1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #include <netinet/tcp_debug.h> 81 82 int tcp_persmin; 83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 84 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 85 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 86 "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 90 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 91 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 92 "maximum persistence interval"); 93 94 int tcp_keepinit; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 96 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 97 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 98 "time to establish connection"); 99 100 int tcp_keepidle; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 103 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 104 "time before keepalive probes begin"); 105 106 int tcp_keepintvl; 107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 109 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 110 "time between keepalive probes"); 111 112 int tcp_delacktime; 113 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 114 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 115 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 116 "Time before a delayed ACK is sent"); 117 118 VNET_DEFINE(int, tcp_msl); 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 121 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 122 "Maximum segment lifetime"); 123 124 int tcp_rexmit_initial; 125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 127 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 128 "Initial Retransmission Timeout"); 129 130 int tcp_rexmit_min; 131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 133 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 134 "Minimum Retransmission Timeout"); 135 136 int tcp_rexmit_slop; 137 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 138 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 139 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 140 "Retransmission Timer Slop"); 141 142 VNET_DEFINE(int, tcp_always_keepalive) = 1; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 144 &VNET_NAME(tcp_always_keepalive) , 0, 145 "Assume SO_KEEPALIVE on all TCP connections"); 146 147 int tcp_fast_finwait2_recycle = 0; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 149 &tcp_fast_finwait2_recycle, 0, 150 "Recycle closed FIN_WAIT_2 connections faster"); 151 152 int tcp_finwait2_timeout; 153 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 154 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 155 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 156 "FIN-WAIT2 timeout"); 157 158 int tcp_keepcnt = TCPTV_KEEPCNT; 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 160 "Number of keepalive probes to send"); 161 162 /* max idle probes */ 163 int tcp_maxpersistidle; 164 165 int tcp_rexmit_drop_options = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 167 &tcp_rexmit_drop_options, 0, 168 "Drop TCP options from 3rd and later retransmitted SYN"); 169 170 int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 171 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, 172 CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 174 "Maximum time (in ms) that a session can linger without making progress"); 175 176 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 180 "Path MTU Discovery Black Hole Detection Enabled"); 181 182 #ifdef INET 183 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 185 CTLFLAG_RW|CTLFLAG_VNET, 186 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 187 "Path MTU Discovery Black Hole Detection lowered MSS"); 188 #endif 189 190 #ifdef INET6 191 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 193 CTLFLAG_RW|CTLFLAG_VNET, 194 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 195 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 196 #endif 197 198 #ifdef RSS 199 static int per_cpu_timers = 1; 200 #else 201 static int per_cpu_timers = 0; 202 #endif 203 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 204 &per_cpu_timers , 0, "run tcp timers on all cpus"); 205 206 /* 207 * Map the given inp to a CPU id. 208 * 209 * This queries RSS if it's compiled in, else it defaults to the current 210 * CPU ID. 211 */ 212 inline int 213 inp_to_cpuid(struct inpcb *inp) 214 { 215 u_int cpuid; 216 217 if (per_cpu_timers) { 218 #ifdef RSS 219 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 220 if (cpuid == NETISR_CPUID_NONE) 221 return (curcpu); /* XXX */ 222 else 223 return (cpuid); 224 #endif 225 /* 226 * We don't have a flowid -> cpuid mapping, so cheat and 227 * just map unknown cpuids to curcpu. Not the best, but 228 * apparently better than defaulting to swi 0. 229 */ 230 cpuid = inp->inp_flowid % (mp_maxid + 1); 231 if (! CPU_ABSENT(cpuid)) 232 return (cpuid); 233 return (curcpu); 234 } else { 235 return (0); 236 } 237 } 238 239 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 240 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 241 242 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 243 244 /* 245 * TCP timer processing. 246 */ 247 248 void 249 tcp_timer_delack(void *xtp) 250 { 251 struct epoch_tracker et; 252 struct tcpcb *tp = xtp; 253 struct inpcb *inp = tptoinpcb(tp); 254 255 INP_WLOCK(inp); 256 CURVNET_SET(inp->inp_vnet); 257 258 if (callout_pending(&tp->t_timers->tt_delack) || 259 !callout_active(&tp->t_timers->tt_delack)) { 260 INP_WUNLOCK(inp); 261 CURVNET_RESTORE(); 262 return; 263 } 264 callout_deactivate(&tp->t_timers->tt_delack); 265 if ((inp->inp_flags & INP_DROPPED) != 0) { 266 INP_WUNLOCK(inp); 267 CURVNET_RESTORE(); 268 return; 269 } 270 tp->t_flags |= TF_ACKNOW; 271 TCPSTAT_INC(tcps_delack); 272 NET_EPOCH_ENTER(et); 273 (void) tcp_output_unlock(tp); 274 NET_EPOCH_EXIT(et); 275 CURVNET_RESTORE(); 276 } 277 278 /* 279 * Call tcp_close() from a callout context. 280 */ 281 static void 282 tcp_timer_close(struct tcpcb *tp) 283 { 284 struct epoch_tracker et; 285 struct inpcb *inp = tptoinpcb(tp); 286 287 INP_WLOCK_ASSERT(inp); 288 289 NET_EPOCH_ENTER(et); 290 tp = tcp_close(tp); 291 NET_EPOCH_EXIT(et); 292 if (tp != NULL) 293 INP_WUNLOCK(inp); 294 } 295 296 /* 297 * Call tcp_drop() from a callout context. 298 */ 299 static void 300 tcp_timer_drop(struct tcpcb *tp) 301 { 302 struct epoch_tracker et; 303 struct inpcb *inp = tptoinpcb(tp); 304 305 INP_WLOCK_ASSERT(inp); 306 307 NET_EPOCH_ENTER(et); 308 tp = tcp_drop(tp, ETIMEDOUT); 309 NET_EPOCH_EXIT(et); 310 if (tp != NULL) 311 INP_WUNLOCK(inp); 312 } 313 314 void 315 tcp_timer_2msl(void *xtp) 316 { 317 struct tcpcb *tp = xtp; 318 struct inpcb *inp = tptoinpcb(tp); 319 #ifdef TCPDEBUG 320 int ostate; 321 322 ostate = tp->t_state; 323 #endif 324 325 INP_WLOCK(inp); 326 CURVNET_SET(inp->inp_vnet); 327 328 tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 329 tcp_free_sackholes(tp); 330 if (callout_pending(&tp->t_timers->tt_2msl) || 331 !callout_active(&tp->t_timers->tt_2msl)) { 332 INP_WUNLOCK(inp); 333 CURVNET_RESTORE(); 334 return; 335 } 336 callout_deactivate(&tp->t_timers->tt_2msl); 337 if (inp->inp_flags & INP_DROPPED) { 338 INP_WUNLOCK(inp); 339 CURVNET_RESTORE(); 340 return; 341 } 342 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 343 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 344 /* 345 * 2 MSL timeout in shutdown went off. If we're closed but 346 * still waiting for peer to close and connection has been idle 347 * too long delete connection control block. Otherwise, check 348 * again in a bit. 349 * 350 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 351 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 352 * Ignore fact that there were recent incoming segments. 353 * 354 * XXXGL: check if inp_socket shall always be !NULL here? 355 */ 356 if (tp->t_state == TCPS_TIME_WAIT) { 357 tcp_timer_close(tp); 358 CURVNET_RESTORE(); 359 return; 360 } else if (tp->t_state == TCPS_FIN_WAIT_2 && 361 tcp_fast_finwait2_recycle && inp->inp_socket && 362 (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 363 TCPSTAT_INC(tcps_finwait2_drops); 364 tcp_timer_close(tp); 365 CURVNET_RESTORE(); 366 return; 367 } else { 368 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 369 callout_reset(&tp->t_timers->tt_2msl, 370 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 371 } else { 372 tcp_timer_close(tp); 373 CURVNET_RESTORE(); 374 return; 375 } 376 } 377 378 #ifdef TCPDEBUG 379 if (tptosocket(tp)->so_options & SO_DEBUG) 380 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 381 PRU_SLOWTIMO); 382 #endif 383 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 384 385 INP_WUNLOCK(inp); 386 CURVNET_RESTORE(); 387 } 388 389 void 390 tcp_timer_keep(void *xtp) 391 { 392 struct epoch_tracker et; 393 struct tcpcb *tp = xtp; 394 struct inpcb *inp = tptoinpcb(tp); 395 struct tcptemp *t_template; 396 #ifdef TCPDEBUG 397 int ostate; 398 399 ostate = tp->t_state; 400 #endif 401 402 INP_WLOCK(inp); 403 CURVNET_SET(inp->inp_vnet); 404 405 if (callout_pending(&tp->t_timers->tt_keep) || 406 !callout_active(&tp->t_timers->tt_keep)) { 407 INP_WUNLOCK(inp); 408 CURVNET_RESTORE(); 409 return; 410 } 411 callout_deactivate(&tp->t_timers->tt_keep); 412 if (inp->inp_flags & INP_DROPPED) { 413 INP_WUNLOCK(inp); 414 CURVNET_RESTORE(); 415 return; 416 } 417 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 418 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 419 420 /* 421 * Because we don't regularly reset the keepalive callout in 422 * the ESTABLISHED state, it may be that we don't actually need 423 * to send a keepalive yet. If that occurs, schedule another 424 * call for the next time the keepalive timer might expire. 425 */ 426 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 427 u_int idletime; 428 429 idletime = ticks - tp->t_rcvtime; 430 if (idletime < TP_KEEPIDLE(tp)) { 431 callout_reset(&tp->t_timers->tt_keep, 432 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 433 INP_WUNLOCK(inp); 434 CURVNET_RESTORE(); 435 return; 436 } 437 } 438 439 /* 440 * Keep-alive timer went off; send something 441 * or drop connection if idle for too long. 442 */ 443 TCPSTAT_INC(tcps_keeptimeo); 444 if (tp->t_state < TCPS_ESTABLISHED) 445 goto dropit; 446 if ((V_tcp_always_keepalive || 447 inp->inp_socket->so_options & SO_KEEPALIVE) && 448 tp->t_state <= TCPS_CLOSING) { 449 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 450 goto dropit; 451 /* 452 * Send a packet designed to force a response 453 * if the peer is up and reachable: 454 * either an ACK if the connection is still alive, 455 * or an RST if the peer has closed the connection 456 * due to timeout or reboot. 457 * Using sequence number tp->snd_una-1 458 * causes the transmitted zero-length segment 459 * to lie outside the receive window; 460 * by the protocol spec, this requires the 461 * correspondent TCP to respond. 462 */ 463 TCPSTAT_INC(tcps_keepprobe); 464 t_template = tcpip_maketemplate(inp); 465 if (t_template) { 466 NET_EPOCH_ENTER(et); 467 tcp_respond(tp, t_template->tt_ipgen, 468 &t_template->tt_t, (struct mbuf *)NULL, 469 tp->rcv_nxt, tp->snd_una - 1, 0); 470 NET_EPOCH_EXIT(et); 471 free(t_template, M_TEMP); 472 } 473 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 474 tcp_timer_keep, tp); 475 } else 476 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 477 tcp_timer_keep, tp); 478 479 #ifdef TCPDEBUG 480 if (inp->inp_socket->so_options & SO_DEBUG) 481 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 482 PRU_SLOWTIMO); 483 #endif 484 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 485 INP_WUNLOCK(inp); 486 CURVNET_RESTORE(); 487 return; 488 489 dropit: 490 TCPSTAT_INC(tcps_keepdrops); 491 NET_EPOCH_ENTER(et); 492 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 493 tp = tcp_drop(tp, ETIMEDOUT); 494 495 #ifdef TCPDEBUG 496 if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) 497 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 498 PRU_SLOWTIMO); 499 #endif 500 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 501 NET_EPOCH_EXIT(et); 502 if (tp != NULL) 503 INP_WUNLOCK(inp); 504 CURVNET_RESTORE(); 505 } 506 507 /* 508 * Has this session exceeded the maximum time without seeing a substantive 509 * acknowledgement? If so, return true; otherwise false. 510 */ 511 static bool 512 tcp_maxunacktime_check(struct tcpcb *tp) 513 { 514 515 /* Are we tracking this timer for this session? */ 516 if (TP_MAXUNACKTIME(tp) == 0) 517 return false; 518 519 /* Do we have a current measurement. */ 520 if (tp->t_acktime == 0) 521 return false; 522 523 /* Are we within the acceptable range? */ 524 if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 525 return false; 526 527 /* We exceeded the timer. */ 528 TCPSTAT_INC(tcps_progdrops); 529 return true; 530 } 531 532 void 533 tcp_timer_persist(void *xtp) 534 { 535 struct epoch_tracker et; 536 struct tcpcb *tp = xtp; 537 struct inpcb *inp = tptoinpcb(tp); 538 bool progdrop; 539 int outrv; 540 #ifdef TCPDEBUG 541 int ostate; 542 543 ostate = tp->t_state; 544 #endif 545 546 INP_WLOCK(inp); 547 CURVNET_SET(inp->inp_vnet); 548 549 if (callout_pending(&tp->t_timers->tt_persist) || 550 !callout_active(&tp->t_timers->tt_persist)) { 551 INP_WUNLOCK(inp); 552 CURVNET_RESTORE(); 553 return; 554 } 555 callout_deactivate(&tp->t_timers->tt_persist); 556 if (inp->inp_flags & INP_DROPPED) { 557 INP_WUNLOCK(inp); 558 CURVNET_RESTORE(); 559 return; 560 } 561 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 562 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 563 /* 564 * Persistence timer into zero window. 565 * Force a byte to be output, if possible. 566 */ 567 TCPSTAT_INC(tcps_persisttimeo); 568 /* 569 * Hack: if the peer is dead/unreachable, we do not 570 * time out if the window is closed. After a full 571 * backoff, drop the connection if the idle time 572 * (no responses to probes) reaches the maximum 573 * backoff that we would use if retransmitting. 574 * Also, drop the connection if we haven't been making 575 * progress. 576 */ 577 progdrop = tcp_maxunacktime_check(tp); 578 if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && 579 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 580 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 581 if (!progdrop) 582 TCPSTAT_INC(tcps_persistdrop); 583 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 584 tcp_timer_drop(tp); 585 CURVNET_RESTORE(); 586 return; 587 } 588 /* 589 * If the user has closed the socket then drop a persisting 590 * connection after a much reduced timeout. 591 */ 592 if (tp->t_state > TCPS_CLOSE_WAIT && 593 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 594 TCPSTAT_INC(tcps_persistdrop); 595 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 596 tcp_timer_drop(tp); 597 CURVNET_RESTORE(); 598 return; 599 } 600 tcp_setpersist(tp); 601 tp->t_flags |= TF_FORCEDATA; 602 NET_EPOCH_ENTER(et); 603 outrv = tcp_output_nodrop(tp); 604 tp->t_flags &= ~TF_FORCEDATA; 605 606 #ifdef TCPDEBUG 607 if (tp != NULL && tptosocket(tp)->so_options & SO_DEBUG) 608 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 609 #endif 610 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 611 (void) tcp_unlock_or_drop(tp, outrv); 612 NET_EPOCH_EXIT(et); 613 CURVNET_RESTORE(); 614 } 615 616 void 617 tcp_timer_rexmt(void * xtp) 618 { 619 struct epoch_tracker et; 620 struct tcpcb *tp = xtp; 621 struct inpcb *inp = tptoinpcb(tp); 622 int rexmt, outrv; 623 bool isipv6; 624 #ifdef TCPDEBUG 625 int ostate; 626 627 ostate = tp->t_state; 628 #endif 629 630 INP_WLOCK(inp); 631 CURVNET_SET(inp->inp_vnet); 632 633 if (callout_pending(&tp->t_timers->tt_rexmt) || 634 !callout_active(&tp->t_timers->tt_rexmt)) { 635 INP_WUNLOCK(inp); 636 CURVNET_RESTORE(); 637 return; 638 } 639 callout_deactivate(&tp->t_timers->tt_rexmt); 640 if (inp->inp_flags & INP_DROPPED) { 641 INP_WUNLOCK(inp); 642 CURVNET_RESTORE(); 643 return; 644 } 645 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 646 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 647 tcp_free_sackholes(tp); 648 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 649 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 650 /* The stack has a timer action too. */ 651 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 652 } 653 /* 654 * Retransmission timer went off. Message has not 655 * been acked within retransmit interval. Back off 656 * to a longer retransmit interval and retransmit one segment. 657 * 658 * If we've either exceeded the maximum number of retransmissions, 659 * or we've gone long enough without making progress, then drop 660 * the session. 661 */ 662 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { 663 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) 664 TCPSTAT_INC(tcps_timeoutdrop); 665 tp->t_rxtshift = TCP_MAXRXTSHIFT; 666 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 667 tcp_timer_drop(tp); 668 CURVNET_RESTORE(); 669 return; 670 } 671 if (tp->t_state == TCPS_SYN_SENT) { 672 /* 673 * If the SYN was retransmitted, indicate CWND to be 674 * limited to 1 segment in cc_conn_init(). 675 */ 676 tp->snd_cwnd = 1; 677 } else if (tp->t_rxtshift == 1) { 678 /* 679 * first retransmit; record ssthresh and cwnd so they can 680 * be recovered if this turns out to be a "bad" retransmit. 681 * A retransmit is considered "bad" if an ACK for this 682 * segment is received within RTT/2 interval; the assumption 683 * here is that the ACK was already in flight. See 684 * "On Estimating End-to-End Network Path Properties" by 685 * Allman and Paxson for more details. 686 */ 687 tp->snd_cwnd_prev = tp->snd_cwnd; 688 tp->snd_ssthresh_prev = tp->snd_ssthresh; 689 tp->snd_recover_prev = tp->snd_recover; 690 if (IN_FASTRECOVERY(tp->t_flags)) 691 tp->t_flags |= TF_WASFRECOVERY; 692 else 693 tp->t_flags &= ~TF_WASFRECOVERY; 694 if (IN_CONGRECOVERY(tp->t_flags)) 695 tp->t_flags |= TF_WASCRECOVERY; 696 else 697 tp->t_flags &= ~TF_WASCRECOVERY; 698 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 699 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 700 /* In the event that we've negotiated timestamps 701 * badrxtwin will be set to the value that we set 702 * the retransmitted packet's to_tsval to by tcp_output 703 */ 704 tp->t_flags |= TF_PREVVALID; 705 } else 706 tp->t_flags &= ~TF_PREVVALID; 707 TCPSTAT_INC(tcps_rexmttimeo); 708 if ((tp->t_state == TCPS_SYN_SENT) || 709 (tp->t_state == TCPS_SYN_RECEIVED)) 710 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 711 else 712 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 713 TCPT_RANGESET(tp->t_rxtcur, rexmt, 714 tp->t_rttmin, TCPTV_REXMTMAX); 715 716 /* 717 * We enter the path for PLMTUD if connection is established or, if 718 * connection is FIN_WAIT_1 status, reason for the last is that if 719 * amount of data we send is very small, we could send it in couple of 720 * packets and process straight to FIN. In that case we won't catch 721 * ESTABLISHED state. 722 */ 723 #ifdef INET6 724 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 725 #else 726 isipv6 = false; 727 #endif 728 if (((V_tcp_pmtud_blackhole_detect == 1) || 729 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 730 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 731 ((tp->t_state == TCPS_ESTABLISHED) || 732 (tp->t_state == TCPS_FIN_WAIT_1))) { 733 if (tp->t_rxtshift == 1) { 734 /* 735 * We enter blackhole detection after the first 736 * unsuccessful timer based retransmission. 737 * Then we reduce up to two times the MSS, each 738 * candidate giving two tries of retransmissions. 739 * But we give a candidate only two tries, if it 740 * actually reduces the MSS. 741 */ 742 tp->t_blackhole_enter = 2; 743 tp->t_blackhole_exit = tp->t_blackhole_enter; 744 if (isipv6) { 745 #ifdef INET6 746 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 747 tp->t_blackhole_exit += 2; 748 if (tp->t_maxseg > V_tcp_v6mssdflt && 749 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 750 tp->t_blackhole_exit += 2; 751 #endif 752 } else { 753 #ifdef INET 754 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 755 tp->t_blackhole_exit += 2; 756 if (tp->t_maxseg > V_tcp_mssdflt && 757 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 758 tp->t_blackhole_exit += 2; 759 #endif 760 } 761 } 762 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 763 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 764 (tp->t_rxtshift >= tp->t_blackhole_enter && 765 tp->t_rxtshift < tp->t_blackhole_exit && 766 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 767 /* 768 * Enter Path MTU Black-hole Detection mechanism: 769 * - Disable Path MTU Discovery (IP "DF" bit). 770 * - Reduce MTU to lower value than what we 771 * negotiated with peer. 772 */ 773 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 774 /* Record that we may have found a black hole. */ 775 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 776 /* Keep track of previous MSS. */ 777 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 778 } 779 780 /* 781 * Reduce the MSS to blackhole value or to the default 782 * in an attempt to retransmit. 783 */ 784 #ifdef INET6 785 if (isipv6 && 786 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 787 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 788 /* Use the sysctl tuneable blackhole MSS. */ 789 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 790 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 791 } else if (isipv6) { 792 /* Use the default MSS. */ 793 tp->t_maxseg = V_tcp_v6mssdflt; 794 /* 795 * Disable Path MTU Discovery when we switch to 796 * minmss. 797 */ 798 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 799 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 800 } 801 #endif 802 #if defined(INET6) && defined(INET) 803 else 804 #endif 805 #ifdef INET 806 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 807 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 808 /* Use the sysctl tuneable blackhole MSS. */ 809 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 810 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 811 } else { 812 /* Use the default MSS. */ 813 tp->t_maxseg = V_tcp_mssdflt; 814 /* 815 * Disable Path MTU Discovery when we switch to 816 * minmss. 817 */ 818 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 819 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 820 } 821 #endif 822 /* 823 * Reset the slow-start flight size 824 * as it may depend on the new MSS. 825 */ 826 if (CC_ALGO(tp)->conn_init != NULL) 827 CC_ALGO(tp)->conn_init(tp->ccv); 828 } else { 829 /* 830 * If further retransmissions are still unsuccessful 831 * with a lowered MTU, maybe this isn't a blackhole and 832 * we restore the previous MSS and blackhole detection 833 * flags. 834 */ 835 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 836 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 837 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 838 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 839 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 840 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 841 /* 842 * Reset the slow-start flight size as it 843 * may depend on the new MSS. 844 */ 845 if (CC_ALGO(tp)->conn_init != NULL) 846 CC_ALGO(tp)->conn_init(tp->ccv); 847 } 848 } 849 } 850 851 /* 852 * Disable RFC1323 and SACK if we haven't got any response to 853 * our third SYN to work-around some broken terminal servers 854 * (most of which have hopefully been retired) that have bad VJ 855 * header compression code which trashes TCP segments containing 856 * unknown-to-them TCP options. 857 */ 858 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 859 (tp->t_rxtshift == 3)) 860 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 861 /* 862 * If we backed off this far, notify the L3 protocol that we're having 863 * connection problems. 864 */ 865 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 866 #ifdef INET6 867 if ((inp->inp_vflag & INP_IPV6) != 0) 868 in6_losing(inp); 869 else 870 #endif 871 in_losing(inp); 872 } 873 tp->snd_nxt = tp->snd_una; 874 tp->snd_recover = tp->snd_max; 875 /* 876 * Force a segment to be sent. 877 */ 878 tp->t_flags |= TF_ACKNOW; 879 /* 880 * If timing a segment in this window, stop the timer. 881 */ 882 tp->t_rtttime = 0; 883 884 cc_cong_signal(tp, NULL, CC_RTO); 885 NET_EPOCH_ENTER(et); 886 outrv = tcp_output_nodrop(tp); 887 #ifdef TCPDEBUG 888 if (tp != NULL && (tptosocket(tp)->so_options & SO_DEBUG)) 889 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 890 PRU_SLOWTIMO); 891 #endif 892 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 893 (void) tcp_unlock_or_drop(tp, outrv); 894 NET_EPOCH_EXIT(et); 895 CURVNET_RESTORE(); 896 } 897 898 void 899 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 900 { 901 struct callout *t_callout; 902 callout_func_t *f_callout; 903 struct inpcb *inp = tptoinpcb(tp); 904 int cpu = inp_to_cpuid(inp); 905 906 #ifdef TCP_OFFLOAD 907 if (tp->t_flags & TF_TOE) 908 return; 909 #endif 910 911 if (tp->t_timers->tt_flags & TT_STOPPED) 912 return; 913 914 switch (timer_type) { 915 case TT_DELACK: 916 t_callout = &tp->t_timers->tt_delack; 917 f_callout = tcp_timer_delack; 918 break; 919 case TT_REXMT: 920 t_callout = &tp->t_timers->tt_rexmt; 921 f_callout = tcp_timer_rexmt; 922 break; 923 case TT_PERSIST: 924 t_callout = &tp->t_timers->tt_persist; 925 f_callout = tcp_timer_persist; 926 break; 927 case TT_KEEP: 928 t_callout = &tp->t_timers->tt_keep; 929 f_callout = tcp_timer_keep; 930 break; 931 case TT_2MSL: 932 t_callout = &tp->t_timers->tt_2msl; 933 f_callout = tcp_timer_2msl; 934 break; 935 default: 936 if (tp->t_fb->tfb_tcp_timer_activate) { 937 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 938 return; 939 } 940 panic("tp %p bad timer_type %#x", tp, timer_type); 941 } 942 if (delta == 0) { 943 callout_stop(t_callout); 944 } else { 945 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 946 } 947 } 948 949 int 950 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 951 { 952 struct callout *t_callout; 953 954 switch (timer_type) { 955 case TT_DELACK: 956 t_callout = &tp->t_timers->tt_delack; 957 break; 958 case TT_REXMT: 959 t_callout = &tp->t_timers->tt_rexmt; 960 break; 961 case TT_PERSIST: 962 t_callout = &tp->t_timers->tt_persist; 963 break; 964 case TT_KEEP: 965 t_callout = &tp->t_timers->tt_keep; 966 break; 967 case TT_2MSL: 968 t_callout = &tp->t_timers->tt_2msl; 969 break; 970 default: 971 if (tp->t_fb->tfb_tcp_timer_active) { 972 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 973 } 974 panic("tp %p bad timer_type %#x", tp, timer_type); 975 } 976 return callout_active(t_callout); 977 } 978 979 /* 980 * Stop the timer from running, and apply a flag 981 * against the timer_flags that will force the 982 * timer never to run. The flag is needed to assure 983 * a race does not leave it running and cause 984 * the timer to possibly restart itself (keep and persist 985 * especially do this). 986 */ 987 int 988 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 989 { 990 struct callout *t_callout; 991 uint32_t t_flags; 992 993 switch (timer_type) { 994 case TT_DELACK: 995 t_flags = TT_DELACK_SUS; 996 t_callout = &tp->t_timers->tt_delack; 997 break; 998 case TT_REXMT: 999 t_flags = TT_REXMT_SUS; 1000 t_callout = &tp->t_timers->tt_rexmt; 1001 break; 1002 case TT_PERSIST: 1003 t_flags = TT_PERSIST_SUS; 1004 t_callout = &tp->t_timers->tt_persist; 1005 break; 1006 case TT_KEEP: 1007 t_flags = TT_KEEP_SUS; 1008 t_callout = &tp->t_timers->tt_keep; 1009 break; 1010 case TT_2MSL: 1011 t_flags = TT_2MSL_SUS; 1012 t_callout = &tp->t_timers->tt_2msl; 1013 break; 1014 default: 1015 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1016 } 1017 tp->t_timers->tt_flags |= t_flags; 1018 return (callout_stop(t_callout)); 1019 } 1020 1021 void 1022 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 1023 { 1024 switch (timer_type) { 1025 case TT_DELACK: 1026 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 1027 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 1028 if (tp->t_flags & TF_DELACK) { 1029 /* Delayed ack timer should be up activate a timer */ 1030 tp->t_flags &= ~TF_DELACK; 1031 tcp_timer_activate(tp, TT_DELACK, 1032 tcp_delacktime); 1033 } 1034 } 1035 break; 1036 case TT_REXMT: 1037 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 1038 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 1039 if (SEQ_GT(tp->snd_max, tp->snd_una) && 1040 (tcp_timer_active((tp), TT_PERSIST) == 0) && 1041 tp->snd_wnd) { 1042 /* We have outstanding data activate a timer */ 1043 tcp_timer_activate(tp, TT_REXMT, 1044 tp->t_rxtcur); 1045 } 1046 } 1047 break; 1048 case TT_PERSIST: 1049 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 1050 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 1051 if (tp->snd_wnd == 0) { 1052 /* Activate the persists timer */ 1053 tp->t_rxtshift = 0; 1054 tcp_setpersist(tp); 1055 } 1056 } 1057 break; 1058 case TT_KEEP: 1059 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1060 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1061 tcp_timer_activate(tp, TT_KEEP, 1062 TCPS_HAVEESTABLISHED(tp->t_state) ? 1063 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1064 } 1065 break; 1066 case TT_2MSL: 1067 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1068 struct socket *so = tptosocket(tp); 1069 1070 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1071 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1072 (so == NULL || /* XXXGL: needed? */ 1073 (so->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1074 /* Star the 2MSL timer */ 1075 tcp_timer_activate(tp, TT_2MSL, 1076 (tcp_fast_finwait2_recycle) ? 1077 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1078 } 1079 } 1080 break; 1081 default: 1082 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1083 } 1084 } 1085 1086 static void 1087 tcp_timer_discard(void *ptp) 1088 { 1089 struct epoch_tracker et; 1090 struct tcpcb *tp = (struct tcpcb *)ptp; 1091 struct inpcb *inp = tptoinpcb(tp); 1092 1093 INP_WLOCK(inp); 1094 CURVNET_SET(inp->inp_vnet); 1095 NET_EPOCH_ENTER(et); 1096 1097 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, 1098 ("%s: tcpcb has to be stopped here", __func__)); 1099 if (--tp->t_timers->tt_draincnt > 0 || 1100 tcp_freecb(tp) == false) 1101 INP_WUNLOCK(inp); 1102 NET_EPOCH_EXIT(et); 1103 CURVNET_RESTORE(); 1104 } 1105 1106 void 1107 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1108 { 1109 struct callout *t_callout; 1110 1111 tp->t_timers->tt_flags |= TT_STOPPED; 1112 switch (timer_type) { 1113 case TT_DELACK: 1114 t_callout = &tp->t_timers->tt_delack; 1115 break; 1116 case TT_REXMT: 1117 t_callout = &tp->t_timers->tt_rexmt; 1118 break; 1119 case TT_PERSIST: 1120 t_callout = &tp->t_timers->tt_persist; 1121 break; 1122 case TT_KEEP: 1123 t_callout = &tp->t_timers->tt_keep; 1124 break; 1125 case TT_2MSL: 1126 t_callout = &tp->t_timers->tt_2msl; 1127 break; 1128 default: 1129 if (tp->t_fb->tfb_tcp_timer_stop) { 1130 /* 1131 * XXXrrs we need to look at this with the 1132 * stop case below (flags). 1133 */ 1134 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1135 return; 1136 } 1137 panic("tp %p bad timer_type %#x", tp, timer_type); 1138 } 1139 1140 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1141 /* 1142 * Can't stop the callout, defer tcpcb actual deletion 1143 * to the last one. We do this using the async drain 1144 * function and incrementing the count in 1145 */ 1146 tp->t_timers->tt_draincnt++; 1147 } 1148 } 1149