1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #include <netinet/tcp_debug.h> 81 82 int tcp_persmin; 83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 84 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 85 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 86 "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 90 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 91 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 92 "maximum persistence interval"); 93 94 int tcp_keepinit; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 96 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 97 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 98 "time to establish connection"); 99 100 int tcp_keepidle; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 103 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 104 "time before keepalive probes begin"); 105 106 int tcp_keepintvl; 107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 109 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 110 "time between keepalive probes"); 111 112 int tcp_delacktime; 113 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 114 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 115 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 116 "Time before a delayed ACK is sent"); 117 118 VNET_DEFINE(int, tcp_msl); 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 121 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 122 "Maximum segment lifetime"); 123 124 int tcp_rexmit_initial; 125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 127 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 128 "Initial Retransmission Timeout"); 129 130 int tcp_rexmit_min; 131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 133 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 134 "Minimum Retransmission Timeout"); 135 136 int tcp_rexmit_slop; 137 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 138 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 139 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 140 "Retransmission Timer Slop"); 141 142 VNET_DEFINE(int, tcp_always_keepalive) = 1; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 144 &VNET_NAME(tcp_always_keepalive) , 0, 145 "Assume SO_KEEPALIVE on all TCP connections"); 146 147 int tcp_fast_finwait2_recycle = 0; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 149 &tcp_fast_finwait2_recycle, 0, 150 "Recycle closed FIN_WAIT_2 connections faster"); 151 152 int tcp_finwait2_timeout; 153 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 154 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 155 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 156 "FIN-WAIT2 timeout"); 157 158 int tcp_keepcnt = TCPTV_KEEPCNT; 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 160 "Number of keepalive probes to send"); 161 162 /* max idle probes */ 163 int tcp_maxpersistidle; 164 165 int tcp_rexmit_drop_options = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 167 &tcp_rexmit_drop_options, 0, 168 "Drop TCP options from 3rd and later retransmitted SYN"); 169 170 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 171 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 172 CTLFLAG_RW|CTLFLAG_VNET, 173 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 174 "Path MTU Discovery Black Hole Detection Enabled"); 175 176 #ifdef INET 177 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 179 CTLFLAG_RW|CTLFLAG_VNET, 180 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 181 "Path MTU Discovery Black Hole Detection lowered MSS"); 182 #endif 183 184 #ifdef INET6 185 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 /* 201 * Map the given inp to a CPU id. 202 * 203 * This queries RSS if it's compiled in, else it defaults to the current 204 * CPU ID. 205 */ 206 inline int 207 inp_to_cpuid(struct inpcb *inp) 208 { 209 u_int cpuid; 210 211 if (per_cpu_timers) { 212 #ifdef RSS 213 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 214 if (cpuid == NETISR_CPUID_NONE) 215 return (curcpu); /* XXX */ 216 else 217 return (cpuid); 218 #endif 219 /* 220 * We don't have a flowid -> cpuid mapping, so cheat and 221 * just map unknown cpuids to curcpu. Not the best, but 222 * apparently better than defaulting to swi 0. 223 */ 224 cpuid = inp->inp_flowid % (mp_maxid + 1); 225 if (! CPU_ABSENT(cpuid)) 226 return (cpuid); 227 return (curcpu); 228 } else { 229 return (0); 230 } 231 } 232 233 /* 234 * Legacy TCP global callout routine called every 500 ms. 235 * Used to cleanup timewait states, which lack their own callouts. 236 */ 237 static struct callout tcpslow_callout; 238 static void 239 tcp_slowtimo(void *arg __unused) 240 { 241 struct epoch_tracker et; 242 VNET_ITERATOR_DECL(vnet_iter); 243 244 NET_EPOCH_ENTER(et); 245 VNET_LIST_RLOCK_NOSLEEP(); 246 VNET_FOREACH(vnet_iter) { 247 CURVNET_SET(vnet_iter); 248 (void) tcp_tw_2msl_scan(0); 249 CURVNET_RESTORE(); 250 } 251 VNET_LIST_RUNLOCK_NOSLEEP(); 252 NET_EPOCH_EXIT(et); 253 254 callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, 255 tcp_slowtimo, NULL, 0); 256 } 257 258 static void 259 tcp_slowtimo_init(void *arg __unused) 260 { 261 262 callout_init(&tcpslow_callout, 1); 263 callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, 264 tcp_slowtimo, NULL, 0); 265 } 266 SYSINIT(tcp_timer, SI_SUB_VNET_DONE, SI_ORDER_ANY, tcp_slowtimo_init, NULL); 267 268 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 269 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 270 271 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 272 273 /* 274 * TCP timer processing. 275 */ 276 277 void 278 tcp_timer_delack(void *xtp) 279 { 280 struct epoch_tracker et; 281 struct tcpcb *tp = xtp; 282 struct inpcb *inp; 283 CURVNET_SET(tp->t_vnet); 284 285 inp = tp->t_inpcb; 286 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 287 INP_WLOCK(inp); 288 if (callout_pending(&tp->t_timers->tt_delack) || 289 !callout_active(&tp->t_timers->tt_delack)) { 290 INP_WUNLOCK(inp); 291 CURVNET_RESTORE(); 292 return; 293 } 294 callout_deactivate(&tp->t_timers->tt_delack); 295 if ((inp->inp_flags & INP_DROPPED) != 0) { 296 INP_WUNLOCK(inp); 297 CURVNET_RESTORE(); 298 return; 299 } 300 tp->t_flags |= TF_ACKNOW; 301 TCPSTAT_INC(tcps_delack); 302 NET_EPOCH_ENTER(et); 303 (void) tcp_output_unlock(tp); 304 NET_EPOCH_EXIT(et); 305 CURVNET_RESTORE(); 306 } 307 308 void 309 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 310 { 311 if (inp && tp != NULL) 312 INP_WUNLOCK(inp); 313 } 314 315 void 316 tcp_timer_2msl(void *xtp) 317 { 318 struct tcpcb *tp = xtp; 319 struct inpcb *inp; 320 struct epoch_tracker et; 321 CURVNET_SET(tp->t_vnet); 322 #ifdef TCPDEBUG 323 int ostate; 324 325 ostate = tp->t_state; 326 #endif 327 inp = tp->t_inpcb; 328 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 329 INP_WLOCK(inp); 330 tcp_free_sackholes(tp); 331 if (callout_pending(&tp->t_timers->tt_2msl) || 332 !callout_active(&tp->t_timers->tt_2msl)) { 333 INP_WUNLOCK(tp->t_inpcb); 334 CURVNET_RESTORE(); 335 return; 336 } 337 callout_deactivate(&tp->t_timers->tt_2msl); 338 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 339 INP_WUNLOCK(inp); 340 CURVNET_RESTORE(); 341 return; 342 } 343 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 344 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 345 /* 346 * 2 MSL timeout in shutdown went off. If we're closed but 347 * still waiting for peer to close and connection has been idle 348 * too long delete connection control block. Otherwise, check 349 * again in a bit. 350 * 351 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 352 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 353 * Ignore fact that there were recent incoming segments. 354 */ 355 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 356 tp->t_inpcb && tp->t_inpcb->inp_socket && 357 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 358 TCPSTAT_INC(tcps_finwait2_drops); 359 NET_EPOCH_ENTER(et); 360 tp = tcp_close(tp); 361 NET_EPOCH_EXIT(et); 362 tcp_inpinfo_lock_del(inp, tp); 363 goto out; 364 } else { 365 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 366 callout_reset(&tp->t_timers->tt_2msl, 367 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 368 } else { 369 NET_EPOCH_ENTER(et); 370 tp = tcp_close(tp); 371 NET_EPOCH_EXIT(et); 372 tcp_inpinfo_lock_del(inp, tp); 373 goto out; 374 } 375 } 376 377 #ifdef TCPDEBUG 378 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 379 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 380 PRU_SLOWTIMO); 381 #endif 382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 383 384 if (tp != NULL) 385 INP_WUNLOCK(inp); 386 out: 387 CURVNET_RESTORE(); 388 } 389 390 void 391 tcp_timer_keep(void *xtp) 392 { 393 struct tcpcb *tp = xtp; 394 struct tcptemp *t_template; 395 struct inpcb *inp; 396 struct epoch_tracker et; 397 CURVNET_SET(tp->t_vnet); 398 #ifdef TCPDEBUG 399 int ostate; 400 401 ostate = tp->t_state; 402 #endif 403 inp = tp->t_inpcb; 404 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 405 INP_WLOCK(inp); 406 if (callout_pending(&tp->t_timers->tt_keep) || 407 !callout_active(&tp->t_timers->tt_keep)) { 408 INP_WUNLOCK(inp); 409 CURVNET_RESTORE(); 410 return; 411 } 412 callout_deactivate(&tp->t_timers->tt_keep); 413 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 414 INP_WUNLOCK(inp); 415 CURVNET_RESTORE(); 416 return; 417 } 418 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 419 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 420 421 /* 422 * Because we don't regularly reset the keepalive callout in 423 * the ESTABLISHED state, it may be that we don't actually need 424 * to send a keepalive yet. If that occurs, schedule another 425 * call for the next time the keepalive timer might expire. 426 */ 427 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 428 u_int idletime; 429 430 idletime = ticks - tp->t_rcvtime; 431 if (idletime < TP_KEEPIDLE(tp)) { 432 callout_reset(&tp->t_timers->tt_keep, 433 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 434 INP_WUNLOCK(inp); 435 CURVNET_RESTORE(); 436 return; 437 } 438 } 439 440 /* 441 * Keep-alive timer went off; send something 442 * or drop connection if idle for too long. 443 */ 444 TCPSTAT_INC(tcps_keeptimeo); 445 if (tp->t_state < TCPS_ESTABLISHED) 446 goto dropit; 447 if ((V_tcp_always_keepalive || 448 inp->inp_socket->so_options & SO_KEEPALIVE) && 449 tp->t_state <= TCPS_CLOSING) { 450 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 451 goto dropit; 452 /* 453 * Send a packet designed to force a response 454 * if the peer is up and reachable: 455 * either an ACK if the connection is still alive, 456 * or an RST if the peer has closed the connection 457 * due to timeout or reboot. 458 * Using sequence number tp->snd_una-1 459 * causes the transmitted zero-length segment 460 * to lie outside the receive window; 461 * by the protocol spec, this requires the 462 * correspondent TCP to respond. 463 */ 464 TCPSTAT_INC(tcps_keepprobe); 465 t_template = tcpip_maketemplate(inp); 466 if (t_template) { 467 NET_EPOCH_ENTER(et); 468 tcp_respond(tp, t_template->tt_ipgen, 469 &t_template->tt_t, (struct mbuf *)NULL, 470 tp->rcv_nxt, tp->snd_una - 1, 0); 471 NET_EPOCH_EXIT(et); 472 free(t_template, M_TEMP); 473 } 474 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 475 tcp_timer_keep, tp); 476 } else 477 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 478 tcp_timer_keep, tp); 479 480 #ifdef TCPDEBUG 481 if (inp->inp_socket->so_options & SO_DEBUG) 482 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 483 PRU_SLOWTIMO); 484 #endif 485 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 486 INP_WUNLOCK(inp); 487 CURVNET_RESTORE(); 488 return; 489 490 dropit: 491 TCPSTAT_INC(tcps_keepdrops); 492 NET_EPOCH_ENTER(et); 493 tp = tcp_drop(tp, ETIMEDOUT); 494 495 #ifdef TCPDEBUG 496 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 497 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 498 PRU_SLOWTIMO); 499 #endif 500 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 501 NET_EPOCH_EXIT(et); 502 tcp_inpinfo_lock_del(inp, tp); 503 CURVNET_RESTORE(); 504 } 505 506 void 507 tcp_timer_persist(void *xtp) 508 { 509 struct tcpcb *tp = xtp; 510 struct inpcb *inp; 511 struct epoch_tracker et; 512 int outrv; 513 CURVNET_SET(tp->t_vnet); 514 #ifdef TCPDEBUG 515 int ostate; 516 517 ostate = tp->t_state; 518 #endif 519 inp = tp->t_inpcb; 520 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 521 INP_WLOCK(inp); 522 if (callout_pending(&tp->t_timers->tt_persist) || 523 !callout_active(&tp->t_timers->tt_persist)) { 524 INP_WUNLOCK(inp); 525 CURVNET_RESTORE(); 526 return; 527 } 528 callout_deactivate(&tp->t_timers->tt_persist); 529 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 530 INP_WUNLOCK(inp); 531 CURVNET_RESTORE(); 532 return; 533 } 534 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 535 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 536 /* 537 * Persistence timer into zero window. 538 * Force a byte to be output, if possible. 539 */ 540 TCPSTAT_INC(tcps_persisttimeo); 541 /* 542 * Hack: if the peer is dead/unreachable, we do not 543 * time out if the window is closed. After a full 544 * backoff, drop the connection if the idle time 545 * (no responses to probes) reaches the maximum 546 * backoff that we would use if retransmitting. 547 */ 548 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 549 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 550 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 551 TCPSTAT_INC(tcps_persistdrop); 552 NET_EPOCH_ENTER(et); 553 tp = tcp_drop(tp, ETIMEDOUT); 554 NET_EPOCH_EXIT(et); 555 tcp_inpinfo_lock_del(inp, tp); 556 goto out; 557 } 558 /* 559 * If the user has closed the socket then drop a persisting 560 * connection after a much reduced timeout. 561 */ 562 if (tp->t_state > TCPS_CLOSE_WAIT && 563 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 564 TCPSTAT_INC(tcps_persistdrop); 565 NET_EPOCH_ENTER(et); 566 tp = tcp_drop(tp, ETIMEDOUT); 567 NET_EPOCH_EXIT(et); 568 tcp_inpinfo_lock_del(inp, tp); 569 goto out; 570 } 571 tcp_setpersist(tp); 572 tp->t_flags |= TF_FORCEDATA; 573 NET_EPOCH_ENTER(et); 574 outrv = tcp_output_nodrop(tp); 575 tp->t_flags &= ~TF_FORCEDATA; 576 577 #ifdef TCPDEBUG 578 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 579 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 580 #endif 581 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 582 (void) tcp_unlock_or_drop(tp, outrv); 583 NET_EPOCH_EXIT(et); 584 out: 585 CURVNET_RESTORE(); 586 } 587 588 void 589 tcp_timer_rexmt(void * xtp) 590 { 591 struct tcpcb *tp = xtp; 592 CURVNET_SET(tp->t_vnet); 593 int rexmt, outrv; 594 struct inpcb *inp; 595 struct epoch_tracker et; 596 bool isipv6; 597 #ifdef TCPDEBUG 598 int ostate; 599 600 ostate = tp->t_state; 601 #endif 602 inp = tp->t_inpcb; 603 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 604 INP_WLOCK(inp); 605 if (callout_pending(&tp->t_timers->tt_rexmt) || 606 !callout_active(&tp->t_timers->tt_rexmt)) { 607 INP_WUNLOCK(inp); 608 CURVNET_RESTORE(); 609 return; 610 } 611 callout_deactivate(&tp->t_timers->tt_rexmt); 612 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 613 INP_WUNLOCK(inp); 614 CURVNET_RESTORE(); 615 return; 616 } 617 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 618 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 619 tcp_free_sackholes(tp); 620 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 621 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 622 /* The stack has a timer action too. */ 623 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 624 } 625 /* 626 * Retransmission timer went off. Message has not 627 * been acked within retransmit interval. Back off 628 * to a longer retransmit interval and retransmit one segment. 629 */ 630 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 631 tp->t_rxtshift = TCP_MAXRXTSHIFT; 632 TCPSTAT_INC(tcps_timeoutdrop); 633 NET_EPOCH_ENTER(et); 634 tp = tcp_drop(tp, ETIMEDOUT); 635 NET_EPOCH_EXIT(et); 636 tcp_inpinfo_lock_del(inp, tp); 637 goto out; 638 } 639 if (tp->t_state == TCPS_SYN_SENT) { 640 /* 641 * If the SYN was retransmitted, indicate CWND to be 642 * limited to 1 segment in cc_conn_init(). 643 */ 644 tp->snd_cwnd = 1; 645 } else if (tp->t_rxtshift == 1) { 646 /* 647 * first retransmit; record ssthresh and cwnd so they can 648 * be recovered if this turns out to be a "bad" retransmit. 649 * A retransmit is considered "bad" if an ACK for this 650 * segment is received within RTT/2 interval; the assumption 651 * here is that the ACK was already in flight. See 652 * "On Estimating End-to-End Network Path Properties" by 653 * Allman and Paxson for more details. 654 */ 655 tp->snd_cwnd_prev = tp->snd_cwnd; 656 tp->snd_ssthresh_prev = tp->snd_ssthresh; 657 tp->snd_recover_prev = tp->snd_recover; 658 if (IN_FASTRECOVERY(tp->t_flags)) 659 tp->t_flags |= TF_WASFRECOVERY; 660 else 661 tp->t_flags &= ~TF_WASFRECOVERY; 662 if (IN_CONGRECOVERY(tp->t_flags)) 663 tp->t_flags |= TF_WASCRECOVERY; 664 else 665 tp->t_flags &= ~TF_WASCRECOVERY; 666 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 667 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 668 /* In the event that we've negotiated timestamps 669 * badrxtwin will be set to the value that we set 670 * the retransmitted packet's to_tsval to by tcp_output 671 */ 672 tp->t_flags |= TF_PREVVALID; 673 } else 674 tp->t_flags &= ~TF_PREVVALID; 675 TCPSTAT_INC(tcps_rexmttimeo); 676 if ((tp->t_state == TCPS_SYN_SENT) || 677 (tp->t_state == TCPS_SYN_RECEIVED)) 678 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 679 else 680 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 681 TCPT_RANGESET(tp->t_rxtcur, rexmt, 682 tp->t_rttmin, TCPTV_REXMTMAX); 683 684 /* 685 * We enter the path for PLMTUD if connection is established or, if 686 * connection is FIN_WAIT_1 status, reason for the last is that if 687 * amount of data we send is very small, we could send it in couple of 688 * packets and process straight to FIN. In that case we won't catch 689 * ESTABLISHED state. 690 */ 691 #ifdef INET6 692 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 693 #else 694 isipv6 = false; 695 #endif 696 if (((V_tcp_pmtud_blackhole_detect == 1) || 697 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 698 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 699 ((tp->t_state == TCPS_ESTABLISHED) || 700 (tp->t_state == TCPS_FIN_WAIT_1))) { 701 if (tp->t_rxtshift == 1) { 702 /* 703 * We enter blackhole detection after the first 704 * unsuccessful timer based retransmission. 705 * Then we reduce up to two times the MSS, each 706 * candidate giving two tries of retransmissions. 707 * But we give a candidate only two tries, if it 708 * actually reduces the MSS. 709 */ 710 tp->t_blackhole_enter = 2; 711 tp->t_blackhole_exit = tp->t_blackhole_enter; 712 if (isipv6) { 713 #ifdef INET6 714 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 715 tp->t_blackhole_exit += 2; 716 if (tp->t_maxseg > V_tcp_v6mssdflt && 717 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 718 tp->t_blackhole_exit += 2; 719 #endif 720 } else { 721 #ifdef INET 722 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 723 tp->t_blackhole_exit += 2; 724 if (tp->t_maxseg > V_tcp_mssdflt && 725 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 726 tp->t_blackhole_exit += 2; 727 #endif 728 } 729 } 730 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 731 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 732 (tp->t_rxtshift >= tp->t_blackhole_enter && 733 tp->t_rxtshift < tp->t_blackhole_exit && 734 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 735 /* 736 * Enter Path MTU Black-hole Detection mechanism: 737 * - Disable Path MTU Discovery (IP "DF" bit). 738 * - Reduce MTU to lower value than what we 739 * negotiated with peer. 740 */ 741 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 742 /* Record that we may have found a black hole. */ 743 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 744 /* Keep track of previous MSS. */ 745 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 746 } 747 748 /* 749 * Reduce the MSS to blackhole value or to the default 750 * in an attempt to retransmit. 751 */ 752 #ifdef INET6 753 if (isipv6 && 754 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 755 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 756 /* Use the sysctl tuneable blackhole MSS. */ 757 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 758 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 759 } else if (isipv6) { 760 /* Use the default MSS. */ 761 tp->t_maxseg = V_tcp_v6mssdflt; 762 /* 763 * Disable Path MTU Discovery when we switch to 764 * minmss. 765 */ 766 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 767 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 768 } 769 #endif 770 #if defined(INET6) && defined(INET) 771 else 772 #endif 773 #ifdef INET 774 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 775 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 776 /* Use the sysctl tuneable blackhole MSS. */ 777 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 778 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 779 } else { 780 /* Use the default MSS. */ 781 tp->t_maxseg = V_tcp_mssdflt; 782 /* 783 * Disable Path MTU Discovery when we switch to 784 * minmss. 785 */ 786 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 787 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 788 } 789 #endif 790 /* 791 * Reset the slow-start flight size 792 * as it may depend on the new MSS. 793 */ 794 if (CC_ALGO(tp)->conn_init != NULL) 795 CC_ALGO(tp)->conn_init(tp->ccv); 796 } else { 797 /* 798 * If further retransmissions are still unsuccessful 799 * with a lowered MTU, maybe this isn't a blackhole and 800 * we restore the previous MSS and blackhole detection 801 * flags. 802 */ 803 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 804 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 805 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 806 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 807 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 808 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 809 /* 810 * Reset the slow-start flight size as it 811 * may depend on the new MSS. 812 */ 813 if (CC_ALGO(tp)->conn_init != NULL) 814 CC_ALGO(tp)->conn_init(tp->ccv); 815 } 816 } 817 } 818 819 /* 820 * Disable RFC1323 and SACK if we haven't got any response to 821 * our third SYN to work-around some broken terminal servers 822 * (most of which have hopefully been retired) that have bad VJ 823 * header compression code which trashes TCP segments containing 824 * unknown-to-them TCP options. 825 */ 826 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 827 (tp->t_rxtshift == 3)) 828 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 829 /* 830 * If we backed off this far, notify the L3 protocol that we're having 831 * connection problems. 832 */ 833 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 834 #ifdef INET6 835 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 836 in6_losing(tp->t_inpcb); 837 else 838 #endif 839 in_losing(tp->t_inpcb); 840 } 841 tp->snd_nxt = tp->snd_una; 842 tp->snd_recover = tp->snd_max; 843 /* 844 * Force a segment to be sent. 845 */ 846 tp->t_flags |= TF_ACKNOW; 847 /* 848 * If timing a segment in this window, stop the timer. 849 */ 850 tp->t_rtttime = 0; 851 852 cc_cong_signal(tp, NULL, CC_RTO); 853 NET_EPOCH_ENTER(et); 854 outrv = tcp_output_nodrop(tp); 855 #ifdef TCPDEBUG 856 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 857 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 858 PRU_SLOWTIMO); 859 #endif 860 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 861 (void) tcp_unlock_or_drop(tp, outrv); 862 NET_EPOCH_EXIT(et); 863 out: 864 CURVNET_RESTORE(); 865 } 866 867 void 868 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 869 { 870 struct callout *t_callout; 871 callout_func_t *f_callout; 872 struct inpcb *inp = tp->t_inpcb; 873 int cpu = inp_to_cpuid(inp); 874 875 #ifdef TCP_OFFLOAD 876 if (tp->t_flags & TF_TOE) 877 return; 878 #endif 879 880 if (tp->t_timers->tt_flags & TT_STOPPED) 881 return; 882 883 switch (timer_type) { 884 case TT_DELACK: 885 t_callout = &tp->t_timers->tt_delack; 886 f_callout = tcp_timer_delack; 887 break; 888 case TT_REXMT: 889 t_callout = &tp->t_timers->tt_rexmt; 890 f_callout = tcp_timer_rexmt; 891 break; 892 case TT_PERSIST: 893 t_callout = &tp->t_timers->tt_persist; 894 f_callout = tcp_timer_persist; 895 break; 896 case TT_KEEP: 897 t_callout = &tp->t_timers->tt_keep; 898 f_callout = tcp_timer_keep; 899 break; 900 case TT_2MSL: 901 t_callout = &tp->t_timers->tt_2msl; 902 f_callout = tcp_timer_2msl; 903 break; 904 default: 905 if (tp->t_fb->tfb_tcp_timer_activate) { 906 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 907 return; 908 } 909 panic("tp %p bad timer_type %#x", tp, timer_type); 910 } 911 if (delta == 0) { 912 callout_stop(t_callout); 913 } else { 914 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 915 } 916 } 917 918 int 919 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 920 { 921 struct callout *t_callout; 922 923 switch (timer_type) { 924 case TT_DELACK: 925 t_callout = &tp->t_timers->tt_delack; 926 break; 927 case TT_REXMT: 928 t_callout = &tp->t_timers->tt_rexmt; 929 break; 930 case TT_PERSIST: 931 t_callout = &tp->t_timers->tt_persist; 932 break; 933 case TT_KEEP: 934 t_callout = &tp->t_timers->tt_keep; 935 break; 936 case TT_2MSL: 937 t_callout = &tp->t_timers->tt_2msl; 938 break; 939 default: 940 if (tp->t_fb->tfb_tcp_timer_active) { 941 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 942 } 943 panic("tp %p bad timer_type %#x", tp, timer_type); 944 } 945 return callout_active(t_callout); 946 } 947 948 /* 949 * Stop the timer from running, and apply a flag 950 * against the timer_flags that will force the 951 * timer never to run. The flag is needed to assure 952 * a race does not leave it running and cause 953 * the timer to possibly restart itself (keep and persist 954 * especially do this). 955 */ 956 int 957 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 958 { 959 struct callout *t_callout; 960 uint32_t t_flags; 961 962 switch (timer_type) { 963 case TT_DELACK: 964 t_flags = TT_DELACK_SUS; 965 t_callout = &tp->t_timers->tt_delack; 966 break; 967 case TT_REXMT: 968 t_flags = TT_REXMT_SUS; 969 t_callout = &tp->t_timers->tt_rexmt; 970 break; 971 case TT_PERSIST: 972 t_flags = TT_PERSIST_SUS; 973 t_callout = &tp->t_timers->tt_persist; 974 break; 975 case TT_KEEP: 976 t_flags = TT_KEEP_SUS; 977 t_callout = &tp->t_timers->tt_keep; 978 break; 979 case TT_2MSL: 980 t_flags = TT_2MSL_SUS; 981 t_callout = &tp->t_timers->tt_2msl; 982 break; 983 default: 984 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 985 } 986 tp->t_timers->tt_flags |= t_flags; 987 return (callout_stop(t_callout)); 988 } 989 990 void 991 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 992 { 993 switch (timer_type) { 994 case TT_DELACK: 995 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 996 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 997 if (tp->t_flags & TF_DELACK) { 998 /* Delayed ack timer should be up activate a timer */ 999 tp->t_flags &= ~TF_DELACK; 1000 tcp_timer_activate(tp, TT_DELACK, 1001 tcp_delacktime); 1002 } 1003 } 1004 break; 1005 case TT_REXMT: 1006 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 1007 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 1008 if (SEQ_GT(tp->snd_max, tp->snd_una) && 1009 (tcp_timer_active((tp), TT_PERSIST) == 0) && 1010 tp->snd_wnd) { 1011 /* We have outstanding data activate a timer */ 1012 tcp_timer_activate(tp, TT_REXMT, 1013 tp->t_rxtcur); 1014 } 1015 } 1016 break; 1017 case TT_PERSIST: 1018 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 1019 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 1020 if (tp->snd_wnd == 0) { 1021 /* Activate the persists timer */ 1022 tp->t_rxtshift = 0; 1023 tcp_setpersist(tp); 1024 } 1025 } 1026 break; 1027 case TT_KEEP: 1028 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1029 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1030 tcp_timer_activate(tp, TT_KEEP, 1031 TCPS_HAVEESTABLISHED(tp->t_state) ? 1032 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1033 } 1034 break; 1035 case TT_2MSL: 1036 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1037 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1038 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1039 ((tp->t_inpcb->inp_socket == NULL) || 1040 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1041 /* Star the 2MSL timer */ 1042 tcp_timer_activate(tp, TT_2MSL, 1043 (tcp_fast_finwait2_recycle) ? 1044 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1045 } 1046 } 1047 break; 1048 default: 1049 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1050 } 1051 } 1052 1053 static void 1054 tcp_timer_discard(void *ptp) 1055 { 1056 struct inpcb *inp; 1057 struct tcpcb *tp; 1058 struct epoch_tracker et; 1059 1060 tp = (struct tcpcb *)ptp; 1061 CURVNET_SET(tp->t_vnet); 1062 NET_EPOCH_ENTER(et); 1063 inp = tp->t_inpcb; 1064 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", 1065 __func__, tp)); 1066 INP_WLOCK(inp); 1067 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0, 1068 ("%s: tcpcb has to be stopped here", __func__)); 1069 if (--tp->t_timers->tt_draincnt > 0 || 1070 tcp_freecb(tp) == false) 1071 INP_WUNLOCK(inp); 1072 NET_EPOCH_EXIT(et); 1073 CURVNET_RESTORE(); 1074 } 1075 1076 void 1077 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1078 { 1079 struct callout *t_callout; 1080 1081 tp->t_timers->tt_flags |= TT_STOPPED; 1082 switch (timer_type) { 1083 case TT_DELACK: 1084 t_callout = &tp->t_timers->tt_delack; 1085 break; 1086 case TT_REXMT: 1087 t_callout = &tp->t_timers->tt_rexmt; 1088 break; 1089 case TT_PERSIST: 1090 t_callout = &tp->t_timers->tt_persist; 1091 break; 1092 case TT_KEEP: 1093 t_callout = &tp->t_timers->tt_keep; 1094 break; 1095 case TT_2MSL: 1096 t_callout = &tp->t_timers->tt_2msl; 1097 break; 1098 default: 1099 if (tp->t_fb->tfb_tcp_timer_stop) { 1100 /* 1101 * XXXrrs we need to look at this with the 1102 * stop case below (flags). 1103 */ 1104 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1105 return; 1106 } 1107 panic("tp %p bad timer_type %#x", tp, timer_type); 1108 } 1109 1110 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1111 /* 1112 * Can't stop the callout, defer tcpcb actual deletion 1113 * to the last one. We do this using the async drain 1114 * function and incrementing the count in 1115 */ 1116 tp->t_timers->tt_draincnt++; 1117 } 1118 } 1119