1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #ifdef TCPDEBUG 81 #include <netinet/tcp_debug.h> 82 #endif 83 84 int tcp_persmin; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 91 92 int tcp_keepinit; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 95 96 int tcp_keepidle; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 99 100 int tcp_keepintvl; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 103 104 int tcp_delacktime; 105 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 107 "Time before a delayed ACK is sent"); 108 109 int tcp_msl; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 112 113 int tcp_rexmit_min; 114 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 115 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 116 "Minimum Retransmission Timeout"); 117 118 int tcp_rexmit_slop; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 121 "Retransmission Timer Slop"); 122 123 int tcp_always_keepalive = 1; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 125 &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 126 127 int tcp_fast_finwait2_recycle = 0; 128 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 129 &tcp_fast_finwait2_recycle, 0, 130 "Recycle closed FIN_WAIT_2 connections faster"); 131 132 int tcp_finwait2_timeout; 133 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 134 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 135 136 int tcp_keepcnt = TCPTV_KEEPCNT; 137 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 138 "Number of keepalive probes to send"); 139 140 /* max idle probes */ 141 int tcp_maxpersistidle; 142 143 int tcp_rexmit_drop_options = 0; 144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 145 &tcp_rexmit_drop_options, 0, 146 "Drop TCP options from 3rd and later retransmitted SYN"); 147 148 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 150 CTLFLAG_RW|CTLFLAG_VNET, 151 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 152 "Path MTU Discovery Black Hole Detection Enabled"); 153 154 #ifdef INET 155 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 156 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 157 CTLFLAG_RW|CTLFLAG_VNET, 158 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 159 "Path MTU Discovery Black Hole Detection lowered MSS"); 160 #endif 161 162 #ifdef INET6 163 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 164 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 165 CTLFLAG_RW|CTLFLAG_VNET, 166 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 167 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 168 #endif 169 170 #ifdef RSS 171 static int per_cpu_timers = 1; 172 #else 173 static int per_cpu_timers = 0; 174 #endif 175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 176 &per_cpu_timers , 0, "run tcp timers on all cpus"); 177 178 /* 179 * Map the given inp to a CPU id. 180 * 181 * This queries RSS if it's compiled in, else it defaults to the current 182 * CPU ID. 183 */ 184 inline int 185 inp_to_cpuid(struct inpcb *inp) 186 { 187 u_int cpuid; 188 189 #ifdef RSS 190 if (per_cpu_timers) { 191 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 192 if (cpuid == NETISR_CPUID_NONE) 193 return (curcpu); /* XXX */ 194 else 195 return (cpuid); 196 } 197 #else 198 /* Legacy, pre-RSS behaviour */ 199 if (per_cpu_timers) { 200 /* 201 * We don't have a flowid -> cpuid mapping, so cheat and 202 * just map unknown cpuids to curcpu. Not the best, but 203 * apparently better than defaulting to swi 0. 204 */ 205 cpuid = inp->inp_flowid % (mp_maxid + 1); 206 if (! CPU_ABSENT(cpuid)) 207 return (cpuid); 208 return (curcpu); 209 } 210 #endif 211 /* Default for RSS and non-RSS - cpuid 0 */ 212 else { 213 return (0); 214 } 215 } 216 217 /* 218 * Tcp protocol timeout routine called every 500 ms. 219 * Updates timestamps used for TCP 220 * causes finite state machine actions if timers expire. 221 */ 222 void 223 tcp_slowtimo(void) 224 { 225 VNET_ITERATOR_DECL(vnet_iter); 226 227 VNET_LIST_RLOCK_NOSLEEP(); 228 VNET_FOREACH(vnet_iter) { 229 CURVNET_SET(vnet_iter); 230 (void) tcp_tw_2msl_scan(0); 231 CURVNET_RESTORE(); 232 } 233 VNET_LIST_RUNLOCK_NOSLEEP(); 234 } 235 236 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 237 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 238 239 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 240 241 /* 242 * TCP timer processing. 243 */ 244 245 void 246 tcp_timer_delack(void *xtp) 247 { 248 struct tcpcb *tp = xtp; 249 struct inpcb *inp; 250 CURVNET_SET(tp->t_vnet); 251 252 inp = tp->t_inpcb; 253 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 254 INP_WLOCK(inp); 255 if (callout_pending(&tp->t_timers->tt_delack) || 256 !callout_active(&tp->t_timers->tt_delack)) { 257 INP_WUNLOCK(inp); 258 CURVNET_RESTORE(); 259 return; 260 } 261 callout_deactivate(&tp->t_timers->tt_delack); 262 if ((inp->inp_flags & INP_DROPPED) != 0) { 263 INP_WUNLOCK(inp); 264 CURVNET_RESTORE(); 265 return; 266 } 267 tp->t_flags |= TF_ACKNOW; 268 TCPSTAT_INC(tcps_delack); 269 (void) tp->t_fb->tfb_tcp_output(tp); 270 INP_WUNLOCK(inp); 271 CURVNET_RESTORE(); 272 } 273 274 void 275 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 276 { 277 if (inp && tp != NULL) 278 INP_WUNLOCK(inp); 279 } 280 281 void 282 tcp_timer_2msl(void *xtp) 283 { 284 struct tcpcb *tp = xtp; 285 struct inpcb *inp; 286 struct epoch_tracker et; 287 CURVNET_SET(tp->t_vnet); 288 #ifdef TCPDEBUG 289 int ostate; 290 291 ostate = tp->t_state; 292 #endif 293 inp = tp->t_inpcb; 294 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 295 INP_WLOCK(inp); 296 tcp_free_sackholes(tp); 297 if (callout_pending(&tp->t_timers->tt_2msl) || 298 !callout_active(&tp->t_timers->tt_2msl)) { 299 INP_WUNLOCK(tp->t_inpcb); 300 CURVNET_RESTORE(); 301 return; 302 } 303 callout_deactivate(&tp->t_timers->tt_2msl); 304 if ((inp->inp_flags & INP_DROPPED) != 0) { 305 INP_WUNLOCK(inp); 306 CURVNET_RESTORE(); 307 return; 308 } 309 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 310 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 311 /* 312 * 2 MSL timeout in shutdown went off. If we're closed but 313 * still waiting for peer to close and connection has been idle 314 * too long delete connection control block. Otherwise, check 315 * again in a bit. 316 * 317 * If in TIME_WAIT state just ignore as this timeout is handled in 318 * tcp_tw_2msl_scan(). 319 * 320 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 321 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 322 * Ignore fact that there were recent incoming segments. 323 */ 324 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 325 INP_WUNLOCK(inp); 326 CURVNET_RESTORE(); 327 return; 328 } 329 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 330 tp->t_inpcb && tp->t_inpcb->inp_socket && 331 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 332 TCPSTAT_INC(tcps_finwait2_drops); 333 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 334 tcp_inpinfo_lock_del(inp, tp); 335 goto out; 336 } 337 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 338 tp = tcp_close(tp); 339 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 340 tcp_inpinfo_lock_del(inp, tp); 341 goto out; 342 } else { 343 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 344 callout_reset(&tp->t_timers->tt_2msl, 345 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 346 } else { 347 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 348 tcp_inpinfo_lock_del(inp, tp); 349 goto out; 350 } 351 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 352 tp = tcp_close(tp); 353 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 354 tcp_inpinfo_lock_del(inp, tp); 355 goto out; 356 } 357 } 358 359 #ifdef TCPDEBUG 360 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 361 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 362 PRU_SLOWTIMO); 363 #endif 364 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 365 366 if (tp != NULL) 367 INP_WUNLOCK(inp); 368 out: 369 CURVNET_RESTORE(); 370 } 371 372 void 373 tcp_timer_keep(void *xtp) 374 { 375 struct tcpcb *tp = xtp; 376 struct tcptemp *t_template; 377 struct inpcb *inp; 378 struct epoch_tracker et; 379 CURVNET_SET(tp->t_vnet); 380 #ifdef TCPDEBUG 381 int ostate; 382 383 ostate = tp->t_state; 384 #endif 385 inp = tp->t_inpcb; 386 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 387 INP_WLOCK(inp); 388 if (callout_pending(&tp->t_timers->tt_keep) || 389 !callout_active(&tp->t_timers->tt_keep)) { 390 INP_WUNLOCK(inp); 391 CURVNET_RESTORE(); 392 return; 393 } 394 callout_deactivate(&tp->t_timers->tt_keep); 395 if ((inp->inp_flags & INP_DROPPED) != 0) { 396 INP_WUNLOCK(inp); 397 CURVNET_RESTORE(); 398 return; 399 } 400 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 401 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 402 403 /* 404 * Because we don't regularly reset the keepalive callout in 405 * the ESTABLISHED state, it may be that we don't actually need 406 * to send a keepalive yet. If that occurs, schedule another 407 * call for the next time the keepalive timer might expire. 408 */ 409 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 410 u_int idletime; 411 412 idletime = ticks - tp->t_rcvtime; 413 if (idletime < TP_KEEPIDLE(tp)) { 414 callout_reset(&tp->t_timers->tt_keep, 415 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 416 INP_WUNLOCK(inp); 417 CURVNET_RESTORE(); 418 return; 419 } 420 } 421 422 /* 423 * Keep-alive timer went off; send something 424 * or drop connection if idle for too long. 425 */ 426 TCPSTAT_INC(tcps_keeptimeo); 427 if (tp->t_state < TCPS_ESTABLISHED) 428 goto dropit; 429 if ((tcp_always_keepalive || 430 inp->inp_socket->so_options & SO_KEEPALIVE) && 431 tp->t_state <= TCPS_CLOSING) { 432 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 433 goto dropit; 434 /* 435 * Send a packet designed to force a response 436 * if the peer is up and reachable: 437 * either an ACK if the connection is still alive, 438 * or an RST if the peer has closed the connection 439 * due to timeout or reboot. 440 * Using sequence number tp->snd_una-1 441 * causes the transmitted zero-length segment 442 * to lie outside the receive window; 443 * by the protocol spec, this requires the 444 * correspondent TCP to respond. 445 */ 446 TCPSTAT_INC(tcps_keepprobe); 447 t_template = tcpip_maketemplate(inp); 448 if (t_template) { 449 tcp_respond(tp, t_template->tt_ipgen, 450 &t_template->tt_t, (struct mbuf *)NULL, 451 tp->rcv_nxt, tp->snd_una - 1, 0); 452 free(t_template, M_TEMP); 453 } 454 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 455 tcp_timer_keep, tp); 456 } else 457 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 458 tcp_timer_keep, tp); 459 460 #ifdef TCPDEBUG 461 if (inp->inp_socket->so_options & SO_DEBUG) 462 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 463 PRU_SLOWTIMO); 464 #endif 465 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 466 INP_WUNLOCK(inp); 467 CURVNET_RESTORE(); 468 return; 469 470 dropit: 471 TCPSTAT_INC(tcps_keepdrops); 472 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 473 tcp_inpinfo_lock_del(inp, tp); 474 goto out; 475 } 476 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 477 tp = tcp_drop(tp, ETIMEDOUT); 478 479 #ifdef TCPDEBUG 480 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 481 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 482 PRU_SLOWTIMO); 483 #endif 484 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 485 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 486 tcp_inpinfo_lock_del(inp, tp); 487 out: 488 CURVNET_RESTORE(); 489 } 490 491 void 492 tcp_timer_persist(void *xtp) 493 { 494 struct tcpcb *tp = xtp; 495 struct inpcb *inp; 496 struct epoch_tracker et; 497 CURVNET_SET(tp->t_vnet); 498 #ifdef TCPDEBUG 499 int ostate; 500 501 ostate = tp->t_state; 502 #endif 503 inp = tp->t_inpcb; 504 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 505 INP_WLOCK(inp); 506 if (callout_pending(&tp->t_timers->tt_persist) || 507 !callout_active(&tp->t_timers->tt_persist)) { 508 INP_WUNLOCK(inp); 509 CURVNET_RESTORE(); 510 return; 511 } 512 callout_deactivate(&tp->t_timers->tt_persist); 513 if ((inp->inp_flags & INP_DROPPED) != 0) { 514 INP_WUNLOCK(inp); 515 CURVNET_RESTORE(); 516 return; 517 } 518 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 519 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 520 /* 521 * Persistence timer into zero window. 522 * Force a byte to be output, if possible. 523 */ 524 TCPSTAT_INC(tcps_persisttimeo); 525 /* 526 * Hack: if the peer is dead/unreachable, we do not 527 * time out if the window is closed. After a full 528 * backoff, drop the connection if the idle time 529 * (no responses to probes) reaches the maximum 530 * backoff that we would use if retransmitting. 531 */ 532 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 533 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 534 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 535 TCPSTAT_INC(tcps_persistdrop); 536 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 537 tcp_inpinfo_lock_del(inp, tp); 538 goto out; 539 } 540 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 541 tp = tcp_drop(tp, ETIMEDOUT); 542 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 543 tcp_inpinfo_lock_del(inp, tp); 544 goto out; 545 } 546 /* 547 * If the user has closed the socket then drop a persisting 548 * connection after a much reduced timeout. 549 */ 550 if (tp->t_state > TCPS_CLOSE_WAIT && 551 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 552 TCPSTAT_INC(tcps_persistdrop); 553 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 554 tcp_inpinfo_lock_del(inp, tp); 555 goto out; 556 } 557 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 558 tp = tcp_drop(tp, ETIMEDOUT); 559 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 560 tcp_inpinfo_lock_del(inp, tp); 561 goto out; 562 } 563 tcp_setpersist(tp); 564 tp->t_flags |= TF_FORCEDATA; 565 (void) tp->t_fb->tfb_tcp_output(tp); 566 tp->t_flags &= ~TF_FORCEDATA; 567 568 #ifdef TCPDEBUG 569 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 570 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 571 #endif 572 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 573 INP_WUNLOCK(inp); 574 out: 575 CURVNET_RESTORE(); 576 } 577 578 void 579 tcp_timer_rexmt(void * xtp) 580 { 581 struct tcpcb *tp = xtp; 582 CURVNET_SET(tp->t_vnet); 583 int rexmt; 584 struct inpcb *inp; 585 struct epoch_tracker et; 586 #ifdef TCPDEBUG 587 int ostate; 588 589 ostate = tp->t_state; 590 #endif 591 inp = tp->t_inpcb; 592 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 593 INP_WLOCK(inp); 594 if (callout_pending(&tp->t_timers->tt_rexmt) || 595 !callout_active(&tp->t_timers->tt_rexmt)) { 596 INP_WUNLOCK(inp); 597 CURVNET_RESTORE(); 598 return; 599 } 600 callout_deactivate(&tp->t_timers->tt_rexmt); 601 if ((inp->inp_flags & INP_DROPPED) != 0) { 602 INP_WUNLOCK(inp); 603 CURVNET_RESTORE(); 604 return; 605 } 606 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 607 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 608 tcp_free_sackholes(tp); 609 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 610 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 611 /* The stack has a timer action too. */ 612 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 613 } 614 /* 615 * Retransmission timer went off. Message has not 616 * been acked within retransmit interval. Back off 617 * to a longer retransmit interval and retransmit one segment. 618 */ 619 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 620 tp->t_rxtshift = TCP_MAXRXTSHIFT; 621 TCPSTAT_INC(tcps_timeoutdrop); 622 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 623 tcp_inpinfo_lock_del(inp, tp); 624 goto out; 625 } 626 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 627 tp = tcp_drop(tp, ETIMEDOUT); 628 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 629 tcp_inpinfo_lock_del(inp, tp); 630 goto out; 631 } 632 if (tp->t_state == TCPS_SYN_SENT) { 633 /* 634 * If the SYN was retransmitted, indicate CWND to be 635 * limited to 1 segment in cc_conn_init(). 636 */ 637 tp->snd_cwnd = 1; 638 } else if (tp->t_rxtshift == 1) { 639 /* 640 * first retransmit; record ssthresh and cwnd so they can 641 * be recovered if this turns out to be a "bad" retransmit. 642 * A retransmit is considered "bad" if an ACK for this 643 * segment is received within RTT/2 interval; the assumption 644 * here is that the ACK was already in flight. See 645 * "On Estimating End-to-End Network Path Properties" by 646 * Allman and Paxson for more details. 647 */ 648 tp->snd_cwnd_prev = tp->snd_cwnd; 649 tp->snd_ssthresh_prev = tp->snd_ssthresh; 650 tp->snd_recover_prev = tp->snd_recover; 651 if (IN_FASTRECOVERY(tp->t_flags)) 652 tp->t_flags |= TF_WASFRECOVERY; 653 else 654 tp->t_flags &= ~TF_WASFRECOVERY; 655 if (IN_CONGRECOVERY(tp->t_flags)) 656 tp->t_flags |= TF_WASCRECOVERY; 657 else 658 tp->t_flags &= ~TF_WASCRECOVERY; 659 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 660 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 661 /* In the event that we've negotiated timestamps 662 * badrxtwin will be set to the value that we set 663 * the retransmitted packet's to_tsval to by tcp_output 664 */ 665 tp->t_flags |= TF_PREVVALID; 666 } else 667 tp->t_flags &= ~TF_PREVVALID; 668 TCPSTAT_INC(tcps_rexmttimeo); 669 if ((tp->t_state == TCPS_SYN_SENT) || 670 (tp->t_state == TCPS_SYN_RECEIVED)) 671 rexmt = TCPTV_RTOBASE * tcp_backoff[tp->t_rxtshift]; 672 else 673 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 674 TCPT_RANGESET(tp->t_rxtcur, rexmt, 675 tp->t_rttmin, TCPTV_REXMTMAX); 676 677 /* 678 * We enter the path for PLMTUD if connection is established or, if 679 * connection is FIN_WAIT_1 status, reason for the last is that if 680 * amount of data we send is very small, we could send it in couple of 681 * packets and process straight to FIN. In that case we won't catch 682 * ESTABLISHED state. 683 */ 684 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 685 || (tp->t_state == TCPS_FIN_WAIT_1))) { 686 #ifdef INET6 687 int isipv6; 688 #endif 689 690 /* 691 * Idea here is that at each stage of mtu probe (usually, 1448 692 * -> 1188 -> 524) should be given 2 chances to recover before 693 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 694 * take care of that. 695 */ 696 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 697 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 698 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 699 tp->t_rxtshift % 2 == 0)) { 700 /* 701 * Enter Path MTU Black-hole Detection mechanism: 702 * - Disable Path MTU Discovery (IP "DF" bit). 703 * - Reduce MTU to lower value than what we 704 * negotiated with peer. 705 */ 706 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 707 /* Record that we may have found a black hole. */ 708 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 709 /* Keep track of previous MSS. */ 710 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 711 } 712 713 /* 714 * Reduce the MSS to blackhole value or to the default 715 * in an attempt to retransmit. 716 */ 717 #ifdef INET6 718 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 719 if (isipv6 && 720 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 721 /* Use the sysctl tuneable blackhole MSS. */ 722 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 723 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 724 } else if (isipv6) { 725 /* Use the default MSS. */ 726 tp->t_maxseg = V_tcp_v6mssdflt; 727 /* 728 * Disable Path MTU Discovery when we switch to 729 * minmss. 730 */ 731 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 732 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 733 } 734 #endif 735 #if defined(INET6) && defined(INET) 736 else 737 #endif 738 #ifdef INET 739 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 740 /* Use the sysctl tuneable blackhole MSS. */ 741 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 742 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 743 } else { 744 /* Use the default MSS. */ 745 tp->t_maxseg = V_tcp_mssdflt; 746 /* 747 * Disable Path MTU Discovery when we switch to 748 * minmss. 749 */ 750 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 751 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 752 } 753 #endif 754 /* 755 * Reset the slow-start flight size 756 * as it may depend on the new MSS. 757 */ 758 if (CC_ALGO(tp)->conn_init != NULL) 759 CC_ALGO(tp)->conn_init(tp->ccv); 760 } else { 761 /* 762 * If further retransmissions are still unsuccessful 763 * with a lowered MTU, maybe this isn't a blackhole and 764 * we restore the previous MSS and blackhole detection 765 * flags. 766 * The limit '6' is determined by giving each probe 767 * stage (1448, 1188, 524) 2 chances to recover. 768 */ 769 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 770 (tp->t_rxtshift >= 6)) { 771 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 772 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 773 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 774 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 775 /* 776 * Reset the slow-start flight size as it 777 * may depend on the new MSS. 778 */ 779 if (CC_ALGO(tp)->conn_init != NULL) 780 CC_ALGO(tp)->conn_init(tp->ccv); 781 } 782 } 783 } 784 785 /* 786 * Disable RFC1323 and SACK if we haven't got any response to 787 * our third SYN to work-around some broken terminal servers 788 * (most of which have hopefully been retired) that have bad VJ 789 * header compression code which trashes TCP segments containing 790 * unknown-to-them TCP options. 791 */ 792 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 793 (tp->t_rxtshift == 3)) 794 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 795 /* 796 * If we backed off this far, notify the L3 protocol that we're having 797 * connection problems. 798 */ 799 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 800 #ifdef INET6 801 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 802 in6_losing(tp->t_inpcb); 803 else 804 #endif 805 in_losing(tp->t_inpcb); 806 } 807 tp->snd_nxt = tp->snd_una; 808 tp->snd_recover = tp->snd_max; 809 /* 810 * Force a segment to be sent. 811 */ 812 tp->t_flags |= TF_ACKNOW; 813 /* 814 * If timing a segment in this window, stop the timer. 815 */ 816 tp->t_rtttime = 0; 817 818 cc_cong_signal(tp, NULL, CC_RTO); 819 820 (void) tp->t_fb->tfb_tcp_output(tp); 821 822 #ifdef TCPDEBUG 823 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 824 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 825 PRU_SLOWTIMO); 826 #endif 827 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 828 INP_WUNLOCK(inp); 829 out: 830 CURVNET_RESTORE(); 831 } 832 833 void 834 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 835 { 836 struct callout *t_callout; 837 timeout_t *f_callout; 838 struct inpcb *inp = tp->t_inpcb; 839 int cpu = inp_to_cpuid(inp); 840 841 #ifdef TCP_OFFLOAD 842 if (tp->t_flags & TF_TOE) 843 return; 844 #endif 845 846 if (tp->t_timers->tt_flags & TT_STOPPED) 847 return; 848 849 switch (timer_type) { 850 case TT_DELACK: 851 t_callout = &tp->t_timers->tt_delack; 852 f_callout = tcp_timer_delack; 853 break; 854 case TT_REXMT: 855 t_callout = &tp->t_timers->tt_rexmt; 856 f_callout = tcp_timer_rexmt; 857 break; 858 case TT_PERSIST: 859 t_callout = &tp->t_timers->tt_persist; 860 f_callout = tcp_timer_persist; 861 break; 862 case TT_KEEP: 863 t_callout = &tp->t_timers->tt_keep; 864 f_callout = tcp_timer_keep; 865 break; 866 case TT_2MSL: 867 t_callout = &tp->t_timers->tt_2msl; 868 f_callout = tcp_timer_2msl; 869 break; 870 default: 871 if (tp->t_fb->tfb_tcp_timer_activate) { 872 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 873 return; 874 } 875 panic("tp %p bad timer_type %#x", tp, timer_type); 876 } 877 if (delta == 0) { 878 callout_stop(t_callout); 879 } else { 880 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 881 } 882 } 883 884 int 885 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 886 { 887 struct callout *t_callout; 888 889 switch (timer_type) { 890 case TT_DELACK: 891 t_callout = &tp->t_timers->tt_delack; 892 break; 893 case TT_REXMT: 894 t_callout = &tp->t_timers->tt_rexmt; 895 break; 896 case TT_PERSIST: 897 t_callout = &tp->t_timers->tt_persist; 898 break; 899 case TT_KEEP: 900 t_callout = &tp->t_timers->tt_keep; 901 break; 902 case TT_2MSL: 903 t_callout = &tp->t_timers->tt_2msl; 904 break; 905 default: 906 if (tp->t_fb->tfb_tcp_timer_active) { 907 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 908 } 909 panic("tp %p bad timer_type %#x", tp, timer_type); 910 } 911 return callout_active(t_callout); 912 } 913 914 /* 915 * Stop the timer from running, and apply a flag 916 * against the timer_flags that will force the 917 * timer never to run. The flag is needed to assure 918 * a race does not leave it running and cause 919 * the timer to possibly restart itself (keep and persist 920 * especially do this). 921 */ 922 int 923 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 924 { 925 struct callout *t_callout; 926 uint32_t t_flags; 927 928 switch (timer_type) { 929 case TT_DELACK: 930 t_flags = TT_DELACK_SUS; 931 t_callout = &tp->t_timers->tt_delack; 932 break; 933 case TT_REXMT: 934 t_flags = TT_REXMT_SUS; 935 t_callout = &tp->t_timers->tt_rexmt; 936 break; 937 case TT_PERSIST: 938 t_flags = TT_PERSIST_SUS; 939 t_callout = &tp->t_timers->tt_persist; 940 break; 941 case TT_KEEP: 942 t_flags = TT_KEEP_SUS; 943 t_callout = &tp->t_timers->tt_keep; 944 break; 945 case TT_2MSL: 946 t_flags = TT_2MSL_SUS; 947 t_callout = &tp->t_timers->tt_2msl; 948 break; 949 default: 950 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 951 } 952 tp->t_timers->tt_flags |= t_flags; 953 return (callout_stop(t_callout)); 954 } 955 956 void 957 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 958 { 959 switch (timer_type) { 960 case TT_DELACK: 961 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 962 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 963 if (tp->t_flags & TF_DELACK) { 964 /* Delayed ack timer should be up activate a timer */ 965 tp->t_flags &= ~TF_DELACK; 966 tcp_timer_activate(tp, TT_DELACK, 967 tcp_delacktime); 968 } 969 } 970 break; 971 case TT_REXMT: 972 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 973 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 974 if (SEQ_GT(tp->snd_max, tp->snd_una) && 975 (tcp_timer_active((tp), TT_PERSIST) == 0) && 976 tp->snd_wnd) { 977 /* We have outstanding data activate a timer */ 978 tcp_timer_activate(tp, TT_REXMT, 979 tp->t_rxtcur); 980 } 981 } 982 break; 983 case TT_PERSIST: 984 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 985 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 986 if (tp->snd_wnd == 0) { 987 /* Activate the persists timer */ 988 tp->t_rxtshift = 0; 989 tcp_setpersist(tp); 990 } 991 } 992 break; 993 case TT_KEEP: 994 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 995 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 996 tcp_timer_activate(tp, TT_KEEP, 997 TCPS_HAVEESTABLISHED(tp->t_state) ? 998 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 999 } 1000 break; 1001 case TT_2MSL: 1002 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1003 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1004 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1005 ((tp->t_inpcb->inp_socket == NULL) || 1006 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1007 /* Star the 2MSL timer */ 1008 tcp_timer_activate(tp, TT_2MSL, 1009 (tcp_fast_finwait2_recycle) ? 1010 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1011 } 1012 } 1013 break; 1014 default: 1015 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1016 } 1017 } 1018 1019 void 1020 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1021 { 1022 struct callout *t_callout; 1023 1024 tp->t_timers->tt_flags |= TT_STOPPED; 1025 switch (timer_type) { 1026 case TT_DELACK: 1027 t_callout = &tp->t_timers->tt_delack; 1028 break; 1029 case TT_REXMT: 1030 t_callout = &tp->t_timers->tt_rexmt; 1031 break; 1032 case TT_PERSIST: 1033 t_callout = &tp->t_timers->tt_persist; 1034 break; 1035 case TT_KEEP: 1036 t_callout = &tp->t_timers->tt_keep; 1037 break; 1038 case TT_2MSL: 1039 t_callout = &tp->t_timers->tt_2msl; 1040 break; 1041 default: 1042 if (tp->t_fb->tfb_tcp_timer_stop) { 1043 /* 1044 * XXXrrs we need to look at this with the 1045 * stop case below (flags). 1046 */ 1047 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1048 return; 1049 } 1050 panic("tp %p bad timer_type %#x", tp, timer_type); 1051 } 1052 1053 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1054 /* 1055 * Can't stop the callout, defer tcpcb actual deletion 1056 * to the last one. We do this using the async drain 1057 * function and incrementing the count in 1058 */ 1059 tp->t_timers->tt_draincnt++; 1060 } 1061 } 1062