1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #ifdef TCPDEBUG 81 #include <netinet/tcp_debug.h> 82 #endif 83 84 int tcp_persmin; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 91 92 int tcp_keepinit; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 95 96 int tcp_keepidle; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 99 100 int tcp_keepintvl; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 103 104 int tcp_delacktime; 105 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 107 "Time before a delayed ACK is sent"); 108 109 int tcp_msl; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 112 113 int tcp_rexmit_initial; 114 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT|CTLFLAG_RW, 115 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 116 "Initial Retransmission Timeout"); 117 118 int tcp_rexmit_min; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 121 "Minimum Retransmission Timeout"); 122 123 int tcp_rexmit_slop; 124 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 125 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 126 "Retransmission Timer Slop"); 127 128 int tcp_always_keepalive = 1; 129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 130 &tcp_always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 131 132 int tcp_fast_finwait2_recycle = 0; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 134 &tcp_fast_finwait2_recycle, 0, 135 "Recycle closed FIN_WAIT_2 connections faster"); 136 137 int tcp_finwait2_timeout; 138 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 139 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 140 141 int tcp_keepcnt = TCPTV_KEEPCNT; 142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 143 "Number of keepalive probes to send"); 144 145 /* max idle probes */ 146 int tcp_maxpersistidle; 147 148 int tcp_rexmit_drop_options = 0; 149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 150 &tcp_rexmit_drop_options, 0, 151 "Drop TCP options from 3rd and later retransmitted SYN"); 152 153 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 155 CTLFLAG_RW|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 157 "Path MTU Discovery Black Hole Detection Enabled"); 158 159 #ifdef INET 160 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 162 CTLFLAG_RW|CTLFLAG_VNET, 163 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 164 "Path MTU Discovery Black Hole Detection lowered MSS"); 165 #endif 166 167 #ifdef INET6 168 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 170 CTLFLAG_RW|CTLFLAG_VNET, 171 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 172 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 173 #endif 174 175 #ifdef RSS 176 static int per_cpu_timers = 1; 177 #else 178 static int per_cpu_timers = 0; 179 #endif 180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 181 &per_cpu_timers , 0, "run tcp timers on all cpus"); 182 183 /* 184 * Map the given inp to a CPU id. 185 * 186 * This queries RSS if it's compiled in, else it defaults to the current 187 * CPU ID. 188 */ 189 inline int 190 inp_to_cpuid(struct inpcb *inp) 191 { 192 u_int cpuid; 193 194 #ifdef RSS 195 if (per_cpu_timers) { 196 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 197 if (cpuid == NETISR_CPUID_NONE) 198 return (curcpu); /* XXX */ 199 else 200 return (cpuid); 201 } 202 #else 203 /* Legacy, pre-RSS behaviour */ 204 if (per_cpu_timers) { 205 /* 206 * We don't have a flowid -> cpuid mapping, so cheat and 207 * just map unknown cpuids to curcpu. Not the best, but 208 * apparently better than defaulting to swi 0. 209 */ 210 cpuid = inp->inp_flowid % (mp_maxid + 1); 211 if (! CPU_ABSENT(cpuid)) 212 return (cpuid); 213 return (curcpu); 214 } 215 #endif 216 /* Default for RSS and non-RSS - cpuid 0 */ 217 else { 218 return (0); 219 } 220 } 221 222 /* 223 * Tcp protocol timeout routine called every 500 ms. 224 * Updates timestamps used for TCP 225 * causes finite state machine actions if timers expire. 226 */ 227 void 228 tcp_slowtimo(void) 229 { 230 VNET_ITERATOR_DECL(vnet_iter); 231 232 VNET_LIST_RLOCK_NOSLEEP(); 233 VNET_FOREACH(vnet_iter) { 234 CURVNET_SET(vnet_iter); 235 (void) tcp_tw_2msl_scan(0); 236 CURVNET_RESTORE(); 237 } 238 VNET_LIST_RUNLOCK_NOSLEEP(); 239 } 240 241 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 242 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 243 244 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 245 246 /* 247 * TCP timer processing. 248 */ 249 250 void 251 tcp_timer_delack(void *xtp) 252 { 253 struct tcpcb *tp = xtp; 254 struct inpcb *inp; 255 CURVNET_SET(tp->t_vnet); 256 257 inp = tp->t_inpcb; 258 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 259 INP_WLOCK(inp); 260 if (callout_pending(&tp->t_timers->tt_delack) || 261 !callout_active(&tp->t_timers->tt_delack)) { 262 INP_WUNLOCK(inp); 263 CURVNET_RESTORE(); 264 return; 265 } 266 callout_deactivate(&tp->t_timers->tt_delack); 267 if ((inp->inp_flags & INP_DROPPED) != 0) { 268 INP_WUNLOCK(inp); 269 CURVNET_RESTORE(); 270 return; 271 } 272 tp->t_flags |= TF_ACKNOW; 273 TCPSTAT_INC(tcps_delack); 274 (void) tp->t_fb->tfb_tcp_output(tp); 275 INP_WUNLOCK(inp); 276 CURVNET_RESTORE(); 277 } 278 279 void 280 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 281 { 282 if (inp && tp != NULL) 283 INP_WUNLOCK(inp); 284 } 285 286 void 287 tcp_timer_2msl(void *xtp) 288 { 289 struct tcpcb *tp = xtp; 290 struct inpcb *inp; 291 struct epoch_tracker et; 292 CURVNET_SET(tp->t_vnet); 293 #ifdef TCPDEBUG 294 int ostate; 295 296 ostate = tp->t_state; 297 #endif 298 inp = tp->t_inpcb; 299 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 300 INP_WLOCK(inp); 301 tcp_free_sackholes(tp); 302 if (callout_pending(&tp->t_timers->tt_2msl) || 303 !callout_active(&tp->t_timers->tt_2msl)) { 304 INP_WUNLOCK(tp->t_inpcb); 305 CURVNET_RESTORE(); 306 return; 307 } 308 callout_deactivate(&tp->t_timers->tt_2msl); 309 if ((inp->inp_flags & INP_DROPPED) != 0) { 310 INP_WUNLOCK(inp); 311 CURVNET_RESTORE(); 312 return; 313 } 314 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 315 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 316 /* 317 * 2 MSL timeout in shutdown went off. If we're closed but 318 * still waiting for peer to close and connection has been idle 319 * too long delete connection control block. Otherwise, check 320 * again in a bit. 321 * 322 * If in TIME_WAIT state just ignore as this timeout is handled in 323 * tcp_tw_2msl_scan(). 324 * 325 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 326 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 327 * Ignore fact that there were recent incoming segments. 328 */ 329 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 330 INP_WUNLOCK(inp); 331 CURVNET_RESTORE(); 332 return; 333 } 334 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 335 tp->t_inpcb && tp->t_inpcb->inp_socket && 336 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 337 TCPSTAT_INC(tcps_finwait2_drops); 338 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 339 tcp_inpinfo_lock_del(inp, tp); 340 goto out; 341 } 342 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 343 tp = tcp_close(tp); 344 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 345 tcp_inpinfo_lock_del(inp, tp); 346 goto out; 347 } else { 348 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 349 callout_reset(&tp->t_timers->tt_2msl, 350 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 351 } else { 352 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 353 tcp_inpinfo_lock_del(inp, tp); 354 goto out; 355 } 356 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 357 tp = tcp_close(tp); 358 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 359 tcp_inpinfo_lock_del(inp, tp); 360 goto out; 361 } 362 } 363 364 #ifdef TCPDEBUG 365 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 366 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 367 PRU_SLOWTIMO); 368 #endif 369 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 370 371 if (tp != NULL) 372 INP_WUNLOCK(inp); 373 out: 374 CURVNET_RESTORE(); 375 } 376 377 void 378 tcp_timer_keep(void *xtp) 379 { 380 struct tcpcb *tp = xtp; 381 struct tcptemp *t_template; 382 struct inpcb *inp; 383 struct epoch_tracker et; 384 CURVNET_SET(tp->t_vnet); 385 #ifdef TCPDEBUG 386 int ostate; 387 388 ostate = tp->t_state; 389 #endif 390 inp = tp->t_inpcb; 391 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 392 INP_WLOCK(inp); 393 if (callout_pending(&tp->t_timers->tt_keep) || 394 !callout_active(&tp->t_timers->tt_keep)) { 395 INP_WUNLOCK(inp); 396 CURVNET_RESTORE(); 397 return; 398 } 399 callout_deactivate(&tp->t_timers->tt_keep); 400 if ((inp->inp_flags & INP_DROPPED) != 0) { 401 INP_WUNLOCK(inp); 402 CURVNET_RESTORE(); 403 return; 404 } 405 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 406 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 407 408 /* 409 * Because we don't regularly reset the keepalive callout in 410 * the ESTABLISHED state, it may be that we don't actually need 411 * to send a keepalive yet. If that occurs, schedule another 412 * call for the next time the keepalive timer might expire. 413 */ 414 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 415 u_int idletime; 416 417 idletime = ticks - tp->t_rcvtime; 418 if (idletime < TP_KEEPIDLE(tp)) { 419 callout_reset(&tp->t_timers->tt_keep, 420 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 421 INP_WUNLOCK(inp); 422 CURVNET_RESTORE(); 423 return; 424 } 425 } 426 427 /* 428 * Keep-alive timer went off; send something 429 * or drop connection if idle for too long. 430 */ 431 TCPSTAT_INC(tcps_keeptimeo); 432 if (tp->t_state < TCPS_ESTABLISHED) 433 goto dropit; 434 if ((tcp_always_keepalive || 435 inp->inp_socket->so_options & SO_KEEPALIVE) && 436 tp->t_state <= TCPS_CLOSING) { 437 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 438 goto dropit; 439 /* 440 * Send a packet designed to force a response 441 * if the peer is up and reachable: 442 * either an ACK if the connection is still alive, 443 * or an RST if the peer has closed the connection 444 * due to timeout or reboot. 445 * Using sequence number tp->snd_una-1 446 * causes the transmitted zero-length segment 447 * to lie outside the receive window; 448 * by the protocol spec, this requires the 449 * correspondent TCP to respond. 450 */ 451 TCPSTAT_INC(tcps_keepprobe); 452 t_template = tcpip_maketemplate(inp); 453 if (t_template) { 454 tcp_respond(tp, t_template->tt_ipgen, 455 &t_template->tt_t, (struct mbuf *)NULL, 456 tp->rcv_nxt, tp->snd_una - 1, 0); 457 free(t_template, M_TEMP); 458 } 459 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 460 tcp_timer_keep, tp); 461 } else 462 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 463 tcp_timer_keep, tp); 464 465 #ifdef TCPDEBUG 466 if (inp->inp_socket->so_options & SO_DEBUG) 467 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 468 PRU_SLOWTIMO); 469 #endif 470 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 471 INP_WUNLOCK(inp); 472 CURVNET_RESTORE(); 473 return; 474 475 dropit: 476 TCPSTAT_INC(tcps_keepdrops); 477 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 478 tcp_inpinfo_lock_del(inp, tp); 479 goto out; 480 } 481 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 482 tp = tcp_drop(tp, ETIMEDOUT); 483 484 #ifdef TCPDEBUG 485 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 486 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 487 PRU_SLOWTIMO); 488 #endif 489 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 490 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 491 tcp_inpinfo_lock_del(inp, tp); 492 out: 493 CURVNET_RESTORE(); 494 } 495 496 void 497 tcp_timer_persist(void *xtp) 498 { 499 struct tcpcb *tp = xtp; 500 struct inpcb *inp; 501 struct epoch_tracker et; 502 CURVNET_SET(tp->t_vnet); 503 #ifdef TCPDEBUG 504 int ostate; 505 506 ostate = tp->t_state; 507 #endif 508 inp = tp->t_inpcb; 509 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 510 INP_WLOCK(inp); 511 if (callout_pending(&tp->t_timers->tt_persist) || 512 !callout_active(&tp->t_timers->tt_persist)) { 513 INP_WUNLOCK(inp); 514 CURVNET_RESTORE(); 515 return; 516 } 517 callout_deactivate(&tp->t_timers->tt_persist); 518 if ((inp->inp_flags & INP_DROPPED) != 0) { 519 INP_WUNLOCK(inp); 520 CURVNET_RESTORE(); 521 return; 522 } 523 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 524 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 525 /* 526 * Persistence timer into zero window. 527 * Force a byte to be output, if possible. 528 */ 529 TCPSTAT_INC(tcps_persisttimeo); 530 /* 531 * Hack: if the peer is dead/unreachable, we do not 532 * time out if the window is closed. After a full 533 * backoff, drop the connection if the idle time 534 * (no responses to probes) reaches the maximum 535 * backoff that we would use if retransmitting. 536 */ 537 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 538 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 539 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 540 TCPSTAT_INC(tcps_persistdrop); 541 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 542 tcp_inpinfo_lock_del(inp, tp); 543 goto out; 544 } 545 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 546 tp = tcp_drop(tp, ETIMEDOUT); 547 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 548 tcp_inpinfo_lock_del(inp, tp); 549 goto out; 550 } 551 /* 552 * If the user has closed the socket then drop a persisting 553 * connection after a much reduced timeout. 554 */ 555 if (tp->t_state > TCPS_CLOSE_WAIT && 556 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 557 TCPSTAT_INC(tcps_persistdrop); 558 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 559 tcp_inpinfo_lock_del(inp, tp); 560 goto out; 561 } 562 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 563 tp = tcp_drop(tp, ETIMEDOUT); 564 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 565 tcp_inpinfo_lock_del(inp, tp); 566 goto out; 567 } 568 tcp_setpersist(tp); 569 tp->t_flags |= TF_FORCEDATA; 570 (void) tp->t_fb->tfb_tcp_output(tp); 571 tp->t_flags &= ~TF_FORCEDATA; 572 573 #ifdef TCPDEBUG 574 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 575 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 576 #endif 577 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 578 INP_WUNLOCK(inp); 579 out: 580 CURVNET_RESTORE(); 581 } 582 583 void 584 tcp_timer_rexmt(void * xtp) 585 { 586 struct tcpcb *tp = xtp; 587 CURVNET_SET(tp->t_vnet); 588 int rexmt; 589 struct inpcb *inp; 590 struct epoch_tracker et; 591 #ifdef TCPDEBUG 592 int ostate; 593 594 ostate = tp->t_state; 595 #endif 596 inp = tp->t_inpcb; 597 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 598 INP_WLOCK(inp); 599 if (callout_pending(&tp->t_timers->tt_rexmt) || 600 !callout_active(&tp->t_timers->tt_rexmt)) { 601 INP_WUNLOCK(inp); 602 CURVNET_RESTORE(); 603 return; 604 } 605 callout_deactivate(&tp->t_timers->tt_rexmt); 606 if ((inp->inp_flags & INP_DROPPED) != 0) { 607 INP_WUNLOCK(inp); 608 CURVNET_RESTORE(); 609 return; 610 } 611 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 612 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 613 tcp_free_sackholes(tp); 614 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 615 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 616 /* The stack has a timer action too. */ 617 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 618 } 619 /* 620 * Retransmission timer went off. Message has not 621 * been acked within retransmit interval. Back off 622 * to a longer retransmit interval and retransmit one segment. 623 */ 624 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 625 tp->t_rxtshift = TCP_MAXRXTSHIFT; 626 TCPSTAT_INC(tcps_timeoutdrop); 627 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 628 tcp_inpinfo_lock_del(inp, tp); 629 goto out; 630 } 631 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 632 tp = tcp_drop(tp, ETIMEDOUT); 633 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 634 tcp_inpinfo_lock_del(inp, tp); 635 goto out; 636 } 637 if (tp->t_state == TCPS_SYN_SENT) { 638 /* 639 * If the SYN was retransmitted, indicate CWND to be 640 * limited to 1 segment in cc_conn_init(). 641 */ 642 tp->snd_cwnd = 1; 643 } else if (tp->t_rxtshift == 1) { 644 /* 645 * first retransmit; record ssthresh and cwnd so they can 646 * be recovered if this turns out to be a "bad" retransmit. 647 * A retransmit is considered "bad" if an ACK for this 648 * segment is received within RTT/2 interval; the assumption 649 * here is that the ACK was already in flight. See 650 * "On Estimating End-to-End Network Path Properties" by 651 * Allman and Paxson for more details. 652 */ 653 tp->snd_cwnd_prev = tp->snd_cwnd; 654 tp->snd_ssthresh_prev = tp->snd_ssthresh; 655 tp->snd_recover_prev = tp->snd_recover; 656 if (IN_FASTRECOVERY(tp->t_flags)) 657 tp->t_flags |= TF_WASFRECOVERY; 658 else 659 tp->t_flags &= ~TF_WASFRECOVERY; 660 if (IN_CONGRECOVERY(tp->t_flags)) 661 tp->t_flags |= TF_WASCRECOVERY; 662 else 663 tp->t_flags &= ~TF_WASCRECOVERY; 664 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 665 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 666 /* In the event that we've negotiated timestamps 667 * badrxtwin will be set to the value that we set 668 * the retransmitted packet's to_tsval to by tcp_output 669 */ 670 tp->t_flags |= TF_PREVVALID; 671 } else 672 tp->t_flags &= ~TF_PREVVALID; 673 TCPSTAT_INC(tcps_rexmttimeo); 674 if ((tp->t_state == TCPS_SYN_SENT) || 675 (tp->t_state == TCPS_SYN_RECEIVED)) 676 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 677 else 678 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 679 TCPT_RANGESET(tp->t_rxtcur, rexmt, 680 tp->t_rttmin, TCPTV_REXMTMAX); 681 682 /* 683 * We enter the path for PLMTUD if connection is established or, if 684 * connection is FIN_WAIT_1 status, reason for the last is that if 685 * amount of data we send is very small, we could send it in couple of 686 * packets and process straight to FIN. In that case we won't catch 687 * ESTABLISHED state. 688 */ 689 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 690 || (tp->t_state == TCPS_FIN_WAIT_1))) { 691 #ifdef INET6 692 int isipv6; 693 #endif 694 695 /* 696 * Idea here is that at each stage of mtu probe (usually, 1448 697 * -> 1188 -> 524) should be given 2 chances to recover before 698 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 699 * take care of that. 700 */ 701 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 702 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 703 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 704 tp->t_rxtshift % 2 == 0)) { 705 /* 706 * Enter Path MTU Black-hole Detection mechanism: 707 * - Disable Path MTU Discovery (IP "DF" bit). 708 * - Reduce MTU to lower value than what we 709 * negotiated with peer. 710 */ 711 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 712 /* Record that we may have found a black hole. */ 713 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 714 /* Keep track of previous MSS. */ 715 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 716 } 717 718 /* 719 * Reduce the MSS to blackhole value or to the default 720 * in an attempt to retransmit. 721 */ 722 #ifdef INET6 723 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 724 if (isipv6 && 725 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 726 /* Use the sysctl tuneable blackhole MSS. */ 727 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 728 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 729 } else if (isipv6) { 730 /* Use the default MSS. */ 731 tp->t_maxseg = V_tcp_v6mssdflt; 732 /* 733 * Disable Path MTU Discovery when we switch to 734 * minmss. 735 */ 736 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 737 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 738 } 739 #endif 740 #if defined(INET6) && defined(INET) 741 else 742 #endif 743 #ifdef INET 744 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 745 /* Use the sysctl tuneable blackhole MSS. */ 746 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 747 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 748 } else { 749 /* Use the default MSS. */ 750 tp->t_maxseg = V_tcp_mssdflt; 751 /* 752 * Disable Path MTU Discovery when we switch to 753 * minmss. 754 */ 755 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 756 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 757 } 758 #endif 759 /* 760 * Reset the slow-start flight size 761 * as it may depend on the new MSS. 762 */ 763 if (CC_ALGO(tp)->conn_init != NULL) 764 CC_ALGO(tp)->conn_init(tp->ccv); 765 } else { 766 /* 767 * If further retransmissions are still unsuccessful 768 * with a lowered MTU, maybe this isn't a blackhole and 769 * we restore the previous MSS and blackhole detection 770 * flags. 771 * The limit '6' is determined by giving each probe 772 * stage (1448, 1188, 524) 2 chances to recover. 773 */ 774 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 775 (tp->t_rxtshift >= 6)) { 776 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 777 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 778 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 779 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 780 /* 781 * Reset the slow-start flight size as it 782 * may depend on the new MSS. 783 */ 784 if (CC_ALGO(tp)->conn_init != NULL) 785 CC_ALGO(tp)->conn_init(tp->ccv); 786 } 787 } 788 } 789 790 /* 791 * Disable RFC1323 and SACK if we haven't got any response to 792 * our third SYN to work-around some broken terminal servers 793 * (most of which have hopefully been retired) that have bad VJ 794 * header compression code which trashes TCP segments containing 795 * unknown-to-them TCP options. 796 */ 797 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 798 (tp->t_rxtshift == 3)) 799 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 800 /* 801 * If we backed off this far, notify the L3 protocol that we're having 802 * connection problems. 803 */ 804 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 805 #ifdef INET6 806 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 807 in6_losing(tp->t_inpcb); 808 else 809 #endif 810 in_losing(tp->t_inpcb); 811 } 812 tp->snd_nxt = tp->snd_una; 813 tp->snd_recover = tp->snd_max; 814 /* 815 * Force a segment to be sent. 816 */ 817 tp->t_flags |= TF_ACKNOW; 818 /* 819 * If timing a segment in this window, stop the timer. 820 */ 821 tp->t_rtttime = 0; 822 823 cc_cong_signal(tp, NULL, CC_RTO); 824 825 (void) tp->t_fb->tfb_tcp_output(tp); 826 827 #ifdef TCPDEBUG 828 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 829 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 830 PRU_SLOWTIMO); 831 #endif 832 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 833 INP_WUNLOCK(inp); 834 out: 835 CURVNET_RESTORE(); 836 } 837 838 void 839 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 840 { 841 struct callout *t_callout; 842 timeout_t *f_callout; 843 struct inpcb *inp = tp->t_inpcb; 844 int cpu = inp_to_cpuid(inp); 845 846 #ifdef TCP_OFFLOAD 847 if (tp->t_flags & TF_TOE) 848 return; 849 #endif 850 851 if (tp->t_timers->tt_flags & TT_STOPPED) 852 return; 853 854 switch (timer_type) { 855 case TT_DELACK: 856 t_callout = &tp->t_timers->tt_delack; 857 f_callout = tcp_timer_delack; 858 break; 859 case TT_REXMT: 860 t_callout = &tp->t_timers->tt_rexmt; 861 f_callout = tcp_timer_rexmt; 862 break; 863 case TT_PERSIST: 864 t_callout = &tp->t_timers->tt_persist; 865 f_callout = tcp_timer_persist; 866 break; 867 case TT_KEEP: 868 t_callout = &tp->t_timers->tt_keep; 869 f_callout = tcp_timer_keep; 870 break; 871 case TT_2MSL: 872 t_callout = &tp->t_timers->tt_2msl; 873 f_callout = tcp_timer_2msl; 874 break; 875 default: 876 if (tp->t_fb->tfb_tcp_timer_activate) { 877 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 878 return; 879 } 880 panic("tp %p bad timer_type %#x", tp, timer_type); 881 } 882 if (delta == 0) { 883 callout_stop(t_callout); 884 } else { 885 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 886 } 887 } 888 889 int 890 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 891 { 892 struct callout *t_callout; 893 894 switch (timer_type) { 895 case TT_DELACK: 896 t_callout = &tp->t_timers->tt_delack; 897 break; 898 case TT_REXMT: 899 t_callout = &tp->t_timers->tt_rexmt; 900 break; 901 case TT_PERSIST: 902 t_callout = &tp->t_timers->tt_persist; 903 break; 904 case TT_KEEP: 905 t_callout = &tp->t_timers->tt_keep; 906 break; 907 case TT_2MSL: 908 t_callout = &tp->t_timers->tt_2msl; 909 break; 910 default: 911 if (tp->t_fb->tfb_tcp_timer_active) { 912 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 913 } 914 panic("tp %p bad timer_type %#x", tp, timer_type); 915 } 916 return callout_active(t_callout); 917 } 918 919 /* 920 * Stop the timer from running, and apply a flag 921 * against the timer_flags that will force the 922 * timer never to run. The flag is needed to assure 923 * a race does not leave it running and cause 924 * the timer to possibly restart itself (keep and persist 925 * especially do this). 926 */ 927 int 928 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 929 { 930 struct callout *t_callout; 931 uint32_t t_flags; 932 933 switch (timer_type) { 934 case TT_DELACK: 935 t_flags = TT_DELACK_SUS; 936 t_callout = &tp->t_timers->tt_delack; 937 break; 938 case TT_REXMT: 939 t_flags = TT_REXMT_SUS; 940 t_callout = &tp->t_timers->tt_rexmt; 941 break; 942 case TT_PERSIST: 943 t_flags = TT_PERSIST_SUS; 944 t_callout = &tp->t_timers->tt_persist; 945 break; 946 case TT_KEEP: 947 t_flags = TT_KEEP_SUS; 948 t_callout = &tp->t_timers->tt_keep; 949 break; 950 case TT_2MSL: 951 t_flags = TT_2MSL_SUS; 952 t_callout = &tp->t_timers->tt_2msl; 953 break; 954 default: 955 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 956 } 957 tp->t_timers->tt_flags |= t_flags; 958 return (callout_stop(t_callout)); 959 } 960 961 void 962 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 963 { 964 switch (timer_type) { 965 case TT_DELACK: 966 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 967 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 968 if (tp->t_flags & TF_DELACK) { 969 /* Delayed ack timer should be up activate a timer */ 970 tp->t_flags &= ~TF_DELACK; 971 tcp_timer_activate(tp, TT_DELACK, 972 tcp_delacktime); 973 } 974 } 975 break; 976 case TT_REXMT: 977 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 978 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 979 if (SEQ_GT(tp->snd_max, tp->snd_una) && 980 (tcp_timer_active((tp), TT_PERSIST) == 0) && 981 tp->snd_wnd) { 982 /* We have outstanding data activate a timer */ 983 tcp_timer_activate(tp, TT_REXMT, 984 tp->t_rxtcur); 985 } 986 } 987 break; 988 case TT_PERSIST: 989 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 990 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 991 if (tp->snd_wnd == 0) { 992 /* Activate the persists timer */ 993 tp->t_rxtshift = 0; 994 tcp_setpersist(tp); 995 } 996 } 997 break; 998 case TT_KEEP: 999 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1000 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1001 tcp_timer_activate(tp, TT_KEEP, 1002 TCPS_HAVEESTABLISHED(tp->t_state) ? 1003 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1004 } 1005 break; 1006 case TT_2MSL: 1007 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1008 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1009 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1010 ((tp->t_inpcb->inp_socket == NULL) || 1011 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1012 /* Star the 2MSL timer */ 1013 tcp_timer_activate(tp, TT_2MSL, 1014 (tcp_fast_finwait2_recycle) ? 1015 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1016 } 1017 } 1018 break; 1019 default: 1020 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1021 } 1022 } 1023 1024 void 1025 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1026 { 1027 struct callout *t_callout; 1028 1029 tp->t_timers->tt_flags |= TT_STOPPED; 1030 switch (timer_type) { 1031 case TT_DELACK: 1032 t_callout = &tp->t_timers->tt_delack; 1033 break; 1034 case TT_REXMT: 1035 t_callout = &tp->t_timers->tt_rexmt; 1036 break; 1037 case TT_PERSIST: 1038 t_callout = &tp->t_timers->tt_persist; 1039 break; 1040 case TT_KEEP: 1041 t_callout = &tp->t_timers->tt_keep; 1042 break; 1043 case TT_2MSL: 1044 t_callout = &tp->t_timers->tt_2msl; 1045 break; 1046 default: 1047 if (tp->t_fb->tfb_tcp_timer_stop) { 1048 /* 1049 * XXXrrs we need to look at this with the 1050 * stop case below (flags). 1051 */ 1052 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1053 return; 1054 } 1055 panic("tp %p bad timer_type %#x", tp, timer_type); 1056 } 1057 1058 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1059 /* 1060 * Can't stop the callout, defer tcpcb actual deletion 1061 * to the last one. We do this using the async drain 1062 * function and incrementing the count in 1063 */ 1064 tp->t_timers->tt_draincnt++; 1065 } 1066 } 1067