1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #include <netinet/tcp_debug.h> 81 82 int tcp_persmin; 83 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 84 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 85 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 86 "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 90 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 91 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 92 "maximum persistence interval"); 93 94 int tcp_keepinit; 95 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 96 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 97 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 98 "time to establish connection"); 99 100 int tcp_keepidle; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 102 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 103 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 104 "time before keepalive probes begin"); 105 106 int tcp_keepintvl; 107 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 109 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 110 "time between keepalive probes"); 111 112 int tcp_delacktime; 113 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 114 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 115 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 116 "Time before a delayed ACK is sent"); 117 118 VNET_DEFINE(int, tcp_msl); 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 120 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 121 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 122 "Maximum segment lifetime"); 123 124 int tcp_rexmit_initial; 125 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 126 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 127 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 128 "Initial Retransmission Timeout"); 129 130 int tcp_rexmit_min; 131 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 132 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 133 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 134 "Minimum Retransmission Timeout"); 135 136 int tcp_rexmit_slop; 137 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 138 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 139 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 140 "Retransmission Timer Slop"); 141 142 VNET_DEFINE(int, tcp_always_keepalive) = 1; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 144 &VNET_NAME(tcp_always_keepalive) , 0, 145 "Assume SO_KEEPALIVE on all TCP connections"); 146 147 int tcp_fast_finwait2_recycle = 0; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 149 &tcp_fast_finwait2_recycle, 0, 150 "Recycle closed FIN_WAIT_2 connections faster"); 151 152 int tcp_finwait2_timeout; 153 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 154 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 155 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 156 "FIN-WAIT2 timeout"); 157 158 int tcp_keepcnt = TCPTV_KEEPCNT; 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 160 "Number of keepalive probes to send"); 161 162 /* max idle probes */ 163 int tcp_maxpersistidle; 164 165 int tcp_rexmit_drop_options = 0; 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 167 &tcp_rexmit_drop_options, 0, 168 "Drop TCP options from 3rd and later retransmitted SYN"); 169 170 int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 171 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, 172 CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 174 "Maximum time (in ms) that a session can linger without making progress"); 175 176 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 180 "Path MTU Discovery Black Hole Detection Enabled"); 181 182 #ifdef INET 183 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 184 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 185 CTLFLAG_RW|CTLFLAG_VNET, 186 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 187 "Path MTU Discovery Black Hole Detection lowered MSS"); 188 #endif 189 190 #ifdef INET6 191 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 192 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 193 CTLFLAG_RW|CTLFLAG_VNET, 194 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 195 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 196 #endif 197 198 #ifdef RSS 199 static int per_cpu_timers = 1; 200 #else 201 static int per_cpu_timers = 0; 202 #endif 203 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 204 &per_cpu_timers , 0, "run tcp timers on all cpus"); 205 206 /* 207 * Map the given inp to a CPU id. 208 * 209 * This queries RSS if it's compiled in, else it defaults to the current 210 * CPU ID. 211 */ 212 inline int 213 inp_to_cpuid(struct inpcb *inp) 214 { 215 u_int cpuid; 216 217 if (per_cpu_timers) { 218 #ifdef RSS 219 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 220 if (cpuid == NETISR_CPUID_NONE) 221 return (curcpu); /* XXX */ 222 else 223 return (cpuid); 224 #endif 225 /* 226 * We don't have a flowid -> cpuid mapping, so cheat and 227 * just map unknown cpuids to curcpu. Not the best, but 228 * apparently better than defaulting to swi 0. 229 */ 230 cpuid = inp->inp_flowid % (mp_maxid + 1); 231 if (! CPU_ABSENT(cpuid)) 232 return (cpuid); 233 return (curcpu); 234 } else { 235 return (0); 236 } 237 } 238 239 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 240 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 241 242 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 243 244 /* 245 * TCP timer processing. 246 * 247 * Each connection has 5 timers associated with it, which can be scheduled 248 * simultaneously. They all are serviced by one callout tcp_timer_enter(). 249 * This function executes the next timer via tcp_timersw[] vector. Each 250 * timer is supposed to return 'true' unless the connection was destroyed. 251 * In the former case tcp_timer_enter() will schedule callout for next timer. 252 */ 253 254 typedef bool tcp_timer_t(struct tcpcb *); 255 static tcp_timer_t tcp_timer_delack; 256 static tcp_timer_t tcp_timer_2msl; 257 static tcp_timer_t tcp_timer_keep; 258 static tcp_timer_t tcp_timer_persist; 259 static tcp_timer_t tcp_timer_rexmt; 260 261 static tcp_timer_t * const tcp_timersw[TT_N] = { 262 [TT_DELACK] = tcp_timer_delack, 263 [TT_REXMT] = tcp_timer_rexmt, 264 [TT_PERSIST] = tcp_timer_persist, 265 [TT_KEEP] = tcp_timer_keep, 266 [TT_2MSL] = tcp_timer_2msl, 267 }; 268 269 /* 270 * tcp_output_locked() s a timer specific variation of call to tcp_output(), 271 * see tcp_var.h for the rest. It handles drop request from advanced stacks, 272 * but keeps tcpcb locked unless tcp_drop() destroyed it. 273 * Returns true if tcpcb is valid and locked. 274 */ 275 static inline bool 276 tcp_output_locked(struct tcpcb *tp) 277 { 278 int rv; 279 280 INP_WLOCK_ASSERT(tptoinpcb(tp)); 281 282 if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { 283 KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, 284 ("TCP stack %s requested tcp_drop(%p)", 285 tp->t_fb->tfb_tcp_block_name, tp)); 286 tp = tcp_drop(tp, rv); 287 } 288 289 return (tp != NULL); 290 } 291 292 static bool 293 tcp_timer_delack(struct tcpcb *tp) 294 { 295 struct epoch_tracker et; 296 #if defined(INVARIANTS) || defined(VIMAGE) 297 struct inpcb *inp = tptoinpcb(tp); 298 #endif 299 bool rv; 300 301 INP_WLOCK_ASSERT(inp); 302 303 CURVNET_SET(inp->inp_vnet); 304 tp->t_flags |= TF_ACKNOW; 305 TCPSTAT_INC(tcps_delack); 306 NET_EPOCH_ENTER(et); 307 rv = tcp_output_locked(tp); 308 NET_EPOCH_EXIT(et); 309 CURVNET_RESTORE(); 310 311 return (rv); 312 } 313 314 static bool 315 tcp_timer_2msl(struct tcpcb *tp) 316 { 317 struct inpcb *inp = tptoinpcb(tp); 318 bool close = false; 319 320 INP_WLOCK_ASSERT(inp); 321 322 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 323 CURVNET_SET(inp->inp_vnet); 324 tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 325 tcp_free_sackholes(tp); 326 /* 327 * 2 MSL timeout in shutdown went off. If we're closed but 328 * still waiting for peer to close and connection has been idle 329 * too long delete connection control block. Otherwise, check 330 * again in a bit. 331 * 332 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 333 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 334 * Ignore fact that there were recent incoming segments. 335 * 336 * XXXGL: check if inp_socket shall always be !NULL here? 337 */ 338 if (tp->t_state == TCPS_TIME_WAIT) { 339 close = true; 340 } else if (tp->t_state == TCPS_FIN_WAIT_2 && 341 tcp_fast_finwait2_recycle && inp->inp_socket && 342 (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 343 TCPSTAT_INC(tcps_finwait2_drops); 344 close = true; 345 } else { 346 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 347 tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); 348 else 349 close = true; 350 } 351 if (close) { 352 struct epoch_tracker et; 353 354 NET_EPOCH_ENTER(et); 355 tp = tcp_close(tp); 356 NET_EPOCH_EXIT(et); 357 } 358 CURVNET_RESTORE(); 359 360 return (tp != NULL); 361 } 362 363 static bool 364 tcp_timer_keep(struct tcpcb *tp) 365 { 366 struct epoch_tracker et; 367 struct inpcb *inp = tptoinpcb(tp); 368 struct tcptemp *t_template; 369 370 INP_WLOCK_ASSERT(inp); 371 372 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 373 CURVNET_SET(inp->inp_vnet); 374 /* 375 * Because we don't regularly reset the keepalive callout in 376 * the ESTABLISHED state, it may be that we don't actually need 377 * to send a keepalive yet. If that occurs, schedule another 378 * call for the next time the keepalive timer might expire. 379 */ 380 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 381 u_int idletime; 382 383 idletime = ticks - tp->t_rcvtime; 384 if (idletime < TP_KEEPIDLE(tp)) { 385 tcp_timer_activate(tp, TT_KEEP, 386 TP_KEEPIDLE(tp) - idletime); 387 CURVNET_RESTORE(); 388 return (true); 389 } 390 } 391 392 /* 393 * Keep-alive timer went off; send something 394 * or drop connection if idle for too long. 395 */ 396 TCPSTAT_INC(tcps_keeptimeo); 397 if (tp->t_state < TCPS_ESTABLISHED) 398 goto dropit; 399 if ((V_tcp_always_keepalive || 400 inp->inp_socket->so_options & SO_KEEPALIVE) && 401 tp->t_state <= TCPS_CLOSING) { 402 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 403 goto dropit; 404 /* 405 * Send a packet designed to force a response 406 * if the peer is up and reachable: 407 * either an ACK if the connection is still alive, 408 * or an RST if the peer has closed the connection 409 * due to timeout or reboot. 410 * Using sequence number tp->snd_una-1 411 * causes the transmitted zero-length segment 412 * to lie outside the receive window; 413 * by the protocol spec, this requires the 414 * correspondent TCP to respond. 415 */ 416 TCPSTAT_INC(tcps_keepprobe); 417 t_template = tcpip_maketemplate(inp); 418 if (t_template) { 419 NET_EPOCH_ENTER(et); 420 tcp_respond(tp, t_template->tt_ipgen, 421 &t_template->tt_t, (struct mbuf *)NULL, 422 tp->rcv_nxt, tp->snd_una - 1, 0); 423 NET_EPOCH_EXIT(et); 424 free(t_template, M_TEMP); 425 } 426 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); 427 } else 428 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 429 430 CURVNET_RESTORE(); 431 return (true); 432 433 dropit: 434 TCPSTAT_INC(tcps_keepdrops); 435 NET_EPOCH_ENTER(et); 436 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 437 tp = tcp_drop(tp, ETIMEDOUT); 438 NET_EPOCH_EXIT(et); 439 CURVNET_RESTORE(); 440 441 return (tp != NULL); 442 } 443 444 /* 445 * Has this session exceeded the maximum time without seeing a substantive 446 * acknowledgement? If so, return true; otherwise false. 447 */ 448 static bool 449 tcp_maxunacktime_check(struct tcpcb *tp) 450 { 451 452 /* Are we tracking this timer for this session? */ 453 if (TP_MAXUNACKTIME(tp) == 0) 454 return false; 455 456 /* Do we have a current measurement. */ 457 if (tp->t_acktime == 0) 458 return false; 459 460 /* Are we within the acceptable range? */ 461 if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 462 return false; 463 464 /* We exceeded the timer. */ 465 TCPSTAT_INC(tcps_progdrops); 466 return true; 467 } 468 469 static bool 470 tcp_timer_persist(struct tcpcb *tp) 471 { 472 struct epoch_tracker et; 473 #if defined(INVARIANTS) || defined(VIMAGE) 474 struct inpcb *inp = tptoinpcb(tp); 475 #endif 476 bool progdrop, rv; 477 478 INP_WLOCK_ASSERT(inp); 479 480 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 481 CURVNET_SET(inp->inp_vnet); 482 /* 483 * Persistence timer into zero window. 484 * Force a byte to be output, if possible. 485 */ 486 TCPSTAT_INC(tcps_persisttimeo); 487 /* 488 * Hack: if the peer is dead/unreachable, we do not 489 * time out if the window is closed. After a full 490 * backoff, drop the connection if the idle time 491 * (no responses to probes) reaches the maximum 492 * backoff that we would use if retransmitting. 493 * Also, drop the connection if we haven't been making 494 * progress. 495 */ 496 progdrop = tcp_maxunacktime_check(tp); 497 if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && 498 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 499 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 500 if (!progdrop) 501 TCPSTAT_INC(tcps_persistdrop); 502 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 503 goto dropit; 504 } 505 /* 506 * If the user has closed the socket then drop a persisting 507 * connection after a much reduced timeout. 508 */ 509 if (tp->t_state > TCPS_CLOSE_WAIT && 510 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 511 TCPSTAT_INC(tcps_persistdrop); 512 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 513 goto dropit; 514 } 515 tcp_setpersist(tp); 516 tp->t_flags |= TF_FORCEDATA; 517 NET_EPOCH_ENTER(et); 518 if ((rv = tcp_output_locked(tp))) 519 tp->t_flags &= ~TF_FORCEDATA; 520 NET_EPOCH_EXIT(et); 521 CURVNET_RESTORE(); 522 523 return (rv); 524 525 dropit: 526 NET_EPOCH_ENTER(et); 527 tp = tcp_drop(tp, ETIMEDOUT); 528 NET_EPOCH_EXIT(et); 529 CURVNET_RESTORE(); 530 531 return (tp != NULL); 532 } 533 534 static bool 535 tcp_timer_rexmt(struct tcpcb *tp) 536 { 537 struct epoch_tracker et; 538 struct inpcb *inp = tptoinpcb(tp); 539 int rexmt; 540 bool isipv6, rv; 541 542 INP_WLOCK_ASSERT(inp); 543 544 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 545 CURVNET_SET(inp->inp_vnet); 546 tcp_free_sackholes(tp); 547 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 548 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 549 /* The stack has a timer action too. */ 550 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 551 } 552 /* 553 * Retransmission timer went off. Message has not 554 * been acked within retransmit interval. Back off 555 * to a longer retransmit interval and retransmit one segment. 556 * 557 * If we've either exceeded the maximum number of retransmissions, 558 * or we've gone long enough without making progress, then drop 559 * the session. 560 */ 561 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { 562 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) 563 TCPSTAT_INC(tcps_timeoutdrop); 564 tp->t_rxtshift = TCP_MAXRXTSHIFT; 565 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 566 NET_EPOCH_ENTER(et); 567 tp = tcp_drop(tp, ETIMEDOUT); 568 NET_EPOCH_EXIT(et); 569 CURVNET_RESTORE(); 570 571 return (tp != NULL); 572 } 573 if (tp->t_state == TCPS_SYN_SENT) { 574 /* 575 * If the SYN was retransmitted, indicate CWND to be 576 * limited to 1 segment in cc_conn_init(). 577 */ 578 tp->snd_cwnd = 1; 579 } else if (tp->t_rxtshift == 1) { 580 /* 581 * first retransmit; record ssthresh and cwnd so they can 582 * be recovered if this turns out to be a "bad" retransmit. 583 * A retransmit is considered "bad" if an ACK for this 584 * segment is received within RTT/2 interval; the assumption 585 * here is that the ACK was already in flight. See 586 * "On Estimating End-to-End Network Path Properties" by 587 * Allman and Paxson for more details. 588 */ 589 tp->snd_cwnd_prev = tp->snd_cwnd; 590 tp->snd_ssthresh_prev = tp->snd_ssthresh; 591 tp->snd_recover_prev = tp->snd_recover; 592 if (IN_FASTRECOVERY(tp->t_flags)) 593 tp->t_flags |= TF_WASFRECOVERY; 594 else 595 tp->t_flags &= ~TF_WASFRECOVERY; 596 if (IN_CONGRECOVERY(tp->t_flags)) 597 tp->t_flags |= TF_WASCRECOVERY; 598 else 599 tp->t_flags &= ~TF_WASCRECOVERY; 600 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 601 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 602 /* In the event that we've negotiated timestamps 603 * badrxtwin will be set to the value that we set 604 * the retransmitted packet's to_tsval to by tcp_output 605 */ 606 tp->t_flags |= TF_PREVVALID; 607 } else 608 tp->t_flags &= ~TF_PREVVALID; 609 TCPSTAT_INC(tcps_rexmttimeo); 610 if ((tp->t_state == TCPS_SYN_SENT) || 611 (tp->t_state == TCPS_SYN_RECEIVED)) 612 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 613 else 614 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 615 TCPT_RANGESET(tp->t_rxtcur, rexmt, 616 tp->t_rttmin, TCPTV_REXMTMAX); 617 618 /* 619 * We enter the path for PLMTUD if connection is established or, if 620 * connection is FIN_WAIT_1 status, reason for the last is that if 621 * amount of data we send is very small, we could send it in couple of 622 * packets and process straight to FIN. In that case we won't catch 623 * ESTABLISHED state. 624 */ 625 #ifdef INET6 626 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 627 #else 628 isipv6 = false; 629 #endif 630 if (((V_tcp_pmtud_blackhole_detect == 1) || 631 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 632 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 633 ((tp->t_state == TCPS_ESTABLISHED) || 634 (tp->t_state == TCPS_FIN_WAIT_1))) { 635 if (tp->t_rxtshift == 1) { 636 /* 637 * We enter blackhole detection after the first 638 * unsuccessful timer based retransmission. 639 * Then we reduce up to two times the MSS, each 640 * candidate giving two tries of retransmissions. 641 * But we give a candidate only two tries, if it 642 * actually reduces the MSS. 643 */ 644 tp->t_blackhole_enter = 2; 645 tp->t_blackhole_exit = tp->t_blackhole_enter; 646 if (isipv6) { 647 #ifdef INET6 648 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 649 tp->t_blackhole_exit += 2; 650 if (tp->t_maxseg > V_tcp_v6mssdflt && 651 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 652 tp->t_blackhole_exit += 2; 653 #endif 654 } else { 655 #ifdef INET 656 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 657 tp->t_blackhole_exit += 2; 658 if (tp->t_maxseg > V_tcp_mssdflt && 659 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 660 tp->t_blackhole_exit += 2; 661 #endif 662 } 663 } 664 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 665 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 666 (tp->t_rxtshift >= tp->t_blackhole_enter && 667 tp->t_rxtshift < tp->t_blackhole_exit && 668 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 669 /* 670 * Enter Path MTU Black-hole Detection mechanism: 671 * - Disable Path MTU Discovery (IP "DF" bit). 672 * - Reduce MTU to lower value than what we 673 * negotiated with peer. 674 */ 675 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 676 /* Record that we may have found a black hole. */ 677 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 678 /* Keep track of previous MSS. */ 679 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 680 } 681 682 /* 683 * Reduce the MSS to blackhole value or to the default 684 * in an attempt to retransmit. 685 */ 686 #ifdef INET6 687 if (isipv6 && 688 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 689 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 690 /* Use the sysctl tuneable blackhole MSS. */ 691 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 692 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 693 } else if (isipv6) { 694 /* Use the default MSS. */ 695 tp->t_maxseg = V_tcp_v6mssdflt; 696 /* 697 * Disable Path MTU Discovery when we switch to 698 * minmss. 699 */ 700 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 701 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 702 } 703 #endif 704 #if defined(INET6) && defined(INET) 705 else 706 #endif 707 #ifdef INET 708 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 709 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 710 /* Use the sysctl tuneable blackhole MSS. */ 711 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 712 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 713 } else { 714 /* Use the default MSS. */ 715 tp->t_maxseg = V_tcp_mssdflt; 716 /* 717 * Disable Path MTU Discovery when we switch to 718 * minmss. 719 */ 720 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 721 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 722 } 723 #endif 724 /* 725 * Reset the slow-start flight size 726 * as it may depend on the new MSS. 727 */ 728 if (CC_ALGO(tp)->conn_init != NULL) 729 CC_ALGO(tp)->conn_init(&tp->t_ccv); 730 } else { 731 /* 732 * If further retransmissions are still unsuccessful 733 * with a lowered MTU, maybe this isn't a blackhole and 734 * we restore the previous MSS and blackhole detection 735 * flags. 736 */ 737 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 738 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 739 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 740 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 741 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 742 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 743 /* 744 * Reset the slow-start flight size as it 745 * may depend on the new MSS. 746 */ 747 if (CC_ALGO(tp)->conn_init != NULL) 748 CC_ALGO(tp)->conn_init(&tp->t_ccv); 749 } 750 } 751 } 752 753 /* 754 * Disable RFC1323 and SACK if we haven't got any response to 755 * our third SYN to work-around some broken terminal servers 756 * (most of which have hopefully been retired) that have bad VJ 757 * header compression code which trashes TCP segments containing 758 * unknown-to-them TCP options. 759 */ 760 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 761 (tp->t_rxtshift == 3)) 762 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 763 /* 764 * If we backed off this far, notify the L3 protocol that we're having 765 * connection problems. 766 */ 767 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 768 #ifdef INET6 769 if ((inp->inp_vflag & INP_IPV6) != 0) 770 in6_losing(inp); 771 else 772 #endif 773 in_losing(inp); 774 } 775 tp->snd_nxt = tp->snd_una; 776 tp->snd_recover = tp->snd_max; 777 /* 778 * Force a segment to be sent. 779 */ 780 tp->t_flags |= TF_ACKNOW; 781 /* 782 * If timing a segment in this window, stop the timer. 783 */ 784 tp->t_rtttime = 0; 785 786 cc_cong_signal(tp, NULL, CC_RTO); 787 NET_EPOCH_ENTER(et); 788 rv = tcp_output_locked(tp); 789 NET_EPOCH_EXIT(et); 790 CURVNET_RESTORE(); 791 792 return (rv); 793 } 794 795 static inline tt_which 796 tcp_timer_next(struct tcpcb *tp, sbintime_t *precision) 797 { 798 tt_which i, rv; 799 sbintime_t after, before; 800 801 for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) { 802 if (tp->t_timers[i] < after) { 803 after = tp->t_timers[i]; 804 rv = i; 805 } 806 before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]); 807 } 808 if (precision != NULL) 809 *precision = before - after; 810 811 return (rv); 812 } 813 814 static void 815 tcp_timer_enter(void *xtp) 816 { 817 struct tcpcb *tp = xtp; 818 struct inpcb *inp = tptoinpcb(tp); 819 sbintime_t precision; 820 tt_which which; 821 822 INP_WLOCK_ASSERT(inp); 823 MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); 824 825 curthread->td_pflags |= TDP_INTCPCALLOUT; 826 827 which = tcp_timer_next(tp, NULL); 828 MPASS(which < TT_N); 829 tp->t_timers[which] = SBT_MAX; 830 tp->t_precisions[which] = 0; 831 832 if (tcp_timersw[which](tp)) { 833 if ((which = tcp_timer_next(tp, &precision)) != TT_N) { 834 callout_reset_sbt_on(&tp->t_callout, 835 tp->t_timers[which], precision, tcp_timer_enter, 836 tp, inp_to_cpuid(inp), C_ABSOLUTE); 837 } 838 INP_WUNLOCK(inp); 839 } 840 841 curthread->td_pflags &= ~TDP_INTCPCALLOUT; 842 } 843 844 /* 845 * Activate or stop (delta == 0) a TCP timer. 846 */ 847 void 848 tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta) 849 { 850 struct inpcb *inp = tptoinpcb(tp); 851 sbintime_t precision; 852 853 #ifdef TCP_OFFLOAD 854 if (tp->t_flags & TF_TOE) 855 return; 856 #endif 857 858 INP_WLOCK_ASSERT(inp); 859 860 if (delta > 0) 861 callout_when(tick_sbt * delta, 0, C_HARDCLOCK, 862 &tp->t_timers[which], &tp->t_precisions[which]); 863 else 864 tp->t_timers[which] = SBT_MAX; 865 866 if ((which = tcp_timer_next(tp, &precision)) != TT_N) 867 callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 868 precision, tcp_timer_enter, tp, inp_to_cpuid(inp), 869 C_ABSOLUTE); 870 else 871 callout_stop(&tp->t_callout); 872 } 873 874 bool 875 tcp_timer_active(struct tcpcb *tp, tt_which which) 876 { 877 878 INP_WLOCK_ASSERT(tptoinpcb(tp)); 879 880 return (tp->t_timers[which] != SBT_MAX); 881 } 882 883 /* 884 * Stop all timers associated with tcpcb. 885 * 886 * Called only on tcpcb destruction. The tcpcb shall already be dropped from 887 * the pcb lookup database and socket is not losing the last reference. 888 * 889 * XXXGL: unfortunately our callout(9) is not able to fully stop a locked 890 * callout even when only two threads are involved: the callout itself and the 891 * thread that does callout_stop(). See where softclock_call_cc() swaps the 892 * callwheel lock to callout lock and then checks cc_exec_cancel(). This is 893 * the race window. If it happens, the tcp_timer_enter() won't be executed, 894 * however pcb lock will be locked and released, hence we can't free memory. 895 * Until callout(9) is improved, just keep retrying. In my profiling I've seen 896 * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. 897 */ 898 void 899 tcp_timer_stop(struct tcpcb *tp) 900 { 901 struct inpcb *inp = tptoinpcb(tp); 902 903 INP_WLOCK_ASSERT(inp); 904 905 if (curthread->td_pflags & TDP_INTCPCALLOUT) { 906 int stopped __diagused; 907 908 stopped = callout_stop(&tp->t_callout); 909 MPASS(stopped == 0); 910 } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { 911 INP_WUNLOCK(inp); 912 kern_yield(PRI_UNCHANGED); 913 INP_WLOCK(inp); 914 } 915 } 916