1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 #include "opt_inet.h" 34 #include "opt_inet6.h" 35 #include "opt_rss.h" 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/lock.h> 40 #include <sys/mbuf.h> 41 #include <sys/mutex.h> 42 #include <sys/protosw.h> 43 #include <sys/smp.h> 44 #include <sys/socket.h> 45 #include <sys/socketvar.h> 46 #include <sys/sysctl.h> 47 #include <sys/systm.h> 48 49 #include <net/if.h> 50 #include <net/route.h> 51 #include <net/rss_config.h> 52 #include <net/vnet.h> 53 #include <net/netisr.h> 54 55 #include <netinet/in.h> 56 #include <netinet/in_kdtrace.h> 57 #include <netinet/in_pcb.h> 58 #include <netinet/in_rss.h> 59 #include <netinet/in_systm.h> 60 #ifdef INET6 61 #include <netinet6/in6_pcb.h> 62 #endif 63 #include <netinet/ip_var.h> 64 #include <netinet/tcp.h> 65 #include <netinet/tcp_fsm.h> 66 #include <netinet/tcp_timer.h> 67 #include <netinet/tcp_var.h> 68 #include <netinet/tcp_log_buf.h> 69 #include <netinet/tcp_seq.h> 70 #include <netinet/cc/cc.h> 71 #ifdef INET6 72 #include <netinet6/tcp6_var.h> 73 #endif 74 #include <netinet/tcpip.h> 75 76 int tcp_persmin; 77 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT | CTLFLAG_RW, 78 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 79 "minimum persistence interval"); 80 81 int tcp_persmax; 82 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT | CTLFLAG_RW, 83 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 84 "maximum persistence interval"); 85 86 int tcp_keepinit; 87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT | CTLFLAG_RW, 88 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 89 "time to establish connection"); 90 91 int tcp_keepidle; 92 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT | CTLFLAG_RW, 93 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 94 "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 98 CTLTYPE_INT | CTLFLAG_RW, &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 99 "time between keepalive probes"); 100 101 int tcp_delacktime; 102 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 103 CTLTYPE_INT | CTLFLAG_RW, &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 104 "Time before a delayed ACK is sent"); 105 106 VNET_DEFINE(int, tcp_msl); 107 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 108 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 109 &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 110 "Maximum segment lifetime"); 111 112 int tcp_rexmit_initial; 113 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT | CTLFLAG_RW, 114 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 115 "Initial Retransmission Timeout"); 116 117 int tcp_rexmit_min; 118 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT | CTLFLAG_RW, 119 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 120 "Minimum Retransmission Timeout"); 121 122 int tcp_rexmit_max; 123 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_max, CTLTYPE_INT | CTLFLAG_RW, 124 &tcp_rexmit_max, 0, sysctl_msec_to_ticks, "I", 125 "Maximum Retransmission Timeout"); 126 127 int tcp_rexmit_slop; 128 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT | CTLFLAG_RW, 129 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 130 "Retransmission Timer Slop"); 131 132 VNET_DEFINE(int, tcp_always_keepalive) = 1; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 134 &VNET_NAME(tcp_always_keepalive) , 0, 135 "Assume SO_KEEPALIVE on all TCP connections"); 136 137 int tcp_fast_finwait2_recycle = 0; 138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 139 &tcp_fast_finwait2_recycle, 0, 140 "Recycle closed FIN_WAIT_2 connections faster"); 141 142 int tcp_finwait2_timeout; 143 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT | CTLFLAG_RW, 144 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 145 "FIN-WAIT2 timeout"); 146 147 int tcp_keepcnt = TCPTV_KEEPCNT; 148 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 149 "Number of keepalive probes to send"); 150 151 /* max idle probes */ 152 int tcp_maxpersistidle; 153 154 int tcp_rexmit_drop_options = 0; 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 156 &tcp_rexmit_drop_options, 0, 157 "Drop TCP options from 3rd and later retransmitted SYN"); 158 159 int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 160 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, CTLTYPE_INT | CTLFLAG_RW, 161 &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 162 "Maximum time (in ms) that a session can linger without making progress"); 163 164 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 165 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 166 CTLFLAG_RW|CTLFLAG_VNET, 167 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 168 "Path MTU Discovery Black Hole Detection Enabled"); 169 170 #ifdef INET 171 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 172 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 173 CTLFLAG_RW|CTLFLAG_VNET, 174 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 175 "Path MTU Discovery Black Hole Detection lowered MSS"); 176 #endif 177 178 #ifdef INET6 179 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 181 CTLFLAG_RW|CTLFLAG_VNET, 182 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 183 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 184 #endif 185 186 #ifdef RSS 187 static int per_cpu_timers = 1; 188 #else 189 static int per_cpu_timers = 0; 190 #endif 191 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 192 &per_cpu_timers , 0, "run tcp timers on all cpus"); 193 194 static int 195 sysctl_net_inet_tcp_retries(SYSCTL_HANDLER_ARGS) 196 { 197 int error, new; 198 199 new = V_tcp_retries; 200 error = sysctl_handle_int(oidp, &new, 0, req); 201 if (error == 0 && req->newptr) { 202 if ((new < 1) || (new > TCP_MAXRXTSHIFT)) 203 error = EINVAL; 204 else 205 V_tcp_retries = new; 206 } 207 return (error); 208 } 209 210 VNET_DEFINE(int, tcp_retries) = TCP_MAXRXTSHIFT; 211 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, retries, 212 CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW, 213 &VNET_NAME(tcp_retries), 0, sysctl_net_inet_tcp_retries, "I", 214 "maximum number of consecutive timer based retransmissions"); 215 216 /* 217 * Map the given inp to a CPU id. 218 * 219 * This queries RSS if it's compiled in, else it defaults to the current 220 * CPU ID. 221 */ 222 inline int 223 inp_to_cpuid(struct inpcb *inp) 224 { 225 u_int cpuid; 226 227 if (per_cpu_timers) { 228 #ifdef RSS 229 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 230 if (cpuid == NETISR_CPUID_NONE) 231 return (curcpu); /* XXX */ 232 else 233 return (cpuid); 234 #endif 235 /* 236 * We don't have a flowid -> cpuid mapping, so cheat and 237 * just map unknown cpuids to curcpu. Not the best, but 238 * apparently better than defaulting to swi 0. 239 */ 240 cpuid = inp->inp_flowid % (mp_maxid + 1); 241 if (! CPU_ABSENT(cpuid)) 242 return (cpuid); 243 return (curcpu); 244 } else { 245 return (0); 246 } 247 } 248 249 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 250 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 251 252 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 253 254 /* 255 * TCP timer processing. 256 * 257 * Each connection has 5 timers associated with it, which can be scheduled 258 * simultaneously. They all are serviced by one callout tcp_timer_enter(). 259 * This function executes the next timer via tcp_timersw[] vector. Each 260 * timer is supposed to return 'true' unless the connection was destroyed. 261 * In the former case tcp_timer_enter() will schedule callout for next timer. 262 */ 263 264 typedef bool tcp_timer_t(struct tcpcb *); 265 static tcp_timer_t tcp_timer_delack; 266 static tcp_timer_t tcp_timer_2msl; 267 static tcp_timer_t tcp_timer_keep; 268 static tcp_timer_t tcp_timer_persist; 269 static tcp_timer_t tcp_timer_rexmt; 270 271 static tcp_timer_t * const tcp_timersw[TT_N] = { 272 [TT_DELACK] = tcp_timer_delack, 273 [TT_REXMT] = tcp_timer_rexmt, 274 [TT_PERSIST] = tcp_timer_persist, 275 [TT_KEEP] = tcp_timer_keep, 276 [TT_2MSL] = tcp_timer_2msl, 277 }; 278 279 /* 280 * tcp_output_locked() s a timer specific variation of call to tcp_output(), 281 * see tcp_var.h for the rest. It handles drop request from advanced stacks, 282 * but keeps tcpcb locked unless tcp_drop() destroyed it. 283 * Returns true if tcpcb is valid and locked. 284 */ 285 static inline bool 286 tcp_output_locked(struct tcpcb *tp) 287 { 288 int rv; 289 290 INP_WLOCK_ASSERT(tptoinpcb(tp)); 291 292 if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { 293 KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, 294 ("TCP stack %s requested tcp_drop(%p)", 295 tp->t_fb->tfb_tcp_block_name, tp)); 296 tp = tcp_drop(tp, -rv); 297 } 298 299 return (tp != NULL); 300 } 301 302 static bool 303 tcp_timer_delack(struct tcpcb *tp) 304 { 305 struct epoch_tracker et; 306 #if defined(INVARIANTS) || defined(VIMAGE) 307 struct inpcb *inp = tptoinpcb(tp); 308 #endif 309 bool rv; 310 311 INP_WLOCK_ASSERT(inp); 312 313 CURVNET_SET(inp->inp_vnet); 314 tp->t_flags |= TF_ACKNOW; 315 TCPSTAT_INC(tcps_delack); 316 NET_EPOCH_ENTER(et); 317 rv = tcp_output_locked(tp); 318 NET_EPOCH_EXIT(et); 319 CURVNET_RESTORE(); 320 321 return (rv); 322 } 323 324 static bool 325 tcp_timer_2msl(struct tcpcb *tp) 326 { 327 struct inpcb *inp = tptoinpcb(tp); 328 bool close = false; 329 330 INP_WLOCK_ASSERT(inp); 331 332 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 333 CURVNET_SET(inp->inp_vnet); 334 tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 335 tcp_free_sackholes(tp); 336 /* 337 * 2 MSL timeout in shutdown went off. If we're closed but 338 * still waiting for peer to close and connection has been idle 339 * too long delete connection control block. Otherwise, check 340 * again in a bit. 341 * 342 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 343 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 344 * Ignore fact that there were recent incoming segments. 345 * 346 * XXXGL: check if inp_socket shall always be !NULL here? 347 */ 348 if (tp->t_state == TCPS_TIME_WAIT) { 349 close = true; 350 } else if (tp->t_state == TCPS_FIN_WAIT_2 && 351 tcp_fast_finwait2_recycle && inp->inp_socket && 352 (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 353 TCPSTAT_INC(tcps_finwait2_drops); 354 close = true; 355 } else { 356 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 357 tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); 358 else 359 close = true; 360 } 361 if (close) { 362 struct epoch_tracker et; 363 364 NET_EPOCH_ENTER(et); 365 tp = tcp_close(tp); 366 NET_EPOCH_EXIT(et); 367 } 368 CURVNET_RESTORE(); 369 370 return (tp != NULL); 371 } 372 373 static bool 374 tcp_timer_keep(struct tcpcb *tp) 375 { 376 struct epoch_tracker et; 377 struct inpcb *inp = tptoinpcb(tp); 378 struct tcptemp *t_template; 379 380 INP_WLOCK_ASSERT(inp); 381 382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 383 CURVNET_SET(inp->inp_vnet); 384 /* 385 * Because we don't regularly reset the keepalive callout in 386 * the ESTABLISHED state, it may be that we don't actually need 387 * to send a keepalive yet. If that occurs, schedule another 388 * call for the next time the keepalive timer might expire. 389 */ 390 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 391 u_int idletime; 392 393 idletime = ticks - tp->t_rcvtime; 394 if (idletime < TP_KEEPIDLE(tp)) { 395 tcp_timer_activate(tp, TT_KEEP, 396 TP_KEEPIDLE(tp) - idletime); 397 CURVNET_RESTORE(); 398 return (true); 399 } 400 } 401 402 /* 403 * Keep-alive timer went off; send something 404 * or drop connection if idle for too long. 405 */ 406 TCPSTAT_INC(tcps_keeptimeo); 407 if (tp->t_state < TCPS_ESTABLISHED) 408 goto dropit; 409 if ((V_tcp_always_keepalive || 410 inp->inp_socket->so_options & SO_KEEPALIVE) && 411 tp->t_state <= TCPS_CLOSING) { 412 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 413 goto dropit; 414 /* 415 * Send a packet designed to force a response 416 * if the peer is up and reachable: 417 * either an ACK if the connection is still alive, 418 * or an RST if the peer has closed the connection 419 * due to timeout or reboot. 420 * Using sequence number tp->snd_una-1 421 * causes the transmitted zero-length segment 422 * to lie outside the receive window; 423 * by the protocol spec, this requires the 424 * correspondent TCP to respond. 425 */ 426 TCPSTAT_INC(tcps_keepprobe); 427 t_template = tcpip_maketemplate(inp); 428 if (t_template) { 429 NET_EPOCH_ENTER(et); 430 tcp_respond(tp, t_template->tt_ipgen, 431 &t_template->tt_t, (struct mbuf *)NULL, 432 tp->rcv_nxt, tp->snd_una - 1, 0); 433 NET_EPOCH_EXIT(et); 434 free(t_template, M_TEMP); 435 } 436 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); 437 } else 438 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 439 440 CURVNET_RESTORE(); 441 return (true); 442 443 dropit: 444 TCPSTAT_INC(tcps_keepdrops); 445 NET_EPOCH_ENTER(et); 446 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 447 tp = tcp_drop(tp, ETIMEDOUT); 448 NET_EPOCH_EXIT(et); 449 CURVNET_RESTORE(); 450 451 return (tp != NULL); 452 } 453 454 /* 455 * Has this session exceeded the maximum time without seeing a substantive 456 * acknowledgement? If so, return true; otherwise false. 457 */ 458 static bool 459 tcp_maxunacktime_check(struct tcpcb *tp) 460 { 461 462 /* Are we tracking this timer for this session? */ 463 if (TP_MAXUNACKTIME(tp) == 0) 464 return false; 465 466 /* Do we have a current measurement. */ 467 if (tp->t_acktime == 0) 468 return false; 469 470 /* Are we within the acceptable range? */ 471 if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 472 return false; 473 474 /* We exceeded the timer. */ 475 TCPSTAT_INC(tcps_progdrops); 476 return true; 477 } 478 479 static bool 480 tcp_timer_persist(struct tcpcb *tp) 481 { 482 struct epoch_tracker et; 483 #if defined(INVARIANTS) || defined(VIMAGE) 484 struct inpcb *inp = tptoinpcb(tp); 485 #endif 486 bool progdrop, rv; 487 488 INP_WLOCK_ASSERT(inp); 489 490 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 491 CURVNET_SET(inp->inp_vnet); 492 /* 493 * Persistence timer into zero window. 494 * Force a byte to be output, if possible. 495 */ 496 TCPSTAT_INC(tcps_persisttimeo); 497 /* 498 * Hack: if the peer is dead/unreachable, we do not 499 * time out if the window is closed. After a full 500 * backoff, drop the connection if the idle time 501 * (no responses to probes) reaches the maximum 502 * backoff that we would use if retransmitting. 503 * Also, drop the connection if we haven't been making 504 * progress. 505 */ 506 progdrop = tcp_maxunacktime_check(tp); 507 if (progdrop || (tp->t_rxtshift >= V_tcp_retries && 508 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 509 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 510 if (!progdrop) 511 TCPSTAT_INC(tcps_persistdrop); 512 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 513 goto dropit; 514 } 515 /* 516 * If the user has closed the socket then drop a persisting 517 * connection after a much reduced timeout. 518 */ 519 if (tp->t_state > TCPS_CLOSE_WAIT && 520 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 521 TCPSTAT_INC(tcps_persistdrop); 522 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 523 goto dropit; 524 } 525 tcp_setpersist(tp); 526 tp->t_flags |= TF_FORCEDATA; 527 NET_EPOCH_ENTER(et); 528 if ((rv = tcp_output_locked(tp))) 529 tp->t_flags &= ~TF_FORCEDATA; 530 NET_EPOCH_EXIT(et); 531 CURVNET_RESTORE(); 532 533 return (rv); 534 535 dropit: 536 NET_EPOCH_ENTER(et); 537 tp = tcp_drop(tp, ETIMEDOUT); 538 NET_EPOCH_EXIT(et); 539 CURVNET_RESTORE(); 540 541 return (tp != NULL); 542 } 543 544 static bool 545 tcp_timer_rexmt(struct tcpcb *tp) 546 { 547 struct epoch_tracker et; 548 struct inpcb *inp = tptoinpcb(tp); 549 int rexmt; 550 bool isipv6, rv; 551 552 INP_WLOCK_ASSERT(inp); 553 554 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 555 CURVNET_SET(inp->inp_vnet); 556 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 557 /* The stack has a timer action too. */ 558 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 559 } 560 /* 561 * Retransmission timer went off. Message has not 562 * been acked within retransmit interval. Back off 563 * to a longer retransmit interval and retransmit one segment. 564 * 565 * If we've either exceeded the maximum number of retransmissions, 566 * or we've gone long enough without making progress, then drop 567 * the session. 568 */ 569 if (++tp->t_rxtshift > V_tcp_retries || tcp_maxunacktime_check(tp)) { 570 if (tp->t_rxtshift > V_tcp_retries) 571 TCPSTAT_INC(tcps_timeoutdrop); 572 tp->t_rxtshift = V_tcp_retries; 573 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 574 NET_EPOCH_ENTER(et); 575 tp = tcp_drop(tp, ETIMEDOUT); 576 NET_EPOCH_EXIT(et); 577 CURVNET_RESTORE(); 578 579 return (tp != NULL); 580 } 581 if (tp->t_state == TCPS_SYN_SENT) { 582 /* 583 * If the SYN was retransmitted, indicate CWND to be 584 * limited to 1 segment in cc_conn_init(). 585 */ 586 tp->snd_cwnd = 1; 587 } else if (tp->t_rxtshift == 1) { 588 /* 589 * first retransmit; record ssthresh and cwnd so they can 590 * be recovered if this turns out to be a "bad" retransmit. 591 * A retransmit is considered "bad" if an ACK for this 592 * segment is received within RTT/2 interval; the assumption 593 * here is that the ACK was already in flight. See 594 * "On Estimating End-to-End Network Path Properties" by 595 * Allman and Paxson for more details. 596 */ 597 tp->snd_cwnd_prev = tp->snd_cwnd; 598 tp->snd_ssthresh_prev = tp->snd_ssthresh; 599 tp->snd_recover_prev = tp->snd_recover; 600 if (IN_FASTRECOVERY(tp->t_flags)) 601 tp->t_flags |= TF_WASFRECOVERY; 602 else 603 tp->t_flags &= ~TF_WASFRECOVERY; 604 if (IN_CONGRECOVERY(tp->t_flags)) 605 tp->t_flags |= TF_WASCRECOVERY; 606 else 607 tp->t_flags &= ~TF_WASCRECOVERY; 608 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 609 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 610 /* In the event that we've negotiated timestamps 611 * badrxtwin will be set to the value that we set 612 * the retransmitted packet's to_tsval to by tcp_output 613 */ 614 tp->t_flags |= TF_PREVVALID; 615 tcp_resend_sackholes(tp); 616 } else { 617 tp->t_flags &= ~TF_PREVVALID; 618 tcp_free_sackholes(tp); 619 } 620 TCPSTAT_INC(tcps_rexmttimeo); 621 if ((tp->t_state == TCPS_SYN_SENT) || 622 (tp->t_state == TCPS_SYN_RECEIVED)) 623 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 624 else 625 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 626 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, tcp_rexmit_max); 627 628 /* 629 * We enter the path for PLMTUD if connection is established or, if 630 * connection is FIN_WAIT_1 status, reason for the last is that if 631 * amount of data we send is very small, we could send it in couple of 632 * packets and process straight to FIN. In that case we won't catch 633 * ESTABLISHED state. 634 */ 635 #ifdef INET6 636 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 637 #else 638 isipv6 = false; 639 #endif 640 if (((V_tcp_pmtud_blackhole_detect == 1) || 641 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 642 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 643 ((tp->t_state == TCPS_ESTABLISHED) || 644 (tp->t_state == TCPS_FIN_WAIT_1))) { 645 if (tp->t_rxtshift == 1) { 646 /* 647 * We enter blackhole detection after the first 648 * unsuccessful timer based retransmission. 649 * Then we reduce up to two times the MSS, each 650 * candidate giving two tries of retransmissions. 651 * But we give a candidate only two tries, if it 652 * actually reduces the MSS. 653 */ 654 tp->t_blackhole_enter = 2; 655 tp->t_blackhole_exit = tp->t_blackhole_enter; 656 if (isipv6) { 657 #ifdef INET6 658 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 659 tp->t_blackhole_exit += 2; 660 if (tp->t_maxseg > V_tcp_v6mssdflt && 661 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 662 tp->t_blackhole_exit += 2; 663 #endif 664 } else { 665 #ifdef INET 666 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 667 tp->t_blackhole_exit += 2; 668 if (tp->t_maxseg > V_tcp_mssdflt && 669 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 670 tp->t_blackhole_exit += 2; 671 #endif 672 } 673 } 674 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 675 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 676 (tp->t_rxtshift >= tp->t_blackhole_enter && 677 tp->t_rxtshift < tp->t_blackhole_exit && 678 (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 679 /* 680 * Enter Path MTU Black-hole Detection mechanism: 681 * - Disable Path MTU Discovery (IP "DF" bit). 682 * - Reduce MTU to lower value than what we 683 * negotiated with peer. 684 */ 685 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 686 /* Record that we may have found a black hole. */ 687 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 688 /* Keep track of previous MSS. */ 689 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 690 } 691 692 /* 693 * Reduce the MSS to blackhole value or to the default 694 * in an attempt to retransmit. 695 */ 696 #ifdef INET6 697 if (isipv6 && 698 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 699 V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 700 /* Use the sysctl tuneable blackhole MSS. */ 701 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 702 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 703 } else if (isipv6) { 704 /* Use the default MSS. */ 705 tp->t_maxseg = V_tcp_v6mssdflt; 706 /* 707 * Disable Path MTU Discovery when we switch to 708 * minmss. 709 */ 710 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 711 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 712 } 713 #endif 714 #if defined(INET6) && defined(INET) 715 else 716 #endif 717 #ifdef INET 718 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 719 V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 720 /* Use the sysctl tuneable blackhole MSS. */ 721 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 722 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 723 } else { 724 /* Use the default MSS. */ 725 tp->t_maxseg = V_tcp_mssdflt; 726 /* 727 * Disable Path MTU Discovery when we switch to 728 * minmss. 729 */ 730 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 731 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 732 } 733 #endif 734 /* 735 * Reset the slow-start flight size 736 * as it may depend on the new MSS. 737 */ 738 if (CC_ALGO(tp)->conn_init != NULL) 739 CC_ALGO(tp)->conn_init(&tp->t_ccv); 740 } else { 741 /* 742 * If further retransmissions are still unsuccessful 743 * with a lowered MTU, maybe this isn't a blackhole and 744 * we restore the previous MSS and blackhole detection 745 * flags. 746 */ 747 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 748 (tp->t_rxtshift >= tp->t_blackhole_exit)) { 749 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 750 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 751 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 752 if (tp->t_maxseg < V_tcp_mssdflt) { 753 /* 754 * The MSS is so small we should not 755 * process incoming SACK's since we are 756 * subject to attack in such a case. 757 */ 758 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 759 } else { 760 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 761 } 762 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 763 /* 764 * Reset the slow-start flight size as it 765 * may depend on the new MSS. 766 */ 767 if (CC_ALGO(tp)->conn_init != NULL) 768 CC_ALGO(tp)->conn_init(&tp->t_ccv); 769 } 770 } 771 } 772 773 /* 774 * Disable RFC1323 and SACK if we haven't got any response to 775 * our third SYN to work-around some broken terminal servers 776 * (most of which have hopefully been retired) that have bad VJ 777 * header compression code which trashes TCP segments containing 778 * unknown-to-them TCP options. 779 */ 780 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 781 (tp->t_rxtshift == 3)) 782 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 783 /* 784 * If we backed off this far, notify the L3 protocol that we're having 785 * connection problems. 786 */ 787 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 788 #ifdef INET6 789 if ((inp->inp_vflag & INP_IPV6) != 0) 790 in6_losing(inp); 791 else 792 #endif 793 in_losing(inp); 794 } 795 tp->snd_nxt = tp->snd_una; 796 tp->snd_recover = tp->snd_max; 797 /* 798 * Force a segment to be sent. 799 */ 800 tp->t_flags |= TF_ACKNOW; 801 /* 802 * If timing a segment in this window, stop the timer. 803 */ 804 tp->t_rtttime = 0; 805 806 /* Do not overwrite the snd_cwnd on SYN retransmissions. */ 807 if (tp->t_state != TCPS_SYN_SENT) 808 cc_cong_signal(tp, NULL, CC_RTO); 809 NET_EPOCH_ENTER(et); 810 rv = tcp_output_locked(tp); 811 NET_EPOCH_EXIT(et); 812 CURVNET_RESTORE(); 813 814 return (rv); 815 } 816 817 static void 818 tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks) 819 { 820 struct tcp_log_buffer *lgb; 821 uint64_t ms; 822 823 INP_WLOCK_ASSERT(tptoinpcb(tp)); 824 if (tcp_bblogging_on(tp)) 825 lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, 826 NULL, false, NULL, NULL, 0, NULL); 827 else 828 lgb = NULL; 829 if (lgb != NULL) { 830 lgb->tlb_flex1 = (what << 8) | which; 831 if (what == TT_STARTING) { 832 /* Convert ticks to ms and store it in tlb_flex2. */ 833 if (hz == 1000) 834 lgb->tlb_flex2 = ticks; 835 else { 836 ms = (((uint64_t)ticks * 1000) + (hz - 1)) / hz; 837 if (ms > UINT32_MAX) 838 lgb->tlb_flex2 = UINT32_MAX; 839 else 840 lgb->tlb_flex2 = (uint32_t)ms; 841 } 842 } 843 } 844 } 845 846 static inline tt_which 847 tcp_timer_next(struct tcpcb *tp, sbintime_t *precision) 848 { 849 tt_which i, rv; 850 sbintime_t after, before; 851 852 for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) { 853 if (tp->t_timers[i] < after) { 854 after = tp->t_timers[i]; 855 rv = i; 856 } 857 before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]); 858 } 859 if (precision != NULL) 860 *precision = before - after; 861 862 return (rv); 863 } 864 865 static void 866 tcp_timer_enter(void *xtp) 867 { 868 struct tcpcb *tp = xtp; 869 struct inpcb *inp = tptoinpcb(tp); 870 sbintime_t precision; 871 tt_which which; 872 873 INP_WLOCK_ASSERT(inp); 874 875 which = tcp_timer_next(tp, NULL); 876 MPASS(which < TT_N); 877 tp->t_timers[which] = SBT_MAX; 878 tp->t_precisions[which] = 0; 879 880 tcp_bblog_timer(tp, which, TT_PROCESSING, 0); 881 if (tcp_timersw[which](tp)) { 882 tcp_bblog_timer(tp, which, TT_PROCESSED, 0); 883 if ((which = tcp_timer_next(tp, &precision)) != TT_N) { 884 MPASS(tp->t_state > TCPS_CLOSED); 885 callout_reset_sbt_on(&tp->t_callout, 886 tp->t_timers[which], precision, tcp_timer_enter, 887 tp, inp_to_cpuid(inp), C_ABSOLUTE); 888 } 889 INP_WUNLOCK(inp); 890 } 891 } 892 893 /* 894 * Activate or stop (delta == 0) a TCP timer. 895 */ 896 void 897 tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta) 898 { 899 struct inpcb *inp = tptoinpcb(tp); 900 sbintime_t precision; 901 tt_what what; 902 903 #ifdef TCP_OFFLOAD 904 if (tp->t_flags & TF_TOE) 905 return; 906 #endif 907 908 INP_WLOCK_ASSERT(inp); 909 MPASS(tp->t_state > TCPS_CLOSED); 910 911 if (delta > 0) { 912 what = TT_STARTING; 913 callout_when(tick_sbt * delta, 0, C_HARDCLOCK, 914 &tp->t_timers[which], &tp->t_precisions[which]); 915 } else { 916 what = TT_STOPPING; 917 tp->t_timers[which] = SBT_MAX; 918 } 919 tcp_bblog_timer(tp, which, what, delta); 920 921 if ((which = tcp_timer_next(tp, &precision)) != TT_N) 922 callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 923 precision, tcp_timer_enter, tp, inp_to_cpuid(inp), 924 C_ABSOLUTE); 925 else 926 callout_stop(&tp->t_callout); 927 } 928 929 bool 930 tcp_timer_active(struct tcpcb *tp, tt_which which) 931 { 932 933 INP_WLOCK_ASSERT(tptoinpcb(tp)); 934 935 return (tp->t_timers[which] != SBT_MAX); 936 } 937 938 /* 939 * Stop all timers associated with tcpcb. 940 * Called when tcpcb moves to TCPS_CLOSED. 941 */ 942 void 943 tcp_timer_stop(struct tcpcb *tp) 944 { 945 946 INP_WLOCK_ASSERT(tptoinpcb(tp)); 947 948 /* 949 * We don't check return value from callout_stop(). There are two 950 * reasons why it can return 0. First, a legitimate one: we could have 951 * been called from the callout itself. Second, callout(9) has a bug. 952 * It can race internally in softclock_call_cc(), when callout has 953 * already completed, but cc_exec_curr still points at the callout. 954 */ 955 (void )callout_stop(&tp->t_callout); 956 /* 957 * In case of being called from callout itself, we must make sure that 958 * we don't reschedule. 959 */ 960 for (tt_which i = 0; i < TT_N; i++) 961 tp->t_timers[i] = SBT_MAX; 962 } 963