1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #ifdef TCPDEBUG 81 #include <netinet/tcp_debug.h> 82 #endif 83 84 int tcp_persmin; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 91 92 int tcp_keepinit; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 95 96 int tcp_keepidle; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 99 100 int tcp_keepintvl; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 103 104 int tcp_delacktime; 105 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 107 "Time before a delayed ACK is sent"); 108 109 int tcp_msl; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 112 113 int tcp_rexmit_initial; 114 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT|CTLFLAG_RW, 115 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 116 "Initial Retransmission Timeout"); 117 118 int tcp_rexmit_min; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 121 "Minimum Retransmission Timeout"); 122 123 int tcp_rexmit_slop; 124 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 125 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 126 "Retransmission Timer Slop"); 127 128 VNET_DEFINE(int, tcp_always_keepalive) = 1; 129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 130 &VNET_NAME(tcp_always_keepalive) , 0, 131 "Assume SO_KEEPALIVE on all TCP connections"); 132 133 int tcp_fast_finwait2_recycle = 0; 134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 135 &tcp_fast_finwait2_recycle, 0, 136 "Recycle closed FIN_WAIT_2 connections faster"); 137 138 int tcp_finwait2_timeout; 139 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 140 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 141 142 int tcp_keepcnt = TCPTV_KEEPCNT; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 144 "Number of keepalive probes to send"); 145 146 /* max idle probes */ 147 int tcp_maxpersistidle; 148 149 int tcp_rexmit_drop_options = 0; 150 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 151 &tcp_rexmit_drop_options, 0, 152 "Drop TCP options from 3rd and later retransmitted SYN"); 153 154 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 156 CTLFLAG_RW|CTLFLAG_VNET, 157 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 158 "Path MTU Discovery Black Hole Detection Enabled"); 159 160 #ifdef INET 161 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 163 CTLFLAG_RW|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 165 "Path MTU Discovery Black Hole Detection lowered MSS"); 166 #endif 167 168 #ifdef INET6 169 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 171 CTLFLAG_RW|CTLFLAG_VNET, 172 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 173 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 174 #endif 175 176 #ifdef RSS 177 static int per_cpu_timers = 1; 178 #else 179 static int per_cpu_timers = 0; 180 #endif 181 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 182 &per_cpu_timers , 0, "run tcp timers on all cpus"); 183 184 /* 185 * Map the given inp to a CPU id. 186 * 187 * This queries RSS if it's compiled in, else it defaults to the current 188 * CPU ID. 189 */ 190 inline int 191 inp_to_cpuid(struct inpcb *inp) 192 { 193 u_int cpuid; 194 195 #ifdef RSS 196 if (per_cpu_timers) { 197 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 198 if (cpuid == NETISR_CPUID_NONE) 199 return (curcpu); /* XXX */ 200 else 201 return (cpuid); 202 } 203 #else 204 /* Legacy, pre-RSS behaviour */ 205 if (per_cpu_timers) { 206 /* 207 * We don't have a flowid -> cpuid mapping, so cheat and 208 * just map unknown cpuids to curcpu. Not the best, but 209 * apparently better than defaulting to swi 0. 210 */ 211 cpuid = inp->inp_flowid % (mp_maxid + 1); 212 if (! CPU_ABSENT(cpuid)) 213 return (cpuid); 214 return (curcpu); 215 } 216 #endif 217 /* Default for RSS and non-RSS - cpuid 0 */ 218 else { 219 return (0); 220 } 221 } 222 223 /* 224 * Tcp protocol timeout routine called every 500 ms. 225 * Updates timestamps used for TCP 226 * causes finite state machine actions if timers expire. 227 */ 228 void 229 tcp_slowtimo(void) 230 { 231 VNET_ITERATOR_DECL(vnet_iter); 232 233 VNET_LIST_RLOCK_NOSLEEP(); 234 VNET_FOREACH(vnet_iter) { 235 CURVNET_SET(vnet_iter); 236 (void) tcp_tw_2msl_scan(0); 237 CURVNET_RESTORE(); 238 } 239 VNET_LIST_RUNLOCK_NOSLEEP(); 240 } 241 242 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 243 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 244 245 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 246 247 /* 248 * TCP timer processing. 249 */ 250 251 void 252 tcp_timer_delack(void *xtp) 253 { 254 struct tcpcb *tp = xtp; 255 struct inpcb *inp; 256 CURVNET_SET(tp->t_vnet); 257 258 inp = tp->t_inpcb; 259 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 260 INP_WLOCK(inp); 261 if (callout_pending(&tp->t_timers->tt_delack) || 262 !callout_active(&tp->t_timers->tt_delack)) { 263 INP_WUNLOCK(inp); 264 CURVNET_RESTORE(); 265 return; 266 } 267 callout_deactivate(&tp->t_timers->tt_delack); 268 if ((inp->inp_flags & INP_DROPPED) != 0) { 269 INP_WUNLOCK(inp); 270 CURVNET_RESTORE(); 271 return; 272 } 273 tp->t_flags |= TF_ACKNOW; 274 TCPSTAT_INC(tcps_delack); 275 (void) tp->t_fb->tfb_tcp_output(tp); 276 INP_WUNLOCK(inp); 277 CURVNET_RESTORE(); 278 } 279 280 void 281 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 282 { 283 if (inp && tp != NULL) 284 INP_WUNLOCK(inp); 285 } 286 287 void 288 tcp_timer_2msl(void *xtp) 289 { 290 struct tcpcb *tp = xtp; 291 struct inpcb *inp; 292 struct epoch_tracker et; 293 CURVNET_SET(tp->t_vnet); 294 #ifdef TCPDEBUG 295 int ostate; 296 297 ostate = tp->t_state; 298 #endif 299 inp = tp->t_inpcb; 300 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 301 INP_WLOCK(inp); 302 tcp_free_sackholes(tp); 303 if (callout_pending(&tp->t_timers->tt_2msl) || 304 !callout_active(&tp->t_timers->tt_2msl)) { 305 INP_WUNLOCK(tp->t_inpcb); 306 CURVNET_RESTORE(); 307 return; 308 } 309 callout_deactivate(&tp->t_timers->tt_2msl); 310 if ((inp->inp_flags & INP_DROPPED) != 0) { 311 INP_WUNLOCK(inp); 312 CURVNET_RESTORE(); 313 return; 314 } 315 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 316 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 317 /* 318 * 2 MSL timeout in shutdown went off. If we're closed but 319 * still waiting for peer to close and connection has been idle 320 * too long delete connection control block. Otherwise, check 321 * again in a bit. 322 * 323 * If in TIME_WAIT state just ignore as this timeout is handled in 324 * tcp_tw_2msl_scan(). 325 * 326 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 327 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 328 * Ignore fact that there were recent incoming segments. 329 */ 330 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 331 INP_WUNLOCK(inp); 332 CURVNET_RESTORE(); 333 return; 334 } 335 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 336 tp->t_inpcb && tp->t_inpcb->inp_socket && 337 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 338 TCPSTAT_INC(tcps_finwait2_drops); 339 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 340 tcp_inpinfo_lock_del(inp, tp); 341 goto out; 342 } 343 NET_EPOCH_ENTER(et); 344 tp = tcp_close(tp); 345 NET_EPOCH_EXIT(et); 346 tcp_inpinfo_lock_del(inp, tp); 347 goto out; 348 } else { 349 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 350 callout_reset(&tp->t_timers->tt_2msl, 351 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 352 } else { 353 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 354 tcp_inpinfo_lock_del(inp, tp); 355 goto out; 356 } 357 NET_EPOCH_ENTER(et); 358 tp = tcp_close(tp); 359 NET_EPOCH_EXIT(et); 360 tcp_inpinfo_lock_del(inp, tp); 361 goto out; 362 } 363 } 364 365 #ifdef TCPDEBUG 366 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 367 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 368 PRU_SLOWTIMO); 369 #endif 370 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 371 372 if (tp != NULL) 373 INP_WUNLOCK(inp); 374 out: 375 CURVNET_RESTORE(); 376 } 377 378 void 379 tcp_timer_keep(void *xtp) 380 { 381 struct tcpcb *tp = xtp; 382 struct tcptemp *t_template; 383 struct inpcb *inp; 384 struct epoch_tracker et; 385 CURVNET_SET(tp->t_vnet); 386 #ifdef TCPDEBUG 387 int ostate; 388 389 ostate = tp->t_state; 390 #endif 391 inp = tp->t_inpcb; 392 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 393 INP_WLOCK(inp); 394 if (callout_pending(&tp->t_timers->tt_keep) || 395 !callout_active(&tp->t_timers->tt_keep)) { 396 INP_WUNLOCK(inp); 397 CURVNET_RESTORE(); 398 return; 399 } 400 callout_deactivate(&tp->t_timers->tt_keep); 401 if ((inp->inp_flags & INP_DROPPED) != 0) { 402 INP_WUNLOCK(inp); 403 CURVNET_RESTORE(); 404 return; 405 } 406 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 407 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 408 409 /* 410 * Because we don't regularly reset the keepalive callout in 411 * the ESTABLISHED state, it may be that we don't actually need 412 * to send a keepalive yet. If that occurs, schedule another 413 * call for the next time the keepalive timer might expire. 414 */ 415 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 416 u_int idletime; 417 418 idletime = ticks - tp->t_rcvtime; 419 if (idletime < TP_KEEPIDLE(tp)) { 420 callout_reset(&tp->t_timers->tt_keep, 421 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 422 INP_WUNLOCK(inp); 423 CURVNET_RESTORE(); 424 return; 425 } 426 } 427 428 /* 429 * Keep-alive timer went off; send something 430 * or drop connection if idle for too long. 431 */ 432 TCPSTAT_INC(tcps_keeptimeo); 433 if (tp->t_state < TCPS_ESTABLISHED) 434 goto dropit; 435 if ((V_tcp_always_keepalive || 436 inp->inp_socket->so_options & SO_KEEPALIVE) && 437 tp->t_state <= TCPS_CLOSING) { 438 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 439 goto dropit; 440 /* 441 * Send a packet designed to force a response 442 * if the peer is up and reachable: 443 * either an ACK if the connection is still alive, 444 * or an RST if the peer has closed the connection 445 * due to timeout or reboot. 446 * Using sequence number tp->snd_una-1 447 * causes the transmitted zero-length segment 448 * to lie outside the receive window; 449 * by the protocol spec, this requires the 450 * correspondent TCP to respond. 451 */ 452 TCPSTAT_INC(tcps_keepprobe); 453 t_template = tcpip_maketemplate(inp); 454 if (t_template) { 455 tcp_respond(tp, t_template->tt_ipgen, 456 &t_template->tt_t, (struct mbuf *)NULL, 457 tp->rcv_nxt, tp->snd_una - 1, 0); 458 free(t_template, M_TEMP); 459 } 460 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 461 tcp_timer_keep, tp); 462 } else 463 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 464 tcp_timer_keep, tp); 465 466 #ifdef TCPDEBUG 467 if (inp->inp_socket->so_options & SO_DEBUG) 468 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 469 PRU_SLOWTIMO); 470 #endif 471 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 472 INP_WUNLOCK(inp); 473 CURVNET_RESTORE(); 474 return; 475 476 dropit: 477 TCPSTAT_INC(tcps_keepdrops); 478 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 479 tcp_inpinfo_lock_del(inp, tp); 480 goto out; 481 } 482 NET_EPOCH_ENTER(et); 483 tp = tcp_drop(tp, ETIMEDOUT); 484 485 #ifdef TCPDEBUG 486 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 487 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 488 PRU_SLOWTIMO); 489 #endif 490 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 491 NET_EPOCH_EXIT(et); 492 tcp_inpinfo_lock_del(inp, tp); 493 out: 494 CURVNET_RESTORE(); 495 } 496 497 void 498 tcp_timer_persist(void *xtp) 499 { 500 struct tcpcb *tp = xtp; 501 struct inpcb *inp; 502 struct epoch_tracker et; 503 CURVNET_SET(tp->t_vnet); 504 #ifdef TCPDEBUG 505 int ostate; 506 507 ostate = tp->t_state; 508 #endif 509 inp = tp->t_inpcb; 510 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 511 INP_WLOCK(inp); 512 if (callout_pending(&tp->t_timers->tt_persist) || 513 !callout_active(&tp->t_timers->tt_persist)) { 514 INP_WUNLOCK(inp); 515 CURVNET_RESTORE(); 516 return; 517 } 518 callout_deactivate(&tp->t_timers->tt_persist); 519 if ((inp->inp_flags & INP_DROPPED) != 0) { 520 INP_WUNLOCK(inp); 521 CURVNET_RESTORE(); 522 return; 523 } 524 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 525 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 526 /* 527 * Persistence timer into zero window. 528 * Force a byte to be output, if possible. 529 */ 530 TCPSTAT_INC(tcps_persisttimeo); 531 /* 532 * Hack: if the peer is dead/unreachable, we do not 533 * time out if the window is closed. After a full 534 * backoff, drop the connection if the idle time 535 * (no responses to probes) reaches the maximum 536 * backoff that we would use if retransmitting. 537 */ 538 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 539 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 540 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 541 TCPSTAT_INC(tcps_persistdrop); 542 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 543 tcp_inpinfo_lock_del(inp, tp); 544 goto out; 545 } 546 NET_EPOCH_ENTER(et); 547 tp = tcp_drop(tp, ETIMEDOUT); 548 NET_EPOCH_EXIT(et); 549 tcp_inpinfo_lock_del(inp, tp); 550 goto out; 551 } 552 /* 553 * If the user has closed the socket then drop a persisting 554 * connection after a much reduced timeout. 555 */ 556 if (tp->t_state > TCPS_CLOSE_WAIT && 557 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 558 TCPSTAT_INC(tcps_persistdrop); 559 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 560 tcp_inpinfo_lock_del(inp, tp); 561 goto out; 562 } 563 NET_EPOCH_ENTER(et); 564 tp = tcp_drop(tp, ETIMEDOUT); 565 NET_EPOCH_EXIT(et); 566 tcp_inpinfo_lock_del(inp, tp); 567 goto out; 568 } 569 tcp_setpersist(tp); 570 tp->t_flags |= TF_FORCEDATA; 571 (void) tp->t_fb->tfb_tcp_output(tp); 572 tp->t_flags &= ~TF_FORCEDATA; 573 574 #ifdef TCPDEBUG 575 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 576 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 577 #endif 578 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 579 INP_WUNLOCK(inp); 580 out: 581 CURVNET_RESTORE(); 582 } 583 584 void 585 tcp_timer_rexmt(void * xtp) 586 { 587 struct tcpcb *tp = xtp; 588 CURVNET_SET(tp->t_vnet); 589 int rexmt; 590 struct inpcb *inp; 591 struct epoch_tracker et; 592 #ifdef TCPDEBUG 593 int ostate; 594 595 ostate = tp->t_state; 596 #endif 597 inp = tp->t_inpcb; 598 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 599 INP_WLOCK(inp); 600 if (callout_pending(&tp->t_timers->tt_rexmt) || 601 !callout_active(&tp->t_timers->tt_rexmt)) { 602 INP_WUNLOCK(inp); 603 CURVNET_RESTORE(); 604 return; 605 } 606 callout_deactivate(&tp->t_timers->tt_rexmt); 607 if ((inp->inp_flags & INP_DROPPED) != 0) { 608 INP_WUNLOCK(inp); 609 CURVNET_RESTORE(); 610 return; 611 } 612 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 613 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 614 tcp_free_sackholes(tp); 615 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 616 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 617 /* The stack has a timer action too. */ 618 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 619 } 620 /* 621 * Retransmission timer went off. Message has not 622 * been acked within retransmit interval. Back off 623 * to a longer retransmit interval and retransmit one segment. 624 */ 625 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 626 tp->t_rxtshift = TCP_MAXRXTSHIFT; 627 TCPSTAT_INC(tcps_timeoutdrop); 628 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 629 tcp_inpinfo_lock_del(inp, tp); 630 goto out; 631 } 632 NET_EPOCH_ENTER(et); 633 tp = tcp_drop(tp, ETIMEDOUT); 634 NET_EPOCH_EXIT(et); 635 tcp_inpinfo_lock_del(inp, tp); 636 goto out; 637 } 638 if (tp->t_state == TCPS_SYN_SENT) { 639 /* 640 * If the SYN was retransmitted, indicate CWND to be 641 * limited to 1 segment in cc_conn_init(). 642 */ 643 tp->snd_cwnd = 1; 644 } else if (tp->t_rxtshift == 1) { 645 /* 646 * first retransmit; record ssthresh and cwnd so they can 647 * be recovered if this turns out to be a "bad" retransmit. 648 * A retransmit is considered "bad" if an ACK for this 649 * segment is received within RTT/2 interval; the assumption 650 * here is that the ACK was already in flight. See 651 * "On Estimating End-to-End Network Path Properties" by 652 * Allman and Paxson for more details. 653 */ 654 tp->snd_cwnd_prev = tp->snd_cwnd; 655 tp->snd_ssthresh_prev = tp->snd_ssthresh; 656 tp->snd_recover_prev = tp->snd_recover; 657 if (IN_FASTRECOVERY(tp->t_flags)) 658 tp->t_flags |= TF_WASFRECOVERY; 659 else 660 tp->t_flags &= ~TF_WASFRECOVERY; 661 if (IN_CONGRECOVERY(tp->t_flags)) 662 tp->t_flags |= TF_WASCRECOVERY; 663 else 664 tp->t_flags &= ~TF_WASCRECOVERY; 665 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 666 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 667 /* In the event that we've negotiated timestamps 668 * badrxtwin will be set to the value that we set 669 * the retransmitted packet's to_tsval to by tcp_output 670 */ 671 tp->t_flags |= TF_PREVVALID; 672 } else 673 tp->t_flags &= ~TF_PREVVALID; 674 TCPSTAT_INC(tcps_rexmttimeo); 675 if ((tp->t_state == TCPS_SYN_SENT) || 676 (tp->t_state == TCPS_SYN_RECEIVED)) 677 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 678 else 679 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 680 TCPT_RANGESET(tp->t_rxtcur, rexmt, 681 tp->t_rttmin, TCPTV_REXMTMAX); 682 683 /* 684 * We enter the path for PLMTUD if connection is established or, if 685 * connection is FIN_WAIT_1 status, reason for the last is that if 686 * amount of data we send is very small, we could send it in couple of 687 * packets and process straight to FIN. In that case we won't catch 688 * ESTABLISHED state. 689 */ 690 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 691 || (tp->t_state == TCPS_FIN_WAIT_1))) { 692 #ifdef INET6 693 int isipv6; 694 #endif 695 696 /* 697 * Idea here is that at each stage of mtu probe (usually, 1448 698 * -> 1188 -> 524) should be given 2 chances to recover before 699 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 700 * take care of that. 701 */ 702 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 703 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 704 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 705 tp->t_rxtshift % 2 == 0)) { 706 /* 707 * Enter Path MTU Black-hole Detection mechanism: 708 * - Disable Path MTU Discovery (IP "DF" bit). 709 * - Reduce MTU to lower value than what we 710 * negotiated with peer. 711 */ 712 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 713 /* Record that we may have found a black hole. */ 714 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 715 /* Keep track of previous MSS. */ 716 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 717 } 718 719 /* 720 * Reduce the MSS to blackhole value or to the default 721 * in an attempt to retransmit. 722 */ 723 #ifdef INET6 724 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 725 if (isipv6 && 726 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 727 /* Use the sysctl tuneable blackhole MSS. */ 728 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 729 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 730 } else if (isipv6) { 731 /* Use the default MSS. */ 732 tp->t_maxseg = V_tcp_v6mssdflt; 733 /* 734 * Disable Path MTU Discovery when we switch to 735 * minmss. 736 */ 737 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 738 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 739 } 740 #endif 741 #if defined(INET6) && defined(INET) 742 else 743 #endif 744 #ifdef INET 745 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 746 /* Use the sysctl tuneable blackhole MSS. */ 747 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 748 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 749 } else { 750 /* Use the default MSS. */ 751 tp->t_maxseg = V_tcp_mssdflt; 752 /* 753 * Disable Path MTU Discovery when we switch to 754 * minmss. 755 */ 756 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 757 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 758 } 759 #endif 760 /* 761 * Reset the slow-start flight size 762 * as it may depend on the new MSS. 763 */ 764 if (CC_ALGO(tp)->conn_init != NULL) 765 CC_ALGO(tp)->conn_init(tp->ccv); 766 } else { 767 /* 768 * If further retransmissions are still unsuccessful 769 * with a lowered MTU, maybe this isn't a blackhole and 770 * we restore the previous MSS and blackhole detection 771 * flags. 772 * The limit '6' is determined by giving each probe 773 * stage (1448, 1188, 524) 2 chances to recover. 774 */ 775 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 776 (tp->t_rxtshift >= 6)) { 777 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 778 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 779 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 780 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 781 /* 782 * Reset the slow-start flight size as it 783 * may depend on the new MSS. 784 */ 785 if (CC_ALGO(tp)->conn_init != NULL) 786 CC_ALGO(tp)->conn_init(tp->ccv); 787 } 788 } 789 } 790 791 /* 792 * Disable RFC1323 and SACK if we haven't got any response to 793 * our third SYN to work-around some broken terminal servers 794 * (most of which have hopefully been retired) that have bad VJ 795 * header compression code which trashes TCP segments containing 796 * unknown-to-them TCP options. 797 */ 798 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 799 (tp->t_rxtshift == 3)) 800 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 801 /* 802 * If we backed off this far, notify the L3 protocol that we're having 803 * connection problems. 804 */ 805 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 806 #ifdef INET6 807 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 808 in6_losing(tp->t_inpcb); 809 else 810 #endif 811 in_losing(tp->t_inpcb); 812 } 813 tp->snd_nxt = tp->snd_una; 814 tp->snd_recover = tp->snd_max; 815 /* 816 * Force a segment to be sent. 817 */ 818 tp->t_flags |= TF_ACKNOW; 819 /* 820 * If timing a segment in this window, stop the timer. 821 */ 822 tp->t_rtttime = 0; 823 824 cc_cong_signal(tp, NULL, CC_RTO); 825 826 (void) tp->t_fb->tfb_tcp_output(tp); 827 828 #ifdef TCPDEBUG 829 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 830 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 831 PRU_SLOWTIMO); 832 #endif 833 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 834 INP_WUNLOCK(inp); 835 out: 836 CURVNET_RESTORE(); 837 } 838 839 void 840 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 841 { 842 struct callout *t_callout; 843 callout_func_t *f_callout; 844 struct inpcb *inp = tp->t_inpcb; 845 int cpu = inp_to_cpuid(inp); 846 847 #ifdef TCP_OFFLOAD 848 if (tp->t_flags & TF_TOE) 849 return; 850 #endif 851 852 if (tp->t_timers->tt_flags & TT_STOPPED) 853 return; 854 855 switch (timer_type) { 856 case TT_DELACK: 857 t_callout = &tp->t_timers->tt_delack; 858 f_callout = tcp_timer_delack; 859 break; 860 case TT_REXMT: 861 t_callout = &tp->t_timers->tt_rexmt; 862 f_callout = tcp_timer_rexmt; 863 break; 864 case TT_PERSIST: 865 t_callout = &tp->t_timers->tt_persist; 866 f_callout = tcp_timer_persist; 867 break; 868 case TT_KEEP: 869 t_callout = &tp->t_timers->tt_keep; 870 f_callout = tcp_timer_keep; 871 break; 872 case TT_2MSL: 873 t_callout = &tp->t_timers->tt_2msl; 874 f_callout = tcp_timer_2msl; 875 break; 876 default: 877 if (tp->t_fb->tfb_tcp_timer_activate) { 878 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 879 return; 880 } 881 panic("tp %p bad timer_type %#x", tp, timer_type); 882 } 883 if (delta == 0) { 884 callout_stop(t_callout); 885 } else { 886 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 887 } 888 } 889 890 int 891 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 892 { 893 struct callout *t_callout; 894 895 switch (timer_type) { 896 case TT_DELACK: 897 t_callout = &tp->t_timers->tt_delack; 898 break; 899 case TT_REXMT: 900 t_callout = &tp->t_timers->tt_rexmt; 901 break; 902 case TT_PERSIST: 903 t_callout = &tp->t_timers->tt_persist; 904 break; 905 case TT_KEEP: 906 t_callout = &tp->t_timers->tt_keep; 907 break; 908 case TT_2MSL: 909 t_callout = &tp->t_timers->tt_2msl; 910 break; 911 default: 912 if (tp->t_fb->tfb_tcp_timer_active) { 913 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 914 } 915 panic("tp %p bad timer_type %#x", tp, timer_type); 916 } 917 return callout_active(t_callout); 918 } 919 920 /* 921 * Stop the timer from running, and apply a flag 922 * against the timer_flags that will force the 923 * timer never to run. The flag is needed to assure 924 * a race does not leave it running and cause 925 * the timer to possibly restart itself (keep and persist 926 * especially do this). 927 */ 928 int 929 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 930 { 931 struct callout *t_callout; 932 uint32_t t_flags; 933 934 switch (timer_type) { 935 case TT_DELACK: 936 t_flags = TT_DELACK_SUS; 937 t_callout = &tp->t_timers->tt_delack; 938 break; 939 case TT_REXMT: 940 t_flags = TT_REXMT_SUS; 941 t_callout = &tp->t_timers->tt_rexmt; 942 break; 943 case TT_PERSIST: 944 t_flags = TT_PERSIST_SUS; 945 t_callout = &tp->t_timers->tt_persist; 946 break; 947 case TT_KEEP: 948 t_flags = TT_KEEP_SUS; 949 t_callout = &tp->t_timers->tt_keep; 950 break; 951 case TT_2MSL: 952 t_flags = TT_2MSL_SUS; 953 t_callout = &tp->t_timers->tt_2msl; 954 break; 955 default: 956 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 957 } 958 tp->t_timers->tt_flags |= t_flags; 959 return (callout_stop(t_callout)); 960 } 961 962 void 963 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 964 { 965 switch (timer_type) { 966 case TT_DELACK: 967 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 968 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 969 if (tp->t_flags & TF_DELACK) { 970 /* Delayed ack timer should be up activate a timer */ 971 tp->t_flags &= ~TF_DELACK; 972 tcp_timer_activate(tp, TT_DELACK, 973 tcp_delacktime); 974 } 975 } 976 break; 977 case TT_REXMT: 978 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 979 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 980 if (SEQ_GT(tp->snd_max, tp->snd_una) && 981 (tcp_timer_active((tp), TT_PERSIST) == 0) && 982 tp->snd_wnd) { 983 /* We have outstanding data activate a timer */ 984 tcp_timer_activate(tp, TT_REXMT, 985 tp->t_rxtcur); 986 } 987 } 988 break; 989 case TT_PERSIST: 990 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 991 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 992 if (tp->snd_wnd == 0) { 993 /* Activate the persists timer */ 994 tp->t_rxtshift = 0; 995 tcp_setpersist(tp); 996 } 997 } 998 break; 999 case TT_KEEP: 1000 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1001 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1002 tcp_timer_activate(tp, TT_KEEP, 1003 TCPS_HAVEESTABLISHED(tp->t_state) ? 1004 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1005 } 1006 break; 1007 case TT_2MSL: 1008 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1009 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1010 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1011 ((tp->t_inpcb->inp_socket == NULL) || 1012 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1013 /* Star the 2MSL timer */ 1014 tcp_timer_activate(tp, TT_2MSL, 1015 (tcp_fast_finwait2_recycle) ? 1016 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1017 } 1018 } 1019 break; 1020 default: 1021 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1022 } 1023 } 1024 1025 void 1026 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1027 { 1028 struct callout *t_callout; 1029 1030 tp->t_timers->tt_flags |= TT_STOPPED; 1031 switch (timer_type) { 1032 case TT_DELACK: 1033 t_callout = &tp->t_timers->tt_delack; 1034 break; 1035 case TT_REXMT: 1036 t_callout = &tp->t_timers->tt_rexmt; 1037 break; 1038 case TT_PERSIST: 1039 t_callout = &tp->t_timers->tt_persist; 1040 break; 1041 case TT_KEEP: 1042 t_callout = &tp->t_timers->tt_keep; 1043 break; 1044 case TT_2MSL: 1045 t_callout = &tp->t_timers->tt_2msl; 1046 break; 1047 default: 1048 if (tp->t_fb->tfb_tcp_timer_stop) { 1049 /* 1050 * XXXrrs we need to look at this with the 1051 * stop case below (flags). 1052 */ 1053 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1054 return; 1055 } 1056 panic("tp %p bad timer_type %#x", tp, timer_type); 1057 } 1058 1059 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1060 /* 1061 * Can't stop the callout, defer tcpcb actual deletion 1062 * to the last one. We do this using the async drain 1063 * function and incrementing the count in 1064 */ 1065 tp->t_timers->tt_draincnt++; 1066 } 1067 } 1068