1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_inet.h" 38 #include "opt_inet6.h" 39 #include "opt_tcpdebug.h" 40 #include "opt_rss.h" 41 42 #include <sys/param.h> 43 #include <sys/kernel.h> 44 #include <sys/lock.h> 45 #include <sys/mbuf.h> 46 #include <sys/mutex.h> 47 #include <sys/protosw.h> 48 #include <sys/smp.h> 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 54 #include <net/if.h> 55 #include <net/route.h> 56 #include <net/rss_config.h> 57 #include <net/vnet.h> 58 #include <net/netisr.h> 59 60 #include <netinet/in.h> 61 #include <netinet/in_kdtrace.h> 62 #include <netinet/in_pcb.h> 63 #include <netinet/in_rss.h> 64 #include <netinet/in_systm.h> 65 #ifdef INET6 66 #include <netinet6/in6_pcb.h> 67 #endif 68 #include <netinet/ip_var.h> 69 #include <netinet/tcp.h> 70 #include <netinet/tcp_fsm.h> 71 #include <netinet/tcp_log_buf.h> 72 #include <netinet/tcp_timer.h> 73 #include <netinet/tcp_var.h> 74 #include <netinet/tcp_seq.h> 75 #include <netinet/cc/cc.h> 76 #ifdef INET6 77 #include <netinet6/tcp6_var.h> 78 #endif 79 #include <netinet/tcpip.h> 80 #ifdef TCPDEBUG 81 #include <netinet/tcp_debug.h> 82 #endif 83 84 int tcp_persmin; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 87 88 int tcp_persmax; 89 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 91 92 int tcp_keepinit; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 95 96 int tcp_keepidle; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 99 100 int tcp_keepintvl; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 103 104 int tcp_delacktime; 105 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 107 "Time before a delayed ACK is sent"); 108 109 int tcp_msl; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 112 113 int tcp_rexmit_initial; 114 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, CTLTYPE_INT|CTLFLAG_RW, 115 &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 116 "Initial Retransmission Timeout"); 117 118 int tcp_rexmit_min; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 121 "Minimum Retransmission Timeout"); 122 123 int tcp_rexmit_slop; 124 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 125 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 126 "Retransmission Timer Slop"); 127 128 VNET_DEFINE(int, tcp_always_keepalive) = 1; 129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 130 &VNET_NAME(tcp_always_keepalive) , 0, 131 "Assume SO_KEEPALIVE on all TCP connections"); 132 133 int tcp_fast_finwait2_recycle = 0; 134 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 135 &tcp_fast_finwait2_recycle, 0, 136 "Recycle closed FIN_WAIT_2 connections faster"); 137 138 int tcp_finwait2_timeout; 139 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 140 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 141 142 int tcp_keepcnt = TCPTV_KEEPCNT; 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 144 "Number of keepalive probes to send"); 145 146 /* max idle probes */ 147 int tcp_maxpersistidle; 148 149 int tcp_rexmit_drop_options = 0; 150 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 151 &tcp_rexmit_drop_options, 0, 152 "Drop TCP options from 3rd and later retransmitted SYN"); 153 154 VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 156 CTLFLAG_RW|CTLFLAG_VNET, 157 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 158 "Path MTU Discovery Black Hole Detection Enabled"); 159 160 #ifdef INET 161 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 163 CTLFLAG_RW|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 165 "Path MTU Discovery Black Hole Detection lowered MSS"); 166 #endif 167 168 #ifdef INET6 169 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 171 CTLFLAG_RW|CTLFLAG_VNET, 172 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 173 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 174 #endif 175 176 #ifdef RSS 177 static int per_cpu_timers = 1; 178 #else 179 static int per_cpu_timers = 0; 180 #endif 181 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 182 &per_cpu_timers , 0, "run tcp timers on all cpus"); 183 184 /* 185 * Map the given inp to a CPU id. 186 * 187 * This queries RSS if it's compiled in, else it defaults to the current 188 * CPU ID. 189 */ 190 inline int 191 inp_to_cpuid(struct inpcb *inp) 192 { 193 u_int cpuid; 194 195 #ifdef RSS 196 if (per_cpu_timers) { 197 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 198 if (cpuid == NETISR_CPUID_NONE) 199 return (curcpu); /* XXX */ 200 else 201 return (cpuid); 202 } 203 #else 204 /* Legacy, pre-RSS behaviour */ 205 if (per_cpu_timers) { 206 /* 207 * We don't have a flowid -> cpuid mapping, so cheat and 208 * just map unknown cpuids to curcpu. Not the best, but 209 * apparently better than defaulting to swi 0. 210 */ 211 cpuid = inp->inp_flowid % (mp_maxid + 1); 212 if (! CPU_ABSENT(cpuid)) 213 return (cpuid); 214 return (curcpu); 215 } 216 #endif 217 /* Default for RSS and non-RSS - cpuid 0 */ 218 else { 219 return (0); 220 } 221 } 222 223 /* 224 * Tcp protocol timeout routine called every 500 ms. 225 * Updates timestamps used for TCP 226 * causes finite state machine actions if timers expire. 227 */ 228 void 229 tcp_slowtimo(void) 230 { 231 VNET_ITERATOR_DECL(vnet_iter); 232 233 VNET_LIST_RLOCK_NOSLEEP(); 234 VNET_FOREACH(vnet_iter) { 235 CURVNET_SET(vnet_iter); 236 (void) tcp_tw_2msl_scan(0); 237 CURVNET_RESTORE(); 238 } 239 VNET_LIST_RUNLOCK_NOSLEEP(); 240 } 241 242 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 243 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 244 245 int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 246 247 /* 248 * TCP timer processing. 249 */ 250 251 void 252 tcp_timer_delack(void *xtp) 253 { 254 struct epoch_tracker et; 255 struct tcpcb *tp = xtp; 256 struct inpcb *inp; 257 CURVNET_SET(tp->t_vnet); 258 259 inp = tp->t_inpcb; 260 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 261 INP_WLOCK(inp); 262 if (callout_pending(&tp->t_timers->tt_delack) || 263 !callout_active(&tp->t_timers->tt_delack)) { 264 INP_WUNLOCK(inp); 265 CURVNET_RESTORE(); 266 return; 267 } 268 callout_deactivate(&tp->t_timers->tt_delack); 269 if ((inp->inp_flags & INP_DROPPED) != 0) { 270 INP_WUNLOCK(inp); 271 CURVNET_RESTORE(); 272 return; 273 } 274 tp->t_flags |= TF_ACKNOW; 275 TCPSTAT_INC(tcps_delack); 276 NET_EPOCH_ENTER(et); 277 (void) tp->t_fb->tfb_tcp_output(tp); 278 INP_WUNLOCK(inp); 279 NET_EPOCH_EXIT(et); 280 CURVNET_RESTORE(); 281 } 282 283 void 284 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 285 { 286 if (inp && tp != NULL) 287 INP_WUNLOCK(inp); 288 } 289 290 void 291 tcp_timer_2msl(void *xtp) 292 { 293 struct tcpcb *tp = xtp; 294 struct inpcb *inp; 295 struct epoch_tracker et; 296 CURVNET_SET(tp->t_vnet); 297 #ifdef TCPDEBUG 298 int ostate; 299 300 ostate = tp->t_state; 301 #endif 302 inp = tp->t_inpcb; 303 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 304 INP_WLOCK(inp); 305 tcp_free_sackholes(tp); 306 if (callout_pending(&tp->t_timers->tt_2msl) || 307 !callout_active(&tp->t_timers->tt_2msl)) { 308 INP_WUNLOCK(tp->t_inpcb); 309 CURVNET_RESTORE(); 310 return; 311 } 312 callout_deactivate(&tp->t_timers->tt_2msl); 313 if ((inp->inp_flags & INP_DROPPED) != 0) { 314 INP_WUNLOCK(inp); 315 CURVNET_RESTORE(); 316 return; 317 } 318 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 319 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 320 /* 321 * 2 MSL timeout in shutdown went off. If we're closed but 322 * still waiting for peer to close and connection has been idle 323 * too long delete connection control block. Otherwise, check 324 * again in a bit. 325 * 326 * If in TIME_WAIT state just ignore as this timeout is handled in 327 * tcp_tw_2msl_scan(). 328 * 329 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 330 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 331 * Ignore fact that there were recent incoming segments. 332 */ 333 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 334 INP_WUNLOCK(inp); 335 CURVNET_RESTORE(); 336 return; 337 } 338 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 339 tp->t_inpcb && tp->t_inpcb->inp_socket && 340 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 341 TCPSTAT_INC(tcps_finwait2_drops); 342 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 343 tcp_inpinfo_lock_del(inp, tp); 344 goto out; 345 } 346 NET_EPOCH_ENTER(et); 347 tp = tcp_close(tp); 348 NET_EPOCH_EXIT(et); 349 tcp_inpinfo_lock_del(inp, tp); 350 goto out; 351 } else { 352 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 353 callout_reset(&tp->t_timers->tt_2msl, 354 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 355 } else { 356 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 357 tcp_inpinfo_lock_del(inp, tp); 358 goto out; 359 } 360 NET_EPOCH_ENTER(et); 361 tp = tcp_close(tp); 362 NET_EPOCH_EXIT(et); 363 tcp_inpinfo_lock_del(inp, tp); 364 goto out; 365 } 366 } 367 368 #ifdef TCPDEBUG 369 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 370 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 371 PRU_SLOWTIMO); 372 #endif 373 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 374 375 if (tp != NULL) 376 INP_WUNLOCK(inp); 377 out: 378 CURVNET_RESTORE(); 379 } 380 381 void 382 tcp_timer_keep(void *xtp) 383 { 384 struct tcpcb *tp = xtp; 385 struct tcptemp *t_template; 386 struct inpcb *inp; 387 struct epoch_tracker et; 388 CURVNET_SET(tp->t_vnet); 389 #ifdef TCPDEBUG 390 int ostate; 391 392 ostate = tp->t_state; 393 #endif 394 inp = tp->t_inpcb; 395 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 396 INP_WLOCK(inp); 397 if (callout_pending(&tp->t_timers->tt_keep) || 398 !callout_active(&tp->t_timers->tt_keep)) { 399 INP_WUNLOCK(inp); 400 CURVNET_RESTORE(); 401 return; 402 } 403 callout_deactivate(&tp->t_timers->tt_keep); 404 if ((inp->inp_flags & INP_DROPPED) != 0) { 405 INP_WUNLOCK(inp); 406 CURVNET_RESTORE(); 407 return; 408 } 409 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 410 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 411 412 /* 413 * Because we don't regularly reset the keepalive callout in 414 * the ESTABLISHED state, it may be that we don't actually need 415 * to send a keepalive yet. If that occurs, schedule another 416 * call for the next time the keepalive timer might expire. 417 */ 418 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 419 u_int idletime; 420 421 idletime = ticks - tp->t_rcvtime; 422 if (idletime < TP_KEEPIDLE(tp)) { 423 callout_reset(&tp->t_timers->tt_keep, 424 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 425 INP_WUNLOCK(inp); 426 CURVNET_RESTORE(); 427 return; 428 } 429 } 430 431 /* 432 * Keep-alive timer went off; send something 433 * or drop connection if idle for too long. 434 */ 435 TCPSTAT_INC(tcps_keeptimeo); 436 if (tp->t_state < TCPS_ESTABLISHED) 437 goto dropit; 438 if ((V_tcp_always_keepalive || 439 inp->inp_socket->so_options & SO_KEEPALIVE) && 440 tp->t_state <= TCPS_CLOSING) { 441 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 442 goto dropit; 443 /* 444 * Send a packet designed to force a response 445 * if the peer is up and reachable: 446 * either an ACK if the connection is still alive, 447 * or an RST if the peer has closed the connection 448 * due to timeout or reboot. 449 * Using sequence number tp->snd_una-1 450 * causes the transmitted zero-length segment 451 * to lie outside the receive window; 452 * by the protocol spec, this requires the 453 * correspondent TCP to respond. 454 */ 455 TCPSTAT_INC(tcps_keepprobe); 456 t_template = tcpip_maketemplate(inp); 457 if (t_template) { 458 NET_EPOCH_ENTER(et); 459 tcp_respond(tp, t_template->tt_ipgen, 460 &t_template->tt_t, (struct mbuf *)NULL, 461 tp->rcv_nxt, tp->snd_una - 1, 0); 462 NET_EPOCH_EXIT(et); 463 free(t_template, M_TEMP); 464 } 465 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 466 tcp_timer_keep, tp); 467 } else 468 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 469 tcp_timer_keep, tp); 470 471 #ifdef TCPDEBUG 472 if (inp->inp_socket->so_options & SO_DEBUG) 473 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 474 PRU_SLOWTIMO); 475 #endif 476 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 477 INP_WUNLOCK(inp); 478 CURVNET_RESTORE(); 479 return; 480 481 dropit: 482 TCPSTAT_INC(tcps_keepdrops); 483 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 484 tcp_inpinfo_lock_del(inp, tp); 485 goto out; 486 } 487 NET_EPOCH_ENTER(et); 488 tp = tcp_drop(tp, ETIMEDOUT); 489 490 #ifdef TCPDEBUG 491 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 492 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 493 PRU_SLOWTIMO); 494 #endif 495 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 496 NET_EPOCH_EXIT(et); 497 tcp_inpinfo_lock_del(inp, tp); 498 out: 499 CURVNET_RESTORE(); 500 } 501 502 void 503 tcp_timer_persist(void *xtp) 504 { 505 struct tcpcb *tp = xtp; 506 struct inpcb *inp; 507 struct epoch_tracker et; 508 CURVNET_SET(tp->t_vnet); 509 #ifdef TCPDEBUG 510 int ostate; 511 512 ostate = tp->t_state; 513 #endif 514 inp = tp->t_inpcb; 515 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 516 INP_WLOCK(inp); 517 if (callout_pending(&tp->t_timers->tt_persist) || 518 !callout_active(&tp->t_timers->tt_persist)) { 519 INP_WUNLOCK(inp); 520 CURVNET_RESTORE(); 521 return; 522 } 523 callout_deactivate(&tp->t_timers->tt_persist); 524 if ((inp->inp_flags & INP_DROPPED) != 0) { 525 INP_WUNLOCK(inp); 526 CURVNET_RESTORE(); 527 return; 528 } 529 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 530 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 531 /* 532 * Persistence timer into zero window. 533 * Force a byte to be output, if possible. 534 */ 535 TCPSTAT_INC(tcps_persisttimeo); 536 /* 537 * Hack: if the peer is dead/unreachable, we do not 538 * time out if the window is closed. After a full 539 * backoff, drop the connection if the idle time 540 * (no responses to probes) reaches the maximum 541 * backoff that we would use if retransmitting. 542 */ 543 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 544 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 545 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 546 TCPSTAT_INC(tcps_persistdrop); 547 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 548 tcp_inpinfo_lock_del(inp, tp); 549 goto out; 550 } 551 NET_EPOCH_ENTER(et); 552 tp = tcp_drop(tp, ETIMEDOUT); 553 NET_EPOCH_EXIT(et); 554 tcp_inpinfo_lock_del(inp, tp); 555 goto out; 556 } 557 /* 558 * If the user has closed the socket then drop a persisting 559 * connection after a much reduced timeout. 560 */ 561 if (tp->t_state > TCPS_CLOSE_WAIT && 562 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 563 TCPSTAT_INC(tcps_persistdrop); 564 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 565 tcp_inpinfo_lock_del(inp, tp); 566 goto out; 567 } 568 NET_EPOCH_ENTER(et); 569 tp = tcp_drop(tp, ETIMEDOUT); 570 NET_EPOCH_EXIT(et); 571 tcp_inpinfo_lock_del(inp, tp); 572 goto out; 573 } 574 tcp_setpersist(tp); 575 tp->t_flags |= TF_FORCEDATA; 576 NET_EPOCH_ENTER(et); 577 (void) tp->t_fb->tfb_tcp_output(tp); 578 NET_EPOCH_EXIT(et); 579 tp->t_flags &= ~TF_FORCEDATA; 580 581 #ifdef TCPDEBUG 582 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 583 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 584 #endif 585 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 586 INP_WUNLOCK(inp); 587 out: 588 CURVNET_RESTORE(); 589 } 590 591 void 592 tcp_timer_rexmt(void * xtp) 593 { 594 struct tcpcb *tp = xtp; 595 CURVNET_SET(tp->t_vnet); 596 int rexmt; 597 struct inpcb *inp; 598 struct epoch_tracker et; 599 #ifdef TCPDEBUG 600 int ostate; 601 602 ostate = tp->t_state; 603 #endif 604 inp = tp->t_inpcb; 605 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 606 INP_WLOCK(inp); 607 if (callout_pending(&tp->t_timers->tt_rexmt) || 608 !callout_active(&tp->t_timers->tt_rexmt)) { 609 INP_WUNLOCK(inp); 610 CURVNET_RESTORE(); 611 return; 612 } 613 callout_deactivate(&tp->t_timers->tt_rexmt); 614 if ((inp->inp_flags & INP_DROPPED) != 0) { 615 INP_WUNLOCK(inp); 616 CURVNET_RESTORE(); 617 return; 618 } 619 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 620 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 621 tcp_free_sackholes(tp); 622 TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); 623 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 624 /* The stack has a timer action too. */ 625 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 626 } 627 /* 628 * Retransmission timer went off. Message has not 629 * been acked within retransmit interval. Back off 630 * to a longer retransmit interval and retransmit one segment. 631 */ 632 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 633 tp->t_rxtshift = TCP_MAXRXTSHIFT; 634 TCPSTAT_INC(tcps_timeoutdrop); 635 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 636 tcp_inpinfo_lock_del(inp, tp); 637 goto out; 638 } 639 NET_EPOCH_ENTER(et); 640 tp = tcp_drop(tp, ETIMEDOUT); 641 NET_EPOCH_EXIT(et); 642 tcp_inpinfo_lock_del(inp, tp); 643 goto out; 644 } 645 if (tp->t_state == TCPS_SYN_SENT) { 646 /* 647 * If the SYN was retransmitted, indicate CWND to be 648 * limited to 1 segment in cc_conn_init(). 649 */ 650 tp->snd_cwnd = 1; 651 } else if (tp->t_rxtshift == 1) { 652 /* 653 * first retransmit; record ssthresh and cwnd so they can 654 * be recovered if this turns out to be a "bad" retransmit. 655 * A retransmit is considered "bad" if an ACK for this 656 * segment is received within RTT/2 interval; the assumption 657 * here is that the ACK was already in flight. See 658 * "On Estimating End-to-End Network Path Properties" by 659 * Allman and Paxson for more details. 660 */ 661 tp->snd_cwnd_prev = tp->snd_cwnd; 662 tp->snd_ssthresh_prev = tp->snd_ssthresh; 663 tp->snd_recover_prev = tp->snd_recover; 664 if (IN_FASTRECOVERY(tp->t_flags)) 665 tp->t_flags |= TF_WASFRECOVERY; 666 else 667 tp->t_flags &= ~TF_WASFRECOVERY; 668 if (IN_CONGRECOVERY(tp->t_flags)) 669 tp->t_flags |= TF_WASCRECOVERY; 670 else 671 tp->t_flags &= ~TF_WASCRECOVERY; 672 if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 673 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 674 /* In the event that we've negotiated timestamps 675 * badrxtwin will be set to the value that we set 676 * the retransmitted packet's to_tsval to by tcp_output 677 */ 678 tp->t_flags |= TF_PREVVALID; 679 } else 680 tp->t_flags &= ~TF_PREVVALID; 681 TCPSTAT_INC(tcps_rexmttimeo); 682 if ((tp->t_state == TCPS_SYN_SENT) || 683 (tp->t_state == TCPS_SYN_RECEIVED)) 684 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 685 else 686 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 687 TCPT_RANGESET(tp->t_rxtcur, rexmt, 688 tp->t_rttmin, TCPTV_REXMTMAX); 689 690 /* 691 * We enter the path for PLMTUD if connection is established or, if 692 * connection is FIN_WAIT_1 status, reason for the last is that if 693 * amount of data we send is very small, we could send it in couple of 694 * packets and process straight to FIN. In that case we won't catch 695 * ESTABLISHED state. 696 */ 697 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 698 || (tp->t_state == TCPS_FIN_WAIT_1))) { 699 #ifdef INET6 700 int isipv6; 701 #endif 702 703 /* 704 * Idea here is that at each stage of mtu probe (usually, 1448 705 * -> 1188 -> 524) should be given 2 chances to recover before 706 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 707 * take care of that. 708 */ 709 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 710 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 711 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 712 tp->t_rxtshift % 2 == 0)) { 713 /* 714 * Enter Path MTU Black-hole Detection mechanism: 715 * - Disable Path MTU Discovery (IP "DF" bit). 716 * - Reduce MTU to lower value than what we 717 * negotiated with peer. 718 */ 719 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 720 /* Record that we may have found a black hole. */ 721 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 722 /* Keep track of previous MSS. */ 723 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 724 } 725 726 /* 727 * Reduce the MSS to blackhole value or to the default 728 * in an attempt to retransmit. 729 */ 730 #ifdef INET6 731 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 732 if (isipv6 && 733 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 734 /* Use the sysctl tuneable blackhole MSS. */ 735 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 736 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 737 } else if (isipv6) { 738 /* Use the default MSS. */ 739 tp->t_maxseg = V_tcp_v6mssdflt; 740 /* 741 * Disable Path MTU Discovery when we switch to 742 * minmss. 743 */ 744 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 745 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 746 } 747 #endif 748 #if defined(INET6) && defined(INET) 749 else 750 #endif 751 #ifdef INET 752 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 753 /* Use the sysctl tuneable blackhole MSS. */ 754 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 755 TCPSTAT_INC(tcps_pmtud_blackhole_activated); 756 } else { 757 /* Use the default MSS. */ 758 tp->t_maxseg = V_tcp_mssdflt; 759 /* 760 * Disable Path MTU Discovery when we switch to 761 * minmss. 762 */ 763 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 764 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 765 } 766 #endif 767 /* 768 * Reset the slow-start flight size 769 * as it may depend on the new MSS. 770 */ 771 if (CC_ALGO(tp)->conn_init != NULL) 772 CC_ALGO(tp)->conn_init(tp->ccv); 773 } else { 774 /* 775 * If further retransmissions are still unsuccessful 776 * with a lowered MTU, maybe this isn't a blackhole and 777 * we restore the previous MSS and blackhole detection 778 * flags. 779 * The limit '6' is determined by giving each probe 780 * stage (1448, 1188, 524) 2 chances to recover. 781 */ 782 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 783 (tp->t_rxtshift >= 6)) { 784 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 785 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 786 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 787 TCPSTAT_INC(tcps_pmtud_blackhole_failed); 788 /* 789 * Reset the slow-start flight size as it 790 * may depend on the new MSS. 791 */ 792 if (CC_ALGO(tp)->conn_init != NULL) 793 CC_ALGO(tp)->conn_init(tp->ccv); 794 } 795 } 796 } 797 798 /* 799 * Disable RFC1323 and SACK if we haven't got any response to 800 * our third SYN to work-around some broken terminal servers 801 * (most of which have hopefully been retired) that have bad VJ 802 * header compression code which trashes TCP segments containing 803 * unknown-to-them TCP options. 804 */ 805 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 806 (tp->t_rxtshift == 3)) 807 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 808 /* 809 * If we backed off this far, notify the L3 protocol that we're having 810 * connection problems. 811 */ 812 if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 813 #ifdef INET6 814 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 815 in6_losing(tp->t_inpcb); 816 else 817 #endif 818 in_losing(tp->t_inpcb); 819 } 820 tp->snd_nxt = tp->snd_una; 821 tp->snd_recover = tp->snd_max; 822 /* 823 * Force a segment to be sent. 824 */ 825 tp->t_flags |= TF_ACKNOW; 826 /* 827 * If timing a segment in this window, stop the timer. 828 */ 829 tp->t_rtttime = 0; 830 831 cc_cong_signal(tp, NULL, CC_RTO); 832 NET_EPOCH_ENTER(et); 833 (void) tp->t_fb->tfb_tcp_output(tp); 834 NET_EPOCH_EXIT(et); 835 #ifdef TCPDEBUG 836 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 837 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 838 PRU_SLOWTIMO); 839 #endif 840 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 841 INP_WUNLOCK(inp); 842 out: 843 CURVNET_RESTORE(); 844 } 845 846 void 847 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 848 { 849 struct callout *t_callout; 850 callout_func_t *f_callout; 851 struct inpcb *inp = tp->t_inpcb; 852 int cpu = inp_to_cpuid(inp); 853 854 #ifdef TCP_OFFLOAD 855 if (tp->t_flags & TF_TOE) 856 return; 857 #endif 858 859 if (tp->t_timers->tt_flags & TT_STOPPED) 860 return; 861 862 switch (timer_type) { 863 case TT_DELACK: 864 t_callout = &tp->t_timers->tt_delack; 865 f_callout = tcp_timer_delack; 866 break; 867 case TT_REXMT: 868 t_callout = &tp->t_timers->tt_rexmt; 869 f_callout = tcp_timer_rexmt; 870 break; 871 case TT_PERSIST: 872 t_callout = &tp->t_timers->tt_persist; 873 f_callout = tcp_timer_persist; 874 break; 875 case TT_KEEP: 876 t_callout = &tp->t_timers->tt_keep; 877 f_callout = tcp_timer_keep; 878 break; 879 case TT_2MSL: 880 t_callout = &tp->t_timers->tt_2msl; 881 f_callout = tcp_timer_2msl; 882 break; 883 default: 884 if (tp->t_fb->tfb_tcp_timer_activate) { 885 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 886 return; 887 } 888 panic("tp %p bad timer_type %#x", tp, timer_type); 889 } 890 if (delta == 0) { 891 callout_stop(t_callout); 892 } else { 893 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 894 } 895 } 896 897 int 898 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 899 { 900 struct callout *t_callout; 901 902 switch (timer_type) { 903 case TT_DELACK: 904 t_callout = &tp->t_timers->tt_delack; 905 break; 906 case TT_REXMT: 907 t_callout = &tp->t_timers->tt_rexmt; 908 break; 909 case TT_PERSIST: 910 t_callout = &tp->t_timers->tt_persist; 911 break; 912 case TT_KEEP: 913 t_callout = &tp->t_timers->tt_keep; 914 break; 915 case TT_2MSL: 916 t_callout = &tp->t_timers->tt_2msl; 917 break; 918 default: 919 if (tp->t_fb->tfb_tcp_timer_active) { 920 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 921 } 922 panic("tp %p bad timer_type %#x", tp, timer_type); 923 } 924 return callout_active(t_callout); 925 } 926 927 /* 928 * Stop the timer from running, and apply a flag 929 * against the timer_flags that will force the 930 * timer never to run. The flag is needed to assure 931 * a race does not leave it running and cause 932 * the timer to possibly restart itself (keep and persist 933 * especially do this). 934 */ 935 int 936 tcp_timer_suspend(struct tcpcb *tp, uint32_t timer_type) 937 { 938 struct callout *t_callout; 939 uint32_t t_flags; 940 941 switch (timer_type) { 942 case TT_DELACK: 943 t_flags = TT_DELACK_SUS; 944 t_callout = &tp->t_timers->tt_delack; 945 break; 946 case TT_REXMT: 947 t_flags = TT_REXMT_SUS; 948 t_callout = &tp->t_timers->tt_rexmt; 949 break; 950 case TT_PERSIST: 951 t_flags = TT_PERSIST_SUS; 952 t_callout = &tp->t_timers->tt_persist; 953 break; 954 case TT_KEEP: 955 t_flags = TT_KEEP_SUS; 956 t_callout = &tp->t_timers->tt_keep; 957 break; 958 case TT_2MSL: 959 t_flags = TT_2MSL_SUS; 960 t_callout = &tp->t_timers->tt_2msl; 961 break; 962 default: 963 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 964 } 965 tp->t_timers->tt_flags |= t_flags; 966 return (callout_stop(t_callout)); 967 } 968 969 void 970 tcp_timers_unsuspend(struct tcpcb *tp, uint32_t timer_type) 971 { 972 switch (timer_type) { 973 case TT_DELACK: 974 if (tp->t_timers->tt_flags & TT_DELACK_SUS) { 975 tp->t_timers->tt_flags &= ~TT_DELACK_SUS; 976 if (tp->t_flags & TF_DELACK) { 977 /* Delayed ack timer should be up activate a timer */ 978 tp->t_flags &= ~TF_DELACK; 979 tcp_timer_activate(tp, TT_DELACK, 980 tcp_delacktime); 981 } 982 } 983 break; 984 case TT_REXMT: 985 if (tp->t_timers->tt_flags & TT_REXMT_SUS) { 986 tp->t_timers->tt_flags &= ~TT_REXMT_SUS; 987 if (SEQ_GT(tp->snd_max, tp->snd_una) && 988 (tcp_timer_active((tp), TT_PERSIST) == 0) && 989 tp->snd_wnd) { 990 /* We have outstanding data activate a timer */ 991 tcp_timer_activate(tp, TT_REXMT, 992 tp->t_rxtcur); 993 } 994 } 995 break; 996 case TT_PERSIST: 997 if (tp->t_timers->tt_flags & TT_PERSIST_SUS) { 998 tp->t_timers->tt_flags &= ~TT_PERSIST_SUS; 999 if (tp->snd_wnd == 0) { 1000 /* Activate the persists timer */ 1001 tp->t_rxtshift = 0; 1002 tcp_setpersist(tp); 1003 } 1004 } 1005 break; 1006 case TT_KEEP: 1007 if (tp->t_timers->tt_flags & TT_KEEP_SUS) { 1008 tp->t_timers->tt_flags &= ~TT_KEEP_SUS; 1009 tcp_timer_activate(tp, TT_KEEP, 1010 TCPS_HAVEESTABLISHED(tp->t_state) ? 1011 TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); 1012 } 1013 break; 1014 case TT_2MSL: 1015 if (tp->t_timers->tt_flags &= TT_2MSL_SUS) { 1016 tp->t_timers->tt_flags &= ~TT_2MSL_SUS; 1017 if ((tp->t_state == TCPS_FIN_WAIT_2) && 1018 ((tp->t_inpcb->inp_socket == NULL) || 1019 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE))) { 1020 /* Star the 2MSL timer */ 1021 tcp_timer_activate(tp, TT_2MSL, 1022 (tcp_fast_finwait2_recycle) ? 1023 tcp_finwait2_timeout : TP_MAXIDLE(tp)); 1024 } 1025 } 1026 break; 1027 default: 1028 panic("tp:%p bad timer_type 0x%x", tp, timer_type); 1029 } 1030 } 1031 1032 void 1033 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 1034 { 1035 struct callout *t_callout; 1036 1037 tp->t_timers->tt_flags |= TT_STOPPED; 1038 switch (timer_type) { 1039 case TT_DELACK: 1040 t_callout = &tp->t_timers->tt_delack; 1041 break; 1042 case TT_REXMT: 1043 t_callout = &tp->t_timers->tt_rexmt; 1044 break; 1045 case TT_PERSIST: 1046 t_callout = &tp->t_timers->tt_persist; 1047 break; 1048 case TT_KEEP: 1049 t_callout = &tp->t_timers->tt_keep; 1050 break; 1051 case TT_2MSL: 1052 t_callout = &tp->t_timers->tt_2msl; 1053 break; 1054 default: 1055 if (tp->t_fb->tfb_tcp_timer_stop) { 1056 /* 1057 * XXXrrs we need to look at this with the 1058 * stop case below (flags). 1059 */ 1060 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 1061 return; 1062 } 1063 panic("tp %p bad timer_type %#x", tp, timer_type); 1064 } 1065 1066 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1067 /* 1068 * Can't stop the callout, defer tcpcb actual deletion 1069 * to the last one. We do this using the async drain 1070 * function and incrementing the count in 1071 */ 1072 tp->t_timers->tt_draincnt++; 1073 } 1074 } 1075