1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/tcp_cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_keepinit; 81 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 83 84 int tcp_keepidle; 85 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 87 88 int tcp_keepintvl; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 91 92 int tcp_delacktime; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 95 "Time before a delayed ACK is sent"); 96 97 int tcp_msl; 98 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 99 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 100 101 int tcp_rexmit_min; 102 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 103 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 104 "Minimum Retransmission Timeout"); 105 106 int tcp_rexmit_slop; 107 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 108 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 109 "Retransmission Timer Slop"); 110 111 static int always_keepalive = 1; 112 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 113 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 114 115 int tcp_fast_finwait2_recycle = 0; 116 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 117 &tcp_fast_finwait2_recycle, 0, 118 "Recycle closed FIN_WAIT_2 connections faster"); 119 120 int tcp_finwait2_timeout; 121 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 122 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 123 124 int tcp_keepcnt = TCPTV_KEEPCNT; 125 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 126 "Number of keepalive probes to send"); 127 128 /* max idle probes */ 129 int tcp_maxpersistidle; 130 131 static int tcp_rexmit_drop_options = 0; 132 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 133 &tcp_rexmit_drop_options, 0, 134 "Drop TCP options from 3rd and later retransmitted SYN"); 135 136 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 137 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 139 CTLFLAG_RW|CTLFLAG_VNET, 140 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 141 "Path MTU Discovery Black Hole Detection Enabled"); 142 143 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 144 #define V_tcp_pmtud_blackhole_activated \ 145 VNET(tcp_pmtud_blackhole_activated) 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 147 CTLFLAG_RD|CTLFLAG_VNET, 148 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 149 "Path MTU Discovery Black Hole Detection, Activation Count"); 150 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 152 #define V_tcp_pmtud_blackhole_activated_min_mss \ 153 VNET(tcp_pmtud_blackhole_activated_min_mss) 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 155 CTLFLAG_RD|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 157 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 158 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 160 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 161 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 162 CTLFLAG_RD|CTLFLAG_VNET, 163 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 164 "Path MTU Discovery Black Hole Detection, Failure Count"); 165 166 #ifdef INET 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 168 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 170 CTLFLAG_RW|CTLFLAG_VNET, 171 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 172 "Path MTU Discovery Black Hole Detection lowered MSS"); 173 #endif 174 175 #ifdef INET6 176 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 177 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 178 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 179 CTLFLAG_RW|CTLFLAG_VNET, 180 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 181 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 182 #endif 183 184 #ifdef RSS 185 static int per_cpu_timers = 1; 186 #else 187 static int per_cpu_timers = 0; 188 #endif 189 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 190 &per_cpu_timers , 0, "run tcp timers on all cpus"); 191 192 #if 0 193 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 194 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 195 #endif 196 197 /* 198 * Map the given inp to a CPU id. 199 * 200 * This queries RSS if it's compiled in, else it defaults to the current 201 * CPU ID. 202 */ 203 static inline int 204 inp_to_cpuid(struct inpcb *inp) 205 { 206 u_int cpuid; 207 208 #ifdef RSS 209 if (per_cpu_timers) { 210 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 211 if (cpuid == NETISR_CPUID_NONE) 212 return (curcpu); /* XXX */ 213 else 214 return (cpuid); 215 } 216 #else 217 /* Legacy, pre-RSS behaviour */ 218 if (per_cpu_timers) { 219 /* 220 * We don't have a flowid -> cpuid mapping, so cheat and 221 * just map unknown cpuids to curcpu. Not the best, but 222 * apparently better than defaulting to swi 0. 223 */ 224 cpuid = inp->inp_flowid % (mp_maxid + 1); 225 if (! CPU_ABSENT(cpuid)) 226 return (cpuid); 227 return (curcpu); 228 } 229 #endif 230 /* Default for RSS and non-RSS - cpuid 0 */ 231 else { 232 return (0); 233 } 234 } 235 236 /* 237 * Tcp protocol timeout routine called every 500 ms. 238 * Updates timestamps used for TCP 239 * causes finite state machine actions if timers expire. 240 */ 241 void 242 tcp_slowtimo(void) 243 { 244 VNET_ITERATOR_DECL(vnet_iter); 245 246 VNET_LIST_RLOCK_NOSLEEP(); 247 VNET_FOREACH(vnet_iter) { 248 CURVNET_SET(vnet_iter); 249 (void) tcp_tw_2msl_scan(0); 250 CURVNET_RESTORE(); 251 } 252 VNET_LIST_RUNLOCK_NOSLEEP(); 253 } 254 255 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 256 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 257 258 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 259 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 260 261 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 262 263 /* 264 * TCP timer processing. 265 */ 266 267 void 268 tcp_timer_delack(void *xtp) 269 { 270 struct tcpcb *tp = xtp; 271 struct inpcb *inp; 272 CURVNET_SET(tp->t_vnet); 273 274 inp = tp->t_inpcb; 275 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 276 INP_WLOCK(inp); 277 if (callout_pending(&tp->t_timers->tt_delack) || 278 !callout_active(&tp->t_timers->tt_delack)) { 279 INP_WUNLOCK(inp); 280 CURVNET_RESTORE(); 281 return; 282 } 283 callout_deactivate(&tp->t_timers->tt_delack); 284 if ((inp->inp_flags & INP_DROPPED) != 0) { 285 INP_WUNLOCK(inp); 286 CURVNET_RESTORE(); 287 return; 288 } 289 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 290 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 291 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 292 ("%s: tp %p delack callout should be running", __func__, tp)); 293 294 tp->t_flags |= TF_ACKNOW; 295 TCPSTAT_INC(tcps_delack); 296 (void) tp->t_fb->tfb_tcp_output(tp); 297 INP_WUNLOCK(inp); 298 CURVNET_RESTORE(); 299 } 300 301 void 302 tcp_timer_2msl(void *xtp) 303 { 304 struct tcpcb *tp = xtp; 305 struct inpcb *inp; 306 CURVNET_SET(tp->t_vnet); 307 #ifdef TCPDEBUG 308 int ostate; 309 310 ostate = tp->t_state; 311 #endif 312 INP_INFO_RLOCK(&V_tcbinfo); 313 inp = tp->t_inpcb; 314 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 315 INP_WLOCK(inp); 316 tcp_free_sackholes(tp); 317 if (callout_pending(&tp->t_timers->tt_2msl) || 318 !callout_active(&tp->t_timers->tt_2msl)) { 319 INP_WUNLOCK(tp->t_inpcb); 320 INP_INFO_RUNLOCK(&V_tcbinfo); 321 CURVNET_RESTORE(); 322 return; 323 } 324 callout_deactivate(&tp->t_timers->tt_2msl); 325 if ((inp->inp_flags & INP_DROPPED) != 0) { 326 INP_WUNLOCK(inp); 327 INP_INFO_RUNLOCK(&V_tcbinfo); 328 CURVNET_RESTORE(); 329 return; 330 } 331 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 332 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 333 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 334 ("%s: tp %p 2msl callout should be running", __func__, tp)); 335 /* 336 * 2 MSL timeout in shutdown went off. If we're closed but 337 * still waiting for peer to close and connection has been idle 338 * too long delete connection control block. Otherwise, check 339 * again in a bit. 340 * 341 * If in TIME_WAIT state just ignore as this timeout is handled in 342 * tcp_tw_2msl_scan(). 343 * 344 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 345 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 346 * Ignore fact that there were recent incoming segments. 347 */ 348 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 349 INP_WUNLOCK(inp); 350 INP_INFO_RUNLOCK(&V_tcbinfo); 351 CURVNET_RESTORE(); 352 return; 353 } 354 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 355 tp->t_inpcb && tp->t_inpcb->inp_socket && 356 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 357 TCPSTAT_INC(tcps_finwait2_drops); 358 tp = tcp_close(tp); 359 } else { 360 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 361 if (!callout_reset(&tp->t_timers->tt_2msl, 362 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 363 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 364 } 365 } else 366 tp = tcp_close(tp); 367 } 368 369 #ifdef TCPDEBUG 370 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 371 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 372 PRU_SLOWTIMO); 373 #endif 374 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 375 376 if (tp != NULL) 377 INP_WUNLOCK(inp); 378 INP_INFO_RUNLOCK(&V_tcbinfo); 379 CURVNET_RESTORE(); 380 } 381 382 void 383 tcp_timer_keep(void *xtp) 384 { 385 struct tcpcb *tp = xtp; 386 struct tcptemp *t_template; 387 struct inpcb *inp; 388 CURVNET_SET(tp->t_vnet); 389 #ifdef TCPDEBUG 390 int ostate; 391 392 ostate = tp->t_state; 393 #endif 394 INP_INFO_RLOCK(&V_tcbinfo); 395 inp = tp->t_inpcb; 396 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 397 INP_WLOCK(inp); 398 if (callout_pending(&tp->t_timers->tt_keep) || 399 !callout_active(&tp->t_timers->tt_keep)) { 400 INP_WUNLOCK(inp); 401 INP_INFO_RUNLOCK(&V_tcbinfo); 402 CURVNET_RESTORE(); 403 return; 404 } 405 callout_deactivate(&tp->t_timers->tt_keep); 406 if ((inp->inp_flags & INP_DROPPED) != 0) { 407 INP_WUNLOCK(inp); 408 INP_INFO_RUNLOCK(&V_tcbinfo); 409 CURVNET_RESTORE(); 410 return; 411 } 412 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 413 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 414 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 415 ("%s: tp %p keep callout should be running", __func__, tp)); 416 /* 417 * Keep-alive timer went off; send something 418 * or drop connection if idle for too long. 419 */ 420 TCPSTAT_INC(tcps_keeptimeo); 421 if (tp->t_state < TCPS_ESTABLISHED) 422 goto dropit; 423 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 424 tp->t_state <= TCPS_CLOSING) { 425 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 426 goto dropit; 427 /* 428 * Send a packet designed to force a response 429 * if the peer is up and reachable: 430 * either an ACK if the connection is still alive, 431 * or an RST if the peer has closed the connection 432 * due to timeout or reboot. 433 * Using sequence number tp->snd_una-1 434 * causes the transmitted zero-length segment 435 * to lie outside the receive window; 436 * by the protocol spec, this requires the 437 * correspondent TCP to respond. 438 */ 439 TCPSTAT_INC(tcps_keepprobe); 440 t_template = tcpip_maketemplate(inp); 441 if (t_template) { 442 tcp_respond(tp, t_template->tt_ipgen, 443 &t_template->tt_t, (struct mbuf *)NULL, 444 tp->rcv_nxt, tp->snd_una - 1, 0); 445 free(t_template, M_TEMP); 446 } 447 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 448 tcp_timer_keep, tp)) { 449 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 450 } 451 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 452 tcp_timer_keep, tp)) { 453 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 454 } 455 456 #ifdef TCPDEBUG 457 if (inp->inp_socket->so_options & SO_DEBUG) 458 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 459 PRU_SLOWTIMO); 460 #endif 461 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 462 INP_WUNLOCK(inp); 463 INP_INFO_RUNLOCK(&V_tcbinfo); 464 CURVNET_RESTORE(); 465 return; 466 467 dropit: 468 TCPSTAT_INC(tcps_keepdrops); 469 tp = tcp_drop(tp, ETIMEDOUT); 470 471 #ifdef TCPDEBUG 472 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 473 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 474 PRU_SLOWTIMO); 475 #endif 476 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 477 if (tp != NULL) 478 INP_WUNLOCK(tp->t_inpcb); 479 INP_INFO_RUNLOCK(&V_tcbinfo); 480 CURVNET_RESTORE(); 481 } 482 483 void 484 tcp_timer_persist(void *xtp) 485 { 486 struct tcpcb *tp = xtp; 487 struct inpcb *inp; 488 CURVNET_SET(tp->t_vnet); 489 #ifdef TCPDEBUG 490 int ostate; 491 492 ostate = tp->t_state; 493 #endif 494 INP_INFO_RLOCK(&V_tcbinfo); 495 inp = tp->t_inpcb; 496 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 497 INP_WLOCK(inp); 498 if (callout_pending(&tp->t_timers->tt_persist) || 499 !callout_active(&tp->t_timers->tt_persist)) { 500 INP_WUNLOCK(inp); 501 INP_INFO_RUNLOCK(&V_tcbinfo); 502 CURVNET_RESTORE(); 503 return; 504 } 505 callout_deactivate(&tp->t_timers->tt_persist); 506 if ((inp->inp_flags & INP_DROPPED) != 0) { 507 INP_WUNLOCK(inp); 508 INP_INFO_RUNLOCK(&V_tcbinfo); 509 CURVNET_RESTORE(); 510 return; 511 } 512 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 513 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 514 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 515 ("%s: tp %p persist callout should be running", __func__, tp)); 516 /* 517 * Persistance timer into zero window. 518 * Force a byte to be output, if possible. 519 */ 520 TCPSTAT_INC(tcps_persisttimeo); 521 /* 522 * Hack: if the peer is dead/unreachable, we do not 523 * time out if the window is closed. After a full 524 * backoff, drop the connection if the idle time 525 * (no responses to probes) reaches the maximum 526 * backoff that we would use if retransmitting. 527 */ 528 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 529 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 530 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 531 TCPSTAT_INC(tcps_persistdrop); 532 tp = tcp_drop(tp, ETIMEDOUT); 533 goto out; 534 } 535 /* 536 * If the user has closed the socket then drop a persisting 537 * connection after a much reduced timeout. 538 */ 539 if (tp->t_state > TCPS_CLOSE_WAIT && 540 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 541 TCPSTAT_INC(tcps_persistdrop); 542 tp = tcp_drop(tp, ETIMEDOUT); 543 goto out; 544 } 545 tcp_setpersist(tp); 546 tp->t_flags |= TF_FORCEDATA; 547 (void) tp->t_fb->tfb_tcp_output(tp); 548 tp->t_flags &= ~TF_FORCEDATA; 549 550 out: 551 #ifdef TCPDEBUG 552 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 553 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 554 #endif 555 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 556 if (tp != NULL) 557 INP_WUNLOCK(inp); 558 INP_INFO_RUNLOCK(&V_tcbinfo); 559 CURVNET_RESTORE(); 560 } 561 562 void 563 tcp_timer_rexmt(void * xtp) 564 { 565 struct tcpcb *tp = xtp; 566 CURVNET_SET(tp->t_vnet); 567 int rexmt; 568 int headlocked; 569 struct inpcb *inp; 570 #ifdef TCPDEBUG 571 int ostate; 572 573 ostate = tp->t_state; 574 #endif 575 576 INP_INFO_RLOCK(&V_tcbinfo); 577 inp = tp->t_inpcb; 578 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 579 INP_WLOCK(inp); 580 if (callout_pending(&tp->t_timers->tt_rexmt) || 581 !callout_active(&tp->t_timers->tt_rexmt)) { 582 INP_WUNLOCK(inp); 583 INP_INFO_RUNLOCK(&V_tcbinfo); 584 CURVNET_RESTORE(); 585 return; 586 } 587 callout_deactivate(&tp->t_timers->tt_rexmt); 588 if ((inp->inp_flags & INP_DROPPED) != 0) { 589 INP_WUNLOCK(inp); 590 INP_INFO_RUNLOCK(&V_tcbinfo); 591 CURVNET_RESTORE(); 592 return; 593 } 594 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 595 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 596 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 597 ("%s: tp %p rexmt callout should be running", __func__, tp)); 598 tcp_free_sackholes(tp); 599 /* 600 * Retransmission timer went off. Message has not 601 * been acked within retransmit interval. Back off 602 * to a longer retransmit interval and retransmit one segment. 603 */ 604 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 605 tp->t_rxtshift = TCP_MAXRXTSHIFT; 606 TCPSTAT_INC(tcps_timeoutdrop); 607 608 tp = tcp_drop(tp, tp->t_softerror ? 609 tp->t_softerror : ETIMEDOUT); 610 headlocked = 1; 611 goto out; 612 } 613 INP_INFO_RUNLOCK(&V_tcbinfo); 614 headlocked = 0; 615 if (tp->t_state == TCPS_SYN_SENT) { 616 /* 617 * If the SYN was retransmitted, indicate CWND to be 618 * limited to 1 segment in cc_conn_init(). 619 */ 620 tp->snd_cwnd = 1; 621 } else if (tp->t_rxtshift == 1) { 622 /* 623 * first retransmit; record ssthresh and cwnd so they can 624 * be recovered if this turns out to be a "bad" retransmit. 625 * A retransmit is considered "bad" if an ACK for this 626 * segment is received within RTT/2 interval; the assumption 627 * here is that the ACK was already in flight. See 628 * "On Estimating End-to-End Network Path Properties" by 629 * Allman and Paxson for more details. 630 */ 631 tp->snd_cwnd_prev = tp->snd_cwnd; 632 tp->snd_ssthresh_prev = tp->snd_ssthresh; 633 tp->snd_recover_prev = tp->snd_recover; 634 if (IN_FASTRECOVERY(tp->t_flags)) 635 tp->t_flags |= TF_WASFRECOVERY; 636 else 637 tp->t_flags &= ~TF_WASFRECOVERY; 638 if (IN_CONGRECOVERY(tp->t_flags)) 639 tp->t_flags |= TF_WASCRECOVERY; 640 else 641 tp->t_flags &= ~TF_WASCRECOVERY; 642 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 643 tp->t_flags |= TF_PREVVALID; 644 } else 645 tp->t_flags &= ~TF_PREVVALID; 646 TCPSTAT_INC(tcps_rexmttimeo); 647 if ((tp->t_state == TCPS_SYN_SENT) || 648 (tp->t_state == TCPS_SYN_RECEIVED)) 649 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 650 else 651 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 652 TCPT_RANGESET(tp->t_rxtcur, rexmt, 653 tp->t_rttmin, TCPTV_REXMTMAX); 654 655 /* 656 * We enter the path for PLMTUD if connection is established or, if 657 * connection is FIN_WAIT_1 status, reason for the last is that if 658 * amount of data we send is very small, we could send it in couple of 659 * packets and process straight to FIN. In that case we won't catch 660 * ESTABLISHED state. 661 */ 662 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 663 || (tp->t_state == TCPS_FIN_WAIT_1))) { 664 #ifdef INET6 665 int isipv6; 666 #endif 667 668 /* 669 * Idea here is that at each stage of mtu probe (usually, 1448 670 * -> 1188 -> 524) should be given 2 chances to recover before 671 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 672 * take care of that. 673 */ 674 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 675 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 676 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 677 /* 678 * Enter Path MTU Black-hole Detection mechanism: 679 * - Disable Path MTU Discovery (IP "DF" bit). 680 * - Reduce MTU to lower value than what we 681 * negotiated with peer. 682 */ 683 /* Record that we may have found a black hole. */ 684 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 685 686 /* Keep track of previous MSS. */ 687 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 688 689 /* 690 * Reduce the MSS to blackhole value or to the default 691 * in an attempt to retransmit. 692 */ 693 #ifdef INET6 694 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 695 if (isipv6 && 696 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 697 /* Use the sysctl tuneable blackhole MSS. */ 698 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 699 V_tcp_pmtud_blackhole_activated++; 700 } else if (isipv6) { 701 /* Use the default MSS. */ 702 tp->t_maxseg = V_tcp_v6mssdflt; 703 /* 704 * Disable Path MTU Discovery when we switch to 705 * minmss. 706 */ 707 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 708 V_tcp_pmtud_blackhole_activated_min_mss++; 709 } 710 #endif 711 #if defined(INET6) && defined(INET) 712 else 713 #endif 714 #ifdef INET 715 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 716 /* Use the sysctl tuneable blackhole MSS. */ 717 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 718 V_tcp_pmtud_blackhole_activated++; 719 } else { 720 /* Use the default MSS. */ 721 tp->t_maxseg = V_tcp_mssdflt; 722 /* 723 * Disable Path MTU Discovery when we switch to 724 * minmss. 725 */ 726 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 727 V_tcp_pmtud_blackhole_activated_min_mss++; 728 } 729 #endif 730 /* 731 * Reset the slow-start flight size 732 * as it may depend on the new MSS. 733 */ 734 if (CC_ALGO(tp)->conn_init != NULL) 735 CC_ALGO(tp)->conn_init(tp->ccv); 736 } else { 737 /* 738 * If further retransmissions are still unsuccessful 739 * with a lowered MTU, maybe this isn't a blackhole and 740 * we restore the previous MSS and blackhole detection 741 * flags. 742 * The limit '6' is determined by giving each probe 743 * stage (1448, 1188, 524) 2 chances to recover. 744 */ 745 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 746 (tp->t_rxtshift > 6)) { 747 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 748 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 749 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 750 V_tcp_pmtud_blackhole_failed++; 751 /* 752 * Reset the slow-start flight size as it 753 * may depend on the new MSS. 754 */ 755 if (CC_ALGO(tp)->conn_init != NULL) 756 CC_ALGO(tp)->conn_init(tp->ccv); 757 } 758 } 759 } 760 761 /* 762 * Disable RFC1323 and SACK if we haven't got any response to 763 * our third SYN to work-around some broken terminal servers 764 * (most of which have hopefully been retired) that have bad VJ 765 * header compression code which trashes TCP segments containing 766 * unknown-to-them TCP options. 767 */ 768 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 769 (tp->t_rxtshift == 3)) 770 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 771 /* 772 * If we backed off this far, our srtt estimate is probably bogus. 773 * Clobber it so we'll take the next rtt measurement as our srtt; 774 * move the current srtt into rttvar to keep the current 775 * retransmit times until then. 776 */ 777 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 778 #ifdef INET6 779 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 780 in6_losing(tp->t_inpcb); 781 #endif 782 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 783 tp->t_srtt = 0; 784 } 785 tp->snd_nxt = tp->snd_una; 786 tp->snd_recover = tp->snd_max; 787 /* 788 * Force a segment to be sent. 789 */ 790 tp->t_flags |= TF_ACKNOW; 791 /* 792 * If timing a segment in this window, stop the timer. 793 */ 794 tp->t_rtttime = 0; 795 796 cc_cong_signal(tp, NULL, CC_RTO); 797 798 (void) tp->t_fb->tfb_tcp_output(tp); 799 800 out: 801 #ifdef TCPDEBUG 802 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 803 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 804 PRU_SLOWTIMO); 805 #endif 806 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 807 if (tp != NULL) 808 INP_WUNLOCK(inp); 809 if (headlocked) 810 INP_INFO_RUNLOCK(&V_tcbinfo); 811 CURVNET_RESTORE(); 812 } 813 814 void 815 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 816 { 817 struct callout *t_callout; 818 timeout_t *f_callout; 819 struct inpcb *inp = tp->t_inpcb; 820 int cpu = inp_to_cpuid(inp); 821 uint32_t f_reset; 822 823 #ifdef TCP_OFFLOAD 824 if (tp->t_flags & TF_TOE) 825 return; 826 #endif 827 828 if (tp->t_timers->tt_flags & TT_STOPPED) 829 return; 830 831 switch (timer_type) { 832 case TT_DELACK: 833 t_callout = &tp->t_timers->tt_delack; 834 f_callout = tcp_timer_delack; 835 f_reset = TT_DELACK_RST; 836 break; 837 case TT_REXMT: 838 t_callout = &tp->t_timers->tt_rexmt; 839 f_callout = tcp_timer_rexmt; 840 f_reset = TT_REXMT_RST; 841 break; 842 case TT_PERSIST: 843 t_callout = &tp->t_timers->tt_persist; 844 f_callout = tcp_timer_persist; 845 f_reset = TT_PERSIST_RST; 846 break; 847 case TT_KEEP: 848 t_callout = &tp->t_timers->tt_keep; 849 f_callout = tcp_timer_keep; 850 f_reset = TT_KEEP_RST; 851 break; 852 case TT_2MSL: 853 t_callout = &tp->t_timers->tt_2msl; 854 f_callout = tcp_timer_2msl; 855 f_reset = TT_2MSL_RST; 856 break; 857 default: 858 if (tp->t_fb->tfb_tcp_timer_activate) { 859 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 860 return; 861 } 862 panic("tp %p bad timer_type %#x", tp, timer_type); 863 } 864 if (delta == 0) { 865 if ((tp->t_timers->tt_flags & timer_type) && 866 (callout_stop(t_callout) > 0) && 867 (tp->t_timers->tt_flags & f_reset)) { 868 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 869 } 870 } else { 871 if ((tp->t_timers->tt_flags & timer_type) == 0) { 872 tp->t_timers->tt_flags |= (timer_type | f_reset); 873 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 874 } else { 875 /* Reset already running callout on the same CPU. */ 876 if (!callout_reset(t_callout, delta, f_callout, tp)) { 877 /* 878 * Callout not cancelled, consider it as not 879 * properly restarted. */ 880 tp->t_timers->tt_flags &= ~f_reset; 881 } 882 } 883 } 884 } 885 886 int 887 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 888 { 889 struct callout *t_callout; 890 891 switch (timer_type) { 892 case TT_DELACK: 893 t_callout = &tp->t_timers->tt_delack; 894 break; 895 case TT_REXMT: 896 t_callout = &tp->t_timers->tt_rexmt; 897 break; 898 case TT_PERSIST: 899 t_callout = &tp->t_timers->tt_persist; 900 break; 901 case TT_KEEP: 902 t_callout = &tp->t_timers->tt_keep; 903 break; 904 case TT_2MSL: 905 t_callout = &tp->t_timers->tt_2msl; 906 break; 907 default: 908 if (tp->t_fb->tfb_tcp_timer_active) { 909 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 910 } 911 panic("tp %p bad timer_type %#x", tp, timer_type); 912 } 913 return callout_active(t_callout); 914 } 915 916 void 917 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 918 { 919 struct callout *t_callout; 920 timeout_t *f_callout; 921 uint32_t f_reset; 922 923 tp->t_timers->tt_flags |= TT_STOPPED; 924 925 switch (timer_type) { 926 case TT_DELACK: 927 t_callout = &tp->t_timers->tt_delack; 928 f_callout = tcp_timer_delack_discard; 929 f_reset = TT_DELACK_RST; 930 break; 931 case TT_REXMT: 932 t_callout = &tp->t_timers->tt_rexmt; 933 f_callout = tcp_timer_rexmt_discard; 934 f_reset = TT_REXMT_RST; 935 break; 936 case TT_PERSIST: 937 t_callout = &tp->t_timers->tt_persist; 938 f_callout = tcp_timer_persist_discard; 939 f_reset = TT_PERSIST_RST; 940 break; 941 case TT_KEEP: 942 t_callout = &tp->t_timers->tt_keep; 943 f_callout = tcp_timer_keep_discard; 944 f_reset = TT_KEEP_RST; 945 break; 946 case TT_2MSL: 947 t_callout = &tp->t_timers->tt_2msl; 948 f_callout = tcp_timer_2msl_discard; 949 f_reset = TT_2MSL_RST; 950 break; 951 default: 952 if (tp->t_fb->tfb_tcp_timer_stop) { 953 /* 954 * XXXrrs we need to look at this with the 955 * stop case below (flags). 956 */ 957 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 958 return; 959 } 960 panic("tp %p bad timer_type %#x", tp, timer_type); 961 } 962 963 if (tp->t_timers->tt_flags & timer_type) { 964 if ((callout_stop(t_callout) > 0) && 965 (tp->t_timers->tt_flags & f_reset)) { 966 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 967 } else { 968 /* 969 * Can't stop the callout, defer tcpcb actual deletion 970 * to the last tcp timer discard callout. 971 * The TT_STOPPED flag will ensure that no tcp timer 972 * callouts can be restarted on our behalf, and 973 * past this point currently running callouts waiting 974 * on inp lock will return right away after the 975 * classical check for callout reset/stop events: 976 * callout_pending() || !callout_active() 977 */ 978 callout_reset(t_callout, 1, f_callout, tp); 979 } 980 } 981 } 982 983 #define ticks_to_msecs(t) (1000*(t) / hz) 984 985 void 986 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 987 struct xtcp_timer *xtimer) 988 { 989 sbintime_t now; 990 991 bzero(xtimer, sizeof(*xtimer)); 992 if (timer == NULL) 993 return; 994 now = getsbinuptime(); 995 if (callout_active(&timer->tt_delack)) 996 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 997 if (callout_active(&timer->tt_rexmt)) 998 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 999 if (callout_active(&timer->tt_persist)) 1000 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 1001 if (callout_active(&timer->tt_keep)) 1002 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 1003 if (callout_active(&timer->tt_2msl)) 1004 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1005 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1006 } 1007