1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/cc.h> 59 #include <netinet/in.h> 60 #include <netinet/in_kdtrace.h> 61 #include <netinet/in_pcb.h> 62 #include <netinet/in_rss.h> 63 #include <netinet/in_systm.h> 64 #ifdef INET6 65 #include <netinet6/in6_pcb.h> 66 #endif 67 #include <netinet/ip_var.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #ifdef INET6 72 #include <netinet6/tcp6_var.h> 73 #endif 74 #include <netinet/tcpip.h> 75 #ifdef TCPDEBUG 76 #include <netinet/tcp_debug.h> 77 #endif 78 79 int tcp_keepinit; 80 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 81 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 82 83 int tcp_keepidle; 84 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 85 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 86 87 int tcp_keepintvl; 88 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 89 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 90 91 int tcp_delacktime; 92 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 93 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 94 "Time before a delayed ACK is sent"); 95 96 int tcp_msl; 97 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 99 100 int tcp_rexmit_min; 101 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 103 "Minimum Retransmission Timeout"); 104 105 int tcp_rexmit_slop; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 108 "Retransmission Timer Slop"); 109 110 static int always_keepalive = 1; 111 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 112 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 113 114 int tcp_fast_finwait2_recycle = 0; 115 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 116 &tcp_fast_finwait2_recycle, 0, 117 "Recycle closed FIN_WAIT_2 connections faster"); 118 119 int tcp_finwait2_timeout; 120 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 121 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 122 123 int tcp_keepcnt = TCPTV_KEEPCNT; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 125 "Number of keepalive probes to send"); 126 127 /* max idle probes */ 128 int tcp_maxpersistidle; 129 130 static int tcp_rexmit_drop_options = 0; 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 132 &tcp_rexmit_drop_options, 0, 133 "Drop TCP options from 3rd and later retransmitted SYN"); 134 135 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 136 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 137 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 138 CTLFLAG_RW|CTLFLAG_VNET, 139 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 140 "Path MTU Discovery Black Hole Detection Enabled"); 141 142 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 143 #define V_tcp_pmtud_blackhole_activated \ 144 VNET(tcp_pmtud_blackhole_activated) 145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 146 CTLFLAG_RD|CTLFLAG_VNET, 147 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 148 "Path MTU Discovery Black Hole Detection, Activation Count"); 149 150 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 151 #define V_tcp_pmtud_blackhole_activated_min_mss \ 152 VNET(tcp_pmtud_blackhole_activated_min_mss) 153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 154 CTLFLAG_RD|CTLFLAG_VNET, 155 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 156 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 157 158 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 159 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 161 CTLFLAG_RD|CTLFLAG_VNET, 162 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 163 "Path MTU Discovery Black Hole Detection, Failure Count"); 164 165 #ifdef INET 166 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 167 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 168 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 169 CTLFLAG_RW|CTLFLAG_VNET, 170 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 171 "Path MTU Discovery Black Hole Detection lowered MSS"); 172 #endif 173 174 #ifdef INET6 175 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 176 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 180 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef RSS 184 static int per_cpu_timers = 1; 185 #else 186 static int per_cpu_timers = 0; 187 #endif 188 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 189 &per_cpu_timers , 0, "run tcp timers on all cpus"); 190 191 #if 0 192 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 193 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 194 #endif 195 196 /* 197 * Map the given inp to a CPU id. 198 * 199 * This queries RSS if it's compiled in, else it defaults to the current 200 * CPU ID. 201 */ 202 static inline int 203 inp_to_cpuid(struct inpcb *inp) 204 { 205 u_int cpuid; 206 207 #ifdef RSS 208 if (per_cpu_timers) { 209 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 210 if (cpuid == NETISR_CPUID_NONE) 211 return (curcpu); /* XXX */ 212 else 213 return (cpuid); 214 } 215 #else 216 /* Legacy, pre-RSS behaviour */ 217 if (per_cpu_timers) { 218 /* 219 * We don't have a flowid -> cpuid mapping, so cheat and 220 * just map unknown cpuids to curcpu. Not the best, but 221 * apparently better than defaulting to swi 0. 222 */ 223 cpuid = inp->inp_flowid % (mp_maxid + 1); 224 if (! CPU_ABSENT(cpuid)) 225 return (cpuid); 226 return (curcpu); 227 } 228 #endif 229 /* Default for RSS and non-RSS - cpuid 0 */ 230 else { 231 return (0); 232 } 233 } 234 235 /* 236 * Tcp protocol timeout routine called every 500 ms. 237 * Updates timestamps used for TCP 238 * causes finite state machine actions if timers expire. 239 */ 240 void 241 tcp_slowtimo(void) 242 { 243 VNET_ITERATOR_DECL(vnet_iter); 244 245 VNET_LIST_RLOCK_NOSLEEP(); 246 VNET_FOREACH(vnet_iter) { 247 CURVNET_SET(vnet_iter); 248 (void) tcp_tw_2msl_scan(0); 249 CURVNET_RESTORE(); 250 } 251 VNET_LIST_RUNLOCK_NOSLEEP(); 252 } 253 254 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 255 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 256 257 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 258 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 259 260 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 261 262 /* 263 * TCP timer processing. 264 */ 265 266 void 267 tcp_timer_delack(void *xtp) 268 { 269 struct tcpcb *tp = xtp; 270 struct inpcb *inp; 271 CURVNET_SET(tp->t_vnet); 272 273 inp = tp->t_inpcb; 274 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 275 INP_WLOCK(inp); 276 if (callout_pending(&tp->t_timers->tt_delack) || 277 !callout_active(&tp->t_timers->tt_delack)) { 278 INP_WUNLOCK(inp); 279 CURVNET_RESTORE(); 280 return; 281 } 282 callout_deactivate(&tp->t_timers->tt_delack); 283 if ((inp->inp_flags & INP_DROPPED) != 0) { 284 INP_WUNLOCK(inp); 285 CURVNET_RESTORE(); 286 return; 287 } 288 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 289 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 290 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 291 ("%s: tp %p delack callout should be running", __func__, tp)); 292 293 tp->t_flags |= TF_ACKNOW; 294 TCPSTAT_INC(tcps_delack); 295 (void) tcp_output(tp); 296 INP_WUNLOCK(inp); 297 CURVNET_RESTORE(); 298 } 299 300 void 301 tcp_timer_2msl(void *xtp) 302 { 303 struct tcpcb *tp = xtp; 304 struct inpcb *inp; 305 CURVNET_SET(tp->t_vnet); 306 #ifdef TCPDEBUG 307 int ostate; 308 309 ostate = tp->t_state; 310 #endif 311 INP_INFO_RLOCK(&V_tcbinfo); 312 inp = tp->t_inpcb; 313 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 314 INP_WLOCK(inp); 315 tcp_free_sackholes(tp); 316 if (callout_pending(&tp->t_timers->tt_2msl) || 317 !callout_active(&tp->t_timers->tt_2msl)) { 318 INP_WUNLOCK(tp->t_inpcb); 319 INP_INFO_RUNLOCK(&V_tcbinfo); 320 CURVNET_RESTORE(); 321 return; 322 } 323 callout_deactivate(&tp->t_timers->tt_2msl); 324 if ((inp->inp_flags & INP_DROPPED) != 0) { 325 INP_WUNLOCK(inp); 326 INP_INFO_RUNLOCK(&V_tcbinfo); 327 CURVNET_RESTORE(); 328 return; 329 } 330 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 331 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 332 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 333 ("%s: tp %p 2msl callout should be running", __func__, tp)); 334 /* 335 * 2 MSL timeout in shutdown went off. If we're closed but 336 * still waiting for peer to close and connection has been idle 337 * too long delete connection control block. Otherwise, check 338 * again in a bit. 339 * 340 * If in TIME_WAIT state just ignore as this timeout is handled in 341 * tcp_tw_2msl_scan(). 342 * 343 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 344 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 345 * Ignore fact that there were recent incoming segments. 346 */ 347 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 348 INP_WUNLOCK(inp); 349 INP_INFO_RUNLOCK(&V_tcbinfo); 350 CURVNET_RESTORE(); 351 return; 352 } 353 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 354 tp->t_inpcb && tp->t_inpcb->inp_socket && 355 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 356 TCPSTAT_INC(tcps_finwait2_drops); 357 tp = tcp_close(tp); 358 } else { 359 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 360 if (!callout_reset(&tp->t_timers->tt_2msl, 361 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 362 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 363 } 364 } else 365 tp = tcp_close(tp); 366 } 367 368 #ifdef TCPDEBUG 369 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 370 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 371 PRU_SLOWTIMO); 372 #endif 373 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 374 375 if (tp != NULL) 376 INP_WUNLOCK(inp); 377 INP_INFO_RUNLOCK(&V_tcbinfo); 378 CURVNET_RESTORE(); 379 } 380 381 void 382 tcp_timer_keep(void *xtp) 383 { 384 struct tcpcb *tp = xtp; 385 struct tcptemp *t_template; 386 struct inpcb *inp; 387 CURVNET_SET(tp->t_vnet); 388 #ifdef TCPDEBUG 389 int ostate; 390 391 ostate = tp->t_state; 392 #endif 393 INP_INFO_RLOCK(&V_tcbinfo); 394 inp = tp->t_inpcb; 395 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 396 INP_WLOCK(inp); 397 if (callout_pending(&tp->t_timers->tt_keep) || 398 !callout_active(&tp->t_timers->tt_keep)) { 399 INP_WUNLOCK(inp); 400 INP_INFO_RUNLOCK(&V_tcbinfo); 401 CURVNET_RESTORE(); 402 return; 403 } 404 callout_deactivate(&tp->t_timers->tt_keep); 405 if ((inp->inp_flags & INP_DROPPED) != 0) { 406 INP_WUNLOCK(inp); 407 INP_INFO_RUNLOCK(&V_tcbinfo); 408 CURVNET_RESTORE(); 409 return; 410 } 411 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 412 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 413 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 414 ("%s: tp %p keep callout should be running", __func__, tp)); 415 /* 416 * Keep-alive timer went off; send something 417 * or drop connection if idle for too long. 418 */ 419 TCPSTAT_INC(tcps_keeptimeo); 420 if (tp->t_state < TCPS_ESTABLISHED) 421 goto dropit; 422 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 423 tp->t_state <= TCPS_CLOSING) { 424 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 425 goto dropit; 426 /* 427 * Send a packet designed to force a response 428 * if the peer is up and reachable: 429 * either an ACK if the connection is still alive, 430 * or an RST if the peer has closed the connection 431 * due to timeout or reboot. 432 * Using sequence number tp->snd_una-1 433 * causes the transmitted zero-length segment 434 * to lie outside the receive window; 435 * by the protocol spec, this requires the 436 * correspondent TCP to respond. 437 */ 438 TCPSTAT_INC(tcps_keepprobe); 439 t_template = tcpip_maketemplate(inp); 440 if (t_template) { 441 tcp_respond(tp, t_template->tt_ipgen, 442 &t_template->tt_t, (struct mbuf *)NULL, 443 tp->rcv_nxt, tp->snd_una - 1, 0); 444 free(t_template, M_TEMP); 445 } 446 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 447 tcp_timer_keep, tp)) { 448 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 449 } 450 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 451 tcp_timer_keep, tp)) { 452 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 453 } 454 455 #ifdef TCPDEBUG 456 if (inp->inp_socket->so_options & SO_DEBUG) 457 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 458 PRU_SLOWTIMO); 459 #endif 460 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 461 INP_WUNLOCK(inp); 462 INP_INFO_RUNLOCK(&V_tcbinfo); 463 CURVNET_RESTORE(); 464 return; 465 466 dropit: 467 TCPSTAT_INC(tcps_keepdrops); 468 tp = tcp_drop(tp, ETIMEDOUT); 469 470 #ifdef TCPDEBUG 471 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 472 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 473 PRU_SLOWTIMO); 474 #endif 475 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 476 if (tp != NULL) 477 INP_WUNLOCK(tp->t_inpcb); 478 INP_INFO_RUNLOCK(&V_tcbinfo); 479 CURVNET_RESTORE(); 480 } 481 482 void 483 tcp_timer_persist(void *xtp) 484 { 485 struct tcpcb *tp = xtp; 486 struct inpcb *inp; 487 CURVNET_SET(tp->t_vnet); 488 #ifdef TCPDEBUG 489 int ostate; 490 491 ostate = tp->t_state; 492 #endif 493 INP_INFO_RLOCK(&V_tcbinfo); 494 inp = tp->t_inpcb; 495 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 496 INP_WLOCK(inp); 497 if (callout_pending(&tp->t_timers->tt_persist) || 498 !callout_active(&tp->t_timers->tt_persist)) { 499 INP_WUNLOCK(inp); 500 INP_INFO_RUNLOCK(&V_tcbinfo); 501 CURVNET_RESTORE(); 502 return; 503 } 504 callout_deactivate(&tp->t_timers->tt_persist); 505 if ((inp->inp_flags & INP_DROPPED) != 0) { 506 INP_WUNLOCK(inp); 507 INP_INFO_RUNLOCK(&V_tcbinfo); 508 CURVNET_RESTORE(); 509 return; 510 } 511 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 512 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 513 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 514 ("%s: tp %p persist callout should be running", __func__, tp)); 515 /* 516 * Persistance timer into zero window. 517 * Force a byte to be output, if possible. 518 */ 519 TCPSTAT_INC(tcps_persisttimeo); 520 /* 521 * Hack: if the peer is dead/unreachable, we do not 522 * time out if the window is closed. After a full 523 * backoff, drop the connection if the idle time 524 * (no responses to probes) reaches the maximum 525 * backoff that we would use if retransmitting. 526 */ 527 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 528 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 529 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 530 TCPSTAT_INC(tcps_persistdrop); 531 tp = tcp_drop(tp, ETIMEDOUT); 532 goto out; 533 } 534 /* 535 * If the user has closed the socket then drop a persisting 536 * connection after a much reduced timeout. 537 */ 538 if (tp->t_state > TCPS_CLOSE_WAIT && 539 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 540 TCPSTAT_INC(tcps_persistdrop); 541 tp = tcp_drop(tp, ETIMEDOUT); 542 goto out; 543 } 544 tcp_setpersist(tp); 545 tp->t_flags |= TF_FORCEDATA; 546 (void) tcp_output(tp); 547 tp->t_flags &= ~TF_FORCEDATA; 548 549 out: 550 #ifdef TCPDEBUG 551 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 552 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 553 #endif 554 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 555 if (tp != NULL) 556 INP_WUNLOCK(inp); 557 INP_INFO_RUNLOCK(&V_tcbinfo); 558 CURVNET_RESTORE(); 559 } 560 561 void 562 tcp_timer_rexmt(void * xtp) 563 { 564 struct tcpcb *tp = xtp; 565 CURVNET_SET(tp->t_vnet); 566 int rexmt; 567 int headlocked; 568 struct inpcb *inp; 569 #ifdef TCPDEBUG 570 int ostate; 571 572 ostate = tp->t_state; 573 #endif 574 575 INP_INFO_RLOCK(&V_tcbinfo); 576 inp = tp->t_inpcb; 577 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 578 INP_WLOCK(inp); 579 if (callout_pending(&tp->t_timers->tt_rexmt) || 580 !callout_active(&tp->t_timers->tt_rexmt)) { 581 INP_WUNLOCK(inp); 582 INP_INFO_RUNLOCK(&V_tcbinfo); 583 CURVNET_RESTORE(); 584 return; 585 } 586 callout_deactivate(&tp->t_timers->tt_rexmt); 587 if ((inp->inp_flags & INP_DROPPED) != 0) { 588 INP_WUNLOCK(inp); 589 INP_INFO_RUNLOCK(&V_tcbinfo); 590 CURVNET_RESTORE(); 591 return; 592 } 593 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 594 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 595 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 596 ("%s: tp %p rexmt callout should be running", __func__, tp)); 597 tcp_free_sackholes(tp); 598 /* 599 * Retransmission timer went off. Message has not 600 * been acked within retransmit interval. Back off 601 * to a longer retransmit interval and retransmit one segment. 602 */ 603 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 604 tp->t_rxtshift = TCP_MAXRXTSHIFT; 605 TCPSTAT_INC(tcps_timeoutdrop); 606 607 tp = tcp_drop(tp, tp->t_softerror ? 608 tp->t_softerror : ETIMEDOUT); 609 headlocked = 1; 610 goto out; 611 } 612 INP_INFO_RUNLOCK(&V_tcbinfo); 613 headlocked = 0; 614 if (tp->t_state == TCPS_SYN_SENT) { 615 /* 616 * If the SYN was retransmitted, indicate CWND to be 617 * limited to 1 segment in cc_conn_init(). 618 */ 619 tp->snd_cwnd = 1; 620 } else if (tp->t_rxtshift == 1) { 621 /* 622 * first retransmit; record ssthresh and cwnd so they can 623 * be recovered if this turns out to be a "bad" retransmit. 624 * A retransmit is considered "bad" if an ACK for this 625 * segment is received within RTT/2 interval; the assumption 626 * here is that the ACK was already in flight. See 627 * "On Estimating End-to-End Network Path Properties" by 628 * Allman and Paxson for more details. 629 */ 630 tp->snd_cwnd_prev = tp->snd_cwnd; 631 tp->snd_ssthresh_prev = tp->snd_ssthresh; 632 tp->snd_recover_prev = tp->snd_recover; 633 if (IN_FASTRECOVERY(tp->t_flags)) 634 tp->t_flags |= TF_WASFRECOVERY; 635 else 636 tp->t_flags &= ~TF_WASFRECOVERY; 637 if (IN_CONGRECOVERY(tp->t_flags)) 638 tp->t_flags |= TF_WASCRECOVERY; 639 else 640 tp->t_flags &= ~TF_WASCRECOVERY; 641 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 642 tp->t_flags |= TF_PREVVALID; 643 } else 644 tp->t_flags &= ~TF_PREVVALID; 645 TCPSTAT_INC(tcps_rexmttimeo); 646 if (tp->t_state == TCPS_SYN_SENT) 647 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 648 else 649 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 650 TCPT_RANGESET(tp->t_rxtcur, rexmt, 651 tp->t_rttmin, TCPTV_REXMTMAX); 652 653 /* 654 * We enter the path for PLMTUD if connection is established or, if 655 * connection is FIN_WAIT_1 status, reason for the last is that if 656 * amount of data we send is very small, we could send it in couple of 657 * packets and process straight to FIN. In that case we won't catch 658 * ESTABLISHED state. 659 */ 660 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 661 || (tp->t_state == TCPS_FIN_WAIT_1))) { 662 int optlen; 663 #ifdef INET6 664 int isipv6; 665 #endif 666 667 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 668 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 669 (tp->t_rxtshift <= 2)) { 670 /* 671 * Enter Path MTU Black-hole Detection mechanism: 672 * - Disable Path MTU Discovery (IP "DF" bit). 673 * - Reduce MTU to lower value than what we 674 * negotiated with peer. 675 */ 676 /* Record that we may have found a black hole. */ 677 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 678 679 /* Keep track of previous MSS. */ 680 optlen = tp->t_maxopd - tp->t_maxseg; 681 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 682 683 /* 684 * Reduce the MSS to blackhole value or to the default 685 * in an attempt to retransmit. 686 */ 687 #ifdef INET6 688 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 689 if (isipv6 && 690 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 691 /* Use the sysctl tuneable blackhole MSS. */ 692 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 693 V_tcp_pmtud_blackhole_activated++; 694 } else if (isipv6) { 695 /* Use the default MSS. */ 696 tp->t_maxopd = V_tcp_v6mssdflt; 697 /* 698 * Disable Path MTU Discovery when we switch to 699 * minmss. 700 */ 701 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 702 V_tcp_pmtud_blackhole_activated_min_mss++; 703 } 704 #endif 705 #if defined(INET6) && defined(INET) 706 else 707 #endif 708 #ifdef INET 709 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 710 /* Use the sysctl tuneable blackhole MSS. */ 711 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 712 V_tcp_pmtud_blackhole_activated++; 713 } else { 714 /* Use the default MSS. */ 715 tp->t_maxopd = V_tcp_mssdflt; 716 /* 717 * Disable Path MTU Discovery when we switch to 718 * minmss. 719 */ 720 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 721 V_tcp_pmtud_blackhole_activated_min_mss++; 722 } 723 #endif 724 tp->t_maxseg = tp->t_maxopd - optlen; 725 /* 726 * Reset the slow-start flight size 727 * as it may depend on the new MSS. 728 */ 729 if (CC_ALGO(tp)->conn_init != NULL) 730 CC_ALGO(tp)->conn_init(tp->ccv); 731 } else { 732 /* 733 * If further retransmissions are still unsuccessful 734 * with a lowered MTU, maybe this isn't a blackhole and 735 * we restore the previous MSS and blackhole detection 736 * flags. 737 */ 738 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 739 (tp->t_rxtshift > 4)) { 740 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 741 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 742 optlen = tp->t_maxopd - tp->t_maxseg; 743 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 744 tp->t_maxseg = tp->t_maxopd - optlen; 745 V_tcp_pmtud_blackhole_failed++; 746 /* 747 * Reset the slow-start flight size as it 748 * may depend on the new MSS. 749 */ 750 if (CC_ALGO(tp)->conn_init != NULL) 751 CC_ALGO(tp)->conn_init(tp->ccv); 752 } 753 } 754 } 755 756 /* 757 * Disable RFC1323 and SACK if we haven't got any response to 758 * our third SYN to work-around some broken terminal servers 759 * (most of which have hopefully been retired) that have bad VJ 760 * header compression code which trashes TCP segments containing 761 * unknown-to-them TCP options. 762 */ 763 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 764 (tp->t_rxtshift == 3)) 765 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 766 /* 767 * If we backed off this far, our srtt estimate is probably bogus. 768 * Clobber it so we'll take the next rtt measurement as our srtt; 769 * move the current srtt into rttvar to keep the current 770 * retransmit times until then. 771 */ 772 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 773 #ifdef INET6 774 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 775 in6_losing(tp->t_inpcb); 776 #endif 777 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 778 tp->t_srtt = 0; 779 } 780 tp->snd_nxt = tp->snd_una; 781 tp->snd_recover = tp->snd_max; 782 /* 783 * Force a segment to be sent. 784 */ 785 tp->t_flags |= TF_ACKNOW; 786 /* 787 * If timing a segment in this window, stop the timer. 788 */ 789 tp->t_rtttime = 0; 790 791 cc_cong_signal(tp, NULL, CC_RTO); 792 793 (void) tcp_output(tp); 794 795 out: 796 #ifdef TCPDEBUG 797 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 798 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 799 PRU_SLOWTIMO); 800 #endif 801 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 802 if (tp != NULL) 803 INP_WUNLOCK(inp); 804 if (headlocked) 805 INP_INFO_RUNLOCK(&V_tcbinfo); 806 CURVNET_RESTORE(); 807 } 808 809 void 810 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 811 { 812 struct callout *t_callout; 813 timeout_t *f_callout; 814 struct inpcb *inp = tp->t_inpcb; 815 int cpu = inp_to_cpuid(inp); 816 uint32_t f_reset; 817 818 #ifdef TCP_OFFLOAD 819 if (tp->t_flags & TF_TOE) 820 return; 821 #endif 822 823 if (tp->t_timers->tt_flags & TT_STOPPED) 824 return; 825 826 switch (timer_type) { 827 case TT_DELACK: 828 t_callout = &tp->t_timers->tt_delack; 829 f_callout = tcp_timer_delack; 830 f_reset = TT_DELACK_RST; 831 break; 832 case TT_REXMT: 833 t_callout = &tp->t_timers->tt_rexmt; 834 f_callout = tcp_timer_rexmt; 835 f_reset = TT_REXMT_RST; 836 break; 837 case TT_PERSIST: 838 t_callout = &tp->t_timers->tt_persist; 839 f_callout = tcp_timer_persist; 840 f_reset = TT_PERSIST_RST; 841 break; 842 case TT_KEEP: 843 t_callout = &tp->t_timers->tt_keep; 844 f_callout = tcp_timer_keep; 845 f_reset = TT_KEEP_RST; 846 break; 847 case TT_2MSL: 848 t_callout = &tp->t_timers->tt_2msl; 849 f_callout = tcp_timer_2msl; 850 f_reset = TT_2MSL_RST; 851 break; 852 default: 853 panic("tp %p bad timer_type %#x", tp, timer_type); 854 } 855 if (delta == 0) { 856 if ((tp->t_timers->tt_flags & timer_type) && 857 callout_stop(t_callout) && 858 (tp->t_timers->tt_flags & f_reset)) { 859 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 860 } 861 } else { 862 if ((tp->t_timers->tt_flags & timer_type) == 0) { 863 tp->t_timers->tt_flags |= (timer_type | f_reset); 864 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 865 } else { 866 /* Reset already running callout on the same CPU. */ 867 if (!callout_reset(t_callout, delta, f_callout, tp)) { 868 /* 869 * Callout not cancelled, consider it as not 870 * properly restarted. */ 871 tp->t_timers->tt_flags &= ~f_reset; 872 } 873 } 874 } 875 } 876 877 int 878 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 879 { 880 struct callout *t_callout; 881 882 switch (timer_type) { 883 case TT_DELACK: 884 t_callout = &tp->t_timers->tt_delack; 885 break; 886 case TT_REXMT: 887 t_callout = &tp->t_timers->tt_rexmt; 888 break; 889 case TT_PERSIST: 890 t_callout = &tp->t_timers->tt_persist; 891 break; 892 case TT_KEEP: 893 t_callout = &tp->t_timers->tt_keep; 894 break; 895 case TT_2MSL: 896 t_callout = &tp->t_timers->tt_2msl; 897 break; 898 default: 899 panic("tp %p bad timer_type %#x", tp, timer_type); 900 } 901 return callout_active(t_callout); 902 } 903 904 void 905 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 906 { 907 struct callout *t_callout; 908 timeout_t *f_callout; 909 uint32_t f_reset; 910 911 tp->t_timers->tt_flags |= TT_STOPPED; 912 913 switch (timer_type) { 914 case TT_DELACK: 915 t_callout = &tp->t_timers->tt_delack; 916 f_callout = tcp_timer_delack_discard; 917 f_reset = TT_DELACK_RST; 918 break; 919 case TT_REXMT: 920 t_callout = &tp->t_timers->tt_rexmt; 921 f_callout = tcp_timer_rexmt_discard; 922 f_reset = TT_REXMT_RST; 923 break; 924 case TT_PERSIST: 925 t_callout = &tp->t_timers->tt_persist; 926 f_callout = tcp_timer_persist_discard; 927 f_reset = TT_PERSIST_RST; 928 break; 929 case TT_KEEP: 930 t_callout = &tp->t_timers->tt_keep; 931 f_callout = tcp_timer_keep_discard; 932 f_reset = TT_KEEP_RST; 933 break; 934 case TT_2MSL: 935 t_callout = &tp->t_timers->tt_2msl; 936 f_callout = tcp_timer_2msl_discard; 937 f_reset = TT_2MSL_RST; 938 break; 939 default: 940 panic("tp %p bad timer_type %#x", tp, timer_type); 941 } 942 943 if (tp->t_timers->tt_flags & timer_type) { 944 if (callout_stop(t_callout) && 945 (tp->t_timers->tt_flags & f_reset)) { 946 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 947 } else { 948 /* 949 * Can't stop the callout, defer tcpcb actual deletion 950 * to the last tcp timer discard callout. 951 * The TT_STOPPED flag will ensure that no tcp timer 952 * callouts can be restarted on our behalf, and 953 * past this point currently running callouts waiting 954 * on inp lock will return right away after the 955 * classical check for callout reset/stop events: 956 * callout_pending() || !callout_active() 957 */ 958 callout_reset(t_callout, 1, f_callout, tp); 959 } 960 } 961 } 962 963 #define ticks_to_msecs(t) (1000*(t) / hz) 964 965 void 966 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 967 struct xtcp_timer *xtimer) 968 { 969 sbintime_t now; 970 971 bzero(xtimer, sizeof(*xtimer)); 972 if (timer == NULL) 973 return; 974 now = getsbinuptime(); 975 if (callout_active(&timer->tt_delack)) 976 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 977 if (callout_active(&timer->tt_rexmt)) 978 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 979 if (callout_active(&timer->tt_persist)) 980 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 981 if (callout_active(&timer->tt_keep)) 982 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 983 if (callout_active(&timer->tt_2msl)) 984 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 985 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 986 } 987