1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/cc.h> 59 #include <netinet/in.h> 60 #include <netinet/in_kdtrace.h> 61 #include <netinet/in_pcb.h> 62 #include <netinet/in_rss.h> 63 #include <netinet/in_systm.h> 64 #ifdef INET6 65 #include <netinet6/in6_pcb.h> 66 #endif 67 #include <netinet/ip_var.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #ifdef INET6 72 #include <netinet6/tcp6_var.h> 73 #endif 74 #include <netinet/tcpip.h> 75 #ifdef TCPDEBUG 76 #include <netinet/tcp_debug.h> 77 #endif 78 79 int tcp_keepinit; 80 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 81 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 82 83 int tcp_keepidle; 84 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 85 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 86 87 int tcp_keepintvl; 88 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 89 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 90 91 int tcp_delacktime; 92 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 93 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 94 "Time before a delayed ACK is sent"); 95 96 int tcp_msl; 97 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 99 100 int tcp_rexmit_min; 101 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 103 "Minimum Retransmission Timeout"); 104 105 int tcp_rexmit_slop; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 108 "Retransmission Timer Slop"); 109 110 static int always_keepalive = 1; 111 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 112 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 113 114 int tcp_fast_finwait2_recycle = 0; 115 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 116 &tcp_fast_finwait2_recycle, 0, 117 "Recycle closed FIN_WAIT_2 connections faster"); 118 119 int tcp_finwait2_timeout; 120 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 121 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 122 123 int tcp_keepcnt = TCPTV_KEEPCNT; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 125 "Number of keepalive probes to send"); 126 127 /* max idle probes */ 128 int tcp_maxpersistidle; 129 130 static int tcp_rexmit_drop_options = 0; 131 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 132 &tcp_rexmit_drop_options, 0, 133 "Drop TCP options from 3rd and later retransmitted SYN"); 134 135 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 136 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 137 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 138 CTLFLAG_RW|CTLFLAG_VNET, 139 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 140 "Path MTU Discovery Black Hole Detection Enabled"); 141 142 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 143 #define V_tcp_pmtud_blackhole_activated \ 144 VNET(tcp_pmtud_blackhole_activated) 145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 146 CTLFLAG_RD|CTLFLAG_VNET, 147 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 148 "Path MTU Discovery Black Hole Detection, Activation Count"); 149 150 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 151 #define V_tcp_pmtud_blackhole_activated_min_mss \ 152 VNET(tcp_pmtud_blackhole_activated_min_mss) 153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 154 CTLFLAG_RD|CTLFLAG_VNET, 155 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 156 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 157 158 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 159 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 160 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 161 CTLFLAG_RD|CTLFLAG_VNET, 162 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 163 "Path MTU Discovery Black Hole Detection, Failure Count"); 164 165 #ifdef INET 166 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 167 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 168 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 169 CTLFLAG_RW|CTLFLAG_VNET, 170 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 171 "Path MTU Discovery Black Hole Detection lowered MSS"); 172 #endif 173 174 #ifdef INET6 175 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 176 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 180 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef RSS 184 static int per_cpu_timers = 1; 185 #else 186 static int per_cpu_timers = 0; 187 #endif 188 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 189 &per_cpu_timers , 0, "run tcp timers on all cpus"); 190 191 #if 0 192 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 193 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 194 #endif 195 196 /* 197 * Map the given inp to a CPU id. 198 * 199 * This queries RSS if it's compiled in, else it defaults to the current 200 * CPU ID. 201 */ 202 static inline int 203 inp_to_cpuid(struct inpcb *inp) 204 { 205 u_int cpuid; 206 207 #ifdef RSS 208 if (per_cpu_timers) { 209 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 210 if (cpuid == NETISR_CPUID_NONE) 211 return (curcpu); /* XXX */ 212 else 213 return (cpuid); 214 } 215 #else 216 /* Legacy, pre-RSS behaviour */ 217 if (per_cpu_timers) { 218 /* 219 * We don't have a flowid -> cpuid mapping, so cheat and 220 * just map unknown cpuids to curcpu. Not the best, but 221 * apparently better than defaulting to swi 0. 222 */ 223 cpuid = inp->inp_flowid % (mp_maxid + 1); 224 if (! CPU_ABSENT(cpuid)) 225 return (cpuid); 226 return (curcpu); 227 } 228 #endif 229 /* Default for RSS and non-RSS - cpuid 0 */ 230 else { 231 return (0); 232 } 233 } 234 235 /* 236 * Tcp protocol timeout routine called every 500 ms. 237 * Updates timestamps used for TCP 238 * causes finite state machine actions if timers expire. 239 */ 240 void 241 tcp_slowtimo(void) 242 { 243 VNET_ITERATOR_DECL(vnet_iter); 244 245 VNET_LIST_RLOCK_NOSLEEP(); 246 VNET_FOREACH(vnet_iter) { 247 CURVNET_SET(vnet_iter); 248 (void) tcp_tw_2msl_scan(0); 249 CURVNET_RESTORE(); 250 } 251 VNET_LIST_RUNLOCK_NOSLEEP(); 252 } 253 254 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 255 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 256 257 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 258 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 259 260 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 261 262 /* 263 * TCP timer processing. 264 */ 265 266 void 267 tcp_timer_delack(void *xtp) 268 { 269 struct tcpcb *tp = xtp; 270 struct inpcb *inp; 271 CURVNET_SET(tp->t_vnet); 272 273 inp = tp->t_inpcb; 274 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 275 INP_WLOCK(inp); 276 if (callout_pending(&tp->t_timers->tt_delack) || 277 !callout_active(&tp->t_timers->tt_delack)) { 278 INP_WUNLOCK(inp); 279 CURVNET_RESTORE(); 280 return; 281 } 282 callout_deactivate(&tp->t_timers->tt_delack); 283 if ((inp->inp_flags & INP_DROPPED) != 0) { 284 INP_WUNLOCK(inp); 285 CURVNET_RESTORE(); 286 return; 287 } 288 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 289 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 290 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 291 ("%s: tp %p delack callout should be running", __func__, tp)); 292 293 tp->t_flags |= TF_ACKNOW; 294 TCPSTAT_INC(tcps_delack); 295 (void) tcp_output(tp); 296 INP_WUNLOCK(inp); 297 CURVNET_RESTORE(); 298 } 299 300 void 301 tcp_timer_2msl(void *xtp) 302 { 303 struct tcpcb *tp = xtp; 304 struct inpcb *inp; 305 CURVNET_SET(tp->t_vnet); 306 #ifdef TCPDEBUG 307 int ostate; 308 309 ostate = tp->t_state; 310 #endif 311 INP_INFO_RLOCK(&V_tcbinfo); 312 inp = tp->t_inpcb; 313 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 314 INP_WLOCK(inp); 315 tcp_free_sackholes(tp); 316 if (callout_pending(&tp->t_timers->tt_2msl) || 317 !callout_active(&tp->t_timers->tt_2msl)) { 318 INP_WUNLOCK(tp->t_inpcb); 319 INP_INFO_RUNLOCK(&V_tcbinfo); 320 CURVNET_RESTORE(); 321 return; 322 } 323 callout_deactivate(&tp->t_timers->tt_2msl); 324 if ((inp->inp_flags & INP_DROPPED) != 0) { 325 INP_WUNLOCK(inp); 326 INP_INFO_RUNLOCK(&V_tcbinfo); 327 CURVNET_RESTORE(); 328 return; 329 } 330 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 331 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 332 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 333 ("%s: tp %p 2msl callout should be running", __func__, tp)); 334 /* 335 * 2 MSL timeout in shutdown went off. If we're closed but 336 * still waiting for peer to close and connection has been idle 337 * too long delete connection control block. Otherwise, check 338 * again in a bit. 339 * 340 * If in TIME_WAIT state just ignore as this timeout is handled in 341 * tcp_tw_2msl_scan(). 342 * 343 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 344 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 345 * Ignore fact that there were recent incoming segments. 346 */ 347 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 348 INP_WUNLOCK(inp); 349 INP_INFO_RUNLOCK(&V_tcbinfo); 350 CURVNET_RESTORE(); 351 return; 352 } 353 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 354 tp->t_inpcb && tp->t_inpcb->inp_socket && 355 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 356 TCPSTAT_INC(tcps_finwait2_drops); 357 tp = tcp_close(tp); 358 } else { 359 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 360 if (!callout_reset(&tp->t_timers->tt_2msl, 361 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 362 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 363 } 364 } else 365 tp = tcp_close(tp); 366 } 367 368 #ifdef TCPDEBUG 369 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 370 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 371 PRU_SLOWTIMO); 372 #endif 373 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 374 375 if (tp != NULL) 376 INP_WUNLOCK(inp); 377 INP_INFO_RUNLOCK(&V_tcbinfo); 378 CURVNET_RESTORE(); 379 } 380 381 void 382 tcp_timer_keep(void *xtp) 383 { 384 struct tcpcb *tp = xtp; 385 struct tcptemp *t_template; 386 struct inpcb *inp; 387 CURVNET_SET(tp->t_vnet); 388 #ifdef TCPDEBUG 389 int ostate; 390 391 ostate = tp->t_state; 392 #endif 393 INP_INFO_RLOCK(&V_tcbinfo); 394 inp = tp->t_inpcb; 395 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 396 INP_WLOCK(inp); 397 if (callout_pending(&tp->t_timers->tt_keep) || 398 !callout_active(&tp->t_timers->tt_keep)) { 399 INP_WUNLOCK(inp); 400 INP_INFO_RUNLOCK(&V_tcbinfo); 401 CURVNET_RESTORE(); 402 return; 403 } 404 callout_deactivate(&tp->t_timers->tt_keep); 405 if ((inp->inp_flags & INP_DROPPED) != 0) { 406 INP_WUNLOCK(inp); 407 INP_INFO_RUNLOCK(&V_tcbinfo); 408 CURVNET_RESTORE(); 409 return; 410 } 411 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 412 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 413 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 414 ("%s: tp %p keep callout should be running", __func__, tp)); 415 /* 416 * Keep-alive timer went off; send something 417 * or drop connection if idle for too long. 418 */ 419 TCPSTAT_INC(tcps_keeptimeo); 420 if (tp->t_state < TCPS_ESTABLISHED) 421 goto dropit; 422 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 423 tp->t_state <= TCPS_CLOSING) { 424 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 425 goto dropit; 426 /* 427 * Send a packet designed to force a response 428 * if the peer is up and reachable: 429 * either an ACK if the connection is still alive, 430 * or an RST if the peer has closed the connection 431 * due to timeout or reboot. 432 * Using sequence number tp->snd_una-1 433 * causes the transmitted zero-length segment 434 * to lie outside the receive window; 435 * by the protocol spec, this requires the 436 * correspondent TCP to respond. 437 */ 438 TCPSTAT_INC(tcps_keepprobe); 439 t_template = tcpip_maketemplate(inp); 440 if (t_template) { 441 tcp_respond(tp, t_template->tt_ipgen, 442 &t_template->tt_t, (struct mbuf *)NULL, 443 tp->rcv_nxt, tp->snd_una - 1, 0); 444 free(t_template, M_TEMP); 445 } 446 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 447 tcp_timer_keep, tp)) { 448 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 449 } 450 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 451 tcp_timer_keep, tp)) { 452 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 453 } 454 455 #ifdef TCPDEBUG 456 if (inp->inp_socket->so_options & SO_DEBUG) 457 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 458 PRU_SLOWTIMO); 459 #endif 460 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 461 INP_WUNLOCK(inp); 462 INP_INFO_RUNLOCK(&V_tcbinfo); 463 CURVNET_RESTORE(); 464 return; 465 466 dropit: 467 TCPSTAT_INC(tcps_keepdrops); 468 tp = tcp_drop(tp, ETIMEDOUT); 469 470 #ifdef TCPDEBUG 471 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 472 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 473 PRU_SLOWTIMO); 474 #endif 475 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 476 if (tp != NULL) 477 INP_WUNLOCK(tp->t_inpcb); 478 INP_INFO_RUNLOCK(&V_tcbinfo); 479 CURVNET_RESTORE(); 480 } 481 482 void 483 tcp_timer_persist(void *xtp) 484 { 485 struct tcpcb *tp = xtp; 486 struct inpcb *inp; 487 CURVNET_SET(tp->t_vnet); 488 #ifdef TCPDEBUG 489 int ostate; 490 491 ostate = tp->t_state; 492 #endif 493 INP_INFO_RLOCK(&V_tcbinfo); 494 inp = tp->t_inpcb; 495 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 496 INP_WLOCK(inp); 497 if (callout_pending(&tp->t_timers->tt_persist) || 498 !callout_active(&tp->t_timers->tt_persist)) { 499 INP_WUNLOCK(inp); 500 INP_INFO_RUNLOCK(&V_tcbinfo); 501 CURVNET_RESTORE(); 502 return; 503 } 504 callout_deactivate(&tp->t_timers->tt_persist); 505 if ((inp->inp_flags & INP_DROPPED) != 0) { 506 INP_WUNLOCK(inp); 507 INP_INFO_RUNLOCK(&V_tcbinfo); 508 CURVNET_RESTORE(); 509 return; 510 } 511 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 512 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 513 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 514 ("%s: tp %p persist callout should be running", __func__, tp)); 515 /* 516 * Persistance timer into zero window. 517 * Force a byte to be output, if possible. 518 */ 519 TCPSTAT_INC(tcps_persisttimeo); 520 /* 521 * Hack: if the peer is dead/unreachable, we do not 522 * time out if the window is closed. After a full 523 * backoff, drop the connection if the idle time 524 * (no responses to probes) reaches the maximum 525 * backoff that we would use if retransmitting. 526 */ 527 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 528 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 529 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 530 TCPSTAT_INC(tcps_persistdrop); 531 tp = tcp_drop(tp, ETIMEDOUT); 532 goto out; 533 } 534 /* 535 * If the user has closed the socket then drop a persisting 536 * connection after a much reduced timeout. 537 */ 538 if (tp->t_state > TCPS_CLOSE_WAIT && 539 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 540 TCPSTAT_INC(tcps_persistdrop); 541 tp = tcp_drop(tp, ETIMEDOUT); 542 goto out; 543 } 544 tcp_setpersist(tp); 545 tp->t_flags |= TF_FORCEDATA; 546 (void) tcp_output(tp); 547 tp->t_flags &= ~TF_FORCEDATA; 548 549 out: 550 #ifdef TCPDEBUG 551 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 552 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 553 #endif 554 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 555 if (tp != NULL) 556 INP_WUNLOCK(inp); 557 INP_INFO_RUNLOCK(&V_tcbinfo); 558 CURVNET_RESTORE(); 559 } 560 561 void 562 tcp_timer_rexmt(void * xtp) 563 { 564 struct tcpcb *tp = xtp; 565 CURVNET_SET(tp->t_vnet); 566 int rexmt; 567 int headlocked; 568 struct inpcb *inp; 569 #ifdef TCPDEBUG 570 int ostate; 571 572 ostate = tp->t_state; 573 #endif 574 575 INP_INFO_RLOCK(&V_tcbinfo); 576 inp = tp->t_inpcb; 577 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 578 INP_WLOCK(inp); 579 if (callout_pending(&tp->t_timers->tt_rexmt) || 580 !callout_active(&tp->t_timers->tt_rexmt)) { 581 INP_WUNLOCK(inp); 582 INP_INFO_RUNLOCK(&V_tcbinfo); 583 CURVNET_RESTORE(); 584 return; 585 } 586 callout_deactivate(&tp->t_timers->tt_rexmt); 587 if ((inp->inp_flags & INP_DROPPED) != 0) { 588 INP_WUNLOCK(inp); 589 INP_INFO_RUNLOCK(&V_tcbinfo); 590 CURVNET_RESTORE(); 591 return; 592 } 593 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 594 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 595 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 596 ("%s: tp %p rexmt callout should be running", __func__, tp)); 597 tcp_free_sackholes(tp); 598 /* 599 * Retransmission timer went off. Message has not 600 * been acked within retransmit interval. Back off 601 * to a longer retransmit interval and retransmit one segment. 602 */ 603 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 604 tp->t_rxtshift = TCP_MAXRXTSHIFT; 605 TCPSTAT_INC(tcps_timeoutdrop); 606 607 tp = tcp_drop(tp, tp->t_softerror ? 608 tp->t_softerror : ETIMEDOUT); 609 headlocked = 1; 610 goto out; 611 } 612 INP_INFO_RUNLOCK(&V_tcbinfo); 613 headlocked = 0; 614 if (tp->t_state == TCPS_SYN_SENT) { 615 /* 616 * If the SYN was retransmitted, indicate CWND to be 617 * limited to 1 segment in cc_conn_init(). 618 */ 619 tp->snd_cwnd = 1; 620 } else if (tp->t_rxtshift == 1) { 621 /* 622 * first retransmit; record ssthresh and cwnd so they can 623 * be recovered if this turns out to be a "bad" retransmit. 624 * A retransmit is considered "bad" if an ACK for this 625 * segment is received within RTT/2 interval; the assumption 626 * here is that the ACK was already in flight. See 627 * "On Estimating End-to-End Network Path Properties" by 628 * Allman and Paxson for more details. 629 */ 630 tp->snd_cwnd_prev = tp->snd_cwnd; 631 tp->snd_ssthresh_prev = tp->snd_ssthresh; 632 tp->snd_recover_prev = tp->snd_recover; 633 if (IN_FASTRECOVERY(tp->t_flags)) 634 tp->t_flags |= TF_WASFRECOVERY; 635 else 636 tp->t_flags &= ~TF_WASFRECOVERY; 637 if (IN_CONGRECOVERY(tp->t_flags)) 638 tp->t_flags |= TF_WASCRECOVERY; 639 else 640 tp->t_flags &= ~TF_WASCRECOVERY; 641 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 642 tp->t_flags |= TF_PREVVALID; 643 } else 644 tp->t_flags &= ~TF_PREVVALID; 645 TCPSTAT_INC(tcps_rexmttimeo); 646 if (tp->t_state == TCPS_SYN_SENT) 647 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 648 else 649 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 650 TCPT_RANGESET(tp->t_rxtcur, rexmt, 651 tp->t_rttmin, TCPTV_REXMTMAX); 652 653 /* 654 * We enter the path for PLMTUD if connection is established or, if 655 * connection is FIN_WAIT_1 status, reason for the last is that if 656 * amount of data we send is very small, we could send it in couple of 657 * packets and process straight to FIN. In that case we won't catch 658 * ESTABLISHED state. 659 */ 660 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 661 || (tp->t_state == TCPS_FIN_WAIT_1))) { 662 int optlen; 663 #ifdef INET6 664 int isipv6; 665 #endif 666 667 /* 668 * Idea here is that at each stage of mtu probe (usually, 1448 669 * -> 1188 -> 524) should be given 2 chances to recover before 670 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 671 * take care of that. 672 */ 673 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 674 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 675 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 676 /* 677 * Enter Path MTU Black-hole Detection mechanism: 678 * - Disable Path MTU Discovery (IP "DF" bit). 679 * - Reduce MTU to lower value than what we 680 * negotiated with peer. 681 */ 682 /* Record that we may have found a black hole. */ 683 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 684 685 /* Keep track of previous MSS. */ 686 optlen = tp->t_maxopd - tp->t_maxseg; 687 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 688 689 /* 690 * Reduce the MSS to blackhole value or to the default 691 * in an attempt to retransmit. 692 */ 693 #ifdef INET6 694 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 695 if (isipv6 && 696 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 697 /* Use the sysctl tuneable blackhole MSS. */ 698 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 699 V_tcp_pmtud_blackhole_activated++; 700 } else if (isipv6) { 701 /* Use the default MSS. */ 702 tp->t_maxopd = V_tcp_v6mssdflt; 703 /* 704 * Disable Path MTU Discovery when we switch to 705 * minmss. 706 */ 707 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 708 V_tcp_pmtud_blackhole_activated_min_mss++; 709 } 710 #endif 711 #if defined(INET6) && defined(INET) 712 else 713 #endif 714 #ifdef INET 715 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 716 /* Use the sysctl tuneable blackhole MSS. */ 717 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 718 V_tcp_pmtud_blackhole_activated++; 719 } else { 720 /* Use the default MSS. */ 721 tp->t_maxopd = V_tcp_mssdflt; 722 /* 723 * Disable Path MTU Discovery when we switch to 724 * minmss. 725 */ 726 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 727 V_tcp_pmtud_blackhole_activated_min_mss++; 728 } 729 #endif 730 tp->t_maxseg = tp->t_maxopd - optlen; 731 /* 732 * Reset the slow-start flight size 733 * as it may depend on the new MSS. 734 */ 735 if (CC_ALGO(tp)->conn_init != NULL) 736 CC_ALGO(tp)->conn_init(tp->ccv); 737 } else { 738 /* 739 * If further retransmissions are still unsuccessful 740 * with a lowered MTU, maybe this isn't a blackhole and 741 * we restore the previous MSS and blackhole detection 742 * flags. 743 * The limit '6' is determined by giving each probe 744 * stage (1448, 1188, 524) 2 chances to recover. 745 */ 746 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 747 (tp->t_rxtshift > 6)) { 748 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 749 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 750 optlen = tp->t_maxopd - tp->t_maxseg; 751 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 752 tp->t_maxseg = tp->t_maxopd - optlen; 753 V_tcp_pmtud_blackhole_failed++; 754 /* 755 * Reset the slow-start flight size as it 756 * may depend on the new MSS. 757 */ 758 if (CC_ALGO(tp)->conn_init != NULL) 759 CC_ALGO(tp)->conn_init(tp->ccv); 760 } 761 } 762 } 763 764 /* 765 * Disable RFC1323 and SACK if we haven't got any response to 766 * our third SYN to work-around some broken terminal servers 767 * (most of which have hopefully been retired) that have bad VJ 768 * header compression code which trashes TCP segments containing 769 * unknown-to-them TCP options. 770 */ 771 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 772 (tp->t_rxtshift == 3)) 773 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 774 /* 775 * If we backed off this far, our srtt estimate is probably bogus. 776 * Clobber it so we'll take the next rtt measurement as our srtt; 777 * move the current srtt into rttvar to keep the current 778 * retransmit times until then. 779 */ 780 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 781 #ifdef INET6 782 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 783 in6_losing(tp->t_inpcb); 784 #endif 785 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 786 tp->t_srtt = 0; 787 } 788 tp->snd_nxt = tp->snd_una; 789 tp->snd_recover = tp->snd_max; 790 /* 791 * Force a segment to be sent. 792 */ 793 tp->t_flags |= TF_ACKNOW; 794 /* 795 * If timing a segment in this window, stop the timer. 796 */ 797 tp->t_rtttime = 0; 798 799 cc_cong_signal(tp, NULL, CC_RTO); 800 801 (void) tcp_output(tp); 802 803 out: 804 #ifdef TCPDEBUG 805 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 806 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 807 PRU_SLOWTIMO); 808 #endif 809 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 810 if (tp != NULL) 811 INP_WUNLOCK(inp); 812 if (headlocked) 813 INP_INFO_RUNLOCK(&V_tcbinfo); 814 CURVNET_RESTORE(); 815 } 816 817 void 818 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 819 { 820 struct callout *t_callout; 821 timeout_t *f_callout; 822 struct inpcb *inp = tp->t_inpcb; 823 int cpu = inp_to_cpuid(inp); 824 uint32_t f_reset; 825 826 #ifdef TCP_OFFLOAD 827 if (tp->t_flags & TF_TOE) 828 return; 829 #endif 830 831 if (tp->t_timers->tt_flags & TT_STOPPED) 832 return; 833 834 switch (timer_type) { 835 case TT_DELACK: 836 t_callout = &tp->t_timers->tt_delack; 837 f_callout = tcp_timer_delack; 838 f_reset = TT_DELACK_RST; 839 break; 840 case TT_REXMT: 841 t_callout = &tp->t_timers->tt_rexmt; 842 f_callout = tcp_timer_rexmt; 843 f_reset = TT_REXMT_RST; 844 break; 845 case TT_PERSIST: 846 t_callout = &tp->t_timers->tt_persist; 847 f_callout = tcp_timer_persist; 848 f_reset = TT_PERSIST_RST; 849 break; 850 case TT_KEEP: 851 t_callout = &tp->t_timers->tt_keep; 852 f_callout = tcp_timer_keep; 853 f_reset = TT_KEEP_RST; 854 break; 855 case TT_2MSL: 856 t_callout = &tp->t_timers->tt_2msl; 857 f_callout = tcp_timer_2msl; 858 f_reset = TT_2MSL_RST; 859 break; 860 default: 861 panic("tp %p bad timer_type %#x", tp, timer_type); 862 } 863 if (delta == 0) { 864 if ((tp->t_timers->tt_flags & timer_type) && 865 (callout_stop(t_callout) > 0) && 866 (tp->t_timers->tt_flags & f_reset)) { 867 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 868 } 869 } else { 870 if ((tp->t_timers->tt_flags & timer_type) == 0) { 871 tp->t_timers->tt_flags |= (timer_type | f_reset); 872 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 873 } else { 874 /* Reset already running callout on the same CPU. */ 875 if (!callout_reset(t_callout, delta, f_callout, tp)) { 876 /* 877 * Callout not cancelled, consider it as not 878 * properly restarted. */ 879 tp->t_timers->tt_flags &= ~f_reset; 880 } 881 } 882 } 883 } 884 885 int 886 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 887 { 888 struct callout *t_callout; 889 890 switch (timer_type) { 891 case TT_DELACK: 892 t_callout = &tp->t_timers->tt_delack; 893 break; 894 case TT_REXMT: 895 t_callout = &tp->t_timers->tt_rexmt; 896 break; 897 case TT_PERSIST: 898 t_callout = &tp->t_timers->tt_persist; 899 break; 900 case TT_KEEP: 901 t_callout = &tp->t_timers->tt_keep; 902 break; 903 case TT_2MSL: 904 t_callout = &tp->t_timers->tt_2msl; 905 break; 906 default: 907 panic("tp %p bad timer_type %#x", tp, timer_type); 908 } 909 return callout_active(t_callout); 910 } 911 912 void 913 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 914 { 915 struct callout *t_callout; 916 timeout_t *f_callout; 917 uint32_t f_reset; 918 919 tp->t_timers->tt_flags |= TT_STOPPED; 920 921 switch (timer_type) { 922 case TT_DELACK: 923 t_callout = &tp->t_timers->tt_delack; 924 f_callout = tcp_timer_delack_discard; 925 f_reset = TT_DELACK_RST; 926 break; 927 case TT_REXMT: 928 t_callout = &tp->t_timers->tt_rexmt; 929 f_callout = tcp_timer_rexmt_discard; 930 f_reset = TT_REXMT_RST; 931 break; 932 case TT_PERSIST: 933 t_callout = &tp->t_timers->tt_persist; 934 f_callout = tcp_timer_persist_discard; 935 f_reset = TT_PERSIST_RST; 936 break; 937 case TT_KEEP: 938 t_callout = &tp->t_timers->tt_keep; 939 f_callout = tcp_timer_keep_discard; 940 f_reset = TT_KEEP_RST; 941 break; 942 case TT_2MSL: 943 t_callout = &tp->t_timers->tt_2msl; 944 f_callout = tcp_timer_2msl_discard; 945 f_reset = TT_2MSL_RST; 946 break; 947 default: 948 panic("tp %p bad timer_type %#x", tp, timer_type); 949 } 950 951 if (tp->t_timers->tt_flags & timer_type) { 952 if ((callout_stop(t_callout) > 0) && 953 (tp->t_timers->tt_flags & f_reset)) { 954 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 955 } else { 956 /* 957 * Can't stop the callout, defer tcpcb actual deletion 958 * to the last tcp timer discard callout. 959 * The TT_STOPPED flag will ensure that no tcp timer 960 * callouts can be restarted on our behalf, and 961 * past this point currently running callouts waiting 962 * on inp lock will return right away after the 963 * classical check for callout reset/stop events: 964 * callout_pending() || !callout_active() 965 */ 966 callout_reset(t_callout, 1, f_callout, tp); 967 } 968 } 969 } 970 971 #define ticks_to_msecs(t) (1000*(t) / hz) 972 973 void 974 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 975 struct xtcp_timer *xtimer) 976 { 977 sbintime_t now; 978 979 bzero(xtimer, sizeof(*xtimer)); 980 if (timer == NULL) 981 return; 982 now = getsbinuptime(); 983 if (callout_active(&timer->tt_delack)) 984 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 985 if (callout_active(&timer->tt_rexmt)) 986 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 987 if (callout_active(&timer->tt_persist)) 988 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 989 if (callout_active(&timer->tt_keep)) 990 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 991 if (callout_active(&timer->tt_2msl)) 992 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 993 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 994 } 995