1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/cc/cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_persmin; 81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84 int tcp_persmax; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88 int tcp_keepinit; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92 int tcp_keepidle; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100 int tcp_delacktime; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105 int tcp_msl; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109 int tcp_rexmit_min; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114 int tcp_rexmit_slop; 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119 static int always_keepalive = 1; 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122 123 int tcp_fast_finwait2_recycle = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 125 &tcp_fast_finwait2_recycle, 0, 126 "Recycle closed FIN_WAIT_2 connections faster"); 127 128 int tcp_finwait2_timeout; 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 131 132 int tcp_keepcnt = TCPTV_KEEPCNT; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 134 "Number of keepalive probes to send"); 135 136 /* max idle probes */ 137 int tcp_maxpersistidle; 138 139 static int tcp_rexmit_drop_options = 0; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 141 &tcp_rexmit_drop_options, 0, 142 "Drop TCP options from 3rd and later retransmitted SYN"); 143 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 145 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 147 CTLFLAG_RW|CTLFLAG_VNET, 148 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 149 "Path MTU Discovery Black Hole Detection Enabled"); 150 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 152 #define V_tcp_pmtud_blackhole_activated \ 153 VNET(tcp_pmtud_blackhole_activated) 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 155 CTLFLAG_RD|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 157 "Path MTU Discovery Black Hole Detection, Activation Count"); 158 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 160 #define V_tcp_pmtud_blackhole_activated_min_mss \ 161 VNET(tcp_pmtud_blackhole_activated_min_mss) 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 163 CTLFLAG_RD|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 165 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 166 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 168 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 170 CTLFLAG_RD|CTLFLAG_VNET, 171 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 172 "Path MTU Discovery Black Hole Detection, Failure Count"); 173 174 #ifdef INET 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 176 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 180 "Path MTU Discovery Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef INET6 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 185 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 #if 0 201 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 202 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 203 #endif 204 205 /* 206 * Map the given inp to a CPU id. 207 * 208 * This queries RSS if it's compiled in, else it defaults to the current 209 * CPU ID. 210 */ 211 static inline int 212 inp_to_cpuid(struct inpcb *inp) 213 { 214 u_int cpuid; 215 216 #ifdef RSS 217 if (per_cpu_timers) { 218 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 219 if (cpuid == NETISR_CPUID_NONE) 220 return (curcpu); /* XXX */ 221 else 222 return (cpuid); 223 } 224 #else 225 /* Legacy, pre-RSS behaviour */ 226 if (per_cpu_timers) { 227 /* 228 * We don't have a flowid -> cpuid mapping, so cheat and 229 * just map unknown cpuids to curcpu. Not the best, but 230 * apparently better than defaulting to swi 0. 231 */ 232 cpuid = inp->inp_flowid % (mp_maxid + 1); 233 if (! CPU_ABSENT(cpuid)) 234 return (cpuid); 235 return (curcpu); 236 } 237 #endif 238 /* Default for RSS and non-RSS - cpuid 0 */ 239 else { 240 return (0); 241 } 242 } 243 244 /* 245 * Tcp protocol timeout routine called every 500 ms. 246 * Updates timestamps used for TCP 247 * causes finite state machine actions if timers expire. 248 */ 249 void 250 tcp_slowtimo(void) 251 { 252 VNET_ITERATOR_DECL(vnet_iter); 253 254 VNET_LIST_RLOCK_NOSLEEP(); 255 VNET_FOREACH(vnet_iter) { 256 CURVNET_SET(vnet_iter); 257 (void) tcp_tw_2msl_scan(0); 258 CURVNET_RESTORE(); 259 } 260 VNET_LIST_RUNLOCK_NOSLEEP(); 261 } 262 263 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 264 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 265 266 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 267 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 268 269 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 270 271 /* 272 * TCP timer processing. 273 */ 274 275 void 276 tcp_timer_delack(void *xtp) 277 { 278 struct tcpcb *tp = xtp; 279 struct inpcb *inp; 280 CURVNET_SET(tp->t_vnet); 281 282 inp = tp->t_inpcb; 283 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 284 INP_WLOCK(inp); 285 if (callout_pending(&tp->t_timers->tt_delack) || 286 !callout_active(&tp->t_timers->tt_delack)) { 287 INP_WUNLOCK(inp); 288 CURVNET_RESTORE(); 289 return; 290 } 291 callout_deactivate(&tp->t_timers->tt_delack); 292 if ((inp->inp_flags & INP_DROPPED) != 0) { 293 INP_WUNLOCK(inp); 294 CURVNET_RESTORE(); 295 return; 296 } 297 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 298 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 299 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 300 ("%s: tp %p delack callout should be running", __func__, tp)); 301 302 tp->t_flags |= TF_ACKNOW; 303 TCPSTAT_INC(tcps_delack); 304 (void) tp->t_fb->tfb_tcp_output(tp); 305 INP_WUNLOCK(inp); 306 CURVNET_RESTORE(); 307 } 308 309 void 310 tcp_timer_2msl(void *xtp) 311 { 312 struct tcpcb *tp = xtp; 313 struct inpcb *inp; 314 CURVNET_SET(tp->t_vnet); 315 #ifdef TCPDEBUG 316 int ostate; 317 318 ostate = tp->t_state; 319 #endif 320 INP_INFO_RLOCK(&V_tcbinfo); 321 inp = tp->t_inpcb; 322 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 323 INP_WLOCK(inp); 324 tcp_free_sackholes(tp); 325 if (callout_pending(&tp->t_timers->tt_2msl) || 326 !callout_active(&tp->t_timers->tt_2msl)) { 327 INP_WUNLOCK(tp->t_inpcb); 328 INP_INFO_RUNLOCK(&V_tcbinfo); 329 CURVNET_RESTORE(); 330 return; 331 } 332 callout_deactivate(&tp->t_timers->tt_2msl); 333 if ((inp->inp_flags & INP_DROPPED) != 0) { 334 INP_WUNLOCK(inp); 335 INP_INFO_RUNLOCK(&V_tcbinfo); 336 CURVNET_RESTORE(); 337 return; 338 } 339 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 340 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 341 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 342 ("%s: tp %p 2msl callout should be running", __func__, tp)); 343 /* 344 * 2 MSL timeout in shutdown went off. If we're closed but 345 * still waiting for peer to close and connection has been idle 346 * too long delete connection control block. Otherwise, check 347 * again in a bit. 348 * 349 * If in TIME_WAIT state just ignore as this timeout is handled in 350 * tcp_tw_2msl_scan(). 351 * 352 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 353 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 354 * Ignore fact that there were recent incoming segments. 355 */ 356 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 357 INP_WUNLOCK(inp); 358 INP_INFO_RUNLOCK(&V_tcbinfo); 359 CURVNET_RESTORE(); 360 return; 361 } 362 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 363 tp->t_inpcb && tp->t_inpcb->inp_socket && 364 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 365 TCPSTAT_INC(tcps_finwait2_drops); 366 tp = tcp_close(tp); 367 } else { 368 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 369 if (!callout_reset(&tp->t_timers->tt_2msl, 370 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 371 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 372 } 373 } else 374 tp = tcp_close(tp); 375 } 376 377 #ifdef TCPDEBUG 378 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 379 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 380 PRU_SLOWTIMO); 381 #endif 382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 383 384 if (tp != NULL) 385 INP_WUNLOCK(inp); 386 INP_INFO_RUNLOCK(&V_tcbinfo); 387 CURVNET_RESTORE(); 388 } 389 390 void 391 tcp_timer_keep(void *xtp) 392 { 393 struct tcpcb *tp = xtp; 394 struct tcptemp *t_template; 395 struct inpcb *inp; 396 CURVNET_SET(tp->t_vnet); 397 #ifdef TCPDEBUG 398 int ostate; 399 400 ostate = tp->t_state; 401 #endif 402 INP_INFO_RLOCK(&V_tcbinfo); 403 inp = tp->t_inpcb; 404 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 405 INP_WLOCK(inp); 406 if (callout_pending(&tp->t_timers->tt_keep) || 407 !callout_active(&tp->t_timers->tt_keep)) { 408 INP_WUNLOCK(inp); 409 INP_INFO_RUNLOCK(&V_tcbinfo); 410 CURVNET_RESTORE(); 411 return; 412 } 413 callout_deactivate(&tp->t_timers->tt_keep); 414 if ((inp->inp_flags & INP_DROPPED) != 0) { 415 INP_WUNLOCK(inp); 416 INP_INFO_RUNLOCK(&V_tcbinfo); 417 CURVNET_RESTORE(); 418 return; 419 } 420 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 421 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 422 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 423 ("%s: tp %p keep callout should be running", __func__, tp)); 424 /* 425 * Keep-alive timer went off; send something 426 * or drop connection if idle for too long. 427 */ 428 TCPSTAT_INC(tcps_keeptimeo); 429 if (tp->t_state < TCPS_ESTABLISHED) 430 goto dropit; 431 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 432 tp->t_state <= TCPS_CLOSING) { 433 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 434 goto dropit; 435 /* 436 * Send a packet designed to force a response 437 * if the peer is up and reachable: 438 * either an ACK if the connection is still alive, 439 * or an RST if the peer has closed the connection 440 * due to timeout or reboot. 441 * Using sequence number tp->snd_una-1 442 * causes the transmitted zero-length segment 443 * to lie outside the receive window; 444 * by the protocol spec, this requires the 445 * correspondent TCP to respond. 446 */ 447 TCPSTAT_INC(tcps_keepprobe); 448 t_template = tcpip_maketemplate(inp); 449 if (t_template) { 450 tcp_respond(tp, t_template->tt_ipgen, 451 &t_template->tt_t, (struct mbuf *)NULL, 452 tp->rcv_nxt, tp->snd_una - 1, 0); 453 free(t_template, M_TEMP); 454 } 455 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 456 tcp_timer_keep, tp)) { 457 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 458 } 459 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 460 tcp_timer_keep, tp)) { 461 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 462 } 463 464 #ifdef TCPDEBUG 465 if (inp->inp_socket->so_options & SO_DEBUG) 466 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 467 PRU_SLOWTIMO); 468 #endif 469 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 470 INP_WUNLOCK(inp); 471 INP_INFO_RUNLOCK(&V_tcbinfo); 472 CURVNET_RESTORE(); 473 return; 474 475 dropit: 476 TCPSTAT_INC(tcps_keepdrops); 477 tp = tcp_drop(tp, ETIMEDOUT); 478 479 #ifdef TCPDEBUG 480 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 481 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 482 PRU_SLOWTIMO); 483 #endif 484 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 485 if (tp != NULL) 486 INP_WUNLOCK(tp->t_inpcb); 487 INP_INFO_RUNLOCK(&V_tcbinfo); 488 CURVNET_RESTORE(); 489 } 490 491 void 492 tcp_timer_persist(void *xtp) 493 { 494 struct tcpcb *tp = xtp; 495 struct inpcb *inp; 496 CURVNET_SET(tp->t_vnet); 497 #ifdef TCPDEBUG 498 int ostate; 499 500 ostate = tp->t_state; 501 #endif 502 INP_INFO_RLOCK(&V_tcbinfo); 503 inp = tp->t_inpcb; 504 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 505 INP_WLOCK(inp); 506 if (callout_pending(&tp->t_timers->tt_persist) || 507 !callout_active(&tp->t_timers->tt_persist)) { 508 INP_WUNLOCK(inp); 509 INP_INFO_RUNLOCK(&V_tcbinfo); 510 CURVNET_RESTORE(); 511 return; 512 } 513 callout_deactivate(&tp->t_timers->tt_persist); 514 if ((inp->inp_flags & INP_DROPPED) != 0) { 515 INP_WUNLOCK(inp); 516 INP_INFO_RUNLOCK(&V_tcbinfo); 517 CURVNET_RESTORE(); 518 return; 519 } 520 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 521 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 522 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 523 ("%s: tp %p persist callout should be running", __func__, tp)); 524 /* 525 * Persistence timer into zero window. 526 * Force a byte to be output, if possible. 527 */ 528 TCPSTAT_INC(tcps_persisttimeo); 529 /* 530 * Hack: if the peer is dead/unreachable, we do not 531 * time out if the window is closed. After a full 532 * backoff, drop the connection if the idle time 533 * (no responses to probes) reaches the maximum 534 * backoff that we would use if retransmitting. 535 */ 536 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 537 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 538 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 539 TCPSTAT_INC(tcps_persistdrop); 540 tp = tcp_drop(tp, ETIMEDOUT); 541 goto out; 542 } 543 /* 544 * If the user has closed the socket then drop a persisting 545 * connection after a much reduced timeout. 546 */ 547 if (tp->t_state > TCPS_CLOSE_WAIT && 548 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 549 TCPSTAT_INC(tcps_persistdrop); 550 tp = tcp_drop(tp, ETIMEDOUT); 551 goto out; 552 } 553 tcp_setpersist(tp); 554 tp->t_flags |= TF_FORCEDATA; 555 (void) tp->t_fb->tfb_tcp_output(tp); 556 tp->t_flags &= ~TF_FORCEDATA; 557 558 out: 559 #ifdef TCPDEBUG 560 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 561 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 562 #endif 563 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 564 if (tp != NULL) 565 INP_WUNLOCK(inp); 566 INP_INFO_RUNLOCK(&V_tcbinfo); 567 CURVNET_RESTORE(); 568 } 569 570 void 571 tcp_timer_rexmt(void * xtp) 572 { 573 struct tcpcb *tp = xtp; 574 CURVNET_SET(tp->t_vnet); 575 int rexmt; 576 int headlocked; 577 struct inpcb *inp; 578 #ifdef TCPDEBUG 579 int ostate; 580 581 ostate = tp->t_state; 582 #endif 583 584 INP_INFO_RLOCK(&V_tcbinfo); 585 inp = tp->t_inpcb; 586 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 587 INP_WLOCK(inp); 588 if (callout_pending(&tp->t_timers->tt_rexmt) || 589 !callout_active(&tp->t_timers->tt_rexmt)) { 590 INP_WUNLOCK(inp); 591 INP_INFO_RUNLOCK(&V_tcbinfo); 592 CURVNET_RESTORE(); 593 return; 594 } 595 callout_deactivate(&tp->t_timers->tt_rexmt); 596 if ((inp->inp_flags & INP_DROPPED) != 0) { 597 INP_WUNLOCK(inp); 598 INP_INFO_RUNLOCK(&V_tcbinfo); 599 CURVNET_RESTORE(); 600 return; 601 } 602 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 603 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 604 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 605 ("%s: tp %p rexmt callout should be running", __func__, tp)); 606 tcp_free_sackholes(tp); 607 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 608 /* The stack has a timer action too. */ 609 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 610 } 611 /* 612 * Retransmission timer went off. Message has not 613 * been acked within retransmit interval. Back off 614 * to a longer retransmit interval and retransmit one segment. 615 */ 616 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 617 tp->t_rxtshift = TCP_MAXRXTSHIFT; 618 TCPSTAT_INC(tcps_timeoutdrop); 619 620 tp = tcp_drop(tp, tp->t_softerror ? 621 tp->t_softerror : ETIMEDOUT); 622 headlocked = 1; 623 goto out; 624 } 625 INP_INFO_RUNLOCK(&V_tcbinfo); 626 headlocked = 0; 627 if (tp->t_state == TCPS_SYN_SENT) { 628 /* 629 * If the SYN was retransmitted, indicate CWND to be 630 * limited to 1 segment in cc_conn_init(). 631 */ 632 tp->snd_cwnd = 1; 633 } else if (tp->t_rxtshift == 1) { 634 /* 635 * first retransmit; record ssthresh and cwnd so they can 636 * be recovered if this turns out to be a "bad" retransmit. 637 * A retransmit is considered "bad" if an ACK for this 638 * segment is received within RTT/2 interval; the assumption 639 * here is that the ACK was already in flight. See 640 * "On Estimating End-to-End Network Path Properties" by 641 * Allman and Paxson for more details. 642 */ 643 tp->snd_cwnd_prev = tp->snd_cwnd; 644 tp->snd_ssthresh_prev = tp->snd_ssthresh; 645 tp->snd_recover_prev = tp->snd_recover; 646 if (IN_FASTRECOVERY(tp->t_flags)) 647 tp->t_flags |= TF_WASFRECOVERY; 648 else 649 tp->t_flags &= ~TF_WASFRECOVERY; 650 if (IN_CONGRECOVERY(tp->t_flags)) 651 tp->t_flags |= TF_WASCRECOVERY; 652 else 653 tp->t_flags &= ~TF_WASCRECOVERY; 654 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 655 tp->t_flags |= TF_PREVVALID; 656 } else 657 tp->t_flags &= ~TF_PREVVALID; 658 TCPSTAT_INC(tcps_rexmttimeo); 659 if ((tp->t_state == TCPS_SYN_SENT) || 660 (tp->t_state == TCPS_SYN_RECEIVED)) 661 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 662 else 663 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 664 TCPT_RANGESET(tp->t_rxtcur, rexmt, 665 tp->t_rttmin, TCPTV_REXMTMAX); 666 667 /* 668 * We enter the path for PLMTUD if connection is established or, if 669 * connection is FIN_WAIT_1 status, reason for the last is that if 670 * amount of data we send is very small, we could send it in couple of 671 * packets and process straight to FIN. In that case we won't catch 672 * ESTABLISHED state. 673 */ 674 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 675 || (tp->t_state == TCPS_FIN_WAIT_1))) { 676 #ifdef INET6 677 int isipv6; 678 #endif 679 680 /* 681 * Idea here is that at each stage of mtu probe (usually, 1448 682 * -> 1188 -> 524) should be given 2 chances to recover before 683 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 684 * take care of that. 685 */ 686 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 687 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 688 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 689 /* 690 * Enter Path MTU Black-hole Detection mechanism: 691 * - Disable Path MTU Discovery (IP "DF" bit). 692 * - Reduce MTU to lower value than what we 693 * negotiated with peer. 694 */ 695 /* Record that we may have found a black hole. */ 696 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 697 698 /* Keep track of previous MSS. */ 699 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 700 701 /* 702 * Reduce the MSS to blackhole value or to the default 703 * in an attempt to retransmit. 704 */ 705 #ifdef INET6 706 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 707 if (isipv6 && 708 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 709 /* Use the sysctl tuneable blackhole MSS. */ 710 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 711 V_tcp_pmtud_blackhole_activated++; 712 } else if (isipv6) { 713 /* Use the default MSS. */ 714 tp->t_maxseg = V_tcp_v6mssdflt; 715 /* 716 * Disable Path MTU Discovery when we switch to 717 * minmss. 718 */ 719 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 720 V_tcp_pmtud_blackhole_activated_min_mss++; 721 } 722 #endif 723 #if defined(INET6) && defined(INET) 724 else 725 #endif 726 #ifdef INET 727 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 728 /* Use the sysctl tuneable blackhole MSS. */ 729 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 730 V_tcp_pmtud_blackhole_activated++; 731 } else { 732 /* Use the default MSS. */ 733 tp->t_maxseg = V_tcp_mssdflt; 734 /* 735 * Disable Path MTU Discovery when we switch to 736 * minmss. 737 */ 738 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 739 V_tcp_pmtud_blackhole_activated_min_mss++; 740 } 741 #endif 742 /* 743 * Reset the slow-start flight size 744 * as it may depend on the new MSS. 745 */ 746 if (CC_ALGO(tp)->conn_init != NULL) 747 CC_ALGO(tp)->conn_init(tp->ccv); 748 } else { 749 /* 750 * If further retransmissions are still unsuccessful 751 * with a lowered MTU, maybe this isn't a blackhole and 752 * we restore the previous MSS and blackhole detection 753 * flags. 754 * The limit '6' is determined by giving each probe 755 * stage (1448, 1188, 524) 2 chances to recover. 756 */ 757 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 758 (tp->t_rxtshift > 6)) { 759 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 760 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 761 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 762 V_tcp_pmtud_blackhole_failed++; 763 /* 764 * Reset the slow-start flight size as it 765 * may depend on the new MSS. 766 */ 767 if (CC_ALGO(tp)->conn_init != NULL) 768 CC_ALGO(tp)->conn_init(tp->ccv); 769 } 770 } 771 } 772 773 /* 774 * Disable RFC1323 and SACK if we haven't got any response to 775 * our third SYN to work-around some broken terminal servers 776 * (most of which have hopefully been retired) that have bad VJ 777 * header compression code which trashes TCP segments containing 778 * unknown-to-them TCP options. 779 */ 780 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 781 (tp->t_rxtshift == 3)) 782 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 783 /* 784 * If we backed off this far, our srtt estimate is probably bogus. 785 * Clobber it so we'll take the next rtt measurement as our srtt; 786 * move the current srtt into rttvar to keep the current 787 * retransmit times until then. 788 */ 789 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 790 #ifdef INET6 791 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 792 in6_losing(tp->t_inpcb); 793 else 794 #endif 795 in_losing(tp->t_inpcb); 796 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 797 tp->t_srtt = 0; 798 } 799 tp->snd_nxt = tp->snd_una; 800 tp->snd_recover = tp->snd_max; 801 /* 802 * Force a segment to be sent. 803 */ 804 tp->t_flags |= TF_ACKNOW; 805 /* 806 * If timing a segment in this window, stop the timer. 807 */ 808 tp->t_rtttime = 0; 809 810 cc_cong_signal(tp, NULL, CC_RTO); 811 812 (void) tp->t_fb->tfb_tcp_output(tp); 813 814 out: 815 #ifdef TCPDEBUG 816 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 817 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 818 PRU_SLOWTIMO); 819 #endif 820 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 821 if (tp != NULL) 822 INP_WUNLOCK(inp); 823 if (headlocked) 824 INP_INFO_RUNLOCK(&V_tcbinfo); 825 CURVNET_RESTORE(); 826 } 827 828 void 829 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 830 { 831 struct callout *t_callout; 832 timeout_t *f_callout; 833 struct inpcb *inp = tp->t_inpcb; 834 int cpu = inp_to_cpuid(inp); 835 uint32_t f_reset; 836 837 #ifdef TCP_OFFLOAD 838 if (tp->t_flags & TF_TOE) 839 return; 840 #endif 841 842 if (tp->t_timers->tt_flags & TT_STOPPED) 843 return; 844 845 switch (timer_type) { 846 case TT_DELACK: 847 t_callout = &tp->t_timers->tt_delack; 848 f_callout = tcp_timer_delack; 849 f_reset = TT_DELACK_RST; 850 break; 851 case TT_REXMT: 852 t_callout = &tp->t_timers->tt_rexmt; 853 f_callout = tcp_timer_rexmt; 854 f_reset = TT_REXMT_RST; 855 break; 856 case TT_PERSIST: 857 t_callout = &tp->t_timers->tt_persist; 858 f_callout = tcp_timer_persist; 859 f_reset = TT_PERSIST_RST; 860 break; 861 case TT_KEEP: 862 t_callout = &tp->t_timers->tt_keep; 863 f_callout = tcp_timer_keep; 864 f_reset = TT_KEEP_RST; 865 break; 866 case TT_2MSL: 867 t_callout = &tp->t_timers->tt_2msl; 868 f_callout = tcp_timer_2msl; 869 f_reset = TT_2MSL_RST; 870 break; 871 default: 872 if (tp->t_fb->tfb_tcp_timer_activate) { 873 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 874 return; 875 } 876 panic("tp %p bad timer_type %#x", tp, timer_type); 877 } 878 if (delta == 0) { 879 if ((tp->t_timers->tt_flags & timer_type) && 880 (callout_stop(t_callout) > 0) && 881 (tp->t_timers->tt_flags & f_reset)) { 882 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 883 } 884 } else { 885 if ((tp->t_timers->tt_flags & timer_type) == 0) { 886 tp->t_timers->tt_flags |= (timer_type | f_reset); 887 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 888 } else { 889 /* Reset already running callout on the same CPU. */ 890 if (!callout_reset(t_callout, delta, f_callout, tp)) { 891 /* 892 * Callout not cancelled, consider it as not 893 * properly restarted. */ 894 tp->t_timers->tt_flags &= ~f_reset; 895 } 896 } 897 } 898 } 899 900 int 901 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 902 { 903 struct callout *t_callout; 904 905 switch (timer_type) { 906 case TT_DELACK: 907 t_callout = &tp->t_timers->tt_delack; 908 break; 909 case TT_REXMT: 910 t_callout = &tp->t_timers->tt_rexmt; 911 break; 912 case TT_PERSIST: 913 t_callout = &tp->t_timers->tt_persist; 914 break; 915 case TT_KEEP: 916 t_callout = &tp->t_timers->tt_keep; 917 break; 918 case TT_2MSL: 919 t_callout = &tp->t_timers->tt_2msl; 920 break; 921 default: 922 if (tp->t_fb->tfb_tcp_timer_active) { 923 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 924 } 925 panic("tp %p bad timer_type %#x", tp, timer_type); 926 } 927 return callout_active(t_callout); 928 } 929 930 void 931 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 932 { 933 struct callout *t_callout; 934 uint32_t f_reset; 935 936 tp->t_timers->tt_flags |= TT_STOPPED; 937 938 switch (timer_type) { 939 case TT_DELACK: 940 t_callout = &tp->t_timers->tt_delack; 941 f_reset = TT_DELACK_RST; 942 break; 943 case TT_REXMT: 944 t_callout = &tp->t_timers->tt_rexmt; 945 f_reset = TT_REXMT_RST; 946 break; 947 case TT_PERSIST: 948 t_callout = &tp->t_timers->tt_persist; 949 f_reset = TT_PERSIST_RST; 950 break; 951 case TT_KEEP: 952 t_callout = &tp->t_timers->tt_keep; 953 f_reset = TT_KEEP_RST; 954 break; 955 case TT_2MSL: 956 t_callout = &tp->t_timers->tt_2msl; 957 f_reset = TT_2MSL_RST; 958 break; 959 default: 960 if (tp->t_fb->tfb_tcp_timer_stop) { 961 /* 962 * XXXrrs we need to look at this with the 963 * stop case below (flags). 964 */ 965 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 966 return; 967 } 968 panic("tp %p bad timer_type %#x", tp, timer_type); 969 } 970 971 if (tp->t_timers->tt_flags & timer_type) { 972 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 973 /* 974 * Can't stop the callout, defer tcpcb actual deletion 975 * to the last one. We do this using the async drain 976 * function and incrementing the count in 977 */ 978 tp->t_timers->tt_draincnt++; 979 } 980 } 981 } 982 983 #define ticks_to_msecs(t) (1000*(t) / hz) 984 985 void 986 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 987 struct xtcp_timer *xtimer) 988 { 989 sbintime_t now; 990 991 bzero(xtimer, sizeof(*xtimer)); 992 if (timer == NULL) 993 return; 994 now = getsbinuptime(); 995 if (callout_active(&timer->tt_delack)) 996 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 997 if (callout_active(&timer->tt_rexmt)) 998 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 999 if (callout_active(&timer->tt_persist)) 1000 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 1001 if (callout_active(&timer->tt_keep)) 1002 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 1003 if (callout_active(&timer->tt_2msl)) 1004 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1005 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1006 } 1007