1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/cc.h> 59 #include <netinet/in.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp_fsm.h> 68 #include <netinet/tcp_timer.h> 69 #include <netinet/tcp_var.h> 70 #ifdef INET6 71 #include <netinet6/tcp6_var.h> 72 #endif 73 #include <netinet/tcpip.h> 74 #ifdef TCPDEBUG 75 #include <netinet/tcp_debug.h> 76 #endif 77 78 int tcp_keepinit; 79 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 80 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 81 82 int tcp_keepidle; 83 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 84 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 85 86 int tcp_keepintvl; 87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 88 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 89 90 int tcp_delacktime; 91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 92 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 93 "Time before a delayed ACK is sent"); 94 95 int tcp_msl; 96 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 97 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 98 99 int tcp_rexmit_min; 100 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 101 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 102 "Minimum Retransmission Timeout"); 103 104 int tcp_rexmit_slop; 105 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 107 "Retransmission Timer Slop"); 108 109 static int always_keepalive = 1; 110 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 111 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 112 113 int tcp_fast_finwait2_recycle = 0; 114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 115 &tcp_fast_finwait2_recycle, 0, 116 "Recycle closed FIN_WAIT_2 connections faster"); 117 118 int tcp_finwait2_timeout; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 121 122 int tcp_keepcnt = TCPTV_KEEPCNT; 123 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 124 "Number of keepalive probes to send"); 125 126 /* max idle probes */ 127 int tcp_maxpersistidle; 128 129 static int tcp_rexmit_drop_options = 0; 130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 131 &tcp_rexmit_drop_options, 0, 132 "Drop TCP options from 3rd and later retransmitted SYN"); 133 134 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 135 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 136 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 137 CTLFLAG_RW|CTLFLAG_VNET, 138 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 139 "Path MTU Discovery Black Hole Detection Enabled"); 140 141 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 142 #define V_tcp_pmtud_blackhole_activated \ 143 VNET(tcp_pmtud_blackhole_activated) 144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 145 CTLFLAG_RD|CTLFLAG_VNET, 146 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 147 "Path MTU Discovery Black Hole Detection, Activation Count"); 148 149 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 150 #define V_tcp_pmtud_blackhole_activated_min_mss \ 151 VNET(tcp_pmtud_blackhole_activated_min_mss) 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 153 CTLFLAG_RD|CTLFLAG_VNET, 154 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 155 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 156 157 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 158 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 160 CTLFLAG_RD|CTLFLAG_VNET, 161 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 162 "Path MTU Discovery Black Hole Detection, Failure Count"); 163 164 #ifdef INET 165 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 166 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 168 CTLFLAG_RW|CTLFLAG_VNET, 169 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 170 "Path MTU Discovery Black Hole Detection lowered MSS"); 171 #endif 172 173 #ifdef INET6 174 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 175 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 176 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 177 CTLFLAG_RW|CTLFLAG_VNET, 178 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 179 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 180 #endif 181 182 #ifdef RSS 183 static int per_cpu_timers = 1; 184 #else 185 static int per_cpu_timers = 0; 186 #endif 187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 188 &per_cpu_timers , 0, "run tcp timers on all cpus"); 189 190 #if 0 191 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 192 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 193 #endif 194 195 /* 196 * Map the given inp to a CPU id. 197 * 198 * This queries RSS if it's compiled in, else it defaults to the current 199 * CPU ID. 200 */ 201 static inline int 202 inp_to_cpuid(struct inpcb *inp) 203 { 204 u_int cpuid; 205 206 #ifdef RSS 207 if (per_cpu_timers) { 208 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 209 if (cpuid == NETISR_CPUID_NONE) 210 return (curcpu); /* XXX */ 211 else 212 return (cpuid); 213 } 214 #else 215 /* Legacy, pre-RSS behaviour */ 216 if (per_cpu_timers) { 217 /* 218 * We don't have a flowid -> cpuid mapping, so cheat and 219 * just map unknown cpuids to curcpu. Not the best, but 220 * apparently better than defaulting to swi 0. 221 */ 222 cpuid = inp->inp_flowid % (mp_maxid + 1); 223 if (! CPU_ABSENT(cpuid)) 224 return (cpuid); 225 return (curcpu); 226 } 227 #endif 228 /* Default for RSS and non-RSS - cpuid 0 */ 229 else { 230 return (0); 231 } 232 } 233 234 /* 235 * Tcp protocol timeout routine called every 500 ms. 236 * Updates timestamps used for TCP 237 * causes finite state machine actions if timers expire. 238 */ 239 void 240 tcp_slowtimo(void) 241 { 242 VNET_ITERATOR_DECL(vnet_iter); 243 244 VNET_LIST_RLOCK_NOSLEEP(); 245 VNET_FOREACH(vnet_iter) { 246 CURVNET_SET(vnet_iter); 247 (void) tcp_tw_2msl_scan(0); 248 CURVNET_RESTORE(); 249 } 250 VNET_LIST_RUNLOCK_NOSLEEP(); 251 } 252 253 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 254 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 255 256 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 257 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 258 259 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 260 261 /* 262 * TCP timer processing. 263 */ 264 265 void 266 tcp_timer_delack(void *xtp) 267 { 268 struct tcpcb *tp = xtp; 269 struct inpcb *inp; 270 CURVNET_SET(tp->t_vnet); 271 272 inp = tp->t_inpcb; 273 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 274 INP_WLOCK(inp); 275 if (callout_pending(&tp->t_timers->tt_delack) || 276 !callout_active(&tp->t_timers->tt_delack)) { 277 INP_WUNLOCK(inp); 278 CURVNET_RESTORE(); 279 return; 280 } 281 callout_deactivate(&tp->t_timers->tt_delack); 282 if ((inp->inp_flags & INP_DROPPED) != 0) { 283 INP_WUNLOCK(inp); 284 CURVNET_RESTORE(); 285 return; 286 } 287 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 288 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 289 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 290 ("%s: tp %p delack callout should be running", __func__, tp)); 291 292 tp->t_flags |= TF_ACKNOW; 293 TCPSTAT_INC(tcps_delack); 294 (void) tcp_output(tp); 295 INP_WUNLOCK(inp); 296 CURVNET_RESTORE(); 297 } 298 299 void 300 tcp_timer_2msl(void *xtp) 301 { 302 struct tcpcb *tp = xtp; 303 struct inpcb *inp; 304 CURVNET_SET(tp->t_vnet); 305 #ifdef TCPDEBUG 306 int ostate; 307 308 ostate = tp->t_state; 309 #endif 310 INP_INFO_WLOCK(&V_tcbinfo); 311 inp = tp->t_inpcb; 312 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 313 INP_WLOCK(inp); 314 tcp_free_sackholes(tp); 315 if (callout_pending(&tp->t_timers->tt_2msl) || 316 !callout_active(&tp->t_timers->tt_2msl)) { 317 INP_WUNLOCK(tp->t_inpcb); 318 INP_INFO_WUNLOCK(&V_tcbinfo); 319 CURVNET_RESTORE(); 320 return; 321 } 322 callout_deactivate(&tp->t_timers->tt_2msl); 323 if ((inp->inp_flags & INP_DROPPED) != 0) { 324 INP_WUNLOCK(inp); 325 INP_INFO_WUNLOCK(&V_tcbinfo); 326 CURVNET_RESTORE(); 327 return; 328 } 329 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 330 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 331 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 332 ("%s: tp %p 2msl callout should be running", __func__, tp)); 333 /* 334 * 2 MSL timeout in shutdown went off. If we're closed but 335 * still waiting for peer to close and connection has been idle 336 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 337 * control block. Otherwise, check again in a bit. 338 * 339 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 340 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 341 * Ignore fact that there were recent incoming segments. 342 */ 343 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 344 tp->t_inpcb && tp->t_inpcb->inp_socket && 345 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 346 TCPSTAT_INC(tcps_finwait2_drops); 347 tp = tcp_close(tp); 348 } else { 349 if (tp->t_state != TCPS_TIME_WAIT && 350 ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 351 if (!callout_reset(&tp->t_timers->tt_2msl, 352 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 353 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 354 } 355 } else 356 tp = tcp_close(tp); 357 } 358 359 #ifdef TCPDEBUG 360 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 361 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 362 PRU_SLOWTIMO); 363 #endif 364 if (tp != NULL) 365 INP_WUNLOCK(inp); 366 INP_INFO_WUNLOCK(&V_tcbinfo); 367 CURVNET_RESTORE(); 368 } 369 370 void 371 tcp_timer_keep(void *xtp) 372 { 373 struct tcpcb *tp = xtp; 374 struct tcptemp *t_template; 375 struct inpcb *inp; 376 CURVNET_SET(tp->t_vnet); 377 #ifdef TCPDEBUG 378 int ostate; 379 380 ostate = tp->t_state; 381 #endif 382 INP_INFO_WLOCK(&V_tcbinfo); 383 inp = tp->t_inpcb; 384 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 385 INP_WLOCK(inp); 386 if (callout_pending(&tp->t_timers->tt_keep) || 387 !callout_active(&tp->t_timers->tt_keep)) { 388 INP_WUNLOCK(inp); 389 INP_INFO_WUNLOCK(&V_tcbinfo); 390 CURVNET_RESTORE(); 391 return; 392 } 393 callout_deactivate(&tp->t_timers->tt_keep); 394 if ((inp->inp_flags & INP_DROPPED) != 0) { 395 INP_WUNLOCK(inp); 396 INP_INFO_WUNLOCK(&V_tcbinfo); 397 CURVNET_RESTORE(); 398 return; 399 } 400 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 401 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 402 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 403 ("%s: tp %p keep callout should be running", __func__, tp)); 404 /* 405 * Keep-alive timer went off; send something 406 * or drop connection if idle for too long. 407 */ 408 TCPSTAT_INC(tcps_keeptimeo); 409 if (tp->t_state < TCPS_ESTABLISHED) 410 goto dropit; 411 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 412 tp->t_state <= TCPS_CLOSING) { 413 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 414 goto dropit; 415 /* 416 * Send a packet designed to force a response 417 * if the peer is up and reachable: 418 * either an ACK if the connection is still alive, 419 * or an RST if the peer has closed the connection 420 * due to timeout or reboot. 421 * Using sequence number tp->snd_una-1 422 * causes the transmitted zero-length segment 423 * to lie outside the receive window; 424 * by the protocol spec, this requires the 425 * correspondent TCP to respond. 426 */ 427 TCPSTAT_INC(tcps_keepprobe); 428 t_template = tcpip_maketemplate(inp); 429 if (t_template) { 430 tcp_respond(tp, t_template->tt_ipgen, 431 &t_template->tt_t, (struct mbuf *)NULL, 432 tp->rcv_nxt, tp->snd_una - 1, 0); 433 free(t_template, M_TEMP); 434 } 435 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 436 tcp_timer_keep, tp)) { 437 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 438 } 439 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 440 tcp_timer_keep, tp)) { 441 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 442 } 443 444 #ifdef TCPDEBUG 445 if (inp->inp_socket->so_options & SO_DEBUG) 446 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 447 PRU_SLOWTIMO); 448 #endif 449 INP_WUNLOCK(inp); 450 INP_INFO_WUNLOCK(&V_tcbinfo); 451 CURVNET_RESTORE(); 452 return; 453 454 dropit: 455 TCPSTAT_INC(tcps_keepdrops); 456 tp = tcp_drop(tp, ETIMEDOUT); 457 458 #ifdef TCPDEBUG 459 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 460 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 461 PRU_SLOWTIMO); 462 #endif 463 if (tp != NULL) 464 INP_WUNLOCK(tp->t_inpcb); 465 INP_INFO_WUNLOCK(&V_tcbinfo); 466 CURVNET_RESTORE(); 467 } 468 469 void 470 tcp_timer_persist(void *xtp) 471 { 472 struct tcpcb *tp = xtp; 473 struct inpcb *inp; 474 CURVNET_SET(tp->t_vnet); 475 #ifdef TCPDEBUG 476 int ostate; 477 478 ostate = tp->t_state; 479 #endif 480 INP_INFO_WLOCK(&V_tcbinfo); 481 inp = tp->t_inpcb; 482 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 483 INP_WLOCK(inp); 484 if (callout_pending(&tp->t_timers->tt_persist) || 485 !callout_active(&tp->t_timers->tt_persist)) { 486 INP_WUNLOCK(inp); 487 INP_INFO_WUNLOCK(&V_tcbinfo); 488 CURVNET_RESTORE(); 489 return; 490 } 491 callout_deactivate(&tp->t_timers->tt_persist); 492 if ((inp->inp_flags & INP_DROPPED) != 0) { 493 INP_WUNLOCK(inp); 494 INP_INFO_WUNLOCK(&V_tcbinfo); 495 CURVNET_RESTORE(); 496 return; 497 } 498 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 499 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 500 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 501 ("%s: tp %p persist callout should be running", __func__, tp)); 502 /* 503 * Persistance timer into zero window. 504 * Force a byte to be output, if possible. 505 */ 506 TCPSTAT_INC(tcps_persisttimeo); 507 /* 508 * Hack: if the peer is dead/unreachable, we do not 509 * time out if the window is closed. After a full 510 * backoff, drop the connection if the idle time 511 * (no responses to probes) reaches the maximum 512 * backoff that we would use if retransmitting. 513 */ 514 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 515 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 516 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 517 TCPSTAT_INC(tcps_persistdrop); 518 tp = tcp_drop(tp, ETIMEDOUT); 519 goto out; 520 } 521 /* 522 * If the user has closed the socket then drop a persisting 523 * connection after a much reduced timeout. 524 */ 525 if (tp->t_state > TCPS_CLOSE_WAIT && 526 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 527 TCPSTAT_INC(tcps_persistdrop); 528 tp = tcp_drop(tp, ETIMEDOUT); 529 goto out; 530 } 531 tcp_setpersist(tp); 532 tp->t_flags |= TF_FORCEDATA; 533 (void) tcp_output(tp); 534 tp->t_flags &= ~TF_FORCEDATA; 535 536 out: 537 #ifdef TCPDEBUG 538 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 539 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 540 #endif 541 if (tp != NULL) 542 INP_WUNLOCK(inp); 543 INP_INFO_WUNLOCK(&V_tcbinfo); 544 CURVNET_RESTORE(); 545 } 546 547 void 548 tcp_timer_rexmt(void * xtp) 549 { 550 struct tcpcb *tp = xtp; 551 CURVNET_SET(tp->t_vnet); 552 int rexmt; 553 int headlocked; 554 struct inpcb *inp; 555 #ifdef TCPDEBUG 556 int ostate; 557 558 ostate = tp->t_state; 559 #endif 560 561 INP_INFO_RLOCK(&V_tcbinfo); 562 inp = tp->t_inpcb; 563 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 564 INP_WLOCK(inp); 565 if (callout_pending(&tp->t_timers->tt_rexmt) || 566 !callout_active(&tp->t_timers->tt_rexmt)) { 567 INP_WUNLOCK(inp); 568 INP_INFO_RUNLOCK(&V_tcbinfo); 569 CURVNET_RESTORE(); 570 return; 571 } 572 callout_deactivate(&tp->t_timers->tt_rexmt); 573 if ((inp->inp_flags & INP_DROPPED) != 0) { 574 INP_WUNLOCK(inp); 575 INP_INFO_RUNLOCK(&V_tcbinfo); 576 CURVNET_RESTORE(); 577 return; 578 } 579 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 580 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 581 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 582 ("%s: tp %p rexmt callout should be running", __func__, tp)); 583 tcp_free_sackholes(tp); 584 /* 585 * Retransmission timer went off. Message has not 586 * been acked within retransmit interval. Back off 587 * to a longer retransmit interval and retransmit one segment. 588 */ 589 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 590 tp->t_rxtshift = TCP_MAXRXTSHIFT; 591 TCPSTAT_INC(tcps_timeoutdrop); 592 in_pcbref(inp); 593 INP_INFO_RUNLOCK(&V_tcbinfo); 594 INP_WUNLOCK(inp); 595 INP_INFO_WLOCK(&V_tcbinfo); 596 INP_WLOCK(inp); 597 if (in_pcbrele_wlocked(inp)) { 598 INP_INFO_WUNLOCK(&V_tcbinfo); 599 CURVNET_RESTORE(); 600 return; 601 } 602 if (inp->inp_flags & INP_DROPPED) { 603 INP_WUNLOCK(inp); 604 INP_INFO_WUNLOCK(&V_tcbinfo); 605 CURVNET_RESTORE(); 606 return; 607 } 608 609 tp = tcp_drop(tp, tp->t_softerror ? 610 tp->t_softerror : ETIMEDOUT); 611 headlocked = 1; 612 goto out; 613 } 614 INP_INFO_RUNLOCK(&V_tcbinfo); 615 headlocked = 0; 616 if (tp->t_state == TCPS_SYN_SENT) { 617 /* 618 * If the SYN was retransmitted, indicate CWND to be 619 * limited to 1 segment in cc_conn_init(). 620 */ 621 tp->snd_cwnd = 1; 622 } else if (tp->t_rxtshift == 1) { 623 /* 624 * first retransmit; record ssthresh and cwnd so they can 625 * be recovered if this turns out to be a "bad" retransmit. 626 * A retransmit is considered "bad" if an ACK for this 627 * segment is received within RTT/2 interval; the assumption 628 * here is that the ACK was already in flight. See 629 * "On Estimating End-to-End Network Path Properties" by 630 * Allman and Paxson for more details. 631 */ 632 tp->snd_cwnd_prev = tp->snd_cwnd; 633 tp->snd_ssthresh_prev = tp->snd_ssthresh; 634 tp->snd_recover_prev = tp->snd_recover; 635 if (IN_FASTRECOVERY(tp->t_flags)) 636 tp->t_flags |= TF_WASFRECOVERY; 637 else 638 tp->t_flags &= ~TF_WASFRECOVERY; 639 if (IN_CONGRECOVERY(tp->t_flags)) 640 tp->t_flags |= TF_WASCRECOVERY; 641 else 642 tp->t_flags &= ~TF_WASCRECOVERY; 643 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 644 tp->t_flags |= TF_PREVVALID; 645 } else 646 tp->t_flags &= ~TF_PREVVALID; 647 TCPSTAT_INC(tcps_rexmttimeo); 648 if (tp->t_state == TCPS_SYN_SENT) 649 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 650 else 651 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 652 TCPT_RANGESET(tp->t_rxtcur, rexmt, 653 tp->t_rttmin, TCPTV_REXMTMAX); 654 655 /* 656 * We enter the path for PLMTUD if connection is established or, if 657 * connection is FIN_WAIT_1 status, reason for the last is that if 658 * amount of data we send is very small, we could send it in couple of 659 * packets and process straight to FIN. In that case we won't catch 660 * ESTABLISHED state. 661 */ 662 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 663 || (tp->t_state == TCPS_FIN_WAIT_1))) { 664 int optlen; 665 #ifdef INET6 666 int isipv6; 667 #endif 668 669 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 670 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 671 (tp->t_rxtshift <= 2)) { 672 /* 673 * Enter Path MTU Black-hole Detection mechanism: 674 * - Disable Path MTU Discovery (IP "DF" bit). 675 * - Reduce MTU to lower value than what we 676 * negotiated with peer. 677 */ 678 /* Record that we may have found a black hole. */ 679 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 680 681 /* Keep track of previous MSS. */ 682 optlen = tp->t_maxopd - tp->t_maxseg; 683 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 684 685 /* 686 * Reduce the MSS to blackhole value or to the default 687 * in an attempt to retransmit. 688 */ 689 #ifdef INET6 690 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 691 if (isipv6 && 692 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 693 /* Use the sysctl tuneable blackhole MSS. */ 694 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 695 V_tcp_pmtud_blackhole_activated++; 696 } else if (isipv6) { 697 /* Use the default MSS. */ 698 tp->t_maxopd = V_tcp_v6mssdflt; 699 /* 700 * Disable Path MTU Discovery when we switch to 701 * minmss. 702 */ 703 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 704 V_tcp_pmtud_blackhole_activated_min_mss++; 705 } 706 #endif 707 #if defined(INET6) && defined(INET) 708 else 709 #endif 710 #ifdef INET 711 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 712 /* Use the sysctl tuneable blackhole MSS. */ 713 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 714 V_tcp_pmtud_blackhole_activated++; 715 } else { 716 /* Use the default MSS. */ 717 tp->t_maxopd = V_tcp_mssdflt; 718 /* 719 * Disable Path MTU Discovery when we switch to 720 * minmss. 721 */ 722 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 723 V_tcp_pmtud_blackhole_activated_min_mss++; 724 } 725 #endif 726 tp->t_maxseg = tp->t_maxopd - optlen; 727 /* 728 * Reset the slow-start flight size 729 * as it may depend on the new MSS. 730 */ 731 if (CC_ALGO(tp)->conn_init != NULL) 732 CC_ALGO(tp)->conn_init(tp->ccv); 733 } else { 734 /* 735 * If further retransmissions are still unsuccessful 736 * with a lowered MTU, maybe this isn't a blackhole and 737 * we restore the previous MSS and blackhole detection 738 * flags. 739 */ 740 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 741 (tp->t_rxtshift > 4)) { 742 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 743 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 744 optlen = tp->t_maxopd - tp->t_maxseg; 745 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 746 tp->t_maxseg = tp->t_maxopd - optlen; 747 V_tcp_pmtud_blackhole_failed++; 748 /* 749 * Reset the slow-start flight size as it 750 * may depend on the new MSS. 751 */ 752 if (CC_ALGO(tp)->conn_init != NULL) 753 CC_ALGO(tp)->conn_init(tp->ccv); 754 } 755 } 756 } 757 758 /* 759 * Disable RFC1323 and SACK if we haven't got any response to 760 * our third SYN to work-around some broken terminal servers 761 * (most of which have hopefully been retired) that have bad VJ 762 * header compression code which trashes TCP segments containing 763 * unknown-to-them TCP options. 764 */ 765 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 766 (tp->t_rxtshift == 3)) 767 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 768 /* 769 * If we backed off this far, our srtt estimate is probably bogus. 770 * Clobber it so we'll take the next rtt measurement as our srtt; 771 * move the current srtt into rttvar to keep the current 772 * retransmit times until then. 773 */ 774 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 775 #ifdef INET6 776 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 777 in6_losing(tp->t_inpcb); 778 #endif 779 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 780 tp->t_srtt = 0; 781 } 782 tp->snd_nxt = tp->snd_una; 783 tp->snd_recover = tp->snd_max; 784 /* 785 * Force a segment to be sent. 786 */ 787 tp->t_flags |= TF_ACKNOW; 788 /* 789 * If timing a segment in this window, stop the timer. 790 */ 791 tp->t_rtttime = 0; 792 793 cc_cong_signal(tp, NULL, CC_RTO); 794 795 (void) tcp_output(tp); 796 797 out: 798 #ifdef TCPDEBUG 799 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 800 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 801 PRU_SLOWTIMO); 802 #endif 803 if (tp != NULL) 804 INP_WUNLOCK(inp); 805 if (headlocked) 806 INP_INFO_WUNLOCK(&V_tcbinfo); 807 CURVNET_RESTORE(); 808 } 809 810 void 811 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 812 { 813 struct callout *t_callout; 814 timeout_t *f_callout; 815 struct inpcb *inp = tp->t_inpcb; 816 int cpu = inp_to_cpuid(inp); 817 uint32_t f_reset; 818 819 #ifdef TCP_OFFLOAD 820 if (tp->t_flags & TF_TOE) 821 return; 822 #endif 823 824 if (tp->t_timers->tt_flags & TT_STOPPED) 825 return; 826 827 switch (timer_type) { 828 case TT_DELACK: 829 t_callout = &tp->t_timers->tt_delack; 830 f_callout = tcp_timer_delack; 831 f_reset = TT_DELACK_RST; 832 break; 833 case TT_REXMT: 834 t_callout = &tp->t_timers->tt_rexmt; 835 f_callout = tcp_timer_rexmt; 836 f_reset = TT_REXMT_RST; 837 break; 838 case TT_PERSIST: 839 t_callout = &tp->t_timers->tt_persist; 840 f_callout = tcp_timer_persist; 841 f_reset = TT_PERSIST_RST; 842 break; 843 case TT_KEEP: 844 t_callout = &tp->t_timers->tt_keep; 845 f_callout = tcp_timer_keep; 846 f_reset = TT_KEEP_RST; 847 break; 848 case TT_2MSL: 849 t_callout = &tp->t_timers->tt_2msl; 850 f_callout = tcp_timer_2msl; 851 f_reset = TT_2MSL_RST; 852 break; 853 default: 854 panic("tp %p bad timer_type %#x", tp, timer_type); 855 } 856 if (delta == 0) { 857 if ((tp->t_timers->tt_flags & timer_type) && 858 callout_stop(t_callout) && 859 (tp->t_timers->tt_flags & f_reset)) { 860 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 861 } 862 } else { 863 if ((tp->t_timers->tt_flags & timer_type) == 0) { 864 tp->t_timers->tt_flags |= (timer_type | f_reset); 865 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 866 } else { 867 /* Reset already running callout on the same CPU. */ 868 if (!callout_reset(t_callout, delta, f_callout, tp)) { 869 /* 870 * Callout not cancelled, consider it as not 871 * properly restarted. */ 872 tp->t_timers->tt_flags &= ~f_reset; 873 } 874 } 875 } 876 } 877 878 int 879 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 880 { 881 struct callout *t_callout; 882 883 switch (timer_type) { 884 case TT_DELACK: 885 t_callout = &tp->t_timers->tt_delack; 886 break; 887 case TT_REXMT: 888 t_callout = &tp->t_timers->tt_rexmt; 889 break; 890 case TT_PERSIST: 891 t_callout = &tp->t_timers->tt_persist; 892 break; 893 case TT_KEEP: 894 t_callout = &tp->t_timers->tt_keep; 895 break; 896 case TT_2MSL: 897 t_callout = &tp->t_timers->tt_2msl; 898 break; 899 default: 900 panic("tp %p bad timer_type %#x", tp, timer_type); 901 } 902 return callout_active(t_callout); 903 } 904 905 void 906 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 907 { 908 struct callout *t_callout; 909 timeout_t *f_callout; 910 uint32_t f_reset; 911 912 tp->t_timers->tt_flags |= TT_STOPPED; 913 914 switch (timer_type) { 915 case TT_DELACK: 916 t_callout = &tp->t_timers->tt_delack; 917 f_callout = tcp_timer_delack_discard; 918 f_reset = TT_DELACK_RST; 919 break; 920 case TT_REXMT: 921 t_callout = &tp->t_timers->tt_rexmt; 922 f_callout = tcp_timer_rexmt_discard; 923 f_reset = TT_REXMT_RST; 924 break; 925 case TT_PERSIST: 926 t_callout = &tp->t_timers->tt_persist; 927 f_callout = tcp_timer_persist_discard; 928 f_reset = TT_PERSIST_RST; 929 break; 930 case TT_KEEP: 931 t_callout = &tp->t_timers->tt_keep; 932 f_callout = tcp_timer_keep_discard; 933 f_reset = TT_KEEP_RST; 934 break; 935 case TT_2MSL: 936 t_callout = &tp->t_timers->tt_2msl; 937 f_callout = tcp_timer_2msl_discard; 938 f_reset = TT_2MSL_RST; 939 break; 940 default: 941 panic("tp %p bad timer_type %#x", tp, timer_type); 942 } 943 944 if (tp->t_timers->tt_flags & timer_type) { 945 if (callout_stop(t_callout) && 946 (tp->t_timers->tt_flags & f_reset)) { 947 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 948 } else { 949 /* 950 * Can't stop the callout, defer tcpcb actual deletion 951 * to the last tcp timer discard callout. 952 * The TT_STOPPED flag will ensure that no tcp timer 953 * callouts can be restarted on our behalf, and 954 * past this point currently running callouts waiting 955 * on inp lock will return right away after the 956 * classical check for callout reset/stop events: 957 * callout_pending() || !callout_active() 958 */ 959 callout_reset(t_callout, 1, f_callout, tp); 960 } 961 } 962 } 963 964 #define ticks_to_msecs(t) (1000*(t) / hz) 965 966 void 967 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 968 struct xtcp_timer *xtimer) 969 { 970 sbintime_t now; 971 972 bzero(xtimer, sizeof(*xtimer)); 973 if (timer == NULL) 974 return; 975 now = getsbinuptime(); 976 if (callout_active(&timer->tt_delack)) 977 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 978 if (callout_active(&timer->tt_rexmt)) 979 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 980 if (callout_active(&timer->tt_persist)) 981 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 982 if (callout_active(&timer->tt_keep)) 983 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 984 if (callout_active(&timer->tt_2msl)) 985 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 986 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 987 } 988