1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/cc/cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_persmin; 81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84 int tcp_persmax; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88 int tcp_keepinit; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92 int tcp_keepidle; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100 int tcp_delacktime; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105 int tcp_msl; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109 int tcp_rexmit_min; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114 int tcp_rexmit_slop; 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119 static int always_keepalive = 1; 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122 123 int tcp_fast_finwait2_recycle = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 125 &tcp_fast_finwait2_recycle, 0, 126 "Recycle closed FIN_WAIT_2 connections faster"); 127 128 int tcp_finwait2_timeout; 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 131 132 int tcp_keepcnt = TCPTV_KEEPCNT; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 134 "Number of keepalive probes to send"); 135 136 /* max idle probes */ 137 int tcp_maxpersistidle; 138 139 static int tcp_rexmit_drop_options = 0; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 141 &tcp_rexmit_drop_options, 0, 142 "Drop TCP options from 3rd and later retransmitted SYN"); 143 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 145 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 147 CTLFLAG_RW|CTLFLAG_VNET, 148 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 149 "Path MTU Discovery Black Hole Detection Enabled"); 150 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 152 #define V_tcp_pmtud_blackhole_activated \ 153 VNET(tcp_pmtud_blackhole_activated) 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 155 CTLFLAG_RD|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 157 "Path MTU Discovery Black Hole Detection, Activation Count"); 158 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 160 #define V_tcp_pmtud_blackhole_activated_min_mss \ 161 VNET(tcp_pmtud_blackhole_activated_min_mss) 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 163 CTLFLAG_RD|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 165 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 166 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 168 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 170 CTLFLAG_RD|CTLFLAG_VNET, 171 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 172 "Path MTU Discovery Black Hole Detection, Failure Count"); 173 174 #ifdef INET 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 176 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 180 "Path MTU Discovery Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef INET6 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 185 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 #if 0 201 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 202 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 203 #endif 204 205 /* 206 * Map the given inp to a CPU id. 207 * 208 * This queries RSS if it's compiled in, else it defaults to the current 209 * CPU ID. 210 */ 211 static inline int 212 inp_to_cpuid(struct inpcb *inp) 213 { 214 u_int cpuid; 215 216 #ifdef RSS 217 if (per_cpu_timers) { 218 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 219 if (cpuid == NETISR_CPUID_NONE) 220 return (curcpu); /* XXX */ 221 else 222 return (cpuid); 223 } 224 #else 225 /* Legacy, pre-RSS behaviour */ 226 if (per_cpu_timers) { 227 /* 228 * We don't have a flowid -> cpuid mapping, so cheat and 229 * just map unknown cpuids to curcpu. Not the best, but 230 * apparently better than defaulting to swi 0. 231 */ 232 cpuid = inp->inp_flowid % (mp_maxid + 1); 233 if (! CPU_ABSENT(cpuid)) 234 return (cpuid); 235 return (curcpu); 236 } 237 #endif 238 /* Default for RSS and non-RSS - cpuid 0 */ 239 else { 240 return (0); 241 } 242 } 243 244 /* 245 * Tcp protocol timeout routine called every 500 ms. 246 * Updates timestamps used for TCP 247 * causes finite state machine actions if timers expire. 248 */ 249 void 250 tcp_slowtimo(void) 251 { 252 VNET_ITERATOR_DECL(vnet_iter); 253 254 VNET_LIST_RLOCK_NOSLEEP(); 255 VNET_FOREACH(vnet_iter) { 256 CURVNET_SET(vnet_iter); 257 (void) tcp_tw_2msl_scan(0); 258 CURVNET_RESTORE(); 259 } 260 VNET_LIST_RUNLOCK_NOSLEEP(); 261 } 262 263 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 264 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 265 266 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 267 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 268 269 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 270 271 /* 272 * TCP timer processing. 273 */ 274 275 void 276 tcp_timer_delack(void *xtp) 277 { 278 struct tcpcb *tp = xtp; 279 struct inpcb *inp; 280 CURVNET_SET(tp->t_vnet); 281 282 inp = tp->t_inpcb; 283 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 284 INP_WLOCK(inp); 285 if (callout_pending(&tp->t_timers->tt_delack) || 286 !callout_active(&tp->t_timers->tt_delack)) { 287 INP_WUNLOCK(inp); 288 CURVNET_RESTORE(); 289 return; 290 } 291 callout_deactivate(&tp->t_timers->tt_delack); 292 if ((inp->inp_flags & INP_DROPPED) != 0) { 293 INP_WUNLOCK(inp); 294 CURVNET_RESTORE(); 295 return; 296 } 297 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 298 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 299 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0, 300 ("%s: tp %p delack callout should be running", __func__, tp)); 301 302 tp->t_flags |= TF_ACKNOW; 303 TCPSTAT_INC(tcps_delack); 304 (void) tp->t_fb->tfb_tcp_output(tp); 305 INP_WUNLOCK(inp); 306 CURVNET_RESTORE(); 307 } 308 309 void 310 tcp_timer_2msl(void *xtp) 311 { 312 struct tcpcb *tp = xtp; 313 struct inpcb *inp; 314 CURVNET_SET(tp->t_vnet); 315 #ifdef TCPDEBUG 316 int ostate; 317 318 ostate = tp->t_state; 319 #endif 320 INP_INFO_RLOCK(&V_tcbinfo); 321 inp = tp->t_inpcb; 322 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 323 INP_WLOCK(inp); 324 tcp_free_sackholes(tp); 325 if (callout_pending(&tp->t_timers->tt_2msl) || 326 !callout_active(&tp->t_timers->tt_2msl)) { 327 INP_WUNLOCK(tp->t_inpcb); 328 INP_INFO_RUNLOCK(&V_tcbinfo); 329 CURVNET_RESTORE(); 330 return; 331 } 332 callout_deactivate(&tp->t_timers->tt_2msl); 333 if ((inp->inp_flags & INP_DROPPED) != 0) { 334 INP_WUNLOCK(inp); 335 INP_INFO_RUNLOCK(&V_tcbinfo); 336 CURVNET_RESTORE(); 337 return; 338 } 339 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 340 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 341 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0, 342 ("%s: tp %p 2msl callout should be running", __func__, tp)); 343 /* 344 * 2 MSL timeout in shutdown went off. If we're closed but 345 * still waiting for peer to close and connection has been idle 346 * too long delete connection control block. Otherwise, check 347 * again in a bit. 348 * 349 * If in TIME_WAIT state just ignore as this timeout is handled in 350 * tcp_tw_2msl_scan(). 351 * 352 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 353 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 354 * Ignore fact that there were recent incoming segments. 355 */ 356 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 357 INP_WUNLOCK(inp); 358 INP_INFO_RUNLOCK(&V_tcbinfo); 359 CURVNET_RESTORE(); 360 return; 361 } 362 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 363 tp->t_inpcb && tp->t_inpcb->inp_socket && 364 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 365 TCPSTAT_INC(tcps_finwait2_drops); 366 tp = tcp_close(tp); 367 } else { 368 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 369 if (!callout_reset(&tp->t_timers->tt_2msl, 370 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) { 371 tp->t_timers->tt_flags &= ~TT_2MSL_RST; 372 } 373 } else 374 tp = tcp_close(tp); 375 } 376 377 #ifdef TCPDEBUG 378 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 379 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 380 PRU_SLOWTIMO); 381 #endif 382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 383 384 if (tp != NULL) 385 INP_WUNLOCK(inp); 386 INP_INFO_RUNLOCK(&V_tcbinfo); 387 CURVNET_RESTORE(); 388 } 389 390 void 391 tcp_timer_keep(void *xtp) 392 { 393 struct tcpcb *tp = xtp; 394 struct tcptemp *t_template; 395 struct inpcb *inp; 396 CURVNET_SET(tp->t_vnet); 397 #ifdef TCPDEBUG 398 int ostate; 399 400 ostate = tp->t_state; 401 #endif 402 INP_INFO_RLOCK(&V_tcbinfo); 403 inp = tp->t_inpcb; 404 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 405 INP_WLOCK(inp); 406 if (callout_pending(&tp->t_timers->tt_keep) || 407 !callout_active(&tp->t_timers->tt_keep)) { 408 INP_WUNLOCK(inp); 409 INP_INFO_RUNLOCK(&V_tcbinfo); 410 CURVNET_RESTORE(); 411 return; 412 } 413 callout_deactivate(&tp->t_timers->tt_keep); 414 if ((inp->inp_flags & INP_DROPPED) != 0) { 415 INP_WUNLOCK(inp); 416 INP_INFO_RUNLOCK(&V_tcbinfo); 417 CURVNET_RESTORE(); 418 return; 419 } 420 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 421 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 422 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0, 423 ("%s: tp %p keep callout should be running", __func__, tp)); 424 /* 425 * Keep-alive timer went off; send something 426 * or drop connection if idle for too long. 427 */ 428 TCPSTAT_INC(tcps_keeptimeo); 429 if (tp->t_state < TCPS_ESTABLISHED) 430 goto dropit; 431 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 432 tp->t_state <= TCPS_CLOSING) { 433 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 434 goto dropit; 435 /* 436 * Send a packet designed to force a response 437 * if the peer is up and reachable: 438 * either an ACK if the connection is still alive, 439 * or an RST if the peer has closed the connection 440 * due to timeout or reboot. 441 * Using sequence number tp->snd_una-1 442 * causes the transmitted zero-length segment 443 * to lie outside the receive window; 444 * by the protocol spec, this requires the 445 * correspondent TCP to respond. 446 */ 447 TCPSTAT_INC(tcps_keepprobe); 448 t_template = tcpip_maketemplate(inp); 449 if (t_template) { 450 tcp_respond(tp, t_template->tt_ipgen, 451 &t_template->tt_t, (struct mbuf *)NULL, 452 tp->rcv_nxt, tp->snd_una - 1, 0); 453 free(t_template, M_TEMP); 454 } 455 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 456 tcp_timer_keep, tp)) { 457 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 458 } 459 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 460 tcp_timer_keep, tp)) { 461 tp->t_timers->tt_flags &= ~TT_KEEP_RST; 462 } 463 464 #ifdef TCPDEBUG 465 if (inp->inp_socket->so_options & SO_DEBUG) 466 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 467 PRU_SLOWTIMO); 468 #endif 469 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 470 INP_WUNLOCK(inp); 471 INP_INFO_RUNLOCK(&V_tcbinfo); 472 CURVNET_RESTORE(); 473 return; 474 475 dropit: 476 TCPSTAT_INC(tcps_keepdrops); 477 tp = tcp_drop(tp, ETIMEDOUT); 478 479 #ifdef TCPDEBUG 480 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 481 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 482 PRU_SLOWTIMO); 483 #endif 484 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 485 if (tp != NULL) 486 INP_WUNLOCK(tp->t_inpcb); 487 INP_INFO_RUNLOCK(&V_tcbinfo); 488 CURVNET_RESTORE(); 489 } 490 491 void 492 tcp_timer_persist(void *xtp) 493 { 494 struct tcpcb *tp = xtp; 495 struct inpcb *inp; 496 CURVNET_SET(tp->t_vnet); 497 #ifdef TCPDEBUG 498 int ostate; 499 500 ostate = tp->t_state; 501 #endif 502 INP_INFO_RLOCK(&V_tcbinfo); 503 inp = tp->t_inpcb; 504 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 505 INP_WLOCK(inp); 506 if (callout_pending(&tp->t_timers->tt_persist) || 507 !callout_active(&tp->t_timers->tt_persist)) { 508 INP_WUNLOCK(inp); 509 INP_INFO_RUNLOCK(&V_tcbinfo); 510 CURVNET_RESTORE(); 511 return; 512 } 513 callout_deactivate(&tp->t_timers->tt_persist); 514 if ((inp->inp_flags & INP_DROPPED) != 0) { 515 INP_WUNLOCK(inp); 516 INP_INFO_RUNLOCK(&V_tcbinfo); 517 CURVNET_RESTORE(); 518 return; 519 } 520 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 521 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 522 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0, 523 ("%s: tp %p persist callout should be running", __func__, tp)); 524 /* 525 * Persistence timer into zero window. 526 * Force a byte to be output, if possible. 527 */ 528 TCPSTAT_INC(tcps_persisttimeo); 529 /* 530 * Hack: if the peer is dead/unreachable, we do not 531 * time out if the window is closed. After a full 532 * backoff, drop the connection if the idle time 533 * (no responses to probes) reaches the maximum 534 * backoff that we would use if retransmitting. 535 */ 536 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 537 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 538 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 539 TCPSTAT_INC(tcps_persistdrop); 540 tp = tcp_drop(tp, ETIMEDOUT); 541 goto out; 542 } 543 /* 544 * If the user has closed the socket then drop a persisting 545 * connection after a much reduced timeout. 546 */ 547 if (tp->t_state > TCPS_CLOSE_WAIT && 548 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 549 TCPSTAT_INC(tcps_persistdrop); 550 tp = tcp_drop(tp, ETIMEDOUT); 551 goto out; 552 } 553 tcp_setpersist(tp); 554 tp->t_flags |= TF_FORCEDATA; 555 (void) tp->t_fb->tfb_tcp_output(tp); 556 tp->t_flags &= ~TF_FORCEDATA; 557 558 out: 559 #ifdef TCPDEBUG 560 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 561 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 562 #endif 563 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 564 if (tp != NULL) 565 INP_WUNLOCK(inp); 566 INP_INFO_RUNLOCK(&V_tcbinfo); 567 CURVNET_RESTORE(); 568 } 569 570 void 571 tcp_timer_rexmt(void * xtp) 572 { 573 struct tcpcb *tp = xtp; 574 CURVNET_SET(tp->t_vnet); 575 int rexmt; 576 int headlocked; 577 struct inpcb *inp; 578 #ifdef TCPDEBUG 579 int ostate; 580 581 ostate = tp->t_state; 582 #endif 583 584 INP_INFO_RLOCK(&V_tcbinfo); 585 inp = tp->t_inpcb; 586 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 587 INP_WLOCK(inp); 588 if (callout_pending(&tp->t_timers->tt_rexmt) || 589 !callout_active(&tp->t_timers->tt_rexmt)) { 590 INP_WUNLOCK(inp); 591 INP_INFO_RUNLOCK(&V_tcbinfo); 592 CURVNET_RESTORE(); 593 return; 594 } 595 callout_deactivate(&tp->t_timers->tt_rexmt); 596 if ((inp->inp_flags & INP_DROPPED) != 0) { 597 INP_WUNLOCK(inp); 598 INP_INFO_RUNLOCK(&V_tcbinfo); 599 CURVNET_RESTORE(); 600 return; 601 } 602 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 603 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 604 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0, 605 ("%s: tp %p rexmt callout should be running", __func__, tp)); 606 tcp_free_sackholes(tp); 607 /* 608 * Retransmission timer went off. Message has not 609 * been acked within retransmit interval. Back off 610 * to a longer retransmit interval and retransmit one segment. 611 */ 612 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 613 tp->t_rxtshift = TCP_MAXRXTSHIFT; 614 TCPSTAT_INC(tcps_timeoutdrop); 615 616 tp = tcp_drop(tp, tp->t_softerror ? 617 tp->t_softerror : ETIMEDOUT); 618 headlocked = 1; 619 goto out; 620 } 621 INP_INFO_RUNLOCK(&V_tcbinfo); 622 headlocked = 0; 623 if (tp->t_state == TCPS_SYN_SENT) { 624 /* 625 * If the SYN was retransmitted, indicate CWND to be 626 * limited to 1 segment in cc_conn_init(). 627 */ 628 tp->snd_cwnd = 1; 629 } else if (tp->t_rxtshift == 1) { 630 /* 631 * first retransmit; record ssthresh and cwnd so they can 632 * be recovered if this turns out to be a "bad" retransmit. 633 * A retransmit is considered "bad" if an ACK for this 634 * segment is received within RTT/2 interval; the assumption 635 * here is that the ACK was already in flight. See 636 * "On Estimating End-to-End Network Path Properties" by 637 * Allman and Paxson for more details. 638 */ 639 tp->snd_cwnd_prev = tp->snd_cwnd; 640 tp->snd_ssthresh_prev = tp->snd_ssthresh; 641 tp->snd_recover_prev = tp->snd_recover; 642 if (IN_FASTRECOVERY(tp->t_flags)) 643 tp->t_flags |= TF_WASFRECOVERY; 644 else 645 tp->t_flags &= ~TF_WASFRECOVERY; 646 if (IN_CONGRECOVERY(tp->t_flags)) 647 tp->t_flags |= TF_WASCRECOVERY; 648 else 649 tp->t_flags &= ~TF_WASCRECOVERY; 650 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 651 tp->t_flags |= TF_PREVVALID; 652 } else 653 tp->t_flags &= ~TF_PREVVALID; 654 TCPSTAT_INC(tcps_rexmttimeo); 655 if ((tp->t_state == TCPS_SYN_SENT) || 656 (tp->t_state == TCPS_SYN_RECEIVED)) 657 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 658 else 659 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 660 TCPT_RANGESET(tp->t_rxtcur, rexmt, 661 tp->t_rttmin, TCPTV_REXMTMAX); 662 663 /* 664 * We enter the path for PLMTUD if connection is established or, if 665 * connection is FIN_WAIT_1 status, reason for the last is that if 666 * amount of data we send is very small, we could send it in couple of 667 * packets and process straight to FIN. In that case we won't catch 668 * ESTABLISHED state. 669 */ 670 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 671 || (tp->t_state == TCPS_FIN_WAIT_1))) { 672 #ifdef INET6 673 int isipv6; 674 #endif 675 676 /* 677 * Idea here is that at each stage of mtu probe (usually, 1448 678 * -> 1188 -> 524) should be given 2 chances to recover before 679 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 680 * take care of that. 681 */ 682 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 683 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 684 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 685 /* 686 * Enter Path MTU Black-hole Detection mechanism: 687 * - Disable Path MTU Discovery (IP "DF" bit). 688 * - Reduce MTU to lower value than what we 689 * negotiated with peer. 690 */ 691 /* Record that we may have found a black hole. */ 692 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 693 694 /* Keep track of previous MSS. */ 695 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 696 697 /* 698 * Reduce the MSS to blackhole value or to the default 699 * in an attempt to retransmit. 700 */ 701 #ifdef INET6 702 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 703 if (isipv6 && 704 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 705 /* Use the sysctl tuneable blackhole MSS. */ 706 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 707 V_tcp_pmtud_blackhole_activated++; 708 } else if (isipv6) { 709 /* Use the default MSS. */ 710 tp->t_maxseg = V_tcp_v6mssdflt; 711 /* 712 * Disable Path MTU Discovery when we switch to 713 * minmss. 714 */ 715 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 716 V_tcp_pmtud_blackhole_activated_min_mss++; 717 } 718 #endif 719 #if defined(INET6) && defined(INET) 720 else 721 #endif 722 #ifdef INET 723 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 724 /* Use the sysctl tuneable blackhole MSS. */ 725 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 726 V_tcp_pmtud_blackhole_activated++; 727 } else { 728 /* Use the default MSS. */ 729 tp->t_maxseg = V_tcp_mssdflt; 730 /* 731 * Disable Path MTU Discovery when we switch to 732 * minmss. 733 */ 734 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 735 V_tcp_pmtud_blackhole_activated_min_mss++; 736 } 737 #endif 738 /* 739 * Reset the slow-start flight size 740 * as it may depend on the new MSS. 741 */ 742 if (CC_ALGO(tp)->conn_init != NULL) 743 CC_ALGO(tp)->conn_init(tp->ccv); 744 } else { 745 /* 746 * If further retransmissions are still unsuccessful 747 * with a lowered MTU, maybe this isn't a blackhole and 748 * we restore the previous MSS and blackhole detection 749 * flags. 750 * The limit '6' is determined by giving each probe 751 * stage (1448, 1188, 524) 2 chances to recover. 752 */ 753 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 754 (tp->t_rxtshift > 6)) { 755 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 756 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 757 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 758 V_tcp_pmtud_blackhole_failed++; 759 /* 760 * Reset the slow-start flight size as it 761 * may depend on the new MSS. 762 */ 763 if (CC_ALGO(tp)->conn_init != NULL) 764 CC_ALGO(tp)->conn_init(tp->ccv); 765 } 766 } 767 } 768 769 /* 770 * Disable RFC1323 and SACK if we haven't got any response to 771 * our third SYN to work-around some broken terminal servers 772 * (most of which have hopefully been retired) that have bad VJ 773 * header compression code which trashes TCP segments containing 774 * unknown-to-them TCP options. 775 */ 776 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 777 (tp->t_rxtshift == 3)) 778 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 779 /* 780 * If we backed off this far, our srtt estimate is probably bogus. 781 * Clobber it so we'll take the next rtt measurement as our srtt; 782 * move the current srtt into rttvar to keep the current 783 * retransmit times until then. 784 */ 785 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 786 #ifdef INET6 787 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 788 in6_losing(tp->t_inpcb); 789 else 790 #endif 791 in_losing(tp->t_inpcb); 792 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 793 tp->t_srtt = 0; 794 } 795 tp->snd_nxt = tp->snd_una; 796 tp->snd_recover = tp->snd_max; 797 /* 798 * Force a segment to be sent. 799 */ 800 tp->t_flags |= TF_ACKNOW; 801 /* 802 * If timing a segment in this window, stop the timer. 803 */ 804 tp->t_rtttime = 0; 805 806 cc_cong_signal(tp, NULL, CC_RTO); 807 808 (void) tp->t_fb->tfb_tcp_output(tp); 809 810 out: 811 #ifdef TCPDEBUG 812 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 813 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 814 PRU_SLOWTIMO); 815 #endif 816 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 817 if (tp != NULL) 818 INP_WUNLOCK(inp); 819 if (headlocked) 820 INP_INFO_RUNLOCK(&V_tcbinfo); 821 CURVNET_RESTORE(); 822 } 823 824 void 825 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 826 { 827 struct callout *t_callout; 828 timeout_t *f_callout; 829 struct inpcb *inp = tp->t_inpcb; 830 int cpu = inp_to_cpuid(inp); 831 uint32_t f_reset; 832 833 #ifdef TCP_OFFLOAD 834 if (tp->t_flags & TF_TOE) 835 return; 836 #endif 837 838 if (tp->t_timers->tt_flags & TT_STOPPED) 839 return; 840 841 switch (timer_type) { 842 case TT_DELACK: 843 t_callout = &tp->t_timers->tt_delack; 844 f_callout = tcp_timer_delack; 845 f_reset = TT_DELACK_RST; 846 break; 847 case TT_REXMT: 848 t_callout = &tp->t_timers->tt_rexmt; 849 f_callout = tcp_timer_rexmt; 850 f_reset = TT_REXMT_RST; 851 break; 852 case TT_PERSIST: 853 t_callout = &tp->t_timers->tt_persist; 854 f_callout = tcp_timer_persist; 855 f_reset = TT_PERSIST_RST; 856 break; 857 case TT_KEEP: 858 t_callout = &tp->t_timers->tt_keep; 859 f_callout = tcp_timer_keep; 860 f_reset = TT_KEEP_RST; 861 break; 862 case TT_2MSL: 863 t_callout = &tp->t_timers->tt_2msl; 864 f_callout = tcp_timer_2msl; 865 f_reset = TT_2MSL_RST; 866 break; 867 default: 868 if (tp->t_fb->tfb_tcp_timer_activate) { 869 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 870 return; 871 } 872 panic("tp %p bad timer_type %#x", tp, timer_type); 873 } 874 if (delta == 0) { 875 if ((tp->t_timers->tt_flags & timer_type) && 876 (callout_stop(t_callout) > 0) && 877 (tp->t_timers->tt_flags & f_reset)) { 878 tp->t_timers->tt_flags &= ~(timer_type | f_reset); 879 } 880 } else { 881 if ((tp->t_timers->tt_flags & timer_type) == 0) { 882 tp->t_timers->tt_flags |= (timer_type | f_reset); 883 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 884 } else { 885 /* Reset already running callout on the same CPU. */ 886 if (!callout_reset(t_callout, delta, f_callout, tp)) { 887 /* 888 * Callout not cancelled, consider it as not 889 * properly restarted. */ 890 tp->t_timers->tt_flags &= ~f_reset; 891 } 892 } 893 } 894 } 895 896 int 897 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 898 { 899 struct callout *t_callout; 900 901 switch (timer_type) { 902 case TT_DELACK: 903 t_callout = &tp->t_timers->tt_delack; 904 break; 905 case TT_REXMT: 906 t_callout = &tp->t_timers->tt_rexmt; 907 break; 908 case TT_PERSIST: 909 t_callout = &tp->t_timers->tt_persist; 910 break; 911 case TT_KEEP: 912 t_callout = &tp->t_timers->tt_keep; 913 break; 914 case TT_2MSL: 915 t_callout = &tp->t_timers->tt_2msl; 916 break; 917 default: 918 if (tp->t_fb->tfb_tcp_timer_active) { 919 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 920 } 921 panic("tp %p bad timer_type %#x", tp, timer_type); 922 } 923 return callout_active(t_callout); 924 } 925 926 void 927 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 928 { 929 struct callout *t_callout; 930 uint32_t f_reset; 931 932 tp->t_timers->tt_flags |= TT_STOPPED; 933 934 switch (timer_type) { 935 case TT_DELACK: 936 t_callout = &tp->t_timers->tt_delack; 937 f_reset = TT_DELACK_RST; 938 break; 939 case TT_REXMT: 940 t_callout = &tp->t_timers->tt_rexmt; 941 f_reset = TT_REXMT_RST; 942 break; 943 case TT_PERSIST: 944 t_callout = &tp->t_timers->tt_persist; 945 f_reset = TT_PERSIST_RST; 946 break; 947 case TT_KEEP: 948 t_callout = &tp->t_timers->tt_keep; 949 f_reset = TT_KEEP_RST; 950 break; 951 case TT_2MSL: 952 t_callout = &tp->t_timers->tt_2msl; 953 f_reset = TT_2MSL_RST; 954 break; 955 default: 956 if (tp->t_fb->tfb_tcp_timer_stop) { 957 /* 958 * XXXrrs we need to look at this with the 959 * stop case below (flags). 960 */ 961 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 962 return; 963 } 964 panic("tp %p bad timer_type %#x", tp, timer_type); 965 } 966 967 if (tp->t_timers->tt_flags & timer_type) { 968 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 969 /* 970 * Can't stop the callout, defer tcpcb actual deletion 971 * to the last one. We do this using the async drain 972 * function and incrementing the count in 973 */ 974 tp->t_timers->tt_draincnt++; 975 } 976 } 977 } 978 979 #define ticks_to_msecs(t) (1000*(t) / hz) 980 981 void 982 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 983 struct xtcp_timer *xtimer) 984 { 985 sbintime_t now; 986 987 bzero(xtimer, sizeof(*xtimer)); 988 if (timer == NULL) 989 return; 990 now = getsbinuptime(); 991 if (callout_active(&timer->tt_delack)) 992 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 993 if (callout_active(&timer->tt_rexmt)) 994 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 995 if (callout_active(&timer->tt_persist)) 996 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 997 if (callout_active(&timer->tt_keep)) 998 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 999 if (callout_active(&timer->tt_2msl)) 1000 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1001 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1002 } 1003