1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/vnet.h> 55 #include <net/netisr.h> 56 57 #include <netinet/cc.h> 58 #include <netinet/in.h> 59 #include <netinet/in_pcb.h> 60 #include <netinet/in_rss.h> 61 #include <netinet/in_systm.h> 62 #ifdef INET6 63 #include <netinet6/in6_pcb.h> 64 #endif 65 #include <netinet/ip_var.h> 66 #include <netinet/tcp_fsm.h> 67 #include <netinet/tcp_timer.h> 68 #include <netinet/tcp_var.h> 69 #ifdef INET6 70 #include <netinet6/tcp6_var.h> 71 #endif 72 #include <netinet/tcpip.h> 73 #ifdef TCPDEBUG 74 #include <netinet/tcp_debug.h> 75 #endif 76 77 int tcp_keepinit; 78 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 79 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 80 81 int tcp_keepidle; 82 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 83 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 84 85 int tcp_keepintvl; 86 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 87 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 88 89 int tcp_delacktime; 90 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 91 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 92 "Time before a delayed ACK is sent"); 93 94 int tcp_msl; 95 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 96 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 97 98 int tcp_rexmit_min; 99 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 100 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 101 "Minimum Retransmission Timeout"); 102 103 int tcp_rexmit_slop; 104 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 105 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 106 "Retransmission Timer Slop"); 107 108 static int always_keepalive = 1; 109 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 110 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 111 112 int tcp_fast_finwait2_recycle = 0; 113 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 114 &tcp_fast_finwait2_recycle, 0, 115 "Recycle closed FIN_WAIT_2 connections faster"); 116 117 int tcp_finwait2_timeout; 118 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 119 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 120 121 int tcp_keepcnt = TCPTV_KEEPCNT; 122 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 123 "Number of keepalive probes to send"); 124 125 /* max idle probes */ 126 int tcp_maxpersistidle; 127 128 static int tcp_rexmit_drop_options = 0; 129 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 130 &tcp_rexmit_drop_options, 0, 131 "Drop TCP options from 3rd and later retransmitted SYN"); 132 133 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 134 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 135 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 136 CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_VNET, 137 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 138 "Path MTU Discovery Black Hole Detection Enabled"); 139 140 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 141 #define V_tcp_pmtud_blackhole_activated \ 142 VNET(tcp_pmtud_blackhole_activated) 143 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 144 CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_VNET, 145 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 146 "Path MTU Discovery Black Hole Detection, Activation Count"); 147 148 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 149 #define V_tcp_pmtud_blackhole_activated_min_mss \ 150 VNET(tcp_pmtud_blackhole_activated_min_mss) 151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 152 CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_VNET, 153 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 154 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 155 156 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 157 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 158 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 159 CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_VNET, 160 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 161 "Path MTU Discovery Black Hole Detection, Failure Count"); 162 163 #ifdef INET 164 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 165 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 167 CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_VNET, 168 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 169 "Path MTU Discovery Black Hole Detection lowered MSS"); 170 #endif 171 172 #ifdef INET6 173 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 174 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 176 CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_VNET, 177 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 178 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 179 #endif 180 181 #ifdef RSS 182 static int per_cpu_timers = 1; 183 #else 184 static int per_cpu_timers = 0; 185 #endif 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 187 &per_cpu_timers , 0, "run tcp timers on all cpus"); 188 189 #if 0 190 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 191 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 192 #endif 193 194 /* 195 * Map the given inp to a CPU id. 196 * 197 * This queries RSS if it's compiled in, else it defaults to the current 198 * CPU ID. 199 */ 200 static inline int 201 inp_to_cpuid(struct inpcb *inp) 202 { 203 u_int cpuid; 204 205 #ifdef RSS 206 if (per_cpu_timers) { 207 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 208 if (cpuid == NETISR_CPUID_NONE) 209 return (curcpu); /* XXX */ 210 else 211 return (cpuid); 212 } 213 #else 214 /* Legacy, pre-RSS behaviour */ 215 if (per_cpu_timers) { 216 /* 217 * We don't have a flowid -> cpuid mapping, so cheat and 218 * just map unknown cpuids to curcpu. Not the best, but 219 * apparently better than defaulting to swi 0. 220 */ 221 cpuid = inp->inp_flowid % (mp_maxid + 1); 222 if (! CPU_ABSENT(cpuid)) 223 return (cpuid); 224 return (curcpu); 225 } 226 #endif 227 /* Default for RSS and non-RSS - cpuid 0 */ 228 else { 229 return (0); 230 } 231 } 232 233 /* 234 * Tcp protocol timeout routine called every 500 ms. 235 * Updates timestamps used for TCP 236 * causes finite state machine actions if timers expire. 237 */ 238 void 239 tcp_slowtimo(void) 240 { 241 VNET_ITERATOR_DECL(vnet_iter); 242 243 VNET_LIST_RLOCK_NOSLEEP(); 244 VNET_FOREACH(vnet_iter) { 245 CURVNET_SET(vnet_iter); 246 tcp_tw_2msl_scan(); 247 CURVNET_RESTORE(); 248 } 249 VNET_LIST_RUNLOCK_NOSLEEP(); 250 } 251 252 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 253 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 254 255 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 256 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 257 258 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 259 260 static int tcp_timer_race; 261 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, 262 0, "Count of t_inpcb races on tcp_discardcb"); 263 264 /* 265 * TCP timer processing. 266 */ 267 268 void 269 tcp_timer_delack(void *xtp) 270 { 271 struct tcpcb *tp = xtp; 272 struct inpcb *inp; 273 CURVNET_SET(tp->t_vnet); 274 275 inp = tp->t_inpcb; 276 /* 277 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 278 * tear-down mean we need it as a work-around for races between 279 * timers and tcp_discardcb(). 280 * 281 * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); 282 */ 283 if (inp == NULL) { 284 tcp_timer_race++; 285 CURVNET_RESTORE(); 286 return; 287 } 288 INP_WLOCK(inp); 289 if (callout_pending(&tp->t_timers->tt_delack) || 290 !callout_active(&tp->t_timers->tt_delack)) { 291 INP_WUNLOCK(inp); 292 CURVNET_RESTORE(); 293 return; 294 } 295 callout_deactivate(&tp->t_timers->tt_delack); 296 if ((inp->inp_flags & INP_DROPPED) != 0) { 297 INP_WUNLOCK(inp); 298 CURVNET_RESTORE(); 299 return; 300 } 301 302 tp->t_flags |= TF_ACKNOW; 303 TCPSTAT_INC(tcps_delack); 304 (void) tcp_output(tp); 305 INP_WUNLOCK(inp); 306 CURVNET_RESTORE(); 307 } 308 309 void 310 tcp_timer_2msl(void *xtp) 311 { 312 struct tcpcb *tp = xtp; 313 struct inpcb *inp; 314 CURVNET_SET(tp->t_vnet); 315 #ifdef TCPDEBUG 316 int ostate; 317 318 ostate = tp->t_state; 319 #endif 320 /* 321 * XXXRW: Does this actually happen? 322 */ 323 INP_INFO_WLOCK(&V_tcbinfo); 324 inp = tp->t_inpcb; 325 /* 326 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 327 * tear-down mean we need it as a work-around for races between 328 * timers and tcp_discardcb(). 329 * 330 * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); 331 */ 332 if (inp == NULL) { 333 tcp_timer_race++; 334 INP_INFO_WUNLOCK(&V_tcbinfo); 335 CURVNET_RESTORE(); 336 return; 337 } 338 INP_WLOCK(inp); 339 tcp_free_sackholes(tp); 340 if (callout_pending(&tp->t_timers->tt_2msl) || 341 !callout_active(&tp->t_timers->tt_2msl)) { 342 INP_WUNLOCK(tp->t_inpcb); 343 INP_INFO_WUNLOCK(&V_tcbinfo); 344 CURVNET_RESTORE(); 345 return; 346 } 347 callout_deactivate(&tp->t_timers->tt_2msl); 348 if ((inp->inp_flags & INP_DROPPED) != 0) { 349 INP_WUNLOCK(inp); 350 INP_INFO_WUNLOCK(&V_tcbinfo); 351 CURVNET_RESTORE(); 352 return; 353 } 354 /* 355 * 2 MSL timeout in shutdown went off. If we're closed but 356 * still waiting for peer to close and connection has been idle 357 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 358 * control block. Otherwise, check again in a bit. 359 * 360 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 361 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 362 * Ignore fact that there were recent incoming segments. 363 */ 364 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 365 tp->t_inpcb && tp->t_inpcb->inp_socket && 366 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 367 TCPSTAT_INC(tcps_finwait2_drops); 368 tp = tcp_close(tp); 369 } else { 370 if (tp->t_state != TCPS_TIME_WAIT && 371 ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 372 callout_reset_on(&tp->t_timers->tt_2msl, 373 TP_KEEPINTVL(tp), tcp_timer_2msl, tp, 374 inp_to_cpuid(inp)); 375 else 376 tp = tcp_close(tp); 377 } 378 379 #ifdef TCPDEBUG 380 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 381 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 382 PRU_SLOWTIMO); 383 #endif 384 if (tp != NULL) 385 INP_WUNLOCK(inp); 386 INP_INFO_WUNLOCK(&V_tcbinfo); 387 CURVNET_RESTORE(); 388 } 389 390 void 391 tcp_timer_keep(void *xtp) 392 { 393 struct tcpcb *tp = xtp; 394 struct tcptemp *t_template; 395 struct inpcb *inp; 396 CURVNET_SET(tp->t_vnet); 397 #ifdef TCPDEBUG 398 int ostate; 399 400 ostate = tp->t_state; 401 #endif 402 INP_INFO_WLOCK(&V_tcbinfo); 403 inp = tp->t_inpcb; 404 /* 405 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 406 * tear-down mean we need it as a work-around for races between 407 * timers and tcp_discardcb(). 408 * 409 * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); 410 */ 411 if (inp == NULL) { 412 tcp_timer_race++; 413 INP_INFO_WUNLOCK(&V_tcbinfo); 414 CURVNET_RESTORE(); 415 return; 416 } 417 INP_WLOCK(inp); 418 if (callout_pending(&tp->t_timers->tt_keep) || 419 !callout_active(&tp->t_timers->tt_keep)) { 420 INP_WUNLOCK(inp); 421 INP_INFO_WUNLOCK(&V_tcbinfo); 422 CURVNET_RESTORE(); 423 return; 424 } 425 callout_deactivate(&tp->t_timers->tt_keep); 426 if ((inp->inp_flags & INP_DROPPED) != 0) { 427 INP_WUNLOCK(inp); 428 INP_INFO_WUNLOCK(&V_tcbinfo); 429 CURVNET_RESTORE(); 430 return; 431 } 432 /* 433 * Keep-alive timer went off; send something 434 * or drop connection if idle for too long. 435 */ 436 TCPSTAT_INC(tcps_keeptimeo); 437 if (tp->t_state < TCPS_ESTABLISHED) 438 goto dropit; 439 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 440 tp->t_state <= TCPS_CLOSING) { 441 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 442 goto dropit; 443 /* 444 * Send a packet designed to force a response 445 * if the peer is up and reachable: 446 * either an ACK if the connection is still alive, 447 * or an RST if the peer has closed the connection 448 * due to timeout or reboot. 449 * Using sequence number tp->snd_una-1 450 * causes the transmitted zero-length segment 451 * to lie outside the receive window; 452 * by the protocol spec, this requires the 453 * correspondent TCP to respond. 454 */ 455 TCPSTAT_INC(tcps_keepprobe); 456 t_template = tcpip_maketemplate(inp); 457 if (t_template) { 458 tcp_respond(tp, t_template->tt_ipgen, 459 &t_template->tt_t, (struct mbuf *)NULL, 460 tp->rcv_nxt, tp->snd_una - 1, 0); 461 free(t_template, M_TEMP); 462 } 463 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 464 tcp_timer_keep, tp, inp_to_cpuid(inp)); 465 } else 466 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 467 tcp_timer_keep, tp, inp_to_cpuid(inp)); 468 469 #ifdef TCPDEBUG 470 if (inp->inp_socket->so_options & SO_DEBUG) 471 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 472 PRU_SLOWTIMO); 473 #endif 474 INP_WUNLOCK(inp); 475 INP_INFO_WUNLOCK(&V_tcbinfo); 476 CURVNET_RESTORE(); 477 return; 478 479 dropit: 480 TCPSTAT_INC(tcps_keepdrops); 481 tp = tcp_drop(tp, ETIMEDOUT); 482 483 #ifdef TCPDEBUG 484 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 485 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 486 PRU_SLOWTIMO); 487 #endif 488 if (tp != NULL) 489 INP_WUNLOCK(tp->t_inpcb); 490 INP_INFO_WUNLOCK(&V_tcbinfo); 491 CURVNET_RESTORE(); 492 } 493 494 void 495 tcp_timer_persist(void *xtp) 496 { 497 struct tcpcb *tp = xtp; 498 struct inpcb *inp; 499 CURVNET_SET(tp->t_vnet); 500 #ifdef TCPDEBUG 501 int ostate; 502 503 ostate = tp->t_state; 504 #endif 505 INP_INFO_WLOCK(&V_tcbinfo); 506 inp = tp->t_inpcb; 507 /* 508 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 509 * tear-down mean we need it as a work-around for races between 510 * timers and tcp_discardcb(). 511 * 512 * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); 513 */ 514 if (inp == NULL) { 515 tcp_timer_race++; 516 INP_INFO_WUNLOCK(&V_tcbinfo); 517 CURVNET_RESTORE(); 518 return; 519 } 520 INP_WLOCK(inp); 521 if (callout_pending(&tp->t_timers->tt_persist) || 522 !callout_active(&tp->t_timers->tt_persist)) { 523 INP_WUNLOCK(inp); 524 INP_INFO_WUNLOCK(&V_tcbinfo); 525 CURVNET_RESTORE(); 526 return; 527 } 528 callout_deactivate(&tp->t_timers->tt_persist); 529 if ((inp->inp_flags & INP_DROPPED) != 0) { 530 INP_WUNLOCK(inp); 531 INP_INFO_WUNLOCK(&V_tcbinfo); 532 CURVNET_RESTORE(); 533 return; 534 } 535 /* 536 * Persistance timer into zero window. 537 * Force a byte to be output, if possible. 538 */ 539 TCPSTAT_INC(tcps_persisttimeo); 540 /* 541 * Hack: if the peer is dead/unreachable, we do not 542 * time out if the window is closed. After a full 543 * backoff, drop the connection if the idle time 544 * (no responses to probes) reaches the maximum 545 * backoff that we would use if retransmitting. 546 */ 547 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 548 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 549 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 550 TCPSTAT_INC(tcps_persistdrop); 551 tp = tcp_drop(tp, ETIMEDOUT); 552 goto out; 553 } 554 /* 555 * If the user has closed the socket then drop a persisting 556 * connection after a much reduced timeout. 557 */ 558 if (tp->t_state > TCPS_CLOSE_WAIT && 559 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 560 TCPSTAT_INC(tcps_persistdrop); 561 tp = tcp_drop(tp, ETIMEDOUT); 562 goto out; 563 } 564 tcp_setpersist(tp); 565 tp->t_flags |= TF_FORCEDATA; 566 (void) tcp_output(tp); 567 tp->t_flags &= ~TF_FORCEDATA; 568 569 out: 570 #ifdef TCPDEBUG 571 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 572 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 573 #endif 574 if (tp != NULL) 575 INP_WUNLOCK(inp); 576 INP_INFO_WUNLOCK(&V_tcbinfo); 577 CURVNET_RESTORE(); 578 } 579 580 void 581 tcp_timer_rexmt(void * xtp) 582 { 583 struct tcpcb *tp = xtp; 584 CURVNET_SET(tp->t_vnet); 585 int rexmt; 586 int headlocked; 587 struct inpcb *inp; 588 #ifdef TCPDEBUG 589 int ostate; 590 591 ostate = tp->t_state; 592 #endif 593 594 INP_INFO_RLOCK(&V_tcbinfo); 595 inp = tp->t_inpcb; 596 /* 597 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 598 * tear-down mean we need it as a work-around for races between 599 * timers and tcp_discardcb(). 600 * 601 * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); 602 */ 603 if (inp == NULL) { 604 tcp_timer_race++; 605 INP_INFO_RUNLOCK(&V_tcbinfo); 606 CURVNET_RESTORE(); 607 return; 608 } 609 INP_WLOCK(inp); 610 if (callout_pending(&tp->t_timers->tt_rexmt) || 611 !callout_active(&tp->t_timers->tt_rexmt)) { 612 INP_WUNLOCK(inp); 613 INP_INFO_RUNLOCK(&V_tcbinfo); 614 CURVNET_RESTORE(); 615 return; 616 } 617 callout_deactivate(&tp->t_timers->tt_rexmt); 618 if ((inp->inp_flags & INP_DROPPED) != 0) { 619 INP_WUNLOCK(inp); 620 INP_INFO_RUNLOCK(&V_tcbinfo); 621 CURVNET_RESTORE(); 622 return; 623 } 624 tcp_free_sackholes(tp); 625 /* 626 * Retransmission timer went off. Message has not 627 * been acked within retransmit interval. Back off 628 * to a longer retransmit interval and retransmit one segment. 629 */ 630 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 631 tp->t_rxtshift = TCP_MAXRXTSHIFT; 632 TCPSTAT_INC(tcps_timeoutdrop); 633 in_pcbref(inp); 634 INP_INFO_RUNLOCK(&V_tcbinfo); 635 INP_WUNLOCK(inp); 636 INP_INFO_WLOCK(&V_tcbinfo); 637 INP_WLOCK(inp); 638 if (in_pcbrele_wlocked(inp)) { 639 INP_INFO_WUNLOCK(&V_tcbinfo); 640 CURVNET_RESTORE(); 641 return; 642 } 643 if (inp->inp_flags & INP_DROPPED) { 644 INP_WUNLOCK(inp); 645 INP_INFO_WUNLOCK(&V_tcbinfo); 646 CURVNET_RESTORE(); 647 return; 648 } 649 650 tp = tcp_drop(tp, tp->t_softerror ? 651 tp->t_softerror : ETIMEDOUT); 652 headlocked = 1; 653 goto out; 654 } 655 INP_INFO_RUNLOCK(&V_tcbinfo); 656 headlocked = 0; 657 if (tp->t_state == TCPS_SYN_SENT) { 658 /* 659 * If the SYN was retransmitted, indicate CWND to be 660 * limited to 1 segment in cc_conn_init(). 661 */ 662 tp->snd_cwnd = 1; 663 } else if (tp->t_rxtshift == 1) { 664 /* 665 * first retransmit; record ssthresh and cwnd so they can 666 * be recovered if this turns out to be a "bad" retransmit. 667 * A retransmit is considered "bad" if an ACK for this 668 * segment is received within RTT/2 interval; the assumption 669 * here is that the ACK was already in flight. See 670 * "On Estimating End-to-End Network Path Properties" by 671 * Allman and Paxson for more details. 672 */ 673 tp->snd_cwnd_prev = tp->snd_cwnd; 674 tp->snd_ssthresh_prev = tp->snd_ssthresh; 675 tp->snd_recover_prev = tp->snd_recover; 676 if (IN_FASTRECOVERY(tp->t_flags)) 677 tp->t_flags |= TF_WASFRECOVERY; 678 else 679 tp->t_flags &= ~TF_WASFRECOVERY; 680 if (IN_CONGRECOVERY(tp->t_flags)) 681 tp->t_flags |= TF_WASCRECOVERY; 682 else 683 tp->t_flags &= ~TF_WASCRECOVERY; 684 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 685 tp->t_flags |= TF_PREVVALID; 686 } else 687 tp->t_flags &= ~TF_PREVVALID; 688 TCPSTAT_INC(tcps_rexmttimeo); 689 if (tp->t_state == TCPS_SYN_SENT) 690 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 691 else 692 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 693 TCPT_RANGESET(tp->t_rxtcur, rexmt, 694 tp->t_rttmin, TCPTV_REXMTMAX); 695 696 if (V_tcp_pmtud_blackhole_detect && (tp->t_state == TCPS_ESTABLISHED)) { 697 int optlen; 698 #ifdef INET6 699 int isipv6; 700 #endif 701 702 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 703 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 704 (tp->t_rxtshift <= 2)) { 705 /* 706 * Enter Path MTU Black-hole Detection mechanism: 707 * - Disable Path MTU Discovery (IP "DF" bit). 708 * - Reduce MTU to lower value than what we 709 * negotiated with peer. 710 */ 711 /* Record that we may have found a black hole. */ 712 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 713 714 /* Keep track of previous MSS. */ 715 optlen = tp->t_maxopd - tp->t_maxseg; 716 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 717 718 /* 719 * Reduce the MSS to blackhole value or to the default 720 * in an attempt to retransmit. 721 */ 722 #ifdef INET6 723 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 724 if (isipv6 && 725 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 726 /* Use the sysctl tuneable blackhole MSS. */ 727 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 728 V_tcp_pmtud_blackhole_activated++; 729 } else if (isipv6) { 730 /* Use the default MSS. */ 731 tp->t_maxopd = V_tcp_v6mssdflt; 732 /* 733 * Disable Path MTU Discovery when we switch to 734 * minmss. 735 */ 736 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 737 V_tcp_pmtud_blackhole_activated_min_mss++; 738 } 739 #endif 740 #if defined(INET6) && defined(INET) 741 else 742 #endif 743 #ifdef INET 744 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 745 /* Use the sysctl tuneable blackhole MSS. */ 746 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 747 V_tcp_pmtud_blackhole_activated++; 748 } else { 749 /* Use the default MSS. */ 750 tp->t_maxopd = V_tcp_mssdflt; 751 /* 752 * Disable Path MTU Discovery when we switch to 753 * minmss. 754 */ 755 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 756 V_tcp_pmtud_blackhole_activated_min_mss++; 757 } 758 #endif 759 tp->t_maxseg = tp->t_maxopd - optlen; 760 /* 761 * Reset the slow-start flight size 762 * as it may depend on the new MSS. 763 */ 764 if (CC_ALGO(tp)->conn_init != NULL) 765 CC_ALGO(tp)->conn_init(tp->ccv); 766 } else { 767 /* 768 * If further retransmissions are still unsuccessful 769 * with a lowered MTU, maybe this isn't a blackhole and 770 * we restore the previous MSS and blackhole detection 771 * flags. 772 */ 773 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 774 (tp->t_rxtshift > 4)) { 775 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 776 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 777 optlen = tp->t_maxopd - tp->t_maxseg; 778 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 779 tp->t_maxseg = tp->t_maxopd - optlen; 780 V_tcp_pmtud_blackhole_failed++; 781 /* 782 * Reset the slow-start flight size as it 783 * may depend on the new MSS. 784 */ 785 if (CC_ALGO(tp)->conn_init != NULL) 786 CC_ALGO(tp)->conn_init(tp->ccv); 787 } 788 } 789 } 790 791 /* 792 * Disable RFC1323 and SACK if we haven't got any response to 793 * our third SYN to work-around some broken terminal servers 794 * (most of which have hopefully been retired) that have bad VJ 795 * header compression code which trashes TCP segments containing 796 * unknown-to-them TCP options. 797 */ 798 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 799 (tp->t_rxtshift == 3)) 800 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 801 /* 802 * If we backed off this far, our srtt estimate is probably bogus. 803 * Clobber it so we'll take the next rtt measurement as our srtt; 804 * move the current srtt into rttvar to keep the current 805 * retransmit times until then. 806 */ 807 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 808 #ifdef INET6 809 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 810 in6_losing(tp->t_inpcb); 811 #endif 812 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 813 tp->t_srtt = 0; 814 } 815 tp->snd_nxt = tp->snd_una; 816 tp->snd_recover = tp->snd_max; 817 /* 818 * Force a segment to be sent. 819 */ 820 tp->t_flags |= TF_ACKNOW; 821 /* 822 * If timing a segment in this window, stop the timer. 823 */ 824 tp->t_rtttime = 0; 825 826 cc_cong_signal(tp, NULL, CC_RTO); 827 828 (void) tcp_output(tp); 829 830 out: 831 #ifdef TCPDEBUG 832 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 833 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 834 PRU_SLOWTIMO); 835 #endif 836 if (tp != NULL) 837 INP_WUNLOCK(inp); 838 if (headlocked) 839 INP_INFO_WUNLOCK(&V_tcbinfo); 840 CURVNET_RESTORE(); 841 } 842 843 void 844 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) 845 { 846 struct callout *t_callout; 847 void *f_callout; 848 struct inpcb *inp = tp->t_inpcb; 849 int cpu = inp_to_cpuid(inp); 850 851 #ifdef TCP_OFFLOAD 852 if (tp->t_flags & TF_TOE) 853 return; 854 #endif 855 856 switch (timer_type) { 857 case TT_DELACK: 858 t_callout = &tp->t_timers->tt_delack; 859 f_callout = tcp_timer_delack; 860 break; 861 case TT_REXMT: 862 t_callout = &tp->t_timers->tt_rexmt; 863 f_callout = tcp_timer_rexmt; 864 break; 865 case TT_PERSIST: 866 t_callout = &tp->t_timers->tt_persist; 867 f_callout = tcp_timer_persist; 868 break; 869 case TT_KEEP: 870 t_callout = &tp->t_timers->tt_keep; 871 f_callout = tcp_timer_keep; 872 break; 873 case TT_2MSL: 874 t_callout = &tp->t_timers->tt_2msl; 875 f_callout = tcp_timer_2msl; 876 break; 877 default: 878 panic("bad timer_type"); 879 } 880 if (delta == 0) { 881 callout_stop(t_callout); 882 } else { 883 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 884 } 885 } 886 887 int 888 tcp_timer_active(struct tcpcb *tp, int timer_type) 889 { 890 struct callout *t_callout; 891 892 switch (timer_type) { 893 case TT_DELACK: 894 t_callout = &tp->t_timers->tt_delack; 895 break; 896 case TT_REXMT: 897 t_callout = &tp->t_timers->tt_rexmt; 898 break; 899 case TT_PERSIST: 900 t_callout = &tp->t_timers->tt_persist; 901 break; 902 case TT_KEEP: 903 t_callout = &tp->t_timers->tt_keep; 904 break; 905 case TT_2MSL: 906 t_callout = &tp->t_timers->tt_2msl; 907 break; 908 default: 909 panic("bad timer_type"); 910 } 911 return callout_active(t_callout); 912 } 913 914 #define ticks_to_msecs(t) (1000*(t) / hz) 915 916 void 917 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 918 struct xtcp_timer *xtimer) 919 { 920 sbintime_t now; 921 922 bzero(xtimer, sizeof(*xtimer)); 923 if (timer == NULL) 924 return; 925 now = getsbinuptime(); 926 if (callout_active(&timer->tt_delack)) 927 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 928 if (callout_active(&timer->tt_rexmt)) 929 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 930 if (callout_active(&timer->tt_persist)) 931 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 932 if (callout_active(&timer->tt_keep)) 933 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 934 if (callout_active(&timer->tt_2msl)) 935 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 936 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 937 } 938