1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/cc.h> 59 #include <netinet/in.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp_fsm.h> 68 #include <netinet/tcp_timer.h> 69 #include <netinet/tcp_var.h> 70 #ifdef INET6 71 #include <netinet6/tcp6_var.h> 72 #endif 73 #include <netinet/tcpip.h> 74 #ifdef TCPDEBUG 75 #include <netinet/tcp_debug.h> 76 #endif 77 78 int tcp_keepinit; 79 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 80 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 81 82 int tcp_keepidle; 83 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 84 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 85 86 int tcp_keepintvl; 87 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 88 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 89 90 int tcp_delacktime; 91 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 92 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 93 "Time before a delayed ACK is sent"); 94 95 int tcp_msl; 96 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 97 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 98 99 int tcp_rexmit_min; 100 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 101 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 102 "Minimum Retransmission Timeout"); 103 104 int tcp_rexmit_slop; 105 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 106 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 107 "Retransmission Timer Slop"); 108 109 static int always_keepalive = 1; 110 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 111 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 112 113 int tcp_fast_finwait2_recycle = 0; 114 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 115 &tcp_fast_finwait2_recycle, 0, 116 "Recycle closed FIN_WAIT_2 connections faster"); 117 118 int tcp_finwait2_timeout; 119 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 120 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 121 122 int tcp_keepcnt = TCPTV_KEEPCNT; 123 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 124 "Number of keepalive probes to send"); 125 126 /* max idle probes */ 127 int tcp_maxpersistidle; 128 129 static int tcp_rexmit_drop_options = 0; 130 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 131 &tcp_rexmit_drop_options, 0, 132 "Drop TCP options from 3rd and later retransmitted SYN"); 133 134 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 135 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 136 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 137 CTLFLAG_RW|CTLFLAG_VNET, 138 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 139 "Path MTU Discovery Black Hole Detection Enabled"); 140 141 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 142 #define V_tcp_pmtud_blackhole_activated \ 143 VNET(tcp_pmtud_blackhole_activated) 144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 145 CTLFLAG_RD|CTLFLAG_VNET, 146 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 147 "Path MTU Discovery Black Hole Detection, Activation Count"); 148 149 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 150 #define V_tcp_pmtud_blackhole_activated_min_mss \ 151 VNET(tcp_pmtud_blackhole_activated_min_mss) 152 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 153 CTLFLAG_RD|CTLFLAG_VNET, 154 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 155 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 156 157 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 158 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 159 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 160 CTLFLAG_RD|CTLFLAG_VNET, 161 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 162 "Path MTU Discovery Black Hole Detection, Failure Count"); 163 164 #ifdef INET 165 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 166 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 167 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 168 CTLFLAG_RW|CTLFLAG_VNET, 169 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 170 "Path MTU Discovery Black Hole Detection lowered MSS"); 171 #endif 172 173 #ifdef INET6 174 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 175 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 176 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 177 CTLFLAG_RW|CTLFLAG_VNET, 178 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 179 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 180 #endif 181 182 #ifdef RSS 183 static int per_cpu_timers = 1; 184 #else 185 static int per_cpu_timers = 0; 186 #endif 187 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 188 &per_cpu_timers , 0, "run tcp timers on all cpus"); 189 190 #if 0 191 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 192 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 193 #endif 194 195 /* 196 * Map the given inp to a CPU id. 197 * 198 * This queries RSS if it's compiled in, else it defaults to the current 199 * CPU ID. 200 */ 201 static inline int 202 inp_to_cpuid(struct inpcb *inp) 203 { 204 u_int cpuid; 205 206 #ifdef RSS 207 if (per_cpu_timers) { 208 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 209 if (cpuid == NETISR_CPUID_NONE) 210 return (curcpu); /* XXX */ 211 else 212 return (cpuid); 213 } 214 #else 215 /* Legacy, pre-RSS behaviour */ 216 if (per_cpu_timers) { 217 /* 218 * We don't have a flowid -> cpuid mapping, so cheat and 219 * just map unknown cpuids to curcpu. Not the best, but 220 * apparently better than defaulting to swi 0. 221 */ 222 cpuid = inp->inp_flowid % (mp_maxid + 1); 223 if (! CPU_ABSENT(cpuid)) 224 return (cpuid); 225 return (curcpu); 226 } 227 #endif 228 /* Default for RSS and non-RSS - cpuid 0 */ 229 else { 230 return (0); 231 } 232 } 233 234 /* 235 * Tcp protocol timeout routine called every 500 ms. 236 * Updates timestamps used for TCP 237 * causes finite state machine actions if timers expire. 238 */ 239 void 240 tcp_slowtimo(void) 241 { 242 VNET_ITERATOR_DECL(vnet_iter); 243 244 VNET_LIST_RLOCK_NOSLEEP(); 245 VNET_FOREACH(vnet_iter) { 246 CURVNET_SET(vnet_iter); 247 (void) tcp_tw_2msl_scan(0); 248 CURVNET_RESTORE(); 249 } 250 VNET_LIST_RUNLOCK_NOSLEEP(); 251 } 252 253 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 254 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 255 256 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 257 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 258 259 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 260 261 static int tcp_timer_race; 262 SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race, 263 0, "Count of t_inpcb races on tcp_discardcb"); 264 265 /* 266 * TCP timer processing. 267 */ 268 269 void 270 tcp_timer_delack(void *xtp) 271 { 272 struct tcpcb *tp = xtp; 273 struct inpcb *inp; 274 CURVNET_SET(tp->t_vnet); 275 276 inp = tp->t_inpcb; 277 /* 278 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 279 * tear-down mean we need it as a work-around for races between 280 * timers and tcp_discardcb(). 281 * 282 * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); 283 */ 284 if (inp == NULL) { 285 tcp_timer_race++; 286 CURVNET_RESTORE(); 287 return; 288 } 289 INP_WLOCK(inp); 290 if (callout_pending(&tp->t_timers->tt_delack) || 291 !callout_active(&tp->t_timers->tt_delack)) { 292 INP_WUNLOCK(inp); 293 CURVNET_RESTORE(); 294 return; 295 } 296 callout_deactivate(&tp->t_timers->tt_delack); 297 if ((inp->inp_flags & INP_DROPPED) != 0) { 298 INP_WUNLOCK(inp); 299 CURVNET_RESTORE(); 300 return; 301 } 302 303 tp->t_flags |= TF_ACKNOW; 304 TCPSTAT_INC(tcps_delack); 305 (void) tcp_output(tp); 306 INP_WUNLOCK(inp); 307 CURVNET_RESTORE(); 308 } 309 310 void 311 tcp_timer_2msl(void *xtp) 312 { 313 struct tcpcb *tp = xtp; 314 struct inpcb *inp; 315 CURVNET_SET(tp->t_vnet); 316 #ifdef TCPDEBUG 317 int ostate; 318 319 ostate = tp->t_state; 320 #endif 321 /* 322 * XXXRW: Does this actually happen? 323 */ 324 INP_INFO_WLOCK(&V_tcbinfo); 325 inp = tp->t_inpcb; 326 /* 327 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 328 * tear-down mean we need it as a work-around for races between 329 * timers and tcp_discardcb(). 330 * 331 * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); 332 */ 333 if (inp == NULL) { 334 tcp_timer_race++; 335 INP_INFO_WUNLOCK(&V_tcbinfo); 336 CURVNET_RESTORE(); 337 return; 338 } 339 INP_WLOCK(inp); 340 tcp_free_sackholes(tp); 341 if (callout_pending(&tp->t_timers->tt_2msl) || 342 !callout_active(&tp->t_timers->tt_2msl)) { 343 INP_WUNLOCK(tp->t_inpcb); 344 INP_INFO_WUNLOCK(&V_tcbinfo); 345 CURVNET_RESTORE(); 346 return; 347 } 348 callout_deactivate(&tp->t_timers->tt_2msl); 349 if ((inp->inp_flags & INP_DROPPED) != 0) { 350 INP_WUNLOCK(inp); 351 INP_INFO_WUNLOCK(&V_tcbinfo); 352 CURVNET_RESTORE(); 353 return; 354 } 355 /* 356 * 2 MSL timeout in shutdown went off. If we're closed but 357 * still waiting for peer to close and connection has been idle 358 * too long, or if 2MSL time is up from TIME_WAIT, delete connection 359 * control block. Otherwise, check again in a bit. 360 * 361 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 362 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 363 * Ignore fact that there were recent incoming segments. 364 */ 365 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 366 tp->t_inpcb && tp->t_inpcb->inp_socket && 367 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 368 TCPSTAT_INC(tcps_finwait2_drops); 369 tp = tcp_close(tp); 370 } else { 371 if (tp->t_state != TCPS_TIME_WAIT && 372 ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 373 callout_reset_on(&tp->t_timers->tt_2msl, 374 TP_KEEPINTVL(tp), tcp_timer_2msl, tp, 375 inp_to_cpuid(inp)); 376 else 377 tp = tcp_close(tp); 378 } 379 380 #ifdef TCPDEBUG 381 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 382 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 383 PRU_SLOWTIMO); 384 #endif 385 if (tp != NULL) 386 INP_WUNLOCK(inp); 387 INP_INFO_WUNLOCK(&V_tcbinfo); 388 CURVNET_RESTORE(); 389 } 390 391 void 392 tcp_timer_keep(void *xtp) 393 { 394 struct tcpcb *tp = xtp; 395 struct tcptemp *t_template; 396 struct inpcb *inp; 397 CURVNET_SET(tp->t_vnet); 398 #ifdef TCPDEBUG 399 int ostate; 400 401 ostate = tp->t_state; 402 #endif 403 INP_INFO_WLOCK(&V_tcbinfo); 404 inp = tp->t_inpcb; 405 /* 406 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 407 * tear-down mean we need it as a work-around for races between 408 * timers and tcp_discardcb(). 409 * 410 * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); 411 */ 412 if (inp == NULL) { 413 tcp_timer_race++; 414 INP_INFO_WUNLOCK(&V_tcbinfo); 415 CURVNET_RESTORE(); 416 return; 417 } 418 INP_WLOCK(inp); 419 if (callout_pending(&tp->t_timers->tt_keep) || 420 !callout_active(&tp->t_timers->tt_keep)) { 421 INP_WUNLOCK(inp); 422 INP_INFO_WUNLOCK(&V_tcbinfo); 423 CURVNET_RESTORE(); 424 return; 425 } 426 callout_deactivate(&tp->t_timers->tt_keep); 427 if ((inp->inp_flags & INP_DROPPED) != 0) { 428 INP_WUNLOCK(inp); 429 INP_INFO_WUNLOCK(&V_tcbinfo); 430 CURVNET_RESTORE(); 431 return; 432 } 433 /* 434 * Keep-alive timer went off; send something 435 * or drop connection if idle for too long. 436 */ 437 TCPSTAT_INC(tcps_keeptimeo); 438 if (tp->t_state < TCPS_ESTABLISHED) 439 goto dropit; 440 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 441 tp->t_state <= TCPS_CLOSING) { 442 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 443 goto dropit; 444 /* 445 * Send a packet designed to force a response 446 * if the peer is up and reachable: 447 * either an ACK if the connection is still alive, 448 * or an RST if the peer has closed the connection 449 * due to timeout or reboot. 450 * Using sequence number tp->snd_una-1 451 * causes the transmitted zero-length segment 452 * to lie outside the receive window; 453 * by the protocol spec, this requires the 454 * correspondent TCP to respond. 455 */ 456 TCPSTAT_INC(tcps_keepprobe); 457 t_template = tcpip_maketemplate(inp); 458 if (t_template) { 459 tcp_respond(tp, t_template->tt_ipgen, 460 &t_template->tt_t, (struct mbuf *)NULL, 461 tp->rcv_nxt, tp->snd_una - 1, 0); 462 free(t_template, M_TEMP); 463 } 464 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 465 tcp_timer_keep, tp, inp_to_cpuid(inp)); 466 } else 467 callout_reset_on(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 468 tcp_timer_keep, tp, inp_to_cpuid(inp)); 469 470 #ifdef TCPDEBUG 471 if (inp->inp_socket->so_options & SO_DEBUG) 472 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 473 PRU_SLOWTIMO); 474 #endif 475 INP_WUNLOCK(inp); 476 INP_INFO_WUNLOCK(&V_tcbinfo); 477 CURVNET_RESTORE(); 478 return; 479 480 dropit: 481 TCPSTAT_INC(tcps_keepdrops); 482 tp = tcp_drop(tp, ETIMEDOUT); 483 484 #ifdef TCPDEBUG 485 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 486 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 487 PRU_SLOWTIMO); 488 #endif 489 if (tp != NULL) 490 INP_WUNLOCK(tp->t_inpcb); 491 INP_INFO_WUNLOCK(&V_tcbinfo); 492 CURVNET_RESTORE(); 493 } 494 495 void 496 tcp_timer_persist(void *xtp) 497 { 498 struct tcpcb *tp = xtp; 499 struct inpcb *inp; 500 CURVNET_SET(tp->t_vnet); 501 #ifdef TCPDEBUG 502 int ostate; 503 504 ostate = tp->t_state; 505 #endif 506 INP_INFO_WLOCK(&V_tcbinfo); 507 inp = tp->t_inpcb; 508 /* 509 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 510 * tear-down mean we need it as a work-around for races between 511 * timers and tcp_discardcb(). 512 * 513 * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); 514 */ 515 if (inp == NULL) { 516 tcp_timer_race++; 517 INP_INFO_WUNLOCK(&V_tcbinfo); 518 CURVNET_RESTORE(); 519 return; 520 } 521 INP_WLOCK(inp); 522 if (callout_pending(&tp->t_timers->tt_persist) || 523 !callout_active(&tp->t_timers->tt_persist)) { 524 INP_WUNLOCK(inp); 525 INP_INFO_WUNLOCK(&V_tcbinfo); 526 CURVNET_RESTORE(); 527 return; 528 } 529 callout_deactivate(&tp->t_timers->tt_persist); 530 if ((inp->inp_flags & INP_DROPPED) != 0) { 531 INP_WUNLOCK(inp); 532 INP_INFO_WUNLOCK(&V_tcbinfo); 533 CURVNET_RESTORE(); 534 return; 535 } 536 /* 537 * Persistance timer into zero window. 538 * Force a byte to be output, if possible. 539 */ 540 TCPSTAT_INC(tcps_persisttimeo); 541 /* 542 * Hack: if the peer is dead/unreachable, we do not 543 * time out if the window is closed. After a full 544 * backoff, drop the connection if the idle time 545 * (no responses to probes) reaches the maximum 546 * backoff that we would use if retransmitting. 547 */ 548 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 549 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 550 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 551 TCPSTAT_INC(tcps_persistdrop); 552 tp = tcp_drop(tp, ETIMEDOUT); 553 goto out; 554 } 555 /* 556 * If the user has closed the socket then drop a persisting 557 * connection after a much reduced timeout. 558 */ 559 if (tp->t_state > TCPS_CLOSE_WAIT && 560 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 561 TCPSTAT_INC(tcps_persistdrop); 562 tp = tcp_drop(tp, ETIMEDOUT); 563 goto out; 564 } 565 tcp_setpersist(tp); 566 tp->t_flags |= TF_FORCEDATA; 567 (void) tcp_output(tp); 568 tp->t_flags &= ~TF_FORCEDATA; 569 570 out: 571 #ifdef TCPDEBUG 572 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 573 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 574 #endif 575 if (tp != NULL) 576 INP_WUNLOCK(inp); 577 INP_INFO_WUNLOCK(&V_tcbinfo); 578 CURVNET_RESTORE(); 579 } 580 581 void 582 tcp_timer_rexmt(void * xtp) 583 { 584 struct tcpcb *tp = xtp; 585 CURVNET_SET(tp->t_vnet); 586 int rexmt; 587 int headlocked; 588 struct inpcb *inp; 589 #ifdef TCPDEBUG 590 int ostate; 591 592 ostate = tp->t_state; 593 #endif 594 595 INP_INFO_RLOCK(&V_tcbinfo); 596 inp = tp->t_inpcb; 597 /* 598 * XXXRW: While this assert is in fact correct, bugs in the tcpcb 599 * tear-down mean we need it as a work-around for races between 600 * timers and tcp_discardcb(). 601 * 602 * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); 603 */ 604 if (inp == NULL) { 605 tcp_timer_race++; 606 INP_INFO_RUNLOCK(&V_tcbinfo); 607 CURVNET_RESTORE(); 608 return; 609 } 610 INP_WLOCK(inp); 611 if (callout_pending(&tp->t_timers->tt_rexmt) || 612 !callout_active(&tp->t_timers->tt_rexmt)) { 613 INP_WUNLOCK(inp); 614 INP_INFO_RUNLOCK(&V_tcbinfo); 615 CURVNET_RESTORE(); 616 return; 617 } 618 callout_deactivate(&tp->t_timers->tt_rexmt); 619 if ((inp->inp_flags & INP_DROPPED) != 0) { 620 INP_WUNLOCK(inp); 621 INP_INFO_RUNLOCK(&V_tcbinfo); 622 CURVNET_RESTORE(); 623 return; 624 } 625 tcp_free_sackholes(tp); 626 /* 627 * Retransmission timer went off. Message has not 628 * been acked within retransmit interval. Back off 629 * to a longer retransmit interval and retransmit one segment. 630 */ 631 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 632 tp->t_rxtshift = TCP_MAXRXTSHIFT; 633 TCPSTAT_INC(tcps_timeoutdrop); 634 in_pcbref(inp); 635 INP_INFO_RUNLOCK(&V_tcbinfo); 636 INP_WUNLOCK(inp); 637 INP_INFO_WLOCK(&V_tcbinfo); 638 INP_WLOCK(inp); 639 if (in_pcbrele_wlocked(inp)) { 640 INP_INFO_WUNLOCK(&V_tcbinfo); 641 CURVNET_RESTORE(); 642 return; 643 } 644 if (inp->inp_flags & INP_DROPPED) { 645 INP_WUNLOCK(inp); 646 INP_INFO_WUNLOCK(&V_tcbinfo); 647 CURVNET_RESTORE(); 648 return; 649 } 650 651 tp = tcp_drop(tp, tp->t_softerror ? 652 tp->t_softerror : ETIMEDOUT); 653 headlocked = 1; 654 goto out; 655 } 656 INP_INFO_RUNLOCK(&V_tcbinfo); 657 headlocked = 0; 658 if (tp->t_state == TCPS_SYN_SENT) { 659 /* 660 * If the SYN was retransmitted, indicate CWND to be 661 * limited to 1 segment in cc_conn_init(). 662 */ 663 tp->snd_cwnd = 1; 664 } else if (tp->t_rxtshift == 1) { 665 /* 666 * first retransmit; record ssthresh and cwnd so they can 667 * be recovered if this turns out to be a "bad" retransmit. 668 * A retransmit is considered "bad" if an ACK for this 669 * segment is received within RTT/2 interval; the assumption 670 * here is that the ACK was already in flight. See 671 * "On Estimating End-to-End Network Path Properties" by 672 * Allman and Paxson for more details. 673 */ 674 tp->snd_cwnd_prev = tp->snd_cwnd; 675 tp->snd_ssthresh_prev = tp->snd_ssthresh; 676 tp->snd_recover_prev = tp->snd_recover; 677 if (IN_FASTRECOVERY(tp->t_flags)) 678 tp->t_flags |= TF_WASFRECOVERY; 679 else 680 tp->t_flags &= ~TF_WASFRECOVERY; 681 if (IN_CONGRECOVERY(tp->t_flags)) 682 tp->t_flags |= TF_WASCRECOVERY; 683 else 684 tp->t_flags &= ~TF_WASCRECOVERY; 685 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 686 tp->t_flags |= TF_PREVVALID; 687 } else 688 tp->t_flags &= ~TF_PREVVALID; 689 TCPSTAT_INC(tcps_rexmttimeo); 690 if (tp->t_state == TCPS_SYN_SENT) 691 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 692 else 693 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 694 TCPT_RANGESET(tp->t_rxtcur, rexmt, 695 tp->t_rttmin, TCPTV_REXMTMAX); 696 697 /* 698 * We enter the path for PLMTUD if connection is established or, if 699 * connection is FIN_WAIT_1 status, reason for the last is that if 700 * amount of data we send is very small, we could send it in couple of 701 * packets and process straight to FIN. In that case we won't catch 702 * ESTABLISHED state. 703 */ 704 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 705 || (tp->t_state == TCPS_FIN_WAIT_1))) { 706 int optlen; 707 #ifdef INET6 708 int isipv6; 709 #endif 710 711 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 712 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 713 (tp->t_rxtshift <= 2)) { 714 /* 715 * Enter Path MTU Black-hole Detection mechanism: 716 * - Disable Path MTU Discovery (IP "DF" bit). 717 * - Reduce MTU to lower value than what we 718 * negotiated with peer. 719 */ 720 /* Record that we may have found a black hole. */ 721 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 722 723 /* Keep track of previous MSS. */ 724 optlen = tp->t_maxopd - tp->t_maxseg; 725 tp->t_pmtud_saved_maxopd = tp->t_maxopd; 726 727 /* 728 * Reduce the MSS to blackhole value or to the default 729 * in an attempt to retransmit. 730 */ 731 #ifdef INET6 732 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 733 if (isipv6 && 734 tp->t_maxopd > V_tcp_v6pmtud_blackhole_mss) { 735 /* Use the sysctl tuneable blackhole MSS. */ 736 tp->t_maxopd = V_tcp_v6pmtud_blackhole_mss; 737 V_tcp_pmtud_blackhole_activated++; 738 } else if (isipv6) { 739 /* Use the default MSS. */ 740 tp->t_maxopd = V_tcp_v6mssdflt; 741 /* 742 * Disable Path MTU Discovery when we switch to 743 * minmss. 744 */ 745 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 746 V_tcp_pmtud_blackhole_activated_min_mss++; 747 } 748 #endif 749 #if defined(INET6) && defined(INET) 750 else 751 #endif 752 #ifdef INET 753 if (tp->t_maxopd > V_tcp_pmtud_blackhole_mss) { 754 /* Use the sysctl tuneable blackhole MSS. */ 755 tp->t_maxopd = V_tcp_pmtud_blackhole_mss; 756 V_tcp_pmtud_blackhole_activated++; 757 } else { 758 /* Use the default MSS. */ 759 tp->t_maxopd = V_tcp_mssdflt; 760 /* 761 * Disable Path MTU Discovery when we switch to 762 * minmss. 763 */ 764 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 765 V_tcp_pmtud_blackhole_activated_min_mss++; 766 } 767 #endif 768 tp->t_maxseg = tp->t_maxopd - optlen; 769 /* 770 * Reset the slow-start flight size 771 * as it may depend on the new MSS. 772 */ 773 if (CC_ALGO(tp)->conn_init != NULL) 774 CC_ALGO(tp)->conn_init(tp->ccv); 775 } else { 776 /* 777 * If further retransmissions are still unsuccessful 778 * with a lowered MTU, maybe this isn't a blackhole and 779 * we restore the previous MSS and blackhole detection 780 * flags. 781 */ 782 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 783 (tp->t_rxtshift > 4)) { 784 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 785 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 786 optlen = tp->t_maxopd - tp->t_maxseg; 787 tp->t_maxopd = tp->t_pmtud_saved_maxopd; 788 tp->t_maxseg = tp->t_maxopd - optlen; 789 V_tcp_pmtud_blackhole_failed++; 790 /* 791 * Reset the slow-start flight size as it 792 * may depend on the new MSS. 793 */ 794 if (CC_ALGO(tp)->conn_init != NULL) 795 CC_ALGO(tp)->conn_init(tp->ccv); 796 } 797 } 798 } 799 800 /* 801 * Disable RFC1323 and SACK if we haven't got any response to 802 * our third SYN to work-around some broken terminal servers 803 * (most of which have hopefully been retired) that have bad VJ 804 * header compression code which trashes TCP segments containing 805 * unknown-to-them TCP options. 806 */ 807 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 808 (tp->t_rxtshift == 3)) 809 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 810 /* 811 * If we backed off this far, our srtt estimate is probably bogus. 812 * Clobber it so we'll take the next rtt measurement as our srtt; 813 * move the current srtt into rttvar to keep the current 814 * retransmit times until then. 815 */ 816 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 817 #ifdef INET6 818 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 819 in6_losing(tp->t_inpcb); 820 #endif 821 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 822 tp->t_srtt = 0; 823 } 824 tp->snd_nxt = tp->snd_una; 825 tp->snd_recover = tp->snd_max; 826 /* 827 * Force a segment to be sent. 828 */ 829 tp->t_flags |= TF_ACKNOW; 830 /* 831 * If timing a segment in this window, stop the timer. 832 */ 833 tp->t_rtttime = 0; 834 835 cc_cong_signal(tp, NULL, CC_RTO); 836 837 (void) tcp_output(tp); 838 839 out: 840 #ifdef TCPDEBUG 841 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 842 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 843 PRU_SLOWTIMO); 844 #endif 845 if (tp != NULL) 846 INP_WUNLOCK(inp); 847 if (headlocked) 848 INP_INFO_WUNLOCK(&V_tcbinfo); 849 CURVNET_RESTORE(); 850 } 851 852 void 853 tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta) 854 { 855 struct callout *t_callout; 856 void *f_callout; 857 struct inpcb *inp = tp->t_inpcb; 858 int cpu = inp_to_cpuid(inp); 859 860 #ifdef TCP_OFFLOAD 861 if (tp->t_flags & TF_TOE) 862 return; 863 #endif 864 865 switch (timer_type) { 866 case TT_DELACK: 867 t_callout = &tp->t_timers->tt_delack; 868 f_callout = tcp_timer_delack; 869 break; 870 case TT_REXMT: 871 t_callout = &tp->t_timers->tt_rexmt; 872 f_callout = tcp_timer_rexmt; 873 break; 874 case TT_PERSIST: 875 t_callout = &tp->t_timers->tt_persist; 876 f_callout = tcp_timer_persist; 877 break; 878 case TT_KEEP: 879 t_callout = &tp->t_timers->tt_keep; 880 f_callout = tcp_timer_keep; 881 break; 882 case TT_2MSL: 883 t_callout = &tp->t_timers->tt_2msl; 884 f_callout = tcp_timer_2msl; 885 break; 886 default: 887 panic("bad timer_type"); 888 } 889 if (delta == 0) { 890 callout_stop(t_callout); 891 } else { 892 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 893 } 894 } 895 896 int 897 tcp_timer_active(struct tcpcb *tp, int timer_type) 898 { 899 struct callout *t_callout; 900 901 switch (timer_type) { 902 case TT_DELACK: 903 t_callout = &tp->t_timers->tt_delack; 904 break; 905 case TT_REXMT: 906 t_callout = &tp->t_timers->tt_rexmt; 907 break; 908 case TT_PERSIST: 909 t_callout = &tp->t_timers->tt_persist; 910 break; 911 case TT_KEEP: 912 t_callout = &tp->t_timers->tt_keep; 913 break; 914 case TT_2MSL: 915 t_callout = &tp->t_timers->tt_2msl; 916 break; 917 default: 918 panic("bad timer_type"); 919 } 920 return callout_active(t_callout); 921 } 922 923 #define ticks_to_msecs(t) (1000*(t) / hz) 924 925 void 926 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 927 struct xtcp_timer *xtimer) 928 { 929 sbintime_t now; 930 931 bzero(xtimer, sizeof(*xtimer)); 932 if (timer == NULL) 933 return; 934 now = getsbinuptime(); 935 if (callout_active(&timer->tt_delack)) 936 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 937 if (callout_active(&timer->tt_rexmt)) 938 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 939 if (callout_active(&timer->tt_persist)) 940 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 941 if (callout_active(&timer->tt_keep)) 942 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 943 if (callout_active(&timer->tt_2msl)) 944 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 945 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 946 } 947