1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/cc/cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_persmin; 81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84 int tcp_persmax; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88 int tcp_keepinit; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92 int tcp_keepidle; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100 int tcp_delacktime; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105 int tcp_msl; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109 int tcp_rexmit_min; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114 int tcp_rexmit_slop; 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119 static int always_keepalive = 1; 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122 123 int tcp_fast_finwait2_recycle = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 125 &tcp_fast_finwait2_recycle, 0, 126 "Recycle closed FIN_WAIT_2 connections faster"); 127 128 int tcp_finwait2_timeout; 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 131 132 int tcp_keepcnt = TCPTV_KEEPCNT; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 134 "Number of keepalive probes to send"); 135 136 /* max idle probes */ 137 int tcp_maxpersistidle; 138 139 static int tcp_rexmit_drop_options = 0; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 141 &tcp_rexmit_drop_options, 0, 142 "Drop TCP options from 3rd and later retransmitted SYN"); 143 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 145 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 147 CTLFLAG_RW|CTLFLAG_VNET, 148 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 149 "Path MTU Discovery Black Hole Detection Enabled"); 150 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 152 #define V_tcp_pmtud_blackhole_activated \ 153 VNET(tcp_pmtud_blackhole_activated) 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 155 CTLFLAG_RD|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 157 "Path MTU Discovery Black Hole Detection, Activation Count"); 158 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 160 #define V_tcp_pmtud_blackhole_activated_min_mss \ 161 VNET(tcp_pmtud_blackhole_activated_min_mss) 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 163 CTLFLAG_RD|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 165 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 166 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 168 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 170 CTLFLAG_RD|CTLFLAG_VNET, 171 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 172 "Path MTU Discovery Black Hole Detection, Failure Count"); 173 174 #ifdef INET 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 176 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 180 "Path MTU Discovery Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef INET6 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 185 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 #if 0 201 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 202 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 203 #endif 204 205 /* 206 * Map the given inp to a CPU id. 207 * 208 * This queries RSS if it's compiled in, else it defaults to the current 209 * CPU ID. 210 */ 211 static inline int 212 inp_to_cpuid(struct inpcb *inp) 213 { 214 u_int cpuid; 215 216 #ifdef RSS 217 if (per_cpu_timers) { 218 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 219 if (cpuid == NETISR_CPUID_NONE) 220 return (curcpu); /* XXX */ 221 else 222 return (cpuid); 223 } 224 #else 225 /* Legacy, pre-RSS behaviour */ 226 if (per_cpu_timers) { 227 /* 228 * We don't have a flowid -> cpuid mapping, so cheat and 229 * just map unknown cpuids to curcpu. Not the best, but 230 * apparently better than defaulting to swi 0. 231 */ 232 cpuid = inp->inp_flowid % (mp_maxid + 1); 233 if (! CPU_ABSENT(cpuid)) 234 return (cpuid); 235 return (curcpu); 236 } 237 #endif 238 /* Default for RSS and non-RSS - cpuid 0 */ 239 else { 240 return (0); 241 } 242 } 243 244 /* 245 * Tcp protocol timeout routine called every 500 ms. 246 * Updates timestamps used for TCP 247 * causes finite state machine actions if timers expire. 248 */ 249 void 250 tcp_slowtimo(void) 251 { 252 VNET_ITERATOR_DECL(vnet_iter); 253 254 VNET_LIST_RLOCK_NOSLEEP(); 255 VNET_FOREACH(vnet_iter) { 256 CURVNET_SET(vnet_iter); 257 (void) tcp_tw_2msl_scan(0); 258 CURVNET_RESTORE(); 259 } 260 VNET_LIST_RUNLOCK_NOSLEEP(); 261 } 262 263 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 264 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 265 266 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 267 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 268 269 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 270 271 /* 272 * TCP timer processing. 273 */ 274 275 void 276 tcp_timer_delack(void *xtp) 277 { 278 struct tcpcb *tp = xtp; 279 struct inpcb *inp; 280 CURVNET_SET(tp->t_vnet); 281 282 inp = tp->t_inpcb; 283 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 284 INP_WLOCK(inp); 285 if (callout_pending(&tp->t_timers->tt_delack) || 286 !callout_active(&tp->t_timers->tt_delack)) { 287 INP_WUNLOCK(inp); 288 CURVNET_RESTORE(); 289 return; 290 } 291 callout_deactivate(&tp->t_timers->tt_delack); 292 if ((inp->inp_flags & INP_DROPPED) != 0) { 293 INP_WUNLOCK(inp); 294 CURVNET_RESTORE(); 295 return; 296 } 297 tp->t_flags |= TF_ACKNOW; 298 TCPSTAT_INC(tcps_delack); 299 (void) tp->t_fb->tfb_tcp_output(tp); 300 INP_WUNLOCK(inp); 301 CURVNET_RESTORE(); 302 } 303 304 /* 305 * When a timer wants to remove a TCB it must 306 * hold the INP_INFO_RLOCK(). The timer function 307 * should only have grabbed the INP_WLOCK() when 308 * it entered. To safely switch to holding both the 309 * INP_INFO_RLOCK() and the INP_WLOCK() we must first 310 * grab a reference on the inp, which will hold the inp 311 * so that it can't be removed. We then unlock the INP_WLOCK(), 312 * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() 313 * we proceed again to get the INP_WLOCK() (this preserves proper 314 * lock order). After acquiring the INP_WLOCK we must check if someone 315 * else deleted the pcb i.e. the inp_flags check. 316 * If so we return 1 otherwise we return 0. 317 * 318 * No matter what the tcp_inpinfo_lock_add() function 319 * returns the caller must afterwards call tcp_inpinfo_lock_del() 320 * to drop the locks and reference properly. 321 */ 322 323 int 324 tcp_inpinfo_lock_add(struct inpcb *inp) 325 { 326 in_pcbref(inp); 327 INP_WUNLOCK(inp); 328 INP_INFO_RLOCK(&V_tcbinfo); 329 INP_WLOCK(inp); 330 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 331 return(1); 332 } 333 return(0); 334 335 } 336 337 void 338 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 339 { 340 INP_INFO_RUNLOCK(&V_tcbinfo); 341 if (inp && (tp == NULL)) { 342 /* 343 * If tcp_close/drop() gets called and tp 344 * returns NULL, then the function dropped 345 * the inp lock, we hold a reference keeping 346 * this around, so we must re-aquire the 347 * INP_WLOCK() in order to proceed with 348 * our dropping the inp reference. 349 */ 350 INP_WLOCK(inp); 351 } 352 if (inp && in_pcbrele_wlocked(inp) == 0) 353 INP_WUNLOCK(inp); 354 } 355 356 void 357 tcp_timer_2msl(void *xtp) 358 { 359 struct tcpcb *tp = xtp; 360 struct inpcb *inp; 361 CURVNET_SET(tp->t_vnet); 362 #ifdef TCPDEBUG 363 int ostate; 364 365 ostate = tp->t_state; 366 #endif 367 inp = tp->t_inpcb; 368 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 369 INP_WLOCK(inp); 370 tcp_free_sackholes(tp); 371 if (callout_pending(&tp->t_timers->tt_2msl) || 372 !callout_active(&tp->t_timers->tt_2msl)) { 373 INP_WUNLOCK(tp->t_inpcb); 374 CURVNET_RESTORE(); 375 return; 376 } 377 callout_deactivate(&tp->t_timers->tt_2msl); 378 if ((inp->inp_flags & INP_DROPPED) != 0) { 379 INP_WUNLOCK(inp); 380 CURVNET_RESTORE(); 381 return; 382 } 383 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 384 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 385 /* 386 * 2 MSL timeout in shutdown went off. If we're closed but 387 * still waiting for peer to close and connection has been idle 388 * too long delete connection control block. Otherwise, check 389 * again in a bit. 390 * 391 * If in TIME_WAIT state just ignore as this timeout is handled in 392 * tcp_tw_2msl_scan(). 393 * 394 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 395 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 396 * Ignore fact that there were recent incoming segments. 397 */ 398 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 399 INP_WUNLOCK(inp); 400 CURVNET_RESTORE(); 401 return; 402 } 403 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 404 tp->t_inpcb && tp->t_inpcb->inp_socket && 405 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 406 TCPSTAT_INC(tcps_finwait2_drops); 407 if (tcp_inpinfo_lock_add(inp)) { 408 tcp_inpinfo_lock_del(inp, tp); 409 goto out; 410 } 411 tp = tcp_close(tp); 412 tcp_inpinfo_lock_del(inp, tp); 413 goto out; 414 } else { 415 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 416 callout_reset(&tp->t_timers->tt_2msl, 417 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 418 } else { 419 if (tcp_inpinfo_lock_add(inp)) { 420 tcp_inpinfo_lock_del(inp, tp); 421 goto out; 422 } 423 tp = tcp_close(tp); 424 tcp_inpinfo_lock_del(inp, tp); 425 goto out; 426 } 427 } 428 429 #ifdef TCPDEBUG 430 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 431 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 432 PRU_SLOWTIMO); 433 #endif 434 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 435 436 if (tp != NULL) 437 INP_WUNLOCK(inp); 438 out: 439 CURVNET_RESTORE(); 440 } 441 442 void 443 tcp_timer_keep(void *xtp) 444 { 445 struct tcpcb *tp = xtp; 446 struct tcptemp *t_template; 447 struct inpcb *inp; 448 CURVNET_SET(tp->t_vnet); 449 #ifdef TCPDEBUG 450 int ostate; 451 452 ostate = tp->t_state; 453 #endif 454 inp = tp->t_inpcb; 455 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 456 INP_WLOCK(inp); 457 if (callout_pending(&tp->t_timers->tt_keep) || 458 !callout_active(&tp->t_timers->tt_keep)) { 459 INP_WUNLOCK(inp); 460 CURVNET_RESTORE(); 461 return; 462 } 463 callout_deactivate(&tp->t_timers->tt_keep); 464 if ((inp->inp_flags & INP_DROPPED) != 0) { 465 INP_WUNLOCK(inp); 466 CURVNET_RESTORE(); 467 return; 468 } 469 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 470 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 471 /* 472 * Keep-alive timer went off; send something 473 * or drop connection if idle for too long. 474 */ 475 TCPSTAT_INC(tcps_keeptimeo); 476 if (tp->t_state < TCPS_ESTABLISHED) 477 goto dropit; 478 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 479 tp->t_state <= TCPS_CLOSING) { 480 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 481 goto dropit; 482 /* 483 * Send a packet designed to force a response 484 * if the peer is up and reachable: 485 * either an ACK if the connection is still alive, 486 * or an RST if the peer has closed the connection 487 * due to timeout or reboot. 488 * Using sequence number tp->snd_una-1 489 * causes the transmitted zero-length segment 490 * to lie outside the receive window; 491 * by the protocol spec, this requires the 492 * correspondent TCP to respond. 493 */ 494 TCPSTAT_INC(tcps_keepprobe); 495 t_template = tcpip_maketemplate(inp); 496 if (t_template) { 497 tcp_respond(tp, t_template->tt_ipgen, 498 &t_template->tt_t, (struct mbuf *)NULL, 499 tp->rcv_nxt, tp->snd_una - 1, 0); 500 free(t_template, M_TEMP); 501 } 502 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 503 tcp_timer_keep, tp); 504 } else 505 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 506 tcp_timer_keep, tp); 507 508 #ifdef TCPDEBUG 509 if (inp->inp_socket->so_options & SO_DEBUG) 510 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 511 PRU_SLOWTIMO); 512 #endif 513 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 514 INP_WUNLOCK(inp); 515 CURVNET_RESTORE(); 516 return; 517 518 dropit: 519 TCPSTAT_INC(tcps_keepdrops); 520 521 if (tcp_inpinfo_lock_add(inp)) { 522 tcp_inpinfo_lock_del(inp, tp); 523 goto out; 524 } 525 tp = tcp_drop(tp, ETIMEDOUT); 526 527 #ifdef TCPDEBUG 528 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 529 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 530 PRU_SLOWTIMO); 531 #endif 532 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 533 tcp_inpinfo_lock_del(inp, tp); 534 out: 535 CURVNET_RESTORE(); 536 } 537 538 void 539 tcp_timer_persist(void *xtp) 540 { 541 struct tcpcb *tp = xtp; 542 struct inpcb *inp; 543 CURVNET_SET(tp->t_vnet); 544 #ifdef TCPDEBUG 545 int ostate; 546 547 ostate = tp->t_state; 548 #endif 549 inp = tp->t_inpcb; 550 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 551 INP_WLOCK(inp); 552 if (callout_pending(&tp->t_timers->tt_persist) || 553 !callout_active(&tp->t_timers->tt_persist)) { 554 INP_WUNLOCK(inp); 555 CURVNET_RESTORE(); 556 return; 557 } 558 callout_deactivate(&tp->t_timers->tt_persist); 559 if ((inp->inp_flags & INP_DROPPED) != 0) { 560 INP_WUNLOCK(inp); 561 CURVNET_RESTORE(); 562 return; 563 } 564 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 565 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 566 /* 567 * Persistence timer into zero window. 568 * Force a byte to be output, if possible. 569 */ 570 TCPSTAT_INC(tcps_persisttimeo); 571 /* 572 * Hack: if the peer is dead/unreachable, we do not 573 * time out if the window is closed. After a full 574 * backoff, drop the connection if the idle time 575 * (no responses to probes) reaches the maximum 576 * backoff that we would use if retransmitting. 577 */ 578 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 579 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 580 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 581 TCPSTAT_INC(tcps_persistdrop); 582 if (tcp_inpinfo_lock_add(inp)) { 583 tcp_inpinfo_lock_del(inp, tp); 584 goto out; 585 } 586 tp = tcp_drop(tp, ETIMEDOUT); 587 tcp_inpinfo_lock_del(inp, tp); 588 goto out; 589 } 590 /* 591 * If the user has closed the socket then drop a persisting 592 * connection after a much reduced timeout. 593 */ 594 if (tp->t_state > TCPS_CLOSE_WAIT && 595 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 596 TCPSTAT_INC(tcps_persistdrop); 597 if (tcp_inpinfo_lock_add(inp)) { 598 tcp_inpinfo_lock_del(inp, tp); 599 goto out; 600 } 601 tp = tcp_drop(tp, ETIMEDOUT); 602 tcp_inpinfo_lock_del(inp, tp); 603 goto out; 604 } 605 tcp_setpersist(tp); 606 tp->t_flags |= TF_FORCEDATA; 607 (void) tp->t_fb->tfb_tcp_output(tp); 608 tp->t_flags &= ~TF_FORCEDATA; 609 610 #ifdef TCPDEBUG 611 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 612 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 613 #endif 614 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 615 INP_WUNLOCK(inp); 616 out: 617 CURVNET_RESTORE(); 618 } 619 620 void 621 tcp_timer_rexmt(void * xtp) 622 { 623 struct tcpcb *tp = xtp; 624 CURVNET_SET(tp->t_vnet); 625 int rexmt; 626 struct inpcb *inp; 627 #ifdef TCPDEBUG 628 int ostate; 629 630 ostate = tp->t_state; 631 #endif 632 inp = tp->t_inpcb; 633 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 634 INP_WLOCK(inp); 635 if (callout_pending(&tp->t_timers->tt_rexmt) || 636 !callout_active(&tp->t_timers->tt_rexmt)) { 637 INP_WUNLOCK(inp); 638 CURVNET_RESTORE(); 639 return; 640 } 641 callout_deactivate(&tp->t_timers->tt_rexmt); 642 if ((inp->inp_flags & INP_DROPPED) != 0) { 643 INP_WUNLOCK(inp); 644 CURVNET_RESTORE(); 645 return; 646 } 647 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 648 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 649 tcp_free_sackholes(tp); 650 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 651 /* The stack has a timer action too. */ 652 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 653 } 654 /* 655 * Retransmission timer went off. Message has not 656 * been acked within retransmit interval. Back off 657 * to a longer retransmit interval and retransmit one segment. 658 */ 659 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 660 tp->t_rxtshift = TCP_MAXRXTSHIFT; 661 TCPSTAT_INC(tcps_timeoutdrop); 662 if (tcp_inpinfo_lock_add(inp)) { 663 tcp_inpinfo_lock_del(inp, tp); 664 goto out; 665 } 666 tp = tcp_drop(tp, tp->t_softerror ? 667 tp->t_softerror : ETIMEDOUT); 668 tcp_inpinfo_lock_del(inp, tp); 669 goto out; 670 } 671 if (tp->t_state == TCPS_SYN_SENT) { 672 /* 673 * If the SYN was retransmitted, indicate CWND to be 674 * limited to 1 segment in cc_conn_init(). 675 */ 676 tp->snd_cwnd = 1; 677 } else if (tp->t_rxtshift == 1) { 678 /* 679 * first retransmit; record ssthresh and cwnd so they can 680 * be recovered if this turns out to be a "bad" retransmit. 681 * A retransmit is considered "bad" if an ACK for this 682 * segment is received within RTT/2 interval; the assumption 683 * here is that the ACK was already in flight. See 684 * "On Estimating End-to-End Network Path Properties" by 685 * Allman and Paxson for more details. 686 */ 687 tp->snd_cwnd_prev = tp->snd_cwnd; 688 tp->snd_ssthresh_prev = tp->snd_ssthresh; 689 tp->snd_recover_prev = tp->snd_recover; 690 if (IN_FASTRECOVERY(tp->t_flags)) 691 tp->t_flags |= TF_WASFRECOVERY; 692 else 693 tp->t_flags &= ~TF_WASFRECOVERY; 694 if (IN_CONGRECOVERY(tp->t_flags)) 695 tp->t_flags |= TF_WASCRECOVERY; 696 else 697 tp->t_flags &= ~TF_WASCRECOVERY; 698 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 699 tp->t_flags |= TF_PREVVALID; 700 } else 701 tp->t_flags &= ~TF_PREVVALID; 702 TCPSTAT_INC(tcps_rexmttimeo); 703 if ((tp->t_state == TCPS_SYN_SENT) || 704 (tp->t_state == TCPS_SYN_RECEIVED)) 705 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 706 else 707 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 708 TCPT_RANGESET(tp->t_rxtcur, rexmt, 709 tp->t_rttmin, TCPTV_REXMTMAX); 710 711 /* 712 * We enter the path for PLMTUD if connection is established or, if 713 * connection is FIN_WAIT_1 status, reason for the last is that if 714 * amount of data we send is very small, we could send it in couple of 715 * packets and process straight to FIN. In that case we won't catch 716 * ESTABLISHED state. 717 */ 718 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 719 || (tp->t_state == TCPS_FIN_WAIT_1))) { 720 #ifdef INET6 721 int isipv6; 722 #endif 723 724 /* 725 * Idea here is that at each stage of mtu probe (usually, 1448 726 * -> 1188 -> 524) should be given 2 chances to recover before 727 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 728 * take care of that. 729 */ 730 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 731 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 732 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 733 /* 734 * Enter Path MTU Black-hole Detection mechanism: 735 * - Disable Path MTU Discovery (IP "DF" bit). 736 * - Reduce MTU to lower value than what we 737 * negotiated with peer. 738 */ 739 /* Record that we may have found a black hole. */ 740 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 741 742 /* Keep track of previous MSS. */ 743 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 744 745 /* 746 * Reduce the MSS to blackhole value or to the default 747 * in an attempt to retransmit. 748 */ 749 #ifdef INET6 750 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 751 if (isipv6 && 752 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 753 /* Use the sysctl tuneable blackhole MSS. */ 754 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 755 V_tcp_pmtud_blackhole_activated++; 756 } else if (isipv6) { 757 /* Use the default MSS. */ 758 tp->t_maxseg = V_tcp_v6mssdflt; 759 /* 760 * Disable Path MTU Discovery when we switch to 761 * minmss. 762 */ 763 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 764 V_tcp_pmtud_blackhole_activated_min_mss++; 765 } 766 #endif 767 #if defined(INET6) && defined(INET) 768 else 769 #endif 770 #ifdef INET 771 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 772 /* Use the sysctl tuneable blackhole MSS. */ 773 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 774 V_tcp_pmtud_blackhole_activated++; 775 } else { 776 /* Use the default MSS. */ 777 tp->t_maxseg = V_tcp_mssdflt; 778 /* 779 * Disable Path MTU Discovery when we switch to 780 * minmss. 781 */ 782 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 783 V_tcp_pmtud_blackhole_activated_min_mss++; 784 } 785 #endif 786 /* 787 * Reset the slow-start flight size 788 * as it may depend on the new MSS. 789 */ 790 if (CC_ALGO(tp)->conn_init != NULL) 791 CC_ALGO(tp)->conn_init(tp->ccv); 792 } else { 793 /* 794 * If further retransmissions are still unsuccessful 795 * with a lowered MTU, maybe this isn't a blackhole and 796 * we restore the previous MSS and blackhole detection 797 * flags. 798 * The limit '6' is determined by giving each probe 799 * stage (1448, 1188, 524) 2 chances to recover. 800 */ 801 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 802 (tp->t_rxtshift > 6)) { 803 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 804 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 805 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 806 V_tcp_pmtud_blackhole_failed++; 807 /* 808 * Reset the slow-start flight size as it 809 * may depend on the new MSS. 810 */ 811 if (CC_ALGO(tp)->conn_init != NULL) 812 CC_ALGO(tp)->conn_init(tp->ccv); 813 } 814 } 815 } 816 817 /* 818 * Disable RFC1323 and SACK if we haven't got any response to 819 * our third SYN to work-around some broken terminal servers 820 * (most of which have hopefully been retired) that have bad VJ 821 * header compression code which trashes TCP segments containing 822 * unknown-to-them TCP options. 823 */ 824 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 825 (tp->t_rxtshift == 3)) 826 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 827 /* 828 * If we backed off this far, our srtt estimate is probably bogus. 829 * Clobber it so we'll take the next rtt measurement as our srtt; 830 * move the current srtt into rttvar to keep the current 831 * retransmit times until then. 832 */ 833 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 834 #ifdef INET6 835 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 836 in6_losing(tp->t_inpcb); 837 else 838 #endif 839 in_losing(tp->t_inpcb); 840 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 841 tp->t_srtt = 0; 842 } 843 tp->snd_nxt = tp->snd_una; 844 tp->snd_recover = tp->snd_max; 845 /* 846 * Force a segment to be sent. 847 */ 848 tp->t_flags |= TF_ACKNOW; 849 /* 850 * If timing a segment in this window, stop the timer. 851 */ 852 tp->t_rtttime = 0; 853 854 cc_cong_signal(tp, NULL, CC_RTO); 855 856 (void) tp->t_fb->tfb_tcp_output(tp); 857 858 #ifdef TCPDEBUG 859 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 860 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 861 PRU_SLOWTIMO); 862 #endif 863 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 864 INP_WUNLOCK(inp); 865 out: 866 CURVNET_RESTORE(); 867 } 868 869 void 870 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 871 { 872 struct callout *t_callout; 873 timeout_t *f_callout; 874 struct inpcb *inp = tp->t_inpcb; 875 int cpu = inp_to_cpuid(inp); 876 877 #ifdef TCP_OFFLOAD 878 if (tp->t_flags & TF_TOE) 879 return; 880 #endif 881 882 if (tp->t_timers->tt_flags & TT_STOPPED) 883 return; 884 885 switch (timer_type) { 886 case TT_DELACK: 887 t_callout = &tp->t_timers->tt_delack; 888 f_callout = tcp_timer_delack; 889 break; 890 case TT_REXMT: 891 t_callout = &tp->t_timers->tt_rexmt; 892 f_callout = tcp_timer_rexmt; 893 break; 894 case TT_PERSIST: 895 t_callout = &tp->t_timers->tt_persist; 896 f_callout = tcp_timer_persist; 897 break; 898 case TT_KEEP: 899 t_callout = &tp->t_timers->tt_keep; 900 f_callout = tcp_timer_keep; 901 break; 902 case TT_2MSL: 903 t_callout = &tp->t_timers->tt_2msl; 904 f_callout = tcp_timer_2msl; 905 break; 906 default: 907 if (tp->t_fb->tfb_tcp_timer_activate) { 908 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 909 return; 910 } 911 panic("tp %p bad timer_type %#x", tp, timer_type); 912 } 913 if (delta == 0) { 914 callout_stop(t_callout); 915 } else { 916 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 917 } 918 } 919 920 int 921 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 922 { 923 struct callout *t_callout; 924 925 switch (timer_type) { 926 case TT_DELACK: 927 t_callout = &tp->t_timers->tt_delack; 928 break; 929 case TT_REXMT: 930 t_callout = &tp->t_timers->tt_rexmt; 931 break; 932 case TT_PERSIST: 933 t_callout = &tp->t_timers->tt_persist; 934 break; 935 case TT_KEEP: 936 t_callout = &tp->t_timers->tt_keep; 937 break; 938 case TT_2MSL: 939 t_callout = &tp->t_timers->tt_2msl; 940 break; 941 default: 942 if (tp->t_fb->tfb_tcp_timer_active) { 943 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 944 } 945 panic("tp %p bad timer_type %#x", tp, timer_type); 946 } 947 return callout_active(t_callout); 948 } 949 950 void 951 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 952 { 953 struct callout *t_callout; 954 955 tp->t_timers->tt_flags |= TT_STOPPED; 956 switch (timer_type) { 957 case TT_DELACK: 958 t_callout = &tp->t_timers->tt_delack; 959 break; 960 case TT_REXMT: 961 t_callout = &tp->t_timers->tt_rexmt; 962 break; 963 case TT_PERSIST: 964 t_callout = &tp->t_timers->tt_persist; 965 break; 966 case TT_KEEP: 967 t_callout = &tp->t_timers->tt_keep; 968 break; 969 case TT_2MSL: 970 t_callout = &tp->t_timers->tt_2msl; 971 break; 972 default: 973 if (tp->t_fb->tfb_tcp_timer_stop) { 974 /* 975 * XXXrrs we need to look at this with the 976 * stop case below (flags). 977 */ 978 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 979 return; 980 } 981 panic("tp %p bad timer_type %#x", tp, timer_type); 982 } 983 984 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 985 /* 986 * Can't stop the callout, defer tcpcb actual deletion 987 * to the last one. We do this using the async drain 988 * function and incrementing the count in 989 */ 990 tp->t_timers->tt_draincnt++; 991 } 992 } 993 994 #define ticks_to_msecs(t) (1000*(t) / hz) 995 996 void 997 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 998 struct xtcp_timer *xtimer) 999 { 1000 sbintime_t now; 1001 1002 bzero(xtimer, sizeof(*xtimer)); 1003 if (timer == NULL) 1004 return; 1005 now = getsbinuptime(); 1006 if (callout_active(&timer->tt_delack)) 1007 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 1008 if (callout_active(&timer->tt_rexmt)) 1009 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 1010 if (callout_active(&timer->tt_persist)) 1011 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 1012 if (callout_active(&timer->tt_keep)) 1013 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 1014 if (callout_active(&timer->tt_2msl)) 1015 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1016 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1017 } 1018