1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_tcpdebug.h" 38 #include "opt_rss.h" 39 40 #include <sys/param.h> 41 #include <sys/kernel.h> 42 #include <sys/lock.h> 43 #include <sys/mbuf.h> 44 #include <sys/mutex.h> 45 #include <sys/protosw.h> 46 #include <sys/smp.h> 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 52 #include <net/if.h> 53 #include <net/route.h> 54 #include <net/rss_config.h> 55 #include <net/vnet.h> 56 #include <net/netisr.h> 57 58 #include <netinet/in.h> 59 #include <netinet/in_kdtrace.h> 60 #include <netinet/in_pcb.h> 61 #include <netinet/in_rss.h> 62 #include <netinet/in_systm.h> 63 #ifdef INET6 64 #include <netinet6/in6_pcb.h> 65 #endif 66 #include <netinet/ip_var.h> 67 #include <netinet/tcp.h> 68 #include <netinet/tcp_fsm.h> 69 #include <netinet/tcp_timer.h> 70 #include <netinet/tcp_var.h> 71 #include <netinet/cc/cc.h> 72 #ifdef INET6 73 #include <netinet6/tcp6_var.h> 74 #endif 75 #include <netinet/tcpip.h> 76 #ifdef TCPDEBUG 77 #include <netinet/tcp_debug.h> 78 #endif 79 80 int tcp_persmin; 81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW, 82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval"); 83 84 int tcp_persmax; 85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW, 86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval"); 87 88 int tcp_keepinit; 89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, 90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); 91 92 int tcp_keepidle; 93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW, 94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin"); 95 96 int tcp_keepintvl; 97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW, 98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes"); 99 100 int tcp_delacktime; 101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW, 102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 103 "Time before a delayed ACK is sent"); 104 105 int tcp_msl; 106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW, 107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime"); 108 109 int tcp_rexmit_min; 110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW, 111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 112 "Minimum Retransmission Timeout"); 113 114 int tcp_rexmit_slop; 115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW, 116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 117 "Retransmission Timer Slop"); 118 119 static int always_keepalive = 1; 120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW, 121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections"); 122 123 int tcp_fast_finwait2_recycle = 0; 124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 125 &tcp_fast_finwait2_recycle, 0, 126 "Recycle closed FIN_WAIT_2 connections faster"); 127 128 int tcp_finwait2_timeout; 129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW, 130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout"); 131 132 int tcp_keepcnt = TCPTV_KEEPCNT; 133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 134 "Number of keepalive probes to send"); 135 136 /* max idle probes */ 137 int tcp_maxpersistidle; 138 139 static int tcp_rexmit_drop_options = 0; 140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 141 &tcp_rexmit_drop_options, 0, 142 "Drop TCP options from 3rd and later retransmitted SYN"); 143 144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 145 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect) 146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 147 CTLFLAG_RW|CTLFLAG_VNET, 148 &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 149 "Path MTU Discovery Black Hole Detection Enabled"); 150 151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated); 152 #define V_tcp_pmtud_blackhole_activated \ 153 VNET(tcp_pmtud_blackhole_activated) 154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated, 155 CTLFLAG_RD|CTLFLAG_VNET, 156 &VNET_NAME(tcp_pmtud_blackhole_activated), 0, 157 "Path MTU Discovery Black Hole Detection, Activation Count"); 158 159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss); 160 #define V_tcp_pmtud_blackhole_activated_min_mss \ 161 VNET(tcp_pmtud_blackhole_activated_min_mss) 162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss, 163 CTLFLAG_RD|CTLFLAG_VNET, 164 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0, 165 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS"); 166 167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed); 168 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed) 169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed, 170 CTLFLAG_RD|CTLFLAG_VNET, 171 &VNET_NAME(tcp_pmtud_blackhole_failed), 0, 172 "Path MTU Discovery Black Hole Detection, Failure Count"); 173 174 #ifdef INET 175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 176 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss) 177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 178 CTLFLAG_RW|CTLFLAG_VNET, 179 &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 180 "Path MTU Discovery Black Hole Detection lowered MSS"); 181 #endif 182 183 #ifdef INET6 184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 185 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss) 186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187 CTLFLAG_RW|CTLFLAG_VNET, 188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190 #endif 191 192 #ifdef RSS 193 static int per_cpu_timers = 1; 194 #else 195 static int per_cpu_timers = 0; 196 #endif 197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 198 &per_cpu_timers , 0, "run tcp timers on all cpus"); 199 200 #if 0 201 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \ 202 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0) 203 #endif 204 205 /* 206 * Map the given inp to a CPU id. 207 * 208 * This queries RSS if it's compiled in, else it defaults to the current 209 * CPU ID. 210 */ 211 static inline int 212 inp_to_cpuid(struct inpcb *inp) 213 { 214 u_int cpuid; 215 216 #ifdef RSS 217 if (per_cpu_timers) { 218 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 219 if (cpuid == NETISR_CPUID_NONE) 220 return (curcpu); /* XXX */ 221 else 222 return (cpuid); 223 } 224 #else 225 /* Legacy, pre-RSS behaviour */ 226 if (per_cpu_timers) { 227 /* 228 * We don't have a flowid -> cpuid mapping, so cheat and 229 * just map unknown cpuids to curcpu. Not the best, but 230 * apparently better than defaulting to swi 0. 231 */ 232 cpuid = inp->inp_flowid % (mp_maxid + 1); 233 if (! CPU_ABSENT(cpuid)) 234 return (cpuid); 235 return (curcpu); 236 } 237 #endif 238 /* Default for RSS and non-RSS - cpuid 0 */ 239 else { 240 return (0); 241 } 242 } 243 244 /* 245 * Tcp protocol timeout routine called every 500 ms. 246 * Updates timestamps used for TCP 247 * causes finite state machine actions if timers expire. 248 */ 249 void 250 tcp_slowtimo(void) 251 { 252 VNET_ITERATOR_DECL(vnet_iter); 253 254 VNET_LIST_RLOCK_NOSLEEP(); 255 VNET_FOREACH(vnet_iter) { 256 CURVNET_SET(vnet_iter); 257 (void) tcp_tw_2msl_scan(0); 258 CURVNET_RESTORE(); 259 } 260 VNET_LIST_RUNLOCK_NOSLEEP(); 261 } 262 263 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = 264 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; 265 266 int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 267 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 268 269 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 270 271 /* 272 * TCP timer processing. 273 */ 274 275 void 276 tcp_timer_delack(void *xtp) 277 { 278 struct tcpcb *tp = xtp; 279 struct inpcb *inp; 280 CURVNET_SET(tp->t_vnet); 281 282 inp = tp->t_inpcb; 283 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 284 INP_WLOCK(inp); 285 if (callout_pending(&tp->t_timers->tt_delack) || 286 !callout_active(&tp->t_timers->tt_delack)) { 287 INP_WUNLOCK(inp); 288 CURVNET_RESTORE(); 289 return; 290 } 291 callout_deactivate(&tp->t_timers->tt_delack); 292 if ((inp->inp_flags & INP_DROPPED) != 0) { 293 INP_WUNLOCK(inp); 294 CURVNET_RESTORE(); 295 return; 296 } 297 tp->t_flags |= TF_ACKNOW; 298 TCPSTAT_INC(tcps_delack); 299 (void) tp->t_fb->tfb_tcp_output(tp); 300 INP_WUNLOCK(inp); 301 CURVNET_RESTORE(); 302 } 303 304 /* 305 * When a timer wants to remove a TCB it must 306 * hold the INP_INFO_RLOCK(). The timer function 307 * should only have grabbed the INP_WLOCK() when 308 * it entered. To safely switch to holding both the 309 * INP_INFO_RLOCK() and the INP_WLOCK() we must first 310 * grab a reference on the inp, which will hold the inp 311 * so that it can't be removed. We then unlock the INP_WLOCK(), 312 * and grab the INP_INFO_RLOCK() lock. Once we have the INP_INFO_RLOCK() 313 * we proceed again to get the INP_WLOCK() (this preserves proper 314 * lock order). After acquiring the INP_WLOCK we must check if someone 315 * else deleted the pcb i.e. the inp_flags check. 316 * If so we return 1 otherwise we return 0. 317 * 318 * No matter what the tcp_inpinfo_lock_add() function 319 * returns the caller must afterwards call tcp_inpinfo_lock_del() 320 * to drop the locks and reference properly. 321 */ 322 323 int 324 tcp_inpinfo_lock_add(struct inpcb *inp) 325 { 326 in_pcbref(inp); 327 INP_WUNLOCK(inp); 328 INP_INFO_RLOCK(&V_tcbinfo); 329 INP_WLOCK(inp); 330 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 331 return(1); 332 } 333 return(0); 334 335 } 336 337 void 338 tcp_inpinfo_lock_del(struct inpcb *inp, struct tcpcb *tp) 339 { 340 INP_INFO_RUNLOCK(&V_tcbinfo); 341 if (inp && (tp == NULL)) { 342 /* 343 * If tcp_close/drop() gets called and tp 344 * returns NULL, then the function dropped 345 * the inp lock, we hold a reference keeping 346 * this around, so we must re-aquire the 347 * INP_WLOCK() in order to proceed with 348 * our dropping the inp reference. 349 */ 350 INP_WLOCK(inp); 351 } 352 if (inp && in_pcbrele_wlocked(inp) == 0) 353 INP_WUNLOCK(inp); 354 } 355 356 void 357 tcp_timer_2msl(void *xtp) 358 { 359 struct tcpcb *tp = xtp; 360 struct inpcb *inp; 361 CURVNET_SET(tp->t_vnet); 362 #ifdef TCPDEBUG 363 int ostate; 364 365 ostate = tp->t_state; 366 #endif 367 inp = tp->t_inpcb; 368 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 369 INP_WLOCK(inp); 370 tcp_free_sackholes(tp); 371 if (callout_pending(&tp->t_timers->tt_2msl) || 372 !callout_active(&tp->t_timers->tt_2msl)) { 373 INP_WUNLOCK(tp->t_inpcb); 374 CURVNET_RESTORE(); 375 return; 376 } 377 callout_deactivate(&tp->t_timers->tt_2msl); 378 if ((inp->inp_flags & INP_DROPPED) != 0) { 379 INP_WUNLOCK(inp); 380 CURVNET_RESTORE(); 381 return; 382 } 383 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 384 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 385 /* 386 * 2 MSL timeout in shutdown went off. If we're closed but 387 * still waiting for peer to close and connection has been idle 388 * too long delete connection control block. Otherwise, check 389 * again in a bit. 390 * 391 * If in TIME_WAIT state just ignore as this timeout is handled in 392 * tcp_tw_2msl_scan(). 393 * 394 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 395 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 396 * Ignore fact that there were recent incoming segments. 397 */ 398 if ((inp->inp_flags & INP_TIMEWAIT) != 0) { 399 INP_WUNLOCK(inp); 400 CURVNET_RESTORE(); 401 return; 402 } 403 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && 404 tp->t_inpcb && tp->t_inpcb->inp_socket && 405 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 406 TCPSTAT_INC(tcps_finwait2_drops); 407 if (tcp_inpinfo_lock_add(inp)) { 408 tcp_inpinfo_lock_del(inp, tp); 409 goto out; 410 } 411 tp = tcp_close(tp); 412 tcp_inpinfo_lock_del(inp, tp); 413 goto out; 414 } else { 415 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { 416 callout_reset(&tp->t_timers->tt_2msl, 417 TP_KEEPINTVL(tp), tcp_timer_2msl, tp); 418 } else { 419 if (tcp_inpinfo_lock_add(inp)) { 420 tcp_inpinfo_lock_del(inp, tp); 421 goto out; 422 } 423 tp = tcp_close(tp); 424 tcp_inpinfo_lock_del(inp, tp); 425 goto out; 426 } 427 } 428 429 #ifdef TCPDEBUG 430 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 431 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 432 PRU_SLOWTIMO); 433 #endif 434 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 435 436 if (tp != NULL) 437 INP_WUNLOCK(inp); 438 out: 439 CURVNET_RESTORE(); 440 } 441 442 void 443 tcp_timer_keep(void *xtp) 444 { 445 struct tcpcb *tp = xtp; 446 struct tcptemp *t_template; 447 struct inpcb *inp; 448 CURVNET_SET(tp->t_vnet); 449 #ifdef TCPDEBUG 450 int ostate; 451 452 ostate = tp->t_state; 453 #endif 454 inp = tp->t_inpcb; 455 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 456 INP_WLOCK(inp); 457 if (callout_pending(&tp->t_timers->tt_keep) || 458 !callout_active(&tp->t_timers->tt_keep)) { 459 INP_WUNLOCK(inp); 460 CURVNET_RESTORE(); 461 return; 462 } 463 callout_deactivate(&tp->t_timers->tt_keep); 464 if ((inp->inp_flags & INP_DROPPED) != 0) { 465 INP_WUNLOCK(inp); 466 CURVNET_RESTORE(); 467 return; 468 } 469 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 470 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 471 472 /* 473 * Because we don't regularly reset the keepalive callout in 474 * the ESTABLISHED state, it may be that we don't actually need 475 * to send a keepalive yet. If that occurs, schedule another 476 * call for the next time the keepalive timer might expire. 477 */ 478 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 479 u_int idletime; 480 481 idletime = ticks - tp->t_rcvtime; 482 if (idletime < TP_KEEPIDLE(tp)) { 483 callout_reset(&tp->t_timers->tt_keep, 484 TP_KEEPIDLE(tp) - idletime, tcp_timer_keep, tp); 485 INP_WUNLOCK(inp); 486 CURVNET_RESTORE(); 487 return; 488 } 489 } 490 491 /* 492 * Keep-alive timer went off; send something 493 * or drop connection if idle for too long. 494 */ 495 TCPSTAT_INC(tcps_keeptimeo); 496 if (tp->t_state < TCPS_ESTABLISHED) 497 goto dropit; 498 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 499 tp->t_state <= TCPS_CLOSING) { 500 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 501 goto dropit; 502 /* 503 * Send a packet designed to force a response 504 * if the peer is up and reachable: 505 * either an ACK if the connection is still alive, 506 * or an RST if the peer has closed the connection 507 * due to timeout or reboot. 508 * Using sequence number tp->snd_una-1 509 * causes the transmitted zero-length segment 510 * to lie outside the receive window; 511 * by the protocol spec, this requires the 512 * correspondent TCP to respond. 513 */ 514 TCPSTAT_INC(tcps_keepprobe); 515 t_template = tcpip_maketemplate(inp); 516 if (t_template) { 517 tcp_respond(tp, t_template->tt_ipgen, 518 &t_template->tt_t, (struct mbuf *)NULL, 519 tp->rcv_nxt, tp->snd_una - 1, 0); 520 free(t_template, M_TEMP); 521 } 522 callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp), 523 tcp_timer_keep, tp); 524 } else 525 callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp), 526 tcp_timer_keep, tp); 527 528 #ifdef TCPDEBUG 529 if (inp->inp_socket->so_options & SO_DEBUG) 530 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 531 PRU_SLOWTIMO); 532 #endif 533 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 534 INP_WUNLOCK(inp); 535 CURVNET_RESTORE(); 536 return; 537 538 dropit: 539 TCPSTAT_INC(tcps_keepdrops); 540 541 if (tcp_inpinfo_lock_add(inp)) { 542 tcp_inpinfo_lock_del(inp, tp); 543 goto out; 544 } 545 tp = tcp_drop(tp, ETIMEDOUT); 546 547 #ifdef TCPDEBUG 548 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 549 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 550 PRU_SLOWTIMO); 551 #endif 552 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 553 tcp_inpinfo_lock_del(inp, tp); 554 out: 555 CURVNET_RESTORE(); 556 } 557 558 void 559 tcp_timer_persist(void *xtp) 560 { 561 struct tcpcb *tp = xtp; 562 struct inpcb *inp; 563 CURVNET_SET(tp->t_vnet); 564 #ifdef TCPDEBUG 565 int ostate; 566 567 ostate = tp->t_state; 568 #endif 569 inp = tp->t_inpcb; 570 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 571 INP_WLOCK(inp); 572 if (callout_pending(&tp->t_timers->tt_persist) || 573 !callout_active(&tp->t_timers->tt_persist)) { 574 INP_WUNLOCK(inp); 575 CURVNET_RESTORE(); 576 return; 577 } 578 callout_deactivate(&tp->t_timers->tt_persist); 579 if ((inp->inp_flags & INP_DROPPED) != 0) { 580 INP_WUNLOCK(inp); 581 CURVNET_RESTORE(); 582 return; 583 } 584 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 585 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 586 /* 587 * Persistence timer into zero window. 588 * Force a byte to be output, if possible. 589 */ 590 TCPSTAT_INC(tcps_persisttimeo); 591 /* 592 * Hack: if the peer is dead/unreachable, we do not 593 * time out if the window is closed. After a full 594 * backoff, drop the connection if the idle time 595 * (no responses to probes) reaches the maximum 596 * backoff that we would use if retransmitting. 597 */ 598 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 599 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 600 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 601 TCPSTAT_INC(tcps_persistdrop); 602 if (tcp_inpinfo_lock_add(inp)) { 603 tcp_inpinfo_lock_del(inp, tp); 604 goto out; 605 } 606 tp = tcp_drop(tp, ETIMEDOUT); 607 tcp_inpinfo_lock_del(inp, tp); 608 goto out; 609 } 610 /* 611 * If the user has closed the socket then drop a persisting 612 * connection after a much reduced timeout. 613 */ 614 if (tp->t_state > TCPS_CLOSE_WAIT && 615 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 616 TCPSTAT_INC(tcps_persistdrop); 617 if (tcp_inpinfo_lock_add(inp)) { 618 tcp_inpinfo_lock_del(inp, tp); 619 goto out; 620 } 621 tp = tcp_drop(tp, ETIMEDOUT); 622 tcp_inpinfo_lock_del(inp, tp); 623 goto out; 624 } 625 tcp_setpersist(tp); 626 tp->t_flags |= TF_FORCEDATA; 627 (void) tp->t_fb->tfb_tcp_output(tp); 628 tp->t_flags &= ~TF_FORCEDATA; 629 630 #ifdef TCPDEBUG 631 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG) 632 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO); 633 #endif 634 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 635 INP_WUNLOCK(inp); 636 out: 637 CURVNET_RESTORE(); 638 } 639 640 void 641 tcp_timer_rexmt(void * xtp) 642 { 643 struct tcpcb *tp = xtp; 644 CURVNET_SET(tp->t_vnet); 645 int rexmt; 646 struct inpcb *inp; 647 #ifdef TCPDEBUG 648 int ostate; 649 650 ostate = tp->t_state; 651 #endif 652 inp = tp->t_inpcb; 653 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 654 INP_WLOCK(inp); 655 if (callout_pending(&tp->t_timers->tt_rexmt) || 656 !callout_active(&tp->t_timers->tt_rexmt)) { 657 INP_WUNLOCK(inp); 658 CURVNET_RESTORE(); 659 return; 660 } 661 callout_deactivate(&tp->t_timers->tt_rexmt); 662 if ((inp->inp_flags & INP_DROPPED) != 0) { 663 INP_WUNLOCK(inp); 664 CURVNET_RESTORE(); 665 return; 666 } 667 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, 668 ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); 669 tcp_free_sackholes(tp); 670 if (tp->t_fb->tfb_tcp_rexmit_tmr) { 671 /* The stack has a timer action too. */ 672 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 673 } 674 /* 675 * Retransmission timer went off. Message has not 676 * been acked within retransmit interval. Back off 677 * to a longer retransmit interval and retransmit one segment. 678 */ 679 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { 680 tp->t_rxtshift = TCP_MAXRXTSHIFT; 681 TCPSTAT_INC(tcps_timeoutdrop); 682 if (tcp_inpinfo_lock_add(inp)) { 683 tcp_inpinfo_lock_del(inp, tp); 684 goto out; 685 } 686 tp = tcp_drop(tp, tp->t_softerror ? 687 tp->t_softerror : ETIMEDOUT); 688 tcp_inpinfo_lock_del(inp, tp); 689 goto out; 690 } 691 if (tp->t_state == TCPS_SYN_SENT) { 692 /* 693 * If the SYN was retransmitted, indicate CWND to be 694 * limited to 1 segment in cc_conn_init(). 695 */ 696 tp->snd_cwnd = 1; 697 } else if (tp->t_rxtshift == 1) { 698 /* 699 * first retransmit; record ssthresh and cwnd so they can 700 * be recovered if this turns out to be a "bad" retransmit. 701 * A retransmit is considered "bad" if an ACK for this 702 * segment is received within RTT/2 interval; the assumption 703 * here is that the ACK was already in flight. See 704 * "On Estimating End-to-End Network Path Properties" by 705 * Allman and Paxson for more details. 706 */ 707 tp->snd_cwnd_prev = tp->snd_cwnd; 708 tp->snd_ssthresh_prev = tp->snd_ssthresh; 709 tp->snd_recover_prev = tp->snd_recover; 710 if (IN_FASTRECOVERY(tp->t_flags)) 711 tp->t_flags |= TF_WASFRECOVERY; 712 else 713 tp->t_flags &= ~TF_WASFRECOVERY; 714 if (IN_CONGRECOVERY(tp->t_flags)) 715 tp->t_flags |= TF_WASCRECOVERY; 716 else 717 tp->t_flags &= ~TF_WASCRECOVERY; 718 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 719 tp->t_flags |= TF_PREVVALID; 720 } else 721 tp->t_flags &= ~TF_PREVVALID; 722 TCPSTAT_INC(tcps_rexmttimeo); 723 if ((tp->t_state == TCPS_SYN_SENT) || 724 (tp->t_state == TCPS_SYN_RECEIVED)) 725 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; 726 else 727 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 728 TCPT_RANGESET(tp->t_rxtcur, rexmt, 729 tp->t_rttmin, TCPTV_REXMTMAX); 730 731 /* 732 * We enter the path for PLMTUD if connection is established or, if 733 * connection is FIN_WAIT_1 status, reason for the last is that if 734 * amount of data we send is very small, we could send it in couple of 735 * packets and process straight to FIN. In that case we won't catch 736 * ESTABLISHED state. 737 */ 738 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED)) 739 || (tp->t_state == TCPS_FIN_WAIT_1))) { 740 #ifdef INET6 741 int isipv6; 742 #endif 743 744 /* 745 * Idea here is that at each stage of mtu probe (usually, 1448 746 * -> 1188 -> 524) should be given 2 chances to recover before 747 * further clamping down. 'tp->t_rxtshift % 2 == 0' should 748 * take care of that. 749 */ 750 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 751 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 752 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) { 753 /* 754 * Enter Path MTU Black-hole Detection mechanism: 755 * - Disable Path MTU Discovery (IP "DF" bit). 756 * - Reduce MTU to lower value than what we 757 * negotiated with peer. 758 */ 759 /* Record that we may have found a black hole. */ 760 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 761 762 /* Keep track of previous MSS. */ 763 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 764 765 /* 766 * Reduce the MSS to blackhole value or to the default 767 * in an attempt to retransmit. 768 */ 769 #ifdef INET6 770 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0; 771 if (isipv6 && 772 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 773 /* Use the sysctl tuneable blackhole MSS. */ 774 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 775 V_tcp_pmtud_blackhole_activated++; 776 } else if (isipv6) { 777 /* Use the default MSS. */ 778 tp->t_maxseg = V_tcp_v6mssdflt; 779 /* 780 * Disable Path MTU Discovery when we switch to 781 * minmss. 782 */ 783 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 784 V_tcp_pmtud_blackhole_activated_min_mss++; 785 } 786 #endif 787 #if defined(INET6) && defined(INET) 788 else 789 #endif 790 #ifdef INET 791 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 792 /* Use the sysctl tuneable blackhole MSS. */ 793 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 794 V_tcp_pmtud_blackhole_activated++; 795 } else { 796 /* Use the default MSS. */ 797 tp->t_maxseg = V_tcp_mssdflt; 798 /* 799 * Disable Path MTU Discovery when we switch to 800 * minmss. 801 */ 802 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 803 V_tcp_pmtud_blackhole_activated_min_mss++; 804 } 805 #endif 806 /* 807 * Reset the slow-start flight size 808 * as it may depend on the new MSS. 809 */ 810 if (CC_ALGO(tp)->conn_init != NULL) 811 CC_ALGO(tp)->conn_init(tp->ccv); 812 } else { 813 /* 814 * If further retransmissions are still unsuccessful 815 * with a lowered MTU, maybe this isn't a blackhole and 816 * we restore the previous MSS and blackhole detection 817 * flags. 818 * The limit '6' is determined by giving each probe 819 * stage (1448, 1188, 524) 2 chances to recover. 820 */ 821 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 822 (tp->t_rxtshift > 6)) { 823 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 824 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 825 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 826 V_tcp_pmtud_blackhole_failed++; 827 /* 828 * Reset the slow-start flight size as it 829 * may depend on the new MSS. 830 */ 831 if (CC_ALGO(tp)->conn_init != NULL) 832 CC_ALGO(tp)->conn_init(tp->ccv); 833 } 834 } 835 } 836 837 /* 838 * Disable RFC1323 and SACK if we haven't got any response to 839 * our third SYN to work-around some broken terminal servers 840 * (most of which have hopefully been retired) that have bad VJ 841 * header compression code which trashes TCP segments containing 842 * unknown-to-them TCP options. 843 */ 844 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 845 (tp->t_rxtshift == 3)) 846 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 847 /* 848 * If we backed off this far, our srtt estimate is probably bogus. 849 * Clobber it so we'll take the next rtt measurement as our srtt; 850 * move the current srtt into rttvar to keep the current 851 * retransmit times until then. 852 */ 853 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 854 #ifdef INET6 855 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 856 in6_losing(tp->t_inpcb); 857 else 858 #endif 859 in_losing(tp->t_inpcb); 860 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 861 tp->t_srtt = 0; 862 } 863 tp->snd_nxt = tp->snd_una; 864 tp->snd_recover = tp->snd_max; 865 /* 866 * Force a segment to be sent. 867 */ 868 tp->t_flags |= TF_ACKNOW; 869 /* 870 * If timing a segment in this window, stop the timer. 871 */ 872 tp->t_rtttime = 0; 873 874 cc_cong_signal(tp, NULL, CC_RTO); 875 876 (void) tp->t_fb->tfb_tcp_output(tp); 877 878 #ifdef TCPDEBUG 879 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) 880 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0, 881 PRU_SLOWTIMO); 882 #endif 883 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 884 INP_WUNLOCK(inp); 885 out: 886 CURVNET_RESTORE(); 887 } 888 889 void 890 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) 891 { 892 struct callout *t_callout; 893 timeout_t *f_callout; 894 struct inpcb *inp = tp->t_inpcb; 895 int cpu = inp_to_cpuid(inp); 896 897 #ifdef TCP_OFFLOAD 898 if (tp->t_flags & TF_TOE) 899 return; 900 #endif 901 902 if (tp->t_timers->tt_flags & TT_STOPPED) 903 return; 904 905 switch (timer_type) { 906 case TT_DELACK: 907 t_callout = &tp->t_timers->tt_delack; 908 f_callout = tcp_timer_delack; 909 break; 910 case TT_REXMT: 911 t_callout = &tp->t_timers->tt_rexmt; 912 f_callout = tcp_timer_rexmt; 913 break; 914 case TT_PERSIST: 915 t_callout = &tp->t_timers->tt_persist; 916 f_callout = tcp_timer_persist; 917 break; 918 case TT_KEEP: 919 t_callout = &tp->t_timers->tt_keep; 920 f_callout = tcp_timer_keep; 921 break; 922 case TT_2MSL: 923 t_callout = &tp->t_timers->tt_2msl; 924 f_callout = tcp_timer_2msl; 925 break; 926 default: 927 if (tp->t_fb->tfb_tcp_timer_activate) { 928 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta); 929 return; 930 } 931 panic("tp %p bad timer_type %#x", tp, timer_type); 932 } 933 if (delta == 0) { 934 callout_stop(t_callout); 935 } else { 936 callout_reset_on(t_callout, delta, f_callout, tp, cpu); 937 } 938 } 939 940 int 941 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type) 942 { 943 struct callout *t_callout; 944 945 switch (timer_type) { 946 case TT_DELACK: 947 t_callout = &tp->t_timers->tt_delack; 948 break; 949 case TT_REXMT: 950 t_callout = &tp->t_timers->tt_rexmt; 951 break; 952 case TT_PERSIST: 953 t_callout = &tp->t_timers->tt_persist; 954 break; 955 case TT_KEEP: 956 t_callout = &tp->t_timers->tt_keep; 957 break; 958 case TT_2MSL: 959 t_callout = &tp->t_timers->tt_2msl; 960 break; 961 default: 962 if (tp->t_fb->tfb_tcp_timer_active) { 963 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type)); 964 } 965 panic("tp %p bad timer_type %#x", tp, timer_type); 966 } 967 return callout_active(t_callout); 968 } 969 970 void 971 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type) 972 { 973 struct callout *t_callout; 974 975 tp->t_timers->tt_flags |= TT_STOPPED; 976 switch (timer_type) { 977 case TT_DELACK: 978 t_callout = &tp->t_timers->tt_delack; 979 break; 980 case TT_REXMT: 981 t_callout = &tp->t_timers->tt_rexmt; 982 break; 983 case TT_PERSIST: 984 t_callout = &tp->t_timers->tt_persist; 985 break; 986 case TT_KEEP: 987 t_callout = &tp->t_timers->tt_keep; 988 break; 989 case TT_2MSL: 990 t_callout = &tp->t_timers->tt_2msl; 991 break; 992 default: 993 if (tp->t_fb->tfb_tcp_timer_stop) { 994 /* 995 * XXXrrs we need to look at this with the 996 * stop case below (flags). 997 */ 998 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type); 999 return; 1000 } 1001 panic("tp %p bad timer_type %#x", tp, timer_type); 1002 } 1003 1004 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) { 1005 /* 1006 * Can't stop the callout, defer tcpcb actual deletion 1007 * to the last one. We do this using the async drain 1008 * function and incrementing the count in 1009 */ 1010 tp->t_timers->tt_draincnt++; 1011 } 1012 } 1013 1014 #define ticks_to_msecs(t) (1000*(t) / hz) 1015 1016 void 1017 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer, 1018 struct xtcp_timer *xtimer) 1019 { 1020 sbintime_t now; 1021 1022 bzero(xtimer, sizeof(*xtimer)); 1023 if (timer == NULL) 1024 return; 1025 now = getsbinuptime(); 1026 if (callout_active(&timer->tt_delack)) 1027 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS; 1028 if (callout_active(&timer->tt_rexmt)) 1029 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS; 1030 if (callout_active(&timer->tt_persist)) 1031 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS; 1032 if (callout_active(&timer->tt_keep)) 1033 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS; 1034 if (callout_active(&timer->tt_2msl)) 1035 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS; 1036 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime); 1037 } 1038