1c398230bSWarner Losh /*- 251369649SPedro F. Giffuni * SPDX-License-Identifier: BSD-3-Clause 351369649SPedro F. Giffuni * 4e79adb8eSGarrett Wollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5df8bae1dSRodney W. Grimes * The Regents of the University of California. All rights reserved. 6df8bae1dSRodney W. Grimes * 7df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without 8df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions 9df8bae1dSRodney W. Grimes * are met: 10df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright 11df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer. 12df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright 13df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer in the 14df8bae1dSRodney W. Grimes * documentation and/or other materials provided with the distribution. 15fbbd9655SWarner Losh * 3. Neither the name of the University nor the names of its contributors 16df8bae1dSRodney W. Grimes * may be used to endorse or promote products derived from this software 17df8bae1dSRodney W. Grimes * without specific prior written permission. 18df8bae1dSRodney W. Grimes * 19df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22df8bae1dSRodney W. Grimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29df8bae1dSRodney W. Grimes * SUCH DAMAGE. 30df8bae1dSRodney W. Grimes * 31e79adb8eSGarrett Wollman * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95 32df8bae1dSRodney W. Grimes */ 33df8bae1dSRodney W. Grimes 344b421e2dSMike Silbersack #include <sys/cdefs.h> 354b421e2dSMike Silbersack __FBSDID("$FreeBSD$"); 364b421e2dSMike Silbersack 37825fd1e4SNavdeep Parhar #include "opt_inet.h" 38fb59c426SYoshinobu Inoue #include "opt_inet6.h" 39883831c6SAdrian Chadd #include "opt_rss.h" 400cc12cc5SJoerg Wunsch 41df8bae1dSRodney W. Grimes #include <sys/param.h> 4298163b98SPoul-Henning Kamp #include <sys/kernel.h> 43c74af4faSBruce Evans #include <sys/lock.h> 4408517d53SMike Silbersack #include <sys/mbuf.h> 45c74af4faSBruce Evans #include <sys/mutex.h> 46c74af4faSBruce Evans #include <sys/protosw.h> 4787aedea4SKip Macy #include <sys/smp.h> 48df8bae1dSRodney W. Grimes #include <sys/socket.h> 49df8bae1dSRodney W. Grimes #include <sys/socketvar.h> 50c74af4faSBruce Evans #include <sys/sysctl.h> 51c74af4faSBruce Evans #include <sys/systm.h> 52e79adb8eSGarrett Wollman 534b79449eSBjoern A. Zeeb #include <net/if.h> 54df8bae1dSRodney W. Grimes #include <net/route.h> 55b2bdc62aSAdrian Chadd #include <net/rss_config.h> 56530c0060SRobert Watson #include <net/vnet.h> 57883831c6SAdrian Chadd #include <net/netisr.h> 58df8bae1dSRodney W. Grimes 59df8bae1dSRodney W. Grimes #include <netinet/in.h> 605d06879aSGeorge V. Neville-Neil #include <netinet/in_kdtrace.h> 61df8bae1dSRodney W. Grimes #include <netinet/in_pcb.h> 62883831c6SAdrian Chadd #include <netinet/in_rss.h> 63c74af4faSBruce Evans #include <netinet/in_systm.h> 64fb59c426SYoshinobu Inoue #ifdef INET6 65fb59c426SYoshinobu Inoue #include <netinet6/in6_pcb.h> 66fb59c426SYoshinobu Inoue #endif 67df8bae1dSRodney W. Grimes #include <netinet/ip_var.h> 682de3e790SGleb Smirnoff #include <netinet/tcp.h> 69df8bae1dSRodney W. Grimes #include <netinet/tcp_fsm.h> 702529f56eSJonathan T. Looney #include <netinet/tcp_log_buf.h> 71df8bae1dSRodney W. Grimes #include <netinet/tcp_timer.h> 72df8bae1dSRodney W. Grimes #include <netinet/tcp_var.h> 7389e560f4SRandall Stewart #include <netinet/tcp_seq.h> 744644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 75f6f6703fSSean Bruno #ifdef INET6 76f6f6703fSSean Bruno #include <netinet6/tcp6_var.h> 77f6f6703fSSean Bruno #endif 78df8bae1dSRodney W. Grimes #include <netinet/tcpip.h> 79df8bae1dSRodney W. Grimes 800645c604SHiren Panchasara int tcp_persmin; 817029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 827029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 837029da5cSPawel Biernacki &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 847029da5cSPawel Biernacki "minimum persistence interval"); 850645c604SHiren Panchasara 860645c604SHiren Panchasara int tcp_persmax; 877029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 887029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 897029da5cSPawel Biernacki &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 907029da5cSPawel Biernacki "maximum persistence interval"); 910645c604SHiren Panchasara 929b8b58e0SJonathan Lemon int tcp_keepinit; 937029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 947029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 957029da5cSPawel Biernacki &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 967029da5cSPawel Biernacki "time to establish connection"); 977b40aa32SPaul Traina 989b8b58e0SJonathan Lemon int tcp_keepidle; 997029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 1007029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1017029da5cSPawel Biernacki &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 1027029da5cSPawel Biernacki "time before keepalive probes begin"); 10398163b98SPoul-Henning Kamp 1049b8b58e0SJonathan Lemon int tcp_keepintvl; 1057029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 1067029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1077029da5cSPawel Biernacki &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 1087029da5cSPawel Biernacki "time between keepalive probes"); 10998163b98SPoul-Henning Kamp 1109b8b58e0SJonathan Lemon int tcp_delacktime; 1117029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 1127029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1136489fe65SAndre Oppermann &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 114ccb4d0c6SJonathan Lemon "Time before a delayed ACK is sent"); 1159b8b58e0SJonathan Lemon 116c2c8e360SAlexander V. Chernikov VNET_DEFINE(int, tcp_msl); 1177029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 118c2c8e360SAlexander V. Chernikov CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 119c2c8e360SAlexander V. Chernikov &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 1207029da5cSPawel Biernacki "Maximum segment lifetime"); 1219b8b58e0SJonathan Lemon 1220999766dSMichael Tuexen int tcp_rexmit_initial; 1237029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 1247029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1250999766dSMichael Tuexen &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 1260999766dSMichael Tuexen "Initial Retransmission Timeout"); 1270999766dSMichael Tuexen 128701bec5aSMatthew Dillon int tcp_rexmit_min; 1297029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 1307029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1316489fe65SAndre Oppermann &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 1326489fe65SAndre Oppermann "Minimum Retransmission Timeout"); 133701bec5aSMatthew Dillon 134701bec5aSMatthew Dillon int tcp_rexmit_slop; 1357029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 1367029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1376489fe65SAndre Oppermann &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 1386489fe65SAndre Oppermann "Retransmission Timer Slop"); 139701bec5aSMatthew Dillon 140334fc582SBjoern A. Zeeb VNET_DEFINE(int, tcp_always_keepalive) = 1; 141334fc582SBjoern A. Zeeb SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 142334fc582SBjoern A. Zeeb &VNET_NAME(tcp_always_keepalive) , 0, 143334fc582SBjoern A. Zeeb "Assume SO_KEEPALIVE on all TCP connections"); 14434be9bf3SPoul-Henning Kamp 1457c72af87SMohan Srinivasan int tcp_fast_finwait2_recycle = 0; 1467c72af87SMohan Srinivasan SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 1476489fe65SAndre Oppermann &tcp_fast_finwait2_recycle, 0, 1486489fe65SAndre Oppermann "Recycle closed FIN_WAIT_2 connections faster"); 1497c72af87SMohan Srinivasan 1507c72af87SMohan Srinivasan int tcp_finwait2_timeout; 1517029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 1527029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1537029da5cSPawel Biernacki &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 1547029da5cSPawel Biernacki "FIN-WAIT2 timeout"); 1557c72af87SMohan Srinivasan 1569077f387SGleb Smirnoff int tcp_keepcnt = TCPTV_KEEPCNT; 1579077f387SGleb Smirnoff SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 1589077f387SGleb Smirnoff "Number of keepalive probes to send"); 1597c72af87SMohan Srinivasan 1600312fbe9SPoul-Henning Kamp /* max idle probes */ 1619b8b58e0SJonathan Lemon int tcp_maxpersistidle; 162e79adb8eSGarrett Wollman 16389e560f4SRandall Stewart int tcp_rexmit_drop_options = 0; 1646c0ef895SJohn Baldwin SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 1656c0ef895SJohn Baldwin &tcp_rexmit_drop_options, 0, 1666c0ef895SJohn Baldwin "Drop TCP options from 3rd and later retransmitted SYN"); 1676c0ef895SJohn Baldwin 16808af8aacSRandall Stewart int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 16908af8aacSRandall Stewart SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, 17008af8aacSRandall Stewart CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, 17108af8aacSRandall Stewart &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 17208af8aacSRandall Stewart "Maximum time (in ms) that a session can linger without making progress"); 17308af8aacSRandall Stewart 174e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 175f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 176f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 177f6f6703fSSean Bruno &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 178f6f6703fSSean Bruno "Path MTU Discovery Black Hole Detection Enabled"); 179f6f6703fSSean Bruno 180f6f6703fSSean Bruno #ifdef INET 181e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 182f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 183f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 184f6f6703fSSean Bruno &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 185f6f6703fSSean Bruno "Path MTU Discovery Black Hole Detection lowered MSS"); 186f6f6703fSSean Bruno #endif 187f6f6703fSSean Bruno 188f6f6703fSSean Bruno #ifdef INET6 189e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 190f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 191f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 192f6f6703fSSean Bruno &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 193f6f6703fSSean Bruno "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 194f6f6703fSSean Bruno #endif 195f6f6703fSSean Bruno 1968f7e75cbSAdrian Chadd #ifdef RSS 1978f7e75cbSAdrian Chadd static int per_cpu_timers = 1; 1988f7e75cbSAdrian Chadd #else 19987aedea4SKip Macy static int per_cpu_timers = 0; 2008f7e75cbSAdrian Chadd #endif 20187aedea4SKip Macy SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 20287aedea4SKip Macy &per_cpu_timers , 0, "run tcp timers on all cpus"); 20387aedea4SKip Macy 204883831c6SAdrian Chadd /* 205883831c6SAdrian Chadd * Map the given inp to a CPU id. 206883831c6SAdrian Chadd * 207883831c6SAdrian Chadd * This queries RSS if it's compiled in, else it defaults to the current 208883831c6SAdrian Chadd * CPU ID. 209883831c6SAdrian Chadd */ 21089e560f4SRandall Stewart inline int 211883831c6SAdrian Chadd inp_to_cpuid(struct inpcb *inp) 212883831c6SAdrian Chadd { 213883831c6SAdrian Chadd u_int cpuid; 214883831c6SAdrian Chadd 215883831c6SAdrian Chadd if (per_cpu_timers) { 21647ded797SFranco Fichtner #ifdef RSS 217883831c6SAdrian Chadd cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 218883831c6SAdrian Chadd if (cpuid == NETISR_CPUID_NONE) 219883831c6SAdrian Chadd return (curcpu); /* XXX */ 220883831c6SAdrian Chadd else 221883831c6SAdrian Chadd return (cpuid); 22247ded797SFranco Fichtner #endif 223883831c6SAdrian Chadd /* 224883831c6SAdrian Chadd * We don't have a flowid -> cpuid mapping, so cheat and 225883831c6SAdrian Chadd * just map unknown cpuids to curcpu. Not the best, but 226883831c6SAdrian Chadd * apparently better than defaulting to swi 0. 227883831c6SAdrian Chadd */ 228883831c6SAdrian Chadd cpuid = inp->inp_flowid % (mp_maxid + 1); 229883831c6SAdrian Chadd if (! CPU_ABSENT(cpuid)) 230883831c6SAdrian Chadd return (cpuid); 231883831c6SAdrian Chadd return (curcpu); 23247ded797SFranco Fichtner } else { 233883831c6SAdrian Chadd return (0); 234883831c6SAdrian Chadd } 235883831c6SAdrian Chadd } 23687aedea4SKip Macy 237df8bae1dSRodney W. Grimes int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 238f058535dSJeffrey Hsu { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 239df8bae1dSRodney W. Grimes 24089e560f4SRandall Stewart int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 241e79adb8eSGarrett Wollman 242df8bae1dSRodney W. Grimes /* 243df8bae1dSRodney W. Grimes * TCP timer processing. 244446ccdd0SGleb Smirnoff * 245446ccdd0SGleb Smirnoff * Each connection has 5 timers associated with it, which can be scheduled 246446ccdd0SGleb Smirnoff * simultaneously. They all are serviced by one callout tcp_timer_enter(). 247446ccdd0SGleb Smirnoff * This function executes the next timer via tcp_timersw[] vector. Each 248446ccdd0SGleb Smirnoff * timer is supposed to return 'true' unless the connection was destroyed. 249446ccdd0SGleb Smirnoff * In the former case tcp_timer_enter() will schedule callout for next timer. 250df8bae1dSRodney W. Grimes */ 25185d94372SRobert Watson 252446ccdd0SGleb Smirnoff typedef bool tcp_timer_t(struct tcpcb *); 253446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_delack; 254446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_2msl; 255446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_keep; 256446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_persist; 257446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_rexmt; 258446ccdd0SGleb Smirnoff 259446ccdd0SGleb Smirnoff static tcp_timer_t * const tcp_timersw[TT_N] = { 260446ccdd0SGleb Smirnoff [TT_DELACK] = tcp_timer_delack, 261446ccdd0SGleb Smirnoff [TT_REXMT] = tcp_timer_rexmt, 262446ccdd0SGleb Smirnoff [TT_PERSIST] = tcp_timer_persist, 263446ccdd0SGleb Smirnoff [TT_KEEP] = tcp_timer_keep, 264446ccdd0SGleb Smirnoff [TT_2MSL] = tcp_timer_2msl, 265446ccdd0SGleb Smirnoff }; 266446ccdd0SGleb Smirnoff 267446ccdd0SGleb Smirnoff /* 268446ccdd0SGleb Smirnoff * tcp_output_locked() s a timer specific variation of call to tcp_output(), 269446ccdd0SGleb Smirnoff * see tcp_var.h for the rest. It handles drop request from advanced stacks, 270446ccdd0SGleb Smirnoff * but keeps tcpcb locked unless tcp_drop() destroyed it. 271446ccdd0SGleb Smirnoff * Returns true if tcpcb is valid and locked. 272446ccdd0SGleb Smirnoff */ 273446ccdd0SGleb Smirnoff static inline bool 274446ccdd0SGleb Smirnoff tcp_output_locked(struct tcpcb *tp) 275446ccdd0SGleb Smirnoff { 276446ccdd0SGleb Smirnoff int rv; 277446ccdd0SGleb Smirnoff 278446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 279446ccdd0SGleb Smirnoff 280446ccdd0SGleb Smirnoff if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { 281446ccdd0SGleb Smirnoff KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, 282446ccdd0SGleb Smirnoff ("TCP stack %s requested tcp_drop(%p)", 283446ccdd0SGleb Smirnoff tp->t_fb->tfb_tcp_block_name, tp)); 284446ccdd0SGleb Smirnoff tp = tcp_drop(tp, rv); 285446ccdd0SGleb Smirnoff } 286446ccdd0SGleb Smirnoff 287446ccdd0SGleb Smirnoff return (tp != NULL); 288446ccdd0SGleb Smirnoff } 289446ccdd0SGleb Smirnoff 290446ccdd0SGleb Smirnoff static bool 291446ccdd0SGleb Smirnoff tcp_timer_delack(struct tcpcb *tp) 292df8bae1dSRodney W. Grimes { 293109eb549SGleb Smirnoff struct epoch_tracker et; 294446ccdd0SGleb Smirnoff #if defined(INVARIANTS) || defined(VIMAGE) 2959eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 296446ccdd0SGleb Smirnoff #endif 297446ccdd0SGleb Smirnoff bool rv; 2989eb0e832SGleb Smirnoff 299446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 300446ccdd0SGleb Smirnoff 3018840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 3029b8b58e0SJonathan Lemon tp->t_flags |= TF_ACKNOW; 30378b50714SRobert Watson TCPSTAT_INC(tcps_delack); 304109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 305446ccdd0SGleb Smirnoff rv = tcp_output_locked(tp); 306109eb549SGleb Smirnoff NET_EPOCH_EXIT(et); 3078b615593SMarko Zec CURVNET_RESTORE(); 308446ccdd0SGleb Smirnoff 309446ccdd0SGleb Smirnoff return (rv); 3109b8b58e0SJonathan Lemon } 3119b8b58e0SJonathan Lemon 312446ccdd0SGleb Smirnoff static bool 313446ccdd0SGleb Smirnoff tcp_timer_2msl(struct tcpcb *tp) 314b07fef50SRandall Stewart { 3159eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 316446ccdd0SGleb Smirnoff bool close = false; 31777198a94SGleb Smirnoff 31877198a94SGleb Smirnoff INP_WLOCK_ASSERT(inp); 31977198a94SGleb Smirnoff 320446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 3218840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 322d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 32385d94372SRobert Watson tcp_free_sackholes(tp); 32485d94372SRobert Watson /* 325df8bae1dSRodney W. Grimes * 2 MSL timeout in shutdown went off. If we're closed but 326df8bae1dSRodney W. Grimes * still waiting for peer to close and connection has been idle 32731a7749dSJulien Charbon * too long delete connection control block. Otherwise, check 32831a7749dSJulien Charbon * again in a bit. 32931a7749dSJulien Charbon * 3307c72af87SMohan Srinivasan * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 3317c72af87SMohan Srinivasan * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 3327c72af87SMohan Srinivasan * Ignore fact that there were recent incoming segments. 333f71cb9f7SGleb Smirnoff * 334f71cb9f7SGleb Smirnoff * XXXGL: check if inp_socket shall always be !NULL here? 335df8bae1dSRodney W. Grimes */ 3360d744519SGleb Smirnoff if (tp->t_state == TCPS_TIME_WAIT) { 337446ccdd0SGleb Smirnoff close = true; 3380d744519SGleb Smirnoff } else if (tp->t_state == TCPS_FIN_WAIT_2 && 3399eb0e832SGleb Smirnoff tcp_fast_finwait2_recycle && inp->inp_socket && 3409eb0e832SGleb Smirnoff (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 34178b50714SRobert Watson TCPSTAT_INC(tcps_finwait2_drops); 342446ccdd0SGleb Smirnoff close = true; 3437c72af87SMohan Srinivasan } else { 344446ccdd0SGleb Smirnoff if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 345446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); 346446ccdd0SGleb Smirnoff else 347446ccdd0SGleb Smirnoff close = true; 348446ccdd0SGleb Smirnoff } 349446ccdd0SGleb Smirnoff if (close) { 350446ccdd0SGleb Smirnoff struct epoch_tracker et; 351446ccdd0SGleb Smirnoff 352446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 353446ccdd0SGleb Smirnoff tp = tcp_close(tp); 354446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 355446ccdd0SGleb Smirnoff } 35677198a94SGleb Smirnoff CURVNET_RESTORE(); 357446ccdd0SGleb Smirnoff 358446ccdd0SGleb Smirnoff return (tp != NULL); 3597c72af87SMohan Srinivasan } 360df8bae1dSRodney W. Grimes 361446ccdd0SGleb Smirnoff static bool 362446ccdd0SGleb Smirnoff tcp_timer_keep(struct tcpcb *tp) 3639b8b58e0SJonathan Lemon { 3646573d758SMatt Macy struct epoch_tracker et; 3659eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 3669eb0e832SGleb Smirnoff struct tcptemp *t_template; 3679b8b58e0SJonathan Lemon 368446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 3699eb0e832SGleb Smirnoff 370446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 3718840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 3726d172f58SJonathan T. Looney /* 3736d172f58SJonathan T. Looney * Because we don't regularly reset the keepalive callout in 3746d172f58SJonathan T. Looney * the ESTABLISHED state, it may be that we don't actually need 3756d172f58SJonathan T. Looney * to send a keepalive yet. If that occurs, schedule another 3766d172f58SJonathan T. Looney * call for the next time the keepalive timer might expire. 3776d172f58SJonathan T. Looney */ 3786d172f58SJonathan T. Looney if (TCPS_HAVEESTABLISHED(tp->t_state)) { 3796d172f58SJonathan T. Looney u_int idletime; 3806d172f58SJonathan T. Looney 3816d172f58SJonathan T. Looney idletime = ticks - tp->t_rcvtime; 3826d172f58SJonathan T. Looney if (idletime < TP_KEEPIDLE(tp)) { 383446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, 384446ccdd0SGleb Smirnoff TP_KEEPIDLE(tp) - idletime); 3856d172f58SJonathan T. Looney CURVNET_RESTORE(); 386446ccdd0SGleb Smirnoff return (true); 3876d172f58SJonathan T. Looney } 3886d172f58SJonathan T. Looney } 3896d172f58SJonathan T. Looney 3909b8b58e0SJonathan Lemon /* 3919b8b58e0SJonathan Lemon * Keep-alive timer went off; send something 3929b8b58e0SJonathan Lemon * or drop connection if idle for too long. 3939b8b58e0SJonathan Lemon */ 39478b50714SRobert Watson TCPSTAT_INC(tcps_keeptimeo); 3959b8b58e0SJonathan Lemon if (tp->t_state < TCPS_ESTABLISHED) 3969b8b58e0SJonathan Lemon goto dropit; 397334fc582SBjoern A. Zeeb if ((V_tcp_always_keepalive || 398f1798531SJohn Baldwin inp->inp_socket->so_options & SO_KEEPALIVE) && 3999b8b58e0SJonathan Lemon tp->t_state <= TCPS_CLOSING) { 4009077f387SGleb Smirnoff if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 4019b8b58e0SJonathan Lemon goto dropit; 4029b8b58e0SJonathan Lemon /* 4039b8b58e0SJonathan Lemon * Send a packet designed to force a response 4049b8b58e0SJonathan Lemon * if the peer is up and reachable: 4059b8b58e0SJonathan Lemon * either an ACK if the connection is still alive, 4069b8b58e0SJonathan Lemon * or an RST if the peer has closed the connection 4079b8b58e0SJonathan Lemon * due to timeout or reboot. 4089b8b58e0SJonathan Lemon * Using sequence number tp->snd_una-1 4099b8b58e0SJonathan Lemon * causes the transmitted zero-length segment 4109b8b58e0SJonathan Lemon * to lie outside the receive window; 4119b8b58e0SJonathan Lemon * by the protocol spec, this requires the 4129b8b58e0SJonathan Lemon * correspondent TCP to respond. 4139b8b58e0SJonathan Lemon */ 41478b50714SRobert Watson TCPSTAT_INC(tcps_keepprobe); 41579909384SJonathan Lemon t_template = tcpip_maketemplate(inp); 41608517d53SMike Silbersack if (t_template) { 417b9555453SGleb Smirnoff NET_EPOCH_ENTER(et); 41808517d53SMike Silbersack tcp_respond(tp, t_template->tt_ipgen, 41908517d53SMike Silbersack &t_template->tt_t, (struct mbuf *)NULL, 4209b8b58e0SJonathan Lemon tp->rcv_nxt, tp->snd_una - 1, 0); 421b9555453SGleb Smirnoff NET_EPOCH_EXIT(et); 42253640b0eSRobert Watson free(t_template, M_TEMP); 42308517d53SMike Silbersack } 424446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); 425b07fef50SRandall Stewart } else 426446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 4279b8b58e0SJonathan Lemon 4288b615593SMarko Zec CURVNET_RESTORE(); 429446ccdd0SGleb Smirnoff return (true); 4309b8b58e0SJonathan Lemon 4319b8b58e0SJonathan Lemon dropit: 43278b50714SRobert Watson TCPSTAT_INC(tcps_keepdrops); 43358d94bd0SGleb Smirnoff NET_EPOCH_ENTER(et); 434d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 43585d94372SRobert Watson tp = tcp_drop(tp, ETIMEDOUT); 43658d94bd0SGleb Smirnoff NET_EPOCH_EXIT(et); 4378b615593SMarko Zec CURVNET_RESTORE(); 438446ccdd0SGleb Smirnoff 439446ccdd0SGleb Smirnoff return (tp != NULL); 4409b8b58e0SJonathan Lemon } 4419b8b58e0SJonathan Lemon 44208af8aacSRandall Stewart /* 44308af8aacSRandall Stewart * Has this session exceeded the maximum time without seeing a substantive 44408af8aacSRandall Stewart * acknowledgement? If so, return true; otherwise false. 44508af8aacSRandall Stewart */ 44608af8aacSRandall Stewart static bool 44708af8aacSRandall Stewart tcp_maxunacktime_check(struct tcpcb *tp) 44808af8aacSRandall Stewart { 44908af8aacSRandall Stewart 45008af8aacSRandall Stewart /* Are we tracking this timer for this session? */ 45108af8aacSRandall Stewart if (TP_MAXUNACKTIME(tp) == 0) 45208af8aacSRandall Stewart return false; 45308af8aacSRandall Stewart 45408af8aacSRandall Stewart /* Do we have a current measurement. */ 45508af8aacSRandall Stewart if (tp->t_acktime == 0) 45608af8aacSRandall Stewart return false; 45708af8aacSRandall Stewart 45808af8aacSRandall Stewart /* Are we within the acceptable range? */ 45908af8aacSRandall Stewart if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 46008af8aacSRandall Stewart return false; 46108af8aacSRandall Stewart 46208af8aacSRandall Stewart /* We exceeded the timer. */ 46308af8aacSRandall Stewart TCPSTAT_INC(tcps_progdrops); 46408af8aacSRandall Stewart return true; 46508af8aacSRandall Stewart } 46608af8aacSRandall Stewart 467446ccdd0SGleb Smirnoff static bool 468446ccdd0SGleb Smirnoff tcp_timer_persist(struct tcpcb *tp) 4699b8b58e0SJonathan Lemon { 4706573d758SMatt Macy struct epoch_tracker et; 471446ccdd0SGleb Smirnoff #if defined(INVARIANTS) || defined(VIMAGE) 4729eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 4739b8b58e0SJonathan Lemon #endif 474446ccdd0SGleb Smirnoff bool progdrop, rv; 4759eb0e832SGleb Smirnoff 476446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 477446ccdd0SGleb Smirnoff 478446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 4798840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 4809b8b58e0SJonathan Lemon /* 481a4641f4eSPedro F. Giffuni * Persistence timer into zero window. 4829b8b58e0SJonathan Lemon * Force a byte to be output, if possible. 4839b8b58e0SJonathan Lemon */ 48478b50714SRobert Watson TCPSTAT_INC(tcps_persisttimeo); 4859b8b58e0SJonathan Lemon /* 4869b8b58e0SJonathan Lemon * Hack: if the peer is dead/unreachable, we do not 4879b8b58e0SJonathan Lemon * time out if the window is closed. After a full 4889b8b58e0SJonathan Lemon * backoff, drop the connection if the idle time 4899b8b58e0SJonathan Lemon * (no responses to probes) reaches the maximum 4909b8b58e0SJonathan Lemon * backoff that we would use if retransmitting. 49108af8aacSRandall Stewart * Also, drop the connection if we haven't been making 49208af8aacSRandall Stewart * progress. 4939b8b58e0SJonathan Lemon */ 49408af8aacSRandall Stewart progdrop = tcp_maxunacktime_check(tp); 49508af8aacSRandall Stewart if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && 4966b0c5521SJohn Baldwin (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 49708af8aacSRandall Stewart ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 49808af8aacSRandall Stewart if (!progdrop) 49978b50714SRobert Watson TCPSTAT_INC(tcps_persistdrop); 500d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 501446ccdd0SGleb Smirnoff goto dropit; 5029b8b58e0SJonathan Lemon } 503322181c9SAndre Oppermann /* 504322181c9SAndre Oppermann * If the user has closed the socket then drop a persisting 505322181c9SAndre Oppermann * connection after a much reduced timeout. 506322181c9SAndre Oppermann */ 507322181c9SAndre Oppermann if (tp->t_state > TCPS_CLOSE_WAIT && 508322181c9SAndre Oppermann (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 509322181c9SAndre Oppermann TCPSTAT_INC(tcps_persistdrop); 510d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 511446ccdd0SGleb Smirnoff goto dropit; 512322181c9SAndre Oppermann } 5139b8b58e0SJonathan Lemon tcp_setpersist(tp); 5142cdbfa66SPaul Saab tp->t_flags |= TF_FORCEDATA; 515109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 516446ccdd0SGleb Smirnoff if ((rv = tcp_output_locked(tp))) 5172cdbfa66SPaul Saab tp->t_flags &= ~TF_FORCEDATA; 518f64dc2abSGleb Smirnoff NET_EPOCH_EXIT(et); 5198b615593SMarko Zec CURVNET_RESTORE(); 520446ccdd0SGleb Smirnoff 521446ccdd0SGleb Smirnoff return (rv); 522446ccdd0SGleb Smirnoff 523446ccdd0SGleb Smirnoff dropit: 524446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 525446ccdd0SGleb Smirnoff tp = tcp_drop(tp, ETIMEDOUT); 526446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 527446ccdd0SGleb Smirnoff CURVNET_RESTORE(); 528446ccdd0SGleb Smirnoff 529446ccdd0SGleb Smirnoff return (tp != NULL); 5309b8b58e0SJonathan Lemon } 5319b8b58e0SJonathan Lemon 532446ccdd0SGleb Smirnoff static bool 533446ccdd0SGleb Smirnoff tcp_timer_rexmt(struct tcpcb *tp) 5349b8b58e0SJonathan Lemon { 5359eb0e832SGleb Smirnoff struct epoch_tracker et; 5369eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 537446ccdd0SGleb Smirnoff int rexmt; 538446ccdd0SGleb Smirnoff bool isipv6, rv; 5399b8b58e0SJonathan Lemon 540446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 5419eb0e832SGleb Smirnoff 542446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 5438840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 5446d90faf3SPaul Saab tcp_free_sackholes(tp); 5455105a92cSRandall Stewart if (tp->t_fb->tfb_tcp_rexmit_tmr) { 5465105a92cSRandall Stewart /* The stack has a timer action too. */ 5475105a92cSRandall Stewart (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 5485105a92cSRandall Stewart } 549df8bae1dSRodney W. Grimes /* 550df8bae1dSRodney W. Grimes * Retransmission timer went off. Message has not 551df8bae1dSRodney W. Grimes * been acked within retransmit interval. Back off 552df8bae1dSRodney W. Grimes * to a longer retransmit interval and retransmit one segment. 55308af8aacSRandall Stewart * 55408af8aacSRandall Stewart * If we've either exceeded the maximum number of retransmissions, 55508af8aacSRandall Stewart * or we've gone long enough without making progress, then drop 55608af8aacSRandall Stewart * the session. 557df8bae1dSRodney W. Grimes */ 55808af8aacSRandall Stewart if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { 55908af8aacSRandall Stewart if (tp->t_rxtshift > TCP_MAXRXTSHIFT) 56078b50714SRobert Watson TCPSTAT_INC(tcps_timeoutdrop); 56108af8aacSRandall Stewart tp->t_rxtshift = TCP_MAXRXTSHIFT; 562d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 563446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 564446ccdd0SGleb Smirnoff tp = tcp_drop(tp, ETIMEDOUT); 565446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 56677198a94SGleb Smirnoff CURVNET_RESTORE(); 567446ccdd0SGleb Smirnoff 568446ccdd0SGleb Smirnoff return (tp != NULL); 569b07fef50SRandall Stewart } 570cf8f04f4SAndre Oppermann if (tp->t_state == TCPS_SYN_SENT) { 571cf8f04f4SAndre Oppermann /* 572cf8f04f4SAndre Oppermann * If the SYN was retransmitted, indicate CWND to be 573cf8f04f4SAndre Oppermann * limited to 1 segment in cc_conn_init(). 574cf8f04f4SAndre Oppermann */ 575cf8f04f4SAndre Oppermann tp->snd_cwnd = 1; 576cf8f04f4SAndre Oppermann } else if (tp->t_rxtshift == 1) { 5779b8b58e0SJonathan Lemon /* 5789b8b58e0SJonathan Lemon * first retransmit; record ssthresh and cwnd so they can 5799b8b58e0SJonathan Lemon * be recovered if this turns out to be a "bad" retransmit. 5809b8b58e0SJonathan Lemon * A retransmit is considered "bad" if an ACK for this 5819b8b58e0SJonathan Lemon * segment is received within RTT/2 interval; the assumption 5829b8b58e0SJonathan Lemon * here is that the ACK was already in flight. See 5839b8b58e0SJonathan Lemon * "On Estimating End-to-End Network Path Properties" by 5849b8b58e0SJonathan Lemon * Allman and Paxson for more details. 5859b8b58e0SJonathan Lemon */ 5869b8b58e0SJonathan Lemon tp->snd_cwnd_prev = tp->snd_cwnd; 5879b8b58e0SJonathan Lemon tp->snd_ssthresh_prev = tp->snd_ssthresh; 5889d11646dSJeffrey Hsu tp->snd_recover_prev = tp->snd_recover; 589dbc42409SLawrence Stewart if (IN_FASTRECOVERY(tp->t_flags)) 5909d11646dSJeffrey Hsu tp->t_flags |= TF_WASFRECOVERY; 5919d11646dSJeffrey Hsu else 5929d11646dSJeffrey Hsu tp->t_flags &= ~TF_WASFRECOVERY; 593dbc42409SLawrence Stewart if (IN_CONGRECOVERY(tp->t_flags)) 594dbc42409SLawrence Stewart tp->t_flags |= TF_WASCRECOVERY; 595dbc42409SLawrence Stewart else 596dbc42409SLawrence Stewart tp->t_flags &= ~TF_WASCRECOVERY; 59710d20c84SMatt Macy if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 5989b8b58e0SJonathan Lemon tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 59910d20c84SMatt Macy /* In the event that we've negotiated timestamps 60010d20c84SMatt Macy * badrxtwin will be set to the value that we set 60110d20c84SMatt Macy * the retransmitted packet's to_tsval to by tcp_output 60210d20c84SMatt Macy */ 603672dc4aeSJohn Baldwin tp->t_flags |= TF_PREVVALID; 604672dc4aeSJohn Baldwin } else 605672dc4aeSJohn Baldwin tp->t_flags &= ~TF_PREVVALID; 60678b50714SRobert Watson TCPSTAT_INC(tcps_rexmttimeo); 607281a0fd4SPatrick Kelsey if ((tp->t_state == TCPS_SYN_SENT) || 608281a0fd4SPatrick Kelsey (tp->t_state == TCPS_SYN_RECEIVED)) 6090999766dSMichael Tuexen rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 6107d42e30cSJonathan Lemon else 611df8bae1dSRodney W. Grimes rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 612df8bae1dSRodney W. Grimes TCPT_RANGESET(tp->t_rxtcur, rexmt, 613df8bae1dSRodney W. Grimes tp->t_rttmin, TCPTV_REXMTMAX); 614f6f6703fSSean Bruno 615882ac53eSSean Bruno /* 616882ac53eSSean Bruno * We enter the path for PLMTUD if connection is established or, if 617882ac53eSSean Bruno * connection is FIN_WAIT_1 status, reason for the last is that if 618882ac53eSSean Bruno * amount of data we send is very small, we could send it in couple of 619882ac53eSSean Bruno * packets and process straight to FIN. In that case we won't catch 620882ac53eSSean Bruno * ESTABLISHED state. 621882ac53eSSean Bruno */ 622f6f6703fSSean Bruno #ifdef INET6 6239eb0e832SGleb Smirnoff isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 624413c3db1SMichael Tuexen #else 625413c3db1SMichael Tuexen isipv6 = false; 626f6f6703fSSean Bruno #endif 627413c3db1SMichael Tuexen if (((V_tcp_pmtud_blackhole_detect == 1) || 628413c3db1SMichael Tuexen (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 629413c3db1SMichael Tuexen (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 630413c3db1SMichael Tuexen ((tp->t_state == TCPS_ESTABLISHED) || 631413c3db1SMichael Tuexen (tp->t_state == TCPS_FIN_WAIT_1))) { 632b89af8e1SMichael Tuexen if (tp->t_rxtshift == 1) { 633adf43a92SHiren Panchasara /* 634b89af8e1SMichael Tuexen * We enter blackhole detection after the first 635b89af8e1SMichael Tuexen * unsuccessful timer based retransmission. 636b89af8e1SMichael Tuexen * Then we reduce up to two times the MSS, each 637b89af8e1SMichael Tuexen * candidate giving two tries of retransmissions. 638b89af8e1SMichael Tuexen * But we give a candidate only two tries, if it 639b89af8e1SMichael Tuexen * actually reduces the MSS. 640adf43a92SHiren Panchasara */ 641b89af8e1SMichael Tuexen tp->t_blackhole_enter = 2; 642b89af8e1SMichael Tuexen tp->t_blackhole_exit = tp->t_blackhole_enter; 643b89af8e1SMichael Tuexen if (isipv6) { 644b89af8e1SMichael Tuexen #ifdef INET6 645b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 646b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 647b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_v6mssdflt && 648b89af8e1SMichael Tuexen V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 649b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 650b89af8e1SMichael Tuexen #endif 651b89af8e1SMichael Tuexen } else { 652b89af8e1SMichael Tuexen #ifdef INET 653b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 654b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 655b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_mssdflt && 656b89af8e1SMichael Tuexen V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 657b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 658b89af8e1SMichael Tuexen #endif 659b89af8e1SMichael Tuexen } 660b89af8e1SMichael Tuexen } 661f6f6703fSSean Bruno if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 662f6f6703fSSean Bruno (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 663b89af8e1SMichael Tuexen (tp->t_rxtshift >= tp->t_blackhole_enter && 664b89af8e1SMichael Tuexen tp->t_rxtshift < tp->t_blackhole_exit && 665b89af8e1SMichael Tuexen (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 666f6f6703fSSean Bruno /* 667f6f6703fSSean Bruno * Enter Path MTU Black-hole Detection mechanism: 668f6f6703fSSean Bruno * - Disable Path MTU Discovery (IP "DF" bit). 669f6f6703fSSean Bruno * - Reduce MTU to lower value than what we 670f6f6703fSSean Bruno * negotiated with peer. 671f6f6703fSSean Bruno */ 6723d5af7a1SMichael Tuexen if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 673f6f6703fSSean Bruno /* Record that we may have found a black hole. */ 674f6f6703fSSean Bruno tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 675f6f6703fSSean Bruno /* Keep track of previous MSS. */ 6760c39d38dSGleb Smirnoff tp->t_pmtud_saved_maxseg = tp->t_maxseg; 6773d5af7a1SMichael Tuexen } 678f6f6703fSSean Bruno 679f6f6703fSSean Bruno /* 680f6f6703fSSean Bruno * Reduce the MSS to blackhole value or to the default 681f6f6703fSSean Bruno * in an attempt to retransmit. 682f6f6703fSSean Bruno */ 683f6f6703fSSean Bruno #ifdef INET6 684f6f6703fSSean Bruno if (isipv6 && 685b89af8e1SMichael Tuexen tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 686b89af8e1SMichael Tuexen V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 687f6f6703fSSean Bruno /* Use the sysctl tuneable blackhole MSS. */ 6880c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 68932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated); 690f6f6703fSSean Bruno } else if (isipv6) { 691f6f6703fSSean Bruno /* Use the default MSS. */ 6920c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_v6mssdflt; 693f6f6703fSSean Bruno /* 694f6f6703fSSean Bruno * Disable Path MTU Discovery when we switch to 695f6f6703fSSean Bruno * minmss. 696f6f6703fSSean Bruno */ 697f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 69832a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 699f6f6703fSSean Bruno } 700f6f6703fSSean Bruno #endif 701f6f6703fSSean Bruno #if defined(INET6) && defined(INET) 702f6f6703fSSean Bruno else 703f6f6703fSSean Bruno #endif 704f6f6703fSSean Bruno #ifdef INET 705b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 706b89af8e1SMichael Tuexen V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 707f6f6703fSSean Bruno /* Use the sysctl tuneable blackhole MSS. */ 7080c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 70932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated); 710f6f6703fSSean Bruno } else { 711f6f6703fSSean Bruno /* Use the default MSS. */ 7120c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_mssdflt; 713f6f6703fSSean Bruno /* 714f6f6703fSSean Bruno * Disable Path MTU Discovery when we switch to 715f6f6703fSSean Bruno * minmss. 716f6f6703fSSean Bruno */ 717f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 71832a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 719f6f6703fSSean Bruno } 720f6f6703fSSean Bruno #endif 721f6f6703fSSean Bruno /* 722f6f6703fSSean Bruno * Reset the slow-start flight size 723f6f6703fSSean Bruno * as it may depend on the new MSS. 724f6f6703fSSean Bruno */ 725f6f6703fSSean Bruno if (CC_ALGO(tp)->conn_init != NULL) 726e68b3792SGleb Smirnoff CC_ALGO(tp)->conn_init(&tp->t_ccv); 727f6f6703fSSean Bruno } else { 728f6f6703fSSean Bruno /* 729f6f6703fSSean Bruno * If further retransmissions are still unsuccessful 730f6f6703fSSean Bruno * with a lowered MTU, maybe this isn't a blackhole and 731f6f6703fSSean Bruno * we restore the previous MSS and blackhole detection 732f6f6703fSSean Bruno * flags. 733f6f6703fSSean Bruno */ 734f6f6703fSSean Bruno if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 735b89af8e1SMichael Tuexen (tp->t_rxtshift >= tp->t_blackhole_exit)) { 736f6f6703fSSean Bruno tp->t_flags2 |= TF2_PLPMTU_PMTUD; 737f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7380c39d38dSGleb Smirnoff tp->t_maxseg = tp->t_pmtud_saved_maxseg; 73932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_failed); 740f6f6703fSSean Bruno /* 741f6f6703fSSean Bruno * Reset the slow-start flight size as it 742f6f6703fSSean Bruno * may depend on the new MSS. 743f6f6703fSSean Bruno */ 744f6f6703fSSean Bruno if (CC_ALGO(tp)->conn_init != NULL) 745e68b3792SGleb Smirnoff CC_ALGO(tp)->conn_init(&tp->t_ccv); 746f6f6703fSSean Bruno } 747f6f6703fSSean Bruno } 748f6f6703fSSean Bruno } 749f6f6703fSSean Bruno 750df8bae1dSRodney W. Grimes /* 75177339e1cSAndre Oppermann * Disable RFC1323 and SACK if we haven't got any response to 7527ceb7783SJesper Skriver * our third SYN to work-around some broken terminal servers 7537ceb7783SJesper Skriver * (most of which have hopefully been retired) that have bad VJ 7547ceb7783SJesper Skriver * header compression code which trashes TCP segments containing 7557ceb7783SJesper Skriver * unknown-to-them TCP options. 7567ceb7783SJesper Skriver */ 7576c0ef895SJohn Baldwin if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7586c0ef895SJohn Baldwin (tp->t_rxtshift == 3)) 759c4ab59c1SAndre Oppermann tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7607ceb7783SJesper Skriver /* 7615ede40dcSRyan Stone * If we backed off this far, notify the L3 protocol that we're having 7625ede40dcSRyan Stone * connection problems. 763df8bae1dSRodney W. Grimes */ 7645ede40dcSRyan Stone if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 765fb59c426SYoshinobu Inoue #ifdef INET6 7669eb0e832SGleb Smirnoff if ((inp->inp_vflag & INP_IPV6) != 0) 7679eb0e832SGleb Smirnoff in6_losing(inp); 76884cc0778SGeorge V. Neville-Neil else 769fb59c426SYoshinobu Inoue #endif 7709eb0e832SGleb Smirnoff in_losing(inp); 771df8bae1dSRodney W. Grimes } 772df8bae1dSRodney W. Grimes tp->snd_nxt = tp->snd_una; 7739d11646dSJeffrey Hsu tp->snd_recover = tp->snd_max; 77446f58482SJonathan Lemon /* 77574b48c1dSAndras Olah * Force a segment to be sent. 77674b48c1dSAndras Olah */ 77774b48c1dSAndras Olah tp->t_flags |= TF_ACKNOW; 77874b48c1dSAndras Olah /* 779df8bae1dSRodney W. Grimes * If timing a segment in this window, stop the timer. 780df8bae1dSRodney W. Grimes */ 7819b8b58e0SJonathan Lemon tp->t_rtttime = 0; 782dbc42409SLawrence Stewart 783b5af1b88SLawrence Stewart cc_cong_signal(tp, NULL, CC_RTO); 784109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 785446ccdd0SGleb Smirnoff rv = tcp_output_locked(tp); 786f64dc2abSGleb Smirnoff NET_EPOCH_EXIT(et); 7878b615593SMarko Zec CURVNET_RESTORE(); 788446ccdd0SGleb Smirnoff 789446ccdd0SGleb Smirnoff return (rv); 79085d94372SRobert Watson } 79185d94372SRobert Watson 792*76578d60SMichael Tuexen static void 793*76578d60SMichael Tuexen tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks) 794*76578d60SMichael Tuexen { 795*76578d60SMichael Tuexen struct tcp_log_buffer *lgb; 796*76578d60SMichael Tuexen uint64_t ms; 797*76578d60SMichael Tuexen 798*76578d60SMichael Tuexen INP_WLOCK_ASSERT(tptoinpcb(tp)); 799*76578d60SMichael Tuexen if (tp->t_logstate != TCP_LOG_STATE_OFF) 800*76578d60SMichael Tuexen lgb = tcp_log_event_(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, 801*76578d60SMichael Tuexen NULL, false, NULL, NULL, 0, NULL); 802*76578d60SMichael Tuexen else 803*76578d60SMichael Tuexen lgb = NULL; 804*76578d60SMichael Tuexen if (lgb != NULL) { 805*76578d60SMichael Tuexen lgb->tlb_flex1 = (what << 8) | which; 806*76578d60SMichael Tuexen if (what == TT_STARTING) { 807*76578d60SMichael Tuexen /* Convert ticks to ms and store it in tlb_flex2. */ 808*76578d60SMichael Tuexen if (hz == 1000) 809*76578d60SMichael Tuexen lgb->tlb_flex2 = ticks; 810*76578d60SMichael Tuexen else { 811*76578d60SMichael Tuexen ms = (((uint64_t)ticks * 1000) + (hz - 1)) / hz; 812*76578d60SMichael Tuexen if (ms > UINT32_MAX) 813*76578d60SMichael Tuexen lgb->tlb_flex2 = UINT32_MAX; 814*76578d60SMichael Tuexen else 815*76578d60SMichael Tuexen lgb->tlb_flex2 = (uint32_t)ms; 816*76578d60SMichael Tuexen } 817*76578d60SMichael Tuexen } 818*76578d60SMichael Tuexen } 819*76578d60SMichael Tuexen } 820*76578d60SMichael Tuexen 821446ccdd0SGleb Smirnoff static inline tt_which 822446ccdd0SGleb Smirnoff tcp_timer_next(struct tcpcb *tp, sbintime_t *precision) 82385d94372SRobert Watson { 824446ccdd0SGleb Smirnoff tt_which i, rv; 825446ccdd0SGleb Smirnoff sbintime_t after, before; 826446ccdd0SGleb Smirnoff 827446ccdd0SGleb Smirnoff for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) { 828446ccdd0SGleb Smirnoff if (tp->t_timers[i] < after) { 829446ccdd0SGleb Smirnoff after = tp->t_timers[i]; 830446ccdd0SGleb Smirnoff rv = i; 831446ccdd0SGleb Smirnoff } 832446ccdd0SGleb Smirnoff before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]); 833446ccdd0SGleb Smirnoff } 834446ccdd0SGleb Smirnoff if (precision != NULL) 835446ccdd0SGleb Smirnoff *precision = before - after; 836446ccdd0SGleb Smirnoff 837446ccdd0SGleb Smirnoff return (rv); 838446ccdd0SGleb Smirnoff } 839446ccdd0SGleb Smirnoff 840446ccdd0SGleb Smirnoff static void 841446ccdd0SGleb Smirnoff tcp_timer_enter(void *xtp) 842446ccdd0SGleb Smirnoff { 843446ccdd0SGleb Smirnoff struct tcpcb *tp = xtp; 8449eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 845446ccdd0SGleb Smirnoff sbintime_t precision; 846446ccdd0SGleb Smirnoff tt_which which; 847*76578d60SMichael Tuexen bool tp_valid; 848446ccdd0SGleb Smirnoff 849446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 850446ccdd0SGleb Smirnoff MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); 851446ccdd0SGleb Smirnoff 852446ccdd0SGleb Smirnoff curthread->td_pflags |= TDP_INTCPCALLOUT; 853446ccdd0SGleb Smirnoff 854446ccdd0SGleb Smirnoff which = tcp_timer_next(tp, NULL); 855446ccdd0SGleb Smirnoff MPASS(which < TT_N); 856446ccdd0SGleb Smirnoff tp->t_timers[which] = SBT_MAX; 857446ccdd0SGleb Smirnoff tp->t_precisions[which] = 0; 858446ccdd0SGleb Smirnoff 859*76578d60SMichael Tuexen tcp_bblog_timer(tp, which, TT_PROCESSING, 0); 860*76578d60SMichael Tuexen tp_valid = tcp_timersw[which](tp); 861*76578d60SMichael Tuexen if (tp_valid) { 862*76578d60SMichael Tuexen tcp_bblog_timer(tp, which, TT_PROCESSED, 0); 863446ccdd0SGleb Smirnoff if ((which = tcp_timer_next(tp, &precision)) != TT_N) { 864446ccdd0SGleb Smirnoff callout_reset_sbt_on(&tp->t_callout, 865446ccdd0SGleb Smirnoff tp->t_timers[which], precision, tcp_timer_enter, 866446ccdd0SGleb Smirnoff tp, inp_to_cpuid(inp), C_ABSOLUTE); 867446ccdd0SGleb Smirnoff } 868446ccdd0SGleb Smirnoff INP_WUNLOCK(inp); 869446ccdd0SGleb Smirnoff } 870446ccdd0SGleb Smirnoff 871446ccdd0SGleb Smirnoff curthread->td_pflags &= ~TDP_INTCPCALLOUT; 872446ccdd0SGleb Smirnoff } 873446ccdd0SGleb Smirnoff 874446ccdd0SGleb Smirnoff /* 875446ccdd0SGleb Smirnoff * Activate or stop (delta == 0) a TCP timer. 876446ccdd0SGleb Smirnoff */ 877446ccdd0SGleb Smirnoff void 878446ccdd0SGleb Smirnoff tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta) 879446ccdd0SGleb Smirnoff { 880446ccdd0SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 881446ccdd0SGleb Smirnoff sbintime_t precision; 882*76578d60SMichael Tuexen tt_what what; 88385d94372SRobert Watson 88409fe6320SNavdeep Parhar #ifdef TCP_OFFLOAD 88509fe6320SNavdeep Parhar if (tp->t_flags & TF_TOE) 88609fe6320SNavdeep Parhar return; 88709fe6320SNavdeep Parhar #endif 88809fe6320SNavdeep Parhar 889446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 8905571f9cfSJulien Charbon 891*76578d60SMichael Tuexen if (delta > 0) { 892*76578d60SMichael Tuexen what = TT_STARTING; 893446ccdd0SGleb Smirnoff callout_when(tick_sbt * delta, 0, C_HARDCLOCK, 894446ccdd0SGleb Smirnoff &tp->t_timers[which], &tp->t_precisions[which]); 895*76578d60SMichael Tuexen } else { 896*76578d60SMichael Tuexen what = TT_STOPPING; 897446ccdd0SGleb Smirnoff tp->t_timers[which] = SBT_MAX; 898*76578d60SMichael Tuexen } 899*76578d60SMichael Tuexen tcp_bblog_timer(tp, which, what, delta); 900446ccdd0SGleb Smirnoff 901446ccdd0SGleb Smirnoff if ((which = tcp_timer_next(tp, &precision)) != TT_N) 902446ccdd0SGleb Smirnoff callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 903446ccdd0SGleb Smirnoff precision, tcp_timer_enter, tp, inp_to_cpuid(inp), 904446ccdd0SGleb Smirnoff C_ABSOLUTE); 905446ccdd0SGleb Smirnoff else 906446ccdd0SGleb Smirnoff callout_stop(&tp->t_callout); 90785d94372SRobert Watson } 90885d94372SRobert Watson 909446ccdd0SGleb Smirnoff bool 910446ccdd0SGleb Smirnoff tcp_timer_active(struct tcpcb *tp, tt_which which) 91185d94372SRobert Watson { 91285d94372SRobert Watson 913446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 914446ccdd0SGleb Smirnoff 915446ccdd0SGleb Smirnoff return (tp->t_timers[which] != SBT_MAX); 916df8bae1dSRodney W. Grimes } 917b8614722SMike Silbersack 918446ccdd0SGleb Smirnoff /* 919446ccdd0SGleb Smirnoff * Stop all timers associated with tcpcb. 920446ccdd0SGleb Smirnoff * 921446ccdd0SGleb Smirnoff * Called only on tcpcb destruction. The tcpcb shall already be dropped from 922446ccdd0SGleb Smirnoff * the pcb lookup database and socket is not losing the last reference. 923446ccdd0SGleb Smirnoff * 924446ccdd0SGleb Smirnoff * XXXGL: unfortunately our callout(9) is not able to fully stop a locked 925446ccdd0SGleb Smirnoff * callout even when only two threads are involved: the callout itself and the 926446ccdd0SGleb Smirnoff * thread that does callout_stop(). See where softclock_call_cc() swaps the 927446ccdd0SGleb Smirnoff * callwheel lock to callout lock and then checks cc_exec_cancel(). This is 928446ccdd0SGleb Smirnoff * the race window. If it happens, the tcp_timer_enter() won't be executed, 929446ccdd0SGleb Smirnoff * however pcb lock will be locked and released, hence we can't free memory. 930446ccdd0SGleb Smirnoff * Until callout(9) is improved, just keep retrying. In my profiling I've seen 931446ccdd0SGleb Smirnoff * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. 932446ccdd0SGleb Smirnoff */ 933446ccdd0SGleb Smirnoff void 934446ccdd0SGleb Smirnoff tcp_timer_stop(struct tcpcb *tp) 935ff945008SGleb Smirnoff { 9369eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 937ff945008SGleb Smirnoff 938446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 9399eb0e832SGleb Smirnoff 940446ccdd0SGleb Smirnoff if (curthread->td_pflags & TDP_INTCPCALLOUT) { 941446ccdd0SGleb Smirnoff int stopped __diagused; 942446ccdd0SGleb Smirnoff 943446ccdd0SGleb Smirnoff stopped = callout_stop(&tp->t_callout); 944446ccdd0SGleb Smirnoff MPASS(stopped == 0); 945446ccdd0SGleb Smirnoff } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { 946ff945008SGleb Smirnoff INP_WUNLOCK(inp); 947446ccdd0SGleb Smirnoff kern_yield(PRI_UNCHANGED); 948446ccdd0SGleb Smirnoff INP_WLOCK(inp); 9495571f9cfSJulien Charbon } 9505571f9cfSJulien Charbon } 951