1c398230bSWarner Losh /*- 251369649SPedro F. Giffuni * SPDX-License-Identifier: BSD-3-Clause 351369649SPedro F. Giffuni * 4e79adb8eSGarrett Wollman * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5df8bae1dSRodney W. Grimes * The Regents of the University of California. All rights reserved. 6df8bae1dSRodney W. Grimes * 7df8bae1dSRodney W. Grimes * Redistribution and use in source and binary forms, with or without 8df8bae1dSRodney W. Grimes * modification, are permitted provided that the following conditions 9df8bae1dSRodney W. Grimes * are met: 10df8bae1dSRodney W. Grimes * 1. Redistributions of source code must retain the above copyright 11df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer. 12df8bae1dSRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright 13df8bae1dSRodney W. Grimes * notice, this list of conditions and the following disclaimer in the 14df8bae1dSRodney W. Grimes * documentation and/or other materials provided with the distribution. 15fbbd9655SWarner Losh * 3. Neither the name of the University nor the names of its contributors 16df8bae1dSRodney W. Grimes * may be used to endorse or promote products derived from this software 17df8bae1dSRodney W. Grimes * without specific prior written permission. 18df8bae1dSRodney W. Grimes * 19df8bae1dSRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20df8bae1dSRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21df8bae1dSRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22df8bae1dSRodney W. Grimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23df8bae1dSRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24df8bae1dSRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25df8bae1dSRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26df8bae1dSRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27df8bae1dSRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28df8bae1dSRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29df8bae1dSRodney W. Grimes * SUCH DAMAGE. 30df8bae1dSRodney W. Grimes */ 31df8bae1dSRodney W. Grimes 324b421e2dSMike Silbersack #include <sys/cdefs.h> 33825fd1e4SNavdeep Parhar #include "opt_inet.h" 34fb59c426SYoshinobu Inoue #include "opt_inet6.h" 35883831c6SAdrian Chadd #include "opt_rss.h" 360cc12cc5SJoerg Wunsch 37df8bae1dSRodney W. Grimes #include <sys/param.h> 3898163b98SPoul-Henning Kamp #include <sys/kernel.h> 39c74af4faSBruce Evans #include <sys/lock.h> 4008517d53SMike Silbersack #include <sys/mbuf.h> 41c74af4faSBruce Evans #include <sys/mutex.h> 42c74af4faSBruce Evans #include <sys/protosw.h> 4387aedea4SKip Macy #include <sys/smp.h> 44df8bae1dSRodney W. Grimes #include <sys/socket.h> 45df8bae1dSRodney W. Grimes #include <sys/socketvar.h> 46c74af4faSBruce Evans #include <sys/sysctl.h> 47c74af4faSBruce Evans #include <sys/systm.h> 48e79adb8eSGarrett Wollman 494b79449eSBjoern A. Zeeb #include <net/if.h> 50df8bae1dSRodney W. Grimes #include <net/route.h> 51b2bdc62aSAdrian Chadd #include <net/rss_config.h> 52530c0060SRobert Watson #include <net/vnet.h> 53883831c6SAdrian Chadd #include <net/netisr.h> 54df8bae1dSRodney W. Grimes 55df8bae1dSRodney W. Grimes #include <netinet/in.h> 565d06879aSGeorge V. Neville-Neil #include <netinet/in_kdtrace.h> 57df8bae1dSRodney W. Grimes #include <netinet/in_pcb.h> 58883831c6SAdrian Chadd #include <netinet/in_rss.h> 59c74af4faSBruce Evans #include <netinet/in_systm.h> 60fb59c426SYoshinobu Inoue #ifdef INET6 61fb59c426SYoshinobu Inoue #include <netinet6/in6_pcb.h> 62fb59c426SYoshinobu Inoue #endif 63df8bae1dSRodney W. Grimes #include <netinet/ip_var.h> 642de3e790SGleb Smirnoff #include <netinet/tcp.h> 65df8bae1dSRodney W. Grimes #include <netinet/tcp_fsm.h> 66df8bae1dSRodney W. Grimes #include <netinet/tcp_timer.h> 67df8bae1dSRodney W. Grimes #include <netinet/tcp_var.h> 6869c7c811SRandall Stewart #include <netinet/tcp_log_buf.h> 6989e560f4SRandall Stewart #include <netinet/tcp_seq.h> 704644fda3SGleb Smirnoff #include <netinet/cc/cc.h> 71f6f6703fSSean Bruno #ifdef INET6 72f6f6703fSSean Bruno #include <netinet6/tcp6_var.h> 73f6f6703fSSean Bruno #endif 74df8bae1dSRodney W. Grimes #include <netinet/tcpip.h> 75df8bae1dSRodney W. Grimes 760645c604SHiren Panchasara int tcp_persmin; 777029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, 787029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 797029da5cSPawel Biernacki &tcp_persmin, 0, sysctl_msec_to_ticks, "I", 807029da5cSPawel Biernacki "minimum persistence interval"); 810645c604SHiren Panchasara 820645c604SHiren Panchasara int tcp_persmax; 837029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, 847029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 857029da5cSPawel Biernacki &tcp_persmax, 0, sysctl_msec_to_ticks, "I", 867029da5cSPawel Biernacki "maximum persistence interval"); 870645c604SHiren Panchasara 889b8b58e0SJonathan Lemon int tcp_keepinit; 897029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, 907029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 917029da5cSPawel Biernacki &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", 927029da5cSPawel Biernacki "time to establish connection"); 937b40aa32SPaul Traina 949b8b58e0SJonathan Lemon int tcp_keepidle; 957029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, 967029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 977029da5cSPawel Biernacki &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", 987029da5cSPawel Biernacki "time before keepalive probes begin"); 9998163b98SPoul-Henning Kamp 1009b8b58e0SJonathan Lemon int tcp_keepintvl; 1017029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, 1027029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1037029da5cSPawel Biernacki &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", 1047029da5cSPawel Biernacki "time between keepalive probes"); 10598163b98SPoul-Henning Kamp 1069b8b58e0SJonathan Lemon int tcp_delacktime; 1077029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, 1087029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1096489fe65SAndre Oppermann &tcp_delacktime, 0, sysctl_msec_to_ticks, "I", 110ccb4d0c6SJonathan Lemon "Time before a delayed ACK is sent"); 1119b8b58e0SJonathan Lemon 112c2c8e360SAlexander V. Chernikov VNET_DEFINE(int, tcp_msl); 1137029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, 114c2c8e360SAlexander V. Chernikov CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET, 115c2c8e360SAlexander V. Chernikov &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I", 1167029da5cSPawel Biernacki "Maximum segment lifetime"); 1179b8b58e0SJonathan Lemon 1180999766dSMichael Tuexen int tcp_rexmit_initial; 1197029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial, 1207029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1210999766dSMichael Tuexen &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I", 1220999766dSMichael Tuexen "Initial Retransmission Timeout"); 1230999766dSMichael Tuexen 124701bec5aSMatthew Dillon int tcp_rexmit_min; 1257029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, 1267029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1276489fe65SAndre Oppermann &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I", 1286489fe65SAndre Oppermann "Minimum Retransmission Timeout"); 129701bec5aSMatthew Dillon 130701bec5aSMatthew Dillon int tcp_rexmit_slop; 1317029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, 1327029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1336489fe65SAndre Oppermann &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I", 1346489fe65SAndre Oppermann "Retransmission Timer Slop"); 135701bec5aSMatthew Dillon 136334fc582SBjoern A. Zeeb VNET_DEFINE(int, tcp_always_keepalive) = 1; 137334fc582SBjoern A. Zeeb SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW, 138334fc582SBjoern A. Zeeb &VNET_NAME(tcp_always_keepalive) , 0, 139334fc582SBjoern A. Zeeb "Assume SO_KEEPALIVE on all TCP connections"); 14034be9bf3SPoul-Henning Kamp 1417c72af87SMohan Srinivasan int tcp_fast_finwait2_recycle = 0; 1427c72af87SMohan Srinivasan SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW, 1436489fe65SAndre Oppermann &tcp_fast_finwait2_recycle, 0, 1446489fe65SAndre Oppermann "Recycle closed FIN_WAIT_2 connections faster"); 1457c72af87SMohan Srinivasan 1467c72af87SMohan Srinivasan int tcp_finwait2_timeout; 1477029da5cSPawel Biernacki SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, 1487029da5cSPawel Biernacki CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 1497029da5cSPawel Biernacki &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", 1507029da5cSPawel Biernacki "FIN-WAIT2 timeout"); 1517c72af87SMohan Srinivasan 1529077f387SGleb Smirnoff int tcp_keepcnt = TCPTV_KEEPCNT; 1539077f387SGleb Smirnoff SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0, 1549077f387SGleb Smirnoff "Number of keepalive probes to send"); 1557c72af87SMohan Srinivasan 1560312fbe9SPoul-Henning Kamp /* max idle probes */ 1579b8b58e0SJonathan Lemon int tcp_maxpersistidle; 158e79adb8eSGarrett Wollman 15989e560f4SRandall Stewart int tcp_rexmit_drop_options = 0; 1606c0ef895SJohn Baldwin SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW, 1616c0ef895SJohn Baldwin &tcp_rexmit_drop_options, 0, 1626c0ef895SJohn Baldwin "Drop TCP options from 3rd and later retransmitted SYN"); 1636c0ef895SJohn Baldwin 16408af8aacSRandall Stewart int tcp_maxunacktime = TCPTV_MAXUNACKTIME; 16508af8aacSRandall Stewart SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, 16608af8aacSRandall Stewart CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, 16708af8aacSRandall Stewart &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", 16808af8aacSRandall Stewart "Maximum time (in ms) that a session can linger without making progress"); 16908af8aacSRandall Stewart 170e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_pmtud_blackhole_detect); 171f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, 172f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 173f6f6703fSSean Bruno &VNET_NAME(tcp_pmtud_blackhole_detect), 0, 174f6f6703fSSean Bruno "Path MTU Discovery Black Hole Detection Enabled"); 175f6f6703fSSean Bruno 176f6f6703fSSean Bruno #ifdef INET 177e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200; 178f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss, 179f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 180f6f6703fSSean Bruno &VNET_NAME(tcp_pmtud_blackhole_mss), 0, 181f6f6703fSSean Bruno "Path MTU Discovery Black Hole Detection lowered MSS"); 182f6f6703fSSean Bruno #endif 183f6f6703fSSean Bruno 184f6f6703fSSean Bruno #ifdef INET6 185e29c55e4SGleb Smirnoff VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220; 186f6f6703fSSean Bruno SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss, 187f0188618SHans Petter Selasky CTLFLAG_RW|CTLFLAG_VNET, 188f6f6703fSSean Bruno &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0, 189f6f6703fSSean Bruno "Path MTU Discovery IPv6 Black Hole Detection lowered MSS"); 190f6f6703fSSean Bruno #endif 191f6f6703fSSean Bruno 1928f7e75cbSAdrian Chadd #ifdef RSS 1938f7e75cbSAdrian Chadd static int per_cpu_timers = 1; 1948f7e75cbSAdrian Chadd #else 19587aedea4SKip Macy static int per_cpu_timers = 0; 1968f7e75cbSAdrian Chadd #endif 19787aedea4SKip Macy SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW, 19887aedea4SKip Macy &per_cpu_timers , 0, "run tcp timers on all cpus"); 19987aedea4SKip Macy 20043b117f8SRichard Scheffenegger static int 20143b117f8SRichard Scheffenegger sysctl_net_inet_tcp_retries(SYSCTL_HANDLER_ARGS) 20243b117f8SRichard Scheffenegger { 20343b117f8SRichard Scheffenegger int error, new; 20443b117f8SRichard Scheffenegger 20543b117f8SRichard Scheffenegger new = V_tcp_retries; 20643b117f8SRichard Scheffenegger error = sysctl_handle_int(oidp, &new, 0, req); 20743b117f8SRichard Scheffenegger if (error == 0 && req->newptr) { 20843b117f8SRichard Scheffenegger if ((new < 1) || (new > TCP_MAXRXTSHIFT)) 20943b117f8SRichard Scheffenegger error = EINVAL; 21043b117f8SRichard Scheffenegger else 21143b117f8SRichard Scheffenegger V_tcp_retries = new; 21243b117f8SRichard Scheffenegger } 21343b117f8SRichard Scheffenegger return (error); 21443b117f8SRichard Scheffenegger } 21543b117f8SRichard Scheffenegger 21643b117f8SRichard Scheffenegger VNET_DEFINE(int, tcp_retries) = TCP_MAXRXTSHIFT; 21743b117f8SRichard Scheffenegger SYSCTL_PROC(_net_inet_tcp, OID_AUTO, retries, 21843b117f8SRichard Scheffenegger CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RW, 21943b117f8SRichard Scheffenegger &VNET_NAME(tcp_retries), 0, sysctl_net_inet_tcp_retries, "I", 22043b117f8SRichard Scheffenegger "maximum number of consecutive timer based retransmissions"); 22143b117f8SRichard Scheffenegger 222883831c6SAdrian Chadd /* 223883831c6SAdrian Chadd * Map the given inp to a CPU id. 224883831c6SAdrian Chadd * 225883831c6SAdrian Chadd * This queries RSS if it's compiled in, else it defaults to the current 226883831c6SAdrian Chadd * CPU ID. 227883831c6SAdrian Chadd */ 22889e560f4SRandall Stewart inline int 229883831c6SAdrian Chadd inp_to_cpuid(struct inpcb *inp) 230883831c6SAdrian Chadd { 231883831c6SAdrian Chadd u_int cpuid; 232883831c6SAdrian Chadd 233883831c6SAdrian Chadd if (per_cpu_timers) { 23447ded797SFranco Fichtner #ifdef RSS 235883831c6SAdrian Chadd cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 236883831c6SAdrian Chadd if (cpuid == NETISR_CPUID_NONE) 237883831c6SAdrian Chadd return (curcpu); /* XXX */ 238883831c6SAdrian Chadd else 239883831c6SAdrian Chadd return (cpuid); 24047ded797SFranco Fichtner #endif 241883831c6SAdrian Chadd /* 242883831c6SAdrian Chadd * We don't have a flowid -> cpuid mapping, so cheat and 243883831c6SAdrian Chadd * just map unknown cpuids to curcpu. Not the best, but 244883831c6SAdrian Chadd * apparently better than defaulting to swi 0. 245883831c6SAdrian Chadd */ 246883831c6SAdrian Chadd cpuid = inp->inp_flowid % (mp_maxid + 1); 247883831c6SAdrian Chadd if (! CPU_ABSENT(cpuid)) 248883831c6SAdrian Chadd return (cpuid); 249883831c6SAdrian Chadd return (curcpu); 25047ded797SFranco Fichtner } else { 251883831c6SAdrian Chadd return (0); 252883831c6SAdrian Chadd } 253883831c6SAdrian Chadd } 25487aedea4SKip Macy 255df8bae1dSRodney W. Grimes int tcp_backoff[TCP_MAXRXTSHIFT + 1] = 256f058535dSJeffrey Hsu { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; 257df8bae1dSRodney W. Grimes 25889e560f4SRandall Stewart int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */ 259e79adb8eSGarrett Wollman 260df8bae1dSRodney W. Grimes /* 261df8bae1dSRodney W. Grimes * TCP timer processing. 262446ccdd0SGleb Smirnoff * 263446ccdd0SGleb Smirnoff * Each connection has 5 timers associated with it, which can be scheduled 264446ccdd0SGleb Smirnoff * simultaneously. They all are serviced by one callout tcp_timer_enter(). 265446ccdd0SGleb Smirnoff * This function executes the next timer via tcp_timersw[] vector. Each 266446ccdd0SGleb Smirnoff * timer is supposed to return 'true' unless the connection was destroyed. 267446ccdd0SGleb Smirnoff * In the former case tcp_timer_enter() will schedule callout for next timer. 268df8bae1dSRodney W. Grimes */ 26985d94372SRobert Watson 270446ccdd0SGleb Smirnoff typedef bool tcp_timer_t(struct tcpcb *); 271446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_delack; 272446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_2msl; 273446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_keep; 274446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_persist; 275446ccdd0SGleb Smirnoff static tcp_timer_t tcp_timer_rexmt; 276446ccdd0SGleb Smirnoff 277446ccdd0SGleb Smirnoff static tcp_timer_t * const tcp_timersw[TT_N] = { 278446ccdd0SGleb Smirnoff [TT_DELACK] = tcp_timer_delack, 279446ccdd0SGleb Smirnoff [TT_REXMT] = tcp_timer_rexmt, 280446ccdd0SGleb Smirnoff [TT_PERSIST] = tcp_timer_persist, 281446ccdd0SGleb Smirnoff [TT_KEEP] = tcp_timer_keep, 282446ccdd0SGleb Smirnoff [TT_2MSL] = tcp_timer_2msl, 283446ccdd0SGleb Smirnoff }; 284446ccdd0SGleb Smirnoff 285446ccdd0SGleb Smirnoff /* 286446ccdd0SGleb Smirnoff * tcp_output_locked() s a timer specific variation of call to tcp_output(), 287446ccdd0SGleb Smirnoff * see tcp_var.h for the rest. It handles drop request from advanced stacks, 288446ccdd0SGleb Smirnoff * but keeps tcpcb locked unless tcp_drop() destroyed it. 289446ccdd0SGleb Smirnoff * Returns true if tcpcb is valid and locked. 290446ccdd0SGleb Smirnoff */ 291446ccdd0SGleb Smirnoff static inline bool 292446ccdd0SGleb Smirnoff tcp_output_locked(struct tcpcb *tp) 293446ccdd0SGleb Smirnoff { 294446ccdd0SGleb Smirnoff int rv; 295446ccdd0SGleb Smirnoff 296446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 297446ccdd0SGleb Smirnoff 298446ccdd0SGleb Smirnoff if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) { 299446ccdd0SGleb Smirnoff KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP, 300446ccdd0SGleb Smirnoff ("TCP stack %s requested tcp_drop(%p)", 301446ccdd0SGleb Smirnoff tp->t_fb->tfb_tcp_block_name, tp)); 302*e21c6687SGleb Smirnoff tp = tcp_drop(tp, -rv); 303446ccdd0SGleb Smirnoff } 304446ccdd0SGleb Smirnoff 305446ccdd0SGleb Smirnoff return (tp != NULL); 306446ccdd0SGleb Smirnoff } 307446ccdd0SGleb Smirnoff 308446ccdd0SGleb Smirnoff static bool 309446ccdd0SGleb Smirnoff tcp_timer_delack(struct tcpcb *tp) 310df8bae1dSRodney W. Grimes { 311109eb549SGleb Smirnoff struct epoch_tracker et; 312446ccdd0SGleb Smirnoff #if defined(INVARIANTS) || defined(VIMAGE) 3139eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 314446ccdd0SGleb Smirnoff #endif 315446ccdd0SGleb Smirnoff bool rv; 3169eb0e832SGleb Smirnoff 317446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 318446ccdd0SGleb Smirnoff 3198840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 3209b8b58e0SJonathan Lemon tp->t_flags |= TF_ACKNOW; 32178b50714SRobert Watson TCPSTAT_INC(tcps_delack); 322109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 323446ccdd0SGleb Smirnoff rv = tcp_output_locked(tp); 324109eb549SGleb Smirnoff NET_EPOCH_EXIT(et); 3258b615593SMarko Zec CURVNET_RESTORE(); 326446ccdd0SGleb Smirnoff 327446ccdd0SGleb Smirnoff return (rv); 3289b8b58e0SJonathan Lemon } 3299b8b58e0SJonathan Lemon 330446ccdd0SGleb Smirnoff static bool 331446ccdd0SGleb Smirnoff tcp_timer_2msl(struct tcpcb *tp) 332b07fef50SRandall Stewart { 3339eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 334446ccdd0SGleb Smirnoff bool close = false; 33577198a94SGleb Smirnoff 33677198a94SGleb Smirnoff INP_WLOCK_ASSERT(inp); 33777198a94SGleb Smirnoff 338446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 3398840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 340d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_2MSL); 34185d94372SRobert Watson tcp_free_sackholes(tp); 34285d94372SRobert Watson /* 343df8bae1dSRodney W. Grimes * 2 MSL timeout in shutdown went off. If we're closed but 344df8bae1dSRodney W. Grimes * still waiting for peer to close and connection has been idle 34531a7749dSJulien Charbon * too long delete connection control block. Otherwise, check 34631a7749dSJulien Charbon * again in a bit. 34731a7749dSJulien Charbon * 3487c72af87SMohan Srinivasan * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed, 3497c72af87SMohan Srinivasan * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. 3507c72af87SMohan Srinivasan * Ignore fact that there were recent incoming segments. 351f71cb9f7SGleb Smirnoff * 352f71cb9f7SGleb Smirnoff * XXXGL: check if inp_socket shall always be !NULL here? 353df8bae1dSRodney W. Grimes */ 3540d744519SGleb Smirnoff if (tp->t_state == TCPS_TIME_WAIT) { 355446ccdd0SGleb Smirnoff close = true; 3560d744519SGleb Smirnoff } else if (tp->t_state == TCPS_FIN_WAIT_2 && 3579eb0e832SGleb Smirnoff tcp_fast_finwait2_recycle && inp->inp_socket && 3589eb0e832SGleb Smirnoff (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { 35978b50714SRobert Watson TCPSTAT_INC(tcps_finwait2_drops); 360446ccdd0SGleb Smirnoff close = true; 3617c72af87SMohan Srinivasan } else { 362446ccdd0SGleb Smirnoff if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) 363446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp)); 364446ccdd0SGleb Smirnoff else 365446ccdd0SGleb Smirnoff close = true; 366446ccdd0SGleb Smirnoff } 367446ccdd0SGleb Smirnoff if (close) { 368446ccdd0SGleb Smirnoff struct epoch_tracker et; 369446ccdd0SGleb Smirnoff 370446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 371446ccdd0SGleb Smirnoff tp = tcp_close(tp); 372446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 373446ccdd0SGleb Smirnoff } 37477198a94SGleb Smirnoff CURVNET_RESTORE(); 375446ccdd0SGleb Smirnoff 376446ccdd0SGleb Smirnoff return (tp != NULL); 3777c72af87SMohan Srinivasan } 378df8bae1dSRodney W. Grimes 379446ccdd0SGleb Smirnoff static bool 380446ccdd0SGleb Smirnoff tcp_timer_keep(struct tcpcb *tp) 3819b8b58e0SJonathan Lemon { 3826573d758SMatt Macy struct epoch_tracker et; 3839eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 3849eb0e832SGleb Smirnoff struct tcptemp *t_template; 3859b8b58e0SJonathan Lemon 386446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 3879eb0e832SGleb Smirnoff 388446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 3898840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 3906d172f58SJonathan T. Looney /* 3916d172f58SJonathan T. Looney * Because we don't regularly reset the keepalive callout in 3926d172f58SJonathan T. Looney * the ESTABLISHED state, it may be that we don't actually need 3936d172f58SJonathan T. Looney * to send a keepalive yet. If that occurs, schedule another 3946d172f58SJonathan T. Looney * call for the next time the keepalive timer might expire. 3956d172f58SJonathan T. Looney */ 3966d172f58SJonathan T. Looney if (TCPS_HAVEESTABLISHED(tp->t_state)) { 3976d172f58SJonathan T. Looney u_int idletime; 3986d172f58SJonathan T. Looney 3996d172f58SJonathan T. Looney idletime = ticks - tp->t_rcvtime; 4006d172f58SJonathan T. Looney if (idletime < TP_KEEPIDLE(tp)) { 401446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, 402446ccdd0SGleb Smirnoff TP_KEEPIDLE(tp) - idletime); 4036d172f58SJonathan T. Looney CURVNET_RESTORE(); 404446ccdd0SGleb Smirnoff return (true); 4056d172f58SJonathan T. Looney } 4066d172f58SJonathan T. Looney } 4076d172f58SJonathan T. Looney 4089b8b58e0SJonathan Lemon /* 4099b8b58e0SJonathan Lemon * Keep-alive timer went off; send something 4109b8b58e0SJonathan Lemon * or drop connection if idle for too long. 4119b8b58e0SJonathan Lemon */ 41278b50714SRobert Watson TCPSTAT_INC(tcps_keeptimeo); 4139b8b58e0SJonathan Lemon if (tp->t_state < TCPS_ESTABLISHED) 4149b8b58e0SJonathan Lemon goto dropit; 415334fc582SBjoern A. Zeeb if ((V_tcp_always_keepalive || 416f1798531SJohn Baldwin inp->inp_socket->so_options & SO_KEEPALIVE) && 4179b8b58e0SJonathan Lemon tp->t_state <= TCPS_CLOSING) { 4189077f387SGleb Smirnoff if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 4199b8b58e0SJonathan Lemon goto dropit; 4209b8b58e0SJonathan Lemon /* 4219b8b58e0SJonathan Lemon * Send a packet designed to force a response 4229b8b58e0SJonathan Lemon * if the peer is up and reachable: 4239b8b58e0SJonathan Lemon * either an ACK if the connection is still alive, 4249b8b58e0SJonathan Lemon * or an RST if the peer has closed the connection 4259b8b58e0SJonathan Lemon * due to timeout or reboot. 4269b8b58e0SJonathan Lemon * Using sequence number tp->snd_una-1 4279b8b58e0SJonathan Lemon * causes the transmitted zero-length segment 4289b8b58e0SJonathan Lemon * to lie outside the receive window; 4299b8b58e0SJonathan Lemon * by the protocol spec, this requires the 4309b8b58e0SJonathan Lemon * correspondent TCP to respond. 4319b8b58e0SJonathan Lemon */ 43278b50714SRobert Watson TCPSTAT_INC(tcps_keepprobe); 43379909384SJonathan Lemon t_template = tcpip_maketemplate(inp); 43408517d53SMike Silbersack if (t_template) { 435b9555453SGleb Smirnoff NET_EPOCH_ENTER(et); 43608517d53SMike Silbersack tcp_respond(tp, t_template->tt_ipgen, 43708517d53SMike Silbersack &t_template->tt_t, (struct mbuf *)NULL, 4389b8b58e0SJonathan Lemon tp->rcv_nxt, tp->snd_una - 1, 0); 439b9555453SGleb Smirnoff NET_EPOCH_EXIT(et); 44053640b0eSRobert Watson free(t_template, M_TEMP); 44108517d53SMike Silbersack } 442446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp)); 443b07fef50SRandall Stewart } else 444446ccdd0SGleb Smirnoff tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); 4459b8b58e0SJonathan Lemon 4468b615593SMarko Zec CURVNET_RESTORE(); 447446ccdd0SGleb Smirnoff return (true); 4489b8b58e0SJonathan Lemon 4499b8b58e0SJonathan Lemon dropit: 45078b50714SRobert Watson TCPSTAT_INC(tcps_keepdrops); 45158d94bd0SGleb Smirnoff NET_EPOCH_ENTER(et); 452d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 45385d94372SRobert Watson tp = tcp_drop(tp, ETIMEDOUT); 45458d94bd0SGleb Smirnoff NET_EPOCH_EXIT(et); 4558b615593SMarko Zec CURVNET_RESTORE(); 456446ccdd0SGleb Smirnoff 457446ccdd0SGleb Smirnoff return (tp != NULL); 4589b8b58e0SJonathan Lemon } 4599b8b58e0SJonathan Lemon 46008af8aacSRandall Stewart /* 46108af8aacSRandall Stewart * Has this session exceeded the maximum time without seeing a substantive 46208af8aacSRandall Stewart * acknowledgement? If so, return true; otherwise false. 46308af8aacSRandall Stewart */ 46408af8aacSRandall Stewart static bool 46508af8aacSRandall Stewart tcp_maxunacktime_check(struct tcpcb *tp) 46608af8aacSRandall Stewart { 46708af8aacSRandall Stewart 46808af8aacSRandall Stewart /* Are we tracking this timer for this session? */ 46908af8aacSRandall Stewart if (TP_MAXUNACKTIME(tp) == 0) 47008af8aacSRandall Stewart return false; 47108af8aacSRandall Stewart 47208af8aacSRandall Stewart /* Do we have a current measurement. */ 47308af8aacSRandall Stewart if (tp->t_acktime == 0) 47408af8aacSRandall Stewart return false; 47508af8aacSRandall Stewart 47608af8aacSRandall Stewart /* Are we within the acceptable range? */ 47708af8aacSRandall Stewart if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) 47808af8aacSRandall Stewart return false; 47908af8aacSRandall Stewart 48008af8aacSRandall Stewart /* We exceeded the timer. */ 48108af8aacSRandall Stewart TCPSTAT_INC(tcps_progdrops); 48208af8aacSRandall Stewart return true; 48308af8aacSRandall Stewart } 48408af8aacSRandall Stewart 485446ccdd0SGleb Smirnoff static bool 486446ccdd0SGleb Smirnoff tcp_timer_persist(struct tcpcb *tp) 4879b8b58e0SJonathan Lemon { 4886573d758SMatt Macy struct epoch_tracker et; 489446ccdd0SGleb Smirnoff #if defined(INVARIANTS) || defined(VIMAGE) 4909eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 4919b8b58e0SJonathan Lemon #endif 492446ccdd0SGleb Smirnoff bool progdrop, rv; 4939eb0e832SGleb Smirnoff 494446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 495446ccdd0SGleb Smirnoff 496446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 4978840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 4989b8b58e0SJonathan Lemon /* 499a4641f4eSPedro F. Giffuni * Persistence timer into zero window. 5009b8b58e0SJonathan Lemon * Force a byte to be output, if possible. 5019b8b58e0SJonathan Lemon */ 50278b50714SRobert Watson TCPSTAT_INC(tcps_persisttimeo); 5039b8b58e0SJonathan Lemon /* 5049b8b58e0SJonathan Lemon * Hack: if the peer is dead/unreachable, we do not 5059b8b58e0SJonathan Lemon * time out if the window is closed. After a full 5069b8b58e0SJonathan Lemon * backoff, drop the connection if the idle time 5079b8b58e0SJonathan Lemon * (no responses to probes) reaches the maximum 5089b8b58e0SJonathan Lemon * backoff that we would use if retransmitting. 50908af8aacSRandall Stewart * Also, drop the connection if we haven't been making 51008af8aacSRandall Stewart * progress. 5119b8b58e0SJonathan Lemon */ 51208af8aacSRandall Stewart progdrop = tcp_maxunacktime_check(tp); 51343b117f8SRichard Scheffenegger if (progdrop || (tp->t_rxtshift >= V_tcp_retries && 5146b0c5521SJohn Baldwin (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 51508af8aacSRandall Stewart ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { 51608af8aacSRandall Stewart if (!progdrop) 51778b50714SRobert Watson TCPSTAT_INC(tcps_persistdrop); 518d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 519446ccdd0SGleb Smirnoff goto dropit; 5209b8b58e0SJonathan Lemon } 521322181c9SAndre Oppermann /* 522322181c9SAndre Oppermann * If the user has closed the socket then drop a persisting 523322181c9SAndre Oppermann * connection after a much reduced timeout. 524322181c9SAndre Oppermann */ 525322181c9SAndre Oppermann if (tp->t_state > TCPS_CLOSE_WAIT && 526322181c9SAndre Oppermann (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 527322181c9SAndre Oppermann TCPSTAT_INC(tcps_persistdrop); 528d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 529446ccdd0SGleb Smirnoff goto dropit; 530322181c9SAndre Oppermann } 5319b8b58e0SJonathan Lemon tcp_setpersist(tp); 5322cdbfa66SPaul Saab tp->t_flags |= TF_FORCEDATA; 533109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 534446ccdd0SGleb Smirnoff if ((rv = tcp_output_locked(tp))) 5352cdbfa66SPaul Saab tp->t_flags &= ~TF_FORCEDATA; 536f64dc2abSGleb Smirnoff NET_EPOCH_EXIT(et); 5378b615593SMarko Zec CURVNET_RESTORE(); 538446ccdd0SGleb Smirnoff 539446ccdd0SGleb Smirnoff return (rv); 540446ccdd0SGleb Smirnoff 541446ccdd0SGleb Smirnoff dropit: 542446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 543446ccdd0SGleb Smirnoff tp = tcp_drop(tp, ETIMEDOUT); 544446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 545446ccdd0SGleb Smirnoff CURVNET_RESTORE(); 546446ccdd0SGleb Smirnoff 547446ccdd0SGleb Smirnoff return (tp != NULL); 5489b8b58e0SJonathan Lemon } 5499b8b58e0SJonathan Lemon 550446ccdd0SGleb Smirnoff static bool 551446ccdd0SGleb Smirnoff tcp_timer_rexmt(struct tcpcb *tp) 5529b8b58e0SJonathan Lemon { 5539eb0e832SGleb Smirnoff struct epoch_tracker et; 5549eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 555446ccdd0SGleb Smirnoff int rexmt; 556446ccdd0SGleb Smirnoff bool isipv6, rv; 5579b8b58e0SJonathan Lemon 558446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 5599eb0e832SGleb Smirnoff 560446ccdd0SGleb Smirnoff TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); 5618840ae22SGleb Smirnoff CURVNET_SET(inp->inp_vnet); 5625105a92cSRandall Stewart if (tp->t_fb->tfb_tcp_rexmit_tmr) { 5635105a92cSRandall Stewart /* The stack has a timer action too. */ 5645105a92cSRandall Stewart (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); 5655105a92cSRandall Stewart } 566df8bae1dSRodney W. Grimes /* 567df8bae1dSRodney W. Grimes * Retransmission timer went off. Message has not 568df8bae1dSRodney W. Grimes * been acked within retransmit interval. Back off 569df8bae1dSRodney W. Grimes * to a longer retransmit interval and retransmit one segment. 57008af8aacSRandall Stewart * 57108af8aacSRandall Stewart * If we've either exceeded the maximum number of retransmissions, 57208af8aacSRandall Stewart * or we've gone long enough without making progress, then drop 57308af8aacSRandall Stewart * the session. 574df8bae1dSRodney W. Grimes */ 57543b117f8SRichard Scheffenegger if (++tp->t_rxtshift > V_tcp_retries || tcp_maxunacktime_check(tp)) { 57643b117f8SRichard Scheffenegger if (tp->t_rxtshift > V_tcp_retries) 57778b50714SRobert Watson TCPSTAT_INC(tcps_timeoutdrop); 57843b117f8SRichard Scheffenegger tp->t_rxtshift = V_tcp_retries; 579d1b07f36SRandall Stewart tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 580446ccdd0SGleb Smirnoff NET_EPOCH_ENTER(et); 581446ccdd0SGleb Smirnoff tp = tcp_drop(tp, ETIMEDOUT); 582446ccdd0SGleb Smirnoff NET_EPOCH_EXIT(et); 58377198a94SGleb Smirnoff CURVNET_RESTORE(); 584446ccdd0SGleb Smirnoff 585446ccdd0SGleb Smirnoff return (tp != NULL); 586b07fef50SRandall Stewart } 587cf8f04f4SAndre Oppermann if (tp->t_state == TCPS_SYN_SENT) { 588cf8f04f4SAndre Oppermann /* 589cf8f04f4SAndre Oppermann * If the SYN was retransmitted, indicate CWND to be 590cf8f04f4SAndre Oppermann * limited to 1 segment in cc_conn_init(). 591cf8f04f4SAndre Oppermann */ 592cf8f04f4SAndre Oppermann tp->snd_cwnd = 1; 593cf8f04f4SAndre Oppermann } else if (tp->t_rxtshift == 1) { 5949b8b58e0SJonathan Lemon /* 5959b8b58e0SJonathan Lemon * first retransmit; record ssthresh and cwnd so they can 5969b8b58e0SJonathan Lemon * be recovered if this turns out to be a "bad" retransmit. 5979b8b58e0SJonathan Lemon * A retransmit is considered "bad" if an ACK for this 5989b8b58e0SJonathan Lemon * segment is received within RTT/2 interval; the assumption 5999b8b58e0SJonathan Lemon * here is that the ACK was already in flight. See 6009b8b58e0SJonathan Lemon * "On Estimating End-to-End Network Path Properties" by 6019b8b58e0SJonathan Lemon * Allman and Paxson for more details. 6029b8b58e0SJonathan Lemon */ 6039b8b58e0SJonathan Lemon tp->snd_cwnd_prev = tp->snd_cwnd; 6049b8b58e0SJonathan Lemon tp->snd_ssthresh_prev = tp->snd_ssthresh; 6059d11646dSJeffrey Hsu tp->snd_recover_prev = tp->snd_recover; 606dbc42409SLawrence Stewart if (IN_FASTRECOVERY(tp->t_flags)) 6079d11646dSJeffrey Hsu tp->t_flags |= TF_WASFRECOVERY; 6089d11646dSJeffrey Hsu else 6099d11646dSJeffrey Hsu tp->t_flags &= ~TF_WASFRECOVERY; 610dbc42409SLawrence Stewart if (IN_CONGRECOVERY(tp->t_flags)) 611dbc42409SLawrence Stewart tp->t_flags |= TF_WASCRECOVERY; 612dbc42409SLawrence Stewart else 613dbc42409SLawrence Stewart tp->t_flags &= ~TF_WASCRECOVERY; 61410d20c84SMatt Macy if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 6159b8b58e0SJonathan Lemon tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 61610d20c84SMatt Macy /* In the event that we've negotiated timestamps 61710d20c84SMatt Macy * badrxtwin will be set to the value that we set 61810d20c84SMatt Macy * the retransmitted packet's to_tsval to by tcp_output 61910d20c84SMatt Macy */ 620672dc4aeSJohn Baldwin tp->t_flags |= TF_PREVVALID; 62130409ecdSRichard Scheffenegger tcp_resend_sackholes(tp); 62230409ecdSRichard Scheffenegger } else { 623672dc4aeSJohn Baldwin tp->t_flags &= ~TF_PREVVALID; 62430409ecdSRichard Scheffenegger tcp_free_sackholes(tp); 62530409ecdSRichard Scheffenegger } 62678b50714SRobert Watson TCPSTAT_INC(tcps_rexmttimeo); 627281a0fd4SPatrick Kelsey if ((tp->t_state == TCPS_SYN_SENT) || 628281a0fd4SPatrick Kelsey (tp->t_state == TCPS_SYN_RECEIVED)) 6290999766dSMichael Tuexen rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; 6307d42e30cSJonathan Lemon else 631df8bae1dSRodney W. Grimes rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 632df8bae1dSRodney W. Grimes TCPT_RANGESET(tp->t_rxtcur, rexmt, 633df8bae1dSRodney W. Grimes tp->t_rttmin, TCPTV_REXMTMAX); 634f6f6703fSSean Bruno 635882ac53eSSean Bruno /* 636882ac53eSSean Bruno * We enter the path for PLMTUD if connection is established or, if 637882ac53eSSean Bruno * connection is FIN_WAIT_1 status, reason for the last is that if 638882ac53eSSean Bruno * amount of data we send is very small, we could send it in couple of 639882ac53eSSean Bruno * packets and process straight to FIN. In that case we won't catch 640882ac53eSSean Bruno * ESTABLISHED state. 641882ac53eSSean Bruno */ 642f6f6703fSSean Bruno #ifdef INET6 6439eb0e832SGleb Smirnoff isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 644413c3db1SMichael Tuexen #else 645413c3db1SMichael Tuexen isipv6 = false; 646f6f6703fSSean Bruno #endif 647413c3db1SMichael Tuexen if (((V_tcp_pmtud_blackhole_detect == 1) || 648413c3db1SMichael Tuexen (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 649413c3db1SMichael Tuexen (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 650413c3db1SMichael Tuexen ((tp->t_state == TCPS_ESTABLISHED) || 651413c3db1SMichael Tuexen (tp->t_state == TCPS_FIN_WAIT_1))) { 652b89af8e1SMichael Tuexen if (tp->t_rxtshift == 1) { 653adf43a92SHiren Panchasara /* 654b89af8e1SMichael Tuexen * We enter blackhole detection after the first 655b89af8e1SMichael Tuexen * unsuccessful timer based retransmission. 656b89af8e1SMichael Tuexen * Then we reduce up to two times the MSS, each 657b89af8e1SMichael Tuexen * candidate giving two tries of retransmissions. 658b89af8e1SMichael Tuexen * But we give a candidate only two tries, if it 659b89af8e1SMichael Tuexen * actually reduces the MSS. 660adf43a92SHiren Panchasara */ 661b89af8e1SMichael Tuexen tp->t_blackhole_enter = 2; 662b89af8e1SMichael Tuexen tp->t_blackhole_exit = tp->t_blackhole_enter; 663b89af8e1SMichael Tuexen if (isipv6) { 664b89af8e1SMichael Tuexen #ifdef INET6 665b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) 666b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 667b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_v6mssdflt && 668b89af8e1SMichael Tuexen V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) 669b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 670b89af8e1SMichael Tuexen #endif 671b89af8e1SMichael Tuexen } else { 672b89af8e1SMichael Tuexen #ifdef INET 673b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) 674b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 675b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_mssdflt && 676b89af8e1SMichael Tuexen V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) 677b89af8e1SMichael Tuexen tp->t_blackhole_exit += 2; 678b89af8e1SMichael Tuexen #endif 679b89af8e1SMichael Tuexen } 680b89af8e1SMichael Tuexen } 681f6f6703fSSean Bruno if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) == 682f6f6703fSSean Bruno (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) && 683b89af8e1SMichael Tuexen (tp->t_rxtshift >= tp->t_blackhole_enter && 684b89af8e1SMichael Tuexen tp->t_rxtshift < tp->t_blackhole_exit && 685b89af8e1SMichael Tuexen (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) { 686f6f6703fSSean Bruno /* 687f6f6703fSSean Bruno * Enter Path MTU Black-hole Detection mechanism: 688f6f6703fSSean Bruno * - Disable Path MTU Discovery (IP "DF" bit). 689f6f6703fSSean Bruno * - Reduce MTU to lower value than what we 690f6f6703fSSean Bruno * negotiated with peer. 691f6f6703fSSean Bruno */ 6923d5af7a1SMichael Tuexen if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 693f6f6703fSSean Bruno /* Record that we may have found a black hole. */ 694f6f6703fSSean Bruno tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 695f6f6703fSSean Bruno /* Keep track of previous MSS. */ 6960c39d38dSGleb Smirnoff tp->t_pmtud_saved_maxseg = tp->t_maxseg; 6973d5af7a1SMichael Tuexen } 698f6f6703fSSean Bruno 699f6f6703fSSean Bruno /* 700f6f6703fSSean Bruno * Reduce the MSS to blackhole value or to the default 701f6f6703fSSean Bruno * in an attempt to retransmit. 702f6f6703fSSean Bruno */ 703f6f6703fSSean Bruno #ifdef INET6 704f6f6703fSSean Bruno if (isipv6 && 705b89af8e1SMichael Tuexen tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss && 706b89af8e1SMichael Tuexen V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) { 707f6f6703fSSean Bruno /* Use the sysctl tuneable blackhole MSS. */ 7080c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 70932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated); 710f6f6703fSSean Bruno } else if (isipv6) { 711f6f6703fSSean Bruno /* Use the default MSS. */ 7120c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_v6mssdflt; 713f6f6703fSSean Bruno /* 714f6f6703fSSean Bruno * Disable Path MTU Discovery when we switch to 715f6f6703fSSean Bruno * minmss. 716f6f6703fSSean Bruno */ 717f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 71832a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 719f6f6703fSSean Bruno } 720f6f6703fSSean Bruno #endif 721f6f6703fSSean Bruno #if defined(INET6) && defined(INET) 722f6f6703fSSean Bruno else 723f6f6703fSSean Bruno #endif 724f6f6703fSSean Bruno #ifdef INET 725b89af8e1SMichael Tuexen if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss && 726b89af8e1SMichael Tuexen V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) { 727f6f6703fSSean Bruno /* Use the sysctl tuneable blackhole MSS. */ 7280c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 72932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated); 730f6f6703fSSean Bruno } else { 731f6f6703fSSean Bruno /* Use the default MSS. */ 7320c39d38dSGleb Smirnoff tp->t_maxseg = V_tcp_mssdflt; 733f6f6703fSSean Bruno /* 734f6f6703fSSean Bruno * Disable Path MTU Discovery when we switch to 735f6f6703fSSean Bruno * minmss. 736f6f6703fSSean Bruno */ 737f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 73832a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 739f6f6703fSSean Bruno } 740f6f6703fSSean Bruno #endif 741f6f6703fSSean Bruno /* 742f6f6703fSSean Bruno * Reset the slow-start flight size 743f6f6703fSSean Bruno * as it may depend on the new MSS. 744f6f6703fSSean Bruno */ 745f6f6703fSSean Bruno if (CC_ALGO(tp)->conn_init != NULL) 746e68b3792SGleb Smirnoff CC_ALGO(tp)->conn_init(&tp->t_ccv); 747f6f6703fSSean Bruno } else { 748f6f6703fSSean Bruno /* 749f6f6703fSSean Bruno * If further retransmissions are still unsuccessful 750f6f6703fSSean Bruno * with a lowered MTU, maybe this isn't a blackhole and 751f6f6703fSSean Bruno * we restore the previous MSS and blackhole detection 752f6f6703fSSean Bruno * flags. 753f6f6703fSSean Bruno */ 754f6f6703fSSean Bruno if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 755b89af8e1SMichael Tuexen (tp->t_rxtshift >= tp->t_blackhole_exit)) { 756f6f6703fSSean Bruno tp->t_flags2 |= TF2_PLPMTU_PMTUD; 757f6f6703fSSean Bruno tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7580c39d38dSGleb Smirnoff tp->t_maxseg = tp->t_pmtud_saved_maxseg; 75932a04bb8SSean Bruno TCPSTAT_INC(tcps_pmtud_blackhole_failed); 760f6f6703fSSean Bruno /* 761f6f6703fSSean Bruno * Reset the slow-start flight size as it 762f6f6703fSSean Bruno * may depend on the new MSS. 763f6f6703fSSean Bruno */ 764f6f6703fSSean Bruno if (CC_ALGO(tp)->conn_init != NULL) 765e68b3792SGleb Smirnoff CC_ALGO(tp)->conn_init(&tp->t_ccv); 766f6f6703fSSean Bruno } 767f6f6703fSSean Bruno } 768f6f6703fSSean Bruno } 769f6f6703fSSean Bruno 770df8bae1dSRodney W. Grimes /* 77177339e1cSAndre Oppermann * Disable RFC1323 and SACK if we haven't got any response to 7727ceb7783SJesper Skriver * our third SYN to work-around some broken terminal servers 7737ceb7783SJesper Skriver * (most of which have hopefully been retired) that have bad VJ 7747ceb7783SJesper Skriver * header compression code which trashes TCP segments containing 7757ceb7783SJesper Skriver * unknown-to-them TCP options. 7767ceb7783SJesper Skriver */ 7776c0ef895SJohn Baldwin if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7786c0ef895SJohn Baldwin (tp->t_rxtshift == 3)) 779c4ab59c1SAndre Oppermann tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7807ceb7783SJesper Skriver /* 7815ede40dcSRyan Stone * If we backed off this far, notify the L3 protocol that we're having 7825ede40dcSRyan Stone * connection problems. 783df8bae1dSRodney W. Grimes */ 7845ede40dcSRyan Stone if (tp->t_rxtshift > TCP_RTT_INVALIDATE) { 785fb59c426SYoshinobu Inoue #ifdef INET6 7869eb0e832SGleb Smirnoff if ((inp->inp_vflag & INP_IPV6) != 0) 7879eb0e832SGleb Smirnoff in6_losing(inp); 78884cc0778SGeorge V. Neville-Neil else 789fb59c426SYoshinobu Inoue #endif 7909eb0e832SGleb Smirnoff in_losing(inp); 791df8bae1dSRodney W. Grimes } 792df8bae1dSRodney W. Grimes tp->snd_nxt = tp->snd_una; 7939d11646dSJeffrey Hsu tp->snd_recover = tp->snd_max; 79446f58482SJonathan Lemon /* 79574b48c1dSAndras Olah * Force a segment to be sent. 79674b48c1dSAndras Olah */ 79774b48c1dSAndras Olah tp->t_flags |= TF_ACKNOW; 79874b48c1dSAndras Olah /* 799df8bae1dSRodney W. Grimes * If timing a segment in this window, stop the timer. 800df8bae1dSRodney W. Grimes */ 8019b8b58e0SJonathan Lemon tp->t_rtttime = 0; 802dbc42409SLawrence Stewart 803b5af1b88SLawrence Stewart cc_cong_signal(tp, NULL, CC_RTO); 804109eb549SGleb Smirnoff NET_EPOCH_ENTER(et); 805446ccdd0SGleb Smirnoff rv = tcp_output_locked(tp); 806f64dc2abSGleb Smirnoff NET_EPOCH_EXIT(et); 8078b615593SMarko Zec CURVNET_RESTORE(); 808446ccdd0SGleb Smirnoff 809446ccdd0SGleb Smirnoff return (rv); 81085d94372SRobert Watson } 81185d94372SRobert Watson 81276578d60SMichael Tuexen static void 81376578d60SMichael Tuexen tcp_bblog_timer(struct tcpcb *tp, tt_which which, tt_what what, uint32_t ticks) 81476578d60SMichael Tuexen { 81576578d60SMichael Tuexen struct tcp_log_buffer *lgb; 81676578d60SMichael Tuexen uint64_t ms; 81776578d60SMichael Tuexen 81876578d60SMichael Tuexen INP_WLOCK_ASSERT(tptoinpcb(tp)); 81969c7c811SRandall Stewart if (tcp_bblogging_on(tp)) 82069c7c811SRandall Stewart lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, 82176578d60SMichael Tuexen NULL, false, NULL, NULL, 0, NULL); 82276578d60SMichael Tuexen else 82376578d60SMichael Tuexen lgb = NULL; 82476578d60SMichael Tuexen if (lgb != NULL) { 82576578d60SMichael Tuexen lgb->tlb_flex1 = (what << 8) | which; 82676578d60SMichael Tuexen if (what == TT_STARTING) { 82776578d60SMichael Tuexen /* Convert ticks to ms and store it in tlb_flex2. */ 82876578d60SMichael Tuexen if (hz == 1000) 82976578d60SMichael Tuexen lgb->tlb_flex2 = ticks; 83076578d60SMichael Tuexen else { 83176578d60SMichael Tuexen ms = (((uint64_t)ticks * 1000) + (hz - 1)) / hz; 83276578d60SMichael Tuexen if (ms > UINT32_MAX) 83376578d60SMichael Tuexen lgb->tlb_flex2 = UINT32_MAX; 83476578d60SMichael Tuexen else 83576578d60SMichael Tuexen lgb->tlb_flex2 = (uint32_t)ms; 83676578d60SMichael Tuexen } 83776578d60SMichael Tuexen } 83876578d60SMichael Tuexen } 83976578d60SMichael Tuexen } 84076578d60SMichael Tuexen 841446ccdd0SGleb Smirnoff static inline tt_which 842446ccdd0SGleb Smirnoff tcp_timer_next(struct tcpcb *tp, sbintime_t *precision) 84385d94372SRobert Watson { 844446ccdd0SGleb Smirnoff tt_which i, rv; 845446ccdd0SGleb Smirnoff sbintime_t after, before; 846446ccdd0SGleb Smirnoff 847446ccdd0SGleb Smirnoff for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) { 848446ccdd0SGleb Smirnoff if (tp->t_timers[i] < after) { 849446ccdd0SGleb Smirnoff after = tp->t_timers[i]; 850446ccdd0SGleb Smirnoff rv = i; 851446ccdd0SGleb Smirnoff } 852446ccdd0SGleb Smirnoff before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]); 853446ccdd0SGleb Smirnoff } 854446ccdd0SGleb Smirnoff if (precision != NULL) 855446ccdd0SGleb Smirnoff *precision = before - after; 856446ccdd0SGleb Smirnoff 857446ccdd0SGleb Smirnoff return (rv); 858446ccdd0SGleb Smirnoff } 859446ccdd0SGleb Smirnoff 860446ccdd0SGleb Smirnoff static void 861446ccdd0SGleb Smirnoff tcp_timer_enter(void *xtp) 862446ccdd0SGleb Smirnoff { 863446ccdd0SGleb Smirnoff struct tcpcb *tp = xtp; 8649eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 865446ccdd0SGleb Smirnoff sbintime_t precision; 866446ccdd0SGleb Smirnoff tt_which which; 86776578d60SMichael Tuexen bool tp_valid; 868446ccdd0SGleb Smirnoff 869446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 870446ccdd0SGleb Smirnoff MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0); 871446ccdd0SGleb Smirnoff 872446ccdd0SGleb Smirnoff curthread->td_pflags |= TDP_INTCPCALLOUT; 873446ccdd0SGleb Smirnoff 874446ccdd0SGleb Smirnoff which = tcp_timer_next(tp, NULL); 875446ccdd0SGleb Smirnoff MPASS(which < TT_N); 876446ccdd0SGleb Smirnoff tp->t_timers[which] = SBT_MAX; 877446ccdd0SGleb Smirnoff tp->t_precisions[which] = 0; 878446ccdd0SGleb Smirnoff 87976578d60SMichael Tuexen tcp_bblog_timer(tp, which, TT_PROCESSING, 0); 88076578d60SMichael Tuexen tp_valid = tcp_timersw[which](tp); 88176578d60SMichael Tuexen if (tp_valid) { 88276578d60SMichael Tuexen tcp_bblog_timer(tp, which, TT_PROCESSED, 0); 883446ccdd0SGleb Smirnoff if ((which = tcp_timer_next(tp, &precision)) != TT_N) { 884446ccdd0SGleb Smirnoff callout_reset_sbt_on(&tp->t_callout, 885446ccdd0SGleb Smirnoff tp->t_timers[which], precision, tcp_timer_enter, 886446ccdd0SGleb Smirnoff tp, inp_to_cpuid(inp), C_ABSOLUTE); 887446ccdd0SGleb Smirnoff } 888446ccdd0SGleb Smirnoff INP_WUNLOCK(inp); 889446ccdd0SGleb Smirnoff } 890446ccdd0SGleb Smirnoff 891446ccdd0SGleb Smirnoff curthread->td_pflags &= ~TDP_INTCPCALLOUT; 892446ccdd0SGleb Smirnoff } 893446ccdd0SGleb Smirnoff 894446ccdd0SGleb Smirnoff /* 895446ccdd0SGleb Smirnoff * Activate or stop (delta == 0) a TCP timer. 896446ccdd0SGleb Smirnoff */ 897446ccdd0SGleb Smirnoff void 898446ccdd0SGleb Smirnoff tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta) 899446ccdd0SGleb Smirnoff { 900446ccdd0SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 901446ccdd0SGleb Smirnoff sbintime_t precision; 90276578d60SMichael Tuexen tt_what what; 90385d94372SRobert Watson 90409fe6320SNavdeep Parhar #ifdef TCP_OFFLOAD 90509fe6320SNavdeep Parhar if (tp->t_flags & TF_TOE) 90609fe6320SNavdeep Parhar return; 90709fe6320SNavdeep Parhar #endif 90809fe6320SNavdeep Parhar 909446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 9105571f9cfSJulien Charbon 91176578d60SMichael Tuexen if (delta > 0) { 91276578d60SMichael Tuexen what = TT_STARTING; 913446ccdd0SGleb Smirnoff callout_when(tick_sbt * delta, 0, C_HARDCLOCK, 914446ccdd0SGleb Smirnoff &tp->t_timers[which], &tp->t_precisions[which]); 91576578d60SMichael Tuexen } else { 91676578d60SMichael Tuexen what = TT_STOPPING; 917446ccdd0SGleb Smirnoff tp->t_timers[which] = SBT_MAX; 91876578d60SMichael Tuexen } 91976578d60SMichael Tuexen tcp_bblog_timer(tp, which, what, delta); 920446ccdd0SGleb Smirnoff 921446ccdd0SGleb Smirnoff if ((which = tcp_timer_next(tp, &precision)) != TT_N) 922446ccdd0SGleb Smirnoff callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which], 923446ccdd0SGleb Smirnoff precision, tcp_timer_enter, tp, inp_to_cpuid(inp), 924446ccdd0SGleb Smirnoff C_ABSOLUTE); 925446ccdd0SGleb Smirnoff else 926446ccdd0SGleb Smirnoff callout_stop(&tp->t_callout); 92785d94372SRobert Watson } 92885d94372SRobert Watson 929446ccdd0SGleb Smirnoff bool 930446ccdd0SGleb Smirnoff tcp_timer_active(struct tcpcb *tp, tt_which which) 93185d94372SRobert Watson { 93285d94372SRobert Watson 933446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(tptoinpcb(tp)); 934446ccdd0SGleb Smirnoff 935446ccdd0SGleb Smirnoff return (tp->t_timers[which] != SBT_MAX); 936df8bae1dSRodney W. Grimes } 937b8614722SMike Silbersack 938446ccdd0SGleb Smirnoff /* 939446ccdd0SGleb Smirnoff * Stop all timers associated with tcpcb. 940446ccdd0SGleb Smirnoff * 941446ccdd0SGleb Smirnoff * Called only on tcpcb destruction. The tcpcb shall already be dropped from 942446ccdd0SGleb Smirnoff * the pcb lookup database and socket is not losing the last reference. 943446ccdd0SGleb Smirnoff * 944446ccdd0SGleb Smirnoff * XXXGL: unfortunately our callout(9) is not able to fully stop a locked 945446ccdd0SGleb Smirnoff * callout even when only two threads are involved: the callout itself and the 946446ccdd0SGleb Smirnoff * thread that does callout_stop(). See where softclock_call_cc() swaps the 947446ccdd0SGleb Smirnoff * callwheel lock to callout lock and then checks cc_exec_cancel(). This is 948446ccdd0SGleb Smirnoff * the race window. If it happens, the tcp_timer_enter() won't be executed, 949446ccdd0SGleb Smirnoff * however pcb lock will be locked and released, hence we can't free memory. 950446ccdd0SGleb Smirnoff * Until callout(9) is improved, just keep retrying. In my profiling I've seen 951446ccdd0SGleb Smirnoff * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic. 952446ccdd0SGleb Smirnoff */ 953446ccdd0SGleb Smirnoff void 954446ccdd0SGleb Smirnoff tcp_timer_stop(struct tcpcb *tp) 955ff945008SGleb Smirnoff { 9569eb0e832SGleb Smirnoff struct inpcb *inp = tptoinpcb(tp); 957ff945008SGleb Smirnoff 958446ccdd0SGleb Smirnoff INP_WLOCK_ASSERT(inp); 9599eb0e832SGleb Smirnoff 960446ccdd0SGleb Smirnoff if (curthread->td_pflags & TDP_INTCPCALLOUT) { 961446ccdd0SGleb Smirnoff int stopped __diagused; 962446ccdd0SGleb Smirnoff 963446ccdd0SGleb Smirnoff stopped = callout_stop(&tp->t_callout); 964446ccdd0SGleb Smirnoff MPASS(stopped == 0); 965446ccdd0SGleb Smirnoff } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) { 966ff945008SGleb Smirnoff INP_WUNLOCK(inp); 967446ccdd0SGleb Smirnoff kern_yield(PRI_UNCHANGED); 968446ccdd0SGleb Smirnoff INP_WLOCK(inp); 9695571f9cfSJulien Charbon } 9705571f9cfSJulien Charbon } 971