1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * INET An implementation of the TCP/IP protocol suite for the LINUX 41da177e4SLinus Torvalds * operating system. INET is implemented using the BSD Socket 51da177e4SLinus Torvalds * interface as the means of communication with the user level. 61da177e4SLinus Torvalds * 71da177e4SLinus Torvalds * Implementation of the Transmission Control Protocol(TCP). 81da177e4SLinus Torvalds * 902c30a84SJesper Juhl * Authors: Ross Biro 101da177e4SLinus Torvalds * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 111da177e4SLinus Torvalds * Mark Evans, <evansmp@uhura.aston.ac.uk> 121da177e4SLinus Torvalds * Corey Minyard <wf-rch!minyard@relay.EU.net> 131da177e4SLinus Torvalds * Florian La Roche, <flla@stud.uni-sb.de> 141da177e4SLinus Torvalds * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> 151da177e4SLinus Torvalds * Linus Torvalds, <torvalds@cs.helsinki.fi> 161da177e4SLinus Torvalds * Alan Cox, <gw4pts@gw4pts.ampr.org> 171da177e4SLinus Torvalds * Matthew Dillon, <dillon@apollo.west.oic.com> 181da177e4SLinus Torvalds * Arnt Gulbrandsen, <agulbra@nvg.unit.no> 191da177e4SLinus Torvalds * Jorge Cwik, <jorge@laser.satlink.net> 201da177e4SLinus Torvalds */ 211da177e4SLinus Torvalds 221da177e4SLinus Torvalds /* 231da177e4SLinus Torvalds * Changes: 241da177e4SLinus Torvalds * Pedro Roque : Fast Retransmit/Recovery. 251da177e4SLinus Torvalds * Two receive queues. 261da177e4SLinus Torvalds * Retransmit queue handled by TCP. 271da177e4SLinus Torvalds * Better retransmit timer handling. 281da177e4SLinus Torvalds * New congestion avoidance. 291da177e4SLinus Torvalds * Header prediction. 301da177e4SLinus Torvalds * Variable renaming. 311da177e4SLinus Torvalds * 321da177e4SLinus Torvalds * Eric : Fast Retransmit. 331da177e4SLinus Torvalds * Randy Scott : MSS option defines. 341da177e4SLinus Torvalds * Eric Schenk : Fixes to slow start algorithm. 351da177e4SLinus Torvalds * Eric Schenk : Yet another double ACK bug. 361da177e4SLinus Torvalds * Eric Schenk : Delayed ACK bug fixes. 371da177e4SLinus Torvalds * Eric Schenk : Floyd style fast retrans war avoidance. 381da177e4SLinus Torvalds * David S. Miller : Don't allow zero congestion window. 391da177e4SLinus Torvalds * Eric Schenk : Fix retransmitter so that it sends 401da177e4SLinus Torvalds * next packet on ack of previous packet. 411da177e4SLinus Torvalds * Andi Kleen : Moved open_request checking here 421da177e4SLinus Torvalds * and process RSTs for open_requests. 431da177e4SLinus Torvalds * Andi Kleen : Better prune_queue, and other fixes. 44caa20d9aSStephen Hemminger * Andrey Savochkin: Fix RTT measurements in the presence of 451da177e4SLinus Torvalds * timestamps. 461da177e4SLinus Torvalds * Andrey Savochkin: Check sequence numbers correctly when 471da177e4SLinus Torvalds * removing SACKs due to in sequence incoming 481da177e4SLinus Torvalds * data segments. 491da177e4SLinus Torvalds * Andi Kleen: Make sure we never ack data there is not 501da177e4SLinus Torvalds * enough room for. Also make this condition 511da177e4SLinus Torvalds * a fatal error if it might still happen. 521da177e4SLinus Torvalds * Andi Kleen: Add tcp_measure_rcv_mss to make 531da177e4SLinus Torvalds * connections with MSS<min(MTU,ann. MSS) 541da177e4SLinus Torvalds * work without delayed acks. 551da177e4SLinus Torvalds * Andi Kleen: Process packets with PSH set in the 561da177e4SLinus Torvalds * fast path. 571da177e4SLinus Torvalds * J Hadi Salim: ECN support 581da177e4SLinus Torvalds * Andrei Gurtov, 591da177e4SLinus Torvalds * Pasi Sarolahti, 601da177e4SLinus Torvalds * Panu Kuhlberg: Experimental audit of TCP (re)transmission 611da177e4SLinus Torvalds * engine. Lots of bugs are found. 621da177e4SLinus Torvalds * Pasi Sarolahti: F-RTO for dealing with spurious RTOs 631da177e4SLinus Torvalds */ 641da177e4SLinus Torvalds 65afd46503SJoe Perches #define pr_fmt(fmt) "TCP: " fmt 66afd46503SJoe Perches 671da177e4SLinus Torvalds #include <linux/mm.h> 685a0e3ad6STejun Heo #include <linux/slab.h> 691da177e4SLinus Torvalds #include <linux/module.h> 701da177e4SLinus Torvalds #include <linux/sysctl.h> 71a0bffffcSIlpo Järvinen #include <linux/kernel.h> 72ad971f61SEric Dumazet #include <linux/prefetch.h> 735ffc02a1SSatoru SATOH #include <net/dst.h> 741da177e4SLinus Torvalds #include <net/tcp.h> 751da177e4SLinus Torvalds #include <net/inet_common.h> 761da177e4SLinus Torvalds #include <linux/ipsec.h> 771da177e4SLinus Torvalds #include <asm/unaligned.h> 78e1c8a607SWillem de Bruijn #include <linux/errqueue.h> 795941521cSSong Liu #include <trace/events/tcp.h> 80494bc1d2SJakub Kicinski #include <linux/jump_label_ratelimit.h> 81c6345ce7SAmritha Nambiar #include <net/busy_poll.h> 821da177e4SLinus Torvalds 83ab32ea5dSBrian Haley int sysctl_tcp_max_orphans __read_mostly = NR_FILE; 841da177e4SLinus Torvalds 851da177e4SLinus Torvalds #define FLAG_DATA 0x01 /* Incoming frame contained data. */ 861da177e4SLinus Torvalds #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ 871da177e4SLinus Torvalds #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ 881da177e4SLinus Torvalds #define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ 891da177e4SLinus Torvalds #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ 901da177e4SLinus Torvalds #define FLAG_DATA_SACKED 0x20 /* New SACK. */ 911da177e4SLinus Torvalds #define FLAG_ECE 0x40 /* ECE in this ACK */ 92291a00d1SYuchung Cheng #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ 9331770e34SFlorian Westphal #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ 94e33099f9SYuchung Cheng #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ 952e605294SIlpo Järvinen #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ 96564262c1SRyousei Takano #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ 97df92c839SNeal Cardwell #define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ 98cadbd031SIlpo Järvinen #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ 9912fb3dd9SEric Dumazet #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ 100d0e1a1b5SEric Dumazet #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ 101eb36be0fSYuchung Cheng #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ 1021da177e4SLinus Torvalds 1031da177e4SLinus Torvalds #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) 1041da177e4SLinus Torvalds #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) 105d09b9e60SPriyaranjan Jha #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK) 1061da177e4SLinus Torvalds #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) 1071da177e4SLinus Torvalds 1081da177e4SLinus Torvalds #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) 109bdf1ee5dSIlpo Järvinen #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) 1101da177e4SLinus Torvalds 111e662ca40SYuchung Cheng #define REXMIT_NONE 0 /* no loss recovery to do */ 112e662ca40SYuchung Cheng #define REXMIT_LOST 1 /* retransmit packets marked lost */ 113e662ca40SYuchung Cheng #define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ 114e662ca40SYuchung Cheng 1156dac1523SIlya Lesokhin #if IS_ENABLED(CONFIG_TLS_DEVICE) 116494bc1d2SJakub Kicinski static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); 1176dac1523SIlya Lesokhin 1186dac1523SIlya Lesokhin void clean_acked_data_enable(struct inet_connection_sock *icsk, 1196dac1523SIlya Lesokhin void (*cad)(struct sock *sk, u32 ack_seq)) 1206dac1523SIlya Lesokhin { 1216dac1523SIlya Lesokhin icsk->icsk_clean_acked = cad; 1227b58139fSWillem de Bruijn static_branch_deferred_inc(&clean_acked_data_enabled); 1236dac1523SIlya Lesokhin } 1246dac1523SIlya Lesokhin EXPORT_SYMBOL_GPL(clean_acked_data_enable); 1256dac1523SIlya Lesokhin 1266dac1523SIlya Lesokhin void clean_acked_data_disable(struct inet_connection_sock *icsk) 1276dac1523SIlya Lesokhin { 128494bc1d2SJakub Kicinski static_branch_slow_dec_deferred(&clean_acked_data_enabled); 1296dac1523SIlya Lesokhin icsk->icsk_clean_acked = NULL; 1306dac1523SIlya Lesokhin } 1316dac1523SIlya Lesokhin EXPORT_SYMBOL_GPL(clean_acked_data_disable); 132494bc1d2SJakub Kicinski 133494bc1d2SJakub Kicinski void clean_acked_data_flush(void) 134494bc1d2SJakub Kicinski { 135494bc1d2SJakub Kicinski static_key_deferred_flush(&clean_acked_data_enabled); 136494bc1d2SJakub Kicinski } 137494bc1d2SJakub Kicinski EXPORT_SYMBOL_GPL(clean_acked_data_flush); 1386dac1523SIlya Lesokhin #endif 1396dac1523SIlya Lesokhin 1400b9aefeaSMarcelo Ricardo Leitner static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb, 1410b9aefeaSMarcelo Ricardo Leitner unsigned int len) 142dcb17d22SMarcelo Ricardo Leitner { 143dcb17d22SMarcelo Ricardo Leitner static bool __once __read_mostly; 144dcb17d22SMarcelo Ricardo Leitner 145dcb17d22SMarcelo Ricardo Leitner if (!__once) { 146dcb17d22SMarcelo Ricardo Leitner struct net_device *dev; 147dcb17d22SMarcelo Ricardo Leitner 148dcb17d22SMarcelo Ricardo Leitner __once = true; 149dcb17d22SMarcelo Ricardo Leitner 150dcb17d22SMarcelo Ricardo Leitner rcu_read_lock(); 151dcb17d22SMarcelo Ricardo Leitner dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); 1520b9aefeaSMarcelo Ricardo Leitner if (!dev || len >= dev->mtu) 153dcb17d22SMarcelo Ricardo Leitner pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n", 154dcb17d22SMarcelo Ricardo Leitner dev ? dev->name : "Unknown driver"); 155dcb17d22SMarcelo Ricardo Leitner rcu_read_unlock(); 156dcb17d22SMarcelo Ricardo Leitner } 157dcb17d22SMarcelo Ricardo Leitner } 158dcb17d22SMarcelo Ricardo Leitner 1591da177e4SLinus Torvalds /* Adapt the MSS value used to make delayed ack decision to the 1601da177e4SLinus Torvalds * real world. 1611da177e4SLinus Torvalds */ 162056834d9SIlpo Järvinen static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) 1631da177e4SLinus Torvalds { 164463c84b9SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 165463c84b9SArnaldo Carvalho de Melo const unsigned int lss = icsk->icsk_ack.last_seg_size; 166463c84b9SArnaldo Carvalho de Melo unsigned int len; 1671da177e4SLinus Torvalds 168463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.last_seg_size = 0; 1691da177e4SLinus Torvalds 1701da177e4SLinus Torvalds /* skb->len may jitter because of SACKs, even if peer 1711da177e4SLinus Torvalds * sends good full-sized frames. 1721da177e4SLinus Torvalds */ 173ff9b5e0fSHerbert Xu len = skb_shinfo(skb)->gso_size ? : skb->len; 174463c84b9SArnaldo Carvalho de Melo if (len >= icsk->icsk_ack.rcv_mss) { 175dcb17d22SMarcelo Ricardo Leitner icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, 176dcb17d22SMarcelo Ricardo Leitner tcp_sk(sk)->advmss); 1770b9aefeaSMarcelo Ricardo Leitner /* Account for possibly-removed options */ 1780b9aefeaSMarcelo Ricardo Leitner if (unlikely(len > icsk->icsk_ack.rcv_mss + 1790b9aefeaSMarcelo Ricardo Leitner MAX_TCP_OPTION_SPACE)) 1800b9aefeaSMarcelo Ricardo Leitner tcp_gro_dev_warn(sk, skb, len); 1811da177e4SLinus Torvalds } else { 1821da177e4SLinus Torvalds /* Otherwise, we make more careful check taking into account, 1831da177e4SLinus Torvalds * that SACKs block is variable. 1841da177e4SLinus Torvalds * 1851da177e4SLinus Torvalds * "len" is invariant segment length, including TCP header. 1861da177e4SLinus Torvalds */ 1879c70220bSArnaldo Carvalho de Melo len += skb->data - skb_transport_header(skb); 188bee7ca9eSWilliam Allen Simpson if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || 1891da177e4SLinus Torvalds /* If PSH is not set, packet should be 1901da177e4SLinus Torvalds * full sized, provided peer TCP is not badly broken. 1911da177e4SLinus Torvalds * This observation (if it is correct 8)) allows 1921da177e4SLinus Torvalds * to handle super-low mtu links fairly. 1931da177e4SLinus Torvalds */ 1941da177e4SLinus Torvalds (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && 195aa8223c7SArnaldo Carvalho de Melo !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { 1961da177e4SLinus Torvalds /* Subtract also invariant (if peer is RFC compliant), 1971da177e4SLinus Torvalds * tcp header plus fixed timestamp option length. 1981da177e4SLinus Torvalds * Resulting "len" is MSS free of SACK jitter. 1991da177e4SLinus Torvalds */ 200463c84b9SArnaldo Carvalho de Melo len -= tcp_sk(sk)->tcp_header_len; 201463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.last_seg_size = len; 2021da177e4SLinus Torvalds if (len == lss) { 203463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.rcv_mss = len; 2041da177e4SLinus Torvalds return; 2051da177e4SLinus Torvalds } 2061da177e4SLinus Torvalds } 2071ef9696cSAlexey Kuznetsov if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) 2081ef9696cSAlexey Kuznetsov icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; 209463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; 2101da177e4SLinus Torvalds } 2111da177e4SLinus Torvalds } 2121da177e4SLinus Torvalds 2139a9c9b51SEric Dumazet static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) 2141da177e4SLinus Torvalds { 215463c84b9SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 21695c96174SEric Dumazet unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); 2171da177e4SLinus Torvalds 2181da177e4SLinus Torvalds if (quickacks == 0) 2191da177e4SLinus Torvalds quickacks = 2; 2209a9c9b51SEric Dumazet quickacks = min(quickacks, max_quickacks); 221463c84b9SArnaldo Carvalho de Melo if (quickacks > icsk->icsk_ack.quick) 2229a9c9b51SEric Dumazet icsk->icsk_ack.quick = quickacks; 2231da177e4SLinus Torvalds } 2241da177e4SLinus Torvalds 225a0496ef2SYuchung Cheng void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) 2261da177e4SLinus Torvalds { 227463c84b9SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 2289a9c9b51SEric Dumazet 2299a9c9b51SEric Dumazet tcp_incr_quickack(sk, max_quickacks); 23031954cd8SWei Wang inet_csk_exit_pingpong_mode(sk); 231463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.ato = TCP_ATO_MIN; 2321da177e4SLinus Torvalds } 233a0496ef2SYuchung Cheng EXPORT_SYMBOL(tcp_enter_quickack_mode); 2341da177e4SLinus Torvalds 2351da177e4SLinus Torvalds /* Send ACKs quickly, if "quick" count is not exhausted 2361da177e4SLinus Torvalds * and the session is not interactive. 2371da177e4SLinus Torvalds */ 2381da177e4SLinus Torvalds 2392251ae46SJon Maxwell static bool tcp_in_quickack_mode(struct sock *sk) 2401da177e4SLinus Torvalds { 241463c84b9SArnaldo Carvalho de Melo const struct inet_connection_sock *icsk = inet_csk(sk); 2422251ae46SJon Maxwell const struct dst_entry *dst = __sk_dst_get(sk); 243a2a385d6SEric Dumazet 2442251ae46SJon Maxwell return (dst && dst_metric(dst, RTAX_QUICKACK)) || 24531954cd8SWei Wang (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); 2461da177e4SLinus Torvalds } 2471da177e4SLinus Torvalds 248735d3831SFlorian Westphal static void tcp_ecn_queue_cwr(struct tcp_sock *tp) 249bdf1ee5dSIlpo Järvinen { 250bdf1ee5dSIlpo Järvinen if (tp->ecn_flags & TCP_ECN_OK) 251bdf1ee5dSIlpo Järvinen tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 252bdf1ee5dSIlpo Järvinen } 253bdf1ee5dSIlpo Järvinen 254fd2123a3SYuchung Cheng static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) 255bdf1ee5dSIlpo Järvinen { 2569aee4000SLawrence Brakmo if (tcp_hdr(skb)->cwr) { 257fd2123a3SYuchung Cheng tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 2589aee4000SLawrence Brakmo 2599aee4000SLawrence Brakmo /* If the sender is telling us it has entered CWR, then its 2609aee4000SLawrence Brakmo * cwnd may be very low (even just 1 packet), so we should ACK 2619aee4000SLawrence Brakmo * immediately. 2629aee4000SLawrence Brakmo */ 263fd2123a3SYuchung Cheng inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 2649aee4000SLawrence Brakmo } 265bdf1ee5dSIlpo Järvinen } 266bdf1ee5dSIlpo Järvinen 267735d3831SFlorian Westphal static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) 268bdf1ee5dSIlpo Järvinen { 269af38d07eSNeal Cardwell tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; 270bdf1ee5dSIlpo Järvinen } 271bdf1ee5dSIlpo Järvinen 272f4c9f85fSYousuk Seung static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) 273bdf1ee5dSIlpo Järvinen { 274f4c9f85fSYousuk Seung struct tcp_sock *tp = tcp_sk(sk); 275f4c9f85fSYousuk Seung 276b82d1bb4SEric Dumazet switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { 2777a269ffaSEric Dumazet case INET_ECN_NOT_ECT: 278bdf1ee5dSIlpo Järvinen /* Funny extension: if ECT is not set on a segment, 2797a269ffaSEric Dumazet * and we already seen ECT on a previous segment, 2807a269ffaSEric Dumazet * it is probably a retransmit. 2817a269ffaSEric Dumazet */ 2827a269ffaSEric Dumazet if (tp->ecn_flags & TCP_ECN_SEEN) 28315ecbe94SEric Dumazet tcp_enter_quickack_mode(sk, 2); 2847a269ffaSEric Dumazet break; 2857a269ffaSEric Dumazet case INET_ECN_CE: 286f4c9f85fSYousuk Seung if (tcp_ca_needs_ecn(sk)) 287f4c9f85fSYousuk Seung tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); 2889890092eSFlorian Westphal 289aae06bf5SEric Dumazet if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 290aae06bf5SEric Dumazet /* Better not delay acks, sender can have a very low cwnd */ 29115ecbe94SEric Dumazet tcp_enter_quickack_mode(sk, 2); 2927a269ffaSEric Dumazet tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 293aae06bf5SEric Dumazet } 2947a269ffaSEric Dumazet tp->ecn_flags |= TCP_ECN_SEEN; 2959890092eSFlorian Westphal break; 2969890092eSFlorian Westphal default: 297f4c9f85fSYousuk Seung if (tcp_ca_needs_ecn(sk)) 298f4c9f85fSYousuk Seung tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); 2999890092eSFlorian Westphal tp->ecn_flags |= TCP_ECN_SEEN; 3009890092eSFlorian Westphal break; 301bdf1ee5dSIlpo Järvinen } 302bdf1ee5dSIlpo Järvinen } 303bdf1ee5dSIlpo Järvinen 304f4c9f85fSYousuk Seung static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) 305735d3831SFlorian Westphal { 306f4c9f85fSYousuk Seung if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) 307f4c9f85fSYousuk Seung __tcp_ecn_check_ce(sk, skb); 308735d3831SFlorian Westphal } 309735d3831SFlorian Westphal 310735d3831SFlorian Westphal static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) 311bdf1ee5dSIlpo Järvinen { 312bdf1ee5dSIlpo Järvinen if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) 313bdf1ee5dSIlpo Järvinen tp->ecn_flags &= ~TCP_ECN_OK; 314bdf1ee5dSIlpo Järvinen } 315bdf1ee5dSIlpo Järvinen 316735d3831SFlorian Westphal static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) 317bdf1ee5dSIlpo Järvinen { 318bdf1ee5dSIlpo Järvinen if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) 319bdf1ee5dSIlpo Järvinen tp->ecn_flags &= ~TCP_ECN_OK; 320bdf1ee5dSIlpo Järvinen } 321bdf1ee5dSIlpo Järvinen 322735d3831SFlorian Westphal static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) 323bdf1ee5dSIlpo Järvinen { 324bdf1ee5dSIlpo Järvinen if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) 325a2a385d6SEric Dumazet return true; 326a2a385d6SEric Dumazet return false; 327bdf1ee5dSIlpo Järvinen } 328bdf1ee5dSIlpo Järvinen 3291da177e4SLinus Torvalds /* Buffer size and advertised window tuning. 3301da177e4SLinus Torvalds * 3311da177e4SLinus Torvalds * 1. Tuning sk->sk_sndbuf, when connection enters established state. 3321da177e4SLinus Torvalds */ 3331da177e4SLinus Torvalds 3346ae70532SEric Dumazet static void tcp_sndbuf_expand(struct sock *sk) 3351da177e4SLinus Torvalds { 3366ae70532SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 33777bfc174SYuchung Cheng const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; 3386ae70532SEric Dumazet int sndmem, per_mss; 3396ae70532SEric Dumazet u32 nr_segs; 3401da177e4SLinus Torvalds 3416ae70532SEric Dumazet /* Worst case is non GSO/TSO : each frame consumes one skb 3426ae70532SEric Dumazet * and skb->head is kmalloced using power of two area of memory 3436ae70532SEric Dumazet */ 3446ae70532SEric Dumazet per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + 3456ae70532SEric Dumazet MAX_TCP_HEADER + 3466ae70532SEric Dumazet SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 3476ae70532SEric Dumazet 3486ae70532SEric Dumazet per_mss = roundup_pow_of_two(per_mss) + 3496ae70532SEric Dumazet SKB_DATA_ALIGN(sizeof(struct sk_buff)); 3506ae70532SEric Dumazet 3516ae70532SEric Dumazet nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); 3526ae70532SEric Dumazet nr_segs = max_t(u32, nr_segs, tp->reordering + 1); 3536ae70532SEric Dumazet 3546ae70532SEric Dumazet /* Fast Recovery (RFC 5681 3.2) : 3556ae70532SEric Dumazet * Cubic needs 1.7 factor, rounded to 2 to include 356a9a08845SLinus Torvalds * extra cushion (application might react slowly to EPOLLOUT) 3576ae70532SEric Dumazet */ 35877bfc174SYuchung Cheng sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2; 35977bfc174SYuchung Cheng sndmem *= nr_segs * per_mss; 3606ae70532SEric Dumazet 36106a59ecbSEric Dumazet if (sk->sk_sndbuf < sndmem) 362*e292f05eSEric Dumazet WRITE_ONCE(sk->sk_sndbuf, 363*e292f05eSEric Dumazet min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); 3641da177e4SLinus Torvalds } 3651da177e4SLinus Torvalds 3661da177e4SLinus Torvalds /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) 3671da177e4SLinus Torvalds * 3681da177e4SLinus Torvalds * All tcp_full_space() is split to two parts: "network" buffer, allocated 3691da177e4SLinus Torvalds * forward and advertised in receiver window (tp->rcv_wnd) and 3701da177e4SLinus Torvalds * "application buffer", required to isolate scheduling/application 3711da177e4SLinus Torvalds * latencies from network. 3721da177e4SLinus Torvalds * window_clamp is maximal advertised window. It can be less than 3731da177e4SLinus Torvalds * tcp_full_space(), in this case tcp_full_space() - window_clamp 3741da177e4SLinus Torvalds * is reserved for "application" buffer. The less window_clamp is 3751da177e4SLinus Torvalds * the smoother our behaviour from viewpoint of network, but the lower 3761da177e4SLinus Torvalds * throughput and the higher sensitivity of the connection to losses. 8) 3771da177e4SLinus Torvalds * 3781da177e4SLinus Torvalds * rcv_ssthresh is more strict window_clamp used at "slow start" 3791da177e4SLinus Torvalds * phase to predict further behaviour of this connection. 3801da177e4SLinus Torvalds * It is used for two goals: 3811da177e4SLinus Torvalds * - to enforce header prediction at sender, even when application 3821da177e4SLinus Torvalds * requires some significant "application buffer". It is check #1. 3831da177e4SLinus Torvalds * - to prevent pruning of receive queue because of misprediction 3841da177e4SLinus Torvalds * of receiver window. Check #2. 3851da177e4SLinus Torvalds * 3861da177e4SLinus Torvalds * The scheme does not work when sender sends good segments opening 387caa20d9aSStephen Hemminger * window and then starts to feed us spaghetti. But it should work 3881da177e4SLinus Torvalds * in common situations. Otherwise, we have to rely on queue collapsing. 3891da177e4SLinus Torvalds */ 3901da177e4SLinus Torvalds 3911da177e4SLinus Torvalds /* Slow part of check#2. */ 3929e412ba7SIlpo Järvinen static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) 3931da177e4SLinus Torvalds { 3949e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 3951da177e4SLinus Torvalds /* Optimize this! */ 39694f0893eSEric Dumazet int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; 397356d1833SEric Dumazet int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; 3981da177e4SLinus Torvalds 3991da177e4SLinus Torvalds while (tp->rcv_ssthresh <= window) { 4001da177e4SLinus Torvalds if (truesize <= skb->len) 401463c84b9SArnaldo Carvalho de Melo return 2 * inet_csk(sk)->icsk_ack.rcv_mss; 4021da177e4SLinus Torvalds 4031da177e4SLinus Torvalds truesize >>= 1; 4041da177e4SLinus Torvalds window >>= 1; 4051da177e4SLinus Torvalds } 4061da177e4SLinus Torvalds return 0; 4071da177e4SLinus Torvalds } 4081da177e4SLinus Torvalds 409cf533ea5SEric Dumazet static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) 4101da177e4SLinus Torvalds { 4119e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 41250ce163aSEric Dumazet int room; 41350ce163aSEric Dumazet 41450ce163aSEric Dumazet room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; 4159e412ba7SIlpo Järvinen 4161da177e4SLinus Torvalds /* Check #1 */ 41750ce163aSEric Dumazet if (room > 0 && !tcp_under_memory_pressure(sk)) { 4181da177e4SLinus Torvalds int incr; 4191da177e4SLinus Torvalds 4201da177e4SLinus Torvalds /* Check #2. Increase window, if skb with such overhead 4211da177e4SLinus Torvalds * will fit to rcvbuf in future. 4221da177e4SLinus Torvalds */ 42394f0893eSEric Dumazet if (tcp_win_from_space(sk, skb->truesize) <= skb->len) 4241da177e4SLinus Torvalds incr = 2 * tp->advmss; 4251da177e4SLinus Torvalds else 4269e412ba7SIlpo Järvinen incr = __tcp_grow_window(sk, skb); 4271da177e4SLinus Torvalds 4281da177e4SLinus Torvalds if (incr) { 4294d846f02SEric Dumazet incr = max_t(int, incr, 2 * skb->len); 43050ce163aSEric Dumazet tp->rcv_ssthresh += min(room, incr); 431463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_ack.quick |= 1; 4321da177e4SLinus Torvalds } 4331da177e4SLinus Torvalds } 4341da177e4SLinus Torvalds } 4351da177e4SLinus Torvalds 436a337531bSYuchung Cheng /* 3. Try to fixup all. It is made immediately after connection enters 4371da177e4SLinus Torvalds * established state. 4381da177e4SLinus Torvalds */ 43910467163SJerry Chu void tcp_init_buffer_space(struct sock *sk) 4401da177e4SLinus Torvalds { 4410c12654aSEric Dumazet int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; 4421da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 4431da177e4SLinus Torvalds int maxwin; 4441da177e4SLinus Torvalds 4451da177e4SLinus Torvalds if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) 4466ae70532SEric Dumazet tcp_sndbuf_expand(sk); 4471da177e4SLinus Torvalds 448041a14d2SYuchung Cheng tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss); 4499a568de4SEric Dumazet tcp_mstamp_refresh(tp); 450645f4c6fSEric Dumazet tp->rcvq_space.time = tp->tcp_mstamp; 451b0983d3cSEric Dumazet tp->rcvq_space.seq = tp->copied_seq; 4521da177e4SLinus Torvalds 4531da177e4SLinus Torvalds maxwin = tcp_full_space(sk); 4541da177e4SLinus Torvalds 4551da177e4SLinus Torvalds if (tp->window_clamp >= maxwin) { 4561da177e4SLinus Torvalds tp->window_clamp = maxwin; 4571da177e4SLinus Torvalds 4580c12654aSEric Dumazet if (tcp_app_win && maxwin > 4 * tp->advmss) 4591da177e4SLinus Torvalds tp->window_clamp = max(maxwin - 4600c12654aSEric Dumazet (maxwin >> tcp_app_win), 4611da177e4SLinus Torvalds 4 * tp->advmss); 4621da177e4SLinus Torvalds } 4631da177e4SLinus Torvalds 4641da177e4SLinus Torvalds /* Force reservation of one segment. */ 4650c12654aSEric Dumazet if (tcp_app_win && 4661da177e4SLinus Torvalds tp->window_clamp > 2 * tp->advmss && 4671da177e4SLinus Torvalds tp->window_clamp + tp->advmss > maxwin) 4681da177e4SLinus Torvalds tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); 4691da177e4SLinus Torvalds 4701da177e4SLinus Torvalds tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); 471c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 4721da177e4SLinus Torvalds } 4731da177e4SLinus Torvalds 474a337531bSYuchung Cheng /* 4. Recalculate window clamp after socket hit its memory bounds. */ 4759e412ba7SIlpo Järvinen static void tcp_clamp_window(struct sock *sk) 4761da177e4SLinus Torvalds { 4779e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 4786687e988SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 479356d1833SEric Dumazet struct net *net = sock_net(sk); 4801da177e4SLinus Torvalds 4816687e988SArnaldo Carvalho de Melo icsk->icsk_ack.quick = 0; 4821da177e4SLinus Torvalds 483356d1833SEric Dumazet if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && 4841da177e4SLinus Torvalds !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && 485b8da51ebSEric Dumazet !tcp_under_memory_pressure(sk) && 486180d8cd9SGlauber Costa sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { 487ebb3b78dSEric Dumazet WRITE_ONCE(sk->sk_rcvbuf, 488ebb3b78dSEric Dumazet min(atomic_read(&sk->sk_rmem_alloc), 489ebb3b78dSEric Dumazet net->ipv4.sysctl_tcp_rmem[2])); 4901da177e4SLinus Torvalds } 491326f36e9SJohn Heffner if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) 4921da177e4SLinus Torvalds tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); 4931da177e4SLinus Torvalds } 4941da177e4SLinus Torvalds 49540efc6faSStephen Hemminger /* Initialize RCV_MSS value. 49640efc6faSStephen Hemminger * RCV_MSS is an our guess about MSS used by the peer. 49740efc6faSStephen Hemminger * We haven't any direct information about the MSS. 49840efc6faSStephen Hemminger * It's better to underestimate the RCV_MSS rather than overestimate. 49940efc6faSStephen Hemminger * Overestimations make us ACKing less frequently than needed. 50040efc6faSStephen Hemminger * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). 50140efc6faSStephen Hemminger */ 50240efc6faSStephen Hemminger void tcp_initialize_rcv_mss(struct sock *sk) 50340efc6faSStephen Hemminger { 504cf533ea5SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 50540efc6faSStephen Hemminger unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); 50640efc6faSStephen Hemminger 50740efc6faSStephen Hemminger hint = min(hint, tp->rcv_wnd / 2); 508bee7ca9eSWilliam Allen Simpson hint = min(hint, TCP_MSS_DEFAULT); 50940efc6faSStephen Hemminger hint = max(hint, TCP_MIN_MSS); 51040efc6faSStephen Hemminger 51140efc6faSStephen Hemminger inet_csk(sk)->icsk_ack.rcv_mss = hint; 51240efc6faSStephen Hemminger } 5134bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_initialize_rcv_mss); 51440efc6faSStephen Hemminger 5151da177e4SLinus Torvalds /* Receiver "autotuning" code. 5161da177e4SLinus Torvalds * 5171da177e4SLinus Torvalds * The algorithm for RTT estimation w/o timestamps is based on 5181da177e4SLinus Torvalds * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. 519631dd1a8SJustin P. Mattock * <http://public.lanl.gov/radiant/pubs.html#DRS> 5201da177e4SLinus Torvalds * 5211da177e4SLinus Torvalds * More detail on this code can be found at 522631dd1a8SJustin P. Mattock * <http://staff.psc.edu/jheffner/>, 5231da177e4SLinus Torvalds * though this reference is out of date. A new paper 5241da177e4SLinus Torvalds * is pending. 5251da177e4SLinus Torvalds */ 5261da177e4SLinus Torvalds static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) 5271da177e4SLinus Torvalds { 528645f4c6fSEric Dumazet u32 new_sample = tp->rcv_rtt_est.rtt_us; 5291da177e4SLinus Torvalds long m = sample; 5301da177e4SLinus Torvalds 5311da177e4SLinus Torvalds if (new_sample != 0) { 5321da177e4SLinus Torvalds /* If we sample in larger samples in the non-timestamp 5331da177e4SLinus Torvalds * case, we could grossly overestimate the RTT especially 5341da177e4SLinus Torvalds * with chatty applications or bulk transfer apps which 5351da177e4SLinus Torvalds * are stalled on filesystem I/O. 5361da177e4SLinus Torvalds * 5371da177e4SLinus Torvalds * Also, since we are only going for a minimum in the 53831f34269SStephen Hemminger * non-timestamp case, we do not smooth things out 539caa20d9aSStephen Hemminger * else with timestamps disabled convergence takes too 5401da177e4SLinus Torvalds * long. 5411da177e4SLinus Torvalds */ 5421da177e4SLinus Torvalds if (!win_dep) { 5431da177e4SLinus Torvalds m -= (new_sample >> 3); 5441da177e4SLinus Torvalds new_sample += m; 54518a223e0SNeal Cardwell } else { 54618a223e0SNeal Cardwell m <<= 3; 54718a223e0SNeal Cardwell if (m < new_sample) 54818a223e0SNeal Cardwell new_sample = m; 54918a223e0SNeal Cardwell } 5501da177e4SLinus Torvalds } else { 551caa20d9aSStephen Hemminger /* No previous measure. */ 5521da177e4SLinus Torvalds new_sample = m << 3; 5531da177e4SLinus Torvalds } 5541da177e4SLinus Torvalds 555645f4c6fSEric Dumazet tp->rcv_rtt_est.rtt_us = new_sample; 5561da177e4SLinus Torvalds } 5571da177e4SLinus Torvalds 5581da177e4SLinus Torvalds static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) 5591da177e4SLinus Torvalds { 560645f4c6fSEric Dumazet u32 delta_us; 561645f4c6fSEric Dumazet 5629a568de4SEric Dumazet if (tp->rcv_rtt_est.time == 0) 5631da177e4SLinus Torvalds goto new_measure; 5641da177e4SLinus Torvalds if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) 5651da177e4SLinus Torvalds return; 5669a568de4SEric Dumazet delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time); 5679ee11bd0SWei Wang if (!delta_us) 5689ee11bd0SWei Wang delta_us = 1; 569645f4c6fSEric Dumazet tcp_rcv_rtt_update(tp, delta_us, 1); 5701da177e4SLinus Torvalds 5711da177e4SLinus Torvalds new_measure: 5721da177e4SLinus Torvalds tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; 573645f4c6fSEric Dumazet tp->rcv_rtt_est.time = tp->tcp_mstamp; 5741da177e4SLinus Torvalds } 5751da177e4SLinus Torvalds 576056834d9SIlpo Järvinen static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, 577056834d9SIlpo Järvinen const struct sk_buff *skb) 5781da177e4SLinus Torvalds { 579463c84b9SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 5809a568de4SEric Dumazet 5813f6c65d6SWei Wang if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr) 5823f6c65d6SWei Wang return; 5833f6c65d6SWei Wang tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; 5843f6c65d6SWei Wang 5853f6c65d6SWei Wang if (TCP_SKB_CB(skb)->end_seq - 5863f6c65d6SWei Wang TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { 5879a568de4SEric Dumazet u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; 5889ee11bd0SWei Wang u32 delta_us; 5899a568de4SEric Dumazet 5909efdda4eSEric Dumazet if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { 5919ee11bd0SWei Wang if (!delta) 5929ee11bd0SWei Wang delta = 1; 5939ee11bd0SWei Wang delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); 5949a568de4SEric Dumazet tcp_rcv_rtt_update(tp, delta_us, 0); 5959a568de4SEric Dumazet } 5961da177e4SLinus Torvalds } 5979efdda4eSEric Dumazet } 5981da177e4SLinus Torvalds 5991da177e4SLinus Torvalds /* 6001da177e4SLinus Torvalds * This function should be called every time data is copied to user space. 6011da177e4SLinus Torvalds * It calculates the appropriate TCP receive buffer space. 6021da177e4SLinus Torvalds */ 6031da177e4SLinus Torvalds void tcp_rcv_space_adjust(struct sock *sk) 6041da177e4SLinus Torvalds { 6051da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 606607065baSEric Dumazet u32 copied; 6071da177e4SLinus Torvalds int time; 6081da177e4SLinus Torvalds 6096163849dSYafang Shao trace_tcp_rcv_space_adjust(sk); 6106163849dSYafang Shao 61186323850SEric Dumazet tcp_mstamp_refresh(tp); 6129a568de4SEric Dumazet time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); 613645f4c6fSEric Dumazet if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) 6141da177e4SLinus Torvalds return; 6151da177e4SLinus Torvalds 616b0983d3cSEric Dumazet /* Number of bytes copied to user in last RTT */ 617b0983d3cSEric Dumazet copied = tp->copied_seq - tp->rcvq_space.seq; 618b0983d3cSEric Dumazet if (copied <= tp->rcvq_space.space) 619b0983d3cSEric Dumazet goto new_measure; 6201da177e4SLinus Torvalds 621b0983d3cSEric Dumazet /* A bit of theory : 622b0983d3cSEric Dumazet * copied = bytes received in previous RTT, our base window 623b0983d3cSEric Dumazet * To cope with packet losses, we need a 2x factor 624b0983d3cSEric Dumazet * To cope with slow start, and sender growing its cwin by 100 % 625b0983d3cSEric Dumazet * every RTT, we need a 4x factor, because the ACK we are sending 626b0983d3cSEric Dumazet * now is for the next RTT, not the current one : 627b0983d3cSEric Dumazet * <prev RTT . ><current RTT .. ><next RTT .... > 628b0983d3cSEric Dumazet */ 6291da177e4SLinus Torvalds 6304540c0cfSEric Dumazet if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf && 6316fcf9412SJohn Heffner !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { 632607065baSEric Dumazet int rcvmem, rcvbuf; 633c3916ad9SEric Dumazet u64 rcvwin, grow; 6341da177e4SLinus Torvalds 635b0983d3cSEric Dumazet /* minimal window to cope with packet losses, assuming 636b0983d3cSEric Dumazet * steady state. Add some cushion because of small variations. 6371da177e4SLinus Torvalds */ 638607065baSEric Dumazet rcvwin = ((u64)copied << 1) + 16 * tp->advmss; 639b0983d3cSEric Dumazet 640c3916ad9SEric Dumazet /* Accommodate for sender rate increase (eg. slow start) */ 641c3916ad9SEric Dumazet grow = rcvwin * (copied - tp->rcvq_space.space); 642c3916ad9SEric Dumazet do_div(grow, tp->rcvq_space.space); 643c3916ad9SEric Dumazet rcvwin += (grow << 1); 644b0983d3cSEric Dumazet 64587fb4b7bSEric Dumazet rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); 64694f0893eSEric Dumazet while (tcp_win_from_space(sk, rcvmem) < tp->advmss) 6471da177e4SLinus Torvalds rcvmem += 128; 648b0983d3cSEric Dumazet 649607065baSEric Dumazet do_div(rcvwin, tp->advmss); 650607065baSEric Dumazet rcvbuf = min_t(u64, rcvwin * rcvmem, 651356d1833SEric Dumazet sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); 652b0983d3cSEric Dumazet if (rcvbuf > sk->sk_rcvbuf) { 653ebb3b78dSEric Dumazet WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); 6541da177e4SLinus Torvalds 6551da177e4SLinus Torvalds /* Make the window clamp follow along. */ 65602db5571SEric Dumazet tp->window_clamp = tcp_win_from_space(sk, rcvbuf); 6571da177e4SLinus Torvalds } 6581da177e4SLinus Torvalds } 659b0983d3cSEric Dumazet tp->rcvq_space.space = copied; 6601da177e4SLinus Torvalds 6611da177e4SLinus Torvalds new_measure: 6621da177e4SLinus Torvalds tp->rcvq_space.seq = tp->copied_seq; 663645f4c6fSEric Dumazet tp->rcvq_space.time = tp->tcp_mstamp; 6641da177e4SLinus Torvalds } 6651da177e4SLinus Torvalds 6661da177e4SLinus Torvalds /* There is something which you must keep in mind when you analyze the 6671da177e4SLinus Torvalds * behavior of the tp->ato delayed ack timeout interval. When a 6681da177e4SLinus Torvalds * connection starts up, we want to ack as quickly as possible. The 6691da177e4SLinus Torvalds * problem is that "good" TCP's do slow start at the beginning of data 6701da177e4SLinus Torvalds * transmission. The means that until we send the first few ACK's the 6711da177e4SLinus Torvalds * sender will sit on his end and only queue most of his data, because 6721da177e4SLinus Torvalds * he can only send snd_cwnd unacked packets at any given time. For 6731da177e4SLinus Torvalds * each ACK we send, he increments snd_cwnd and transmits more of his 6741da177e4SLinus Torvalds * queue. -DaveM 6751da177e4SLinus Torvalds */ 6769e412ba7SIlpo Järvinen static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) 6771da177e4SLinus Torvalds { 6789e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 679463c84b9SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 6801da177e4SLinus Torvalds u32 now; 6811da177e4SLinus Torvalds 682463c84b9SArnaldo Carvalho de Melo inet_csk_schedule_ack(sk); 6831da177e4SLinus Torvalds 684463c84b9SArnaldo Carvalho de Melo tcp_measure_rcv_mss(sk, skb); 6851da177e4SLinus Torvalds 6861da177e4SLinus Torvalds tcp_rcv_rtt_measure(tp); 6871da177e4SLinus Torvalds 68870eabf0eSEric Dumazet now = tcp_jiffies32; 6891da177e4SLinus Torvalds 690463c84b9SArnaldo Carvalho de Melo if (!icsk->icsk_ack.ato) { 6911da177e4SLinus Torvalds /* The _first_ data packet received, initialize 6921da177e4SLinus Torvalds * delayed ACK engine. 6931da177e4SLinus Torvalds */ 6949a9c9b51SEric Dumazet tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); 695463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.ato = TCP_ATO_MIN; 6961da177e4SLinus Torvalds } else { 697463c84b9SArnaldo Carvalho de Melo int m = now - icsk->icsk_ack.lrcvtime; 6981da177e4SLinus Torvalds 6991da177e4SLinus Torvalds if (m <= TCP_ATO_MIN / 2) { 7001da177e4SLinus Torvalds /* The fastest case is the first. */ 701463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; 702463c84b9SArnaldo Carvalho de Melo } else if (m < icsk->icsk_ack.ato) { 703463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; 704463c84b9SArnaldo Carvalho de Melo if (icsk->icsk_ack.ato > icsk->icsk_rto) 705463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.ato = icsk->icsk_rto; 706463c84b9SArnaldo Carvalho de Melo } else if (m > icsk->icsk_rto) { 707caa20d9aSStephen Hemminger /* Too long gap. Apparently sender failed to 7081da177e4SLinus Torvalds * restart window, so that we send ACKs quickly. 7091da177e4SLinus Torvalds */ 7109a9c9b51SEric Dumazet tcp_incr_quickack(sk, TCP_MAX_QUICKACKS); 7113ab224beSHideo Aoki sk_mem_reclaim(sk); 7121da177e4SLinus Torvalds } 7131da177e4SLinus Torvalds } 714463c84b9SArnaldo Carvalho de Melo icsk->icsk_ack.lrcvtime = now; 7151da177e4SLinus Torvalds 716f4c9f85fSYousuk Seung tcp_ecn_check_ce(sk, skb); 7171da177e4SLinus Torvalds 7181da177e4SLinus Torvalds if (skb->len >= 128) 7199e412ba7SIlpo Järvinen tcp_grow_window(sk, skb); 7201da177e4SLinus Torvalds } 7211da177e4SLinus Torvalds 7221da177e4SLinus Torvalds /* Called to compute a smoothed rtt estimate. The data fed to this 7231da177e4SLinus Torvalds * routine either comes from timestamps, or from segments that were 7241da177e4SLinus Torvalds * known _not_ to have been retransmitted [see Karn/Partridge 7251da177e4SLinus Torvalds * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 7261da177e4SLinus Torvalds * piece by Van Jacobson. 7271da177e4SLinus Torvalds * NOTE: the next three routines used to be one big routine. 7281da177e4SLinus Torvalds * To save cycles in the RFC 1323 implementation it was better to break 7291da177e4SLinus Torvalds * it up into three procedures. -- erics 7301da177e4SLinus Torvalds */ 731740b0f18SEric Dumazet static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) 7321da177e4SLinus Torvalds { 7336687e988SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 734740b0f18SEric Dumazet long m = mrtt_us; /* RTT */ 735740b0f18SEric Dumazet u32 srtt = tp->srtt_us; 7361da177e4SLinus Torvalds 7371da177e4SLinus Torvalds /* The following amusing code comes from Jacobson's 7381da177e4SLinus Torvalds * article in SIGCOMM '88. Note that rtt and mdev 7391da177e4SLinus Torvalds * are scaled versions of rtt and mean deviation. 7401da177e4SLinus Torvalds * This is designed to be as fast as possible 7411da177e4SLinus Torvalds * m stands for "measurement". 7421da177e4SLinus Torvalds * 7431da177e4SLinus Torvalds * On a 1990 paper the rto value is changed to: 7441da177e4SLinus Torvalds * RTO = rtt + 4 * mdev 7451da177e4SLinus Torvalds * 7461da177e4SLinus Torvalds * Funny. This algorithm seems to be very broken. 7471da177e4SLinus Torvalds * These formulae increase RTO, when it should be decreased, increase 74831f34269SStephen Hemminger * too slowly, when it should be increased quickly, decrease too quickly 7491da177e4SLinus Torvalds * etc. I guess in BSD RTO takes ONE value, so that it is absolutely 7501da177e4SLinus Torvalds * does not matter how to _calculate_ it. Seems, it was trap 7511da177e4SLinus Torvalds * that VJ failed to avoid. 8) 7521da177e4SLinus Torvalds */ 7534a5ab4e2SEric Dumazet if (srtt != 0) { 7544a5ab4e2SEric Dumazet m -= (srtt >> 3); /* m is now error in rtt est */ 7554a5ab4e2SEric Dumazet srtt += m; /* rtt = 7/8 rtt + 1/8 new */ 7561da177e4SLinus Torvalds if (m < 0) { 7571da177e4SLinus Torvalds m = -m; /* m is now abs(error) */ 758740b0f18SEric Dumazet m -= (tp->mdev_us >> 2); /* similar update on mdev */ 7591da177e4SLinus Torvalds /* This is similar to one of Eifel findings. 7601da177e4SLinus Torvalds * Eifel blocks mdev updates when rtt decreases. 7611da177e4SLinus Torvalds * This solution is a bit different: we use finer gain 7621da177e4SLinus Torvalds * for mdev in this case (alpha*beta). 7631da177e4SLinus Torvalds * Like Eifel it also prevents growth of rto, 7641da177e4SLinus Torvalds * but also it limits too fast rto decreases, 7651da177e4SLinus Torvalds * happening in pure Eifel. 7661da177e4SLinus Torvalds */ 7671da177e4SLinus Torvalds if (m > 0) 7681da177e4SLinus Torvalds m >>= 3; 7691da177e4SLinus Torvalds } else { 770740b0f18SEric Dumazet m -= (tp->mdev_us >> 2); /* similar update on mdev */ 7711da177e4SLinus Torvalds } 772740b0f18SEric Dumazet tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ 773740b0f18SEric Dumazet if (tp->mdev_us > tp->mdev_max_us) { 774740b0f18SEric Dumazet tp->mdev_max_us = tp->mdev_us; 775740b0f18SEric Dumazet if (tp->mdev_max_us > tp->rttvar_us) 776740b0f18SEric Dumazet tp->rttvar_us = tp->mdev_max_us; 7771da177e4SLinus Torvalds } 7781da177e4SLinus Torvalds if (after(tp->snd_una, tp->rtt_seq)) { 779740b0f18SEric Dumazet if (tp->mdev_max_us < tp->rttvar_us) 780740b0f18SEric Dumazet tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; 7811da177e4SLinus Torvalds tp->rtt_seq = tp->snd_nxt; 782740b0f18SEric Dumazet tp->mdev_max_us = tcp_rto_min_us(sk); 78323729ff2SStanislav Fomichev 78423729ff2SStanislav Fomichev tcp_bpf_rtt(sk); 7851da177e4SLinus Torvalds } 7861da177e4SLinus Torvalds } else { 7871da177e4SLinus Torvalds /* no previous measure. */ 7884a5ab4e2SEric Dumazet srtt = m << 3; /* take the measured time to be rtt */ 789740b0f18SEric Dumazet tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ 790740b0f18SEric Dumazet tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); 791740b0f18SEric Dumazet tp->mdev_max_us = tp->rttvar_us; 7921da177e4SLinus Torvalds tp->rtt_seq = tp->snd_nxt; 79323729ff2SStanislav Fomichev 79423729ff2SStanislav Fomichev tcp_bpf_rtt(sk); 7951da177e4SLinus Torvalds } 796740b0f18SEric Dumazet tp->srtt_us = max(1U, srtt); 7971da177e4SLinus Torvalds } 7981da177e4SLinus Torvalds 79995bd09ebSEric Dumazet static void tcp_update_pacing_rate(struct sock *sk) 80095bd09ebSEric Dumazet { 80195bd09ebSEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 80295bd09ebSEric Dumazet u64 rate; 80395bd09ebSEric Dumazet 80495bd09ebSEric Dumazet /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ 80543e122b0SEric Dumazet rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); 80643e122b0SEric Dumazet 80743e122b0SEric Dumazet /* current rate is (cwnd * mss) / srtt 80843e122b0SEric Dumazet * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. 80943e122b0SEric Dumazet * In Congestion Avoidance phase, set it to 120 % the current rate. 81043e122b0SEric Dumazet * 81143e122b0SEric Dumazet * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) 81243e122b0SEric Dumazet * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching 81343e122b0SEric Dumazet * end of slow start and should slow down. 81443e122b0SEric Dumazet */ 81543e122b0SEric Dumazet if (tp->snd_cwnd < tp->snd_ssthresh / 2) 81623a7102aSEric Dumazet rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; 81743e122b0SEric Dumazet else 818c26e91f8SEric Dumazet rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; 81995bd09ebSEric Dumazet 82095bd09ebSEric Dumazet rate *= max(tp->snd_cwnd, tp->packets_out); 82195bd09ebSEric Dumazet 822740b0f18SEric Dumazet if (likely(tp->srtt_us)) 823740b0f18SEric Dumazet do_div(rate, tp->srtt_us); 82495bd09ebSEric Dumazet 825a9da6f29SMark Rutland /* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate 826ba537427SEric Dumazet * without any lock. We want to make sure compiler wont store 827ba537427SEric Dumazet * intermediate values in this location. 828ba537427SEric Dumazet */ 829a9da6f29SMark Rutland WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate, 830a9da6f29SMark Rutland sk->sk_max_pacing_rate)); 83195bd09ebSEric Dumazet } 83295bd09ebSEric Dumazet 8331da177e4SLinus Torvalds /* Calculate rto without backoff. This is the second half of Van Jacobson's 8341da177e4SLinus Torvalds * routine referred to above. 8351da177e4SLinus Torvalds */ 836f7e56a76Sstephen hemminger static void tcp_set_rto(struct sock *sk) 8371da177e4SLinus Torvalds { 838463c84b9SArnaldo Carvalho de Melo const struct tcp_sock *tp = tcp_sk(sk); 8391da177e4SLinus Torvalds /* Old crap is replaced with new one. 8) 8401da177e4SLinus Torvalds * 8411da177e4SLinus Torvalds * More seriously: 8421da177e4SLinus Torvalds * 1. If rtt variance happened to be less 50msec, it is hallucination. 8431da177e4SLinus Torvalds * It cannot be less due to utterly erratic ACK generation made 8441da177e4SLinus Torvalds * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ 8451da177e4SLinus Torvalds * to do with delayed acks, because at cwnd>2 true delack timeout 8461da177e4SLinus Torvalds * is invisible. Actually, Linux-2.4 also generates erratic 847caa20d9aSStephen Hemminger * ACKs in some circumstances. 8481da177e4SLinus Torvalds */ 849f1ecd5d9SDamian Lukowski inet_csk(sk)->icsk_rto = __tcp_set_rto(tp); 8501da177e4SLinus Torvalds 8511da177e4SLinus Torvalds /* 2. Fixups made earlier cannot be right. 8521da177e4SLinus Torvalds * If we do not estimate RTO correctly without them, 8531da177e4SLinus Torvalds * all the algo is pure shit and should be replaced 854caa20d9aSStephen Hemminger * with correct one. It is exactly, which we pretend to do. 8551da177e4SLinus Torvalds */ 8561da177e4SLinus Torvalds 8571da177e4SLinus Torvalds /* NOTE: clamping at TCP_RTO_MIN is not required, current algo 8581da177e4SLinus Torvalds * guarantees that rto is higher. 8591da177e4SLinus Torvalds */ 860f1ecd5d9SDamian Lukowski tcp_bound_rto(sk); 8611da177e4SLinus Torvalds } 8621da177e4SLinus Torvalds 863cf533ea5SEric Dumazet __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) 8641da177e4SLinus Torvalds { 8651da177e4SLinus Torvalds __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); 8661da177e4SLinus Torvalds 86722b71c8fSGerrit Renker if (!cwnd) 868442b9635SDavid S. Miller cwnd = TCP_INIT_CWND; 8691da177e4SLinus Torvalds return min_t(__u32, cwnd, tp->snd_cwnd_clamp); 8701da177e4SLinus Torvalds } 8711da177e4SLinus Torvalds 872564262c1SRyousei Takano /* Take a notice that peer is sending D-SACKs */ 873e60402d0SIlpo Järvinen static void tcp_dsack_seen(struct tcp_sock *tp) 874e60402d0SIlpo Järvinen { 875ab56222aSVijay Subramanian tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; 8761f255691SPriyaranjan Jha tp->rack.dsack_seen = 1; 8777e10b655SWei Wang tp->dsack_dups++; 878e60402d0SIlpo Järvinen } 879e60402d0SIlpo Järvinen 880737ff314SYuchung Cheng /* It's reordering when higher sequence was delivered (i.e. sacked) before 881737ff314SYuchung Cheng * some lower never-retransmitted sequence ("low_seq"). The maximum reordering 882737ff314SYuchung Cheng * distance is approximated in full-mss packet distance ("reordering"). 883737ff314SYuchung Cheng */ 884737ff314SYuchung Cheng static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq, 8856687e988SArnaldo Carvalho de Melo const int ts) 8861da177e4SLinus Torvalds { 8876687e988SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 888737ff314SYuchung Cheng const u32 mss = tp->mss_cache; 889737ff314SYuchung Cheng u32 fack, metric; 89040b215e5SPavel Emelyanov 891737ff314SYuchung Cheng fack = tcp_highest_sack_seq(tp); 892737ff314SYuchung Cheng if (!before(low_seq, fack)) 8936f5b24eeSSoheil Hassas Yeganeh return; 8946f5b24eeSSoheil Hassas Yeganeh 895737ff314SYuchung Cheng metric = fack - low_seq; 896737ff314SYuchung Cheng if ((metric > tp->reordering * mss) && mss) { 8971da177e4SLinus Torvalds #if FASTRETRANS_DEBUG > 1 89891df42beSJoe Perches pr_debug("Disorder%d %d %u f%u s%u rr%d\n", 8996687e988SArnaldo Carvalho de Melo tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, 9001da177e4SLinus Torvalds tp->reordering, 901737ff314SYuchung Cheng 0, 9021da177e4SLinus Torvalds tp->sacked_out, 9031da177e4SLinus Torvalds tp->undo_marker ? tp->undo_retrans : 0); 9041da177e4SLinus Torvalds #endif 905737ff314SYuchung Cheng tp->reordering = min_t(u32, (metric + mss - 1) / mss, 906737ff314SYuchung Cheng sock_net(sk)->ipv4.sysctl_tcp_max_reordering); 9071da177e4SLinus Torvalds } 908eed530b6SYuchung Cheng 9092d2517eeSYuchung Cheng /* This exciting event is worth to be remembered. 8) */ 9107ec65372SWei Wang tp->reord_seen++; 911737ff314SYuchung Cheng NET_INC_STATS(sock_net(sk), 912737ff314SYuchung Cheng ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER); 9131da177e4SLinus Torvalds } 9141da177e4SLinus Torvalds 915006f582cSIlpo Järvinen /* This must be called before lost_out is incremented */ 916c8c213f2SIlpo Järvinen static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) 917c8c213f2SIlpo Järvinen { 91851456b29SIan Morris if (!tp->retransmit_skb_hint || 919c8c213f2SIlpo Järvinen before(TCP_SKB_CB(skb)->seq, 920c8c213f2SIlpo Järvinen TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) 921006f582cSIlpo Järvinen tp->retransmit_skb_hint = skb; 922c8c213f2SIlpo Järvinen } 923c8c213f2SIlpo Järvinen 9240682e690SNeal Cardwell /* Sum the number of packets on the wire we have marked as lost. 9250682e690SNeal Cardwell * There are two cases we care about here: 9260682e690SNeal Cardwell * a) Packet hasn't been marked lost (nor retransmitted), 9270682e690SNeal Cardwell * and this is the first loss. 9280682e690SNeal Cardwell * b) Packet has been marked both lost and retransmitted, 9290682e690SNeal Cardwell * and this means we think it was lost again. 9300682e690SNeal Cardwell */ 9310682e690SNeal Cardwell static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb) 9320682e690SNeal Cardwell { 9330682e690SNeal Cardwell __u8 sacked = TCP_SKB_CB(skb)->sacked; 9340682e690SNeal Cardwell 9350682e690SNeal Cardwell if (!(sacked & TCPCB_LOST) || 9360682e690SNeal Cardwell ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS))) 9370682e690SNeal Cardwell tp->lost += tcp_skb_pcount(skb); 9380682e690SNeal Cardwell } 9390682e690SNeal Cardwell 94041ea36e3SIlpo Järvinen static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb) 94141ea36e3SIlpo Järvinen { 94241ea36e3SIlpo Järvinen if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 94341ea36e3SIlpo Järvinen tcp_verify_retransmit_hint(tp, skb); 94441ea36e3SIlpo Järvinen 94541ea36e3SIlpo Järvinen tp->lost_out += tcp_skb_pcount(skb); 9460682e690SNeal Cardwell tcp_sum_lost(tp, skb); 94741ea36e3SIlpo Järvinen TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 94841ea36e3SIlpo Järvinen } 94941ea36e3SIlpo Järvinen } 95041ea36e3SIlpo Järvinen 9514f41b1c5SYuchung Cheng void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb) 952006f582cSIlpo Järvinen { 953006f582cSIlpo Järvinen tcp_verify_retransmit_hint(tp, skb); 954006f582cSIlpo Järvinen 9550682e690SNeal Cardwell tcp_sum_lost(tp, skb); 956006f582cSIlpo Järvinen if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { 957006f582cSIlpo Järvinen tp->lost_out += tcp_skb_pcount(skb); 958006f582cSIlpo Järvinen TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 959006f582cSIlpo Järvinen } 960006f582cSIlpo Järvinen } 961006f582cSIlpo Järvinen 9621da177e4SLinus Torvalds /* This procedure tags the retransmission queue when SACKs arrive. 9631da177e4SLinus Torvalds * 9641da177e4SLinus Torvalds * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). 9651da177e4SLinus Torvalds * Packets in queue with these bits set are counted in variables 9661da177e4SLinus Torvalds * sacked_out, retrans_out and lost_out, correspondingly. 9671da177e4SLinus Torvalds * 9681da177e4SLinus Torvalds * Valid combinations are: 9691da177e4SLinus Torvalds * Tag InFlight Description 9701da177e4SLinus Torvalds * 0 1 - orig segment is in flight. 9711da177e4SLinus Torvalds * S 0 - nothing flies, orig reached receiver. 9721da177e4SLinus Torvalds * L 0 - nothing flies, orig lost by net. 9731da177e4SLinus Torvalds * R 2 - both orig and retransmit are in flight. 9741da177e4SLinus Torvalds * L|R 1 - orig is lost, retransmit is in flight. 9751da177e4SLinus Torvalds * S|R 1 - orig reached receiver, retrans is still in flight. 9761da177e4SLinus Torvalds * (L|S|R is logically valid, it could occur when L|R is sacked, 9771da177e4SLinus Torvalds * but it is equivalent to plain S and code short-curcuits it to S. 9781da177e4SLinus Torvalds * L|S is logically invalid, it would mean -1 packet in flight 8)) 9791da177e4SLinus Torvalds * 9801da177e4SLinus Torvalds * These 6 states form finite state machine, controlled by the following events: 9811da177e4SLinus Torvalds * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) 9821da177e4SLinus Torvalds * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) 983974c1236SYuchung Cheng * 3. Loss detection event of two flavors: 9841da177e4SLinus Torvalds * A. Scoreboard estimator decided the packet is lost. 9851da177e4SLinus Torvalds * A'. Reno "three dupacks" marks head of queue lost. 986974c1236SYuchung Cheng * B. SACK arrives sacking SND.NXT at the moment, when the 9871da177e4SLinus Torvalds * segment was retransmitted. 9881da177e4SLinus Torvalds * 4. D-SACK added new rule: D-SACK changes any tag to S. 9891da177e4SLinus Torvalds * 9901da177e4SLinus Torvalds * It is pleasant to note, that state diagram turns out to be commutative, 9911da177e4SLinus Torvalds * so that we are allowed not to be bothered by order of our actions, 9921da177e4SLinus Torvalds * when multiple events arrive simultaneously. (see the function below). 9931da177e4SLinus Torvalds * 9941da177e4SLinus Torvalds * Reordering detection. 9951da177e4SLinus Torvalds * -------------------- 9961da177e4SLinus Torvalds * Reordering metric is maximal distance, which a packet can be displaced 9971da177e4SLinus Torvalds * in packet stream. With SACKs we can estimate it: 9981da177e4SLinus Torvalds * 9991da177e4SLinus Torvalds * 1. SACK fills old hole and the corresponding segment was not 10001da177e4SLinus Torvalds * ever retransmitted -> reordering. Alas, we cannot use it 10011da177e4SLinus Torvalds * when segment was retransmitted. 10021da177e4SLinus Torvalds * 2. The last flaw is solved with D-SACK. D-SACK arrives 10031da177e4SLinus Torvalds * for retransmitted and already SACKed segment -> reordering.. 10041da177e4SLinus Torvalds * Both of these heuristics are not used in Loss state, when we cannot 10051da177e4SLinus Torvalds * account for retransmits accurately. 10065b3c9882SIlpo Järvinen * 10075b3c9882SIlpo Järvinen * SACK block validation. 10085b3c9882SIlpo Järvinen * ---------------------- 10095b3c9882SIlpo Järvinen * 10105b3c9882SIlpo Järvinen * SACK block range validation checks that the received SACK block fits to 10115b3c9882SIlpo Järvinen * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. 10125b3c9882SIlpo Järvinen * Note that SND.UNA is not included to the range though being valid because 10130e835331SIlpo Järvinen * it means that the receiver is rather inconsistent with itself reporting 10140e835331SIlpo Järvinen * SACK reneging when it should advance SND.UNA. Such SACK block this is 10150e835331SIlpo Järvinen * perfectly valid, however, in light of RFC2018 which explicitly states 10160e835331SIlpo Järvinen * that "SACK block MUST reflect the newest segment. Even if the newest 10170e835331SIlpo Järvinen * segment is going to be discarded ...", not that it looks very clever 10180e835331SIlpo Järvinen * in case of head skb. Due to potentional receiver driven attacks, we 10190e835331SIlpo Järvinen * choose to avoid immediate execution of a walk in write queue due to 10200e835331SIlpo Järvinen * reneging and defer head skb's loss recovery to standard loss recovery 10210e835331SIlpo Järvinen * procedure that will eventually trigger (nothing forbids us doing this). 10225b3c9882SIlpo Järvinen * 10235b3c9882SIlpo Järvinen * Implements also blockage to start_seq wrap-around. Problem lies in the 10245b3c9882SIlpo Järvinen * fact that though start_seq (s) is before end_seq (i.e., not reversed), 10255b3c9882SIlpo Järvinen * there's no guarantee that it will be before snd_nxt (n). The problem 10265b3c9882SIlpo Järvinen * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt 10275b3c9882SIlpo Järvinen * wrap (s_w): 10285b3c9882SIlpo Järvinen * 10295b3c9882SIlpo Järvinen * <- outs wnd -> <- wrapzone -> 10305b3c9882SIlpo Järvinen * u e n u_w e_w s n_w 10315b3c9882SIlpo Järvinen * | | | | | | | 10325b3c9882SIlpo Järvinen * |<------------+------+----- TCP seqno space --------------+---------->| 10335b3c9882SIlpo Järvinen * ...-- <2^31 ->| |<--------... 10345b3c9882SIlpo Järvinen * ...---- >2^31 ------>| |<--------... 10355b3c9882SIlpo Järvinen * 10365b3c9882SIlpo Järvinen * Current code wouldn't be vulnerable but it's better still to discard such 10375b3c9882SIlpo Järvinen * crazy SACK blocks. Doing this check for start_seq alone closes somewhat 10385b3c9882SIlpo Järvinen * similar case (end_seq after snd_nxt wrap) as earlier reversed check in 10395b3c9882SIlpo Järvinen * snd_nxt wrap -> snd_una region will then become "well defined", i.e., 10405b3c9882SIlpo Järvinen * equal to the ideal case (infinite seqno space without wrap caused issues). 10415b3c9882SIlpo Järvinen * 10425b3c9882SIlpo Järvinen * With D-SACK the lower bound is extended to cover sequence space below 10435b3c9882SIlpo Järvinen * SND.UNA down to undo_marker, which is the last point of interest. Yet 1044564262c1SRyousei Takano * again, D-SACK block must not to go across snd_una (for the same reason as 10455b3c9882SIlpo Järvinen * for the normal SACK blocks, explained above). But there all simplicity 10465b3c9882SIlpo Järvinen * ends, TCP might receive valid D-SACKs below that. As long as they reside 10475b3c9882SIlpo Järvinen * fully below undo_marker they do not affect behavior in anyway and can 10485b3c9882SIlpo Järvinen * therefore be safely ignored. In rare cases (which are more or less 10495b3c9882SIlpo Järvinen * theoretical ones), the D-SACK will nicely cross that boundary due to skb 10505b3c9882SIlpo Järvinen * fragmentation and packet reordering past skb's retransmission. To consider 10515b3c9882SIlpo Järvinen * them correctly, the acceptable range must be extended even more though 10525b3c9882SIlpo Järvinen * the exact amount is rather hard to quantify. However, tp->max_window can 10535b3c9882SIlpo Järvinen * be used as an exaggerated estimate. 10541da177e4SLinus Torvalds */ 1055a2a385d6SEric Dumazet static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, 10565b3c9882SIlpo Järvinen u32 start_seq, u32 end_seq) 10575b3c9882SIlpo Järvinen { 10585b3c9882SIlpo Järvinen /* Too far in future, or reversed (interpretation is ambiguous) */ 10595b3c9882SIlpo Järvinen if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) 1060a2a385d6SEric Dumazet return false; 10615b3c9882SIlpo Järvinen 10625b3c9882SIlpo Järvinen /* Nasty start_seq wrap-around check (see comments above) */ 10635b3c9882SIlpo Järvinen if (!before(start_seq, tp->snd_nxt)) 1064a2a385d6SEric Dumazet return false; 10655b3c9882SIlpo Järvinen 1066564262c1SRyousei Takano /* In outstanding window? ...This is valid exit for D-SACKs too. 10675b3c9882SIlpo Järvinen * start_seq == snd_una is non-sensical (see comments above) 10685b3c9882SIlpo Järvinen */ 10695b3c9882SIlpo Järvinen if (after(start_seq, tp->snd_una)) 1070a2a385d6SEric Dumazet return true; 10715b3c9882SIlpo Järvinen 10725b3c9882SIlpo Järvinen if (!is_dsack || !tp->undo_marker) 1073a2a385d6SEric Dumazet return false; 10745b3c9882SIlpo Järvinen 10755b3c9882SIlpo Järvinen /* ...Then it's D-SACK, and must reside below snd_una completely */ 1076f779b2d6SZheng Yan if (after(end_seq, tp->snd_una)) 1077a2a385d6SEric Dumazet return false; 10785b3c9882SIlpo Järvinen 10795b3c9882SIlpo Järvinen if (!before(start_seq, tp->undo_marker)) 1080a2a385d6SEric Dumazet return true; 10815b3c9882SIlpo Järvinen 10825b3c9882SIlpo Järvinen /* Too old */ 10835b3c9882SIlpo Järvinen if (!after(end_seq, tp->undo_marker)) 1084a2a385d6SEric Dumazet return false; 10855b3c9882SIlpo Järvinen 10865b3c9882SIlpo Järvinen /* Undo_marker boundary crossing (overestimates a lot). Known already: 10875b3c9882SIlpo Järvinen * start_seq < undo_marker and end_seq >= undo_marker. 10885b3c9882SIlpo Järvinen */ 10895b3c9882SIlpo Järvinen return !before(start_seq, end_seq - tp->max_window); 10905b3c9882SIlpo Järvinen } 10915b3c9882SIlpo Järvinen 1092a2a385d6SEric Dumazet static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, 1093d06e021dSDavid S. Miller struct tcp_sack_block_wire *sp, int num_sacks, 1094d06e021dSDavid S. Miller u32 prior_snd_una) 1095d06e021dSDavid S. Miller { 10961ed83465SPavel Emelyanov struct tcp_sock *tp = tcp_sk(sk); 1097d3e2ce3bSHarvey Harrison u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); 1098d3e2ce3bSHarvey Harrison u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); 1099a2a385d6SEric Dumazet bool dup_sack = false; 1100d06e021dSDavid S. Miller 1101d06e021dSDavid S. Miller if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { 1102a2a385d6SEric Dumazet dup_sack = true; 1103e60402d0SIlpo Järvinen tcp_dsack_seen(tp); 1104c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); 1105d06e021dSDavid S. Miller } else if (num_sacks > 1) { 1106d3e2ce3bSHarvey Harrison u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); 1107d3e2ce3bSHarvey Harrison u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); 1108d06e021dSDavid S. Miller 1109d06e021dSDavid S. Miller if (!after(end_seq_0, end_seq_1) && 1110d06e021dSDavid S. Miller !before(start_seq_0, start_seq_1)) { 1111a2a385d6SEric Dumazet dup_sack = true; 1112e60402d0SIlpo Järvinen tcp_dsack_seen(tp); 1113c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), 1114de0744afSPavel Emelyanov LINUX_MIB_TCPDSACKOFORECV); 1115d06e021dSDavid S. Miller } 1116d06e021dSDavid S. Miller } 1117d06e021dSDavid S. Miller 1118d06e021dSDavid S. Miller /* D-SACK for already forgotten data... Do dumb counting. */ 11196e08d5e3SYuchung Cheng if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && 1120d06e021dSDavid S. Miller !after(end_seq_0, prior_snd_una) && 1121d06e021dSDavid S. Miller after(end_seq_0, tp->undo_marker)) 1122d06e021dSDavid S. Miller tp->undo_retrans--; 1123d06e021dSDavid S. Miller 1124d06e021dSDavid S. Miller return dup_sack; 1125d06e021dSDavid S. Miller } 1126d06e021dSDavid S. Miller 1127a1197f5aSIlpo Järvinen struct tcp_sacktag_state { 1128737ff314SYuchung Cheng u32 reord; 112931231a8aSKenneth Klette Jonassen /* Timestamps for earliest and latest never-retransmitted segment 113031231a8aSKenneth Klette Jonassen * that was SACKed. RTO needs the earliest RTT to stay conservative, 113131231a8aSKenneth Klette Jonassen * but congestion control should still get an accurate delay signal. 113231231a8aSKenneth Klette Jonassen */ 11339a568de4SEric Dumazet u64 first_sackt; 11349a568de4SEric Dumazet u64 last_sackt; 1135b9f64820SYuchung Cheng struct rate_sample *rate; 1136a1197f5aSIlpo Järvinen int flag; 113775c119afSEric Dumazet unsigned int mss_now; 1138a1197f5aSIlpo Järvinen }; 1139a1197f5aSIlpo Järvinen 1140d1935942SIlpo Järvinen /* Check if skb is fully within the SACK block. In presence of GSO skbs, 1141d1935942SIlpo Järvinen * the incoming SACK may not exactly match but we can find smaller MSS 1142d1935942SIlpo Järvinen * aligned portion of it that matches. Therefore we might need to fragment 1143d1935942SIlpo Järvinen * which may fail and creates some hassle (caller must handle error case 1144d1935942SIlpo Järvinen * returns). 1145832d11c5SIlpo Järvinen * 1146832d11c5SIlpo Järvinen * FIXME: this could be merged to shift decision code 1147d1935942SIlpo Järvinen */ 11480f79efdcSAdrian Bunk static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, 1149d1935942SIlpo Järvinen u32 start_seq, u32 end_seq) 1150d1935942SIlpo Järvinen { 1151a2a385d6SEric Dumazet int err; 1152a2a385d6SEric Dumazet bool in_sack; 1153d1935942SIlpo Järvinen unsigned int pkt_len; 1154adb92db8SIlpo Järvinen unsigned int mss; 1155d1935942SIlpo Järvinen 1156d1935942SIlpo Järvinen in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1157d1935942SIlpo Järvinen !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1158d1935942SIlpo Järvinen 1159d1935942SIlpo Järvinen if (tcp_skb_pcount(skb) > 1 && !in_sack && 1160d1935942SIlpo Järvinen after(TCP_SKB_CB(skb)->end_seq, start_seq)) { 1161adb92db8SIlpo Järvinen mss = tcp_skb_mss(skb); 1162d1935942SIlpo Järvinen in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); 1163d1935942SIlpo Järvinen 1164adb92db8SIlpo Järvinen if (!in_sack) { 1165d1935942SIlpo Järvinen pkt_len = start_seq - TCP_SKB_CB(skb)->seq; 1166adb92db8SIlpo Järvinen if (pkt_len < mss) 1167adb92db8SIlpo Järvinen pkt_len = mss; 1168adb92db8SIlpo Järvinen } else { 1169d1935942SIlpo Järvinen pkt_len = end_seq - TCP_SKB_CB(skb)->seq; 1170adb92db8SIlpo Järvinen if (pkt_len < mss) 1171adb92db8SIlpo Järvinen return -EINVAL; 1172adb92db8SIlpo Järvinen } 1173adb92db8SIlpo Järvinen 1174adb92db8SIlpo Järvinen /* Round if necessary so that SACKs cover only full MSSes 1175adb92db8SIlpo Järvinen * and/or the remaining small portion (if present) 1176adb92db8SIlpo Järvinen */ 1177adb92db8SIlpo Järvinen if (pkt_len > mss) { 1178adb92db8SIlpo Järvinen unsigned int new_len = (pkt_len / mss) * mss; 1179b451e5d2SYuchung Cheng if (!in_sack && new_len < pkt_len) 1180adb92db8SIlpo Järvinen new_len += mss; 1181adb92db8SIlpo Järvinen pkt_len = new_len; 1182adb92db8SIlpo Järvinen } 1183b451e5d2SYuchung Cheng 1184b451e5d2SYuchung Cheng if (pkt_len >= skb->len && !in_sack) 1185b451e5d2SYuchung Cheng return 0; 1186b451e5d2SYuchung Cheng 118775c119afSEric Dumazet err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, 118875c119afSEric Dumazet pkt_len, mss, GFP_ATOMIC); 1189d1935942SIlpo Järvinen if (err < 0) 1190d1935942SIlpo Järvinen return err; 1191d1935942SIlpo Järvinen } 1192d1935942SIlpo Järvinen 1193d1935942SIlpo Järvinen return in_sack; 1194d1935942SIlpo Järvinen } 1195d1935942SIlpo Järvinen 1196cc9a672eSNeal Cardwell /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ 1197cc9a672eSNeal Cardwell static u8 tcp_sacktag_one(struct sock *sk, 1198cc9a672eSNeal Cardwell struct tcp_sacktag_state *state, u8 sacked, 1199cc9a672eSNeal Cardwell u32 start_seq, u32 end_seq, 1200740b0f18SEric Dumazet int dup_sack, int pcount, 12019a568de4SEric Dumazet u64 xmit_time) 12029e10c47cSIlpo Järvinen { 12036859d494SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 12049e10c47cSIlpo Järvinen 12059e10c47cSIlpo Järvinen /* Account D-SACK for retransmitted packet. */ 12069e10c47cSIlpo Järvinen if (dup_sack && (sacked & TCPCB_RETRANS)) { 12076e08d5e3SYuchung Cheng if (tp->undo_marker && tp->undo_retrans > 0 && 1208cc9a672eSNeal Cardwell after(end_seq, tp->undo_marker)) 12099e10c47cSIlpo Järvinen tp->undo_retrans--; 1210737ff314SYuchung Cheng if ((sacked & TCPCB_SACKED_ACKED) && 1211737ff314SYuchung Cheng before(start_seq, state->reord)) 1212737ff314SYuchung Cheng state->reord = start_seq; 12139e10c47cSIlpo Järvinen } 12149e10c47cSIlpo Järvinen 12159e10c47cSIlpo Järvinen /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1216cc9a672eSNeal Cardwell if (!after(end_seq, tp->snd_una)) 1217a1197f5aSIlpo Järvinen return sacked; 12189e10c47cSIlpo Järvinen 12199e10c47cSIlpo Järvinen if (!(sacked & TCPCB_SACKED_ACKED)) { 1220d2329f10SEric Dumazet tcp_rack_advance(tp, sacked, end_seq, xmit_time); 1221659a8ad5SYuchung Cheng 12229e10c47cSIlpo Järvinen if (sacked & TCPCB_SACKED_RETRANS) { 12239e10c47cSIlpo Järvinen /* If the segment is not tagged as lost, 12249e10c47cSIlpo Järvinen * we do not clear RETRANS, believing 12259e10c47cSIlpo Järvinen * that retransmission is still in flight. 12269e10c47cSIlpo Järvinen */ 12279e10c47cSIlpo Järvinen if (sacked & TCPCB_LOST) { 1228a1197f5aSIlpo Järvinen sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); 1229f58b22fdSIlpo Järvinen tp->lost_out -= pcount; 1230f58b22fdSIlpo Järvinen tp->retrans_out -= pcount; 12319e10c47cSIlpo Järvinen } 12329e10c47cSIlpo Järvinen } else { 12339e10c47cSIlpo Järvinen if (!(sacked & TCPCB_RETRANS)) { 12349e10c47cSIlpo Järvinen /* New sack for not retransmitted frame, 12359e10c47cSIlpo Järvinen * which was in hole. It is reordering. 12369e10c47cSIlpo Järvinen */ 1237cc9a672eSNeal Cardwell if (before(start_seq, 1238737ff314SYuchung Cheng tcp_highest_sack_seq(tp)) && 1239737ff314SYuchung Cheng before(start_seq, state->reord)) 1240737ff314SYuchung Cheng state->reord = start_seq; 1241737ff314SYuchung Cheng 1242e33099f9SYuchung Cheng if (!after(end_seq, tp->high_seq)) 1243e33099f9SYuchung Cheng state->flag |= FLAG_ORIG_SACK_ACKED; 12449a568de4SEric Dumazet if (state->first_sackt == 0) 12459a568de4SEric Dumazet state->first_sackt = xmit_time; 12469a568de4SEric Dumazet state->last_sackt = xmit_time; 12479e10c47cSIlpo Järvinen } 12489e10c47cSIlpo Järvinen 12499e10c47cSIlpo Järvinen if (sacked & TCPCB_LOST) { 1250a1197f5aSIlpo Järvinen sacked &= ~TCPCB_LOST; 1251f58b22fdSIlpo Järvinen tp->lost_out -= pcount; 12529e10c47cSIlpo Järvinen } 12539e10c47cSIlpo Järvinen } 12549e10c47cSIlpo Järvinen 1255a1197f5aSIlpo Järvinen sacked |= TCPCB_SACKED_ACKED; 1256a1197f5aSIlpo Järvinen state->flag |= FLAG_DATA_SACKED; 1257f58b22fdSIlpo Järvinen tp->sacked_out += pcount; 1258ddf1af6fSYuchung Cheng tp->delivered += pcount; /* Out-of-order packets delivered */ 12599e10c47cSIlpo Järvinen 12609e10c47cSIlpo Järvinen /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ 1261713bafeaSYuchung Cheng if (tp->lost_skb_hint && 1262cc9a672eSNeal Cardwell before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) 1263f58b22fdSIlpo Järvinen tp->lost_cnt_hint += pcount; 12649e10c47cSIlpo Järvinen } 12659e10c47cSIlpo Järvinen 12669e10c47cSIlpo Järvinen /* D-SACK. We can detect redundant retransmission in S|R and plain R 12679e10c47cSIlpo Järvinen * frames and clear it. undo_retrans is decreased above, L|R frames 12689e10c47cSIlpo Järvinen * are accounted above as well. 12699e10c47cSIlpo Järvinen */ 1270a1197f5aSIlpo Järvinen if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) { 1271a1197f5aSIlpo Järvinen sacked &= ~TCPCB_SACKED_RETRANS; 1272f58b22fdSIlpo Järvinen tp->retrans_out -= pcount; 12739e10c47cSIlpo Järvinen } 12749e10c47cSIlpo Järvinen 1275a1197f5aSIlpo Järvinen return sacked; 12769e10c47cSIlpo Järvinen } 12779e10c47cSIlpo Järvinen 1278daef52baSNeal Cardwell /* Shift newly-SACKed bytes from this skb to the immediately previous 1279daef52baSNeal Cardwell * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. 1280daef52baSNeal Cardwell */ 1281f3319816SEric Dumazet static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, 1282f3319816SEric Dumazet struct sk_buff *skb, 1283a1197f5aSIlpo Järvinen struct tcp_sacktag_state *state, 12849ec06ff5SIlpo Järvinen unsigned int pcount, int shifted, int mss, 1285a2a385d6SEric Dumazet bool dup_sack) 1286832d11c5SIlpo Järvinen { 1287832d11c5SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 1288daef52baSNeal Cardwell u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ 1289daef52baSNeal Cardwell u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ 1290832d11c5SIlpo Järvinen 1291832d11c5SIlpo Järvinen BUG_ON(!pcount); 1292832d11c5SIlpo Järvinen 12934c90d3b3SNeal Cardwell /* Adjust counters and hints for the newly sacked sequence 12944c90d3b3SNeal Cardwell * range but discard the return value since prev is already 12954c90d3b3SNeal Cardwell * marked. We must tag the range first because the seq 12964c90d3b3SNeal Cardwell * advancement below implicitly advances 12974c90d3b3SNeal Cardwell * tcp_highest_sack_seq() when skb is highest_sack. 12984c90d3b3SNeal Cardwell */ 12994c90d3b3SNeal Cardwell tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 130059c9af42SYuchung Cheng start_seq, end_seq, dup_sack, pcount, 13012fd66ffbSEric Dumazet tcp_skb_timestamp_us(skb)); 1302b9f64820SYuchung Cheng tcp_rate_skb_delivered(sk, skb, state->rate); 13034c90d3b3SNeal Cardwell 13044c90d3b3SNeal Cardwell if (skb == tp->lost_skb_hint) 13050af2a0d0SNeal Cardwell tp->lost_cnt_hint += pcount; 13060af2a0d0SNeal Cardwell 1307832d11c5SIlpo Järvinen TCP_SKB_CB(prev)->end_seq += shifted; 1308832d11c5SIlpo Järvinen TCP_SKB_CB(skb)->seq += shifted; 1309832d11c5SIlpo Järvinen 1310cd7d8498SEric Dumazet tcp_skb_pcount_add(prev, pcount); 13113b4929f6SEric Dumazet WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); 1312cd7d8498SEric Dumazet tcp_skb_pcount_add(skb, -pcount); 1313832d11c5SIlpo Järvinen 1314832d11c5SIlpo Järvinen /* When we're adding to gso_segs == 1, gso_size will be zero, 1315832d11c5SIlpo Järvinen * in theory this shouldn't be necessary but as long as DSACK 1316832d11c5SIlpo Järvinen * code can come after this skb later on it's better to keep 1317832d11c5SIlpo Järvinen * setting gso_size to something. 1318832d11c5SIlpo Järvinen */ 1319f69ad292SEric Dumazet if (!TCP_SKB_CB(prev)->tcp_gso_size) 1320f69ad292SEric Dumazet TCP_SKB_CB(prev)->tcp_gso_size = mss; 1321832d11c5SIlpo Järvinen 1322832d11c5SIlpo Järvinen /* CHECKME: To clear or not to clear? Mimics normal skb currently */ 132351466a75SEric Dumazet if (tcp_skb_pcount(skb) <= 1) 1324f69ad292SEric Dumazet TCP_SKB_CB(skb)->tcp_gso_size = 0; 1325832d11c5SIlpo Järvinen 1326832d11c5SIlpo Järvinen /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1327832d11c5SIlpo Järvinen TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1328832d11c5SIlpo Järvinen 1329832d11c5SIlpo Järvinen if (skb->len > 0) { 1330832d11c5SIlpo Järvinen BUG_ON(!tcp_skb_pcount(skb)); 1331c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); 1332a2a385d6SEric Dumazet return false; 1333832d11c5SIlpo Järvinen } 1334832d11c5SIlpo Järvinen 1335832d11c5SIlpo Järvinen /* Whole SKB was eaten :-) */ 1336832d11c5SIlpo Järvinen 133792ee76b6SIlpo Järvinen if (skb == tp->retransmit_skb_hint) 133892ee76b6SIlpo Järvinen tp->retransmit_skb_hint = prev; 133992ee76b6SIlpo Järvinen if (skb == tp->lost_skb_hint) { 134092ee76b6SIlpo Järvinen tp->lost_skb_hint = prev; 134192ee76b6SIlpo Järvinen tp->lost_cnt_hint -= tcp_skb_pcount(prev); 134292ee76b6SIlpo Järvinen } 134392ee76b6SIlpo Järvinen 13445e8a402fSEric Dumazet TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; 1345a643b5d4SMartin KaFai Lau TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; 13465e8a402fSEric Dumazet if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 13475e8a402fSEric Dumazet TCP_SKB_CB(prev)->end_seq++; 13485e8a402fSEric Dumazet 1349832d11c5SIlpo Järvinen if (skb == tcp_highest_sack(sk)) 1350832d11c5SIlpo Järvinen tcp_advance_highest_sack(sk, skb); 1351832d11c5SIlpo Järvinen 1352cfea5a68SMartin KaFai Lau tcp_skb_collapse_tstamp(prev, skb); 13539a568de4SEric Dumazet if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) 13549a568de4SEric Dumazet TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; 1355b9f64820SYuchung Cheng 135675c119afSEric Dumazet tcp_rtx_queue_unlink_and_free(skb, sk); 1357832d11c5SIlpo Järvinen 1358c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); 1359111cc8b9SIlpo Järvinen 1360a2a385d6SEric Dumazet return true; 1361832d11c5SIlpo Järvinen } 1362832d11c5SIlpo Järvinen 1363832d11c5SIlpo Järvinen /* I wish gso_size would have a bit more sane initialization than 1364832d11c5SIlpo Järvinen * something-or-zero which complicates things 1365832d11c5SIlpo Järvinen */ 1366cf533ea5SEric Dumazet static int tcp_skb_seglen(const struct sk_buff *skb) 1367832d11c5SIlpo Järvinen { 1368775ffabfSIlpo Järvinen return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb); 1369832d11c5SIlpo Järvinen } 1370832d11c5SIlpo Järvinen 1371832d11c5SIlpo Järvinen /* Shifting pages past head area doesn't work */ 1372cf533ea5SEric Dumazet static int skb_can_shift(const struct sk_buff *skb) 1373832d11c5SIlpo Järvinen { 1374832d11c5SIlpo Järvinen return !skb_headlen(skb) && skb_is_nonlinear(skb); 1375832d11c5SIlpo Järvinen } 1376832d11c5SIlpo Järvinen 13773b4929f6SEric Dumazet int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, 13783b4929f6SEric Dumazet int pcount, int shiftlen) 13793b4929f6SEric Dumazet { 13803b4929f6SEric Dumazet /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) 13813b4929f6SEric Dumazet * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need 13823b4929f6SEric Dumazet * to make sure not storing more than 65535 * 8 bytes per skb, 13833b4929f6SEric Dumazet * even if current MSS is bigger. 13843b4929f6SEric Dumazet */ 13853b4929f6SEric Dumazet if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) 13863b4929f6SEric Dumazet return 0; 13873b4929f6SEric Dumazet if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) 13883b4929f6SEric Dumazet return 0; 13893b4929f6SEric Dumazet return skb_shift(to, from, shiftlen); 13903b4929f6SEric Dumazet } 13913b4929f6SEric Dumazet 1392832d11c5SIlpo Järvinen /* Try collapsing SACK blocks spanning across multiple skbs to a single 1393832d11c5SIlpo Järvinen * skb. 1394832d11c5SIlpo Järvinen */ 1395832d11c5SIlpo Järvinen static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, 1396a1197f5aSIlpo Järvinen struct tcp_sacktag_state *state, 1397832d11c5SIlpo Järvinen u32 start_seq, u32 end_seq, 1398a2a385d6SEric Dumazet bool dup_sack) 1399832d11c5SIlpo Järvinen { 1400832d11c5SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 1401832d11c5SIlpo Järvinen struct sk_buff *prev; 1402832d11c5SIlpo Järvinen int mss; 1403832d11c5SIlpo Järvinen int pcount = 0; 1404832d11c5SIlpo Järvinen int len; 1405832d11c5SIlpo Järvinen int in_sack; 1406832d11c5SIlpo Järvinen 1407832d11c5SIlpo Järvinen /* Normally R but no L won't result in plain S */ 1408832d11c5SIlpo Järvinen if (!dup_sack && 14099969ca5fSIlpo Järvinen (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS) 1410832d11c5SIlpo Järvinen goto fallback; 1411832d11c5SIlpo Järvinen if (!skb_can_shift(skb)) 1412832d11c5SIlpo Järvinen goto fallback; 1413832d11c5SIlpo Järvinen /* This frame is about to be dropped (was ACKed). */ 1414832d11c5SIlpo Järvinen if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) 1415832d11c5SIlpo Järvinen goto fallback; 1416832d11c5SIlpo Järvinen 1417832d11c5SIlpo Järvinen /* Can only happen with delayed DSACK + discard craziness */ 141875c119afSEric Dumazet prev = skb_rb_prev(skb); 141975c119afSEric Dumazet if (!prev) 1420832d11c5SIlpo Järvinen goto fallback; 1421832d11c5SIlpo Järvinen 1422832d11c5SIlpo Järvinen if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) 1423832d11c5SIlpo Järvinen goto fallback; 1424832d11c5SIlpo Järvinen 1425a643b5d4SMartin KaFai Lau if (!tcp_skb_can_collapse_to(prev)) 1426a643b5d4SMartin KaFai Lau goto fallback; 1427a643b5d4SMartin KaFai Lau 1428832d11c5SIlpo Järvinen in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && 1429832d11c5SIlpo Järvinen !before(end_seq, TCP_SKB_CB(skb)->end_seq); 1430832d11c5SIlpo Järvinen 1431832d11c5SIlpo Järvinen if (in_sack) { 1432832d11c5SIlpo Järvinen len = skb->len; 1433832d11c5SIlpo Järvinen pcount = tcp_skb_pcount(skb); 1434775ffabfSIlpo Järvinen mss = tcp_skb_seglen(skb); 1435832d11c5SIlpo Järvinen 1436832d11c5SIlpo Järvinen /* TODO: Fix DSACKs to not fragment already SACKed and we can 1437832d11c5SIlpo Järvinen * drop this restriction as unnecessary 1438832d11c5SIlpo Järvinen */ 1439775ffabfSIlpo Järvinen if (mss != tcp_skb_seglen(prev)) 1440832d11c5SIlpo Järvinen goto fallback; 1441832d11c5SIlpo Järvinen } else { 1442832d11c5SIlpo Järvinen if (!after(TCP_SKB_CB(skb)->end_seq, start_seq)) 1443832d11c5SIlpo Järvinen goto noop; 1444832d11c5SIlpo Järvinen /* CHECKME: This is non-MSS split case only?, this will 1445832d11c5SIlpo Järvinen * cause skipped skbs due to advancing loop btw, original 1446832d11c5SIlpo Järvinen * has that feature too 1447832d11c5SIlpo Järvinen */ 1448832d11c5SIlpo Järvinen if (tcp_skb_pcount(skb) <= 1) 1449832d11c5SIlpo Järvinen goto noop; 1450832d11c5SIlpo Järvinen 1451832d11c5SIlpo Järvinen in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); 1452832d11c5SIlpo Järvinen if (!in_sack) { 1453832d11c5SIlpo Järvinen /* TODO: head merge to next could be attempted here 1454832d11c5SIlpo Järvinen * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)), 1455832d11c5SIlpo Järvinen * though it might not be worth of the additional hassle 1456832d11c5SIlpo Järvinen * 1457832d11c5SIlpo Järvinen * ...we can probably just fallback to what was done 1458832d11c5SIlpo Järvinen * previously. We could try merging non-SACKed ones 1459832d11c5SIlpo Järvinen * as well but it probably isn't going to buy off 1460832d11c5SIlpo Järvinen * because later SACKs might again split them, and 1461832d11c5SIlpo Järvinen * it would make skb timestamp tracking considerably 1462832d11c5SIlpo Järvinen * harder problem. 1463832d11c5SIlpo Järvinen */ 1464832d11c5SIlpo Järvinen goto fallback; 1465832d11c5SIlpo Järvinen } 1466832d11c5SIlpo Järvinen 1467832d11c5SIlpo Järvinen len = end_seq - TCP_SKB_CB(skb)->seq; 1468832d11c5SIlpo Järvinen BUG_ON(len < 0); 1469832d11c5SIlpo Järvinen BUG_ON(len > skb->len); 1470832d11c5SIlpo Järvinen 1471832d11c5SIlpo Järvinen /* MSS boundaries should be honoured or else pcount will 1472832d11c5SIlpo Järvinen * severely break even though it makes things bit trickier. 1473832d11c5SIlpo Järvinen * Optimize common case to avoid most of the divides 1474832d11c5SIlpo Järvinen */ 1475832d11c5SIlpo Järvinen mss = tcp_skb_mss(skb); 1476832d11c5SIlpo Järvinen 1477832d11c5SIlpo Järvinen /* TODO: Fix DSACKs to not fragment already SACKed and we can 1478832d11c5SIlpo Järvinen * drop this restriction as unnecessary 1479832d11c5SIlpo Järvinen */ 1480775ffabfSIlpo Järvinen if (mss != tcp_skb_seglen(prev)) 1481832d11c5SIlpo Järvinen goto fallback; 1482832d11c5SIlpo Järvinen 1483832d11c5SIlpo Järvinen if (len == mss) { 1484832d11c5SIlpo Järvinen pcount = 1; 1485832d11c5SIlpo Järvinen } else if (len < mss) { 1486832d11c5SIlpo Järvinen goto noop; 1487832d11c5SIlpo Järvinen } else { 1488832d11c5SIlpo Järvinen pcount = len / mss; 1489832d11c5SIlpo Järvinen len = pcount * mss; 1490832d11c5SIlpo Järvinen } 1491832d11c5SIlpo Järvinen } 1492832d11c5SIlpo Järvinen 14934648dc97SNeal Cardwell /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ 14944648dc97SNeal Cardwell if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) 14954648dc97SNeal Cardwell goto fallback; 14964648dc97SNeal Cardwell 14973b4929f6SEric Dumazet if (!tcp_skb_shift(prev, skb, pcount, len)) 1498832d11c5SIlpo Järvinen goto fallback; 1499f3319816SEric Dumazet if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) 1500832d11c5SIlpo Järvinen goto out; 1501832d11c5SIlpo Järvinen 1502832d11c5SIlpo Järvinen /* Hole filled allows collapsing with the next as well, this is very 1503832d11c5SIlpo Järvinen * useful when hole on every nth skb pattern happens 1504832d11c5SIlpo Järvinen */ 150575c119afSEric Dumazet skb = skb_rb_next(prev); 150675c119afSEric Dumazet if (!skb) 1507832d11c5SIlpo Järvinen goto out; 1508832d11c5SIlpo Järvinen 1509f0bc52f3SIlpo Järvinen if (!skb_can_shift(skb) || 1510f0bc52f3SIlpo Järvinen ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || 1511775ffabfSIlpo Järvinen (mss != tcp_skb_seglen(skb))) 1512832d11c5SIlpo Järvinen goto out; 1513832d11c5SIlpo Järvinen 1514832d11c5SIlpo Järvinen len = skb->len; 15153b4929f6SEric Dumazet pcount = tcp_skb_pcount(skb); 15163b4929f6SEric Dumazet if (tcp_skb_shift(prev, skb, pcount, len)) 15173b4929f6SEric Dumazet tcp_shifted_skb(sk, prev, skb, state, pcount, 1518f3319816SEric Dumazet len, mss, 0); 1519832d11c5SIlpo Järvinen 1520832d11c5SIlpo Järvinen out: 1521832d11c5SIlpo Järvinen return prev; 1522832d11c5SIlpo Järvinen 1523832d11c5SIlpo Järvinen noop: 1524832d11c5SIlpo Järvinen return skb; 1525832d11c5SIlpo Järvinen 1526832d11c5SIlpo Järvinen fallback: 1527c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); 1528832d11c5SIlpo Järvinen return NULL; 1529832d11c5SIlpo Järvinen } 1530832d11c5SIlpo Järvinen 153168f8353bSIlpo Järvinen static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, 153268f8353bSIlpo Järvinen struct tcp_sack_block *next_dup, 1533a1197f5aSIlpo Järvinen struct tcp_sacktag_state *state, 153468f8353bSIlpo Järvinen u32 start_seq, u32 end_seq, 1535a2a385d6SEric Dumazet bool dup_sack_in) 153668f8353bSIlpo Järvinen { 1537832d11c5SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 1538832d11c5SIlpo Järvinen struct sk_buff *tmp; 1539832d11c5SIlpo Järvinen 154075c119afSEric Dumazet skb_rbtree_walk_from(skb) { 154168f8353bSIlpo Järvinen int in_sack = 0; 1542a2a385d6SEric Dumazet bool dup_sack = dup_sack_in; 154368f8353bSIlpo Järvinen 154468f8353bSIlpo Järvinen /* queue is in-order => we can short-circuit the walk early */ 154568f8353bSIlpo Järvinen if (!before(TCP_SKB_CB(skb)->seq, end_seq)) 154668f8353bSIlpo Järvinen break; 154768f8353bSIlpo Järvinen 154800db4124SIan Morris if (next_dup && 154968f8353bSIlpo Järvinen before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) { 155068f8353bSIlpo Järvinen in_sack = tcp_match_skb_to_sack(sk, skb, 155168f8353bSIlpo Järvinen next_dup->start_seq, 155268f8353bSIlpo Järvinen next_dup->end_seq); 155368f8353bSIlpo Järvinen if (in_sack > 0) 1554a2a385d6SEric Dumazet dup_sack = true; 155568f8353bSIlpo Järvinen } 155668f8353bSIlpo Järvinen 1557832d11c5SIlpo Järvinen /* skb reference here is a bit tricky to get right, since 1558832d11c5SIlpo Järvinen * shifting can eat and free both this skb and the next, 1559832d11c5SIlpo Järvinen * so not even _safe variant of the loop is enough. 1560832d11c5SIlpo Järvinen */ 1561832d11c5SIlpo Järvinen if (in_sack <= 0) { 1562a1197f5aSIlpo Järvinen tmp = tcp_shift_skb_data(sk, skb, state, 1563a1197f5aSIlpo Järvinen start_seq, end_seq, dup_sack); 156400db4124SIan Morris if (tmp) { 1565832d11c5SIlpo Järvinen if (tmp != skb) { 1566832d11c5SIlpo Järvinen skb = tmp; 1567832d11c5SIlpo Järvinen continue; 1568832d11c5SIlpo Järvinen } 1569832d11c5SIlpo Järvinen 1570832d11c5SIlpo Järvinen in_sack = 0; 1571832d11c5SIlpo Järvinen } else { 1572832d11c5SIlpo Järvinen in_sack = tcp_match_skb_to_sack(sk, skb, 1573832d11c5SIlpo Järvinen start_seq, 1574056834d9SIlpo Järvinen end_seq); 1575832d11c5SIlpo Järvinen } 1576832d11c5SIlpo Järvinen } 1577832d11c5SIlpo Järvinen 157868f8353bSIlpo Järvinen if (unlikely(in_sack < 0)) 157968f8353bSIlpo Järvinen break; 158068f8353bSIlpo Järvinen 1581832d11c5SIlpo Järvinen if (in_sack) { 1582cc9a672eSNeal Cardwell TCP_SKB_CB(skb)->sacked = 1583cc9a672eSNeal Cardwell tcp_sacktag_one(sk, 1584a1197f5aSIlpo Järvinen state, 1585cc9a672eSNeal Cardwell TCP_SKB_CB(skb)->sacked, 1586cc9a672eSNeal Cardwell TCP_SKB_CB(skb)->seq, 1587cc9a672eSNeal Cardwell TCP_SKB_CB(skb)->end_seq, 1588a1197f5aSIlpo Järvinen dup_sack, 158959c9af42SYuchung Cheng tcp_skb_pcount(skb), 15902fd66ffbSEric Dumazet tcp_skb_timestamp_us(skb)); 1591b9f64820SYuchung Cheng tcp_rate_skb_delivered(sk, skb, state->rate); 1592e2080072SEric Dumazet if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) 1593e2080072SEric Dumazet list_del_init(&skb->tcp_tsorted_anchor); 159468f8353bSIlpo Järvinen 1595832d11c5SIlpo Järvinen if (!before(TCP_SKB_CB(skb)->seq, 1596832d11c5SIlpo Järvinen tcp_highest_sack_seq(tp))) 1597832d11c5SIlpo Järvinen tcp_advance_highest_sack(sk, skb); 1598832d11c5SIlpo Järvinen } 159968f8353bSIlpo Järvinen } 160068f8353bSIlpo Järvinen return skb; 160168f8353bSIlpo Järvinen } 160268f8353bSIlpo Järvinen 16034bfabc46STaehee Yoo static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq) 160475c119afSEric Dumazet { 160575c119afSEric Dumazet struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; 160675c119afSEric Dumazet struct sk_buff *skb; 160775c119afSEric Dumazet 160875c119afSEric Dumazet while (*p) { 160975c119afSEric Dumazet parent = *p; 161075c119afSEric Dumazet skb = rb_to_skb(parent); 161175c119afSEric Dumazet if (before(seq, TCP_SKB_CB(skb)->seq)) { 161275c119afSEric Dumazet p = &parent->rb_left; 161375c119afSEric Dumazet continue; 161475c119afSEric Dumazet } 161575c119afSEric Dumazet if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { 161675c119afSEric Dumazet p = &parent->rb_right; 161775c119afSEric Dumazet continue; 161875c119afSEric Dumazet } 161975c119afSEric Dumazet return skb; 162075c119afSEric Dumazet } 162175c119afSEric Dumazet return NULL; 162275c119afSEric Dumazet } 162375c119afSEric Dumazet 162468f8353bSIlpo Järvinen static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, 1625a1197f5aSIlpo Järvinen u32 skip_to_seq) 162668f8353bSIlpo Järvinen { 162775c119afSEric Dumazet if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) 162868f8353bSIlpo Järvinen return skb; 162975c119afSEric Dumazet 16304bfabc46STaehee Yoo return tcp_sacktag_bsearch(sk, skip_to_seq); 163168f8353bSIlpo Järvinen } 163268f8353bSIlpo Järvinen 163368f8353bSIlpo Järvinen static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, 163468f8353bSIlpo Järvinen struct sock *sk, 163568f8353bSIlpo Järvinen struct tcp_sack_block *next_dup, 1636a1197f5aSIlpo Järvinen struct tcp_sacktag_state *state, 1637a1197f5aSIlpo Järvinen u32 skip_to_seq) 163868f8353bSIlpo Järvinen { 163951456b29SIan Morris if (!next_dup) 164068f8353bSIlpo Järvinen return skb; 164168f8353bSIlpo Järvinen 164268f8353bSIlpo Järvinen if (before(next_dup->start_seq, skip_to_seq)) { 16434bfabc46STaehee Yoo skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq); 1644a1197f5aSIlpo Järvinen skb = tcp_sacktag_walk(skb, sk, NULL, state, 164568f8353bSIlpo Järvinen next_dup->start_seq, next_dup->end_seq, 1646a1197f5aSIlpo Järvinen 1); 164768f8353bSIlpo Järvinen } 164868f8353bSIlpo Järvinen 164968f8353bSIlpo Järvinen return skb; 165068f8353bSIlpo Järvinen } 165168f8353bSIlpo Järvinen 1652cf533ea5SEric Dumazet static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) 165368f8353bSIlpo Järvinen { 165468f8353bSIlpo Järvinen return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); 165568f8353bSIlpo Järvinen } 165668f8353bSIlpo Järvinen 16571da177e4SLinus Torvalds static int 1658cf533ea5SEric Dumazet tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, 1659196da974SKenneth Klette Jonassen u32 prior_snd_una, struct tcp_sacktag_state *state) 16601da177e4SLinus Torvalds { 16611da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 1662cf533ea5SEric Dumazet const unsigned char *ptr = (skb_transport_header(ack_skb) + 16639c70220bSArnaldo Carvalho de Melo TCP_SKB_CB(ack_skb)->sacked); 1664fd6dad61SIlpo Järvinen struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 16654389ddedSAdam Langley struct tcp_sack_block sp[TCP_NUM_SACKS]; 166668f8353bSIlpo Järvinen struct tcp_sack_block *cache; 166768f8353bSIlpo Järvinen struct sk_buff *skb; 16684389ddedSAdam Langley int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1669fd6dad61SIlpo Järvinen int used_sacks; 1670a2a385d6SEric Dumazet bool found_dup_sack = false; 167168f8353bSIlpo Järvinen int i, j; 1672fda03fbbSBaruch Even int first_sack_index; 16731da177e4SLinus Torvalds 1674196da974SKenneth Klette Jonassen state->flag = 0; 1675737ff314SYuchung Cheng state->reord = tp->snd_nxt; 1676a1197f5aSIlpo Järvinen 1677737ff314SYuchung Cheng if (!tp->sacked_out) 16786859d494SIlpo Järvinen tcp_highest_sack_reset(sk); 16791da177e4SLinus Torvalds 16801ed83465SPavel Emelyanov found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, 1681d06e021dSDavid S. Miller num_sacks, prior_snd_una); 1682b9f64820SYuchung Cheng if (found_dup_sack) { 1683196da974SKenneth Klette Jonassen state->flag |= FLAG_DSACKING_ACK; 1684b9f64820SYuchung Cheng tp->delivered++; /* A spurious retransmission is delivered */ 1685b9f64820SYuchung Cheng } 16866f74651aSBaruch Even 16876f74651aSBaruch Even /* Eliminate too old ACKs, but take into 16886f74651aSBaruch Even * account more or less fresh ones, they can 16896f74651aSBaruch Even * contain valid SACK info. 16906f74651aSBaruch Even */ 16916f74651aSBaruch Even if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window)) 16926f74651aSBaruch Even return 0; 16936f74651aSBaruch Even 169496a2d41aSIlpo Järvinen if (!tp->packets_out) 169596a2d41aSIlpo Järvinen goto out; 169696a2d41aSIlpo Järvinen 1697fd6dad61SIlpo Järvinen used_sacks = 0; 1698fd6dad61SIlpo Järvinen first_sack_index = 0; 1699fd6dad61SIlpo Järvinen for (i = 0; i < num_sacks; i++) { 1700a2a385d6SEric Dumazet bool dup_sack = !i && found_dup_sack; 1701fd6dad61SIlpo Järvinen 1702d3e2ce3bSHarvey Harrison sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq); 1703d3e2ce3bSHarvey Harrison sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq); 1704fd6dad61SIlpo Järvinen 1705fd6dad61SIlpo Järvinen if (!tcp_is_sackblock_valid(tp, dup_sack, 1706fd6dad61SIlpo Järvinen sp[used_sacks].start_seq, 1707fd6dad61SIlpo Järvinen sp[used_sacks].end_seq)) { 170840b215e5SPavel Emelyanov int mib_idx; 170940b215e5SPavel Emelyanov 1710fd6dad61SIlpo Järvinen if (dup_sack) { 1711fd6dad61SIlpo Järvinen if (!tp->undo_marker) 171240b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; 1713fd6dad61SIlpo Järvinen else 171440b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; 1715fd6dad61SIlpo Järvinen } else { 1716fd6dad61SIlpo Järvinen /* Don't count olds caused by ACK reordering */ 1717fd6dad61SIlpo Järvinen if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && 1718fd6dad61SIlpo Järvinen !after(sp[used_sacks].end_seq, tp->snd_una)) 1719fd6dad61SIlpo Järvinen continue; 172040b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPSACKDISCARD; 1721fd6dad61SIlpo Järvinen } 172240b215e5SPavel Emelyanov 1723c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), mib_idx); 1724fd6dad61SIlpo Järvinen if (i == 0) 1725fd6dad61SIlpo Järvinen first_sack_index = -1; 1726fd6dad61SIlpo Järvinen continue; 1727fd6dad61SIlpo Järvinen } 1728fd6dad61SIlpo Järvinen 1729fd6dad61SIlpo Järvinen /* Ignore very old stuff early */ 1730fd6dad61SIlpo Järvinen if (!after(sp[used_sacks].end_seq, prior_snd_una)) 1731fd6dad61SIlpo Järvinen continue; 1732fd6dad61SIlpo Järvinen 1733fd6dad61SIlpo Järvinen used_sacks++; 1734fd6dad61SIlpo Järvinen } 1735fd6dad61SIlpo Järvinen 17366a438bbeSStephen Hemminger /* order SACK blocks to allow in order walk of the retrans queue */ 1737fd6dad61SIlpo Järvinen for (i = used_sacks - 1; i > 0; i--) { 17386a438bbeSStephen Hemminger for (j = 0; j < i; j++) { 1739fd6dad61SIlpo Järvinen if (after(sp[j].start_seq, sp[j + 1].start_seq)) { 1740a0bffffcSIlpo Järvinen swap(sp[j], sp[j + 1]); 1741fda03fbbSBaruch Even 1742fda03fbbSBaruch Even /* Track where the first SACK block goes to */ 1743fda03fbbSBaruch Even if (j == first_sack_index) 1744fda03fbbSBaruch Even first_sack_index = j + 1; 17456a438bbeSStephen Hemminger } 17466a438bbeSStephen Hemminger } 17476a438bbeSStephen Hemminger } 17486a438bbeSStephen Hemminger 174975c119afSEric Dumazet state->mss_now = tcp_current_mss(sk); 175075c119afSEric Dumazet skb = NULL; 175168f8353bSIlpo Järvinen i = 0; 175268f8353bSIlpo Järvinen 175368f8353bSIlpo Järvinen if (!tp->sacked_out) { 175468f8353bSIlpo Järvinen /* It's already past, so skip checking against it */ 175568f8353bSIlpo Järvinen cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); 175668f8353bSIlpo Järvinen } else { 175768f8353bSIlpo Järvinen cache = tp->recv_sack_cache; 175868f8353bSIlpo Järvinen /* Skip empty blocks in at head of the cache */ 175968f8353bSIlpo Järvinen while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq && 176068f8353bSIlpo Järvinen !cache->end_seq) 176168f8353bSIlpo Järvinen cache++; 1762fda03fbbSBaruch Even } 1763fda03fbbSBaruch Even 176468f8353bSIlpo Järvinen while (i < used_sacks) { 1765fd6dad61SIlpo Järvinen u32 start_seq = sp[i].start_seq; 1766fd6dad61SIlpo Järvinen u32 end_seq = sp[i].end_seq; 1767a2a385d6SEric Dumazet bool dup_sack = (found_dup_sack && (i == first_sack_index)); 176868f8353bSIlpo Järvinen struct tcp_sack_block *next_dup = NULL; 1769e56d6cd6SIlpo Järvinen 177068f8353bSIlpo Järvinen if (found_dup_sack && ((i + 1) == first_sack_index)) 177168f8353bSIlpo Järvinen next_dup = &sp[i + 1]; 17721da177e4SLinus Torvalds 177368f8353bSIlpo Järvinen /* Skip too early cached blocks */ 177468f8353bSIlpo Järvinen while (tcp_sack_cache_ok(tp, cache) && 177568f8353bSIlpo Järvinen !before(start_seq, cache->end_seq)) 177668f8353bSIlpo Järvinen cache++; 17771da177e4SLinus Torvalds 177868f8353bSIlpo Järvinen /* Can skip some work by looking recv_sack_cache? */ 177968f8353bSIlpo Järvinen if (tcp_sack_cache_ok(tp, cache) && !dup_sack && 178068f8353bSIlpo Järvinen after(end_seq, cache->start_seq)) { 1781fe067e8aSDavid S. Miller 178268f8353bSIlpo Järvinen /* Head todo? */ 178368f8353bSIlpo Järvinen if (before(start_seq, cache->start_seq)) { 17844bfabc46STaehee Yoo skb = tcp_sacktag_skip(skb, sk, start_seq); 1785056834d9SIlpo Järvinen skb = tcp_sacktag_walk(skb, sk, next_dup, 1786196da974SKenneth Klette Jonassen state, 1787056834d9SIlpo Järvinen start_seq, 1788056834d9SIlpo Järvinen cache->start_seq, 1789a1197f5aSIlpo Järvinen dup_sack); 1790fda03fbbSBaruch Even } 17916a438bbeSStephen Hemminger 179268f8353bSIlpo Järvinen /* Rest of the block already fully processed? */ 179320de20beSIlpo Järvinen if (!after(end_seq, cache->end_seq)) 179420de20beSIlpo Järvinen goto advance_sp; 179520de20beSIlpo Järvinen 1796056834d9SIlpo Järvinen skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, 1797196da974SKenneth Klette Jonassen state, 1798a1197f5aSIlpo Järvinen cache->end_seq); 179968f8353bSIlpo Järvinen 180068f8353bSIlpo Järvinen /* ...tail remains todo... */ 18016859d494SIlpo Järvinen if (tcp_highest_sack_seq(tp) == cache->end_seq) { 180220de20beSIlpo Järvinen /* ...but better entrypoint exists! */ 18036859d494SIlpo Järvinen skb = tcp_highest_sack(sk); 180451456b29SIan Morris if (!skb) 18056859d494SIlpo Järvinen break; 180668f8353bSIlpo Järvinen cache++; 180768f8353bSIlpo Järvinen goto walk; 1808e56d6cd6SIlpo Järvinen } 1809e56d6cd6SIlpo Järvinen 18104bfabc46STaehee Yoo skb = tcp_sacktag_skip(skb, sk, cache->end_seq); 181168f8353bSIlpo Järvinen /* Check overlap against next cached too (past this one already) */ 181268f8353bSIlpo Järvinen cache++; 181368f8353bSIlpo Järvinen continue; 18141da177e4SLinus Torvalds } 1815fbd52eb2SIlpo Järvinen 18166859d494SIlpo Järvinen if (!before(start_seq, tcp_highest_sack_seq(tp))) { 18176859d494SIlpo Järvinen skb = tcp_highest_sack(sk); 181851456b29SIan Morris if (!skb) 18196859d494SIlpo Järvinen break; 182068f8353bSIlpo Järvinen } 18214bfabc46STaehee Yoo skb = tcp_sacktag_skip(skb, sk, start_seq); 182268f8353bSIlpo Järvinen 182368f8353bSIlpo Järvinen walk: 1824196da974SKenneth Klette Jonassen skb = tcp_sacktag_walk(skb, sk, next_dup, state, 1825a1197f5aSIlpo Järvinen start_seq, end_seq, dup_sack); 182668f8353bSIlpo Järvinen 182768f8353bSIlpo Järvinen advance_sp: 182868f8353bSIlpo Järvinen i++; 18291da177e4SLinus Torvalds } 18301da177e4SLinus Torvalds 183168f8353bSIlpo Järvinen /* Clear the head of the cache sack blocks so we can skip it next time */ 183268f8353bSIlpo Järvinen for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) { 183368f8353bSIlpo Järvinen tp->recv_sack_cache[i].start_seq = 0; 183468f8353bSIlpo Järvinen tp->recv_sack_cache[i].end_seq = 0; 183568f8353bSIlpo Järvinen } 183668f8353bSIlpo Järvinen for (j = 0; j < used_sacks; j++) 183768f8353bSIlpo Järvinen tp->recv_sack_cache[i++] = sp[j]; 183868f8353bSIlpo Järvinen 1839737ff314SYuchung Cheng if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss || tp->undo_marker) 1840737ff314SYuchung Cheng tcp_check_sack_reordering(sk, state->reord, 0); 18411da177e4SLinus Torvalds 18429dac8835SYuchung Cheng tcp_verify_left_out(tp); 184396a2d41aSIlpo Järvinen out: 184496a2d41aSIlpo Järvinen 18451da177e4SLinus Torvalds #if FASTRETRANS_DEBUG > 0 1846547b792cSIlpo Järvinen WARN_ON((int)tp->sacked_out < 0); 1847547b792cSIlpo Järvinen WARN_ON((int)tp->lost_out < 0); 1848547b792cSIlpo Järvinen WARN_ON((int)tp->retrans_out < 0); 1849547b792cSIlpo Järvinen WARN_ON((int)tcp_packets_in_flight(tp) < 0); 18501da177e4SLinus Torvalds #endif 1851196da974SKenneth Klette Jonassen return state->flag; 18521da177e4SLinus Torvalds } 18531da177e4SLinus Torvalds 1854882bebaaSIlpo Järvinen /* Limits sacked_out so that sum with lost_out isn't ever larger than 1855a2a385d6SEric Dumazet * packets_out. Returns false if sacked_out adjustement wasn't necessary. 185630935cf4SIlpo Järvinen */ 1857a2a385d6SEric Dumazet static bool tcp_limit_reno_sacked(struct tcp_sock *tp) 18584ddf6676SIlpo Järvinen { 18594ddf6676SIlpo Järvinen u32 holes; 18604ddf6676SIlpo Järvinen 18614ddf6676SIlpo Järvinen holes = max(tp->lost_out, 1U); 18624ddf6676SIlpo Järvinen holes = min(holes, tp->packets_out); 18634ddf6676SIlpo Järvinen 18644ddf6676SIlpo Järvinen if ((tp->sacked_out + holes) > tp->packets_out) { 18654ddf6676SIlpo Järvinen tp->sacked_out = tp->packets_out - holes; 1866a2a385d6SEric Dumazet return true; 18674ddf6676SIlpo Järvinen } 1868a2a385d6SEric Dumazet return false; 1869882bebaaSIlpo Järvinen } 1870882bebaaSIlpo Järvinen 1871882bebaaSIlpo Järvinen /* If we receive more dupacks than we expected counting segments 1872882bebaaSIlpo Järvinen * in assumption of absent reordering, interpret this as reordering. 1873882bebaaSIlpo Järvinen * The only another reason could be bug in receiver TCP. 1874882bebaaSIlpo Järvinen */ 1875882bebaaSIlpo Järvinen static void tcp_check_reno_reordering(struct sock *sk, const int addend) 1876882bebaaSIlpo Järvinen { 1877882bebaaSIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 1878737ff314SYuchung Cheng 1879737ff314SYuchung Cheng if (!tcp_limit_reno_sacked(tp)) 1880737ff314SYuchung Cheng return; 1881737ff314SYuchung Cheng 1882737ff314SYuchung Cheng tp->reordering = min_t(u32, tp->packets_out + addend, 1883737ff314SYuchung Cheng sock_net(sk)->ipv4.sysctl_tcp_max_reordering); 18847ec65372SWei Wang tp->reord_seen++; 1885737ff314SYuchung Cheng NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER); 18864ddf6676SIlpo Järvinen } 18874ddf6676SIlpo Järvinen 18884ddf6676SIlpo Järvinen /* Emulate SACKs for SACKless connection: account for a new dupack. */ 18894ddf6676SIlpo Järvinen 189019119f29SEric Dumazet static void tcp_add_reno_sack(struct sock *sk, int num_dupack) 18914ddf6676SIlpo Järvinen { 189219119f29SEric Dumazet if (num_dupack) { 18934ddf6676SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 1894ddf1af6fSYuchung Cheng u32 prior_sacked = tp->sacked_out; 189519119f29SEric Dumazet s32 delivered; 1896ddf1af6fSYuchung Cheng 189719119f29SEric Dumazet tp->sacked_out += num_dupack; 18984ddf6676SIlpo Järvinen tcp_check_reno_reordering(sk, 0); 189919119f29SEric Dumazet delivered = tp->sacked_out - prior_sacked; 190019119f29SEric Dumazet if (delivered > 0) 190119119f29SEric Dumazet tp->delivered += delivered; 1902005903bcSIlpo Järvinen tcp_verify_left_out(tp); 19034ddf6676SIlpo Järvinen } 190419119f29SEric Dumazet } 19054ddf6676SIlpo Järvinen 19064ddf6676SIlpo Järvinen /* Account for ACK, ACKing some data in Reno Recovery phase. */ 19074ddf6676SIlpo Järvinen 19084ddf6676SIlpo Järvinen static void tcp_remove_reno_sacks(struct sock *sk, int acked) 19094ddf6676SIlpo Järvinen { 19104ddf6676SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 19114ddf6676SIlpo Järvinen 19124ddf6676SIlpo Järvinen if (acked > 0) { 19134ddf6676SIlpo Järvinen /* One ACK acked hole. The rest eat duplicate ACKs. */ 1914ddf1af6fSYuchung Cheng tp->delivered += max_t(int, acked - tp->sacked_out, 1); 19154ddf6676SIlpo Järvinen if (acked - 1 >= tp->sacked_out) 19164ddf6676SIlpo Järvinen tp->sacked_out = 0; 19174ddf6676SIlpo Järvinen else 19184ddf6676SIlpo Järvinen tp->sacked_out -= acked - 1; 19194ddf6676SIlpo Järvinen } 19204ddf6676SIlpo Järvinen tcp_check_reno_reordering(sk, acked); 1921005903bcSIlpo Järvinen tcp_verify_left_out(tp); 19224ddf6676SIlpo Järvinen } 19234ddf6676SIlpo Järvinen 19244ddf6676SIlpo Järvinen static inline void tcp_reset_reno_sack(struct tcp_sock *tp) 19254ddf6676SIlpo Järvinen { 19264ddf6676SIlpo Järvinen tp->sacked_out = 0; 19274ddf6676SIlpo Järvinen } 19284ddf6676SIlpo Järvinen 1929989e04c5SYuchung Cheng void tcp_clear_retrans(struct tcp_sock *tp) 19301da177e4SLinus Torvalds { 19311da177e4SLinus Torvalds tp->retrans_out = 0; 19321da177e4SLinus Torvalds tp->lost_out = 0; 19331da177e4SLinus Torvalds tp->undo_marker = 0; 19346e08d5e3SYuchung Cheng tp->undo_retrans = -1; 19354cd82999SIlpo Järvinen tp->sacked_out = 0; 19364cd82999SIlpo Järvinen } 19374cd82999SIlpo Järvinen 1938989e04c5SYuchung Cheng static inline void tcp_init_undo(struct tcp_sock *tp) 1939989e04c5SYuchung Cheng { 1940989e04c5SYuchung Cheng tp->undo_marker = tp->snd_una; 1941989e04c5SYuchung Cheng /* Retransmission still in flight may cause DSACKs later. */ 1942989e04c5SYuchung Cheng tp->undo_retrans = tp->retrans_out ? : -1; 1943989e04c5SYuchung Cheng } 1944989e04c5SYuchung Cheng 1945b8fef65aSYuchung Cheng static bool tcp_is_rack(const struct sock *sk) 1946b8fef65aSYuchung Cheng { 1947b8fef65aSYuchung Cheng return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION; 1948b8fef65aSYuchung Cheng } 1949b8fef65aSYuchung Cheng 19502ad55f56SYuchung Cheng /* If we detect SACK reneging, forget all SACK information 19511da177e4SLinus Torvalds * and reset tags completely, otherwise preserve SACKs. If receiver 19521da177e4SLinus Torvalds * dropped its ofo queue, we will know this due to reneging detection. 19531da177e4SLinus Torvalds */ 19542ad55f56SYuchung Cheng static void tcp_timeout_mark_lost(struct sock *sk) 19552ad55f56SYuchung Cheng { 19562ad55f56SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 195756f8c5d7SYuchung Cheng struct sk_buff *skb, *head; 19582ad55f56SYuchung Cheng bool is_reneg; /* is receiver reneging on SACKs? */ 19592ad55f56SYuchung Cheng 196056f8c5d7SYuchung Cheng head = tcp_rtx_queue_head(sk); 196156f8c5d7SYuchung Cheng is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED); 19622ad55f56SYuchung Cheng if (is_reneg) { 19632ad55f56SYuchung Cheng NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); 19642ad55f56SYuchung Cheng tp->sacked_out = 0; 19652ad55f56SYuchung Cheng /* Mark SACK reneging until we recover from this loss event. */ 19662ad55f56SYuchung Cheng tp->is_sack_reneg = 1; 19672ad55f56SYuchung Cheng } else if (tcp_is_reno(tp)) { 19682ad55f56SYuchung Cheng tcp_reset_reno_sack(tp); 19692ad55f56SYuchung Cheng } 19702ad55f56SYuchung Cheng 197156f8c5d7SYuchung Cheng skb = head; 19722ad55f56SYuchung Cheng skb_rbtree_walk_from(skb) { 19732ad55f56SYuchung Cheng if (is_reneg) 19742ad55f56SYuchung Cheng TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 197556f8c5d7SYuchung Cheng else if (tcp_is_rack(sk) && skb != head && 197656f8c5d7SYuchung Cheng tcp_rack_skb_timeout(tp, skb, 0) > 0) 197756f8c5d7SYuchung Cheng continue; /* Don't mark recently sent ones lost yet */ 19782ad55f56SYuchung Cheng tcp_mark_skb_lost(sk, skb); 19792ad55f56SYuchung Cheng } 19802ad55f56SYuchung Cheng tcp_verify_left_out(tp); 19812ad55f56SYuchung Cheng tcp_clear_all_retrans_hints(tp); 19822ad55f56SYuchung Cheng } 19832ad55f56SYuchung Cheng 19842ad55f56SYuchung Cheng /* Enter Loss state. */ 19855ae344c9SNeal Cardwell void tcp_enter_loss(struct sock *sk) 19861da177e4SLinus Torvalds { 19876687e988SArnaldo Carvalho de Melo const struct inet_connection_sock *icsk = inet_csk(sk); 19881da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 19891043e25fSNikolay Borisov struct net *net = sock_net(sk); 1990cc663f4dSYuchung Cheng bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; 19911da177e4SLinus Torvalds 1992c77d62ffSYuchung Cheng tcp_timeout_mark_lost(sk); 1993c77d62ffSYuchung Cheng 19941da177e4SLinus Torvalds /* Reduce ssthresh if it has not yet been made inside this window. */ 1995e33099f9SYuchung Cheng if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1996e33099f9SYuchung Cheng !after(tp->high_seq, tp->snd_una) || 19976687e988SArnaldo Carvalho de Melo (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 19986687e988SArnaldo Carvalho de Melo tp->prior_ssthresh = tcp_current_ssthresh(sk); 19994faf7839SYuchung Cheng tp->prior_cwnd = tp->snd_cwnd; 20006687e988SArnaldo Carvalho de Melo tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 20016687e988SArnaldo Carvalho de Melo tcp_ca_event(sk, CA_EVENT_LOSS); 2002989e04c5SYuchung Cheng tcp_init_undo(tp); 20031da177e4SLinus Torvalds } 200456f8c5d7SYuchung Cheng tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; 20051da177e4SLinus Torvalds tp->snd_cwnd_cnt = 0; 2006c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 20071da177e4SLinus Torvalds 200874c181d5SYuchung Cheng /* Timeout in disordered state after receiving substantial DUPACKs 200974c181d5SYuchung Cheng * suggests that the degree of reordering is over-estimated. 201074c181d5SYuchung Cheng */ 201174c181d5SYuchung Cheng if (icsk->icsk_ca_state <= TCP_CA_Disorder && 20121043e25fSNikolay Borisov tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) 20131da177e4SLinus Torvalds tp->reordering = min_t(unsigned int, tp->reordering, 20141043e25fSNikolay Borisov net->ipv4.sysctl_tcp_reordering); 20156687e988SArnaldo Carvalho de Melo tcp_set_ca_state(sk, TCP_CA_Loss); 20161da177e4SLinus Torvalds tp->high_seq = tp->snd_nxt; 2017735d3831SFlorian Westphal tcp_ecn_queue_cwr(tp); 2018e33099f9SYuchung Cheng 2019cc663f4dSYuchung Cheng /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous 2020cc663f4dSYuchung Cheng * loss recovery is underway except recurring timeout(s) on 2021cc663f4dSYuchung Cheng * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing 2022e33099f9SYuchung Cheng */ 2023af9b69a7SEric Dumazet tp->frto = net->ipv4.sysctl_tcp_frto && 2024cc663f4dSYuchung Cheng (new_recovery || icsk->icsk_retransmits) && 2025cc663f4dSYuchung Cheng !inet_csk(sk)->icsk_mtup.probe_size; 20261da177e4SLinus Torvalds } 20271da177e4SLinus Torvalds 2028cadbd031SIlpo Järvinen /* If ACK arrived pointing to a remembered SACK, it means that our 2029cadbd031SIlpo Järvinen * remembered SACKs do not reflect real state of receiver i.e. 20301da177e4SLinus Torvalds * receiver _host_ is heavily congested (or buggy). 2031cadbd031SIlpo Järvinen * 20325ae344c9SNeal Cardwell * To avoid big spurious retransmission bursts due to transient SACK 20335ae344c9SNeal Cardwell * scoreboard oddities that look like reneging, we give the receiver a 20345ae344c9SNeal Cardwell * little time (max(RTT/2, 10ms)) to send us some more ACKs that will 20355ae344c9SNeal Cardwell * restore sanity to the SACK scoreboard. If the apparent reneging 20365ae344c9SNeal Cardwell * persists until this RTO then we'll clear the SACK scoreboard. 20371da177e4SLinus Torvalds */ 2038a2a385d6SEric Dumazet static bool tcp_check_sack_reneging(struct sock *sk, int flag) 2039cadbd031SIlpo Järvinen { 2040cadbd031SIlpo Järvinen if (flag & FLAG_SACK_RENEGING) { 20415ae344c9SNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 20425ae344c9SNeal Cardwell unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), 20435ae344c9SNeal Cardwell msecs_to_jiffies(10)); 20441da177e4SLinus Torvalds 2045463c84b9SArnaldo Carvalho de Melo inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 20465ae344c9SNeal Cardwell delay, TCP_RTO_MAX); 2047a2a385d6SEric Dumazet return true; 20481da177e4SLinus Torvalds } 2049a2a385d6SEric Dumazet return false; 20501da177e4SLinus Torvalds } 20511da177e4SLinus Torvalds 205285cc391cSIlpo Järvinen /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs 205385cc391cSIlpo Järvinen * counter when SACK is enabled (without SACK, sacked_out is used for 205485cc391cSIlpo Järvinen * that purpose). 205585cc391cSIlpo Järvinen * 205685cc391cSIlpo Järvinen * With reordering, holes may still be in flight, so RFC3517 recovery 205785cc391cSIlpo Järvinen * uses pure sacked_out (total number of SACKed segments) even though 205885cc391cSIlpo Järvinen * it violates the RFC that uses duplicate ACKs, often these are equal 205985cc391cSIlpo Järvinen * but when e.g. out-of-window ACKs or packet duplication occurs, 206085cc391cSIlpo Järvinen * they differ. Since neither occurs due to loss, TCP should really 206185cc391cSIlpo Järvinen * ignore them. 206285cc391cSIlpo Järvinen */ 2063cf533ea5SEric Dumazet static inline int tcp_dupack_heuristics(const struct tcp_sock *tp) 206485cc391cSIlpo Järvinen { 2065713bafeaSYuchung Cheng return tp->sacked_out + 1; 206685cc391cSIlpo Järvinen } 206785cc391cSIlpo Järvinen 2068713bafeaSYuchung Cheng /* Linux NewReno/SACK/ECN state machine. 20691da177e4SLinus Torvalds * -------------------------------------- 20701da177e4SLinus Torvalds * 20711da177e4SLinus Torvalds * "Open" Normal state, no dubious events, fast path. 20721da177e4SLinus Torvalds * "Disorder" In all the respects it is "Open", 20731da177e4SLinus Torvalds * but requires a bit more attention. It is entered when 20741da177e4SLinus Torvalds * we see some SACKs or dupacks. It is split of "Open" 20751da177e4SLinus Torvalds * mainly to move some processing from fast path to slow one. 20761da177e4SLinus Torvalds * "CWR" CWND was reduced due to some Congestion Notification event. 20771da177e4SLinus Torvalds * It can be ECN, ICMP source quench, local device congestion. 20781da177e4SLinus Torvalds * "Recovery" CWND was reduced, we are fast-retransmitting. 20791da177e4SLinus Torvalds * "Loss" CWND was reduced due to RTO timeout or SACK reneging. 20801da177e4SLinus Torvalds * 20811da177e4SLinus Torvalds * tcp_fastretrans_alert() is entered: 20821da177e4SLinus Torvalds * - each incoming ACK, if state is not "Open" 20831da177e4SLinus Torvalds * - when arrived ACK is unusual, namely: 20841da177e4SLinus Torvalds * * SACK 20851da177e4SLinus Torvalds * * Duplicate ACK. 20861da177e4SLinus Torvalds * * ECN ECE. 20871da177e4SLinus Torvalds * 20881da177e4SLinus Torvalds * Counting packets in flight is pretty simple. 20891da177e4SLinus Torvalds * 20901da177e4SLinus Torvalds * in_flight = packets_out - left_out + retrans_out 20911da177e4SLinus Torvalds * 20921da177e4SLinus Torvalds * packets_out is SND.NXT-SND.UNA counted in packets. 20931da177e4SLinus Torvalds * 20941da177e4SLinus Torvalds * retrans_out is number of retransmitted segments. 20951da177e4SLinus Torvalds * 20961da177e4SLinus Torvalds * left_out is number of segments left network, but not ACKed yet. 20971da177e4SLinus Torvalds * 20981da177e4SLinus Torvalds * left_out = sacked_out + lost_out 20991da177e4SLinus Torvalds * 21001da177e4SLinus Torvalds * sacked_out: Packets, which arrived to receiver out of order 21011da177e4SLinus Torvalds * and hence not ACKed. With SACKs this number is simply 21021da177e4SLinus Torvalds * amount of SACKed data. Even without SACKs 21031da177e4SLinus Torvalds * it is easy to give pretty reliable estimate of this number, 21041da177e4SLinus Torvalds * counting duplicate ACKs. 21051da177e4SLinus Torvalds * 21061da177e4SLinus Torvalds * lost_out: Packets lost by network. TCP has no explicit 21071da177e4SLinus Torvalds * "loss notification" feedback from network (for now). 21081da177e4SLinus Torvalds * It means that this number can be only _guessed_. 21091da177e4SLinus Torvalds * Actually, it is the heuristics to predict lossage that 21101da177e4SLinus Torvalds * distinguishes different algorithms. 21111da177e4SLinus Torvalds * 21121da177e4SLinus Torvalds * F.e. after RTO, when all the queue is considered as lost, 21131da177e4SLinus Torvalds * lost_out = packets_out and in_flight = retrans_out. 21141da177e4SLinus Torvalds * 2115a0370b3fSYuchung Cheng * Essentially, we have now a few algorithms detecting 21161da177e4SLinus Torvalds * lost packets. 21171da177e4SLinus Torvalds * 2118a0370b3fSYuchung Cheng * If the receiver supports SACK: 2119a0370b3fSYuchung Cheng * 2120a0370b3fSYuchung Cheng * RFC6675/3517: It is the conventional algorithm. A packet is 2121a0370b3fSYuchung Cheng * considered lost if the number of higher sequence packets 2122a0370b3fSYuchung Cheng * SACKed is greater than or equal the DUPACK thoreshold 2123a0370b3fSYuchung Cheng * (reordering). This is implemented in tcp_mark_head_lost and 2124a0370b3fSYuchung Cheng * tcp_update_scoreboard. 2125a0370b3fSYuchung Cheng * 2126a0370b3fSYuchung Cheng * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm 2127a0370b3fSYuchung Cheng * (2017-) that checks timing instead of counting DUPACKs. 2128a0370b3fSYuchung Cheng * Essentially a packet is considered lost if it's not S/ACKed 2129a0370b3fSYuchung Cheng * after RTT + reordering_window, where both metrics are 2130a0370b3fSYuchung Cheng * dynamically measured and adjusted. This is implemented in 2131a0370b3fSYuchung Cheng * tcp_rack_mark_lost. 2132a0370b3fSYuchung Cheng * 2133a0370b3fSYuchung Cheng * If the receiver does not support SACK: 2134a0370b3fSYuchung Cheng * 2135a0370b3fSYuchung Cheng * NewReno (RFC6582): in Recovery we assume that one segment 21361da177e4SLinus Torvalds * is lost (classic Reno). While we are in Recovery and 21371da177e4SLinus Torvalds * a partial ACK arrives, we assume that one more packet 21381da177e4SLinus Torvalds * is lost (NewReno). This heuristics are the same in NewReno 21391da177e4SLinus Torvalds * and SACK. 21401da177e4SLinus Torvalds * 21411da177e4SLinus Torvalds * Really tricky (and requiring careful tuning) part of algorithm 21421da177e4SLinus Torvalds * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). 21431da177e4SLinus Torvalds * The first determines the moment _when_ we should reduce CWND and, 21441da177e4SLinus Torvalds * hence, slow down forward transmission. In fact, it determines the moment 21451da177e4SLinus Torvalds * when we decide that hole is caused by loss, rather than by a reorder. 21461da177e4SLinus Torvalds * 21471da177e4SLinus Torvalds * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill 21481da177e4SLinus Torvalds * holes, caused by lost packets. 21491da177e4SLinus Torvalds * 21501da177e4SLinus Torvalds * And the most logically complicated part of algorithm is undo 21511da177e4SLinus Torvalds * heuristics. We detect false retransmits due to both too early 21521da177e4SLinus Torvalds * fast retransmit (reordering) and underestimated RTO, analyzing 21531da177e4SLinus Torvalds * timestamps and D-SACKs. When we detect that some segments were 21541da177e4SLinus Torvalds * retransmitted by mistake and CWND reduction was wrong, we undo 21551da177e4SLinus Torvalds * window reduction and abort recovery phase. This logic is hidden 21561da177e4SLinus Torvalds * inside several functions named tcp_try_undo_<something>. 21571da177e4SLinus Torvalds */ 21581da177e4SLinus Torvalds 21591da177e4SLinus Torvalds /* This function decides, when we should leave Disordered state 21601da177e4SLinus Torvalds * and enter Recovery phase, reducing congestion window. 21611da177e4SLinus Torvalds * 21621da177e4SLinus Torvalds * Main question: may we further continue forward transmission 21631da177e4SLinus Torvalds * with the same cwnd? 21641da177e4SLinus Torvalds */ 2165a2a385d6SEric Dumazet static bool tcp_time_to_recover(struct sock *sk, int flag) 21661da177e4SLinus Torvalds { 21679e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 21681da177e4SLinus Torvalds 21691da177e4SLinus Torvalds /* Trick#1: The loss is proven. */ 21701da177e4SLinus Torvalds if (tp->lost_out) 2171a2a385d6SEric Dumazet return true; 21721da177e4SLinus Torvalds 21731da177e4SLinus Torvalds /* Not-A-Trick#2 : Classic rule... */ 2174b38a51feSYuchung Cheng if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering) 2175a2a385d6SEric Dumazet return true; 21761da177e4SLinus Torvalds 2177a2a385d6SEric Dumazet return false; 21781da177e4SLinus Torvalds } 21791da177e4SLinus Torvalds 2180974c1236SYuchung Cheng /* Detect loss in event "A" above by marking head of queue up as lost. 2181713bafeaSYuchung Cheng * For non-SACK(Reno) senders, the first "packets" number of segments 2182974c1236SYuchung Cheng * are considered lost. For RFC3517 SACK, a segment is considered lost if it 2183974c1236SYuchung Cheng * has at least tp->reordering SACKed seqments above it; "packets" refers to 2184974c1236SYuchung Cheng * the maximum SACKed segments to pass before reaching this limit. 218585cc391cSIlpo Järvinen */ 21861fdb9361SIlpo Järvinen static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) 21871da177e4SLinus Torvalds { 21889e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 21891da177e4SLinus Torvalds struct sk_buff *skb; 2190d88270eeSNeal Cardwell int cnt, oldcnt, lost; 2191c137f3ddSIlpo Järvinen unsigned int mss; 2192974c1236SYuchung Cheng /* Use SACK to deduce losses of new sequences sent during recovery */ 2193974c1236SYuchung Cheng const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; 21941da177e4SLinus Torvalds 2195547b792cSIlpo Järvinen WARN_ON(packets > tp->packets_out); 21966a438bbeSStephen Hemminger skb = tp->lost_skb_hint; 21975e76ee4bSEric Dumazet if (skb) { 21981fdb9361SIlpo Järvinen /* Head already handled? */ 21995e76ee4bSEric Dumazet if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) 22001fdb9361SIlpo Järvinen return; 22015e76ee4bSEric Dumazet cnt = tp->lost_cnt_hint; 22026a438bbeSStephen Hemminger } else { 220375c119afSEric Dumazet skb = tcp_rtx_queue_head(sk); 22046a438bbeSStephen Hemminger cnt = 0; 22056a438bbeSStephen Hemminger } 22061da177e4SLinus Torvalds 220775c119afSEric Dumazet skb_rbtree_walk_from(skb) { 22086a438bbeSStephen Hemminger /* TODO: do this better */ 22096a438bbeSStephen Hemminger /* this is not the most efficient way to do this... */ 22106a438bbeSStephen Hemminger tp->lost_skb_hint = skb; 22116a438bbeSStephen Hemminger tp->lost_cnt_hint = cnt; 221285cc391cSIlpo Järvinen 2213974c1236SYuchung Cheng if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) 2214c137f3ddSIlpo Järvinen break; 2215c137f3ddSIlpo Järvinen 2216c137f3ddSIlpo Järvinen oldcnt = cnt; 2217713bafeaSYuchung Cheng if (tcp_is_reno(tp) || 221885cc391cSIlpo Järvinen (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 22196a438bbeSStephen Hemminger cnt += tcp_skb_pcount(skb); 222085cc391cSIlpo Järvinen 2221c137f3ddSIlpo Järvinen if (cnt > packets) { 2222713bafeaSYuchung Cheng if (tcp_is_sack(tp) || 2223c0638c24SNeal Cardwell (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2224b3de7559SYuchung Cheng (oldcnt >= packets)) 22251da177e4SLinus Torvalds break; 2226c137f3ddSIlpo Järvinen 2227f69ad292SEric Dumazet mss = tcp_skb_mss(skb); 2228d88270eeSNeal Cardwell /* If needed, chop off the prefix to mark as lost. */ 2229d88270eeSNeal Cardwell lost = (packets - oldcnt) * mss; 2230d88270eeSNeal Cardwell if (lost < skb->len && 223175c119afSEric Dumazet tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, 223275c119afSEric Dumazet lost, mss, GFP_ATOMIC) < 0) 2233c137f3ddSIlpo Järvinen break; 2234c137f3ddSIlpo Järvinen cnt = packets; 2235c137f3ddSIlpo Järvinen } 2236c137f3ddSIlpo Järvinen 223741ea36e3SIlpo Järvinen tcp_skb_mark_lost(tp, skb); 22381fdb9361SIlpo Järvinen 22391fdb9361SIlpo Järvinen if (mark_head) 22401fdb9361SIlpo Järvinen break; 22411da177e4SLinus Torvalds } 2242005903bcSIlpo Järvinen tcp_verify_left_out(tp); 22431da177e4SLinus Torvalds } 22441da177e4SLinus Torvalds 22451da177e4SLinus Torvalds /* Account newly detected lost packet(s) */ 22461da177e4SLinus Torvalds 224785cc391cSIlpo Järvinen static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) 22481da177e4SLinus Torvalds { 22499e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 22509e412ba7SIlpo Järvinen 22516ac06ecdSYuchung Cheng if (tcp_is_sack(tp)) { 225285cc391cSIlpo Järvinen int sacked_upto = tp->sacked_out - tp->reordering; 22531fdb9361SIlpo Järvinen if (sacked_upto >= 0) 22541fdb9361SIlpo Järvinen tcp_mark_head_lost(sk, sacked_upto, 0); 22551fdb9361SIlpo Järvinen else if (fast_rexmit) 22561fdb9361SIlpo Järvinen tcp_mark_head_lost(sk, 1, 1); 22571da177e4SLinus Torvalds } 22581da177e4SLinus Torvalds } 22591da177e4SLinus Torvalds 226077c63127SYuchung Cheng static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) 226177c63127SYuchung Cheng { 226277c63127SYuchung Cheng return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 226377c63127SYuchung Cheng before(tp->rx_opt.rcv_tsecr, when); 226477c63127SYuchung Cheng } 226577c63127SYuchung Cheng 2266659a8ad5SYuchung Cheng /* skb is spurious retransmitted if the returned timestamp echo 2267659a8ad5SYuchung Cheng * reply is prior to the skb transmission time 2268659a8ad5SYuchung Cheng */ 2269659a8ad5SYuchung Cheng static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, 2270659a8ad5SYuchung Cheng const struct sk_buff *skb) 2271659a8ad5SYuchung Cheng { 2272659a8ad5SYuchung Cheng return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) && 2273659a8ad5SYuchung Cheng tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb)); 2274659a8ad5SYuchung Cheng } 2275659a8ad5SYuchung Cheng 22761da177e4SLinus Torvalds /* Nothing was retransmitted or returned timestamp is less 22771da177e4SLinus Torvalds * than timestamp of the first retransmission. 22781da177e4SLinus Torvalds */ 227967b95bd7SVijay Subramanian static inline bool tcp_packet_delayed(const struct tcp_sock *tp) 22801da177e4SLinus Torvalds { 2281bc9f38c8SYuchung Cheng return tp->retrans_stamp && 228277c63127SYuchung Cheng tcp_tsopt_ecr_before(tp, tp->retrans_stamp); 22831da177e4SLinus Torvalds } 22841da177e4SLinus Torvalds 22851da177e4SLinus Torvalds /* Undo procedures. */ 22861da177e4SLinus Torvalds 22871f37bf87SMarcelo Leitner /* We can clear retrans_stamp when there are no retransmissions in the 22881f37bf87SMarcelo Leitner * window. It would seem that it is trivially available for us in 22891f37bf87SMarcelo Leitner * tp->retrans_out, however, that kind of assumptions doesn't consider 22901f37bf87SMarcelo Leitner * what will happen if errors occur when sending retransmission for the 22911f37bf87SMarcelo Leitner * second time. ...It could the that such segment has only 22921f37bf87SMarcelo Leitner * TCPCB_EVER_RETRANS set at the present time. It seems that checking 22931f37bf87SMarcelo Leitner * the head skb is enough except for some reneging corner cases that 22941f37bf87SMarcelo Leitner * are not worth the effort. 22951f37bf87SMarcelo Leitner * 22961f37bf87SMarcelo Leitner * Main reason for all this complexity is the fact that connection dying 22971f37bf87SMarcelo Leitner * time now depends on the validity of the retrans_stamp, in particular, 22981f37bf87SMarcelo Leitner * that successive retransmissions of a segment must not advance 22991f37bf87SMarcelo Leitner * retrans_stamp under any conditions. 23001f37bf87SMarcelo Leitner */ 23011f37bf87SMarcelo Leitner static bool tcp_any_retrans_done(const struct sock *sk) 23021f37bf87SMarcelo Leitner { 23031f37bf87SMarcelo Leitner const struct tcp_sock *tp = tcp_sk(sk); 23041f37bf87SMarcelo Leitner struct sk_buff *skb; 23051f37bf87SMarcelo Leitner 23061f37bf87SMarcelo Leitner if (tp->retrans_out) 23071f37bf87SMarcelo Leitner return true; 23081f37bf87SMarcelo Leitner 230975c119afSEric Dumazet skb = tcp_rtx_queue_head(sk); 23101f37bf87SMarcelo Leitner if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) 23111f37bf87SMarcelo Leitner return true; 23121f37bf87SMarcelo Leitner 23131f37bf87SMarcelo Leitner return false; 23141f37bf87SMarcelo Leitner } 23151f37bf87SMarcelo Leitner 23169e412ba7SIlpo Järvinen static void DBGUNDO(struct sock *sk, const char *msg) 23171da177e4SLinus Torvalds { 23183934788aSJoe Perches #if FASTRETRANS_DEBUG > 1 23199e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 23201da177e4SLinus Torvalds struct inet_sock *inet = inet_sk(sk); 23219e412ba7SIlpo Järvinen 2322569508c9SYOSHIFUJI Hideaki if (sk->sk_family == AF_INET) { 232391df42beSJoe Perches pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", 23241da177e4SLinus Torvalds msg, 2325288fcee8SJoe Perches &inet->inet_daddr, ntohs(inet->inet_dport), 232683ae4088SIlpo Järvinen tp->snd_cwnd, tcp_left_out(tp), 23271da177e4SLinus Torvalds tp->snd_ssthresh, tp->prior_ssthresh, 23281da177e4SLinus Torvalds tp->packets_out); 23291da177e4SLinus Torvalds } 2330dfd56b8bSEric Dumazet #if IS_ENABLED(CONFIG_IPV6) 2331569508c9SYOSHIFUJI Hideaki else if (sk->sk_family == AF_INET6) { 233291df42beSJoe Perches pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", 2333569508c9SYOSHIFUJI Hideaki msg, 2334019b1c9fSEric Dumazet &sk->sk_v6_daddr, ntohs(inet->inet_dport), 2335569508c9SYOSHIFUJI Hideaki tp->snd_cwnd, tcp_left_out(tp), 2336569508c9SYOSHIFUJI Hideaki tp->snd_ssthresh, tp->prior_ssthresh, 2337569508c9SYOSHIFUJI Hideaki tp->packets_out); 2338569508c9SYOSHIFUJI Hideaki } 2339569508c9SYOSHIFUJI Hideaki #endif 23401da177e4SLinus Torvalds #endif 23413934788aSJoe Perches } 23421da177e4SLinus Torvalds 23437026b912SYuchung Cheng static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) 23441da177e4SLinus Torvalds { 23456687e988SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 23466687e988SArnaldo Carvalho de Melo 23476a63df46SYuchung Cheng if (unmark_loss) { 23486a63df46SYuchung Cheng struct sk_buff *skb; 23496a63df46SYuchung Cheng 235075c119afSEric Dumazet skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { 23516a63df46SYuchung Cheng TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; 23526a63df46SYuchung Cheng } 23536a63df46SYuchung Cheng tp->lost_out = 0; 23546a63df46SYuchung Cheng tcp_clear_all_retrans_hints(tp); 23556a63df46SYuchung Cheng } 23566a63df46SYuchung Cheng 23571da177e4SLinus Torvalds if (tp->prior_ssthresh) { 23586687e988SArnaldo Carvalho de Melo const struct inet_connection_sock *icsk = inet_csk(sk); 23596687e988SArnaldo Carvalho de Melo 23606687e988SArnaldo Carvalho de Melo tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); 23611da177e4SLinus Torvalds 23627026b912SYuchung Cheng if (tp->prior_ssthresh > tp->snd_ssthresh) { 23631da177e4SLinus Torvalds tp->snd_ssthresh = tp->prior_ssthresh; 2364735d3831SFlorian Westphal tcp_ecn_withdraw_cwr(tp); 23651da177e4SLinus Torvalds } 23661da177e4SLinus Torvalds } 2367c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 23686a63df46SYuchung Cheng tp->undo_marker = 0; 2369cd1fc85bSYuchung Cheng tp->rack.advanced = 1; /* Force RACK to re-exam losses */ 23701da177e4SLinus Torvalds } 23711da177e4SLinus Torvalds 237267b95bd7SVijay Subramanian static inline bool tcp_may_undo(const struct tcp_sock *tp) 23731da177e4SLinus Torvalds { 2374056834d9SIlpo Järvinen return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); 23751da177e4SLinus Torvalds } 23761da177e4SLinus Torvalds 23771da177e4SLinus Torvalds /* People celebrate: "We love our President!" */ 2378a2a385d6SEric Dumazet static bool tcp_try_undo_recovery(struct sock *sk) 23791da177e4SLinus Torvalds { 23809e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 23819e412ba7SIlpo Järvinen 23821da177e4SLinus Torvalds if (tcp_may_undo(tp)) { 238340b215e5SPavel Emelyanov int mib_idx; 238440b215e5SPavel Emelyanov 23851da177e4SLinus Torvalds /* Happy end! We did not retransmit anything 23861da177e4SLinus Torvalds * or our original transmission succeeded. 23871da177e4SLinus Torvalds */ 23889e412ba7SIlpo Järvinen DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); 23897026b912SYuchung Cheng tcp_undo_cwnd_reduction(sk, false); 23906687e988SArnaldo Carvalho de Melo if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) 239140b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPLOSSUNDO; 23921da177e4SLinus Torvalds else 239340b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPFULLUNDO; 239440b215e5SPavel Emelyanov 2395c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), mib_idx); 23961f255691SPriyaranjan Jha } else if (tp->rack.reo_wnd_persist) { 23971f255691SPriyaranjan Jha tp->rack.reo_wnd_persist--; 23981da177e4SLinus Torvalds } 2399e60402d0SIlpo Järvinen if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { 24001da177e4SLinus Torvalds /* Hold old state until something *above* high_seq 24011da177e4SLinus Torvalds * is ACKed. For Reno it is MUST to prevent false 24021da177e4SLinus Torvalds * fast retransmits (RFC2582). SACK TCP is safe. */ 24031f37bf87SMarcelo Leitner if (!tcp_any_retrans_done(sk)) 24041f37bf87SMarcelo Leitner tp->retrans_stamp = 0; 2405a2a385d6SEric Dumazet return true; 24061da177e4SLinus Torvalds } 24076687e988SArnaldo Carvalho de Melo tcp_set_ca_state(sk, TCP_CA_Open); 2408d4761754SYousuk Seung tp->is_sack_reneg = 0; 2409a2a385d6SEric Dumazet return false; 24101da177e4SLinus Torvalds } 24111da177e4SLinus Torvalds 24121da177e4SLinus Torvalds /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ 2413c7d9d6a1SYuchung Cheng static bool tcp_try_undo_dsack(struct sock *sk) 24141da177e4SLinus Torvalds { 24159e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 24169e412ba7SIlpo Järvinen 24171da177e4SLinus Torvalds if (tp->undo_marker && !tp->undo_retrans) { 24181f255691SPriyaranjan Jha tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH, 24191f255691SPriyaranjan Jha tp->rack.reo_wnd_persist + 1); 24209e412ba7SIlpo Järvinen DBGUNDO(sk, "D-SACK"); 24217026b912SYuchung Cheng tcp_undo_cwnd_reduction(sk, false); 2422c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); 2423c7d9d6a1SYuchung Cheng return true; 24241da177e4SLinus Torvalds } 2425c7d9d6a1SYuchung Cheng return false; 24261da177e4SLinus Torvalds } 24271da177e4SLinus Torvalds 2428e33099f9SYuchung Cheng /* Undo during loss recovery after partial ACK or using F-RTO. */ 2429e33099f9SYuchung Cheng static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) 24301da177e4SLinus Torvalds { 24319e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 24329e412ba7SIlpo Järvinen 2433e33099f9SYuchung Cheng if (frto_undo || tcp_may_undo(tp)) { 24347026b912SYuchung Cheng tcp_undo_cwnd_reduction(sk, true); 24356a438bbeSStephen Hemminger 24369e412ba7SIlpo Järvinen DBGUNDO(sk, "partial loss"); 2437c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); 2438e33099f9SYuchung Cheng if (frto_undo) 2439c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), 2440e33099f9SYuchung Cheng LINUX_MIB_TCPSPURIOUSRTOS); 2441463c84b9SArnaldo Carvalho de Melo inet_csk(sk)->icsk_retransmits = 0; 2442d4761754SYousuk Seung if (frto_undo || tcp_is_sack(tp)) { 24436687e988SArnaldo Carvalho de Melo tcp_set_ca_state(sk, TCP_CA_Open); 2444d4761754SYousuk Seung tp->is_sack_reneg = 0; 2445d4761754SYousuk Seung } 2446a2a385d6SEric Dumazet return true; 24471da177e4SLinus Torvalds } 2448a2a385d6SEric Dumazet return false; 24491da177e4SLinus Torvalds } 24501da177e4SLinus Torvalds 24513759824dSYuchung Cheng /* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. 2452fb4d3d1dSYuchung Cheng * It computes the number of packets to send (sndcnt) based on packets newly 2453fb4d3d1dSYuchung Cheng * delivered: 2454fb4d3d1dSYuchung Cheng * 1) If the packets in flight is larger than ssthresh, PRR spreads the 2455fb4d3d1dSYuchung Cheng * cwnd reductions across a full RTT. 24563759824dSYuchung Cheng * 2) Otherwise PRR uses packet conservation to send as much as delivered. 24573759824dSYuchung Cheng * But when the retransmits are acked without further losses, PRR 24583759824dSYuchung Cheng * slow starts cwnd up to ssthresh to speed up the recovery. 2459fb4d3d1dSYuchung Cheng */ 24605ee2c941SChristoph Paasch static void tcp_init_cwnd_reduction(struct sock *sk) 2461684bad11SYuchung Cheng { 2462684bad11SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 2463684bad11SYuchung Cheng 2464684bad11SYuchung Cheng tp->high_seq = tp->snd_nxt; 24659b717a8dSNandita Dukkipati tp->tlp_high_seq = 0; 2466684bad11SYuchung Cheng tp->snd_cwnd_cnt = 0; 2467684bad11SYuchung Cheng tp->prior_cwnd = tp->snd_cwnd; 2468684bad11SYuchung Cheng tp->prr_delivered = 0; 2469684bad11SYuchung Cheng tp->prr_out = 0; 2470684bad11SYuchung Cheng tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); 2471735d3831SFlorian Westphal tcp_ecn_queue_cwr(tp); 2472684bad11SYuchung Cheng } 2473684bad11SYuchung Cheng 247457dde7f7SYuchung Cheng void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) 2475fb4d3d1dSYuchung Cheng { 2476fb4d3d1dSYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 2477fb4d3d1dSYuchung Cheng int sndcnt = 0; 2478fb4d3d1dSYuchung Cheng int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); 2479fb4d3d1dSYuchung Cheng 24808b8a321fSYuchung Cheng if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) 24818b8a321fSYuchung Cheng return; 24828b8a321fSYuchung Cheng 2483684bad11SYuchung Cheng tp->prr_delivered += newly_acked_sacked; 24843759824dSYuchung Cheng if (delta < 0) { 2485fb4d3d1dSYuchung Cheng u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + 2486fb4d3d1dSYuchung Cheng tp->prior_cwnd - 1; 2487fb4d3d1dSYuchung Cheng sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; 24885e13a0d3SYafang Shao } else if ((flag & (FLAG_RETRANS_DATA_ACKED | FLAG_LOST_RETRANS)) == 24895e13a0d3SYafang Shao FLAG_RETRANS_DATA_ACKED) { 2490fb4d3d1dSYuchung Cheng sndcnt = min_t(int, delta, 2491fb4d3d1dSYuchung Cheng max_t(int, tp->prr_delivered - tp->prr_out, 2492fb4d3d1dSYuchung Cheng newly_acked_sacked) + 1); 24933759824dSYuchung Cheng } else { 24943759824dSYuchung Cheng sndcnt = min(delta, newly_acked_sacked); 2495fb4d3d1dSYuchung Cheng } 249631ba0c10SYuchung Cheng /* Force a fast retransmit upon entering fast recovery */ 249731ba0c10SYuchung Cheng sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); 2498fb4d3d1dSYuchung Cheng tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; 2499fb4d3d1dSYuchung Cheng } 2500fb4d3d1dSYuchung Cheng 2501684bad11SYuchung Cheng static inline void tcp_end_cwnd_reduction(struct sock *sk) 25021da177e4SLinus Torvalds { 25036687e988SArnaldo Carvalho de Melo struct tcp_sock *tp = tcp_sk(sk); 2504a262f0cdSNandita Dukkipati 2505c0402760SYuchung Cheng if (inet_csk(sk)->icsk_ca_ops->cong_control) 2506c0402760SYuchung Cheng return; 2507c0402760SYuchung Cheng 2508684bad11SYuchung Cheng /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ 2509ed254971SYuchung Cheng if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && 2510ed254971SYuchung Cheng (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { 251167d4120aSYuchung Cheng tp->snd_cwnd = tp->snd_ssthresh; 2512c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 251367d4120aSYuchung Cheng } 25146687e988SArnaldo Carvalho de Melo tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 25151da177e4SLinus Torvalds } 25161da177e4SLinus Torvalds 2517684bad11SYuchung Cheng /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ 25185ee2c941SChristoph Paasch void tcp_enter_cwr(struct sock *sk) 251909484d1fSYuchung Cheng { 252009484d1fSYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 252109484d1fSYuchung Cheng 252209484d1fSYuchung Cheng tp->prior_ssthresh = 0; 2523684bad11SYuchung Cheng if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 252409484d1fSYuchung Cheng tp->undo_marker = 0; 25255ee2c941SChristoph Paasch tcp_init_cwnd_reduction(sk); 252609484d1fSYuchung Cheng tcp_set_ca_state(sk, TCP_CA_CWR); 252709484d1fSYuchung Cheng } 252809484d1fSYuchung Cheng } 25297782ad8bSKenneth Klette Jonassen EXPORT_SYMBOL(tcp_enter_cwr); 253009484d1fSYuchung Cheng 25318aca6cb1SIlpo Järvinen static void tcp_try_keep_open(struct sock *sk) 25328aca6cb1SIlpo Järvinen { 25338aca6cb1SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 25348aca6cb1SIlpo Järvinen int state = TCP_CA_Open; 25358aca6cb1SIlpo Järvinen 2536f698204bSNeal Cardwell if (tcp_left_out(tp) || tcp_any_retrans_done(sk)) 25378aca6cb1SIlpo Järvinen state = TCP_CA_Disorder; 25388aca6cb1SIlpo Järvinen 25398aca6cb1SIlpo Järvinen if (inet_csk(sk)->icsk_ca_state != state) { 25408aca6cb1SIlpo Järvinen tcp_set_ca_state(sk, state); 25418aca6cb1SIlpo Järvinen tp->high_seq = tp->snd_nxt; 25428aca6cb1SIlpo Järvinen } 25438aca6cb1SIlpo Järvinen } 25448aca6cb1SIlpo Järvinen 254531ba0c10SYuchung Cheng static void tcp_try_to_open(struct sock *sk, int flag) 25461da177e4SLinus Torvalds { 25479e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 25489e412ba7SIlpo Järvinen 254986426c22SIlpo Järvinen tcp_verify_left_out(tp); 255086426c22SIlpo Järvinen 25519b44190dSYuchung Cheng if (!tcp_any_retrans_done(sk)) 25521da177e4SLinus Torvalds tp->retrans_stamp = 0; 25531da177e4SLinus Torvalds 25541da177e4SLinus Torvalds if (flag & FLAG_ECE) 25555ee2c941SChristoph Paasch tcp_enter_cwr(sk); 25561da177e4SLinus Torvalds 25576687e988SArnaldo Carvalho de Melo if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 25588aca6cb1SIlpo Järvinen tcp_try_keep_open(sk); 25591da177e4SLinus Torvalds } 25601da177e4SLinus Torvalds } 25611da177e4SLinus Torvalds 25625d424d5aSJohn Heffner static void tcp_mtup_probe_failed(struct sock *sk) 25635d424d5aSJohn Heffner { 25645d424d5aSJohn Heffner struct inet_connection_sock *icsk = inet_csk(sk); 25655d424d5aSJohn Heffner 25665d424d5aSJohn Heffner icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; 25675d424d5aSJohn Heffner icsk->icsk_mtup.probe_size = 0; 2568c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); 25695d424d5aSJohn Heffner } 25705d424d5aSJohn Heffner 257172211e90SIlpo Järvinen static void tcp_mtup_probe_success(struct sock *sk) 25725d424d5aSJohn Heffner { 25735d424d5aSJohn Heffner struct tcp_sock *tp = tcp_sk(sk); 25745d424d5aSJohn Heffner struct inet_connection_sock *icsk = inet_csk(sk); 25755d424d5aSJohn Heffner 25765d424d5aSJohn Heffner /* FIXME: breaks with very large cwnd */ 25775d424d5aSJohn Heffner tp->prior_ssthresh = tcp_current_ssthresh(sk); 25785d424d5aSJohn Heffner tp->snd_cwnd = tp->snd_cwnd * 25795d424d5aSJohn Heffner tcp_mss_to_mtu(sk, tp->mss_cache) / 25805d424d5aSJohn Heffner icsk->icsk_mtup.probe_size; 25815d424d5aSJohn Heffner tp->snd_cwnd_cnt = 0; 2582c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 25839c6d5e55SJohn Heffner tp->snd_ssthresh = tcp_current_ssthresh(sk); 25845d424d5aSJohn Heffner 25855d424d5aSJohn Heffner icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; 25865d424d5aSJohn Heffner icsk->icsk_mtup.probe_size = 0; 25875d424d5aSJohn Heffner tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 2588c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); 25895d424d5aSJohn Heffner } 25905d424d5aSJohn Heffner 2591e1aa680fSIlpo Järvinen /* Do a simple retransmit without using the backoff mechanisms in 2592e1aa680fSIlpo Järvinen * tcp_timer. This is used for path mtu discovery. 2593e1aa680fSIlpo Järvinen * The socket is already locked here. 2594e1aa680fSIlpo Järvinen */ 2595e1aa680fSIlpo Järvinen void tcp_simple_retransmit(struct sock *sk) 2596e1aa680fSIlpo Järvinen { 2597e1aa680fSIlpo Järvinen const struct inet_connection_sock *icsk = inet_csk(sk); 2598e1aa680fSIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 2599e1aa680fSIlpo Järvinen struct sk_buff *skb; 26000c54b85fSIlpo Järvinen unsigned int mss = tcp_current_mss(sk); 2601e1aa680fSIlpo Järvinen 260275c119afSEric Dumazet skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { 2603775ffabfSIlpo Järvinen if (tcp_skb_seglen(skb) > mss && 2604e1aa680fSIlpo Järvinen !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { 2605e1aa680fSIlpo Järvinen if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { 2606e1aa680fSIlpo Järvinen TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 2607e1aa680fSIlpo Järvinen tp->retrans_out -= tcp_skb_pcount(skb); 2608e1aa680fSIlpo Järvinen } 2609e1aa680fSIlpo Järvinen tcp_skb_mark_lost_uncond_verify(tp, skb); 2610e1aa680fSIlpo Järvinen } 2611e1aa680fSIlpo Järvinen } 2612e1aa680fSIlpo Järvinen 2613e1aa680fSIlpo Järvinen tcp_clear_retrans_hints_partial(tp); 2614e1aa680fSIlpo Järvinen 26150eb96bf7SYuchung Cheng if (!tp->lost_out) 2616e1aa680fSIlpo Järvinen return; 2617e1aa680fSIlpo Järvinen 2618e1aa680fSIlpo Järvinen if (tcp_is_reno(tp)) 2619e1aa680fSIlpo Järvinen tcp_limit_reno_sacked(tp); 2620e1aa680fSIlpo Järvinen 2621e1aa680fSIlpo Järvinen tcp_verify_left_out(tp); 2622e1aa680fSIlpo Järvinen 2623e1aa680fSIlpo Järvinen /* Don't muck with the congestion window here. 2624e1aa680fSIlpo Järvinen * Reason is that we do not increase amount of _data_ 2625e1aa680fSIlpo Järvinen * in network, but units changed and effective 2626e1aa680fSIlpo Järvinen * cwnd/ssthresh really reduced now. 2627e1aa680fSIlpo Järvinen */ 2628e1aa680fSIlpo Järvinen if (icsk->icsk_ca_state != TCP_CA_Loss) { 2629e1aa680fSIlpo Järvinen tp->high_seq = tp->snd_nxt; 2630e1aa680fSIlpo Järvinen tp->snd_ssthresh = tcp_current_ssthresh(sk); 2631e1aa680fSIlpo Järvinen tp->prior_ssthresh = 0; 2632e1aa680fSIlpo Järvinen tp->undo_marker = 0; 2633e1aa680fSIlpo Järvinen tcp_set_ca_state(sk, TCP_CA_Loss); 2634e1aa680fSIlpo Järvinen } 2635e1aa680fSIlpo Järvinen tcp_xmit_retransmit_queue(sk); 2636e1aa680fSIlpo Järvinen } 26374bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_simple_retransmit); 2638e1aa680fSIlpo Järvinen 263957dde7f7SYuchung Cheng void tcp_enter_recovery(struct sock *sk, bool ece_ack) 26401fbc3405SYuchung Cheng { 26411fbc3405SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 26421fbc3405SYuchung Cheng int mib_idx; 26431fbc3405SYuchung Cheng 26441fbc3405SYuchung Cheng if (tcp_is_reno(tp)) 26451fbc3405SYuchung Cheng mib_idx = LINUX_MIB_TCPRENORECOVERY; 26461fbc3405SYuchung Cheng else 26471fbc3405SYuchung Cheng mib_idx = LINUX_MIB_TCPSACKRECOVERY; 26481fbc3405SYuchung Cheng 2649c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), mib_idx); 26501fbc3405SYuchung Cheng 26511fbc3405SYuchung Cheng tp->prior_ssthresh = 0; 2652989e04c5SYuchung Cheng tcp_init_undo(tp); 26531fbc3405SYuchung Cheng 2654291a00d1SYuchung Cheng if (!tcp_in_cwnd_reduction(sk)) { 26551fbc3405SYuchung Cheng if (!ece_ack) 26561fbc3405SYuchung Cheng tp->prior_ssthresh = tcp_current_ssthresh(sk); 26575ee2c941SChristoph Paasch tcp_init_cwnd_reduction(sk); 26581fbc3405SYuchung Cheng } 26591fbc3405SYuchung Cheng tcp_set_ca_state(sk, TCP_CA_Recovery); 26601fbc3405SYuchung Cheng } 26611fbc3405SYuchung Cheng 2662ab42d9eeSYuchung Cheng /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are 2663ab42d9eeSYuchung Cheng * recovered or spurious. Otherwise retransmits more on partial ACKs. 2664ab42d9eeSYuchung Cheng */ 266519119f29SEric Dumazet static void tcp_process_loss(struct sock *sk, int flag, int num_dupack, 2666e662ca40SYuchung Cheng int *rexmit) 2667ab42d9eeSYuchung Cheng { 2668ab42d9eeSYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 2669e33099f9SYuchung Cheng bool recovered = !before(tp->snd_una, tp->high_seq); 2670ab42d9eeSYuchung Cheng 2671d983ea6fSEric Dumazet if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) && 2672da34ac76SYuchung Cheng tcp_try_undo_loss(sk, false)) 2673da34ac76SYuchung Cheng return; 2674da34ac76SYuchung Cheng 2675fc68e171SYuchung Cheng if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ 2676fc68e171SYuchung Cheng /* Step 3.b. A timeout is spurious if not all data are 2677fc68e171SYuchung Cheng * lost, i.e., never-retransmitted data are (s)acked. 2678e33099f9SYuchung Cheng */ 2679da34ac76SYuchung Cheng if ((flag & FLAG_ORIG_SACK_ACKED) && 2680fc68e171SYuchung Cheng tcp_try_undo_loss(sk, true)) 2681e33099f9SYuchung Cheng return; 26820cfa5c07SYuchung Cheng 2683b7b0ed91SYuchung Cheng if (after(tp->snd_nxt, tp->high_seq)) { 268419119f29SEric Dumazet if (flag & FLAG_DATA_SACKED || num_dupack) 2685b7b0ed91SYuchung Cheng tp->frto = 0; /* Step 3.a. loss was real */ 2686e33099f9SYuchung Cheng } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { 2687e33099f9SYuchung Cheng tp->high_seq = tp->snd_nxt; 2688e662ca40SYuchung Cheng /* Step 2.b. Try send new data (but deferred until cwnd 2689e662ca40SYuchung Cheng * is updated in tcp_ack()). Otherwise fall back to 2690e662ca40SYuchung Cheng * the conventional recovery. 2691e662ca40SYuchung Cheng */ 269275c119afSEric Dumazet if (!tcp_write_queue_empty(sk) && 2693e662ca40SYuchung Cheng after(tcp_wnd_end(tp), tp->snd_nxt)) { 2694e662ca40SYuchung Cheng *rexmit = REXMIT_NEW; 2695e662ca40SYuchung Cheng return; 2696e662ca40SYuchung Cheng } 2697e33099f9SYuchung Cheng tp->frto = 0; 2698e33099f9SYuchung Cheng } 2699e33099f9SYuchung Cheng } 2700e33099f9SYuchung Cheng 2701e33099f9SYuchung Cheng if (recovered) { 2702e33099f9SYuchung Cheng /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ 2703ab42d9eeSYuchung Cheng tcp_try_undo_recovery(sk); 2704ab42d9eeSYuchung Cheng return; 2705ab42d9eeSYuchung Cheng } 2706e33099f9SYuchung Cheng if (tcp_is_reno(tp)) { 2707e33099f9SYuchung Cheng /* A Reno DUPACK means new data in F-RTO step 2.b above are 2708e33099f9SYuchung Cheng * delivered. Lower inflight to clock out (re)tranmissions. 2709e33099f9SYuchung Cheng */ 271019119f29SEric Dumazet if (after(tp->snd_nxt, tp->high_seq) && num_dupack) 271119119f29SEric Dumazet tcp_add_reno_sack(sk, num_dupack); 2712e33099f9SYuchung Cheng else if (flag & FLAG_SND_UNA_ADVANCED) 2713ab42d9eeSYuchung Cheng tcp_reset_reno_sack(tp); 2714e33099f9SYuchung Cheng } 2715e662ca40SYuchung Cheng *rexmit = REXMIT_LOST; 2716ab42d9eeSYuchung Cheng } 2717ab42d9eeSYuchung Cheng 27186a63df46SYuchung Cheng /* Undo during fast recovery after partial ACK. */ 2719737ff314SYuchung Cheng static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una) 27206a63df46SYuchung Cheng { 27216a63df46SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 27226a63df46SYuchung Cheng 27237026b912SYuchung Cheng if (tp->undo_marker && tcp_packet_delayed(tp)) { 27246a63df46SYuchung Cheng /* Plain luck! Hole if filled with delayed 2725737ff314SYuchung Cheng * packet, rather than with a retransmit. Check reordering. 27266a63df46SYuchung Cheng */ 2727737ff314SYuchung Cheng tcp_check_sack_reordering(sk, prior_snd_una, 1); 27287026b912SYuchung Cheng 27297026b912SYuchung Cheng /* We are getting evidence that the reordering degree is higher 27307026b912SYuchung Cheng * than we realized. If there are no retransmits out then we 27317026b912SYuchung Cheng * can undo. Otherwise we clock out new packets but do not 27327026b912SYuchung Cheng * mark more packets lost or retransmit more. 27337026b912SYuchung Cheng */ 273431ba0c10SYuchung Cheng if (tp->retrans_out) 27357026b912SYuchung Cheng return true; 27367026b912SYuchung Cheng 27376a63df46SYuchung Cheng if (!tcp_any_retrans_done(sk)) 27386a63df46SYuchung Cheng tp->retrans_stamp = 0; 27396a63df46SYuchung Cheng 27407026b912SYuchung Cheng DBGUNDO(sk, "partial recovery"); 27417026b912SYuchung Cheng tcp_undo_cwnd_reduction(sk, true); 2742c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); 27437026b912SYuchung Cheng tcp_try_keep_open(sk); 27447026b912SYuchung Cheng return true; 27456a63df46SYuchung Cheng } 27467026b912SYuchung Cheng return false; 27476a63df46SYuchung Cheng } 27486a63df46SYuchung Cheng 27496ac06ecdSYuchung Cheng static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag) 275098e36d44SYuchung Cheng { 275198e36d44SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 275298e36d44SYuchung Cheng 27536ac06ecdSYuchung Cheng if (tcp_rtx_queue_empty(sk)) 27546ac06ecdSYuchung Cheng return; 27556ac06ecdSYuchung Cheng 27566ac06ecdSYuchung Cheng if (unlikely(tcp_is_reno(tp))) { 27576ac06ecdSYuchung Cheng tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED); 27586ac06ecdSYuchung Cheng } else if (tcp_is_rack(sk)) { 275998e36d44SYuchung Cheng u32 prior_retrans = tp->retrans_out; 276098e36d44SYuchung Cheng 2761128eda86SEric Dumazet tcp_rack_mark_lost(sk); 276298e36d44SYuchung Cheng if (prior_retrans > tp->retrans_out) 276398e36d44SYuchung Cheng *ack_flag |= FLAG_LOST_RETRANS; 276498e36d44SYuchung Cheng } 276598e36d44SYuchung Cheng } 276698e36d44SYuchung Cheng 2767737ff314SYuchung Cheng static bool tcp_force_fast_retransmit(struct sock *sk) 2768737ff314SYuchung Cheng { 2769737ff314SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 2770737ff314SYuchung Cheng 2771737ff314SYuchung Cheng return after(tcp_highest_sack_seq(tp), 2772737ff314SYuchung Cheng tp->snd_una + tp->reordering * tp->mss_cache); 2773737ff314SYuchung Cheng } 2774737ff314SYuchung Cheng 27751da177e4SLinus Torvalds /* Process an event, which can update packets-in-flight not trivially. 27761da177e4SLinus Torvalds * Main goal of this function is to calculate new estimate for left_out, 27771da177e4SLinus Torvalds * taking into account both packets sitting in receiver's buffer and 27781da177e4SLinus Torvalds * packets lost by network. 27791da177e4SLinus Torvalds * 278031ba0c10SYuchung Cheng * Besides that it updates the congestion state when packet loss or ECN 278131ba0c10SYuchung Cheng * is detected. But it does not reduce the cwnd, it is done by the 278231ba0c10SYuchung Cheng * congestion control later. 27831da177e4SLinus Torvalds * 27841da177e4SLinus Torvalds * It does _not_ decide what to send, it is made in function 27851da177e4SLinus Torvalds * tcp_xmit_retransmit_queue(). 27861da177e4SLinus Torvalds */ 2787737ff314SYuchung Cheng static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, 278819119f29SEric Dumazet int num_dupack, int *ack_flag, int *rexmit) 27891da177e4SLinus Torvalds { 27906687e988SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 27911da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 279231ba0c10SYuchung Cheng int fast_rexmit = 0, flag = *ack_flag; 279319119f29SEric Dumazet bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) && 2794737ff314SYuchung Cheng tcp_force_fast_retransmit(sk)); 27951da177e4SLinus Torvalds 27968ba6ddaaSEric Dumazet if (!tp->packets_out && tp->sacked_out) 27971da177e4SLinus Torvalds tp->sacked_out = 0; 27981da177e4SLinus Torvalds 27991da177e4SLinus Torvalds /* Now state machine starts. 28001da177e4SLinus Torvalds * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ 28011da177e4SLinus Torvalds if (flag & FLAG_ECE) 28021da177e4SLinus Torvalds tp->prior_ssthresh = 0; 28031da177e4SLinus Torvalds 28041da177e4SLinus Torvalds /* B. In all the states check for reneging SACKs. */ 2805cadbd031SIlpo Järvinen if (tcp_check_sack_reneging(sk, flag)) 28061da177e4SLinus Torvalds return; 28071da177e4SLinus Torvalds 2808974c1236SYuchung Cheng /* C. Check consistency of the current state. */ 2809005903bcSIlpo Järvinen tcp_verify_left_out(tp); 28101da177e4SLinus Torvalds 2811974c1236SYuchung Cheng /* D. Check state exit conditions. State can be terminated 28121da177e4SLinus Torvalds * when high_seq is ACKed. */ 28136687e988SArnaldo Carvalho de Melo if (icsk->icsk_ca_state == TCP_CA_Open) { 2814547b792cSIlpo Järvinen WARN_ON(tp->retrans_out != 0); 28151da177e4SLinus Torvalds tp->retrans_stamp = 0; 28161da177e4SLinus Torvalds } else if (!before(tp->snd_una, tp->high_seq)) { 28176687e988SArnaldo Carvalho de Melo switch (icsk->icsk_ca_state) { 28181da177e4SLinus Torvalds case TCP_CA_CWR: 28191da177e4SLinus Torvalds /* CWR is to be held something *above* high_seq 28201da177e4SLinus Torvalds * is ACKed for CWR bit to reach receiver. */ 28211da177e4SLinus Torvalds if (tp->snd_una != tp->high_seq) { 2822684bad11SYuchung Cheng tcp_end_cwnd_reduction(sk); 28236687e988SArnaldo Carvalho de Melo tcp_set_ca_state(sk, TCP_CA_Open); 28241da177e4SLinus Torvalds } 28251da177e4SLinus Torvalds break; 28261da177e4SLinus Torvalds 28271da177e4SLinus Torvalds case TCP_CA_Recovery: 2828e60402d0SIlpo Järvinen if (tcp_is_reno(tp)) 28291da177e4SLinus Torvalds tcp_reset_reno_sack(tp); 28309e412ba7SIlpo Järvinen if (tcp_try_undo_recovery(sk)) 28311da177e4SLinus Torvalds return; 2832684bad11SYuchung Cheng tcp_end_cwnd_reduction(sk); 28331da177e4SLinus Torvalds break; 28341da177e4SLinus Torvalds } 28351da177e4SLinus Torvalds } 28361da177e4SLinus Torvalds 2837974c1236SYuchung Cheng /* E. Process state. */ 28386687e988SArnaldo Carvalho de Melo switch (icsk->icsk_ca_state) { 28391da177e4SLinus Torvalds case TCP_CA_Recovery: 28402e605294SIlpo Järvinen if (!(flag & FLAG_SND_UNA_ADVANCED)) { 284119119f29SEric Dumazet if (tcp_is_reno(tp)) 284219119f29SEric Dumazet tcp_add_reno_sack(sk, num_dupack); 28437026b912SYuchung Cheng } else { 2844737ff314SYuchung Cheng if (tcp_try_undo_partial(sk, prior_snd_una)) 28457026b912SYuchung Cheng return; 28467026b912SYuchung Cheng /* Partial ACK arrived. Force fast retransmit. */ 28477026b912SYuchung Cheng do_lost = tcp_is_reno(tp) || 2848737ff314SYuchung Cheng tcp_force_fast_retransmit(sk); 28497026b912SYuchung Cheng } 2850c7d9d6a1SYuchung Cheng if (tcp_try_undo_dsack(sk)) { 2851c7d9d6a1SYuchung Cheng tcp_try_keep_open(sk); 2852c7d9d6a1SYuchung Cheng return; 2853c7d9d6a1SYuchung Cheng } 28546ac06ecdSYuchung Cheng tcp_identify_packet_loss(sk, ack_flag); 28551da177e4SLinus Torvalds break; 28561da177e4SLinus Torvalds case TCP_CA_Loss: 285719119f29SEric Dumazet tcp_process_loss(sk, flag, num_dupack, rexmit); 28586ac06ecdSYuchung Cheng tcp_identify_packet_loss(sk, ack_flag); 285998e36d44SYuchung Cheng if (!(icsk->icsk_ca_state == TCP_CA_Open || 286098e36d44SYuchung Cheng (*ack_flag & FLAG_LOST_RETRANS))) 28611da177e4SLinus Torvalds return; 2862291a00d1SYuchung Cheng /* Change state if cwnd is undone or retransmits are lost */ 2863fcfd6dfaSGustavo A. R. Silva /* fall through */ 28641da177e4SLinus Torvalds default: 2865e60402d0SIlpo Järvinen if (tcp_is_reno(tp)) { 28662e605294SIlpo Järvinen if (flag & FLAG_SND_UNA_ADVANCED) 28671da177e4SLinus Torvalds tcp_reset_reno_sack(tp); 286819119f29SEric Dumazet tcp_add_reno_sack(sk, num_dupack); 28691da177e4SLinus Torvalds } 28701da177e4SLinus Torvalds 2871f698204bSNeal Cardwell if (icsk->icsk_ca_state <= TCP_CA_Disorder) 28729e412ba7SIlpo Järvinen tcp_try_undo_dsack(sk); 28731da177e4SLinus Torvalds 28746ac06ecdSYuchung Cheng tcp_identify_packet_loss(sk, ack_flag); 2875750ea2baSYuchung Cheng if (!tcp_time_to_recover(sk, flag)) { 287631ba0c10SYuchung Cheng tcp_try_to_open(sk, flag); 28771da177e4SLinus Torvalds return; 28781da177e4SLinus Torvalds } 28791da177e4SLinus Torvalds 28805d424d5aSJohn Heffner /* MTU probe failure: don't reduce cwnd */ 28815d424d5aSJohn Heffner if (icsk->icsk_ca_state < TCP_CA_CWR && 28825d424d5aSJohn Heffner icsk->icsk_mtup.probe_size && 28830e7b1368SJohn Heffner tp->snd_una == tp->mtu_probe.probe_seq_start) { 28845d424d5aSJohn Heffner tcp_mtup_probe_failed(sk); 28855d424d5aSJohn Heffner /* Restores the reduction we did in tcp_mtup_probe() */ 28865d424d5aSJohn Heffner tp->snd_cwnd++; 28875d424d5aSJohn Heffner tcp_simple_retransmit(sk); 28885d424d5aSJohn Heffner return; 28895d424d5aSJohn Heffner } 28905d424d5aSJohn Heffner 28911da177e4SLinus Torvalds /* Otherwise enter Recovery state */ 28921fbc3405SYuchung Cheng tcp_enter_recovery(sk, (flag & FLAG_ECE)); 289385cc391cSIlpo Järvinen fast_rexmit = 1; 28941da177e4SLinus Torvalds } 28951da177e4SLinus Torvalds 2896b38a51feSYuchung Cheng if (!tcp_is_rack(sk) && do_lost) 289785cc391cSIlpo Järvinen tcp_update_scoreboard(sk, fast_rexmit); 2898e662ca40SYuchung Cheng *rexmit = REXMIT_LOST; 28991da177e4SLinus Torvalds } 29001da177e4SLinus Torvalds 2901eb36be0fSYuchung Cheng static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag) 2902f6722583SYuchung Cheng { 2903bd239704SEric Dumazet u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ; 290464033892SNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 2905f6722583SYuchung Cheng 2906eb36be0fSYuchung Cheng if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) { 2907eb36be0fSYuchung Cheng /* If the remote keeps returning delayed ACKs, eventually 2908eb36be0fSYuchung Cheng * the min filter would pick it up and overestimate the 2909eb36be0fSYuchung Cheng * prop. delay when it expires. Skip suspected delayed ACKs. 2910eb36be0fSYuchung Cheng */ 2911eb36be0fSYuchung Cheng return; 2912eb36be0fSYuchung Cheng } 2913ac9517fcSEric Dumazet minmax_running_min(&tp->rtt_min, wlen, tcp_jiffies32, 291464033892SNeal Cardwell rtt_us ? : jiffies_to_usecs(1)); 2915f6722583SYuchung Cheng } 2916f6722583SYuchung Cheng 2917775e68a9SYuchung Cheng static bool tcp_ack_update_rtt(struct sock *sk, const int flag, 2918f6722583SYuchung Cheng long seq_rtt_us, long sack_rtt_us, 2919775e68a9SYuchung Cheng long ca_rtt_us, struct rate_sample *rs) 292041834b73SIlpo Järvinen { 29215b08e47cSYuchung Cheng const struct tcp_sock *tp = tcp_sk(sk); 292241834b73SIlpo Järvinen 29235b08e47cSYuchung Cheng /* Prefer RTT measured from ACK's timing to TS-ECR. This is because 29245b08e47cSYuchung Cheng * broken middle-boxes or peers may corrupt TS-ECR fields. But 29255b08e47cSYuchung Cheng * Karn's algorithm forbids taking RTT if some retransmitted data 29265b08e47cSYuchung Cheng * is acked (RFC6298). 29271da177e4SLinus Torvalds */ 2928740b0f18SEric Dumazet if (seq_rtt_us < 0) 2929740b0f18SEric Dumazet seq_rtt_us = sack_rtt_us; 2930ed08495cSYuchung Cheng 29311da177e4SLinus Torvalds /* RTTM Rule: A TSecr value received in a segment is used to 29321da177e4SLinus Torvalds * update the averaged RTT measurement only if the segment 29331da177e4SLinus Torvalds * acknowledges some new data, i.e., only if it advances the 29341da177e4SLinus Torvalds * left edge of the send window. 29351da177e4SLinus Torvalds * See draft-ietf-tcplw-high-performance-00, section 3.3. 29361da177e4SLinus Torvalds */ 2937740b0f18SEric Dumazet if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 29389a568de4SEric Dumazet flag & FLAG_ACKED) { 29399a568de4SEric Dumazet u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr; 29409a568de4SEric Dumazet 29419efdda4eSEric Dumazet if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { 29429efdda4eSEric Dumazet seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ); 29439efdda4eSEric Dumazet ca_rtt_us = seq_rtt_us; 29449efdda4eSEric Dumazet } 29459a568de4SEric Dumazet } 2946775e68a9SYuchung Cheng rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ 2947740b0f18SEric Dumazet if (seq_rtt_us < 0) 2948ed08495cSYuchung Cheng return false; 29491da177e4SLinus Torvalds 2950f6722583SYuchung Cheng /* ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is 2951f6722583SYuchung Cheng * always taken together with ACK, SACK, or TS-opts. Any negative 2952f6722583SYuchung Cheng * values will be skipped with the seq_rtt_us < 0 check above. 2953f6722583SYuchung Cheng */ 2954eb36be0fSYuchung Cheng tcp_update_rtt_min(sk, ca_rtt_us, flag); 2955740b0f18SEric Dumazet tcp_rtt_estimator(sk, seq_rtt_us); 29565b08e47cSYuchung Cheng tcp_set_rto(sk); 29571da177e4SLinus Torvalds 29585b08e47cSYuchung Cheng /* RFC6298: only reset backoff on valid RTT measurement. */ 29595b08e47cSYuchung Cheng inet_csk(sk)->icsk_backoff = 0; 2960ed08495cSYuchung Cheng return true; 29611da177e4SLinus Torvalds } 29621da177e4SLinus Torvalds 2963375fe02cSYuchung Cheng /* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ 29640f1c28aeSYuchung Cheng void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req) 2965375fe02cSYuchung Cheng { 2966775e68a9SYuchung Cheng struct rate_sample rs; 29670f1c28aeSYuchung Cheng long rtt_us = -1L; 2968375fe02cSYuchung Cheng 29699a568de4SEric Dumazet if (req && !req->num_retrans && tcp_rsk(req)->snt_synack) 29709a568de4SEric Dumazet rtt_us = tcp_stamp_us_delta(tcp_clock_us(), tcp_rsk(req)->snt_synack); 2971375fe02cSYuchung Cheng 2972775e68a9SYuchung Cheng tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, rtt_us, -1L, rtt_us, &rs); 29730f1c28aeSYuchung Cheng } 29740f1c28aeSYuchung Cheng 29750f1c28aeSYuchung Cheng 297624901551SEric Dumazet static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) 29771da177e4SLinus Torvalds { 29786687e988SArnaldo Carvalho de Melo const struct inet_connection_sock *icsk = inet_csk(sk); 297924901551SEric Dumazet 298024901551SEric Dumazet icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); 2981c2203cf7SEric Dumazet tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32; 29821da177e4SLinus Torvalds } 29831da177e4SLinus Torvalds 29841da177e4SLinus Torvalds /* Restart timer after forward progress on connection. 29851da177e4SLinus Torvalds * RFC2988 recommends to restart timer to now+rto. 29861da177e4SLinus Torvalds */ 2987750ea2baSYuchung Cheng void tcp_rearm_rto(struct sock *sk) 29881da177e4SLinus Torvalds { 29896ba8a3b1SNandita Dukkipati const struct inet_connection_sock *icsk = inet_csk(sk); 2990750ea2baSYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 29919e412ba7SIlpo Järvinen 2992168a8f58SJerry Chu /* If the retrans timer is currently being used by Fast Open 2993168a8f58SJerry Chu * for SYN-ACK retrans purpose, stay put. 2994168a8f58SJerry Chu */ 2995d983ea6fSEric Dumazet if (rcu_access_pointer(tp->fastopen_rsk)) 2996168a8f58SJerry Chu return; 2997168a8f58SJerry Chu 29981da177e4SLinus Torvalds if (!tp->packets_out) { 2999463c84b9SArnaldo Carvalho de Melo inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 30001da177e4SLinus Torvalds } else { 3001750ea2baSYuchung Cheng u32 rto = inet_csk(sk)->icsk_rto; 3002750ea2baSYuchung Cheng /* Offset the time elapsed after installing regular RTO */ 3003bec41a11SYuchung Cheng if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || 30046ba8a3b1SNandita Dukkipati icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 3005e1a10ef7SNeal Cardwell s64 delta_us = tcp_rto_delta_us(sk); 3006b17b8a20SEric Dumazet /* delta_us may not be positive if the socket is locked 30076ba8a3b1SNandita Dukkipati * when the retrans timer fires and is rescheduled. 3008750ea2baSYuchung Cheng */ 3009cdbeb633SNeal Cardwell rto = usecs_to_jiffies(max_t(int, delta_us, 1)); 30101da177e4SLinus Torvalds } 30113f80e08fSEric Dumazet tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, 30123f80e08fSEric Dumazet TCP_RTO_MAX, tcp_rtx_queue_head(sk)); 3013750ea2baSYuchung Cheng } 3014750ea2baSYuchung Cheng } 3015750ea2baSYuchung Cheng 3016df92c839SNeal Cardwell /* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ 3017df92c839SNeal Cardwell static void tcp_set_xmit_timer(struct sock *sk) 3018df92c839SNeal Cardwell { 3019ed66dfafSNeal Cardwell if (!tcp_schedule_loss_probe(sk, true)) 3020df92c839SNeal Cardwell tcp_rearm_rto(sk); 3021df92c839SNeal Cardwell } 3022df92c839SNeal Cardwell 30237c46a03eSIlpo Järvinen /* If we get here, the whole TSO packet has not been acked. */ 302413fcf850SIlpo Järvinen static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) 30251da177e4SLinus Torvalds { 30261da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 30277c46a03eSIlpo Järvinen u32 packets_acked; 30281da177e4SLinus Torvalds 30297c46a03eSIlpo Järvinen BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); 30301da177e4SLinus Torvalds 30311da177e4SLinus Torvalds packets_acked = tcp_skb_pcount(skb); 30327c46a03eSIlpo Järvinen if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) 30331da177e4SLinus Torvalds return 0; 30341da177e4SLinus Torvalds packets_acked -= tcp_skb_pcount(skb); 30351da177e4SLinus Torvalds 30361da177e4SLinus Torvalds if (packets_acked) { 30371da177e4SLinus Torvalds BUG_ON(tcp_skb_pcount(skb) == 0); 30387c46a03eSIlpo Järvinen BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); 30391da177e4SLinus Torvalds } 30401da177e4SLinus Torvalds 304113fcf850SIlpo Järvinen return packets_acked; 30421da177e4SLinus Torvalds } 30431da177e4SLinus Torvalds 3044ad971f61SEric Dumazet static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, 3045ad971f61SEric Dumazet u32 prior_snd_una) 3046ad971f61SEric Dumazet { 3047ad971f61SEric Dumazet const struct skb_shared_info *shinfo; 3048ad971f61SEric Dumazet 3049ad971f61SEric Dumazet /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ 30506b084928SSoheil Hassas Yeganeh if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) 3051ad971f61SEric Dumazet return; 3052ad971f61SEric Dumazet 3053ad971f61SEric Dumazet shinfo = skb_shinfo(skb); 30540a2cf20cSSoheil Hassas Yeganeh if (!before(shinfo->tskey, prior_snd_una) && 3055e2080072SEric Dumazet before(shinfo->tskey, tcp_sk(sk)->snd_una)) { 3056e2080072SEric Dumazet tcp_skb_tsorted_save(skb) { 3057ad971f61SEric Dumazet __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); 3058e2080072SEric Dumazet } tcp_skb_tsorted_restore(skb); 3059e2080072SEric Dumazet } 3060ad971f61SEric Dumazet } 3061ad971f61SEric Dumazet 30627c46a03eSIlpo Järvinen /* Remove acknowledged frames from the retransmission queue. If our packet 30637c46a03eSIlpo Järvinen * is before the ack sequence we can discard it as it's confirmed to have 30647c46a03eSIlpo Järvinen * arrived at the other end. 30657c46a03eSIlpo Järvinen */ 3066737ff314SYuchung Cheng static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, 3067737ff314SYuchung Cheng u32 prior_snd_una, 3068deed7be7SYuchung Cheng struct tcp_sacktag_state *sack) 30691da177e4SLinus Torvalds { 30702d2abbabSStephen Hemminger const struct inet_connection_sock *icsk = inet_csk(sk); 30719a568de4SEric Dumazet u64 first_ackt, last_ackt; 3072740b0f18SEric Dumazet struct tcp_sock *tp = tcp_sk(sk); 307390638a04SIlpo Järvinen u32 prior_sacked = tp->sacked_out; 3074737ff314SYuchung Cheng u32 reord = tp->snd_nxt; /* lowest acked un-retx un-sacked seq */ 307575c119afSEric Dumazet struct sk_buff *skb, *next; 3076740b0f18SEric Dumazet bool fully_acked = true; 307731231a8aSKenneth Klette Jonassen long sack_rtt_us = -1L; 3078740b0f18SEric Dumazet long seq_rtt_us = -1L; 307931231a8aSKenneth Klette Jonassen long ca_rtt_us = -1L; 3080740b0f18SEric Dumazet u32 pkts_acked = 0; 30816f094b9eSLawrence Brakmo u32 last_in_flight = 0; 30822f715c1dSYuchung Cheng bool rtt_update; 3083740b0f18SEric Dumazet int flag = 0; 3084740b0f18SEric Dumazet 30859a568de4SEric Dumazet first_ackt = 0; 30861da177e4SLinus Torvalds 308775c119afSEric Dumazet for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { 30881da177e4SLinus Torvalds struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3089737ff314SYuchung Cheng const u32 start_seq = scb->seq; 30907c46a03eSIlpo Järvinen u8 sacked = scb->sacked; 3091740b0f18SEric Dumazet u32 acked_pcount; 30921da177e4SLinus Torvalds 3093ad971f61SEric Dumazet tcp_ack_tstamp(sk, skb, prior_snd_una); 3094712a7221SWillem de Bruijn 30952072c228SGavin McCullagh /* Determine how many packets and what bytes were acked, tso and else */ 30961da177e4SLinus Torvalds if (after(scb->end_seq, tp->snd_una)) { 309713fcf850SIlpo Järvinen if (tcp_skb_pcount(skb) == 1 || 309813fcf850SIlpo Järvinen !after(tp->snd_una, scb->seq)) 30991da177e4SLinus Torvalds break; 310013fcf850SIlpo Järvinen 310172018835SIlpo Järvinen acked_pcount = tcp_tso_acked(sk, skb); 310272018835SIlpo Järvinen if (!acked_pcount) 310313fcf850SIlpo Järvinen break; 3104a2a385d6SEric Dumazet fully_acked = false; 310513fcf850SIlpo Järvinen } else { 310672018835SIlpo Järvinen acked_pcount = tcp_skb_pcount(skb); 31071da177e4SLinus Torvalds } 31081da177e4SLinus Torvalds 3109ad971f61SEric Dumazet if (unlikely(sacked & TCPCB_RETRANS)) { 31101da177e4SLinus Torvalds if (sacked & TCPCB_SACKED_RETRANS) 311172018835SIlpo Järvinen tp->retrans_out -= acked_pcount; 31127c46a03eSIlpo Järvinen flag |= FLAG_RETRANS_DATA_ACKED; 31133d0d26c7SKenneth Klette Jonassen } else if (!(sacked & TCPCB_SACKED_ACKED)) { 31142fd66ffbSEric Dumazet last_ackt = tcp_skb_timestamp_us(skb); 31159a568de4SEric Dumazet WARN_ON_ONCE(last_ackt == 0); 31169a568de4SEric Dumazet if (!first_ackt) 3117740b0f18SEric Dumazet first_ackt = last_ackt; 3118740b0f18SEric Dumazet 31196f094b9eSLawrence Brakmo last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; 3120737ff314SYuchung Cheng if (before(start_seq, reord)) 3121737ff314SYuchung Cheng reord = start_seq; 3122e33099f9SYuchung Cheng if (!after(scb->end_seq, tp->high_seq)) 3123e33099f9SYuchung Cheng flag |= FLAG_ORIG_SACK_ACKED; 3124c7caf8d3SIlpo Järvinen } 31257c46a03eSIlpo Järvinen 3126ddf1af6fSYuchung Cheng if (sacked & TCPCB_SACKED_ACKED) { 312772018835SIlpo Järvinen tp->sacked_out -= acked_pcount; 3128ddf1af6fSYuchung Cheng } else if (tcp_is_sack(tp)) { 3129ddf1af6fSYuchung Cheng tp->delivered += acked_pcount; 3130ddf1af6fSYuchung Cheng if (!tcp_skb_spurious_retrans(tp, skb)) 31311d0833dfSYuchung Cheng tcp_rack_advance(tp, sacked, scb->end_seq, 31322fd66ffbSEric Dumazet tcp_skb_timestamp_us(skb)); 3133ddf1af6fSYuchung Cheng } 31341da177e4SLinus Torvalds if (sacked & TCPCB_LOST) 313572018835SIlpo Järvinen tp->lost_out -= acked_pcount; 31367c46a03eSIlpo Järvinen 313772018835SIlpo Järvinen tp->packets_out -= acked_pcount; 313872018835SIlpo Järvinen pkts_acked += acked_pcount; 3139b9f64820SYuchung Cheng tcp_rate_skb_delivered(sk, skb, sack->rate); 314013fcf850SIlpo Järvinen 3141009a2e3eSIlpo Järvinen /* Initial outgoing SYN's get put onto the write_queue 3142009a2e3eSIlpo Järvinen * just like anything else we transmit. It is not 3143009a2e3eSIlpo Järvinen * true data, and if we misinform our callers that 3144009a2e3eSIlpo Järvinen * this ACK acks real data, we will erroneously exit 3145009a2e3eSIlpo Järvinen * connection startup slow start one packet too 3146009a2e3eSIlpo Järvinen * quickly. This is severely frowned upon behavior. 3147009a2e3eSIlpo Järvinen */ 3148ad971f61SEric Dumazet if (likely(!(scb->tcp_flags & TCPHDR_SYN))) { 3149009a2e3eSIlpo Järvinen flag |= FLAG_DATA_ACKED; 3150009a2e3eSIlpo Järvinen } else { 3151009a2e3eSIlpo Järvinen flag |= FLAG_SYN_ACKED; 3152009a2e3eSIlpo Järvinen tp->retrans_stamp = 0; 3153009a2e3eSIlpo Järvinen } 3154009a2e3eSIlpo Järvinen 315513fcf850SIlpo Järvinen if (!fully_acked) 315613fcf850SIlpo Järvinen break; 315713fcf850SIlpo Järvinen 315875c119afSEric Dumazet next = skb_rb_next(skb); 3159ad971f61SEric Dumazet if (unlikely(skb == tp->retransmit_skb_hint)) 3160ef9da47cSIlpo Järvinen tp->retransmit_skb_hint = NULL; 3161ad971f61SEric Dumazet if (unlikely(skb == tp->lost_skb_hint)) 316290638a04SIlpo Järvinen tp->lost_skb_hint = NULL; 316375c119afSEric Dumazet tcp_rtx_queue_unlink_and_free(skb, sk); 31641da177e4SLinus Torvalds } 31651da177e4SLinus Torvalds 31660f87230dSFrancis Yan if (!skb) 31670f87230dSFrancis Yan tcp_chrono_stop(sk, TCP_CHRONO_BUSY); 31680f87230dSFrancis Yan 316933f5f57eSIlpo Järvinen if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) 317033f5f57eSIlpo Järvinen tp->snd_up = tp->snd_una; 317133f5f57eSIlpo Järvinen 3172cadbd031SIlpo Järvinen if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3173cadbd031SIlpo Järvinen flag |= FLAG_SACK_RENEGING; 3174cadbd031SIlpo Järvinen 31759a568de4SEric Dumazet if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) { 31769a568de4SEric Dumazet seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); 31779a568de4SEric Dumazet ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); 3178eb36be0fSYuchung Cheng 3179eb36be0fSYuchung Cheng if (pkts_acked == 1 && last_in_flight < tp->mss_cache && 3180eb36be0fSYuchung Cheng last_in_flight && !prior_sacked && fully_acked && 3181eb36be0fSYuchung Cheng sack->rate->prior_delivered + 1 == tp->delivered && 3182eb36be0fSYuchung Cheng !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { 3183eb36be0fSYuchung Cheng /* Conservatively mark a delayed ACK. It's typically 3184eb36be0fSYuchung Cheng * from a lone runt packet over the round trip to 3185eb36be0fSYuchung Cheng * a receiver w/o out-of-order or CE events. 3186eb36be0fSYuchung Cheng */ 3187eb36be0fSYuchung Cheng flag |= FLAG_ACK_MAYBE_DELAYED; 3188eb36be0fSYuchung Cheng } 318931231a8aSKenneth Klette Jonassen } 31909a568de4SEric Dumazet if (sack->first_sackt) { 31919a568de4SEric Dumazet sack_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->first_sackt); 31929a568de4SEric Dumazet ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, sack->last_sackt); 3193740b0f18SEric Dumazet } 3194f6722583SYuchung Cheng rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us, 3195775e68a9SYuchung Cheng ca_rtt_us, sack->rate); 3196ed08495cSYuchung Cheng 31977c46a03eSIlpo Järvinen if (flag & FLAG_ACKED) { 3198df92c839SNeal Cardwell flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ 319972211e90SIlpo Järvinen if (unlikely(icsk->icsk_mtup.probe_size && 320072211e90SIlpo Järvinen !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { 320172211e90SIlpo Järvinen tcp_mtup_probe_success(sk); 320272211e90SIlpo Järvinen } 320372211e90SIlpo Järvinen 3204c7caf8d3SIlpo Järvinen if (tcp_is_reno(tp)) { 3205c7caf8d3SIlpo Järvinen tcp_remove_reno_sacks(sk, pkts_acked); 32061236f22fSIlpo Järvinen 32071236f22fSIlpo Järvinen /* If any of the cumulatively ACKed segments was 32081236f22fSIlpo Järvinen * retransmitted, non-SACK case cannot confirm that 32091236f22fSIlpo Järvinen * progress was due to original transmission due to 32101236f22fSIlpo Järvinen * lack of TCPCB_SACKED_ACKED bits even if some of 32111236f22fSIlpo Järvinen * the packets may have been never retransmitted. 32121236f22fSIlpo Järvinen */ 32131236f22fSIlpo Järvinen if (flag & FLAG_RETRANS_DATA_ACKED) 32141236f22fSIlpo Järvinen flag &= ~FLAG_ORIG_SACK_ACKED; 3215c7caf8d3SIlpo Järvinen } else { 321659a08cbaSIlpo Järvinen int delta; 321759a08cbaSIlpo Järvinen 3218c7caf8d3SIlpo Järvinen /* Non-retransmitted hole got filled? That's reordering */ 3219737ff314SYuchung Cheng if (before(reord, prior_fack)) 3220737ff314SYuchung Cheng tcp_check_sack_reordering(sk, reord, 0); 322190638a04SIlpo Järvinen 3222713bafeaSYuchung Cheng delta = prior_sacked - tp->sacked_out; 322359a08cbaSIlpo Järvinen tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3224c7caf8d3SIlpo Järvinen } 3225740b0f18SEric Dumazet } else if (skb && rtt_update && sack_rtt_us >= 0 && 32262fd66ffbSEric Dumazet sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, 32272fd66ffbSEric Dumazet tcp_skb_timestamp_us(skb))) { 32282f715c1dSYuchung Cheng /* Do not re-arm RTO if the sack RTT is measured from data sent 32292f715c1dSYuchung Cheng * after when the head was last (re)transmitted. Otherwise the 32302f715c1dSYuchung Cheng * timeout may continue to extend in loss recovery. 32312f715c1dSYuchung Cheng */ 3232df92c839SNeal Cardwell flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ 32331da177e4SLinus Torvalds } 32341da177e4SLinus Torvalds 3235756ee172SLawrence Brakmo if (icsk->icsk_ca_ops->pkts_acked) { 3236756ee172SLawrence Brakmo struct ack_sample sample = { .pkts_acked = pkts_acked, 3237775e68a9SYuchung Cheng .rtt_us = sack->rate->rtt_us, 32386f094b9eSLawrence Brakmo .in_flight = last_in_flight }; 3239756ee172SLawrence Brakmo 3240756ee172SLawrence Brakmo icsk->icsk_ca_ops->pkts_acked(sk, &sample); 3241756ee172SLawrence Brakmo } 3242138998fdSKenneth Klette Jonassen 32431da177e4SLinus Torvalds #if FASTRETRANS_DEBUG > 0 3244547b792cSIlpo Järvinen WARN_ON((int)tp->sacked_out < 0); 3245547b792cSIlpo Järvinen WARN_ON((int)tp->lost_out < 0); 3246547b792cSIlpo Järvinen WARN_ON((int)tp->retrans_out < 0); 3247e60402d0SIlpo Järvinen if (!tp->packets_out && tcp_is_sack(tp)) { 3248cfcabdccSStephen Hemminger icsk = inet_csk(sk); 32491da177e4SLinus Torvalds if (tp->lost_out) { 325091df42beSJoe Perches pr_debug("Leak l=%u %d\n", 32516687e988SArnaldo Carvalho de Melo tp->lost_out, icsk->icsk_ca_state); 32521da177e4SLinus Torvalds tp->lost_out = 0; 32531da177e4SLinus Torvalds } 32541da177e4SLinus Torvalds if (tp->sacked_out) { 325591df42beSJoe Perches pr_debug("Leak s=%u %d\n", 32566687e988SArnaldo Carvalho de Melo tp->sacked_out, icsk->icsk_ca_state); 32571da177e4SLinus Torvalds tp->sacked_out = 0; 32581da177e4SLinus Torvalds } 32591da177e4SLinus Torvalds if (tp->retrans_out) { 326091df42beSJoe Perches pr_debug("Leak r=%u %d\n", 32616687e988SArnaldo Carvalho de Melo tp->retrans_out, icsk->icsk_ca_state); 32621da177e4SLinus Torvalds tp->retrans_out = 0; 32631da177e4SLinus Torvalds } 32641da177e4SLinus Torvalds } 32651da177e4SLinus Torvalds #endif 32667c46a03eSIlpo Järvinen return flag; 32671da177e4SLinus Torvalds } 32681da177e4SLinus Torvalds 32691da177e4SLinus Torvalds static void tcp_ack_probe(struct sock *sk) 32701da177e4SLinus Torvalds { 3271463c84b9SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 327275c119afSEric Dumazet struct sk_buff *head = tcp_send_head(sk); 327375c119afSEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 32741da177e4SLinus Torvalds 32751da177e4SLinus Torvalds /* Was it a usable window open? */ 327675c119afSEric Dumazet if (!head) 327775c119afSEric Dumazet return; 327875c119afSEric Dumazet if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { 3279463c84b9SArnaldo Carvalho de Melo icsk->icsk_backoff = 0; 3280463c84b9SArnaldo Carvalho de Melo inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 32811da177e4SLinus Torvalds /* Socket must be waked up by subsequent tcp_data_snd_check(). 32821da177e4SLinus Torvalds * This function is not for random using! 32831da177e4SLinus Torvalds */ 32841da177e4SLinus Torvalds } else { 328521c8fe99SEric Dumazet unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); 3286fcdd1cf4SEric Dumazet 32873f80e08fSEric Dumazet tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, 32883f80e08fSEric Dumazet when, TCP_RTO_MAX, NULL); 32891da177e4SLinus Torvalds } 32901da177e4SLinus Torvalds } 32911da177e4SLinus Torvalds 329267b95bd7SVijay Subramanian static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) 32931da177e4SLinus Torvalds { 3294a02cec21SEric Dumazet return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3295a02cec21SEric Dumazet inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 32961da177e4SLinus Torvalds } 32971da177e4SLinus Torvalds 32980f7cc9a3SYuchung Cheng /* Decide wheather to run the increase function of congestion control. */ 329967b95bd7SVijay Subramanian static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) 33001da177e4SLinus Torvalds { 33010f7cc9a3SYuchung Cheng /* If reordering is high then always grow cwnd whenever data is 33020f7cc9a3SYuchung Cheng * delivered regardless of its ordering. Otherwise stay conservative 330316edfe7eSYuchung Cheng * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ 33040f7cc9a3SYuchung Cheng * new SACK or ECE mark may first advance cwnd here and later reduce 33050f7cc9a3SYuchung Cheng * cwnd in tcp_fastretrans_alert() based on more states. 33060f7cc9a3SYuchung Cheng */ 33071043e25fSNikolay Borisov if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) 33080f7cc9a3SYuchung Cheng return flag & FLAG_FORWARD_PROGRESS; 33090f7cc9a3SYuchung Cheng 331016edfe7eSYuchung Cheng return flag & FLAG_DATA_ACKED; 33111da177e4SLinus Torvalds } 33121da177e4SLinus Torvalds 3313d452e6caSYuchung Cheng /* The "ultimate" congestion control function that aims to replace the rigid 3314d452e6caSYuchung Cheng * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). 3315d452e6caSYuchung Cheng * It's called toward the end of processing an ACK with precise rate 3316d452e6caSYuchung Cheng * information. All transmission or retransmission are delayed afterwards. 3317d452e6caSYuchung Cheng */ 3318d452e6caSYuchung Cheng static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, 3319c0402760SYuchung Cheng int flag, const struct rate_sample *rs) 3320d452e6caSYuchung Cheng { 3321c0402760SYuchung Cheng const struct inet_connection_sock *icsk = inet_csk(sk); 3322c0402760SYuchung Cheng 3323c0402760SYuchung Cheng if (icsk->icsk_ca_ops->cong_control) { 3324c0402760SYuchung Cheng icsk->icsk_ca_ops->cong_control(sk, rs); 3325c0402760SYuchung Cheng return; 3326c0402760SYuchung Cheng } 3327c0402760SYuchung Cheng 3328d452e6caSYuchung Cheng if (tcp_in_cwnd_reduction(sk)) { 3329d452e6caSYuchung Cheng /* Reduce cwnd if state mandates */ 3330d452e6caSYuchung Cheng tcp_cwnd_reduction(sk, acked_sacked, flag); 3331d452e6caSYuchung Cheng } else if (tcp_may_raise_cwnd(sk, flag)) { 3332d452e6caSYuchung Cheng /* Advance cwnd if state allows */ 3333d452e6caSYuchung Cheng tcp_cong_avoid(sk, ack, acked_sacked); 3334d452e6caSYuchung Cheng } 3335d452e6caSYuchung Cheng tcp_update_pacing_rate(sk); 3336d452e6caSYuchung Cheng } 3337d452e6caSYuchung Cheng 33381da177e4SLinus Torvalds /* Check that window update is acceptable. 33391da177e4SLinus Torvalds * The function assumes that snd_una<=ack<=snd_next. 33401da177e4SLinus Torvalds */ 334167b95bd7SVijay Subramanian static inline bool tcp_may_update_window(const struct tcp_sock *tp, 3342056834d9SIlpo Järvinen const u32 ack, const u32 ack_seq, 3343056834d9SIlpo Järvinen const u32 nwin) 33441da177e4SLinus Torvalds { 3345a02cec21SEric Dumazet return after(ack, tp->snd_una) || 33461da177e4SLinus Torvalds after(ack_seq, tp->snd_wl1) || 3347a02cec21SEric Dumazet (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); 33481da177e4SLinus Torvalds } 33491da177e4SLinus Torvalds 33500df48c26SEric Dumazet /* If we update tp->snd_una, also update tp->bytes_acked */ 33510df48c26SEric Dumazet static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) 33520df48c26SEric Dumazet { 33530df48c26SEric Dumazet u32 delta = ack - tp->snd_una; 33540df48c26SEric Dumazet 335546cc6e49SEric Dumazet sock_owned_by_me((struct sock *)tp); 33560df48c26SEric Dumazet tp->bytes_acked += delta; 33570df48c26SEric Dumazet tp->snd_una = ack; 33580df48c26SEric Dumazet } 33590df48c26SEric Dumazet 3360bdd1f9edSEric Dumazet /* If we update tp->rcv_nxt, also update tp->bytes_received */ 3361bdd1f9edSEric Dumazet static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) 3362bdd1f9edSEric Dumazet { 3363bdd1f9edSEric Dumazet u32 delta = seq - tp->rcv_nxt; 3364bdd1f9edSEric Dumazet 336546cc6e49SEric Dumazet sock_owned_by_me((struct sock *)tp); 3366bdd1f9edSEric Dumazet tp->bytes_received += delta; 3367dba7d9b8SEric Dumazet WRITE_ONCE(tp->rcv_nxt, seq); 3368bdd1f9edSEric Dumazet } 3369bdd1f9edSEric Dumazet 33701da177e4SLinus Torvalds /* Update our send window. 33711da177e4SLinus Torvalds * 33721da177e4SLinus Torvalds * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 33731da177e4SLinus Torvalds * and in FreeBSD. NetBSD's one is even worse.) is wrong. 33741da177e4SLinus Torvalds */ 3375cf533ea5SEric Dumazet static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack, 33769e412ba7SIlpo Järvinen u32 ack_seq) 33771da177e4SLinus Torvalds { 33789e412ba7SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 33791da177e4SLinus Torvalds int flag = 0; 3380aa8223c7SArnaldo Carvalho de Melo u32 nwin = ntohs(tcp_hdr(skb)->window); 33811da177e4SLinus Torvalds 3382aa8223c7SArnaldo Carvalho de Melo if (likely(!tcp_hdr(skb)->syn)) 33831da177e4SLinus Torvalds nwin <<= tp->rx_opt.snd_wscale; 33841da177e4SLinus Torvalds 33851da177e4SLinus Torvalds if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { 33861da177e4SLinus Torvalds flag |= FLAG_WIN_UPDATE; 3387ee7537b6SHantzis Fotis tcp_update_wl(tp, ack_seq); 33881da177e4SLinus Torvalds 33891da177e4SLinus Torvalds if (tp->snd_wnd != nwin) { 33901da177e4SLinus Torvalds tp->snd_wnd = nwin; 33911da177e4SLinus Torvalds 339231770e34SFlorian Westphal /* Note, it is the only place, where 339331770e34SFlorian Westphal * fast path is recovered for sending TCP. 339431770e34SFlorian Westphal */ 339531770e34SFlorian Westphal tp->pred_flags = 0; 339631770e34SFlorian Westphal tcp_fast_path_check(sk); 339731770e34SFlorian Westphal 339875c119afSEric Dumazet if (!tcp_write_queue_empty(sk)) 33996f021c62SEric Dumazet tcp_slow_start_after_idle_check(sk); 34006f021c62SEric Dumazet 34011da177e4SLinus Torvalds if (nwin > tp->max_window) { 34021da177e4SLinus Torvalds tp->max_window = nwin; 3403d83d8461SArnaldo Carvalho de Melo tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); 34041da177e4SLinus Torvalds } 34051da177e4SLinus Torvalds } 34061da177e4SLinus Torvalds } 34071da177e4SLinus Torvalds 34080df48c26SEric Dumazet tcp_snd_una_update(tp, ack); 34091da177e4SLinus Torvalds 34101da177e4SLinus Torvalds return flag; 34111da177e4SLinus Torvalds } 34121da177e4SLinus Torvalds 3413083ae308SJason Baron static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, 3414083ae308SJason Baron u32 *last_oow_ack_time) 3415083ae308SJason Baron { 3416083ae308SJason Baron if (*last_oow_ack_time) { 3417594208afSEric Dumazet s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time); 3418083ae308SJason Baron 34194170ba6bSEric Dumazet if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) { 3420083ae308SJason Baron NET_INC_STATS(net, mib_idx); 3421083ae308SJason Baron return true; /* rate-limited: don't send yet! */ 3422083ae308SJason Baron } 3423083ae308SJason Baron } 3424083ae308SJason Baron 3425594208afSEric Dumazet *last_oow_ack_time = tcp_jiffies32; 3426083ae308SJason Baron 3427083ae308SJason Baron return false; /* not rate-limited: go ahead, send dupack now! */ 3428083ae308SJason Baron } 3429083ae308SJason Baron 34307970ddc8SEric Dumazet /* Return true if we're currently rate-limiting out-of-window ACKs and 34317970ddc8SEric Dumazet * thus shouldn't send a dupack right now. We rate-limit dupacks in 34327970ddc8SEric Dumazet * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS 34337970ddc8SEric Dumazet * attacks that send repeated SYNs or ACKs for the same connection. To 34347970ddc8SEric Dumazet * do this, we do not send a duplicate SYNACK or ACK if the remote 34357970ddc8SEric Dumazet * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. 34367970ddc8SEric Dumazet */ 34377970ddc8SEric Dumazet bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, 34387970ddc8SEric Dumazet int mib_idx, u32 *last_oow_ack_time) 34397970ddc8SEric Dumazet { 34407970ddc8SEric Dumazet /* Data packets without SYNs are not likely part of an ACK loop. */ 34417970ddc8SEric Dumazet if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && 34427970ddc8SEric Dumazet !tcp_hdr(skb)->syn) 3443083ae308SJason Baron return false; 34447970ddc8SEric Dumazet 3445083ae308SJason Baron return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); 34467970ddc8SEric Dumazet } 34477970ddc8SEric Dumazet 3448354e4aa3SEric Dumazet /* RFC 5961 7 [ACK Throttling] */ 3449f2b2c582SNeal Cardwell static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) 3450354e4aa3SEric Dumazet { 3451354e4aa3SEric Dumazet /* unprotected vars, we dont care of overwrites */ 3452354e4aa3SEric Dumazet static u32 challenge_timestamp; 3453354e4aa3SEric Dumazet static unsigned int challenge_count; 3454f2b2c582SNeal Cardwell struct tcp_sock *tp = tcp_sk(sk); 3455b530b681SEric Dumazet struct net *net = sock_net(sk); 345675ff39ccSEric Dumazet u32 count, now; 3457354e4aa3SEric Dumazet 3458f2b2c582SNeal Cardwell /* First check our per-socket dupack rate limit. */ 3459b530b681SEric Dumazet if (__tcp_oow_rate_limited(net, 3460f2b2c582SNeal Cardwell LINUX_MIB_TCPACKSKIPPEDCHALLENGE, 3461f2b2c582SNeal Cardwell &tp->last_oow_ack_time)) 3462f2b2c582SNeal Cardwell return; 3463f2b2c582SNeal Cardwell 346475ff39ccSEric Dumazet /* Then check host-wide RFC 5961 rate limit. */ 3465f2b2c582SNeal Cardwell now = jiffies / HZ; 3466354e4aa3SEric Dumazet if (now != challenge_timestamp) { 3467b530b681SEric Dumazet u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit; 3468b530b681SEric Dumazet u32 half = (ack_limit + 1) >> 1; 346975ff39ccSEric Dumazet 3470354e4aa3SEric Dumazet challenge_timestamp = now; 3471b530b681SEric Dumazet WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit)); 3472354e4aa3SEric Dumazet } 347375ff39ccSEric Dumazet count = READ_ONCE(challenge_count); 347475ff39ccSEric Dumazet if (count > 0) { 347575ff39ccSEric Dumazet WRITE_ONCE(challenge_count, count - 1); 3476b530b681SEric Dumazet NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); 3477354e4aa3SEric Dumazet tcp_send_ack(sk); 3478354e4aa3SEric Dumazet } 3479354e4aa3SEric Dumazet } 3480354e4aa3SEric Dumazet 348112fb3dd9SEric Dumazet static void tcp_store_ts_recent(struct tcp_sock *tp) 348212fb3dd9SEric Dumazet { 348312fb3dd9SEric Dumazet tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; 3484cca9bab1SArnd Bergmann tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); 348512fb3dd9SEric Dumazet } 348612fb3dd9SEric Dumazet 348712fb3dd9SEric Dumazet static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) 348812fb3dd9SEric Dumazet { 348912fb3dd9SEric Dumazet if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { 349012fb3dd9SEric Dumazet /* PAWS bug workaround wrt. ACK frames, the PAWS discard 349112fb3dd9SEric Dumazet * extra check below makes sure this can only happen 349212fb3dd9SEric Dumazet * for pure ACK frames. -DaveM 349312fb3dd9SEric Dumazet * 349412fb3dd9SEric Dumazet * Not only, also it occurs for expired timestamps. 349512fb3dd9SEric Dumazet */ 349612fb3dd9SEric Dumazet 349712fb3dd9SEric Dumazet if (tcp_paws_check(&tp->rx_opt, 0)) 349812fb3dd9SEric Dumazet tcp_store_ts_recent(tp); 349912fb3dd9SEric Dumazet } 350012fb3dd9SEric Dumazet } 350112fb3dd9SEric Dumazet 35029b717a8dSNandita Dukkipati /* This routine deals with acks during a TLP episode. 350308abdffaSSébastien Barré * We mark the end of a TLP episode on receiving TLP dupack or when 350408abdffaSSébastien Barré * ack is after tlp_high_seq. 35059b717a8dSNandita Dukkipati * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe. 35069b717a8dSNandita Dukkipati */ 35079b717a8dSNandita Dukkipati static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) 35089b717a8dSNandita Dukkipati { 35099b717a8dSNandita Dukkipati struct tcp_sock *tp = tcp_sk(sk); 35109b717a8dSNandita Dukkipati 351108abdffaSSébastien Barré if (before(ack, tp->tlp_high_seq)) 35129b717a8dSNandita Dukkipati return; 35139b717a8dSNandita Dukkipati 351408abdffaSSébastien Barré if (flag & FLAG_DSACKING_ACK) { 351508abdffaSSébastien Barré /* This DSACK means original and TLP probe arrived; no loss */ 35169b717a8dSNandita Dukkipati tp->tlp_high_seq = 0; 351708abdffaSSébastien Barré } else if (after(ack, tp->tlp_high_seq)) { 351808abdffaSSébastien Barré /* ACK advances: there was a loss, so reduce cwnd. Reset 351908abdffaSSébastien Barré * tlp_high_seq in tcp_init_cwnd_reduction() 352008abdffaSSébastien Barré */ 35215ee2c941SChristoph Paasch tcp_init_cwnd_reduction(sk); 35229b717a8dSNandita Dukkipati tcp_set_ca_state(sk, TCP_CA_CWR); 35239b717a8dSNandita Dukkipati tcp_end_cwnd_reduction(sk); 3524031afe49SYuchung Cheng tcp_try_keep_open(sk); 3525c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), 35269b717a8dSNandita Dukkipati LINUX_MIB_TCPLOSSPROBERECOVERY); 352708abdffaSSébastien Barré } else if (!(flag & (FLAG_SND_UNA_ADVANCED | 352808abdffaSSébastien Barré FLAG_NOT_DUP | FLAG_DATA_SACKED))) { 352908abdffaSSébastien Barré /* Pure dupack: original and TLP probe arrived; no loss */ 353008abdffaSSébastien Barré tp->tlp_high_seq = 0; 35319b717a8dSNandita Dukkipati } 35329b717a8dSNandita Dukkipati } 35339b717a8dSNandita Dukkipati 35347354c8c3SFlorian Westphal static inline void tcp_in_ack_event(struct sock *sk, u32 flags) 35357354c8c3SFlorian Westphal { 35367354c8c3SFlorian Westphal const struct inet_connection_sock *icsk = inet_csk(sk); 35377354c8c3SFlorian Westphal 35387354c8c3SFlorian Westphal if (icsk->icsk_ca_ops->in_ack_event) 35397354c8c3SFlorian Westphal icsk->icsk_ca_ops->in_ack_event(sk, flags); 35407354c8c3SFlorian Westphal } 35417354c8c3SFlorian Westphal 3542e662ca40SYuchung Cheng /* Congestion control has updated the cwnd already. So if we're in 3543e662ca40SYuchung Cheng * loss recovery then now we do any new sends (for FRTO) or 3544e662ca40SYuchung Cheng * retransmits (for CA_Loss or CA_recovery) that make sense. 3545e662ca40SYuchung Cheng */ 3546e662ca40SYuchung Cheng static void tcp_xmit_recovery(struct sock *sk, int rexmit) 3547e662ca40SYuchung Cheng { 3548e662ca40SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 3549e662ca40SYuchung Cheng 3550bc9f38c8SYuchung Cheng if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) 3551e662ca40SYuchung Cheng return; 3552e662ca40SYuchung Cheng 3553e662ca40SYuchung Cheng if (unlikely(rexmit == 2)) { 3554e662ca40SYuchung Cheng __tcp_push_pending_frames(sk, tcp_current_mss(sk), 3555e662ca40SYuchung Cheng TCP_NAGLE_OFF); 3556e662ca40SYuchung Cheng if (after(tp->snd_nxt, tp->high_seq)) 3557e662ca40SYuchung Cheng return; 3558e662ca40SYuchung Cheng tp->frto = 0; 3559e662ca40SYuchung Cheng } 3560e662ca40SYuchung Cheng tcp_xmit_retransmit_queue(sk); 3561e662ca40SYuchung Cheng } 3562e662ca40SYuchung Cheng 3563a77fa010SYuchung Cheng /* Returns the number of packets newly acked or sacked by the current ACK */ 3564a77fa010SYuchung Cheng static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) 3565a77fa010SYuchung Cheng { 3566feb5f2ecSYuchung Cheng const struct net *net = sock_net(sk); 3567a77fa010SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 3568a77fa010SYuchung Cheng u32 delivered; 3569a77fa010SYuchung Cheng 3570a77fa010SYuchung Cheng delivered = tp->delivered - prior_delivered; 3571feb5f2ecSYuchung Cheng NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); 3572feb5f2ecSYuchung Cheng if (flag & FLAG_ECE) { 3573e21db6f6SYuchung Cheng tp->delivered_ce += delivered; 3574feb5f2ecSYuchung Cheng NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); 3575feb5f2ecSYuchung Cheng } 3576a77fa010SYuchung Cheng return delivered; 3577a77fa010SYuchung Cheng } 3578a77fa010SYuchung Cheng 35791da177e4SLinus Torvalds /* This routine deals with incoming acks, but not outgoing ones. */ 3580cf533ea5SEric Dumazet static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 35811da177e4SLinus Torvalds { 35826687e988SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 35831da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 3584196da974SKenneth Klette Jonassen struct tcp_sacktag_state sack_state; 3585b9f64820SYuchung Cheng struct rate_sample rs = { .prior_delivered = 0 }; 35861da177e4SLinus Torvalds u32 prior_snd_una = tp->snd_una; 3587d4761754SYousuk Seung bool is_sack_reneg = tp->is_sack_reneg; 35881da177e4SLinus Torvalds u32 ack_seq = TCP_SKB_CB(skb)->seq; 35891da177e4SLinus Torvalds u32 ack = TCP_SKB_CB(skb)->ack_seq; 359019119f29SEric Dumazet int num_dupack = 0; 359135f079ebSNandita Dukkipati int prior_packets = tp->packets_out; 3592b9f64820SYuchung Cheng u32 delivered = tp->delivered; 3593b9f64820SYuchung Cheng u32 lost = tp->lost; 3594e662ca40SYuchung Cheng int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3595737ff314SYuchung Cheng u32 prior_fack; 3596196da974SKenneth Klette Jonassen 35979a568de4SEric Dumazet sack_state.first_sackt = 0; 3598b9f64820SYuchung Cheng sack_state.rate = &rs; 35991da177e4SLinus Torvalds 360075c119afSEric Dumazet /* We very likely will need to access rtx queue. */ 360175c119afSEric Dumazet prefetch(sk->tcp_rtx_queue.rb_node); 3602ad971f61SEric Dumazet 360396e0bf4bSJohn Dykstra /* If the ack is older than previous acks 36041da177e4SLinus Torvalds * then we can probably ignore it. 36051da177e4SLinus Torvalds */ 3606354e4aa3SEric Dumazet if (before(ack, prior_snd_una)) { 3607354e4aa3SEric Dumazet /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3608354e4aa3SEric Dumazet if (before(ack, prior_snd_una - tp->max_window)) { 3609d0e1a1b5SEric Dumazet if (!(flag & FLAG_NO_CHALLENGE_ACK)) 3610f2b2c582SNeal Cardwell tcp_send_challenge_ack(sk, skb); 3611354e4aa3SEric Dumazet return -1; 3612354e4aa3SEric Dumazet } 36131da177e4SLinus Torvalds goto old_ack; 3614354e4aa3SEric Dumazet } 36151da177e4SLinus Torvalds 361696e0bf4bSJohn Dykstra /* If the ack includes data we haven't sent yet, discard 361796e0bf4bSJohn Dykstra * this segment (RFC793 Section 3.9). 361896e0bf4bSJohn Dykstra */ 361996e0bf4bSJohn Dykstra if (after(ack, tp->snd_nxt)) 36209946b341SYafang Shao return -1; 362196e0bf4bSJohn Dykstra 36220c9ab092SNeal Cardwell if (after(ack, prior_snd_una)) { 36232e605294SIlpo Järvinen flag |= FLAG_SND_UNA_ADVANCED; 36240c9ab092SNeal Cardwell icsk->icsk_retransmits = 0; 36256dac1523SIlya Lesokhin 36266dac1523SIlya Lesokhin #if IS_ENABLED(CONFIG_TLS_DEVICE) 3627494bc1d2SJakub Kicinski if (static_branch_unlikely(&clean_acked_data_enabled.key)) 36286dac1523SIlya Lesokhin if (icsk->icsk_clean_acked) 36296dac1523SIlya Lesokhin icsk->icsk_clean_acked(sk, ack); 36306dac1523SIlya Lesokhin #endif 36310c9ab092SNeal Cardwell } 36322e605294SIlpo Järvinen 363350895b9dSEric Dumazet prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; 3634b9f64820SYuchung Cheng rs.prior_in_flight = tcp_packets_in_flight(tp); 3635c7caf8d3SIlpo Järvinen 363612fb3dd9SEric Dumazet /* ts_recent update must be made after we are sure that the packet 363712fb3dd9SEric Dumazet * is in window. 363812fb3dd9SEric Dumazet */ 363912fb3dd9SEric Dumazet if (flag & FLAG_UPDATE_TS_RECENT) 364012fb3dd9SEric Dumazet tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); 364112fb3dd9SEric Dumazet 36425e13a0d3SYafang Shao if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == 36435e13a0d3SYafang Shao FLAG_SND_UNA_ADVANCED) { 364431770e34SFlorian Westphal /* Window is constant, pure forward advance. 364531770e34SFlorian Westphal * No more checks are required. 364631770e34SFlorian Westphal * Note, we use the fact that SND.UNA>=SND.WL2. 364731770e34SFlorian Westphal */ 364831770e34SFlorian Westphal tcp_update_wl(tp, ack_seq); 364931770e34SFlorian Westphal tcp_snd_una_update(tp, ack); 365031770e34SFlorian Westphal flag |= FLAG_WIN_UPDATE; 365131770e34SFlorian Westphal 365231770e34SFlorian Westphal tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); 365331770e34SFlorian Westphal 365431770e34SFlorian Westphal NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); 365531770e34SFlorian Westphal } else { 3656c1d2b4c3SFlorian Westphal u32 ack_ev_flags = CA_ACK_SLOWPATH; 3657c1d2b4c3SFlorian Westphal 36581da177e4SLinus Torvalds if (ack_seq != TCP_SKB_CB(skb)->end_seq) 36591da177e4SLinus Torvalds flag |= FLAG_DATA; 36601da177e4SLinus Torvalds else 3661c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); 36621da177e4SLinus Torvalds 36639e412ba7SIlpo Järvinen flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); 36641da177e4SLinus Torvalds 36651da177e4SLinus Torvalds if (TCP_SKB_CB(skb)->sacked) 366659c9af42SYuchung Cheng flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3667196da974SKenneth Klette Jonassen &sack_state); 36681da177e4SLinus Torvalds 3669735d3831SFlorian Westphal if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { 36701da177e4SLinus Torvalds flag |= FLAG_ECE; 3671c1d2b4c3SFlorian Westphal ack_ev_flags |= CA_ACK_ECE; 36729890092eSFlorian Westphal } 36731da177e4SLinus Torvalds 36749890092eSFlorian Westphal if (flag & FLAG_WIN_UPDATE) 36759890092eSFlorian Westphal ack_ev_flags |= CA_ACK_WIN_UPDATE; 36769890092eSFlorian Westphal 36779890092eSFlorian Westphal tcp_in_ack_event(sk, ack_ev_flags); 3678c1d2b4c3SFlorian Westphal } 36791da177e4SLinus Torvalds 36801da177e4SLinus Torvalds /* We passed data and got it acked, remove any soft error 36811da177e4SLinus Torvalds * log. Something worked... 36821da177e4SLinus Torvalds */ 36831da177e4SLinus Torvalds sk->sk_err_soft = 0; 36844b53fb67SDavid S. Miller icsk->icsk_probes_out = 0; 368570eabf0eSEric Dumazet tp->rcv_tstamp = tcp_jiffies32; 36861da177e4SLinus Torvalds if (!prior_packets) 36871da177e4SLinus Torvalds goto no_queue; 36881da177e4SLinus Torvalds 36891da177e4SLinus Torvalds /* See if we can take anything off of the retransmit queue. */ 3690737ff314SYuchung Cheng flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state); 3691a262f0cdSNandita Dukkipati 36921f255691SPriyaranjan Jha tcp_rack_update_reo_wnd(sk, &rs); 36931da177e4SLinus Torvalds 3694df92c839SNeal Cardwell if (tp->tlp_high_seq) 3695df92c839SNeal Cardwell tcp_process_tlp_ack(sk, ack, flag); 3696df92c839SNeal Cardwell /* If needed, reset TLP/RTO timer; RACK may later override this. */ 3697df92c839SNeal Cardwell if (flag & FLAG_SET_XMIT_TIMER) 3698df92c839SNeal Cardwell tcp_set_xmit_timer(sk); 3699df92c839SNeal Cardwell 37000f7cc9a3SYuchung Cheng if (tcp_ack_is_dubious(sk, flag)) { 370119119f29SEric Dumazet if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) { 370219119f29SEric Dumazet num_dupack = 1; 370319119f29SEric Dumazet /* Consider if pure acks were aggregated in tcp_add_backlog() */ 370419119f29SEric Dumazet if (!(flag & FLAG_DATA)) 370519119f29SEric Dumazet num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs); 370619119f29SEric Dumazet } 370719119f29SEric Dumazet tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, 3708737ff314SYuchung Cheng &rexmit); 37091da177e4SLinus Torvalds } 37109b717a8dSNandita Dukkipati 3711c3a2e837SJulian Anastasov if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3712c3a2e837SJulian Anastasov sk_dst_confirm(sk); 37136ba8a3b1SNandita Dukkipati 3714a77fa010SYuchung Cheng delivered = tcp_newly_delivered(sk, delivered, flag); 3715b9f64820SYuchung Cheng lost = tp->lost - lost; /* freshly marked lost */ 3716e4286603SYuchung Cheng rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); 3717d4761754SYousuk Seung tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); 3718deed7be7SYuchung Cheng tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); 3719e662ca40SYuchung Cheng tcp_xmit_recovery(sk, rexmit); 37201da177e4SLinus Torvalds return 1; 37211da177e4SLinus Torvalds 37221da177e4SLinus Torvalds no_queue: 37235628adf1SNeal Cardwell /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3724a77fa010SYuchung Cheng if (flag & FLAG_DSACKING_ACK) { 372519119f29SEric Dumazet tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, 3726737ff314SYuchung Cheng &rexmit); 3727a77fa010SYuchung Cheng tcp_newly_delivered(sk, delivered, flag); 3728a77fa010SYuchung Cheng } 37291da177e4SLinus Torvalds /* If this ack opens up a zero window, clear backoff. It was 37301da177e4SLinus Torvalds * being used to time the probes, and is probably far higher than 37311da177e4SLinus Torvalds * it needs to be for normal retransmission. 37321da177e4SLinus Torvalds */ 37331da177e4SLinus Torvalds tcp_ack_probe(sk); 37349b717a8dSNandita Dukkipati 37359b717a8dSNandita Dukkipati if (tp->tlp_high_seq) 37369b717a8dSNandita Dukkipati tcp_process_tlp_ack(sk, ack, flag); 37371da177e4SLinus Torvalds return 1; 37381da177e4SLinus Torvalds 37391da177e4SLinus Torvalds old_ack: 3740e95ae2f2SNeal Cardwell /* If data was SACKed, tag it and see if we should send more data. 3741e95ae2f2SNeal Cardwell * If data was DSACKed, see if we can undo a cwnd reduction. 3742e95ae2f2SNeal Cardwell */ 37438aca6cb1SIlpo Järvinen if (TCP_SKB_CB(skb)->sacked) { 374459c9af42SYuchung Cheng flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, 3745196da974SKenneth Klette Jonassen &sack_state); 374619119f29SEric Dumazet tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, 3747737ff314SYuchung Cheng &rexmit); 3748a77fa010SYuchung Cheng tcp_newly_delivered(sk, delivered, flag); 3749e662ca40SYuchung Cheng tcp_xmit_recovery(sk, rexmit); 37508aca6cb1SIlpo Järvinen } 37511da177e4SLinus Torvalds 37521da177e4SLinus Torvalds return 0; 37531da177e4SLinus Torvalds } 37541da177e4SLinus Torvalds 37557f9b838bSDaniel Lee static void tcp_parse_fastopen_option(int len, const unsigned char *cookie, 37567f9b838bSDaniel Lee bool syn, struct tcp_fastopen_cookie *foc, 37577f9b838bSDaniel Lee bool exp_opt) 37587f9b838bSDaniel Lee { 37597f9b838bSDaniel Lee /* Valid only in SYN or SYN-ACK with an even length. */ 37607f9b838bSDaniel Lee if (!foc || !syn || len < 0 || (len & 1)) 37617f9b838bSDaniel Lee return; 37627f9b838bSDaniel Lee 37637f9b838bSDaniel Lee if (len >= TCP_FASTOPEN_COOKIE_MIN && 37647f9b838bSDaniel Lee len <= TCP_FASTOPEN_COOKIE_MAX) 37657f9b838bSDaniel Lee memcpy(foc->val, cookie, len); 37667f9b838bSDaniel Lee else if (len != 0) 37677f9b838bSDaniel Lee len = -1; 37687f9b838bSDaniel Lee foc->len = len; 37697f9b838bSDaniel Lee foc->exp = exp_opt; 37707f9b838bSDaniel Lee } 37717f9b838bSDaniel Lee 377260e2a778SUrsula Braun static void smc_parse_options(const struct tcphdr *th, 377360e2a778SUrsula Braun struct tcp_options_received *opt_rx, 377460e2a778SUrsula Braun const unsigned char *ptr, 377560e2a778SUrsula Braun int opsize) 377660e2a778SUrsula Braun { 377760e2a778SUrsula Braun #if IS_ENABLED(CONFIG_SMC) 377860e2a778SUrsula Braun if (static_branch_unlikely(&tcp_have_smc)) { 377960e2a778SUrsula Braun if (th->syn && !(opsize & 1) && 378060e2a778SUrsula Braun opsize >= TCPOLEN_EXP_SMC_BASE && 378160e2a778SUrsula Braun get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) 378260e2a778SUrsula Braun opt_rx->smc_ok = 1; 378360e2a778SUrsula Braun } 378460e2a778SUrsula Braun #endif 378560e2a778SUrsula Braun } 378660e2a778SUrsula Braun 37879349d600SPetar Penkov /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped 37889349d600SPetar Penkov * value on success. 37899349d600SPetar Penkov */ 37909349d600SPetar Penkov static u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) 37919349d600SPetar Penkov { 37929349d600SPetar Penkov const unsigned char *ptr = (const unsigned char *)(th + 1); 37939349d600SPetar Penkov int length = (th->doff * 4) - sizeof(struct tcphdr); 37949349d600SPetar Penkov u16 mss = 0; 37959349d600SPetar Penkov 37969349d600SPetar Penkov while (length > 0) { 37979349d600SPetar Penkov int opcode = *ptr++; 37989349d600SPetar Penkov int opsize; 37999349d600SPetar Penkov 38009349d600SPetar Penkov switch (opcode) { 38019349d600SPetar Penkov case TCPOPT_EOL: 38029349d600SPetar Penkov return mss; 38039349d600SPetar Penkov case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 38049349d600SPetar Penkov length--; 38059349d600SPetar Penkov continue; 38069349d600SPetar Penkov default: 38079349d600SPetar Penkov if (length < 2) 38089349d600SPetar Penkov return mss; 38099349d600SPetar Penkov opsize = *ptr++; 38109349d600SPetar Penkov if (opsize < 2) /* "silly options" */ 38119349d600SPetar Penkov return mss; 38129349d600SPetar Penkov if (opsize > length) 38139349d600SPetar Penkov return mss; /* fail on partial options */ 38149349d600SPetar Penkov if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) { 38159349d600SPetar Penkov u16 in_mss = get_unaligned_be16(ptr); 38169349d600SPetar Penkov 38179349d600SPetar Penkov if (in_mss) { 38189349d600SPetar Penkov if (user_mss && user_mss < in_mss) 38199349d600SPetar Penkov in_mss = user_mss; 38209349d600SPetar Penkov mss = in_mss; 38219349d600SPetar Penkov } 38229349d600SPetar Penkov } 38239349d600SPetar Penkov ptr += opsize - 2; 38249349d600SPetar Penkov length -= opsize; 38259349d600SPetar Penkov } 38269349d600SPetar Penkov } 38279349d600SPetar Penkov return mss; 38289349d600SPetar Penkov } 38299349d600SPetar Penkov 38301da177e4SLinus Torvalds /* Look for tcp options. Normally only called on SYN and SYNACK packets. 38311da177e4SLinus Torvalds * But, this can also be called on packets in the established flow when 38321da177e4SLinus Torvalds * the fast version below fails. 38331da177e4SLinus Torvalds */ 3834eed29f17SEric Dumazet void tcp_parse_options(const struct net *net, 3835eed29f17SEric Dumazet const struct sk_buff *skb, 38361a2c6181SChristoph Paasch struct tcp_options_received *opt_rx, int estab, 38372100c8d2SYuchung Cheng struct tcp_fastopen_cookie *foc) 38381da177e4SLinus Torvalds { 3839cf533ea5SEric Dumazet const unsigned char *ptr; 3840cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 38411da177e4SLinus Torvalds int length = (th->doff * 4) - sizeof(struct tcphdr); 38421da177e4SLinus Torvalds 3843cf533ea5SEric Dumazet ptr = (const unsigned char *)(th + 1); 38441da177e4SLinus Torvalds opt_rx->saw_tstamp = 0; 38451da177e4SLinus Torvalds 38461da177e4SLinus Torvalds while (length > 0) { 38471da177e4SLinus Torvalds int opcode = *ptr++; 38481da177e4SLinus Torvalds int opsize; 38491da177e4SLinus Torvalds 38501da177e4SLinus Torvalds switch (opcode) { 38511da177e4SLinus Torvalds case TCPOPT_EOL: 38521da177e4SLinus Torvalds return; 38531da177e4SLinus Torvalds case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 38541da177e4SLinus Torvalds length--; 38551da177e4SLinus Torvalds continue; 38561da177e4SLinus Torvalds default: 38579609dad2SYoung Xiao if (length < 2) 38589609dad2SYoung Xiao return; 38591da177e4SLinus Torvalds opsize = *ptr++; 38601da177e4SLinus Torvalds if (opsize < 2) /* "silly options" */ 38611da177e4SLinus Torvalds return; 38621da177e4SLinus Torvalds if (opsize > length) 38631da177e4SLinus Torvalds return; /* don't parse partial options */ 38641da177e4SLinus Torvalds switch (opcode) { 38651da177e4SLinus Torvalds case TCPOPT_MSS: 38661da177e4SLinus Torvalds if (opsize == TCPOLEN_MSS && th->syn && !estab) { 3867d3e2ce3bSHarvey Harrison u16 in_mss = get_unaligned_be16(ptr); 38681da177e4SLinus Torvalds if (in_mss) { 3869f038ac8fSIlpo Järvinen if (opt_rx->user_mss && 3870f038ac8fSIlpo Järvinen opt_rx->user_mss < in_mss) 38711da177e4SLinus Torvalds in_mss = opt_rx->user_mss; 38721da177e4SLinus Torvalds opt_rx->mss_clamp = in_mss; 38731da177e4SLinus Torvalds } 38741da177e4SLinus Torvalds } 38751da177e4SLinus Torvalds break; 38761da177e4SLinus Torvalds case TCPOPT_WINDOW: 3877f038ac8fSIlpo Järvinen if (opsize == TCPOLEN_WINDOW && th->syn && 38789bb37ef0SEric Dumazet !estab && net->ipv4.sysctl_tcp_window_scaling) { 38791da177e4SLinus Torvalds __u8 snd_wscale = *(__u8 *)ptr; 38801da177e4SLinus Torvalds opt_rx->wscale_ok = 1; 3881589c49cbSGao Feng if (snd_wscale > TCP_MAX_WSCALE) { 3882589c49cbSGao Feng net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n", 3883058bd4d2SJoe Perches __func__, 3884589c49cbSGao Feng snd_wscale, 3885589c49cbSGao Feng TCP_MAX_WSCALE); 3886589c49cbSGao Feng snd_wscale = TCP_MAX_WSCALE; 38871da177e4SLinus Torvalds } 38881da177e4SLinus Torvalds opt_rx->snd_wscale = snd_wscale; 38891da177e4SLinus Torvalds } 38901da177e4SLinus Torvalds break; 38911da177e4SLinus Torvalds case TCPOPT_TIMESTAMP: 3892f038ac8fSIlpo Järvinen if ((opsize == TCPOLEN_TIMESTAMP) && 3893f038ac8fSIlpo Järvinen ((estab && opt_rx->tstamp_ok) || 38945d2ed052SEric Dumazet (!estab && net->ipv4.sysctl_tcp_timestamps))) { 38951da177e4SLinus Torvalds opt_rx->saw_tstamp = 1; 3896d3e2ce3bSHarvey Harrison opt_rx->rcv_tsval = get_unaligned_be32(ptr); 3897d3e2ce3bSHarvey Harrison opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4); 38981da177e4SLinus Torvalds } 38991da177e4SLinus Torvalds break; 39001da177e4SLinus Torvalds case TCPOPT_SACK_PERM: 3901f038ac8fSIlpo Järvinen if (opsize == TCPOLEN_SACK_PERM && th->syn && 3902f9301034SEric Dumazet !estab && net->ipv4.sysctl_tcp_sack) { 3903ab56222aSVijay Subramanian opt_rx->sack_ok = TCP_SACK_SEEN; 39041da177e4SLinus Torvalds tcp_sack_reset(opt_rx); 39051da177e4SLinus Torvalds } 39061da177e4SLinus Torvalds break; 39071da177e4SLinus Torvalds 39081da177e4SLinus Torvalds case TCPOPT_SACK: 39091da177e4SLinus Torvalds if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && 39101da177e4SLinus Torvalds !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && 39111da177e4SLinus Torvalds opt_rx->sack_ok) { 39121da177e4SLinus Torvalds TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; 39131da177e4SLinus Torvalds } 3914d7ea5b91SIlpo Järvinen break; 3915cfb6eeb4SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 3916cfb6eeb4SYOSHIFUJI Hideaki case TCPOPT_MD5SIG: 3917cfb6eeb4SYOSHIFUJI Hideaki /* 3918cfb6eeb4SYOSHIFUJI Hideaki * The MD5 Hash has already been 3919cfb6eeb4SYOSHIFUJI Hideaki * checked (see tcp_v{4,6}_do_rcv()). 3920cfb6eeb4SYOSHIFUJI Hideaki */ 3921cfb6eeb4SYOSHIFUJI Hideaki break; 3922cfb6eeb4SYOSHIFUJI Hideaki #endif 39237f9b838bSDaniel Lee case TCPOPT_FASTOPEN: 39247f9b838bSDaniel Lee tcp_parse_fastopen_option( 39257f9b838bSDaniel Lee opsize - TCPOLEN_FASTOPEN_BASE, 39267f9b838bSDaniel Lee ptr, th->syn, foc, false); 39277f9b838bSDaniel Lee break; 39287f9b838bSDaniel Lee 39292100c8d2SYuchung Cheng case TCPOPT_EXP: 39302100c8d2SYuchung Cheng /* Fast Open option shares code 254 using a 39317f9b838bSDaniel Lee * 16 bits magic number. 39322100c8d2SYuchung Cheng */ 39337f9b838bSDaniel Lee if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE && 39347f9b838bSDaniel Lee get_unaligned_be16(ptr) == 39357f9b838bSDaniel Lee TCPOPT_FASTOPEN_MAGIC) 39367f9b838bSDaniel Lee tcp_parse_fastopen_option(opsize - 39377f9b838bSDaniel Lee TCPOLEN_EXP_FASTOPEN_BASE, 39387f9b838bSDaniel Lee ptr + 2, th->syn, foc, true); 393960e2a778SUrsula Braun else 394060e2a778SUrsula Braun smc_parse_options(th, opt_rx, ptr, 394160e2a778SUrsula Braun opsize); 39422100c8d2SYuchung Cheng break; 39432100c8d2SYuchung Cheng 39442100c8d2SYuchung Cheng } 39451da177e4SLinus Torvalds ptr += opsize-2; 39461da177e4SLinus Torvalds length -= opsize; 39473ff50b79SStephen Hemminger } 39481da177e4SLinus Torvalds } 39491da177e4SLinus Torvalds } 39504bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_parse_options); 39511da177e4SLinus Torvalds 3952a2a385d6SEric Dumazet static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th) 3953a4356b29SIlpo Järvinen { 3954cf533ea5SEric Dumazet const __be32 *ptr = (const __be32 *)(th + 1); 3955a4356b29SIlpo Järvinen 3956a4356b29SIlpo Järvinen if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) 3957a4356b29SIlpo Järvinen | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { 3958a4356b29SIlpo Järvinen tp->rx_opt.saw_tstamp = 1; 3959a4356b29SIlpo Järvinen ++ptr; 3960a4356b29SIlpo Järvinen tp->rx_opt.rcv_tsval = ntohl(*ptr); 3961a4356b29SIlpo Järvinen ++ptr; 3962e3e12028SAndrew Vagin if (*ptr) 3963ee684b6fSAndrey Vagin tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; 3964e3e12028SAndrew Vagin else 3965e3e12028SAndrew Vagin tp->rx_opt.rcv_tsecr = 0; 3966a2a385d6SEric Dumazet return true; 3967a4356b29SIlpo Järvinen } 3968a2a385d6SEric Dumazet return false; 3969a4356b29SIlpo Järvinen } 3970a4356b29SIlpo Järvinen 39711da177e4SLinus Torvalds /* Fast parse options. This hopes to only see timestamps. 39721da177e4SLinus Torvalds * If it is wrong it falls back on tcp_parse_options(). 39731da177e4SLinus Torvalds */ 3974eed29f17SEric Dumazet static bool tcp_fast_parse_options(const struct net *net, 3975eed29f17SEric Dumazet const struct sk_buff *skb, 39761a2c6181SChristoph Paasch const struct tcphdr *th, struct tcp_sock *tp) 39771da177e4SLinus Torvalds { 39784957faadSWilliam Allen Simpson /* In the spirit of fast parsing, compare doff directly to constant 39794957faadSWilliam Allen Simpson * values. Because equality is used, short doff can be ignored here. 39804957faadSWilliam Allen Simpson */ 39814957faadSWilliam Allen Simpson if (th->doff == (sizeof(*th) / 4)) { 39821da177e4SLinus Torvalds tp->rx_opt.saw_tstamp = 0; 3983a2a385d6SEric Dumazet return false; 39841da177e4SLinus Torvalds } else if (tp->rx_opt.tstamp_ok && 39854957faadSWilliam Allen Simpson th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { 3986a4356b29SIlpo Järvinen if (tcp_parse_aligned_timestamp(tp, th)) 3987a2a385d6SEric Dumazet return true; 39881da177e4SLinus Torvalds } 3989ee684b6fSAndrey Vagin 3990eed29f17SEric Dumazet tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); 3991e3e12028SAndrew Vagin if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 3992ee684b6fSAndrey Vagin tp->rx_opt.rcv_tsecr -= tp->tsoffset; 3993ee684b6fSAndrey Vagin 3994a2a385d6SEric Dumazet return true; 39951da177e4SLinus Torvalds } 39961da177e4SLinus Torvalds 39977d5d5525SYOSHIFUJI Hideaki #ifdef CONFIG_TCP_MD5SIG 39987d5d5525SYOSHIFUJI Hideaki /* 39997d5d5525SYOSHIFUJI Hideaki * Parse MD5 Signature option 40007d5d5525SYOSHIFUJI Hideaki */ 4001cf533ea5SEric Dumazet const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) 40027d5d5525SYOSHIFUJI Hideaki { 40037d5d5525SYOSHIFUJI Hideaki int length = (th->doff << 2) - sizeof(*th); 4004cf533ea5SEric Dumazet const u8 *ptr = (const u8 *)(th + 1); 40057d5d5525SYOSHIFUJI Hideaki 40067e5a206aSJann Horn /* If not enough data remaining, we can short cut */ 40077e5a206aSJann Horn while (length >= TCPOLEN_MD5SIG) { 40087d5d5525SYOSHIFUJI Hideaki int opcode = *ptr++; 40097d5d5525SYOSHIFUJI Hideaki int opsize; 40107d5d5525SYOSHIFUJI Hideaki 40117d5d5525SYOSHIFUJI Hideaki switch (opcode) { 40127d5d5525SYOSHIFUJI Hideaki case TCPOPT_EOL: 40137d5d5525SYOSHIFUJI Hideaki return NULL; 40147d5d5525SYOSHIFUJI Hideaki case TCPOPT_NOP: 40157d5d5525SYOSHIFUJI Hideaki length--; 40167d5d5525SYOSHIFUJI Hideaki continue; 40177d5d5525SYOSHIFUJI Hideaki default: 40187d5d5525SYOSHIFUJI Hideaki opsize = *ptr++; 40197d5d5525SYOSHIFUJI Hideaki if (opsize < 2 || opsize > length) 40207d5d5525SYOSHIFUJI Hideaki return NULL; 40217d5d5525SYOSHIFUJI Hideaki if (opcode == TCPOPT_MD5SIG) 4022ba78e2ddSDmitry Popov return opsize == TCPOLEN_MD5SIG ? ptr : NULL; 40237d5d5525SYOSHIFUJI Hideaki } 40247d5d5525SYOSHIFUJI Hideaki ptr += opsize - 2; 40257d5d5525SYOSHIFUJI Hideaki length -= opsize; 40267d5d5525SYOSHIFUJI Hideaki } 40277d5d5525SYOSHIFUJI Hideaki return NULL; 40287d5d5525SYOSHIFUJI Hideaki } 40294bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_parse_md5sig_option); 40307d5d5525SYOSHIFUJI Hideaki #endif 40317d5d5525SYOSHIFUJI Hideaki 40321da177e4SLinus Torvalds /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM 40331da177e4SLinus Torvalds * 40341da177e4SLinus Torvalds * It is not fatal. If this ACK does _not_ change critical state (seqs, window) 40351da177e4SLinus Torvalds * it can pass through stack. So, the following predicate verifies that 40361da177e4SLinus Torvalds * this segment is not used for anything but congestion avoidance or 40371da177e4SLinus Torvalds * fast retransmit. Moreover, we even are able to eliminate most of such 40381da177e4SLinus Torvalds * second order effects, if we apply some small "replay" window (~RTO) 40391da177e4SLinus Torvalds * to timestamp space. 40401da177e4SLinus Torvalds * 40411da177e4SLinus Torvalds * All these measures still do not guarantee that we reject wrapped ACKs 40421da177e4SLinus Torvalds * on networks with high bandwidth, when sequence space is recycled fastly, 40431da177e4SLinus Torvalds * but it guarantees that such events will be very rare and do not affect 40441da177e4SLinus Torvalds * connection seriously. This doesn't look nice, but alas, PAWS is really 40451da177e4SLinus Torvalds * buggy extension. 40461da177e4SLinus Torvalds * 40471da177e4SLinus Torvalds * [ Later note. Even worse! It is buggy for segments _with_ data. RFC 40481da177e4SLinus Torvalds * states that events when retransmit arrives after original data are rare. 40491da177e4SLinus Torvalds * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is 40501da177e4SLinus Torvalds * the biggest problem on large power networks even with minor reordering. 40511da177e4SLinus Torvalds * OK, let's give it small replay window. If peer clock is even 1hz, it is safe 40521da177e4SLinus Torvalds * up to bandwidth of 18Gigabit/sec. 8) ] 40531da177e4SLinus Torvalds */ 40541da177e4SLinus Torvalds 4055463c84b9SArnaldo Carvalho de Melo static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) 40561da177e4SLinus Torvalds { 4057cf533ea5SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 4058cf533ea5SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 40591da177e4SLinus Torvalds u32 seq = TCP_SKB_CB(skb)->seq; 40601da177e4SLinus Torvalds u32 ack = TCP_SKB_CB(skb)->ack_seq; 40611da177e4SLinus Torvalds 40621da177e4SLinus Torvalds return (/* 1. Pure ACK with correct sequence number. */ 40631da177e4SLinus Torvalds (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && 40641da177e4SLinus Torvalds 40651da177e4SLinus Torvalds /* 2. ... and duplicate ACK. */ 40661da177e4SLinus Torvalds ack == tp->snd_una && 40671da177e4SLinus Torvalds 40681da177e4SLinus Torvalds /* 3. ... and does not update window. */ 40691da177e4SLinus Torvalds !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && 40701da177e4SLinus Torvalds 40711da177e4SLinus Torvalds /* 4. ... and sits in replay window. */ 4072463c84b9SArnaldo Carvalho de Melo (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); 40731da177e4SLinus Torvalds } 40741da177e4SLinus Torvalds 407567b95bd7SVijay Subramanian static inline bool tcp_paws_discard(const struct sock *sk, 4076056834d9SIlpo Järvinen const struct sk_buff *skb) 40771da177e4SLinus Torvalds { 4078463c84b9SArnaldo Carvalho de Melo const struct tcp_sock *tp = tcp_sk(sk); 4079c887e6d2SIlpo Järvinen 4080c887e6d2SIlpo Järvinen return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && 4081c887e6d2SIlpo Järvinen !tcp_disordered_ack(sk, skb); 40821da177e4SLinus Torvalds } 40831da177e4SLinus Torvalds 40841da177e4SLinus Torvalds /* Check segment sequence number for validity. 40851da177e4SLinus Torvalds * 40861da177e4SLinus Torvalds * Segment controls are considered valid, if the segment 40871da177e4SLinus Torvalds * fits to the window after truncation to the window. Acceptability 40881da177e4SLinus Torvalds * of data (and SYN, FIN, of course) is checked separately. 40891da177e4SLinus Torvalds * See tcp_data_queue(), for example. 40901da177e4SLinus Torvalds * 40911da177e4SLinus Torvalds * Also, controls (RST is main one) are accepted using RCV.WUP instead 40921da177e4SLinus Torvalds * of RCV.NXT. Peer still did not advance his SND.UNA when we 40931da177e4SLinus Torvalds * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. 40941da177e4SLinus Torvalds * (borrowed from freebsd) 40951da177e4SLinus Torvalds */ 40961da177e4SLinus Torvalds 409767b95bd7SVijay Subramanian static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) 40981da177e4SLinus Torvalds { 40991da177e4SLinus Torvalds return !before(end_seq, tp->rcv_wup) && 41001da177e4SLinus Torvalds !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); 41011da177e4SLinus Torvalds } 41021da177e4SLinus Torvalds 41031da177e4SLinus Torvalds /* When we get a reset we do this. */ 410410467163SJerry Chu void tcp_reset(struct sock *sk) 41051da177e4SLinus Torvalds { 41065941521cSSong Liu trace_tcp_receive_reset(sk); 41075941521cSSong Liu 41081da177e4SLinus Torvalds /* We want the right error as BSD sees it (and indeed as we do). */ 41091da177e4SLinus Torvalds switch (sk->sk_state) { 41101da177e4SLinus Torvalds case TCP_SYN_SENT: 41111da177e4SLinus Torvalds sk->sk_err = ECONNREFUSED; 41121da177e4SLinus Torvalds break; 41131da177e4SLinus Torvalds case TCP_CLOSE_WAIT: 41141da177e4SLinus Torvalds sk->sk_err = EPIPE; 41151da177e4SLinus Torvalds break; 41161da177e4SLinus Torvalds case TCP_CLOSE: 41171da177e4SLinus Torvalds return; 41181da177e4SLinus Torvalds default: 41191da177e4SLinus Torvalds sk->sk_err = ECONNRESET; 41201da177e4SLinus Torvalds } 4121a4d25803STom Marshall /* This barrier is coupled with smp_rmb() in tcp_poll() */ 4122a4d25803STom Marshall smp_wmb(); 41231da177e4SLinus Torvalds 4124a27fd7a8SSoheil Hassas Yeganeh tcp_write_queue_purge(sk); 41253d476263SEric Dumazet tcp_done(sk); 41263d476263SEric Dumazet 41271da177e4SLinus Torvalds if (!sock_flag(sk, SOCK_DEAD)) 41281da177e4SLinus Torvalds sk->sk_error_report(sk); 41291da177e4SLinus Torvalds } 41301da177e4SLinus Torvalds 41311da177e4SLinus Torvalds /* 41321da177e4SLinus Torvalds * Process the FIN bit. This now behaves as it is supposed to work 41331da177e4SLinus Torvalds * and the FIN takes effect when it is validly part of sequence 41341da177e4SLinus Torvalds * space. Not before when we get holes. 41351da177e4SLinus Torvalds * 41361da177e4SLinus Torvalds * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT 41371da177e4SLinus Torvalds * (and thence onto LAST-ACK and finally, CLOSE, we never enter 41381da177e4SLinus Torvalds * TIME-WAIT) 41391da177e4SLinus Torvalds * 41401da177e4SLinus Torvalds * If we are in FINWAIT-1, a received FIN indicates simultaneous 41411da177e4SLinus Torvalds * close and we go into CLOSING (and later onto TIME-WAIT) 41421da177e4SLinus Torvalds * 41431da177e4SLinus Torvalds * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. 41441da177e4SLinus Torvalds */ 4145e3e17b77SEric Dumazet void tcp_fin(struct sock *sk) 41461da177e4SLinus Torvalds { 41471da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 41481da177e4SLinus Torvalds 4149463c84b9SArnaldo Carvalho de Melo inet_csk_schedule_ack(sk); 41501da177e4SLinus Torvalds 41511da177e4SLinus Torvalds sk->sk_shutdown |= RCV_SHUTDOWN; 41521da177e4SLinus Torvalds sock_set_flag(sk, SOCK_DONE); 41531da177e4SLinus Torvalds 41541da177e4SLinus Torvalds switch (sk->sk_state) { 41551da177e4SLinus Torvalds case TCP_SYN_RECV: 41561da177e4SLinus Torvalds case TCP_ESTABLISHED: 41571da177e4SLinus Torvalds /* Move to CLOSE_WAIT */ 41581da177e4SLinus Torvalds tcp_set_state(sk, TCP_CLOSE_WAIT); 415931954cd8SWei Wang inet_csk_enter_pingpong_mode(sk); 41601da177e4SLinus Torvalds break; 41611da177e4SLinus Torvalds 41621da177e4SLinus Torvalds case TCP_CLOSE_WAIT: 41631da177e4SLinus Torvalds case TCP_CLOSING: 41641da177e4SLinus Torvalds /* Received a retransmission of the FIN, do 41651da177e4SLinus Torvalds * nothing. 41661da177e4SLinus Torvalds */ 41671da177e4SLinus Torvalds break; 41681da177e4SLinus Torvalds case TCP_LAST_ACK: 41691da177e4SLinus Torvalds /* RFC793: Remain in the LAST-ACK state. */ 41701da177e4SLinus Torvalds break; 41711da177e4SLinus Torvalds 41721da177e4SLinus Torvalds case TCP_FIN_WAIT1: 41731da177e4SLinus Torvalds /* This case occurs when a simultaneous close 41741da177e4SLinus Torvalds * happens, we must ack the received FIN and 41751da177e4SLinus Torvalds * enter the CLOSING state. 41761da177e4SLinus Torvalds */ 41771da177e4SLinus Torvalds tcp_send_ack(sk); 41781da177e4SLinus Torvalds tcp_set_state(sk, TCP_CLOSING); 41791da177e4SLinus Torvalds break; 41801da177e4SLinus Torvalds case TCP_FIN_WAIT2: 41811da177e4SLinus Torvalds /* Received a FIN -- send ACK and enter TIME_WAIT. */ 41821da177e4SLinus Torvalds tcp_send_ack(sk); 41831da177e4SLinus Torvalds tcp_time_wait(sk, TCP_TIME_WAIT, 0); 41841da177e4SLinus Torvalds break; 41851da177e4SLinus Torvalds default: 41861da177e4SLinus Torvalds /* Only TCP_LISTEN and TCP_CLOSE are left, in these 41871da177e4SLinus Torvalds * cases we should never reach this piece of code. 41881da177e4SLinus Torvalds */ 4189058bd4d2SJoe Perches pr_err("%s: Impossible, sk->sk_state=%d\n", 41900dc47877SHarvey Harrison __func__, sk->sk_state); 41911da177e4SLinus Torvalds break; 41923ff50b79SStephen Hemminger } 41931da177e4SLinus Torvalds 41941da177e4SLinus Torvalds /* It _is_ possible, that we have something out-of-order _after_ FIN. 41951da177e4SLinus Torvalds * Probably, we should reset in this case. For now drop them. 41961da177e4SLinus Torvalds */ 41979f5afeaeSYaogong Wang skb_rbtree_purge(&tp->out_of_order_queue); 4198e60402d0SIlpo Järvinen if (tcp_is_sack(tp)) 41991da177e4SLinus Torvalds tcp_sack_reset(&tp->rx_opt); 42003ab224beSHideo Aoki sk_mem_reclaim(sk); 42011da177e4SLinus Torvalds 42021da177e4SLinus Torvalds if (!sock_flag(sk, SOCK_DEAD)) { 42031da177e4SLinus Torvalds sk->sk_state_change(sk); 42041da177e4SLinus Torvalds 42051da177e4SLinus Torvalds /* Do not send POLL_HUP for half duplex close. */ 42061da177e4SLinus Torvalds if (sk->sk_shutdown == SHUTDOWN_MASK || 42071da177e4SLinus Torvalds sk->sk_state == TCP_CLOSE) 42088d8ad9d7SPavel Emelyanov sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 42091da177e4SLinus Torvalds else 42108d8ad9d7SPavel Emelyanov sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 42111da177e4SLinus Torvalds } 42121da177e4SLinus Torvalds } 42131da177e4SLinus Torvalds 4214a2a385d6SEric Dumazet static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, 4215056834d9SIlpo Järvinen u32 end_seq) 42161da177e4SLinus Torvalds { 42171da177e4SLinus Torvalds if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { 42181da177e4SLinus Torvalds if (before(seq, sp->start_seq)) 42191da177e4SLinus Torvalds sp->start_seq = seq; 42201da177e4SLinus Torvalds if (after(end_seq, sp->end_seq)) 42211da177e4SLinus Torvalds sp->end_seq = end_seq; 4222a2a385d6SEric Dumazet return true; 42231da177e4SLinus Torvalds } 4224a2a385d6SEric Dumazet return false; 42251da177e4SLinus Torvalds } 42261da177e4SLinus Torvalds 42271ed83465SPavel Emelyanov static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) 42281da177e4SLinus Torvalds { 42291ed83465SPavel Emelyanov struct tcp_sock *tp = tcp_sk(sk); 42301ed83465SPavel Emelyanov 42316496f6bdSEric Dumazet if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { 423240b215e5SPavel Emelyanov int mib_idx; 423340b215e5SPavel Emelyanov 42341da177e4SLinus Torvalds if (before(seq, tp->rcv_nxt)) 423540b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPDSACKOLDSENT; 42361da177e4SLinus Torvalds else 423740b215e5SPavel Emelyanov mib_idx = LINUX_MIB_TCPDSACKOFOSENT; 423840b215e5SPavel Emelyanov 4239c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), mib_idx); 42401da177e4SLinus Torvalds 42411da177e4SLinus Torvalds tp->rx_opt.dsack = 1; 42421da177e4SLinus Torvalds tp->duplicate_sack[0].start_seq = seq; 42431da177e4SLinus Torvalds tp->duplicate_sack[0].end_seq = end_seq; 42441da177e4SLinus Torvalds } 42451da177e4SLinus Torvalds } 42461da177e4SLinus Torvalds 42471ed83465SPavel Emelyanov static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) 42481da177e4SLinus Torvalds { 42491ed83465SPavel Emelyanov struct tcp_sock *tp = tcp_sk(sk); 42501ed83465SPavel Emelyanov 42511da177e4SLinus Torvalds if (!tp->rx_opt.dsack) 42521ed83465SPavel Emelyanov tcp_dsack_set(sk, seq, end_seq); 42531da177e4SLinus Torvalds else 42541da177e4SLinus Torvalds tcp_sack_extend(tp->duplicate_sack, seq, end_seq); 42551da177e4SLinus Torvalds } 42561da177e4SLinus Torvalds 42577788174eSYuchung Cheng static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) 42587788174eSYuchung Cheng { 42597788174eSYuchung Cheng /* When the ACK path fails or drops most ACKs, the sender would 42607788174eSYuchung Cheng * timeout and spuriously retransmit the same segment repeatedly. 42617788174eSYuchung Cheng * The receiver remembers and reflects via DSACKs. Leverage the 42627788174eSYuchung Cheng * DSACK state and change the txhash to re-route speculatively. 42637788174eSYuchung Cheng */ 42647788174eSYuchung Cheng if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) 42657788174eSYuchung Cheng sk_rethink_txhash(sk); 42667788174eSYuchung Cheng } 42677788174eSYuchung Cheng 4268cf533ea5SEric Dumazet static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) 42691da177e4SLinus Torvalds { 42701da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 42711da177e4SLinus Torvalds 42721da177e4SLinus Torvalds if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 42731da177e4SLinus Torvalds before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 4274c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 42759a9c9b51SEric Dumazet tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); 42761da177e4SLinus Torvalds 42776496f6bdSEric Dumazet if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) { 42781da177e4SLinus Torvalds u32 end_seq = TCP_SKB_CB(skb)->end_seq; 42791da177e4SLinus Torvalds 42807788174eSYuchung Cheng tcp_rcv_spurious_retrans(sk, skb); 42811da177e4SLinus Torvalds if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) 42821da177e4SLinus Torvalds end_seq = tp->rcv_nxt; 42831ed83465SPavel Emelyanov tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); 42841da177e4SLinus Torvalds } 42851da177e4SLinus Torvalds } 42861da177e4SLinus Torvalds 42871da177e4SLinus Torvalds tcp_send_ack(sk); 42881da177e4SLinus Torvalds } 42891da177e4SLinus Torvalds 42901da177e4SLinus Torvalds /* These routines update the SACK block as out-of-order packets arrive or 42911da177e4SLinus Torvalds * in-order packets close up the sequence space. 42921da177e4SLinus Torvalds */ 42931da177e4SLinus Torvalds static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) 42941da177e4SLinus Torvalds { 42951da177e4SLinus Torvalds int this_sack; 42961da177e4SLinus Torvalds struct tcp_sack_block *sp = &tp->selective_acks[0]; 42971da177e4SLinus Torvalds struct tcp_sack_block *swalk = sp + 1; 42981da177e4SLinus Torvalds 42991da177e4SLinus Torvalds /* See if the recent change to the first SACK eats into 43001da177e4SLinus Torvalds * or hits the sequence space of other SACK blocks, if so coalesce. 43011da177e4SLinus Torvalds */ 43021da177e4SLinus Torvalds for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) { 43031da177e4SLinus Torvalds if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { 43041da177e4SLinus Torvalds int i; 43051da177e4SLinus Torvalds 43061da177e4SLinus Torvalds /* Zap SWALK, by moving every further SACK up by one slot. 43071da177e4SLinus Torvalds * Decrease num_sacks. 43081da177e4SLinus Torvalds */ 43091da177e4SLinus Torvalds tp->rx_opt.num_sacks--; 43101da177e4SLinus Torvalds for (i = this_sack; i < tp->rx_opt.num_sacks; i++) 43111da177e4SLinus Torvalds sp[i] = sp[i + 1]; 43121da177e4SLinus Torvalds continue; 43131da177e4SLinus Torvalds } 43141da177e4SLinus Torvalds this_sack++, swalk++; 43151da177e4SLinus Torvalds } 43161da177e4SLinus Torvalds } 43171da177e4SLinus Torvalds 43181da177e4SLinus Torvalds static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) 43191da177e4SLinus Torvalds { 43201da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 43211da177e4SLinus Torvalds struct tcp_sack_block *sp = &tp->selective_acks[0]; 43221da177e4SLinus Torvalds int cur_sacks = tp->rx_opt.num_sacks; 43231da177e4SLinus Torvalds int this_sack; 43241da177e4SLinus Torvalds 43251da177e4SLinus Torvalds if (!cur_sacks) 43261da177e4SLinus Torvalds goto new_sack; 43271da177e4SLinus Torvalds 43281da177e4SLinus Torvalds for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { 43291da177e4SLinus Torvalds if (tcp_sack_extend(sp, seq, end_seq)) { 43301da177e4SLinus Torvalds /* Rotate this_sack to the first one. */ 43311da177e4SLinus Torvalds for (; this_sack > 0; this_sack--, sp--) 4332a0bffffcSIlpo Järvinen swap(*sp, *(sp - 1)); 43331da177e4SLinus Torvalds if (cur_sacks > 1) 43341da177e4SLinus Torvalds tcp_sack_maybe_coalesce(tp); 43351da177e4SLinus Torvalds return; 43361da177e4SLinus Torvalds } 43371da177e4SLinus Torvalds } 43381da177e4SLinus Torvalds 43391da177e4SLinus Torvalds /* Could not find an adjacent existing SACK, build a new one, 43401da177e4SLinus Torvalds * put it at the front, and shift everyone else down. We 43411da177e4SLinus Torvalds * always know there is at least one SACK present already here. 43421da177e4SLinus Torvalds * 43431da177e4SLinus Torvalds * If the sack array is full, forget about the last one. 43441da177e4SLinus Torvalds */ 43454389ddedSAdam Langley if (this_sack >= TCP_NUM_SACKS) { 434686de5921SEric Dumazet if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) 43475d9f4262SEric Dumazet tcp_send_ack(sk); 43481da177e4SLinus Torvalds this_sack--; 43491da177e4SLinus Torvalds tp->rx_opt.num_sacks--; 43501da177e4SLinus Torvalds sp--; 43511da177e4SLinus Torvalds } 43521da177e4SLinus Torvalds for (; this_sack > 0; this_sack--, sp--) 43531da177e4SLinus Torvalds *sp = *(sp - 1); 43541da177e4SLinus Torvalds 43551da177e4SLinus Torvalds new_sack: 43561da177e4SLinus Torvalds /* Build the new head SACK, and we're done. */ 43571da177e4SLinus Torvalds sp->start_seq = seq; 43581da177e4SLinus Torvalds sp->end_seq = end_seq; 43591da177e4SLinus Torvalds tp->rx_opt.num_sacks++; 43601da177e4SLinus Torvalds } 43611da177e4SLinus Torvalds 43621da177e4SLinus Torvalds /* RCV.NXT advances, some SACKs should be eaten. */ 43631da177e4SLinus Torvalds 43641da177e4SLinus Torvalds static void tcp_sack_remove(struct tcp_sock *tp) 43651da177e4SLinus Torvalds { 43661da177e4SLinus Torvalds struct tcp_sack_block *sp = &tp->selective_acks[0]; 43671da177e4SLinus Torvalds int num_sacks = tp->rx_opt.num_sacks; 43681da177e4SLinus Torvalds int this_sack; 43691da177e4SLinus Torvalds 43701da177e4SLinus Torvalds /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 43719f5afeaeSYaogong Wang if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 43721da177e4SLinus Torvalds tp->rx_opt.num_sacks = 0; 43731da177e4SLinus Torvalds return; 43741da177e4SLinus Torvalds } 43751da177e4SLinus Torvalds 43761da177e4SLinus Torvalds for (this_sack = 0; this_sack < num_sacks;) { 43771da177e4SLinus Torvalds /* Check if the start of the sack is covered by RCV.NXT. */ 43781da177e4SLinus Torvalds if (!before(tp->rcv_nxt, sp->start_seq)) { 43791da177e4SLinus Torvalds int i; 43801da177e4SLinus Torvalds 43811da177e4SLinus Torvalds /* RCV.NXT must cover all the block! */ 4382547b792cSIlpo Järvinen WARN_ON(before(tp->rcv_nxt, sp->end_seq)); 43831da177e4SLinus Torvalds 43841da177e4SLinus Torvalds /* Zap this SACK, by moving forward any other SACKS. */ 43851da177e4SLinus Torvalds for (i = this_sack+1; i < num_sacks; i++) 43861da177e4SLinus Torvalds tp->selective_acks[i-1] = tp->selective_acks[i]; 43871da177e4SLinus Torvalds num_sacks--; 43881da177e4SLinus Torvalds continue; 43891da177e4SLinus Torvalds } 43901da177e4SLinus Torvalds this_sack++; 43911da177e4SLinus Torvalds sp++; 43921da177e4SLinus Torvalds } 43931da177e4SLinus Torvalds tp->rx_opt.num_sacks = num_sacks; 43941da177e4SLinus Torvalds } 43951da177e4SLinus Torvalds 43961402d366SEric Dumazet /** 43971402d366SEric Dumazet * tcp_try_coalesce - try to merge skb to prior one 43981402d366SEric Dumazet * @sk: socket 439998aaa913SMike Maloney * @dest: destination queue 44001402d366SEric Dumazet * @to: prior buffer 44011402d366SEric Dumazet * @from: buffer to add in queue 4402923dd347SEric Dumazet * @fragstolen: pointer to boolean 44031402d366SEric Dumazet * 44041402d366SEric Dumazet * Before queueing skb @from after @to, try to merge them 44051402d366SEric Dumazet * to reduce overall memory use and queue lengths, if cost is small. 44061402d366SEric Dumazet * Packets in ofo or receive queues can stay a long time. 44071402d366SEric Dumazet * Better try to coalesce them right now to avoid future collapses. 4408783c175fSEric Dumazet * Returns true if caller should free @from instead of queueing it 44091402d366SEric Dumazet */ 4410783c175fSEric Dumazet static bool tcp_try_coalesce(struct sock *sk, 44111402d366SEric Dumazet struct sk_buff *to, 4412329033f6SEric Dumazet struct sk_buff *from, 4413329033f6SEric Dumazet bool *fragstolen) 44141402d366SEric Dumazet { 4415bad43ca8SEric Dumazet int delta; 44161402d366SEric Dumazet 4417329033f6SEric Dumazet *fragstolen = false; 441834a802a5SAlexander Duyck 44191ca7ee30SEric Dumazet /* Its possible this segment overlaps with prior segment in queue */ 44201ca7ee30SEric Dumazet if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) 44211ca7ee30SEric Dumazet return false; 44221ca7ee30SEric Dumazet 442341ed9c04SBoris Pismenny #ifdef CONFIG_TLS_DEVICE 442441ed9c04SBoris Pismenny if (from->decrypted != to->decrypted) 442541ed9c04SBoris Pismenny return false; 442641ed9c04SBoris Pismenny #endif 442741ed9c04SBoris Pismenny 4428bad43ca8SEric Dumazet if (!skb_try_coalesce(to, from, fragstolen, &delta)) 4429783c175fSEric Dumazet return false; 443034a802a5SAlexander Duyck 44311402d366SEric Dumazet atomic_add(delta, &sk->sk_rmem_alloc); 44321402d366SEric Dumazet sk_mem_charge(sk, delta); 4433c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); 443434a802a5SAlexander Duyck TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; 443534a802a5SAlexander Duyck TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; 4436e93a0435SEric Dumazet TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; 443798aaa913SMike Maloney 443898aaa913SMike Maloney if (TCP_SKB_CB(from)->has_rxtstamp) { 443998aaa913SMike Maloney TCP_SKB_CB(to)->has_rxtstamp = true; 444098aaa913SMike Maloney to->tstamp = from->tstamp; 4441cadf9df2SStephen Mallon skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp; 444298aaa913SMike Maloney } 444398aaa913SMike Maloney 444434a802a5SAlexander Duyck return true; 44451402d366SEric Dumazet } 44461402d366SEric Dumazet 444758152ecbSEric Dumazet static bool tcp_ooo_try_coalesce(struct sock *sk, 444858152ecbSEric Dumazet struct sk_buff *to, 444958152ecbSEric Dumazet struct sk_buff *from, 445058152ecbSEric Dumazet bool *fragstolen) 445158152ecbSEric Dumazet { 445258152ecbSEric Dumazet bool res = tcp_try_coalesce(sk, to, from, fragstolen); 445358152ecbSEric Dumazet 445458152ecbSEric Dumazet /* In case tcp_drop() is called later, update to->gso_segs */ 445558152ecbSEric Dumazet if (res) { 445658152ecbSEric Dumazet u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + 445758152ecbSEric Dumazet max_t(u16, 1, skb_shinfo(from)->gso_segs); 445858152ecbSEric Dumazet 445958152ecbSEric Dumazet skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); 446058152ecbSEric Dumazet } 446158152ecbSEric Dumazet return res; 446258152ecbSEric Dumazet } 446358152ecbSEric Dumazet 4464532182cdSEric Dumazet static void tcp_drop(struct sock *sk, struct sk_buff *skb) 4465532182cdSEric Dumazet { 4466532182cdSEric Dumazet sk_drops_add(sk, skb); 4467532182cdSEric Dumazet __kfree_skb(skb); 4468532182cdSEric Dumazet } 4469532182cdSEric Dumazet 44701da177e4SLinus Torvalds /* This one checks to see if we can put data from the 44711da177e4SLinus Torvalds * out_of_order queue into the receive_queue. 44721da177e4SLinus Torvalds */ 44731da177e4SLinus Torvalds static void tcp_ofo_queue(struct sock *sk) 44741da177e4SLinus Torvalds { 44751da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 44761da177e4SLinus Torvalds __u32 dsack_high = tp->rcv_nxt; 44779f5afeaeSYaogong Wang bool fin, fragstolen, eaten; 4478bd1e75abSEric Dumazet struct sk_buff *skb, *tail; 44799f5afeaeSYaogong Wang struct rb_node *p; 44801da177e4SLinus Torvalds 44819f5afeaeSYaogong Wang p = rb_first(&tp->out_of_order_queue); 44829f5afeaeSYaogong Wang while (p) { 448318a4c0eaSEric Dumazet skb = rb_to_skb(p); 44841da177e4SLinus Torvalds if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 44851da177e4SLinus Torvalds break; 44861da177e4SLinus Torvalds 44871da177e4SLinus Torvalds if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { 44881da177e4SLinus Torvalds __u32 dsack = dsack_high; 44891da177e4SLinus Torvalds if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) 44901da177e4SLinus Torvalds dsack_high = TCP_SKB_CB(skb)->end_seq; 44911da177e4SLinus Torvalds tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 44921da177e4SLinus Torvalds } 44939f5afeaeSYaogong Wang p = rb_next(p); 44949f5afeaeSYaogong Wang rb_erase(&skb->rbnode, &tp->out_of_order_queue); 44951da177e4SLinus Torvalds 44969f5afeaeSYaogong Wang if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) { 4497532182cdSEric Dumazet tcp_drop(sk, skb); 44981da177e4SLinus Torvalds continue; 44991da177e4SLinus Torvalds } 45001da177e4SLinus Torvalds 4501bd1e75abSEric Dumazet tail = skb_peek_tail(&sk->sk_receive_queue); 4502bffa72cfSEric Dumazet eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); 4503bdd1f9edSEric Dumazet tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); 45049f5afeaeSYaogong Wang fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 4505bd1e75abSEric Dumazet if (!eaten) 4506bd1e75abSEric Dumazet __skb_queue_tail(&sk->sk_receive_queue, skb); 45079f5afeaeSYaogong Wang else 4508bd1e75abSEric Dumazet kfree_skb_partial(skb, fragstolen); 45099f5afeaeSYaogong Wang 45109f5afeaeSYaogong Wang if (unlikely(fin)) { 45119f5afeaeSYaogong Wang tcp_fin(sk); 45129f5afeaeSYaogong Wang /* tcp_fin() purges tp->out_of_order_queue, 45139f5afeaeSYaogong Wang * so we must end this loop right now. 45149f5afeaeSYaogong Wang */ 45159f5afeaeSYaogong Wang break; 45169f5afeaeSYaogong Wang } 45171da177e4SLinus Torvalds } 45181da177e4SLinus Torvalds } 45191da177e4SLinus Torvalds 45201da177e4SLinus Torvalds static bool tcp_prune_ofo_queue(struct sock *sk); 45211da177e4SLinus Torvalds static int tcp_prune_queue(struct sock *sk); 45221da177e4SLinus Torvalds 45231da177e4SLinus Torvalds static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 45241da177e4SLinus Torvalds unsigned int size) 45251da177e4SLinus Torvalds { 45261da177e4SLinus Torvalds if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 45271da177e4SLinus Torvalds !sk_rmem_schedule(sk, skb, size)) { 45281da177e4SLinus Torvalds 45291da177e4SLinus Torvalds if (tcp_prune_queue(sk) < 0) 45301da177e4SLinus Torvalds return -1; 45311ed83465SPavel Emelyanov 453236a6503fSEric Dumazet while (!sk_rmem_schedule(sk, skb, size)) { 45331da177e4SLinus Torvalds if (!tcp_prune_ofo_queue(sk)) 45341da177e4SLinus Torvalds return -1; 45351da177e4SLinus Torvalds } 45361da177e4SLinus Torvalds } 45371da177e4SLinus Torvalds return 0; 45381da177e4SLinus Torvalds } 45391da177e4SLinus Torvalds 4540e86b2919SEric Dumazet static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4541e86b2919SEric Dumazet { 4542e86b2919SEric Dumazet struct tcp_sock *tp = tcp_sk(sk); 454318a4c0eaSEric Dumazet struct rb_node **p, *parent; 4544e86b2919SEric Dumazet struct sk_buff *skb1; 4545e86b2919SEric Dumazet u32 seq, end_seq; 45469f5afeaeSYaogong Wang bool fragstolen; 4547e86b2919SEric Dumazet 4548f4c9f85fSYousuk Seung tcp_ecn_check_ce(sk, skb); 4549e86b2919SEric Dumazet 4550c76562b6SMel Gorman if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { 4551c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); 4552532182cdSEric Dumazet tcp_drop(sk, skb); 4553e86b2919SEric Dumazet return; 4554e86b2919SEric Dumazet } 4555e86b2919SEric Dumazet 455631770e34SFlorian Westphal /* Disable header prediction. */ 455731770e34SFlorian Westphal tp->pred_flags = 0; 4558e86b2919SEric Dumazet inet_csk_schedule_ack(sk); 4559e86b2919SEric Dumazet 4560f9af2dbbSThomas Higdon tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs); 4561c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); 45629f5afeaeSYaogong Wang seq = TCP_SKB_CB(skb)->seq; 45639f5afeaeSYaogong Wang end_seq = TCP_SKB_CB(skb)->end_seq; 4564e86b2919SEric Dumazet 45659f5afeaeSYaogong Wang p = &tp->out_of_order_queue.rb_node; 45669f5afeaeSYaogong Wang if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 4567e86b2919SEric Dumazet /* Initial out of order segment, build 1 SACK. */ 4568e86b2919SEric Dumazet if (tcp_is_sack(tp)) { 4569e86b2919SEric Dumazet tp->rx_opt.num_sacks = 1; 45709f5afeaeSYaogong Wang tp->selective_acks[0].start_seq = seq; 45719f5afeaeSYaogong Wang tp->selective_acks[0].end_seq = end_seq; 4572e86b2919SEric Dumazet } 45739f5afeaeSYaogong Wang rb_link_node(&skb->rbnode, NULL, p); 45749f5afeaeSYaogong Wang rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); 45759f5afeaeSYaogong Wang tp->ooo_last_skb = skb; 4576e86b2919SEric Dumazet goto end; 4577e86b2919SEric Dumazet } 4578e86b2919SEric Dumazet 45799f5afeaeSYaogong Wang /* In the typical case, we are adding an skb to the end of the list. 45809f5afeaeSYaogong Wang * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. 45819f5afeaeSYaogong Wang */ 458258152ecbSEric Dumazet if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, 458398aaa913SMike Maloney skb, &fragstolen)) { 45849f5afeaeSYaogong Wang coalesce_done: 45854e4f1fc2SEric Dumazet tcp_grow_window(sk, skb); 4586923dd347SEric Dumazet kfree_skb_partial(skb, fragstolen); 4587c8628155SEric Dumazet skb = NULL; 4588e86b2919SEric Dumazet goto add_sack; 4589e86b2919SEric Dumazet } 45902594a2a9SEric Dumazet /* Can avoid an rbtree lookup if we are adding skb after ooo_last_skb */ 45912594a2a9SEric Dumazet if (!before(seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) { 45922594a2a9SEric Dumazet parent = &tp->ooo_last_skb->rbnode; 45932594a2a9SEric Dumazet p = &parent->rb_right; 45942594a2a9SEric Dumazet goto insert; 4595e86b2919SEric Dumazet } 4596e86b2919SEric Dumazet 45979f5afeaeSYaogong Wang /* Find place to insert this segment. Handle overlaps on the way. */ 45989f5afeaeSYaogong Wang parent = NULL; 45999f5afeaeSYaogong Wang while (*p) { 46009f5afeaeSYaogong Wang parent = *p; 460118a4c0eaSEric Dumazet skb1 = rb_to_skb(parent); 46029f5afeaeSYaogong Wang if (before(seq, TCP_SKB_CB(skb1)->seq)) { 46039f5afeaeSYaogong Wang p = &parent->rb_left; 46049f5afeaeSYaogong Wang continue; 4605e86b2919SEric Dumazet } 46069f5afeaeSYaogong Wang if (before(seq, TCP_SKB_CB(skb1)->end_seq)) { 4607e86b2919SEric Dumazet if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4608e86b2919SEric Dumazet /* All the bits are present. Drop. */ 46099f5afeaeSYaogong Wang NET_INC_STATS(sock_net(sk), 46109f5afeaeSYaogong Wang LINUX_MIB_TCPOFOMERGE); 46118541b21eSEric Dumazet tcp_drop(sk, skb); 4612e86b2919SEric Dumazet skb = NULL; 4613e86b2919SEric Dumazet tcp_dsack_set(sk, seq, end_seq); 4614e86b2919SEric Dumazet goto add_sack; 4615e86b2919SEric Dumazet } 4616e86b2919SEric Dumazet if (after(seq, TCP_SKB_CB(skb1)->seq)) { 4617e86b2919SEric Dumazet /* Partial overlap. */ 46189f5afeaeSYaogong Wang tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); 4619e86b2919SEric Dumazet } else { 46209f5afeaeSYaogong Wang /* skb's seq == skb1's seq and skb covers skb1. 46219f5afeaeSYaogong Wang * Replace skb1 with skb. 46229f5afeaeSYaogong Wang */ 46239f5afeaeSYaogong Wang rb_replace_node(&skb1->rbnode, &skb->rbnode, 46249f5afeaeSYaogong Wang &tp->out_of_order_queue); 46259f5afeaeSYaogong Wang tcp_dsack_extend(sk, 46269f5afeaeSYaogong Wang TCP_SKB_CB(skb1)->seq, 46279f5afeaeSYaogong Wang TCP_SKB_CB(skb1)->end_seq); 46289f5afeaeSYaogong Wang NET_INC_STATS(sock_net(sk), 46299f5afeaeSYaogong Wang LINUX_MIB_TCPOFOMERGE); 46308541b21eSEric Dumazet tcp_drop(sk, skb1); 463176f0dcbbSEric Dumazet goto merge_right; 4632e86b2919SEric Dumazet } 463358152ecbSEric Dumazet } else if (tcp_ooo_try_coalesce(sk, skb1, 463498aaa913SMike Maloney skb, &fragstolen)) { 46359f5afeaeSYaogong Wang goto coalesce_done; 4636e86b2919SEric Dumazet } 46379f5afeaeSYaogong Wang p = &parent->rb_right; 46389f5afeaeSYaogong Wang } 46392594a2a9SEric Dumazet insert: 46409f5afeaeSYaogong Wang /* Insert segment into RB tree. */ 46419f5afeaeSYaogong Wang rb_link_node(&skb->rbnode, parent, p); 46429f5afeaeSYaogong Wang rb_insert_color(&skb->rbnode, &tp->out_of_order_queue); 4643e86b2919SEric Dumazet 464476f0dcbbSEric Dumazet merge_right: 46459f5afeaeSYaogong Wang /* Remove other segments covered by skb. */ 464618a4c0eaSEric Dumazet while ((skb1 = skb_rb_next(skb)) != NULL) { 4647e86b2919SEric Dumazet if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) 4648e86b2919SEric Dumazet break; 4649e86b2919SEric Dumazet if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { 4650e86b2919SEric Dumazet tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4651e86b2919SEric Dumazet end_seq); 4652e86b2919SEric Dumazet break; 4653e86b2919SEric Dumazet } 46549f5afeaeSYaogong Wang rb_erase(&skb1->rbnode, &tp->out_of_order_queue); 4655e86b2919SEric Dumazet tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, 4656e86b2919SEric Dumazet TCP_SKB_CB(skb1)->end_seq); 4657c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); 4658532182cdSEric Dumazet tcp_drop(sk, skb1); 4659e86b2919SEric Dumazet } 46609f5afeaeSYaogong Wang /* If there is no skb after us, we are the last_skb ! */ 466118a4c0eaSEric Dumazet if (!skb1) 46629f5afeaeSYaogong Wang tp->ooo_last_skb = skb; 4663e86b2919SEric Dumazet 4664e86b2919SEric Dumazet add_sack: 4665e86b2919SEric Dumazet if (tcp_is_sack(tp)) 4666e86b2919SEric Dumazet tcp_sack_new_ofo_skb(sk, seq, end_seq); 4667e86b2919SEric Dumazet end: 46684e4f1fc2SEric Dumazet if (skb) { 46694e4f1fc2SEric Dumazet tcp_grow_window(sk, skb); 467060b1af33SEric Dumazet skb_condense(skb); 4671e86b2919SEric Dumazet skb_set_owner_r(skb, sk); 4672e86b2919SEric Dumazet } 46734e4f1fc2SEric Dumazet } 4674e86b2919SEric Dumazet 4675e7395f1fSEric Dumazet static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, 4676b081f85cSEric Dumazet bool *fragstolen) 4677b081f85cSEric Dumazet { 4678b081f85cSEric Dumazet int eaten; 4679b081f85cSEric Dumazet struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue); 4680b081f85cSEric Dumazet 4681b081f85cSEric Dumazet eaten = (tail && 4682bffa72cfSEric Dumazet tcp_try_coalesce(sk, tail, 468398aaa913SMike Maloney skb, fragstolen)) ? 1 : 0; 4684bdd1f9edSEric Dumazet tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); 4685b081f85cSEric Dumazet if (!eaten) { 4686b081f85cSEric Dumazet __skb_queue_tail(&sk->sk_receive_queue, skb); 4687b081f85cSEric Dumazet skb_set_owner_r(skb, sk); 4688b081f85cSEric Dumazet } 4689b081f85cSEric Dumazet return eaten; 4690b081f85cSEric Dumazet } 4691e86b2919SEric Dumazet 4692292e8d8cSPavel Emelyanov int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4693292e8d8cSPavel Emelyanov { 4694cb93471aSEric Dumazet struct sk_buff *skb; 46955d4c9bfbSEric Dumazet int err = -ENOMEM; 46965d4c9bfbSEric Dumazet int data_len = 0; 4697292e8d8cSPavel Emelyanov bool fragstolen; 4698292e8d8cSPavel Emelyanov 4699c454e611SPavel Emelyanov if (size == 0) 4700c454e611SPavel Emelyanov return 0; 4701c454e611SPavel Emelyanov 47025d4c9bfbSEric Dumazet if (size > PAGE_SIZE) { 47035d4c9bfbSEric Dumazet int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS); 47045d4c9bfbSEric Dumazet 47055d4c9bfbSEric Dumazet data_len = npages << PAGE_SHIFT; 47065d4c9bfbSEric Dumazet size = data_len + (size & ~PAGE_MASK); 47075d4c9bfbSEric Dumazet } 47085d4c9bfbSEric Dumazet skb = alloc_skb_with_frags(size - data_len, data_len, 47095d4c9bfbSEric Dumazet PAGE_ALLOC_COSTLY_ORDER, 47105d4c9bfbSEric Dumazet &err, sk->sk_allocation); 4711292e8d8cSPavel Emelyanov if (!skb) 4712292e8d8cSPavel Emelyanov goto err; 4713292e8d8cSPavel Emelyanov 47145d4c9bfbSEric Dumazet skb_put(skb, size - data_len); 47155d4c9bfbSEric Dumazet skb->data_len = data_len; 47165d4c9bfbSEric Dumazet skb->len = size; 47175d4c9bfbSEric Dumazet 4718ea5d0c32SYafang Shao if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { 4719ea5d0c32SYafang Shao NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); 4720c76562b6SMel Gorman goto err_free; 4721ea5d0c32SYafang Shao } 4722c76562b6SMel Gorman 47235d4c9bfbSEric Dumazet err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); 47245d4c9bfbSEric Dumazet if (err) 4725292e8d8cSPavel Emelyanov goto err_free; 4726292e8d8cSPavel Emelyanov 4727292e8d8cSPavel Emelyanov TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; 4728292e8d8cSPavel Emelyanov TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4729292e8d8cSPavel Emelyanov TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4730292e8d8cSPavel Emelyanov 4731e7395f1fSEric Dumazet if (tcp_queue_rcv(sk, skb, &fragstolen)) { 4732292e8d8cSPavel Emelyanov WARN_ON_ONCE(fragstolen); /* should not happen */ 4733292e8d8cSPavel Emelyanov __kfree_skb(skb); 4734292e8d8cSPavel Emelyanov } 4735292e8d8cSPavel Emelyanov return size; 4736292e8d8cSPavel Emelyanov 4737292e8d8cSPavel Emelyanov err_free: 4738292e8d8cSPavel Emelyanov kfree_skb(skb); 4739292e8d8cSPavel Emelyanov err: 47405d4c9bfbSEric Dumazet return err; 47415d4c9bfbSEric Dumazet 4742292e8d8cSPavel Emelyanov } 4743292e8d8cSPavel Emelyanov 474403f45c88SEric Dumazet void tcp_data_ready(struct sock *sk) 474503f45c88SEric Dumazet { 474603f45c88SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 474703f45c88SEric Dumazet int avail = tp->rcv_nxt - tp->copied_seq; 474803f45c88SEric Dumazet 474903f45c88SEric Dumazet if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) 475003f45c88SEric Dumazet return; 475103f45c88SEric Dumazet 475203f45c88SEric Dumazet sk->sk_data_ready(sk); 475303f45c88SEric Dumazet } 475403f45c88SEric Dumazet 47551da177e4SLinus Torvalds static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) 47561da177e4SLinus Torvalds { 47571da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 47585357f0bdSEric Dumazet bool fragstolen; 47595357f0bdSEric Dumazet int eaten; 47601da177e4SLinus Torvalds 4761532182cdSEric Dumazet if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { 4762532182cdSEric Dumazet __kfree_skb(skb); 4763532182cdSEric Dumazet return; 4764532182cdSEric Dumazet } 4765f84af32cSEric Dumazet skb_dst_drop(skb); 4766155c6e1aSPeter Pan(潘卫平) __skb_pull(skb, tcp_hdr(skb)->doff * 4); 47671da177e4SLinus Torvalds 4768fd2123a3SYuchung Cheng tcp_ecn_accept_cwr(sk, skb); 47691da177e4SLinus Torvalds 47701da177e4SLinus Torvalds tp->rx_opt.dsack = 0; 47711da177e4SLinus Torvalds 47721da177e4SLinus Torvalds /* Queue data for delivery to the user. 47731da177e4SLinus Torvalds * Packets in sequence go to the receive queue. 47741da177e4SLinus Torvalds * Out of sequence packets to the out_of_order_queue. 47751da177e4SLinus Torvalds */ 47761da177e4SLinus Torvalds if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 4777fb223502SYafang Shao if (tcp_receive_window(tp) == 0) { 4778fb223502SYafang Shao NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); 47791da177e4SLinus Torvalds goto out_of_window; 4780fb223502SYafang Shao } 47811da177e4SLinus Torvalds 47821da177e4SLinus Torvalds /* Ok. In sequence. In window. */ 47831da177e4SLinus Torvalds queue_and_out: 478476dfa608SEric Dumazet if (skb_queue_len(&sk->sk_receive_queue) == 0) 478576dfa608SEric Dumazet sk_forced_mem_schedule(sk, skb->truesize); 4786ea5d0c32SYafang Shao else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) { 4787ea5d0c32SYafang Shao NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP); 47881da177e4SLinus Torvalds goto drop; 4789ea5d0c32SYafang Shao } 47905357f0bdSEric Dumazet 4791e7395f1fSEric Dumazet eaten = tcp_queue_rcv(sk, skb, &fragstolen); 47921da177e4SLinus Torvalds if (skb->len) 47939e412ba7SIlpo Järvinen tcp_event_data_recv(sk, skb); 4794155c6e1aSPeter Pan(潘卫平) if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) 479520c4cb79SEric Dumazet tcp_fin(sk); 47961da177e4SLinus Torvalds 47979f5afeaeSYaogong Wang if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 47981da177e4SLinus Torvalds tcp_ofo_queue(sk); 47991da177e4SLinus Torvalds 480015bdd568SYuchung Cheng /* RFC5681. 4.2. SHOULD send immediate ACK, when 48011da177e4SLinus Torvalds * gap in queue is filled. 48021da177e4SLinus Torvalds */ 48039f5afeaeSYaogong Wang if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) 480415bdd568SYuchung Cheng inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 48051da177e4SLinus Torvalds } 48061da177e4SLinus Torvalds 48071da177e4SLinus Torvalds if (tp->rx_opt.num_sacks) 48081da177e4SLinus Torvalds tcp_sack_remove(tp); 48091da177e4SLinus Torvalds 481031770e34SFlorian Westphal tcp_fast_path_check(sk); 481131770e34SFlorian Westphal 4812923dd347SEric Dumazet if (eaten > 0) 4813923dd347SEric Dumazet kfree_skb_partial(skb, fragstolen); 48141d57f195SEric Dumazet if (!sock_flag(sk, SOCK_DEAD)) 481503f45c88SEric Dumazet tcp_data_ready(sk); 48161da177e4SLinus Torvalds return; 48171da177e4SLinus Torvalds } 48181da177e4SLinus Torvalds 48191da177e4SLinus Torvalds if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 48207788174eSYuchung Cheng tcp_rcv_spurious_retrans(sk, skb); 48211da177e4SLinus Torvalds /* A retransmit, 2nd most common case. Force an immediate ack. */ 4822c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 48231ed83465SPavel Emelyanov tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 48241da177e4SLinus Torvalds 48251da177e4SLinus Torvalds out_of_window: 48269a9c9b51SEric Dumazet tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); 4827463c84b9SArnaldo Carvalho de Melo inet_csk_schedule_ack(sk); 48281da177e4SLinus Torvalds drop: 4829532182cdSEric Dumazet tcp_drop(sk, skb); 48301da177e4SLinus Torvalds return; 48311da177e4SLinus Torvalds } 48321da177e4SLinus Torvalds 48331da177e4SLinus Torvalds /* Out of window. F.e. zero window probe. */ 48341da177e4SLinus Torvalds if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) 48351da177e4SLinus Torvalds goto out_of_window; 48361da177e4SLinus Torvalds 48371da177e4SLinus Torvalds if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { 48381da177e4SLinus Torvalds /* Partial packet, seq < rcv_next < end_seq */ 48391ed83465SPavel Emelyanov tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); 48401da177e4SLinus Torvalds 48411da177e4SLinus Torvalds /* If window is closed, drop tail of packet. But after 48421da177e4SLinus Torvalds * remembering D-SACK for its head made in previous line. 48431da177e4SLinus Torvalds */ 4844fb223502SYafang Shao if (!tcp_receive_window(tp)) { 4845fb223502SYafang Shao NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP); 48461da177e4SLinus Torvalds goto out_of_window; 4847fb223502SYafang Shao } 48481da177e4SLinus Torvalds goto queue_and_out; 48491da177e4SLinus Torvalds } 48501da177e4SLinus Torvalds 4851e86b2919SEric Dumazet tcp_data_queue_ofo(sk, skb); 48521da177e4SLinus Torvalds } 48531da177e4SLinus Torvalds 48549f5afeaeSYaogong Wang static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *list) 48552cf46637SIlpo Järvinen { 48569f5afeaeSYaogong Wang if (list) 48579f5afeaeSYaogong Wang return !skb_queue_is_last(list, skb) ? skb->next : NULL; 485891521944SDavid S. Miller 485918a4c0eaSEric Dumazet return skb_rb_next(skb); 48609f5afeaeSYaogong Wang } 48612cf46637SIlpo Järvinen 48629f5afeaeSYaogong Wang static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, 48639f5afeaeSYaogong Wang struct sk_buff_head *list, 48649f5afeaeSYaogong Wang struct rb_root *root) 48659f5afeaeSYaogong Wang { 48669f5afeaeSYaogong Wang struct sk_buff *next = tcp_skb_next(skb, list); 48679f5afeaeSYaogong Wang 48689f5afeaeSYaogong Wang if (list) 48692cf46637SIlpo Järvinen __skb_unlink(skb, list); 48709f5afeaeSYaogong Wang else 48719f5afeaeSYaogong Wang rb_erase(&skb->rbnode, root); 48729f5afeaeSYaogong Wang 48732cf46637SIlpo Järvinen __kfree_skb(skb); 4874c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); 48752cf46637SIlpo Järvinen 48762cf46637SIlpo Järvinen return next; 48772cf46637SIlpo Järvinen } 48782cf46637SIlpo Järvinen 48799f5afeaeSYaogong Wang /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ 488075c119afSEric Dumazet void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) 48819f5afeaeSYaogong Wang { 48829f5afeaeSYaogong Wang struct rb_node **p = &root->rb_node; 48839f5afeaeSYaogong Wang struct rb_node *parent = NULL; 48849f5afeaeSYaogong Wang struct sk_buff *skb1; 48859f5afeaeSYaogong Wang 48869f5afeaeSYaogong Wang while (*p) { 48879f5afeaeSYaogong Wang parent = *p; 488818a4c0eaSEric Dumazet skb1 = rb_to_skb(parent); 48899f5afeaeSYaogong Wang if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) 48909f5afeaeSYaogong Wang p = &parent->rb_left; 48919f5afeaeSYaogong Wang else 48929f5afeaeSYaogong Wang p = &parent->rb_right; 48939f5afeaeSYaogong Wang } 48949f5afeaeSYaogong Wang rb_link_node(&skb->rbnode, parent, p); 48959f5afeaeSYaogong Wang rb_insert_color(&skb->rbnode, root); 48969f5afeaeSYaogong Wang } 48979f5afeaeSYaogong Wang 48981da177e4SLinus Torvalds /* Collapse contiguous sequence of skbs head..tail with 48991da177e4SLinus Torvalds * sequence numbers start..end. 490091521944SDavid S. Miller * 49019f5afeaeSYaogong Wang * If tail is NULL, this means until the end of the queue. 490291521944SDavid S. Miller * 49031da177e4SLinus Torvalds * Segments with FIN/SYN are not collapsed (only because this 49041da177e4SLinus Torvalds * simplifies code) 49051da177e4SLinus Torvalds */ 49061da177e4SLinus Torvalds static void 49079f5afeaeSYaogong Wang tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, 49089f5afeaeSYaogong Wang struct sk_buff *head, struct sk_buff *tail, u32 start, u32 end) 49091da177e4SLinus Torvalds { 49109f5afeaeSYaogong Wang struct sk_buff *skb = head, *n; 49119f5afeaeSYaogong Wang struct sk_buff_head tmp; 491291521944SDavid S. Miller bool end_of_skbs; 49131da177e4SLinus Torvalds 4914caa20d9aSStephen Hemminger /* First, check that queue is collapsible and find 49159f5afeaeSYaogong Wang * the point where collapsing can be useful. 49169f5afeaeSYaogong Wang */ 491791521944SDavid S. Miller restart: 49189f5afeaeSYaogong Wang for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) { 49199f5afeaeSYaogong Wang n = tcp_skb_next(skb, list); 49209f5afeaeSYaogong Wang 49211da177e4SLinus Torvalds /* No new bits? It is possible on ofo queue. */ 49221da177e4SLinus Torvalds if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 49239f5afeaeSYaogong Wang skb = tcp_collapse_one(sk, skb, list, root); 492491521944SDavid S. Miller if (!skb) 492591521944SDavid S. Miller break; 492691521944SDavid S. Miller goto restart; 49271da177e4SLinus Torvalds } 49281da177e4SLinus Torvalds 49291da177e4SLinus Torvalds /* The first skb to collapse is: 49301da177e4SLinus Torvalds * - not SYN/FIN and 49311da177e4SLinus Torvalds * - bloated or contains data before "start" or 49321da177e4SLinus Torvalds * overlaps to the next one. 49331da177e4SLinus Torvalds */ 4934e11ecddfSEric Dumazet if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && 493594f0893eSEric Dumazet (tcp_win_from_space(sk, skb->truesize) > skb->len || 493691521944SDavid S. Miller before(TCP_SKB_CB(skb)->seq, start))) { 493791521944SDavid S. Miller end_of_skbs = false; 49381da177e4SLinus Torvalds break; 493991521944SDavid S. Miller } 494091521944SDavid S. Miller 49419f5afeaeSYaogong Wang if (n && n != tail && 49429f5afeaeSYaogong Wang TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { 494391521944SDavid S. Miller end_of_skbs = false; 494491521944SDavid S. Miller break; 494591521944SDavid S. Miller } 49461da177e4SLinus Torvalds 49471da177e4SLinus Torvalds /* Decided to skip this, advance start seq. */ 49481da177e4SLinus Torvalds start = TCP_SKB_CB(skb)->end_seq; 49491da177e4SLinus Torvalds } 4950e11ecddfSEric Dumazet if (end_of_skbs || 4951e11ecddfSEric Dumazet (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 49521da177e4SLinus Torvalds return; 49531da177e4SLinus Torvalds 49549f5afeaeSYaogong Wang __skb_queue_head_init(&tmp); 49559f5afeaeSYaogong Wang 49561da177e4SLinus Torvalds while (before(start, end)) { 4957b3d6cb92SEric Dumazet int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); 49581da177e4SLinus Torvalds struct sk_buff *nskb; 49591da177e4SLinus Torvalds 4960b3d6cb92SEric Dumazet nskb = alloc_skb(copy, GFP_ATOMIC); 49611da177e4SLinus Torvalds if (!nskb) 49629f5afeaeSYaogong Wang break; 4963c51957daSArnaldo Carvalho de Melo 49641da177e4SLinus Torvalds memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 496541ed9c04SBoris Pismenny #ifdef CONFIG_TLS_DEVICE 496641ed9c04SBoris Pismenny nskb->decrypted = skb->decrypted; 496741ed9c04SBoris Pismenny #endif 49681da177e4SLinus Torvalds TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 49699f5afeaeSYaogong Wang if (list) 497043f59c89SDavid S. Miller __skb_queue_before(list, skb, nskb); 49719f5afeaeSYaogong Wang else 49729f5afeaeSYaogong Wang __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ 49733ab224beSHideo Aoki skb_set_owner_r(nskb, sk); 49741da177e4SLinus Torvalds 49751da177e4SLinus Torvalds /* Copy data, releasing collapsed skbs. */ 49761da177e4SLinus Torvalds while (copy > 0) { 49771da177e4SLinus Torvalds int offset = start - TCP_SKB_CB(skb)->seq; 49781da177e4SLinus Torvalds int size = TCP_SKB_CB(skb)->end_seq - start; 49791da177e4SLinus Torvalds 498009a62660SKris Katterjohn BUG_ON(offset < 0); 49811da177e4SLinus Torvalds if (size > 0) { 49821da177e4SLinus Torvalds size = min(copy, size); 49831da177e4SLinus Torvalds if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) 49841da177e4SLinus Torvalds BUG(); 49851da177e4SLinus Torvalds TCP_SKB_CB(nskb)->end_seq += size; 49861da177e4SLinus Torvalds copy -= size; 49871da177e4SLinus Torvalds start += size; 49881da177e4SLinus Torvalds } 49891da177e4SLinus Torvalds if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 49909f5afeaeSYaogong Wang skb = tcp_collapse_one(sk, skb, list, root); 499191521944SDavid S. Miller if (!skb || 499291521944SDavid S. Miller skb == tail || 4993e11ecddfSEric Dumazet (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) 49949f5afeaeSYaogong Wang goto end; 499541ed9c04SBoris Pismenny #ifdef CONFIG_TLS_DEVICE 499641ed9c04SBoris Pismenny if (skb->decrypted != nskb->decrypted) 499741ed9c04SBoris Pismenny goto end; 499841ed9c04SBoris Pismenny #endif 49991da177e4SLinus Torvalds } 50001da177e4SLinus Torvalds } 50011da177e4SLinus Torvalds } 50029f5afeaeSYaogong Wang end: 50039f5afeaeSYaogong Wang skb_queue_walk_safe(&tmp, skb, n) 50049f5afeaeSYaogong Wang tcp_rbtree_insert(root, skb); 50051da177e4SLinus Torvalds } 50061da177e4SLinus Torvalds 50071da177e4SLinus Torvalds /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 50081da177e4SLinus Torvalds * and tcp_collapse() them until all the queue is collapsed. 50091da177e4SLinus Torvalds */ 50101da177e4SLinus Torvalds static void tcp_collapse_ofo_queue(struct sock *sk) 50111da177e4SLinus Torvalds { 50121da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 50133d4bf93aSEric Dumazet u32 range_truesize, sum_tiny = 0; 50149f5afeaeSYaogong Wang struct sk_buff *skb, *head; 50151da177e4SLinus Torvalds u32 start, end; 50161da177e4SLinus Torvalds 501718a4c0eaSEric Dumazet skb = skb_rb_first(&tp->out_of_order_queue); 50189f5afeaeSYaogong Wang new_range: 50199f5afeaeSYaogong Wang if (!skb) { 502018a4c0eaSEric Dumazet tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); 50211da177e4SLinus Torvalds return; 50229f5afeaeSYaogong Wang } 50231da177e4SLinus Torvalds start = TCP_SKB_CB(skb)->seq; 50241da177e4SLinus Torvalds end = TCP_SKB_CB(skb)->end_seq; 50253d4bf93aSEric Dumazet range_truesize = skb->truesize; 50261da177e4SLinus Torvalds 50279f5afeaeSYaogong Wang for (head = skb;;) { 502818a4c0eaSEric Dumazet skb = skb_rb_next(skb); 502991521944SDavid S. Miller 50309f5afeaeSYaogong Wang /* Range is terminated when we see a gap or when 50319f5afeaeSYaogong Wang * we are at the queue end. 50329f5afeaeSYaogong Wang */ 503391521944SDavid S. Miller if (!skb || 50341da177e4SLinus Torvalds after(TCP_SKB_CB(skb)->seq, end) || 50351da177e4SLinus Torvalds before(TCP_SKB_CB(skb)->end_seq, start)) { 50363d4bf93aSEric Dumazet /* Do not attempt collapsing tiny skbs */ 50373d4bf93aSEric Dumazet if (range_truesize != head->truesize || 50383d4bf93aSEric Dumazet end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { 50399f5afeaeSYaogong Wang tcp_collapse(sk, NULL, &tp->out_of_order_queue, 50408728b834SDavid S. Miller head, skb, start, end); 50413d4bf93aSEric Dumazet } else { 50423d4bf93aSEric Dumazet sum_tiny += range_truesize; 50433d4bf93aSEric Dumazet if (sum_tiny > sk->sk_rcvbuf >> 3) 50443d4bf93aSEric Dumazet return; 50453d4bf93aSEric Dumazet } 50469f5afeaeSYaogong Wang goto new_range; 50479f5afeaeSYaogong Wang } 50489f5afeaeSYaogong Wang 50493d4bf93aSEric Dumazet range_truesize += skb->truesize; 50509f5afeaeSYaogong Wang if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) 50511da177e4SLinus Torvalds start = TCP_SKB_CB(skb)->seq; 50521da177e4SLinus Torvalds if (after(TCP_SKB_CB(skb)->end_seq, end)) 50531da177e4SLinus Torvalds end = TCP_SKB_CB(skb)->end_seq; 50541da177e4SLinus Torvalds } 50551da177e4SLinus Torvalds } 50561da177e4SLinus Torvalds 5057b000cd37SVitaliy Gusev /* 505836a6503fSEric Dumazet * Clean the out-of-order queue to make room. 505936a6503fSEric Dumazet * We drop high sequences packets to : 506036a6503fSEric Dumazet * 1) Let a chance for holes to be filled. 506136a6503fSEric Dumazet * 2) not add too big latencies if thousands of packets sit there. 506236a6503fSEric Dumazet * (But if application shrinks SO_RCVBUF, we could still end up 506336a6503fSEric Dumazet * freeing whole queue here) 506472cd43baSEric Dumazet * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. 506536a6503fSEric Dumazet * 506636a6503fSEric Dumazet * Return true if queue has shrunk. 5067b000cd37SVitaliy Gusev */ 5068a2a385d6SEric Dumazet static bool tcp_prune_ofo_queue(struct sock *sk) 5069b000cd37SVitaliy Gusev { 5070b000cd37SVitaliy Gusev struct tcp_sock *tp = tcp_sk(sk); 50719f5afeaeSYaogong Wang struct rb_node *node, *prev; 507272cd43baSEric Dumazet int goal; 5073b000cd37SVitaliy Gusev 50749f5afeaeSYaogong Wang if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) 507536a6503fSEric Dumazet return false; 507636a6503fSEric Dumazet 5077c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); 507872cd43baSEric Dumazet goal = sk->sk_rcvbuf >> 3; 50799f5afeaeSYaogong Wang node = &tp->ooo_last_skb->rbnode; 50809f5afeaeSYaogong Wang do { 50819f5afeaeSYaogong Wang prev = rb_prev(node); 50829f5afeaeSYaogong Wang rb_erase(node, &tp->out_of_order_queue); 508372cd43baSEric Dumazet goal -= rb_to_skb(node)->truesize; 508418a4c0eaSEric Dumazet tcp_drop(sk, rb_to_skb(node)); 508572cd43baSEric Dumazet if (!prev || goal <= 0) { 508636a6503fSEric Dumazet sk_mem_reclaim(sk); 508736a6503fSEric Dumazet if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && 508836a6503fSEric Dumazet !tcp_under_memory_pressure(sk)) 508936a6503fSEric Dumazet break; 509072cd43baSEric Dumazet goal = sk->sk_rcvbuf >> 3; 509172cd43baSEric Dumazet } 50929f5afeaeSYaogong Wang node = prev; 50939f5afeaeSYaogong Wang } while (node); 509418a4c0eaSEric Dumazet tp->ooo_last_skb = rb_to_skb(prev); 5095b000cd37SVitaliy Gusev 5096b000cd37SVitaliy Gusev /* Reset SACK state. A conforming SACK implementation will 5097b000cd37SVitaliy Gusev * do the same at a timeout based retransmit. When a connection 5098b000cd37SVitaliy Gusev * is in a sad state like this, we care only about integrity 5099b000cd37SVitaliy Gusev * of the connection not performance. 5100b000cd37SVitaliy Gusev */ 5101b000cd37SVitaliy Gusev if (tp->rx_opt.sack_ok) 5102b000cd37SVitaliy Gusev tcp_sack_reset(&tp->rx_opt); 510336a6503fSEric Dumazet return true; 5104b000cd37SVitaliy Gusev } 5105b000cd37SVitaliy Gusev 51061da177e4SLinus Torvalds /* Reduce allocated memory if we can, trying to get 51071da177e4SLinus Torvalds * the socket within its memory limits again. 51081da177e4SLinus Torvalds * 51091da177e4SLinus Torvalds * Return less than zero if we should start dropping frames 51101da177e4SLinus Torvalds * until the socket owning process reads some of the data 51111da177e4SLinus Torvalds * to stabilize the situation. 51121da177e4SLinus Torvalds */ 51131da177e4SLinus Torvalds static int tcp_prune_queue(struct sock *sk) 51141da177e4SLinus Torvalds { 51151da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 51161da177e4SLinus Torvalds 5117c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); 51181da177e4SLinus Torvalds 51191da177e4SLinus Torvalds if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) 51209e412ba7SIlpo Järvinen tcp_clamp_window(sk); 5121b8da51ebSEric Dumazet else if (tcp_under_memory_pressure(sk)) 51221da177e4SLinus Torvalds tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); 51231da177e4SLinus Torvalds 5124f4a3313dSEric Dumazet if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 5125f4a3313dSEric Dumazet return 0; 5126f4a3313dSEric Dumazet 51271da177e4SLinus Torvalds tcp_collapse_ofo_queue(sk); 512891521944SDavid S. Miller if (!skb_queue_empty(&sk->sk_receive_queue)) 51299f5afeaeSYaogong Wang tcp_collapse(sk, &sk->sk_receive_queue, NULL, 513091521944SDavid S. Miller skb_peek(&sk->sk_receive_queue), 513191521944SDavid S. Miller NULL, 51321da177e4SLinus Torvalds tp->copied_seq, tp->rcv_nxt); 51333ab224beSHideo Aoki sk_mem_reclaim(sk); 51341da177e4SLinus Torvalds 51351da177e4SLinus Torvalds if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 51361da177e4SLinus Torvalds return 0; 51371da177e4SLinus Torvalds 51381da177e4SLinus Torvalds /* Collapsing did not help, destructive actions follow. 51391da177e4SLinus Torvalds * This must not ever occur. */ 51401da177e4SLinus Torvalds 5141b000cd37SVitaliy Gusev tcp_prune_ofo_queue(sk); 51421da177e4SLinus Torvalds 51431da177e4SLinus Torvalds if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) 51441da177e4SLinus Torvalds return 0; 51451da177e4SLinus Torvalds 51461da177e4SLinus Torvalds /* If we are really being abused, tell the caller to silently 51471da177e4SLinus Torvalds * drop receive data on the floor. It will get retransmitted 51481da177e4SLinus Torvalds * and hopefully then we'll have sufficient space. 51491da177e4SLinus Torvalds */ 5150c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); 51511da177e4SLinus Torvalds 51521da177e4SLinus Torvalds /* Massive buffer overcommit. */ 515331770e34SFlorian Westphal tp->pred_flags = 0; 51541da177e4SLinus Torvalds return -1; 51551da177e4SLinus Torvalds } 51561da177e4SLinus Torvalds 5157a2a385d6SEric Dumazet static bool tcp_should_expand_sndbuf(const struct sock *sk) 51580d9901dfSDavid S. Miller { 5159cf533ea5SEric Dumazet const struct tcp_sock *tp = tcp_sk(sk); 51609e412ba7SIlpo Järvinen 51610d9901dfSDavid S. Miller /* If the user specified a specific send buffer setting, do 51620d9901dfSDavid S. Miller * not modify it. 51630d9901dfSDavid S. Miller */ 51640d9901dfSDavid S. Miller if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) 5165a2a385d6SEric Dumazet return false; 51660d9901dfSDavid S. Miller 51670d9901dfSDavid S. Miller /* If we are under global TCP memory pressure, do not expand. */ 5168b8da51ebSEric Dumazet if (tcp_under_memory_pressure(sk)) 5169a2a385d6SEric Dumazet return false; 51700d9901dfSDavid S. Miller 51710d9901dfSDavid S. Miller /* If we are under soft global TCP memory pressure, do not expand. */ 5172180d8cd9SGlauber Costa if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) 5173a2a385d6SEric Dumazet return false; 51740d9901dfSDavid S. Miller 51750d9901dfSDavid S. Miller /* If we filled the congestion window, do not expand. */ 51766514890fSNeal Cardwell if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) 5177a2a385d6SEric Dumazet return false; 51780d9901dfSDavid S. Miller 5179a2a385d6SEric Dumazet return true; 51800d9901dfSDavid S. Miller } 51811da177e4SLinus Torvalds 51821da177e4SLinus Torvalds /* When incoming ACK allowed to free some skb from write_queue, 51831da177e4SLinus Torvalds * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket 51841da177e4SLinus Torvalds * on the exit from tcp input handler. 51851da177e4SLinus Torvalds * 51861da177e4SLinus Torvalds * PROBLEM: sndbuf expansion does not work well with largesend. 51871da177e4SLinus Torvalds */ 51881da177e4SLinus Torvalds static void tcp_new_space(struct sock *sk) 51891da177e4SLinus Torvalds { 51901da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 51911da177e4SLinus Torvalds 51929e412ba7SIlpo Järvinen if (tcp_should_expand_sndbuf(sk)) { 51936ae70532SEric Dumazet tcp_sndbuf_expand(sk); 5194c2203cf7SEric Dumazet tp->snd_cwnd_stamp = tcp_jiffies32; 51951da177e4SLinus Torvalds } 51961da177e4SLinus Torvalds 51971da177e4SLinus Torvalds sk->sk_write_space(sk); 51981da177e4SLinus Torvalds } 51991da177e4SLinus Torvalds 520040efc6faSStephen Hemminger static void tcp_check_space(struct sock *sk) 52011da177e4SLinus Torvalds { 52021da177e4SLinus Torvalds if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { 52031da177e4SLinus Torvalds sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); 52043c715127Sjbaron@akamai.com /* pairs with tcp_poll() */ 520556d80622SJason Baron smp_mb(); 52061da177e4SLinus Torvalds if (sk->sk_socket && 5207b0f71bd3SFrancis Yan test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 52081da177e4SLinus Torvalds tcp_new_space(sk); 5209b0f71bd3SFrancis Yan if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) 5210b0f71bd3SFrancis Yan tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); 5211b0f71bd3SFrancis Yan } 52121da177e4SLinus Torvalds } 52131da177e4SLinus Torvalds } 52141da177e4SLinus Torvalds 52159e412ba7SIlpo Järvinen static inline void tcp_data_snd_check(struct sock *sk) 52161da177e4SLinus Torvalds { 52179e412ba7SIlpo Järvinen tcp_push_pending_frames(sk); 52181da177e4SLinus Torvalds tcp_check_space(sk); 52191da177e4SLinus Torvalds } 52201da177e4SLinus Torvalds 52211da177e4SLinus Torvalds /* 52221da177e4SLinus Torvalds * Check if sending an ack is needed. 52231da177e4SLinus Torvalds */ 52241da177e4SLinus Torvalds static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) 52251da177e4SLinus Torvalds { 52261da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 52275d9f4262SEric Dumazet unsigned long rtt, delay; 52281da177e4SLinus Torvalds 52291da177e4SLinus Torvalds /* More than one full frame received... */ 52309d4fb27dSJoe Perches if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && 52311da177e4SLinus Torvalds /* ... and right edge of window advances far enough. 5232796f82eaSEric Dumazet * (tcp_recvmsg() will send ACK otherwise). 5233796f82eaSEric Dumazet * If application uses SO_RCVLOWAT, we want send ack now if 5234796f82eaSEric Dumazet * we have not received enough bytes to satisfy the condition. 52351da177e4SLinus Torvalds */ 5236796f82eaSEric Dumazet (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || 5237796f82eaSEric Dumazet __tcp_select_window(sk) >= tp->rcv_wnd)) || 52381da177e4SLinus Torvalds /* We ACK each frame or... */ 5239466466dcSYuchung Cheng tcp_in_quickack_mode(sk) || 5240466466dcSYuchung Cheng /* Protocol state mandates a one-time immediate ACK */ 5241466466dcSYuchung Cheng inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { 52425d9f4262SEric Dumazet send_now: 52431da177e4SLinus Torvalds tcp_send_ack(sk); 52445d9f4262SEric Dumazet return; 52451da177e4SLinus Torvalds } 52465d9f4262SEric Dumazet 52475d9f4262SEric Dumazet if (!ofo_possible || RB_EMPTY_ROOT(&tp->out_of_order_queue)) { 52485d9f4262SEric Dumazet tcp_send_delayed_ack(sk); 52495d9f4262SEric Dumazet return; 52505d9f4262SEric Dumazet } 52515d9f4262SEric Dumazet 52529c21d2fcSEric Dumazet if (!tcp_is_sack(tp) || 52539c21d2fcSEric Dumazet tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) 52545d9f4262SEric Dumazet goto send_now; 525586de5921SEric Dumazet 525686de5921SEric Dumazet if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { 525786de5921SEric Dumazet tp->compressed_ack_rcv_nxt = tp->rcv_nxt; 525886de5921SEric Dumazet if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) 525986de5921SEric Dumazet NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, 526086de5921SEric Dumazet tp->compressed_ack - TCP_FASTRETRANS_THRESH); 526186de5921SEric Dumazet tp->compressed_ack = 0; 526286de5921SEric Dumazet } 526386de5921SEric Dumazet 526486de5921SEric Dumazet if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) 526586de5921SEric Dumazet goto send_now; 52665d9f4262SEric Dumazet 52675d9f4262SEric Dumazet if (hrtimer_is_queued(&tp->compressed_ack_timer)) 52685d9f4262SEric Dumazet return; 52695d9f4262SEric Dumazet 52706d82aa24SEric Dumazet /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ 52715d9f4262SEric Dumazet 52725d9f4262SEric Dumazet rtt = tp->rcv_rtt_est.rtt_us; 52735d9f4262SEric Dumazet if (tp->srtt_us && tp->srtt_us < rtt) 52745d9f4262SEric Dumazet rtt = tp->srtt_us; 52755d9f4262SEric Dumazet 52766d82aa24SEric Dumazet delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, 52775d9f4262SEric Dumazet rtt * (NSEC_PER_USEC >> 3)/20); 52785d9f4262SEric Dumazet sock_hold(sk); 52795d9f4262SEric Dumazet hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), 52805d9f4262SEric Dumazet HRTIMER_MODE_REL_PINNED_SOFT); 52811da177e4SLinus Torvalds } 52821da177e4SLinus Torvalds 528340efc6faSStephen Hemminger static inline void tcp_ack_snd_check(struct sock *sk) 52841da177e4SLinus Torvalds { 5285463c84b9SArnaldo Carvalho de Melo if (!inet_csk_ack_scheduled(sk)) { 52861da177e4SLinus Torvalds /* We sent a data segment already. */ 52871da177e4SLinus Torvalds return; 52881da177e4SLinus Torvalds } 52891da177e4SLinus Torvalds __tcp_ack_snd_check(sk, 1); 52901da177e4SLinus Torvalds } 52911da177e4SLinus Torvalds 52921da177e4SLinus Torvalds /* 52931da177e4SLinus Torvalds * This routine is only called when we have urgent data 5294caa20d9aSStephen Hemminger * signaled. Its the 'slow' part of tcp_urg. It could be 52951da177e4SLinus Torvalds * moved inline now as tcp_urg is only called from one 52961da177e4SLinus Torvalds * place. We handle URGent data wrong. We have to - as 52971da177e4SLinus Torvalds * BSD still doesn't use the correction from RFC961. 52981da177e4SLinus Torvalds * For 1003.1g we should support a new option TCP_STDURG to permit 52991da177e4SLinus Torvalds * either form (or just set the sysctl tcp_stdurg). 53001da177e4SLinus Torvalds */ 53011da177e4SLinus Torvalds 5302cf533ea5SEric Dumazet static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) 53031da177e4SLinus Torvalds { 53041da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 53051da177e4SLinus Torvalds u32 ptr = ntohs(th->urg_ptr); 53061da177e4SLinus Torvalds 53073f4c7c6fSEric Dumazet if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg) 53081da177e4SLinus Torvalds ptr--; 53091da177e4SLinus Torvalds ptr += ntohl(th->seq); 53101da177e4SLinus Torvalds 53111da177e4SLinus Torvalds /* Ignore urgent data that we've already seen and read. */ 53121da177e4SLinus Torvalds if (after(tp->copied_seq, ptr)) 53131da177e4SLinus Torvalds return; 53141da177e4SLinus Torvalds 53151da177e4SLinus Torvalds /* Do not replay urg ptr. 53161da177e4SLinus Torvalds * 53171da177e4SLinus Torvalds * NOTE: interesting situation not covered by specs. 53181da177e4SLinus Torvalds * Misbehaving sender may send urg ptr, pointing to segment, 53191da177e4SLinus Torvalds * which we already have in ofo queue. We are not able to fetch 53201da177e4SLinus Torvalds * such data and will stay in TCP_URG_NOTYET until will be eaten 53211da177e4SLinus Torvalds * by recvmsg(). Seems, we are not obliged to handle such wicked 53221da177e4SLinus Torvalds * situations. But it is worth to think about possibility of some 53231da177e4SLinus Torvalds * DoSes using some hypothetical application level deadlock. 53241da177e4SLinus Torvalds */ 53251da177e4SLinus Torvalds if (before(ptr, tp->rcv_nxt)) 53261da177e4SLinus Torvalds return; 53271da177e4SLinus Torvalds 53281da177e4SLinus Torvalds /* Do we already have a newer (or duplicate) urgent pointer? */ 53291da177e4SLinus Torvalds if (tp->urg_data && !after(ptr, tp->urg_seq)) 53301da177e4SLinus Torvalds return; 53311da177e4SLinus Torvalds 53321da177e4SLinus Torvalds /* Tell the world about our new urgent pointer. */ 53331da177e4SLinus Torvalds sk_send_sigurg(sk); 53341da177e4SLinus Torvalds 53351da177e4SLinus Torvalds /* We may be adding urgent data when the last byte read was 53361da177e4SLinus Torvalds * urgent. To do this requires some care. We cannot just ignore 53371da177e4SLinus Torvalds * tp->copied_seq since we would read the last urgent byte again 53381da177e4SLinus Torvalds * as data, nor can we alter copied_seq until this data arrives 5339caa20d9aSStephen Hemminger * or we break the semantics of SIOCATMARK (and thus sockatmark()) 53401da177e4SLinus Torvalds * 53411da177e4SLinus Torvalds * NOTE. Double Dutch. Rendering to plain English: author of comment 53421da177e4SLinus Torvalds * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); 53431da177e4SLinus Torvalds * and expect that both A and B disappear from stream. This is _wrong_. 53441da177e4SLinus Torvalds * Though this happens in BSD with high probability, this is occasional. 53451da177e4SLinus Torvalds * Any application relying on this is buggy. Note also, that fix "works" 53461da177e4SLinus Torvalds * only in this artificial test. Insert some normal data between A and B and we will 53471da177e4SLinus Torvalds * decline of BSD again. Verdict: it is better to remove to trap 53481da177e4SLinus Torvalds * buggy users. 53491da177e4SLinus Torvalds */ 53501da177e4SLinus Torvalds if (tp->urg_seq == tp->copied_seq && tp->urg_data && 5351056834d9SIlpo Järvinen !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) { 53521da177e4SLinus Torvalds struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); 53531da177e4SLinus Torvalds tp->copied_seq++; 53541da177e4SLinus Torvalds if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { 53558728b834SDavid S. Miller __skb_unlink(skb, &sk->sk_receive_queue); 53561da177e4SLinus Torvalds __kfree_skb(skb); 53571da177e4SLinus Torvalds } 53581da177e4SLinus Torvalds } 53591da177e4SLinus Torvalds 53601da177e4SLinus Torvalds tp->urg_data = TCP_URG_NOTYET; 5361d9b55bf7SEric Dumazet WRITE_ONCE(tp->urg_seq, ptr); 536231770e34SFlorian Westphal 536331770e34SFlorian Westphal /* Disable header prediction. */ 536431770e34SFlorian Westphal tp->pred_flags = 0; 53651da177e4SLinus Torvalds } 53661da177e4SLinus Torvalds 53671da177e4SLinus Torvalds /* This is the 'fast' part of urgent handling. */ 5368cf533ea5SEric Dumazet static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) 53691da177e4SLinus Torvalds { 53701da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 53711da177e4SLinus Torvalds 53721da177e4SLinus Torvalds /* Check if we get a new urgent pointer - normally not. */ 53731da177e4SLinus Torvalds if (th->urg) 53741da177e4SLinus Torvalds tcp_check_urg(sk, th); 53751da177e4SLinus Torvalds 53761da177e4SLinus Torvalds /* Do we wait for any urgent data? - normally not... */ 53771da177e4SLinus Torvalds if (tp->urg_data == TCP_URG_NOTYET) { 53781da177e4SLinus Torvalds u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - 53791da177e4SLinus Torvalds th->syn; 53801da177e4SLinus Torvalds 53811da177e4SLinus Torvalds /* Is the urgent pointer pointing into this packet? */ 53821da177e4SLinus Torvalds if (ptr < skb->len) { 53831da177e4SLinus Torvalds u8 tmp; 53841da177e4SLinus Torvalds if (skb_copy_bits(skb, ptr, &tmp, 1)) 53851da177e4SLinus Torvalds BUG(); 53861da177e4SLinus Torvalds tp->urg_data = TCP_URG_VALID | tmp; 53871da177e4SLinus Torvalds if (!sock_flag(sk, SOCK_DEAD)) 5388676d2369SDavid S. Miller sk->sk_data_ready(sk); 53891da177e4SLinus Torvalds } 53901da177e4SLinus Torvalds } 53911da177e4SLinus Torvalds } 53921da177e4SLinus Torvalds 53930e40f4c9SJason Baron /* Accept RST for rcv_nxt - 1 after a FIN. 53940e40f4c9SJason Baron * When tcp connections are abruptly terminated from Mac OSX (via ^C), a 53950e40f4c9SJason Baron * FIN is sent followed by a RST packet. The RST is sent with the same 53960e40f4c9SJason Baron * sequence number as the FIN, and thus according to RFC 5961 a challenge 53970e40f4c9SJason Baron * ACK should be sent. However, Mac OSX rate limits replies to challenge 53980e40f4c9SJason Baron * ACKs on the closed socket. In addition middleboxes can drop either the 53990e40f4c9SJason Baron * challenge ACK or a subsequent RST. 54000e40f4c9SJason Baron */ 54010e40f4c9SJason Baron static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb) 54020e40f4c9SJason Baron { 54030e40f4c9SJason Baron struct tcp_sock *tp = tcp_sk(sk); 54040e40f4c9SJason Baron 54050e40f4c9SJason Baron return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) && 54060e40f4c9SJason Baron (1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK | 54070e40f4c9SJason Baron TCPF_CLOSING)); 54080e40f4c9SJason Baron } 54090e40f4c9SJason Baron 5410cbe2d128SIlpo Järvinen /* Does PAWS and seqno based validation of an incoming segment, flags will 5411cbe2d128SIlpo Järvinen * play significant role here. 5412cbe2d128SIlpo Järvinen */ 54130c24604bSEric Dumazet static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 5414cf533ea5SEric Dumazet const struct tcphdr *th, int syn_inerr) 5415cbe2d128SIlpo Järvinen { 5416cbe2d128SIlpo Järvinen struct tcp_sock *tp = tcp_sk(sk); 5417e00431bcSPau Espin Pedrol bool rst_seq_match = false; 5418cbe2d128SIlpo Järvinen 5419cbe2d128SIlpo Järvinen /* RFC1323: H1. Apply PAWS check first. */ 5420eed29f17SEric Dumazet if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && 5421eed29f17SEric Dumazet tp->rx_opt.saw_tstamp && 5422cbe2d128SIlpo Järvinen tcp_paws_discard(sk, skb)) { 5423cbe2d128SIlpo Järvinen if (!th->rst) { 5424c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 5425f2b2c582SNeal Cardwell if (!tcp_oow_rate_limited(sock_net(sk), skb, 5426f2b2c582SNeal Cardwell LINUX_MIB_TCPACKSKIPPEDPAWS, 5427f2b2c582SNeal Cardwell &tp->last_oow_ack_time)) 5428cbe2d128SIlpo Järvinen tcp_send_dupack(sk, skb); 5429cbe2d128SIlpo Järvinen goto discard; 5430cbe2d128SIlpo Järvinen } 5431cbe2d128SIlpo Järvinen /* Reset is accepted even if it did not pass PAWS. */ 5432cbe2d128SIlpo Järvinen } 5433cbe2d128SIlpo Järvinen 5434cbe2d128SIlpo Järvinen /* Step 1: check sequence number */ 5435cbe2d128SIlpo Järvinen if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { 5436cbe2d128SIlpo Järvinen /* RFC793, page 37: "In all states except SYN-SENT, all reset 5437cbe2d128SIlpo Järvinen * (RST) segments are validated by checking their SEQ-fields." 5438cbe2d128SIlpo Järvinen * And page 69: "If an incoming segment is not acceptable, 5439cbe2d128SIlpo Järvinen * an acknowledgment should be sent in reply (unless the RST 5440cbe2d128SIlpo Järvinen * bit is set, if so drop the segment and return)". 5441cbe2d128SIlpo Järvinen */ 5442e3715899SEric Dumazet if (!th->rst) { 5443e3715899SEric Dumazet if (th->syn) 5444e3715899SEric Dumazet goto syn_challenge; 5445f2b2c582SNeal Cardwell if (!tcp_oow_rate_limited(sock_net(sk), skb, 5446f2b2c582SNeal Cardwell LINUX_MIB_TCPACKSKIPPEDSEQ, 5447f2b2c582SNeal Cardwell &tp->last_oow_ack_time)) 5448cbe2d128SIlpo Järvinen tcp_send_dupack(sk, skb); 54490e40f4c9SJason Baron } else if (tcp_reset_check(sk, skb)) { 54500e40f4c9SJason Baron tcp_reset(sk); 5451e3715899SEric Dumazet } 5452cbe2d128SIlpo Järvinen goto discard; 5453cbe2d128SIlpo Järvinen } 5454cbe2d128SIlpo Järvinen 5455cbe2d128SIlpo Järvinen /* Step 2: check RST bit */ 5456cbe2d128SIlpo Järvinen if (th->rst) { 54570e40f4c9SJason Baron /* RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a 54580e40f4c9SJason Baron * FIN and SACK too if available): 54590e40f4c9SJason Baron * If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or 54600e40f4c9SJason Baron * the right-most SACK block, 5461e00431bcSPau Espin Pedrol * then 5462282f23c6SEric Dumazet * RESET the connection 5463282f23c6SEric Dumazet * else 5464282f23c6SEric Dumazet * Send a challenge ACK 5465282f23c6SEric Dumazet */ 54660e40f4c9SJason Baron if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt || 54670e40f4c9SJason Baron tcp_reset_check(sk, skb)) { 5468e00431bcSPau Espin Pedrol rst_seq_match = true; 5469e00431bcSPau Espin Pedrol } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) { 5470e00431bcSPau Espin Pedrol struct tcp_sack_block *sp = &tp->selective_acks[0]; 5471e00431bcSPau Espin Pedrol int max_sack = sp[0].end_seq; 5472e00431bcSPau Espin Pedrol int this_sack; 5473e00431bcSPau Espin Pedrol 5474e00431bcSPau Espin Pedrol for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; 5475e00431bcSPau Espin Pedrol ++this_sack) { 5476e00431bcSPau Espin Pedrol max_sack = after(sp[this_sack].end_seq, 5477e00431bcSPau Espin Pedrol max_sack) ? 5478e00431bcSPau Espin Pedrol sp[this_sack].end_seq : max_sack; 5479e00431bcSPau Espin Pedrol } 5480e00431bcSPau Espin Pedrol 5481e00431bcSPau Espin Pedrol if (TCP_SKB_CB(skb)->seq == max_sack) 5482e00431bcSPau Espin Pedrol rst_seq_match = true; 5483e00431bcSPau Espin Pedrol } 5484e00431bcSPau Espin Pedrol 5485e00431bcSPau Espin Pedrol if (rst_seq_match) 5486cbe2d128SIlpo Järvinen tcp_reset(sk); 5487cf1ef3f0SWei Wang else { 5488cf1ef3f0SWei Wang /* Disable TFO if RST is out-of-order 5489cf1ef3f0SWei Wang * and no data has been received 5490cf1ef3f0SWei Wang * for current active TFO socket 5491cf1ef3f0SWei Wang */ 5492cf1ef3f0SWei Wang if (tp->syn_fastopen && !tp->data_segs_in && 5493cf1ef3f0SWei Wang sk->sk_state == TCP_ESTABLISHED) 549446c2fa39SWei Wang tcp_fastopen_active_disable(sk); 5495f2b2c582SNeal Cardwell tcp_send_challenge_ack(sk, skb); 5496cf1ef3f0SWei Wang } 5497cbe2d128SIlpo Järvinen goto discard; 5498cbe2d128SIlpo Järvinen } 5499cbe2d128SIlpo Järvinen 5500cbe2d128SIlpo Järvinen /* step 3: check security and precedence [ignored] */ 5501cbe2d128SIlpo Järvinen 55020c24604bSEric Dumazet /* step 4: Check for a SYN 5503cd214535SSowmini Varadhan * RFC 5961 4.2 : Send a challenge ack 55040c24604bSEric Dumazet */ 55050c24604bSEric Dumazet if (th->syn) { 5506e3715899SEric Dumazet syn_challenge: 5507cbe2d128SIlpo Järvinen if (syn_inerr) 5508c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 5509c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); 5510f2b2c582SNeal Cardwell tcp_send_challenge_ack(sk, skb); 55110c24604bSEric Dumazet goto discard; 5512cbe2d128SIlpo Järvinen } 5513cbe2d128SIlpo Järvinen 55140c24604bSEric Dumazet return true; 5515cbe2d128SIlpo Järvinen 5516cbe2d128SIlpo Järvinen discard: 5517532182cdSEric Dumazet tcp_drop(sk, skb); 55180c24604bSEric Dumazet return false; 5519cbe2d128SIlpo Järvinen } 5520cbe2d128SIlpo Järvinen 55211da177e4SLinus Torvalds /* 55221da177e4SLinus Torvalds * TCP receive function for the ESTABLISHED state. 552331770e34SFlorian Westphal * 552431770e34SFlorian Westphal * It is split into a fast path and a slow path. The fast path is 552531770e34SFlorian Westphal * disabled when: 552631770e34SFlorian Westphal * - A zero window was announced from us - zero window probing 552731770e34SFlorian Westphal * is only handled properly in the slow path. 552831770e34SFlorian Westphal * - Out of order segments arrived. 552931770e34SFlorian Westphal * - Urgent data is expected. 553031770e34SFlorian Westphal * - There is no buffer space left 553131770e34SFlorian Westphal * - Unexpected TCP flags/window values/header lengths are received 553231770e34SFlorian Westphal * (detected by checking the TCP header against pred_flags) 553331770e34SFlorian Westphal * - Data is sent in both directions. Fast path only supports pure senders 553431770e34SFlorian Westphal * or pure receivers (this means either the sequence number or the ack 553531770e34SFlorian Westphal * value must stay constant) 553631770e34SFlorian Westphal * - Unexpected TCP option. 553731770e34SFlorian Westphal * 553831770e34SFlorian Westphal * When these conditions are not satisfied it drops into a standard 553931770e34SFlorian Westphal * receive procedure patterned after RFC793 to handle all cases. 554031770e34SFlorian Westphal * The first three cases are guaranteed by proper pred_flags setting, 554131770e34SFlorian Westphal * the rest is checked inline. Fast processing is turned on in 554231770e34SFlorian Westphal * tcp_data_queue when everything is OK. 55431da177e4SLinus Torvalds */ 55443d97d88eSYafang Shao void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) 55451da177e4SLinus Torvalds { 55463d97d88eSYafang Shao const struct tcphdr *th = (const struct tcphdr *)skb->data; 55471da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 55483d97d88eSYafang Shao unsigned int len = skb->len; 55491da177e4SLinus Torvalds 5550c3fde1bdSMasami Hiramatsu /* TCP congestion window tracking */ 5551c3fde1bdSMasami Hiramatsu trace_tcp_probe(sk, skb); 5552c3fde1bdSMasami Hiramatsu 55539a568de4SEric Dumazet tcp_mstamp_refresh(tp); 555451456b29SIan Morris if (unlikely(!sk->sk_rx_dst)) 55555d299f3dSEric Dumazet inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); 555631770e34SFlorian Westphal /* 555731770e34SFlorian Westphal * Header prediction. 555831770e34SFlorian Westphal * The code loosely follows the one in the famous 555931770e34SFlorian Westphal * "30 instruction TCP receive" Van Jacobson mail. 556031770e34SFlorian Westphal * 556131770e34SFlorian Westphal * Van's trick is to deposit buffers into socket queue 556231770e34SFlorian Westphal * on a device interrupt, to call tcp_recv function 556331770e34SFlorian Westphal * on the receive process context and checksum and copy 556431770e34SFlorian Westphal * the buffer to user space. smart... 556531770e34SFlorian Westphal * 556631770e34SFlorian Westphal * Our current scheme is not silly either but we take the 556731770e34SFlorian Westphal * extra cost of the net_bh soft interrupt processing... 556831770e34SFlorian Westphal * We do checksum and copy also but from device to kernel. 556931770e34SFlorian Westphal */ 55701da177e4SLinus Torvalds 55711da177e4SLinus Torvalds tp->rx_opt.saw_tstamp = 0; 55721da177e4SLinus Torvalds 557331770e34SFlorian Westphal /* pred_flags is 0xS?10 << 16 + snd_wnd 557431770e34SFlorian Westphal * if header_prediction is to be made 557531770e34SFlorian Westphal * 'S' will always be tp->tcp_header_len >> 2 557631770e34SFlorian Westphal * '?' will be 0 for the fast path, otherwise pred_flags is 0 to 557731770e34SFlorian Westphal * turn it off (when there are holes in the receive 557831770e34SFlorian Westphal * space for instance) 557931770e34SFlorian Westphal * PSH flag is ignored. 558031770e34SFlorian Westphal */ 558131770e34SFlorian Westphal 558231770e34SFlorian Westphal if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && 558331770e34SFlorian Westphal TCP_SKB_CB(skb)->seq == tp->rcv_nxt && 558431770e34SFlorian Westphal !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { 558531770e34SFlorian Westphal int tcp_header_len = tp->tcp_header_len; 558631770e34SFlorian Westphal 558731770e34SFlorian Westphal /* Timestamp header prediction: tcp_header_len 558831770e34SFlorian Westphal * is automatically equal to th->doff*4 due to pred_flags 558931770e34SFlorian Westphal * match. 559031770e34SFlorian Westphal */ 559131770e34SFlorian Westphal 559231770e34SFlorian Westphal /* Check timestamp */ 559331770e34SFlorian Westphal if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { 559431770e34SFlorian Westphal /* No? Slow path! */ 559531770e34SFlorian Westphal if (!tcp_parse_aligned_timestamp(tp, th)) 559631770e34SFlorian Westphal goto slow_path; 559731770e34SFlorian Westphal 559831770e34SFlorian Westphal /* If PAWS failed, check it more carefully in slow path */ 559931770e34SFlorian Westphal if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) 560031770e34SFlorian Westphal goto slow_path; 560131770e34SFlorian Westphal 560231770e34SFlorian Westphal /* DO NOT update ts_recent here, if checksum fails 560331770e34SFlorian Westphal * and timestamp was corrupted part, it will result 560431770e34SFlorian Westphal * in a hung connection since we will drop all 560531770e34SFlorian Westphal * future packets due to the PAWS test. 560631770e34SFlorian Westphal */ 560731770e34SFlorian Westphal } 560831770e34SFlorian Westphal 560931770e34SFlorian Westphal if (len <= tcp_header_len) { 561031770e34SFlorian Westphal /* Bulk data transfer: sender */ 561131770e34SFlorian Westphal if (len == tcp_header_len) { 561231770e34SFlorian Westphal /* Predicted packet is in window by definition. 561331770e34SFlorian Westphal * seq == rcv_nxt and rcv_wup <= rcv_nxt. 561431770e34SFlorian Westphal * Hence, check seq<=rcv_wup reduces to: 561531770e34SFlorian Westphal */ 561631770e34SFlorian Westphal if (tcp_header_len == 561731770e34SFlorian Westphal (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && 561831770e34SFlorian Westphal tp->rcv_nxt == tp->rcv_wup) 561931770e34SFlorian Westphal tcp_store_ts_recent(tp); 562031770e34SFlorian Westphal 562131770e34SFlorian Westphal /* We know that such packets are checksummed 562231770e34SFlorian Westphal * on entry. 562331770e34SFlorian Westphal */ 562431770e34SFlorian Westphal tcp_ack(sk, skb, 0); 562531770e34SFlorian Westphal __kfree_skb(skb); 562631770e34SFlorian Westphal tcp_data_snd_check(sk); 56273f6c65d6SWei Wang /* When receiving pure ack in fast path, update 56283f6c65d6SWei Wang * last ts ecr directly instead of calling 56293f6c65d6SWei Wang * tcp_rcv_rtt_measure_ts() 56303f6c65d6SWei Wang */ 56313f6c65d6SWei Wang tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr; 563231770e34SFlorian Westphal return; 563331770e34SFlorian Westphal } else { /* Header too small */ 563431770e34SFlorian Westphal TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 563531770e34SFlorian Westphal goto discard; 563631770e34SFlorian Westphal } 563731770e34SFlorian Westphal } else { 563831770e34SFlorian Westphal int eaten = 0; 563931770e34SFlorian Westphal bool fragstolen = false; 564031770e34SFlorian Westphal 564131770e34SFlorian Westphal if (tcp_checksum_complete(skb)) 564231770e34SFlorian Westphal goto csum_error; 564331770e34SFlorian Westphal 564431770e34SFlorian Westphal if ((int)skb->truesize > sk->sk_forward_alloc) 564531770e34SFlorian Westphal goto step5; 564631770e34SFlorian Westphal 564731770e34SFlorian Westphal /* Predicted packet is in window by definition. 564831770e34SFlorian Westphal * seq == rcv_nxt and rcv_wup <= rcv_nxt. 564931770e34SFlorian Westphal * Hence, check seq<=rcv_wup reduces to: 565031770e34SFlorian Westphal */ 565131770e34SFlorian Westphal if (tcp_header_len == 565231770e34SFlorian Westphal (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && 565331770e34SFlorian Westphal tp->rcv_nxt == tp->rcv_wup) 565431770e34SFlorian Westphal tcp_store_ts_recent(tp); 565531770e34SFlorian Westphal 565631770e34SFlorian Westphal tcp_rcv_rtt_measure_ts(sk, skb); 565731770e34SFlorian Westphal 565831770e34SFlorian Westphal NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); 565931770e34SFlorian Westphal 566031770e34SFlorian Westphal /* Bulk data transfer: receiver */ 5661e7395f1fSEric Dumazet __skb_pull(skb, tcp_header_len); 5662e7395f1fSEric Dumazet eaten = tcp_queue_rcv(sk, skb, &fragstolen); 566331770e34SFlorian Westphal 566431770e34SFlorian Westphal tcp_event_data_recv(sk, skb); 566531770e34SFlorian Westphal 566631770e34SFlorian Westphal if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 566731770e34SFlorian Westphal /* Well, only one small jumplet in fast path... */ 566831770e34SFlorian Westphal tcp_ack(sk, skb, FLAG_DATA); 566931770e34SFlorian Westphal tcp_data_snd_check(sk); 567031770e34SFlorian Westphal if (!inet_csk_ack_scheduled(sk)) 567131770e34SFlorian Westphal goto no_ack; 567231770e34SFlorian Westphal } 567331770e34SFlorian Westphal 567431770e34SFlorian Westphal __tcp_ack_snd_check(sk, 0); 567531770e34SFlorian Westphal no_ack: 567631770e34SFlorian Westphal if (eaten) 567731770e34SFlorian Westphal kfree_skb_partial(skb, fragstolen); 567803f45c88SEric Dumazet tcp_data_ready(sk); 567931770e34SFlorian Westphal return; 568031770e34SFlorian Westphal } 568131770e34SFlorian Westphal } 568231770e34SFlorian Westphal 568331770e34SFlorian Westphal slow_path: 5684fb3477c0SEric Dumazet if (len < (th->doff << 2) || tcp_checksum_complete(skb)) 56851da177e4SLinus Torvalds goto csum_error; 56861da177e4SLinus Torvalds 56870c228e83SCalvin Owens if (!th->ack && !th->rst && !th->syn) 5688c3ae62afSEric Dumazet goto discard; 5689c3ae62afSEric Dumazet 569031770e34SFlorian Westphal /* 569131770e34SFlorian Westphal * Standard slow path. 569231770e34SFlorian Westphal */ 569331770e34SFlorian Westphal 56940c24604bSEric Dumazet if (!tcp_validate_incoming(sk, skb, th, 1)) 5695c995ae22SVijay Subramanian return; 56961da177e4SLinus Torvalds 569731770e34SFlorian Westphal step5: 569831770e34SFlorian Westphal if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) 569996e0bf4bSJohn Dykstra goto discard; 57001da177e4SLinus Torvalds 5701463c84b9SArnaldo Carvalho de Melo tcp_rcv_rtt_measure_ts(sk, skb); 57021da177e4SLinus Torvalds 57031da177e4SLinus Torvalds /* Process urgent data. */ 57041da177e4SLinus Torvalds tcp_urg(sk, skb, th); 57051da177e4SLinus Torvalds 57061da177e4SLinus Torvalds /* step 7: process the segment text */ 57071da177e4SLinus Torvalds tcp_data_queue(sk, skb); 57081da177e4SLinus Torvalds 57099e412ba7SIlpo Järvinen tcp_data_snd_check(sk); 57101da177e4SLinus Torvalds tcp_ack_snd_check(sk); 5711c995ae22SVijay Subramanian return; 57121da177e4SLinus Torvalds 57131da177e4SLinus Torvalds csum_error: 5714c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); 5715c10d9310SEric Dumazet TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 57161da177e4SLinus Torvalds 57171da177e4SLinus Torvalds discard: 5718532182cdSEric Dumazet tcp_drop(sk, skb); 57191da177e4SLinus Torvalds } 57204bc2f18bSEric Dumazet EXPORT_SYMBOL(tcp_rcv_established); 57211da177e4SLinus Torvalds 572298fa6271SYuchung Cheng void tcp_init_transfer(struct sock *sk, int bpf_op) 572398fa6271SYuchung Cheng { 572498fa6271SYuchung Cheng struct inet_connection_sock *icsk = inet_csk(sk); 572598fa6271SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 572698fa6271SYuchung Cheng 572798fa6271SYuchung Cheng tcp_mtup_init(sk); 572898fa6271SYuchung Cheng icsk->icsk_af_ops->rebuild_header(sk); 572998fa6271SYuchung Cheng tcp_init_metrics(sk); 573098fa6271SYuchung Cheng 573198fa6271SYuchung Cheng /* Initialize the congestion window to start the transfer. 573298fa6271SYuchung Cheng * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been 573398fa6271SYuchung Cheng * retransmitted. In light of RFC6298 more aggressive 1sec 573498fa6271SYuchung Cheng * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK 573598fa6271SYuchung Cheng * retransmission has occurred. 573698fa6271SYuchung Cheng */ 573798fa6271SYuchung Cheng if (tp->total_retrans > 1 && tp->undo_marker) 573898fa6271SYuchung Cheng tp->snd_cwnd = 1; 573998fa6271SYuchung Cheng else 574098fa6271SYuchung Cheng tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); 574198fa6271SYuchung Cheng tp->snd_cwnd_stamp = tcp_jiffies32; 574298fa6271SYuchung Cheng 574398fa6271SYuchung Cheng tcp_call_bpf(sk, bpf_op, 0, NULL); 574498fa6271SYuchung Cheng tcp_init_congestion_control(sk); 574598fa6271SYuchung Cheng tcp_init_buffer_space(sk); 574698fa6271SYuchung Cheng } 574798fa6271SYuchung Cheng 5748370816aeSPavel Emelyanov void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) 5749370816aeSPavel Emelyanov { 5750370816aeSPavel Emelyanov struct tcp_sock *tp = tcp_sk(sk); 5751370816aeSPavel Emelyanov struct inet_connection_sock *icsk = inet_csk(sk); 5752370816aeSPavel Emelyanov 5753370816aeSPavel Emelyanov tcp_set_state(sk, TCP_ESTABLISHED); 575470eabf0eSEric Dumazet icsk->icsk_ack.lrcvtime = tcp_jiffies32; 5755370816aeSPavel Emelyanov 575600db4124SIan Morris if (skb) { 57575d299f3dSEric Dumazet icsk->icsk_af_ops->sk_rx_dst_set(sk, skb); 5758370816aeSPavel Emelyanov security_inet_conn_established(sk, skb); 5759c6345ce7SAmritha Nambiar sk_mark_napi_id(sk, skb); 576041063e9dSDavid S. Miller } 5761370816aeSPavel Emelyanov 576227204aaaSWei Wang tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB); 5763370816aeSPavel Emelyanov 5764370816aeSPavel Emelyanov /* Prevent spurious tcp_cwnd_restart() on first data 5765370816aeSPavel Emelyanov * packet. 5766370816aeSPavel Emelyanov */ 5767d635fbe2SEric Dumazet tp->lsndtime = tcp_jiffies32; 5768370816aeSPavel Emelyanov 5769370816aeSPavel Emelyanov if (sock_flag(sk, SOCK_KEEPOPEN)) 5770370816aeSPavel Emelyanov inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); 577131770e34SFlorian Westphal 577231770e34SFlorian Westphal if (!tp->rx_opt.snd_wscale) 577331770e34SFlorian Westphal __tcp_fast_path_on(tp, tp->snd_wnd); 577431770e34SFlorian Westphal else 577531770e34SFlorian Westphal tp->pred_flags = 0; 5776370816aeSPavel Emelyanov } 5777370816aeSPavel Emelyanov 57788e4178c1SYuchung Cheng static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, 57798e4178c1SYuchung Cheng struct tcp_fastopen_cookie *cookie) 57808e4178c1SYuchung Cheng { 57818e4178c1SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 578275c119afSEric Dumazet struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; 57832646c831SDaniel Lee u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; 57842646c831SDaniel Lee bool syn_drop = false; 57858e4178c1SYuchung Cheng 57868e4178c1SYuchung Cheng if (mss == tp->rx_opt.user_mss) { 57878e4178c1SYuchung Cheng struct tcp_options_received opt; 57888e4178c1SYuchung Cheng 57898e4178c1SYuchung Cheng /* Get original SYNACK MSS value if user MSS sets mss_clamp */ 57908e4178c1SYuchung Cheng tcp_clear_options(&opt); 57918e4178c1SYuchung Cheng opt.user_mss = opt.mss_clamp = 0; 5792eed29f17SEric Dumazet tcp_parse_options(sock_net(sk), synack, &opt, 0, NULL); 57938e4178c1SYuchung Cheng mss = opt.mss_clamp; 57948e4178c1SYuchung Cheng } 57958e4178c1SYuchung Cheng 57962646c831SDaniel Lee if (!tp->syn_fastopen) { 57972646c831SDaniel Lee /* Ignore an unsolicited cookie */ 579867da22d2SYuchung Cheng cookie->len = -1; 57992646c831SDaniel Lee } else if (tp->total_retrans) { 58002646c831SDaniel Lee /* SYN timed out and the SYN-ACK neither has a cookie nor 58012646c831SDaniel Lee * acknowledges data. Presumably the remote received only 58022646c831SDaniel Lee * the retransmitted (regular) SYNs: either the original 58032646c831SDaniel Lee * SYN-data or the corresponding SYN-ACK was dropped. 5804aab48743SYuchung Cheng */ 58052646c831SDaniel Lee syn_drop = (cookie->len < 0 && data); 58062646c831SDaniel Lee } else if (cookie->len < 0 && !tp->syn_data) { 58072646c831SDaniel Lee /* We requested a cookie but didn't get it. If we did not use 58082646c831SDaniel Lee * the (old) exp opt format then try so next time (try_exp=1). 58092646c831SDaniel Lee * Otherwise we go back to use the RFC7413 opt (try_exp=2). 58102646c831SDaniel Lee */ 58112646c831SDaniel Lee try_exp = tp->syn_fastopen_exp ? 2 : 1; 58122646c831SDaniel Lee } 5813aab48743SYuchung Cheng 58142646c831SDaniel Lee tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); 58158e4178c1SYuchung Cheng 58168e4178c1SYuchung Cheng if (data) { /* Retransmit unacked data in SYN */ 581775c119afSEric Dumazet skb_rbtree_walk_from(data) { 581875c119afSEric Dumazet if (__tcp_retransmit_skb(sk, data, 1)) 581993b174adSYuchung Cheng break; 582093b174adSYuchung Cheng } 58218e4178c1SYuchung Cheng tcp_rearm_rto(sk); 5822c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), 582302a1d6e7SEric Dumazet LINUX_MIB_TCPFASTOPENACTIVEFAIL); 58248e4178c1SYuchung Cheng return true; 58258e4178c1SYuchung Cheng } 58266f73601eSYuchung Cheng tp->syn_data_acked = tp->syn_data; 5827bef5767fSYuchung Cheng if (tp->syn_data_acked) { 5828bef5767fSYuchung Cheng NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); 5829bef5767fSYuchung Cheng /* SYN-data is counted as two separate packets in tcp_ack() */ 5830bef5767fSYuchung Cheng if (tp->delivered > 1) 5831bef5767fSYuchung Cheng --tp->delivered; 5832bef5767fSYuchung Cheng } 583361d2bcaeSEric Dumazet 583461d2bcaeSEric Dumazet tcp_fastopen_add_skb(sk, synack); 583561d2bcaeSEric Dumazet 58368e4178c1SYuchung Cheng return false; 58378e4178c1SYuchung Cheng } 58388e4178c1SYuchung Cheng 583960e2a778SUrsula Braun static void smc_check_reset_syn(struct tcp_sock *tp) 584060e2a778SUrsula Braun { 584160e2a778SUrsula Braun #if IS_ENABLED(CONFIG_SMC) 584260e2a778SUrsula Braun if (static_branch_unlikely(&tcp_have_smc)) { 584360e2a778SUrsula Braun if (tp->syn_smc && !tp->rx_opt.smc_ok) 584460e2a778SUrsula Braun tp->syn_smc = 0; 584560e2a778SUrsula Braun } 584660e2a778SUrsula Braun #endif 584760e2a778SUrsula Braun } 584860e2a778SUrsula Braun 58497c1f0815SYuchung Cheng static void tcp_try_undo_spurious_syn(struct sock *sk) 58507c1f0815SYuchung Cheng { 58517c1f0815SYuchung Cheng struct tcp_sock *tp = tcp_sk(sk); 58527c1f0815SYuchung Cheng u32 syn_stamp; 58537c1f0815SYuchung Cheng 58547c1f0815SYuchung Cheng /* undo_marker is set when SYN or SYNACK times out. The timeout is 58557c1f0815SYuchung Cheng * spurious if the ACK's timestamp option echo value matches the 58567c1f0815SYuchung Cheng * original SYN timestamp. 58577c1f0815SYuchung Cheng */ 58587c1f0815SYuchung Cheng syn_stamp = tp->retrans_stamp; 58597c1f0815SYuchung Cheng if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp && 58607c1f0815SYuchung Cheng syn_stamp == tp->rx_opt.rcv_tsecr) 58617c1f0815SYuchung Cheng tp->undo_marker = 0; 58627c1f0815SYuchung Cheng } 58637c1f0815SYuchung Cheng 58641da177e4SLinus Torvalds static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5865bda07a64SEric Dumazet const struct tcphdr *th) 58661da177e4SLinus Torvalds { 5867d83d8461SArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 58684957faadSWilliam Allen Simpson struct tcp_sock *tp = tcp_sk(sk); 58698e4178c1SYuchung Cheng struct tcp_fastopen_cookie foc = { .len = -1 }; 58704957faadSWilliam Allen Simpson int saved_clamp = tp->rx_opt.mss_clamp; 58710f9fa831SEric Dumazet bool fastopen_fail; 58721da177e4SLinus Torvalds 5873eed29f17SEric Dumazet tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc); 5874e3e12028SAndrew Vagin if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) 5875ee684b6fSAndrey Vagin tp->rx_opt.rcv_tsecr -= tp->tsoffset; 58761da177e4SLinus Torvalds 58771da177e4SLinus Torvalds if (th->ack) { 58781da177e4SLinus Torvalds /* rfc793: 58791da177e4SLinus Torvalds * "If the state is SYN-SENT then 58801da177e4SLinus Torvalds * first check the ACK bit 58811da177e4SLinus Torvalds * If the ACK bit is set 58821da177e4SLinus Torvalds * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 58831da177e4SLinus Torvalds * a reset (unless the RST bit is set, if so drop 58841da177e4SLinus Torvalds * the segment and return)" 58851da177e4SLinus Torvalds */ 58868e4178c1SYuchung Cheng if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) || 58878e4178c1SYuchung Cheng after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) 58881da177e4SLinus Torvalds goto reset_and_undo; 58891da177e4SLinus Torvalds 58901da177e4SLinus Torvalds if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 58911da177e4SLinus Torvalds !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, 58929a568de4SEric Dumazet tcp_time_stamp(tp))) { 5893c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), 589402a1d6e7SEric Dumazet LINUX_MIB_PAWSACTIVEREJECTED); 58951da177e4SLinus Torvalds goto reset_and_undo; 58961da177e4SLinus Torvalds } 58971da177e4SLinus Torvalds 58981da177e4SLinus Torvalds /* Now ACK is acceptable. 58991da177e4SLinus Torvalds * 59001da177e4SLinus Torvalds * "If the RST bit is set 59011da177e4SLinus Torvalds * If the ACK was acceptable then signal the user "error: 59021da177e4SLinus Torvalds * connection reset", drop the segment, enter CLOSED state, 59031da177e4SLinus Torvalds * delete TCB, and return." 59041da177e4SLinus Torvalds */ 59051da177e4SLinus Torvalds 59061da177e4SLinus Torvalds if (th->rst) { 59071da177e4SLinus Torvalds tcp_reset(sk); 59081da177e4SLinus Torvalds goto discard; 59091da177e4SLinus Torvalds } 59101da177e4SLinus Torvalds 59111da177e4SLinus Torvalds /* rfc793: 59121da177e4SLinus Torvalds * "fifth, if neither of the SYN or RST bits is set then 59131da177e4SLinus Torvalds * drop the segment and return." 59141da177e4SLinus Torvalds * 59151da177e4SLinus Torvalds * See note below! 59161da177e4SLinus Torvalds * --ANK(990513) 59171da177e4SLinus Torvalds */ 59181da177e4SLinus Torvalds if (!th->syn) 59191da177e4SLinus Torvalds goto discard_and_undo; 59201da177e4SLinus Torvalds 59211da177e4SLinus Torvalds /* rfc793: 59221da177e4SLinus Torvalds * "If the SYN bit is on ... 59231da177e4SLinus Torvalds * are acceptable then ... 59241da177e4SLinus Torvalds * (our SYN has been ACKed), change the connection 59251da177e4SLinus Torvalds * state to ESTABLISHED..." 59261da177e4SLinus Torvalds */ 59271da177e4SLinus Torvalds 5928735d3831SFlorian Westphal tcp_ecn_rcv_synack(tp, th); 59291da177e4SLinus Torvalds 59301b3a6926SRazvan Ghitulete tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 59317c1f0815SYuchung Cheng tcp_try_undo_spurious_syn(sk); 593231770e34SFlorian Westphal tcp_ack(sk, skb, FLAG_SLOWPATH); 59331da177e4SLinus Torvalds 59341da177e4SLinus Torvalds /* Ok.. it's good. Set up sequence numbers and 59351da177e4SLinus Torvalds * move to established. 59361da177e4SLinus Torvalds */ 5937dba7d9b8SEric Dumazet WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); 59381da177e4SLinus Torvalds tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 59391da177e4SLinus Torvalds 59401da177e4SLinus Torvalds /* RFC1323: The window in SYN & SYN/ACK segments is 59411da177e4SLinus Torvalds * never scaled. 59421da177e4SLinus Torvalds */ 59431da177e4SLinus Torvalds tp->snd_wnd = ntohs(th->window); 59441da177e4SLinus Torvalds 59451da177e4SLinus Torvalds if (!tp->rx_opt.wscale_ok) { 59461da177e4SLinus Torvalds tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 59471da177e4SLinus Torvalds tp->window_clamp = min(tp->window_clamp, 65535U); 59481da177e4SLinus Torvalds } 59491da177e4SLinus Torvalds 59501da177e4SLinus Torvalds if (tp->rx_opt.saw_tstamp) { 59511da177e4SLinus Torvalds tp->rx_opt.tstamp_ok = 1; 59521da177e4SLinus Torvalds tp->tcp_header_len = 59531da177e4SLinus Torvalds sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 59541da177e4SLinus Torvalds tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 59551da177e4SLinus Torvalds tcp_store_ts_recent(tp); 59561da177e4SLinus Torvalds } else { 59571da177e4SLinus Torvalds tp->tcp_header_len = sizeof(struct tcphdr); 59581da177e4SLinus Torvalds } 59591da177e4SLinus Torvalds 5960d83d8461SArnaldo Carvalho de Melo tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 59611da177e4SLinus Torvalds tcp_initialize_rcv_mss(sk); 59621da177e4SLinus Torvalds 59631da177e4SLinus Torvalds /* Remember, tcp_poll() does not lock socket! 59641da177e4SLinus Torvalds * Change state from SYN-SENT only after copied_seq 59651da177e4SLinus Torvalds * is initialized. */ 59667db48e98SEric Dumazet WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 59674957faadSWilliam Allen Simpson 596860e2a778SUrsula Braun smc_check_reset_syn(tp); 596960e2a778SUrsula Braun 5970e16aa207SRalf Baechle smp_mb(); 59711da177e4SLinus Torvalds 5972370816aeSPavel Emelyanov tcp_finish_connect(sk, skb); 59731da177e4SLinus Torvalds 59740f9fa831SEric Dumazet fastopen_fail = (tp->syn_fastopen || tp->syn_data) && 59750f9fa831SEric Dumazet tcp_rcv_fastopen_synack(sk, skb, &foc); 59768e4178c1SYuchung Cheng 59770f9fa831SEric Dumazet if (!sock_flag(sk, SOCK_DEAD)) { 59780f9fa831SEric Dumazet sk->sk_state_change(sk); 59790f9fa831SEric Dumazet sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); 59800f9fa831SEric Dumazet } 59810f9fa831SEric Dumazet if (fastopen_fail) 59820f9fa831SEric Dumazet return -1; 5983295f7324SArnaldo Carvalho de Melo if (sk->sk_write_pending || 5984295f7324SArnaldo Carvalho de Melo icsk->icsk_accept_queue.rskq_defer_accept || 598531954cd8SWei Wang inet_csk_in_pingpong_mode(sk)) { 59861da177e4SLinus Torvalds /* Save one ACK. Data will be ready after 59871da177e4SLinus Torvalds * several ticks, if write_pending is set. 59881da177e4SLinus Torvalds * 59891da177e4SLinus Torvalds * It may be deleted, but with this feature tcpdumps 59901da177e4SLinus Torvalds * look so _wonderfully_ clever, that I was not able 59911da177e4SLinus Torvalds * to stand against the temptation 8) --ANK 59921da177e4SLinus Torvalds */ 5993463c84b9SArnaldo Carvalho de Melo inet_csk_schedule_ack(sk); 59949a9c9b51SEric Dumazet tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); 59953f421baaSArnaldo Carvalho de Melo inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 59963f421baaSArnaldo Carvalho de Melo TCP_DELACK_MAX, TCP_RTO_MAX); 59971da177e4SLinus Torvalds 59981da177e4SLinus Torvalds discard: 5999532182cdSEric Dumazet tcp_drop(sk, skb); 60001da177e4SLinus Torvalds return 0; 60011da177e4SLinus Torvalds } else { 60021da177e4SLinus Torvalds tcp_send_ack(sk); 60031da177e4SLinus Torvalds } 60041da177e4SLinus Torvalds return -1; 60051da177e4SLinus Torvalds } 60061da177e4SLinus Torvalds 60071da177e4SLinus Torvalds /* No ACK in the segment */ 60081da177e4SLinus Torvalds 60091da177e4SLinus Torvalds if (th->rst) { 60101da177e4SLinus Torvalds /* rfc793: 60111da177e4SLinus Torvalds * "If the RST bit is set 60121da177e4SLinus Torvalds * 60131da177e4SLinus Torvalds * Otherwise (no ACK) drop the segment and return." 60141da177e4SLinus Torvalds */ 60151da177e4SLinus Torvalds 60161da177e4SLinus Torvalds goto discard_and_undo; 60171da177e4SLinus Torvalds } 60181da177e4SLinus Torvalds 60191da177e4SLinus Torvalds /* PAWS check. */ 6020056834d9SIlpo Järvinen if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && 6021c887e6d2SIlpo Järvinen tcp_paws_reject(&tp->rx_opt, 0)) 60221da177e4SLinus Torvalds goto discard_and_undo; 60231da177e4SLinus Torvalds 60241da177e4SLinus Torvalds if (th->syn) { 60251da177e4SLinus Torvalds /* We see SYN without ACK. It is attempt of 60261da177e4SLinus Torvalds * simultaneous connect with crossed SYNs. 60271da177e4SLinus Torvalds * Particularly, it can be connect to self. 60281da177e4SLinus Torvalds */ 60291da177e4SLinus Torvalds tcp_set_state(sk, TCP_SYN_RECV); 60301da177e4SLinus Torvalds 60311da177e4SLinus Torvalds if (tp->rx_opt.saw_tstamp) { 60321da177e4SLinus Torvalds tp->rx_opt.tstamp_ok = 1; 60331da177e4SLinus Torvalds tcp_store_ts_recent(tp); 60341da177e4SLinus Torvalds tp->tcp_header_len = 60351da177e4SLinus Torvalds sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 60361da177e4SLinus Torvalds } else { 60371da177e4SLinus Torvalds tp->tcp_header_len = sizeof(struct tcphdr); 60381da177e4SLinus Torvalds } 60391da177e4SLinus Torvalds 6040dba7d9b8SEric Dumazet WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); 60417db48e98SEric Dumazet WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 60421da177e4SLinus Torvalds tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 60431da177e4SLinus Torvalds 60441da177e4SLinus Torvalds /* RFC1323: The window in SYN & SYN/ACK segments is 60451da177e4SLinus Torvalds * never scaled. 60461da177e4SLinus Torvalds */ 60471da177e4SLinus Torvalds tp->snd_wnd = ntohs(th->window); 60481da177e4SLinus Torvalds tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 60491da177e4SLinus Torvalds tp->max_window = tp->snd_wnd; 60501da177e4SLinus Torvalds 6051735d3831SFlorian Westphal tcp_ecn_rcv_syn(tp, th); 60521da177e4SLinus Torvalds 60535d424d5aSJohn Heffner tcp_mtup_init(sk); 6054d83d8461SArnaldo Carvalho de Melo tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); 60551da177e4SLinus Torvalds tcp_initialize_rcv_mss(sk); 60561da177e4SLinus Torvalds 60571da177e4SLinus Torvalds tcp_send_synack(sk); 60581da177e4SLinus Torvalds #if 0 60591da177e4SLinus Torvalds /* Note, we could accept data and URG from this segment. 6060168a8f58SJerry Chu * There are no obstacles to make this (except that we must 6061168a8f58SJerry Chu * either change tcp_recvmsg() to prevent it from returning data 6062168a8f58SJerry Chu * before 3WHS completes per RFC793, or employ TCP Fast Open). 60631da177e4SLinus Torvalds * 60641da177e4SLinus Torvalds * However, if we ignore data in ACKless segments sometimes, 60651da177e4SLinus Torvalds * we have no reasons to accept it sometimes. 60661da177e4SLinus Torvalds * Also, seems the code doing it in step6 of tcp_rcv_state_process 60671da177e4SLinus Torvalds * is not flawless. So, discard packet for sanity. 60681da177e4SLinus Torvalds * Uncomment this return to process the data. 60691da177e4SLinus Torvalds */ 60701da177e4SLinus Torvalds return -1; 60711da177e4SLinus Torvalds #else 60721da177e4SLinus Torvalds goto discard; 60731da177e4SLinus Torvalds #endif 60741da177e4SLinus Torvalds } 60751da177e4SLinus Torvalds /* "fifth, if neither of the SYN or RST bits is set then 60761da177e4SLinus Torvalds * drop the segment and return." 60771da177e4SLinus Torvalds */ 60781da177e4SLinus Torvalds 60791da177e4SLinus Torvalds discard_and_undo: 60801da177e4SLinus Torvalds tcp_clear_options(&tp->rx_opt); 60811da177e4SLinus Torvalds tp->rx_opt.mss_clamp = saved_clamp; 60821da177e4SLinus Torvalds goto discard; 60831da177e4SLinus Torvalds 60841da177e4SLinus Torvalds reset_and_undo: 60851da177e4SLinus Torvalds tcp_clear_options(&tp->rx_opt); 60861da177e4SLinus Torvalds tp->rx_opt.mss_clamp = saved_clamp; 60871da177e4SLinus Torvalds return 1; 60881da177e4SLinus Torvalds } 60891da177e4SLinus Torvalds 60906b94b1c8SYuchung Cheng static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) 60916b94b1c8SYuchung Cheng { 6092d983ea6fSEric Dumazet struct request_sock *req; 6093d983ea6fSEric Dumazet 60946b94b1c8SYuchung Cheng tcp_try_undo_loss(sk, false); 6095cd736d8bSYuchung Cheng 6096cd736d8bSYuchung Cheng /* Reset rtx states to prevent spurious retransmits_timed_out() */ 6097cd736d8bSYuchung Cheng tcp_sk(sk)->retrans_stamp = 0; 60986b94b1c8SYuchung Cheng inet_csk(sk)->icsk_retransmits = 0; 60996b94b1c8SYuchung Cheng 61006b94b1c8SYuchung Cheng /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1, 61016b94b1c8SYuchung Cheng * we no longer need req so release it. 61026b94b1c8SYuchung Cheng */ 6103d983ea6fSEric Dumazet req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 6104d983ea6fSEric Dumazet lockdep_sock_is_held(sk)); 6105d983ea6fSEric Dumazet reqsk_fastopen_remove(sk, req, false); 61066b94b1c8SYuchung Cheng 61076b94b1c8SYuchung Cheng /* Re-arm the timer because data may have been sent out. 61086b94b1c8SYuchung Cheng * This is similar to the regular data transmission case 61096b94b1c8SYuchung Cheng * when new data has just been ack'ed. 61106b94b1c8SYuchung Cheng * 61116b94b1c8SYuchung Cheng * (TFO) - we could try to be more aggressive and 61126b94b1c8SYuchung Cheng * retransmitting any data sooner based on when they 61136b94b1c8SYuchung Cheng * are sent out. 61146b94b1c8SYuchung Cheng */ 61156b94b1c8SYuchung Cheng tcp_rearm_rto(sk); 61166b94b1c8SYuchung Cheng } 61176b94b1c8SYuchung Cheng 61181da177e4SLinus Torvalds /* 61191da177e4SLinus Torvalds * This function implements the receiving procedure of RFC 793 for 61201da177e4SLinus Torvalds * all states except ESTABLISHED and TIME_WAIT. 61211da177e4SLinus Torvalds * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be 61221da177e4SLinus Torvalds * address independent. 61231da177e4SLinus Torvalds */ 61241da177e4SLinus Torvalds 612572ab4a86SEric Dumazet int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) 61261da177e4SLinus Torvalds { 61271da177e4SLinus Torvalds struct tcp_sock *tp = tcp_sk(sk); 61288292a17aSArnaldo Carvalho de Melo struct inet_connection_sock *icsk = inet_csk(sk); 612972ab4a86SEric Dumazet const struct tcphdr *th = tcp_hdr(skb); 6130168a8f58SJerry Chu struct request_sock *req; 61311da177e4SLinus Torvalds int queued = 0; 61321f6afc81SEric Dumazet bool acceptable; 61331da177e4SLinus Torvalds 61341da177e4SLinus Torvalds switch (sk->sk_state) { 61351da177e4SLinus Torvalds case TCP_CLOSE: 61361da177e4SLinus Torvalds goto discard; 61371da177e4SLinus Torvalds 61381da177e4SLinus Torvalds case TCP_LISTEN: 61391da177e4SLinus Torvalds if (th->ack) 61401da177e4SLinus Torvalds return 1; 61411da177e4SLinus Torvalds 61421da177e4SLinus Torvalds if (th->rst) 61431da177e4SLinus Torvalds goto discard; 61441da177e4SLinus Torvalds 61451da177e4SLinus Torvalds if (th->syn) { 6146fdf5af0dSEric Dumazet if (th->fin) 6147fdf5af0dSEric Dumazet goto discard; 6148449809a6SEric Dumazet /* It is possible that we process SYN packets from backlog, 61491ad98e9dSEric Dumazet * so we need to make sure to disable BH and RCU right there. 6150449809a6SEric Dumazet */ 61511ad98e9dSEric Dumazet rcu_read_lock(); 6152449809a6SEric Dumazet local_bh_disable(); 6153449809a6SEric Dumazet acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; 6154449809a6SEric Dumazet local_bh_enable(); 61551ad98e9dSEric Dumazet rcu_read_unlock(); 61561da177e4SLinus Torvalds 6157449809a6SEric Dumazet if (!acceptable) 6158449809a6SEric Dumazet return 1; 61590aea76d3SEric Dumazet consume_skb(skb); 6160fb7e2399SMasayuki Nakagawa return 0; 61611da177e4SLinus Torvalds } 61621da177e4SLinus Torvalds goto discard; 61631da177e4SLinus Torvalds 61641da177e4SLinus Torvalds case TCP_SYN_SENT: 61658804b272SEric Dumazet tp->rx_opt.saw_tstamp = 0; 61669a568de4SEric Dumazet tcp_mstamp_refresh(tp); 6167bda07a64SEric Dumazet queued = tcp_rcv_synsent_state_process(sk, skb, th); 61681da177e4SLinus Torvalds if (queued >= 0) 61691da177e4SLinus Torvalds return queued; 61701da177e4SLinus Torvalds 61711da177e4SLinus Torvalds /* Do step6 onward by hand. */ 61721da177e4SLinus Torvalds tcp_urg(sk, skb, th); 61731da177e4SLinus Torvalds __kfree_skb(skb); 61749e412ba7SIlpo Järvinen tcp_data_snd_check(sk); 61751da177e4SLinus Torvalds return 0; 61761da177e4SLinus Torvalds } 61771da177e4SLinus Torvalds 61789a568de4SEric Dumazet tcp_mstamp_refresh(tp); 61798804b272SEric Dumazet tp->rx_opt.saw_tstamp = 0; 6180d983ea6fSEric Dumazet req = rcu_dereference_protected(tp->fastopen_rsk, 6181d983ea6fSEric Dumazet lockdep_sock_is_held(sk)); 618200db4124SIan Morris if (req) { 6183e0f9759fSEric Dumazet bool req_stolen; 6184e0f9759fSEric Dumazet 618537561f68SJerry Chu WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && 6186168a8f58SJerry Chu sk->sk_state != TCP_FIN_WAIT1); 6187168a8f58SJerry Chu 6188e0f9759fSEric Dumazet if (!tcp_check_req(sk, skb, req, true, &req_stolen)) 6189168a8f58SJerry Chu goto discard; 6190e69bebdeSNeal Cardwell } 6191c3ae62afSEric Dumazet 61920c228e83SCalvin Owens if (!th->ack && !th->rst && !th->syn) 6193c3ae62afSEric Dumazet goto discard; 6194c3ae62afSEric Dumazet 6195e69bebdeSNeal Cardwell if (!tcp_validate_incoming(sk, skb, th, 0)) 61960c24604bSEric Dumazet return 0; 61971da177e4SLinus Torvalds 61981da177e4SLinus Torvalds /* step 5: check the ACK field */ 619931770e34SFlorian Westphal acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | 620031770e34SFlorian Westphal FLAG_UPDATE_TS_RECENT | 6201d0e1a1b5SEric Dumazet FLAG_NO_CHALLENGE_ACK) > 0; 62021da177e4SLinus Torvalds 6203d0e1a1b5SEric Dumazet if (!acceptable) { 6204d0e1a1b5SEric Dumazet if (sk->sk_state == TCP_SYN_RECV) 6205d0e1a1b5SEric Dumazet return 1; /* send one RST */ 6206d0e1a1b5SEric Dumazet tcp_send_challenge_ack(sk, skb); 6207d0e1a1b5SEric Dumazet goto discard; 6208d0e1a1b5SEric Dumazet } 62091da177e4SLinus Torvalds switch (sk->sk_state) { 62101da177e4SLinus Torvalds case TCP_SYN_RECV: 6211bef5767fSYuchung Cheng tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ 62120f1c28aeSYuchung Cheng if (!tp->srtt_us) 62130f1c28aeSYuchung Cheng tcp_synack_rtt_meas(sk, req); 62140f1c28aeSYuchung Cheng 6215168a8f58SJerry Chu if (req) { 62166b94b1c8SYuchung Cheng tcp_rcv_synrecv_state_fastopen(sk); 6217168a8f58SJerry Chu } else { 6218336c39a0SYuchung Cheng tcp_try_undo_spurious_syn(sk); 6219336c39a0SYuchung Cheng tp->retrans_stamp = 0; 622027204aaaSWei Wang tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB); 62217db48e98SEric Dumazet WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 6222168a8f58SJerry Chu } 6223e16aa207SRalf Baechle smp_mb(); 62241da177e4SLinus Torvalds tcp_set_state(sk, TCP_ESTABLISHED); 62251da177e4SLinus Torvalds sk->sk_state_change(sk); 62261da177e4SLinus Torvalds 622761eb9003SJoe Perches /* Note, that this wakeup is only for marginal crossed SYN case. 622861eb9003SJoe Perches * Passively open sockets are not waked up, because 622961eb9003SJoe Perches * sk->sk_sleep == NULL and sk->sk_socket == NULL. 62301da177e4SLinus Torvalds */ 62318d8ad9d7SPavel Emelyanov if (sk->sk_socket) 62321f6afc81SEric Dumazet sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); 62331da177e4SLinus Torvalds 62341da177e4SLinus Torvalds tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 623561eb9003SJoe Perches tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; 6236ee7537b6SHantzis Fotis tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 62371da177e4SLinus Torvalds 62381da177e4SLinus Torvalds if (tp->rx_opt.tstamp_ok) 62391da177e4SLinus Torvalds tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 62401da177e4SLinus Torvalds 6241c0402760SYuchung Cheng if (!inet_csk(sk)->icsk_ca_ops->cong_control) 624202cf4ebdSNeal Cardwell tcp_update_pacing_rate(sk); 624302cf4ebdSNeal Cardwell 624461eb9003SJoe Perches /* Prevent spurious tcp_cwnd_restart() on first data packet */ 6245d635fbe2SEric Dumazet tp->lsndtime = tcp_jiffies32; 62461da177e4SLinus Torvalds 62471da177e4SLinus Torvalds tcp_initialize_rcv_mss(sk); 624831770e34SFlorian Westphal tcp_fast_path_on(tp); 62491da177e4SLinus Torvalds break; 62501da177e4SLinus Torvalds 6251c48b22daSJoe Perches case TCP_FIN_WAIT1: { 6252c48b22daSJoe Perches int tmo; 6253c48b22daSJoe Perches 62546b94b1c8SYuchung Cheng if (req) 62556b94b1c8SYuchung Cheng tcp_rcv_synrecv_state_fastopen(sk); 62566b94b1c8SYuchung Cheng 6257c48b22daSJoe Perches if (tp->snd_una != tp->write_seq) 6258c48b22daSJoe Perches break; 62595110effeSDavid S. Miller 62601da177e4SLinus Torvalds tcp_set_state(sk, TCP_FIN_WAIT2); 62611da177e4SLinus Torvalds sk->sk_shutdown |= SEND_SHUTDOWN; 62625110effeSDavid S. Miller 6263c3a2e837SJulian Anastasov sk_dst_confirm(sk); 62641da177e4SLinus Torvalds 62651f6afc81SEric Dumazet if (!sock_flag(sk, SOCK_DEAD)) { 62661da177e4SLinus Torvalds /* Wake up lingering close() */ 62671da177e4SLinus Torvalds sk->sk_state_change(sk); 6268c48b22daSJoe Perches break; 6269c48b22daSJoe Perches } 62701da177e4SLinus Torvalds 6271cf1ef3f0SWei Wang if (tp->linger2 < 0) { 6272cf1ef3f0SWei Wang tcp_done(sk); 6273cf1ef3f0SWei Wang NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 6274cf1ef3f0SWei Wang return 1; 6275cf1ef3f0SWei Wang } 6276cf1ef3f0SWei Wang if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 6277cf1ef3f0SWei Wang after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { 6278cf1ef3f0SWei Wang /* Receive out of order FIN after close() */ 6279cf1ef3f0SWei Wang if (tp->syn_fastopen && th->fin) 628046c2fa39SWei Wang tcp_fastopen_active_disable(sk); 62811da177e4SLinus Torvalds tcp_done(sk); 6282c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 62831da177e4SLinus Torvalds return 1; 62841da177e4SLinus Torvalds } 62851da177e4SLinus Torvalds 6286463c84b9SArnaldo Carvalho de Melo tmo = tcp_fin_time(sk); 62871da177e4SLinus Torvalds if (tmo > TCP_TIMEWAIT_LEN) { 6288463c84b9SArnaldo Carvalho de Melo inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); 62891da177e4SLinus Torvalds } else if (th->fin || sock_owned_by_user(sk)) { 62901da177e4SLinus Torvalds /* Bad case. We could lose such FIN otherwise. 62911da177e4SLinus Torvalds * It is not a big problem, but it looks confusing 62921da177e4SLinus Torvalds * and not so rare event. We still can lose it now, 62931da177e4SLinus Torvalds * if it spins in bh_lock_sock(), but it is really 62941da177e4SLinus Torvalds * marginal case. 62951da177e4SLinus Torvalds */ 6296463c84b9SArnaldo Carvalho de Melo inet_csk_reset_keepalive_timer(sk, tmo); 62971da177e4SLinus Torvalds } else { 62981da177e4SLinus Torvalds tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); 62991da177e4SLinus Torvalds goto discard; 63001da177e4SLinus Torvalds } 63011da177e4SLinus Torvalds break; 6302c48b22daSJoe Perches } 63031da177e4SLinus Torvalds 63041da177e4SLinus Torvalds case TCP_CLOSING: 63051da177e4SLinus Torvalds if (tp->snd_una == tp->write_seq) { 63061da177e4SLinus Torvalds tcp_time_wait(sk, TCP_TIME_WAIT, 0); 63071da177e4SLinus Torvalds goto discard; 63081da177e4SLinus Torvalds } 63091da177e4SLinus Torvalds break; 63101da177e4SLinus Torvalds 63111da177e4SLinus Torvalds case TCP_LAST_ACK: 63121da177e4SLinus Torvalds if (tp->snd_una == tp->write_seq) { 63131da177e4SLinus Torvalds tcp_update_metrics(sk); 63141da177e4SLinus Torvalds tcp_done(sk); 63151da177e4SLinus Torvalds goto discard; 63161da177e4SLinus Torvalds } 63171da177e4SLinus Torvalds break; 63181da177e4SLinus Torvalds } 63191da177e4SLinus Torvalds 63201da177e4SLinus Torvalds /* step 6: check the URG bit */ 63211da177e4SLinus Torvalds tcp_urg(sk, skb, th); 63221da177e4SLinus Torvalds 63231da177e4SLinus Torvalds /* step 7: process the segment text */ 63241da177e4SLinus Torvalds switch (sk->sk_state) { 63251da177e4SLinus Torvalds case TCP_CLOSE_WAIT: 63261da177e4SLinus Torvalds case TCP_CLOSING: 63271da177e4SLinus Torvalds case TCP_LAST_ACK: 63281da177e4SLinus Torvalds if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 63291da177e4SLinus Torvalds break; 6330fcfd6dfaSGustavo A. R. Silva /* fall through */ 63311da177e4SLinus Torvalds case TCP_FIN_WAIT1: 63321da177e4SLinus Torvalds case TCP_FIN_WAIT2: 63331da177e4SLinus Torvalds /* RFC 793 says to queue data in these states, 63341da177e4SLinus Torvalds * RFC 1122 says we MUST send a reset. 63351da177e4SLinus Torvalds * BSD 4.4 also does reset. 63361da177e4SLinus Torvalds */ 63371da177e4SLinus Torvalds if (sk->sk_shutdown & RCV_SHUTDOWN) { 63381da177e4SLinus Torvalds if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && 63391da177e4SLinus Torvalds after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { 6340c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); 63411da177e4SLinus Torvalds tcp_reset(sk); 63421da177e4SLinus Torvalds return 1; 63431da177e4SLinus Torvalds } 63441da177e4SLinus Torvalds } 63451da177e4SLinus Torvalds /* Fall through */ 63461da177e4SLinus Torvalds case TCP_ESTABLISHED: 63471da177e4SLinus Torvalds tcp_data_queue(sk, skb); 63481da177e4SLinus Torvalds queued = 1; 63491da177e4SLinus Torvalds break; 63501da177e4SLinus Torvalds } 63511da177e4SLinus Torvalds 63521da177e4SLinus Torvalds /* tcp_data could move socket to TIME-WAIT */ 63531da177e4SLinus Torvalds if (sk->sk_state != TCP_CLOSE) { 63549e412ba7SIlpo Järvinen tcp_data_snd_check(sk); 63551da177e4SLinus Torvalds tcp_ack_snd_check(sk); 63561da177e4SLinus Torvalds } 63571da177e4SLinus Torvalds 63581da177e4SLinus Torvalds if (!queued) { 63591da177e4SLinus Torvalds discard: 6360532182cdSEric Dumazet tcp_drop(sk, skb); 63611da177e4SLinus Torvalds } 63621da177e4SLinus Torvalds return 0; 63631da177e4SLinus Torvalds } 63641da177e4SLinus Torvalds EXPORT_SYMBOL(tcp_rcv_state_process); 63651fb6f159SOctavian Purdila 63661fb6f159SOctavian Purdila static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) 63671fb6f159SOctavian Purdila { 63681fb6f159SOctavian Purdila struct inet_request_sock *ireq = inet_rsk(req); 63691fb6f159SOctavian Purdila 63701fb6f159SOctavian Purdila if (family == AF_INET) 6371ba7a46f1SJoe Perches net_dbg_ratelimited("drop open request from %pI4/%u\n", 63721fb6f159SOctavian Purdila &ireq->ir_rmt_addr, port); 63734135ab82SOctavian Purdila #if IS_ENABLED(CONFIG_IPV6) 63744135ab82SOctavian Purdila else if (family == AF_INET6) 6375ba7a46f1SJoe Perches net_dbg_ratelimited("drop open request from %pI6/%u\n", 63761fb6f159SOctavian Purdila &ireq->ir_v6_rmt_addr, port); 63774135ab82SOctavian Purdila #endif 63781fb6f159SOctavian Purdila } 63791fb6f159SOctavian Purdila 6380d82bd122SFlorian Westphal /* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set 6381d82bd122SFlorian Westphal * 6382d82bd122SFlorian Westphal * If we receive a SYN packet with these bits set, it means a 6383d82bd122SFlorian Westphal * network is playing bad games with TOS bits. In order to 6384d82bd122SFlorian Westphal * avoid possible false congestion notifications, we disable 6385f4e715c3Sstephen hemminger * TCP ECN negotiation. 6386d82bd122SFlorian Westphal * 6387d82bd122SFlorian Westphal * Exception: tcp_ca wants ECN. This is required for DCTCP 6388843c2fdfSFlorian Westphal * congestion control: Linux DCTCP asserts ECT on all packets, 6389843c2fdfSFlorian Westphal * including SYN, which is most optimal solution; however, 6390843c2fdfSFlorian Westphal * others, such as FreeBSD do not. 6391f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) * 6392f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) * Exception: At least one of the reserved bits of the TCP header (th->res1) is 6393f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) * set, indicating the use of a future TCP extension (such as AccECN). See 6394f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) * RFC8311 §4.3 which updates RFC3168 to allow the development of such 6395f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) * extensions. 6396d82bd122SFlorian Westphal */ 6397d82bd122SFlorian Westphal static void tcp_ecn_create_request(struct request_sock *req, 6398d82bd122SFlorian Westphal const struct sk_buff *skb, 6399f7b3bec6SFlorian Westphal const struct sock *listen_sk, 6400f7b3bec6SFlorian Westphal const struct dst_entry *dst) 6401d82bd122SFlorian Westphal { 6402d82bd122SFlorian Westphal const struct tcphdr *th = tcp_hdr(skb); 6403d82bd122SFlorian Westphal const struct net *net = sock_net(listen_sk); 6404d82bd122SFlorian Westphal bool th_ecn = th->ece && th->cwr; 6405843c2fdfSFlorian Westphal bool ect, ecn_ok; 6406c3a8d947SDaniel Borkmann u32 ecn_ok_dst; 6407d82bd122SFlorian Westphal 6408d82bd122SFlorian Westphal if (!th_ecn) 6409d82bd122SFlorian Westphal return; 6410d82bd122SFlorian Westphal 6411d82bd122SFlorian Westphal ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); 6412c3a8d947SDaniel Borkmann ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); 6413c3a8d947SDaniel Borkmann ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; 6414d82bd122SFlorian Westphal 6415f6fee16dSTilmans, Olivier (Nokia - BE/Antwerp) if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || 641691b5b21cSLawrence Brakmo (ecn_ok_dst & DST_FEATURE_ECN_CA) || 641791b5b21cSLawrence Brakmo tcp_bpf_ca_needs_ecn((struct sock *)req)) 6418d82bd122SFlorian Westphal inet_rsk(req)->ecn_ok = 1; 6419d82bd122SFlorian Westphal } 6420d82bd122SFlorian Westphal 64211bfc4438SEric Dumazet static void tcp_openreq_init(struct request_sock *req, 64221bfc4438SEric Dumazet const struct tcp_options_received *rx_opt, 64231bfc4438SEric Dumazet struct sk_buff *skb, const struct sock *sk) 64241bfc4438SEric Dumazet { 64251bfc4438SEric Dumazet struct inet_request_sock *ireq = inet_rsk(req); 64261bfc4438SEric Dumazet 6427ed53d0abSEric Dumazet req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */ 64281bfc4438SEric Dumazet req->cookie_ts = 0; 64291bfc4438SEric Dumazet tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; 64301bfc4438SEric Dumazet tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 64319e450c1eSYuchung Cheng tcp_rsk(req)->snt_synack = 0; 64321bfc4438SEric Dumazet tcp_rsk(req)->last_oow_ack_time = 0; 64331bfc4438SEric Dumazet req->mss = rx_opt->mss_clamp; 64341bfc4438SEric Dumazet req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 64351bfc4438SEric Dumazet ireq->tstamp_ok = rx_opt->tstamp_ok; 64361bfc4438SEric Dumazet ireq->sack_ok = rx_opt->sack_ok; 64371bfc4438SEric Dumazet ireq->snd_wscale = rx_opt->snd_wscale; 64381bfc4438SEric Dumazet ireq->wscale_ok = rx_opt->wscale_ok; 64391bfc4438SEric Dumazet ireq->acked = 0; 64401bfc4438SEric Dumazet ireq->ecn_ok = 0; 64411bfc4438SEric Dumazet ireq->ir_rmt_port = tcp_hdr(skb)->source; 64421bfc4438SEric Dumazet ireq->ir_num = ntohs(tcp_hdr(skb)->dest); 64431bfc4438SEric Dumazet ireq->ir_mark = inet_request_mark(sk, skb); 644460e2a778SUrsula Braun #if IS_ENABLED(CONFIG_SMC) 644560e2a778SUrsula Braun ireq->smc_ok = rx_opt->smc_ok; 644660e2a778SUrsula Braun #endif 64471bfc4438SEric Dumazet } 64481bfc4438SEric Dumazet 6449e49bb337SEric Dumazet struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, 6450a1a5344dSEric Dumazet struct sock *sk_listener, 6451a1a5344dSEric Dumazet bool attach_listener) 6452e49bb337SEric Dumazet { 6453a1a5344dSEric Dumazet struct request_sock *req = reqsk_alloc(ops, sk_listener, 6454a1a5344dSEric Dumazet attach_listener); 6455e49bb337SEric Dumazet 6456e49bb337SEric Dumazet if (req) { 6457e49bb337SEric Dumazet struct inet_request_sock *ireq = inet_rsk(req); 6458e49bb337SEric Dumazet 6459c92e8c02SEric Dumazet ireq->ireq_opt = NULL; 646056ac42bcSHuw Davies #if IS_ENABLED(CONFIG_IPV6) 646156ac42bcSHuw Davies ireq->pktopts = NULL; 646256ac42bcSHuw Davies #endif 6463a06ac0d6SYafang Shao atomic64_set(&ireq->ir_cookie, 0); 6464e49bb337SEric Dumazet ireq->ireq_state = TCP_NEW_SYN_RECV; 6465e49bb337SEric Dumazet write_pnet(&ireq->ireq_net, sock_net(sk_listener)); 64660144a81cSEric Dumazet ireq->ireq_family = sk_listener->sk_family; 6467e49bb337SEric Dumazet } 6468e49bb337SEric Dumazet 6469e49bb337SEric Dumazet return req; 6470e49bb337SEric Dumazet } 6471e49bb337SEric Dumazet EXPORT_SYMBOL(inet_reqsk_alloc); 6472e49bb337SEric Dumazet 647341d25fe0SEric Dumazet /* 647441d25fe0SEric Dumazet * Return true if a syncookie should be sent 647541d25fe0SEric Dumazet */ 647696511278SPetar Penkov static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) 647741d25fe0SEric Dumazet { 64788d2675f1SEric Dumazet struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; 647941d25fe0SEric Dumazet const char *msg = "Dropping request"; 648041d25fe0SEric Dumazet bool want_cookie = false; 648112ed8244SNikolay Borisov struct net *net = sock_net(sk); 648241d25fe0SEric Dumazet 648341d25fe0SEric Dumazet #ifdef CONFIG_SYN_COOKIES 648412ed8244SNikolay Borisov if (net->ipv4.sysctl_tcp_syncookies) { 648541d25fe0SEric Dumazet msg = "Sending cookies"; 648641d25fe0SEric Dumazet want_cookie = true; 648702a1d6e7SEric Dumazet __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); 648841d25fe0SEric Dumazet } else 648941d25fe0SEric Dumazet #endif 649002a1d6e7SEric Dumazet __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); 649141d25fe0SEric Dumazet 64928d2675f1SEric Dumazet if (!queue->synflood_warned && 649312ed8244SNikolay Borisov net->ipv4.sysctl_tcp_syncookies != 2 && 64948d2675f1SEric Dumazet xchg(&queue->synflood_warned, 1) == 0) 64950297c1c2SWillem de Bruijn net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", 649696511278SPetar Penkov proto, sk->sk_num, msg); 64972985aaacSEric Dumazet 649841d25fe0SEric Dumazet return want_cookie; 649941d25fe0SEric Dumazet } 650041d25fe0SEric Dumazet 6501cd8ae852SEric Dumazet static void tcp_reqsk_record_syn(const struct sock *sk, 6502cd8ae852SEric Dumazet struct request_sock *req, 6503cd8ae852SEric Dumazet const struct sk_buff *skb) 6504cd8ae852SEric Dumazet { 6505cd8ae852SEric Dumazet if (tcp_sk(sk)->save_syn) { 6506cd8ae852SEric Dumazet u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb); 6507cd8ae852SEric Dumazet u32 *copy; 6508cd8ae852SEric Dumazet 6509cd8ae852SEric Dumazet copy = kmalloc(len + sizeof(u32), GFP_ATOMIC); 6510cd8ae852SEric Dumazet if (copy) { 6511cd8ae852SEric Dumazet copy[0] = len; 6512cd8ae852SEric Dumazet memcpy(©[1], skb_network_header(skb), len); 6513cd8ae852SEric Dumazet req->saved_syn = copy; 6514cd8ae852SEric Dumazet } 6515cd8ae852SEric Dumazet } 6516cd8ae852SEric Dumazet } 6517cd8ae852SEric Dumazet 65189349d600SPetar Penkov /* If a SYN cookie is required and supported, returns a clamped MSS value to be 65199349d600SPetar Penkov * used for SYN cookie generation. 65209349d600SPetar Penkov */ 65219349d600SPetar Penkov u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, 65229349d600SPetar Penkov const struct tcp_request_sock_ops *af_ops, 65239349d600SPetar Penkov struct sock *sk, struct tcphdr *th) 65249349d600SPetar Penkov { 65259349d600SPetar Penkov struct tcp_sock *tp = tcp_sk(sk); 65269349d600SPetar Penkov u16 mss; 65279349d600SPetar Penkov 65289349d600SPetar Penkov if (sock_net(sk)->ipv4.sysctl_tcp_syncookies != 2 && 65299349d600SPetar Penkov !inet_csk_reqsk_queue_is_full(sk)) 65309349d600SPetar Penkov return 0; 65319349d600SPetar Penkov 65329349d600SPetar Penkov if (!tcp_syn_flood_action(sk, rsk_ops->slab_name)) 65339349d600SPetar Penkov return 0; 65349349d600SPetar Penkov 65359349d600SPetar Penkov if (sk_acceptq_is_full(sk)) { 65369349d600SPetar Penkov NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 65379349d600SPetar Penkov return 0; 65389349d600SPetar Penkov } 65399349d600SPetar Penkov 65409349d600SPetar Penkov mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); 65419349d600SPetar Penkov if (!mss) 65429349d600SPetar Penkov mss = af_ops->mss_clamp; 65439349d600SPetar Penkov 65449349d600SPetar Penkov return mss; 65459349d600SPetar Penkov } 65469349d600SPetar Penkov EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); 65479349d600SPetar Penkov 65481fb6f159SOctavian Purdila int tcp_conn_request(struct request_sock_ops *rsk_ops, 65491fb6f159SOctavian Purdila const struct tcp_request_sock_ops *af_ops, 65501fb6f159SOctavian Purdila struct sock *sk, struct sk_buff *skb) 65511fb6f159SOctavian Purdila { 65521fb6f159SOctavian Purdila struct tcp_fastopen_cookie foc = { .len = -1 }; 65537c85af88SEric Dumazet __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; 65547c85af88SEric Dumazet struct tcp_options_received tmp_opt; 65557c85af88SEric Dumazet struct tcp_sock *tp = tcp_sk(sk); 655612ed8244SNikolay Borisov struct net *net = sock_net(sk); 65577c85af88SEric Dumazet struct sock *fastopen_sk = NULL; 65587c85af88SEric Dumazet struct request_sock *req; 65597c85af88SEric Dumazet bool want_cookie = false; 6560eaa72dc4SEric Dumazet struct dst_entry *dst; 65617c85af88SEric Dumazet struct flowi fl; 65621fb6f159SOctavian Purdila 65631fb6f159SOctavian Purdila /* TW buckets are converted to open requests without 65641fb6f159SOctavian Purdila * limitations, they conserve resources and peer is 65651fb6f159SOctavian Purdila * evidently real one. 65661fb6f159SOctavian Purdila */ 656712ed8244SNikolay Borisov if ((net->ipv4.sysctl_tcp_syncookies == 2 || 65681fb6f159SOctavian Purdila inet_csk_reqsk_queue_is_full(sk)) && !isn) { 656996511278SPetar Penkov want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name); 65701fb6f159SOctavian Purdila if (!want_cookie) 65711fb6f159SOctavian Purdila goto drop; 65721fb6f159SOctavian Purdila } 65731fb6f159SOctavian Purdila 65745ea8ea2cSEric Dumazet if (sk_acceptq_is_full(sk)) { 6575c10d9310SEric Dumazet NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 65761fb6f159SOctavian Purdila goto drop; 65771fb6f159SOctavian Purdila } 65781fb6f159SOctavian Purdila 6579a1a5344dSEric Dumazet req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie); 65801fb6f159SOctavian Purdila if (!req) 65811fb6f159SOctavian Purdila goto drop; 65821fb6f159SOctavian Purdila 65831fb6f159SOctavian Purdila tcp_rsk(req)->af_specific = af_ops; 658495a22caeSFlorian Westphal tcp_rsk(req)->ts_off = 0; 65851fb6f159SOctavian Purdila 65861fb6f159SOctavian Purdila tcp_clear_options(&tmp_opt); 65871fb6f159SOctavian Purdila tmp_opt.mss_clamp = af_ops->mss_clamp; 65881fb6f159SOctavian Purdila tmp_opt.user_mss = tp->rx_opt.user_mss; 6589eed29f17SEric Dumazet tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, 6590eed29f17SEric Dumazet want_cookie ? NULL : &foc); 65911fb6f159SOctavian Purdila 65921fb6f159SOctavian Purdila if (want_cookie && !tmp_opt.saw_tstamp) 65931fb6f159SOctavian Purdila tcp_clear_options(&tmp_opt); 65941fb6f159SOctavian Purdila 6595bc58a1baSHans Wippel if (IS_ENABLED(CONFIG_SMC) && want_cookie) 6596bc58a1baSHans Wippel tmp_opt.smc_ok = 0; 6597bc58a1baSHans Wippel 65981fb6f159SOctavian Purdila tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 65991fb6f159SOctavian Purdila tcp_openreq_init(req, &tmp_opt, skb, sk); 66007a682575SKOVACS Krisztian inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent; 66011fb6f159SOctavian Purdila 660216f86165SEric Dumazet /* Note: tcp_v6_init_req() might override ir_iif for link locals */ 66036dd9a14eSDavid Ahern inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb); 660416f86165SEric Dumazet 66051fb6f159SOctavian Purdila af_ops->init_req(req, sk, skb); 66061fb6f159SOctavian Purdila 66071fb6f159SOctavian Purdila if (security_inet_conn_request(sk, skb, req)) 66081fb6f159SOctavian Purdila goto drop_and_free; 66091fb6f159SOctavian Purdila 661084b114b9SEric Dumazet if (tmp_opt.tstamp_ok) 66115d2ed052SEric Dumazet tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb); 661295a22caeSFlorian Westphal 661349c71586STonghao Zhang dst = af_ops->route_req(sk, &fl, req); 661449c71586STonghao Zhang if (!dst) 661549c71586STonghao Zhang goto drop_and_free; 661649c71586STonghao Zhang 6617f7b3bec6SFlorian Westphal if (!want_cookie && !isn) { 66181fb6f159SOctavian Purdila /* Kill the following clause, if you dislike this way. */ 66194396e461SSoheil Hassas Yeganeh if (!net->ipv4.sysctl_tcp_syncookies && 6620fee83d09SHaishuang Yan (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 6621fee83d09SHaishuang Yan (net->ipv4.sysctl_max_syn_backlog >> 2)) && 6622d82bae12SSoheil Hassas Yeganeh !tcp_peer_is_proven(req, dst)) { 66231fb6f159SOctavian Purdila /* Without syncookies last quarter of 66241fb6f159SOctavian Purdila * backlog is filled with destinations, 66251fb6f159SOctavian Purdila * proven to be alive. 66261fb6f159SOctavian Purdila * It means that we continue to communicate 66271fb6f159SOctavian Purdila * to destinations, already remembered 66281fb6f159SOctavian Purdila * to the moment of synflood. 66291fb6f159SOctavian Purdila */ 66301fb6f159SOctavian Purdila pr_drop_req(req, ntohs(tcp_hdr(skb)->source), 66311fb6f159SOctavian Purdila rsk_ops->family); 66321fb6f159SOctavian Purdila goto drop_and_release; 66331fb6f159SOctavian Purdila } 66341fb6f159SOctavian Purdila 663584b114b9SEric Dumazet isn = af_ops->init_seq(skb); 66361fb6f159SOctavian Purdila } 66371fb6f159SOctavian Purdila 6638f7b3bec6SFlorian Westphal tcp_ecn_create_request(req, skb, sk, dst); 6639f7b3bec6SFlorian Westphal 6640f7b3bec6SFlorian Westphal if (want_cookie) { 6641f7b3bec6SFlorian Westphal isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 6642f7b3bec6SFlorian Westphal req->cookie_ts = tmp_opt.tstamp_ok; 6643f7b3bec6SFlorian Westphal if (!tmp_opt.tstamp_ok) 6644f7b3bec6SFlorian Westphal inet_rsk(req)->ecn_ok = 0; 6645f7b3bec6SFlorian Westphal } 6646f7b3bec6SFlorian Westphal 66471fb6f159SOctavian Purdila tcp_rsk(req)->snt_isn = isn; 664858d607d3SEric Dumazet tcp_rsk(req)->txhash = net_tx_rndhash(); 66491fb6f159SOctavian Purdila tcp_openreq_init_rwin(req, sk, dst); 6650c6345ce7SAmritha Nambiar sk_rx_queue_set(req_to_sk(req), skb); 6651ca6fb065SEric Dumazet if (!want_cookie) { 6652ca6fb065SEric Dumazet tcp_reqsk_record_syn(sk, req, skb); 665371c02379SChristoph Paasch fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); 6654ca6fb065SEric Dumazet } 66557c85af88SEric Dumazet if (fastopen_sk) { 6656ca6fb065SEric Dumazet af_ops->send_synack(fastopen_sk, dst, &fl, req, 6657b3d05147SEric Dumazet &foc, TCP_SYNACK_FASTOPEN); 66587656d842SEric Dumazet /* Add the child socket directly into the accept queue */ 66599d3e1368SGuillaume Nault if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { 66609d3e1368SGuillaume Nault reqsk_fastopen_remove(fastopen_sk, req, false); 66619d3e1368SGuillaume Nault bh_unlock_sock(fastopen_sk); 66629d3e1368SGuillaume Nault sock_put(fastopen_sk); 66639403cf23SGuillaume Nault goto drop_and_free; 66649d3e1368SGuillaume Nault } 66657656d842SEric Dumazet sk->sk_data_ready(sk); 66667656d842SEric Dumazet bh_unlock_sock(fastopen_sk); 66677c85af88SEric Dumazet sock_put(fastopen_sk); 66687c85af88SEric Dumazet } else { 66699439ce00SEric Dumazet tcp_rsk(req)->tfo_listener = false; 6670ca6fb065SEric Dumazet if (!want_cookie) 66718550f328SLawrence Brakmo inet_csk_reqsk_queue_hash_add(sk, req, 66728550f328SLawrence Brakmo tcp_timeout_init((struct sock *)req)); 6673b3d05147SEric Dumazet af_ops->send_synack(sk, dst, &fl, req, &foc, 6674b3d05147SEric Dumazet !want_cookie ? TCP_SYNACK_NORMAL : 6675b3d05147SEric Dumazet TCP_SYNACK_COOKIE); 66769caad864SEric Dumazet if (want_cookie) { 66779caad864SEric Dumazet reqsk_free(req); 66789caad864SEric Dumazet return 0; 66799caad864SEric Dumazet } 66801fb6f159SOctavian Purdila } 6681ca6fb065SEric Dumazet reqsk_put(req); 66821fb6f159SOctavian Purdila return 0; 66831fb6f159SOctavian Purdila 66841fb6f159SOctavian Purdila drop_and_release: 66851fb6f159SOctavian Purdila dst_release(dst); 66861fb6f159SOctavian Purdila drop_and_free: 66879403cf23SGuillaume Nault __reqsk_free(req); 66881fb6f159SOctavian Purdila drop: 66899caad864SEric Dumazet tcp_listendrop(sk); 66901fb6f159SOctavian Purdila return 0; 66911fb6f159SOctavian Purdila } 66921fb6f159SOctavian Purdila EXPORT_SYMBOL(tcp_conn_request); 6693