tcp_input.c (d27f9bc104375a0a835cf68bb88fc9cec69125da) tcp_input.c (cd7d8498c9a5d510c64db38d9f4f4fbc41790f09)
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 59 unchanged lines hidden (view full) ---

68#include <linux/module.h>
69#include <linux/sysctl.h>
70#include <linux/kernel.h>
71#include <net/dst.h>
72#include <net/tcp.h>
73#include <net/inet_common.h>
74#include <linux/ipsec.h>
75#include <asm/unaligned.h>
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Authors: Ross Biro

--- 59 unchanged lines hidden (view full) ---

68#include <linux/module.h>
69#include <linux/sysctl.h>
70#include <linux/kernel.h>
71#include <net/dst.h>
72#include <net/tcp.h>
73#include <net/inet_common.h>
74#include <linux/ipsec.h>
75#include <asm/unaligned.h>
76#include <net/netdma.h>
77#include <linux/errqueue.h>
76
77int sysctl_tcp_timestamps __read_mostly = 1;
78int sysctl_tcp_window_scaling __read_mostly = 1;
79int sysctl_tcp_sack __read_mostly = 1;
80int sysctl_tcp_fack __read_mostly = 1;
81int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
82EXPORT_SYMBOL(sysctl_tcp_reordering);
83int sysctl_tcp_dsack __read_mostly = 1;

--- 577 unchanged lines hidden (view full) ---

661 * routine either comes from timestamps, or from segments that were
662 * known _not_ to have been retransmitted [see Karn/Partridge
663 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
664 * piece by Van Jacobson.
665 * NOTE: the next three routines used to be one big routine.
666 * To save cycles in the RFC 1323 implementation it was better to break
667 * it up into three procedures. -- erics
668 */
78
79int sysctl_tcp_timestamps __read_mostly = 1;
80int sysctl_tcp_window_scaling __read_mostly = 1;
81int sysctl_tcp_sack __read_mostly = 1;
82int sysctl_tcp_fack __read_mostly = 1;
83int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
84EXPORT_SYMBOL(sysctl_tcp_reordering);
85int sysctl_tcp_dsack __read_mostly = 1;

--- 577 unchanged lines hidden (view full) ---

663 * routine either comes from timestamps, or from segments that were
664 * known _not_ to have been retransmitted [see Karn/Partridge
665 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
666 * piece by Van Jacobson.
667 * NOTE: the next three routines used to be one big routine.
668 * To save cycles in the RFC 1323 implementation it was better to break
669 * it up into three procedures. -- erics
670 */
669static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
671static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
670{
671 struct tcp_sock *tp = tcp_sk(sk);
672{
673 struct tcp_sock *tp = tcp_sk(sk);
672 long m = mrtt; /* RTT */
673 u32 srtt = tp->srtt;
674 long m = mrtt_us; /* RTT */
675 u32 srtt = tp->srtt_us;
674
675 /* The following amusing code comes from Jacobson's
676 * article in SIGCOMM '88. Note that rtt and mdev
677 * are scaled versions of rtt and mean deviation.
678 * This is designed to be as fast as possible
679 * m stands for "measurement".
680 *
681 * On a 1990 paper the rto value is changed to:

--- 6 unchanged lines hidden (view full) ---

688 * does not matter how to _calculate_ it. Seems, it was trap
689 * that VJ failed to avoid. 8)
690 */
691 if (srtt != 0) {
692 m -= (srtt >> 3); /* m is now error in rtt est */
693 srtt += m; /* rtt = 7/8 rtt + 1/8 new */
694 if (m < 0) {
695 m = -m; /* m is now abs(error) */
676
677 /* The following amusing code comes from Jacobson's
678 * article in SIGCOMM '88. Note that rtt and mdev
679 * are scaled versions of rtt and mean deviation.
680 * This is designed to be as fast as possible
681 * m stands for "measurement".
682 *
683 * On a 1990 paper the rto value is changed to:

--- 6 unchanged lines hidden (view full) ---

690 * does not matter how to _calculate_ it. Seems, it was trap
691 * that VJ failed to avoid. 8)
692 */
693 if (srtt != 0) {
694 m -= (srtt >> 3); /* m is now error in rtt est */
695 srtt += m; /* rtt = 7/8 rtt + 1/8 new */
696 if (m < 0) {
697 m = -m; /* m is now abs(error) */
696 m -= (tp->mdev >> 2); /* similar update on mdev */
698 m -= (tp->mdev_us >> 2); /* similar update on mdev */
697 /* This is similar to one of Eifel findings.
698 * Eifel blocks mdev updates when rtt decreases.
699 * This solution is a bit different: we use finer gain
700 * for mdev in this case (alpha*beta).
701 * Like Eifel it also prevents growth of rto,
702 * but also it limits too fast rto decreases,
703 * happening in pure Eifel.
704 */
705 if (m > 0)
706 m >>= 3;
707 } else {
699 /* This is similar to one of Eifel findings.
700 * Eifel blocks mdev updates when rtt decreases.
701 * This solution is a bit different: we use finer gain
702 * for mdev in this case (alpha*beta).
703 * Like Eifel it also prevents growth of rto,
704 * but also it limits too fast rto decreases,
705 * happening in pure Eifel.
706 */
707 if (m > 0)
708 m >>= 3;
709 } else {
708 m -= (tp->mdev >> 2); /* similar update on mdev */
710 m -= (tp->mdev_us >> 2); /* similar update on mdev */
709 }
711 }
710 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
711 if (tp->mdev > tp->mdev_max) {
712 tp->mdev_max = tp->mdev;
713 if (tp->mdev_max > tp->rttvar)
714 tp->rttvar = tp->mdev_max;
712 tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
713 if (tp->mdev_us > tp->mdev_max_us) {
714 tp->mdev_max_us = tp->mdev_us;
715 if (tp->mdev_max_us > tp->rttvar_us)
716 tp->rttvar_us = tp->mdev_max_us;
715 }
716 if (after(tp->snd_una, tp->rtt_seq)) {
717 }
718 if (after(tp->snd_una, tp->rtt_seq)) {
717 if (tp->mdev_max < tp->rttvar)
718 tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
719 if (tp->mdev_max_us < tp->rttvar_us)
720 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
719 tp->rtt_seq = tp->snd_nxt;
721 tp->rtt_seq = tp->snd_nxt;
720 tp->mdev_max = tcp_rto_min(sk);
722 tp->mdev_max_us = tcp_rto_min_us(sk);
721 }
722 } else {
723 /* no previous measure. */
724 srtt = m << 3; /* take the measured time to be rtt */
723 }
724 } else {
725 /* no previous measure. */
726 srtt = m << 3; /* take the measured time to be rtt */
725 tp->mdev = m << 1; /* make sure rto = 3*rtt */
726 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
727 tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
728 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
729 tp->mdev_max_us = tp->rttvar_us;
727 tp->rtt_seq = tp->snd_nxt;
728 }
730 tp->rtt_seq = tp->snd_nxt;
731 }
729 tp->srtt = max(1U, srtt);
732 tp->srtt_us = max(1U, srtt);
730}
731
732/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
733 * Note: TCP stack does not yet implement pacing.
734 * FQ packet scheduler can be used to implement cheap but effective
735 * TCP pacing, to smooth the burst on large writes when packets
736 * in flight is significantly lower than cwnd (or rwin)
737 */
738static void tcp_update_pacing_rate(struct sock *sk)
739{
740 const struct tcp_sock *tp = tcp_sk(sk);
741 u64 rate;
742
743 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
733}
734
735/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
736 * Note: TCP stack does not yet implement pacing.
737 * FQ packet scheduler can be used to implement cheap but effective
738 * TCP pacing, to smooth the burst on large writes when packets
739 * in flight is significantly lower than cwnd (or rwin)
740 */
741static void tcp_update_pacing_rate(struct sock *sk)
742{
743 const struct tcp_sock *tp = tcp_sk(sk);
744 u64 rate;
745
746 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
744 rate = (u64)tp->mss_cache * 2 * (HZ << 3);
747 rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
745
746 rate *= max(tp->snd_cwnd, tp->packets_out);
747
748
749 rate *= max(tp->snd_cwnd, tp->packets_out);
750
748 /* Correction for small srtt and scheduling constraints.
749 * For small rtt, consider noise is too high, and use
750 * the minimal value (srtt = 1 -> 125 us for HZ=1000)
751 *
752 * We probably need usec resolution in the future.
753 * Note: This also takes care of possible srtt=0 case,
754 * when tcp_rtt_estimator() was not yet called.
755 */
756 if (tp->srtt > 8 + 2)
757 do_div(rate, tp->srtt);
751 if (likely(tp->srtt_us))
752 do_div(rate, tp->srtt_us);
758
759 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
760 * without any lock. We want to make sure compiler wont store
761 * intermediate values in this location.
762 */
763 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
764 sk->sk_max_pacing_rate);
765}

--- 341 unchanged lines hidden (view full) ---

1107 dup_sack = true;
1108 tcp_dsack_seen(tp);
1109 NET_INC_STATS_BH(sock_net(sk),
1110 LINUX_MIB_TCPDSACKOFORECV);
1111 }
1112 }
1113
1114 /* D-SACK for already forgotten data... Do dumb counting. */
753
754 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
755 * without any lock. We want to make sure compiler wont store
756 * intermediate values in this location.
757 */
758 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
759 sk->sk_max_pacing_rate);
760}

--- 341 unchanged lines hidden (view full) ---

1102 dup_sack = true;
1103 tcp_dsack_seen(tp);
1104 NET_INC_STATS_BH(sock_net(sk),
1105 LINUX_MIB_TCPDSACKOFORECV);
1106 }
1107 }
1108
1109 /* D-SACK for already forgotten data... Do dumb counting. */
1115 if (dup_sack && tp->undo_marker && tp->undo_retrans &&
1110 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
1116 !after(end_seq_0, prior_snd_una) &&
1117 after(end_seq_0, tp->undo_marker))
1118 tp->undo_retrans--;
1119
1120 return dup_sack;
1121}
1122
1123struct tcp_sacktag_state {
1111 !after(end_seq_0, prior_snd_una) &&
1112 after(end_seq_0, tp->undo_marker))
1113 tp->undo_retrans--;
1114
1115 return dup_sack;
1116}
1117
1118struct tcp_sacktag_state {
1124 int reord;
1125 int fack_count;
1126 int flag;
1127 s32 rtt; /* RTT measured by SACKing never-retransmitted data */
1119 int reord;
1120 int fack_count;
1121 long rtt_us; /* RTT measured by SACKing never-retransmitted data */
1122 int flag;
1128};
1129
1130/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1131 * the incoming SACK may not exactly match but we can find smaller MSS
1132 * aligned portion of it that matches. Therefore we might need to fragment
1133 * which may fail and creates some hassle (caller must handle error case
1134 * returns).
1135 *

--- 27 unchanged lines hidden (view full) ---

1163
1164 /* Round if necessary so that SACKs cover only full MSSes
1165 * and/or the remaining small portion (if present)
1166 */
1167 if (pkt_len > mss) {
1168 unsigned int new_len = (pkt_len / mss) * mss;
1169 if (!in_sack && new_len < pkt_len) {
1170 new_len += mss;
1123};
1124
1125/* Check if skb is fully within the SACK block. In presence of GSO skbs,
1126 * the incoming SACK may not exactly match but we can find smaller MSS
1127 * aligned portion of it that matches. Therefore we might need to fragment
1128 * which may fail and creates some hassle (caller must handle error case
1129 * returns).
1130 *

--- 27 unchanged lines hidden (view full) ---

1158
1159 /* Round if necessary so that SACKs cover only full MSSes
1160 * and/or the remaining small portion (if present)
1161 */
1162 if (pkt_len > mss) {
1163 unsigned int new_len = (pkt_len / mss) * mss;
1164 if (!in_sack && new_len < pkt_len) {
1165 new_len += mss;
1171 if (new_len > skb->len)
1166 if (new_len >= skb->len)
1172 return 0;
1173 }
1174 pkt_len = new_len;
1175 }
1167 return 0;
1168 }
1169 pkt_len = new_len;
1170 }
1176 err = tcp_fragment(sk, skb, pkt_len, mss);
1171 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
1177 if (err < 0)
1178 return err;
1179 }
1180
1181 return in_sack;
1182}
1183
1184/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
1185static u8 tcp_sacktag_one(struct sock *sk,
1186 struct tcp_sacktag_state *state, u8 sacked,
1187 u32 start_seq, u32 end_seq,
1172 if (err < 0)
1173 return err;
1174 }
1175
1176 return in_sack;
1177}
1178
1179/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
1180static u8 tcp_sacktag_one(struct sock *sk,
1181 struct tcp_sacktag_state *state, u8 sacked,
1182 u32 start_seq, u32 end_seq,
1188 int dup_sack, int pcount, u32 xmit_time)
1183 int dup_sack, int pcount,
1184 const struct skb_mstamp *xmit_time)
1189{
1190 struct tcp_sock *tp = tcp_sk(sk);
1191 int fack_count = state->fack_count;
1192
1193 /* Account D-SACK for retransmitted packet. */
1194 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1185{
1186 struct tcp_sock *tp = tcp_sk(sk);
1187 int fack_count = state->fack_count;
1188
1189 /* Account D-SACK for retransmitted packet. */
1190 if (dup_sack && (sacked & TCPCB_RETRANS)) {
1195 if (tp->undo_marker && tp->undo_retrans &&
1191 if (tp->undo_marker && tp->undo_retrans > 0 &&
1196 after(end_seq, tp->undo_marker))
1197 tp->undo_retrans--;
1198 if (sacked & TCPCB_SACKED_ACKED)
1199 state->reord = min(fack_count, state->reord);
1200 }
1201
1202 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1203 if (!after(end_seq, tp->snd_una))

--- 17 unchanged lines hidden (view full) ---

1221 */
1222 if (before(start_seq,
1223 tcp_highest_sack_seq(tp)))
1224 state->reord = min(fack_count,
1225 state->reord);
1226 if (!after(end_seq, tp->high_seq))
1227 state->flag |= FLAG_ORIG_SACK_ACKED;
1228 /* Pick the earliest sequence sacked for RTT */
1192 after(end_seq, tp->undo_marker))
1193 tp->undo_retrans--;
1194 if (sacked & TCPCB_SACKED_ACKED)
1195 state->reord = min(fack_count, state->reord);
1196 }
1197
1198 /* Nothing to do; acked frame is about to be dropped (was ACKed). */
1199 if (!after(end_seq, tp->snd_una))

--- 17 unchanged lines hidden (view full) ---

1217 */
1218 if (before(start_seq,
1219 tcp_highest_sack_seq(tp)))
1220 state->reord = min(fack_count,
1221 state->reord);
1222 if (!after(end_seq, tp->high_seq))
1223 state->flag |= FLAG_ORIG_SACK_ACKED;
1224 /* Pick the earliest sequence sacked for RTT */
1229 if (state->rtt < 0)
1230 state->rtt = tcp_time_stamp - xmit_time;
1225 if (state->rtt_us < 0) {
1226 struct skb_mstamp now;
1227
1228 skb_mstamp_get(&now);
1229 state->rtt_us = skb_mstamp_us_delta(&now,
1230 xmit_time);
1231 }
1231 }
1232
1233 if (sacked & TCPCB_LOST) {
1234 sacked &= ~TCPCB_LOST;
1235 tp->lost_out -= pcount;
1236 }
1237 }
1238

--- 42 unchanged lines hidden (view full) ---

1281 /* Adjust counters and hints for the newly sacked sequence
1282 * range but discard the return value since prev is already
1283 * marked. We must tag the range first because the seq
1284 * advancement below implicitly advances
1285 * tcp_highest_sack_seq() when skb is highest_sack.
1286 */
1287 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1288 start_seq, end_seq, dup_sack, pcount,
1232 }
1233
1234 if (sacked & TCPCB_LOST) {
1235 sacked &= ~TCPCB_LOST;
1236 tp->lost_out -= pcount;
1237 }
1238 }
1239

--- 42 unchanged lines hidden (view full) ---

1282 /* Adjust counters and hints for the newly sacked sequence
1283 * range but discard the return value since prev is already
1284 * marked. We must tag the range first because the seq
1285 * advancement below implicitly advances
1286 * tcp_highest_sack_seq() when skb is highest_sack.
1287 */
1288 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1289 start_seq, end_seq, dup_sack, pcount,
1289 TCP_SKB_CB(skb)->when);
1290 &skb->skb_mstamp);
1290
1291 if (skb == tp->lost_skb_hint)
1292 tp->lost_cnt_hint += pcount;
1293
1294 TCP_SKB_CB(prev)->end_seq += shifted;
1295 TCP_SKB_CB(skb)->seq += shifted;
1296
1291
1292 if (skb == tp->lost_skb_hint)
1293 tp->lost_cnt_hint += pcount;
1294
1295 TCP_SKB_CB(prev)->end_seq += shifted;
1296 TCP_SKB_CB(skb)->seq += shifted;
1297
1297 skb_shinfo(prev)->gso_segs += pcount;
1298 BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
1299 skb_shinfo(skb)->gso_segs -= pcount;
1298 tcp_skb_pcount_add(prev, pcount);
1299 BUG_ON(tcp_skb_pcount(skb) < pcount);
1300 tcp_skb_pcount_add(skb, -pcount);
1300
1301 /* When we're adding to gso_segs == 1, gso_size will be zero,
1302 * in theory this shouldn't be necessary but as long as DSACK
1303 * code can come after this skb later on it's better to keep
1304 * setting gso_size to something.
1305 */
1306 if (!skb_shinfo(prev)->gso_size) {
1307 skb_shinfo(prev)->gso_size = mss;
1308 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1309 }
1310
1311 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1301
1302 /* When we're adding to gso_segs == 1, gso_size will be zero,
1303 * in theory this shouldn't be necessary but as long as DSACK
1304 * code can come after this skb later on it's better to keep
1305 * setting gso_size to something.
1306 */
1307 if (!skb_shinfo(prev)->gso_size) {
1308 skb_shinfo(prev)->gso_size = mss;
1309 skb_shinfo(prev)->gso_type = sk->sk_gso_type;
1310 }
1311
1312 /* CHECKME: To clear or not to clear? Mimics normal skb currently */
1312 if (skb_shinfo(skb)->gso_segs <= 1) {
1313 if (tcp_skb_pcount(skb) <= 1) {
1313 skb_shinfo(skb)->gso_size = 0;
1314 skb_shinfo(skb)->gso_type = 0;
1315 }
1316
1317 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1318 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1319
1320 if (skb->len > 0) {

--- 238 unchanged lines hidden (view full) ---

1559 TCP_SKB_CB(skb)->sacked =
1560 tcp_sacktag_one(sk,
1561 state,
1562 TCP_SKB_CB(skb)->sacked,
1563 TCP_SKB_CB(skb)->seq,
1564 TCP_SKB_CB(skb)->end_seq,
1565 dup_sack,
1566 tcp_skb_pcount(skb),
1314 skb_shinfo(skb)->gso_size = 0;
1315 skb_shinfo(skb)->gso_type = 0;
1316 }
1317
1318 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1319 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1320
1321 if (skb->len > 0) {

--- 238 unchanged lines hidden (view full) ---

1560 TCP_SKB_CB(skb)->sacked =
1561 tcp_sacktag_one(sk,
1562 state,
1563 TCP_SKB_CB(skb)->sacked,
1564 TCP_SKB_CB(skb)->seq,
1565 TCP_SKB_CB(skb)->end_seq,
1566 dup_sack,
1567 tcp_skb_pcount(skb),
1567 TCP_SKB_CB(skb)->when);
1568 &skb->skb_mstamp);
1568
1569 if (!before(TCP_SKB_CB(skb)->seq,
1570 tcp_highest_sack_seq(tp)))
1571 tcp_advance_highest_sack(sk, skb);
1572 }
1573
1574 state->fack_count += tcp_skb_pcount(skb);
1575 }

--- 40 unchanged lines hidden (view full) ---

1616
1617static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1618{
1619 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1620}
1621
1622static int
1623tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1569
1570 if (!before(TCP_SKB_CB(skb)->seq,
1571 tcp_highest_sack_seq(tp)))
1572 tcp_advance_highest_sack(sk, skb);
1573 }
1574
1575 state->fack_count += tcp_skb_pcount(skb);
1576 }

--- 40 unchanged lines hidden (view full) ---

1617
1618static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
1619{
1620 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1621}
1622
1623static int
1624tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
1624 u32 prior_snd_una, s32 *sack_rtt)
1625 u32 prior_snd_una, long *sack_rtt_us)
1625{
1626 struct tcp_sock *tp = tcp_sk(sk);
1627 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1628 TCP_SKB_CB(ack_skb)->sacked);
1629 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1630 struct tcp_sack_block sp[TCP_NUM_SACKS];
1631 struct tcp_sack_block *cache;
1632 struct tcp_sacktag_state state;
1633 struct sk_buff *skb;
1634 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1635 int used_sacks;
1636 bool found_dup_sack = false;
1637 int i, j;
1638 int first_sack_index;
1639
1640 state.flag = 0;
1641 state.reord = tp->packets_out;
1626{
1627 struct tcp_sock *tp = tcp_sk(sk);
1628 const unsigned char *ptr = (skb_transport_header(ack_skb) +
1629 TCP_SKB_CB(ack_skb)->sacked);
1630 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1631 struct tcp_sack_block sp[TCP_NUM_SACKS];
1632 struct tcp_sack_block *cache;
1633 struct tcp_sacktag_state state;
1634 struct sk_buff *skb;
1635 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
1636 int used_sacks;
1637 bool found_dup_sack = false;
1638 int i, j;
1639 int first_sack_index;
1640
1641 state.flag = 0;
1642 state.reord = tp->packets_out;
1642 state.rtt = -1;
1643 state.rtt_us = -1L;
1643
1644 if (!tp->sacked_out) {
1645 if (WARN_ON(tp->fackets_out))
1646 tp->fackets_out = 0;
1647 tcp_highest_sack_reset(sk);
1648 }
1649
1650 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,

--- 167 unchanged lines hidden (view full) ---

1818out:
1819
1820#if FASTRETRANS_DEBUG > 0
1821 WARN_ON((int)tp->sacked_out < 0);
1822 WARN_ON((int)tp->lost_out < 0);
1823 WARN_ON((int)tp->retrans_out < 0);
1824 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1825#endif
1644
1645 if (!tp->sacked_out) {
1646 if (WARN_ON(tp->fackets_out))
1647 tp->fackets_out = 0;
1648 tcp_highest_sack_reset(sk);
1649 }
1650
1651 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,

--- 167 unchanged lines hidden (view full) ---

1819out:
1820
1821#if FASTRETRANS_DEBUG > 0
1822 WARN_ON((int)tp->sacked_out < 0);
1823 WARN_ON((int)tp->lost_out < 0);
1824 WARN_ON((int)tp->retrans_out < 0);
1825 WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1826#endif
1826 *sack_rtt = state.rtt;
1827 *sack_rtt_us = state.rtt_us;
1827 return state.flag;
1828}
1829
1830/* Limits sacked_out so that sum with lost_out isn't ever larger than
1831 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
1832 */
1833static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1834{

--- 47 unchanged lines hidden (view full) ---

1882 tcp_verify_left_out(tp);
1883}
1884
1885static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1886{
1887 tp->sacked_out = 0;
1888}
1889
1828 return state.flag;
1829}
1830
1831/* Limits sacked_out so that sum with lost_out isn't ever larger than
1832 * packets_out. Returns false if sacked_out adjustement wasn't necessary.
1833 */
1834static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
1835{

--- 47 unchanged lines hidden (view full) ---

1883 tcp_verify_left_out(tp);
1884}
1885
1886static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
1887{
1888 tp->sacked_out = 0;
1889}
1890
1890static void tcp_clear_retrans_partial(struct tcp_sock *tp)
1891void tcp_clear_retrans(struct tcp_sock *tp)
1891{
1892 tp->retrans_out = 0;
1893 tp->lost_out = 0;
1892{
1893 tp->retrans_out = 0;
1894 tp->lost_out = 0;
1894
1895 tp->undo_marker = 0;
1895 tp->undo_marker = 0;
1896 tp->undo_retrans = 0;
1896 tp->undo_retrans = -1;
1897 tp->fackets_out = 0;
1898 tp->sacked_out = 0;
1897}
1898
1899}
1900
1899void tcp_clear_retrans(struct tcp_sock *tp)
1901static inline void tcp_init_undo(struct tcp_sock *tp)
1900{
1902{
1901 tcp_clear_retrans_partial(tp);
1902
1903 tp->fackets_out = 0;
1904 tp->sacked_out = 0;
1903 tp->undo_marker = tp->snd_una;
1904 /* Retransmission still in flight may cause DSACKs later. */
1905 tp->undo_retrans = tp->retrans_out ? : -1;
1905}
1906
1906}
1907
1907/* Enter Loss state. If "how" is not zero, forget all SACK information
1908/* Enter Loss state. If we detect SACK reneging, forget all SACK information
1908 * and reset tags completely, otherwise preserve SACKs. If receiver
1909 * dropped its ofo queue, we will know this due to reneging detection.
1910 */
1909 * and reset tags completely, otherwise preserve SACKs. If receiver
1910 * dropped its ofo queue, we will know this due to reneging detection.
1911 */
1911void tcp_enter_loss(struct sock *sk, int how)
1912void tcp_enter_loss(struct sock *sk)
1912{
1913 const struct inet_connection_sock *icsk = inet_csk(sk);
1914 struct tcp_sock *tp = tcp_sk(sk);
1915 struct sk_buff *skb;
1916 bool new_recovery = false;
1913{
1914 const struct inet_connection_sock *icsk = inet_csk(sk);
1915 struct tcp_sock *tp = tcp_sk(sk);
1916 struct sk_buff *skb;
1917 bool new_recovery = false;
1918 bool is_reneg; /* is receiver reneging on SACKs? */
1917
1918 /* Reduce ssthresh if it has not yet been made inside this window. */
1919 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1920 !after(tp->high_seq, tp->snd_una) ||
1921 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1922 new_recovery = true;
1923 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1924 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1925 tcp_ca_event(sk, CA_EVENT_LOSS);
1919
1920 /* Reduce ssthresh if it has not yet been made inside this window. */
1921 if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
1922 !after(tp->high_seq, tp->snd_una) ||
1923 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
1924 new_recovery = true;
1925 tp->prior_ssthresh = tcp_current_ssthresh(sk);
1926 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1927 tcp_ca_event(sk, CA_EVENT_LOSS);
1928 tcp_init_undo(tp);
1926 }
1927 tp->snd_cwnd = 1;
1928 tp->snd_cwnd_cnt = 0;
1929 tp->snd_cwnd_stamp = tcp_time_stamp;
1930
1929 }
1930 tp->snd_cwnd = 1;
1931 tp->snd_cwnd_cnt = 0;
1932 tp->snd_cwnd_stamp = tcp_time_stamp;
1933
1931 tcp_clear_retrans_partial(tp);
1934 tp->retrans_out = 0;
1935 tp->lost_out = 0;
1932
1933 if (tcp_is_reno(tp))
1934 tcp_reset_reno_sack(tp);
1935
1936
1937 if (tcp_is_reno(tp))
1938 tcp_reset_reno_sack(tp);
1939
1936 tp->undo_marker = tp->snd_una;
1937 if (how) {
1940 skb = tcp_write_queue_head(sk);
1941 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
1942 if (is_reneg) {
1943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1938 tp->sacked_out = 0;
1939 tp->fackets_out = 0;
1940 }
1941 tcp_clear_all_retrans_hints(tp);
1942
1943 tcp_for_write_queue(skb, sk) {
1944 if (skb == tcp_send_head(sk))
1945 break;
1946
1944 tp->sacked_out = 0;
1945 tp->fackets_out = 0;
1946 }
1947 tcp_clear_all_retrans_hints(tp);
1948
1949 tcp_for_write_queue(skb, sk) {
1950 if (skb == tcp_send_head(sk))
1951 break;
1952
1947 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1948 tp->undo_marker = 0;
1949
1950 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
1951 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
1954 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) {
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1954 tp->lost_out += tcp_skb_pcount(skb);
1955 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1956 }
1957 }
1958 tcp_verify_left_out(tp);
1959

--- 16 unchanged lines hidden (view full) ---

1976 (new_recovery || icsk->icsk_retransmits) &&
1977 !inet_csk(sk)->icsk_mtup.probe_size;
1978}
1979
1980/* If ACK arrived pointing to a remembered SACK, it means that our
1981 * remembered SACKs do not reflect real state of receiver i.e.
1982 * receiver _host_ is heavily congested (or buggy).
1983 *
1955 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
1956 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
1957 tp->lost_out += tcp_skb_pcount(skb);
1958 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1959 }
1960 }
1961 tcp_verify_left_out(tp);
1962

--- 16 unchanged lines hidden (view full) ---

1979 (new_recovery || icsk->icsk_retransmits) &&
1980 !inet_csk(sk)->icsk_mtup.probe_size;
1981}
1982
1983/* If ACK arrived pointing to a remembered SACK, it means that our
1984 * remembered SACKs do not reflect real state of receiver i.e.
1985 * receiver _host_ is heavily congested (or buggy).
1986 *
1984 * Do processing similar to RTO timeout.
1987 * To avoid big spurious retransmission bursts due to transient SACK
1988 * scoreboard oddities that look like reneging, we give the receiver a
1989 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will
1990 * restore sanity to the SACK scoreboard. If the apparent reneging
1991 * persists until this RTO then we'll clear the SACK scoreboard.
1985 */
1986static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1987{
1988 if (flag & FLAG_SACK_RENEGING) {
1992 */
1993static bool tcp_check_sack_reneging(struct sock *sk, int flag)
1994{
1995 if (flag & FLAG_SACK_RENEGING) {
1989 struct inet_connection_sock *icsk = inet_csk(sk);
1990 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1996 struct tcp_sock *tp = tcp_sk(sk);
1997 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
1998 msecs_to_jiffies(10));
1991
1999
1992 tcp_enter_loss(sk, 1);
1993 icsk->icsk_retransmits++;
1994 tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
1995 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
2000 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
1996 icsk->icsk_rto, TCP_RTO_MAX);
2001 delay, TCP_RTO_MAX);
1997 return true;
1998 }
1999 return false;
2000}
2001
2002static inline int tcp_fackets_out(const struct tcp_sock *tp)
2003{
2004 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;

--- 24 unchanged lines hidden (view full) ---

2029 struct tcp_sock *tp = tcp_sk(sk);
2030 unsigned long delay;
2031
2032 /* Delay early retransmit and entering fast recovery for
2033 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2034 * available, or RTO is scheduled to fire first.
2035 */
2036 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2002 return true;
2003 }
2004 return false;
2005}
2006
2007static inline int tcp_fackets_out(const struct tcp_sock *tp)
2008{
2009 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;

--- 24 unchanged lines hidden (view full) ---

2034 struct tcp_sock *tp = tcp_sk(sk);
2035 unsigned long delay;
2036
2037 /* Delay early retransmit and entering fast recovery for
2038 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
2039 * available, or RTO is scheduled to fire first.
2040 */
2041 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
2037 (flag & FLAG_ECE) || !tp->srtt)
2042 (flag & FLAG_ECE) || !tp->srtt_us)
2038 return false;
2039
2043 return false;
2044
2040 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
2045 delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
2046 msecs_to_jiffies(2));
2047
2041 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2042 return false;
2043
2044 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2045 TCP_RTO_MAX);
2046 return true;
2047}
2048

--- 185 unchanged lines hidden (view full) ---

2234
2235 if (cnt > packets) {
2236 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2237 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2238 (oldcnt >= packets))
2239 break;
2240
2241 mss = skb_shinfo(skb)->gso_size;
2048 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
2049 return false;
2050
2051 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
2052 TCP_RTO_MAX);
2053 return true;
2054}
2055

--- 185 unchanged lines hidden (view full) ---

2241
2242 if (cnt > packets) {
2243 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
2244 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
2245 (oldcnt >= packets))
2246 break;
2247
2248 mss = skb_shinfo(skb)->gso_size;
2242 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
2249 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
2250 mss, GFP_ATOMIC);
2243 if (err < 0)
2244 break;
2245 cnt = packets;
2246 }
2247
2248 tcp_skb_mark_lost(tp, skb);
2249
2250 if (mark_head)

--- 216 unchanged lines hidden (view full) ---

2467 * It computes the number of packets to send (sndcnt) based on packets newly
2468 * delivered:
2469 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2470 * cwnd reductions across a full RTT.
2471 * 2) If packets in flight is lower than ssthresh (such as due to excess
2472 * losses and/or application stalls), do not perform any further cwnd
2473 * reductions, but instead slow start up to ssthresh.
2474 */
2251 if (err < 0)
2252 break;
2253 cnt = packets;
2254 }
2255
2256 tcp_skb_mark_lost(tp, skb);
2257
2258 if (mark_head)

--- 216 unchanged lines hidden (view full) ---

2475 * It computes the number of packets to send (sndcnt) based on packets newly
2476 * delivered:
2477 * 1) If the packets in flight is larger than ssthresh, PRR spreads the
2478 * cwnd reductions across a full RTT.
2479 * 2) If packets in flight is lower than ssthresh (such as due to excess
2480 * losses and/or application stalls), do not perform any further cwnd
2481 * reductions, but instead slow start up to ssthresh.
2482 */
2475static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
2483static void tcp_init_cwnd_reduction(struct sock *sk)
2476{
2477 struct tcp_sock *tp = tcp_sk(sk);
2478
2479 tp->high_seq = tp->snd_nxt;
2480 tp->tlp_high_seq = 0;
2481 tp->snd_cwnd_cnt = 0;
2482 tp->prior_cwnd = tp->snd_cwnd;
2483 tp->prr_delivered = 0;
2484 tp->prr_out = 0;
2484{
2485 struct tcp_sock *tp = tcp_sk(sk);
2486
2487 tp->high_seq = tp->snd_nxt;
2488 tp->tlp_high_seq = 0;
2489 tp->snd_cwnd_cnt = 0;
2490 tp->prior_cwnd = tp->snd_cwnd;
2491 tp->prr_delivered = 0;
2492 tp->prr_out = 0;
2485 if (set_ssthresh)
2486 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2493 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2487 TCP_ECN_queue_cwr(tp);
2488}
2489
2490static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
2491 int fast_rexmit)
2492{
2493 struct tcp_sock *tp = tcp_sk(sk);
2494 int sndcnt = 0;

--- 25 unchanged lines hidden (view full) ---

2520 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2521 tp->snd_cwnd = tp->snd_ssthresh;
2522 tp->snd_cwnd_stamp = tcp_time_stamp;
2523 }
2524 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2525}
2526
2527/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2494 TCP_ECN_queue_cwr(tp);
2495}
2496
2497static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
2498 int fast_rexmit)
2499{
2500 struct tcp_sock *tp = tcp_sk(sk);
2501 int sndcnt = 0;

--- 25 unchanged lines hidden (view full) ---

2527 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
2528 tp->snd_cwnd = tp->snd_ssthresh;
2529 tp->snd_cwnd_stamp = tcp_time_stamp;
2530 }
2531 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
2532}
2533
2534/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */
2528void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
2535void tcp_enter_cwr(struct sock *sk)
2529{
2530 struct tcp_sock *tp = tcp_sk(sk);
2531
2532 tp->prior_ssthresh = 0;
2533 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2534 tp->undo_marker = 0;
2536{
2537 struct tcp_sock *tp = tcp_sk(sk);
2538
2539 tp->prior_ssthresh = 0;
2540 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2541 tp->undo_marker = 0;
2535 tcp_init_cwnd_reduction(sk, set_ssthresh);
2542 tcp_init_cwnd_reduction(sk);
2536 tcp_set_ca_state(sk, TCP_CA_CWR);
2537 }
2538}
2539
2540static void tcp_try_keep_open(struct sock *sk)
2541{
2542 struct tcp_sock *tp = tcp_sk(sk);
2543 int state = TCP_CA_Open;

--- 12 unchanged lines hidden (view full) ---

2556 struct tcp_sock *tp = tcp_sk(sk);
2557
2558 tcp_verify_left_out(tp);
2559
2560 if (!tcp_any_retrans_done(sk))
2561 tp->retrans_stamp = 0;
2562
2563 if (flag & FLAG_ECE)
2543 tcp_set_ca_state(sk, TCP_CA_CWR);
2544 }
2545}
2546
2547static void tcp_try_keep_open(struct sock *sk)
2548{
2549 struct tcp_sock *tp = tcp_sk(sk);
2550 int state = TCP_CA_Open;

--- 12 unchanged lines hidden (view full) ---

2563 struct tcp_sock *tp = tcp_sk(sk);
2564
2565 tcp_verify_left_out(tp);
2566
2567 if (!tcp_any_retrans_done(sk))
2568 tp->retrans_stamp = 0;
2569
2570 if (flag & FLAG_ECE)
2564 tcp_enter_cwr(sk, 1);
2571 tcp_enter_cwr(sk);
2565
2566 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2567 tcp_try_keep_open(sk);
2568 } else {
2569 tcp_cwnd_reduction(sk, prior_unsacked, 0);
2570 }
2571}
2572

--- 83 unchanged lines hidden (view full) ---

2656 if (tcp_is_reno(tp))
2657 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2658 else
2659 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2660
2661 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2662
2663 tp->prior_ssthresh = 0;
2572
2573 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2574 tcp_try_keep_open(sk);
2575 } else {
2576 tcp_cwnd_reduction(sk, prior_unsacked, 0);
2577 }
2578}
2579

--- 83 unchanged lines hidden (view full) ---

2663 if (tcp_is_reno(tp))
2664 mib_idx = LINUX_MIB_TCPRENORECOVERY;
2665 else
2666 mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2667
2668 NET_INC_STATS_BH(sock_net(sk), mib_idx);
2669
2670 tp->prior_ssthresh = 0;
2664 tp->undo_marker = tp->snd_una;
2665 tp->undo_retrans = tp->retrans_out;
2671 tcp_init_undo(tp);
2666
2667 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2668 if (!ece_ack)
2669 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2672
2673 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2674 if (!ece_ack)
2675 tp->prior_ssthresh = tcp_current_ssthresh(sk);
2670 tcp_init_cwnd_reduction(sk, true);
2676 tcp_init_cwnd_reduction(sk);
2671 }
2672 tcp_set_ca_state(sk, TCP_CA_Recovery);
2673}
2674
2675/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2676 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2677 */
2678static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2679{
2677 }
2678 tcp_set_ca_state(sk, TCP_CA_Recovery);
2679}
2680
2681/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
2682 * recovered or spurious. Otherwise retransmits more on partial ACKs.
2683 */
2684static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2685{
2680 struct inet_connection_sock *icsk = inet_csk(sk);
2681 struct tcp_sock *tp = tcp_sk(sk);
2682 bool recovered = !before(tp->snd_una, tp->high_seq);
2683
2684 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2686 struct tcp_sock *tp = tcp_sk(sk);
2687 bool recovered = !before(tp->snd_una, tp->high_seq);
2688
2689 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
2685 if (flag & FLAG_ORIG_SACK_ACKED) {
2686 /* Step 3.b. A timeout is spurious if not all data are
2687 * lost, i.e., never-retransmitted data are (s)acked.
2688 */
2689 tcp_try_undo_loss(sk, true);
2690 /* Step 3.b. A timeout is spurious if not all data are
2691 * lost, i.e., never-retransmitted data are (s)acked.
2692 */
2693 if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
2690 return;
2694 return;
2691 }
2695
2692 if (after(tp->snd_nxt, tp->high_seq) &&
2693 (flag & FLAG_DATA_SACKED || is_dupack)) {
2694 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2695 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2696 tp->high_seq = tp->snd_nxt;
2697 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2698 TCP_NAGLE_OFF);
2699 if (after(tp->snd_nxt, tp->high_seq))
2700 return; /* Step 2.b */
2701 tp->frto = 0;
2702 }
2703 }
2704
2705 if (recovered) {
2706 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2696 if (after(tp->snd_nxt, tp->high_seq) &&
2697 (flag & FLAG_DATA_SACKED || is_dupack)) {
2698 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
2699 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2700 tp->high_seq = tp->snd_nxt;
2701 __tcp_push_pending_frames(sk, tcp_current_mss(sk),
2702 TCP_NAGLE_OFF);
2703 if (after(tp->snd_nxt, tp->high_seq))
2704 return; /* Step 2.b */
2705 tp->frto = 0;
2706 }
2707 }
2708
2709 if (recovered) {
2710 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
2707 icsk->icsk_retransmits = 0;
2708 tcp_try_undo_recovery(sk);
2709 return;
2710 }
2711 tcp_try_undo_recovery(sk);
2712 return;
2713 }
2711 if (flag & FLAG_DATA_ACKED)
2712 icsk->icsk_retransmits = 0;
2713 if (tcp_is_reno(tp)) {
2714 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2715 * delivered. Lower inflight to clock out (re)tranmissions.
2716 */
2717 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2718 tcp_add_reno_sack(sk);
2719 else if (flag & FLAG_SND_UNA_ADVANCED)
2720 tcp_reset_reno_sack(tp);

--- 158 unchanged lines hidden (view full) ---

2879
2880 if (do_lost)
2881 tcp_update_scoreboard(sk, fast_rexmit);
2882 tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
2883 tcp_xmit_retransmit_queue(sk);
2884}
2885
2886static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2714 if (tcp_is_reno(tp)) {
2715 /* A Reno DUPACK means new data in F-RTO step 2.b above are
2716 * delivered. Lower inflight to clock out (re)tranmissions.
2717 */
2718 if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
2719 tcp_add_reno_sack(sk);
2720 else if (flag & FLAG_SND_UNA_ADVANCED)
2721 tcp_reset_reno_sack(tp);

--- 158 unchanged lines hidden (view full) ---

2880
2881 if (do_lost)
2882 tcp_update_scoreboard(sk, fast_rexmit);
2883 tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit);
2884 tcp_xmit_retransmit_queue(sk);
2885}
2886
2887static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
2887 s32 seq_rtt, s32 sack_rtt)
2888 long seq_rtt_us, long sack_rtt_us)
2888{
2889 const struct tcp_sock *tp = tcp_sk(sk);
2890
2891 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
2892 * broken middle-boxes or peers may corrupt TS-ECR fields. But
2893 * Karn's algorithm forbids taking RTT if some retransmitted data
2894 * is acked (RFC6298).
2895 */
2896 if (flag & FLAG_RETRANS_DATA_ACKED)
2889{
2890 const struct tcp_sock *tp = tcp_sk(sk);
2891
2892 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because
2893 * broken middle-boxes or peers may corrupt TS-ECR fields. But
2894 * Karn's algorithm forbids taking RTT if some retransmitted data
2895 * is acked (RFC6298).
2896 */
2897 if (flag & FLAG_RETRANS_DATA_ACKED)
2897 seq_rtt = -1;
2898 seq_rtt_us = -1L;
2898
2899
2899 if (seq_rtt < 0)
2900 seq_rtt = sack_rtt;
2900 if (seq_rtt_us < 0)
2901 seq_rtt_us = sack_rtt_us;
2901
2902 /* RTTM Rule: A TSecr value received in a segment is used to
2903 * update the averaged RTT measurement only if the segment
2904 * acknowledges some new data, i.e., only if it advances the
2905 * left edge of the send window.
2906 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2907 */
2902
2903 /* RTTM Rule: A TSecr value received in a segment is used to
2904 * update the averaged RTT measurement only if the segment
2905 * acknowledges some new data, i.e., only if it advances the
2906 * left edge of the send window.
2907 * See draft-ietf-tcplw-high-performance-00, section 3.3.
2908 */
2908 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2909 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2909 flag & FLAG_ACKED)
2910 flag & FLAG_ACKED)
2910 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
2911 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
2911
2912
2912 if (seq_rtt < 0)
2913 if (seq_rtt_us < 0)
2913 return false;
2914
2914 return false;
2915
2915 tcp_rtt_estimator(sk, seq_rtt);
2916 tcp_rtt_estimator(sk, seq_rtt_us);
2916 tcp_set_rto(sk);
2917
2918 /* RFC6298: only reset backoff on valid RTT measurement. */
2919 inet_csk(sk)->icsk_backoff = 0;
2920 return true;
2921}
2922
2923/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
2924static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2925{
2926 struct tcp_sock *tp = tcp_sk(sk);
2917 tcp_set_rto(sk);
2918
2919 /* RFC6298: only reset backoff on valid RTT measurement. */
2920 inet_csk(sk)->icsk_backoff = 0;
2921 return true;
2922}
2923
2924/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */
2925static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
2926{
2927 struct tcp_sock *tp = tcp_sk(sk);
2927 s32 seq_rtt = -1;
2928 long seq_rtt_us = -1L;
2928
2929 if (synack_stamp && !tp->total_retrans)
2929
2930 if (synack_stamp && !tp->total_retrans)
2930 seq_rtt = tcp_time_stamp - synack_stamp;
2931 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
2931
2932 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
2933 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
2934 */
2932
2933 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
2934 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
2935 */
2935 if (!tp->srtt)
2936 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
2936 if (!tp->srtt_us)
2937 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
2937}
2938
2938}
2939
2939static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
2940static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
2940{
2941 const struct inet_connection_sock *icsk = inet_csk(sk);
2941{
2942 const struct inet_connection_sock *icsk = inet_csk(sk);
2942 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight);
2943
2944 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
2943 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2944}
2945
2946/* Restart timer after forward progress on connection.
2947 * RFC2988 recommends to restart timer to now+rto.
2948 */
2949void tcp_rearm_rto(struct sock *sk)
2950{

--- 9 unchanged lines hidden (view full) ---

2960 if (!tp->packets_out) {
2961 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
2962 } else {
2963 u32 rto = inet_csk(sk)->icsk_rto;
2964 /* Offset the time elapsed after installing regular RTO */
2965 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2966 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2967 struct sk_buff *skb = tcp_write_queue_head(sk);
2945 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
2946}
2947
2948/* Restart timer after forward progress on connection.
2949 * RFC2988 recommends to restart timer to now+rto.
2950 */
2951void tcp_rearm_rto(struct sock *sk)
2952{

--- 9 unchanged lines hidden (view full) ---

2962 if (!tp->packets_out) {
2963 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
2964 } else {
2965 u32 rto = inet_csk(sk)->icsk_rto;
2966 /* Offset the time elapsed after installing regular RTO */
2967 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
2968 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2969 struct sk_buff *skb = tcp_write_queue_head(sk);
2968 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
2970 const u32 rto_time_stamp =
2971 tcp_skb_timestamp(skb) + rto;
2969 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2970 /* delta may not be positive if the socket is locked
2971 * when the retrans timer fires and is rescheduled.
2972 */
2973 if (delta > 0)
2974 rto = delta;
2975 }
2976 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,

--- 40 unchanged lines hidden (view full) ---

3017 return packets_acked;
3018}
3019
3020/* Remove acknowledged frames from the retransmission queue. If our packet
3021 * is before the ack sequence we can discard it as it's confirmed to have
3022 * arrived at the other end.
3023 */
3024static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
2972 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
2973 /* delta may not be positive if the socket is locked
2974 * when the retrans timer fires and is rescheduled.
2975 */
2976 if (delta > 0)
2977 rto = delta;
2978 }
2979 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,

--- 40 unchanged lines hidden (view full) ---

3020 return packets_acked;
3021}
3022
3023/* Remove acknowledged frames from the retransmission queue. If our packet
3024 * is before the ack sequence we can discard it as it's confirmed to have
3025 * arrived at the other end.
3026 */
3027static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3025 u32 prior_snd_una, s32 sack_rtt)
3028 u32 prior_snd_una, long sack_rtt_us)
3026{
3029{
3027 struct tcp_sock *tp = tcp_sk(sk);
3028 const struct inet_connection_sock *icsk = inet_csk(sk);
3030 const struct inet_connection_sock *icsk = inet_csk(sk);
3029 struct sk_buff *skb;
3030 u32 now = tcp_time_stamp;
3031 struct skb_mstamp first_ackt, last_ackt, now;
3032 struct tcp_sock *tp = tcp_sk(sk);
3033 u32 prior_sacked = tp->sacked_out;
3034 u32 reord = tp->packets_out;
3031 bool fully_acked = true;
3035 bool fully_acked = true;
3032 int flag = 0;
3036 long ca_seq_rtt_us = -1L;
3037 long seq_rtt_us = -1L;
3038 struct sk_buff *skb;
3033 u32 pkts_acked = 0;
3039 u32 pkts_acked = 0;
3034 u32 reord = tp->packets_out;
3035 u32 prior_sacked = tp->sacked_out;
3036 s32 seq_rtt = -1;
3037 s32 ca_seq_rtt = -1;
3038 ktime_t last_ackt = net_invalid_timestamp();
3039 bool rtt_update;
3040 bool rtt_update;
3041 int flag = 0;
3040
3042
3043 first_ackt.v64 = 0;
3044
3041 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3045 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3046 struct skb_shared_info *shinfo = skb_shinfo(skb);
3042 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3047 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3043 u32 acked_pcount;
3044 u8 sacked = scb->sacked;
3048 u8 sacked = scb->sacked;
3049 u32 acked_pcount;
3045
3050
3051 if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
3052 between(shinfo->tskey, prior_snd_una, tp->snd_una - 1))
3053 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
3054
3046 /* Determine how many packets and what bytes were acked, tso and else */
3047 if (after(scb->end_seq, tp->snd_una)) {
3048 if (tcp_skb_pcount(skb) == 1 ||
3049 !after(tp->snd_una, scb->seq))
3050 break;
3051
3052 acked_pcount = tcp_tso_acked(sk, skb);
3053 if (!acked_pcount)

--- 4 unchanged lines hidden (view full) ---

3058 acked_pcount = tcp_skb_pcount(skb);
3059 }
3060
3061 if (sacked & TCPCB_RETRANS) {
3062 if (sacked & TCPCB_SACKED_RETRANS)
3063 tp->retrans_out -= acked_pcount;
3064 flag |= FLAG_RETRANS_DATA_ACKED;
3065 } else {
3055 /* Determine how many packets and what bytes were acked, tso and else */
3056 if (after(scb->end_seq, tp->snd_una)) {
3057 if (tcp_skb_pcount(skb) == 1 ||
3058 !after(tp->snd_una, scb->seq))
3059 break;
3060
3061 acked_pcount = tcp_tso_acked(sk, skb);
3062 if (!acked_pcount)

--- 4 unchanged lines hidden (view full) ---

3067 acked_pcount = tcp_skb_pcount(skb);
3068 }
3069
3070 if (sacked & TCPCB_RETRANS) {
3071 if (sacked & TCPCB_SACKED_RETRANS)
3072 tp->retrans_out -= acked_pcount;
3073 flag |= FLAG_RETRANS_DATA_ACKED;
3074 } else {
3066 ca_seq_rtt = now - scb->when;
3067 last_ackt = skb->tstamp;
3068 if (seq_rtt < 0) {
3069 seq_rtt = ca_seq_rtt;
3070 }
3075 last_ackt = skb->skb_mstamp;
3076 WARN_ON_ONCE(last_ackt.v64 == 0);
3077 if (!first_ackt.v64)
3078 first_ackt = last_ackt;
3079
3071 if (!(sacked & TCPCB_SACKED_ACKED))
3072 reord = min(pkts_acked, reord);
3073 if (!after(scb->end_seq, tp->high_seq))
3074 flag |= FLAG_ORIG_SACK_ACKED;
3075 }
3076
3077 if (sacked & TCPCB_SACKED_ACKED)
3078 tp->sacked_out -= acked_pcount;

--- 29 unchanged lines hidden (view full) ---

3108 }
3109
3110 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3111 tp->snd_up = tp->snd_una;
3112
3113 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3114 flag |= FLAG_SACK_RENEGING;
3115
3080 if (!(sacked & TCPCB_SACKED_ACKED))
3081 reord = min(pkts_acked, reord);
3082 if (!after(scb->end_seq, tp->high_seq))
3083 flag |= FLAG_ORIG_SACK_ACKED;
3084 }
3085
3086 if (sacked & TCPCB_SACKED_ACKED)
3087 tp->sacked_out -= acked_pcount;

--- 29 unchanged lines hidden (view full) ---

3117 }
3118
3119 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3120 tp->snd_up = tp->snd_una;
3121
3122 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
3123 flag |= FLAG_SACK_RENEGING;
3124
3116 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
3125 skb_mstamp_get(&now);
3126 if (first_ackt.v64) {
3127 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
3128 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
3129 }
3117
3130
3131 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
3132
3118 if (flag & FLAG_ACKED) {
3119 const struct tcp_congestion_ops *ca_ops
3120 = inet_csk(sk)->icsk_ca_ops;
3121
3122 tcp_rearm_rto(sk);
3123 if (unlikely(icsk->icsk_mtup.probe_size &&
3124 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3125 tcp_mtup_probe_success(sk);

--- 10 unchanged lines hidden (view full) ---

3136
3137 delta = tcp_is_fack(tp) ? pkts_acked :
3138 prior_sacked - tp->sacked_out;
3139 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3140 }
3141
3142 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3143
3133 if (flag & FLAG_ACKED) {
3134 const struct tcp_congestion_ops *ca_ops
3135 = inet_csk(sk)->icsk_ca_ops;
3136
3137 tcp_rearm_rto(sk);
3138 if (unlikely(icsk->icsk_mtup.probe_size &&
3139 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3140 tcp_mtup_probe_success(sk);

--- 10 unchanged lines hidden (view full) ---

3151
3152 delta = tcp_is_fack(tp) ? pkts_acked :
3153 prior_sacked - tp->sacked_out;
3154 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3155 }
3156
3157 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
3158
3144 if (ca_ops->pkts_acked) {
3145 s32 rtt_us = -1;
3159 if (ca_ops->pkts_acked)
3160 ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
3146
3161
3147 /* Is the ACK triggering packet unambiguous? */
3148 if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
3149 /* High resolution needed and available? */
3150 if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
3151 !ktime_equal(last_ackt,
3152 net_invalid_timestamp()))
3153 rtt_us = ktime_us_delta(ktime_get_real(),
3154 last_ackt);
3155 else if (ca_seq_rtt >= 0)
3156 rtt_us = jiffies_to_usecs(ca_seq_rtt);
3157 }
3158
3159 ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
3160 }
3161 } else if (skb && rtt_update && sack_rtt >= 0 &&
3162 sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
3162 } else if (skb && rtt_update && sack_rtt_us >= 0 &&
3163 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
3163 /* Do not re-arm RTO if the sack RTT is measured from data sent
3164 * after when the head was last (re)transmitted. Otherwise the
3165 * timeout may continue to extend in loss recovery.
3166 */
3167 tcp_rearm_rto(sk);
3168 }
3169
3170#if FASTRETRANS_DEBUG > 0

--- 31 unchanged lines hidden (view full) ---

3202
3203 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3204 icsk->icsk_backoff = 0;
3205 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3206 /* Socket must be waked up by subsequent tcp_data_snd_check().
3207 * This function is not for random using!
3208 */
3209 } else {
3164 /* Do not re-arm RTO if the sack RTT is measured from data sent
3165 * after when the head was last (re)transmitted. Otherwise the
3166 * timeout may continue to extend in loss recovery.
3167 */
3168 tcp_rearm_rto(sk);
3169 }
3170
3171#if FASTRETRANS_DEBUG > 0

--- 31 unchanged lines hidden (view full) ---

3203
3204 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
3205 icsk->icsk_backoff = 0;
3206 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3207 /* Socket must be waked up by subsequent tcp_data_snd_check().
3208 * This function is not for random using!
3209 */
3210 } else {
3211 unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
3212
3210 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3213 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3211 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
3212 TCP_RTO_MAX);
3214 when, TCP_RTO_MAX);
3213 }
3214}
3215
3216static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3217{
3218 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3219 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3220}

--- 124 unchanged lines hidden (view full) ---

3345 tp->tlp_high_seq = 0;
3346 return;
3347 }
3348
3349 if (after(ack, tp->tlp_high_seq)) {
3350 tp->tlp_high_seq = 0;
3351 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3352 if (!(flag & FLAG_DSACKING_ACK)) {
3215 }
3216}
3217
3218static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
3219{
3220 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
3221 inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3222}

--- 124 unchanged lines hidden (view full) ---

3347 tp->tlp_high_seq = 0;
3348 return;
3349 }
3350
3351 if (after(ack, tp->tlp_high_seq)) {
3352 tp->tlp_high_seq = 0;
3353 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */
3354 if (!(flag & FLAG_DSACKING_ACK)) {
3353 tcp_init_cwnd_reduction(sk, true);
3355 tcp_init_cwnd_reduction(sk);
3354 tcp_set_ca_state(sk, TCP_CA_CWR);
3355 tcp_end_cwnd_reduction(sk);
3356 tcp_try_keep_open(sk);
3357 NET_INC_STATS_BH(sock_net(sk),
3358 LINUX_MIB_TCPLOSSPROBERECOVERY);
3359 }
3360 }
3361}
3362
3363/* This routine deals with incoming acks, but not outgoing ones. */
3364static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3365{
3366 struct inet_connection_sock *icsk = inet_csk(sk);
3367 struct tcp_sock *tp = tcp_sk(sk);
3368 u32 prior_snd_una = tp->snd_una;
3369 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3370 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3371 bool is_dupack = false;
3356 tcp_set_ca_state(sk, TCP_CA_CWR);
3357 tcp_end_cwnd_reduction(sk);
3358 tcp_try_keep_open(sk);
3359 NET_INC_STATS_BH(sock_net(sk),
3360 LINUX_MIB_TCPLOSSPROBERECOVERY);
3361 }
3362 }
3363}
3364
3365/* This routine deals with incoming acks, but not outgoing ones. */
3366static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3367{
3368 struct inet_connection_sock *icsk = inet_csk(sk);
3369 struct tcp_sock *tp = tcp_sk(sk);
3370 u32 prior_snd_una = tp->snd_una;
3371 u32 ack_seq = TCP_SKB_CB(skb)->seq;
3372 u32 ack = TCP_SKB_CB(skb)->ack_seq;
3373 bool is_dupack = false;
3372 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
3373 u32 prior_fackets;
3374 int prior_packets = tp->packets_out;
3375 const int prior_unsacked = tp->packets_out - tp->sacked_out;
3376 int acked = 0; /* Number of packets newly acked */
3374 u32 prior_fackets;
3375 int prior_packets = tp->packets_out;
3376 const int prior_unsacked = tp->packets_out - tp->sacked_out;
3377 int acked = 0; /* Number of packets newly acked */
3377 s32 sack_rtt = -1;
3378 long sack_rtt_us = -1L;
3378
3379 /* If the ack is older than previous acks
3380 * then we can probably ignore it.
3381 */
3382 if (before(ack, prior_snd_una)) {
3383 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3384 if (before(ack, prior_snd_una - tp->max_window)) {
3385 tcp_send_challenge_ack(sk);

--- 7 unchanged lines hidden (view full) ---

3393 */
3394 if (after(ack, tp->snd_nxt))
3395 goto invalid_ack;
3396
3397 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3398 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3399 tcp_rearm_rto(sk);
3400
3379
3380 /* If the ack is older than previous acks
3381 * then we can probably ignore it.
3382 */
3383 if (before(ack, prior_snd_una)) {
3384 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
3385 if (before(ack, prior_snd_una - tp->max_window)) {
3386 tcp_send_challenge_ack(sk);

--- 7 unchanged lines hidden (view full) ---

3394 */
3395 if (after(ack, tp->snd_nxt))
3396 goto invalid_ack;
3397
3398 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
3399 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
3400 tcp_rearm_rto(sk);
3401
3401 if (after(ack, prior_snd_una))
3402 if (after(ack, prior_snd_una)) {
3402 flag |= FLAG_SND_UNA_ADVANCED;
3403 flag |= FLAG_SND_UNA_ADVANCED;
3404 icsk->icsk_retransmits = 0;
3405 }
3403
3404 prior_fackets = tp->fackets_out;
3406
3407 prior_fackets = tp->fackets_out;
3405 prior_in_flight = tcp_packets_in_flight(tp);
3406
3407 /* ts_recent update must be made after we are sure that the packet
3408 * is in window.
3409 */
3410 if (flag & FLAG_UPDATE_TS_RECENT)
3411 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3412
3413 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {

--- 13 unchanged lines hidden (view full) ---

3427 flag |= FLAG_DATA;
3428 else
3429 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3430
3431 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3432
3433 if (TCP_SKB_CB(skb)->sacked)
3434 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3408
3409 /* ts_recent update must be made after we are sure that the packet
3410 * is in window.
3411 */
3412 if (flag & FLAG_UPDATE_TS_RECENT)
3413 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3414
3415 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {

--- 13 unchanged lines hidden (view full) ---

3429 flag |= FLAG_DATA;
3430 else
3431 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
3432
3433 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
3434
3435 if (TCP_SKB_CB(skb)->sacked)
3436 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3435 &sack_rtt);
3437 &sack_rtt_us);
3436
3437 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
3438 flag |= FLAG_ECE;
3439
3440 tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
3441 }
3442
3443 /* We passed data and got it acked, remove any soft error
3444 * log. Something worked...
3445 */
3446 sk->sk_err_soft = 0;
3447 icsk->icsk_probes_out = 0;
3448 tp->rcv_tstamp = tcp_time_stamp;
3449 if (!prior_packets)
3450 goto no_queue;
3451
3452 /* See if we can take anything off of the retransmit queue. */
3453 acked = tp->packets_out;
3438
3439 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
3440 flag |= FLAG_ECE;
3441
3442 tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
3443 }
3444
3445 /* We passed data and got it acked, remove any soft error
3446 * log. Something worked...
3447 */
3448 sk->sk_err_soft = 0;
3449 icsk->icsk_probes_out = 0;
3450 tp->rcv_tstamp = tcp_time_stamp;
3451 if (!prior_packets)
3452 goto no_queue;
3453
3454 /* See if we can take anything off of the retransmit queue. */
3455 acked = tp->packets_out;
3454 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
3456 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
3457 sack_rtt_us);
3455 acked -= tp->packets_out;
3456
3457 /* Advance cwnd if state allows */
3458 if (tcp_may_raise_cwnd(sk, flag))
3458 acked -= tp->packets_out;
3459
3460 /* Advance cwnd if state allows */
3461 if (tcp_may_raise_cwnd(sk, flag))
3459 tcp_cong_avoid(sk, ack, acked, prior_in_flight);
3462 tcp_cong_avoid(sk, ack, acked);
3460
3461 if (tcp_ack_is_dubious(sk, flag)) {
3462 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3463 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3464 is_dupack, flag);
3465 }
3466 if (tp->tlp_high_seq)
3467 tcp_process_tlp_ack(sk, ack, flag);
3468
3469 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3470 struct dst_entry *dst = __sk_dst_get(sk);
3471 if (dst)
3472 dst_confirm(dst);
3473 }
3474
3475 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3476 tcp_schedule_loss_probe(sk);
3463
3464 if (tcp_ack_is_dubious(sk, flag)) {
3465 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3466 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3467 is_dupack, flag);
3468 }
3469 if (tp->tlp_high_seq)
3470 tcp_process_tlp_ack(sk, ack, flag);
3471
3472 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
3473 struct dst_entry *dst = __sk_dst_get(sk);
3474 if (dst)
3475 dst_confirm(dst);
3476 }
3477
3478 if (icsk->icsk_pending == ICSK_TIME_RETRANS)
3479 tcp_schedule_loss_probe(sk);
3477 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
3478 tcp_update_pacing_rate(sk);
3480 tcp_update_pacing_rate(sk);
3479 return 1;
3480
3481no_queue:
3482 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3483 if (flag & FLAG_DSACKING_ACK)
3484 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3485 is_dupack, flag);
3486 /* If this ack opens up a zero window, clear backoff. It was

--- 12 unchanged lines hidden (view full) ---

3499 return -1;
3500
3501old_ack:
3502 /* If data was SACKed, tag it and see if we should send more data.
3503 * If data was DSACKed, see if we can undo a cwnd reduction.
3504 */
3505 if (TCP_SKB_CB(skb)->sacked) {
3506 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3481 return 1;
3482
3483no_queue:
3484 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3485 if (flag & FLAG_DSACKING_ACK)
3486 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3487 is_dupack, flag);
3488 /* If this ack opens up a zero window, clear backoff. It was

--- 12 unchanged lines hidden (view full) ---

3501 return -1;
3502
3503old_ack:
3504 /* If data was SACKed, tag it and see if we should send more data.
3505 * If data was DSACKed, see if we can undo a cwnd reduction.
3506 */
3507 if (TCP_SKB_CB(skb)->sacked) {
3508 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
3507 &sack_rtt);
3509 &sack_rtt_us);
3508 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3509 is_dupack, flag);
3510 }
3511
3512 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3513 return 0;
3514}
3515

--- 538 unchanged lines hidden (view full) ---

4054 continue;
4055 }
4056 this_sack++;
4057 sp++;
4058 }
4059 tp->rx_opt.num_sacks = num_sacks;
4060}
4061
3510 tcp_fastretrans_alert(sk, acked, prior_unsacked,
3511 is_dupack, flag);
3512 }
3513
3514 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3515 return 0;
3516}
3517

--- 538 unchanged lines hidden (view full) ---

4056 continue;
4057 }
4058 this_sack++;
4059 sp++;
4060 }
4061 tp->rx_opt.num_sacks = num_sacks;
4062}
4063
4064/**
4065 * tcp_try_coalesce - try to merge skb to prior one
4066 * @sk: socket
4067 * @to: prior buffer
4068 * @from: buffer to add in queue
4069 * @fragstolen: pointer to boolean
4070 *
4071 * Before queueing skb @from after @to, try to merge them
4072 * to reduce overall memory use and queue lengths, if cost is small.
4073 * Packets in ofo or receive queues can stay a long time.
4074 * Better try to coalesce them right now to avoid future collapses.
4075 * Returns true if caller should free @from instead of queueing it
4076 */
4077static bool tcp_try_coalesce(struct sock *sk,
4078 struct sk_buff *to,
4079 struct sk_buff *from,
4080 bool *fragstolen)
4081{
4082 int delta;
4083
4084 *fragstolen = false;
4085
4086 /* Its possible this segment overlaps with prior segment in queue */
4087 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4088 return false;
4089
4090 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4091 return false;
4092
4093 atomic_add(delta, &sk->sk_rmem_alloc);
4094 sk_mem_charge(sk, delta);
4095 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4096 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4097 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4098 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
4099 return true;
4100}
4101
4062/* This one checks to see if we can put data from the
4063 * out_of_order queue into the receive_queue.
4064 */
4065static void tcp_ofo_queue(struct sock *sk)
4066{
4067 struct tcp_sock *tp = tcp_sk(sk);
4068 __u32 dsack_high = tp->rcv_nxt;
4102/* This one checks to see if we can put data from the
4103 * out_of_order queue into the receive_queue.
4104 */
4105static void tcp_ofo_queue(struct sock *sk)
4106{
4107 struct tcp_sock *tp = tcp_sk(sk);
4108 __u32 dsack_high = tp->rcv_nxt;
4069 struct sk_buff *skb;
4109 struct sk_buff *skb, *tail;
4110 bool fragstolen, eaten;
4070
4071 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4072 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4073 break;
4074
4075 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4076 __u32 dsack = dsack_high;
4077 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4078 dsack_high = TCP_SKB_CB(skb)->end_seq;
4079 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4080 }
4081
4111
4112 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
4113 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4114 break;
4115
4116 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
4117 __u32 dsack = dsack_high;
4118 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
4119 dsack_high = TCP_SKB_CB(skb)->end_seq;
4120 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
4121 }
4122
4123 __skb_unlink(skb, &tp->out_of_order_queue);
4082 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4083 SOCK_DEBUG(sk, "ofo packet was already received\n");
4124 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4125 SOCK_DEBUG(sk, "ofo packet was already received\n");
4084 __skb_unlink(skb, &tp->out_of_order_queue);
4085 __kfree_skb(skb);
4086 continue;
4087 }
4088 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4089 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4090 TCP_SKB_CB(skb)->end_seq);
4091
4126 __kfree_skb(skb);
4127 continue;
4128 }
4129 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
4130 tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
4131 TCP_SKB_CB(skb)->end_seq);
4132
4092 __skb_unlink(skb, &tp->out_of_order_queue);
4093 __skb_queue_tail(&sk->sk_receive_queue, skb);
4133 tail = skb_peek_tail(&sk->sk_receive_queue);
4134 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
4094 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4135 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
4095 if (tcp_hdr(skb)->fin)
4136 if (!eaten)
4137 __skb_queue_tail(&sk->sk_receive_queue, skb);
4138 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
4096 tcp_fin(sk);
4139 tcp_fin(sk);
4140 if (eaten)
4141 kfree_skb_partial(skb, fragstolen);
4097 }
4098}
4099
4100static bool tcp_prune_ofo_queue(struct sock *sk);
4101static int tcp_prune_queue(struct sock *sk);
4102
4103static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4104 unsigned int size)

--- 10 unchanged lines hidden (view full) ---

4115
4116 if (!sk_rmem_schedule(sk, skb, size))
4117 return -1;
4118 }
4119 }
4120 return 0;
4121}
4122
4142 }
4143}
4144
4145static bool tcp_prune_ofo_queue(struct sock *sk);
4146static int tcp_prune_queue(struct sock *sk);
4147
4148static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4149 unsigned int size)

--- 10 unchanged lines hidden (view full) ---

4160
4161 if (!sk_rmem_schedule(sk, skb, size))
4162 return -1;
4163 }
4164 }
4165 return 0;
4166}
4167
4123/**
4124 * tcp_try_coalesce - try to merge skb to prior one
4125 * @sk: socket
4126 * @to: prior buffer
4127 * @from: buffer to add in queue
4128 * @fragstolen: pointer to boolean
4129 *
4130 * Before queueing skb @from after @to, try to merge them
4131 * to reduce overall memory use and queue lengths, if cost is small.
4132 * Packets in ofo or receive queues can stay a long time.
4133 * Better try to coalesce them right now to avoid future collapses.
4134 * Returns true if caller should free @from instead of queueing it
4135 */
4136static bool tcp_try_coalesce(struct sock *sk,
4137 struct sk_buff *to,
4138 struct sk_buff *from,
4139 bool *fragstolen)
4140{
4141 int delta;
4142
4143 *fragstolen = false;
4144
4145 if (tcp_hdr(from)->fin)
4146 return false;
4147
4148 /* Its possible this segment overlaps with prior segment in queue */
4149 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4150 return false;
4151
4152 if (!skb_try_coalesce(to, from, fragstolen, &delta))
4153 return false;
4154
4155 atomic_add(delta, &sk->sk_rmem_alloc);
4156 sk_mem_charge(sk, delta);
4157 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4158 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4159 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4160 return true;
4161}
4162
4163static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4164{
4165 struct tcp_sock *tp = tcp_sk(sk);
4166 struct sk_buff *skb1;
4167 u32 seq, end_seq;
4168
4169 TCP_ECN_check_ce(tp, skb);
4170

--- 129 unchanged lines hidden (view full) ---

4300 __skb_queue_tail(&sk->sk_receive_queue, skb);
4301 skb_set_owner_r(skb, sk);
4302 }
4303 return eaten;
4304}
4305
4306int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4307{
4168static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4169{
4170 struct tcp_sock *tp = tcp_sk(sk);
4171 struct sk_buff *skb1;
4172 u32 seq, end_seq;
4173
4174 TCP_ECN_check_ce(tp, skb);
4175

--- 129 unchanged lines hidden (view full) ---

4305 __skb_queue_tail(&sk->sk_receive_queue, skb);
4306 skb_set_owner_r(skb, sk);
4307 }
4308 return eaten;
4309}
4310
4311int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4312{
4308 struct sk_buff *skb = NULL;
4309 struct tcphdr *th;
4313 struct sk_buff *skb;
4310 bool fragstolen;
4311
4312 if (size == 0)
4313 return 0;
4314
4314 bool fragstolen;
4315
4316 if (size == 0)
4317 return 0;
4318
4315 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4319 skb = alloc_skb(size, sk->sk_allocation);
4316 if (!skb)
4317 goto err;
4318
4320 if (!skb)
4321 goto err;
4322
4319 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
4323 if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
4320 goto err_free;
4321
4324 goto err_free;
4325
4322 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4323 skb_reset_transport_header(skb);
4324 memset(th, 0, sizeof(*th));
4325
4326 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4327 goto err_free;
4328
4329 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4330 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4331 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4332
4326 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size))
4327 goto err_free;
4328
4329 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
4330 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
4331 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
4332
4333 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) {
4333 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
4334 WARN_ON_ONCE(fragstolen); /* should not happen */
4335 __kfree_skb(skb);
4336 }
4337 return size;
4338
4339err_free:
4340 kfree_skb(skb);
4341err:

--- 71 unchanged lines hidden (view full) ---

4413 if (tp->rx_opt.num_sacks)
4414 tcp_sack_remove(tp);
4415
4416 tcp_fast_path_check(sk);
4417
4418 if (eaten > 0)
4419 kfree_skb_partial(skb, fragstolen);
4420 if (!sock_flag(sk, SOCK_DEAD))
4334 WARN_ON_ONCE(fragstolen); /* should not happen */
4335 __kfree_skb(skb);
4336 }
4337 return size;
4338
4339err_free:
4340 kfree_skb(skb);
4341err:

--- 71 unchanged lines hidden (view full) ---

4413 if (tp->rx_opt.num_sacks)
4414 tcp_sack_remove(tp);
4415
4416 tcp_fast_path_check(sk);
4417
4418 if (eaten > 0)
4419 kfree_skb_partial(skb, fragstolen);
4420 if (!sock_flag(sk, SOCK_DEAD))
4421 sk->sk_data_ready(sk, 0);
4421 sk->sk_data_ready(sk);
4422 return;
4423 }
4424
4425 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4426 /* A retransmit, 2nd most common case. Force an immediate ack. */
4427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4428 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4429

--- 77 unchanged lines hidden (view full) ---

4507 goto restart;
4508 }
4509
4510 /* The first skb to collapse is:
4511 * - not SYN/FIN and
4512 * - bloated or contains data before "start" or
4513 * overlaps to the next one.
4514 */
4422 return;
4423 }
4424
4425 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
4426 /* A retransmit, 2nd most common case. Force an immediate ack. */
4427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4428 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
4429

--- 77 unchanged lines hidden (view full) ---

4507 goto restart;
4508 }
4509
4510 /* The first skb to collapse is:
4511 * - not SYN/FIN and
4512 * - bloated or contains data before "start" or
4513 * overlaps to the next one.
4514 */
4515 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
4515 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
4516 (tcp_win_from_space(skb->truesize) > skb->len ||
4517 before(TCP_SKB_CB(skb)->seq, start))) {
4518 end_of_skbs = false;
4519 break;
4520 }
4521
4522 if (!skb_queue_is_last(list, skb)) {
4523 struct sk_buff *next = skb_queue_next(list, skb);
4524 if (next != tail &&
4525 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
4526 end_of_skbs = false;
4527 break;
4528 }
4529 }
4530
4531 /* Decided to skip this, advance start seq. */
4532 start = TCP_SKB_CB(skb)->end_seq;
4533 }
4516 (tcp_win_from_space(skb->truesize) > skb->len ||
4517 before(TCP_SKB_CB(skb)->seq, start))) {
4518 end_of_skbs = false;
4519 break;
4520 }
4521
4522 if (!skb_queue_is_last(list, skb)) {
4523 struct sk_buff *next = skb_queue_next(list, skb);
4524 if (next != tail &&
4525 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
4526 end_of_skbs = false;
4527 break;
4528 }
4529 }
4530
4531 /* Decided to skip this, advance start seq. */
4532 start = TCP_SKB_CB(skb)->end_seq;
4533 }
4534 if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
4534 if (end_of_skbs ||
4535 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4535 return;
4536
4537 while (before(start, end)) {
4536 return;
4537
4538 while (before(start, end)) {
4539 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start);
4538 struct sk_buff *nskb;
4540 struct sk_buff *nskb;
4539 unsigned int header = skb_headroom(skb);
4540 int copy = SKB_MAX_ORDER(header, 0);
4541
4541
4542 /* Too big header? This can happen with IPv6. */
4543 if (copy < 0)
4544 return;
4545 if (end - start < copy)
4546 copy = end - start;
4547 nskb = alloc_skb(copy + header, GFP_ATOMIC);
4542 nskb = alloc_skb(copy, GFP_ATOMIC);
4548 if (!nskb)
4549 return;
4550
4543 if (!nskb)
4544 return;
4545
4551 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
4552 skb_set_network_header(nskb, (skb_network_header(skb) -
4553 skb->head));
4554 skb_set_transport_header(nskb, (skb_transport_header(skb) -
4555 skb->head));
4556 skb_reserve(nskb, header);
4557 memcpy(nskb->head, skb->head, header);
4558 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4559 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4560 __skb_queue_before(list, skb, nskb);
4561 skb_set_owner_r(nskb, sk);
4562
4563 /* Copy data, releasing collapsed skbs. */
4564 while (copy > 0) {
4565 int offset = start - TCP_SKB_CB(skb)->seq;

--- 7 unchanged lines hidden (view full) ---

4573 TCP_SKB_CB(nskb)->end_seq += size;
4574 copy -= size;
4575 start += size;
4576 }
4577 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4578 skb = tcp_collapse_one(sk, skb, list);
4579 if (!skb ||
4580 skb == tail ||
4546 memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
4547 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
4548 __skb_queue_before(list, skb, nskb);
4549 skb_set_owner_r(nskb, sk);
4550
4551 /* Copy data, releasing collapsed skbs. */
4552 while (copy > 0) {
4553 int offset = start - TCP_SKB_CB(skb)->seq;

--- 7 unchanged lines hidden (view full) ---

4561 TCP_SKB_CB(nskb)->end_seq += size;
4562 copy -= size;
4563 start += size;
4564 }
4565 if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
4566 skb = tcp_collapse_one(sk, skb, list);
4567 if (!skb ||
4568 skb == tail ||
4581 tcp_hdr(skb)->syn ||
4582 tcp_hdr(skb)->fin)
4569 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
4583 return;
4584 }
4585 }
4586 }
4587}
4588
4589/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
4590 * and tcp_collapse() them until all the queue is collapsed.

--- 112 unchanged lines hidden (view full) ---

4703 */
4704 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
4705
4706 /* Massive buffer overcommit. */
4707 tp->pred_flags = 0;
4708 return -1;
4709}
4710
4570 return;
4571 }
4572 }
4573 }
4574}
4575
4576/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
4577 * and tcp_collapse() them until all the queue is collapsed.

--- 112 unchanged lines hidden (view full) ---

4690 */
4691 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
4692
4693 /* Massive buffer overcommit. */
4694 tp->pred_flags = 0;
4695 return -1;
4696}
4697
4711/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
4712 * As additional protections, we do not touch cwnd in retransmission phases,
4713 * and if application hit its sndbuf limit recently.
4714 */
4715void tcp_cwnd_application_limited(struct sock *sk)
4716{
4717 struct tcp_sock *tp = tcp_sk(sk);
4718
4719 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
4720 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
4721 /* Limited by application or receiver window. */
4722 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
4723 u32 win_used = max(tp->snd_cwnd_used, init_win);
4724 if (win_used < tp->snd_cwnd) {
4725 tp->snd_ssthresh = tcp_current_ssthresh(sk);
4726 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
4727 }
4728 tp->snd_cwnd_used = 0;
4729 }
4730 tp->snd_cwnd_stamp = tcp_time_stamp;
4731}
4732
4733static bool tcp_should_expand_sndbuf(const struct sock *sk)
4734{
4735 const struct tcp_sock *tp = tcp_sk(sk);
4736
4737 /* If the user specified a specific send buffer setting, do
4738 * not modify it.
4739 */
4740 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)

--- 173 unchanged lines hidden (view full) ---

4914
4915 /* Is the urgent pointer pointing into this packet? */
4916 if (ptr < skb->len) {
4917 u8 tmp;
4918 if (skb_copy_bits(skb, ptr, &tmp, 1))
4919 BUG();
4920 tp->urg_data = TCP_URG_VALID | tmp;
4921 if (!sock_flag(sk, SOCK_DEAD))
4698static bool tcp_should_expand_sndbuf(const struct sock *sk)
4699{
4700 const struct tcp_sock *tp = tcp_sk(sk);
4701
4702 /* If the user specified a specific send buffer setting, do
4703 * not modify it.
4704 */
4705 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)

--- 173 unchanged lines hidden (view full) ---

4879
4880 /* Is the urgent pointer pointing into this packet? */
4881 if (ptr < skb->len) {
4882 u8 tmp;
4883 if (skb_copy_bits(skb, ptr, &tmp, 1))
4884 BUG();
4885 tp->urg_data = TCP_URG_VALID | tmp;
4886 if (!sock_flag(sk, SOCK_DEAD))
4922 sk->sk_data_ready(sk, 0);
4887 sk->sk_data_ready(sk);
4923 }
4924 }
4925}
4926
4927static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
4928{
4929 struct tcp_sock *tp = tcp_sk(sk);
4930 int chunk = skb->len - hlen;

--- 33 unchanged lines hidden (view full) ---

4964
4965static inline bool tcp_checksum_complete_user(struct sock *sk,
4966 struct sk_buff *skb)
4967{
4968 return !skb_csum_unnecessary(skb) &&
4969 __tcp_checksum_complete_user(sk, skb);
4970}
4971
4888 }
4889 }
4890}
4891
4892static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
4893{
4894 struct tcp_sock *tp = tcp_sk(sk);
4895 int chunk = skb->len - hlen;

--- 33 unchanged lines hidden (view full) ---

4929
4930static inline bool tcp_checksum_complete_user(struct sock *sk,
4931 struct sk_buff *skb)
4932{
4933 return !skb_csum_unnecessary(skb) &&
4934 __tcp_checksum_complete_user(sk, skb);
4935}
4936
4937#ifdef CONFIG_NET_DMA
4938static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
4939 int hlen)
4940{
4941 struct tcp_sock *tp = tcp_sk(sk);
4942 int chunk = skb->len - hlen;
4943 int dma_cookie;
4944 bool copied_early = false;
4945
4946 if (tp->ucopy.wakeup)
4947 return false;
4948
4949 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
4950 tp->ucopy.dma_chan = net_dma_find_channel();
4951
4952 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
4953
4954 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
4955 skb, hlen,
4956 tp->ucopy.iov, chunk,
4957 tp->ucopy.pinned_list);
4958
4959 if (dma_cookie < 0)
4960 goto out;
4961
4962 tp->ucopy.dma_cookie = dma_cookie;
4963 copied_early = true;
4964
4965 tp->ucopy.len -= chunk;
4966 tp->copied_seq += chunk;
4967 tcp_rcv_space_adjust(sk);
4968
4969 if ((tp->ucopy.len == 0) ||
4970 (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
4971 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
4972 tp->ucopy.wakeup = 1;
4973 sk->sk_data_ready(sk);
4974 }
4975 } else if (chunk > 0) {
4976 tp->ucopy.wakeup = 1;
4977 sk->sk_data_ready(sk);
4978 }
4979out:
4980 return copied_early;
4981}
4982#endif /* CONFIG_NET_DMA */
4983
4972/* Does PAWS and seqno based validation of an incoming segment, flags will
4973 * play significant role here.
4974 */
4975static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4976 const struct tcphdr *th, int syn_inerr)
4977{
4978 struct tcp_sock *tp = tcp_sk(sk);
4979

--- 163 unchanged lines hidden (view full) ---

5143 tcp_data_snd_check(sk);
5144 return;
5145 } else { /* Header too small */
5146 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5147 goto discard;
5148 }
5149 } else {
5150 int eaten = 0;
4984/* Does PAWS and seqno based validation of an incoming segment, flags will
4985 * play significant role here.
4986 */
4987static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
4988 const struct tcphdr *th, int syn_inerr)
4989{
4990 struct tcp_sock *tp = tcp_sk(sk);
4991

--- 163 unchanged lines hidden (view full) ---

5155 tcp_data_snd_check(sk);
5156 return;
5157 } else { /* Header too small */
5158 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
5159 goto discard;
5160 }
5161 } else {
5162 int eaten = 0;
5163 int copied_early = 0;
5151 bool fragstolen = false;
5152
5164 bool fragstolen = false;
5165
5153 if (tp->ucopy.task == current &&
5154 tp->copied_seq == tp->rcv_nxt &&
5155 len - tcp_header_len <= tp->ucopy.len &&
5156 sock_owned_by_user(sk)) {
5157 __set_current_state(TASK_RUNNING);
5166 if (tp->copied_seq == tp->rcv_nxt &&
5167 len - tcp_header_len <= tp->ucopy.len) {
5168#ifdef CONFIG_NET_DMA
5169 if (tp->ucopy.task == current &&
5170 sock_owned_by_user(sk) &&
5171 tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5172 copied_early = 1;
5173 eaten = 1;
5174 }
5175#endif
5176 if (tp->ucopy.task == current &&
5177 sock_owned_by_user(sk) && !copied_early) {
5178 __set_current_state(TASK_RUNNING);
5158
5179
5159 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
5180 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
5181 eaten = 1;
5182 }
5183 if (eaten) {
5160 /* Predicted packet is in window by definition.
5161 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5162 * Hence, check seq<=rcv_wup reduces to:
5163 */
5164 if (tcp_header_len ==
5165 (sizeof(struct tcphdr) +
5166 TCPOLEN_TSTAMP_ALIGNED) &&
5167 tp->rcv_nxt == tp->rcv_wup)
5168 tcp_store_ts_recent(tp);
5169
5170 tcp_rcv_rtt_measure_ts(sk, skb);
5171
5172 __skb_pull(skb, tcp_header_len);
5173 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5174 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5184 /* Predicted packet is in window by definition.
5185 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
5186 * Hence, check seq<=rcv_wup reduces to:
5187 */
5188 if (tcp_header_len ==
5189 (sizeof(struct tcphdr) +
5190 TCPOLEN_TSTAMP_ALIGNED) &&
5191 tp->rcv_nxt == tp->rcv_wup)
5192 tcp_store_ts_recent(tp);
5193
5194 tcp_rcv_rtt_measure_ts(sk, skb);
5195
5196 __skb_pull(skb, tcp_header_len);
5197 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
5198 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
5175 eaten = 1;
5176 }
5199 }
5200 if (copied_early)
5201 tcp_cleanup_rbuf(sk, skb->len);
5177 }
5178 if (!eaten) {
5179 if (tcp_checksum_complete_user(sk, skb))
5180 goto csum_error;
5181
5182 if ((int)skb->truesize > sk->sk_forward_alloc)
5183 goto step5;
5184

--- 20 unchanged lines hidden (view full) ---

5205 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5206 /* Well, only one small jumplet in fast path... */
5207 tcp_ack(sk, skb, FLAG_DATA);
5208 tcp_data_snd_check(sk);
5209 if (!inet_csk_ack_scheduled(sk))
5210 goto no_ack;
5211 }
5212
5202 }
5203 if (!eaten) {
5204 if (tcp_checksum_complete_user(sk, skb))
5205 goto csum_error;
5206
5207 if ((int)skb->truesize > sk->sk_forward_alloc)
5208 goto step5;
5209

--- 20 unchanged lines hidden (view full) ---

5230 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
5231 /* Well, only one small jumplet in fast path... */
5232 tcp_ack(sk, skb, FLAG_DATA);
5233 tcp_data_snd_check(sk);
5234 if (!inet_csk_ack_scheduled(sk))
5235 goto no_ack;
5236 }
5237
5213 __tcp_ack_snd_check(sk, 0);
5238 if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
5239 __tcp_ack_snd_check(sk, 0);
5214no_ack:
5240no_ack:
5241#ifdef CONFIG_NET_DMA
5242 if (copied_early)
5243 __skb_queue_tail(&sk->sk_async_wait_queue, skb);
5244 else
5245#endif
5215 if (eaten)
5216 kfree_skb_partial(skb, fragstolen);
5246 if (eaten)
5247 kfree_skb_partial(skb, fragstolen);
5217 sk->sk_data_ready(sk, 0);
5248 sk->sk_data_ready(sk);
5218 return;
5219 }
5220 }
5221
5222slow_path:
5223 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5224 goto csum_error;
5225

--- 103 unchanged lines hidden (view full) ---

5329
5330 if (data) { /* Retransmit unacked data in SYN */
5331 tcp_for_write_queue_from(data, sk) {
5332 if (data == tcp_send_head(sk) ||
5333 __tcp_retransmit_skb(sk, data))
5334 break;
5335 }
5336 tcp_rearm_rto(sk);
5249 return;
5250 }
5251 }
5252
5253slow_path:
5254 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
5255 goto csum_error;
5256

--- 103 unchanged lines hidden (view full) ---

5360
5361 if (data) { /* Retransmit unacked data in SYN */
5362 tcp_for_write_queue_from(data, sk) {
5363 if (data == tcp_send_head(sk) ||
5364 __tcp_retransmit_skb(sk, data))
5365 break;
5366 }
5367 tcp_rearm_rto(sk);
5368 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
5337 return true;
5338 }
5339 tp->syn_data_acked = tp->syn_data;
5369 return true;
5370 }
5371 tp->syn_data_acked = tp->syn_data;
5372 if (tp->syn_data_acked)
5373 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
5340 return false;
5341}
5342
5343static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5344 const struct tcphdr *th, unsigned int len)
5345{
5346 struct inet_connection_sock *icsk = inet_csk(sk);
5347 struct tcp_sock *tp = tcp_sk(sk);

--- 483 unchanged lines hidden (view full) ---

5831
5832 if (!queued) {
5833discard:
5834 __kfree_skb(skb);
5835 }
5836 return 0;
5837}
5838EXPORT_SYMBOL(tcp_rcv_state_process);
5374 return false;
5375}
5376
5377static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5378 const struct tcphdr *th, unsigned int len)
5379{
5380 struct inet_connection_sock *icsk = inet_csk(sk);
5381 struct tcp_sock *tp = tcp_sk(sk);

--- 483 unchanged lines hidden (view full) ---

5865
5866 if (!queued) {
5867discard:
5868 __kfree_skb(skb);
5869 }
5870 return 0;
5871}
5872EXPORT_SYMBOL(tcp_rcv_state_process);
5873
5874static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
5875{
5876 struct inet_request_sock *ireq = inet_rsk(req);
5877
5878 if (family == AF_INET)
5879 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
5880 &ireq->ir_rmt_addr, port);
5881#if IS_ENABLED(CONFIG_IPV6)
5882 else if (family == AF_INET6)
5883 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
5884 &ireq->ir_v6_rmt_addr, port);
5885#endif
5886}
5887
5888int tcp_conn_request(struct request_sock_ops *rsk_ops,
5889 const struct tcp_request_sock_ops *af_ops,
5890 struct sock *sk, struct sk_buff *skb)
5891{
5892 struct tcp_options_received tmp_opt;
5893 struct request_sock *req;
5894 struct tcp_sock *tp = tcp_sk(sk);
5895 struct dst_entry *dst = NULL;
5896 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
5897 bool want_cookie = false, fastopen;
5898 struct flowi fl;
5899 struct tcp_fastopen_cookie foc = { .len = -1 };
5900 int err;
5901
5902
5903 /* TW buckets are converted to open requests without
5904 * limitations, they conserve resources and peer is
5905 * evidently real one.
5906 */
5907 if ((sysctl_tcp_syncookies == 2 ||
5908 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
5909 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
5910 if (!want_cookie)
5911 goto drop;
5912 }
5913
5914
5915 /* Accept backlog is full. If we have already queued enough
5916 * of warm entries in syn queue, drop request. It is better than
5917 * clogging syn queue with openreqs with exponentially increasing
5918 * timeout.
5919 */
5920 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
5921 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
5922 goto drop;
5923 }
5924
5925 req = inet_reqsk_alloc(rsk_ops);
5926 if (!req)
5927 goto drop;
5928
5929 tcp_rsk(req)->af_specific = af_ops;
5930
5931 tcp_clear_options(&tmp_opt);
5932 tmp_opt.mss_clamp = af_ops->mss_clamp;
5933 tmp_opt.user_mss = tp->rx_opt.user_mss;
5934 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
5935
5936 if (want_cookie && !tmp_opt.saw_tstamp)
5937 tcp_clear_options(&tmp_opt);
5938
5939 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
5940 tcp_openreq_init(req, &tmp_opt, skb, sk);
5941
5942 af_ops->init_req(req, sk, skb);
5943
5944 if (security_inet_conn_request(sk, skb, req))
5945 goto drop_and_free;
5946
5947 if (!want_cookie || tmp_opt.tstamp_ok)
5948 TCP_ECN_create_request(req, skb, sock_net(sk));
5949
5950 if (want_cookie) {
5951 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
5952 req->cookie_ts = tmp_opt.tstamp_ok;
5953 } else if (!isn) {
5954 /* VJ's idea. We save last timestamp seen
5955 * from the destination in peer table, when entering
5956 * state TIME-WAIT, and check against it before
5957 * accepting new connection request.
5958 *
5959 * If "isn" is not zero, this request hit alive
5960 * timewait bucket, so that all the necessary checks
5961 * are made in the function processing timewait state.
5962 */
5963 if (tcp_death_row.sysctl_tw_recycle) {
5964 bool strict;
5965
5966 dst = af_ops->route_req(sk, &fl, req, &strict);
5967
5968 if (dst && strict &&
5969 !tcp_peer_is_proven(req, dst, true,
5970 tmp_opt.saw_tstamp)) {
5971 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
5972 goto drop_and_release;
5973 }
5974 }
5975 /* Kill the following clause, if you dislike this way. */
5976 else if (!sysctl_tcp_syncookies &&
5977 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
5978 (sysctl_max_syn_backlog >> 2)) &&
5979 !tcp_peer_is_proven(req, dst, false,
5980 tmp_opt.saw_tstamp)) {
5981 /* Without syncookies last quarter of
5982 * backlog is filled with destinations,
5983 * proven to be alive.
5984 * It means that we continue to communicate
5985 * to destinations, already remembered
5986 * to the moment of synflood.
5987 */
5988 pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
5989 rsk_ops->family);
5990 goto drop_and_release;
5991 }
5992
5993 isn = af_ops->init_seq(skb);
5994 }
5995 if (!dst) {
5996 dst = af_ops->route_req(sk, &fl, req, NULL);
5997 if (!dst)
5998 goto drop_and_free;
5999 }
6000
6001 tcp_rsk(req)->snt_isn = isn;
6002 tcp_openreq_init_rwin(req, sk, dst);
6003 fastopen = !want_cookie &&
6004 tcp_try_fastopen(sk, skb, req, &foc, dst);
6005 err = af_ops->send_synack(sk, dst, &fl, req,
6006 skb_get_queue_mapping(skb), &foc);
6007 if (!fastopen) {
6008 if (err || want_cookie)
6009 goto drop_and_free;
6010
6011 tcp_rsk(req)->listener = NULL;
6012 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6013 }
6014
6015 return 0;
6016
6017drop_and_release:
6018 dst_release(dst);
6019drop_and_free:
6020 reqsk_free(req);
6021drop:
6022 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6023 return 0;
6024}
6025EXPORT_SYMBOL(tcp_conn_request);