tcp_input.c (d27f9bc104375a0a835cf68bb88fc9cec69125da) | tcp_input.c (cd7d8498c9a5d510c64db38d9f4f4fbc41790f09) |
---|---|
1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 59 unchanged lines hidden (view full) --- 68#include <linux/module.h> 69#include <linux/sysctl.h> 70#include <linux/kernel.h> 71#include <net/dst.h> 72#include <net/tcp.h> 73#include <net/inet_common.h> 74#include <linux/ipsec.h> 75#include <asm/unaligned.h> | 1/* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Implementation of the Transmission Control Protocol(TCP). 7 * 8 * Authors: Ross Biro --- 59 unchanged lines hidden (view full) --- 68#include <linux/module.h> 69#include <linux/sysctl.h> 70#include <linux/kernel.h> 71#include <net/dst.h> 72#include <net/tcp.h> 73#include <net/inet_common.h> 74#include <linux/ipsec.h> 75#include <asm/unaligned.h> |
76#include <net/netdma.h> 77#include <linux/errqueue.h> |
|
76 77int sysctl_tcp_timestamps __read_mostly = 1; 78int sysctl_tcp_window_scaling __read_mostly = 1; 79int sysctl_tcp_sack __read_mostly = 1; 80int sysctl_tcp_fack __read_mostly = 1; 81int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 82EXPORT_SYMBOL(sysctl_tcp_reordering); 83int sysctl_tcp_dsack __read_mostly = 1; --- 577 unchanged lines hidden (view full) --- 661 * routine either comes from timestamps, or from segments that were 662 * known _not_ to have been retransmitted [see Karn/Partridge 663 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 664 * piece by Van Jacobson. 665 * NOTE: the next three routines used to be one big routine. 666 * To save cycles in the RFC 1323 implementation it was better to break 667 * it up into three procedures. -- erics 668 */ | 78 79int sysctl_tcp_timestamps __read_mostly = 1; 80int sysctl_tcp_window_scaling __read_mostly = 1; 81int sysctl_tcp_sack __read_mostly = 1; 82int sysctl_tcp_fack __read_mostly = 1; 83int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; 84EXPORT_SYMBOL(sysctl_tcp_reordering); 85int sysctl_tcp_dsack __read_mostly = 1; --- 577 unchanged lines hidden (view full) --- 663 * routine either comes from timestamps, or from segments that were 664 * known _not_ to have been retransmitted [see Karn/Partridge 665 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 666 * piece by Van Jacobson. 667 * NOTE: the next three routines used to be one big routine. 668 * To save cycles in the RFC 1323 implementation it was better to break 669 * it up into three procedures. -- erics 670 */ |
669static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) | 671static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) |
670{ 671 struct tcp_sock *tp = tcp_sk(sk); | 672{ 673 struct tcp_sock *tp = tcp_sk(sk); |
672 long m = mrtt; /* RTT */ 673 u32 srtt = tp->srtt; | 674 long m = mrtt_us; /* RTT */ 675 u32 srtt = tp->srtt_us; |
674 675 /* The following amusing code comes from Jacobson's 676 * article in SIGCOMM '88. Note that rtt and mdev 677 * are scaled versions of rtt and mean deviation. 678 * This is designed to be as fast as possible 679 * m stands for "measurement". 680 * 681 * On a 1990 paper the rto value is changed to: --- 6 unchanged lines hidden (view full) --- 688 * does not matter how to _calculate_ it. Seems, it was trap 689 * that VJ failed to avoid. 8) 690 */ 691 if (srtt != 0) { 692 m -= (srtt >> 3); /* m is now error in rtt est */ 693 srtt += m; /* rtt = 7/8 rtt + 1/8 new */ 694 if (m < 0) { 695 m = -m; /* m is now abs(error) */ | 676 677 /* The following amusing code comes from Jacobson's 678 * article in SIGCOMM '88. Note that rtt and mdev 679 * are scaled versions of rtt and mean deviation. 680 * This is designed to be as fast as possible 681 * m stands for "measurement". 682 * 683 * On a 1990 paper the rto value is changed to: --- 6 unchanged lines hidden (view full) --- 690 * does not matter how to _calculate_ it. Seems, it was trap 691 * that VJ failed to avoid. 8) 692 */ 693 if (srtt != 0) { 694 m -= (srtt >> 3); /* m is now error in rtt est */ 695 srtt += m; /* rtt = 7/8 rtt + 1/8 new */ 696 if (m < 0) { 697 m = -m; /* m is now abs(error) */ |
696 m -= (tp->mdev >> 2); /* similar update on mdev */ | 698 m -= (tp->mdev_us >> 2); /* similar update on mdev */ |
697 /* This is similar to one of Eifel findings. 698 * Eifel blocks mdev updates when rtt decreases. 699 * This solution is a bit different: we use finer gain 700 * for mdev in this case (alpha*beta). 701 * Like Eifel it also prevents growth of rto, 702 * but also it limits too fast rto decreases, 703 * happening in pure Eifel. 704 */ 705 if (m > 0) 706 m >>= 3; 707 } else { | 699 /* This is similar to one of Eifel findings. 700 * Eifel blocks mdev updates when rtt decreases. 701 * This solution is a bit different: we use finer gain 702 * for mdev in this case (alpha*beta). 703 * Like Eifel it also prevents growth of rto, 704 * but also it limits too fast rto decreases, 705 * happening in pure Eifel. 706 */ 707 if (m > 0) 708 m >>= 3; 709 } else { |
708 m -= (tp->mdev >> 2); /* similar update on mdev */ | 710 m -= (tp->mdev_us >> 2); /* similar update on mdev */ |
709 } | 711 } |
710 tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ 711 if (tp->mdev > tp->mdev_max) { 712 tp->mdev_max = tp->mdev; 713 if (tp->mdev_max > tp->rttvar) 714 tp->rttvar = tp->mdev_max; | 712 tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ 713 if (tp->mdev_us > tp->mdev_max_us) { 714 tp->mdev_max_us = tp->mdev_us; 715 if (tp->mdev_max_us > tp->rttvar_us) 716 tp->rttvar_us = tp->mdev_max_us; |
715 } 716 if (after(tp->snd_una, tp->rtt_seq)) { | 717 } 718 if (after(tp->snd_una, tp->rtt_seq)) { |
717 if (tp->mdev_max < tp->rttvar) 718 tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; | 719 if (tp->mdev_max_us < tp->rttvar_us) 720 tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; |
719 tp->rtt_seq = tp->snd_nxt; | 721 tp->rtt_seq = tp->snd_nxt; |
720 tp->mdev_max = tcp_rto_min(sk); | 722 tp->mdev_max_us = tcp_rto_min_us(sk); |
721 } 722 } else { 723 /* no previous measure. */ 724 srtt = m << 3; /* take the measured time to be rtt */ | 723 } 724 } else { 725 /* no previous measure. */ 726 srtt = m << 3; /* take the measured time to be rtt */ |
725 tp->mdev = m << 1; /* make sure rto = 3*rtt */ 726 tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); | 727 tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ 728 tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); 729 tp->mdev_max_us = tp->rttvar_us; |
727 tp->rtt_seq = tp->snd_nxt; 728 } | 730 tp->rtt_seq = tp->snd_nxt; 731 } |
729 tp->srtt = max(1U, srtt); | 732 tp->srtt_us = max(1U, srtt); |
730} 731 732/* Set the sk_pacing_rate to allow proper sizing of TSO packets. 733 * Note: TCP stack does not yet implement pacing. 734 * FQ packet scheduler can be used to implement cheap but effective 735 * TCP pacing, to smooth the burst on large writes when packets 736 * in flight is significantly lower than cwnd (or rwin) 737 */ 738static void tcp_update_pacing_rate(struct sock *sk) 739{ 740 const struct tcp_sock *tp = tcp_sk(sk); 741 u64 rate; 742 743 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ | 733} 734 735/* Set the sk_pacing_rate to allow proper sizing of TSO packets. 736 * Note: TCP stack does not yet implement pacing. 737 * FQ packet scheduler can be used to implement cheap but effective 738 * TCP pacing, to smooth the burst on large writes when packets 739 * in flight is significantly lower than cwnd (or rwin) 740 */ 741static void tcp_update_pacing_rate(struct sock *sk) 742{ 743 const struct tcp_sock *tp = tcp_sk(sk); 744 u64 rate; 745 746 /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ |
744 rate = (u64)tp->mss_cache * 2 * (HZ << 3); | 747 rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); |
745 746 rate *= max(tp->snd_cwnd, tp->packets_out); 747 | 748 749 rate *= max(tp->snd_cwnd, tp->packets_out); 750 |
748 /* Correction for small srtt and scheduling constraints. 749 * For small rtt, consider noise is too high, and use 750 * the minimal value (srtt = 1 -> 125 us for HZ=1000) 751 * 752 * We probably need usec resolution in the future. 753 * Note: This also takes care of possible srtt=0 case, 754 * when tcp_rtt_estimator() was not yet called. 755 */ 756 if (tp->srtt > 8 + 2) 757 do_div(rate, tp->srtt); | 751 if (likely(tp->srtt_us)) 752 do_div(rate, tp->srtt_us); |
758 759 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate 760 * without any lock. We want to make sure compiler wont store 761 * intermediate values in this location. 762 */ 763 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, 764 sk->sk_max_pacing_rate); 765} --- 341 unchanged lines hidden (view full) --- 1107 dup_sack = true; 1108 tcp_dsack_seen(tp); 1109 NET_INC_STATS_BH(sock_net(sk), 1110 LINUX_MIB_TCPDSACKOFORECV); 1111 } 1112 } 1113 1114 /* D-SACK for already forgotten data... Do dumb counting. */ | 753 754 /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate 755 * without any lock. We want to make sure compiler wont store 756 * intermediate values in this location. 757 */ 758 ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate, 759 sk->sk_max_pacing_rate); 760} --- 341 unchanged lines hidden (view full) --- 1102 dup_sack = true; 1103 tcp_dsack_seen(tp); 1104 NET_INC_STATS_BH(sock_net(sk), 1105 LINUX_MIB_TCPDSACKOFORECV); 1106 } 1107 } 1108 1109 /* D-SACK for already forgotten data... Do dumb counting. */ |
1115 if (dup_sack && tp->undo_marker && tp->undo_retrans && | 1110 if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && |
1116 !after(end_seq_0, prior_snd_una) && 1117 after(end_seq_0, tp->undo_marker)) 1118 tp->undo_retrans--; 1119 1120 return dup_sack; 1121} 1122 1123struct tcp_sacktag_state { | 1111 !after(end_seq_0, prior_snd_una) && 1112 after(end_seq_0, tp->undo_marker)) 1113 tp->undo_retrans--; 1114 1115 return dup_sack; 1116} 1117 1118struct tcp_sacktag_state { |
1124 int reord; 1125 int fack_count; 1126 int flag; 1127 s32 rtt; /* RTT measured by SACKing never-retransmitted data */ | 1119 int reord; 1120 int fack_count; 1121 long rtt_us; /* RTT measured by SACKing never-retransmitted data */ 1122 int flag; |
1128}; 1129 1130/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1131 * the incoming SACK may not exactly match but we can find smaller MSS 1132 * aligned portion of it that matches. Therefore we might need to fragment 1133 * which may fail and creates some hassle (caller must handle error case 1134 * returns). 1135 * --- 27 unchanged lines hidden (view full) --- 1163 1164 /* Round if necessary so that SACKs cover only full MSSes 1165 * and/or the remaining small portion (if present) 1166 */ 1167 if (pkt_len > mss) { 1168 unsigned int new_len = (pkt_len / mss) * mss; 1169 if (!in_sack && new_len < pkt_len) { 1170 new_len += mss; | 1123}; 1124 1125/* Check if skb is fully within the SACK block. In presence of GSO skbs, 1126 * the incoming SACK may not exactly match but we can find smaller MSS 1127 * aligned portion of it that matches. Therefore we might need to fragment 1128 * which may fail and creates some hassle (caller must handle error case 1129 * returns). 1130 * --- 27 unchanged lines hidden (view full) --- 1158 1159 /* Round if necessary so that SACKs cover only full MSSes 1160 * and/or the remaining small portion (if present) 1161 */ 1162 if (pkt_len > mss) { 1163 unsigned int new_len = (pkt_len / mss) * mss; 1164 if (!in_sack && new_len < pkt_len) { 1165 new_len += mss; |
1171 if (new_len > skb->len) | 1166 if (new_len >= skb->len) |
1172 return 0; 1173 } 1174 pkt_len = new_len; 1175 } | 1167 return 0; 1168 } 1169 pkt_len = new_len; 1170 } |
1176 err = tcp_fragment(sk, skb, pkt_len, mss); | 1171 err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); |
1177 if (err < 0) 1178 return err; 1179 } 1180 1181 return in_sack; 1182} 1183 1184/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ 1185static u8 tcp_sacktag_one(struct sock *sk, 1186 struct tcp_sacktag_state *state, u8 sacked, 1187 u32 start_seq, u32 end_seq, | 1172 if (err < 0) 1173 return err; 1174 } 1175 1176 return in_sack; 1177} 1178 1179/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ 1180static u8 tcp_sacktag_one(struct sock *sk, 1181 struct tcp_sacktag_state *state, u8 sacked, 1182 u32 start_seq, u32 end_seq, |
1188 int dup_sack, int pcount, u32 xmit_time) | 1183 int dup_sack, int pcount, 1184 const struct skb_mstamp *xmit_time) |
1189{ 1190 struct tcp_sock *tp = tcp_sk(sk); 1191 int fack_count = state->fack_count; 1192 1193 /* Account D-SACK for retransmitted packet. */ 1194 if (dup_sack && (sacked & TCPCB_RETRANS)) { | 1185{ 1186 struct tcp_sock *tp = tcp_sk(sk); 1187 int fack_count = state->fack_count; 1188 1189 /* Account D-SACK for retransmitted packet. */ 1190 if (dup_sack && (sacked & TCPCB_RETRANS)) { |
1195 if (tp->undo_marker && tp->undo_retrans && | 1191 if (tp->undo_marker && tp->undo_retrans > 0 && |
1196 after(end_seq, tp->undo_marker)) 1197 tp->undo_retrans--; 1198 if (sacked & TCPCB_SACKED_ACKED) 1199 state->reord = min(fack_count, state->reord); 1200 } 1201 1202 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1203 if (!after(end_seq, tp->snd_una)) --- 17 unchanged lines hidden (view full) --- 1221 */ 1222 if (before(start_seq, 1223 tcp_highest_sack_seq(tp))) 1224 state->reord = min(fack_count, 1225 state->reord); 1226 if (!after(end_seq, tp->high_seq)) 1227 state->flag |= FLAG_ORIG_SACK_ACKED; 1228 /* Pick the earliest sequence sacked for RTT */ | 1192 after(end_seq, tp->undo_marker)) 1193 tp->undo_retrans--; 1194 if (sacked & TCPCB_SACKED_ACKED) 1195 state->reord = min(fack_count, state->reord); 1196 } 1197 1198 /* Nothing to do; acked frame is about to be dropped (was ACKed). */ 1199 if (!after(end_seq, tp->snd_una)) --- 17 unchanged lines hidden (view full) --- 1217 */ 1218 if (before(start_seq, 1219 tcp_highest_sack_seq(tp))) 1220 state->reord = min(fack_count, 1221 state->reord); 1222 if (!after(end_seq, tp->high_seq)) 1223 state->flag |= FLAG_ORIG_SACK_ACKED; 1224 /* Pick the earliest sequence sacked for RTT */ |
1229 if (state->rtt < 0) 1230 state->rtt = tcp_time_stamp - xmit_time; | 1225 if (state->rtt_us < 0) { 1226 struct skb_mstamp now; 1227 1228 skb_mstamp_get(&now); 1229 state->rtt_us = skb_mstamp_us_delta(&now, 1230 xmit_time); 1231 } |
1231 } 1232 1233 if (sacked & TCPCB_LOST) { 1234 sacked &= ~TCPCB_LOST; 1235 tp->lost_out -= pcount; 1236 } 1237 } 1238 --- 42 unchanged lines hidden (view full) --- 1281 /* Adjust counters and hints for the newly sacked sequence 1282 * range but discard the return value since prev is already 1283 * marked. We must tag the range first because the seq 1284 * advancement below implicitly advances 1285 * tcp_highest_sack_seq() when skb is highest_sack. 1286 */ 1287 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1288 start_seq, end_seq, dup_sack, pcount, | 1232 } 1233 1234 if (sacked & TCPCB_LOST) { 1235 sacked &= ~TCPCB_LOST; 1236 tp->lost_out -= pcount; 1237 } 1238 } 1239 --- 42 unchanged lines hidden (view full) --- 1282 /* Adjust counters and hints for the newly sacked sequence 1283 * range but discard the return value since prev is already 1284 * marked. We must tag the range first because the seq 1285 * advancement below implicitly advances 1286 * tcp_highest_sack_seq() when skb is highest_sack. 1287 */ 1288 tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, 1289 start_seq, end_seq, dup_sack, pcount, |
1289 TCP_SKB_CB(skb)->when); | 1290 &skb->skb_mstamp); |
1290 1291 if (skb == tp->lost_skb_hint) 1292 tp->lost_cnt_hint += pcount; 1293 1294 TCP_SKB_CB(prev)->end_seq += shifted; 1295 TCP_SKB_CB(skb)->seq += shifted; 1296 | 1291 1292 if (skb == tp->lost_skb_hint) 1293 tp->lost_cnt_hint += pcount; 1294 1295 TCP_SKB_CB(prev)->end_seq += shifted; 1296 TCP_SKB_CB(skb)->seq += shifted; 1297 |
1297 skb_shinfo(prev)->gso_segs += pcount; 1298 BUG_ON(skb_shinfo(skb)->gso_segs < pcount); 1299 skb_shinfo(skb)->gso_segs -= pcount; | 1298 tcp_skb_pcount_add(prev, pcount); 1299 BUG_ON(tcp_skb_pcount(skb) < pcount); 1300 tcp_skb_pcount_add(skb, -pcount); |
1300 1301 /* When we're adding to gso_segs == 1, gso_size will be zero, 1302 * in theory this shouldn't be necessary but as long as DSACK 1303 * code can come after this skb later on it's better to keep 1304 * setting gso_size to something. 1305 */ 1306 if (!skb_shinfo(prev)->gso_size) { 1307 skb_shinfo(prev)->gso_size = mss; 1308 skb_shinfo(prev)->gso_type = sk->sk_gso_type; 1309 } 1310 1311 /* CHECKME: To clear or not to clear? Mimics normal skb currently */ | 1301 1302 /* When we're adding to gso_segs == 1, gso_size will be zero, 1303 * in theory this shouldn't be necessary but as long as DSACK 1304 * code can come after this skb later on it's better to keep 1305 * setting gso_size to something. 1306 */ 1307 if (!skb_shinfo(prev)->gso_size) { 1308 skb_shinfo(prev)->gso_size = mss; 1309 skb_shinfo(prev)->gso_type = sk->sk_gso_type; 1310 } 1311 1312 /* CHECKME: To clear or not to clear? Mimics normal skb currently */ |
1312 if (skb_shinfo(skb)->gso_segs <= 1) { | 1313 if (tcp_skb_pcount(skb) <= 1) { |
1313 skb_shinfo(skb)->gso_size = 0; 1314 skb_shinfo(skb)->gso_type = 0; 1315 } 1316 1317 /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1318 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1319 1320 if (skb->len > 0) { --- 238 unchanged lines hidden (view full) --- 1559 TCP_SKB_CB(skb)->sacked = 1560 tcp_sacktag_one(sk, 1561 state, 1562 TCP_SKB_CB(skb)->sacked, 1563 TCP_SKB_CB(skb)->seq, 1564 TCP_SKB_CB(skb)->end_seq, 1565 dup_sack, 1566 tcp_skb_pcount(skb), | 1314 skb_shinfo(skb)->gso_size = 0; 1315 skb_shinfo(skb)->gso_type = 0; 1316 } 1317 1318 /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1319 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1320 1321 if (skb->len > 0) { --- 238 unchanged lines hidden (view full) --- 1560 TCP_SKB_CB(skb)->sacked = 1561 tcp_sacktag_one(sk, 1562 state, 1563 TCP_SKB_CB(skb)->sacked, 1564 TCP_SKB_CB(skb)->seq, 1565 TCP_SKB_CB(skb)->end_seq, 1566 dup_sack, 1567 tcp_skb_pcount(skb), |
1567 TCP_SKB_CB(skb)->when); | 1568 &skb->skb_mstamp); |
1568 1569 if (!before(TCP_SKB_CB(skb)->seq, 1570 tcp_highest_sack_seq(tp))) 1571 tcp_advance_highest_sack(sk, skb); 1572 } 1573 1574 state->fack_count += tcp_skb_pcount(skb); 1575 } --- 40 unchanged lines hidden (view full) --- 1616 1617static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) 1618{ 1619 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); 1620} 1621 1622static int 1623tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, | 1569 1570 if (!before(TCP_SKB_CB(skb)->seq, 1571 tcp_highest_sack_seq(tp))) 1572 tcp_advance_highest_sack(sk, skb); 1573 } 1574 1575 state->fack_count += tcp_skb_pcount(skb); 1576 } --- 40 unchanged lines hidden (view full) --- 1617 1618static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache) 1619{ 1620 return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache); 1621} 1622 1623static int 1624tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, |
1624 u32 prior_snd_una, s32 *sack_rtt) | 1625 u32 prior_snd_una, long *sack_rtt_us) |
1625{ 1626 struct tcp_sock *tp = tcp_sk(sk); 1627 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1628 TCP_SKB_CB(ack_skb)->sacked); 1629 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1630 struct tcp_sack_block sp[TCP_NUM_SACKS]; 1631 struct tcp_sack_block *cache; 1632 struct tcp_sacktag_state state; 1633 struct sk_buff *skb; 1634 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1635 int used_sacks; 1636 bool found_dup_sack = false; 1637 int i, j; 1638 int first_sack_index; 1639 1640 state.flag = 0; 1641 state.reord = tp->packets_out; | 1626{ 1627 struct tcp_sock *tp = tcp_sk(sk); 1628 const unsigned char *ptr = (skb_transport_header(ack_skb) + 1629 TCP_SKB_CB(ack_skb)->sacked); 1630 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2); 1631 struct tcp_sack_block sp[TCP_NUM_SACKS]; 1632 struct tcp_sack_block *cache; 1633 struct tcp_sacktag_state state; 1634 struct sk_buff *skb; 1635 int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3); 1636 int used_sacks; 1637 bool found_dup_sack = false; 1638 int i, j; 1639 int first_sack_index; 1640 1641 state.flag = 0; 1642 state.reord = tp->packets_out; |
1642 state.rtt = -1; | 1643 state.rtt_us = -1L; |
1643 1644 if (!tp->sacked_out) { 1645 if (WARN_ON(tp->fackets_out)) 1646 tp->fackets_out = 0; 1647 tcp_highest_sack_reset(sk); 1648 } 1649 1650 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, --- 167 unchanged lines hidden (view full) --- 1818out: 1819 1820#if FASTRETRANS_DEBUG > 0 1821 WARN_ON((int)tp->sacked_out < 0); 1822 WARN_ON((int)tp->lost_out < 0); 1823 WARN_ON((int)tp->retrans_out < 0); 1824 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1825#endif | 1644 1645 if (!tp->sacked_out) { 1646 if (WARN_ON(tp->fackets_out)) 1647 tp->fackets_out = 0; 1648 tcp_highest_sack_reset(sk); 1649 } 1650 1651 found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, --- 167 unchanged lines hidden (view full) --- 1819out: 1820 1821#if FASTRETRANS_DEBUG > 0 1822 WARN_ON((int)tp->sacked_out < 0); 1823 WARN_ON((int)tp->lost_out < 0); 1824 WARN_ON((int)tp->retrans_out < 0); 1825 WARN_ON((int)tcp_packets_in_flight(tp) < 0); 1826#endif |
1826 *sack_rtt = state.rtt; | 1827 *sack_rtt_us = state.rtt_us; |
1827 return state.flag; 1828} 1829 1830/* Limits sacked_out so that sum with lost_out isn't ever larger than 1831 * packets_out. Returns false if sacked_out adjustement wasn't necessary. 1832 */ 1833static bool tcp_limit_reno_sacked(struct tcp_sock *tp) 1834{ --- 47 unchanged lines hidden (view full) --- 1882 tcp_verify_left_out(tp); 1883} 1884 1885static inline void tcp_reset_reno_sack(struct tcp_sock *tp) 1886{ 1887 tp->sacked_out = 0; 1888} 1889 | 1828 return state.flag; 1829} 1830 1831/* Limits sacked_out so that sum with lost_out isn't ever larger than 1832 * packets_out. Returns false if sacked_out adjustement wasn't necessary. 1833 */ 1834static bool tcp_limit_reno_sacked(struct tcp_sock *tp) 1835{ --- 47 unchanged lines hidden (view full) --- 1883 tcp_verify_left_out(tp); 1884} 1885 1886static inline void tcp_reset_reno_sack(struct tcp_sock *tp) 1887{ 1888 tp->sacked_out = 0; 1889} 1890 |
1890static void tcp_clear_retrans_partial(struct tcp_sock *tp) | 1891void tcp_clear_retrans(struct tcp_sock *tp) |
1891{ 1892 tp->retrans_out = 0; 1893 tp->lost_out = 0; | 1892{ 1893 tp->retrans_out = 0; 1894 tp->lost_out = 0; |
1894 | |
1895 tp->undo_marker = 0; | 1895 tp->undo_marker = 0; |
1896 tp->undo_retrans = 0; | 1896 tp->undo_retrans = -1; 1897 tp->fackets_out = 0; 1898 tp->sacked_out = 0; |
1897} 1898 | 1899} 1900 |
1899void tcp_clear_retrans(struct tcp_sock *tp) | 1901static inline void tcp_init_undo(struct tcp_sock *tp) |
1900{ | 1902{ |
1901 tcp_clear_retrans_partial(tp); 1902 1903 tp->fackets_out = 0; 1904 tp->sacked_out = 0; | 1903 tp->undo_marker = tp->snd_una; 1904 /* Retransmission still in flight may cause DSACKs later. */ 1905 tp->undo_retrans = tp->retrans_out ? : -1; |
1905} 1906 | 1906} 1907 |
1907/* Enter Loss state. If "how" is not zero, forget all SACK information | 1908/* Enter Loss state. If we detect SACK reneging, forget all SACK information |
1908 * and reset tags completely, otherwise preserve SACKs. If receiver 1909 * dropped its ofo queue, we will know this due to reneging detection. 1910 */ | 1909 * and reset tags completely, otherwise preserve SACKs. If receiver 1910 * dropped its ofo queue, we will know this due to reneging detection. 1911 */ |
1911void tcp_enter_loss(struct sock *sk, int how) | 1912void tcp_enter_loss(struct sock *sk) |
1912{ 1913 const struct inet_connection_sock *icsk = inet_csk(sk); 1914 struct tcp_sock *tp = tcp_sk(sk); 1915 struct sk_buff *skb; 1916 bool new_recovery = false; | 1913{ 1914 const struct inet_connection_sock *icsk = inet_csk(sk); 1915 struct tcp_sock *tp = tcp_sk(sk); 1916 struct sk_buff *skb; 1917 bool new_recovery = false; |
1918 bool is_reneg; /* is receiver reneging on SACKs? */ |
|
1917 1918 /* Reduce ssthresh if it has not yet been made inside this window. */ 1919 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1920 !after(tp->high_seq, tp->snd_una) || 1921 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1922 new_recovery = true; 1923 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1924 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1925 tcp_ca_event(sk, CA_EVENT_LOSS); | 1919 1920 /* Reduce ssthresh if it has not yet been made inside this window. */ 1921 if (icsk->icsk_ca_state <= TCP_CA_Disorder || 1922 !after(tp->high_seq, tp->snd_una) || 1923 (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { 1924 new_recovery = true; 1925 tp->prior_ssthresh = tcp_current_ssthresh(sk); 1926 tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); 1927 tcp_ca_event(sk, CA_EVENT_LOSS); |
1928 tcp_init_undo(tp); |
|
1926 } 1927 tp->snd_cwnd = 1; 1928 tp->snd_cwnd_cnt = 0; 1929 tp->snd_cwnd_stamp = tcp_time_stamp; 1930 | 1929 } 1930 tp->snd_cwnd = 1; 1931 tp->snd_cwnd_cnt = 0; 1932 tp->snd_cwnd_stamp = tcp_time_stamp; 1933 |
1931 tcp_clear_retrans_partial(tp); | 1934 tp->retrans_out = 0; 1935 tp->lost_out = 0; |
1932 1933 if (tcp_is_reno(tp)) 1934 tcp_reset_reno_sack(tp); 1935 | 1936 1937 if (tcp_is_reno(tp)) 1938 tcp_reset_reno_sack(tp); 1939 |
1936 tp->undo_marker = tp->snd_una; 1937 if (how) { | 1940 skb = tcp_write_queue_head(sk); 1941 is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); 1942 if (is_reneg) { 1943 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); |
1938 tp->sacked_out = 0; 1939 tp->fackets_out = 0; 1940 } 1941 tcp_clear_all_retrans_hints(tp); 1942 1943 tcp_for_write_queue(skb, sk) { 1944 if (skb == tcp_send_head(sk)) 1945 break; 1946 | 1944 tp->sacked_out = 0; 1945 tp->fackets_out = 0; 1946 } 1947 tcp_clear_all_retrans_hints(tp); 1948 1949 tcp_for_write_queue(skb, sk) { 1950 if (skb == tcp_send_head(sk)) 1951 break; 1952 |
1947 if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) 1948 tp->undo_marker = 0; 1949 | |
1950 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; | 1953 TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; |
1951 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { | 1954 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { |
1952 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1953 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1954 tp->lost_out += tcp_skb_pcount(skb); 1955 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; 1956 } 1957 } 1958 tcp_verify_left_out(tp); 1959 --- 16 unchanged lines hidden (view full) --- 1976 (new_recovery || icsk->icsk_retransmits) && 1977 !inet_csk(sk)->icsk_mtup.probe_size; 1978} 1979 1980/* If ACK arrived pointing to a remembered SACK, it means that our 1981 * remembered SACKs do not reflect real state of receiver i.e. 1982 * receiver _host_ is heavily congested (or buggy). 1983 * | 1955 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; 1956 TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; 1957 tp->lost_out += tcp_skb_pcount(skb); 1958 tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; 1959 } 1960 } 1961 tcp_verify_left_out(tp); 1962 --- 16 unchanged lines hidden (view full) --- 1979 (new_recovery || icsk->icsk_retransmits) && 1980 !inet_csk(sk)->icsk_mtup.probe_size; 1981} 1982 1983/* If ACK arrived pointing to a remembered SACK, it means that our 1984 * remembered SACKs do not reflect real state of receiver i.e. 1985 * receiver _host_ is heavily congested (or buggy). 1986 * |
1984 * Do processing similar to RTO timeout. | 1987 * To avoid big spurious retransmission bursts due to transient SACK 1988 * scoreboard oddities that look like reneging, we give the receiver a 1989 * little time (max(RTT/2, 10ms)) to send us some more ACKs that will 1990 * restore sanity to the SACK scoreboard. If the apparent reneging 1991 * persists until this RTO then we'll clear the SACK scoreboard. |
1985 */ 1986static bool tcp_check_sack_reneging(struct sock *sk, int flag) 1987{ 1988 if (flag & FLAG_SACK_RENEGING) { | 1992 */ 1993static bool tcp_check_sack_reneging(struct sock *sk, int flag) 1994{ 1995 if (flag & FLAG_SACK_RENEGING) { |
1989 struct inet_connection_sock *icsk = inet_csk(sk); 1990 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); | 1996 struct tcp_sock *tp = tcp_sk(sk); 1997 unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), 1998 msecs_to_jiffies(10)); |
1991 | 1999 |
1992 tcp_enter_loss(sk, 1); 1993 icsk->icsk_retransmits++; 1994 tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); | |
1995 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, | 2000 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, |
1996 icsk->icsk_rto, TCP_RTO_MAX); | 2001 delay, TCP_RTO_MAX); |
1997 return true; 1998 } 1999 return false; 2000} 2001 2002static inline int tcp_fackets_out(const struct tcp_sock *tp) 2003{ 2004 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; --- 24 unchanged lines hidden (view full) --- 2029 struct tcp_sock *tp = tcp_sk(sk); 2030 unsigned long delay; 2031 2032 /* Delay early retransmit and entering fast recovery for 2033 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 2034 * available, or RTO is scheduled to fire first. 2035 */ 2036 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || | 2002 return true; 2003 } 2004 return false; 2005} 2006 2007static inline int tcp_fackets_out(const struct tcp_sock *tp) 2008{ 2009 return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out; --- 24 unchanged lines hidden (view full) --- 2034 struct tcp_sock *tp = tcp_sk(sk); 2035 unsigned long delay; 2036 2037 /* Delay early retransmit and entering fast recovery for 2038 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples 2039 * available, or RTO is scheduled to fire first. 2040 */ 2041 if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || |
2037 (flag & FLAG_ECE) || !tp->srtt) | 2042 (flag & FLAG_ECE) || !tp->srtt_us) |
2038 return false; 2039 | 2043 return false; 2044 |
2040 delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); | 2045 delay = max(usecs_to_jiffies(tp->srtt_us >> 5), 2046 msecs_to_jiffies(2)); 2047 |
2041 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 2042 return false; 2043 2044 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, 2045 TCP_RTO_MAX); 2046 return true; 2047} 2048 --- 185 unchanged lines hidden (view full) --- 2234 2235 if (cnt > packets) { 2236 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2237 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2238 (oldcnt >= packets)) 2239 break; 2240 2241 mss = skb_shinfo(skb)->gso_size; | 2048 if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) 2049 return false; 2050 2051 inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay, 2052 TCP_RTO_MAX); 2053 return true; 2054} 2055 --- 185 unchanged lines hidden (view full) --- 2241 2242 if (cnt > packets) { 2243 if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || 2244 (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || 2245 (oldcnt >= packets)) 2246 break; 2247 2248 mss = skb_shinfo(skb)->gso_size; |
2242 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); | 2249 err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, 2250 mss, GFP_ATOMIC); |
2243 if (err < 0) 2244 break; 2245 cnt = packets; 2246 } 2247 2248 tcp_skb_mark_lost(tp, skb); 2249 2250 if (mark_head) --- 216 unchanged lines hidden (view full) --- 2467 * It computes the number of packets to send (sndcnt) based on packets newly 2468 * delivered: 2469 * 1) If the packets in flight is larger than ssthresh, PRR spreads the 2470 * cwnd reductions across a full RTT. 2471 * 2) If packets in flight is lower than ssthresh (such as due to excess 2472 * losses and/or application stalls), do not perform any further cwnd 2473 * reductions, but instead slow start up to ssthresh. 2474 */ | 2251 if (err < 0) 2252 break; 2253 cnt = packets; 2254 } 2255 2256 tcp_skb_mark_lost(tp, skb); 2257 2258 if (mark_head) --- 216 unchanged lines hidden (view full) --- 2475 * It computes the number of packets to send (sndcnt) based on packets newly 2476 * delivered: 2477 * 1) If the packets in flight is larger than ssthresh, PRR spreads the 2478 * cwnd reductions across a full RTT. 2479 * 2) If packets in flight is lower than ssthresh (such as due to excess 2480 * losses and/or application stalls), do not perform any further cwnd 2481 * reductions, but instead slow start up to ssthresh. 2482 */ |
2475static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) | 2483static void tcp_init_cwnd_reduction(struct sock *sk) |
2476{ 2477 struct tcp_sock *tp = tcp_sk(sk); 2478 2479 tp->high_seq = tp->snd_nxt; 2480 tp->tlp_high_seq = 0; 2481 tp->snd_cwnd_cnt = 0; 2482 tp->prior_cwnd = tp->snd_cwnd; 2483 tp->prr_delivered = 0; 2484 tp->prr_out = 0; | 2484{ 2485 struct tcp_sock *tp = tcp_sk(sk); 2486 2487 tp->high_seq = tp->snd_nxt; 2488 tp->tlp_high_seq = 0; 2489 tp->snd_cwnd_cnt = 0; 2490 tp->prior_cwnd = tp->snd_cwnd; 2491 tp->prr_delivered = 0; 2492 tp->prr_out = 0; |
2485 if (set_ssthresh) 2486 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); | 2493 tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); |
2487 TCP_ECN_queue_cwr(tp); 2488} 2489 2490static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, 2491 int fast_rexmit) 2492{ 2493 struct tcp_sock *tp = tcp_sk(sk); 2494 int sndcnt = 0; --- 25 unchanged lines hidden (view full) --- 2520 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { 2521 tp->snd_cwnd = tp->snd_ssthresh; 2522 tp->snd_cwnd_stamp = tcp_time_stamp; 2523 } 2524 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2525} 2526 2527/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ | 2494 TCP_ECN_queue_cwr(tp); 2495} 2496 2497static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, 2498 int fast_rexmit) 2499{ 2500 struct tcp_sock *tp = tcp_sk(sk); 2501 int sndcnt = 0; --- 25 unchanged lines hidden (view full) --- 2527 (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { 2528 tp->snd_cwnd = tp->snd_ssthresh; 2529 tp->snd_cwnd_stamp = tcp_time_stamp; 2530 } 2531 tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); 2532} 2533 2534/* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ |
2528void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) | 2535void tcp_enter_cwr(struct sock *sk) |
2529{ 2530 struct tcp_sock *tp = tcp_sk(sk); 2531 2532 tp->prior_ssthresh = 0; 2533 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2534 tp->undo_marker = 0; | 2536{ 2537 struct tcp_sock *tp = tcp_sk(sk); 2538 2539 tp->prior_ssthresh = 0; 2540 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2541 tp->undo_marker = 0; |
2535 tcp_init_cwnd_reduction(sk, set_ssthresh); | 2542 tcp_init_cwnd_reduction(sk); |
2536 tcp_set_ca_state(sk, TCP_CA_CWR); 2537 } 2538} 2539 2540static void tcp_try_keep_open(struct sock *sk) 2541{ 2542 struct tcp_sock *tp = tcp_sk(sk); 2543 int state = TCP_CA_Open; --- 12 unchanged lines hidden (view full) --- 2556 struct tcp_sock *tp = tcp_sk(sk); 2557 2558 tcp_verify_left_out(tp); 2559 2560 if (!tcp_any_retrans_done(sk)) 2561 tp->retrans_stamp = 0; 2562 2563 if (flag & FLAG_ECE) | 2543 tcp_set_ca_state(sk, TCP_CA_CWR); 2544 } 2545} 2546 2547static void tcp_try_keep_open(struct sock *sk) 2548{ 2549 struct tcp_sock *tp = tcp_sk(sk); 2550 int state = TCP_CA_Open; --- 12 unchanged lines hidden (view full) --- 2563 struct tcp_sock *tp = tcp_sk(sk); 2564 2565 tcp_verify_left_out(tp); 2566 2567 if (!tcp_any_retrans_done(sk)) 2568 tp->retrans_stamp = 0; 2569 2570 if (flag & FLAG_ECE) |
2564 tcp_enter_cwr(sk, 1); | 2571 tcp_enter_cwr(sk); |
2565 2566 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2567 tcp_try_keep_open(sk); 2568 } else { 2569 tcp_cwnd_reduction(sk, prior_unsacked, 0); 2570 } 2571} 2572 --- 83 unchanged lines hidden (view full) --- 2656 if (tcp_is_reno(tp)) 2657 mib_idx = LINUX_MIB_TCPRENORECOVERY; 2658 else 2659 mib_idx = LINUX_MIB_TCPSACKRECOVERY; 2660 2661 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2662 2663 tp->prior_ssthresh = 0; | 2572 2573 if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { 2574 tcp_try_keep_open(sk); 2575 } else { 2576 tcp_cwnd_reduction(sk, prior_unsacked, 0); 2577 } 2578} 2579 --- 83 unchanged lines hidden (view full) --- 2663 if (tcp_is_reno(tp)) 2664 mib_idx = LINUX_MIB_TCPRENORECOVERY; 2665 else 2666 mib_idx = LINUX_MIB_TCPSACKRECOVERY; 2667 2668 NET_INC_STATS_BH(sock_net(sk), mib_idx); 2669 2670 tp->prior_ssthresh = 0; |
2664 tp->undo_marker = tp->snd_una; 2665 tp->undo_retrans = tp->retrans_out; | 2671 tcp_init_undo(tp); |
2666 2667 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2668 if (!ece_ack) 2669 tp->prior_ssthresh = tcp_current_ssthresh(sk); | 2672 2673 if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { 2674 if (!ece_ack) 2675 tp->prior_ssthresh = tcp_current_ssthresh(sk); |
2670 tcp_init_cwnd_reduction(sk, true); | 2676 tcp_init_cwnd_reduction(sk); |
2671 } 2672 tcp_set_ca_state(sk, TCP_CA_Recovery); 2673} 2674 2675/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are 2676 * recovered or spurious. Otherwise retransmits more on partial ACKs. 2677 */ 2678static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) 2679{ | 2677 } 2678 tcp_set_ca_state(sk, TCP_CA_Recovery); 2679} 2680 2681/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are 2682 * recovered or spurious. Otherwise retransmits more on partial ACKs. 2683 */ 2684static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) 2685{ |
2680 struct inet_connection_sock *icsk = inet_csk(sk); | |
2681 struct tcp_sock *tp = tcp_sk(sk); 2682 bool recovered = !before(tp->snd_una, tp->high_seq); 2683 2684 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ | 2686 struct tcp_sock *tp = tcp_sk(sk); 2687 bool recovered = !before(tp->snd_una, tp->high_seq); 2688 2689 if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ |
2685 if (flag & FLAG_ORIG_SACK_ACKED) { 2686 /* Step 3.b. A timeout is spurious if not all data are 2687 * lost, i.e., never-retransmitted data are (s)acked. 2688 */ 2689 tcp_try_undo_loss(sk, true); | 2690 /* Step 3.b. A timeout is spurious if not all data are 2691 * lost, i.e., never-retransmitted data are (s)acked. 2692 */ 2693 if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) |
2690 return; | 2694 return; |
2691 } | 2695 |
2692 if (after(tp->snd_nxt, tp->high_seq) && 2693 (flag & FLAG_DATA_SACKED || is_dupack)) { 2694 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ 2695 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { 2696 tp->high_seq = tp->snd_nxt; 2697 __tcp_push_pending_frames(sk, tcp_current_mss(sk), 2698 TCP_NAGLE_OFF); 2699 if (after(tp->snd_nxt, tp->high_seq)) 2700 return; /* Step 2.b */ 2701 tp->frto = 0; 2702 } 2703 } 2704 2705 if (recovered) { 2706 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ | 2696 if (after(tp->snd_nxt, tp->high_seq) && 2697 (flag & FLAG_DATA_SACKED || is_dupack)) { 2698 tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ 2699 } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { 2700 tp->high_seq = tp->snd_nxt; 2701 __tcp_push_pending_frames(sk, tcp_current_mss(sk), 2702 TCP_NAGLE_OFF); 2703 if (after(tp->snd_nxt, tp->high_seq)) 2704 return; /* Step 2.b */ 2705 tp->frto = 0; 2706 } 2707 } 2708 2709 if (recovered) { 2710 /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ |
2707 icsk->icsk_retransmits = 0; | |
2708 tcp_try_undo_recovery(sk); 2709 return; 2710 } | 2711 tcp_try_undo_recovery(sk); 2712 return; 2713 } |
2711 if (flag & FLAG_DATA_ACKED) 2712 icsk->icsk_retransmits = 0; | |
2713 if (tcp_is_reno(tp)) { 2714 /* A Reno DUPACK means new data in F-RTO step 2.b above are 2715 * delivered. Lower inflight to clock out (re)tranmissions. 2716 */ 2717 if (after(tp->snd_nxt, tp->high_seq) && is_dupack) 2718 tcp_add_reno_sack(sk); 2719 else if (flag & FLAG_SND_UNA_ADVANCED) 2720 tcp_reset_reno_sack(tp); --- 158 unchanged lines hidden (view full) --- 2879 2880 if (do_lost) 2881 tcp_update_scoreboard(sk, fast_rexmit); 2882 tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); 2883 tcp_xmit_retransmit_queue(sk); 2884} 2885 2886static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, | 2714 if (tcp_is_reno(tp)) { 2715 /* A Reno DUPACK means new data in F-RTO step 2.b above are 2716 * delivered. Lower inflight to clock out (re)tranmissions. 2717 */ 2718 if (after(tp->snd_nxt, tp->high_seq) && is_dupack) 2719 tcp_add_reno_sack(sk); 2720 else if (flag & FLAG_SND_UNA_ADVANCED) 2721 tcp_reset_reno_sack(tp); --- 158 unchanged lines hidden (view full) --- 2880 2881 if (do_lost) 2882 tcp_update_scoreboard(sk, fast_rexmit); 2883 tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); 2884 tcp_xmit_retransmit_queue(sk); 2885} 2886 2887static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, |
2887 s32 seq_rtt, s32 sack_rtt) | 2888 long seq_rtt_us, long sack_rtt_us) |
2888{ 2889 const struct tcp_sock *tp = tcp_sk(sk); 2890 2891 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because 2892 * broken middle-boxes or peers may corrupt TS-ECR fields. But 2893 * Karn's algorithm forbids taking RTT if some retransmitted data 2894 * is acked (RFC6298). 2895 */ 2896 if (flag & FLAG_RETRANS_DATA_ACKED) | 2889{ 2890 const struct tcp_sock *tp = tcp_sk(sk); 2891 2892 /* Prefer RTT measured from ACK's timing to TS-ECR. This is because 2893 * broken middle-boxes or peers may corrupt TS-ECR fields. But 2894 * Karn's algorithm forbids taking RTT if some retransmitted data 2895 * is acked (RFC6298). 2896 */ 2897 if (flag & FLAG_RETRANS_DATA_ACKED) |
2897 seq_rtt = -1; | 2898 seq_rtt_us = -1L; |
2898 | 2899 |
2899 if (seq_rtt < 0) 2900 seq_rtt = sack_rtt; | 2900 if (seq_rtt_us < 0) 2901 seq_rtt_us = sack_rtt_us; |
2901 2902 /* RTTM Rule: A TSecr value received in a segment is used to 2903 * update the averaged RTT measurement only if the segment 2904 * acknowledges some new data, i.e., only if it advances the 2905 * left edge of the send window. 2906 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2907 */ | 2902 2903 /* RTTM Rule: A TSecr value received in a segment is used to 2904 * update the averaged RTT measurement only if the segment 2905 * acknowledges some new data, i.e., only if it advances the 2906 * left edge of the send window. 2907 * See draft-ietf-tcplw-high-performance-00, section 3.3. 2908 */ |
2908 if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && | 2909 if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && |
2909 flag & FLAG_ACKED) | 2910 flag & FLAG_ACKED) |
2910 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; | 2911 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); |
2911 | 2912 |
2912 if (seq_rtt < 0) | 2913 if (seq_rtt_us < 0) |
2913 return false; 2914 | 2914 return false; 2915 |
2915 tcp_rtt_estimator(sk, seq_rtt); | 2916 tcp_rtt_estimator(sk, seq_rtt_us); |
2916 tcp_set_rto(sk); 2917 2918 /* RFC6298: only reset backoff on valid RTT measurement. */ 2919 inet_csk(sk)->icsk_backoff = 0; 2920 return true; 2921} 2922 2923/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ 2924static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) 2925{ 2926 struct tcp_sock *tp = tcp_sk(sk); | 2917 tcp_set_rto(sk); 2918 2919 /* RFC6298: only reset backoff on valid RTT measurement. */ 2920 inet_csk(sk)->icsk_backoff = 0; 2921 return true; 2922} 2923 2924/* Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. */ 2925static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) 2926{ 2927 struct tcp_sock *tp = tcp_sk(sk); |
2927 s32 seq_rtt = -1; | 2928 long seq_rtt_us = -1L; |
2928 2929 if (synack_stamp && !tp->total_retrans) | 2929 2930 if (synack_stamp && !tp->total_retrans) |
2930 seq_rtt = tcp_time_stamp - synack_stamp; | 2931 seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); |
2931 2932 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets 2933 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() 2934 */ | 2932 2933 /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets 2934 * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() 2935 */ |
2935 if (!tp->srtt) 2936 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); | 2936 if (!tp->srtt_us) 2937 tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); |
2937} 2938 | 2938} 2939 |
2939static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) | 2940static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) |
2940{ 2941 const struct inet_connection_sock *icsk = inet_csk(sk); | 2941{ 2942 const struct inet_connection_sock *icsk = inet_csk(sk); |
2942 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); | 2943 2944 icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); |
2943 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; 2944} 2945 2946/* Restart timer after forward progress on connection. 2947 * RFC2988 recommends to restart timer to now+rto. 2948 */ 2949void tcp_rearm_rto(struct sock *sk) 2950{ --- 9 unchanged lines hidden (view full) --- 2960 if (!tp->packets_out) { 2961 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2962 } else { 2963 u32 rto = inet_csk(sk)->icsk_rto; 2964 /* Offset the time elapsed after installing regular RTO */ 2965 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2966 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2967 struct sk_buff *skb = tcp_write_queue_head(sk); | 2945 tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; 2946} 2947 2948/* Restart timer after forward progress on connection. 2949 * RFC2988 recommends to restart timer to now+rto. 2950 */ 2951void tcp_rearm_rto(struct sock *sk) 2952{ --- 9 unchanged lines hidden (view full) --- 2962 if (!tp->packets_out) { 2963 inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); 2964 } else { 2965 u32 rto = inet_csk(sk)->icsk_rto; 2966 /* Offset the time elapsed after installing regular RTO */ 2967 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 2968 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { 2969 struct sk_buff *skb = tcp_write_queue_head(sk); |
2968 const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; | 2970 const u32 rto_time_stamp = 2971 tcp_skb_timestamp(skb) + rto; |
2969 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2970 /* delta may not be positive if the socket is locked 2971 * when the retrans timer fires and is rescheduled. 2972 */ 2973 if (delta > 0) 2974 rto = delta; 2975 } 2976 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, --- 40 unchanged lines hidden (view full) --- 3017 return packets_acked; 3018} 3019 3020/* Remove acknowledged frames from the retransmission queue. If our packet 3021 * is before the ack sequence we can discard it as it's confirmed to have 3022 * arrived at the other end. 3023 */ 3024static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, | 2972 s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); 2973 /* delta may not be positive if the socket is locked 2974 * when the retrans timer fires and is rescheduled. 2975 */ 2976 if (delta > 0) 2977 rto = delta; 2978 } 2979 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, --- 40 unchanged lines hidden (view full) --- 3020 return packets_acked; 3021} 3022 3023/* Remove acknowledged frames from the retransmission queue. If our packet 3024 * is before the ack sequence we can discard it as it's confirmed to have 3025 * arrived at the other end. 3026 */ 3027static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, |
3025 u32 prior_snd_una, s32 sack_rtt) | 3028 u32 prior_snd_una, long sack_rtt_us) |
3026{ | 3029{ |
3027 struct tcp_sock *tp = tcp_sk(sk); | |
3028 const struct inet_connection_sock *icsk = inet_csk(sk); | 3030 const struct inet_connection_sock *icsk = inet_csk(sk); |
3029 struct sk_buff *skb; 3030 u32 now = tcp_time_stamp; | 3031 struct skb_mstamp first_ackt, last_ackt, now; 3032 struct tcp_sock *tp = tcp_sk(sk); 3033 u32 prior_sacked = tp->sacked_out; 3034 u32 reord = tp->packets_out; |
3031 bool fully_acked = true; | 3035 bool fully_acked = true; |
3032 int flag = 0; | 3036 long ca_seq_rtt_us = -1L; 3037 long seq_rtt_us = -1L; 3038 struct sk_buff *skb; |
3033 u32 pkts_acked = 0; | 3039 u32 pkts_acked = 0; |
3034 u32 reord = tp->packets_out; 3035 u32 prior_sacked = tp->sacked_out; 3036 s32 seq_rtt = -1; 3037 s32 ca_seq_rtt = -1; 3038 ktime_t last_ackt = net_invalid_timestamp(); | |
3039 bool rtt_update; | 3040 bool rtt_update; |
3041 int flag = 0; |
|
3040 | 3042 |
3043 first_ackt.v64 = 0; 3044 |
|
3041 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { | 3045 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { |
3046 struct skb_shared_info *shinfo = skb_shinfo(skb); |
|
3042 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); | 3047 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); |
3043 u32 acked_pcount; | |
3044 u8 sacked = scb->sacked; | 3048 u8 sacked = scb->sacked; |
3049 u32 acked_pcount; |
|
3045 | 3050 |
3051 if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && 3052 between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) 3053 __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); 3054 |
|
3046 /* Determine how many packets and what bytes were acked, tso and else */ 3047 if (after(scb->end_seq, tp->snd_una)) { 3048 if (tcp_skb_pcount(skb) == 1 || 3049 !after(tp->snd_una, scb->seq)) 3050 break; 3051 3052 acked_pcount = tcp_tso_acked(sk, skb); 3053 if (!acked_pcount) --- 4 unchanged lines hidden (view full) --- 3058 acked_pcount = tcp_skb_pcount(skb); 3059 } 3060 3061 if (sacked & TCPCB_RETRANS) { 3062 if (sacked & TCPCB_SACKED_RETRANS) 3063 tp->retrans_out -= acked_pcount; 3064 flag |= FLAG_RETRANS_DATA_ACKED; 3065 } else { | 3055 /* Determine how many packets and what bytes were acked, tso and else */ 3056 if (after(scb->end_seq, tp->snd_una)) { 3057 if (tcp_skb_pcount(skb) == 1 || 3058 !after(tp->snd_una, scb->seq)) 3059 break; 3060 3061 acked_pcount = tcp_tso_acked(sk, skb); 3062 if (!acked_pcount) --- 4 unchanged lines hidden (view full) --- 3067 acked_pcount = tcp_skb_pcount(skb); 3068 } 3069 3070 if (sacked & TCPCB_RETRANS) { 3071 if (sacked & TCPCB_SACKED_RETRANS) 3072 tp->retrans_out -= acked_pcount; 3073 flag |= FLAG_RETRANS_DATA_ACKED; 3074 } else { |
3066 ca_seq_rtt = now - scb->when; 3067 last_ackt = skb->tstamp; 3068 if (seq_rtt < 0) { 3069 seq_rtt = ca_seq_rtt; 3070 } | 3075 last_ackt = skb->skb_mstamp; 3076 WARN_ON_ONCE(last_ackt.v64 == 0); 3077 if (!first_ackt.v64) 3078 first_ackt = last_ackt; 3079 |
3071 if (!(sacked & TCPCB_SACKED_ACKED)) 3072 reord = min(pkts_acked, reord); 3073 if (!after(scb->end_seq, tp->high_seq)) 3074 flag |= FLAG_ORIG_SACK_ACKED; 3075 } 3076 3077 if (sacked & TCPCB_SACKED_ACKED) 3078 tp->sacked_out -= acked_pcount; --- 29 unchanged lines hidden (view full) --- 3108 } 3109 3110 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) 3111 tp->snd_up = tp->snd_una; 3112 3113 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3114 flag |= FLAG_SACK_RENEGING; 3115 | 3080 if (!(sacked & TCPCB_SACKED_ACKED)) 3081 reord = min(pkts_acked, reord); 3082 if (!after(scb->end_seq, tp->high_seq)) 3083 flag |= FLAG_ORIG_SACK_ACKED; 3084 } 3085 3086 if (sacked & TCPCB_SACKED_ACKED) 3087 tp->sacked_out -= acked_pcount; --- 29 unchanged lines hidden (view full) --- 3117 } 3118 3119 if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) 3120 tp->snd_up = tp->snd_una; 3121 3122 if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) 3123 flag |= FLAG_SACK_RENEGING; 3124 |
3116 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); | 3125 skb_mstamp_get(&now); 3126 if (first_ackt.v64) { 3127 seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); 3128 ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); 3129 } |
3117 | 3130 |
3131 rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); 3132 |
|
3118 if (flag & FLAG_ACKED) { 3119 const struct tcp_congestion_ops *ca_ops 3120 = inet_csk(sk)->icsk_ca_ops; 3121 3122 tcp_rearm_rto(sk); 3123 if (unlikely(icsk->icsk_mtup.probe_size && 3124 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { 3125 tcp_mtup_probe_success(sk); --- 10 unchanged lines hidden (view full) --- 3136 3137 delta = tcp_is_fack(tp) ? pkts_acked : 3138 prior_sacked - tp->sacked_out; 3139 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3140 } 3141 3142 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3143 | 3133 if (flag & FLAG_ACKED) { 3134 const struct tcp_congestion_ops *ca_ops 3135 = inet_csk(sk)->icsk_ca_ops; 3136 3137 tcp_rearm_rto(sk); 3138 if (unlikely(icsk->icsk_mtup.probe_size && 3139 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { 3140 tcp_mtup_probe_success(sk); --- 10 unchanged lines hidden (view full) --- 3151 3152 delta = tcp_is_fack(tp) ? pkts_acked : 3153 prior_sacked - tp->sacked_out; 3154 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); 3155 } 3156 3157 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3158 |
3144 if (ca_ops->pkts_acked) { 3145 s32 rtt_us = -1; | 3159 if (ca_ops->pkts_acked) 3160 ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); |
3146 | 3161 |
3147 /* Is the ACK triggering packet unambiguous? */ 3148 if (!(flag & FLAG_RETRANS_DATA_ACKED)) { 3149 /* High resolution needed and available? */ 3150 if (ca_ops->flags & TCP_CONG_RTT_STAMP && 3151 !ktime_equal(last_ackt, 3152 net_invalid_timestamp())) 3153 rtt_us = ktime_us_delta(ktime_get_real(), 3154 last_ackt); 3155 else if (ca_seq_rtt >= 0) 3156 rtt_us = jiffies_to_usecs(ca_seq_rtt); 3157 } 3158 3159 ca_ops->pkts_acked(sk, pkts_acked, rtt_us); 3160 } 3161 } else if (skb && rtt_update && sack_rtt >= 0 && 3162 sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { | 3162 } else if (skb && rtt_update && sack_rtt_us >= 0 && 3163 sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { |
3163 /* Do not re-arm RTO if the sack RTT is measured from data sent 3164 * after when the head was last (re)transmitted. Otherwise the 3165 * timeout may continue to extend in loss recovery. 3166 */ 3167 tcp_rearm_rto(sk); 3168 } 3169 3170#if FASTRETRANS_DEBUG > 0 --- 31 unchanged lines hidden (view full) --- 3202 3203 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { 3204 icsk->icsk_backoff = 0; 3205 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 3206 /* Socket must be waked up by subsequent tcp_data_snd_check(). 3207 * This function is not for random using! 3208 */ 3209 } else { | 3164 /* Do not re-arm RTO if the sack RTT is measured from data sent 3165 * after when the head was last (re)transmitted. Otherwise the 3166 * timeout may continue to extend in loss recovery. 3167 */ 3168 tcp_rearm_rto(sk); 3169 } 3170 3171#if FASTRETRANS_DEBUG > 0 --- 31 unchanged lines hidden (view full) --- 3203 3204 if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { 3205 icsk->icsk_backoff = 0; 3206 inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); 3207 /* Socket must be waked up by subsequent tcp_data_snd_check(). 3208 * This function is not for random using! 3209 */ 3210 } else { |
3211 unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); 3212 |
|
3210 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, | 3213 inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, |
3211 min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), 3212 TCP_RTO_MAX); | 3214 when, TCP_RTO_MAX); |
3213 } 3214} 3215 3216static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) 3217{ 3218 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3219 inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 3220} --- 124 unchanged lines hidden (view full) --- 3345 tp->tlp_high_seq = 0; 3346 return; 3347 } 3348 3349 if (after(ack, tp->tlp_high_seq)) { 3350 tp->tlp_high_seq = 0; 3351 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ 3352 if (!(flag & FLAG_DSACKING_ACK)) { | 3215 } 3216} 3217 3218static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) 3219{ 3220 return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || 3221 inet_csk(sk)->icsk_ca_state != TCP_CA_Open; 3222} --- 124 unchanged lines hidden (view full) --- 3347 tp->tlp_high_seq = 0; 3348 return; 3349 } 3350 3351 if (after(ack, tp->tlp_high_seq)) { 3352 tp->tlp_high_seq = 0; 3353 /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ 3354 if (!(flag & FLAG_DSACKING_ACK)) { |
3353 tcp_init_cwnd_reduction(sk, true); | 3355 tcp_init_cwnd_reduction(sk); |
3354 tcp_set_ca_state(sk, TCP_CA_CWR); 3355 tcp_end_cwnd_reduction(sk); 3356 tcp_try_keep_open(sk); 3357 NET_INC_STATS_BH(sock_net(sk), 3358 LINUX_MIB_TCPLOSSPROBERECOVERY); 3359 } 3360 } 3361} 3362 3363/* This routine deals with incoming acks, but not outgoing ones. */ 3364static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3365{ 3366 struct inet_connection_sock *icsk = inet_csk(sk); 3367 struct tcp_sock *tp = tcp_sk(sk); 3368 u32 prior_snd_una = tp->snd_una; 3369 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3370 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3371 bool is_dupack = false; | 3356 tcp_set_ca_state(sk, TCP_CA_CWR); 3357 tcp_end_cwnd_reduction(sk); 3358 tcp_try_keep_open(sk); 3359 NET_INC_STATS_BH(sock_net(sk), 3360 LINUX_MIB_TCPLOSSPROBERECOVERY); 3361 } 3362 } 3363} 3364 3365/* This routine deals with incoming acks, but not outgoing ones. */ 3366static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) 3367{ 3368 struct inet_connection_sock *icsk = inet_csk(sk); 3369 struct tcp_sock *tp = tcp_sk(sk); 3370 u32 prior_snd_una = tp->snd_una; 3371 u32 ack_seq = TCP_SKB_CB(skb)->seq; 3372 u32 ack = TCP_SKB_CB(skb)->ack_seq; 3373 bool is_dupack = false; |
3372 u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; | |
3373 u32 prior_fackets; 3374 int prior_packets = tp->packets_out; 3375 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3376 int acked = 0; /* Number of packets newly acked */ | 3374 u32 prior_fackets; 3375 int prior_packets = tp->packets_out; 3376 const int prior_unsacked = tp->packets_out - tp->sacked_out; 3377 int acked = 0; /* Number of packets newly acked */ |
3377 s32 sack_rtt = -1; | 3378 long sack_rtt_us = -1L; |
3378 3379 /* If the ack is older than previous acks 3380 * then we can probably ignore it. 3381 */ 3382 if (before(ack, prior_snd_una)) { 3383 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3384 if (before(ack, prior_snd_una - tp->max_window)) { 3385 tcp_send_challenge_ack(sk); --- 7 unchanged lines hidden (view full) --- 3393 */ 3394 if (after(ack, tp->snd_nxt)) 3395 goto invalid_ack; 3396 3397 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3398 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3399 tcp_rearm_rto(sk); 3400 | 3379 3380 /* If the ack is older than previous acks 3381 * then we can probably ignore it. 3382 */ 3383 if (before(ack, prior_snd_una)) { 3384 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3385 if (before(ack, prior_snd_una - tp->max_window)) { 3386 tcp_send_challenge_ack(sk); --- 7 unchanged lines hidden (view full) --- 3394 */ 3395 if (after(ack, tp->snd_nxt)) 3396 goto invalid_ack; 3397 3398 if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || 3399 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) 3400 tcp_rearm_rto(sk); 3401 |
3401 if (after(ack, prior_snd_una)) | 3402 if (after(ack, prior_snd_una)) { |
3402 flag |= FLAG_SND_UNA_ADVANCED; | 3403 flag |= FLAG_SND_UNA_ADVANCED; |
3404 icsk->icsk_retransmits = 0; 3405 } |
|
3403 3404 prior_fackets = tp->fackets_out; | 3406 3407 prior_fackets = tp->fackets_out; |
3405 prior_in_flight = tcp_packets_in_flight(tp); | |
3406 3407 /* ts_recent update must be made after we are sure that the packet 3408 * is in window. 3409 */ 3410 if (flag & FLAG_UPDATE_TS_RECENT) 3411 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); 3412 3413 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { --- 13 unchanged lines hidden (view full) --- 3427 flag |= FLAG_DATA; 3428 else 3429 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); 3430 3431 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); 3432 3433 if (TCP_SKB_CB(skb)->sacked) 3434 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3408 3409 /* ts_recent update must be made after we are sure that the packet 3410 * is in window. 3411 */ 3412 if (flag & FLAG_UPDATE_TS_RECENT) 3413 tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); 3414 3415 if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { --- 13 unchanged lines hidden (view full) --- 3429 flag |= FLAG_DATA; 3430 else 3431 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); 3432 3433 flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); 3434 3435 if (TCP_SKB_CB(skb)->sacked) 3436 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3435 &sack_rtt); | 3437 &sack_rtt_us); |
3436 3437 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3438 flag |= FLAG_ECE; 3439 3440 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3441 } 3442 3443 /* We passed data and got it acked, remove any soft error 3444 * log. Something worked... 3445 */ 3446 sk->sk_err_soft = 0; 3447 icsk->icsk_probes_out = 0; 3448 tp->rcv_tstamp = tcp_time_stamp; 3449 if (!prior_packets) 3450 goto no_queue; 3451 3452 /* See if we can take anything off of the retransmit queue. */ 3453 acked = tp->packets_out; | 3438 3439 if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) 3440 flag |= FLAG_ECE; 3441 3442 tcp_ca_event(sk, CA_EVENT_SLOW_ACK); 3443 } 3444 3445 /* We passed data and got it acked, remove any soft error 3446 * log. Something worked... 3447 */ 3448 sk->sk_err_soft = 0; 3449 icsk->icsk_probes_out = 0; 3450 tp->rcv_tstamp = tcp_time_stamp; 3451 if (!prior_packets) 3452 goto no_queue; 3453 3454 /* See if we can take anything off of the retransmit queue. */ 3455 acked = tp->packets_out; |
3454 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); | 3456 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, 3457 sack_rtt_us); |
3455 acked -= tp->packets_out; 3456 3457 /* Advance cwnd if state allows */ 3458 if (tcp_may_raise_cwnd(sk, flag)) | 3458 acked -= tp->packets_out; 3459 3460 /* Advance cwnd if state allows */ 3461 if (tcp_may_raise_cwnd(sk, flag)) |
3459 tcp_cong_avoid(sk, ack, acked, prior_in_flight); | 3462 tcp_cong_avoid(sk, ack, acked); |
3460 3461 if (tcp_ack_is_dubious(sk, flag)) { 3462 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3463 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3464 is_dupack, flag); 3465 } 3466 if (tp->tlp_high_seq) 3467 tcp_process_tlp_ack(sk, ack, flag); 3468 3469 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3470 struct dst_entry *dst = __sk_dst_get(sk); 3471 if (dst) 3472 dst_confirm(dst); 3473 } 3474 3475 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3476 tcp_schedule_loss_probe(sk); | 3463 3464 if (tcp_ack_is_dubious(sk, flag)) { 3465 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3466 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3467 is_dupack, flag); 3468 } 3469 if (tp->tlp_high_seq) 3470 tcp_process_tlp_ack(sk, ack, flag); 3471 3472 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { 3473 struct dst_entry *dst = __sk_dst_get(sk); 3474 if (dst) 3475 dst_confirm(dst); 3476 } 3477 3478 if (icsk->icsk_pending == ICSK_TIME_RETRANS) 3479 tcp_schedule_loss_probe(sk); |
3477 if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) 3478 tcp_update_pacing_rate(sk); | 3480 tcp_update_pacing_rate(sk); |
3479 return 1; 3480 3481no_queue: 3482 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3483 if (flag & FLAG_DSACKING_ACK) 3484 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3485 is_dupack, flag); 3486 /* If this ack opens up a zero window, clear backoff. It was --- 12 unchanged lines hidden (view full) --- 3499 return -1; 3500 3501old_ack: 3502 /* If data was SACKed, tag it and see if we should send more data. 3503 * If data was DSACKed, see if we can undo a cwnd reduction. 3504 */ 3505 if (TCP_SKB_CB(skb)->sacked) { 3506 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, | 3481 return 1; 3482 3483no_queue: 3484 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3485 if (flag & FLAG_DSACKING_ACK) 3486 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3487 is_dupack, flag); 3488 /* If this ack opens up a zero window, clear backoff. It was --- 12 unchanged lines hidden (view full) --- 3501 return -1; 3502 3503old_ack: 3504 /* If data was SACKed, tag it and see if we should send more data. 3505 * If data was DSACKed, see if we can undo a cwnd reduction. 3506 */ 3507 if (TCP_SKB_CB(skb)->sacked) { 3508 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, |
3507 &sack_rtt); | 3509 &sack_rtt_us); |
3508 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3509 is_dupack, flag); 3510 } 3511 3512 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3513 return 0; 3514} 3515 --- 538 unchanged lines hidden (view full) --- 4054 continue; 4055 } 4056 this_sack++; 4057 sp++; 4058 } 4059 tp->rx_opt.num_sacks = num_sacks; 4060} 4061 | 3510 tcp_fastretrans_alert(sk, acked, prior_unsacked, 3511 is_dupack, flag); 3512 } 3513 3514 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3515 return 0; 3516} 3517 --- 538 unchanged lines hidden (view full) --- 4056 continue; 4057 } 4058 this_sack++; 4059 sp++; 4060 } 4061 tp->rx_opt.num_sacks = num_sacks; 4062} 4063 |
4064/** 4065 * tcp_try_coalesce - try to merge skb to prior one 4066 * @sk: socket 4067 * @to: prior buffer 4068 * @from: buffer to add in queue 4069 * @fragstolen: pointer to boolean 4070 * 4071 * Before queueing skb @from after @to, try to merge them 4072 * to reduce overall memory use and queue lengths, if cost is small. 4073 * Packets in ofo or receive queues can stay a long time. 4074 * Better try to coalesce them right now to avoid future collapses. 4075 * Returns true if caller should free @from instead of queueing it 4076 */ 4077static bool tcp_try_coalesce(struct sock *sk, 4078 struct sk_buff *to, 4079 struct sk_buff *from, 4080 bool *fragstolen) 4081{ 4082 int delta; 4083 4084 *fragstolen = false; 4085 4086 /* Its possible this segment overlaps with prior segment in queue */ 4087 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) 4088 return false; 4089 4090 if (!skb_try_coalesce(to, from, fragstolen, &delta)) 4091 return false; 4092 4093 atomic_add(delta, &sk->sk_rmem_alloc); 4094 sk_mem_charge(sk, delta); 4095 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); 4096 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; 4097 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; 4098 TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; 4099 return true; 4100} 4101 |
|
4062/* This one checks to see if we can put data from the 4063 * out_of_order queue into the receive_queue. 4064 */ 4065static void tcp_ofo_queue(struct sock *sk) 4066{ 4067 struct tcp_sock *tp = tcp_sk(sk); 4068 __u32 dsack_high = tp->rcv_nxt; | 4102/* This one checks to see if we can put data from the 4103 * out_of_order queue into the receive_queue. 4104 */ 4105static void tcp_ofo_queue(struct sock *sk) 4106{ 4107 struct tcp_sock *tp = tcp_sk(sk); 4108 __u32 dsack_high = tp->rcv_nxt; |
4069 struct sk_buff *skb; | 4109 struct sk_buff *skb, *tail; 4110 bool fragstolen, eaten; |
4070 4071 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4072 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4073 break; 4074 4075 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { 4076 __u32 dsack = dsack_high; 4077 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) 4078 dsack_high = TCP_SKB_CB(skb)->end_seq; 4079 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4080 } 4081 | 4111 4112 while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { 4113 if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) 4114 break; 4115 4116 if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { 4117 __u32 dsack = dsack_high; 4118 if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) 4119 dsack_high = TCP_SKB_CB(skb)->end_seq; 4120 tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); 4121 } 4122 |
4123 __skb_unlink(skb, &tp->out_of_order_queue); |
|
4082 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4083 SOCK_DEBUG(sk, "ofo packet was already received\n"); | 4124 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4125 SOCK_DEBUG(sk, "ofo packet was already received\n"); |
4084 __skb_unlink(skb, &tp->out_of_order_queue); | |
4085 __kfree_skb(skb); 4086 continue; 4087 } 4088 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", 4089 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4090 TCP_SKB_CB(skb)->end_seq); 4091 | 4126 __kfree_skb(skb); 4127 continue; 4128 } 4129 SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", 4130 tp->rcv_nxt, TCP_SKB_CB(skb)->seq, 4131 TCP_SKB_CB(skb)->end_seq); 4132 |
4092 __skb_unlink(skb, &tp->out_of_order_queue); 4093 __skb_queue_tail(&sk->sk_receive_queue, skb); | 4133 tail = skb_peek_tail(&sk->sk_receive_queue); 4134 eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); |
4094 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 4135 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
4095 if (tcp_hdr(skb)->fin) | 4136 if (!eaten) 4137 __skb_queue_tail(&sk->sk_receive_queue, skb); 4138 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) |
4096 tcp_fin(sk); | 4139 tcp_fin(sk); |
4140 if (eaten) 4141 kfree_skb_partial(skb, fragstolen); |
|
4097 } 4098} 4099 4100static bool tcp_prune_ofo_queue(struct sock *sk); 4101static int tcp_prune_queue(struct sock *sk); 4102 4103static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 4104 unsigned int size) --- 10 unchanged lines hidden (view full) --- 4115 4116 if (!sk_rmem_schedule(sk, skb, size)) 4117 return -1; 4118 } 4119 } 4120 return 0; 4121} 4122 | 4142 } 4143} 4144 4145static bool tcp_prune_ofo_queue(struct sock *sk); 4146static int tcp_prune_queue(struct sock *sk); 4147 4148static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, 4149 unsigned int size) --- 10 unchanged lines hidden (view full) --- 4160 4161 if (!sk_rmem_schedule(sk, skb, size)) 4162 return -1; 4163 } 4164 } 4165 return 0; 4166} 4167 |
4123/** 4124 * tcp_try_coalesce - try to merge skb to prior one 4125 * @sk: socket 4126 * @to: prior buffer 4127 * @from: buffer to add in queue 4128 * @fragstolen: pointer to boolean 4129 * 4130 * Before queueing skb @from after @to, try to merge them 4131 * to reduce overall memory use and queue lengths, if cost is small. 4132 * Packets in ofo or receive queues can stay a long time. 4133 * Better try to coalesce them right now to avoid future collapses. 4134 * Returns true if caller should free @from instead of queueing it 4135 */ 4136static bool tcp_try_coalesce(struct sock *sk, 4137 struct sk_buff *to, 4138 struct sk_buff *from, 4139 bool *fragstolen) 4140{ 4141 int delta; 4142 4143 *fragstolen = false; 4144 4145 if (tcp_hdr(from)->fin) 4146 return false; 4147 4148 /* Its possible this segment overlaps with prior segment in queue */ 4149 if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) 4150 return false; 4151 4152 if (!skb_try_coalesce(to, from, fragstolen, &delta)) 4153 return false; 4154 4155 atomic_add(delta, &sk->sk_rmem_alloc); 4156 sk_mem_charge(sk, delta); 4157 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); 4158 TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; 4159 TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; 4160 return true; 4161} 4162 | |
4163static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4164{ 4165 struct tcp_sock *tp = tcp_sk(sk); 4166 struct sk_buff *skb1; 4167 u32 seq, end_seq; 4168 4169 TCP_ECN_check_ce(tp, skb); 4170 --- 129 unchanged lines hidden (view full) --- 4300 __skb_queue_tail(&sk->sk_receive_queue, skb); 4301 skb_set_owner_r(skb, sk); 4302 } 4303 return eaten; 4304} 4305 4306int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4307{ | 4168static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) 4169{ 4170 struct tcp_sock *tp = tcp_sk(sk); 4171 struct sk_buff *skb1; 4172 u32 seq, end_seq; 4173 4174 TCP_ECN_check_ce(tp, skb); 4175 --- 129 unchanged lines hidden (view full) --- 4305 __skb_queue_tail(&sk->sk_receive_queue, skb); 4306 skb_set_owner_r(skb, sk); 4307 } 4308 return eaten; 4309} 4310 4311int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4312{ |
4308 struct sk_buff *skb = NULL; 4309 struct tcphdr *th; | 4313 struct sk_buff *skb; |
4310 bool fragstolen; 4311 4312 if (size == 0) 4313 return 0; 4314 | 4314 bool fragstolen; 4315 4316 if (size == 0) 4317 return 0; 4318 |
4315 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); | 4319 skb = alloc_skb(size, sk->sk_allocation); |
4316 if (!skb) 4317 goto err; 4318 | 4320 if (!skb) 4321 goto err; 4322 |
4319 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) | 4323 if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) |
4320 goto err_free; 4321 | 4324 goto err_free; 4325 |
4322 th = (struct tcphdr *)skb_put(skb, sizeof(*th)); 4323 skb_reset_transport_header(skb); 4324 memset(th, 0, sizeof(*th)); 4325 | |
4326 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) 4327 goto err_free; 4328 4329 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; 4330 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4331 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4332 | 4326 if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) 4327 goto err_free; 4328 4329 TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt; 4330 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; 4331 TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; 4332 |
4333 if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { | 4333 if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { |
4334 WARN_ON_ONCE(fragstolen); /* should not happen */ 4335 __kfree_skb(skb); 4336 } 4337 return size; 4338 4339err_free: 4340 kfree_skb(skb); 4341err: --- 71 unchanged lines hidden (view full) --- 4413 if (tp->rx_opt.num_sacks) 4414 tcp_sack_remove(tp); 4415 4416 tcp_fast_path_check(sk); 4417 4418 if (eaten > 0) 4419 kfree_skb_partial(skb, fragstolen); 4420 if (!sock_flag(sk, SOCK_DEAD)) | 4334 WARN_ON_ONCE(fragstolen); /* should not happen */ 4335 __kfree_skb(skb); 4336 } 4337 return size; 4338 4339err_free: 4340 kfree_skb(skb); 4341err: --- 71 unchanged lines hidden (view full) --- 4413 if (tp->rx_opt.num_sacks) 4414 tcp_sack_remove(tp); 4415 4416 tcp_fast_path_check(sk); 4417 4418 if (eaten > 0) 4419 kfree_skb_partial(skb, fragstolen); 4420 if (!sock_flag(sk, SOCK_DEAD)) |
4421 sk->sk_data_ready(sk, 0); | 4421 sk->sk_data_ready(sk); |
4422 return; 4423 } 4424 4425 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4426 /* A retransmit, 2nd most common case. Force an immediate ack. */ 4427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4428 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4429 --- 77 unchanged lines hidden (view full) --- 4507 goto restart; 4508 } 4509 4510 /* The first skb to collapse is: 4511 * - not SYN/FIN and 4512 * - bloated or contains data before "start" or 4513 * overlaps to the next one. 4514 */ | 4422 return; 4423 } 4424 4425 if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { 4426 /* A retransmit, 2nd most common case. Force an immediate ack. */ 4427 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); 4428 tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); 4429 --- 77 unchanged lines hidden (view full) --- 4507 goto restart; 4508 } 4509 4510 /* The first skb to collapse is: 4511 * - not SYN/FIN and 4512 * - bloated or contains data before "start" or 4513 * overlaps to the next one. 4514 */ |
4515 if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && | 4515 if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && |
4516 (tcp_win_from_space(skb->truesize) > skb->len || 4517 before(TCP_SKB_CB(skb)->seq, start))) { 4518 end_of_skbs = false; 4519 break; 4520 } 4521 4522 if (!skb_queue_is_last(list, skb)) { 4523 struct sk_buff *next = skb_queue_next(list, skb); 4524 if (next != tail && 4525 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { 4526 end_of_skbs = false; 4527 break; 4528 } 4529 } 4530 4531 /* Decided to skip this, advance start seq. */ 4532 start = TCP_SKB_CB(skb)->end_seq; 4533 } | 4516 (tcp_win_from_space(skb->truesize) > skb->len || 4517 before(TCP_SKB_CB(skb)->seq, start))) { 4518 end_of_skbs = false; 4519 break; 4520 } 4521 4522 if (!skb_queue_is_last(list, skb)) { 4523 struct sk_buff *next = skb_queue_next(list, skb); 4524 if (next != tail && 4525 TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) { 4526 end_of_skbs = false; 4527 break; 4528 } 4529 } 4530 4531 /* Decided to skip this, advance start seq. */ 4532 start = TCP_SKB_CB(skb)->end_seq; 4533 } |
4534 if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) | 4534 if (end_of_skbs || 4535 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4535 return; 4536 4537 while (before(start, end)) { | 4536 return; 4537 4538 while (before(start, end)) { |
4539 int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); |
|
4538 struct sk_buff *nskb; | 4540 struct sk_buff *nskb; |
4539 unsigned int header = skb_headroom(skb); 4540 int copy = SKB_MAX_ORDER(header, 0); | |
4541 | 4541 |
4542 /* Too big header? This can happen with IPv6. */ 4543 if (copy < 0) 4544 return; 4545 if (end - start < copy) 4546 copy = end - start; 4547 nskb = alloc_skb(copy + header, GFP_ATOMIC); | 4542 nskb = alloc_skb(copy, GFP_ATOMIC); |
4548 if (!nskb) 4549 return; 4550 | 4543 if (!nskb) 4544 return; 4545 |
4551 skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); 4552 skb_set_network_header(nskb, (skb_network_header(skb) - 4553 skb->head)); 4554 skb_set_transport_header(nskb, (skb_transport_header(skb) - 4555 skb->head)); 4556 skb_reserve(nskb, header); 4557 memcpy(nskb->head, skb->head, header); | |
4558 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4559 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4560 __skb_queue_before(list, skb, nskb); 4561 skb_set_owner_r(nskb, sk); 4562 4563 /* Copy data, releasing collapsed skbs. */ 4564 while (copy > 0) { 4565 int offset = start - TCP_SKB_CB(skb)->seq; --- 7 unchanged lines hidden (view full) --- 4573 TCP_SKB_CB(nskb)->end_seq += size; 4574 copy -= size; 4575 start += size; 4576 } 4577 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4578 skb = tcp_collapse_one(sk, skb, list); 4579 if (!skb || 4580 skb == tail || | 4546 memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); 4547 TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; 4548 __skb_queue_before(list, skb, nskb); 4549 skb_set_owner_r(nskb, sk); 4550 4551 /* Copy data, releasing collapsed skbs. */ 4552 while (copy > 0) { 4553 int offset = start - TCP_SKB_CB(skb)->seq; --- 7 unchanged lines hidden (view full) --- 4561 TCP_SKB_CB(nskb)->end_seq += size; 4562 copy -= size; 4563 start += size; 4564 } 4565 if (!before(start, TCP_SKB_CB(skb)->end_seq)) { 4566 skb = tcp_collapse_one(sk, skb, list); 4567 if (!skb || 4568 skb == tail || |
4581 tcp_hdr(skb)->syn || 4582 tcp_hdr(skb)->fin) | 4569 (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) |
4583 return; 4584 } 4585 } 4586 } 4587} 4588 4589/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 4590 * and tcp_collapse() them until all the queue is collapsed. --- 112 unchanged lines hidden (view full) --- 4703 */ 4704 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); 4705 4706 /* Massive buffer overcommit. */ 4707 tp->pred_flags = 0; 4708 return -1; 4709} 4710 | 4570 return; 4571 } 4572 } 4573 } 4574} 4575 4576/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs 4577 * and tcp_collapse() them until all the queue is collapsed. --- 112 unchanged lines hidden (view full) --- 4690 */ 4691 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); 4692 4693 /* Massive buffer overcommit. */ 4694 tp->pred_flags = 0; 4695 return -1; 4696} 4697 |
4711/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. 4712 * As additional protections, we do not touch cwnd in retransmission phases, 4713 * and if application hit its sndbuf limit recently. 4714 */ 4715void tcp_cwnd_application_limited(struct sock *sk) 4716{ 4717 struct tcp_sock *tp = tcp_sk(sk); 4718 4719 if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && 4720 sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { 4721 /* Limited by application or receiver window. */ 4722 u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); 4723 u32 win_used = max(tp->snd_cwnd_used, init_win); 4724 if (win_used < tp->snd_cwnd) { 4725 tp->snd_ssthresh = tcp_current_ssthresh(sk); 4726 tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; 4727 } 4728 tp->snd_cwnd_used = 0; 4729 } 4730 tp->snd_cwnd_stamp = tcp_time_stamp; 4731} 4732 | |
4733static bool tcp_should_expand_sndbuf(const struct sock *sk) 4734{ 4735 const struct tcp_sock *tp = tcp_sk(sk); 4736 4737 /* If the user specified a specific send buffer setting, do 4738 * not modify it. 4739 */ 4740 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) --- 173 unchanged lines hidden (view full) --- 4914 4915 /* Is the urgent pointer pointing into this packet? */ 4916 if (ptr < skb->len) { 4917 u8 tmp; 4918 if (skb_copy_bits(skb, ptr, &tmp, 1)) 4919 BUG(); 4920 tp->urg_data = TCP_URG_VALID | tmp; 4921 if (!sock_flag(sk, SOCK_DEAD)) | 4698static bool tcp_should_expand_sndbuf(const struct sock *sk) 4699{ 4700 const struct tcp_sock *tp = tcp_sk(sk); 4701 4702 /* If the user specified a specific send buffer setting, do 4703 * not modify it. 4704 */ 4705 if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) --- 173 unchanged lines hidden (view full) --- 4879 4880 /* Is the urgent pointer pointing into this packet? */ 4881 if (ptr < skb->len) { 4882 u8 tmp; 4883 if (skb_copy_bits(skb, ptr, &tmp, 1)) 4884 BUG(); 4885 tp->urg_data = TCP_URG_VALID | tmp; 4886 if (!sock_flag(sk, SOCK_DEAD)) |
4922 sk->sk_data_ready(sk, 0); | 4887 sk->sk_data_ready(sk); |
4923 } 4924 } 4925} 4926 4927static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) 4928{ 4929 struct tcp_sock *tp = tcp_sk(sk); 4930 int chunk = skb->len - hlen; --- 33 unchanged lines hidden (view full) --- 4964 4965static inline bool tcp_checksum_complete_user(struct sock *sk, 4966 struct sk_buff *skb) 4967{ 4968 return !skb_csum_unnecessary(skb) && 4969 __tcp_checksum_complete_user(sk, skb); 4970} 4971 | 4888 } 4889 } 4890} 4891 4892static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) 4893{ 4894 struct tcp_sock *tp = tcp_sk(sk); 4895 int chunk = skb->len - hlen; --- 33 unchanged lines hidden (view full) --- 4929 4930static inline bool tcp_checksum_complete_user(struct sock *sk, 4931 struct sk_buff *skb) 4932{ 4933 return !skb_csum_unnecessary(skb) && 4934 __tcp_checksum_complete_user(sk, skb); 4935} 4936 |
4937#ifdef CONFIG_NET_DMA 4938static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, 4939 int hlen) 4940{ 4941 struct tcp_sock *tp = tcp_sk(sk); 4942 int chunk = skb->len - hlen; 4943 int dma_cookie; 4944 bool copied_early = false; 4945 4946 if (tp->ucopy.wakeup) 4947 return false; 4948 4949 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) 4950 tp->ucopy.dma_chan = net_dma_find_channel(); 4951 4952 if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { 4953 4954 dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, 4955 skb, hlen, 4956 tp->ucopy.iov, chunk, 4957 tp->ucopy.pinned_list); 4958 4959 if (dma_cookie < 0) 4960 goto out; 4961 4962 tp->ucopy.dma_cookie = dma_cookie; 4963 copied_early = true; 4964 4965 tp->ucopy.len -= chunk; 4966 tp->copied_seq += chunk; 4967 tcp_rcv_space_adjust(sk); 4968 4969 if ((tp->ucopy.len == 0) || 4970 (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || 4971 (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { 4972 tp->ucopy.wakeup = 1; 4973 sk->sk_data_ready(sk); 4974 } 4975 } else if (chunk > 0) { 4976 tp->ucopy.wakeup = 1; 4977 sk->sk_data_ready(sk); 4978 } 4979out: 4980 return copied_early; 4981} 4982#endif /* CONFIG_NET_DMA */ 4983 |
|
4972/* Does PAWS and seqno based validation of an incoming segment, flags will 4973 * play significant role here. 4974 */ 4975static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 4976 const struct tcphdr *th, int syn_inerr) 4977{ 4978 struct tcp_sock *tp = tcp_sk(sk); 4979 --- 163 unchanged lines hidden (view full) --- 5143 tcp_data_snd_check(sk); 5144 return; 5145 } else { /* Header too small */ 5146 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5147 goto discard; 5148 } 5149 } else { 5150 int eaten = 0; | 4984/* Does PAWS and seqno based validation of an incoming segment, flags will 4985 * play significant role here. 4986 */ 4987static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, 4988 const struct tcphdr *th, int syn_inerr) 4989{ 4990 struct tcp_sock *tp = tcp_sk(sk); 4991 --- 163 unchanged lines hidden (view full) --- 5155 tcp_data_snd_check(sk); 5156 return; 5157 } else { /* Header too small */ 5158 TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); 5159 goto discard; 5160 } 5161 } else { 5162 int eaten = 0; |
5163 int copied_early = 0; |
|
5151 bool fragstolen = false; 5152 | 5164 bool fragstolen = false; 5165 |
5153 if (tp->ucopy.task == current && 5154 tp->copied_seq == tp->rcv_nxt && 5155 len - tcp_header_len <= tp->ucopy.len && 5156 sock_owned_by_user(sk)) { 5157 __set_current_state(TASK_RUNNING); | 5166 if (tp->copied_seq == tp->rcv_nxt && 5167 len - tcp_header_len <= tp->ucopy.len) { 5168#ifdef CONFIG_NET_DMA 5169 if (tp->ucopy.task == current && 5170 sock_owned_by_user(sk) && 5171 tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { 5172 copied_early = 1; 5173 eaten = 1; 5174 } 5175#endif 5176 if (tp->ucopy.task == current && 5177 sock_owned_by_user(sk) && !copied_early) { 5178 __set_current_state(TASK_RUNNING); |
5158 | 5179 |
5159 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { | 5180 if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) 5181 eaten = 1; 5182 } 5183 if (eaten) { |
5160 /* Predicted packet is in window by definition. 5161 * seq == rcv_nxt and rcv_wup <= rcv_nxt. 5162 * Hence, check seq<=rcv_wup reduces to: 5163 */ 5164 if (tcp_header_len == 5165 (sizeof(struct tcphdr) + 5166 TCPOLEN_TSTAMP_ALIGNED) && 5167 tp->rcv_nxt == tp->rcv_wup) 5168 tcp_store_ts_recent(tp); 5169 5170 tcp_rcv_rtt_measure_ts(sk, skb); 5171 5172 __skb_pull(skb, tcp_header_len); 5173 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 5174 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); | 5184 /* Predicted packet is in window by definition. 5185 * seq == rcv_nxt and rcv_wup <= rcv_nxt. 5186 * Hence, check seq<=rcv_wup reduces to: 5187 */ 5188 if (tcp_header_len == 5189 (sizeof(struct tcphdr) + 5190 TCPOLEN_TSTAMP_ALIGNED) && 5191 tp->rcv_nxt == tp->rcv_wup) 5192 tcp_store_ts_recent(tp); 5193 5194 tcp_rcv_rtt_measure_ts(sk, skb); 5195 5196 __skb_pull(skb, tcp_header_len); 5197 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; 5198 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); |
5175 eaten = 1; | |
5176 } | 5199 } |
5200 if (copied_early) 5201 tcp_cleanup_rbuf(sk, skb->len); |
|
5177 } 5178 if (!eaten) { 5179 if (tcp_checksum_complete_user(sk, skb)) 5180 goto csum_error; 5181 5182 if ((int)skb->truesize > sk->sk_forward_alloc) 5183 goto step5; 5184 --- 20 unchanged lines hidden (view full) --- 5205 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 5206 /* Well, only one small jumplet in fast path... */ 5207 tcp_ack(sk, skb, FLAG_DATA); 5208 tcp_data_snd_check(sk); 5209 if (!inet_csk_ack_scheduled(sk)) 5210 goto no_ack; 5211 } 5212 | 5202 } 5203 if (!eaten) { 5204 if (tcp_checksum_complete_user(sk, skb)) 5205 goto csum_error; 5206 5207 if ((int)skb->truesize > sk->sk_forward_alloc) 5208 goto step5; 5209 --- 20 unchanged lines hidden (view full) --- 5230 if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { 5231 /* Well, only one small jumplet in fast path... */ 5232 tcp_ack(sk, skb, FLAG_DATA); 5233 tcp_data_snd_check(sk); 5234 if (!inet_csk_ack_scheduled(sk)) 5235 goto no_ack; 5236 } 5237 |
5213 __tcp_ack_snd_check(sk, 0); | 5238 if (!copied_early || tp->rcv_nxt != tp->rcv_wup) 5239 __tcp_ack_snd_check(sk, 0); |
5214no_ack: | 5240no_ack: |
5241#ifdef CONFIG_NET_DMA 5242 if (copied_early) 5243 __skb_queue_tail(&sk->sk_async_wait_queue, skb); 5244 else 5245#endif |
|
5215 if (eaten) 5216 kfree_skb_partial(skb, fragstolen); | 5246 if (eaten) 5247 kfree_skb_partial(skb, fragstolen); |
5217 sk->sk_data_ready(sk, 0); | 5248 sk->sk_data_ready(sk); |
5218 return; 5219 } 5220 } 5221 5222slow_path: 5223 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) 5224 goto csum_error; 5225 --- 103 unchanged lines hidden (view full) --- 5329 5330 if (data) { /* Retransmit unacked data in SYN */ 5331 tcp_for_write_queue_from(data, sk) { 5332 if (data == tcp_send_head(sk) || 5333 __tcp_retransmit_skb(sk, data)) 5334 break; 5335 } 5336 tcp_rearm_rto(sk); | 5249 return; 5250 } 5251 } 5252 5253slow_path: 5254 if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) 5255 goto csum_error; 5256 --- 103 unchanged lines hidden (view full) --- 5360 5361 if (data) { /* Retransmit unacked data in SYN */ 5362 tcp_for_write_queue_from(data, sk) { 5363 if (data == tcp_send_head(sk) || 5364 __tcp_retransmit_skb(sk, data)) 5365 break; 5366 } 5367 tcp_rearm_rto(sk); |
5368 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); |
|
5337 return true; 5338 } 5339 tp->syn_data_acked = tp->syn_data; | 5369 return true; 5370 } 5371 tp->syn_data_acked = tp->syn_data; |
5372 if (tp->syn_data_acked) 5373 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); |
|
5340 return false; 5341} 5342 5343static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5344 const struct tcphdr *th, unsigned int len) 5345{ 5346 struct inet_connection_sock *icsk = inet_csk(sk); 5347 struct tcp_sock *tp = tcp_sk(sk); --- 483 unchanged lines hidden (view full) --- 5831 5832 if (!queued) { 5833discard: 5834 __kfree_skb(skb); 5835 } 5836 return 0; 5837} 5838EXPORT_SYMBOL(tcp_rcv_state_process); | 5374 return false; 5375} 5376 5377static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 5378 const struct tcphdr *th, unsigned int len) 5379{ 5380 struct inet_connection_sock *icsk = inet_csk(sk); 5381 struct tcp_sock *tp = tcp_sk(sk); --- 483 unchanged lines hidden (view full) --- 5865 5866 if (!queued) { 5867discard: 5868 __kfree_skb(skb); 5869 } 5870 return 0; 5871} 5872EXPORT_SYMBOL(tcp_rcv_state_process); |
5873 5874static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) 5875{ 5876 struct inet_request_sock *ireq = inet_rsk(req); 5877 5878 if (family == AF_INET) 5879 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), 5880 &ireq->ir_rmt_addr, port); 5881#if IS_ENABLED(CONFIG_IPV6) 5882 else if (family == AF_INET6) 5883 LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), 5884 &ireq->ir_v6_rmt_addr, port); 5885#endif 5886} 5887 5888int tcp_conn_request(struct request_sock_ops *rsk_ops, 5889 const struct tcp_request_sock_ops *af_ops, 5890 struct sock *sk, struct sk_buff *skb) 5891{ 5892 struct tcp_options_received tmp_opt; 5893 struct request_sock *req; 5894 struct tcp_sock *tp = tcp_sk(sk); 5895 struct dst_entry *dst = NULL; 5896 __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; 5897 bool want_cookie = false, fastopen; 5898 struct flowi fl; 5899 struct tcp_fastopen_cookie foc = { .len = -1 }; 5900 int err; 5901 5902 5903 /* TW buckets are converted to open requests without 5904 * limitations, they conserve resources and peer is 5905 * evidently real one. 5906 */ 5907 if ((sysctl_tcp_syncookies == 2 || 5908 inet_csk_reqsk_queue_is_full(sk)) && !isn) { 5909 want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); 5910 if (!want_cookie) 5911 goto drop; 5912 } 5913 5914 5915 /* Accept backlog is full. If we have already queued enough 5916 * of warm entries in syn queue, drop request. It is better than 5917 * clogging syn queue with openreqs with exponentially increasing 5918 * timeout. 5919 */ 5920 if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { 5921 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); 5922 goto drop; 5923 } 5924 5925 req = inet_reqsk_alloc(rsk_ops); 5926 if (!req) 5927 goto drop; 5928 5929 tcp_rsk(req)->af_specific = af_ops; 5930 5931 tcp_clear_options(&tmp_opt); 5932 tmp_opt.mss_clamp = af_ops->mss_clamp; 5933 tmp_opt.user_mss = tp->rx_opt.user_mss; 5934 tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); 5935 5936 if (want_cookie && !tmp_opt.saw_tstamp) 5937 tcp_clear_options(&tmp_opt); 5938 5939 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 5940 tcp_openreq_init(req, &tmp_opt, skb, sk); 5941 5942 af_ops->init_req(req, sk, skb); 5943 5944 if (security_inet_conn_request(sk, skb, req)) 5945 goto drop_and_free; 5946 5947 if (!want_cookie || tmp_opt.tstamp_ok) 5948 TCP_ECN_create_request(req, skb, sock_net(sk)); 5949 5950 if (want_cookie) { 5951 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); 5952 req->cookie_ts = tmp_opt.tstamp_ok; 5953 } else if (!isn) { 5954 /* VJ's idea. We save last timestamp seen 5955 * from the destination in peer table, when entering 5956 * state TIME-WAIT, and check against it before 5957 * accepting new connection request. 5958 * 5959 * If "isn" is not zero, this request hit alive 5960 * timewait bucket, so that all the necessary checks 5961 * are made in the function processing timewait state. 5962 */ 5963 if (tcp_death_row.sysctl_tw_recycle) { 5964 bool strict; 5965 5966 dst = af_ops->route_req(sk, &fl, req, &strict); 5967 5968 if (dst && strict && 5969 !tcp_peer_is_proven(req, dst, true, 5970 tmp_opt.saw_tstamp)) { 5971 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); 5972 goto drop_and_release; 5973 } 5974 } 5975 /* Kill the following clause, if you dislike this way. */ 5976 else if (!sysctl_tcp_syncookies && 5977 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < 5978 (sysctl_max_syn_backlog >> 2)) && 5979 !tcp_peer_is_proven(req, dst, false, 5980 tmp_opt.saw_tstamp)) { 5981 /* Without syncookies last quarter of 5982 * backlog is filled with destinations, 5983 * proven to be alive. 5984 * It means that we continue to communicate 5985 * to destinations, already remembered 5986 * to the moment of synflood. 5987 */ 5988 pr_drop_req(req, ntohs(tcp_hdr(skb)->source), 5989 rsk_ops->family); 5990 goto drop_and_release; 5991 } 5992 5993 isn = af_ops->init_seq(skb); 5994 } 5995 if (!dst) { 5996 dst = af_ops->route_req(sk, &fl, req, NULL); 5997 if (!dst) 5998 goto drop_and_free; 5999 } 6000 6001 tcp_rsk(req)->snt_isn = isn; 6002 tcp_openreq_init_rwin(req, sk, dst); 6003 fastopen = !want_cookie && 6004 tcp_try_fastopen(sk, skb, req, &foc, dst); 6005 err = af_ops->send_synack(sk, dst, &fl, req, 6006 skb_get_queue_mapping(skb), &foc); 6007 if (!fastopen) { 6008 if (err || want_cookie) 6009 goto drop_and_free; 6010 6011 tcp_rsk(req)->listener = NULL; 6012 af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); 6013 } 6014 6015 return 0; 6016 6017drop_and_release: 6018 dst_release(dst); 6019drop_and_free: 6020 reqsk_free(req); 6021drop: 6022 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); 6023 return 0; 6024} 6025EXPORT_SYMBOL(tcp_conn_request); |
|