xref: /linux/net/ipv4/tcp_recovery.c (revision 995231c820e3bd3633cb38bf4ea6f2541e1da331)
1 #include <linux/tcp.h>
2 #include <net/tcp.h>
3 
4 static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
5 {
6 	struct tcp_sock *tp = tcp_sk(sk);
7 
8 	tcp_skb_mark_lost_uncond_verify(tp, skb);
9 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
10 		/* Account for retransmits that are lost again */
11 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
12 		tp->retrans_out -= tcp_skb_pcount(skb);
13 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
14 			      tcp_skb_pcount(skb));
15 	}
16 }
17 
18 static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
19 {
20 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
21 }
22 
23 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
24  *
25  * Marks a packet lost, if some packet sent later has been (s)acked.
26  * The underlying idea is similar to the traditional dupthresh and FACK
27  * but they look at different metrics:
28  *
29  * dupthresh: 3 OOO packets delivered (packet count)
30  * FACK: sequence delta to highest sacked sequence (sequence space)
31  * RACK: sent time delta to the latest delivered packet (time domain)
32  *
33  * The advantage of RACK is it applies to both original and retransmitted
34  * packet and therefore is robust against tail losses. Another advantage
35  * is being more resilient to reordering by simply allowing some
36  * "settling delay", instead of tweaking the dupthresh.
37  *
38  * When tcp_rack_detect_loss() detects some packets are lost and we
39  * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
40  * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
41  * make us enter the CA_Recovery state.
42  */
43 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
44 {
45 	struct tcp_sock *tp = tcp_sk(sk);
46 	struct sk_buff *skb, *n;
47 	u32 reo_wnd;
48 
49 	*reo_timeout = 0;
50 	/* To be more reordering resilient, allow min_rtt/4 settling delay
51 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
52 	 * RTT because reordering is often a path property and less related
53 	 * to queuing or delayed ACKs.
54 	 */
55 	reo_wnd = 1000;
56 	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
57 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
58 
59 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
60 				 tcp_tsorted_anchor) {
61 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
62 		s32 remaining;
63 
64 		/* Skip ones marked lost but not yet retransmitted */
65 		if ((scb->sacked & TCPCB_LOST) &&
66 		    !(scb->sacked & TCPCB_SACKED_RETRANS))
67 			continue;
68 
69 		if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
70 					 tp->rack.end_seq, scb->end_seq))
71 			break;
72 
73 		/* A packet is lost if it has not been s/acked beyond
74 		 * the recent RTT plus the reordering window.
75 		 */
76 		remaining = tp->rack.rtt_us + reo_wnd -
77 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
78 		if (remaining < 0) {
79 			tcp_rack_mark_skb_lost(sk, skb);
80 			list_del_init(&skb->tcp_tsorted_anchor);
81 		} else {
82 			/* Record maximum wait time (+1 to avoid 0) */
83 			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
84 		}
85 	}
86 }
87 
88 void tcp_rack_mark_lost(struct sock *sk)
89 {
90 	struct tcp_sock *tp = tcp_sk(sk);
91 	u32 timeout;
92 
93 	if (!tp->rack.advanced)
94 		return;
95 
96 	/* Reset the advanced flag to avoid unnecessary queue scanning */
97 	tp->rack.advanced = 0;
98 	tcp_rack_detect_loss(sk, &timeout);
99 	if (timeout) {
100 		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
101 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
102 					  timeout, inet_csk(sk)->icsk_rto);
103 	}
104 }
105 
106 /* Record the most recently (re)sent time among the (s)acked packets
107  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
108  * draft-cheng-tcpm-rack-00.txt
109  */
110 void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
111 		      u64 xmit_time)
112 {
113 	u32 rtt_us;
114 
115 	if (tp->rack.mstamp &&
116 	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
117 				 end_seq, tp->rack.end_seq))
118 		return;
119 
120 	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
121 	if (sacked & TCPCB_RETRANS) {
122 		/* If the sacked packet was retransmitted, it's ambiguous
123 		 * whether the retransmission or the original (or the prior
124 		 * retransmission) was sacked.
125 		 *
126 		 * If the original is lost, there is no ambiguity. Otherwise
127 		 * we assume the original can be delayed up to aRTT + min_rtt.
128 		 * the aRTT term is bounded by the fast recovery or timeout,
129 		 * so it's at least one RTT (i.e., retransmission is at least
130 		 * an RTT later).
131 		 */
132 		if (rtt_us < tcp_min_rtt(tp))
133 			return;
134 	}
135 	tp->rack.rtt_us = rtt_us;
136 	tp->rack.mstamp = xmit_time;
137 	tp->rack.end_seq = end_seq;
138 	tp->rack.advanced = 1;
139 }
140 
141 /* We have waited long enough to accommodate reordering. Mark the expired
142  * packets lost and retransmit them.
143  */
144 void tcp_rack_reo_timeout(struct sock *sk)
145 {
146 	struct tcp_sock *tp = tcp_sk(sk);
147 	u32 timeout, prior_inflight;
148 
149 	prior_inflight = tcp_packets_in_flight(tp);
150 	tcp_rack_detect_loss(sk, &timeout);
151 	if (prior_inflight != tcp_packets_in_flight(tp)) {
152 		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
153 			tcp_enter_recovery(sk, false);
154 			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
155 				tcp_cwnd_reduction(sk, 1, 0);
156 		}
157 		tcp_xmit_retransmit_queue(sk);
158 	}
159 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
160 		tcp_rearm_rto(sk);
161 }
162