xref: /linux/net/ipv4/tcp_recovery.c (revision b24413180f5600bcb3bb70fbed5cf186b60864bd)
1*b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2659a8ad5SYuchung Cheng #include <linux/tcp.h>
3659a8ad5SYuchung Cheng #include <net/tcp.h>
4659a8ad5SYuchung Cheng 
5a0370b3fSYuchung Cheng int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
64f41b1c5SYuchung Cheng 
7db8da6bbSYuchung Cheng static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
8db8da6bbSYuchung Cheng {
9db8da6bbSYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
10db8da6bbSYuchung Cheng 
11db8da6bbSYuchung Cheng 	tcp_skb_mark_lost_uncond_verify(tp, skb);
12db8da6bbSYuchung Cheng 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
13db8da6bbSYuchung Cheng 		/* Account for retransmits that are lost again */
14db8da6bbSYuchung Cheng 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
15db8da6bbSYuchung Cheng 		tp->retrans_out -= tcp_skb_pcount(skb);
16ecde8f36SYuchung Cheng 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
17ecde8f36SYuchung Cheng 			      tcp_skb_pcount(skb));
18db8da6bbSYuchung Cheng 	}
19db8da6bbSYuchung Cheng }
20db8da6bbSYuchung Cheng 
219a568de4SEric Dumazet static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
221d0833dfSYuchung Cheng {
239a568de4SEric Dumazet 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
241d0833dfSYuchung Cheng }
251d0833dfSYuchung Cheng 
26a0370b3fSYuchung Cheng /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
27a0370b3fSYuchung Cheng  *
28a0370b3fSYuchung Cheng  * Marks a packet lost, if some packet sent later has been (s)acked.
294f41b1c5SYuchung Cheng  * The underlying idea is similar to the traditional dupthresh and FACK
304f41b1c5SYuchung Cheng  * but they look at different metrics:
314f41b1c5SYuchung Cheng  *
324f41b1c5SYuchung Cheng  * dupthresh: 3 OOO packets delivered (packet count)
334f41b1c5SYuchung Cheng  * FACK: sequence delta to highest sacked sequence (sequence space)
344f41b1c5SYuchung Cheng  * RACK: sent time delta to the latest delivered packet (time domain)
354f41b1c5SYuchung Cheng  *
364f41b1c5SYuchung Cheng  * The advantage of RACK is it applies to both original and retransmitted
374f41b1c5SYuchung Cheng  * packet and therefore is robust against tail losses. Another advantage
384f41b1c5SYuchung Cheng  * is being more resilient to reordering by simply allowing some
394f41b1c5SYuchung Cheng  * "settling delay", instead of tweaking the dupthresh.
404f41b1c5SYuchung Cheng  *
41a0370b3fSYuchung Cheng  * When tcp_rack_detect_loss() detects some packets are lost and we
42a0370b3fSYuchung Cheng  * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
43a0370b3fSYuchung Cheng  * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
44a0370b3fSYuchung Cheng  * make us enter the CA_Recovery state.
454f41b1c5SYuchung Cheng  */
467c1c7308SEric Dumazet static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
474f41b1c5SYuchung Cheng {
484f41b1c5SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
494f41b1c5SYuchung Cheng 	struct sk_buff *skb;
50e636f8b0SYuchung Cheng 	u32 reo_wnd;
514f41b1c5SYuchung Cheng 
5257dde7f7SYuchung Cheng 	*reo_timeout = 0;
534f41b1c5SYuchung Cheng 	/* To be more reordering resilient, allow min_rtt/4 settling delay
544f41b1c5SYuchung Cheng 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
554f41b1c5SYuchung Cheng 	 * RTT because reordering is often a path property and less related
564f41b1c5SYuchung Cheng 	 * to queuing or delayed ACKs.
574f41b1c5SYuchung Cheng 	 */
584f41b1c5SYuchung Cheng 	reo_wnd = 1000;
59a0370b3fSYuchung Cheng 	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
604f41b1c5SYuchung Cheng 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
614f41b1c5SYuchung Cheng 
624f41b1c5SYuchung Cheng 	tcp_for_write_queue(skb, sk) {
634f41b1c5SYuchung Cheng 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
644f41b1c5SYuchung Cheng 
654f41b1c5SYuchung Cheng 		if (skb == tcp_send_head(sk))
664f41b1c5SYuchung Cheng 			break;
674f41b1c5SYuchung Cheng 
684f41b1c5SYuchung Cheng 		/* Skip ones already (s)acked */
694f41b1c5SYuchung Cheng 		if (!after(scb->end_seq, tp->snd_una) ||
704f41b1c5SYuchung Cheng 		    scb->sacked & TCPCB_SACKED_ACKED)
714f41b1c5SYuchung Cheng 			continue;
724f41b1c5SYuchung Cheng 
739a568de4SEric Dumazet 		if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
741d0833dfSYuchung Cheng 					tp->rack.end_seq, scb->end_seq)) {
75deed7be7SYuchung Cheng 			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
76deed7be7SYuchung Cheng 			 * A packet is lost if its elapsed time is beyond
77deed7be7SYuchung Cheng 			 * the recent RTT plus the reordering window.
78deed7be7SYuchung Cheng 			 */
799a568de4SEric Dumazet 			u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
809a568de4SEric Dumazet 							 skb->skb_mstamp);
8157dde7f7SYuchung Cheng 			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
8257dde7f7SYuchung Cheng 
8357dde7f7SYuchung Cheng 			if (remaining < 0) {
84db8da6bbSYuchung Cheng 				tcp_rack_mark_skb_lost(sk, skb);
8557dde7f7SYuchung Cheng 				continue;
86deed7be7SYuchung Cheng 			}
8757dde7f7SYuchung Cheng 
8857dde7f7SYuchung Cheng 			/* Skip ones marked lost but not yet retransmitted */
8957dde7f7SYuchung Cheng 			if ((scb->sacked & TCPCB_LOST) &&
9057dde7f7SYuchung Cheng 			    !(scb->sacked & TCPCB_SACKED_RETRANS))
9157dde7f7SYuchung Cheng 				continue;
9257dde7f7SYuchung Cheng 
9357dde7f7SYuchung Cheng 			/* Record maximum wait time (+1 to avoid 0) */
9457dde7f7SYuchung Cheng 			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
9557dde7f7SYuchung Cheng 
964f41b1c5SYuchung Cheng 		} else if (!(scb->sacked & TCPCB_RETRANS)) {
974f41b1c5SYuchung Cheng 			/* Original data are sent sequentially so stop early
984f41b1c5SYuchung Cheng 			 * b/c the rest are all sent after rack_sent
994f41b1c5SYuchung Cheng 			 */
1004f41b1c5SYuchung Cheng 			break;
1014f41b1c5SYuchung Cheng 		}
1024f41b1c5SYuchung Cheng 	}
103e636f8b0SYuchung Cheng }
104e636f8b0SYuchung Cheng 
105128eda86SEric Dumazet void tcp_rack_mark_lost(struct sock *sk)
106e636f8b0SYuchung Cheng {
107e636f8b0SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
10857dde7f7SYuchung Cheng 	u32 timeout;
109e636f8b0SYuchung Cheng 
110a0370b3fSYuchung Cheng 	if (!tp->rack.advanced)
111e636f8b0SYuchung Cheng 		return;
11257dde7f7SYuchung Cheng 
113e636f8b0SYuchung Cheng 	/* Reset the advanced flag to avoid unnecessary queue scanning */
114e636f8b0SYuchung Cheng 	tp->rack.advanced = 0;
1157c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
11657dde7f7SYuchung Cheng 	if (timeout) {
117bb4d991aSYuchung Cheng 		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
11857dde7f7SYuchung Cheng 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
11957dde7f7SYuchung Cheng 					  timeout, inet_csk(sk)->icsk_rto);
12057dde7f7SYuchung Cheng 	}
1214f41b1c5SYuchung Cheng }
1224f41b1c5SYuchung Cheng 
123deed7be7SYuchung Cheng /* Record the most recently (re)sent time among the (s)acked packets
124deed7be7SYuchung Cheng  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
125deed7be7SYuchung Cheng  * draft-cheng-tcpm-rack-00.txt
126deed7be7SYuchung Cheng  */
1271d0833dfSYuchung Cheng void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
1289a568de4SEric Dumazet 		      u64 xmit_time)
129659a8ad5SYuchung Cheng {
130deed7be7SYuchung Cheng 	u32 rtt_us;
131deed7be7SYuchung Cheng 
1329a568de4SEric Dumazet 	if (tp->rack.mstamp &&
1339a568de4SEric Dumazet 	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
1341d0833dfSYuchung Cheng 				 end_seq, tp->rack.end_seq))
135659a8ad5SYuchung Cheng 		return;
136659a8ad5SYuchung Cheng 
1379a568de4SEric Dumazet 	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
138659a8ad5SYuchung Cheng 	if (sacked & TCPCB_RETRANS) {
139659a8ad5SYuchung Cheng 		/* If the sacked packet was retransmitted, it's ambiguous
140659a8ad5SYuchung Cheng 		 * whether the retransmission or the original (or the prior
141659a8ad5SYuchung Cheng 		 * retransmission) was sacked.
142659a8ad5SYuchung Cheng 		 *
143659a8ad5SYuchung Cheng 		 * If the original is lost, there is no ambiguity. Otherwise
144659a8ad5SYuchung Cheng 		 * we assume the original can be delayed up to aRTT + min_rtt.
145659a8ad5SYuchung Cheng 		 * the aRTT term is bounded by the fast recovery or timeout,
146659a8ad5SYuchung Cheng 		 * so it's at least one RTT (i.e., retransmission is at least
147659a8ad5SYuchung Cheng 		 * an RTT later).
148659a8ad5SYuchung Cheng 		 */
149deed7be7SYuchung Cheng 		if (rtt_us < tcp_min_rtt(tp))
150659a8ad5SYuchung Cheng 			return;
151659a8ad5SYuchung Cheng 	}
152deed7be7SYuchung Cheng 	tp->rack.rtt_us = rtt_us;
1539a568de4SEric Dumazet 	tp->rack.mstamp = xmit_time;
1541d0833dfSYuchung Cheng 	tp->rack.end_seq = end_seq;
155659a8ad5SYuchung Cheng 	tp->rack.advanced = 1;
156659a8ad5SYuchung Cheng }
15757dde7f7SYuchung Cheng 
15857dde7f7SYuchung Cheng /* We have waited long enough to accommodate reordering. Mark the expired
15957dde7f7SYuchung Cheng  * packets lost and retransmit them.
16057dde7f7SYuchung Cheng  */
16157dde7f7SYuchung Cheng void tcp_rack_reo_timeout(struct sock *sk)
16257dde7f7SYuchung Cheng {
16357dde7f7SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
16457dde7f7SYuchung Cheng 	u32 timeout, prior_inflight;
16557dde7f7SYuchung Cheng 
16657dde7f7SYuchung Cheng 	prior_inflight = tcp_packets_in_flight(tp);
1677c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
16857dde7f7SYuchung Cheng 	if (prior_inflight != tcp_packets_in_flight(tp)) {
16957dde7f7SYuchung Cheng 		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
17057dde7f7SYuchung Cheng 			tcp_enter_recovery(sk, false);
17157dde7f7SYuchung Cheng 			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
17257dde7f7SYuchung Cheng 				tcp_cwnd_reduction(sk, 1, 0);
17357dde7f7SYuchung Cheng 		}
17457dde7f7SYuchung Cheng 		tcp_xmit_retransmit_queue(sk);
17557dde7f7SYuchung Cheng 	}
17657dde7f7SYuchung Cheng 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
17757dde7f7SYuchung Cheng 		tcp_rearm_rto(sk);
17857dde7f7SYuchung Cheng }
179