xref: /linux/net/ipv4/tcp_recovery.c (revision 428aec5e69fa17d223e1495f395833c50770f7ae)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
2659a8ad5SYuchung Cheng #include <linux/tcp.h>
3659a8ad5SYuchung Cheng #include <net/tcp.h>
4659a8ad5SYuchung Cheng 
5db8da6bbSYuchung Cheng static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
6db8da6bbSYuchung Cheng {
7db8da6bbSYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
8db8da6bbSYuchung Cheng 
9db8da6bbSYuchung Cheng 	tcp_skb_mark_lost_uncond_verify(tp, skb);
10db8da6bbSYuchung Cheng 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
11db8da6bbSYuchung Cheng 		/* Account for retransmits that are lost again */
12db8da6bbSYuchung Cheng 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
13db8da6bbSYuchung Cheng 		tp->retrans_out -= tcp_skb_pcount(skb);
14ecde8f36SYuchung Cheng 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
15ecde8f36SYuchung Cheng 			      tcp_skb_pcount(skb));
16db8da6bbSYuchung Cheng 	}
17db8da6bbSYuchung Cheng }
18db8da6bbSYuchung Cheng 
199a568de4SEric Dumazet static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
201d0833dfSYuchung Cheng {
219a568de4SEric Dumazet 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
221d0833dfSYuchung Cheng }
231d0833dfSYuchung Cheng 
24a0370b3fSYuchung Cheng /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
25a0370b3fSYuchung Cheng  *
26a0370b3fSYuchung Cheng  * Marks a packet lost, if some packet sent later has been (s)acked.
274f41b1c5SYuchung Cheng  * The underlying idea is similar to the traditional dupthresh and FACK
284f41b1c5SYuchung Cheng  * but they look at different metrics:
294f41b1c5SYuchung Cheng  *
304f41b1c5SYuchung Cheng  * dupthresh: 3 OOO packets delivered (packet count)
314f41b1c5SYuchung Cheng  * FACK: sequence delta to highest sacked sequence (sequence space)
324f41b1c5SYuchung Cheng  * RACK: sent time delta to the latest delivered packet (time domain)
334f41b1c5SYuchung Cheng  *
344f41b1c5SYuchung Cheng  * The advantage of RACK is it applies to both original and retransmitted
354f41b1c5SYuchung Cheng  * packet and therefore is robust against tail losses. Another advantage
364f41b1c5SYuchung Cheng  * is being more resilient to reordering by simply allowing some
374f41b1c5SYuchung Cheng  * "settling delay", instead of tweaking the dupthresh.
384f41b1c5SYuchung Cheng  *
39a0370b3fSYuchung Cheng  * When tcp_rack_detect_loss() detects some packets are lost and we
40a0370b3fSYuchung Cheng  * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
41a0370b3fSYuchung Cheng  * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
42a0370b3fSYuchung Cheng  * make us enter the CA_Recovery state.
434f41b1c5SYuchung Cheng  */
447c1c7308SEric Dumazet static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
454f41b1c5SYuchung Cheng {
464f41b1c5SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
471f255691SPriyaranjan Jha 	u32 min_rtt = tcp_min_rtt(tp);
48043b87d7SYuchung Cheng 	struct sk_buff *skb, *n;
49e636f8b0SYuchung Cheng 	u32 reo_wnd;
504f41b1c5SYuchung Cheng 
5157dde7f7SYuchung Cheng 	*reo_timeout = 0;
524f41b1c5SYuchung Cheng 	/* To be more reordering resilient, allow min_rtt/4 settling delay
534f41b1c5SYuchung Cheng 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
544f41b1c5SYuchung Cheng 	 * RTT because reordering is often a path property and less related
554f41b1c5SYuchung Cheng 	 * to queuing or delayed ACKs.
564f41b1c5SYuchung Cheng 	 */
574f41b1c5SYuchung Cheng 	reo_wnd = 1000;
580ce294d8SYuchung Cheng 	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
590ce294d8SYuchung Cheng 	    min_rtt != ~0U) {
601f255691SPriyaranjan Jha 		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
611f255691SPriyaranjan Jha 		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
621f255691SPriyaranjan Jha 	}
634f41b1c5SYuchung Cheng 
64043b87d7SYuchung Cheng 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
65043b87d7SYuchung Cheng 				 tcp_tsorted_anchor) {
664f41b1c5SYuchung Cheng 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
67bef06223SYuchung Cheng 		s32 remaining;
6857dde7f7SYuchung Cheng 
6957dde7f7SYuchung Cheng 		/* Skip ones marked lost but not yet retransmitted */
7057dde7f7SYuchung Cheng 		if ((scb->sacked & TCPCB_LOST) &&
7157dde7f7SYuchung Cheng 		    !(scb->sacked & TCPCB_SACKED_RETRANS))
7257dde7f7SYuchung Cheng 			continue;
7357dde7f7SYuchung Cheng 
74bef06223SYuchung Cheng 		if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
75bef06223SYuchung Cheng 					 tp->rack.end_seq, scb->end_seq))
76bef06223SYuchung Cheng 			break;
77bef06223SYuchung Cheng 
78bef06223SYuchung Cheng 		/* A packet is lost if it has not been s/acked beyond
79bef06223SYuchung Cheng 		 * the recent RTT plus the reordering window.
80bef06223SYuchung Cheng 		 */
81bef06223SYuchung Cheng 		remaining = tp->rack.rtt_us + reo_wnd -
82bef06223SYuchung Cheng 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
83*428aec5eSYuchung Cheng 		if (remaining <= 0) {
84bef06223SYuchung Cheng 			tcp_rack_mark_skb_lost(sk, skb);
85bef06223SYuchung Cheng 			list_del_init(&skb->tcp_tsorted_anchor);
86bef06223SYuchung Cheng 		} else {
87*428aec5eSYuchung Cheng 			/* Record maximum wait time */
88*428aec5eSYuchung Cheng 			*reo_timeout = max_t(u32, *reo_timeout, remaining);
894f41b1c5SYuchung Cheng 		}
904f41b1c5SYuchung Cheng 	}
91e636f8b0SYuchung Cheng }
92e636f8b0SYuchung Cheng 
93128eda86SEric Dumazet void tcp_rack_mark_lost(struct sock *sk)
94e636f8b0SYuchung Cheng {
95e636f8b0SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
9657dde7f7SYuchung Cheng 	u32 timeout;
97e636f8b0SYuchung Cheng 
98a0370b3fSYuchung Cheng 	if (!tp->rack.advanced)
99e636f8b0SYuchung Cheng 		return;
10057dde7f7SYuchung Cheng 
101e636f8b0SYuchung Cheng 	/* Reset the advanced flag to avoid unnecessary queue scanning */
102e636f8b0SYuchung Cheng 	tp->rack.advanced = 0;
1037c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
10457dde7f7SYuchung Cheng 	if (timeout) {
105bb4d991aSYuchung Cheng 		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
10657dde7f7SYuchung Cheng 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
10757dde7f7SYuchung Cheng 					  timeout, inet_csk(sk)->icsk_rto);
10857dde7f7SYuchung Cheng 	}
1094f41b1c5SYuchung Cheng }
1104f41b1c5SYuchung Cheng 
111deed7be7SYuchung Cheng /* Record the most recently (re)sent time among the (s)acked packets
112deed7be7SYuchung Cheng  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
113deed7be7SYuchung Cheng  * draft-cheng-tcpm-rack-00.txt
114deed7be7SYuchung Cheng  */
1151d0833dfSYuchung Cheng void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
1169a568de4SEric Dumazet 		      u64 xmit_time)
117659a8ad5SYuchung Cheng {
118deed7be7SYuchung Cheng 	u32 rtt_us;
119deed7be7SYuchung Cheng 
1209a568de4SEric Dumazet 	if (tp->rack.mstamp &&
1219a568de4SEric Dumazet 	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
1221d0833dfSYuchung Cheng 				 end_seq, tp->rack.end_seq))
123659a8ad5SYuchung Cheng 		return;
124659a8ad5SYuchung Cheng 
1259a568de4SEric Dumazet 	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
126659a8ad5SYuchung Cheng 	if (sacked & TCPCB_RETRANS) {
127659a8ad5SYuchung Cheng 		/* If the sacked packet was retransmitted, it's ambiguous
128659a8ad5SYuchung Cheng 		 * whether the retransmission or the original (or the prior
129659a8ad5SYuchung Cheng 		 * retransmission) was sacked.
130659a8ad5SYuchung Cheng 		 *
131659a8ad5SYuchung Cheng 		 * If the original is lost, there is no ambiguity. Otherwise
132659a8ad5SYuchung Cheng 		 * we assume the original can be delayed up to aRTT + min_rtt.
133659a8ad5SYuchung Cheng 		 * the aRTT term is bounded by the fast recovery or timeout,
134659a8ad5SYuchung Cheng 		 * so it's at least one RTT (i.e., retransmission is at least
135659a8ad5SYuchung Cheng 		 * an RTT later).
136659a8ad5SYuchung Cheng 		 */
137deed7be7SYuchung Cheng 		if (rtt_us < tcp_min_rtt(tp))
138659a8ad5SYuchung Cheng 			return;
139659a8ad5SYuchung Cheng 	}
140deed7be7SYuchung Cheng 	tp->rack.rtt_us = rtt_us;
1419a568de4SEric Dumazet 	tp->rack.mstamp = xmit_time;
1421d0833dfSYuchung Cheng 	tp->rack.end_seq = end_seq;
143659a8ad5SYuchung Cheng 	tp->rack.advanced = 1;
144659a8ad5SYuchung Cheng }
14557dde7f7SYuchung Cheng 
14657dde7f7SYuchung Cheng /* We have waited long enough to accommodate reordering. Mark the expired
14757dde7f7SYuchung Cheng  * packets lost and retransmit them.
14857dde7f7SYuchung Cheng  */
14957dde7f7SYuchung Cheng void tcp_rack_reo_timeout(struct sock *sk)
15057dde7f7SYuchung Cheng {
15157dde7f7SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
15257dde7f7SYuchung Cheng 	u32 timeout, prior_inflight;
15357dde7f7SYuchung Cheng 
15457dde7f7SYuchung Cheng 	prior_inflight = tcp_packets_in_flight(tp);
1557c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
15657dde7f7SYuchung Cheng 	if (prior_inflight != tcp_packets_in_flight(tp)) {
15757dde7f7SYuchung Cheng 		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
15857dde7f7SYuchung Cheng 			tcp_enter_recovery(sk, false);
15957dde7f7SYuchung Cheng 			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
16057dde7f7SYuchung Cheng 				tcp_cwnd_reduction(sk, 1, 0);
16157dde7f7SYuchung Cheng 		}
16257dde7f7SYuchung Cheng 		tcp_xmit_retransmit_queue(sk);
16357dde7f7SYuchung Cheng 	}
16457dde7f7SYuchung Cheng 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
16557dde7f7SYuchung Cheng 		tcp_rearm_rto(sk);
16657dde7f7SYuchung Cheng }
1671f255691SPriyaranjan Jha 
1681f255691SPriyaranjan Jha /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
1691f255691SPriyaranjan Jha  *
1701f255691SPriyaranjan Jha  * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
1711f255691SPriyaranjan Jha  * by srtt), since there is possibility that spurious retransmission was
1721f255691SPriyaranjan Jha  * due to reordering delay longer than reo_wnd.
1731f255691SPriyaranjan Jha  *
1741f255691SPriyaranjan Jha  * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
1751f255691SPriyaranjan Jha  * no. of successful recoveries (accounts for full DSACK-based loss
1761f255691SPriyaranjan Jha  * recovery undo). After that, reset it to default (min_rtt/4).
1771f255691SPriyaranjan Jha  *
1781f255691SPriyaranjan Jha  * At max, reo_wnd is incremented only once per rtt. So that the new
1791f255691SPriyaranjan Jha  * DSACK on which we are reacting, is due to the spurious retx (approx)
1801f255691SPriyaranjan Jha  * after the reo_wnd has been updated last time.
1811f255691SPriyaranjan Jha  *
1821f255691SPriyaranjan Jha  * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
1831f255691SPriyaranjan Jha  * absolute value to account for change in rtt.
1841f255691SPriyaranjan Jha  */
1851f255691SPriyaranjan Jha void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
1861f255691SPriyaranjan Jha {
1871f255691SPriyaranjan Jha 	struct tcp_sock *tp = tcp_sk(sk);
1881f255691SPriyaranjan Jha 
1891f255691SPriyaranjan Jha 	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
1901f255691SPriyaranjan Jha 	    !rs->prior_delivered)
1911f255691SPriyaranjan Jha 		return;
1921f255691SPriyaranjan Jha 
1931f255691SPriyaranjan Jha 	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
1941f255691SPriyaranjan Jha 	if (before(rs->prior_delivered, tp->rack.last_delivered))
1951f255691SPriyaranjan Jha 		tp->rack.dsack_seen = 0;
1961f255691SPriyaranjan Jha 
1971f255691SPriyaranjan Jha 	/* Adjust the reo_wnd if update is pending */
1981f255691SPriyaranjan Jha 	if (tp->rack.dsack_seen) {
1991f255691SPriyaranjan Jha 		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
2001f255691SPriyaranjan Jha 					       tp->rack.reo_wnd_steps + 1);
2011f255691SPriyaranjan Jha 		tp->rack.dsack_seen = 0;
2021f255691SPriyaranjan Jha 		tp->rack.last_delivered = tp->delivered;
2031f255691SPriyaranjan Jha 		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
2041f255691SPriyaranjan Jha 	} else if (!tp->rack.reo_wnd_persist) {
2051f255691SPriyaranjan Jha 		tp->rack.reo_wnd_steps = 1;
2061f255691SPriyaranjan Jha 	}
2071f255691SPriyaranjan Jha }
208