xref: /linux/net/ipv4/tcp_recovery.c (revision bef06223083b81d2064824afe2bc85be416ab73a)
1659a8ad5SYuchung Cheng #include <linux/tcp.h>
2659a8ad5SYuchung Cheng #include <net/tcp.h>
3659a8ad5SYuchung Cheng 
4a0370b3fSYuchung Cheng int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
54f41b1c5SYuchung Cheng 
6db8da6bbSYuchung Cheng static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
7db8da6bbSYuchung Cheng {
8db8da6bbSYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
9db8da6bbSYuchung Cheng 
10db8da6bbSYuchung Cheng 	tcp_skb_mark_lost_uncond_verify(tp, skb);
11db8da6bbSYuchung Cheng 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
12db8da6bbSYuchung Cheng 		/* Account for retransmits that are lost again */
13db8da6bbSYuchung Cheng 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
14db8da6bbSYuchung Cheng 		tp->retrans_out -= tcp_skb_pcount(skb);
15ecde8f36SYuchung Cheng 		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
16ecde8f36SYuchung Cheng 			      tcp_skb_pcount(skb));
17db8da6bbSYuchung Cheng 	}
18db8da6bbSYuchung Cheng }
19db8da6bbSYuchung Cheng 
209a568de4SEric Dumazet static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
211d0833dfSYuchung Cheng {
229a568de4SEric Dumazet 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
231d0833dfSYuchung Cheng }
241d0833dfSYuchung Cheng 
25a0370b3fSYuchung Cheng /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
26a0370b3fSYuchung Cheng  *
27a0370b3fSYuchung Cheng  * Marks a packet lost, if some packet sent later has been (s)acked.
284f41b1c5SYuchung Cheng  * The underlying idea is similar to the traditional dupthresh and FACK
294f41b1c5SYuchung Cheng  * but they look at different metrics:
304f41b1c5SYuchung Cheng  *
314f41b1c5SYuchung Cheng  * dupthresh: 3 OOO packets delivered (packet count)
324f41b1c5SYuchung Cheng  * FACK: sequence delta to highest sacked sequence (sequence space)
334f41b1c5SYuchung Cheng  * RACK: sent time delta to the latest delivered packet (time domain)
344f41b1c5SYuchung Cheng  *
354f41b1c5SYuchung Cheng  * The advantage of RACK is it applies to both original and retransmitted
364f41b1c5SYuchung Cheng  * packet and therefore is robust against tail losses. Another advantage
374f41b1c5SYuchung Cheng  * is being more resilient to reordering by simply allowing some
384f41b1c5SYuchung Cheng  * "settling delay", instead of tweaking the dupthresh.
394f41b1c5SYuchung Cheng  *
40a0370b3fSYuchung Cheng  * When tcp_rack_detect_loss() detects some packets are lost and we
41a0370b3fSYuchung Cheng  * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
42a0370b3fSYuchung Cheng  * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
43a0370b3fSYuchung Cheng  * make us enter the CA_Recovery state.
444f41b1c5SYuchung Cheng  */
457c1c7308SEric Dumazet static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
464f41b1c5SYuchung Cheng {
474f41b1c5SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
48043b87d7SYuchung Cheng 	struct sk_buff *skb, *n;
49e636f8b0SYuchung Cheng 	u32 reo_wnd;
504f41b1c5SYuchung Cheng 
5157dde7f7SYuchung Cheng 	*reo_timeout = 0;
524f41b1c5SYuchung Cheng 	/* To be more reordering resilient, allow min_rtt/4 settling delay
534f41b1c5SYuchung Cheng 	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
544f41b1c5SYuchung Cheng 	 * RTT because reordering is often a path property and less related
554f41b1c5SYuchung Cheng 	 * to queuing or delayed ACKs.
564f41b1c5SYuchung Cheng 	 */
574f41b1c5SYuchung Cheng 	reo_wnd = 1000;
58a0370b3fSYuchung Cheng 	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
594f41b1c5SYuchung Cheng 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
604f41b1c5SYuchung Cheng 
61043b87d7SYuchung Cheng 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
62043b87d7SYuchung Cheng 				 tcp_tsorted_anchor) {
634f41b1c5SYuchung Cheng 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
64*bef06223SYuchung Cheng 		s32 remaining;
6557dde7f7SYuchung Cheng 
6657dde7f7SYuchung Cheng 		/* Skip ones marked lost but not yet retransmitted */
6757dde7f7SYuchung Cheng 		if ((scb->sacked & TCPCB_LOST) &&
6857dde7f7SYuchung Cheng 		    !(scb->sacked & TCPCB_SACKED_RETRANS))
6957dde7f7SYuchung Cheng 			continue;
7057dde7f7SYuchung Cheng 
71*bef06223SYuchung Cheng 		if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
72*bef06223SYuchung Cheng 					 tp->rack.end_seq, scb->end_seq))
73*bef06223SYuchung Cheng 			break;
74*bef06223SYuchung Cheng 
75*bef06223SYuchung Cheng 		/* A packet is lost if it has not been s/acked beyond
76*bef06223SYuchung Cheng 		 * the recent RTT plus the reordering window.
77*bef06223SYuchung Cheng 		 */
78*bef06223SYuchung Cheng 		remaining = tp->rack.rtt_us + reo_wnd -
79*bef06223SYuchung Cheng 			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
80*bef06223SYuchung Cheng 		if (remaining < 0) {
81*bef06223SYuchung Cheng 			tcp_rack_mark_skb_lost(sk, skb);
82*bef06223SYuchung Cheng 			list_del_init(&skb->tcp_tsorted_anchor);
83*bef06223SYuchung Cheng 		} else {
8457dde7f7SYuchung Cheng 			/* Record maximum wait time (+1 to avoid 0) */
8557dde7f7SYuchung Cheng 			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
864f41b1c5SYuchung Cheng 		}
874f41b1c5SYuchung Cheng 	}
88e636f8b0SYuchung Cheng }
89e636f8b0SYuchung Cheng 
90128eda86SEric Dumazet void tcp_rack_mark_lost(struct sock *sk)
91e636f8b0SYuchung Cheng {
92e636f8b0SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
9357dde7f7SYuchung Cheng 	u32 timeout;
94e636f8b0SYuchung Cheng 
95a0370b3fSYuchung Cheng 	if (!tp->rack.advanced)
96e636f8b0SYuchung Cheng 		return;
9757dde7f7SYuchung Cheng 
98e636f8b0SYuchung Cheng 	/* Reset the advanced flag to avoid unnecessary queue scanning */
99e636f8b0SYuchung Cheng 	tp->rack.advanced = 0;
1007c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
10157dde7f7SYuchung Cheng 	if (timeout) {
102bb4d991aSYuchung Cheng 		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
10357dde7f7SYuchung Cheng 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
10457dde7f7SYuchung Cheng 					  timeout, inet_csk(sk)->icsk_rto);
10557dde7f7SYuchung Cheng 	}
1064f41b1c5SYuchung Cheng }
1074f41b1c5SYuchung Cheng 
108deed7be7SYuchung Cheng /* Record the most recently (re)sent time among the (s)acked packets
109deed7be7SYuchung Cheng  * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
110deed7be7SYuchung Cheng  * draft-cheng-tcpm-rack-00.txt
111deed7be7SYuchung Cheng  */
1121d0833dfSYuchung Cheng void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
1139a568de4SEric Dumazet 		      u64 xmit_time)
114659a8ad5SYuchung Cheng {
115deed7be7SYuchung Cheng 	u32 rtt_us;
116deed7be7SYuchung Cheng 
1179a568de4SEric Dumazet 	if (tp->rack.mstamp &&
1189a568de4SEric Dumazet 	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
1191d0833dfSYuchung Cheng 				 end_seq, tp->rack.end_seq))
120659a8ad5SYuchung Cheng 		return;
121659a8ad5SYuchung Cheng 
1229a568de4SEric Dumazet 	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
123659a8ad5SYuchung Cheng 	if (sacked & TCPCB_RETRANS) {
124659a8ad5SYuchung Cheng 		/* If the sacked packet was retransmitted, it's ambiguous
125659a8ad5SYuchung Cheng 		 * whether the retransmission or the original (or the prior
126659a8ad5SYuchung Cheng 		 * retransmission) was sacked.
127659a8ad5SYuchung Cheng 		 *
128659a8ad5SYuchung Cheng 		 * If the original is lost, there is no ambiguity. Otherwise
129659a8ad5SYuchung Cheng 		 * we assume the original can be delayed up to aRTT + min_rtt.
130659a8ad5SYuchung Cheng 		 * the aRTT term is bounded by the fast recovery or timeout,
131659a8ad5SYuchung Cheng 		 * so it's at least one RTT (i.e., retransmission is at least
132659a8ad5SYuchung Cheng 		 * an RTT later).
133659a8ad5SYuchung Cheng 		 */
134deed7be7SYuchung Cheng 		if (rtt_us < tcp_min_rtt(tp))
135659a8ad5SYuchung Cheng 			return;
136659a8ad5SYuchung Cheng 	}
137deed7be7SYuchung Cheng 	tp->rack.rtt_us = rtt_us;
1389a568de4SEric Dumazet 	tp->rack.mstamp = xmit_time;
1391d0833dfSYuchung Cheng 	tp->rack.end_seq = end_seq;
140659a8ad5SYuchung Cheng 	tp->rack.advanced = 1;
141659a8ad5SYuchung Cheng }
14257dde7f7SYuchung Cheng 
14357dde7f7SYuchung Cheng /* We have waited long enough to accommodate reordering. Mark the expired
14457dde7f7SYuchung Cheng  * packets lost and retransmit them.
14557dde7f7SYuchung Cheng  */
14657dde7f7SYuchung Cheng void tcp_rack_reo_timeout(struct sock *sk)
14757dde7f7SYuchung Cheng {
14857dde7f7SYuchung Cheng 	struct tcp_sock *tp = tcp_sk(sk);
14957dde7f7SYuchung Cheng 	u32 timeout, prior_inflight;
15057dde7f7SYuchung Cheng 
15157dde7f7SYuchung Cheng 	prior_inflight = tcp_packets_in_flight(tp);
1527c1c7308SEric Dumazet 	tcp_rack_detect_loss(sk, &timeout);
15357dde7f7SYuchung Cheng 	if (prior_inflight != tcp_packets_in_flight(tp)) {
15457dde7f7SYuchung Cheng 		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
15557dde7f7SYuchung Cheng 			tcp_enter_recovery(sk, false);
15657dde7f7SYuchung Cheng 			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
15757dde7f7SYuchung Cheng 				tcp_cwnd_reduction(sk, 1, 0);
15857dde7f7SYuchung Cheng 		}
15957dde7f7SYuchung Cheng 		tcp_xmit_retransmit_queue(sk);
16057dde7f7SYuchung Cheng 	}
16157dde7f7SYuchung Cheng 	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
16257dde7f7SYuchung Cheng 		tcp_rearm_rto(sk);
16357dde7f7SYuchung Cheng }
164