1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* RACK-TLP [RFC8958] Implementation 3 * 4 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include "ar-internal.h" 11 12 static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, 13 ktime_t t2, rxrpc_seq_t seq2) 14 { 15 if (ktime_after(t1, t2)) 16 return true; 17 return t1 == t2 && after(seq1, seq2); 18 } 19 20 /* 21 * Mark a packet lost. 22 */ 23 static void rxrpc_rack_mark_lost(struct rxrpc_call *call, 24 struct rxrpc_txqueue *tq, unsigned int ix) 25 { 26 if (__test_and_set_bit(ix, &tq->segment_lost)) { 27 if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) 28 call->tx_nr_resent--; 29 } else { 30 call->tx_nr_lost++; 31 } 32 tq->segment_xmit_ts[ix] = UINT_MAX; 33 } 34 35 /* 36 * Get the transmission time of a packet in the Tx queue. 37 */ 38 static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) 39 { 40 if (tq->segment_xmit_ts[ix] == UINT_MAX) 41 return KTIME_MAX; 42 return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); 43 } 44 45 /* 46 * Get a bitmask of nack bits for a queue segment and mask off any that aren't 47 * yet reported. 48 */ 49 static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) 50 { 51 unsigned long nacks = ~tq->segment_acked; 52 53 if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) 54 nacks &= (1UL << tq->nr_reported_acks) - 1; 55 return nacks; 56 } 57 58 /* 59 * Update the RACK state for the most recently sent packet that has been 60 * delivered [RFC8958 6.2 Step 2]. 61 */ 62 static void rxrpc_rack_update(struct rxrpc_call *call, 63 struct rxrpc_ack_summary *summary, 64 struct rxrpc_txqueue *tq, 65 unsigned int ix) 66 { 67 rxrpc_seq_t seq = tq->qbase + ix; 68 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); 69 ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); 70 71 if (__test_and_clear_bit(ix, &tq->segment_lost)) 72 call->tx_nr_lost--; 73 74 if (test_bit(ix, &tq->segment_retransmitted)) { 75 /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ 76 if (before(call->acks_highest_serial, tq->segment_serial[ix])) 77 return; 78 if (rtt < minmax_get(&call->min_rtt)) 79 return; 80 } 81 82 /* The RACK algorithm requires the segment ACKs to be traversed in 83 * order of segment transmission - but the only thing this seems to 84 * matter for is that RACK.rtt is set to the rtt of the most recently 85 * transmitted segment. We should be able to achieve the same by only 86 * setting RACK.rtt if the xmit time is greater. 87 */ 88 if (ktime_after(xmit_ts, call->rack_rtt_ts)) { 89 call->rack_rtt = rtt; 90 call->rack_rtt_ts = xmit_ts; 91 } 92 93 if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { 94 call->rack_rtt = rtt; 95 call->rack_xmit_ts = xmit_ts; 96 call->rack_end_seq = seq; 97 } 98 } 99 100 /* 101 * Detect data segment reordering [RFC8958 6.2 Step 3]. 102 */ 103 static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, 104 struct rxrpc_ack_summary *summary, 105 struct rxrpc_txqueue *tq, 106 unsigned int ix) 107 { 108 rxrpc_seq_t seq = tq->qbase + ix; 109 110 /* Track the highest sequence number so far ACK'd. This is not 111 * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer 112 * could put a NACK in the last SACK slot. 113 */ 114 if (after(seq, call->rack_fack)) 115 call->rack_fack = seq; 116 else if (before(seq, call->rack_fack) && 117 test_bit(ix, &tq->segment_retransmitted)) 118 call->rack_reordering_seen = true; 119 } 120 121 void rxrpc_input_rack_one(struct rxrpc_call *call, 122 struct rxrpc_ack_summary *summary, 123 struct rxrpc_txqueue *tq, 124 unsigned int ix) 125 { 126 rxrpc_rack_update(call, summary, tq, ix); 127 rxrpc_rack_detect_reordering(call, summary, tq, ix); 128 } 129 130 void rxrpc_input_rack(struct rxrpc_call *call, 131 struct rxrpc_ack_summary *summary, 132 struct rxrpc_txqueue *tq, 133 unsigned long new_acks) 134 { 135 while (new_acks) { 136 unsigned int ix = __ffs(new_acks); 137 138 __clear_bit(ix, &new_acks); 139 rxrpc_input_rack_one(call, summary, tq, ix); 140 } 141 142 trace_rxrpc_rack_update(call, summary); 143 } 144 145 /* 146 * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated 147 * duration of the reordering window. 148 * 149 * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can 150 * be given a 'DUPLICATE' reason with the serial number referring to the 151 * duplicated DATA packet. Rx does not inform as to whether this was a 152 * reception of the same packet twice or of a retransmission of a packet we 153 * already received (though this could be determined by the transmitter based 154 * on the serial number). 155 */ 156 static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, 157 struct rxrpc_ack_summary *summary) 158 { 159 rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ 160 rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ 161 bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; 162 int dup_thresh = 3; 163 164 /* DSACK-based reordering window adaptation */ 165 if (!call->rack_dsack_round_none && 166 after_eq(snd_una, call->rack_dsack_round)) 167 call->rack_dsack_round_none = true; 168 169 /* Grow the reordering window per round that sees DSACK. Reset the 170 * window after 16 DSACK-free recoveries. 171 */ 172 if (call->rack_dsack_round_none && have_dsack_option) { 173 call->rack_dsack_round_none = false; 174 call->rack_dsack_round = snd_nxt; 175 call->rack_reo_wnd_mult++; 176 call->rack_reo_wnd_persist = 16; 177 } else if (summary->exiting_fast_or_rto_recovery) { 178 call->rack_reo_wnd_persist--; 179 if (call->rack_reo_wnd_persist <= 0) 180 call->rack_reo_wnd_mult = 1; 181 } 182 183 if (!call->rack_reordering_seen) { 184 if (summary->in_fast_or_rto_recovery) 185 return 0; 186 if (call->acks_nr_sacks >= dup_thresh) 187 return 0; 188 } 189 190 return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, 191 call->srtt_us >> 3)); 192 } 193 194 /* 195 * Detect losses [RFC8958 6.2 Step 5]. 196 */ 197 static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, 198 struct rxrpc_ack_summary *summary) 199 { 200 struct rxrpc_txqueue *tq; 201 ktime_t timeout = 0, lost_after, now = ktime_get_real(); 202 203 call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); 204 lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); 205 trace_rxrpc_rack_scan_loss(call); 206 207 for (tq = call->tx_queue; tq; tq = tq->next) { 208 unsigned long nacks = rxrpc_tq_nacks(tq); 209 210 if (after(tq->qbase, call->tx_transmitted)) 211 break; 212 trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); 213 214 /* Skip ones marked lost but not yet retransmitted */ 215 nacks &= ~tq->segment_lost | tq->segment_retransmitted; 216 217 while (nacks) { 218 unsigned int ix = __ffs(nacks); 219 rxrpc_seq_t seq = tq->qbase + ix; 220 ktime_t remaining; 221 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); 222 223 __clear_bit(ix, &nacks); 224 225 if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, 226 xmit_ts, seq)) { 227 remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); 228 if (remaining <= 0) { 229 rxrpc_rack_mark_lost(call, tq, ix); 230 trace_rxrpc_rack_detect_loss(call, summary, seq); 231 } else { 232 timeout = max(remaining, timeout); 233 } 234 } 235 } 236 } 237 238 return timeout; 239 } 240 241 /* 242 * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. 243 */ 244 void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, 245 struct rxrpc_ack_summary *summary) 246 { 247 ktime_t timeout = rxrpc_rack_detect_loss(call, summary); 248 249 if (timeout) { 250 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; 251 call->rack_timo_at = ktime_add(ktime_get_real(), timeout); 252 trace_rxrpc_rack_timer(call, timeout, false); 253 trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); 254 } 255 } 256 257 /* 258 * Handle RACK-TLP RTO expiration [RFC8958 6.3]. 259 */ 260 static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) 261 { 262 struct rxrpc_txqueue *tq; 263 rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ 264 ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); 265 ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); 266 267 for (tq = call->tx_queue; tq; tq = tq->next) { 268 unsigned long unacked = ~tq->segment_acked; 269 270 trace_rxrpc_rack_mark_loss_tq(call, tq); 271 while (unacked) { 272 unsigned int ix = __ffs(unacked); 273 rxrpc_seq_t seq = tq->qbase + ix; 274 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); 275 276 if (after(seq, call->tx_transmitted)) 277 return; 278 __clear_bit(ix, &unacked); 279 280 if (seq == snd_una || 281 ktime_before(xmit_ts, deadline)) 282 rxrpc_rack_mark_lost(call, tq, ix); 283 } 284 } 285 } 286 287 /* 288 * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. 289 */ 290 ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) 291 { 292 unsigned int flight_size = rxrpc_tx_in_flight(call); 293 ktime_t rto_at = ktime_add(call->tx_last_sent, 294 rxrpc_get_rto_backoff(call, false)); 295 ktime_t pto; 296 297 if (call->rtt_count > 0) { 298 /* Use 2*SRTT as the timeout. */ 299 pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); 300 if (flight_size) 301 pto = ktime_add(pto, call->tlp_max_ack_delay); 302 } else { 303 pto = NSEC_PER_SEC; 304 } 305 306 if (ktime_after(ktime_add(now, pto), rto_at)) 307 pto = ktime_sub(rto_at, now); 308 return pto; 309 } 310 311 /* 312 * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. 313 */ 314 void rxrpc_tlp_send_probe(struct rxrpc_call *call) 315 { 316 unsigned int in_flight = rxrpc_tx_in_flight(call); 317 318 if (after_eq(call->acks_hard_ack, call->tx_transmitted)) 319 return; /* Everything we transmitted has been acked. */ 320 321 /* There must be no other loss probe still in flight and we need to 322 * have taken a new RTT sample since last probe or the start of 323 * connection. 324 */ 325 if (!call->tlp_serial && 326 call->tlp_rtt_taken != call->rtt_taken) { 327 call->tlp_is_retrans = false; 328 if (after(call->send_top, call->tx_transmitted) && 329 rxrpc_tx_window_space(call) > 0) { 330 /* Transmit the lowest-sequence unsent DATA */ 331 call->tx_last_serial = 0; 332 rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); 333 call->tlp_serial = call->tx_last_serial; 334 call->tlp_seq = call->tx_transmitted; 335 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); 336 in_flight = rxrpc_tx_in_flight(call); 337 } else { 338 /* Retransmit the highest-sequence DATA sent */ 339 call->tx_last_serial = 0; 340 rxrpc_resend_tlp(call); 341 call->tlp_is_retrans = true; 342 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); 343 } 344 } else { 345 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); 346 } 347 348 if (in_flight != 0) { 349 ktime_t rto = rxrpc_get_rto_backoff(call, false); 350 351 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; 352 call->rack_timo_at = ktime_add(ktime_get_real(), rto); 353 trace_rxrpc_rack_timer(call, rto, false); 354 trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); 355 } 356 } 357 358 /* 359 * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. 360 */ 361 void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) 362 { 363 if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) 364 return; 365 366 if (!call->tlp_is_retrans) { 367 /* TLP of new data delivered */ 368 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); 369 call->tlp_serial = 0; 370 } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && 371 summary->acked_serial == call->tlp_serial) { 372 /* General Case: Detected packet losses using RACK [7.4.1] */ 373 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); 374 call->tlp_serial = 0; 375 } else if (after(call->acks_hard_ack, call->tlp_seq)) { 376 /* Repaired the single loss */ 377 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); 378 call->tlp_serial = 0; 379 // TODO: Invoke congestion control to react to the loss 380 // event the probe has repaired 381 } else if (summary->tlp_probe_acked) { 382 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); 383 /* Special Case: Detected a single loss repaired by the loss 384 * probe [7.4.2] 385 */ 386 call->tlp_serial = 0; 387 } else { 388 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); 389 } 390 } 391 392 /* 393 * Handle RACK timer expiration; returns true to request a resend. 394 */ 395 void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) 396 { 397 struct rxrpc_ack_summary summary = {}; 398 enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; 399 400 trace_rxrpc_rack_timer(call, overran_by, true); 401 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; 402 403 switch (mode) { 404 case RXRPC_CALL_RACKTIMER_RACK_REORDER: 405 rxrpc_rack_detect_loss_and_arm_timer(call, &summary); 406 break; 407 case RXRPC_CALL_RACKTIMER_TLP_PTO: 408 rxrpc_tlp_send_probe(call); 409 break; 410 case RXRPC_CALL_RACKTIMER_RTO: 411 // Might need to poke the congestion algo in some way 412 rxrpc_rack_mark_losses_on_rto(call); 413 break; 414 //case RXRPC_CALL_RACKTIMER_ZEROWIN: 415 default: 416 pr_warn("Unexpected rack timer %u", call->rack_timer_mode); 417 } 418 } 419