1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* RACK-TLP [RFC8958] Implementation
3 *
4 * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells ([email protected])
6 */
7
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10 #include "ar-internal.h"
11
rxrpc_rack_sent_after(ktime_t t1,rxrpc_seq_t seq1,ktime_t t2,rxrpc_seq_t seq2)12 static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
13 ktime_t t2, rxrpc_seq_t seq2)
14 {
15 if (ktime_after(t1, t2))
16 return true;
17 return t1 == t2 && after(seq1, seq2);
18 }
19
20 /*
21 * Mark a packet lost.
22 */
rxrpc_rack_mark_lost(struct rxrpc_call * call,struct rxrpc_txqueue * tq,unsigned int ix)23 static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
24 struct rxrpc_txqueue *tq, unsigned int ix)
25 {
26 if (__test_and_set_bit(ix, &tq->segment_lost)) {
27 if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
28 call->tx_nr_resent--;
29 } else {
30 call->tx_nr_lost++;
31 }
32 tq->segment_xmit_ts[ix] = UINT_MAX;
33 }
34
35 /*
36 * Get the transmission time of a packet in the Tx queue.
37 */
rxrpc_get_xmit_ts(const struct rxrpc_txqueue * tq,unsigned int ix)38 static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
39 {
40 if (tq->segment_xmit_ts[ix] == UINT_MAX)
41 return KTIME_MAX;
42 return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
43 }
44
45 /*
46 * Get a bitmask of nack bits for a queue segment and mask off any that aren't
47 * yet reported.
48 */
rxrpc_tq_nacks(const struct rxrpc_txqueue * tq)49 static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
50 {
51 unsigned long nacks = ~tq->segment_acked;
52
53 if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
54 nacks &= (1UL << tq->nr_reported_acks) - 1;
55 return nacks;
56 }
57
58 /*
59 * Update the RACK state for the most recently sent packet that has been
60 * delivered [RFC8958 6.2 Step 2].
61 */
rxrpc_rack_update(struct rxrpc_call * call,struct rxrpc_ack_summary * summary,struct rxrpc_txqueue * tq,unsigned int ix)62 static void rxrpc_rack_update(struct rxrpc_call *call,
63 struct rxrpc_ack_summary *summary,
64 struct rxrpc_txqueue *tq,
65 unsigned int ix)
66 {
67 rxrpc_seq_t seq = tq->qbase + ix;
68 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
69 ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
70
71 if (__test_and_clear_bit(ix, &tq->segment_lost))
72 call->tx_nr_lost--;
73
74 if (test_bit(ix, &tq->segment_retransmitted)) {
75 /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
76 if (before(call->acks_highest_serial, tq->segment_serial[ix]))
77 return;
78 if (rtt < minmax_get(&call->min_rtt))
79 return;
80 }
81
82 /* The RACK algorithm requires the segment ACKs to be traversed in
83 * order of segment transmission - but the only thing this seems to
84 * matter for is that RACK.rtt is set to the rtt of the most recently
85 * transmitted segment. We should be able to achieve the same by only
86 * setting RACK.rtt if the xmit time is greater.
87 */
88 if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
89 call->rack_rtt = rtt;
90 call->rack_rtt_ts = xmit_ts;
91 }
92
93 if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
94 call->rack_rtt = rtt;
95 call->rack_xmit_ts = xmit_ts;
96 call->rack_end_seq = seq;
97 }
98 }
99
100 /*
101 * Detect data segment reordering [RFC8958 6.2 Step 3].
102 */
rxrpc_rack_detect_reordering(struct rxrpc_call * call,struct rxrpc_ack_summary * summary,struct rxrpc_txqueue * tq,unsigned int ix)103 static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
104 struct rxrpc_ack_summary *summary,
105 struct rxrpc_txqueue *tq,
106 unsigned int ix)
107 {
108 rxrpc_seq_t seq = tq->qbase + ix;
109
110 /* Track the highest sequence number so far ACK'd. This is not
111 * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
112 * could put a NACK in the last SACK slot.
113 */
114 if (after(seq, call->rack_fack))
115 call->rack_fack = seq;
116 else if (before(seq, call->rack_fack) &&
117 test_bit(ix, &tq->segment_retransmitted))
118 call->rack_reordering_seen = true;
119 }
120
rxrpc_input_rack_one(struct rxrpc_call * call,struct rxrpc_ack_summary * summary,struct rxrpc_txqueue * tq,unsigned int ix)121 void rxrpc_input_rack_one(struct rxrpc_call *call,
122 struct rxrpc_ack_summary *summary,
123 struct rxrpc_txqueue *tq,
124 unsigned int ix)
125 {
126 rxrpc_rack_update(call, summary, tq, ix);
127 rxrpc_rack_detect_reordering(call, summary, tq, ix);
128 }
129
rxrpc_input_rack(struct rxrpc_call * call,struct rxrpc_ack_summary * summary,struct rxrpc_txqueue * tq,unsigned long new_acks)130 void rxrpc_input_rack(struct rxrpc_call *call,
131 struct rxrpc_ack_summary *summary,
132 struct rxrpc_txqueue *tq,
133 unsigned long new_acks)
134 {
135 while (new_acks) {
136 unsigned int ix = __ffs(new_acks);
137
138 __clear_bit(ix, &new_acks);
139 rxrpc_input_rack_one(call, summary, tq, ix);
140 }
141
142 trace_rxrpc_rack_update(call, summary);
143 }
144
145 /*
146 * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated
147 * duration of the reordering window.
148 *
149 * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
150 * be given a 'DUPLICATE' reason with the serial number referring to the
151 * duplicated DATA packet. Rx does not inform as to whether this was a
152 * reception of the same packet twice or of a retransmission of a packet we
153 * already received (though this could be determined by the transmitter based
154 * on the serial number).
155 */
rxrpc_rack_update_reo_wnd(struct rxrpc_call * call,struct rxrpc_ack_summary * summary)156 static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
157 struct rxrpc_ack_summary *summary)
158 {
159 rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
160 rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
161 bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
162 int dup_thresh = 3;
163
164 /* DSACK-based reordering window adaptation */
165 if (!call->rack_dsack_round_none &&
166 after_eq(snd_una, call->rack_dsack_round))
167 call->rack_dsack_round_none = true;
168
169 /* Grow the reordering window per round that sees DSACK. Reset the
170 * window after 16 DSACK-free recoveries.
171 */
172 if (call->rack_dsack_round_none && have_dsack_option) {
173 call->rack_dsack_round_none = false;
174 call->rack_dsack_round = snd_nxt;
175 call->rack_reo_wnd_mult++;
176 call->rack_reo_wnd_persist = 16;
177 } else if (summary->exiting_fast_or_rto_recovery) {
178 call->rack_reo_wnd_persist--;
179 if (call->rack_reo_wnd_persist <= 0)
180 call->rack_reo_wnd_mult = 1;
181 }
182
183 if (!call->rack_reordering_seen) {
184 if (summary->in_fast_or_rto_recovery)
185 return 0;
186 if (call->acks_nr_sacks >= dup_thresh)
187 return 0;
188 }
189
190 return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
191 call->srtt_us >> 3));
192 }
193
194 /*
195 * Detect losses [RFC8958 6.2 Step 5].
196 */
rxrpc_rack_detect_loss(struct rxrpc_call * call,struct rxrpc_ack_summary * summary)197 static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
198 struct rxrpc_ack_summary *summary)
199 {
200 struct rxrpc_txqueue *tq;
201 ktime_t timeout = 0, lost_after, now = ktime_get_real();
202
203 call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
204 lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
205 trace_rxrpc_rack_scan_loss(call);
206
207 for (tq = call->tx_queue; tq; tq = tq->next) {
208 unsigned long nacks = rxrpc_tq_nacks(tq);
209
210 if (after(tq->qbase, call->tx_transmitted))
211 break;
212 trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
213
214 /* Skip ones marked lost but not yet retransmitted */
215 nacks &= ~tq->segment_lost | tq->segment_retransmitted;
216
217 while (nacks) {
218 unsigned int ix = __ffs(nacks);
219 rxrpc_seq_t seq = tq->qbase + ix;
220 ktime_t remaining;
221 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
222
223 __clear_bit(ix, &nacks);
224
225 if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
226 xmit_ts, seq)) {
227 remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
228 if (remaining <= 0) {
229 rxrpc_rack_mark_lost(call, tq, ix);
230 trace_rxrpc_rack_detect_loss(call, summary, seq);
231 } else {
232 timeout = max(remaining, timeout);
233 }
234 }
235 }
236 }
237
238 return timeout;
239 }
240
241 /*
242 * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
243 */
rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call * call,struct rxrpc_ack_summary * summary)244 void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
245 struct rxrpc_ack_summary *summary)
246 {
247 ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
248
249 if (timeout) {
250 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
251 call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
252 trace_rxrpc_rack_timer(call, timeout, false);
253 trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
254 }
255 }
256
257 /*
258 * Handle RACK-TLP RTO expiration [RFC8958 6.3].
259 */
rxrpc_rack_mark_losses_on_rto(struct rxrpc_call * call)260 static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
261 {
262 struct rxrpc_txqueue *tq;
263 rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
264 ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
265 ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
266
267 for (tq = call->tx_queue; tq; tq = tq->next) {
268 unsigned long unacked = ~tq->segment_acked;
269
270 trace_rxrpc_rack_mark_loss_tq(call, tq);
271 while (unacked) {
272 unsigned int ix = __ffs(unacked);
273 rxrpc_seq_t seq = tq->qbase + ix;
274 ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
275
276 if (after(seq, call->tx_transmitted))
277 return;
278 __clear_bit(ix, &unacked);
279
280 if (seq == snd_una ||
281 ktime_before(xmit_ts, deadline))
282 rxrpc_rack_mark_lost(call, tq, ix);
283 }
284 }
285 }
286
287 /*
288 * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
289 */
rxrpc_tlp_calc_pto(struct rxrpc_call * call,ktime_t now)290 ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
291 {
292 unsigned int flight_size = rxrpc_tx_in_flight(call);
293 ktime_t rto_at = ktime_add(call->tx_last_sent,
294 rxrpc_get_rto_backoff(call, false));
295 ktime_t pto;
296
297 if (call->rtt_count > 0) {
298 /* Use 2*SRTT as the timeout. */
299 pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
300 if (flight_size)
301 pto = ktime_add(pto, call->tlp_max_ack_delay);
302 } else {
303 pto = NSEC_PER_SEC;
304 }
305
306 if (ktime_after(ktime_add(now, pto), rto_at))
307 pto = ktime_sub(rto_at, now);
308 return pto;
309 }
310
311 /*
312 * Send a TLP loss probe on PTO expiration [RFC8958 7.3].
313 */
rxrpc_tlp_send_probe(struct rxrpc_call * call)314 void rxrpc_tlp_send_probe(struct rxrpc_call *call)
315 {
316 unsigned int in_flight = rxrpc_tx_in_flight(call);
317
318 if (after_eq(call->acks_hard_ack, call->tx_transmitted))
319 return; /* Everything we transmitted has been acked. */
320
321 /* There must be no other loss probe still in flight and we need to
322 * have taken a new RTT sample since last probe or the start of
323 * connection.
324 */
325 if (!call->tlp_serial &&
326 call->tlp_rtt_taken != call->rtt_taken) {
327 call->tlp_is_retrans = false;
328 if (after(call->send_top, call->tx_transmitted) &&
329 rxrpc_tx_window_space(call) > 0) {
330 /* Transmit the lowest-sequence unsent DATA */
331 call->tx_last_serial = 0;
332 rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
333 call->tlp_serial = call->tx_last_serial;
334 call->tlp_seq = call->tx_transmitted;
335 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
336 in_flight = rxrpc_tx_in_flight(call);
337 } else {
338 /* Retransmit the highest-sequence DATA sent */
339 call->tx_last_serial = 0;
340 rxrpc_resend_tlp(call);
341 call->tlp_is_retrans = true;
342 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
343 }
344 } else {
345 trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
346 }
347
348 if (in_flight != 0) {
349 ktime_t rto = rxrpc_get_rto_backoff(call, false);
350
351 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
352 call->rack_timo_at = ktime_add(ktime_get_real(), rto);
353 trace_rxrpc_rack_timer(call, rto, false);
354 trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
355 }
356 }
357
358 /*
359 * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
360 */
rxrpc_tlp_process_ack(struct rxrpc_call * call,struct rxrpc_ack_summary * summary)361 void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
362 {
363 if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
364 return;
365
366 if (!call->tlp_is_retrans) {
367 /* TLP of new data delivered */
368 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
369 call->tlp_serial = 0;
370 } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
371 summary->acked_serial == call->tlp_serial) {
372 /* General Case: Detected packet losses using RACK [7.4.1] */
373 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
374 call->tlp_serial = 0;
375 } else if (after(call->acks_hard_ack, call->tlp_seq)) {
376 /* Repaired the single loss */
377 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
378 call->tlp_serial = 0;
379 // TODO: Invoke congestion control to react to the loss
380 // event the probe has repaired
381 } else if (summary->tlp_probe_acked) {
382 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
383 /* Special Case: Detected a single loss repaired by the loss
384 * probe [7.4.2]
385 */
386 call->tlp_serial = 0;
387 } else {
388 trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
389 }
390 }
391
392 /*
393 * Handle RACK timer expiration; returns true to request a resend.
394 */
rxrpc_rack_timer_expired(struct rxrpc_call * call,ktime_t overran_by)395 void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
396 {
397 struct rxrpc_ack_summary summary = {};
398 enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
399
400 trace_rxrpc_rack_timer(call, overran_by, true);
401 call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
402
403 switch (mode) {
404 case RXRPC_CALL_RACKTIMER_RACK_REORDER:
405 rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
406 break;
407 case RXRPC_CALL_RACKTIMER_TLP_PTO:
408 rxrpc_tlp_send_probe(call);
409 break;
410 case RXRPC_CALL_RACKTIMER_RTO:
411 // Might need to poke the congestion algo in some way
412 rxrpc_rack_mark_losses_on_rto(call);
413 break;
414 //case RXRPC_CALL_RACKTIMER_ZEROWIN:
415 default:
416 pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
417 }
418 }
419