1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 */
32
33 /*-
34 *
35 * NRL grants permission for redistribution and use in source and binary
36 * forms, with or without modification, of the software and documentation
37 * created at NRL provided that the following conditions are met:
38 *
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgements:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * This product includes software developed at the Information
49 * Technology Division, US Naval Research Laboratory.
50 * 4. Neither the name of the NRL nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
53 *
54 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
55 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
56 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
57 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
58 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
59 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
60 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
61 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
62 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
63 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
64 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 *
66 * The views and conclusions contained in the software and documentation
67 * are those of the authors and should not be interpreted as representing
68 * official policies, either expressed or implied, of the US Naval
69 * Research Laboratory (NRL).
70 */
71
72 #include <sys/cdefs.h>
73 #include "opt_inet.h"
74 #include "opt_inet6.h"
75
76 #include <sys/param.h>
77 #include <sys/systm.h>
78 #include <sys/kernel.h>
79 #include <sys/sysctl.h>
80 #include <sys/malloc.h>
81 #include <sys/mbuf.h>
82 #include <sys/proc.h> /* for proc0 declaration */
83 #include <sys/protosw.h>
84 #include <sys/socket.h>
85 #include <sys/socketvar.h>
86 #include <sys/syslog.h>
87 #include <sys/systm.h>
88
89 #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
90
91 #include <vm/uma.h>
92
93 #include <net/if.h>
94 #include <net/if_var.h>
95 #include <net/route.h>
96 #include <net/vnet.h>
97
98 #include <netinet/in.h>
99 #include <netinet/in_systm.h>
100 #include <netinet/ip.h>
101 #include <netinet/in_var.h>
102 #include <netinet/in_pcb.h>
103 #include <netinet/ip_var.h>
104 #include <netinet/ip6.h>
105 #include <netinet/icmp6.h>
106 #include <netinet6/nd6.h>
107 #include <netinet6/ip6_var.h>
108 #include <netinet6/in6_pcb.h>
109 #include <netinet/tcp.h>
110 #include <netinet/tcp_fsm.h>
111 #include <netinet/tcp_seq.h>
112 #include <netinet/tcp_timer.h>
113 #include <netinet/tcp_var.h>
114 #include <netinet/tcpip.h>
115 #include <netinet/cc/cc.h>
116
117 #include <machine/in_cksum.h>
118
119 VNET_DECLARE(struct uma_zone *, sack_hole_zone);
120 #define V_sack_hole_zone VNET(sack_hole_zone)
121
122 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
123 "TCP SACK");
124
125 VNET_DEFINE(int, tcp_do_sack) = 1;
126 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
127 &VNET_NAME(tcp_do_sack), 0,
128 "Enable/Disable TCP SACK support");
129
130 VNET_DEFINE(int, tcp_do_newsack) = 1;
131
132 static int
sysctl_net_inet_tcp_sack_revised(SYSCTL_HANDLER_ARGS)133 sysctl_net_inet_tcp_sack_revised(SYSCTL_HANDLER_ARGS)
134 {
135 int error;
136 int new;
137
138 new = V_tcp_do_newsack;
139 error = sysctl_handle_int(oidp, &new, 0, req);
140 if (error == 0 && req->newptr) {
141 V_tcp_do_newsack = new;
142 gone_in(16, "net.inet.tcp.sack.revised will be deprecated."
143 " net.inet.tcp.sack.enable will always follow RFC6675 SACK.\n");
144 }
145 return (error);
146 }
147
148 SYSCTL_PROC(_net_inet_tcp_sack, OID_AUTO, revised, CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_INT,
149 &VNET_NAME(tcp_do_newsack), 0, sysctl_net_inet_tcp_sack_revised, "CU",
150 "Use revised SACK loss recovery per RFC 6675");
151
152 VNET_DEFINE(int, tcp_do_lrd) = 1;
153 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, lrd, CTLFLAG_VNET | CTLFLAG_RW,
154 &VNET_NAME(tcp_do_lrd), 1,
155 "Perform Lost Retransmission Detection");
156
157 VNET_DEFINE(int, tcp_sack_tso) = 0;
158 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
159 &VNET_NAME(tcp_sack_tso), 0,
160 "Allow TSO during SACK loss recovery");
161
162 VNET_DEFINE(int, tcp_sack_maxholes) = 128;
163 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
164 &VNET_NAME(tcp_sack_maxholes), 0,
165 "Maximum number of TCP SACK holes allowed per connection");
166
167 VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
168 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
169 &VNET_NAME(tcp_sack_globalmaxholes), 0,
170 "Global maximum number of TCP SACK holes");
171
172 VNET_DEFINE(int, tcp_sack_globalholes) = 0;
173 SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
174 &VNET_NAME(tcp_sack_globalholes), 0,
175 "Global number of TCP SACK holes currently allocated");
176
177 int
tcp_dsack_block_exists(struct tcpcb * tp)178 tcp_dsack_block_exists(struct tcpcb *tp)
179 {
180 /* Return true if a DSACK block exists */
181 if (tp->rcv_numsacks == 0)
182 return (0);
183 if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
184 return(1);
185 return (0);
186 }
187
188 /*
189 * This function will find overlaps with the currently stored sackblocks
190 * and add any overlap as a dsack block upfront
191 */
192 void
tcp_update_dsack_list(struct tcpcb * tp,tcp_seq rcv_start,tcp_seq rcv_end)193 tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
194 {
195 struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
196 int i, j, n, identical;
197 tcp_seq start, end;
198
199 INP_WLOCK_ASSERT(tptoinpcb(tp));
200
201 KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
202
203 if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
204 ((rcv_end == tp->rcv_nxt) &&
205 (tp->rcv_numsacks > 0 ) &&
206 (tp->sackblks[0].end == tp->rcv_nxt))) {
207 saved_blks[0].start = rcv_start;
208 saved_blks[0].end = rcv_end;
209 } else {
210 saved_blks[0].start = saved_blks[0].end = 0;
211 }
212
213 head_blk.start = head_blk.end = 0;
214 mid_blk.start = rcv_start;
215 mid_blk.end = rcv_end;
216 identical = 0;
217
218 for (i = 0; i < tp->rcv_numsacks; i++) {
219 start = tp->sackblks[i].start;
220 end = tp->sackblks[i].end;
221 if (SEQ_LT(rcv_end, start)) {
222 /* pkt left to sack blk */
223 continue;
224 }
225 if (SEQ_GT(rcv_start, end)) {
226 /* pkt right to sack blk */
227 continue;
228 }
229 if (SEQ_GT(tp->rcv_nxt, end)) {
230 if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
231 (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
232 (head_blk.start == head_blk.end))) {
233 head_blk.start = SEQ_MAX(rcv_start, start);
234 head_blk.end = SEQ_MIN(rcv_end, end);
235 }
236 continue;
237 }
238 if (((head_blk.start == head_blk.end) ||
239 SEQ_LT(start, head_blk.start)) &&
240 (SEQ_GT(end, rcv_start) &&
241 SEQ_LEQ(start, rcv_end))) {
242 head_blk.start = start;
243 head_blk.end = end;
244 }
245 mid_blk.start = SEQ_MIN(mid_blk.start, start);
246 mid_blk.end = SEQ_MAX(mid_blk.end, end);
247 if ((mid_blk.start == start) &&
248 (mid_blk.end == end))
249 identical = 1;
250 }
251 if (SEQ_LT(head_blk.start, head_blk.end)) {
252 /* store overlapping range */
253 saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
254 saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end);
255 }
256 n = 1;
257 /*
258 * Second, if not ACKed, store the SACK block that
259 * overlaps with the DSACK block unless it is identical
260 */
261 if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
262 !((mid_blk.start == saved_blks[0].start) &&
263 (mid_blk.end == saved_blks[0].end))) ||
264 identical == 1) {
265 saved_blks[n].start = mid_blk.start;
266 saved_blks[n++].end = mid_blk.end;
267 }
268 for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) {
269 if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
270 SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
271 (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
272 saved_blks[n++] = tp->sackblks[j];
273 }
274 j = 0;
275 for (i = 0; i < n; i++) {
276 /* we can end up with a stale initial entry */
277 if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
278 tp->sackblks[j++] = saved_blks[i];
279 }
280 }
281 tp->rcv_numsacks = j;
282 }
283
284 /*
285 * This function is called upon receipt of new valid data (while not in
286 * header prediction mode), and it updates the ordered list of sacks.
287 */
288 void
tcp_update_sack_list(struct tcpcb * tp,tcp_seq rcv_start,tcp_seq rcv_end)289 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
290 {
291 /*
292 * First reported block MUST be the most recent one. Subsequent
293 * blocks SHOULD be in the order in which they arrived at the
294 * receiver. These two conditions make the implementation fully
295 * compliant with RFC 2018.
296 */
297 struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
298 int num_head, num_saved, i;
299
300 INP_WLOCK_ASSERT(tptoinpcb(tp));
301
302 /* Check arguments. */
303 KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("SEG_GT(rcv_start, rcv_end)"));
304
305 if ((rcv_start == rcv_end) &&
306 (tp->rcv_numsacks >= 1) &&
307 (rcv_end == tp->sackblks[0].end)) {
308 /* retaining DSACK block below rcv_nxt (todrop) */
309 head_blk = tp->sackblks[0];
310 } else {
311 /* SACK block for the received segment. */
312 head_blk.start = rcv_start;
313 head_blk.end = rcv_end;
314 }
315
316 /*
317 * Merge updated SACK blocks into head_blk, and save unchanged SACK
318 * blocks into saved_blks[]. num_saved will have the number of the
319 * saved SACK blocks.
320 */
321 num_saved = 0;
322 for (i = 0; i < tp->rcv_numsacks; i++) {
323 tcp_seq start = tp->sackblks[i].start;
324 tcp_seq end = tp->sackblks[i].end;
325 if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
326 /*
327 * Discard this SACK block.
328 */
329 } else if (SEQ_LEQ(head_blk.start, end) &&
330 SEQ_GEQ(head_blk.end, start)) {
331 /*
332 * Merge this SACK block into head_blk. This SACK
333 * block itself will be discarded.
334 */
335 /*
336 * |-|
337 * |---| merge
338 *
339 * |-|
340 * |---| merge
341 *
342 * |-----|
343 * |-| DSACK smaller
344 *
345 * |-|
346 * |-----| DSACK smaller
347 */
348 if (head_blk.start == end)
349 head_blk.start = start;
350 else if (head_blk.end == start)
351 head_blk.end = end;
352 else {
353 if (SEQ_LT(head_blk.start, start)) {
354 tcp_seq temp = start;
355 start = head_blk.start;
356 head_blk.start = temp;
357 }
358 if (SEQ_GT(head_blk.end, end)) {
359 tcp_seq temp = end;
360 end = head_blk.end;
361 head_blk.end = temp;
362 }
363 if ((head_blk.start != start) ||
364 (head_blk.end != end)) {
365 if ((num_saved >= 1) &&
366 SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
367 SEQ_LEQ(saved_blks[num_saved-1].end, end))
368 num_saved--;
369 saved_blks[num_saved].start = start;
370 saved_blks[num_saved].end = end;
371 num_saved++;
372 }
373 }
374 } else {
375 /*
376 * This block supercedes the prior block
377 */
378 if ((num_saved >= 1) &&
379 SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
380 SEQ_LEQ(saved_blks[num_saved-1].end, end))
381 num_saved--;
382 /*
383 * Save this SACK block.
384 */
385 saved_blks[num_saved].start = start;
386 saved_blks[num_saved].end = end;
387 num_saved++;
388 }
389 }
390
391 /*
392 * Update SACK list in tp->sackblks[].
393 */
394 num_head = 0;
395 if (SEQ_LT(rcv_start, rcv_end)) {
396 /*
397 * The received data segment is an out-of-order segment. Put
398 * head_blk at the top of SACK list.
399 */
400 tp->sackblks[0] = head_blk;
401 num_head = 1;
402 /*
403 * If the number of saved SACK blocks exceeds its limit,
404 * discard the last SACK block.
405 */
406 if (num_saved >= MAX_SACK_BLKS)
407 num_saved--;
408 }
409 if ((rcv_start == rcv_end) &&
410 (rcv_start == tp->sackblks[0].end)) {
411 num_head = 1;
412 }
413 if (num_saved > 0) {
414 /*
415 * Copy the saved SACK blocks back.
416 */
417 bcopy(saved_blks, &tp->sackblks[num_head],
418 sizeof(struct sackblk) * num_saved);
419 }
420
421 /* Save the number of SACK blocks. */
422 tp->rcv_numsacks = num_head + num_saved;
423 }
424
425 void
tcp_clean_dsack_blocks(struct tcpcb * tp)426 tcp_clean_dsack_blocks(struct tcpcb *tp)
427 {
428 struct sackblk saved_blks[MAX_SACK_BLKS];
429 int num_saved, i;
430
431 INP_WLOCK_ASSERT(tptoinpcb(tp));
432 /*
433 * Clean up any DSACK blocks that
434 * are in our queue of sack blocks.
435 *
436 */
437 num_saved = 0;
438 for (i = 0; i < tp->rcv_numsacks; i++) {
439 tcp_seq start = tp->sackblks[i].start;
440 tcp_seq end = tp->sackblks[i].end;
441 if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
442 /*
443 * Discard this D-SACK block.
444 */
445 continue;
446 }
447 /*
448 * Save this SACK block.
449 */
450 saved_blks[num_saved].start = start;
451 saved_blks[num_saved].end = end;
452 num_saved++;
453 }
454 if (num_saved > 0) {
455 /*
456 * Copy the saved SACK blocks back.
457 */
458 bcopy(saved_blks, &tp->sackblks[0],
459 sizeof(struct sackblk) * num_saved);
460 }
461 tp->rcv_numsacks = num_saved;
462 }
463
464 /*
465 * Delete all receiver-side SACK information.
466 */
467 void
tcp_clean_sackreport(struct tcpcb * tp)468 tcp_clean_sackreport(struct tcpcb *tp)
469 {
470 int i;
471
472 INP_WLOCK_ASSERT(tptoinpcb(tp));
473 tp->rcv_numsacks = 0;
474 for (i = 0; i < MAX_SACK_BLKS; i++)
475 tp->sackblks[i].start = tp->sackblks[i].end=0;
476 }
477
478 /*
479 * Allocate struct sackhole.
480 */
481 static struct sackhole *
tcp_sackhole_alloc(struct tcpcb * tp,tcp_seq start,tcp_seq end)482 tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
483 {
484 struct sackhole *hole;
485
486 if (tp->snd_numholes >= V_tcp_sack_maxholes ||
487 V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
488 TCPSTAT_INC(tcps_sack_sboverflow);
489 return NULL;
490 }
491
492 hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
493 if (hole == NULL)
494 return NULL;
495
496 hole->start = start;
497 hole->end = end;
498 hole->rxmit = start;
499
500 tp->snd_numholes++;
501 atomic_add_int(&V_tcp_sack_globalholes, 1);
502
503 return hole;
504 }
505
506 /*
507 * Free struct sackhole.
508 */
509 static void
tcp_sackhole_free(struct tcpcb * tp,struct sackhole * hole)510 tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
511 {
512
513 uma_zfree(V_sack_hole_zone, hole);
514
515 tp->snd_numholes--;
516 atomic_subtract_int(&V_tcp_sack_globalholes, 1);
517
518 KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes < 0"));
519 KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes < 0"));
520 }
521
522 /*
523 * Insert new SACK hole into scoreboard.
524 */
525 static struct sackhole *
tcp_sackhole_insert(struct tcpcb * tp,tcp_seq start,tcp_seq end,struct sackhole * after)526 tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
527 struct sackhole *after)
528 {
529 struct sackhole *hole;
530
531 /* Allocate a new SACK hole. */
532 hole = tcp_sackhole_alloc(tp, start, end);
533 if (hole == NULL)
534 return NULL;
535
536 /* Insert the new SACK hole into scoreboard. */
537 if (after != NULL)
538 TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
539 else
540 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
541
542 /* Update SACK hint. */
543 if (tp->sackhint.nexthole == NULL)
544 tp->sackhint.nexthole = hole;
545
546 return hole;
547 }
548
549 /*
550 * Remove SACK hole from scoreboard.
551 */
552 static void
tcp_sackhole_remove(struct tcpcb * tp,struct sackhole * hole)553 tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
554 {
555
556 /* Update SACK hint. */
557 if (tp->sackhint.nexthole == hole)
558 tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
559
560 /* Remove this SACK hole. */
561 TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
562
563 /* Free this SACK hole. */
564 tcp_sackhole_free(tp, hole);
565 }
566
567 /*
568 * Process cumulative ACK and the TCP SACK option to update the scoreboard.
569 * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
570 * the sequence space).
571 * Returns SACK_NEWLOSS if incoming ACK indicates ongoing loss (hole split, new hole),
572 * SACK_CHANGE if incoming ACK has previously unknown SACK information,
573 * SACK_NOCHANGE otherwise.
574 */
575 sackstatus_t
tcp_sack_doack(struct tcpcb * tp,struct tcpopt * to,tcp_seq th_ack)576 tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
577 {
578 struct sackhole *cur, *temp;
579 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
580 int i, j, num_sack_blks;
581 sackstatus_t sack_changed;
582 int delivered_data, left_edge_delta;
583 int maxseg = tp->t_maxseg - MAX_TCPOPTLEN;
584
585 tcp_seq loss_hiack = 0;
586 int loss_thresh = 0;
587 int loss_sblks = 0;
588 int notlost_bytes = 0;
589
590 INP_WLOCK_ASSERT(tptoinpcb(tp));
591
592 num_sack_blks = 0;
593 sack_changed = SACK_NOCHANGE;
594 delivered_data = 0;
595 left_edge_delta = 0;
596 /*
597 * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
598 * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
599 * Account changes to SND.UNA always in delivered data.
600 */
601 if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
602 left_edge_delta = th_ack - tp->snd_una;
603 delivered_data += left_edge_delta;
604 sack_blocks[num_sack_blks].start = tp->snd_una;
605 sack_blocks[num_sack_blks++].end = th_ack;
606 /*
607 * Pulling snd_fack forward if we got here
608 * due to DSACK blocks
609 */
610 if (SEQ_LT(tp->snd_fack, th_ack)) {
611 tp->snd_fack = th_ack;
612 sack_changed = SACK_CHANGE;
613 }
614 }
615 /*
616 * Append received valid SACK blocks to sack_blocks[], but only if we
617 * received new blocks from the other side.
618 */
619 if (to->to_flags & TOF_SACK) {
620 for (i = 0; i < to->to_nsacks; i++) {
621 bcopy((to->to_sacks + i * TCPOLEN_SACK),
622 &sack, sizeof(sack));
623 sack.start = ntohl(sack.start);
624 sack.end = ntohl(sack.end);
625 if (SEQ_GT(sack.end, sack.start) &&
626 SEQ_GT(sack.start, tp->snd_una) &&
627 SEQ_GT(sack.start, th_ack) &&
628 SEQ_LT(sack.start, tp->snd_max) &&
629 SEQ_GT(sack.end, tp->snd_una) &&
630 SEQ_LEQ(sack.end, tp->snd_max) &&
631 ((sack.end - sack.start) >= maxseg ||
632 SEQ_GEQ(sack.end, tp->snd_max))) {
633 sack_blocks[num_sack_blks++] = sack;
634 } else if (SEQ_LEQ(sack.start, th_ack) &&
635 SEQ_LEQ(sack.end, th_ack)) {
636 /*
637 * Its a D-SACK block.
638 */
639 tcp_record_dsack(tp, sack.start, sack.end, 0);
640 }
641 }
642 }
643 /*
644 * Return if SND.UNA is not advanced and no valid SACK block is
645 * received.
646 */
647 if (num_sack_blks == 0)
648 return (sack_changed);
649
650 /*
651 * Sort the SACK blocks so we can update the scoreboard with just one
652 * pass. The overhead of sorting up to 4+1 elements is less than
653 * making up to 4+1 passes over the scoreboard.
654 */
655 for (i = 0; i < num_sack_blks; i++) {
656 for (j = i + 1; j < num_sack_blks; j++) {
657 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
658 sack = sack_blocks[i];
659 sack_blocks[i] = sack_blocks[j];
660 sack_blocks[j] = sack;
661 }
662 }
663 }
664 if (TAILQ_EMPTY(&tp->snd_holes)) {
665 /*
666 * Empty scoreboard. Need to initialize snd_fack (it may be
667 * uninitialized or have a bogus value). Scoreboard holes
668 * (from the sack blocks received) are created later below
669 * (in the logic that adds holes to the tail of the
670 * scoreboard).
671 */
672 tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
673 }
674 /*
675 * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
676 * SACK holes (snd_holes) are traversed from their tails with just
677 * one pass in order to reduce the number of compares especially when
678 * the bandwidth-delay product is large.
679 *
680 * Note: Typically, in the first RTT of SACK recovery, the highest
681 * three or four SACK blocks with the same ack number are received.
682 * In the second RTT, if retransmitted data segments are not lost,
683 * the highest three or four SACK blocks with ack number advancing
684 * are received.
685 */
686 sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */
687 tp->sackhint.last_sack_ack = sblkp->end;
688 if (SEQ_LT(tp->snd_fack, sblkp->start)) {
689 /*
690 * The highest SACK block is beyond fack. First,
691 * check if there was a successful Rescue Retransmission,
692 * and move this hole left. With normal holes, snd_fack
693 * is always to the right of the end.
694 */
695 if (((temp = TAILQ_LAST(&tp->snd_holes, sackhole_head)) != NULL) &&
696 SEQ_LEQ(tp->snd_fack,temp->end)) {
697 tp->sackhint.hole_bytes -= temp->end - temp->start;
698 temp->start = SEQ_MAX(tp->snd_fack, SEQ_MAX(tp->snd_una, th_ack));
699 temp->end = sblkp->start;
700 temp->rxmit = temp->start;
701 delivered_data += sblkp->end - sblkp->start;
702 tp->sackhint.hole_bytes += temp->end - temp->start;
703 KASSERT(tp->sackhint.hole_bytes >= 0,
704 ("sackhint hole bytes < 0"));
705 tp->snd_fack = sblkp->end;
706 sblkp--;
707 sack_changed = SACK_NEWLOSS;
708 } else {
709 /*
710 * Append a new SACK hole at the tail. If the
711 * second or later highest SACK blocks are also
712 * beyond the current fack, they will be inserted
713 * by way of hole splitting in the while-loop below.
714 */
715 temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
716 if (temp != NULL) {
717 delivered_data += sblkp->end - sblkp->start;
718 tp->sackhint.hole_bytes += temp->end - temp->start;
719 tp->snd_fack = sblkp->end;
720 /* Go to the previous sack block. */
721 sblkp--;
722 sack_changed = SACK_CHANGE;
723 } else {
724 /*
725 * We failed to add a new hole based on the current
726 * sack block. Skip over all the sack blocks that
727 * fall completely to the right of snd_fack and
728 * proceed to trim the scoreboard based on the
729 * remaining sack blocks. This also trims the
730 * scoreboard for th_ack (which is sack_blocks[0]).
731 */
732 while (sblkp >= sack_blocks &&
733 SEQ_LT(tp->snd_fack, sblkp->start))
734 sblkp--;
735 if (sblkp >= sack_blocks &&
736 SEQ_LT(tp->snd_fack, sblkp->end)) {
737 delivered_data += sblkp->end - tp->snd_fack;
738 tp->snd_fack = sblkp->end;
739 /*
740 * While the Scoreboard didn't change in
741 * size, we only ended up here because
742 * some SACK data had to be dismissed.
743 */
744 sack_changed = SACK_NEWLOSS;
745 }
746 }
747 }
748 } else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
749 /* fack is advanced. */
750 delivered_data += sblkp->end - tp->snd_fack;
751 tp->snd_fack = sblkp->end;
752 sack_changed = SACK_CHANGE;
753 }
754 cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
755 loss_hiack = tp->snd_fack;
756
757 /*
758 * Since the incoming sack blocks are sorted, we can process them
759 * making one sweep of the scoreboard.
760 */
761 while (cur != NULL) {
762 if (!(sblkp >= sack_blocks)) {
763 if (((loss_sblks >= tcprexmtthresh) ||
764 (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
765 break;
766 loss_thresh += loss_hiack - cur->end;
767 loss_hiack = cur->start;
768 loss_sblks++;
769 if (!((loss_sblks >= tcprexmtthresh) ||
770 (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) {
771 notlost_bytes += cur->end - cur->start;
772 } else {
773 break;
774 }
775 cur = TAILQ_PREV(cur, sackhole_head, scblink);
776 continue;
777 }
778 if (SEQ_GEQ(sblkp->start, cur->end)) {
779 /*
780 * SACKs data beyond the current hole. Go to the
781 * previous sack block.
782 */
783 sblkp--;
784 continue;
785 }
786 if (SEQ_LEQ(sblkp->end, cur->start)) {
787 /*
788 * SACKs data before the current hole. Go to the
789 * previous hole.
790 */
791 loss_thresh += loss_hiack - cur->end;
792 loss_hiack = cur->start;
793 loss_sblks++;
794 if (!((loss_sblks >= tcprexmtthresh) ||
795 (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
796 notlost_bytes += cur->end - cur->start;
797 cur = TAILQ_PREV(cur, sackhole_head, scblink);
798 continue;
799 }
800 tp->sackhint.sack_bytes_rexmit -=
801 (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
802 KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
803 ("sackhint bytes rtx < 0"));
804 sack_changed = SACK_CHANGE;
805 if (SEQ_LEQ(sblkp->start, cur->start)) {
806 /* Data acks at least the beginning of hole. */
807 if (SEQ_GEQ(sblkp->end, cur->end)) {
808 /* Acks entire hole, so delete hole. */
809 delivered_data += (cur->end - cur->start);
810 temp = cur;
811 cur = TAILQ_PREV(cur, sackhole_head, scblink);
812 tp->sackhint.hole_bytes -= temp->end - temp->start;
813 tcp_sackhole_remove(tp, temp);
814 /*
815 * The sack block may ack all or part of the
816 * next hole too, so continue onto the next
817 * hole.
818 */
819 continue;
820 } else {
821 /* Move start of hole forward. */
822 delivered_data += (sblkp->end - cur->start);
823 tp->sackhint.hole_bytes -= sblkp->end - cur->start;
824 cur->start = sblkp->end;
825 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
826 }
827 } else {
828 /* Data acks at least the end of hole. */
829 if (SEQ_GEQ(sblkp->end, cur->end)) {
830 /* Move end of hole backward. */
831 delivered_data += (cur->end - sblkp->start);
832 tp->sackhint.hole_bytes -= cur->end - sblkp->start;
833 cur->end = sblkp->start;
834 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
835 if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
836 cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover);
837 } else {
838 /*
839 * ACKs some data in middle of a hole; need
840 * to split current hole
841 */
842 temp = tcp_sackhole_insert(tp, sblkp->end,
843 cur->end, cur);
844 sack_changed = SACK_NEWLOSS;
845 if (temp != NULL) {
846 if (SEQ_GT(cur->rxmit, temp->rxmit)) {
847 temp->rxmit = cur->rxmit;
848 tp->sackhint.sack_bytes_rexmit +=
849 (SEQ_MIN(temp->rxmit,
850 temp->end) - temp->start);
851 }
852 tp->sackhint.hole_bytes -= sblkp->end - sblkp->start;
853 loss_thresh += loss_hiack - temp->end;
854 loss_hiack = temp->start;
855 loss_sblks++;
856 if (!((loss_sblks >= tcprexmtthresh) ||
857 (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
858 notlost_bytes += temp->end - temp->start;
859 cur->end = sblkp->start;
860 cur->rxmit = SEQ_MIN(cur->rxmit,
861 cur->end);
862 if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
863 cur->rxmit = SEQ_MAX(cur->rxmit, tp->snd_recover);
864 delivered_data += (sblkp->end - sblkp->start);
865 }
866 }
867 }
868 tp->sackhint.sack_bytes_rexmit +=
869 (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
870 /*
871 * Testing sblkp->start against cur->start tells us whether
872 * we're done with the sack block or the sack hole.
873 * Accordingly, we advance one or the other.
874 */
875 if (SEQ_LEQ(sblkp->start, cur->start)) {
876 loss_thresh += loss_hiack - cur->end;
877 loss_hiack = cur->start;
878 loss_sblks++;
879 if (!((loss_sblks >= tcprexmtthresh) ||
880 (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
881 notlost_bytes += cur->end - cur->start;
882 cur = TAILQ_PREV(cur, sackhole_head, scblink);
883 } else {
884 sblkp--;
885 }
886 }
887
888 KASSERT(delivered_data >= 0, ("delivered_data < 0"));
889 KASSERT(notlost_bytes <= tp->sackhint.hole_bytes,
890 ("SACK: more bytes marked notlost than in scoreboard holes"));
891
892 if (TAILQ_EMPTY(&tp->snd_holes)) {
893 KASSERT(tp->sackhint.hole_bytes == 0,
894 ("SACK scoreboard empty, but sackhint hole bytes != 0"));
895 tp->sackhint.sack_bytes_rexmit = 0;
896 tp->sackhint.sacked_bytes = 0;
897 tp->sackhint.lost_bytes = 0;
898 } else {
899 KASSERT(tp->sackhint.hole_bytes > 0,
900 ("SACK scoreboard not empty, but sackhint hole bytes <= 0"));
901 tp->sackhint.delivered_data = delivered_data;
902 tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
903 KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0"));
904 tp->sackhint.lost_bytes = tp->sackhint.hole_bytes -
905 notlost_bytes;
906 }
907
908 if (!(to->to_flags & TOF_SACK))
909 /*
910 * If this ACK did not contain any
911 * SACK blocks, any only moved the
912 * left edge right, it is a pure
913 * cumulative ACK. Do not count
914 * DupAck for this. Also required
915 * for RFC6675 rescue retransmission.
916 */
917 sack_changed = SACK_NOCHANGE;
918 return (sack_changed);
919 }
920
921 /*
922 * Free all SACK holes to clear the scoreboard.
923 */
924 void
tcp_free_sackholes(struct tcpcb * tp)925 tcp_free_sackholes(struct tcpcb *tp)
926 {
927 struct sackhole *q;
928
929 INP_WLOCK_ASSERT(tptoinpcb(tp));
930 while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
931 tcp_sackhole_remove(tp, q);
932 tp->sackhint.sack_bytes_rexmit = 0;
933 tp->sackhint.delivered_data = 0;
934 tp->sackhint.sacked_bytes = 0;
935 tp->sackhint.hole_bytes = 0;
936 tp->sackhint.lost_bytes = 0;
937
938 KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes != 0"));
939 KASSERT(tp->sackhint.nexthole == NULL,
940 ("tp->sackhint.nexthole != NULL"));
941 }
942
943 /*
944 * Resend all the currently existing SACK holes of
945 * the scoreboard. This is in line with the Errata to
946 * RFC 2018, which allows the use of SACK data past
947 * an RTO to good effect typically.
948 */
949 void
tcp_resend_sackholes(struct tcpcb * tp)950 tcp_resend_sackholes(struct tcpcb *tp)
951 {
952 struct sackhole *p;
953
954 INP_WLOCK_ASSERT(tptoinpcb(tp));
955 TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
956 p->rxmit = p->start;
957 }
958 tp->sackhint.nexthole = TAILQ_FIRST(&tp->snd_holes);
959 tp->sackhint.sack_bytes_rexmit = 0;
960 }
961
962 /*
963 * Partial ack handling within a sack recovery episode. Keeping this very
964 * simple for now. When a partial ack is received, force snd_cwnd to a value
965 * that will allow the sender to transmit no more than 2 segments. If
966 * necessary, a better scheme can be adopted at a later point, but for now,
967 * the goal is to prevent the sender from bursting a large amount of data in
968 * the midst of sack recovery.
969 */
970 void
tcp_sack_partialack(struct tcpcb * tp,struct tcphdr * th,u_int * maxsegp)971 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp)
972 {
973 struct sackhole *temp;
974 int num_segs = 1;
975 u_int maxseg;
976
977 INP_WLOCK_ASSERT(tptoinpcb(tp));
978
979 if (*maxsegp == 0) {
980 *maxsegp = tcp_maxseg(tp);
981 }
982 maxseg = *maxsegp;
983 tcp_timer_activate(tp, TT_REXMT, 0);
984 tp->t_rtttime = 0;
985 /* Send one or 2 segments based on how much new data was acked. */
986 if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
987 num_segs = 2;
988 if (tp->snd_nxt == tp->snd_max) {
989 tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
990 (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg);
991 } else {
992 /*
993 * Since cwnd is not the expected flightsize during
994 * SACK LR, not deflating cwnd allows the partial
995 * ACKed amount to be sent.
996 */
997 }
998 if (tp->snd_cwnd > tp->snd_ssthresh)
999 tp->snd_cwnd = tp->snd_ssthresh;
1000 tp->t_flags |= TF_ACKNOW;
1001 /*
1002 * RFC6675 rescue retransmission
1003 * Add a hole between th_ack (snd_una is not yet set) and snd_max,
1004 * if this was a pure cumulative ACK and no data was send beyond
1005 * recovery point. Since the data in the socket has not been freed
1006 * at this point, we check if the scoreboard is empty, and the ACK
1007 * delivered some new data, indicating a full ACK. Also, if the
1008 * recovery point is still at snd_max, we are probably application
1009 * limited. However, this inference might not always be true. The
1010 * rescue retransmission may rarely be slightly premature
1011 * compared to RFC6675.
1012 * The corresponding ACK+SACK will cause any further outstanding
1013 * segments to be retransmitted. This addresses a corner case, when
1014 * the trailing packets of a window are lost and no further data
1015 * is available for sending.
1016 */
1017 if ((V_tcp_do_newsack) &&
1018 SEQ_LT(th->th_ack, tp->snd_recover) &&
1019 TAILQ_EMPTY(&tp->snd_holes) &&
1020 (tp->sackhint.delivered_data > 0)) {
1021 /*
1022 * Exclude FIN sequence space in
1023 * the hole for the rescue retransmission,
1024 * and also don't create a hole, if only
1025 * the ACK for a FIN is outstanding.
1026 */
1027 tcp_seq highdata = tp->snd_max;
1028 if (tp->t_flags & TF_SENTFIN)
1029 highdata--;
1030 highdata = SEQ_MIN(highdata, tp->snd_recover);
1031 if (SEQ_LT(th->th_ack, highdata)) {
1032 tp->snd_fack = SEQ_MAX(th->th_ack, tp->snd_fack);
1033 if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack,
1034 highdata - maxseg), highdata, NULL)) != NULL) {
1035 tp->sackhint.hole_bytes +=
1036 temp->end - temp->start;
1037 }
1038 }
1039 }
1040 (void) tcp_output(tp);
1041 }
1042
1043 /*
1044 * Returns the next hole to retransmit and the number of retransmitted bytes
1045 * from the scoreboard. We store both the next hole and the number of
1046 * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
1047 * reception). This avoids scoreboard traversals completely.
1048 *
1049 * The loop here will traverse *at most* one link. Here's the argument. For
1050 * the loop to traverse more than 1 link before finding the next hole to
1051 * retransmit, we would need to have at least 1 node following the current
1052 * hint with (rxmit == end). But, for all holes following the current hint,
1053 * (start == rxmit), since we have not yet retransmitted from them.
1054 * Therefore, in order to traverse more 1 link in the loop below, we need to
1055 * have at least one node following the current hint with (start == rxmit ==
1056 * end). But that can't happen, (start == end) means that all the data in
1057 * that hole has been sacked, in which case, the hole would have been removed
1058 * from the scoreboard.
1059 */
1060 struct sackhole *
tcp_sack_output(struct tcpcb * tp,int * sack_bytes_rexmt)1061 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
1062 {
1063 struct sackhole *hole = NULL;
1064
1065 INP_WLOCK_ASSERT(tptoinpcb(tp));
1066 *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
1067 hole = tp->sackhint.nexthole;
1068 if (hole == NULL)
1069 return (hole);
1070 if (SEQ_GEQ(hole->rxmit, hole->end)) {
1071 for (;;) {
1072 hole = TAILQ_NEXT(hole, scblink);
1073 if (hole == NULL)
1074 return (hole);
1075 if (SEQ_LT(hole->rxmit, hole->end)) {
1076 tp->sackhint.nexthole = hole;
1077 break;
1078 }
1079 }
1080 }
1081 KASSERT(SEQ_LT(hole->start, hole->end),
1082 ("%s: SEQ_GEQ(hole.start, hole.end)", __func__));
1083 if (!(V_tcp_do_newsack)) {
1084 KASSERT(SEQ_LT(hole->start, tp->snd_fack),
1085 ("%s: SEG_GEQ(hole.start, snd.fack)", __func__));
1086 KASSERT(SEQ_LT(hole->end, tp->snd_fack),
1087 ("%s: SEG_GEQ(hole.end, snd.fack)", __func__));
1088 KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack),
1089 ("%s: SEQ_GEQ(hole.rxmit, snd.fack)", __func__));
1090 if (SEQ_GEQ(hole->start, hole->end) ||
1091 SEQ_GEQ(hole->start, tp->snd_fack) ||
1092 SEQ_GEQ(hole->end, tp->snd_fack) ||
1093 SEQ_GEQ(hole->rxmit, tp->snd_fack)) {
1094 log(LOG_CRIT,"tcp: invalid SACK hole (%u-%u,%u) vs fwd ack %u, ignoring.\n",
1095 hole->start, hole->end, hole->rxmit, tp->snd_fack);
1096 return (NULL);
1097 }
1098 }
1099 return (hole);
1100 }
1101
1102 /*
1103 * After a timeout, the SACK list may be rebuilt. This SACK information
1104 * should be used to avoid retransmitting SACKed data. This function
1105 * traverses the SACK list to see if snd_nxt should be moved forward.
1106 * In addition, cwnd will be inflated by the sacked bytes traversed when
1107 * moving snd_nxt forward. This prevents a traffic burst after the final
1108 * full ACK, and also keeps ACKs coming back.
1109 */
1110 int
tcp_sack_adjust(struct tcpcb * tp)1111 tcp_sack_adjust(struct tcpcb *tp)
1112 {
1113 int sacked = 0;
1114 struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
1115
1116 INP_WLOCK_ASSERT(tptoinpcb(tp));
1117 if (cur == NULL) {
1118 /* No holes */
1119 return (0);
1120 }
1121 if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) {
1122 /* We're already beyond any SACKed blocks */
1123 return (tp->sackhint.sacked_bytes);
1124 }
1125 /*
1126 * Two cases for which we want to advance snd_nxt:
1127 * i) snd_nxt lies between end of one hole and beginning of another
1128 * ii) snd_nxt lies between end of last hole and snd_fack
1129 */
1130 while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
1131 if (SEQ_LT(tp->snd_nxt, cur->end)) {
1132 return (sacked);
1133 }
1134 sacked += p->start - cur->end;
1135 if (SEQ_GEQ(tp->snd_nxt, p->start)) {
1136 cur = p;
1137 } else {
1138 tp->snd_nxt = p->start;
1139 return (sacked);
1140 }
1141 }
1142 if (SEQ_LT(tp->snd_nxt, cur->end)) {
1143 return (sacked);
1144 }
1145 tp->snd_nxt = tp->snd_fack;
1146 return (tp->sackhint.sacked_bytes);
1147 }
1148
1149 /*
1150 * Lost Retransmission Detection
1151 * Check is FACK is beyond the rexmit of the leftmost hole.
1152 * If yes, we restart sending from still existing holes,
1153 * and adjust cwnd via the congestion control module.
1154 */
1155 void
tcp_sack_lost_retransmission(struct tcpcb * tp,struct tcphdr * th)1156 tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
1157 {
1158 struct sackhole *temp;
1159
1160 if (IN_RECOVERY(tp->t_flags) &&
1161 SEQ_GT(tp->snd_fack, tp->snd_recover) &&
1162 ((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) &&
1163 SEQ_GEQ(temp->rxmit, temp->end) &&
1164 SEQ_GEQ(tp->snd_fack, temp->rxmit)) {
1165 TCPSTAT_INC(tcps_sack_lostrexmt);
1166 /*
1167 * Start retransmissions from the first hole, and
1168 * subsequently all other remaining holes, including
1169 * those, which had been sent completely before.
1170 */
1171 tp->sackhint.nexthole = temp;
1172 TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
1173 if (SEQ_GEQ(tp->snd_fack, temp->rxmit) &&
1174 SEQ_GEQ(temp->rxmit, temp->end))
1175 temp->rxmit = temp->start;
1176 }
1177 /*
1178 * Remember the old ssthresh, to deduct the beta factor used
1179 * by the CC module. Finally, set cwnd to ssthresh just
1180 * prior to invoking another cwnd reduction by the CC
1181 * module, to not shrink it excessively.
1182 */
1183 tp->snd_cwnd = tp->snd_ssthresh;
1184 /*
1185 * Formally exit recovery, and let the CC module adjust
1186 * ssthresh as intended.
1187 */
1188 EXIT_RECOVERY(tp->t_flags);
1189 cc_cong_signal(tp, th, CC_NDUPACK);
1190 /*
1191 * For PRR, adjust recover_fs as if this new reduction
1192 * initialized this variable.
1193 * cwnd will be adjusted by SACK or PRR processing
1194 * subsequently, only set it to a safe value here.
1195 */
1196 tp->snd_cwnd = tcp_maxseg(tp);
1197 tp->sackhint.recover_fs = (tp->snd_max - tp->snd_una) -
1198 tp->sackhint.recover_fs;
1199 }
1200 }
1201