xref: /freebsd/sys/netinet/tcp_stacks/rack.c (revision c4e127e24dc9f1322ebe7ade0991de7022010bf1)
1 /*-
2  * Copyright (c) 2016-2019 Netflix, Inc.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include "opt_inet.h"
31 #include "opt_inet6.h"
32 #include "opt_ipsec.h"
33 #include "opt_tcpdebug.h"
34 
35 #include <sys/param.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #ifdef TCP_HHOOK
39 #include <sys/hhook.h>
40 #endif
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/mbuf.h>
46 #include <sys/proc.h>		/* for proc0 declaration */
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51 #ifdef NETFLIX_STATS
52 #include <sys/stats.h>
53 #endif
54 #include <sys/refcount.h>
55 #include <sys/queue.h>
56 #include <sys/smp.h>
57 #include <sys/kthread.h>
58 #include <sys/kern_prefetch.h>
59 
60 #include <vm/uma.h>
61 
62 #include <net/route.h>
63 #include <net/vnet.h>
64 
65 #define TCPSTATES		/* for logging */
66 
67 #include <netinet/in.h>
68 #include <netinet/in_kdtrace.h>
69 #include <netinet/in_pcb.h>
70 #include <netinet/ip.h>
71 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
72 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
73 #include <netinet/ip_var.h>
74 #include <netinet/ip6.h>
75 #include <netinet6/in6_pcb.h>
76 #include <netinet6/ip6_var.h>
77 #include <netinet/tcp.h>
78 #define	TCPOUTFLAGS
79 #include <netinet/tcp_fsm.h>
80 #include <netinet/tcp_log_buf.h>
81 #include <netinet/tcp_seq.h>
82 #include <netinet/tcp_timer.h>
83 #include <netinet/tcp_var.h>
84 #include <netinet/tcp_hpts.h>
85 #include <netinet/tcpip.h>
86 #include <netinet/cc/cc.h>
87 #ifdef NETFLIX_CWV
88 #include <netinet/tcp_newcwv.h>
89 #endif
90 #include <netinet/tcp_fastopen.h>
91 #ifdef TCPDEBUG
92 #include <netinet/tcp_debug.h>
93 #endif				/* TCPDEBUG */
94 #ifdef TCP_OFFLOAD
95 #include <netinet/tcp_offload.h>
96 #endif
97 #ifdef INET6
98 #include <netinet6/tcp6_var.h>
99 #endif
100 
101 #include <netipsec/ipsec_support.h>
102 
103 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
104 #include <netipsec/ipsec.h>
105 #include <netipsec/ipsec6.h>
106 #endif				/* IPSEC */
107 
108 #include <netinet/udp.h>
109 #include <netinet/udp_var.h>
110 #include <machine/in_cksum.h>
111 
112 #ifdef MAC
113 #include <security/mac/mac_framework.h>
114 #endif
115 #include "sack_filter.h"
116 #include "tcp_rack.h"
117 #include "rack_bbr_common.h"
118 
119 uma_zone_t rack_zone;
120 uma_zone_t rack_pcb_zone;
121 
122 #ifndef TICKS2SBT
123 #define	TICKS2SBT(__t)	(tick_sbt * ((sbintime_t)(__t)))
124 #endif
125 
126 struct sysctl_ctx_list rack_sysctl_ctx;
127 struct sysctl_oid *rack_sysctl_root;
128 
129 #define CUM_ACKED 1
130 #define SACKED 2
131 
132 /*
133  * The RACK module incorporates a number of
134  * TCP ideas that have been put out into the IETF
135  * over the last few years:
136  * - Matt Mathis's Rate Halving which slowly drops
137  *    the congestion window so that the ack clock can
138  *    be maintained during a recovery.
139  * - Yuchung Cheng's RACK TCP (for which its named) that
140  *    will stop us using the number of dup acks and instead
141  *    use time as the gage of when we retransmit.
142  * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
143  *    of Dukkipati et.al.
144  * RACK depends on SACK, so if an endpoint arrives that
145  * cannot do SACK the state machine below will shuttle the
146  * connection back to using the "default" TCP stack that is
147  * in FreeBSD.
148  *
149  * To implement RACK the original TCP stack was first decomposed
150  * into a functional state machine with individual states
151  * for each of the possible TCP connection states. The do_segement
152  * functions role in life is to mandate the connection supports SACK
153  * initially and then assure that the RACK state matches the conenction
154  * state before calling the states do_segment function. Each
155  * state is simplified due to the fact that the original do_segment
156  * has been decomposed and we *know* what state we are in (no
157  * switches on the state) and all tests for SACK are gone. This
158  * greatly simplifies what each state does.
159  *
160  * TCP output is also over-written with a new version since it
161  * must maintain the new rack scoreboard.
162  *
163  */
164 static int32_t rack_precache = 1;
165 static int32_t rack_tlp_thresh = 1;
166 static int32_t rack_reorder_thresh = 2;
167 static int32_t rack_reorder_fade = 60000;	/* 0 - never fade, def 60,000
168 						 * - 60 seconds */
169 static int32_t rack_pkt_delay = 1;
170 static int32_t rack_inc_var = 0;/* For TLP */
171 static int32_t rack_reduce_largest_on_idle = 0;
172 static int32_t rack_min_pace_time = 0;
173 static int32_t rack_min_pace_time_seg_req=6;
174 static int32_t rack_early_recovery = 1;
175 static int32_t rack_early_recovery_max_seg = 6;
176 static int32_t rack_send_a_lot_in_prr = 1;
177 static int32_t rack_min_to = 1;	/* Number of ms minimum timeout */
178 static int32_t rack_tlp_in_recovery = 1;	/* Can we do TLP in recovery? */
179 static int32_t rack_verbose_logging = 0;
180 static int32_t rack_ignore_data_after_close = 1;
181 /*
182  * Currently regular tcp has a rto_min of 30ms
183  * the backoff goes 12 times so that ends up
184  * being a total of 122.850 seconds before a
185  * connection is killed.
186  */
187 static int32_t rack_tlp_min = 10;
188 static int32_t rack_rto_min = 30;	/* 30ms same as main freebsd */
189 static int32_t rack_rto_max = 30000;	/* 30 seconds */
190 static const int32_t rack_free_cache = 2;
191 static int32_t rack_hptsi_segments = 40;
192 static int32_t rack_rate_sample_method = USE_RTT_LOW;
193 static int32_t rack_pace_every_seg = 1;
194 static int32_t rack_delayed_ack_time = 200;	/* 200ms */
195 static int32_t rack_slot_reduction = 4;
196 static int32_t rack_lower_cwnd_at_tlp = 0;
197 static int32_t rack_use_proportional_reduce = 0;
198 static int32_t rack_proportional_rate = 10;
199 static int32_t rack_tlp_max_resend = 2;
200 static int32_t rack_limited_retran = 0;
201 static int32_t rack_always_send_oldest = 0;
202 static int32_t rack_sack_block_limit = 128;
203 static int32_t rack_use_sack_filter = 1;
204 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
205 static uint32_t rack_map_split_limit = 0;	/* unlimited by default */
206 
207 /* Rack specific counters */
208 counter_u64_t rack_badfr;
209 counter_u64_t rack_badfr_bytes;
210 counter_u64_t rack_rtm_prr_retran;
211 counter_u64_t rack_rtm_prr_newdata;
212 counter_u64_t rack_timestamp_mismatch;
213 counter_u64_t rack_reorder_seen;
214 counter_u64_t rack_paced_segments;
215 counter_u64_t rack_unpaced_segments;
216 counter_u64_t rack_saw_enobuf;
217 counter_u64_t rack_saw_enetunreach;
218 
219 /* Tail loss probe counters */
220 counter_u64_t rack_tlp_tot;
221 counter_u64_t rack_tlp_newdata;
222 counter_u64_t rack_tlp_retran;
223 counter_u64_t rack_tlp_retran_bytes;
224 counter_u64_t rack_tlp_retran_fail;
225 counter_u64_t rack_to_tot;
226 counter_u64_t rack_to_arm_rack;
227 counter_u64_t rack_to_arm_tlp;
228 counter_u64_t rack_to_alloc;
229 counter_u64_t rack_to_alloc_hard;
230 counter_u64_t rack_to_alloc_emerg;
231 counter_u64_t rack_alloc_limited_conns;
232 counter_u64_t rack_split_limited;
233 
234 counter_u64_t rack_sack_proc_all;
235 counter_u64_t rack_sack_proc_short;
236 counter_u64_t rack_sack_proc_restart;
237 counter_u64_t rack_runt_sacks;
238 counter_u64_t rack_used_tlpmethod;
239 counter_u64_t rack_used_tlpmethod2;
240 counter_u64_t rack_enter_tlp_calc;
241 counter_u64_t rack_input_idle_reduces;
242 counter_u64_t rack_tlp_does_nada;
243 
244 /* Temp CPU counters */
245 counter_u64_t rack_find_high;
246 
247 counter_u64_t rack_progress_drops;
248 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
249 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
250 
251 static void
252 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line);
253 
254 static int
255 rack_process_ack(struct mbuf *m, struct tcphdr *th,
256     struct socket *so, struct tcpcb *tp, struct tcpopt *to,
257     uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
258 static int
259 rack_process_data(struct mbuf *m, struct tcphdr *th,
260     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
261     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
262 static void
263 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
264     struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery);
265 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
266 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
267     uint8_t limit_type);
268 static struct rack_sendmap *
269 rack_check_recovery_mode(struct tcpcb *tp,
270     uint32_t tsused);
271 static void
272 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th,
273     uint32_t type);
274 static void rack_counter_destroy(void);
275 static int
276 rack_ctloutput(struct socket *so, struct sockopt *sopt,
277     struct inpcb *inp, struct tcpcb *tp);
278 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
279 static void
280 rack_do_segment(struct mbuf *m, struct tcphdr *th,
281     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
282     uint8_t iptos);
283 static void rack_dtor(void *mem, int32_t size, void *arg);
284 static void
285 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
286     uint32_t t, uint32_t cts);
287 static struct rack_sendmap *
288 rack_find_high_nonack(struct tcp_rack *rack,
289     struct rack_sendmap *rsm);
290 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
291 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
292 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
293 static int
294 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
295     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
296 static int32_t rack_handoff_ok(struct tcpcb *tp);
297 static int32_t rack_init(struct tcpcb *tp);
298 static void rack_init_sysctls(void);
299 static void
300 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
301     struct tcphdr *th);
302 static void
303 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
304     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
305     uint8_t pass, struct rack_sendmap *hintrsm);
306 static void
307 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
308     struct rack_sendmap *rsm);
309 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num);
310 static int32_t rack_output(struct tcpcb *tp);
311 static void
312 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th,
313     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
314     uint8_t iptos, int32_t nxt_pkt, struct timeval *tv);
315 
316 static uint32_t
317 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
318     struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
319     uint32_t cts);
320 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th);
321 static void rack_remxt_tmr(struct tcpcb *tp);
322 static int
323 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
324     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
325 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
326 static int32_t rack_stopall(struct tcpcb *tp);
327 static void
328 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type,
329     uint32_t delta);
330 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type);
331 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
332 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type);
333 static uint32_t
334 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
335     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp);
336 static void
337 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
338     struct rack_sendmap *rsm, uint32_t ts);
339 static int
340 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
341     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
342 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
343 static void
344 rack_challenge_ack(struct mbuf *m, struct tcphdr *th,
345     struct tcpcb *tp, int32_t * ret_val);
346 static int
347 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
348     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
349     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
350 static int
351 rack_do_closing(struct mbuf *m, struct tcphdr *th,
352     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
353     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
354 static void
355 rack_do_drop(struct mbuf *m, struct tcpcb *tp);
356 static void
357 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
358     struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
359 static void
360 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
361 	struct tcphdr *th, int32_t rstreason, int32_t tlen);
362 static int
363 rack_do_established(struct mbuf *m, struct tcphdr *th,
364     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
365     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
366 static int
367 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
368     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
369     int32_t tlen, uint32_t tiwin, int32_t nxt_pkt);
370 static int
371 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
372     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
373     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
374 static int
375 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
376     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
377     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
378 static int
379 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
380     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
381     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
382 static int
383 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
384     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
385     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
386 static int
387 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
388     struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
389     int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
390 static int
391 rack_drop_checks(struct tcpopt *to, struct mbuf *m,
392     struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
393     int32_t * drop_hdrlen, int32_t * ret_val);
394 static int
395 rack_process_rst(struct mbuf *m, struct tcphdr *th,
396     struct socket *so, struct tcpcb *tp);
397 struct rack_sendmap *
398 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
399     uint32_t tsused);
400 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
401 static void
402      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
403 
404 static int
405 rack_ts_check(struct mbuf *m, struct tcphdr *th,
406     struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
407 
408 int32_t rack_clear_counter=0;
409 
410 
411 static int
412 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
413 {
414 	uint32_t stat;
415 	int32_t error;
416 
417 	error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
418 	if (error || req->newptr == NULL)
419 		return error;
420 
421 	error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
422 	if (error)
423 		return (error);
424 	if (stat == 1) {
425 #ifdef INVARIANTS
426 		printf("Clearing RACK counters\n");
427 #endif
428 		counter_u64_zero(rack_badfr);
429 		counter_u64_zero(rack_badfr_bytes);
430 		counter_u64_zero(rack_rtm_prr_retran);
431 		counter_u64_zero(rack_rtm_prr_newdata);
432 		counter_u64_zero(rack_timestamp_mismatch);
433 		counter_u64_zero(rack_reorder_seen);
434 		counter_u64_zero(rack_tlp_tot);
435 		counter_u64_zero(rack_tlp_newdata);
436 		counter_u64_zero(rack_tlp_retran);
437 		counter_u64_zero(rack_tlp_retran_bytes);
438 		counter_u64_zero(rack_tlp_retran_fail);
439 		counter_u64_zero(rack_to_tot);
440 		counter_u64_zero(rack_to_arm_rack);
441 		counter_u64_zero(rack_to_arm_tlp);
442 		counter_u64_zero(rack_paced_segments);
443 		counter_u64_zero(rack_unpaced_segments);
444 		counter_u64_zero(rack_saw_enobuf);
445 		counter_u64_zero(rack_saw_enetunreach);
446 		counter_u64_zero(rack_to_alloc_hard);
447 		counter_u64_zero(rack_to_alloc_emerg);
448 		counter_u64_zero(rack_sack_proc_all);
449 		counter_u64_zero(rack_sack_proc_short);
450 		counter_u64_zero(rack_sack_proc_restart);
451 		counter_u64_zero(rack_to_alloc);
452 		counter_u64_zero(rack_alloc_limited_conns);
453 		counter_u64_zero(rack_split_limited);
454 		counter_u64_zero(rack_find_high);
455 		counter_u64_zero(rack_runt_sacks);
456 		counter_u64_zero(rack_used_tlpmethod);
457 		counter_u64_zero(rack_used_tlpmethod2);
458 		counter_u64_zero(rack_enter_tlp_calc);
459 		counter_u64_zero(rack_progress_drops);
460 		counter_u64_zero(rack_tlp_does_nada);
461 	}
462 	rack_clear_counter = 0;
463 	return (0);
464 }
465 
466 
467 
468 static void
469 rack_init_sysctls()
470 {
471 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
472 	    SYSCTL_CHILDREN(rack_sysctl_root),
473 	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
474 	    &rack_rate_sample_method , USE_RTT_LOW,
475 	    "What method should we use for rate sampling 0=high, 1=low ");
476 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
477 	    SYSCTL_CHILDREN(rack_sysctl_root),
478 	    OID_AUTO, "data_after_close", CTLFLAG_RW,
479 	    &rack_ignore_data_after_close, 0,
480 	    "Do we hold off sending a RST until all pending data is ack'd");
481 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
482 	    SYSCTL_CHILDREN(rack_sysctl_root),
483 	    OID_AUTO, "tlpmethod", CTLFLAG_RW,
484 	    &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
485 	    "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
486 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
487 	    SYSCTL_CHILDREN(rack_sysctl_root),
488 	    OID_AUTO, "min_pace_time", CTLFLAG_RW,
489 	    &rack_min_pace_time, 0,
490 	    "Should we enforce a minimum pace time of 1ms");
491 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
492 	    SYSCTL_CHILDREN(rack_sysctl_root),
493 	    OID_AUTO, "min_pace_segs", CTLFLAG_RW,
494 	    &rack_min_pace_time_seg_req, 6,
495 	    "How many segments have to be in the len to enforce min-pace-time");
496 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
497 	    SYSCTL_CHILDREN(rack_sysctl_root),
498 	    OID_AUTO, "idle_reduce_high", CTLFLAG_RW,
499 	    &rack_reduce_largest_on_idle, 0,
500 	    "Should we reduce the largest cwnd seen to IW on idle reduction");
501 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
502 	    SYSCTL_CHILDREN(rack_sysctl_root),
503 	    OID_AUTO, "bb_verbose", CTLFLAG_RW,
504 	    &rack_verbose_logging, 0,
505 	    "Should RACK black box logging be verbose");
506 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
507 	    SYSCTL_CHILDREN(rack_sysctl_root),
508 	    OID_AUTO, "sackfiltering", CTLFLAG_RW,
509 	    &rack_use_sack_filter, 1,
510 	    "Do we use sack filtering?");
511 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
512 	    SYSCTL_CHILDREN(rack_sysctl_root),
513 	    OID_AUTO, "delayed_ack", CTLFLAG_RW,
514 	    &rack_delayed_ack_time, 200,
515 	    "Delayed ack time (200ms)");
516 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
517 	    SYSCTL_CHILDREN(rack_sysctl_root),
518 	    OID_AUTO, "tlpminto", CTLFLAG_RW,
519 	    &rack_tlp_min, 10,
520 	    "TLP minimum timeout per the specification (10ms)");
521 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
522 	    SYSCTL_CHILDREN(rack_sysctl_root),
523 	    OID_AUTO, "precache", CTLFLAG_RW,
524 	    &rack_precache, 0,
525 	    "Where should we precache the mcopy (0 is not at all)");
526 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
527 	    SYSCTL_CHILDREN(rack_sysctl_root),
528 	    OID_AUTO, "sblklimit", CTLFLAG_RW,
529 	    &rack_sack_block_limit, 128,
530 	    "When do we start paying attention to small sack blocks");
531 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
532 	    SYSCTL_CHILDREN(rack_sysctl_root),
533 	    OID_AUTO, "send_oldest", CTLFLAG_RW,
534 	    &rack_always_send_oldest, 1,
535 	    "Should we always send the oldest TLP and RACK-TLP");
536 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
537 	    SYSCTL_CHILDREN(rack_sysctl_root),
538 	    OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW,
539 	    &rack_tlp_in_recovery, 1,
540 	    "Can we do a TLP during recovery?");
541 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
542 	    SYSCTL_CHILDREN(rack_sysctl_root),
543 	    OID_AUTO, "rack_tlimit", CTLFLAG_RW,
544 	    &rack_limited_retran, 0,
545 	    "How many times can a rack timeout drive out sends");
546 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
547 	    SYSCTL_CHILDREN(rack_sysctl_root),
548 	    OID_AUTO, "minrto", CTLFLAG_RW,
549 	    &rack_rto_min, 0,
550 	    "Minimum RTO in ms -- set with caution below 1000 due to TLP");
551 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
552 	    SYSCTL_CHILDREN(rack_sysctl_root),
553 	    OID_AUTO, "maxrto", CTLFLAG_RW,
554 	    &rack_rto_max, 0,
555 	    "Maxiumum RTO in ms -- should be at least as large as min_rto");
556 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
557 	    SYSCTL_CHILDREN(rack_sysctl_root),
558 	    OID_AUTO, "tlp_retry", CTLFLAG_RW,
559 	    &rack_tlp_max_resend, 2,
560 	    "How many times does TLP retry a single segment or multiple with no ACK");
561 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
562 	    SYSCTL_CHILDREN(rack_sysctl_root),
563 	    OID_AUTO, "recovery_loss_prop", CTLFLAG_RW,
564 	    &rack_use_proportional_reduce, 0,
565 	    "Should we proportionaly reduce cwnd based on the number of losses ");
566 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
567 	    SYSCTL_CHILDREN(rack_sysctl_root),
568 	    OID_AUTO, "recovery_prop", CTLFLAG_RW,
569 	    &rack_proportional_rate, 10,
570 	    "What percent reduction per loss");
571 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
572 	    SYSCTL_CHILDREN(rack_sysctl_root),
573 	    OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
574 	    &rack_lower_cwnd_at_tlp, 0,
575 	    "When a TLP completes a retran should we enter recovery?");
576 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
577 	    SYSCTL_CHILDREN(rack_sysctl_root),
578 	    OID_AUTO, "hptsi_reduces", CTLFLAG_RW,
579 	    &rack_slot_reduction, 4,
580 	    "When setting a slot should we reduce by divisor");
581 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
582 	    SYSCTL_CHILDREN(rack_sysctl_root),
583 	    OID_AUTO, "hptsi_every_seg", CTLFLAG_RW,
584 	    &rack_pace_every_seg, 1,
585 	    "Should we pace out every segment hptsi");
586 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
587 	    SYSCTL_CHILDREN(rack_sysctl_root),
588 	    OID_AUTO, "hptsi_seg_max", CTLFLAG_RW,
589 	    &rack_hptsi_segments, 6,
590 	    "Should we pace out only a limited size of segments");
591 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
592 	    SYSCTL_CHILDREN(rack_sysctl_root),
593 	    OID_AUTO, "prr_sendalot", CTLFLAG_RW,
594 	    &rack_send_a_lot_in_prr, 1,
595 	    "Send a lot in prr");
596 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
597 	    SYSCTL_CHILDREN(rack_sysctl_root),
598 	    OID_AUTO, "minto", CTLFLAG_RW,
599 	    &rack_min_to, 1,
600 	    "Minimum rack timeout in milliseconds");
601 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
602 	    SYSCTL_CHILDREN(rack_sysctl_root),
603 	    OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW,
604 	    &rack_early_recovery_max_seg, 6,
605 	    "Max segments in early recovery");
606 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
607 	    SYSCTL_CHILDREN(rack_sysctl_root),
608 	    OID_AUTO, "earlyrecovery", CTLFLAG_RW,
609 	    &rack_early_recovery, 1,
610 	    "Do we do early recovery with rack");
611 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
612 	    SYSCTL_CHILDREN(rack_sysctl_root),
613 	    OID_AUTO, "reorder_thresh", CTLFLAG_RW,
614 	    &rack_reorder_thresh, 2,
615 	    "What factor for rack will be added when seeing reordering (shift right)");
616 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
617 	    SYSCTL_CHILDREN(rack_sysctl_root),
618 	    OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
619 	    &rack_tlp_thresh, 1,
620 	    "what divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
621 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
622 	    SYSCTL_CHILDREN(rack_sysctl_root),
623 	    OID_AUTO, "reorder_fade", CTLFLAG_RW,
624 	    &rack_reorder_fade, 0,
625 	    "Does reorder detection fade, if so how many ms (0 means never)");
626 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
627 	    SYSCTL_CHILDREN(rack_sysctl_root),
628 	    OID_AUTO, "pktdelay", CTLFLAG_RW,
629 	    &rack_pkt_delay, 1,
630 	    "Extra RACK time (in ms) besides reordering thresh");
631 	SYSCTL_ADD_U32(&rack_sysctl_ctx,
632 	    SYSCTL_CHILDREN(rack_sysctl_root),
633 	    OID_AUTO, "split_limit", CTLFLAG_RW,
634 	    &rack_map_split_limit, 0,
635 	    "Is there a limit on the number of map split entries (0=unlimited)");
636 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
637 	    SYSCTL_CHILDREN(rack_sysctl_root),
638 	    OID_AUTO, "inc_var", CTLFLAG_RW,
639 	    &rack_inc_var, 0,
640 	    "Should rack add to the TLP timer the variance in rtt calculation");
641 	rack_badfr = counter_u64_alloc(M_WAITOK);
642 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
643 	    SYSCTL_CHILDREN(rack_sysctl_root),
644 	    OID_AUTO, "badfr", CTLFLAG_RD,
645 	    &rack_badfr, "Total number of bad FRs");
646 	rack_badfr_bytes = counter_u64_alloc(M_WAITOK);
647 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
648 	    SYSCTL_CHILDREN(rack_sysctl_root),
649 	    OID_AUTO, "badfr_bytes", CTLFLAG_RD,
650 	    &rack_badfr_bytes, "Total number of bad FRs");
651 	rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK);
652 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
653 	    SYSCTL_CHILDREN(rack_sysctl_root),
654 	    OID_AUTO, "prrsndret", CTLFLAG_RD,
655 	    &rack_rtm_prr_retran,
656 	    "Total number of prr based retransmits");
657 	rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK);
658 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
659 	    SYSCTL_CHILDREN(rack_sysctl_root),
660 	    OID_AUTO, "prrsndnew", CTLFLAG_RD,
661 	    &rack_rtm_prr_newdata,
662 	    "Total number of prr based new transmits");
663 	rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK);
664 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
665 	    SYSCTL_CHILDREN(rack_sysctl_root),
666 	    OID_AUTO, "tsnf", CTLFLAG_RD,
667 	    &rack_timestamp_mismatch,
668 	    "Total number of timestamps that we could not find the reported ts");
669 	rack_find_high = counter_u64_alloc(M_WAITOK);
670 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
671 	    SYSCTL_CHILDREN(rack_sysctl_root),
672 	    OID_AUTO, "findhigh", CTLFLAG_RD,
673 	    &rack_find_high,
674 	    "Total number of FIN causing find-high");
675 	rack_reorder_seen = counter_u64_alloc(M_WAITOK);
676 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
677 	    SYSCTL_CHILDREN(rack_sysctl_root),
678 	    OID_AUTO, "reordering", CTLFLAG_RD,
679 	    &rack_reorder_seen,
680 	    "Total number of times we added delay due to reordering");
681 	rack_tlp_tot = counter_u64_alloc(M_WAITOK);
682 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
683 	    SYSCTL_CHILDREN(rack_sysctl_root),
684 	    OID_AUTO, "tlp_to_total", CTLFLAG_RD,
685 	    &rack_tlp_tot,
686 	    "Total number of tail loss probe expirations");
687 	rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
688 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
689 	    SYSCTL_CHILDREN(rack_sysctl_root),
690 	    OID_AUTO, "tlp_new", CTLFLAG_RD,
691 	    &rack_tlp_newdata,
692 	    "Total number of tail loss probe sending new data");
693 
694 	rack_tlp_retran = counter_u64_alloc(M_WAITOK);
695 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
696 	    SYSCTL_CHILDREN(rack_sysctl_root),
697 	    OID_AUTO, "tlp_retran", CTLFLAG_RD,
698 	    &rack_tlp_retran,
699 	    "Total number of tail loss probe sending retransmitted data");
700 	rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
701 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
702 	    SYSCTL_CHILDREN(rack_sysctl_root),
703 	    OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
704 	    &rack_tlp_retran_bytes,
705 	    "Total bytes of tail loss probe sending retransmitted data");
706 	rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK);
707 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
708 	    SYSCTL_CHILDREN(rack_sysctl_root),
709 	    OID_AUTO, "tlp_retran_fail", CTLFLAG_RD,
710 	    &rack_tlp_retran_fail,
711 	    "Total number of tail loss probe sending retransmitted data that failed (wait for t3)");
712 	rack_to_tot = counter_u64_alloc(M_WAITOK);
713 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
714 	    SYSCTL_CHILDREN(rack_sysctl_root),
715 	    OID_AUTO, "rack_to_tot", CTLFLAG_RD,
716 	    &rack_to_tot,
717 	    "Total number of times the rack to expired?");
718 	rack_to_arm_rack = counter_u64_alloc(M_WAITOK);
719 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
720 	    SYSCTL_CHILDREN(rack_sysctl_root),
721 	    OID_AUTO, "arm_rack", CTLFLAG_RD,
722 	    &rack_to_arm_rack,
723 	    "Total number of times the rack timer armed?");
724 	rack_to_arm_tlp = counter_u64_alloc(M_WAITOK);
725 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
726 	    SYSCTL_CHILDREN(rack_sysctl_root),
727 	    OID_AUTO, "arm_tlp", CTLFLAG_RD,
728 	    &rack_to_arm_tlp,
729 	    "Total number of times the tlp timer armed?");
730 	rack_paced_segments = counter_u64_alloc(M_WAITOK);
731 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
732 	    SYSCTL_CHILDREN(rack_sysctl_root),
733 	    OID_AUTO, "paced", CTLFLAG_RD,
734 	    &rack_paced_segments,
735 	    "Total number of times a segment send caused hptsi");
736 	rack_unpaced_segments = counter_u64_alloc(M_WAITOK);
737 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
738 	    SYSCTL_CHILDREN(rack_sysctl_root),
739 	    OID_AUTO, "unpaced", CTLFLAG_RD,
740 	    &rack_unpaced_segments,
741 	    "Total number of times a segment did not cause hptsi");
742 	rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
743 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
744 	    SYSCTL_CHILDREN(rack_sysctl_root),
745 	    OID_AUTO, "saw_enobufs", CTLFLAG_RD,
746 	    &rack_saw_enobuf,
747 	    "Total number of times a segment did not cause hptsi");
748 	rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
749 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
750 	    SYSCTL_CHILDREN(rack_sysctl_root),
751 	    OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
752 	    &rack_saw_enetunreach,
753 	    "Total number of times a segment did not cause hptsi");
754 	rack_to_alloc = counter_u64_alloc(M_WAITOK);
755 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
756 	    SYSCTL_CHILDREN(rack_sysctl_root),
757 	    OID_AUTO, "allocs", CTLFLAG_RD,
758 	    &rack_to_alloc,
759 	    "Total allocations of tracking structures");
760 	rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
761 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
762 	    SYSCTL_CHILDREN(rack_sysctl_root),
763 	    OID_AUTO, "allochard", CTLFLAG_RD,
764 	    &rack_to_alloc_hard,
765 	    "Total allocations done with sleeping the hard way");
766 	rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
767 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
768 	    SYSCTL_CHILDREN(rack_sysctl_root),
769 	    OID_AUTO, "allocemerg", CTLFLAG_RD,
770 	    &rack_to_alloc_emerg,
771 	    "Total allocations done from emergency cache");
772 	rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
773 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
774 	    SYSCTL_CHILDREN(rack_sysctl_root),
775 	    OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
776 	    &rack_alloc_limited_conns,
777 	    "Connections with allocations dropped due to limit");
778 	rack_split_limited = counter_u64_alloc(M_WAITOK);
779 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
780 	    SYSCTL_CHILDREN(rack_sysctl_root),
781 	    OID_AUTO, "split_limited", CTLFLAG_RD,
782 	    &rack_split_limited,
783 	    "Split allocations dropped due to limit");
784 	rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
785 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
786 	    SYSCTL_CHILDREN(rack_sysctl_root),
787 	    OID_AUTO, "sack_long", CTLFLAG_RD,
788 	    &rack_sack_proc_all,
789 	    "Total times we had to walk whole list for sack processing");
790 
791 	rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
792 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
793 	    SYSCTL_CHILDREN(rack_sysctl_root),
794 	    OID_AUTO, "sack_restart", CTLFLAG_RD,
795 	    &rack_sack_proc_restart,
796 	    "Total times we had to walk whole list due to a restart");
797 	rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
798 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
799 	    SYSCTL_CHILDREN(rack_sysctl_root),
800 	    OID_AUTO, "sack_short", CTLFLAG_RD,
801 	    &rack_sack_proc_short,
802 	    "Total times we took shortcut for sack processing");
803 	rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK);
804 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
805 	    SYSCTL_CHILDREN(rack_sysctl_root),
806 	    OID_AUTO, "tlp_calc_entered", CTLFLAG_RD,
807 	    &rack_enter_tlp_calc,
808 	    "Total times we called calc-tlp");
809 	rack_used_tlpmethod = counter_u64_alloc(M_WAITOK);
810 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
811 	    SYSCTL_CHILDREN(rack_sysctl_root),
812 	    OID_AUTO, "hit_tlp_method", CTLFLAG_RD,
813 	    &rack_used_tlpmethod,
814 	    "Total number of runt sacks");
815 	rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK);
816 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
817 	    SYSCTL_CHILDREN(rack_sysctl_root),
818 	    OID_AUTO, "hit_tlp_method2", CTLFLAG_RD,
819 	    &rack_used_tlpmethod2,
820 	    "Total number of runt sacks 2");
821 	rack_runt_sacks = counter_u64_alloc(M_WAITOK);
822 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
823 	    SYSCTL_CHILDREN(rack_sysctl_root),
824 	    OID_AUTO, "runtsacks", CTLFLAG_RD,
825 	    &rack_runt_sacks,
826 	    "Total number of runt sacks");
827 	rack_progress_drops = counter_u64_alloc(M_WAITOK);
828 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
829 	    SYSCTL_CHILDREN(rack_sysctl_root),
830 	    OID_AUTO, "prog_drops", CTLFLAG_RD,
831 	    &rack_progress_drops,
832 	    "Total number of progress drops");
833 	rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
834 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
835 	    SYSCTL_CHILDREN(rack_sysctl_root),
836 	    OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
837 	    &rack_input_idle_reduces,
838 	    "Total number of idle reductions on input");
839 	rack_tlp_does_nada = counter_u64_alloc(M_WAITOK);
840 	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
841 	    SYSCTL_CHILDREN(rack_sysctl_root),
842 	    OID_AUTO, "tlp_nada", CTLFLAG_RD,
843 	    &rack_tlp_does_nada,
844 	    "Total number of nada tlp calls");
845 	COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
846 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
847 	    OID_AUTO, "outsize", CTLFLAG_RD,
848 	    rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
849 	COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
850 	SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
851 	    OID_AUTO, "opts", CTLFLAG_RD,
852 	    rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
853 	SYSCTL_ADD_PROC(&rack_sysctl_ctx,
854 	    SYSCTL_CHILDREN(rack_sysctl_root),
855 	    OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
856 	    &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
857 }
858 
859 static inline int32_t
860 rack_progress_timeout_check(struct tcpcb *tp)
861 {
862 	if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
863 		if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
864 			/*
865 			 * There is an assumption that the caller
866 			 * will drop the connection so we will
867 			 * increment the counters here.
868 			 */
869 			struct tcp_rack *rack;
870 			rack = (struct tcp_rack *)tp->t_fb_ptr;
871 			counter_u64_add(rack_progress_drops, 1);
872 #ifdef NETFLIX_STATS
873 			TCPSTAT_INC(tcps_progdrops);
874 #endif
875 			rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
876 			return (1);
877 		}
878 	}
879 	return (0);
880 }
881 
882 
883 static void
884 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which)
885 {
886 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
887 		union tcp_log_stackspecific log;
888 
889 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
890 		log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT);
891 		log.u_bbr.flex2 = to;
892 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
893 		log.u_bbr.flex4 = slot;
894 		log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot;
895 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
896 		log.u_bbr.flex8 = which;
897 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
898 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
899 		TCP_LOG_EVENT(rack->rc_tp, NULL,
900 		    &rack->rc_inp->inp_socket->so_rcv,
901 		    &rack->rc_inp->inp_socket->so_snd,
902 		    BBR_LOG_TIMERSTAR, 0,
903 		    0, &log, false);
904 	}
905 }
906 
907 static void
908 rack_log_to_event(struct tcp_rack *rack, int32_t to_num)
909 {
910 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
911 		union tcp_log_stackspecific log;
912 
913 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
914 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
915 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
916 		log.u_bbr.flex8 = to_num;
917 		log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
918 		log.u_bbr.flex2 = rack->rc_rack_rtt;
919 		TCP_LOG_EVENT(rack->rc_tp, NULL,
920 		    &rack->rc_inp->inp_socket->so_rcv,
921 		    &rack->rc_inp->inp_socket->so_snd,
922 		    BBR_LOG_RTO, 0,
923 		    0, &log, false);
924 	}
925 }
926 
927 static void
928 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, int32_t t,
929     uint32_t o_srtt, uint32_t o_var)
930 {
931 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
932 		union tcp_log_stackspecific log;
933 
934 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
935 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
936 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
937 		log.u_bbr.flex1 = t;
938 		log.u_bbr.flex2 = o_srtt;
939 		log.u_bbr.flex3 = o_var;
940 		log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
941 		log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
942 		log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt;
943 		log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot;
944 		log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
945 		TCP_LOG_EVENT(tp, NULL,
946 		    &rack->rc_inp->inp_socket->so_rcv,
947 		    &rack->rc_inp->inp_socket->so_snd,
948 		    BBR_LOG_BBRRTT, 0,
949 		    0, &log, false);
950 	}
951 }
952 
953 static void
954 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
955 {
956 	/*
957 	 * Log the rtt sample we are
958 	 * applying to the srtt algorithm in
959 	 * useconds.
960 	 */
961 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
962 		union tcp_log_stackspecific log;
963 		struct timeval tv;
964 
965 		/* Convert our ms to a microsecond */
966 		log.u_bbr.flex1 = rtt * 1000;
967 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
968 		TCP_LOG_EVENTP(rack->rc_tp, NULL,
969 		    &rack->rc_inp->inp_socket->so_rcv,
970 		    &rack->rc_inp->inp_socket->so_snd,
971 		    TCP_LOG_RTT, 0,
972 		    0, &log, false, &tv);
973 	}
974 }
975 
976 
977 static inline void
978 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick,  int event, int line)
979 {
980 	if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
981 		union tcp_log_stackspecific log;
982 
983 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
984 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
985 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
986 		log.u_bbr.flex1 = line;
987 		log.u_bbr.flex2 = tick;
988 		log.u_bbr.flex3 = tp->t_maxunacktime;
989 		log.u_bbr.flex4 = tp->t_acktime;
990 		log.u_bbr.flex8 = event;
991 		TCP_LOG_EVENT(tp, NULL,
992 		    &rack->rc_inp->inp_socket->so_rcv,
993 		    &rack->rc_inp->inp_socket->so_snd,
994 		    BBR_LOG_PROGRESS, 0,
995 		    0, &log, false);
996 	}
997 }
998 
999 static void
1000 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts)
1001 {
1002 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1003 		union tcp_log_stackspecific log;
1004 
1005 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1006 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1007 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1008 		log.u_bbr.flex1 = slot;
1009 		log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
1010 		log.u_bbr.flex8 = rack->rc_in_persist;
1011 		TCP_LOG_EVENT(rack->rc_tp, NULL,
1012 		    &rack->rc_inp->inp_socket->so_rcv,
1013 		    &rack->rc_inp->inp_socket->so_snd,
1014 		    BBR_LOG_BBRSND, 0,
1015 		    0, &log, false);
1016 	}
1017 }
1018 
1019 static void
1020 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out)
1021 {
1022 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1023 		union tcp_log_stackspecific log;
1024 		log.u_bbr.flex1 = did_out;
1025 		log.u_bbr.flex2 = nxt_pkt;
1026 		log.u_bbr.flex3 = way_out;
1027 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1028 		log.u_bbr.flex7 = rack->r_wanted_output;
1029 		log.u_bbr.flex8 = rack->rc_in_persist;
1030 		TCP_LOG_EVENT(rack->rc_tp, NULL,
1031 		    &rack->rc_inp->inp_socket->so_rcv,
1032 		    &rack->rc_inp->inp_socket->so_snd,
1033 		    BBR_LOG_DOSEG_DONE, 0,
1034 		    0, &log, false);
1035 	}
1036 }
1037 
1038 
1039 static void
1040 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling)
1041 {
1042 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1043 		union tcp_log_stackspecific log;
1044 
1045 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1046 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1047 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1048 		log.u_bbr.flex1 = slot;
1049 		log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
1050 		log.u_bbr.flex7 = hpts_calling;
1051 		log.u_bbr.flex8 = rack->rc_in_persist;
1052 		TCP_LOG_EVENT(rack->rc_tp, NULL,
1053 		    &rack->rc_inp->inp_socket->so_rcv,
1054 		    &rack->rc_inp->inp_socket->so_snd,
1055 		    BBR_LOG_JUSTRET, 0,
1056 		    tlen, &log, false);
1057 	}
1058 }
1059 
1060 static void
1061 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line)
1062 {
1063 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1064 		union tcp_log_stackspecific log;
1065 
1066 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1067 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
1068 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
1069 		log.u_bbr.flex1 = line;
1070 		log.u_bbr.flex2 = 0;
1071 		log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
1072 		log.u_bbr.flex4 = 0;
1073 		log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
1074 		log.u_bbr.flex8 = hpts_removed;
1075 		TCP_LOG_EVENT(rack->rc_tp, NULL,
1076 		    &rack->rc_inp->inp_socket->so_rcv,
1077 		    &rack->rc_inp->inp_socket->so_snd,
1078 		    BBR_LOG_TIMERCANC, 0,
1079 		    0, &log, false);
1080 	}
1081 }
1082 
1083 static void
1084 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
1085 {
1086 	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
1087 		union tcp_log_stackspecific log;
1088 
1089 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
1090 		log.u_bbr.flex1 = timers;
1091 		log.u_bbr.flex2 = ret;
1092 		log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
1093 		log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
1094 		log.u_bbr.flex5 = cts;
1095 		TCP_LOG_EVENT(rack->rc_tp, NULL,
1096 		    &rack->rc_inp->inp_socket->so_rcv,
1097 		    &rack->rc_inp->inp_socket->so_snd,
1098 		    BBR_LOG_TO_PROCESS, 0,
1099 		    0, &log, false);
1100 	}
1101 }
1102 
1103 static void
1104 rack_counter_destroy()
1105 {
1106 	counter_u64_free(rack_badfr);
1107 	counter_u64_free(rack_badfr_bytes);
1108 	counter_u64_free(rack_rtm_prr_retran);
1109 	counter_u64_free(rack_rtm_prr_newdata);
1110 	counter_u64_free(rack_timestamp_mismatch);
1111 	counter_u64_free(rack_reorder_seen);
1112 	counter_u64_free(rack_tlp_tot);
1113 	counter_u64_free(rack_tlp_newdata);
1114 	counter_u64_free(rack_tlp_retran);
1115 	counter_u64_free(rack_tlp_retran_bytes);
1116 	counter_u64_free(rack_tlp_retran_fail);
1117 	counter_u64_free(rack_to_tot);
1118 	counter_u64_free(rack_to_arm_rack);
1119 	counter_u64_free(rack_to_arm_tlp);
1120 	counter_u64_free(rack_paced_segments);
1121 	counter_u64_free(rack_unpaced_segments);
1122 	counter_u64_free(rack_saw_enobuf);
1123 	counter_u64_free(rack_saw_enetunreach);
1124 	counter_u64_free(rack_to_alloc_hard);
1125 	counter_u64_free(rack_to_alloc_emerg);
1126 	counter_u64_free(rack_sack_proc_all);
1127 	counter_u64_free(rack_sack_proc_short);
1128 	counter_u64_free(rack_sack_proc_restart);
1129 	counter_u64_free(rack_to_alloc);
1130 	counter_u64_free(rack_find_high);
1131 	counter_u64_free(rack_runt_sacks);
1132 	counter_u64_free(rack_enter_tlp_calc);
1133 	counter_u64_free(rack_used_tlpmethod);
1134 	counter_u64_free(rack_used_tlpmethod2);
1135 	counter_u64_free(rack_progress_drops);
1136 	counter_u64_free(rack_input_idle_reduces);
1137 	counter_u64_free(rack_tlp_does_nada);
1138 	COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
1139 	COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
1140 }
1141 
1142 static struct rack_sendmap *
1143 rack_alloc(struct tcp_rack *rack)
1144 {
1145 	struct rack_sendmap *rsm;
1146 
1147 	rsm = uma_zalloc(rack_zone, M_NOWAIT);
1148 	if (rsm) {
1149 alloc_done:
1150 		counter_u64_add(rack_to_alloc, 1);
1151 		rack->r_ctl.rc_num_maps_alloced++;
1152 		return (rsm);
1153 	}
1154 	if (rack->rc_free_cnt) {
1155 		counter_u64_add(rack_to_alloc_emerg, 1);
1156 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
1157 		TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
1158 		rack->rc_free_cnt--;
1159 		goto alloc_done;
1160 	}
1161 	return (NULL);
1162 }
1163 
1164 /* wrapper to allocate a sendmap entry, subject to a specific limit */
1165 static struct rack_sendmap *
1166 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
1167 {
1168 	struct rack_sendmap *rsm;
1169 
1170 	if (limit_type) {
1171 		/* currently there is only one limit type */
1172 		if (rack_map_split_limit > 0 &&
1173 		    rack->r_ctl.rc_num_split_allocs >= rack_map_split_limit) {
1174 			counter_u64_add(rack_split_limited, 1);
1175 			if (!rack->alloc_limit_reported) {
1176 				rack->alloc_limit_reported = 1;
1177 				counter_u64_add(rack_alloc_limited_conns, 1);
1178 			}
1179 			return (NULL);
1180 		}
1181 	}
1182 
1183 	/* allocate and mark in the limit type, if set */
1184 	rsm = rack_alloc(rack);
1185 	if (rsm != NULL && limit_type) {
1186 		rsm->r_limit_type = limit_type;
1187 		rack->r_ctl.rc_num_split_allocs++;
1188 	}
1189 	return (rsm);
1190 }
1191 
1192 static void
1193 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
1194 {
1195 	if (rsm->r_limit_type) {
1196 		/* currently there is only one limit type */
1197 		rack->r_ctl.rc_num_split_allocs--;
1198 	}
1199 	rack->r_ctl.rc_num_maps_alloced--;
1200 	if (rack->r_ctl.rc_tlpsend == rsm)
1201 		rack->r_ctl.rc_tlpsend = NULL;
1202 	if (rack->r_ctl.rc_next == rsm)
1203 		rack->r_ctl.rc_next = NULL;
1204 	if (rack->r_ctl.rc_sacklast == rsm)
1205 		rack->r_ctl.rc_sacklast = NULL;
1206 	if (rack->rc_free_cnt < rack_free_cache) {
1207 		memset(rsm, 0, sizeof(struct rack_sendmap));
1208 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
1209 		rack->rc_free_cnt++;
1210 		return;
1211 	}
1212 	uma_zfree(rack_zone, rsm);
1213 }
1214 
1215 /*
1216  * CC wrapper hook functions
1217  */
1218 static void
1219 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs,
1220     uint16_t type, int32_t recovery)
1221 {
1222 #ifdef NETFLIX_STATS
1223 	int32_t gput;
1224 #endif
1225 #ifdef NETFLIX_CWV
1226 	u_long old_cwnd = tp->snd_cwnd;
1227 #endif
1228 
1229 	INP_WLOCK_ASSERT(tp->t_inpcb);
1230 	tp->ccv->nsegs = nsegs;
1231 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
1232 	if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
1233 		uint32_t max;
1234 
1235 		max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg;
1236 		if (tp->ccv->bytes_this_ack > max) {
1237 			tp->ccv->bytes_this_ack = max;
1238 		}
1239 	}
1240 	if (tp->snd_cwnd <= tp->snd_wnd)
1241 		tp->ccv->flags |= CCF_CWND_LIMITED;
1242 	else
1243 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
1244 
1245 	if (type == CC_ACK) {
1246 #ifdef NETFLIX_STATS
1247 		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
1248 		    ((int32_t) tp->snd_cwnd) - tp->snd_wnd);
1249 		if ((tp->t_flags & TF_GPUTINPROG) &&
1250 		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
1251 			gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) /
1252 			    max(1, tcp_ts_getticks() - tp->gput_ts);
1253 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
1254 			    gput);
1255 			/*
1256 			 * XXXLAS: This is a temporary hack, and should be
1257 			 * chained off VOI_TCP_GPUT when stats(9) grows an
1258 			 * API to deal with chained VOIs.
1259 			 */
1260 			if (tp->t_stats_gput_prev > 0)
1261 				stats_voi_update_abs_s32(tp->t_stats,
1262 				    VOI_TCP_GPUT_ND,
1263 				    ((gput - tp->t_stats_gput_prev) * 100) /
1264 				    tp->t_stats_gput_prev);
1265 			tp->t_flags &= ~TF_GPUTINPROG;
1266 			tp->t_stats_gput_prev = gput;
1267 #ifdef NETFLIX_CWV
1268 			if (tp->t_maxpeakrate) {
1269 				/*
1270 				 * We update t_peakrate_thr. This gives us roughly
1271 				 * one update per round trip time.
1272 				 */
1273 				tcp_update_peakrate_thr(tp);
1274 			}
1275 #endif
1276 		}
1277 #endif
1278 		if (tp->snd_cwnd > tp->snd_ssthresh) {
1279 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
1280 			    nsegs * V_tcp_abc_l_var * tp->t_maxseg);
1281 			if (tp->t_bytes_acked >= tp->snd_cwnd) {
1282 				tp->t_bytes_acked -= tp->snd_cwnd;
1283 				tp->ccv->flags |= CCF_ABC_SENTAWND;
1284 			}
1285 		} else {
1286 			tp->ccv->flags &= ~CCF_ABC_SENTAWND;
1287 			tp->t_bytes_acked = 0;
1288 		}
1289 	}
1290 	if (CC_ALGO(tp)->ack_received != NULL) {
1291 		/* XXXLAS: Find a way to live without this */
1292 		tp->ccv->curack = th->th_ack;
1293 		CC_ALGO(tp)->ack_received(tp->ccv, type);
1294 	}
1295 #ifdef NETFLIX_STATS
1296 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
1297 #endif
1298 	if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
1299 		rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
1300 	}
1301 #ifdef NETFLIX_CWV
1302 	if (tp->cwv_enabled) {
1303 		/*
1304 		 * Per RFC 7661: The behaviour in the non-validated phase is
1305 		 * specified as: o  A sender determines whether to increase
1306 		 * the cwnd based upon whether it is cwnd-limited (see
1307 		 * Section 4.5.3): * A sender that is cwnd-limited MAY use
1308 		 * the standard TCP method to increase cwnd (i.e., the
1309 		 * standard method permits a TCP sender that fully utilises
1310 		 * the cwnd to increase the cwnd each time it receives an
1311 		 * ACK). * A sender that is not cwnd-limited MUST NOT
1312 		 * increase the cwnd when ACK packets are received in this
1313 		 * phase (i.e., needs to avoid growing the cwnd when it has
1314 		 * not recently sent using the current size of cwnd).
1315 		 */
1316 		if ((tp->snd_cwnd > old_cwnd) &&
1317 		    (tp->cwv_cwnd_valid == 0) &&
1318 		    (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
1319 			tp->snd_cwnd = old_cwnd;
1320 		}
1321 		/* Try to update pipeAck and NCWV state */
1322 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1323 		    !IN_RECOVERY(tp->t_flags)) {
1324 			uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
1325 
1326 			tcp_newcwv_update_pipeack(tp, data);
1327 		}
1328 	}
1329 	/* we enforce max peak rate if it is set. */
1330 	if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
1331 		tp->snd_cwnd = tp->t_peakrate_thr;
1332 	}
1333 #endif
1334 }
1335 
1336 static void
1337 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th)
1338 {
1339 	struct tcp_rack *rack;
1340 
1341 	rack = (struct tcp_rack *)tp->t_fb_ptr;
1342 	INP_WLOCK_ASSERT(tp->t_inpcb);
1343 	if (rack->r_ctl.rc_prr_sndcnt > 0)
1344 		rack->r_wanted_output++;
1345 }
1346 
1347 static void
1348 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th)
1349 {
1350 	struct tcp_rack *rack;
1351 
1352 	INP_WLOCK_ASSERT(tp->t_inpcb);
1353 	rack = (struct tcp_rack *)tp->t_fb_ptr;
1354 	if (CC_ALGO(tp)->post_recovery != NULL) {
1355 		tp->ccv->curack = th->th_ack;
1356 		CC_ALGO(tp)->post_recovery(tp->ccv);
1357 	}
1358 	/*
1359 	 * Here we can in theory adjust cwnd to be based on the number of
1360 	 * losses in the window (rack->r_ctl.rc_loss_count). This is done
1361 	 * based on the rack_use_proportional flag.
1362 	 */
1363 	if (rack->r_ctl.rc_prop_reduce && rack->r_ctl.rc_prop_rate) {
1364 		int32_t reduce;
1365 
1366 		reduce = (rack->r_ctl.rc_loss_count * rack->r_ctl.rc_prop_rate);
1367 		if (reduce > 50) {
1368 			reduce = 50;
1369 		}
1370 		tp->snd_cwnd -= ((reduce * tp->snd_cwnd) / 100);
1371 	} else {
1372 		if (tp->snd_cwnd > tp->snd_ssthresh) {
1373 			/* Drop us down to the ssthresh (1/2 cwnd at loss) */
1374 			tp->snd_cwnd = tp->snd_ssthresh;
1375 		}
1376 	}
1377 	if (rack->r_ctl.rc_prr_sndcnt > 0) {
1378 		/* Suck the next prr cnt back into cwnd */
1379 		tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
1380 		rack->r_ctl.rc_prr_sndcnt = 0;
1381 	}
1382 	EXIT_RECOVERY(tp->t_flags);
1383 
1384 
1385 #ifdef NETFLIX_CWV
1386 	if (tp->cwv_enabled) {
1387 		if ((tp->cwv_cwnd_valid == 0) &&
1388 		    (tp->snd_cwv.in_recovery))
1389 			tcp_newcwv_end_recovery(tp);
1390 	}
1391 #endif
1392 }
1393 
1394 static void
1395 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
1396 {
1397 	struct tcp_rack *rack;
1398 
1399 	INP_WLOCK_ASSERT(tp->t_inpcb);
1400 
1401 	rack = (struct tcp_rack *)tp->t_fb_ptr;
1402 	switch (type) {
1403 	case CC_NDUPACK:
1404 /*		rack->r_ctl.rc_ssthresh_set = 1;*/
1405 		if (!IN_FASTRECOVERY(tp->t_flags)) {
1406 			rack->r_ctl.rc_tlp_rtx_out = 0;
1407 			rack->r_ctl.rc_prr_delivered = 0;
1408 			rack->r_ctl.rc_prr_out = 0;
1409 			rack->r_ctl.rc_loss_count = 0;
1410 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
1411 			rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
1412 			tp->snd_recover = tp->snd_max;
1413 			if (tp->t_flags & TF_ECN_PERMIT)
1414 				tp->t_flags |= TF_ECN_SND_CWR;
1415 		}
1416 		break;
1417 	case CC_ECN:
1418 		if (!IN_CONGRECOVERY(tp->t_flags)) {
1419 			TCPSTAT_INC(tcps_ecn_rcwnd);
1420 			tp->snd_recover = tp->snd_max;
1421 			if (tp->t_flags & TF_ECN_PERMIT)
1422 				tp->t_flags |= TF_ECN_SND_CWR;
1423 		}
1424 		break;
1425 	case CC_RTO:
1426 		tp->t_dupacks = 0;
1427 		tp->t_bytes_acked = 0;
1428 		EXIT_RECOVERY(tp->t_flags);
1429 		tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 /
1430 		    tp->t_maxseg) * tp->t_maxseg;
1431 		tp->snd_cwnd = tp->t_maxseg;
1432 		break;
1433 	case CC_RTO_ERR:
1434 		TCPSTAT_INC(tcps_sndrexmitbad);
1435 		/* RTO was unnecessary, so reset everything. */
1436 		tp->snd_cwnd = tp->snd_cwnd_prev;
1437 		tp->snd_ssthresh = tp->snd_ssthresh_prev;
1438 		tp->snd_recover = tp->snd_recover_prev;
1439 		if (tp->t_flags & TF_WASFRECOVERY)
1440 			ENTER_FASTRECOVERY(tp->t_flags);
1441 		if (tp->t_flags & TF_WASCRECOVERY)
1442 			ENTER_CONGRECOVERY(tp->t_flags);
1443 		tp->snd_nxt = tp->snd_max;
1444 		tp->t_badrxtwin = 0;
1445 		break;
1446 	}
1447 
1448 	if (CC_ALGO(tp)->cong_signal != NULL) {
1449 		if (th != NULL)
1450 			tp->ccv->curack = th->th_ack;
1451 		CC_ALGO(tp)->cong_signal(tp->ccv, type);
1452 	}
1453 #ifdef NETFLIX_CWV
1454 	if (tp->cwv_enabled) {
1455 		if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
1456 			tcp_newcwv_enter_recovery(tp);
1457 		}
1458 		if (type == CC_RTO) {
1459 			tcp_newcwv_reset(tp);
1460 		}
1461 	}
1462 #endif
1463 }
1464 
1465 
1466 
1467 static inline void
1468 rack_cc_after_idle(struct tcpcb *tp, int reduce_largest)
1469 {
1470 	uint32_t i_cwnd;
1471 
1472 	INP_WLOCK_ASSERT(tp->t_inpcb);
1473 
1474 #ifdef NETFLIX_STATS
1475 	TCPSTAT_INC(tcps_idle_restarts);
1476 	if (tp->t_state == TCPS_ESTABLISHED)
1477 		TCPSTAT_INC(tcps_idle_estrestarts);
1478 #endif
1479 	if (CC_ALGO(tp)->after_idle != NULL)
1480 		CC_ALGO(tp)->after_idle(tp->ccv);
1481 
1482 	if (tp->snd_cwnd == 1)
1483 		i_cwnd = tp->t_maxseg;		/* SYN(-ACK) lost */
1484 	else
1485 		i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
1486 
1487 	if (reduce_largest) {
1488 		/*
1489 		 * Do we reduce the largest cwnd to make
1490 		 * rack play nice on restart hptsi wise?
1491 		 */
1492 		if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd  > i_cwnd)
1493 			((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd;
1494 	}
1495 	/*
1496 	 * Being idle is no differnt than the initial window. If the cc
1497 	 * clamps it down below the initial window raise it to the initial
1498 	 * window.
1499 	 */
1500 	if (tp->snd_cwnd < i_cwnd) {
1501 		tp->snd_cwnd = i_cwnd;
1502 	}
1503 }
1504 
1505 
1506 /*
1507  * Indicate whether this ack should be delayed.  We can delay the ack if
1508  * following conditions are met:
1509  *	- There is no delayed ack timer in progress.
1510  *	- Our last ack wasn't a 0-sized window. We never want to delay
1511  *	  the ack that opens up a 0-sized window.
1512  *	- LRO wasn't used for this segment. We make sure by checking that the
1513  *	  segment size is not larger than the MSS.
1514  *	- Delayed acks are enabled or this is a half-synchronized T/TCP
1515  *	  connection.
1516  */
1517 #define DELAY_ACK(tp, tlen)			 \
1518 	(((tp->t_flags & TF_RXWIN0SENT) == 0) && \
1519 	((tp->t_flags & TF_DELACK) == 0) && 	 \
1520 	(tlen <= tp->t_maxseg) &&		 \
1521 	(tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
1522 
1523 static inline void
1524 rack_calc_rwin(struct socket *so, struct tcpcb *tp)
1525 {
1526 	int32_t win;
1527 
1528 	/*
1529 	 * Calculate amount of space in receive window, and then do TCP
1530 	 * input processing. Receive window is amount of space in rcv queue,
1531 	 * but not less than advertised window.
1532 	 */
1533 	win = sbspace(&so->so_rcv);
1534 	if (win < 0)
1535 		win = 0;
1536 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
1537 }
1538 
1539 static void
1540 rack_do_drop(struct mbuf *m, struct tcpcb *tp)
1541 {
1542 	/*
1543 	 * Drop space held by incoming segment and return.
1544 	 */
1545 	if (tp != NULL)
1546 		INP_WUNLOCK(tp->t_inpcb);
1547 	if (m)
1548 		m_freem(m);
1549 }
1550 
1551 static void
1552 rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
1553     int32_t rstreason, int32_t tlen)
1554 {
1555 	if (tp != NULL) {
1556 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
1557 		INP_WUNLOCK(tp->t_inpcb);
1558 	} else
1559 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
1560 }
1561 
1562 /*
1563  * The value in ret_val informs the caller
1564  * if we dropped the tcb (and lock) or not.
1565  * 1 = we dropped it, 0 = the TCB is still locked
1566  * and valid.
1567  */
1568 static void
1569 rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
1570 {
1571 	/*
1572 	 * Generate an ACK dropping incoming segment if it occupies sequence
1573 	 * space, where the ACK reflects our state.
1574 	 *
1575 	 * We can now skip the test for the RST flag since all paths to this
1576 	 * code happen after packets containing RST have been dropped.
1577 	 *
1578 	 * In the SYN-RECEIVED state, don't send an ACK unless the segment
1579 	 * we received passes the SYN-RECEIVED ACK test. If it fails send a
1580 	 * RST.  This breaks the loop in the "LAND" DoS attack, and also
1581 	 * prevents an ACK storm between two listening ports that have been
1582 	 * sent forged SYN segments, each with the source address of the
1583 	 * other.
1584 	 */
1585 	struct tcp_rack *rack;
1586 
1587 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
1588 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
1589 	    SEQ_GT(th->th_ack, tp->snd_max))) {
1590 		*ret_val = 1;
1591 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
1592 		return;
1593 	} else
1594 		*ret_val = 0;
1595 	rack = (struct tcp_rack *)tp->t_fb_ptr;
1596 	rack->r_wanted_output++;
1597 	tp->t_flags |= TF_ACKNOW;
1598 	if (m)
1599 		m_freem(m);
1600 }
1601 
1602 
1603 static int
1604 rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
1605 {
1606 	/*
1607 	 * RFC5961 Section 3.2
1608 	 *
1609 	 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
1610 	 * window, we send challenge ACK.
1611 	 *
1612 	 * Note: to take into account delayed ACKs, we should test against
1613 	 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
1614 	 * of closed window, not covered by the RFC.
1615 	 */
1616 	int dropped = 0;
1617 
1618 	if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
1619 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
1620 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
1621 
1622 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1623 		KASSERT(tp->t_state != TCPS_SYN_SENT,
1624 		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
1625 		    __func__, th, tp));
1626 
1627 		if (V_tcp_insecure_rst ||
1628 		    (tp->last_ack_sent == th->th_seq) ||
1629 		    (tp->rcv_nxt == th->th_seq) ||
1630 		    ((tp->last_ack_sent - 1) == th->th_seq)) {
1631 			TCPSTAT_INC(tcps_drops);
1632 			/* Drop the connection. */
1633 			switch (tp->t_state) {
1634 			case TCPS_SYN_RECEIVED:
1635 				so->so_error = ECONNREFUSED;
1636 				goto close;
1637 			case TCPS_ESTABLISHED:
1638 			case TCPS_FIN_WAIT_1:
1639 			case TCPS_FIN_WAIT_2:
1640 			case TCPS_CLOSE_WAIT:
1641 			case TCPS_CLOSING:
1642 			case TCPS_LAST_ACK:
1643 				so->so_error = ECONNRESET;
1644 		close:
1645 				tcp_state_change(tp, TCPS_CLOSED);
1646 				/* FALLTHROUGH */
1647 			default:
1648 				tp = tcp_close(tp);
1649 			}
1650 			dropped = 1;
1651 			rack_do_drop(m, tp);
1652 		} else {
1653 			TCPSTAT_INC(tcps_badrst);
1654 			/* Send challenge ACK. */
1655 			tcp_respond(tp, mtod(m, void *), th, m,
1656 			    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
1657 			tp->last_ack_sent = tp->rcv_nxt;
1658 		}
1659 	} else {
1660 		m_freem(m);
1661 	}
1662 	return (dropped);
1663 }
1664 
1665 /*
1666  * The value in ret_val informs the caller
1667  * if we dropped the tcb (and lock) or not.
1668  * 1 = we dropped it, 0 = the TCB is still locked
1669  * and valid.
1670  */
1671 static void
1672 rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
1673 {
1674 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
1675 
1676 	TCPSTAT_INC(tcps_badsyn);
1677 	if (V_tcp_insecure_syn &&
1678 	    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
1679 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
1680 		tp = tcp_drop(tp, ECONNRESET);
1681 		*ret_val = 1;
1682 		rack_do_drop(m, tp);
1683 	} else {
1684 		/* Send challenge ACK. */
1685 		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
1686 		    tp->snd_nxt, TH_ACK);
1687 		tp->last_ack_sent = tp->rcv_nxt;
1688 		m = NULL;
1689 		*ret_val = 0;
1690 		rack_do_drop(m, NULL);
1691 	}
1692 }
1693 
1694 /*
1695  * rack_ts_check returns 1 for you should not proceed. It places
1696  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1697  * that the TCB is unlocked and probably dropped. The 0 indicates the
1698  * TCB is still valid and locked.
1699  */
1700 static int
1701 rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val)
1702 {
1703 
1704 	/* Check to see if ts_recent is over 24 days old.  */
1705 	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
1706 		/*
1707 		 * Invalidate ts_recent.  If this segment updates ts_recent,
1708 		 * the age will be reset later and ts_recent will get a
1709 		 * valid value.  If it does not, setting ts_recent to zero
1710 		 * will at least satisfy the requirement that zero be placed
1711 		 * in the timestamp echo reply when ts_recent isn't valid.
1712 		 * The age isn't reset until we get a valid ts_recent
1713 		 * because we don't want out-of-order segments to be dropped
1714 		 * when ts_recent is old.
1715 		 */
1716 		tp->ts_recent = 0;
1717 	} else {
1718 		TCPSTAT_INC(tcps_rcvduppack);
1719 		TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
1720 		TCPSTAT_INC(tcps_pawsdrop);
1721 		*ret_val = 0;
1722 		if (tlen) {
1723 			rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1724 		} else {
1725 			rack_do_drop(m, NULL);
1726 		}
1727 		return (1);
1728 	}
1729 	return (0);
1730 }
1731 
1732 /*
1733  * rack_drop_checks returns 1 for you should not proceed. It places
1734  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
1735  * that the TCB is unlocked and probably dropped. The 0 indicates the
1736  * TCB is still valid and locked.
1737  */
1738 static int
1739 rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp,  int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
1740 {
1741 	int32_t todrop;
1742 	int32_t thflags;
1743 	int32_t tlen;
1744 
1745 	thflags = *thf;
1746 	tlen = *tlenp;
1747 	todrop = tp->rcv_nxt - th->th_seq;
1748 	if (todrop > 0) {
1749 		if (thflags & TH_SYN) {
1750 			thflags &= ~TH_SYN;
1751 			th->th_seq++;
1752 			if (th->th_urp > 1)
1753 				th->th_urp--;
1754 			else
1755 				thflags &= ~TH_URG;
1756 			todrop--;
1757 		}
1758 		/*
1759 		 * Following if statement from Stevens, vol. 2, p. 960.
1760 		 */
1761 		if (todrop > tlen
1762 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
1763 			/*
1764 			 * Any valid FIN must be to the left of the window.
1765 			 * At this point the FIN must be a duplicate or out
1766 			 * of sequence; drop it.
1767 			 */
1768 			thflags &= ~TH_FIN;
1769 			/*
1770 			 * Send an ACK to resynchronize and drop any data.
1771 			 * But keep on processing for RST or ACK.
1772 			 */
1773 			tp->t_flags |= TF_ACKNOW;
1774 			todrop = tlen;
1775 			TCPSTAT_INC(tcps_rcvduppack);
1776 			TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
1777 		} else {
1778 			TCPSTAT_INC(tcps_rcvpartduppack);
1779 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
1780 		}
1781 		/*
1782 		 * DSACK - add SACK block for dropped range
1783 		 */
1784 		if (tp->t_flags & TF_SACK_PERMIT) {
1785 			tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
1786 			/*
1787 			 * ACK now, as the next in-sequence segment
1788 			 * will clear the DSACK block again
1789 			 */
1790 			tp->t_flags |= TF_ACKNOW;
1791 		}
1792 		*drop_hdrlen += todrop;	/* drop from the top afterwards */
1793 		th->th_seq += todrop;
1794 		tlen -= todrop;
1795 		if (th->th_urp > todrop)
1796 			th->th_urp -= todrop;
1797 		else {
1798 			thflags &= ~TH_URG;
1799 			th->th_urp = 0;
1800 		}
1801 	}
1802 	/*
1803 	 * If segment ends after window, drop trailing data (and PUSH and
1804 	 * FIN); if nothing left, just ACK.
1805 	 */
1806 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
1807 	if (todrop > 0) {
1808 		TCPSTAT_INC(tcps_rcvpackafterwin);
1809 		if (todrop >= tlen) {
1810 			TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
1811 			/*
1812 			 * If window is closed can only take segments at
1813 			 * window edge, and have to drop data and PUSH from
1814 			 * incoming segments.  Continue processing, but
1815 			 * remember to ack.  Otherwise, drop segment and
1816 			 * ack.
1817 			 */
1818 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
1819 				tp->t_flags |= TF_ACKNOW;
1820 				TCPSTAT_INC(tcps_rcvwinprobe);
1821 			} else {
1822 				rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
1823 				return (1);
1824 			}
1825 		} else
1826 			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
1827 		m_adj(m, -todrop);
1828 		tlen -= todrop;
1829 		thflags &= ~(TH_PUSH | TH_FIN);
1830 	}
1831 	*thf = thflags;
1832 	*tlenp = tlen;
1833 	return (0);
1834 }
1835 
1836 static struct rack_sendmap *
1837 rack_find_lowest_rsm(struct tcp_rack *rack)
1838 {
1839 	struct rack_sendmap *rsm;
1840 
1841 	/*
1842 	 * Walk the time-order transmitted list looking for an rsm that is
1843 	 * not acked. This will be the one that was sent the longest time
1844 	 * ago that is still outstanding.
1845 	 */
1846 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
1847 		if (rsm->r_flags & RACK_ACKED) {
1848 			continue;
1849 		}
1850 		goto finish;
1851 	}
1852 finish:
1853 	return (rsm);
1854 }
1855 
1856 static struct rack_sendmap *
1857 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
1858 {
1859 	struct rack_sendmap *prsm;
1860 
1861 	/*
1862 	 * Walk the sequence order list backward until we hit and arrive at
1863 	 * the highest seq not acked. In theory when this is called it
1864 	 * should be the last segment (which it was not).
1865 	 */
1866 	counter_u64_add(rack_find_high, 1);
1867 	prsm = rsm;
1868 	TAILQ_FOREACH_REVERSE_FROM(prsm, &rack->r_ctl.rc_map, rack_head, r_next) {
1869 		if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
1870 			continue;
1871 		}
1872 		return (prsm);
1873 	}
1874 	return (NULL);
1875 }
1876 
1877 
1878 static uint32_t
1879 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts)
1880 {
1881 	int32_t lro;
1882 	uint32_t thresh;
1883 
1884 	/*
1885 	 * lro is the flag we use to determine if we have seen reordering.
1886 	 * If it gets set we have seen reordering. The reorder logic either
1887 	 * works in one of two ways:
1888 	 *
1889 	 * If reorder-fade is configured, then we track the last time we saw
1890 	 * re-ordering occur. If we reach the point where enough time as
1891 	 * passed we no longer consider reordering has occuring.
1892 	 *
1893 	 * Or if reorder-face is 0, then once we see reordering we consider
1894 	 * the connection to alway be subject to reordering and just set lro
1895 	 * to 1.
1896 	 *
1897 	 * In the end if lro is non-zero we add the extra time for
1898 	 * reordering in.
1899 	 */
1900 	if (srtt == 0)
1901 		srtt = 1;
1902 	if (rack->r_ctl.rc_reorder_ts) {
1903 		if (rack->r_ctl.rc_reorder_fade) {
1904 			if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
1905 				lro = cts - rack->r_ctl.rc_reorder_ts;
1906 				if (lro == 0) {
1907 					/*
1908 					 * No time as passed since the last
1909 					 * reorder, mark it as reordering.
1910 					 */
1911 					lro = 1;
1912 				}
1913 			} else {
1914 				/* Negative time? */
1915 				lro = 0;
1916 			}
1917 			if (lro > rack->r_ctl.rc_reorder_fade) {
1918 				/* Turn off reordering seen too */
1919 				rack->r_ctl.rc_reorder_ts = 0;
1920 				lro = 0;
1921 			}
1922 		} else {
1923 			/* Reodering does not fade */
1924 			lro = 1;
1925 		}
1926 	} else {
1927 		lro = 0;
1928 	}
1929 	thresh = srtt + rack->r_ctl.rc_pkt_delay;
1930 	if (lro) {
1931 		/* It must be set, if not you get 1/4 rtt */
1932 		if (rack->r_ctl.rc_reorder_shift)
1933 			thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
1934 		else
1935 			thresh += (srtt >> 2);
1936 	} else {
1937 		thresh += 1;
1938 	}
1939 	/* We don't let the rack timeout be above a RTO */
1940 
1941 	if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) {
1942 		thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur);
1943 	}
1944 	/* And we don't want it above the RTO max either */
1945 	if (thresh > rack_rto_max) {
1946 		thresh = rack_rto_max;
1947 	}
1948 	return (thresh);
1949 }
1950 
1951 static uint32_t
1952 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
1953 		     struct rack_sendmap *rsm, uint32_t srtt)
1954 {
1955 	struct rack_sendmap *prsm;
1956 	uint32_t thresh, len;
1957 	int maxseg;
1958 
1959 	if (srtt == 0)
1960 		srtt = 1;
1961 	if (rack->r_ctl.rc_tlp_threshold)
1962 		thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
1963 	else
1964 		thresh = (srtt * 2);
1965 
1966 	/* Get the previous sent packet, if any  */
1967 	maxseg = tcp_maxseg(tp);
1968 	counter_u64_add(rack_enter_tlp_calc, 1);
1969 	len = rsm->r_end - rsm->r_start;
1970 	if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
1971 		/* Exactly like the ID */
1972 		if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= maxseg) {
1973 			uint32_t alt_thresh;
1974 			/*
1975 			 * Compensate for delayed-ack with the d-ack time.
1976 			 */
1977 			counter_u64_add(rack_used_tlpmethod, 1);
1978 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
1979 			if (alt_thresh > thresh)
1980 				thresh = alt_thresh;
1981 		}
1982 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
1983 		/* 2.1 behavior */
1984 		prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
1985 		if (prsm && (len <= maxseg)) {
1986 			/*
1987 			 * Two packets outstanding, thresh should be (2*srtt) +
1988 			 * possible inter-packet delay (if any).
1989 			 */
1990 			uint32_t inter_gap = 0;
1991 			int idx, nidx;
1992 
1993 			counter_u64_add(rack_used_tlpmethod, 1);
1994 			idx = rsm->r_rtr_cnt - 1;
1995 			nidx = prsm->r_rtr_cnt - 1;
1996 			if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) {
1997 				/* Yes it was sent later (or at the same time) */
1998 				inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
1999 			}
2000 			thresh += inter_gap;
2001 		} else 	if (len <= maxseg) {
2002 			/*
2003 			 * Possibly compensate for delayed-ack.
2004 			 */
2005 			uint32_t alt_thresh;
2006 
2007 			counter_u64_add(rack_used_tlpmethod2, 1);
2008 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2009 			if (alt_thresh > thresh)
2010 				thresh = alt_thresh;
2011 		}
2012 	} else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
2013 		/* 2.2 behavior */
2014 		if (len <= maxseg) {
2015 			uint32_t alt_thresh;
2016 			/*
2017 			 * Compensate for delayed-ack with the d-ack time.
2018 			 */
2019 			counter_u64_add(rack_used_tlpmethod, 1);
2020 			alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
2021 			if (alt_thresh > thresh)
2022 				thresh = alt_thresh;
2023 		}
2024 	}
2025  	/* Not above an RTO */
2026 	if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) {
2027 		thresh = TICKS_2_MSEC(tp->t_rxtcur);
2028 	}
2029 	/* Not above a RTO max */
2030 	if (thresh > rack_rto_max) {
2031 		thresh = rack_rto_max;
2032 	}
2033 	/* Apply user supplied min TLP */
2034 	if (thresh < rack_tlp_min) {
2035 		thresh = rack_tlp_min;
2036 	}
2037 	return (thresh);
2038 }
2039 
2040 static struct rack_sendmap *
2041 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
2042 {
2043 	/*
2044 	 * Check to see that we don't need to fall into recovery. We will
2045 	 * need to do so if our oldest transmit is past the time we should
2046 	 * have had an ack.
2047 	 */
2048 	struct tcp_rack *rack;
2049 	struct rack_sendmap *rsm;
2050 	int32_t idx;
2051 	uint32_t srtt_cur, srtt, thresh;
2052 
2053 	rack = (struct tcp_rack *)tp->t_fb_ptr;
2054 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
2055 		return (NULL);
2056 	}
2057 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
2058 	srtt = TICKS_2_MSEC(srtt_cur);
2059 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
2060 		srtt = rack->rc_rack_rtt;
2061 
2062 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2063 	if (rsm == NULL)
2064 		return (NULL);
2065 
2066 	if (rsm->r_flags & RACK_ACKED) {
2067 		rsm = rack_find_lowest_rsm(rack);
2068 		if (rsm == NULL)
2069 			return (NULL);
2070 	}
2071 	idx = rsm->r_rtr_cnt - 1;
2072 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
2073 	if (tsused < rsm->r_tim_lastsent[idx]) {
2074 		return (NULL);
2075 	}
2076 	if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) {
2077 		return (NULL);
2078 	}
2079 	/* Ok if we reach here we are over-due */
2080 	rack->r_ctl.rc_rsm_start = rsm->r_start;
2081 	rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
2082 	rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
2083 	rack_cong_signal(tp, NULL, CC_NDUPACK);
2084 	return (rsm);
2085 }
2086 
2087 static uint32_t
2088 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
2089 {
2090 	int32_t t;
2091 	int32_t tt;
2092 	uint32_t ret_val;
2093 
2094 	t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT));
2095 	TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
2096 	    tcp_persmin, tcp_persmax);
2097 	if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
2098 		tp->t_rxtshift++;
2099 	rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
2100 	ret_val = (uint32_t)tt;
2101 	return (ret_val);
2102 }
2103 
2104 static uint32_t
2105 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2106 {
2107 	/*
2108 	 * Start the FR timer, we do this based on getting the first one in
2109 	 * the rc_tmap. Note that if its NULL we must stop the timer. in all
2110 	 * events we need to stop the running timer (if its running) before
2111 	 * starting the new one.
2112 	 */
2113 	uint32_t thresh, exp, to, srtt, time_since_sent;
2114 	uint32_t srtt_cur;
2115 	int32_t idx;
2116 	int32_t is_tlp_timer = 0;
2117 	struct rack_sendmap *rsm;
2118 
2119 	if (rack->t_timers_stopped) {
2120 		/* All timers have been stopped none are to run */
2121 		return (0);
2122 	}
2123 	if (rack->rc_in_persist) {
2124 		/* We can't start any timer in persists */
2125 		return (rack_get_persists_timer_val(tp, rack));
2126 	}
2127 	if (tp->t_state < TCPS_ESTABLISHED)
2128 		goto activate_rxt;
2129 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2130 	if (rsm == NULL) {
2131 		/* Nothing on the send map */
2132 activate_rxt:
2133 		if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) {
2134 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
2135 			to = TICKS_2_MSEC(tp->t_rxtcur);
2136 			if (to == 0)
2137 				to = 1;
2138 			return (to);
2139 		}
2140 		return (0);
2141 	}
2142 	if (rsm->r_flags & RACK_ACKED) {
2143 		rsm = rack_find_lowest_rsm(rack);
2144 		if (rsm == NULL) {
2145 			/* No lowest? */
2146 			goto activate_rxt;
2147 		}
2148 	}
2149 	/* Convert from ms to usecs */
2150 	if (rsm->r_flags & RACK_SACK_PASSED) {
2151 		if ((tp->t_flags & TF_SENTFIN) &&
2152 		    ((tp->snd_max - tp->snd_una) == 1) &&
2153 		    (rsm->r_flags & RACK_HAS_FIN)) {
2154 			/*
2155 			 * We don't start a rack timer if all we have is a
2156 			 * FIN outstanding.
2157 			 */
2158 			goto activate_rxt;
2159 		}
2160 		if (tp->t_srtt) {
2161 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2162 			srtt = TICKS_2_MSEC(srtt_cur);
2163 		} else
2164 			srtt = RACK_INITIAL_RTO;
2165 
2166 		thresh = rack_calc_thresh_rack(rack, srtt, cts);
2167 		idx = rsm->r_rtr_cnt - 1;
2168 		exp = rsm->r_tim_lastsent[idx] + thresh;
2169 		if (SEQ_GEQ(exp, cts)) {
2170 			to = exp - cts;
2171 			if (to < rack->r_ctl.rc_min_to) {
2172 				to = rack->r_ctl.rc_min_to;
2173 			}
2174 		} else {
2175 			to = rack->r_ctl.rc_min_to;
2176 		}
2177 	} else {
2178 		/* Ok we need to do a TLP not RACK */
2179 		if ((rack->rc_tlp_in_progress != 0) ||
2180 		    (rack->r_ctl.rc_tlp_rtx_out != 0)) {
2181 			/*
2182 			 * The previous send was a TLP or a tlp_rtx is in
2183 			 * process.
2184 			 */
2185 			goto activate_rxt;
2186 		}
2187 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
2188 		if (rsm == NULL) {
2189 			/* We found no rsm to TLP with. */
2190 			goto activate_rxt;
2191 		}
2192 		if (rsm->r_flags & RACK_HAS_FIN) {
2193 			/* If its a FIN we dont do TLP */
2194 			rsm = NULL;
2195 			goto activate_rxt;
2196 		}
2197 		idx = rsm->r_rtr_cnt - 1;
2198 		if (TSTMP_GT(cts,  rsm->r_tim_lastsent[idx]))
2199 			time_since_sent = cts - rsm->r_tim_lastsent[idx];
2200 		else
2201 			time_since_sent = 0;
2202 		is_tlp_timer = 1;
2203 		if (tp->t_srtt) {
2204 			srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT);
2205 			srtt = TICKS_2_MSEC(srtt_cur);
2206 		} else
2207 			srtt = RACK_INITIAL_RTO;
2208 		thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
2209 		if (thresh > time_since_sent)
2210 			to = thresh - time_since_sent;
2211 		else
2212 			to = rack->r_ctl.rc_min_to;
2213 		if (to > TCPTV_REXMTMAX) {
2214 			/*
2215 			 * If the TLP time works out to larger than the max
2216 			 * RTO lets not do TLP.. just RTO.
2217 			 */
2218 			goto activate_rxt;
2219 		}
2220 		if (rsm->r_start != rack->r_ctl.rc_last_tlp_seq) {
2221 			/*
2222 			 * The tail is no longer the last one I did a probe
2223 			 * on
2224 			 */
2225 			rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2226 			rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2227 		}
2228 	}
2229 	if (is_tlp_timer == 0) {
2230 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
2231 	} else {
2232 		if ((rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) ||
2233 		    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2234 			/*
2235 			 * We have exceeded how many times we can retran the
2236 			 * current TLP timer, switch to the RTO timer.
2237 			 */
2238 			goto activate_rxt;
2239 		} else {
2240 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
2241 		}
2242 	}
2243 	if (to == 0)
2244 		to = 1;
2245 	return (to);
2246 }
2247 
2248 static void
2249 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2250 {
2251 	if (rack->rc_in_persist == 0) {
2252 		if (((tp->t_flags & TF_SENTFIN) == 0) &&
2253 		    (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd))
2254 			/* Must need to send more data to enter persist */
2255 			return;
2256 		rack->r_ctl.rc_went_idle_time = cts;
2257 		rack_timer_cancel(tp, rack, cts, __LINE__);
2258 		tp->t_rxtshift = 0;
2259 		rack->rc_in_persist = 1;
2260 	}
2261 }
2262 
2263 static void
2264 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack)
2265 {
2266 	if (rack->rc_inp->inp_in_hpts)  {
2267 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
2268 		rack->r_ctl.rc_hpts_flags  = 0;
2269 	}
2270 	rack->rc_in_persist = 0;
2271 	rack->r_ctl.rc_went_idle_time = 0;
2272 	tp->t_flags &= ~TF_FORCEDATA;
2273 	tp->t_rxtshift = 0;
2274 }
2275 
2276 static void
2277 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line,
2278     int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail)
2279 {
2280 	struct inpcb *inp;
2281 	uint32_t delayed_ack = 0;
2282 	uint32_t hpts_timeout;
2283 	uint8_t stopped;
2284 	uint32_t left = 0;
2285 
2286 	inp = tp->t_inpcb;
2287 	if (inp->inp_in_hpts) {
2288 		/* A previous call is already set up */
2289 		return;
2290 	}
2291 	if (tp->t_state == TCPS_CLOSED) {
2292 		return;
2293 	}
2294 	stopped = rack->rc_tmr_stopped;
2295 	if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
2296 		left = rack->r_ctl.rc_timer_exp - cts;
2297 	}
2298 	rack->r_ctl.rc_timer_exp = 0;
2299 	if (rack->rc_inp->inp_in_hpts == 0) {
2300 		rack->r_ctl.rc_hpts_flags = 0;
2301 	}
2302 	if (slot) {
2303 		/* We are hptsi too */
2304 		rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
2305 	} else if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
2306 		/*
2307 		 * We are still left on the hpts when the to goes
2308 		 * it will be for output.
2309 		 */
2310 		if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
2311 			slot = cts - rack->r_ctl.rc_last_output_to;
2312 		else
2313 			slot = 1;
2314 	}
2315 	if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) {
2316 		/* No send window.. we must enter persist */
2317 		rack_enter_persist(tp, rack, cts);
2318 	} else if ((frm_out_sbavail &&
2319 		    (frm_out_sbavail > (tp->snd_max - tp->snd_una)) &&
2320 		    (tp->snd_wnd < tp->t_maxseg)) &&
2321 	    TCPS_HAVEESTABLISHED(tp->t_state)) {
2322 		/*
2323 		 * If we have no window or we can't send a segment (and have
2324 		 * data to send.. we cheat here and frm_out_sbavail is
2325 		 * passed in with the sbavail(sb) only from bbr_output) and
2326 		 * we are established, then we must enter persits (if not
2327 		 * already in persits).
2328 		 */
2329 		rack_enter_persist(tp, rack, cts);
2330 	}
2331 	hpts_timeout = rack_timer_start(tp, rack, cts);
2332 	if (tp->t_flags & TF_DELACK) {
2333 		delayed_ack = TICKS_2_MSEC(tcp_delacktime);
2334 		rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
2335 	}
2336 	if (delayed_ack && ((hpts_timeout == 0) ||
2337 			    (delayed_ack < hpts_timeout)))
2338 		hpts_timeout = delayed_ack;
2339 	else
2340 		rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2341 	/*
2342 	 * If no timers are going to run and we will fall off the hptsi
2343 	 * wheel, we resort to a keep-alive timer if its configured.
2344 	 */
2345 	if ((hpts_timeout == 0) &&
2346 	    (slot == 0)) {
2347 		if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2348 		    (tp->t_state <= TCPS_CLOSING)) {
2349 			/*
2350 			 * Ok we have no timer (persists, rack, tlp, rxt  or
2351 			 * del-ack), we don't have segments being paced. So
2352 			 * all that is left is the keepalive timer.
2353 			 */
2354 			if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2355 				/* Get the established keep-alive time */
2356 				hpts_timeout = TP_KEEPIDLE(tp);
2357 			} else {
2358 				/* Get the initial setup keep-alive time */
2359 				hpts_timeout = TP_KEEPINIT(tp);
2360 			}
2361 			rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
2362 		}
2363 	}
2364 	if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
2365 	    (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
2366 		/*
2367 		 * RACK, TLP, persists and RXT timers all are restartable
2368 		 * based on actions input .. i.e we received a packet (ack
2369 		 * or sack) and that changes things (rw, or snd_una etc).
2370 		 * Thus we can restart them with a new value. For
2371 		 * keep-alive, delayed_ack we keep track of what was left
2372 		 * and restart the timer with a smaller value.
2373 		 */
2374 		if (left < hpts_timeout)
2375 			hpts_timeout = left;
2376 	}
2377 	if (hpts_timeout) {
2378 		/*
2379 		 * Hack alert for now we can't time-out over 2,147,483
2380 		 * seconds (a bit more than 596 hours), which is probably ok
2381 		 * :).
2382 		 */
2383 		if (hpts_timeout > 0x7ffffffe)
2384 			hpts_timeout = 0x7ffffffe;
2385 		rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
2386 	}
2387 	if (slot) {
2388 		rack->r_ctl.rc_last_output_to = cts + slot;
2389 		if ((hpts_timeout == 0) || (hpts_timeout > slot)) {
2390 			if (rack->rc_inp->inp_in_hpts == 0)
2391 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(slot));
2392 			rack_log_to_start(rack, cts, hpts_timeout, slot, 1);
2393 		} else {
2394 			/*
2395 			 * Arrange for the hpts to kick back in after the
2396 			 * t-o if the t-o does not cause a send.
2397 			 */
2398 			if (rack->rc_inp->inp_in_hpts == 0)
2399 				tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2400 			rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2401 		}
2402 	} else if (hpts_timeout) {
2403 		if (rack->rc_inp->inp_in_hpts == 0)
2404 			tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout));
2405 		rack_log_to_start(rack, cts, hpts_timeout, slot, 0);
2406 	} else {
2407 		/* No timer starting */
2408 #ifdef INVARIANTS
2409 		if (SEQ_GT(tp->snd_max, tp->snd_una)) {
2410 			panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?",
2411 			    tp, rack, tot_len_this_send, cts, slot, hpts_timeout);
2412 		}
2413 #endif
2414 	}
2415 	rack->rc_tmr_stopped = 0;
2416 	if (slot)
2417 		rack_log_type_bbrsnd(rack, tot_len_this_send, slot, cts);
2418 }
2419 
2420 /*
2421  * RACK Timer, here we simply do logging and house keeping.
2422  * the normal rack_output() function will call the
2423  * appropriate thing to check if we need to do a RACK retransmit.
2424  * We return 1, saying don't proceed with rack_output only
2425  * when all timers have been stopped (destroyed PCB?).
2426  */
2427 static int
2428 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2429 {
2430 	/*
2431 	 * This timer simply provides an internal trigger to send out data.
2432 	 * The check_recovery_mode call will see if there are needed
2433 	 * retransmissions, if so we will enter fast-recovery. The output
2434 	 * call may or may not do the same thing depending on sysctl
2435 	 * settings.
2436 	 */
2437 	struct rack_sendmap *rsm;
2438 	int32_t recovery;
2439 
2440 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2441 		return (1);
2442 	}
2443 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2444 		/* Its not time yet */
2445 		return (0);
2446 	}
2447 	rack_log_to_event(rack, RACK_TO_FRM_RACK);
2448 	recovery = IN_RECOVERY(tp->t_flags);
2449 	counter_u64_add(rack_to_tot, 1);
2450 	if (rack->r_state && (rack->r_state != tp->t_state))
2451 		rack_set_state(tp, rack);
2452 	rsm = rack_check_recovery_mode(tp, cts);
2453 	if (rsm) {
2454 		uint32_t rtt;
2455 
2456 		rtt = rack->rc_rack_rtt;
2457 		if (rtt == 0)
2458 			rtt = 1;
2459 		if ((recovery == 0) &&
2460 		    (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) {
2461 			/*
2462 			 * The rack-timeout that enter's us into recovery
2463 			 * will force out one MSS and set us up so that we
2464 			 * can do one more send in 2*rtt (transitioning the
2465 			 * rack timeout into a rack-tlp).
2466 			 */
2467 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2468 		} else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) &&
2469 		    ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) {
2470 			/*
2471 			 * When a rack timer goes, we have to send at
2472 			 * least one segment. They will be paced a min of 1ms
2473 			 * apart via the next rack timer (or further
2474 			 * if the rack timer dictates it).
2475 			 */
2476 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2477 		}
2478 	} else {
2479 		/* This is a case that should happen rarely if ever */
2480 		counter_u64_add(rack_tlp_does_nada, 1);
2481 #ifdef TCP_BLACKBOX
2482 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2483 #endif
2484 		rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2485 	}
2486 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
2487 	return (0);
2488 }
2489 
2490 /*
2491  * TLP Timer, here we simply setup what segment we want to
2492  * have the TLP expire on, the normal rack_output() will then
2493  * send it out.
2494  *
2495  * We return 1, saying don't proceed with rack_output only
2496  * when all timers have been stopped (destroyed PCB?).
2497  */
2498 static int
2499 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2500 {
2501 	/*
2502 	 * Tail Loss Probe.
2503 	 */
2504 	struct rack_sendmap *rsm = NULL;
2505 	struct socket *so;
2506 	uint32_t amm, old_prr_snd = 0;
2507 	uint32_t out, avail;
2508 
2509 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2510 		return (1);
2511 	}
2512 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
2513 		/* Its not time yet */
2514 		return (0);
2515 	}
2516 	if (rack_progress_timeout_check(tp)) {
2517 		tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
2518 		return (1);
2519 	}
2520 	/*
2521 	 * A TLP timer has expired. We have been idle for 2 rtts. So we now
2522 	 * need to figure out how to force a full MSS segment out.
2523 	 */
2524 	rack_log_to_event(rack, RACK_TO_FRM_TLP);
2525 	counter_u64_add(rack_tlp_tot, 1);
2526 	if (rack->r_state && (rack->r_state != tp->t_state))
2527 		rack_set_state(tp, rack);
2528 	so = tp->t_inpcb->inp_socket;
2529 	avail = sbavail(&so->so_snd);
2530 	out = tp->snd_max - tp->snd_una;
2531 	rack->rc_timer_up = 1;
2532 	/*
2533 	 * If we are in recovery we can jazz out a segment if new data is
2534 	 * present simply by setting rc_prr_sndcnt to a segment.
2535 	 */
2536 	if ((avail > out) &&
2537 	    ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) {
2538 		/* New data is available */
2539 		amm = avail - out;
2540 		if (amm > tp->t_maxseg) {
2541 			amm = tp->t_maxseg;
2542 		} else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) {
2543 			/* not enough to fill a MTU and no-delay is off */
2544 			goto need_retran;
2545 		}
2546 		if (IN_RECOVERY(tp->t_flags)) {
2547 			/* Unlikely */
2548 			old_prr_snd = rack->r_ctl.rc_prr_sndcnt;
2549 			if (out + amm <= tp->snd_wnd)
2550 				rack->r_ctl.rc_prr_sndcnt = amm;
2551 			else
2552 				goto need_retran;
2553 		} else {
2554 			/* Set the send-new override */
2555 			if (out + amm <= tp->snd_wnd)
2556 				rack->r_ctl.rc_tlp_new_data = amm;
2557 			else
2558 				goto need_retran;
2559 		}
2560 		rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2561 		rack->r_ctl.rc_last_tlp_seq = tp->snd_max;
2562 		rack->r_ctl.rc_tlpsend = NULL;
2563 		counter_u64_add(rack_tlp_newdata, 1);
2564 		goto send;
2565 	}
2566 need_retran:
2567 	/*
2568 	 * Ok we need to arrange the last un-acked segment to be re-sent, or
2569 	 * optionally the first un-acked segment.
2570 	 */
2571 	if (rack_always_send_oldest)
2572 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
2573 	else {
2574 		rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
2575 		if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
2576 			rsm = rack_find_high_nonack(rack, rsm);
2577 		}
2578 	}
2579 	if (rsm == NULL) {
2580 		counter_u64_add(rack_tlp_does_nada, 1);
2581 #ifdef TCP_BLACKBOX
2582 		tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
2583 #endif
2584 		goto out;
2585 	}
2586 	if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) {
2587 		/*
2588 		 * We need to split this the last segment in two.
2589 		 */
2590 		int32_t idx;
2591 		struct rack_sendmap *nrsm;
2592 
2593 		nrsm = rack_alloc(rack);
2594 		if (nrsm == NULL) {
2595 			/*
2596 			 * No memory to split, we will just exit and punt
2597 			 * off to the RXT timer.
2598 			 */
2599 			counter_u64_add(rack_tlp_does_nada, 1);
2600 			goto out;
2601 		}
2602 		nrsm->r_start = (rsm->r_end - tp->t_maxseg);
2603 		nrsm->r_end = rsm->r_end;
2604 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
2605 		nrsm->r_flags = rsm->r_flags;
2606 		nrsm->r_sndcnt = rsm->r_sndcnt;
2607 		nrsm->r_rtr_bytes = 0;
2608 		rsm->r_end = nrsm->r_start;
2609 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
2610 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
2611 		}
2612 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
2613 		if (rsm->r_in_tmap) {
2614 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
2615 			nrsm->r_in_tmap = 1;
2616 		}
2617 		rsm->r_flags &= (~RACK_HAS_FIN);
2618 		rsm = nrsm;
2619 	}
2620 	rack->r_ctl.rc_tlpsend = rsm;
2621 	rack->r_ctl.rc_tlp_rtx_out = 1;
2622 	if (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) {
2623 		rack->r_ctl.rc_tlp_seg_send_cnt++;
2624 		tp->t_rxtshift++;
2625 	} else {
2626 		rack->r_ctl.rc_last_tlp_seq = rsm->r_start;
2627 		rack->r_ctl.rc_tlp_seg_send_cnt = 1;
2628 	}
2629 send:
2630 	rack->r_ctl.rc_tlp_send_cnt++;
2631 	if (rack->r_ctl.rc_tlp_send_cnt > rack_tlp_max_resend) {
2632 		/*
2633 		 * Can't [re]/transmit a segment we have not heard from the
2634 		 * peer in max times. We need the retransmit timer to take
2635 		 * over.
2636 		 */
2637 restore:
2638 		rack->r_ctl.rc_tlpsend = NULL;
2639 		if (rsm)
2640 			rsm->r_flags &= ~RACK_TLP;
2641 		rack->r_ctl.rc_prr_sndcnt = old_prr_snd;
2642 		counter_u64_add(rack_tlp_retran_fail, 1);
2643 		goto out;
2644 	} else if (rsm) {
2645 		rsm->r_flags |= RACK_TLP;
2646 	}
2647 	if (rsm && (rsm->r_start == rack->r_ctl.rc_last_tlp_seq) &&
2648 	    (rack->r_ctl.rc_tlp_seg_send_cnt > rack_tlp_max_resend)) {
2649 		/*
2650 		 * We don't want to send a single segment more than the max
2651 		 * either.
2652 		 */
2653 		goto restore;
2654 	}
2655 	rack->r_timer_override = 1;
2656 	rack->r_tlp_running = 1;
2657 	rack->rc_tlp_in_progress = 1;
2658 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2659 	return (0);
2660 out:
2661 	rack->rc_timer_up = 0;
2662 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
2663 	return (0);
2664 }
2665 
2666 /*
2667  * Delayed ack Timer, here we simply need to setup the
2668  * ACK_NOW flag and remove the DELACK flag. From there
2669  * the output routine will send the ack out.
2670  *
2671  * We only return 1, saying don't proceed, if all timers
2672  * are stopped (destroyed PCB?).
2673  */
2674 static int
2675 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2676 {
2677 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2678 		return (1);
2679 	}
2680 	rack_log_to_event(rack, RACK_TO_FRM_DELACK);
2681 	tp->t_flags &= ~TF_DELACK;
2682 	tp->t_flags |= TF_ACKNOW;
2683 	TCPSTAT_INC(tcps_delack);
2684 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
2685 	return (0);
2686 }
2687 
2688 /*
2689  * Persists timer, here we simply need to setup the
2690  * FORCE-DATA flag the output routine will send
2691  * the one byte send.
2692  *
2693  * We only return 1, saying don't proceed, if all timers
2694  * are stopped (destroyed PCB?).
2695  */
2696 static int
2697 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2698 {
2699 	struct inpcb *inp;
2700 	int32_t retval = 0;
2701 
2702 	inp = tp->t_inpcb;
2703 
2704 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2705 		return (1);
2706 	}
2707 	if (rack->rc_in_persist == 0)
2708 		return (0);
2709 	if (rack_progress_timeout_check(tp)) {
2710 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
2711 		return (1);
2712 	}
2713 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
2714 	/*
2715 	 * Persistence timer into zero window. Force a byte to be output, if
2716 	 * possible.
2717 	 */
2718 	TCPSTAT_INC(tcps_persisttimeo);
2719 	/*
2720 	 * Hack: if the peer is dead/unreachable, we do not time out if the
2721 	 * window is closed.  After a full backoff, drop the connection if
2722 	 * the idle time (no responses to probes) reaches the maximum
2723 	 * backoff that we would use if retransmitting.
2724 	 */
2725 	if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
2726 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
2727 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
2728 		TCPSTAT_INC(tcps_persistdrop);
2729 		retval = 1;
2730 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2731 		goto out;
2732 	}
2733 	if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
2734 	    tp->snd_una == tp->snd_max)
2735 		rack_exit_persist(tp, rack);
2736 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
2737 	/*
2738 	 * If the user has closed the socket then drop a persisting
2739 	 * connection after a much reduced timeout.
2740 	 */
2741 	if (tp->t_state > TCPS_CLOSE_WAIT &&
2742 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
2743 		retval = 1;
2744 		TCPSTAT_INC(tcps_persistdrop);
2745 		tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2746 		goto out;
2747 	}
2748 	tp->t_flags |= TF_FORCEDATA;
2749 out:
2750 	rack_log_to_event(rack, RACK_TO_FRM_PERSIST);
2751 	return (retval);
2752 }
2753 
2754 /*
2755  * If a keepalive goes off, we had no other timers
2756  * happening. We always return 1 here since this
2757  * routine either drops the connection or sends
2758  * out a segment with respond.
2759  */
2760 static int
2761 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2762 {
2763 	struct tcptemp *t_template;
2764 	struct inpcb *inp;
2765 
2766 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2767 		return (1);
2768 	}
2769 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
2770 	inp = tp->t_inpcb;
2771 	rack_log_to_event(rack, RACK_TO_FRM_KEEP);
2772 	/*
2773 	 * Keep-alive timer went off; send something or drop connection if
2774 	 * idle for too long.
2775 	 */
2776 	TCPSTAT_INC(tcps_keeptimeo);
2777 	if (tp->t_state < TCPS_ESTABLISHED)
2778 		goto dropit;
2779 	if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
2780 	    tp->t_state <= TCPS_CLOSING) {
2781 		if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
2782 			goto dropit;
2783 		/*
2784 		 * Send a packet designed to force a response if the peer is
2785 		 * up and reachable: either an ACK if the connection is
2786 		 * still alive, or an RST if the peer has closed the
2787 		 * connection due to timeout or reboot. Using sequence
2788 		 * number tp->snd_una-1 causes the transmitted zero-length
2789 		 * segment to lie outside the receive window; by the
2790 		 * protocol spec, this requires the correspondent TCP to
2791 		 * respond.
2792 		 */
2793 		TCPSTAT_INC(tcps_keepprobe);
2794 		t_template = tcpip_maketemplate(inp);
2795 		if (t_template) {
2796 			tcp_respond(tp, t_template->tt_ipgen,
2797 			    &t_template->tt_t, (struct mbuf *)NULL,
2798 			    tp->rcv_nxt, tp->snd_una - 1, 0);
2799 			free(t_template, M_TEMP);
2800 		}
2801 	}
2802 	rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
2803 	return (1);
2804 dropit:
2805 	TCPSTAT_INC(tcps_keepdrops);
2806 	tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT);
2807 	return (1);
2808 }
2809 
2810 /*
2811  * Retransmit helper function, clear up all the ack
2812  * flags and take care of important book keeping.
2813  */
2814 static void
2815 rack_remxt_tmr(struct tcpcb *tp)
2816 {
2817 	/*
2818 	 * The retransmit timer went off, all sack'd blocks must be
2819 	 * un-acked.
2820 	 */
2821 	struct rack_sendmap *rsm, *trsm = NULL;
2822 	struct tcp_rack *rack;
2823 	int32_t cnt = 0;
2824 
2825 	rack = (struct tcp_rack *)tp->t_fb_ptr;
2826 	rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__);
2827 	rack_log_to_event(rack, RACK_TO_FRM_TMR);
2828 	if (rack->r_state && (rack->r_state != tp->t_state))
2829 		rack_set_state(tp, rack);
2830 	/*
2831 	 * Ideally we would like to be able to
2832 	 * mark SACK-PASS on anything not acked here.
2833 	 * However, if we do that we would burst out
2834 	 * all that data 1ms apart. This would be unwise,
2835 	 * so for now we will just let the normal rxt timer
2836 	 * and tlp timer take care of it.
2837 	 */
2838 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
2839 		if (rsm->r_flags & RACK_ACKED) {
2840 			cnt++;
2841 			rsm->r_sndcnt = 0;
2842 			if (rsm->r_in_tmap == 0) {
2843 				/* We must re-add it back to the tlist */
2844 				if (trsm == NULL) {
2845 					TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
2846 				} else {
2847 					TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
2848 				}
2849 				rsm->r_in_tmap = 1;
2850 				trsm = rsm;
2851 			}
2852 		}
2853 		rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
2854 	}
2855 	/* Clear the count (we just un-acked them) */
2856 	rack->r_ctl.rc_sacked = 0;
2857 	/* Clear the tlp rtx mark */
2858 	rack->r_ctl.rc_tlp_rtx_out = 0;
2859 	rack->r_ctl.rc_tlp_seg_send_cnt = 0;
2860 	rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map);
2861 	/* Setup so we send one segment */
2862 	if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)
2863 		rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
2864 	rack->r_timer_override = 1;
2865 }
2866 
2867 /*
2868  * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
2869  * we will setup to retransmit the lowest seq number outstanding.
2870  */
2871 static int
2872 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
2873 {
2874 	int32_t rexmt;
2875 	struct inpcb *inp;
2876 	int32_t retval = 0;
2877 
2878 	inp = tp->t_inpcb;
2879 	if (tp->t_timers->tt_flags & TT_STOPPED) {
2880 		return (1);
2881 	}
2882 	if (rack_progress_timeout_check(tp)) {
2883 		tcp_set_inp_to_drop(inp, ETIMEDOUT);
2884 		return (1);
2885 	}
2886 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
2887 	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
2888 	    (tp->snd_una == tp->snd_max)) {
2889 		/* Nothing outstanding .. nothing to do */
2890 		return (0);
2891 	}
2892 	/*
2893 	 * Retransmission timer went off.  Message has not been acked within
2894 	 * retransmit interval.  Back off to a longer retransmit interval
2895 	 * and retransmit one segment.
2896 	 */
2897 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
2898 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
2899 		TCPSTAT_INC(tcps_timeoutdrop);
2900 		retval = 1;
2901 		tcp_set_inp_to_drop(rack->rc_inp,
2902 		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
2903 		goto out;
2904 	}
2905 	rack_remxt_tmr(tp);
2906 	if (tp->t_state == TCPS_SYN_SENT) {
2907 		/*
2908 		 * If the SYN was retransmitted, indicate CWND to be limited
2909 		 * to 1 segment in cc_conn_init().
2910 		 */
2911 		tp->snd_cwnd = 1;
2912 	} else if (tp->t_rxtshift == 1) {
2913 		/*
2914 		 * first retransmit; record ssthresh and cwnd so they can be
2915 		 * recovered if this turns out to be a "bad" retransmit. A
2916 		 * retransmit is considered "bad" if an ACK for this segment
2917 		 * is received within RTT/2 interval; the assumption here is
2918 		 * that the ACK was already in flight.  See "On Estimating
2919 		 * End-to-End Network Path Properties" by Allman and Paxson
2920 		 * for more details.
2921 		 */
2922 		tp->snd_cwnd_prev = tp->snd_cwnd;
2923 		tp->snd_ssthresh_prev = tp->snd_ssthresh;
2924 		tp->snd_recover_prev = tp->snd_recover;
2925 		if (IN_FASTRECOVERY(tp->t_flags))
2926 			tp->t_flags |= TF_WASFRECOVERY;
2927 		else
2928 			tp->t_flags &= ~TF_WASFRECOVERY;
2929 		if (IN_CONGRECOVERY(tp->t_flags))
2930 			tp->t_flags |= TF_WASCRECOVERY;
2931 		else
2932 			tp->t_flags &= ~TF_WASCRECOVERY;
2933 		tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
2934 		tp->t_flags |= TF_PREVVALID;
2935 	} else
2936 		tp->t_flags &= ~TF_PREVVALID;
2937 	TCPSTAT_INC(tcps_rexmttimeo);
2938 	if ((tp->t_state == TCPS_SYN_SENT) ||
2939 	    (tp->t_state == TCPS_SYN_RECEIVED))
2940 		rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
2941 	else
2942 		rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
2943 	TCPT_RANGESET(tp->t_rxtcur, rexmt,
2944 	   max(MSEC_2_TICKS(rack_rto_min), rexmt),
2945 	   MSEC_2_TICKS(rack_rto_max));
2946 	/*
2947 	 * We enter the path for PLMTUD if connection is established or, if
2948 	 * connection is FIN_WAIT_1 status, reason for the last is that if
2949 	 * amount of data we send is very small, we could send it in couple
2950 	 * of packets and process straight to FIN. In that case we won't
2951 	 * catch ESTABLISHED state.
2952 	 */
2953 	if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
2954 	    || (tp->t_state == TCPS_FIN_WAIT_1))) {
2955 #ifdef INET6
2956 		int32_t isipv6;
2957 #endif
2958 
2959 		/*
2960 		 * Idea here is that at each stage of mtu probe (usually,
2961 		 * 1448 -> 1188 -> 524) should be given 2 chances to recover
2962 		 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
2963 		 * should take care of that.
2964 		 */
2965 		if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
2966 		    (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
2967 		    (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
2968 		    tp->t_rxtshift % 2 == 0)) {
2969 			/*
2970 			 * Enter Path MTU Black-hole Detection mechanism: -
2971 			 * Disable Path MTU Discovery (IP "DF" bit). -
2972 			 * Reduce MTU to lower value than what we negotiated
2973 			 * with peer.
2974 			 */
2975 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
2976 				/* Record that we may have found a black hole. */
2977 				tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
2978 				/* Keep track of previous MSS. */
2979 				tp->t_pmtud_saved_maxseg = tp->t_maxseg;
2980 			}
2981 
2982 			/*
2983 			 * Reduce the MSS to blackhole value or to the
2984 			 * default in an attempt to retransmit.
2985 			 */
2986 #ifdef INET6
2987 			isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
2988 			if (isipv6 &&
2989 			    tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
2990 				/* Use the sysctl tuneable blackhole MSS. */
2991 				tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
2992 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
2993 			} else if (isipv6) {
2994 				/* Use the default MSS. */
2995 				tp->t_maxseg = V_tcp_v6mssdflt;
2996 				/*
2997 				 * Disable Path MTU Discovery when we switch
2998 				 * to minmss.
2999 				 */
3000 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3001 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3002 			}
3003 #endif
3004 #if defined(INET6) && defined(INET)
3005 			else
3006 #endif
3007 #ifdef INET
3008 			if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
3009 				/* Use the sysctl tuneable blackhole MSS. */
3010 				tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
3011 				TCPSTAT_INC(tcps_pmtud_blackhole_activated);
3012 			} else {
3013 				/* Use the default MSS. */
3014 				tp->t_maxseg = V_tcp_mssdflt;
3015 				/*
3016 				 * Disable Path MTU Discovery when we switch
3017 				 * to minmss.
3018 				 */
3019 				tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
3020 				TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
3021 			}
3022 #endif
3023 		} else {
3024 			/*
3025 			 * If further retransmissions are still unsuccessful
3026 			 * with a lowered MTU, maybe this isn't a blackhole
3027 			 * and we restore the previous MSS and blackhole
3028 			 * detection flags. The limit '6' is determined by
3029 			 * giving each probe stage (1448, 1188, 524) 2
3030 			 * chances to recover.
3031 			 */
3032 			if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
3033 			    (tp->t_rxtshift >= 6)) {
3034 				tp->t_flags2 |= TF2_PLPMTU_PMTUD;
3035 				tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
3036 				tp->t_maxseg = tp->t_pmtud_saved_maxseg;
3037 				TCPSTAT_INC(tcps_pmtud_blackhole_failed);
3038 			}
3039 		}
3040 	}
3041 	/*
3042 	 * Disable RFC1323 and SACK if we haven't got any response to our
3043 	 * third SYN to work-around some broken terminal servers (most of
3044 	 * which have hopefully been retired) that have bad VJ header
3045 	 * compression code which trashes TCP segments containing
3046 	 * unknown-to-them TCP options.
3047 	 */
3048 	if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
3049 	    (tp->t_rxtshift == 3))
3050 		tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT);
3051 	/*
3052 	 * If we backed off this far, our srtt estimate is probably bogus.
3053 	 * Clobber it so we'll take the next rtt measurement as our srtt;
3054 	 * move the current srtt into rttvar to keep the current retransmit
3055 	 * times until then.
3056 	 */
3057 	if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
3058 #ifdef INET6
3059 		if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
3060 			in6_losing(tp->t_inpcb);
3061 		else
3062 #endif
3063 			in_losing(tp->t_inpcb);
3064 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
3065 		tp->t_srtt = 0;
3066 	}
3067 	if (rack_use_sack_filter)
3068 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
3069 	tp->snd_recover = tp->snd_max;
3070 	tp->t_flags |= TF_ACKNOW;
3071 	tp->t_rtttime = 0;
3072 	rack_cong_signal(tp, NULL, CC_RTO);
3073 out:
3074 	return (retval);
3075 }
3076 
3077 static int
3078 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling)
3079 {
3080 	int32_t ret = 0;
3081 	int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
3082 
3083 	if (timers == 0) {
3084 		return (0);
3085 	}
3086 	if (tp->t_state == TCPS_LISTEN) {
3087 		/* no timers on listen sockets */
3088 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
3089 			return (0);
3090 		return (1);
3091 	}
3092 	if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
3093 		uint32_t left;
3094 
3095 		if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
3096 			ret = -1;
3097 			rack_log_to_processing(rack, cts, ret, 0);
3098 			return (0);
3099 		}
3100 		if (hpts_calling == 0) {
3101 			ret = -2;
3102 			rack_log_to_processing(rack, cts, ret, 0);
3103 			return (0);
3104 		}
3105 		/*
3106 		 * Ok our timer went off early and we are not paced false
3107 		 * alarm, go back to sleep.
3108 		 */
3109 		ret = -3;
3110 		left = rack->r_ctl.rc_timer_exp - cts;
3111 		tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left));
3112 		rack_log_to_processing(rack, cts, ret, left);
3113 		rack->rc_last_pto_set = 0;
3114 		return (1);
3115 	}
3116 	rack->rc_tmr_stopped = 0;
3117 	rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
3118 	if (timers & PACE_TMR_DELACK) {
3119 		ret = rack_timeout_delack(tp, rack, cts);
3120 	} else if (timers & PACE_TMR_RACK) {
3121 		ret = rack_timeout_rack(tp, rack, cts);
3122 	} else if (timers & PACE_TMR_TLP) {
3123 		ret = rack_timeout_tlp(tp, rack, cts);
3124 	} else if (timers & PACE_TMR_RXT) {
3125 		ret = rack_timeout_rxt(tp, rack, cts);
3126 	} else if (timers & PACE_TMR_PERSIT) {
3127 		ret = rack_timeout_persist(tp, rack, cts);
3128 	} else if (timers & PACE_TMR_KEEP) {
3129 		ret = rack_timeout_keepalive(tp, rack, cts);
3130 	}
3131 	rack_log_to_processing(rack, cts, ret, timers);
3132 	return (ret);
3133 }
3134 
3135 static void
3136 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
3137 {
3138 	uint8_t hpts_removed = 0;
3139 
3140 	if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
3141 	    TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
3142 		tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3143 		hpts_removed = 1;
3144 	}
3145 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
3146 		rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
3147 		if (rack->rc_inp->inp_in_hpts &&
3148 		    ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
3149 			/*
3150 			 * Canceling timer's when we have no output being
3151 			 * paced. We also must remove ourselves from the
3152 			 * hpts.
3153 			 */
3154 			tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
3155 			hpts_removed = 1;
3156 		}
3157 		rack_log_to_cancel(rack, hpts_removed, line);
3158 		rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
3159 	}
3160 }
3161 
3162 static void
3163 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type)
3164 {
3165 	return;
3166 }
3167 
3168 static int
3169 rack_stopall(struct tcpcb *tp)
3170 {
3171 	struct tcp_rack *rack;
3172 	rack = (struct tcp_rack *)tp->t_fb_ptr;
3173 	rack->t_timers_stopped = 1;
3174 	return (0);
3175 }
3176 
3177 static void
3178 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta)
3179 {
3180 	return;
3181 }
3182 
3183 static int
3184 rack_timer_active(struct tcpcb *tp, uint32_t timer_type)
3185 {
3186 	return (0);
3187 }
3188 
3189 static void
3190 rack_stop_all_timers(struct tcpcb *tp)
3191 {
3192 	struct tcp_rack *rack;
3193 
3194 	/*
3195 	 * Assure no timers are running.
3196 	 */
3197 	if (tcp_timer_active(tp, TT_PERSIST)) {
3198 		/* We enter in persists, set the flag appropriately */
3199 		rack = (struct tcp_rack *)tp->t_fb_ptr;
3200 		rack->rc_in_persist = 1;
3201 	}
3202 	tcp_timer_suspend(tp, TT_PERSIST);
3203 	tcp_timer_suspend(tp, TT_REXMT);
3204 	tcp_timer_suspend(tp, TT_KEEP);
3205 	tcp_timer_suspend(tp, TT_DELACK);
3206 }
3207 
3208 static void
3209 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
3210     struct rack_sendmap *rsm, uint32_t ts)
3211 {
3212 	int32_t idx;
3213 
3214 	rsm->r_rtr_cnt++;
3215 	rsm->r_sndcnt++;
3216 	if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
3217 		rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
3218 		rsm->r_flags |= RACK_OVERMAX;
3219 	}
3220 	if ((rsm->r_rtr_cnt > 1) && (rack->r_tlp_running == 0)) {
3221 		rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
3222 		rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
3223 	}
3224 	idx = rsm->r_rtr_cnt - 1;
3225 	rsm->r_tim_lastsent[idx] = ts;
3226 	if (rsm->r_flags & RACK_ACKED) {
3227 		/* Problably MTU discovery messing with us */
3228 		rsm->r_flags &= ~RACK_ACKED;
3229 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
3230 	}
3231 	if (rsm->r_in_tmap) {
3232 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3233 	}
3234 	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3235 	rsm->r_in_tmap = 1;
3236 	if (rsm->r_flags & RACK_SACK_PASSED) {
3237 		/* We have retransmitted due to the SACK pass */
3238 		rsm->r_flags &= ~RACK_SACK_PASSED;
3239 		rsm->r_flags |= RACK_WAS_SACKPASS;
3240 	}
3241 	/* Update memory for next rtr */
3242 	rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3243 }
3244 
3245 
3246 static uint32_t
3247 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
3248     struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp)
3249 {
3250 	/*
3251 	 * We (re-)transmitted starting at rsm->r_start for some length
3252 	 * (possibly less than r_end.
3253 	 */
3254 	struct rack_sendmap *nrsm;
3255 	uint32_t c_end;
3256 	int32_t len;
3257 	int32_t idx;
3258 
3259 	len = *lenp;
3260 	c_end = rsm->r_start + len;
3261 	if (SEQ_GEQ(c_end, rsm->r_end)) {
3262 		/*
3263 		 * We retransmitted the whole piece or more than the whole
3264 		 * slopping into the next rsm.
3265 		 */
3266 		rack_update_rsm(tp, rack, rsm, ts);
3267 		if (c_end == rsm->r_end) {
3268 			*lenp = 0;
3269 			return (0);
3270 		} else {
3271 			int32_t act_len;
3272 
3273 			/* Hangs over the end return whats left */
3274 			act_len = rsm->r_end - rsm->r_start;
3275 			*lenp = (len - act_len);
3276 			return (rsm->r_end);
3277 		}
3278 		/* We don't get out of this block. */
3279 	}
3280 	/*
3281 	 * Here we retransmitted less than the whole thing which means we
3282 	 * have to split this into what was transmitted and what was not.
3283 	 */
3284 	nrsm = rack_alloc(rack);
3285 	if (nrsm == NULL) {
3286 		/*
3287 		 * We can't get memory, so lets not proceed.
3288 		 */
3289 		*lenp = 0;
3290 		return (0);
3291 	}
3292 	/*
3293 	 * So here we are going to take the original rsm and make it what we
3294 	 * retransmitted. nrsm will be the tail portion we did not
3295 	 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
3296 	 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
3297 	 * 1, 6 and the new piece will be 6, 11.
3298 	 */
3299 	nrsm->r_start = c_end;
3300 	nrsm->r_end = rsm->r_end;
3301 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3302 	nrsm->r_flags = rsm->r_flags;
3303 	nrsm->r_sndcnt = rsm->r_sndcnt;
3304 	nrsm->r_rtr_bytes = 0;
3305 	rsm->r_end = c_end;
3306 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3307 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3308 	}
3309 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3310 	if (rsm->r_in_tmap) {
3311 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3312 		nrsm->r_in_tmap = 1;
3313 	}
3314 	rsm->r_flags &= (~RACK_HAS_FIN);
3315 	rack_update_rsm(tp, rack, rsm, ts);
3316 	*lenp = 0;
3317 	return (0);
3318 }
3319 
3320 
3321 static void
3322 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
3323     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
3324     uint8_t pass, struct rack_sendmap *hintrsm)
3325 {
3326 	struct tcp_rack *rack;
3327 	struct rack_sendmap *rsm, *nrsm;
3328 	register uint32_t snd_max, snd_una;
3329 	int32_t idx;
3330 
3331 	/*
3332 	 * Add to the RACK log of packets in flight or retransmitted. If
3333 	 * there is a TS option we will use the TS echoed, if not we will
3334 	 * grab a TS.
3335 	 *
3336 	 * Retransmissions will increment the count and move the ts to its
3337 	 * proper place. Note that if options do not include TS's then we
3338 	 * won't be able to effectively use the ACK for an RTT on a retran.
3339 	 *
3340 	 * Notes about r_start and r_end. Lets consider a send starting at
3341 	 * sequence 1 for 10 bytes. In such an example the r_start would be
3342 	 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
3343 	 * This means that r_end is actually the first sequence for the next
3344 	 * slot (11).
3345 	 *
3346 	 */
3347 	/*
3348 	 * If err is set what do we do XXXrrs? should we not add the thing?
3349 	 * -- i.e. return if err != 0 or should we pretend we sent it? --
3350 	 * i.e. proceed with add ** do this for now.
3351 	 */
3352 	INP_WLOCK_ASSERT(tp->t_inpcb);
3353 	if (err)
3354 		/*
3355 		 * We don't log errors -- we could but snd_max does not
3356 		 * advance in this case either.
3357 		 */
3358 		return;
3359 
3360 	if (th_flags & TH_RST) {
3361 		/*
3362 		 * We don't log resets and we return immediately from
3363 		 * sending
3364 		 */
3365 		return;
3366 	}
3367 	rack = (struct tcp_rack *)tp->t_fb_ptr;
3368 	snd_una = tp->snd_una;
3369 	if (SEQ_LEQ((seq_out + len), snd_una)) {
3370 		/* Are sending an old segment to induce an ack (keep-alive)? */
3371 		return;
3372 	}
3373 	if (SEQ_LT(seq_out, snd_una)) {
3374 		/* huh? should we panic? */
3375 		uint32_t end;
3376 
3377 		end = seq_out + len;
3378 		seq_out = snd_una;
3379 		len = end - seq_out;
3380 	}
3381 	snd_max = tp->snd_max;
3382 	if (th_flags & (TH_SYN | TH_FIN)) {
3383 		/*
3384 		 * The call to rack_log_output is made before bumping
3385 		 * snd_max. This means we can record one extra byte on a SYN
3386 		 * or FIN if seq_out is adding more on and a FIN is present
3387 		 * (and we are not resending).
3388 		 */
3389 		if (th_flags & TH_SYN)
3390 			len++;
3391 		if (th_flags & TH_FIN)
3392 			len++;
3393 		if (SEQ_LT(snd_max, tp->snd_nxt)) {
3394 			/*
3395 			 * The add/update as not been done for the FIN/SYN
3396 			 * yet.
3397 			 */
3398 			snd_max = tp->snd_nxt;
3399 		}
3400 	}
3401 	if (len == 0) {
3402 		/* We don't log zero window probes */
3403 		return;
3404 	}
3405 	rack->r_ctl.rc_time_last_sent = ts;
3406 	if (IN_RECOVERY(tp->t_flags)) {
3407 		rack->r_ctl.rc_prr_out += len;
3408 	}
3409 	/* First question is it a retransmission? */
3410 	if (seq_out == snd_max) {
3411 again:
3412 		rsm = rack_alloc(rack);
3413 		if (rsm == NULL) {
3414 			/*
3415 			 * Hmm out of memory and the tcb got destroyed while
3416 			 * we tried to wait.
3417 			 */
3418 #ifdef INVARIANTS
3419 			panic("Out of memory when we should not be rack:%p", rack);
3420 #endif
3421 			return;
3422 		}
3423 		if (th_flags & TH_FIN) {
3424 			rsm->r_flags = RACK_HAS_FIN;
3425 		} else {
3426 			rsm->r_flags = 0;
3427 		}
3428 		rsm->r_tim_lastsent[0] = ts;
3429 		rsm->r_rtr_cnt = 1;
3430 		rsm->r_rtr_bytes = 0;
3431 		if (th_flags & TH_SYN) {
3432 			/* The data space is one beyond snd_una */
3433 			rsm->r_start = seq_out + 1;
3434 			rsm->r_end = rsm->r_start + (len - 1);
3435 		} else {
3436 			/* Normal case */
3437 			rsm->r_start = seq_out;
3438 			rsm->r_end = rsm->r_start + len;
3439 		}
3440 		rsm->r_sndcnt = 0;
3441 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
3442 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
3443 		rsm->r_in_tmap = 1;
3444 		return;
3445 	}
3446 	/*
3447 	 * If we reach here its a retransmission and we need to find it.
3448 	 */
3449 more:
3450 	if (hintrsm && (hintrsm->r_start == seq_out)) {
3451 		rsm = hintrsm;
3452 		hintrsm = NULL;
3453 	} else if (rack->r_ctl.rc_next) {
3454 		/* We have a hint from a previous run */
3455 		rsm = rack->r_ctl.rc_next;
3456 	} else {
3457 		/* No hints sorry */
3458 		rsm = NULL;
3459 	}
3460 	if ((rsm) && (rsm->r_start == seq_out)) {
3461 		/*
3462 		 * We used rc_next or hintrsm  to retransmit, hopefully the
3463 		 * likely case.
3464 		 */
3465 		seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3466 		if (len == 0) {
3467 			return;
3468 		} else {
3469 			goto more;
3470 		}
3471 	}
3472 	/* Ok it was not the last pointer go through it the hard way. */
3473 	TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3474 		if (rsm->r_start == seq_out) {
3475 			seq_out = rack_update_entry(tp, rack, rsm, ts, &len);
3476 			rack->r_ctl.rc_next = TAILQ_NEXT(rsm, r_next);
3477 			if (len == 0) {
3478 				return;
3479 			} else {
3480 				continue;
3481 			}
3482 		}
3483 		if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
3484 			/* Transmitted within this piece */
3485 			/*
3486 			 * Ok we must split off the front and then let the
3487 			 * update do the rest
3488 			 */
3489 			nrsm = rack_alloc(rack);
3490 			if (nrsm == NULL) {
3491 #ifdef INVARIANTS
3492 				panic("Ran out of memory that was preallocated? rack:%p", rack);
3493 #endif
3494 				rack_update_rsm(tp, rack, rsm, ts);
3495 				return;
3496 			}
3497 			/*
3498 			 * copy rsm to nrsm and then trim the front of rsm
3499 			 * to not include this part.
3500 			 */
3501 			nrsm->r_start = seq_out;
3502 			nrsm->r_end = rsm->r_end;
3503 			nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
3504 			nrsm->r_flags = rsm->r_flags;
3505 			nrsm->r_sndcnt = rsm->r_sndcnt;
3506 			nrsm->r_rtr_bytes = 0;
3507 			for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
3508 				nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
3509 			}
3510 			rsm->r_end = nrsm->r_start;
3511 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
3512 			if (rsm->r_in_tmap) {
3513 				TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
3514 				nrsm->r_in_tmap = 1;
3515 			}
3516 			rsm->r_flags &= (~RACK_HAS_FIN);
3517 			seq_out = rack_update_entry(tp, rack, nrsm, ts, &len);
3518 			if (len == 0) {
3519 				return;
3520 			}
3521 		}
3522 	}
3523 	/*
3524 	 * Hmm not found in map did they retransmit both old and on into the
3525 	 * new?
3526 	 */
3527 	if (seq_out == tp->snd_max) {
3528 		goto again;
3529 	} else if (SEQ_LT(seq_out, tp->snd_max)) {
3530 #ifdef INVARIANTS
3531 		printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
3532 		    seq_out, len, tp->snd_una, tp->snd_max);
3533 		printf("Starting Dump of all rack entries\n");
3534 		TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) {
3535 			printf("rsm:%p start:%u end:%u\n",
3536 			    rsm, rsm->r_start, rsm->r_end);
3537 		}
3538 		printf("Dump complete\n");
3539 		panic("seq_out not found rack:%p tp:%p",
3540 		    rack, tp);
3541 #endif
3542 	} else {
3543 #ifdef INVARIANTS
3544 		/*
3545 		 * Hmm beyond sndmax? (only if we are using the new rtt-pack
3546 		 * flag)
3547 		 */
3548 		panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
3549 		    seq_out, len, tp->snd_max, tp);
3550 #endif
3551 	}
3552 }
3553 
3554 /*
3555  * Record one of the RTT updates from an ack into
3556  * our sample structure.
3557  */
3558 static void
3559 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt)
3560 {
3561 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3562 	    (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
3563 		rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
3564 	}
3565 	if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
3566 	    (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
3567 		rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
3568 	}
3569 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
3570 	rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
3571 	rack->r_ctl.rack_rs.rs_rtt_cnt++;
3572 }
3573 
3574 /*
3575  * Collect new round-trip time estimate
3576  * and update averages and current timeout.
3577  */
3578 static void
3579 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
3580 {
3581 	int32_t delta;
3582 	uint32_t o_srtt, o_var;
3583 	int32_t rtt;
3584 
3585 	if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
3586 		/* No valid sample */
3587 		return;
3588 	if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
3589 		/* We are to use the lowest RTT seen in a single ack */
3590 		rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
3591 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
3592 		/* We are to use the highest RTT seen in a single ack */
3593 		rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
3594 	} else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
3595 		/* We are to use the average RTT seen in a single ack */
3596 		rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
3597 				(uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
3598 	} else {
3599 #ifdef INVARIANTS
3600 		panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
3601 #endif
3602 		return;
3603 	}
3604 	if (rtt == 0)
3605 		rtt = 1;
3606 	rack_log_rtt_sample(rack, rtt);
3607 	o_srtt = tp->t_srtt;
3608 	o_var = tp->t_rttvar;
3609 	rack = (struct tcp_rack *)tp->t_fb_ptr;
3610 	if (tp->t_srtt != 0) {
3611 		/*
3612 		 * srtt is stored as fixed point with 5 bits after the
3613 		 * binary point (i.e., scaled by 8).  The following magic is
3614 		 * equivalent to the smoothing algorithm in rfc793 with an
3615 		 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point).
3616 		 * Adjust rtt to origin 0.
3617 		 */
3618 		delta = ((rtt - 1) << TCP_DELTA_SHIFT)
3619 		    - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
3620 
3621 		tp->t_srtt += delta;
3622 		if (tp->t_srtt <= 0)
3623 			tp->t_srtt = 1;
3624 
3625 		/*
3626 		 * We accumulate a smoothed rtt variance (actually, a
3627 		 * smoothed mean difference), then set the retransmit timer
3628 		 * to smoothed rtt + 4 times the smoothed variance. rttvar
3629 		 * is stored as fixed point with 4 bits after the binary
3630 		 * point (scaled by 16).  The following is equivalent to
3631 		 * rfc793 smoothing with an alpha of .75 (rttvar =
3632 		 * rttvar*3/4 + |delta| / 4).  This replaces rfc793's
3633 		 * wired-in beta.
3634 		 */
3635 		if (delta < 0)
3636 			delta = -delta;
3637 		delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
3638 		tp->t_rttvar += delta;
3639 		if (tp->t_rttvar <= 0)
3640 			tp->t_rttvar = 1;
3641 		if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
3642 			tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3643 	} else {
3644 		/*
3645 		 * No rtt measurement yet - use the unsmoothed rtt. Set the
3646 		 * variance to half the rtt (so our first retransmit happens
3647 		 * at 3*rtt).
3648 		 */
3649 		tp->t_srtt = rtt << TCP_RTT_SHIFT;
3650 		tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
3651 		tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
3652 	}
3653 	TCPSTAT_INC(tcps_rttupdated);
3654 	rack_log_rtt_upd(tp, rack, rtt, o_srtt, o_var);
3655 	tp->t_rttupdated++;
3656 #ifdef NETFLIX_STATS
3657 	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
3658 #endif
3659 	tp->t_rxtshift = 0;
3660 
3661 	/*
3662 	 * the retransmit should happen at rtt + 4 * rttvar. Because of the
3663 	 * way we do the smoothing, srtt and rttvar will each average +1/2
3664 	 * tick of bias.  When we compute the retransmit timer, we want 1/2
3665 	 * tick of rounding and 1 extra tick because of +-1/2 tick
3666 	 * uncertainty in the firing of the timer.  The bias will give us
3667 	 * exactly the 1.5 tick we need.  But, because the bias is
3668 	 * statistical, we have to test that we don't drop below the minimum
3669 	 * feasible timer (which is 2 ticks).
3670 	 */
3671 	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
3672 	   max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max));
3673 	tp->t_softerror = 0;
3674 }
3675 
3676 static void
3677 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
3678     uint32_t t, uint32_t cts)
3679 {
3680 	/*
3681 	 * For this RSM, we acknowledged the data from a previous
3682 	 * transmission, not the last one we made. This means we did a false
3683 	 * retransmit.
3684 	 */
3685 	struct tcp_rack *rack;
3686 
3687 	if (rsm->r_flags & RACK_HAS_FIN) {
3688 		/*
3689 		 * The sending of the FIN often is multiple sent when we
3690 		 * have everything outstanding ack'd. We ignore this case
3691 		 * since its over now.
3692 		 */
3693 		return;
3694 	}
3695 	if (rsm->r_flags & RACK_TLP) {
3696 		/*
3697 		 * We expect TLP's to have this occur.
3698 		 */
3699 		return;
3700 	}
3701 	rack = (struct tcp_rack *)tp->t_fb_ptr;
3702 	/* should we undo cc changes and exit recovery? */
3703 	if (IN_RECOVERY(tp->t_flags)) {
3704 		if (rack->r_ctl.rc_rsm_start == rsm->r_start) {
3705 			/*
3706 			 * Undo what we ratched down and exit recovery if
3707 			 * possible
3708 			 */
3709 			EXIT_RECOVERY(tp->t_flags);
3710 			tp->snd_recover = tp->snd_una;
3711 			if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd)
3712 				tp->snd_cwnd = rack->r_ctl.rc_cwnd_at;
3713 			if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh)
3714 				tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at;
3715 		}
3716 	}
3717 	if (rsm->r_flags & RACK_WAS_SACKPASS) {
3718 		/*
3719 		 * We retransmitted based on a sack and the earlier
3720 		 * retransmission ack'd it - re-ordering is occuring.
3721 		 */
3722 		counter_u64_add(rack_reorder_seen, 1);
3723 		rack->r_ctl.rc_reorder_ts = cts;
3724 	}
3725 	counter_u64_add(rack_badfr, 1);
3726 	counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start));
3727 }
3728 
3729 
3730 static int
3731 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
3732     struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type)
3733 {
3734 	int32_t i;
3735 	uint32_t t;
3736 
3737 	if (rsm->r_flags & RACK_ACKED)
3738 		/* Already done */
3739 		return (0);
3740 
3741 
3742 	if ((rsm->r_rtr_cnt == 1) ||
3743 	    ((ack_type == CUM_ACKED) &&
3744 	    (to->to_flags & TOF_TS) &&
3745 	    (to->to_tsecr) &&
3746 	    (rsm->r_tim_lastsent[rsm->r_rtr_cnt - 1] == to->to_tsecr))
3747 	    ) {
3748 		/*
3749 		 * We will only find a matching timestamp if its cum-acked.
3750 		 * But if its only one retransmission its for-sure matching
3751 		 * :-)
3752 		 */
3753 		t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3754 		if ((int)t <= 0)
3755 			t = 1;
3756 		if (!tp->t_rttlow || tp->t_rttlow > t)
3757 			tp->t_rttlow = t;
3758 		if (!rack->r_ctl.rc_rack_min_rtt ||
3759 		    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3760 			rack->r_ctl.rc_rack_min_rtt = t;
3761 			if (rack->r_ctl.rc_rack_min_rtt == 0) {
3762 				rack->r_ctl.rc_rack_min_rtt = 1;
3763 			}
3764 		}
3765 		tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1);
3766 		if ((rsm->r_flags & RACK_TLP) &&
3767 		    (!IN_RECOVERY(tp->t_flags))) {
3768 			/* Segment was a TLP and our retrans matched */
3769 			if (rack->r_ctl.rc_tlp_cwnd_reduce) {
3770 				rack->r_ctl.rc_rsm_start = tp->snd_max;
3771 				rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
3772 				rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
3773 				rack_cong_signal(tp, NULL, CC_NDUPACK);
3774 				/*
3775 				 * When we enter recovery we need to assure
3776 				 * we send one packet.
3777 				 */
3778 				rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
3779 			} else
3780 				rack->r_ctl.rc_tlp_rtx_out = 0;
3781 		}
3782 		if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3783 			/* New more recent rack_tmit_time */
3784 			rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3785 			rack->rc_rack_rtt = t;
3786 		}
3787 		return (1);
3788 	}
3789 	/*
3790 	 * We clear the soft/rxtshift since we got an ack.
3791 	 * There is no assurance we will call the commit() function
3792 	 * so we need to clear these to avoid incorrect handling.
3793 	 */
3794 	tp->t_rxtshift = 0;
3795 	tp->t_softerror = 0;
3796 	if ((to->to_flags & TOF_TS) &&
3797 	    (ack_type == CUM_ACKED) &&
3798 	    (to->to_tsecr) &&
3799 	    ((rsm->r_flags & (RACK_DEFERRED | RACK_OVERMAX)) == 0)) {
3800 		/*
3801 		 * Now which timestamp does it match? In this block the ACK
3802 		 * must be coming from a previous transmission.
3803 		 */
3804 		for (i = 0; i < rsm->r_rtr_cnt; i++) {
3805 			if (rsm->r_tim_lastsent[i] == to->to_tsecr) {
3806 				t = cts - rsm->r_tim_lastsent[i];
3807 				if ((int)t <= 0)
3808 					t = 1;
3809 				if ((i + 1) < rsm->r_rtr_cnt) {
3810 					/* Likely */
3811 					rack_earlier_retran(tp, rsm, t, cts);
3812 				}
3813 				if (!tp->t_rttlow || tp->t_rttlow > t)
3814 					tp->t_rttlow = t;
3815 				if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3816 					rack->r_ctl.rc_rack_min_rtt = t;
3817 					if (rack->r_ctl.rc_rack_min_rtt == 0) {
3818 						rack->r_ctl.rc_rack_min_rtt = 1;
3819 					}
3820 				}
3821                                 /*
3822 				 * Note the following calls to
3823 				 * tcp_rack_xmit_timer() are being commented
3824 				 * out for now. They give us no more accuracy
3825 				 * and often lead to a wrong choice. We have
3826 				 * enough samples that have not been
3827 				 * retransmitted. I leave the commented out
3828 				 * code in here in case in the future we
3829 				 * decide to add it back (though I can't forsee
3830 				 * doing that). That way we will easily see
3831 				 * where they need to be placed.
3832 				 */
3833 				if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
3834 				    rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) {
3835 					/* New more recent rack_tmit_time */
3836 					rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
3837 					rack->rc_rack_rtt = t;
3838 				}
3839 				return (1);
3840 			}
3841 		}
3842 		goto ts_not_found;
3843 	} else {
3844 		/*
3845 		 * Ok its a SACK block that we retransmitted. or a windows
3846 		 * machine without timestamps. We can tell nothing from the
3847 		 * time-stamp since its not there or the time the peer last
3848 		 * recieved a segment that moved forward its cum-ack point.
3849 		 */
3850 ts_not_found:
3851 		i = rsm->r_rtr_cnt - 1;
3852 		t = cts - rsm->r_tim_lastsent[i];
3853 		if ((int)t <= 0)
3854 			t = 1;
3855 		if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3856 			/*
3857 			 * We retransmitted and the ack came back in less
3858 			 * than the smallest rtt we have observed. We most
3859 			 * likey did an improper retransmit as outlined in
3860 			 * 4.2 Step 3 point 2 in the rack-draft.
3861 			 */
3862 			i = rsm->r_rtr_cnt - 2;
3863 			t = cts - rsm->r_tim_lastsent[i];
3864 			rack_earlier_retran(tp, rsm, t, cts);
3865 		} else if (rack->r_ctl.rc_rack_min_rtt) {
3866 			/*
3867 			 * We retransmitted it and the retransmit did the
3868 			 * job.
3869 			 */
3870 			if (!rack->r_ctl.rc_rack_min_rtt ||
3871 			    SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
3872 				rack->r_ctl.rc_rack_min_rtt = t;
3873 				if (rack->r_ctl.rc_rack_min_rtt == 0) {
3874 					rack->r_ctl.rc_rack_min_rtt = 1;
3875 				}
3876 			}
3877 			if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) {
3878 				/* New more recent rack_tmit_time */
3879 				rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i];
3880 				rack->rc_rack_rtt = t;
3881 			}
3882 			return (1);
3883 		}
3884 	}
3885 	return (0);
3886 }
3887 
3888 /*
3889  * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
3890  */
3891 static void
3892 rack_log_sack_passed(struct tcpcb *tp,
3893     struct tcp_rack *rack, struct rack_sendmap *rsm)
3894 {
3895 	struct rack_sendmap *nrsm;
3896 	uint32_t ts;
3897 	int32_t idx;
3898 
3899 	idx = rsm->r_rtr_cnt - 1;
3900 	ts = rsm->r_tim_lastsent[idx];
3901 	nrsm = rsm;
3902 	TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
3903 	    rack_head, r_tnext) {
3904 		if (nrsm == rsm) {
3905 			/* Skip orginal segment he is acked */
3906 			continue;
3907 		}
3908 		if (nrsm->r_flags & RACK_ACKED) {
3909 			/* Skip ack'd segments */
3910 			continue;
3911 		}
3912 		idx = nrsm->r_rtr_cnt - 1;
3913 		if (ts == nrsm->r_tim_lastsent[idx]) {
3914 			/*
3915 			 * For this case lets use seq no, if we sent in a
3916 			 * big block (TSO) we would have a bunch of segments
3917 			 * sent at the same time.
3918 			 *
3919 			 * We would only get a report if its SEQ is earlier.
3920 			 * If we have done multiple retransmits the times
3921 			 * would not be equal.
3922 			 */
3923 			if (SEQ_LT(nrsm->r_start, rsm->r_start)) {
3924 				nrsm->r_flags |= RACK_SACK_PASSED;
3925 				nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3926 			}
3927 		} else {
3928 			/*
3929 			 * Here they were sent at different times, not a big
3930 			 * block. Since we transmitted this one later and
3931 			 * see it sack'd then this must also be missing (or
3932 			 * we would have gotten a sack block for it)
3933 			 */
3934 			nrsm->r_flags |= RACK_SACK_PASSED;
3935 			nrsm->r_flags &= ~RACK_WAS_SACKPASS;
3936 		}
3937 	}
3938 }
3939 
3940 static uint32_t
3941 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
3942     struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts)
3943 {
3944 	int32_t idx;
3945 	int32_t times = 0;
3946 	uint32_t start, end, changed = 0;
3947 	struct rack_sendmap *rsm, *nrsm;
3948 	int32_t used_ref = 1;
3949 
3950 	start = sack->start;
3951 	end = sack->end;
3952 	rsm = *prsm;
3953 	if (rsm && SEQ_LT(start, rsm->r_start)) {
3954 		TAILQ_FOREACH_REVERSE_FROM(rsm, &rack->r_ctl.rc_map, rack_head, r_next) {
3955 			if (SEQ_GEQ(start, rsm->r_start) &&
3956 			    SEQ_LT(start, rsm->r_end)) {
3957 				goto do_rest_ofb;
3958 			}
3959 		}
3960 	}
3961 	if (rsm == NULL) {
3962 start_at_beginning:
3963 		rsm = NULL;
3964 		used_ref = 0;
3965 	}
3966 	/* First lets locate the block where this guy is */
3967 	TAILQ_FOREACH_FROM(rsm, &rack->r_ctl.rc_map, r_next) {
3968 		if (SEQ_GEQ(start, rsm->r_start) &&
3969 		    SEQ_LT(start, rsm->r_end)) {
3970 			break;
3971 		}
3972 	}
3973 do_rest_ofb:
3974 	if (rsm == NULL) {
3975 		/*
3976 		 * This happens when we get duplicate sack blocks with the
3977 		 * same end. For example SACK 4: 100 SACK 3: 100 The sort
3978 		 * will not change there location so we would just start at
3979 		 * the end of the first one and get lost.
3980 		 */
3981 		if (tp->t_flags & TF_SENTFIN) {
3982 			/*
3983 			 * Check to see if we have not logged the FIN that
3984 			 * went out.
3985 			 */
3986 			nrsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
3987 			if (nrsm && (nrsm->r_end + 1) == tp->snd_max) {
3988 				/*
3989 				 * Ok we did not get the FIN logged.
3990 				 */
3991 				nrsm->r_end++;
3992 				rsm = nrsm;
3993 				goto do_rest_ofb;
3994 			}
3995 		}
3996 		if (times == 1) {
3997 #ifdef INVARIANTS
3998 			panic("tp:%p rack:%p sack:%p to:%p prsm:%p",
3999 			    tp, rack, sack, to, prsm);
4000 #else
4001 			goto out;
4002 #endif
4003 		}
4004 		times++;
4005 		counter_u64_add(rack_sack_proc_restart, 1);
4006 		goto start_at_beginning;
4007 	}
4008 	/* Ok we have an ACK for some piece of rsm */
4009 	if (rsm->r_start != start) {
4010 		/*
4011 		 * Need to split this in two pieces the before and after.
4012 		 */
4013 		nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4014 		if (nrsm == NULL) {
4015 			/*
4016 			 * failed XXXrrs what can we do but loose the sack
4017 			 * info?
4018 			 */
4019 			goto out;
4020 		}
4021 		nrsm->r_start = start;
4022 		nrsm->r_rtr_bytes = 0;
4023 		nrsm->r_end = rsm->r_end;
4024 		nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4025 		nrsm->r_flags = rsm->r_flags;
4026 		nrsm->r_sndcnt = rsm->r_sndcnt;
4027 		for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4028 			nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4029 		}
4030 		rsm->r_end = nrsm->r_start;
4031 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4032 		if (rsm->r_in_tmap) {
4033 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4034 			nrsm->r_in_tmap = 1;
4035 		}
4036 		rsm->r_flags &= (~RACK_HAS_FIN);
4037 		rsm = nrsm;
4038 	}
4039 	if (SEQ_GEQ(end, rsm->r_end)) {
4040 		/*
4041 		 * The end of this block is either beyond this guy or right
4042 		 * at this guy.
4043 		 */
4044 
4045 		if ((rsm->r_flags & RACK_ACKED) == 0) {
4046 			rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4047 			changed += (rsm->r_end - rsm->r_start);
4048 			rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4049 			rack_log_sack_passed(tp, rack, rsm);
4050 			/* Is Reordering occuring? */
4051 			if (rsm->r_flags & RACK_SACK_PASSED) {
4052 				counter_u64_add(rack_reorder_seen, 1);
4053 				rack->r_ctl.rc_reorder_ts = cts;
4054 			}
4055 			rsm->r_flags |= RACK_ACKED;
4056 			rsm->r_flags &= ~RACK_TLP;
4057 			if (rsm->r_in_tmap) {
4058 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4059 				rsm->r_in_tmap = 0;
4060 			}
4061 		}
4062 		if (end == rsm->r_end) {
4063 			/* This block only - done */
4064 			goto out;
4065 		}
4066 		/* There is more not coverend by this rsm move on */
4067 		start = rsm->r_end;
4068 		nrsm = TAILQ_NEXT(rsm, r_next);
4069 		rsm = nrsm;
4070 		times = 0;
4071 		goto do_rest_ofb;
4072 	}
4073 	/* Ok we need to split off this one at the tail */
4074 	nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
4075 	if (nrsm == NULL) {
4076 		/* failed rrs what can we do but loose the sack info? */
4077 		goto out;
4078 	}
4079 	/* Clone it */
4080 	nrsm->r_start = end;
4081 	nrsm->r_end = rsm->r_end;
4082 	nrsm->r_rtr_bytes = 0;
4083 	nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
4084 	nrsm->r_flags = rsm->r_flags;
4085 	nrsm->r_sndcnt = rsm->r_sndcnt;
4086 	for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
4087 		nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
4088 	}
4089 	/* The sack block does not cover this guy fully */
4090 	rsm->r_flags &= (~RACK_HAS_FIN);
4091 	rsm->r_end = end;
4092 	TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next);
4093 	if (rsm->r_in_tmap) {
4094 		TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
4095 		nrsm->r_in_tmap = 1;
4096 	}
4097 	if (rsm->r_flags & RACK_ACKED) {
4098 		/* Been here done that */
4099 		goto out;
4100 	}
4101 	rack_update_rtt(tp, rack, rsm, to, cts, SACKED);
4102 	changed += (rsm->r_end - rsm->r_start);
4103 	rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
4104 	rack_log_sack_passed(tp, rack, rsm);
4105 	/* Is Reordering occuring? */
4106 	if (rsm->r_flags & RACK_SACK_PASSED) {
4107 		counter_u64_add(rack_reorder_seen, 1);
4108 		rack->r_ctl.rc_reorder_ts = cts;
4109 	}
4110 	rsm->r_flags |= RACK_ACKED;
4111 	rsm->r_flags &= ~RACK_TLP;
4112 	if (rsm->r_in_tmap) {
4113 		TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4114 		rsm->r_in_tmap = 0;
4115 	}
4116 out:
4117 	if (used_ref == 0) {
4118 		counter_u64_add(rack_sack_proc_all, 1);
4119 	} else {
4120 		counter_u64_add(rack_sack_proc_short, 1);
4121 	}
4122 	/* Save off where we last were */
4123 	if (rsm)
4124 		rack->r_ctl.rc_sacklast = TAILQ_NEXT(rsm, r_next);
4125 	else
4126 		rack->r_ctl.rc_sacklast = NULL;
4127 	*prsm = rsm;
4128 	return (changed);
4129 }
4130 
4131 static void inline
4132 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
4133 {
4134 	struct rack_sendmap *tmap;
4135 
4136 	tmap = NULL;
4137 	while (rsm && (rsm->r_flags & RACK_ACKED)) {
4138 		/* Its no longer sacked, mark it so */
4139 		rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4140 #ifdef INVARIANTS
4141 		if (rsm->r_in_tmap) {
4142 			panic("rack:%p rsm:%p flags:0x%x in tmap?",
4143 			      rack, rsm, rsm->r_flags);
4144 		}
4145 #endif
4146 		rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
4147 		/* Rebuild it into our tmap */
4148 		if (tmap == NULL) {
4149 			TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4150 			tmap = rsm;
4151 		} else {
4152 			TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
4153 			tmap = rsm;
4154 		}
4155 		tmap->r_in_tmap = 1;
4156 		rsm = TAILQ_NEXT(rsm, r_next);
4157 	}
4158 	/*
4159 	 * Now lets possibly clear the sack filter so we start
4160 	 * recognizing sacks that cover this area.
4161 	 */
4162 	if (rack_use_sack_filter)
4163 		sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
4164 
4165 }
4166 
4167 static void
4168 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
4169 {
4170 	uint32_t changed, last_seq, entered_recovery = 0;
4171 	struct tcp_rack *rack;
4172 	struct rack_sendmap *rsm;
4173 	struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
4174 	register uint32_t th_ack;
4175 	int32_t i, j, k, num_sack_blks = 0;
4176 	uint32_t cts, acked, ack_point, sack_changed = 0;
4177 
4178 	INP_WLOCK_ASSERT(tp->t_inpcb);
4179 	if (th->th_flags & TH_RST) {
4180 		/* We don't log resets */
4181 		return;
4182 	}
4183 	rack = (struct tcp_rack *)tp->t_fb_ptr;
4184 	cts = tcp_ts_getticks();
4185 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4186 	changed = 0;
4187 	th_ack = th->th_ack;
4188 
4189 	if (SEQ_GT(th_ack, tp->snd_una)) {
4190 		rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
4191 		tp->t_acktime = ticks;
4192 	}
4193 	if (rsm && SEQ_GT(th_ack, rsm->r_start))
4194 		changed = th_ack - rsm->r_start;
4195 	if (changed) {
4196 		/*
4197 		 * The ACK point is advancing to th_ack, we must drop off
4198 		 * the packets in the rack log and calculate any eligble
4199 		 * RTT's.
4200 		 */
4201 		rack->r_wanted_output++;
4202 more:
4203 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4204 		if (rsm == NULL) {
4205 			if ((th_ack - 1) == tp->iss) {
4206 				/*
4207 				 * For the SYN incoming case we will not
4208 				 * have called tcp_output for the sending of
4209 				 * the SYN, so there will be no map. All
4210 				 * other cases should probably be a panic.
4211 				 */
4212 				goto proc_sack;
4213 			}
4214 			if (tp->t_flags & TF_SENTFIN) {
4215 				/* if we send a FIN we will not hav a map */
4216 				goto proc_sack;
4217 			}
4218 #ifdef INVARIANTS
4219 			panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n",
4220 			    tp,
4221 			    th, tp->t_state, rack,
4222 			    tp->snd_una, tp->snd_max, tp->snd_nxt, changed);
4223 #endif
4224 			goto proc_sack;
4225 		}
4226 		if (SEQ_LT(th_ack, rsm->r_start)) {
4227 			/* Huh map is missing this */
4228 #ifdef INVARIANTS
4229 			printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
4230 			    rsm->r_start,
4231 			    th_ack, tp->t_state, rack->r_state);
4232 #endif
4233 			goto proc_sack;
4234 		}
4235 		rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED);
4236 		/* Now do we consume the whole thing? */
4237 		if (SEQ_GEQ(th_ack, rsm->r_end)) {
4238 			/* Its all consumed. */
4239 			uint32_t left;
4240 
4241 			rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4242 			rsm->r_rtr_bytes = 0;
4243 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
4244 			if (rsm->r_in_tmap) {
4245 				TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
4246 				rsm->r_in_tmap = 0;
4247 			}
4248 			if (rack->r_ctl.rc_next == rsm) {
4249 				/* scoot along the marker */
4250 				rack->r_ctl.rc_next = TAILQ_FIRST(&rack->r_ctl.rc_map);
4251 			}
4252 			if (rsm->r_flags & RACK_ACKED) {
4253 				/*
4254 				 * It was acked on the scoreboard -- remove
4255 				 * it from total
4256 				 */
4257 				rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
4258 			} else if (rsm->r_flags & RACK_SACK_PASSED) {
4259 				/*
4260 				 * There are acked segments ACKED on the
4261 				 * scoreboard further up. We are seeing
4262 				 * reordering.
4263 				 */
4264 				counter_u64_add(rack_reorder_seen, 1);
4265 				rsm->r_flags |= RACK_ACKED;
4266 				rack->r_ctl.rc_reorder_ts = cts;
4267 			}
4268 			left = th_ack - rsm->r_end;
4269 			if (rsm->r_rtr_cnt > 1) {
4270 				/*
4271 				 * Technically we should make r_rtr_cnt be
4272 				 * monotonicly increasing and just mod it to
4273 				 * the timestamp it is replacing.. that way
4274 				 * we would have the last 3 retransmits. Now
4275 				 * rc_loss_count will be wrong if we
4276 				 * retransmit something more than 2 times in
4277 				 * recovery :(
4278 				 */
4279 				rack->r_ctl.rc_loss_count += (rsm->r_rtr_cnt - 1);
4280 			}
4281 			/* Free back to zone */
4282 			rack_free(rack, rsm);
4283 			if (left) {
4284 				goto more;
4285 			}
4286 			goto proc_sack;
4287 		}
4288 		if (rsm->r_flags & RACK_ACKED) {
4289 			/*
4290 			 * It was acked on the scoreboard -- remove it from
4291 			 * total for the part being cum-acked.
4292 			 */
4293 			rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
4294 		}
4295 		rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
4296 		rsm->r_rtr_bytes = 0;
4297 		rsm->r_start = th_ack;
4298 	}
4299 proc_sack:
4300 	/* Check for reneging */
4301 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
4302 	if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
4303 		/*
4304 		 * The peer has moved snd_una up to
4305 		 * the edge of this send, i.e. one
4306 		 * that it had previously acked. The only
4307 		 * way that can be true if the peer threw
4308 		 * away data (space issues) that it had
4309 		 * previously sacked (else it would have
4310 		 * given us snd_una up to (rsm->r_end).
4311 		 * We need to undo the acked markings here.
4312 		 *
4313 		 * Note we have to look to make sure th_ack is
4314 		 * our rsm->r_start in case we get an old ack
4315 		 * where th_ack is behind snd_una.
4316 		 */
4317 		rack_peer_reneges(rack, rsm, th->th_ack);
4318 	}
4319 	if ((to->to_flags & TOF_SACK) == 0) {
4320 		/* We are done nothing left to log */
4321 		goto out;
4322 	}
4323 	rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next);
4324 	if (rsm) {
4325 		last_seq = rsm->r_end;
4326 	} else {
4327 		last_seq = tp->snd_max;
4328 	}
4329 	/* Sack block processing */
4330 	if (SEQ_GT(th_ack, tp->snd_una))
4331 		ack_point = th_ack;
4332 	else
4333 		ack_point = tp->snd_una;
4334 	for (i = 0; i < to->to_nsacks; i++) {
4335 		bcopy((to->to_sacks + i * TCPOLEN_SACK),
4336 		    &sack, sizeof(sack));
4337 		sack.start = ntohl(sack.start);
4338 		sack.end = ntohl(sack.end);
4339 		if (SEQ_GT(sack.end, sack.start) &&
4340 		    SEQ_GT(sack.start, ack_point) &&
4341 		    SEQ_LT(sack.start, tp->snd_max) &&
4342 		    SEQ_GT(sack.end, ack_point) &&
4343 		    SEQ_LEQ(sack.end, tp->snd_max)) {
4344 			if ((rack->r_ctl.rc_num_maps_alloced > rack_sack_block_limit) &&
4345 			    (SEQ_LT(sack.end, last_seq)) &&
4346 			    ((sack.end - sack.start) < (tp->t_maxseg / 8))) {
4347 				/*
4348 				 * Not the last piece and its smaller than
4349 				 * 1/8th of a MSS. We ignore this.
4350 				 */
4351 				counter_u64_add(rack_runt_sacks, 1);
4352 				continue;
4353 			}
4354 			sack_blocks[num_sack_blks] = sack;
4355 			num_sack_blks++;
4356 #ifdef NETFLIX_STATS
4357 		} else if (SEQ_LEQ(sack.start, th_ack) &&
4358 			   SEQ_LEQ(sack.end, th_ack)) {
4359 			/*
4360 			 * Its a D-SACK block.
4361 			 */
4362 			tcp_record_dsack(sack.start, sack.end);
4363 #endif
4364 		}
4365 
4366 	}
4367 	if (num_sack_blks == 0)
4368 		goto out;
4369 	/*
4370 	 * Sort the SACK blocks so we can update the rack scoreboard with
4371 	 * just one pass.
4372 	 */
4373 	if (rack_use_sack_filter) {
4374 		num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
4375 	}
4376 	if (num_sack_blks < 2) {
4377 		goto do_sack_work;
4378 	}
4379 	/* Sort the sacks */
4380 	for (i = 0; i < num_sack_blks; i++) {
4381 		for (j = i + 1; j < num_sack_blks; j++) {
4382 			if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
4383 				sack = sack_blocks[i];
4384 				sack_blocks[i] = sack_blocks[j];
4385 				sack_blocks[j] = sack;
4386 			}
4387 		}
4388 	}
4389 	/*
4390 	 * Now are any of the sack block ends the same (yes some
4391 	 * implememtations send these)?
4392 	 */
4393 again:
4394 	if (num_sack_blks > 1) {
4395 		for (i = 0; i < num_sack_blks; i++) {
4396 			for (j = i + 1; j < num_sack_blks; j++) {
4397 				if (sack_blocks[i].end == sack_blocks[j].end) {
4398 					/*
4399 					 * Ok these two have the same end we
4400 					 * want the smallest end and then
4401 					 * throw away the larger and start
4402 					 * again.
4403 					 */
4404 					if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
4405 						/*
4406 						 * The second block covers
4407 						 * more area use that
4408 						 */
4409 						sack_blocks[i].start = sack_blocks[j].start;
4410 					}
4411 					/*
4412 					 * Now collapse out the dup-sack and
4413 					 * lower the count
4414 					 */
4415 					for (k = (j + 1); k < num_sack_blks; k++) {
4416 						sack_blocks[j].start = sack_blocks[k].start;
4417 						sack_blocks[j].end = sack_blocks[k].end;
4418 						j++;
4419 					}
4420 					num_sack_blks--;
4421 					goto again;
4422 				}
4423 			}
4424 		}
4425 	}
4426 do_sack_work:
4427 	rsm = rack->r_ctl.rc_sacklast;
4428 	for (i = 0; i < num_sack_blks; i++) {
4429 		acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts);
4430 		if (acked) {
4431 			rack->r_wanted_output++;
4432 			changed += acked;
4433 			sack_changed += acked;
4434 		}
4435 	}
4436 out:
4437 	if (changed) {
4438 		/* Something changed cancel the rack timer */
4439 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4440 	}
4441 	if ((sack_changed) && (!IN_RECOVERY(tp->t_flags))) {
4442 		/*
4443 		 * Ok we have a high probability that we need to go in to
4444 		 * recovery since we have data sack'd
4445 		 */
4446 		struct rack_sendmap *rsm;
4447 		uint32_t tsused;
4448 
4449 		tsused = tcp_ts_getticks();
4450 		rsm = tcp_rack_output(tp, rack, tsused);
4451 		if (rsm) {
4452 			/* Enter recovery */
4453 			rack->r_ctl.rc_rsm_start = rsm->r_start;
4454 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
4455 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
4456 			entered_recovery = 1;
4457 			rack_cong_signal(tp, NULL, CC_NDUPACK);
4458 			/*
4459 			 * When we enter recovery we need to assure we send
4460 			 * one packet.
4461 			 */
4462 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
4463 			rack->r_timer_override = 1;
4464 		}
4465 	}
4466 	if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) {
4467 		/* Deal with changed an PRR here (in recovery only) */
4468 		uint32_t pipe, snd_una;
4469 
4470 		rack->r_ctl.rc_prr_delivered += changed;
4471 		/* Compute prr_sndcnt */
4472 		if (SEQ_GT(tp->snd_una, th_ack)) {
4473 			snd_una = tp->snd_una;
4474 		} else {
4475 			snd_una = th_ack;
4476 		}
4477 		pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt;
4478 		if (pipe > tp->snd_ssthresh) {
4479 			long sndcnt;
4480 
4481 			sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
4482 			if (rack->r_ctl.rc_prr_recovery_fs > 0)
4483 				sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
4484 			else {
4485 				rack->r_ctl.rc_prr_sndcnt = 0;
4486 				sndcnt = 0;
4487 			}
4488 			sndcnt++;
4489 			if (sndcnt > (long)rack->r_ctl.rc_prr_out)
4490 				sndcnt -= rack->r_ctl.rc_prr_out;
4491 			else
4492 				sndcnt = 0;
4493 			rack->r_ctl.rc_prr_sndcnt = sndcnt;
4494 		} else {
4495 			uint32_t limit;
4496 
4497 			if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
4498 				limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
4499 			else
4500 				limit = 0;
4501 			if (changed > limit)
4502 				limit = changed;
4503 			limit += tp->t_maxseg;
4504 			if (tp->snd_ssthresh > pipe) {
4505 				rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
4506 			} else {
4507 				rack->r_ctl.rc_prr_sndcnt = min(0, limit);
4508 			}
4509 		}
4510 		if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) {
4511 			rack->r_timer_override = 1;
4512 		}
4513 	}
4514 }
4515 
4516 /*
4517  * Return value of 1, we do not need to call rack_process_data().
4518  * return value of 0, rack_process_data can be called.
4519  * For ret_val if its 0 the TCP is locked, if its non-zero
4520  * its unlocked and probably unsafe to touch the TCB.
4521  */
4522 static int
4523 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
4524     struct tcpcb *tp, struct tcpopt *to,
4525     uint32_t tiwin, int32_t tlen,
4526     int32_t * ofia, int32_t thflags, int32_t * ret_val)
4527 {
4528 	int32_t ourfinisacked = 0;
4529 	int32_t nsegs, acked_amount;
4530 	int32_t acked;
4531 	struct mbuf *mfree;
4532 	struct tcp_rack *rack;
4533 	int32_t recovery = 0;
4534 
4535 	rack = (struct tcp_rack *)tp->t_fb_ptr;
4536 	if (SEQ_GT(th->th_ack, tp->snd_max)) {
4537 		rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
4538 		return (1);
4539 	}
4540 	if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
4541 		rack_log_ack(tp, to, th);
4542 	}
4543 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
4544 		/*
4545 		 * Old ack, behind (or duplicate to) the last one rcv'd
4546 		 * Note: Should mark reordering is occuring! We should also
4547 		 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1,
4548 		 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no
4549 		 * retran and> ack 3
4550 		 */
4551 		return (0);
4552 	}
4553 	/*
4554 	 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
4555 	 * something we sent.
4556 	 */
4557 	if (tp->t_flags & TF_NEEDSYN) {
4558 		/*
4559 		 * T/TCP: Connection was half-synchronized, and our SYN has
4560 		 * been ACK'd (so connection is now fully synchronized).  Go
4561 		 * to non-starred state, increment snd_una for ACK of SYN,
4562 		 * and check if we can do window scaling.
4563 		 */
4564 		tp->t_flags &= ~TF_NEEDSYN;
4565 		tp->snd_una++;
4566 		/* Do window scaling? */
4567 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
4568 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
4569 			tp->rcv_scale = tp->request_r_scale;
4570 			/* Send window already scaled. */
4571 		}
4572 	}
4573 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
4574 	INP_WLOCK_ASSERT(tp->t_inpcb);
4575 
4576 	acked = BYTES_THIS_ACK(tp, th);
4577 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
4578 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
4579 
4580 	/*
4581 	 * If we just performed our first retransmit, and the ACK arrives
4582 	 * within our recovery window, then it was a mistake to do the
4583 	 * retransmit in the first place.  Recover our original cwnd and
4584 	 * ssthresh, and proceed to transmit where we left off.
4585 	 */
4586 	if (tp->t_flags & TF_PREVVALID) {
4587 		tp->t_flags &= ~TF_PREVVALID;
4588 		if (tp->t_rxtshift == 1 &&
4589 		    (int)(ticks - tp->t_badrxtwin) < 0)
4590 			rack_cong_signal(tp, th, CC_RTO_ERR);
4591 	}
4592 	/*
4593 	 * If we have a timestamp reply, update smoothed round trip time. If
4594 	 * no timestamp is present but transmit timer is running and timed
4595 	 * sequence number was acked, update smoothed round trip time. Since
4596 	 * we now have an rtt measurement, cancel the timer backoff (cf.,
4597 	 * Phil Karn's retransmit alg.). Recompute the initial retransmit
4598 	 * timer.
4599 	 *
4600 	 * Some boxes send broken timestamp replies during the SYN+ACK
4601 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
4602 	 * and blow up the retransmit timer.
4603 	 */
4604 	/*
4605 	 * If all outstanding data is acked, stop retransmit timer and
4606 	 * remember to restart (more output or persist). If there is more
4607 	 * data to be acked, restart retransmit timer, using current
4608 	 * (possibly backed-off) value.
4609 	 */
4610 	if (th->th_ack == tp->snd_max) {
4611 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4612 		rack->r_wanted_output++;
4613 	}
4614 	/*
4615 	 * If no data (only SYN) was ACK'd, skip rest of ACK processing.
4616 	 */
4617 	if (acked == 0) {
4618 		if (ofia)
4619 			*ofia = ourfinisacked;
4620 		return (0);
4621 	}
4622 	if (rack->r_ctl.rc_early_recovery) {
4623 		if (IN_FASTRECOVERY(tp->t_flags)) {
4624 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4625 				tcp_rack_partialack(tp, th);
4626 			} else {
4627 				rack_post_recovery(tp, th);
4628 				recovery = 1;
4629 			}
4630 		}
4631 	}
4632 	/*
4633 	 * Let the congestion control algorithm update congestion control
4634 	 * related information. This typically means increasing the
4635 	 * congestion window.
4636 	 */
4637 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery);
4638 	SOCKBUF_LOCK(&so->so_snd);
4639 	acked_amount = min(acked, (int)sbavail(&so->so_snd));
4640 	tp->snd_wnd -= acked_amount;
4641 	mfree = sbcut_locked(&so->so_snd, acked_amount);
4642 	if ((sbused(&so->so_snd) == 0) &&
4643 	    (acked > acked_amount) &&
4644 	    (tp->t_state >= TCPS_FIN_WAIT_1)) {
4645 		ourfinisacked = 1;
4646 	}
4647 	/* NB: sowwakeup_locked() does an implicit unlock. */
4648 	sowwakeup_locked(so);
4649 	m_freem(mfree);
4650 	if (rack->r_ctl.rc_early_recovery == 0) {
4651 		if (IN_FASTRECOVERY(tp->t_flags)) {
4652 			if (SEQ_LT(th->th_ack, tp->snd_recover)) {
4653 				tcp_rack_partialack(tp, th);
4654 			} else {
4655 				rack_post_recovery(tp, th);
4656 			}
4657 		}
4658 	}
4659 	tp->snd_una = th->th_ack;
4660 	if (SEQ_GT(tp->snd_una, tp->snd_recover))
4661 		tp->snd_recover = tp->snd_una;
4662 
4663 	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4664 		tp->snd_nxt = tp->snd_una;
4665 	}
4666 	if (tp->snd_una == tp->snd_max) {
4667 		/* Nothing left outstanding */
4668 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
4669 		tp->t_acktime = 0;
4670 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4671 		/* Set need output so persist might get set */
4672 		rack->r_wanted_output++;
4673 		if (rack_use_sack_filter)
4674 			sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
4675 		if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
4676 		    (sbavail(&so->so_snd) == 0) &&
4677 		    (tp->t_flags2 & TF2_DROP_AF_DATA)) {
4678 			/*
4679 			 * The socket was gone and the
4680 			 * peer sent data, time to
4681 			 * reset him.
4682 			 */
4683 			*ret_val = 1;
4684 			tp = tcp_close(tp);
4685 			rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
4686 			return (1);
4687 		}
4688 	}
4689 	if (ofia)
4690 		*ofia = ourfinisacked;
4691 	return (0);
4692 }
4693 
4694 
4695 /*
4696  * Return value of 1, the TCB is unlocked and most
4697  * likely gone, return value of 0, the TCP is still
4698  * locked.
4699  */
4700 static int
4701 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
4702     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
4703     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
4704 {
4705 	/*
4706 	 * Update window information. Don't look at window if no ACK: TAC's
4707 	 * send garbage on first SYN.
4708 	 */
4709 	int32_t nsegs;
4710 	int32_t tfo_syn;
4711 	struct tcp_rack *rack;
4712 
4713 	rack = (struct tcp_rack *)tp->t_fb_ptr;
4714 	INP_WLOCK_ASSERT(tp->t_inpcb);
4715 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
4716 	if ((thflags & TH_ACK) &&
4717 	    (SEQ_LT(tp->snd_wl1, th->th_seq) ||
4718 	    (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
4719 	    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
4720 		/* keep track of pure window updates */
4721 		if (tlen == 0 &&
4722 		    tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
4723 			TCPSTAT_INC(tcps_rcvwinupd);
4724 		tp->snd_wnd = tiwin;
4725 		tp->snd_wl1 = th->th_seq;
4726 		tp->snd_wl2 = th->th_ack;
4727 		if (tp->snd_wnd > tp->max_sndwnd)
4728 			tp->max_sndwnd = tp->snd_wnd;
4729 		rack->r_wanted_output++;
4730 	} else if (thflags & TH_ACK) {
4731 		if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
4732 			tp->snd_wnd = tiwin;
4733 			tp->snd_wl1 = th->th_seq;
4734 			tp->snd_wl2 = th->th_ack;
4735 		}
4736 	}
4737 	/* Was persist timer active and now we have window space? */
4738 	if ((rack->rc_in_persist != 0) && tp->snd_wnd) {
4739 		rack_exit_persist(tp, rack);
4740 		tp->snd_nxt = tp->snd_max;
4741 		/* Make sure we output to start the timer */
4742 		rack->r_wanted_output++;
4743 	}
4744 	if (tp->t_flags2 & TF2_DROP_AF_DATA) {
4745 		m_freem(m);
4746 		return (0);
4747 	}
4748 	/*
4749 	 * Process segments with URG.
4750 	 */
4751 	if ((thflags & TH_URG) && th->th_urp &&
4752 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4753 		/*
4754 		 * This is a kludge, but if we receive and accept random
4755 		 * urgent pointers, we'll crash in soreceive.  It's hard to
4756 		 * imagine someone actually wanting to send this much urgent
4757 		 * data.
4758 		 */
4759 		SOCKBUF_LOCK(&so->so_rcv);
4760 		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
4761 			th->th_urp = 0;	/* XXX */
4762 			thflags &= ~TH_URG;	/* XXX */
4763 			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
4764 			goto dodata;	/* XXX */
4765 		}
4766 		/*
4767 		 * If this segment advances the known urgent pointer, then
4768 		 * mark the data stream.  This should not happen in
4769 		 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
4770 		 * FIN has been received from the remote side. In these
4771 		 * states we ignore the URG.
4772 		 *
4773 		 * According to RFC961 (Assigned Protocols), the urgent
4774 		 * pointer points to the last octet of urgent data.  We
4775 		 * continue, however, to consider it to indicate the first
4776 		 * octet of data past the urgent section as the original
4777 		 * spec states (in one of two places).
4778 		 */
4779 		if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
4780 			tp->rcv_up = th->th_seq + th->th_urp;
4781 			so->so_oobmark = sbavail(&so->so_rcv) +
4782 			    (tp->rcv_up - tp->rcv_nxt) - 1;
4783 			if (so->so_oobmark == 0)
4784 				so->so_rcv.sb_state |= SBS_RCVATMARK;
4785 			sohasoutofband(so);
4786 			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
4787 		}
4788 		SOCKBUF_UNLOCK(&so->so_rcv);
4789 		/*
4790 		 * Remove out of band data so doesn't get presented to user.
4791 		 * This can happen independent of advancing the URG pointer,
4792 		 * but if two URG's are pending at once, some out-of-band
4793 		 * data may creep in... ick.
4794 		 */
4795 		if (th->th_urp <= (uint32_t) tlen &&
4796 		    !(so->so_options & SO_OOBINLINE)) {
4797 			/* hdr drop is delayed */
4798 			tcp_pulloutofband(so, th, m, drop_hdrlen);
4799 		}
4800 	} else {
4801 		/*
4802 		 * If no out of band data is expected, pull receive urgent
4803 		 * pointer along with the receive window.
4804 		 */
4805 		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
4806 			tp->rcv_up = tp->rcv_nxt;
4807 	}
4808 dodata:				/* XXX */
4809 	INP_WLOCK_ASSERT(tp->t_inpcb);
4810 
4811 	/*
4812 	 * Process the segment text, merging it into the TCP sequencing
4813 	 * queue, and arranging for acknowledgment of receipt if necessary.
4814 	 * This process logically involves adjusting tp->rcv_wnd as data is
4815 	 * presented to the user (this happens in tcp_usrreq.c, case
4816 	 * PRU_RCVD).  If a FIN has already been received on this connection
4817 	 * then we just ignore the text.
4818 	 */
4819 	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
4820 		   IS_FASTOPEN(tp->t_flags));
4821 	if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
4822 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4823 		tcp_seq save_start = th->th_seq;
4824 		tcp_seq save_rnxt  = tp->rcv_nxt;
4825 		int     save_tlen  = tlen;
4826 
4827 		m_adj(m, drop_hdrlen);	/* delayed header drop */
4828 		/*
4829 		 * Insert segment which includes th into TCP reassembly
4830 		 * queue with control block tp.  Set thflags to whether
4831 		 * reassembly now includes a segment with FIN.  This handles
4832 		 * the common case inline (segment is the next to be
4833 		 * received on an established connection, and the queue is
4834 		 * empty), avoiding linkage into and removal from the queue
4835 		 * and repetition of various conversions. Set DELACK for
4836 		 * segments received in order, but ack immediately when
4837 		 * segments are out of order (so fast retransmit can work).
4838 		 */
4839 		if (th->th_seq == tp->rcv_nxt &&
4840 		    SEGQ_EMPTY(tp) &&
4841 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
4842 		    tfo_syn)) {
4843 			if (DELAY_ACK(tp, tlen) || tfo_syn) {
4844 				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4845 				tp->t_flags |= TF_DELACK;
4846 			} else {
4847 				rack->r_wanted_output++;
4848 				tp->t_flags |= TF_ACKNOW;
4849 			}
4850 			tp->rcv_nxt += tlen;
4851 			thflags = th->th_flags & TH_FIN;
4852 			TCPSTAT_ADD(tcps_rcvpack, nsegs);
4853 			TCPSTAT_ADD(tcps_rcvbyte, tlen);
4854 			SOCKBUF_LOCK(&so->so_rcv);
4855 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
4856 				m_freem(m);
4857 			else
4858 				sbappendstream_locked(&so->so_rcv, m, 0);
4859 			/* NB: sorwakeup_locked() does an implicit unlock. */
4860 			sorwakeup_locked(so);
4861 		} else {
4862 			/*
4863 			 * XXX: Due to the header drop above "th" is
4864 			 * theoretically invalid by now.  Fortunately
4865 			 * m_adj() doesn't actually frees any mbufs when
4866 			 * trimming from the head.
4867 			 */
4868 			tcp_seq temp = save_start;
4869 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
4870 			tp->t_flags |= TF_ACKNOW;
4871 		}
4872 		if (((tlen == 0) && (save_tlen > 0) &&
4873 		    (SEQ_LT(save_start, save_rnxt)))) {
4874 			/*
4875 			 * DSACK actually handled in the fastpath
4876 			 * above.
4877 			 */
4878 			tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4879 		} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
4880 			/*
4881 			 * Cleaning sackblks by using zero length
4882 			 * update.
4883 			 */
4884 			tcp_update_sack_list(tp, save_start, save_start);
4885 		} else if ((tlen > 0) && (tlen >= save_tlen)) {
4886 			/* Update of sackblks. */
4887 			tcp_update_sack_list(tp, save_start, save_start + save_tlen);
4888 		} else if (tlen > 0) {
4889 			tcp_update_sack_list(tp, save_start, save_start+tlen);
4890 		}
4891 	} else {
4892 		m_freem(m);
4893 		thflags &= ~TH_FIN;
4894 	}
4895 
4896 	/*
4897 	 * If FIN is received ACK the FIN and let the user know that the
4898 	 * connection is closing.
4899 	 */
4900 	if (thflags & TH_FIN) {
4901 		if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
4902 			socantrcvmore(so);
4903 			/*
4904 			 * If connection is half-synchronized (ie NEEDSYN
4905 			 * flag on) then delay ACK, so it may be piggybacked
4906 			 * when SYN is sent. Otherwise, since we received a
4907 			 * FIN then no more input can be expected, send ACK
4908 			 * now.
4909 			 */
4910 			if (tp->t_flags & TF_NEEDSYN) {
4911 				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4912 				tp->t_flags |= TF_DELACK;
4913 			} else {
4914 				tp->t_flags |= TF_ACKNOW;
4915 			}
4916 			tp->rcv_nxt++;
4917 		}
4918 		switch (tp->t_state) {
4919 
4920 			/*
4921 			 * In SYN_RECEIVED and ESTABLISHED STATES enter the
4922 			 * CLOSE_WAIT state.
4923 			 */
4924 		case TCPS_SYN_RECEIVED:
4925 			tp->t_starttime = ticks;
4926 			/* FALLTHROUGH */
4927 		case TCPS_ESTABLISHED:
4928 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4929 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
4930 			break;
4931 
4932 			/*
4933 			 * If still in FIN_WAIT_1 STATE FIN has not been
4934 			 * acked so enter the CLOSING state.
4935 			 */
4936 		case TCPS_FIN_WAIT_1:
4937 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4938 			tcp_state_change(tp, TCPS_CLOSING);
4939 			break;
4940 
4941 			/*
4942 			 * In FIN_WAIT_2 state enter the TIME_WAIT state,
4943 			 * starting the time-wait timer, turning off the
4944 			 * other standard timers.
4945 			 */
4946 		case TCPS_FIN_WAIT_2:
4947 			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
4948 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
4949 			tcp_twstart(tp);
4950 			return (1);
4951 		}
4952 	}
4953 	/*
4954 	 * Return any desired output.
4955 	 */
4956 	if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
4957 		rack->r_wanted_output++;
4958 	}
4959 	INP_WLOCK_ASSERT(tp->t_inpcb);
4960 	return (0);
4961 }
4962 
4963 /*
4964  * Here nothing is really faster, its just that we
4965  * have broken out the fast-data path also just like
4966  * the fast-ack.
4967  */
4968 static int
4969 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
4970     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
4971     uint32_t tiwin, int32_t nxt_pkt)
4972 {
4973 	int32_t nsegs;
4974 	int32_t newsize = 0;	/* automatic sockbuf scaling */
4975 	struct tcp_rack *rack;
4976 #ifdef TCPDEBUG
4977 	/*
4978 	 * The size of tcp_saveipgen must be the size of the max ip header,
4979 	 * now IPv6.
4980 	 */
4981 	u_char tcp_saveipgen[IP6_HDR_LEN];
4982 	struct tcphdr tcp_savetcp;
4983 	short ostate = 0;
4984 
4985 #endif
4986 	/*
4987 	 * If last ACK falls within this segment's sequence numbers, record
4988 	 * the timestamp. NOTE that the test is modified according to the
4989 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
4990 	 */
4991 	if (__predict_false(th->th_seq != tp->rcv_nxt)) {
4992 		return (0);
4993 	}
4994 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
4995 		return (0);
4996 	}
4997 	if (tiwin && tiwin != tp->snd_wnd) {
4998 		return (0);
4999 	}
5000 	if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
5001 		return (0);
5002 	}
5003 	if (__predict_false((to->to_flags & TOF_TS) &&
5004 	    (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
5005 		return (0);
5006 	}
5007 	if (__predict_false((th->th_ack != tp->snd_una))) {
5008 		return (0);
5009 	}
5010 	if (__predict_false(tlen > sbspace(&so->so_rcv))) {
5011 		return (0);
5012 	}
5013 	if ((to->to_flags & TOF_TS) != 0 &&
5014 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5015 		tp->ts_recent_age = tcp_ts_getticks();
5016 		tp->ts_recent = to->to_tsval;
5017 	}
5018 	rack = (struct tcp_rack *)tp->t_fb_ptr;
5019 	/*
5020 	 * This is a pure, in-sequence data packet with nothing on the
5021 	 * reassembly queue and we have enough buffer space to take it.
5022 	 */
5023 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
5024 
5025 
5026 	/* Clean receiver SACK report if present */
5027 	if (tp->rcv_numsacks)
5028 		tcp_clean_sackreport(tp);
5029 	TCPSTAT_INC(tcps_preddat);
5030 	tp->rcv_nxt += tlen;
5031 	/*
5032 	 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
5033 	 */
5034 	tp->snd_wl1 = th->th_seq;
5035 	/*
5036 	 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
5037 	 */
5038 	tp->rcv_up = tp->rcv_nxt;
5039 	TCPSTAT_ADD(tcps_rcvpack, nsegs);
5040 	TCPSTAT_ADD(tcps_rcvbyte, tlen);
5041 #ifdef TCPDEBUG
5042 	if (so->so_options & SO_DEBUG)
5043 		tcp_trace(TA_INPUT, ostate, tp,
5044 		    (void *)tcp_saveipgen, &tcp_savetcp, 0);
5045 #endif
5046 	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
5047 
5048 	/* Add data to socket buffer. */
5049 	SOCKBUF_LOCK(&so->so_rcv);
5050 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5051 		m_freem(m);
5052 	} else {
5053 		/*
5054 		 * Set new socket buffer size. Give up when limit is
5055 		 * reached.
5056 		 */
5057 		if (newsize)
5058 			if (!sbreserve_locked(&so->so_rcv,
5059 			    newsize, so, NULL))
5060 				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
5061 		m_adj(m, drop_hdrlen);	/* delayed header drop */
5062 		sbappendstream_locked(&so->so_rcv, m, 0);
5063 		rack_calc_rwin(so, tp);
5064 	}
5065 	/* NB: sorwakeup_locked() does an implicit unlock. */
5066 	sorwakeup_locked(so);
5067 	if (DELAY_ACK(tp, tlen)) {
5068 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5069 		tp->t_flags |= TF_DELACK;
5070 	} else {
5071 		tp->t_flags |= TF_ACKNOW;
5072 		rack->r_wanted_output++;
5073 	}
5074 	if ((tp->snd_una == tp->snd_max) && rack_use_sack_filter)
5075 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
5076 	return (1);
5077 }
5078 
5079 /*
5080  * This subfunction is used to try to highly optimize the
5081  * fast path. We again allow window updates that are
5082  * in sequence to remain in the fast-path. We also add
5083  * in the __predict's to attempt to help the compiler.
5084  * Note that if we return a 0, then we can *not* process
5085  * it and the caller should push the packet into the
5086  * slow-path.
5087  */
5088 static int
5089 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
5090     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5091     uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
5092 {
5093 	int32_t acked;
5094 	int32_t nsegs;
5095 
5096 #ifdef TCPDEBUG
5097 	/*
5098 	 * The size of tcp_saveipgen must be the size of the max ip header,
5099 	 * now IPv6.
5100 	 */
5101 	u_char tcp_saveipgen[IP6_HDR_LEN];
5102 	struct tcphdr tcp_savetcp;
5103 	short ostate = 0;
5104 
5105 #endif
5106 	struct tcp_rack *rack;
5107 
5108 	if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
5109 		/* Old ack, behind (or duplicate to) the last one rcv'd */
5110 		return (0);
5111 	}
5112 	if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
5113 		/* Above what we have sent? */
5114 		return (0);
5115 	}
5116 	if (__predict_false(tp->snd_nxt != tp->snd_max)) {
5117 		/* We are retransmitting */
5118 		return (0);
5119 	}
5120 	if (__predict_false(tiwin == 0)) {
5121 		/* zero window */
5122 		return (0);
5123 	}
5124 	if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
5125 		/* We need a SYN or a FIN, unlikely.. */
5126 		return (0);
5127 	}
5128 	if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
5129 		/* Timestamp is behind .. old ack with seq wrap? */
5130 		return (0);
5131 	}
5132 	if (__predict_false(IN_RECOVERY(tp->t_flags))) {
5133 		/* Still recovering */
5134 		return (0);
5135 	}
5136 	rack = (struct tcp_rack *)tp->t_fb_ptr;
5137 	if (rack->r_ctl.rc_sacked) {
5138 		/* We have sack holes on our scoreboard */
5139 		return (0);
5140 	}
5141 	/* Ok if we reach here, we can process a fast-ack */
5142 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
5143 	rack_log_ack(tp, to, th);
5144 	/* Did the window get updated? */
5145 	if (tiwin != tp->snd_wnd) {
5146 		tp->snd_wnd = tiwin;
5147 		tp->snd_wl1 = th->th_seq;
5148 		if (tp->snd_wnd > tp->max_sndwnd)
5149 			tp->max_sndwnd = tp->snd_wnd;
5150 	}
5151 	if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) {
5152 		rack_exit_persist(tp, rack);
5153 	}
5154 	/*
5155 	 * If last ACK falls within this segment's sequence numbers, record
5156 	 * the timestamp. NOTE that the test is modified according to the
5157 	 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
5158 	 */
5159 	if ((to->to_flags & TOF_TS) != 0 &&
5160 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
5161 		tp->ts_recent_age = tcp_ts_getticks();
5162 		tp->ts_recent = to->to_tsval;
5163 	}
5164 	/*
5165 	 * This is a pure ack for outstanding data.
5166 	 */
5167 	TCPSTAT_INC(tcps_predack);
5168 
5169 	/*
5170 	 * "bad retransmit" recovery.
5171 	 */
5172 	if (tp->t_flags & TF_PREVVALID) {
5173 		tp->t_flags &= ~TF_PREVVALID;
5174 		if (tp->t_rxtshift == 1 &&
5175 		    (int)(ticks - tp->t_badrxtwin) < 0)
5176 			rack_cong_signal(tp, th, CC_RTO_ERR);
5177 	}
5178 	/*
5179 	 * Recalculate the transmit timer / rtt.
5180 	 *
5181 	 * Some boxes send broken timestamp replies during the SYN+ACK
5182 	 * phase, ignore timestamps of 0 or we could calculate a huge RTT
5183 	 * and blow up the retransmit timer.
5184 	 */
5185 	acked = BYTES_THIS_ACK(tp, th);
5186 
5187 #ifdef TCP_HHOOK
5188 	/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
5189 	hhook_run_tcp_est_in(tp, th, to);
5190 #endif
5191 
5192 	TCPSTAT_ADD(tcps_rcvackpack, nsegs);
5193 	TCPSTAT_ADD(tcps_rcvackbyte, acked);
5194 	sbdrop(&so->so_snd, acked);
5195 	/*
5196 	 * Let the congestion control algorithm update congestion control
5197 	 * related information. This typically means increasing the
5198 	 * congestion window.
5199 	 */
5200 	rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0);
5201 
5202 	tp->snd_una = th->th_ack;
5203 	/*
5204 	 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
5205 	 */
5206 	tp->snd_wl2 = th->th_ack;
5207 	tp->t_dupacks = 0;
5208 	m_freem(m);
5209 	/* ND6_HINT(tp);	 *//* Some progress has been made. */
5210 
5211 	/*
5212 	 * If all outstanding data are acked, stop retransmit timer,
5213 	 * otherwise restart timer using current (possibly backed-off)
5214 	 * value. If process is waiting for space, wakeup/selwakeup/signal.
5215 	 * If data are ready to send, let tcp_output decide between more
5216 	 * output or persist.
5217 	 */
5218 #ifdef TCPDEBUG
5219 	if (so->so_options & SO_DEBUG)
5220 		tcp_trace(TA_INPUT, ostate, tp,
5221 		    (void *)tcp_saveipgen,
5222 		    &tcp_savetcp, 0);
5223 #endif
5224 	if (tp->snd_una == tp->snd_max) {
5225 		rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
5226 		tp->t_acktime = 0;
5227 		rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
5228 	}
5229 	/* Wake up the socket if we have room to write more */
5230 	sowwakeup(so);
5231 	if (sbavail(&so->so_snd)) {
5232 		rack->r_wanted_output++;
5233 	}
5234 	return (1);
5235 }
5236 
5237 /*
5238  * Return value of 1, the TCB is unlocked and most
5239  * likely gone, return value of 0, the TCP is still
5240  * locked.
5241  */
5242 static int
5243 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
5244     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5245     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5246 {
5247 	int32_t ret_val = 0;
5248 	int32_t todrop;
5249 	int32_t ourfinisacked = 0;
5250 
5251 	rack_calc_rwin(so, tp);
5252 	/*
5253 	 * If the state is SYN_SENT: if seg contains an ACK, but not for our
5254 	 * SYN, drop the input. if seg contains a RST, then drop the
5255 	 * connection. if seg does not contain SYN, then drop it. Otherwise
5256 	 * this is an acceptable SYN segment initialize tp->rcv_nxt and
5257 	 * tp->irs if seg contains ack then advance tp->snd_una if seg
5258 	 * contains an ECE and ECN support is enabled, the stream is ECN
5259 	 * capable. if SYN has been acked change to ESTABLISHED else
5260 	 * SYN_RCVD state arrange for segment to be acked (eventually)
5261 	 * continue processing rest of data/controls, beginning with URG
5262 	 */
5263 	if ((thflags & TH_ACK) &&
5264 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
5265 	    SEQ_GT(th->th_ack, tp->snd_max))) {
5266 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5267 		return (1);
5268 	}
5269 	if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
5270 		TCP_PROBE5(connect__refused, NULL, tp,
5271 		    mtod(m, const char *), tp, th);
5272 		tp = tcp_drop(tp, ECONNREFUSED);
5273 		rack_do_drop(m, tp);
5274 		return (1);
5275 	}
5276 	if (thflags & TH_RST) {
5277 		rack_do_drop(m, tp);
5278 		return (1);
5279 	}
5280 	if (!(thflags & TH_SYN)) {
5281 		rack_do_drop(m, tp);
5282 		return (1);
5283 	}
5284 	tp->irs = th->th_seq;
5285 	tcp_rcvseqinit(tp);
5286 	if (thflags & TH_ACK) {
5287 		int tfo_partial = 0;
5288 
5289 		TCPSTAT_INC(tcps_connects);
5290 		soisconnected(so);
5291 #ifdef MAC
5292 		mac_socketpeer_set_from_mbuf(m, so);
5293 #endif
5294 		/* Do window scaling on this connection? */
5295 		if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5296 		    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5297 			tp->rcv_scale = tp->request_r_scale;
5298 		}
5299 		tp->rcv_adv += min(tp->rcv_wnd,
5300 		    TCP_MAXWIN << tp->rcv_scale);
5301 		/*
5302 		 * If not all the data that was sent in the TFO SYN
5303 		 * has been acked, resend the remainder right away.
5304 		 */
5305 		if (IS_FASTOPEN(tp->t_flags) &&
5306 		    (tp->snd_una != tp->snd_max)) {
5307 			tp->snd_nxt = th->th_ack;
5308 			tfo_partial = 1;
5309 		}
5310 		/*
5311 		 * If there's data, delay ACK; if there's also a FIN ACKNOW
5312 		 * will be turned on later.
5313 		 */
5314 		if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
5315 			rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
5316 					  ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
5317 			tp->t_flags |= TF_DELACK;
5318 		} else {
5319 			((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++;
5320 			tp->t_flags |= TF_ACKNOW;
5321 		}
5322 
5323 		if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
5324 		    V_tcp_do_ecn) {
5325 			tp->t_flags |= TF_ECN_PERMIT;
5326 			TCPSTAT_INC(tcps_ecn_shs);
5327 		}
5328 		if (SEQ_GT(th->th_ack, tp->snd_una)) {
5329 			/*
5330 			 * We advance snd_una for the
5331 			 * fast open case. If th_ack is
5332 			 * acknowledging data beyond
5333 			 * snd_una we can't just call
5334 			 * ack-processing since the
5335 			 * data stream in our send-map
5336 			 * will start at snd_una + 1 (one
5337 			 * beyond the SYN). If its just
5338 			 * equal we don't need to do that
5339 			 * and there is no send_map.
5340 			 */
5341 			tp->snd_una++;
5342 		}
5343 		/*
5344 		 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
5345 		 * SYN_SENT  --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
5346 		 */
5347 		tp->t_starttime = ticks;
5348 		if (tp->t_flags & TF_NEEDFIN) {
5349 			tcp_state_change(tp, TCPS_FIN_WAIT_1);
5350 			tp->t_flags &= ~TF_NEEDFIN;
5351 			thflags &= ~TH_SYN;
5352 		} else {
5353 			tcp_state_change(tp, TCPS_ESTABLISHED);
5354 			TCP_PROBE5(connect__established, NULL, tp,
5355 			    mtod(m, const char *), tp, th);
5356 			cc_conn_init(tp);
5357 		}
5358 	} else {
5359 		/*
5360 		 * Received initial SYN in SYN-SENT[*] state => simultaneous
5361 		 * open.  If segment contains CC option and there is a
5362 		 * cached CC, apply TAO test. If it succeeds, connection is *
5363 		 * half-synchronized. Otherwise, do 3-way handshake:
5364 		 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
5365 		 * there was no CC option, clear cached CC value.
5366 		 */
5367 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
5368 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
5369 	}
5370 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5371 	INP_WLOCK_ASSERT(tp->t_inpcb);
5372 	/*
5373 	 * Advance th->th_seq to correspond to first data byte. If data,
5374 	 * trim to stay within window, dropping FIN if necessary.
5375 	 */
5376 	th->th_seq++;
5377 	if (tlen > tp->rcv_wnd) {
5378 		todrop = tlen - tp->rcv_wnd;
5379 		m_adj(m, -todrop);
5380 		tlen = tp->rcv_wnd;
5381 		thflags &= ~TH_FIN;
5382 		TCPSTAT_INC(tcps_rcvpackafterwin);
5383 		TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
5384 	}
5385 	tp->snd_wl1 = th->th_seq - 1;
5386 	tp->rcv_up = th->th_seq;
5387 	/*
5388 	 * Client side of transaction: already sent SYN and data. If the
5389 	 * remote host used T/TCP to validate the SYN, our data will be
5390 	 * ACK'd; if so, enter normal data segment processing in the middle
5391 	 * of step 5, ack processing. Otherwise, goto step 6.
5392 	 */
5393 	if (thflags & TH_ACK) {
5394 		if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val))
5395 			return (ret_val);
5396 		/* We may have changed to FIN_WAIT_1 above */
5397 		if (tp->t_state == TCPS_FIN_WAIT_1) {
5398 			/*
5399 			 * In FIN_WAIT_1 STATE in addition to the processing
5400 			 * for the ESTABLISHED state if our FIN is now
5401 			 * acknowledged then enter FIN_WAIT_2.
5402 			 */
5403 			if (ourfinisacked) {
5404 				/*
5405 				 * If we can't receive any more data, then
5406 				 * closing user can proceed. Starting the
5407 				 * timer is contrary to the specification,
5408 				 * but if we don't get a FIN we'll hang
5409 				 * forever.
5410 				 *
5411 				 * XXXjl: we should release the tp also, and
5412 				 * use a compressed state.
5413 				 */
5414 				if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5415 					soisdisconnected(so);
5416 					tcp_timer_activate(tp, TT_2MSL,
5417 					    (tcp_fast_finwait2_recycle ?
5418 					    tcp_finwait2_timeout :
5419 					    TP_MAXIDLE(tp)));
5420 				}
5421 				tcp_state_change(tp, TCPS_FIN_WAIT_2);
5422 			}
5423 		}
5424 	}
5425 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5426 	   tiwin, thflags, nxt_pkt));
5427 }
5428 
5429 /*
5430  * Return value of 1, the TCB is unlocked and most
5431  * likely gone, return value of 0, the TCP is still
5432  * locked.
5433  */
5434 static int
5435 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
5436     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5437     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5438 {
5439 	int32_t ret_val = 0;
5440 	int32_t ourfinisacked = 0;
5441 
5442 	rack_calc_rwin(so, tp);
5443 
5444 	if ((thflags & TH_ACK) &&
5445 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
5446 	    SEQ_GT(th->th_ack, tp->snd_max))) {
5447 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5448 		return (1);
5449 	}
5450 	if (IS_FASTOPEN(tp->t_flags)) {
5451 		/*
5452 		 * When a TFO connection is in SYN_RECEIVED, the
5453 		 * only valid packets are the initial SYN, a
5454 		 * retransmit/copy of the initial SYN (possibly with
5455 		 * a subset of the original data), a valid ACK, a
5456 		 * FIN, or a RST.
5457 		 */
5458 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
5459 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5460 			return (1);
5461 		} else if (thflags & TH_SYN) {
5462 			/* non-initial SYN is ignored */
5463 			struct tcp_rack *rack;
5464 
5465 			rack = (struct tcp_rack *)tp->t_fb_ptr;
5466 			if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
5467 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
5468 			    (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
5469 				rack_do_drop(m, NULL);
5470 				return (0);
5471 			}
5472 		} else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
5473 			rack_do_drop(m, NULL);
5474 			return (0);
5475 		}
5476 	}
5477 	if (thflags & TH_RST)
5478 		return (rack_process_rst(m, th, so, tp));
5479 	/*
5480 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5481 	 * it's less than ts_recent, drop it.
5482 	 */
5483 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5484 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5485 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5486 			return (ret_val);
5487 	}
5488 	/*
5489 	 * In the SYN-RECEIVED state, validate that the packet belongs to
5490 	 * this connection before trimming the data to fit the receive
5491 	 * window.  Check the sequence number versus IRS since we know the
5492 	 * sequence numbers haven't wrapped.  This is a partial fix for the
5493 	 * "LAND" DoS attack.
5494 	 */
5495 	if (SEQ_LT(th->th_seq, tp->irs)) {
5496 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5497 		return (1);
5498 	}
5499 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5500 		return (ret_val);
5501 	}
5502 	/*
5503 	 * If last ACK falls within this segment's sequence numbers, record
5504 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
5505 	 * from the latest proposal of the tcplw@cray.com list (Braden
5506 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
5507 	 * with our earlier PAWS tests, so this check should be solely
5508 	 * predicated on the sequence space of this segment. 3) That we
5509 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5510 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5511 	 * SEG.Len, This modified check allows us to overcome RFC1323's
5512 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5513 	 * p.869. In such cases, we can still calculate the RTT correctly
5514 	 * when RCV.NXT == Last.ACK.Sent.
5515 	 */
5516 	if ((to->to_flags & TOF_TS) != 0 &&
5517 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5518 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5519 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5520 		tp->ts_recent_age = tcp_ts_getticks();
5521 		tp->ts_recent = to->to_tsval;
5522 	}
5523 	tp->snd_wnd = tiwin;
5524 	/*
5525 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5526 	 * is on (half-synchronized state), then queue data for later
5527 	 * processing; else drop segment and return.
5528 	 */
5529 	if ((thflags & TH_ACK) == 0) {
5530 		if (IS_FASTOPEN(tp->t_flags)) {
5531 			cc_conn_init(tp);
5532 		}
5533 		return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5534 		    tiwin, thflags, nxt_pkt));
5535 	}
5536 	TCPSTAT_INC(tcps_connects);
5537 	soisconnected(so);
5538 	/* Do window scaling? */
5539 	if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
5540 	    (TF_RCVD_SCALE | TF_REQ_SCALE)) {
5541 		tp->rcv_scale = tp->request_r_scale;
5542 	}
5543 	/*
5544 	 * Make transitions: SYN-RECEIVED  -> ESTABLISHED SYN-RECEIVED* ->
5545 	 * FIN-WAIT-1
5546 	 */
5547 	tp->t_starttime = ticks;
5548 	if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
5549 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
5550 		tp->t_tfo_pending = NULL;
5551 
5552 		/*
5553 		 * Account for the ACK of our SYN prior to
5554 		 * regular ACK processing below.
5555 		 */
5556 		tp->snd_una++;
5557 	}
5558 	if (tp->t_flags & TF_NEEDFIN) {
5559 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
5560 		tp->t_flags &= ~TF_NEEDFIN;
5561 	} else {
5562 		tcp_state_change(tp, TCPS_ESTABLISHED);
5563 		TCP_PROBE5(accept__established, NULL, tp,
5564 		    mtod(m, const char *), tp, th);
5565 		/*
5566 		 * TFO connections call cc_conn_init() during SYN
5567 		 * processing.  Calling it again here for such connections
5568 		 * is not harmless as it would undo the snd_cwnd reduction
5569 		 * that occurs when a TFO SYN|ACK is retransmitted.
5570 		 */
5571 		if (!IS_FASTOPEN(tp->t_flags))
5572 			cc_conn_init(tp);
5573 	}
5574 	/*
5575 	 * If segment contains data or ACK, will call tcp_reass() later; if
5576 	 * not, do so now to pass queued data to user.
5577 	 */
5578 	if (tlen == 0 && (thflags & TH_FIN) == 0)
5579 		(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
5580 		    (struct mbuf *)0);
5581 	tp->snd_wl1 = th->th_seq - 1;
5582 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5583 		return (ret_val);
5584 	}
5585 	if (tp->t_state == TCPS_FIN_WAIT_1) {
5586 		/* We could have went to FIN_WAIT_1 (or EST) above */
5587 		/*
5588 		 * In FIN_WAIT_1 STATE in addition to the processing for the
5589 		 * ESTABLISHED state if our FIN is now acknowledged then
5590 		 * enter FIN_WAIT_2.
5591 		 */
5592 		if (ourfinisacked) {
5593 			/*
5594 			 * If we can't receive any more data, then closing
5595 			 * user can proceed. Starting the timer is contrary
5596 			 * to the specification, but if we don't get a FIN
5597 			 * we'll hang forever.
5598 			 *
5599 			 * XXXjl: we should release the tp also, and use a
5600 			 * compressed state.
5601 			 */
5602 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5603 				soisdisconnected(so);
5604 				tcp_timer_activate(tp, TT_2MSL,
5605 				    (tcp_fast_finwait2_recycle ?
5606 				    tcp_finwait2_timeout :
5607 				    TP_MAXIDLE(tp)));
5608 			}
5609 			tcp_state_change(tp, TCPS_FIN_WAIT_2);
5610 		}
5611 	}
5612 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5613 	    tiwin, thflags, nxt_pkt));
5614 }
5615 
5616 /*
5617  * Return value of 1, the TCB is unlocked and most
5618  * likely gone, return value of 0, the TCP is still
5619  * locked.
5620  */
5621 static int
5622 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
5623     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5624     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5625 {
5626 	int32_t ret_val = 0;
5627 
5628 	/*
5629 	 * Header prediction: check for the two common cases of a
5630 	 * uni-directional data xfer.  If the packet has no control flags,
5631 	 * is in-sequence, the window didn't change and we're not
5632 	 * retransmitting, it's a candidate.  If the length is zero and the
5633 	 * ack moved forward, we're the sender side of the xfer.  Just free
5634 	 * the data acked & wake any higher level process that was blocked
5635 	 * waiting for space.  If the length is non-zero and the ack didn't
5636 	 * move, we're the receiver side.  If we're getting packets in-order
5637 	 * (the reassembly queue is empty), add the data toc The socket
5638 	 * buffer and note that we need a delayed ack. Make sure that the
5639 	 * hidden state-flags are also off. Since we check for
5640 	 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
5641 	 */
5642 	if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
5643 	    __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK)) == TH_ACK) &&
5644 	    __predict_true(SEGQ_EMPTY(tp)) &&
5645 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
5646 		struct tcp_rack *rack;
5647 
5648 		rack = (struct tcp_rack *)tp->t_fb_ptr;
5649 		if (tlen == 0) {
5650 			if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
5651 			    tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
5652 				return (0);
5653 			}
5654 		} else {
5655 			if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
5656 			    tiwin, nxt_pkt)) {
5657 				return (0);
5658 			}
5659 		}
5660 	}
5661 	rack_calc_rwin(so, tp);
5662 
5663 	if (thflags & TH_RST)
5664 		return (rack_process_rst(m, th, so, tp));
5665 
5666 	/*
5667 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5668 	 * synchronized state.
5669 	 */
5670 	if (thflags & TH_SYN) {
5671 		rack_challenge_ack(m, th, tp, &ret_val);
5672 		return (ret_val);
5673 	}
5674 	/*
5675 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5676 	 * it's less than ts_recent, drop it.
5677 	 */
5678 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5679 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5680 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5681 			return (ret_val);
5682 	}
5683 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5684 		return (ret_val);
5685 	}
5686 	/*
5687 	 * If last ACK falls within this segment's sequence numbers, record
5688 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
5689 	 * from the latest proposal of the tcplw@cray.com list (Braden
5690 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
5691 	 * with our earlier PAWS tests, so this check should be solely
5692 	 * predicated on the sequence space of this segment. 3) That we
5693 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5694 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5695 	 * SEG.Len, This modified check allows us to overcome RFC1323's
5696 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5697 	 * p.869. In such cases, we can still calculate the RTT correctly
5698 	 * when RCV.NXT == Last.ACK.Sent.
5699 	 */
5700 	if ((to->to_flags & TOF_TS) != 0 &&
5701 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5702 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5703 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5704 		tp->ts_recent_age = tcp_ts_getticks();
5705 		tp->ts_recent = to->to_tsval;
5706 	}
5707 	/*
5708 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5709 	 * is on (half-synchronized state), then queue data for later
5710 	 * processing; else drop segment and return.
5711 	 */
5712 	if ((thflags & TH_ACK) == 0) {
5713 		if (tp->t_flags & TF_NEEDSYN) {
5714 
5715 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5716 			    tiwin, thflags, nxt_pkt));
5717 
5718 		} else if (tp->t_flags & TF_ACKNOW) {
5719 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5720 			return (ret_val);
5721 		} else {
5722 			rack_do_drop(m, NULL);
5723 			return (0);
5724 		}
5725 	}
5726 	/*
5727 	 * Ack processing.
5728 	 */
5729 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5730 		return (ret_val);
5731 	}
5732 	if (sbavail(&so->so_snd)) {
5733 		if (rack_progress_timeout_check(tp)) {
5734 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5735 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5736 			return (1);
5737 		}
5738 	}
5739 	/* State changes only happen in rack_process_data() */
5740 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5741 	    tiwin, thflags, nxt_pkt));
5742 }
5743 
5744 /*
5745  * Return value of 1, the TCB is unlocked and most
5746  * likely gone, return value of 0, the TCP is still
5747  * locked.
5748  */
5749 static int
5750 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
5751     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5752     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5753 {
5754 	int32_t ret_val = 0;
5755 
5756 	rack_calc_rwin(so, tp);
5757 	if (thflags & TH_RST)
5758 		return (rack_process_rst(m, th, so, tp));
5759 	/*
5760 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5761 	 * synchronized state.
5762 	 */
5763 	if (thflags & TH_SYN) {
5764 		rack_challenge_ack(m, th, tp, &ret_val);
5765 		return (ret_val);
5766 	}
5767 	/*
5768 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5769 	 * it's less than ts_recent, drop it.
5770 	 */
5771 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5772 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5773 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5774 			return (ret_val);
5775 	}
5776 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5777 		return (ret_val);
5778 	}
5779 	/*
5780 	 * If last ACK falls within this segment's sequence numbers, record
5781 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
5782 	 * from the latest proposal of the tcplw@cray.com list (Braden
5783 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
5784 	 * with our earlier PAWS tests, so this check should be solely
5785 	 * predicated on the sequence space of this segment. 3) That we
5786 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5787 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5788 	 * SEG.Len, This modified check allows us to overcome RFC1323's
5789 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5790 	 * p.869. In such cases, we can still calculate the RTT correctly
5791 	 * when RCV.NXT == Last.ACK.Sent.
5792 	 */
5793 	if ((to->to_flags & TOF_TS) != 0 &&
5794 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5795 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5796 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5797 		tp->ts_recent_age = tcp_ts_getticks();
5798 		tp->ts_recent = to->to_tsval;
5799 	}
5800 	/*
5801 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5802 	 * is on (half-synchronized state), then queue data for later
5803 	 * processing; else drop segment and return.
5804 	 */
5805 	if ((thflags & TH_ACK) == 0) {
5806 		if (tp->t_flags & TF_NEEDSYN) {
5807 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5808 			    tiwin, thflags, nxt_pkt));
5809 
5810 		} else if (tp->t_flags & TF_ACKNOW) {
5811 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5812 			return (ret_val);
5813 		} else {
5814 			rack_do_drop(m, NULL);
5815 			return (0);
5816 		}
5817 	}
5818 	/*
5819 	 * Ack processing.
5820 	 */
5821 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) {
5822 		return (ret_val);
5823 	}
5824 	if (sbavail(&so->so_snd)) {
5825 		if (rack_progress_timeout_check(tp)) {
5826 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5827 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5828 			return (1);
5829 		}
5830 	}
5831 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5832 	    tiwin, thflags, nxt_pkt));
5833 }
5834 
5835 static int
5836 rack_check_data_after_close(struct mbuf *m,
5837     struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
5838 {
5839 	struct tcp_rack *rack;
5840 
5841 	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
5842 	rack = (struct tcp_rack *)tp->t_fb_ptr;
5843 	if (rack->rc_allow_data_af_clo == 0) {
5844 	close_now:
5845 		tp = tcp_close(tp);
5846 		TCPSTAT_INC(tcps_rcvafterclose);
5847 		rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
5848 		return (1);
5849 	}
5850 	if (sbavail(&so->so_snd) == 0)
5851 		goto close_now;
5852 	/* Ok we allow data that is ignored and a followup reset */
5853 	tp->rcv_nxt = th->th_seq + *tlen;
5854 	tp->t_flags2 |= TF2_DROP_AF_DATA;
5855 	rack->r_wanted_output = 1;
5856 	*tlen = 0;
5857 	return (0);
5858 }
5859 
5860 /*
5861  * Return value of 1, the TCB is unlocked and most
5862  * likely gone, return value of 0, the TCP is still
5863  * locked.
5864  */
5865 static int
5866 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
5867     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5868     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5869 {
5870 	int32_t ret_val = 0;
5871 	int32_t ourfinisacked = 0;
5872 
5873 	rack_calc_rwin(so, tp);
5874 
5875 	if (thflags & TH_RST)
5876 		return (rack_process_rst(m, th, so, tp));
5877 	/*
5878 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5879 	 * synchronized state.
5880 	 */
5881 	if (thflags & TH_SYN) {
5882 		rack_challenge_ack(m, th, tp, &ret_val);
5883 		return (ret_val);
5884 	}
5885 	/*
5886 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
5887 	 * it's less than ts_recent, drop it.
5888 	 */
5889 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
5890 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
5891 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
5892 			return (ret_val);
5893 	}
5894 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
5895 		return (ret_val);
5896 	}
5897 	/*
5898 	 * If new data are received on a connection after the user processes
5899 	 * are gone, then RST the other end.
5900 	 */
5901 	if ((so->so_state & SS_NOFDREF) && tlen) {
5902 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
5903 			return (1);
5904 	}
5905 	/*
5906 	 * If last ACK falls within this segment's sequence numbers, record
5907 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
5908 	 * from the latest proposal of the tcplw@cray.com list (Braden
5909 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
5910 	 * with our earlier PAWS tests, so this check should be solely
5911 	 * predicated on the sequence space of this segment. 3) That we
5912 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
5913 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
5914 	 * SEG.Len, This modified check allows us to overcome RFC1323's
5915 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
5916 	 * p.869. In such cases, we can still calculate the RTT correctly
5917 	 * when RCV.NXT == Last.ACK.Sent.
5918 	 */
5919 	if ((to->to_flags & TOF_TS) != 0 &&
5920 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
5921 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
5922 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
5923 		tp->ts_recent_age = tcp_ts_getticks();
5924 		tp->ts_recent = to->to_tsval;
5925 	}
5926 	/*
5927 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
5928 	 * is on (half-synchronized state), then queue data for later
5929 	 * processing; else drop segment and return.
5930 	 */
5931 	if ((thflags & TH_ACK) == 0) {
5932 		if (tp->t_flags & TF_NEEDSYN) {
5933 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5934 			    tiwin, thflags, nxt_pkt));
5935 		} else if (tp->t_flags & TF_ACKNOW) {
5936 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
5937 			return (ret_val);
5938 		} else {
5939 			rack_do_drop(m, NULL);
5940 			return (0);
5941 		}
5942 	}
5943 	/*
5944 	 * Ack processing.
5945 	 */
5946 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
5947 		return (ret_val);
5948 	}
5949 	if (ourfinisacked) {
5950 		/*
5951 		 * If we can't receive any more data, then closing user can
5952 		 * proceed. Starting the timer is contrary to the
5953 		 * specification, but if we don't get a FIN we'll hang
5954 		 * forever.
5955 		 *
5956 		 * XXXjl: we should release the tp also, and use a
5957 		 * compressed state.
5958 		 */
5959 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
5960 			soisdisconnected(so);
5961 			tcp_timer_activate(tp, TT_2MSL,
5962 			    (tcp_fast_finwait2_recycle ?
5963 			    tcp_finwait2_timeout :
5964 			    TP_MAXIDLE(tp)));
5965 		}
5966 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
5967 	}
5968 	if (sbavail(&so->so_snd)) {
5969 		if (rack_progress_timeout_check(tp)) {
5970 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
5971 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
5972 			return (1);
5973 		}
5974 	}
5975 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
5976 	    tiwin, thflags, nxt_pkt));
5977 }
5978 
5979 /*
5980  * Return value of 1, the TCB is unlocked and most
5981  * likely gone, return value of 0, the TCP is still
5982  * locked.
5983  */
5984 static int
5985 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
5986     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
5987     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
5988 {
5989 	int32_t ret_val = 0;
5990 	int32_t ourfinisacked = 0;
5991 
5992 	rack_calc_rwin(so, tp);
5993 
5994 	if (thflags & TH_RST)
5995 		return (rack_process_rst(m, th, so, tp));
5996 	/*
5997 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
5998 	 * synchronized state.
5999 	 */
6000 	if (thflags & TH_SYN) {
6001 		rack_challenge_ack(m, th, tp, &ret_val);
6002 		return (ret_val);
6003 	}
6004 	/*
6005 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6006 	 * it's less than ts_recent, drop it.
6007 	 */
6008 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6009 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6010 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6011 			return (ret_val);
6012 	}
6013 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6014 		return (ret_val);
6015 	}
6016 	/*
6017 	 * If new data are received on a connection after the user processes
6018 	 * are gone, then RST the other end.
6019 	 */
6020 	if ((so->so_state & SS_NOFDREF) && tlen) {
6021 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
6022 			return (1);
6023 	}
6024 	/*
6025 	 * If last ACK falls within this segment's sequence numbers, record
6026 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
6027 	 * from the latest proposal of the tcplw@cray.com list (Braden
6028 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
6029 	 * with our earlier PAWS tests, so this check should be solely
6030 	 * predicated on the sequence space of this segment. 3) That we
6031 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6032 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6033 	 * SEG.Len, This modified check allows us to overcome RFC1323's
6034 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6035 	 * p.869. In such cases, we can still calculate the RTT correctly
6036 	 * when RCV.NXT == Last.ACK.Sent.
6037 	 */
6038 	if ((to->to_flags & TOF_TS) != 0 &&
6039 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6040 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6041 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6042 		tp->ts_recent_age = tcp_ts_getticks();
6043 		tp->ts_recent = to->to_tsval;
6044 	}
6045 	/*
6046 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6047 	 * is on (half-synchronized state), then queue data for later
6048 	 * processing; else drop segment and return.
6049 	 */
6050 	if ((thflags & TH_ACK) == 0) {
6051 		if (tp->t_flags & TF_NEEDSYN) {
6052 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6053 			    tiwin, thflags, nxt_pkt));
6054 		} else if (tp->t_flags & TF_ACKNOW) {
6055 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6056 			return (ret_val);
6057 		} else {
6058 			rack_do_drop(m, NULL);
6059 			return (0);
6060 		}
6061 	}
6062 	/*
6063 	 * Ack processing.
6064 	 */
6065 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6066 		return (ret_val);
6067 	}
6068 	if (ourfinisacked) {
6069 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6070 		tcp_twstart(tp);
6071 		m_freem(m);
6072 		return (1);
6073 	}
6074 	if (sbavail(&so->so_snd)) {
6075 		if (rack_progress_timeout_check(tp)) {
6076 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6077 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6078 			return (1);
6079 		}
6080 	}
6081 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6082 	    tiwin, thflags, nxt_pkt));
6083 }
6084 
6085 /*
6086  * Return value of 1, the TCB is unlocked and most
6087  * likely gone, return value of 0, the TCP is still
6088  * locked.
6089  */
6090 static int
6091 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
6092     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6093     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6094 {
6095 	int32_t ret_val = 0;
6096 	int32_t ourfinisacked = 0;
6097 
6098 	rack_calc_rwin(so, tp);
6099 
6100 	if (thflags & TH_RST)
6101 		return (rack_process_rst(m, th, so, tp));
6102 	/*
6103 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6104 	 * synchronized state.
6105 	 */
6106 	if (thflags & TH_SYN) {
6107 		rack_challenge_ack(m, th, tp, &ret_val);
6108 		return (ret_val);
6109 	}
6110 	/*
6111 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6112 	 * it's less than ts_recent, drop it.
6113 	 */
6114 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6115 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6116 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6117 			return (ret_val);
6118 	}
6119 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6120 		return (ret_val);
6121 	}
6122 	/*
6123 	 * If new data are received on a connection after the user processes
6124 	 * are gone, then RST the other end.
6125 	 */
6126 	if ((so->so_state & SS_NOFDREF) && tlen) {
6127 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
6128 			return (1);
6129 	}
6130 	/*
6131 	 * If last ACK falls within this segment's sequence numbers, record
6132 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
6133 	 * from the latest proposal of the tcplw@cray.com list (Braden
6134 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
6135 	 * with our earlier PAWS tests, so this check should be solely
6136 	 * predicated on the sequence space of this segment. 3) That we
6137 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6138 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6139 	 * SEG.Len, This modified check allows us to overcome RFC1323's
6140 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6141 	 * p.869. In such cases, we can still calculate the RTT correctly
6142 	 * when RCV.NXT == Last.ACK.Sent.
6143 	 */
6144 	if ((to->to_flags & TOF_TS) != 0 &&
6145 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6146 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6147 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6148 		tp->ts_recent_age = tcp_ts_getticks();
6149 		tp->ts_recent = to->to_tsval;
6150 	}
6151 	/*
6152 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6153 	 * is on (half-synchronized state), then queue data for later
6154 	 * processing; else drop segment and return.
6155 	 */
6156 	if ((thflags & TH_ACK) == 0) {
6157 		if (tp->t_flags & TF_NEEDSYN) {
6158 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6159 			    tiwin, thflags, nxt_pkt));
6160 		} else if (tp->t_flags & TF_ACKNOW) {
6161 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6162 			return (ret_val);
6163 		} else {
6164 			rack_do_drop(m, NULL);
6165 			return (0);
6166 		}
6167 	}
6168 	/*
6169 	 * case TCPS_LAST_ACK: Ack processing.
6170 	 */
6171 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6172 		return (ret_val);
6173 	}
6174 	if (ourfinisacked) {
6175 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6176 		tp = tcp_close(tp);
6177 		rack_do_drop(m, tp);
6178 		return (1);
6179 	}
6180 	if (sbavail(&so->so_snd)) {
6181 		if (rack_progress_timeout_check(tp)) {
6182 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6183 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6184 			return (1);
6185 		}
6186 	}
6187 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6188 	    tiwin, thflags, nxt_pkt));
6189 }
6190 
6191 
6192 /*
6193  * Return value of 1, the TCB is unlocked and most
6194  * likely gone, return value of 0, the TCP is still
6195  * locked.
6196  */
6197 static int
6198 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
6199     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
6200     uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
6201 {
6202 	int32_t ret_val = 0;
6203 	int32_t ourfinisacked = 0;
6204 
6205 	rack_calc_rwin(so, tp);
6206 
6207 	/* Reset receive buffer auto scaling when not in bulk receive mode. */
6208 	if (thflags & TH_RST)
6209 		return (rack_process_rst(m, th, so, tp));
6210 	/*
6211 	 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
6212 	 * synchronized state.
6213 	 */
6214 	if (thflags & TH_SYN) {
6215 		rack_challenge_ack(m, th, tp, &ret_val);
6216 		return (ret_val);
6217 	}
6218 	/*
6219 	 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
6220 	 * it's less than ts_recent, drop it.
6221 	 */
6222 	if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
6223 	    TSTMP_LT(to->to_tsval, tp->ts_recent)) {
6224 		if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val))
6225 			return (ret_val);
6226 	}
6227 	if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
6228 		return (ret_val);
6229 	}
6230 	/*
6231 	 * If new data are received on a connection after the user processes
6232 	 * are gone, then RST the other end.
6233 	 */
6234 	if ((so->so_state & SS_NOFDREF) &&
6235 	    tlen) {
6236 		if (rack_check_data_after_close(m, tp, &tlen, th, so))
6237 			return (1);
6238 	}
6239 	/*
6240 	 * If last ACK falls within this segment's sequence numbers, record
6241 	 * its timestamp. NOTE: 1) That the test incorporates suggestions
6242 	 * from the latest proposal of the tcplw@cray.com list (Braden
6243 	 * 1993/04/26). 2) That updating only on newer timestamps interferes
6244 	 * with our earlier PAWS tests, so this check should be solely
6245 	 * predicated on the sequence space of this segment. 3) That we
6246 	 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
6247 	 * + SEG.Len  instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
6248 	 * SEG.Len, This modified check allows us to overcome RFC1323's
6249 	 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
6250 	 * p.869. In such cases, we can still calculate the RTT correctly
6251 	 * when RCV.NXT == Last.ACK.Sent.
6252 	 */
6253 	if ((to->to_flags & TOF_TS) != 0 &&
6254 	    SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
6255 	    SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
6256 	    ((thflags & (TH_SYN | TH_FIN)) != 0))) {
6257 		tp->ts_recent_age = tcp_ts_getticks();
6258 		tp->ts_recent = to->to_tsval;
6259 	}
6260 	/*
6261 	 * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN flag
6262 	 * is on (half-synchronized state), then queue data for later
6263 	 * processing; else drop segment and return.
6264 	 */
6265 	if ((thflags & TH_ACK) == 0) {
6266 		if (tp->t_flags & TF_NEEDSYN) {
6267 			return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6268 			    tiwin, thflags, nxt_pkt));
6269 		} else if (tp->t_flags & TF_ACKNOW) {
6270 			rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
6271 			return (ret_val);
6272 		} else {
6273 			rack_do_drop(m, NULL);
6274 			return (0);
6275 		}
6276 	}
6277 	/*
6278 	 * Ack processing.
6279 	 */
6280 	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
6281 		return (ret_val);
6282 	}
6283 	if (sbavail(&so->so_snd)) {
6284 		if (rack_progress_timeout_check(tp)) {
6285 			tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
6286 			rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6287 			return (1);
6288 		}
6289 	}
6290 	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
6291 	    tiwin, thflags, nxt_pkt));
6292 }
6293 
6294 
6295 static void inline
6296 rack_clear_rate_sample(struct tcp_rack *rack)
6297 {
6298 	rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
6299 	rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
6300 	rack->r_ctl.rack_rs.rs_rtt_tot = 0;
6301 }
6302 
6303 static int
6304 rack_init(struct tcpcb *tp)
6305 {
6306 	struct tcp_rack *rack = NULL;
6307 
6308 	tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
6309 	if (tp->t_fb_ptr == NULL) {
6310 		/*
6311 		 * We need to allocate memory but cant. The INP and INP_INFO
6312 		 * locks and they are recusive (happens during setup. So a
6313 		 * scheme to drop the locks fails :(
6314 		 *
6315 		 */
6316 		return (ENOMEM);
6317 	}
6318 	memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
6319 
6320 	rack = (struct tcp_rack *)tp->t_fb_ptr;
6321 	TAILQ_INIT(&rack->r_ctl.rc_map);
6322 	TAILQ_INIT(&rack->r_ctl.rc_free);
6323 	TAILQ_INIT(&rack->r_ctl.rc_tmap);
6324 	rack->rc_tp = tp;
6325 	if (tp->t_inpcb) {
6326 		rack->rc_inp = tp->t_inpcb;
6327 	}
6328 	/* Probably not needed but lets be sure */
6329 	rack_clear_rate_sample(rack);
6330 	rack->r_cpu = 0;
6331 	rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
6332 	rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
6333 	rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
6334 	rack->rc_pace_reduce = rack_slot_reduction;
6335 	if (V_tcp_delack_enabled)
6336 		tp->t_delayed_ack = 1;
6337 	else
6338 		tp->t_delayed_ack = 0;
6339 	rack->rc_pace_max_segs = rack_hptsi_segments;
6340 	rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg;
6341 	rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
6342 	rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
6343 	rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce;
6344 	rack->r_idle_reduce_largest  = rack_reduce_largest_on_idle;
6345 	rack->r_enforce_min_pace = rack_min_pace_time;
6346 	rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req;
6347 	rack->r_ctl.rc_prop_rate = rack_proportional_rate;
6348 	rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
6349 	rack->r_ctl.rc_early_recovery = rack_early_recovery;
6350 	rack->rc_always_pace = rack_pace_every_seg;
6351 	rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
6352 	rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
6353 	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
6354 	rack->r_ctl.rc_min_to = rack_min_to;
6355 	rack->r_ctl.rc_prr_inc_var = rack_inc_var;
6356 	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6357 	if (tp->snd_una != tp->snd_max) {
6358 		/* Create a send map for the current outstanding data */
6359 		struct rack_sendmap *rsm;
6360 
6361 		rsm = rack_alloc(rack);
6362 		if (rsm == NULL) {
6363 			uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6364 			tp->t_fb_ptr = NULL;
6365 			return (ENOMEM);
6366 		}
6367 		rsm->r_flags = RACK_OVERMAX;
6368 		rsm->r_tim_lastsent[0] = tcp_ts_getticks();
6369 		rsm->r_rtr_cnt = 1;
6370 		rsm->r_rtr_bytes = 0;
6371 		rsm->r_start = tp->snd_una;
6372 		rsm->r_end = tp->snd_max;
6373 		rsm->r_sndcnt = 0;
6374 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
6375 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
6376 		rsm->r_in_tmap = 1;
6377 	}
6378 	return (0);
6379 }
6380 
6381 static int
6382 rack_handoff_ok(struct tcpcb *tp)
6383 {
6384 	if ((tp->t_state == TCPS_CLOSED) ||
6385 	    (tp->t_state == TCPS_LISTEN)) {
6386 		/* Sure no problem though it may not stick */
6387 		return (0);
6388 	}
6389 	if ((tp->t_state == TCPS_SYN_SENT) ||
6390 	    (tp->t_state == TCPS_SYN_RECEIVED)) {
6391 		/*
6392 		 * We really don't know you have to get to ESTAB or beyond
6393 		 * to tell.
6394 		 */
6395 		return (EAGAIN);
6396 	}
6397 	if (tp->t_flags & TF_SACK_PERMIT) {
6398 		return (0);
6399 	}
6400 	/*
6401 	 * If we reach here we don't do SACK on this connection so we can
6402 	 * never do rack.
6403 	 */
6404 	return (EINVAL);
6405 }
6406 
6407 static void
6408 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
6409 {
6410 	if (tp->t_fb_ptr) {
6411 		struct tcp_rack *rack;
6412 		struct rack_sendmap *rsm;
6413 
6414 		rack = (struct tcp_rack *)tp->t_fb_ptr;
6415 #ifdef TCP_BLACKBOX
6416 		tcp_log_flowend(tp);
6417 #endif
6418 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6419 		while (rsm) {
6420 			TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next);
6421 			uma_zfree(rack_zone, rsm);
6422 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_map);
6423 		}
6424 		rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6425 		while (rsm) {
6426 			TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
6427 			uma_zfree(rack_zone, rsm);
6428 			rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
6429 		}
6430 		rack->rc_free_cnt = 0;
6431 		uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
6432 		tp->t_fb_ptr = NULL;
6433 	}
6434 }
6435 
6436 static void
6437 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
6438 {
6439 	switch (tp->t_state) {
6440 	case TCPS_SYN_SENT:
6441 		rack->r_state = TCPS_SYN_SENT;
6442 		rack->r_substate = rack_do_syn_sent;
6443 		break;
6444 	case TCPS_SYN_RECEIVED:
6445 		rack->r_state = TCPS_SYN_RECEIVED;
6446 		rack->r_substate = rack_do_syn_recv;
6447 		break;
6448 	case TCPS_ESTABLISHED:
6449 		rack->r_state = TCPS_ESTABLISHED;
6450 		rack->r_substate = rack_do_established;
6451 		break;
6452 	case TCPS_CLOSE_WAIT:
6453 		rack->r_state = TCPS_CLOSE_WAIT;
6454 		rack->r_substate = rack_do_close_wait;
6455 		break;
6456 	case TCPS_FIN_WAIT_1:
6457 		rack->r_state = TCPS_FIN_WAIT_1;
6458 		rack->r_substate = rack_do_fin_wait_1;
6459 		break;
6460 	case TCPS_CLOSING:
6461 		rack->r_state = TCPS_CLOSING;
6462 		rack->r_substate = rack_do_closing;
6463 		break;
6464 	case TCPS_LAST_ACK:
6465 		rack->r_state = TCPS_LAST_ACK;
6466 		rack->r_substate = rack_do_lastack;
6467 		break;
6468 	case TCPS_FIN_WAIT_2:
6469 		rack->r_state = TCPS_FIN_WAIT_2;
6470 		rack->r_substate = rack_do_fin_wait_2;
6471 		break;
6472 	case TCPS_LISTEN:
6473 	case TCPS_CLOSED:
6474 	case TCPS_TIME_WAIT:
6475 	default:
6476 #ifdef INVARIANTS
6477 		panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
6478 #endif
6479 		break;
6480 	};
6481 }
6482 
6483 
6484 static void
6485 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
6486 {
6487 	/*
6488 	 * We received an ack, and then did not
6489 	 * call send or were bounced out due to the
6490 	 * hpts was running. Now a timer is up as well, is
6491 	 * it the right timer?
6492 	 */
6493 	struct rack_sendmap *rsm;
6494 	int tmr_up;
6495 
6496 	tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
6497 	if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
6498 		return;
6499 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6500 	if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
6501 	    (tmr_up == PACE_TMR_RXT)) {
6502 		/* Should be an RXT */
6503 		return;
6504 	}
6505 	if (rsm == NULL) {
6506 		/* Nothing outstanding? */
6507 		if (tp->t_flags & TF_DELACK) {
6508 			if (tmr_up == PACE_TMR_DELACK)
6509 				/* We are supposed to have delayed ack up and we do */
6510 				return;
6511 		} else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) {
6512 			/*
6513 			 * if we hit enobufs then we would expect the possiblity
6514 			 * of nothing outstanding and the RXT up (and the hptsi timer).
6515 			 */
6516 			return;
6517 		} else if (((tcp_always_keepalive ||
6518 			     rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6519 			    (tp->t_state <= TCPS_CLOSING)) &&
6520 			   (tmr_up == PACE_TMR_KEEP) &&
6521 			   (tp->snd_max == tp->snd_una)) {
6522 			/* We should have keep alive up and we do */
6523 			return;
6524 		}
6525 	}
6526 	if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) {
6527 		if ((tp->t_flags & TF_SENTFIN) &&
6528 		    ((tp->snd_max - tp->snd_una) == 1) &&
6529 		    (rsm->r_flags & RACK_HAS_FIN)) {
6530 			/* needs to be a RXT */
6531 			if (tmr_up == PACE_TMR_RXT)
6532 				return;
6533 		} else if (tmr_up == PACE_TMR_RACK)
6534 			return;
6535 	} else if (SEQ_GT(tp->snd_max,tp->snd_una) &&
6536 		   ((tmr_up == PACE_TMR_TLP) ||
6537 		    (tmr_up == PACE_TMR_RXT))) {
6538 		/*
6539 		 * Either a TLP or RXT is fine if no sack-passed
6540 		 * is in place and data is outstanding.
6541 		 */
6542 		return;
6543 	} else if (tmr_up == PACE_TMR_DELACK) {
6544 		/*
6545 		 * If the delayed ack was going to go off
6546 		 * before the rtx/tlp/rack timer were going to
6547 		 * expire, then that would be the timer in control.
6548 		 * Note we don't check the time here trusting the
6549 		 * code is correct.
6550 		 */
6551 		return;
6552 	}
6553 	/*
6554 	 * Ok the timer originally started is not what we want now.
6555 	 * We will force the hpts to be stopped if any, and restart
6556 	 * with the slot set to what was in the saved slot.
6557 	 */
6558 	rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
6559 	rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6560 }
6561 
6562 static void
6563 rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6564     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos,
6565     int32_t nxt_pkt, struct timeval *tv)
6566 {
6567 	int32_t thflags, retval, did_out = 0;
6568 	int32_t way_out = 0;
6569 	uint32_t cts;
6570 	uint32_t tiwin;
6571 	struct tcpopt to;
6572 	struct tcp_rack *rack;
6573 	struct rack_sendmap *rsm;
6574 	int32_t prev_state = 0;
6575 
6576 	cts = tcp_tv_to_mssectick(tv);
6577 	rack = (struct tcp_rack *)tp->t_fb_ptr;
6578 
6579 	kern_prefetch(rack, &prev_state);
6580 	prev_state = 0;
6581 	thflags = th->th_flags;
6582 	/*
6583 	 * If this is either a state-changing packet or current state isn't
6584 	 * established, we require a read lock on tcbinfo.  Otherwise, we
6585 	 * allow the tcbinfo to be in either locked or unlocked, as the
6586 	 * caller may have unnecessarily acquired a lock due to a race.
6587 	 */
6588 	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
6589 	    tp->t_state != TCPS_ESTABLISHED) {
6590 		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
6591 	}
6592 	INP_WLOCK_ASSERT(tp->t_inpcb);
6593 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
6594 	    __func__));
6595 	KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
6596 	    __func__));
6597 	{
6598 		union tcp_log_stackspecific log;
6599 
6600 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
6601 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
6602 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
6603 		TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
6604 		    tlen, &log, true);
6605 	}
6606 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
6607 		way_out = 4;
6608 		goto done_with_input;
6609 	}
6610 	/*
6611 	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
6612 	 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
6613 	 */
6614 	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
6615 	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
6616 		rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
6617 		return;
6618 	}
6619 	/*
6620 	 * Segment received on connection. Reset idle time and keep-alive
6621 	 * timer. XXX: This should be done after segment validation to
6622 	 * ignore broken/spoofed segs.
6623 	 */
6624 	if  (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
6625 #ifdef NETFLIX_CWV
6626 		if ((tp->cwv_enabled) &&
6627 		    ((tp->cwv_cwnd_valid == 0) &&
6628 		     TCPS_HAVEESTABLISHED(tp->t_state) &&
6629 		     (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
6630 			tcp_newcwv_nvp_closedown(tp);
6631 		} else
6632 #endif
6633 		       if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
6634 			counter_u64_add(rack_input_idle_reduces, 1);
6635 			rack_cc_after_idle(tp,
6636 			    (rack->r_idle_reduce_largest ? 1 :0));
6637 		}
6638 	}
6639 	rack->r_ctl.rc_rcvtime = cts;
6640 	tp->t_rcvtime = ticks;
6641 
6642 #ifdef NETFLIX_CWV
6643 	if (tp->cwv_enabled) {
6644 		if ((tp->cwv_cwnd_valid == 0) &&
6645 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
6646 		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
6647 			tcp_newcwv_nvp_closedown(tp);
6648 	}
6649 #endif
6650 	/*
6651 	 * Unscale the window into a 32-bit value. For the SYN_SENT state
6652 	 * the scale is zero.
6653 	 */
6654 	tiwin = th->th_win << tp->snd_scale;
6655 #ifdef NETFLIX_STATS
6656 	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
6657 #endif
6658 	/*
6659 	 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
6660 	 * this to occur after we've validated the segment.
6661 	 */
6662 	if (tp->t_flags & TF_ECN_PERMIT) {
6663 		if (thflags & TH_CWR)
6664 			tp->t_flags &= ~TF_ECN_SND_ECE;
6665 		switch (iptos & IPTOS_ECN_MASK) {
6666 		case IPTOS_ECN_CE:
6667 			tp->t_flags |= TF_ECN_SND_ECE;
6668 			TCPSTAT_INC(tcps_ecn_ce);
6669 			break;
6670 		case IPTOS_ECN_ECT0:
6671 			TCPSTAT_INC(tcps_ecn_ect0);
6672 			break;
6673 		case IPTOS_ECN_ECT1:
6674 			TCPSTAT_INC(tcps_ecn_ect1);
6675 			break;
6676 		}
6677 		/* Congestion experienced. */
6678 		if (thflags & TH_ECE) {
6679 			rack_cong_signal(tp, th, CC_ECN);
6680 		}
6681 	}
6682 	/*
6683 	 * Parse options on any incoming segment.
6684 	 */
6685 	tcp_dooptions(&to, (u_char *)(th + 1),
6686 	    (th->th_off << 2) - sizeof(struct tcphdr),
6687 	    (thflags & TH_SYN) ? TO_SYN : 0);
6688 
6689 	/*
6690 	 * If echoed timestamp is later than the current time, fall back to
6691 	 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
6692 	 * were used when this connection was established.
6693 	 */
6694 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
6695 		to.to_tsecr -= tp->ts_offset;
6696 		if (TSTMP_GT(to.to_tsecr, cts))
6697 			to.to_tsecr = 0;
6698 	}
6699 	/*
6700 	 * If its the first time in we need to take care of options and
6701 	 * verify we can do SACK for rack!
6702 	 */
6703 	if (rack->r_state == 0) {
6704 		/* Should be init'd by rack_init() */
6705 		KASSERT(rack->rc_inp != NULL,
6706 		    ("%s: rack->rc_inp unexpectedly NULL", __func__));
6707 		if (rack->rc_inp == NULL) {
6708 			rack->rc_inp = tp->t_inpcb;
6709 		}
6710 
6711 		/*
6712 		 * Process options only when we get SYN/ACK back. The SYN
6713 		 * case for incoming connections is handled in tcp_syncache.
6714 		 * According to RFC1323 the window field in a SYN (i.e., a
6715 		 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
6716 		 * this is traditional behavior, may need to be cleaned up.
6717 		 */
6718 		rack->r_cpu = inp_to_cpuid(tp->t_inpcb);
6719 		if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
6720 			if ((to.to_flags & TOF_SCALE) &&
6721 			    (tp->t_flags & TF_REQ_SCALE)) {
6722 				tp->t_flags |= TF_RCVD_SCALE;
6723 				tp->snd_scale = to.to_wscale;
6724 			}
6725 			/*
6726 			 * Initial send window.  It will be updated with the
6727 			 * next incoming segment to the scaled value.
6728 			 */
6729 			tp->snd_wnd = th->th_win;
6730 			if (to.to_flags & TOF_TS) {
6731 				tp->t_flags |= TF_RCVD_TSTMP;
6732 				tp->ts_recent = to.to_tsval;
6733 				tp->ts_recent_age = cts;
6734 			}
6735 			if (to.to_flags & TOF_MSS)
6736 				tcp_mss(tp, to.to_mss);
6737 			if ((tp->t_flags & TF_SACK_PERMIT) &&
6738 			    (to.to_flags & TOF_SACKPERM) == 0)
6739 				tp->t_flags &= ~TF_SACK_PERMIT;
6740 			if (IS_FASTOPEN(tp->t_flags)) {
6741 				if (to.to_flags & TOF_FASTOPEN) {
6742 					uint16_t mss;
6743 
6744 					if (to.to_flags & TOF_MSS)
6745 						mss = to.to_mss;
6746 					else
6747 						if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
6748 							mss = TCP6_MSS;
6749 						else
6750 							mss = TCP_MSS;
6751 					tcp_fastopen_update_cache(tp, mss,
6752 					    to.to_tfo_len, to.to_tfo_cookie);
6753 				} else
6754 					tcp_fastopen_disable_path(tp);
6755 			}
6756 		}
6757 		/*
6758 		 * At this point we are at the initial call. Here we decide
6759 		 * if we are doing RACK or not. We do this by seeing if
6760 		 * TF_SACK_PERMIT is set, if not rack is *not* possible and
6761 		 * we switch to the default code.
6762 		 */
6763 		if ((tp->t_flags & TF_SACK_PERMIT) == 0) {
6764 			tcp_switch_back_to_default(tp);
6765 			(*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen,
6766 			    tlen, iptos);
6767 			return;
6768 		}
6769 		/* Set the flag */
6770 		rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
6771 		tcp_set_hpts(tp->t_inpcb);
6772 		rack_stop_all_timers(tp);
6773 		sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
6774 	}
6775 	/*
6776 	 * This is the one exception case where we set the rack state
6777 	 * always. All other times (timers etc) we must have a rack-state
6778 	 * set (so we assure we have done the checks above for SACK).
6779 	 */
6780 	if (rack->r_state != tp->t_state)
6781 		rack_set_state(tp, rack);
6782 	if (SEQ_GT(th->th_ack, tp->snd_una) && (rsm = TAILQ_FIRST(&rack->r_ctl.rc_map)) != NULL)
6783 		kern_prefetch(rsm, &prev_state);
6784 	prev_state = rack->r_state;
6785 	rack->r_ctl.rc_tlp_send_cnt = 0;
6786 	rack_clear_rate_sample(rack);
6787 	retval = (*rack->r_substate) (m, th, so,
6788 	    tp, &to, drop_hdrlen,
6789 	    tlen, tiwin, thflags, nxt_pkt);
6790 #ifdef INVARIANTS
6791 	if ((retval == 0) &&
6792 	    (tp->t_inpcb == NULL)) {
6793 		panic("retval:%d tp:%p t_inpcb:NULL state:%d",
6794 		    retval, tp, prev_state);
6795 	}
6796 #endif
6797 	if (retval == 0) {
6798 		/*
6799 		 * If retval is 1 the tcb is unlocked and most likely the tp
6800 		 * is gone.
6801 		 */
6802 		INP_WLOCK_ASSERT(tp->t_inpcb);
6803 		tcp_rack_xmit_timer_commit(rack, tp);
6804 		if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
6805 		    (rack->rc_in_persist == 0)){
6806 			/*
6807 			 * The peer shrunk its window on us to the point
6808 			 * where we have sent too much. The only thing
6809 			 * we can do here is stop any timers and
6810 			 * enter persist. We most likely lost the last
6811 			 * bytes we sent but oh well, we will have to
6812 			 * retransmit them after the peer is caught up.
6813 			 */
6814 			if (rack->rc_inp->inp_in_hpts)
6815 				tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6816 			rack_timer_cancel(tp, rack, cts, __LINE__);
6817 			rack_enter_persist(tp, rack, cts);
6818 			rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6819 			way_out = 3;
6820 			goto done_with_input;
6821 		}
6822 		if (nxt_pkt == 0) {
6823 			if (rack->r_wanted_output != 0) {
6824 				did_out = 1;
6825 				(void)tp->t_fb->tfb_tcp_output(tp);
6826 			}
6827 			rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0);
6828 		}
6829 		if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
6830 		    (SEQ_GT(tp->snd_max, tp->snd_una) ||
6831 		     (tp->t_flags & TF_DELACK) ||
6832 		     ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
6833 		      (tp->t_state <= TCPS_CLOSING)))) {
6834 			/* We could not send (probably in the hpts but stopped the timer earlier)? */
6835 			if ((tp->snd_max == tp->snd_una) &&
6836 			    ((tp->t_flags & TF_DELACK) == 0) &&
6837 			    (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
6838 				/* keep alive not needed if we are hptsi output yet */
6839 				;
6840 			} else {
6841 				if (rack->rc_inp->inp_in_hpts)
6842 					tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
6843 				rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
6844 			}
6845 			way_out = 1;
6846 		} else {
6847 			/* Do we have the correct timer running? */
6848 			rack_timer_audit(tp, rack, &so->so_snd);
6849 			way_out = 2;
6850 		}
6851 	done_with_input:
6852 		rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
6853 		if (did_out)
6854 			rack->r_wanted_output = 0;
6855 #ifdef INVARIANTS
6856 		if (tp->t_inpcb == NULL) {
6857 			panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d",
6858 			      did_out,
6859 			      retval, tp, prev_state);
6860 		}
6861 #endif
6862 		INP_WUNLOCK(tp->t_inpcb);
6863 	}
6864 }
6865 
6866 void
6867 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
6868     struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
6869 {
6870 	struct timeval tv;
6871 #ifdef RSS
6872 	struct tcp_function_block *tfb;
6873 	struct tcp_rack *rack;
6874 	struct epoch_tracker et;
6875 
6876 	rack = (struct tcp_rack *)tp->t_fb_ptr;
6877 	if (rack->r_state == 0) {
6878 		/*
6879 		 * Initial input (ACK to SYN-ACK etc)lets go ahead and get
6880 		 * it processed
6881 		 */
6882 		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
6883 		tcp_get_usecs(&tv);
6884 		rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6885 		    tlen, iptos, 0, &tv);
6886 		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
6887 		return;
6888 	}
6889 	tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
6890 	INP_WUNLOCK(tp->t_inpcb);
6891 #else
6892 	tcp_get_usecs(&tv);
6893 	rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
6894 	    tlen, iptos, 0, &tv);
6895 #endif
6896 }
6897 
6898 struct rack_sendmap *
6899 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
6900 {
6901 	struct rack_sendmap *rsm = NULL;
6902 	int32_t idx;
6903 	uint32_t srtt_cur, srtt = 0, thresh = 0, ts_low = 0;
6904 
6905 	/* Return the next guy to be re-transmitted */
6906 	if (TAILQ_EMPTY(&rack->r_ctl.rc_map)) {
6907 		return (NULL);
6908 	}
6909 	if (tp->t_flags & TF_SENTFIN) {
6910 		/* retran the end FIN? */
6911 		return (NULL);
6912 	}
6913 	/* ok lets look at this one */
6914 	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6915 	if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
6916 		goto check_it;
6917 	}
6918 	rsm = rack_find_lowest_rsm(rack);
6919 	if (rsm == NULL) {
6920 		return (NULL);
6921 	}
6922 check_it:
6923 	srtt_cur = tp->t_srtt >> TCP_RTT_SHIFT;
6924 	srtt = TICKS_2_MSEC(srtt_cur);
6925 	if (rack->rc_rack_rtt && (srtt > rack->rc_rack_rtt))
6926 		srtt = rack->rc_rack_rtt;
6927 	if (rsm->r_flags & RACK_ACKED) {
6928 		return (NULL);
6929 	}
6930 	if ((rsm->r_flags & RACK_SACK_PASSED) == 0) {
6931 		/* Its not yet ready */
6932 		return (NULL);
6933 	}
6934 	idx = rsm->r_rtr_cnt - 1;
6935 	ts_low = rsm->r_tim_lastsent[idx];
6936 	thresh = rack_calc_thresh_rack(rack, srtt, tsused);
6937 	if (tsused <= ts_low) {
6938 		return (NULL);
6939 	}
6940 	if ((tsused - ts_low) >= thresh) {
6941 		return (rsm);
6942 	}
6943 	return (NULL);
6944 }
6945 
6946 static int
6947 rack_output(struct tcpcb *tp)
6948 {
6949 	struct socket *so;
6950 	uint32_t recwin, sendwin;
6951 	uint32_t sb_offset;
6952 	int32_t len, flags, error = 0;
6953 	struct mbuf *m;
6954 	struct mbuf *mb;
6955 	uint32_t if_hw_tsomaxsegcount = 0;
6956 	uint32_t if_hw_tsomaxsegsize;
6957 	long tot_len_this_send = 0;
6958 	struct ip *ip = NULL;
6959 #ifdef TCPDEBUG
6960 	struct ipovly *ipov = NULL;
6961 #endif
6962 	struct udphdr *udp = NULL;
6963 	struct tcp_rack *rack;
6964 	struct tcphdr *th;
6965 	uint8_t pass = 0;
6966 	uint8_t wanted_cookie = 0;
6967 	u_char opt[TCP_MAXOLEN];
6968 	unsigned ipoptlen, optlen, hdrlen, ulen=0;
6969 	uint32_t rack_seq;
6970 
6971 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
6972 	unsigned ipsec_optlen = 0;
6973 
6974 #endif
6975 	int32_t idle, sendalot;
6976 	int32_t sub_from_prr = 0;
6977 	volatile int32_t sack_rxmit;
6978 	struct rack_sendmap *rsm = NULL;
6979 	int32_t tso, mtu, would_have_fin = 0;
6980 	struct tcpopt to;
6981 	int32_t slot = 0;
6982 	uint32_t cts;
6983 	uint8_t hpts_calling, doing_tlp = 0;
6984 	int32_t do_a_prefetch;
6985 	int32_t prefetch_rsm = 0;
6986 	int32_t prefetch_so_done = 0;
6987 	struct tcp_log_buffer *lgb = NULL;
6988 	struct inpcb *inp;
6989 	struct sockbuf *sb;
6990 #ifdef INET6
6991 	struct ip6_hdr *ip6 = NULL;
6992 	int32_t isipv6;
6993 #endif
6994 	/* setup and take the cache hits here */
6995 	rack = (struct tcp_rack *)tp->t_fb_ptr;
6996 	inp = rack->rc_inp;
6997 	so = inp->inp_socket;
6998 	sb = &so->so_snd;
6999 	kern_prefetch(sb, &do_a_prefetch);
7000 	do_a_prefetch = 1;
7001 
7002 	INP_WLOCK_ASSERT(inp);
7003 #ifdef TCP_OFFLOAD
7004 	if (tp->t_flags & TF_TOE)
7005 		return (tcp_offload_output(tp));
7006 #endif
7007 #ifdef INET6
7008 	if (rack->r_state) {
7009 		/* Use the cache line loaded if possible */
7010 		isipv6 = rack->r_is_v6;
7011 	} else {
7012 		isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
7013 	}
7014 #endif
7015 	cts = tcp_ts_getticks();
7016 	if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
7017 	    inp->inp_in_hpts) {
7018 		/*
7019 		 * We are on the hpts for some timer but not hptsi output.
7020 		 * Remove from the hpts unconditionally.
7021 		 */
7022 		rack_timer_cancel(tp, rack, cts, __LINE__);
7023 	}
7024 	/* Mark that we have called rack_output(). */
7025 	if ((rack->r_timer_override) ||
7026 	    (tp->t_flags & TF_FORCEDATA) ||
7027 	    (tp->t_state < TCPS_ESTABLISHED)) {
7028 		if (tp->t_inpcb->inp_in_hpts)
7029 			tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT);
7030 	} else if (tp->t_inpcb->inp_in_hpts) {
7031 		/*
7032 		 * On the hpts you can't pass even if ACKNOW is on, we will
7033 		 * when the hpts fires.
7034 		 */
7035 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
7036 		return (0);
7037 	}
7038 	hpts_calling = inp->inp_hpts_calls;
7039 	inp->inp_hpts_calls = 0;
7040 	if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7041 		if (rack_process_timers(tp, rack, cts, hpts_calling)) {
7042 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
7043 			return (0);
7044 		}
7045 	}
7046 	rack->r_wanted_output = 0;
7047 	rack->r_timer_override = 0;
7048 	/*
7049 	 * For TFO connections in SYN_SENT or SYN_RECEIVED,
7050 	 * only allow the initial SYN or SYN|ACK and those sent
7051 	 * by the retransmit timer.
7052 	 */
7053 	if (IS_FASTOPEN(tp->t_flags) &&
7054 	    ((tp->t_state == TCPS_SYN_RECEIVED) ||
7055 	     (tp->t_state == TCPS_SYN_SENT)) &&
7056 	    SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
7057 	    (tp->t_rxtshift == 0))              /* not a retransmit */
7058 		return (0);
7059 	/*
7060 	 * Determine length of data that should be transmitted, and flags
7061 	 * that will be used. If there is some data or critical controls
7062 	 * (SYN, RST) to send, then transmit; otherwise, investigate
7063 	 * further.
7064 	 */
7065 	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
7066 #ifdef NETFLIX_CWV
7067 	if (tp->cwv_enabled) {
7068 		if ((tp->cwv_cwnd_valid == 0) &&
7069 		    TCPS_HAVEESTABLISHED(tp->t_state) &&
7070 		    (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
7071 			tcp_newcwv_nvp_closedown(tp);
7072 	} else
7073 #endif
7074 	if (tp->t_idle_reduce) {
7075 		if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
7076 			rack_cc_after_idle(tp,
7077 		            (rack->r_idle_reduce_largest ? 1 :0));
7078 	}
7079 	tp->t_flags &= ~TF_LASTIDLE;
7080 	if (idle) {
7081 		if (tp->t_flags & TF_MORETOCOME) {
7082 			tp->t_flags |= TF_LASTIDLE;
7083 			idle = 0;
7084 		}
7085 	}
7086 again:
7087 	/*
7088 	 * If we've recently taken a timeout, snd_max will be greater than
7089 	 * snd_nxt.  There may be SACK information that allows us to avoid
7090 	 * resending already delivered data.  Adjust snd_nxt accordingly.
7091 	 */
7092 	sendalot = 0;
7093 	cts = tcp_ts_getticks();
7094 	tso = 0;
7095 	mtu = 0;
7096 	sb_offset = tp->snd_max - tp->snd_una;
7097 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
7098 
7099 	flags = tcp_outflags[tp->t_state];
7100 	/*
7101 	 * Send any SACK-generated retransmissions.  If we're explicitly
7102 	 * trying to send out new data (when sendalot is 1), bypass this
7103 	 * function. If we retransmit in fast recovery mode, decrement
7104 	 * snd_cwnd, since we're replacing a (future) new transmission with
7105 	 * a retransmission now, and we previously incremented snd_cwnd in
7106 	 * tcp_input().
7107 	 */
7108 	/*
7109 	 * Still in sack recovery , reset rxmit flag to zero.
7110 	 */
7111 	while (rack->rc_free_cnt < rack_free_cache) {
7112 		rsm = rack_alloc(rack);
7113 		if (rsm == NULL) {
7114 			if (inp->inp_hpts_calls)
7115 				/* Retry in a ms */
7116 				slot = 1;
7117 			goto just_return_nolock;
7118 		}
7119 		TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
7120 		rack->rc_free_cnt++;
7121 		rsm = NULL;
7122 	}
7123 	if (inp->inp_hpts_calls)
7124 		inp->inp_hpts_calls = 0;
7125 	sack_rxmit = 0;
7126 	len = 0;
7127 	rsm = NULL;
7128 	if (flags & TH_RST) {
7129 		SOCKBUF_LOCK(sb);
7130 		goto send;
7131 	}
7132 	if (rack->r_ctl.rc_tlpsend) {
7133 		/* Tail loss probe */
7134 		long cwin;
7135 		long tlen;
7136 
7137 		doing_tlp = 1;
7138 		rsm = rack->r_ctl.rc_tlpsend;
7139 		rack->r_ctl.rc_tlpsend = NULL;
7140 		sack_rxmit = 1;
7141 		tlen = rsm->r_end - rsm->r_start;
7142 		if (tlen > tp->t_maxseg)
7143 			tlen = tp->t_maxseg;
7144 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7145 		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7146 		    __func__, __LINE__,
7147 		    rsm->r_start, tp->snd_una, tp, rack, rsm));
7148 		sb_offset = rsm->r_start - tp->snd_una;
7149 		cwin = min(tp->snd_wnd, tlen);
7150 		len = cwin;
7151 	} else if (rack->r_ctl.rc_resend) {
7152 		/* Retransmit timer */
7153 		rsm = rack->r_ctl.rc_resend;
7154 		rack->r_ctl.rc_resend = NULL;
7155 		len = rsm->r_end - rsm->r_start;
7156 		sack_rxmit = 1;
7157 		sendalot = 0;
7158 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7159 		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7160 		    __func__, __LINE__,
7161 		    rsm->r_start, tp->snd_una, tp, rack, rsm));
7162 		sb_offset = rsm->r_start - tp->snd_una;
7163 		if (len >= tp->t_maxseg) {
7164 			len = tp->t_maxseg;
7165 		}
7166 	} else if ((rack->rc_in_persist == 0) &&
7167 	    ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
7168 		long tlen;
7169 
7170 		if ((!IN_RECOVERY(tp->t_flags)) &&
7171 		    ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) {
7172 			/* Enter recovery if not induced by a time-out */
7173 			rack->r_ctl.rc_rsm_start = rsm->r_start;
7174 			rack->r_ctl.rc_cwnd_at = tp->snd_cwnd;
7175 			rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh;
7176 			rack_cong_signal(tp, NULL, CC_NDUPACK);
7177 			/*
7178 			 * When we enter recovery we need to assure we send
7179 			 * one packet.
7180 			 */
7181 			rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg;
7182 		}
7183 #ifdef INVARIANTS
7184 		if (SEQ_LT(rsm->r_start, tp->snd_una)) {
7185 			panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
7186 			    tp, rack, rsm, rsm->r_start, tp->snd_una);
7187 		}
7188 #endif
7189 		tlen = rsm->r_end - rsm->r_start;
7190 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
7191 		    ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
7192 		    __func__, __LINE__,
7193 		    rsm->r_start, tp->snd_una, tp, rack, rsm));
7194 		sb_offset = rsm->r_start - tp->snd_una;
7195 		if (tlen > rack->r_ctl.rc_prr_sndcnt) {
7196 			len = rack->r_ctl.rc_prr_sndcnt;
7197 		} else {
7198 			len = tlen;
7199 		}
7200 		if (len >= tp->t_maxseg) {
7201 			sendalot = 1;
7202 			len = tp->t_maxseg;
7203 		} else {
7204 			sendalot = 0;
7205 			if ((rack->rc_timer_up == 0) &&
7206 			    (len < tlen)) {
7207 				/*
7208 				 * If its not a timer don't send a partial
7209 				 * segment.
7210 				 */
7211 				len = 0;
7212 				goto just_return_nolock;
7213 			}
7214 		}
7215 		if (len > 0) {
7216 			sub_from_prr = 1;
7217 			sack_rxmit = 1;
7218 			TCPSTAT_INC(tcps_sack_rexmits);
7219 			TCPSTAT_ADD(tcps_sack_rexmit_bytes,
7220 			    min(len, tp->t_maxseg));
7221 			counter_u64_add(rack_rtm_prr_retran, 1);
7222 		}
7223 	}
7224 	if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
7225 		/* we are retransmitting the fin */
7226 		len--;
7227 		if (len) {
7228 			/*
7229 			 * When retransmitting data do *not* include the
7230 			 * FIN. This could happen from a TLP probe.
7231 			 */
7232 			flags &= ~TH_FIN;
7233 		}
7234 	}
7235 #ifdef INVARIANTS
7236 	/* For debugging */
7237 	rack->r_ctl.rc_rsm_at_retran = rsm;
7238 #endif
7239 	/*
7240 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
7241 	 * state flags.
7242 	 */
7243 	if (tp->t_flags & TF_NEEDFIN)
7244 		flags |= TH_FIN;
7245 	if (tp->t_flags & TF_NEEDSYN)
7246 		flags |= TH_SYN;
7247 	if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
7248 		void *end_rsm;
7249 		end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
7250 		if (end_rsm)
7251 			kern_prefetch(end_rsm, &prefetch_rsm);
7252 		prefetch_rsm = 1;
7253 	}
7254 	SOCKBUF_LOCK(sb);
7255 	/*
7256 	 * If in persist timeout with window of 0, send 1 byte. Otherwise,
7257 	 * if window is small but nonzero and time TF_SENTFIN expired, we
7258 	 * will send what we can and go to transmit state.
7259 	 */
7260 	if (tp->t_flags & TF_FORCEDATA) {
7261 		if (sendwin == 0) {
7262 			/*
7263 			 * If we still have some data to send, then clear
7264 			 * the FIN bit.  Usually this would happen below
7265 			 * when it realizes that we aren't sending all the
7266 			 * data.  However, if we have exactly 1 byte of
7267 			 * unsent data, then it won't clear the FIN bit
7268 			 * below, and if we are in persist state, we wind up
7269 			 * sending the packet without recording that we sent
7270 			 * the FIN bit.
7271 			 *
7272 			 * We can't just blindly clear the FIN bit, because
7273 			 * if we don't have any more data to send then the
7274 			 * probe will be the FIN itself.
7275 			 */
7276 			if (sb_offset < sbused(sb))
7277 				flags &= ~TH_FIN;
7278 			sendwin = 1;
7279 		} else {
7280 			if (rack->rc_in_persist)
7281 				rack_exit_persist(tp, rack);
7282 			/*
7283 			 * If we are dropping persist mode then we need to
7284 			 * correct snd_nxt/snd_max and off.
7285 			 */
7286 			tp->snd_nxt = tp->snd_max;
7287 			sb_offset = tp->snd_nxt - tp->snd_una;
7288 		}
7289 	}
7290 	/*
7291 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
7292 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
7293 	 * negative length.  This can also occur when TCP opens up its
7294 	 * congestion window while receiving additional duplicate acks after
7295 	 * fast-retransmit because TCP will reset snd_nxt to snd_max after
7296 	 * the fast-retransmit.
7297 	 *
7298 	 * In the normal retransmit-FIN-only case, however, snd_nxt will be
7299 	 * set to snd_una, the sb_offset will be 0, and the length may wind
7300 	 * up 0.
7301 	 *
7302 	 * If sack_rxmit is true we are retransmitting from the scoreboard
7303 	 * in which case len is already set.
7304 	 */
7305 	if (sack_rxmit == 0) {
7306 		uint32_t avail;
7307 
7308 		avail = sbavail(sb);
7309 		if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
7310 			sb_offset = tp->snd_nxt - tp->snd_una;
7311 		else
7312 			sb_offset = 0;
7313 		if (IN_RECOVERY(tp->t_flags) == 0) {
7314 			if (rack->r_ctl.rc_tlp_new_data) {
7315 				/* TLP is forcing out new data */
7316 				if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
7317 					rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
7318 				}
7319 				if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd)
7320 					len = tp->snd_wnd;
7321 				else
7322 					len = rack->r_ctl.rc_tlp_new_data;
7323 				rack->r_ctl.rc_tlp_new_data = 0;
7324 				doing_tlp = 1;
7325 			} else {
7326 				if (sendwin > avail) {
7327 					/* use the available */
7328 					if (avail > sb_offset) {
7329 						len = (int32_t)(avail - sb_offset);
7330 					} else {
7331 						len = 0;
7332 					}
7333 				} else {
7334 					if (sendwin > sb_offset) {
7335 						len = (int32_t)(sendwin - sb_offset);
7336 					} else {
7337 						len = 0;
7338 					}
7339 				}
7340 			}
7341 		} else {
7342 			uint32_t outstanding;
7343 
7344 			/*
7345 			 * We are inside of a SACK recovery episode and are
7346 			 * sending new data, having retransmitted all the
7347 			 * data possible so far in the scoreboard.
7348 			 */
7349 			outstanding = tp->snd_max - tp->snd_una;
7350 			if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
7351 				len = 0;
7352 			else if (avail > sb_offset)
7353 				len = avail - sb_offset;
7354 			else
7355 				len = 0;
7356 			if (len > 0) {
7357 				if (len > rack->r_ctl.rc_prr_sndcnt)
7358 					len = rack->r_ctl.rc_prr_sndcnt;
7359 
7360 				if (len > 0) {
7361 					sub_from_prr = 1;
7362 					counter_u64_add(rack_rtm_prr_newdata, 1);
7363 				}
7364 			}
7365 			if (len > tp->t_maxseg) {
7366 				/*
7367 				 * We should never send more than a MSS when
7368 				 * retransmitting or sending new data in prr
7369 				 * mode unless the override flag is on. Most
7370 				 * likely the PRR algorithm is not going to
7371 				 * let us send a lot as well :-)
7372 				 */
7373 				if (rack->r_ctl.rc_prr_sendalot == 0)
7374 					len = tp->t_maxseg;
7375 			} else if (len < tp->t_maxseg) {
7376 				/*
7377 				 * Do we send any? The idea here is if the
7378 				 * send empty's the socket buffer we want to
7379 				 * do it. However if not then lets just wait
7380 				 * for our prr_sndcnt to get bigger.
7381 				 */
7382 				long leftinsb;
7383 
7384 				leftinsb = sbavail(sb) - sb_offset;
7385 				if (leftinsb > len) {
7386 					/* This send does not empty the sb */
7387 					len = 0;
7388 				}
7389 			}
7390 		}
7391 	}
7392 	if (prefetch_so_done == 0) {
7393 		kern_prefetch(so, &prefetch_so_done);
7394 		prefetch_so_done = 1;
7395 	}
7396 	/*
7397 	 * Lop off SYN bit if it has already been sent.  However, if this is
7398 	 * SYN-SENT state and if segment contains data and if we don't know
7399 	 * that foreign host supports TAO, suppress sending segment.
7400 	 */
7401 	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
7402 	    ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
7403 		if (tp->t_state != TCPS_SYN_RECEIVED)
7404 			flags &= ~TH_SYN;
7405 		/*
7406 		 * When sending additional segments following a TFO SYN|ACK,
7407 		 * do not include the SYN bit.
7408 		 */
7409 		if (IS_FASTOPEN(tp->t_flags) &&
7410 		    (tp->t_state == TCPS_SYN_RECEIVED))
7411 			flags &= ~TH_SYN;
7412 		sb_offset--, len++;
7413 	}
7414 	/*
7415 	 * Be careful not to send data and/or FIN on SYN segments. This
7416 	 * measure is needed to prevent interoperability problems with not
7417 	 * fully conformant TCP implementations.
7418 	 */
7419 	if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
7420 		len = 0;
7421 		flags &= ~TH_FIN;
7422 	}
7423 	/*
7424 	 * On TFO sockets, ensure no data is sent in the following cases:
7425 	 *
7426 	 *  - When retransmitting SYN|ACK on a passively-created socket
7427 	 *
7428 	 *  - When retransmitting SYN on an actively created socket
7429 	 *
7430 	 *  - When sending a zero-length cookie (cookie request) on an
7431 	 *    actively created socket
7432 	 *
7433 	 *  - When the socket is in the CLOSED state (RST is being sent)
7434 	 */
7435 	if (IS_FASTOPEN(tp->t_flags) &&
7436 	    (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
7437 	     ((tp->t_state == TCPS_SYN_SENT) &&
7438 	      (tp->t_tfo_client_cookie_len == 0)) ||
7439 	     (flags & TH_RST))) {
7440 		sack_rxmit = 0;
7441 		len = 0;
7442 	}
7443 	/* Without fast-open there should never be data sent on a SYN */
7444 	if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
7445 		len = 0;
7446 	if (len <= 0) {
7447 		/*
7448 		 * If FIN has been sent but not acked, but we haven't been
7449 		 * called to retransmit, len will be < 0.  Otherwise, window
7450 		 * shrank after we sent into it.  If window shrank to 0,
7451 		 * cancel pending retransmit, pull snd_nxt back to (closed)
7452 		 * window, and set the persist timer if it isn't already
7453 		 * going.  If the window didn't close completely, just wait
7454 		 * for an ACK.
7455 		 *
7456 		 * We also do a general check here to ensure that we will
7457 		 * set the persist timer when we have data to send, but a
7458 		 * 0-byte window. This makes sure the persist timer is set
7459 		 * even if the packet hits one of the "goto send" lines
7460 		 * below.
7461 		 */
7462 		len = 0;
7463 		if ((tp->snd_wnd == 0) &&
7464 		    (TCPS_HAVEESTABLISHED(tp->t_state)) &&
7465 		    (sb_offset < (int)sbavail(sb))) {
7466 			tp->snd_nxt = tp->snd_una;
7467 			rack_enter_persist(tp, rack, cts);
7468 		}
7469 	}
7470 	/* len will be >= 0 after this point. */
7471 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7472 	tcp_sndbuf_autoscale(tp, so, sendwin);
7473 	/*
7474 	 * Decide if we can use TCP Segmentation Offloading (if supported by
7475 	 * hardware).
7476 	 *
7477 	 * TSO may only be used if we are in a pure bulk sending state.  The
7478 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
7479 	 * options prevent using TSO.  With TSO the TCP header is the same
7480 	 * (except for the sequence number) for all generated packets.  This
7481 	 * makes it impossible to transmit any options which vary per
7482 	 * generated segment or packet.
7483 	 *
7484 	 * IPv4 handling has a clear separation of ip options and ip header
7485 	 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
7486 	 * the right thing below to provide length of just ip options and thus
7487 	 * checking for ipoptlen is enough to decide if ip options are present.
7488 	 */
7489 
7490 #ifdef INET6
7491 	if (isipv6)
7492 		ipoptlen = ip6_optlen(tp->t_inpcb);
7493 	else
7494 #endif
7495 		if (tp->t_inpcb->inp_options)
7496 			ipoptlen = tp->t_inpcb->inp_options->m_len -
7497 			    offsetof(struct ipoption, ipopt_list);
7498 		else
7499 			ipoptlen = 0;
7500 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7501 	/*
7502 	 * Pre-calculate here as we save another lookup into the darknesses
7503 	 * of IPsec that way and can actually decide if TSO is ok.
7504 	 */
7505 #ifdef INET6
7506 	if (isipv6 && IPSEC_ENABLED(ipv6))
7507 		ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb);
7508 #ifdef INET
7509 	else
7510 #endif
7511 #endif				/* INET6 */
7512 #ifdef INET
7513 	if (IPSEC_ENABLED(ipv4))
7514 		ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb);
7515 #endif				/* INET */
7516 #endif
7517 
7518 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7519 	ipoptlen += ipsec_optlen;
7520 #endif
7521 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
7522 	    (tp->t_port == 0) &&
7523 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
7524 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
7525 	    ipoptlen == 0)
7526 		tso = 1;
7527 	{
7528 		uint32_t outstanding;
7529 
7530 		outstanding = tp->snd_max - tp->snd_una;
7531 		if (tp->t_flags & TF_SENTFIN) {
7532 			/*
7533 			 * If we sent a fin, snd_max is 1 higher than
7534 			 * snd_una
7535 			 */
7536 			outstanding--;
7537 		}
7538 		if (outstanding > 0) {
7539 			/*
7540 			 * This is sub-optimal. We only send a stand alone
7541 			 * FIN on its own segment.
7542 			 */
7543 			if (flags & TH_FIN) {
7544 				flags &= ~TH_FIN;
7545 				would_have_fin = 1;
7546 			}
7547 		} else if (sack_rxmit) {
7548 			if ((rsm->r_flags & RACK_HAS_FIN) == 0)
7549 				flags &= ~TH_FIN;
7550 		} else {
7551 			if (SEQ_LT(tp->snd_nxt + len, tp->snd_una +
7552 			    sbused(sb)))
7553 				flags &= ~TH_FIN;
7554 		}
7555 	}
7556 	recwin = sbspace(&so->so_rcv);
7557 
7558 	/*
7559 	 * Sender silly window avoidance.   We transmit under the following
7560 	 * conditions when len is non-zero:
7561 	 *
7562 	 * - We have a full segment (or more with TSO) - This is the last
7563 	 * buffer in a write()/send() and we are either idle or running
7564 	 * NODELAY - we've timed out (e.g. persist timer) - we have more
7565 	 * then 1/2 the maximum send window's worth of data (receiver may be
7566 	 * limited the window size) - we need to retransmit
7567 	 */
7568 	if (len) {
7569 		if (len >= tp->t_maxseg) {
7570 			pass = 1;
7571 			goto send;
7572 		}
7573 		/*
7574 		 * NOTE! on localhost connections an 'ack' from the remote
7575 		 * end may occur synchronously with the output and cause us
7576 		 * to flush a buffer queued with moretocome.  XXX
7577 		 *
7578 		 */
7579 		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
7580 		    (idle || (tp->t_flags & TF_NODELAY)) &&
7581 		    ((uint32_t)len + (uint32_t)sb_offset >= sbavail(&so->so_snd)) &&
7582 		    (tp->t_flags & TF_NOPUSH) == 0) {
7583 			pass = 2;
7584 			goto send;
7585 		}
7586 		if (tp->t_flags & TF_FORCEDATA) {	/* typ. timeout case */
7587 			pass = 3;
7588 			goto send;
7589 		}
7590 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
7591 			goto send;
7592 		}
7593 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
7594 			pass = 4;
7595 			goto send;
7596 		}
7597 		if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {	/* retransmit case */
7598 			pass = 5;
7599 			goto send;
7600 		}
7601 		if (sack_rxmit) {
7602 			pass = 6;
7603 			goto send;
7604 		}
7605 	}
7606 	/*
7607 	 * Sending of standalone window updates.
7608 	 *
7609 	 * Window updates are important when we close our window due to a
7610 	 * full socket buffer and are opening it again after the application
7611 	 * reads data from it.  Once the window has opened again and the
7612 	 * remote end starts to send again the ACK clock takes over and
7613 	 * provides the most current window information.
7614 	 *
7615 	 * We must avoid the silly window syndrome whereas every read from
7616 	 * the receive buffer, no matter how small, causes a window update
7617 	 * to be sent.  We also should avoid sending a flurry of window
7618 	 * updates when the socket buffer had queued a lot of data and the
7619 	 * application is doing small reads.
7620 	 *
7621 	 * Prevent a flurry of pointless window updates by only sending an
7622 	 * update when we can increase the advertized window by more than
7623 	 * 1/4th of the socket buffer capacity.  When the buffer is getting
7624 	 * full or is very small be more aggressive and send an update
7625 	 * whenever we can increase by two mss sized segments. In all other
7626 	 * situations the ACK's to new incoming data will carry further
7627 	 * window increases.
7628 	 *
7629 	 * Don't send an independent window update if a delayed ACK is
7630 	 * pending (it will get piggy-backed on it) or the remote side
7631 	 * already has done a half-close and won't send more data.  Skip
7632 	 * this if the connection is in T/TCP half-open state.
7633 	 */
7634 	if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
7635 	    !(tp->t_flags & TF_DELACK) &&
7636 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
7637 		/*
7638 		 * "adv" is the amount we could increase the window, taking
7639 		 * into account that we are limited by TCP_MAXWIN <<
7640 		 * tp->rcv_scale.
7641 		 */
7642 		int32_t adv;
7643 		int oldwin;
7644 
7645 		adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale);
7646 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
7647 			oldwin = (tp->rcv_adv - tp->rcv_nxt);
7648 			adv -= oldwin;
7649 		} else
7650 			oldwin = 0;
7651 
7652 		/*
7653 		 * If the new window size ends up being the same as the old
7654 		 * size when it is scaled, then don't force a window update.
7655 		 */
7656 		if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale)
7657 			goto dontupdate;
7658 
7659 		if (adv >= (int32_t)(2 * tp->t_maxseg) &&
7660 		    (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
7661 		    recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
7662 		    so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) {
7663 			pass = 7;
7664 			goto send;
7665 		}
7666 		if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat)
7667 			goto send;
7668 	}
7669 dontupdate:
7670 
7671 	/*
7672 	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
7673 	 * is also a catch-all for the retransmit timer timeout case.
7674 	 */
7675 	if (tp->t_flags & TF_ACKNOW) {
7676 		pass = 8;
7677 		goto send;
7678 	}
7679 	if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
7680 		pass = 9;
7681 		goto send;
7682 	}
7683 	if (SEQ_GT(tp->snd_up, tp->snd_una)) {
7684 		pass = 10;
7685 		goto send;
7686 	}
7687 	/*
7688 	 * If our state indicates that FIN should be sent and we have not
7689 	 * yet done so, then we need to send.
7690 	 */
7691 	if ((flags & TH_FIN) &&
7692 	    (tp->snd_nxt == tp->snd_una)) {
7693 		pass = 11;
7694 		goto send;
7695 	}
7696 	/*
7697 	 * No reason to send a segment, just return.
7698 	 */
7699 just_return:
7700 	SOCKBUF_UNLOCK(sb);
7701 just_return_nolock:
7702 	if (tot_len_this_send == 0)
7703 		counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
7704 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
7705 	rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling);
7706 	tp->t_flags &= ~TF_FORCEDATA;
7707 	return (0);
7708 
7709 send:
7710 	if (doing_tlp == 0) {
7711 		/*
7712 		 * Data not a TLP, and its not the rxt firing. If it is the
7713 		 * rxt firing, we want to leave the tlp_in_progress flag on
7714 		 * so we don't send another TLP. It has to be a rack timer
7715 		 * or normal send (response to acked data) to clear the tlp
7716 		 * in progress flag.
7717 		 */
7718 		rack->rc_tlp_in_progress = 0;
7719 	}
7720 	SOCKBUF_LOCK_ASSERT(sb);
7721 	if (len > 0) {
7722 		if (len >= tp->t_maxseg)
7723 			tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
7724 		else
7725 			tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
7726 	}
7727 	/*
7728 	 * Before ESTABLISHED, force sending of initial options unless TCP
7729 	 * set not to do any options. NOTE: we assume that the IP/TCP header
7730 	 * plus TCP options always fit in a single mbuf, leaving room for a
7731 	 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
7732 	 * + optlen <= MCLBYTES
7733 	 */
7734 	optlen = 0;
7735 #ifdef INET6
7736 	if (isipv6)
7737 		hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
7738 	else
7739 #endif
7740 		hdrlen = sizeof(struct tcpiphdr);
7741 
7742 	/*
7743 	 * Compute options for segment. We only have to care about SYN and
7744 	 * established connection segments.  Options for SYN-ACK segments
7745 	 * are handled in TCP syncache.
7746 	 */
7747 	to.to_flags = 0;
7748 	if ((tp->t_flags & TF_NOOPT) == 0) {
7749 		/* Maximum segment size. */
7750 		if (flags & TH_SYN) {
7751 			tp->snd_nxt = tp->iss;
7752 			to.to_mss = tcp_mssopt(&inp->inp_inc);
7753 #ifdef NETFLIX_TCPOUDP
7754 			if (tp->t_port)
7755 				to.to_mss -= V_tcp_udp_tunneling_overhead;
7756 #endif
7757 			to.to_flags |= TOF_MSS;
7758 
7759 			/*
7760 			 * On SYN or SYN|ACK transmits on TFO connections,
7761 			 * only include the TFO option if it is not a
7762 			 * retransmit, as the presence of the TFO option may
7763 			 * have caused the original SYN or SYN|ACK to have
7764 			 * been dropped by a middlebox.
7765 			 */
7766 			if (IS_FASTOPEN(tp->t_flags) &&
7767 			    (tp->t_rxtshift == 0)) {
7768 				if (tp->t_state == TCPS_SYN_RECEIVED) {
7769 					to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
7770 					to.to_tfo_cookie =
7771 					    (u_int8_t *)&tp->t_tfo_cookie.server;
7772 					to.to_flags |= TOF_FASTOPEN;
7773 					wanted_cookie = 1;
7774 				} else if (tp->t_state == TCPS_SYN_SENT) {
7775 					to.to_tfo_len =
7776 					    tp->t_tfo_client_cookie_len;
7777 					to.to_tfo_cookie =
7778 					    tp->t_tfo_cookie.client;
7779 					to.to_flags |= TOF_FASTOPEN;
7780 					wanted_cookie = 1;
7781 					/*
7782 					 * If we wind up having more data to
7783 					 * send with the SYN than can fit in
7784 					 * one segment, don't send any more
7785 					 * until the SYN|ACK comes back from
7786 					 * the other end.
7787 					 */
7788 					sendalot = 0;
7789 				}
7790 			}
7791 		}
7792 		/* Window scaling. */
7793 		if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
7794 			to.to_wscale = tp->request_r_scale;
7795 			to.to_flags |= TOF_SCALE;
7796 		}
7797 		/* Timestamps. */
7798 		if ((tp->t_flags & TF_RCVD_TSTMP) ||
7799 		    ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
7800 			to.to_tsval = cts + tp->ts_offset;
7801 			to.to_tsecr = tp->ts_recent;
7802 			to.to_flags |= TOF_TS;
7803 		}
7804 		/* Set receive buffer autosizing timestamp. */
7805 		if (tp->rfbuf_ts == 0 &&
7806 		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
7807 			tp->rfbuf_ts = tcp_ts_getticks();
7808 		/* Selective ACK's. */
7809 		if (flags & TH_SYN)
7810 			to.to_flags |= TOF_SACKPERM;
7811 		else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7812 		    tp->rcv_numsacks > 0) {
7813 			to.to_flags |= TOF_SACK;
7814 			to.to_nsacks = tp->rcv_numsacks;
7815 			to.to_sacks = (u_char *)tp->sackblks;
7816 		}
7817 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
7818 		/* TCP-MD5 (RFC2385). */
7819 		if (tp->t_flags & TF_SIGNATURE)
7820 			to.to_flags |= TOF_SIGNATURE;
7821 #endif				/* TCP_SIGNATURE */
7822 
7823 		/* Processing the options. */
7824 		hdrlen += optlen = tcp_addoptions(&to, opt);
7825 		/*
7826 		 * If we wanted a TFO option to be added, but it was unable
7827 		 * to fit, ensure no data is sent.
7828 		 */
7829 		if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
7830 		    !(to.to_flags & TOF_FASTOPEN))
7831 			len = 0;
7832 	}
7833 #ifdef NETFLIX_TCPOUDP
7834 	if (tp->t_port) {
7835 		if (V_tcp_udp_tunneling_port == 0) {
7836 			/* The port was removed?? */
7837 			SOCKBUF_UNLOCK(&so->so_snd);
7838 			return (EHOSTUNREACH);
7839 		}
7840 		hdrlen += sizeof(struct udphdr);
7841 	}
7842 #endif
7843 	ipoptlen = 0;
7844 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
7845 	ipoptlen += ipsec_optlen;
7846 #endif
7847 
7848 	/*
7849 	 * Adjust data length if insertion of options will bump the packet
7850 	 * length beyond the t_maxseg length. Clear the FIN bit because we
7851 	 * cut off the tail of the segment.
7852 	 */
7853 	if (len + optlen + ipoptlen > tp->t_maxseg) {
7854 		if (flags & TH_FIN) {
7855 			would_have_fin = 1;
7856 			flags &= ~TH_FIN;
7857 		}
7858 		if (tso) {
7859 			uint32_t if_hw_tsomax;
7860 			uint32_t moff;
7861 			int32_t max_len;
7862 
7863 			/* extract TSO information */
7864 			if_hw_tsomax = tp->t_tsomax;
7865 			if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
7866 			if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
7867 			KASSERT(ipoptlen == 0,
7868 			    ("%s: TSO can't do IP options", __func__));
7869 
7870 			/*
7871 			 * Check if we should limit by maximum payload
7872 			 * length:
7873 			 */
7874 			if (if_hw_tsomax != 0) {
7875 				/* compute maximum TSO length */
7876 				max_len = (if_hw_tsomax - hdrlen -
7877 				    max_linkhdr);
7878 				if (max_len <= 0) {
7879 					len = 0;
7880 				} else if (len > max_len) {
7881 					sendalot = 1;
7882 					len = max_len;
7883 				}
7884 			}
7885 			/*
7886 			 * Prevent the last segment from being fractional
7887 			 * unless the send sockbuf can be emptied:
7888 			 */
7889 			max_len = (tp->t_maxseg - optlen);
7890 			if ((sb_offset + len) < sbavail(sb)) {
7891 				moff = len % (u_int)max_len;
7892 				if (moff != 0) {
7893 					len -= moff;
7894 					sendalot = 1;
7895 				}
7896 			}
7897 			/*
7898 			 * In case there are too many small fragments don't
7899 			 * use TSO:
7900 			 */
7901 			if (len <= max_len) {
7902 				len = max_len;
7903 				sendalot = 1;
7904 				tso = 0;
7905 			}
7906 			/*
7907 			 * Send the FIN in a separate segment after the bulk
7908 			 * sending is done. We don't trust the TSO
7909 			 * implementations to clear the FIN flag on all but
7910 			 * the last segment.
7911 			 */
7912 			if (tp->t_flags & TF_NEEDFIN)
7913 				sendalot = 1;
7914 
7915 		} else {
7916 			len = tp->t_maxseg - optlen - ipoptlen;
7917 			sendalot = 1;
7918 		}
7919 	} else
7920 		tso = 0;
7921 	KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
7922 	    ("%s: len > IP_MAXPACKET", __func__));
7923 #ifdef DIAGNOSTIC
7924 #ifdef INET6
7925 	if (max_linkhdr + hdrlen > MCLBYTES)
7926 #else
7927 	if (max_linkhdr + hdrlen > MHLEN)
7928 #endif
7929 		panic("tcphdr too big");
7930 #endif
7931 
7932 	/*
7933 	 * This KASSERT is here to catch edge cases at a well defined place.
7934 	 * Before, those had triggered (random) panic conditions further
7935 	 * down.
7936 	 */
7937 	KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
7938 	if ((len == 0) &&
7939 	    (flags & TH_FIN) &&
7940 	    (sbused(sb))) {
7941 		/*
7942 		 * We have outstanding data, don't send a fin by itself!.
7943 		 */
7944 		goto just_return;
7945 	}
7946 	/*
7947 	 * Grab a header mbuf, attaching a copy of data to be transmitted,
7948 	 * and initialize the header from the template for sends on this
7949 	 * connection.
7950 	 */
7951 	if (len) {
7952 		uint32_t max_val;
7953 		uint32_t moff;
7954 
7955 		if (rack->rc_pace_max_segs)
7956 			max_val = rack->rc_pace_max_segs * tp->t_maxseg;
7957 		else
7958 			max_val = len;
7959 		/*
7960 		 * We allow a limit on sending with hptsi.
7961 		 */
7962 		if (len > max_val) {
7963 			len = max_val;
7964 		}
7965 #ifdef INET6
7966 		if (MHLEN < hdrlen + max_linkhdr)
7967 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
7968 		else
7969 #endif
7970 			m = m_gethdr(M_NOWAIT, MT_DATA);
7971 
7972 		if (m == NULL) {
7973 			SOCKBUF_UNLOCK(sb);
7974 			error = ENOBUFS;
7975 			sack_rxmit = 0;
7976 			goto out;
7977 		}
7978 		m->m_data += max_linkhdr;
7979 		m->m_len = hdrlen;
7980 
7981 		/*
7982 		 * Start the m_copy functions from the closest mbuf to the
7983 		 * sb_offset in the socket buffer chain.
7984 		 */
7985 		mb = sbsndptr_noadv(sb, sb_offset, &moff);
7986 		if (len <= MHLEN - hdrlen - max_linkhdr) {
7987 			m_copydata(mb, moff, (int)len,
7988 			    mtod(m, caddr_t)+hdrlen);
7989 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7990 				sbsndptr_adv(sb, mb, len);
7991 			m->m_len += len;
7992 		} else {
7993 			struct sockbuf *msb;
7994 
7995 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
7996 				msb = NULL;
7997 			else
7998 				msb = sb;
7999 			m->m_next = tcp_m_copym(mb, moff, &len,
8000 			    if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
8001 			if (len <= (tp->t_maxseg - optlen)) {
8002 				/*
8003 				 * Must have ran out of mbufs for the copy
8004 				 * shorten it to no longer need tso. Lets
8005 				 * not put on sendalot since we are low on
8006 				 * mbufs.
8007 				 */
8008 				tso = 0;
8009 			}
8010 			if (m->m_next == NULL) {
8011 				SOCKBUF_UNLOCK(sb);
8012 				(void)m_free(m);
8013 				error = ENOBUFS;
8014 				sack_rxmit = 0;
8015 				goto out;
8016 			}
8017 		}
8018 		if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
8019 			TCPSTAT_INC(tcps_sndprobe);
8020 #ifdef NETFLIX_STATS
8021 			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
8022 				stats_voi_update_abs_u32(tp->t_stats,
8023 				    VOI_TCP_RETXPB, len);
8024 			else
8025 				stats_voi_update_abs_u64(tp->t_stats,
8026 				    VOI_TCP_TXPB, len);
8027 #endif
8028 		} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
8029 			if (rsm && (rsm->r_flags & RACK_TLP)) {
8030 				/*
8031 				 * TLP should not count in retran count, but
8032 				 * in its own bin
8033 				 */
8034 				counter_u64_add(rack_tlp_retran, 1);
8035 				counter_u64_add(rack_tlp_retran_bytes, len);
8036 			} else {
8037 				tp->t_sndrexmitpack++;
8038 				TCPSTAT_INC(tcps_sndrexmitpack);
8039 				TCPSTAT_ADD(tcps_sndrexmitbyte, len);
8040 			}
8041 #ifdef NETFLIX_STATS
8042 			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
8043 			    len);
8044 #endif
8045 		} else {
8046 			TCPSTAT_INC(tcps_sndpack);
8047 			TCPSTAT_ADD(tcps_sndbyte, len);
8048 #ifdef NETFLIX_STATS
8049 			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
8050 			    len);
8051 #endif
8052 		}
8053 		/*
8054 		 * If we're sending everything we've got, set PUSH. (This
8055 		 * will keep happy those implementations which only give
8056 		 * data to the user when a buffer fills or a PUSH comes in.)
8057 		 */
8058 		if (sb_offset + len == sbused(sb) &&
8059 		    sbused(sb) &&
8060 		    !(flags & TH_SYN))
8061 			flags |= TH_PUSH;
8062 
8063 		/*
8064 		 * Are we doing hptsi, if so we must calculate the slot. We
8065 		 * only do hptsi in ESTABLISHED and with no RESET being
8066 		 * sent where we have data to send.
8067 		 */
8068 		if (((tp->t_state == TCPS_ESTABLISHED) ||
8069 		    (tp->t_state == TCPS_CLOSE_WAIT) ||
8070 		    ((tp->t_state == TCPS_FIN_WAIT_1) &&
8071 		    ((tp->t_flags & TF_SENTFIN) == 0) &&
8072 		    ((flags & TH_FIN) == 0))) &&
8073 		    ((flags & TH_RST) == 0) &&
8074 		    (rack->rc_always_pace)) {
8075 			/*
8076 			 * We use the most optimistic possible cwnd/srtt for
8077 			 * sending calculations. This will make our
8078 			 * calculation anticipate getting more through
8079 			 * quicker then possible. But thats ok we don't want
8080 			 * the peer to have a gap in data sending.
8081 			 */
8082 			uint32_t srtt, cwnd, tr_perms = 0;
8083 
8084 			if (rack->r_ctl.rc_rack_min_rtt)
8085 				srtt = rack->r_ctl.rc_rack_min_rtt;
8086 			else
8087 				srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT));
8088 			if (rack->r_ctl.rc_rack_largest_cwnd)
8089 				cwnd = rack->r_ctl.rc_rack_largest_cwnd;
8090 			else
8091 				cwnd = tp->snd_cwnd;
8092 			tr_perms = cwnd / srtt;
8093 			if (tr_perms == 0) {
8094 				tr_perms = tp->t_maxseg;
8095 			}
8096 			tot_len_this_send += len;
8097 			/*
8098 			 * Calculate how long this will take to drain, if
8099 			 * the calculation comes out to zero, thats ok we
8100 			 * will use send_a_lot to possibly spin around for
8101 			 * more increasing tot_len_this_send to the point
8102 			 * that its going to require a pace, or we hit the
8103 			 * cwnd. Which in that case we are just waiting for
8104 			 * a ACK.
8105 			 */
8106 			slot = tot_len_this_send / tr_perms;
8107 			/* Now do we reduce the time so we don't run dry? */
8108 			if (slot && rack->rc_pace_reduce) {
8109 				int32_t reduce;
8110 
8111 				reduce = (slot / rack->rc_pace_reduce);
8112 				if (reduce < slot) {
8113 					slot -= reduce;
8114 				} else
8115 					slot = 0;
8116 			}
8117 			if (rack->r_enforce_min_pace &&
8118 			    (slot == 0) &&
8119 			    (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) {
8120 				/* We are enforcing a minimum pace time of 1ms */
8121 				slot = rack->r_enforce_min_pace;
8122 			}
8123 		}
8124 		SOCKBUF_UNLOCK(sb);
8125 	} else {
8126 		SOCKBUF_UNLOCK(sb);
8127 		if (tp->t_flags & TF_ACKNOW)
8128 			TCPSTAT_INC(tcps_sndacks);
8129 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
8130 			TCPSTAT_INC(tcps_sndctrl);
8131 		else if (SEQ_GT(tp->snd_up, tp->snd_una))
8132 			TCPSTAT_INC(tcps_sndurg);
8133 		else
8134 			TCPSTAT_INC(tcps_sndwinup);
8135 
8136 		m = m_gethdr(M_NOWAIT, MT_DATA);
8137 		if (m == NULL) {
8138 			error = ENOBUFS;
8139 			sack_rxmit = 0;
8140 			goto out;
8141 		}
8142 #ifdef INET6
8143 		if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
8144 		    MHLEN >= hdrlen) {
8145 			M_ALIGN(m, hdrlen);
8146 		} else
8147 #endif
8148 			m->m_data += max_linkhdr;
8149 		m->m_len = hdrlen;
8150 	}
8151 	SOCKBUF_UNLOCK_ASSERT(sb);
8152 	m->m_pkthdr.rcvif = (struct ifnet *)0;
8153 #ifdef MAC
8154 	mac_inpcb_create_mbuf(inp, m);
8155 #endif
8156 #ifdef INET6
8157 	if (isipv6) {
8158 		ip6 = mtod(m, struct ip6_hdr *);
8159 #ifdef NETFLIX_TCPOUDP
8160 		if (tp->t_port) {
8161 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
8162 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8163 			udp->uh_dport = tp->t_port;
8164 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
8165 			udp->uh_ulen = htons(ulen);
8166 			th = (struct tcphdr *)(udp + 1);
8167 		} else
8168 #endif
8169 			th = (struct tcphdr *)(ip6 + 1);
8170 		tcpip_fillheaders(inp, ip6, th);
8171 	} else
8172 #endif				/* INET6 */
8173 	{
8174 		ip = mtod(m, struct ip *);
8175 #ifdef TCPDEBUG
8176 		ipov = (struct ipovly *)ip;
8177 #endif
8178 #ifdef NETFLIX_TCPOUDP
8179 		if (tp->t_port) {
8180 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
8181 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
8182 			udp->uh_dport = tp->t_port;
8183 			ulen = hdrlen + len - sizeof(struct ip);
8184 			udp->uh_ulen = htons(ulen);
8185 			th = (struct tcphdr *)(udp + 1);
8186 		} else
8187 #endif
8188 			th = (struct tcphdr *)(ip + 1);
8189 		tcpip_fillheaders(inp, ip, th);
8190 	}
8191 	/*
8192 	 * Fill in fields, remembering maximum advertised window for use in
8193 	 * delaying messages about window sizes. If resending a FIN, be sure
8194 	 * not to use a new sequence number.
8195 	 */
8196 	if (flags & TH_FIN && tp->t_flags & TF_SENTFIN &&
8197 	    tp->snd_nxt == tp->snd_max)
8198 		tp->snd_nxt--;
8199 	/*
8200 	 * If we are starting a connection, send ECN setup SYN packet. If we
8201 	 * are on a retransmit, we may resend those bits a number of times
8202 	 * as per RFC 3168.
8203 	 */
8204 	if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) {
8205 		if (tp->t_rxtshift >= 1) {
8206 			if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
8207 				flags |= TH_ECE | TH_CWR;
8208 		} else
8209 			flags |= TH_ECE | TH_CWR;
8210 	}
8211 	if (tp->t_state == TCPS_ESTABLISHED &&
8212 	    (tp->t_flags & TF_ECN_PERMIT)) {
8213 		/*
8214 		 * If the peer has ECN, mark data packets with ECN capable
8215 		 * transmission (ECT). Ignore pure ack packets,
8216 		 * retransmissions and window probes.
8217 		 */
8218 		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
8219 		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
8220 #ifdef INET6
8221 			if (isipv6)
8222 				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
8223 			else
8224 #endif
8225 				ip->ip_tos |= IPTOS_ECN_ECT0;
8226 			TCPSTAT_INC(tcps_ecn_ect0);
8227 		}
8228 		/*
8229 		 * Reply with proper ECN notifications.
8230 		 */
8231 		if (tp->t_flags & TF_ECN_SND_CWR) {
8232 			flags |= TH_CWR;
8233 			tp->t_flags &= ~TF_ECN_SND_CWR;
8234 		}
8235 		if (tp->t_flags & TF_ECN_SND_ECE)
8236 			flags |= TH_ECE;
8237 	}
8238 	/*
8239 	 * If we are doing retransmissions, then snd_nxt will not reflect
8240 	 * the first unsent octet.  For ACK only packets, we do not want the
8241 	 * sequence number of the retransmitted packet, we want the sequence
8242 	 * number of the next unsent octet.  So, if there is no data (and no
8243 	 * SYN or FIN), use snd_max instead of snd_nxt when filling in
8244 	 * ti_seq.  But if we are in persist state, snd_max might reflect
8245 	 * one byte beyond the right edge of the window, so use snd_nxt in
8246 	 * that case, since we know we aren't doing a retransmission.
8247 	 * (retransmit and persist are mutually exclusive...)
8248 	 */
8249 	if (sack_rxmit == 0) {
8250 		if (len || (flags & (TH_SYN | TH_FIN)) ||
8251 		    rack->rc_in_persist) {
8252 			th->th_seq = htonl(tp->snd_nxt);
8253 			rack_seq = tp->snd_nxt;
8254 		} else if (flags & TH_RST) {
8255 			/*
8256 			 * For a Reset send the last cum ack in sequence
8257 			 * (this like any other choice may still generate a
8258 			 * challenge ack, if a ack-update packet is in
8259 			 * flight).
8260 			 */
8261 			th->th_seq = htonl(tp->snd_una);
8262 			rack_seq = tp->snd_una;
8263 		} else {
8264 			th->th_seq = htonl(tp->snd_max);
8265 			rack_seq = tp->snd_max;
8266 		}
8267 	} else {
8268 		th->th_seq = htonl(rsm->r_start);
8269 		rack_seq = rsm->r_start;
8270 	}
8271 	th->th_ack = htonl(tp->rcv_nxt);
8272 	if (optlen) {
8273 		bcopy(opt, th + 1, optlen);
8274 		th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
8275 	}
8276 	th->th_flags = flags;
8277 	/*
8278 	 * Calculate receive window.  Don't shrink window, but avoid silly
8279 	 * window syndrome.
8280 	 * If a RST segment is sent, advertise a window of zero.
8281 	 */
8282 	if (flags & TH_RST) {
8283 		recwin = 0;
8284 	} else {
8285 		if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
8286 		    recwin < (long)tp->t_maxseg)
8287 			recwin = 0;
8288 		if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
8289 		    recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
8290 			recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
8291 		if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
8292 			recwin = (long)TCP_MAXWIN << tp->rcv_scale;
8293 	}
8294 
8295 	/*
8296 	 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
8297 	 * <SYN,ACK>) segment itself is never scaled.  The <SYN,ACK> case is
8298 	 * handled in syncache.
8299 	 */
8300 	if (flags & TH_SYN)
8301 		th->th_win = htons((u_short)
8302 		    (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
8303 	else
8304 		th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
8305 	/*
8306 	 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
8307 	 * window.  This may cause the remote transmitter to stall.  This
8308 	 * flag tells soreceive() to disable delayed acknowledgements when
8309 	 * draining the buffer.  This can occur if the receiver is
8310 	 * attempting to read more data than can be buffered prior to
8311 	 * transmitting on the connection.
8312 	 */
8313 	if (th->th_win == 0) {
8314 		tp->t_sndzerowin++;
8315 		tp->t_flags |= TF_RXWIN0SENT;
8316 	} else
8317 		tp->t_flags &= ~TF_RXWIN0SENT;
8318 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
8319 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
8320 		th->th_flags |= TH_URG;
8321 	} else
8322 		/*
8323 		 * If no urgent pointer to send, then we pull the urgent
8324 		 * pointer to the left edge of the send window so that it
8325 		 * doesn't drift into the send window on sequence number
8326 		 * wraparound.
8327 		 */
8328 		tp->snd_up = tp->snd_una;	/* drag it along */
8329 
8330 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
8331 	if (to.to_flags & TOF_SIGNATURE) {
8332 		/*
8333 		 * Calculate MD5 signature and put it into the place
8334 		 * determined before.
8335 		 * NOTE: since TCP options buffer doesn't point into
8336 		 * mbuf's data, calculate offset and use it.
8337 		 */
8338 		if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
8339 		    (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
8340 			/*
8341 			 * Do not send segment if the calculation of MD5
8342 			 * digest has failed.
8343 			 */
8344 			goto out;
8345 		}
8346 	}
8347 #endif
8348 
8349 	/*
8350 	 * Put TCP length in extended header, and then checksum extended
8351 	 * header and data.
8352 	 */
8353 	m->m_pkthdr.len = hdrlen + len;	/* in6_cksum() need this */
8354 #ifdef INET6
8355 	if (isipv6) {
8356 		/*
8357 		 * ip6_plen is not need to be filled now, and will be filled
8358 		 * in ip6_output.
8359 		 */
8360 		if (tp->t_port) {
8361 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
8362 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8363 			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
8364 			th->th_sum = htons(0);
8365 		} else {
8366 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
8367 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8368 			th->th_sum = in6_cksum_pseudo(ip6,
8369 			    sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
8370 			    0);
8371 		}
8372 	}
8373 #endif
8374 #if defined(INET6) && defined(INET)
8375 	else
8376 #endif
8377 #ifdef INET
8378 	{
8379 		if (tp->t_port) {
8380 			m->m_pkthdr.csum_flags = CSUM_UDP;
8381 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
8382 			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
8383 			   ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
8384 			th->th_sum = htons(0);
8385 		} else {
8386 			m->m_pkthdr.csum_flags = CSUM_TCP;
8387 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
8388 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
8389 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
8390 			    IPPROTO_TCP + len + optlen));
8391 		}
8392 		/* IP version must be set here for ipv4/ipv6 checking later */
8393 		KASSERT(ip->ip_v == IPVERSION,
8394 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
8395 	}
8396 #endif
8397 
8398 	/*
8399 	 * Enable TSO and specify the size of the segments. The TCP pseudo
8400 	 * header checksum is always provided. XXX: Fixme: This is currently
8401 	 * not the case for IPv6.
8402 	 */
8403 	if (tso) {
8404 		KASSERT(len > tp->t_maxseg - optlen,
8405 		    ("%s: len <= tso_segsz", __func__));
8406 		m->m_pkthdr.csum_flags |= CSUM_TSO;
8407 		m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
8408 	}
8409 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
8410 	KASSERT(len + hdrlen + ipoptlen - ipsec_optlen == m_length(m, NULL),
8411 	    ("%s: mbuf chain shorter than expected: %d + %u + %u - %u != %u",
8412 	    __func__, len, hdrlen, ipoptlen, ipsec_optlen, m_length(m, NULL)));
8413 #else
8414 	KASSERT(len + hdrlen + ipoptlen == m_length(m, NULL),
8415 	    ("%s: mbuf chain shorter than expected: %d + %u + %u != %u",
8416 	    __func__, len, hdrlen, ipoptlen, m_length(m, NULL)));
8417 #endif
8418 
8419 #ifdef TCP_HHOOK
8420 	/* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
8421 	hhook_run_tcp_est_out(tp, th, &to, len, tso);
8422 #endif
8423 
8424 #ifdef TCPDEBUG
8425 	/*
8426 	 * Trace.
8427 	 */
8428 	if (so->so_options & SO_DEBUG) {
8429 		u_short save = 0;
8430 
8431 #ifdef INET6
8432 		if (!isipv6)
8433 #endif
8434 		{
8435 			save = ipov->ih_len;
8436 			ipov->ih_len = htons(m->m_pkthdr.len	/* - hdrlen +
8437 			      * (th->th_off << 2) */ );
8438 		}
8439 		tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0);
8440 #ifdef INET6
8441 		if (!isipv6)
8442 #endif
8443 			ipov->ih_len = save;
8444 	}
8445 #endif				/* TCPDEBUG */
8446 
8447 	/* We're getting ready to send; log now. */
8448 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
8449 		union tcp_log_stackspecific log;
8450 
8451 		memset(&log.u_bbr, 0, sizeof(log.u_bbr));
8452 		log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
8453 		log.u_bbr.ininput = rack->rc_inp->inp_in_input;
8454 		log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
8455 		if (rsm || sack_rxmit) {
8456 			log.u_bbr.flex8 = 1;
8457 		} else {
8458 			log.u_bbr.flex8 = 0;
8459 		}
8460 		lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
8461 		    len, &log, false, NULL, NULL, 0, NULL);
8462 	} else
8463 		lgb = NULL;
8464 
8465 	/*
8466 	 * Fill in IP length and desired time to live and send to IP level.
8467 	 * There should be a better way to handle ttl and tos; we could keep
8468 	 * them in the template, but need a way to checksum without them.
8469 	 */
8470 	/*
8471 	 * m->m_pkthdr.len should have been set before cksum calcuration,
8472 	 * because in6_cksum() need it.
8473 	 */
8474 #ifdef INET6
8475 	if (isipv6) {
8476 		/*
8477 		 * we separately set hoplimit for every segment, since the
8478 		 * user might want to change the value via setsockopt. Also,
8479 		 * desired default hop limit might be changed via Neighbor
8480 		 * Discovery.
8481 		 */
8482 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
8483 
8484 		/*
8485 		 * Set the packet size here for the benefit of DTrace
8486 		 * probes. ip6_output() will set it properly; it's supposed
8487 		 * to include the option header lengths as well.
8488 		 */
8489 		ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
8490 
8491 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
8492 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8493 		else
8494 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8495 
8496 		if (tp->t_state == TCPS_SYN_SENT)
8497 			TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
8498 
8499 		TCP_PROBE5(send, NULL, tp, ip6, tp, th);
8500 		/* TODO: IPv6 IP6TOS_ECT bit on */
8501 		error = ip6_output(m, tp->t_inpcb->in6p_outputopts,
8502 		    &inp->inp_route6,
8503 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
8504 		    NULL, NULL, inp);
8505 
8506 		if (error == EMSGSIZE && inp->inp_route6.ro_rt != NULL)
8507 			mtu = inp->inp_route6.ro_rt->rt_mtu;
8508 	}
8509 #endif				/* INET6 */
8510 #if defined(INET) && defined(INET6)
8511 	else
8512 #endif
8513 #ifdef INET
8514 	{
8515 		ip->ip_len = htons(m->m_pkthdr.len);
8516 #ifdef INET6
8517 		if (inp->inp_vflag & INP_IPV6PROTO)
8518 			ip->ip_ttl = in6_selecthlim(inp, NULL);
8519 #endif				/* INET6 */
8520 		/*
8521 		 * If we do path MTU discovery, then we set DF on every
8522 		 * packet. This might not be the best thing to do according
8523 		 * to RFC3390 Section 2. However the tcp hostcache migitates
8524 		 * the problem so it affects only the first tcp connection
8525 		 * with a host.
8526 		 *
8527 		 * NB: Don't set DF on small MTU/MSS to have a safe
8528 		 * fallback.
8529 		 */
8530 		if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
8531 			tp->t_flags2 |= TF2_PLPMTU_PMTUD;
8532 			if (tp->t_port == 0 || len < V_tcp_minmss) {
8533 				ip->ip_off |= htons(IP_DF);
8534 			}
8535 		} else {
8536 			tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
8537 		}
8538 
8539 		if (tp->t_state == TCPS_SYN_SENT)
8540 			TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
8541 
8542 		TCP_PROBE5(send, NULL, tp, ip, tp, th);
8543 
8544 		error = ip_output(m, tp->t_inpcb->inp_options, &inp->inp_route,
8545 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
8546 		    inp);
8547 		if (error == EMSGSIZE && inp->inp_route.ro_rt != NULL)
8548 			mtu = inp->inp_route.ro_rt->rt_mtu;
8549 	}
8550 #endif				/* INET */
8551 
8552 out:
8553 	if (lgb) {
8554 		lgb->tlb_errno = error;
8555 		lgb = NULL;
8556 	}
8557 	/*
8558 	 * In transmit state, time the transmission and arrange for the
8559 	 * retransmit.  In persist state, just set snd_max.
8560 	 */
8561 	if (error == 0) {
8562 		if (len == 0)
8563 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
8564 		else if (len == 1) {
8565 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
8566 		} else if (len > 1) {
8567 			int idx;
8568 
8569 			idx = (len / tp->t_maxseg) + 3;
8570 			if (idx >= TCP_MSS_ACCT_ATIMER)
8571 				counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
8572 			else
8573 				counter_u64_add(rack_out_size[idx], 1);
8574 		}
8575 	}
8576 	if (sub_from_prr && (error == 0)) {
8577 		rack->r_ctl.rc_prr_sndcnt -= len;
8578 	}
8579 	sub_from_prr = 0;
8580 	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
8581 	    pass, rsm);
8582 	if ((tp->t_flags & TF_FORCEDATA) == 0 ||
8583 	    (rack->rc_in_persist == 0)) {
8584 		tcp_seq startseq = tp->snd_nxt;
8585 
8586 		/*
8587 		 * Advance snd_nxt over sequence space of this segment.
8588 		 */
8589 		if (error)
8590 			/* We don't log or do anything with errors */
8591 			goto timer;
8592 
8593 		if (flags & (TH_SYN | TH_FIN)) {
8594 			if (flags & TH_SYN)
8595 				tp->snd_nxt++;
8596 			if (flags & TH_FIN) {
8597 				tp->snd_nxt++;
8598 				tp->t_flags |= TF_SENTFIN;
8599 			}
8600 		}
8601 		/* In the ENOBUFS case we do *not* update snd_max */
8602 		if (sack_rxmit)
8603 			goto timer;
8604 
8605 		tp->snd_nxt += len;
8606 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
8607 			if (tp->snd_una == tp->snd_max) {
8608 				/*
8609 				 * Update the time we just added data since
8610 				 * none was outstanding.
8611 				 */
8612 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8613 				tp->t_acktime = ticks;
8614 			}
8615 			tp->snd_max = tp->snd_nxt;
8616 			/*
8617 			 * Time this transmission if not a retransmission and
8618 			 * not currently timing anything.
8619 			 * This is only relevant in case of switching back to
8620 			 * the base stack.
8621 			 */
8622 			if (tp->t_rtttime == 0) {
8623 				tp->t_rtttime = ticks;
8624 				tp->t_rtseq = startseq;
8625 				TCPSTAT_INC(tcps_segstimed);
8626 			}
8627 #ifdef NETFLIX_STATS
8628 			if (!(tp->t_flags & TF_GPUTINPROG) && len) {
8629 				tp->t_flags |= TF_GPUTINPROG;
8630 				tp->gput_seq = startseq;
8631 				tp->gput_ack = startseq +
8632 				    ulmin(sbavail(sb) - sb_offset, sendwin);
8633 				tp->gput_ts = tcp_ts_getticks();
8634 			}
8635 #endif
8636 		}
8637 		/*
8638 		 * Set retransmit timer if not currently set, and not doing
8639 		 * a pure ack or a keep-alive probe. Initial value for
8640 		 * retransmit timer is smoothed round-trip time + 2 *
8641 		 * round-trip time variance. Initialize shift counter which
8642 		 * is used for backoff of retransmit time.
8643 		 */
8644 timer:
8645 		if ((tp->snd_wnd == 0) &&
8646 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
8647 			/*
8648 			 * If the persists timer was set above (right before
8649 			 * the goto send), and still needs to be on. Lets
8650 			 * make sure all is canceled. If the persist timer
8651 			 * is not running, we want to get it up.
8652 			 */
8653 			if (rack->rc_in_persist == 0) {
8654 				rack_enter_persist(tp, rack, cts);
8655 			}
8656 		}
8657 	} else {
8658 		/*
8659 		 * Persist case, update snd_max but since we are in persist
8660 		 * mode (no window) we do not update snd_nxt.
8661 		 */
8662 		int32_t xlen = len;
8663 
8664 		if (error)
8665 			goto nomore;
8666 
8667 		if (flags & TH_SYN)
8668 			++xlen;
8669 		if (flags & TH_FIN) {
8670 			++xlen;
8671 			tp->t_flags |= TF_SENTFIN;
8672 		}
8673 		/* In the ENOBUFS case we do *not* update snd_max */
8674 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
8675 			if (tp->snd_una == tp->snd_max) {
8676 				/*
8677 				 * Update the time we just added data since
8678 				 * none was outstanding.
8679 				 */
8680 				rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
8681 				tp->t_acktime = ticks;
8682 			}
8683 			tp->snd_max = tp->snd_nxt + len;
8684 		}
8685 	}
8686 nomore:
8687 	if (error) {
8688 		SOCKBUF_UNLOCK_ASSERT(sb);	/* Check gotos. */
8689 		/*
8690 		 * Failures do not advance the seq counter above. For the
8691 		 * case of ENOBUFS we will fall out and retry in 1ms with
8692 		 * the hpts. Everything else will just have to retransmit
8693 		 * with the timer.
8694 		 *
8695 		 * In any case, we do not want to loop around for another
8696 		 * send without a good reason.
8697 		 */
8698 		sendalot = 0;
8699 		switch (error) {
8700 		case EPERM:
8701 			tp->t_flags &= ~TF_FORCEDATA;
8702 			tp->t_softerror = error;
8703 			return (error);
8704 		case ENOBUFS:
8705 			if (slot == 0) {
8706 				/*
8707 				 * Pace us right away to retry in a some
8708 				 * time
8709 				 */
8710 				slot = 1 + rack->rc_enobuf;
8711 				if (rack->rc_enobuf < 255)
8712 					rack->rc_enobuf++;
8713 				if (slot > (rack->rc_rack_rtt / 2)) {
8714 					slot = rack->rc_rack_rtt / 2;
8715 				}
8716 				if (slot < 10)
8717 					slot = 10;
8718 			}
8719 			counter_u64_add(rack_saw_enobuf, 1);
8720 			error = 0;
8721 			goto enobufs;
8722 		case EMSGSIZE:
8723 			/*
8724 			 * For some reason the interface we used initially
8725 			 * to send segments changed to another or lowered
8726 			 * its MTU. If TSO was active we either got an
8727 			 * interface without TSO capabilits or TSO was
8728 			 * turned off. If we obtained mtu from ip_output()
8729 			 * then update it and try again.
8730 			 */
8731 			if (tso)
8732 				tp->t_flags &= ~TF_TSO;
8733 			if (mtu != 0) {
8734 				tcp_mss_update(tp, -1, mtu, NULL, NULL);
8735 				goto again;
8736 			}
8737 			slot = 10;
8738 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8739 			tp->t_flags &= ~TF_FORCEDATA;
8740 			return (error);
8741 		case ENETUNREACH:
8742 			counter_u64_add(rack_saw_enetunreach, 1);
8743 		case EHOSTDOWN:
8744 		case EHOSTUNREACH:
8745 		case ENETDOWN:
8746 			if (TCPS_HAVERCVDSYN(tp->t_state)) {
8747 				tp->t_softerror = error;
8748 			}
8749 			/* FALLTHROUGH */
8750 		default:
8751 			slot = 10;
8752 			rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1);
8753 			tp->t_flags &= ~TF_FORCEDATA;
8754 			return (error);
8755 		}
8756 	} else {
8757 		rack->rc_enobuf = 0;
8758 	}
8759 	TCPSTAT_INC(tcps_sndtotal);
8760 
8761 	/*
8762 	 * Data sent (as far as we can tell). If this advertises a larger
8763 	 * window than any other segment, then remember the size of the
8764 	 * advertised window. Any pending ACK has now been sent.
8765 	 */
8766 	if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
8767 		tp->rcv_adv = tp->rcv_nxt + recwin;
8768 	tp->last_ack_sent = tp->rcv_nxt;
8769 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
8770 enobufs:
8771 	rack->r_tlp_running = 0;
8772 	if ((flags & TH_RST) || (would_have_fin == 1)) {
8773 		/*
8774 		 * We don't send again after a RST. We also do *not* send
8775 		 * again if we would have had a find, but now have
8776 		 * outstanding data.
8777 		 */
8778 		slot = 0;
8779 		sendalot = 0;
8780 	}
8781 	if (slot) {
8782 		/* set the rack tcb into the slot N */
8783 		counter_u64_add(rack_paced_segments, 1);
8784 	} else if (sendalot) {
8785 		if (len)
8786 			counter_u64_add(rack_unpaced_segments, 1);
8787 		sack_rxmit = 0;
8788 		tp->t_flags &= ~TF_FORCEDATA;
8789 		goto again;
8790 	} else if (len) {
8791 		counter_u64_add(rack_unpaced_segments, 1);
8792 	}
8793 	tp->t_flags &= ~TF_FORCEDATA;
8794 	rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1);
8795 	return (error);
8796 }
8797 
8798 /*
8799  * rack_ctloutput() must drop the inpcb lock before performing copyin on
8800  * socket option arguments.  When it re-acquires the lock after the copy, it
8801  * has to revalidate that the connection is still valid for the socket
8802  * option.
8803  */
8804 static int
8805 rack_set_sockopt(struct socket *so, struct sockopt *sopt,
8806     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
8807 {
8808 	int32_t error = 0, optval;
8809 
8810 	switch (sopt->sopt_name) {
8811 	case TCP_RACK_PROP_RATE:
8812 	case TCP_RACK_PROP:
8813 	case TCP_RACK_TLP_REDUCE:
8814 	case TCP_RACK_EARLY_RECOV:
8815 	case TCP_RACK_PACE_ALWAYS:
8816 	case TCP_DELACK:
8817 	case TCP_RACK_PACE_REDUCE:
8818 	case TCP_RACK_PACE_MAX_SEG:
8819 	case TCP_RACK_PRR_SENDALOT:
8820 	case TCP_RACK_MIN_TO:
8821 	case TCP_RACK_EARLY_SEG:
8822 	case TCP_RACK_REORD_THRESH:
8823 	case TCP_RACK_REORD_FADE:
8824 	case TCP_RACK_TLP_THRESH:
8825 	case TCP_RACK_PKT_DELAY:
8826 	case TCP_RACK_TLP_USE:
8827 	case TCP_RACK_TLP_INC_VAR:
8828 	case TCP_RACK_IDLE_REDUCE_HIGH:
8829 	case TCP_RACK_MIN_PACE:
8830 	case TCP_RACK_MIN_PACE_SEG:
8831 	case TCP_BBR_RACK_RTT_USE:
8832 	case TCP_DATA_AFTER_CLOSE:
8833 		break;
8834 	default:
8835 		return (tcp_default_ctloutput(so, sopt, inp, tp));
8836 		break;
8837 	}
8838 	INP_WUNLOCK(inp);
8839 	error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
8840 	if (error)
8841 		return (error);
8842 	INP_WLOCK(inp);
8843 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
8844 		INP_WUNLOCK(inp);
8845 		return (ECONNRESET);
8846 	}
8847 	tp = intotcpcb(inp);
8848 	rack = (struct tcp_rack *)tp->t_fb_ptr;
8849 	switch (sopt->sopt_name) {
8850 	case TCP_RACK_PROP_RATE:
8851 		if ((optval <= 0) || (optval >= 100)) {
8852 			error = EINVAL;
8853 			break;
8854 		}
8855 		RACK_OPTS_INC(tcp_rack_prop_rate);
8856 		rack->r_ctl.rc_prop_rate = optval;
8857 		break;
8858 	case TCP_RACK_TLP_USE:
8859 		if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
8860 			error = EINVAL;
8861 			break;
8862 		}
8863 		RACK_OPTS_INC(tcp_tlp_use);
8864 		rack->rack_tlp_threshold_use = optval;
8865 		break;
8866 	case TCP_RACK_PROP:
8867 		/* RACK proportional rate reduction (bool) */
8868 		RACK_OPTS_INC(tcp_rack_prop);
8869 		rack->r_ctl.rc_prop_reduce = optval;
8870 		break;
8871 	case TCP_RACK_TLP_REDUCE:
8872 		/* RACK TLP cwnd reduction (bool) */
8873 		RACK_OPTS_INC(tcp_rack_tlp_reduce);
8874 		rack->r_ctl.rc_tlp_cwnd_reduce = optval;
8875 		break;
8876 	case TCP_RACK_EARLY_RECOV:
8877 		/* Should recovery happen early (bool) */
8878 		RACK_OPTS_INC(tcp_rack_early_recov);
8879 		rack->r_ctl.rc_early_recovery = optval;
8880 		break;
8881 	case TCP_RACK_PACE_ALWAYS:
8882 		/* Use the always pace method (bool)  */
8883 		RACK_OPTS_INC(tcp_rack_pace_always);
8884 		if (optval > 0)
8885 			rack->rc_always_pace = 1;
8886 		else
8887 			rack->rc_always_pace = 0;
8888 		break;
8889 	case TCP_RACK_PACE_REDUCE:
8890 		/* RACK Hptsi reduction factor (divisor) */
8891 		RACK_OPTS_INC(tcp_rack_pace_reduce);
8892 		if (optval)
8893 			/* Must be non-zero */
8894 			rack->rc_pace_reduce = optval;
8895 		else
8896 			error = EINVAL;
8897 		break;
8898 	case TCP_RACK_PACE_MAX_SEG:
8899 		/* Max segments in a pace */
8900 		RACK_OPTS_INC(tcp_rack_max_seg);
8901 		rack->rc_pace_max_segs = optval;
8902 		break;
8903 	case TCP_RACK_PRR_SENDALOT:
8904 		/* Allow PRR to send more than one seg */
8905 		RACK_OPTS_INC(tcp_rack_prr_sendalot);
8906 		rack->r_ctl.rc_prr_sendalot = optval;
8907 		break;
8908 	case TCP_RACK_MIN_TO:
8909 		/* Minimum time between rack t-o's in ms */
8910 		RACK_OPTS_INC(tcp_rack_min_to);
8911 		rack->r_ctl.rc_min_to = optval;
8912 		break;
8913 	case TCP_RACK_EARLY_SEG:
8914 		/* If early recovery max segments */
8915 		RACK_OPTS_INC(tcp_rack_early_seg);
8916 		rack->r_ctl.rc_early_recovery_segs = optval;
8917 		break;
8918 	case TCP_RACK_REORD_THRESH:
8919 		/* RACK reorder threshold (shift amount) */
8920 		RACK_OPTS_INC(tcp_rack_reord_thresh);
8921 		if ((optval > 0) && (optval < 31))
8922 			rack->r_ctl.rc_reorder_shift = optval;
8923 		else
8924 			error = EINVAL;
8925 		break;
8926 	case TCP_RACK_REORD_FADE:
8927 		/* Does reordering fade after ms time */
8928 		RACK_OPTS_INC(tcp_rack_reord_fade);
8929 		rack->r_ctl.rc_reorder_fade = optval;
8930 		break;
8931 	case TCP_RACK_TLP_THRESH:
8932 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
8933 		RACK_OPTS_INC(tcp_rack_tlp_thresh);
8934 		if (optval)
8935 			rack->r_ctl.rc_tlp_threshold = optval;
8936 		else
8937 			error = EINVAL;
8938 		break;
8939 	case TCP_RACK_PKT_DELAY:
8940 		/* RACK added ms i.e. rack-rtt + reord + N */
8941 		RACK_OPTS_INC(tcp_rack_pkt_delay);
8942 		rack->r_ctl.rc_pkt_delay = optval;
8943 		break;
8944 	case TCP_RACK_TLP_INC_VAR:
8945 		/* Does TLP include rtt variance in t-o */
8946 		RACK_OPTS_INC(tcp_rack_tlp_inc_var);
8947 		rack->r_ctl.rc_prr_inc_var = optval;
8948 		break;
8949 	case TCP_RACK_IDLE_REDUCE_HIGH:
8950 		RACK_OPTS_INC(tcp_rack_idle_reduce_high);
8951 		if (optval)
8952 			rack->r_idle_reduce_largest = 1;
8953 		else
8954 			rack->r_idle_reduce_largest = 0;
8955 		break;
8956 	case TCP_DELACK:
8957 		if (optval == 0)
8958 			tp->t_delayed_ack = 0;
8959 		else
8960 			tp->t_delayed_ack = 1;
8961 		if (tp->t_flags & TF_DELACK) {
8962 			tp->t_flags &= ~TF_DELACK;
8963 			tp->t_flags |= TF_ACKNOW;
8964 			rack_output(tp);
8965 		}
8966 		break;
8967 	case TCP_RACK_MIN_PACE:
8968 		RACK_OPTS_INC(tcp_rack_min_pace);
8969 		if (optval > 3)
8970 			rack->r_enforce_min_pace = 3;
8971 		else
8972 			rack->r_enforce_min_pace = optval;
8973 		break;
8974 	case TCP_RACK_MIN_PACE_SEG:
8975 		RACK_OPTS_INC(tcp_rack_min_pace_seg);
8976 		if (optval >= 16)
8977 			rack->r_min_pace_seg_thresh = 15;
8978 		else
8979 			rack->r_min_pace_seg_thresh = optval;
8980 		break;
8981 	case TCP_BBR_RACK_RTT_USE:
8982 		if ((optval != USE_RTT_HIGH) &&
8983 		    (optval != USE_RTT_LOW) &&
8984 		    (optval != USE_RTT_AVG))
8985 			error = EINVAL;
8986 		else
8987 			rack->r_ctl.rc_rate_sample_method = optval;
8988 		break;
8989 	case TCP_DATA_AFTER_CLOSE:
8990 		if (optval)
8991 			rack->rc_allow_data_af_clo = 1;
8992 		else
8993 			rack->rc_allow_data_af_clo = 0;
8994 		break;
8995 	default:
8996 		return (tcp_default_ctloutput(so, sopt, inp, tp));
8997 		break;
8998 	}
8999 #ifdef NETFLIX_STATS
9000 	tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
9001 #endif
9002 	INP_WUNLOCK(inp);
9003 	return (error);
9004 }
9005 
9006 static int
9007 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
9008     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack)
9009 {
9010 	int32_t error, optval;
9011 
9012 	/*
9013 	 * Because all our options are either boolean or an int, we can just
9014 	 * pull everything into optval and then unlock and copy. If we ever
9015 	 * add a option that is not a int, then this will have quite an
9016 	 * impact to this routine.
9017 	 */
9018 	switch (sopt->sopt_name) {
9019 	case TCP_RACK_PROP_RATE:
9020 		optval = rack->r_ctl.rc_prop_rate;
9021 		break;
9022 	case TCP_RACK_PROP:
9023 		/* RACK proportional rate reduction (bool) */
9024 		optval = rack->r_ctl.rc_prop_reduce;
9025 		break;
9026 	case TCP_RACK_TLP_REDUCE:
9027 		/* RACK TLP cwnd reduction (bool) */
9028 		optval = rack->r_ctl.rc_tlp_cwnd_reduce;
9029 		break;
9030 	case TCP_RACK_EARLY_RECOV:
9031 		/* Should recovery happen early (bool) */
9032 		optval = rack->r_ctl.rc_early_recovery;
9033 		break;
9034 	case TCP_RACK_PACE_REDUCE:
9035 		/* RACK Hptsi reduction factor (divisor) */
9036 		optval = rack->rc_pace_reduce;
9037 		break;
9038 	case TCP_RACK_PACE_MAX_SEG:
9039 		/* Max segments in a pace */
9040 		optval = rack->rc_pace_max_segs;
9041 		break;
9042 	case TCP_RACK_PACE_ALWAYS:
9043 		/* Use the always pace method */
9044 		optval = rack->rc_always_pace;
9045 		break;
9046 	case TCP_RACK_PRR_SENDALOT:
9047 		/* Allow PRR to send more than one seg */
9048 		optval = rack->r_ctl.rc_prr_sendalot;
9049 		break;
9050 	case TCP_RACK_MIN_TO:
9051 		/* Minimum time between rack t-o's in ms */
9052 		optval = rack->r_ctl.rc_min_to;
9053 		break;
9054 	case TCP_RACK_EARLY_SEG:
9055 		/* If early recovery max segments */
9056 		optval = rack->r_ctl.rc_early_recovery_segs;
9057 		break;
9058 	case TCP_RACK_REORD_THRESH:
9059 		/* RACK reorder threshold (shift amount) */
9060 		optval = rack->r_ctl.rc_reorder_shift;
9061 		break;
9062 	case TCP_RACK_REORD_FADE:
9063 		/* Does reordering fade after ms time */
9064 		optval = rack->r_ctl.rc_reorder_fade;
9065 		break;
9066 	case TCP_RACK_TLP_THRESH:
9067 		/* RACK TLP theshold i.e. srtt+(srtt/N) */
9068 		optval = rack->r_ctl.rc_tlp_threshold;
9069 		break;
9070 	case TCP_RACK_PKT_DELAY:
9071 		/* RACK added ms i.e. rack-rtt + reord + N */
9072 		optval = rack->r_ctl.rc_pkt_delay;
9073 		break;
9074 	case TCP_RACK_TLP_USE:
9075 		optval = rack->rack_tlp_threshold_use;
9076 		break;
9077 	case TCP_RACK_TLP_INC_VAR:
9078 		/* Does TLP include rtt variance in t-o */
9079 		optval = rack->r_ctl.rc_prr_inc_var;
9080 		break;
9081 	case TCP_RACK_IDLE_REDUCE_HIGH:
9082 		optval = rack->r_idle_reduce_largest;
9083 		break;
9084 	case TCP_RACK_MIN_PACE:
9085 		optval = rack->r_enforce_min_pace;
9086 		break;
9087 	case TCP_RACK_MIN_PACE_SEG:
9088 		optval = rack->r_min_pace_seg_thresh;
9089 		break;
9090 	case TCP_BBR_RACK_RTT_USE:
9091 		optval = rack->r_ctl.rc_rate_sample_method;
9092 		break;
9093 	case TCP_DELACK:
9094 		optval = tp->t_delayed_ack;
9095 		break;
9096 	case TCP_DATA_AFTER_CLOSE:
9097 		optval = rack->rc_allow_data_af_clo;
9098 		break;
9099 	default:
9100 		return (tcp_default_ctloutput(so, sopt, inp, tp));
9101 		break;
9102 	}
9103 	INP_WUNLOCK(inp);
9104 	error = sooptcopyout(sopt, &optval, sizeof optval);
9105 	return (error);
9106 }
9107 
9108 static int
9109 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp)
9110 {
9111 	int32_t error = EINVAL;
9112 	struct tcp_rack *rack;
9113 
9114 	rack = (struct tcp_rack *)tp->t_fb_ptr;
9115 	if (rack == NULL) {
9116 		/* Huh? */
9117 		goto out;
9118 	}
9119 	if (sopt->sopt_dir == SOPT_SET) {
9120 		return (rack_set_sockopt(so, sopt, inp, tp, rack));
9121 	} else if (sopt->sopt_dir == SOPT_GET) {
9122 		return (rack_get_sockopt(so, sopt, inp, tp, rack));
9123 	}
9124 out:
9125 	INP_WUNLOCK(inp);
9126 	return (error);
9127 }
9128 
9129 
9130 struct tcp_function_block __tcp_rack = {
9131 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
9132 	.tfb_tcp_output = rack_output,
9133 	.tfb_tcp_do_segment = rack_do_segment,
9134 	.tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
9135 	.tfb_tcp_ctloutput = rack_ctloutput,
9136 	.tfb_tcp_fb_init = rack_init,
9137 	.tfb_tcp_fb_fini = rack_fini,
9138 	.tfb_tcp_timer_stop_all = rack_stopall,
9139 	.tfb_tcp_timer_activate = rack_timer_activate,
9140 	.tfb_tcp_timer_active = rack_timer_active,
9141 	.tfb_tcp_timer_stop = rack_timer_stop,
9142 	.tfb_tcp_rexmit_tmr = rack_remxt_tmr,
9143 	.tfb_tcp_handoff_ok = rack_handoff_ok
9144 };
9145 
9146 static const char *rack_stack_names[] = {
9147 	__XSTRING(STACKNAME),
9148 #ifdef STACKALIAS
9149 	__XSTRING(STACKALIAS),
9150 #endif
9151 };
9152 
9153 static int
9154 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
9155 {
9156 	memset(mem, 0, size);
9157 	return (0);
9158 }
9159 
9160 static void
9161 rack_dtor(void *mem, int32_t size, void *arg)
9162 {
9163 
9164 }
9165 
9166 static bool rack_mod_inited = false;
9167 
9168 static int
9169 tcp_addrack(module_t mod, int32_t type, void *data)
9170 {
9171 	int32_t err = 0;
9172 	int num_stacks;
9173 
9174 	switch (type) {
9175 	case MOD_LOAD:
9176 		rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
9177 		    sizeof(struct rack_sendmap),
9178 		    rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
9179 
9180 		rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
9181 		    sizeof(struct tcp_rack),
9182 		    rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
9183 
9184 		sysctl_ctx_init(&rack_sysctl_ctx);
9185 		rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
9186 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
9187 		    OID_AUTO,
9188 		    __XSTRING(STACKNAME),
9189 		    CTLFLAG_RW, 0,
9190 		    "");
9191 		if (rack_sysctl_root == NULL) {
9192 			printf("Failed to add sysctl node\n");
9193 			err = EFAULT;
9194 			goto free_uma;
9195 		}
9196 		rack_init_sysctls();
9197 		num_stacks = nitems(rack_stack_names);
9198 		err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
9199 		    rack_stack_names, &num_stacks);
9200 		if (err) {
9201 			printf("Failed to register %s stack name for "
9202 			    "%s module\n", rack_stack_names[num_stacks],
9203 			    __XSTRING(MODNAME));
9204 			sysctl_ctx_free(&rack_sysctl_ctx);
9205 free_uma:
9206 			uma_zdestroy(rack_zone);
9207 			uma_zdestroy(rack_pcb_zone);
9208 			rack_counter_destroy();
9209 			printf("Failed to register rack module -- err:%d\n", err);
9210 			return (err);
9211 		}
9212 		rack_mod_inited = true;
9213 		break;
9214 	case MOD_QUIESCE:
9215 		err = deregister_tcp_functions(&__tcp_rack, true, false);
9216 		break;
9217 	case MOD_UNLOAD:
9218 		err = deregister_tcp_functions(&__tcp_rack, false, true);
9219 		if (err == EBUSY)
9220 			break;
9221 		if (rack_mod_inited) {
9222 			uma_zdestroy(rack_zone);
9223 			uma_zdestroy(rack_pcb_zone);
9224 			sysctl_ctx_free(&rack_sysctl_ctx);
9225 			rack_counter_destroy();
9226 			rack_mod_inited = false;
9227 		}
9228 		err = 0;
9229 		break;
9230 	default:
9231 		return (EOPNOTSUPP);
9232 	}
9233 	return (err);
9234 }
9235 
9236 static moduledata_t tcp_rack = {
9237 	.name = __XSTRING(MODNAME),
9238 	.evhand = tcp_addrack,
9239 	.priv = 0
9240 };
9241 
9242 MODULE_VERSION(MODNAME, 1);
9243 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
9244 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
9245