1 /*-
2 * Copyright (c) 2016-2020 Netflix, Inc.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 */
26
27 #include <sys/cdefs.h>
28 #include "opt_inet.h"
29 #include "opt_inet6.h"
30 #include "opt_ipsec.h"
31 #include "opt_ratelimit.h"
32 #include "opt_kern_tls.h"
33 #if defined(INET) || defined(INET6)
34 #include <sys/param.h>
35 #include <sys/arb.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #ifdef TCP_HHOOK
39 #include <sys/hhook.h>
40 #endif
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/mbuf.h>
45 #include <sys/proc.h> /* for proc0 declaration */
46 #include <sys/socket.h>
47 #include <sys/socketvar.h>
48 #include <sys/sysctl.h>
49 #include <sys/systm.h>
50 #ifdef STATS
51 #include <sys/qmath.h>
52 #include <sys/tree.h>
53 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
54 #else
55 #include <sys/tree.h>
56 #endif
57 #include <sys/refcount.h>
58 #include <sys/queue.h>
59 #include <sys/tim_filter.h>
60 #include <sys/smp.h>
61 #include <sys/kthread.h>
62 #include <sys/kern_prefetch.h>
63 #include <sys/protosw.h>
64 #ifdef TCP_ACCOUNTING
65 #include <sys/sched.h>
66 #include <machine/cpu.h>
67 #endif
68 #include <vm/uma.h>
69
70 #include <net/route.h>
71 #include <net/route/nhop.h>
72 #include <net/vnet.h>
73
74 #define TCPSTATES /* for logging */
75
76 #include <netinet/in.h>
77 #include <netinet/in_kdtrace.h>
78 #include <netinet/in_pcb.h>
79 #include <netinet/ip.h>
80 #include <netinet/ip_var.h>
81 #include <netinet/ip6.h>
82 #include <netinet6/in6_pcb.h>
83 #include <netinet6/ip6_var.h>
84 #include <netinet/tcp.h>
85 #define TCPOUTFLAGS
86 #include <netinet/tcp_fsm.h>
87 #include <netinet/tcp_seq.h>
88 #include <netinet/tcp_timer.h>
89 #include <netinet/tcp_var.h>
90 #include <netinet/tcp_log_buf.h>
91 #include <netinet/tcp_syncache.h>
92 #include <netinet/tcp_hpts.h>
93 #include <netinet/tcp_ratelimit.h>
94 #include <netinet/tcp_accounting.h>
95 #include <netinet/tcpip.h>
96 #include <netinet/cc/cc.h>
97 #include <netinet/cc/cc_newreno.h>
98 #include <netinet/tcp_fastopen.h>
99 #include <netinet/tcp_lro.h>
100 #ifdef NETFLIX_SHARED_CWND
101 #include <netinet/tcp_shared_cwnd.h>
102 #endif
103 #ifdef TCP_OFFLOAD
104 #include <netinet/tcp_offload.h>
105 #endif
106 #ifdef INET6
107 #include <netinet6/tcp6_var.h>
108 #endif
109 #include <netinet/tcp_ecn.h>
110
111 #include <netipsec/ipsec_support.h>
112
113 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
114 #include <netipsec/ipsec.h>
115 #include <netipsec/ipsec6.h>
116 #endif /* IPSEC */
117
118 #include <netinet/udp.h>
119 #include <netinet/udp_var.h>
120 #include <machine/in_cksum.h>
121
122 #ifdef MAC
123 #include <security/mac/mac_framework.h>
124 #endif
125 #include "sack_filter.h"
126 #include "tcp_rack.h"
127 #include "tailq_hash.h"
128 #include "rack_bbr_common.h"
129
130 uma_zone_t rack_zone;
131 uma_zone_t rack_pcb_zone;
132
133 #ifndef TICKS2SBT
134 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t)))
135 #endif
136
137 VNET_DECLARE(uint32_t, newreno_beta);
138 VNET_DECLARE(uint32_t, newreno_beta_ecn);
139 #define V_newreno_beta VNET(newreno_beta)
140 #define V_newreno_beta_ecn VNET(newreno_beta_ecn)
141
142 #define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME)
143 #define M_TCPDO __CONCAT(M_TCPDO, STACKNAME)
144
145 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block");
146 MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options");
147 MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information");
148
149 struct sysctl_ctx_list rack_sysctl_ctx;
150 struct sysctl_oid *rack_sysctl_root;
151
152 #define CUM_ACKED 1
153 #define SACKED 2
154
155 /*
156 * The RACK module incorporates a number of
157 * TCP ideas that have been put out into the IETF
158 * over the last few years:
159 * - Matt Mathis's Rate Halving which slowly drops
160 * the congestion window so that the ack clock can
161 * be maintained during a recovery.
162 * - Yuchung Cheng's RACK TCP (for which its named) that
163 * will stop us using the number of dup acks and instead
164 * use time as the gage of when we retransmit.
165 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
166 * of Dukkipati et.al.
167 * RACK depends on SACK, so if an endpoint arrives that
168 * cannot do SACK the state machine below will shuttle the
169 * connection back to using the "default" TCP stack that is
170 * in FreeBSD.
171 *
172 * To implement RACK the original TCP stack was first decomposed
173 * into a functional state machine with individual states
174 * for each of the possible TCP connection states. The do_segment
175 * functions role in life is to mandate the connection supports SACK
176 * initially and then assure that the RACK state matches the conenction
177 * state before calling the states do_segment function. Each
178 * state is simplified due to the fact that the original do_segment
179 * has been decomposed and we *know* what state we are in (no
180 * switches on the state) and all tests for SACK are gone. This
181 * greatly simplifies what each state does.
182 *
183 * TCP output is also over-written with a new version since it
184 * must maintain the new rack scoreboard.
185 *
186 */
187 static int32_t rack_tlp_thresh = 1;
188 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */
189 static int32_t rack_tlp_use_greater = 1;
190 static int32_t rack_reorder_thresh = 2;
191 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000
192 * - 60 seconds */
193 static uint32_t rack_pcm_every_n_rounds = 100;
194 static uint32_t rack_pcm_blast = 0;
195 static uint32_t rack_pcm_is_enabled = 1;
196 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */
197
198 static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */
199 static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */
200
201
202 static int32_t rack_rxt_scoreboard_clear_thresh = 2;
203 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */
204 static int32_t rack_rxt_controls = 0;
205 static int32_t rack_fill_cw_state = 0;
206 static uint8_t rack_req_measurements = 1;
207 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */
208 static int32_t rack_hw_rate_caps = 0; /* 1; */
209 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */
210 static int32_t rack_hw_rate_min = 0; /* 1500000;*/
211 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */
212 static int32_t rack_hw_up_only = 0;
213 static int32_t rack_stats_gets_ms_rtt = 1;
214 static int32_t rack_prr_addbackmax = 2;
215 static int32_t rack_do_hystart = 0;
216 static int32_t rack_apply_rtt_with_reduced_conf = 0;
217 static int32_t rack_hibeta_setting = 0;
218 static int32_t rack_default_pacing_divisor = 250;
219 static uint16_t rack_pacing_min_seg = 0;
220 static int32_t rack_timely_off = 0;
221
222 static int32_t rack_pkt_delay = 1000;
223 static int32_t rack_send_a_lot_in_prr = 1;
224 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */
225 static int32_t rack_verbose_logging = 0;
226 static int32_t rack_ignore_data_after_close = 1;
227 static int32_t rack_enable_shared_cwnd = 1;
228 static int32_t rack_use_cmp_acks = 1;
229 static int32_t rack_use_fsb = 1;
230 static int32_t rack_use_rfo = 1;
231 static int32_t rack_use_rsm_rfo = 1;
232 static int32_t rack_max_abc_post_recovery = 2;
233 static int32_t rack_client_low_buf = 0;
234 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */
235 static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */
236 #ifdef TCP_ACCOUNTING
237 static int32_t rack_tcp_accounting = 0;
238 #endif
239 static int32_t rack_limits_scwnd = 1;
240 static int32_t rack_enable_mqueue_for_nonpaced = 0;
241 static int32_t rack_hybrid_allow_set_maxseg = 0;
242 static int32_t rack_disable_prr = 0;
243 static int32_t use_rack_rr = 1;
244 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
245 static int32_t rack_persist_min = 250000; /* 250usec */
246 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */
247 static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */
248 static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */
249 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */
250 static int32_t rack_limit_time_with_srtt = 0;
251 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */
252 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */
253 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */
254 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */
255 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */
256 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */
257
258 /*
259 * Currently regular tcp has a rto_min of 30ms
260 * the backoff goes 12 times so that ends up
261 * being a total of 122.850 seconds before a
262 * connection is killed.
263 */
264 static uint32_t rack_def_data_window = 20;
265 static uint32_t rack_goal_bdp = 2;
266 static uint32_t rack_min_srtts = 1;
267 static uint32_t rack_min_measure_usec = 0;
268 static int32_t rack_tlp_min = 10000; /* 10ms */
269 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */
270 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */
271 static const int32_t rack_free_cache = 2;
272 static int32_t rack_hptsi_segments = 40;
273 static int32_t rack_rate_sample_method = USE_RTT_LOW;
274 static int32_t rack_pace_every_seg = 0;
275 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */
276 static int32_t rack_pacing_delay_reduction = 4;
277 static int32_t rack_wma_divisor = 8; /* For WMA calculation */
278 static int32_t rack_cwnd_block_ends_measure = 0;
279 static int32_t rack_rwnd_block_ends_measure = 0;
280 static int32_t rack_def_profile = 0;
281
282 static int32_t rack_lower_cwnd_at_tlp = 0;
283 static int32_t rack_always_send_oldest = 0;
284 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
285
286 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */
287 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */
288 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */
289
290 /* Probertt */
291 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */
292 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */
293 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
294 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */
295 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */
296
297 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */
298 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */
299 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */
300 static uint32_t rack_probertt_use_min_rtt_exit = 0;
301 static uint32_t rack_probe_rtt_sets_cwnd = 0;
302 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */
303 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */
304 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */
305 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */
306 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */
307 static uint32_t rack_probertt_filter_life = 10000000;
308 static uint32_t rack_probertt_lower_within = 10;
309 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */
310 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */
311 static int32_t rack_probertt_clear_is = 1;
312 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */
313 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */
314
315 /* Part of pacing */
316 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */
317
318 /* Timely information:
319 *
320 * Here we have various control parameters on how
321 * timely may change the multiplier. rack_gain_p5_ub
322 * is associated with timely but not directly influencing
323 * the rate decision like the other variables. It controls
324 * the way fill-cw interacts with timely and caps how much
325 * timely can boost the fill-cw b/w.
326 *
327 * The other values are various boost/shrink numbers as well
328 * as potential caps when adjustments are made to the timely
329 * gain (returned by rack_get_output_gain(). Remember too that
330 * the gain returned can be overriden by other factors such as
331 * probeRTT as well as fixed-rate-pacing.
332 */
333 static int32_t rack_gain_p5_ub = 250;
334 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */
335 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */
336 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */
337 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */
338 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */
339 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */
340 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */
341 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */
342 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */
343 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */
344 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */
345 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */
346 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */
347 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */
348 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */
349 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */
350 static int32_t rack_timely_no_stopping = 0;
351 static int32_t rack_down_raise_thresh = 100;
352 static int32_t rack_req_segs = 1;
353 static uint64_t rack_bw_rate_cap = 0;
354 static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */
355
356
357 /* Rack specific counters */
358 counter_u64_t rack_saw_enobuf;
359 counter_u64_t rack_saw_enobuf_hw;
360 counter_u64_t rack_saw_enetunreach;
361 counter_u64_t rack_persists_sends;
362 counter_u64_t rack_persists_acks;
363 counter_u64_t rack_persists_loss;
364 counter_u64_t rack_persists_lost_ends;
365 counter_u64_t rack_total_bytes;
366 #ifdef INVARIANTS
367 counter_u64_t rack_adjust_map_bw;
368 #endif
369 /* Tail loss probe counters */
370 counter_u64_t rack_tlp_tot;
371 counter_u64_t rack_tlp_newdata;
372 counter_u64_t rack_tlp_retran;
373 counter_u64_t rack_tlp_retran_bytes;
374 counter_u64_t rack_to_tot;
375 counter_u64_t rack_hot_alloc;
376 counter_u64_t rack_to_alloc;
377 counter_u64_t rack_to_alloc_hard;
378 counter_u64_t rack_to_alloc_emerg;
379 counter_u64_t rack_to_alloc_limited;
380 counter_u64_t rack_alloc_limited_conns;
381 counter_u64_t rack_split_limited;
382 counter_u64_t rack_rxt_clamps_cwnd;
383 counter_u64_t rack_rxt_clamps_cwnd_uniq;
384
385 counter_u64_t rack_multi_single_eq;
386 counter_u64_t rack_proc_non_comp_ack;
387
388 counter_u64_t rack_fto_send;
389 counter_u64_t rack_fto_rsm_send;
390 counter_u64_t rack_nfto_resend;
391 counter_u64_t rack_non_fto_send;
392 counter_u64_t rack_extended_rfo;
393
394 counter_u64_t rack_sack_proc_all;
395 counter_u64_t rack_sack_proc_short;
396 counter_u64_t rack_sack_proc_restart;
397
398 counter_u64_t rack_input_idle_reduces;
399 counter_u64_t rack_collapsed_win;
400 counter_u64_t rack_collapsed_win_seen;
401 counter_u64_t rack_collapsed_win_rxt;
402 counter_u64_t rack_collapsed_win_rxt_bytes;
403 counter_u64_t rack_try_scwnd;
404 counter_u64_t rack_hw_pace_init_fail;
405 counter_u64_t rack_hw_pace_lost;
406
407 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
408 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
409
410
411 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2)))
412
413 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \
414 (tv) = (value) + slop; \
415 if ((u_long)(tv) < (u_long)(tvmin)) \
416 (tv) = (tvmin); \
417 if ((u_long)(tv) > (u_long)(tvmax)) \
418 (tv) = (tvmax); \
419 } while (0)
420
421 static void
422 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
423
424 static int
425 rack_process_ack(struct mbuf *m, struct tcphdr *th,
426 struct socket *so, struct tcpcb *tp, struct tcpopt *to,
427 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen);
428 static int
429 rack_process_data(struct mbuf *m, struct tcphdr *th,
430 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
431 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
432 static void
433 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack,
434 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery);
435 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack);
436 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack,
437 uint8_t limit_type);
438 static struct rack_sendmap *
439 rack_check_recovery_mode(struct tcpcb *tp,
440 uint32_t tsused);
441 static uint32_t
442 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack);
443 static void
444 rack_cong_signal(struct tcpcb *tp,
445 uint32_t type, uint32_t ack, int );
446 static void rack_counter_destroy(void);
447 static int
448 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt);
449 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
450 static void
451 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override);
452 static void
453 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
454 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos);
455 static void rack_dtor(void *mem, int32_t size, void *arg);
456 static void
457 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
458 uint32_t flex1, uint32_t flex2,
459 uint32_t flex3, uint32_t flex4,
460 uint32_t flex5, uint32_t flex6,
461 uint16_t flex7, uint8_t mod);
462
463 static void
464 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
465 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line,
466 struct rack_sendmap *rsm, uint8_t quality);
467 static struct rack_sendmap *
468 rack_find_high_nonack(struct tcp_rack *rack,
469 struct rack_sendmap *rsm);
470 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack);
471 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm);
472 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged);
473 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt);
474 static void
475 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
476 tcp_seq th_ack, int line, uint8_t quality);
477 static void
478 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm);
479
480 static uint32_t
481 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
482 static int32_t rack_handoff_ok(struct tcpcb *tp);
483 static int32_t rack_init(struct tcpcb *tp, void **ptr);
484 static void rack_init_sysctls(void);
485
486 static void
487 rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
488 struct tcphdr *th, int entered_rec, int dup_ack_struck,
489 int *dsack_seen, int *sacks_seen);
490 static void
491 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
492 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts,
493 struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz);
494
495 static uint64_t rack_get_gp_est(struct tcp_rack *rack);
496
497
498 static void
499 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
500 struct rack_sendmap *rsm, uint32_t cts);
501 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
502 static int32_t rack_output(struct tcpcb *tp);
503
504 static uint32_t
505 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack,
506 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm,
507 uint32_t cts, uint32_t segsiz);
508 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq);
509 static void rack_remxt_tmr(struct tcpcb *tp);
510 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt);
511 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack);
512 static int32_t rack_stopall(struct tcpcb *tp);
513 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line);
514 static uint32_t
515 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
516 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz);
517 static void
518 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
519 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz);
520 static int
521 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
522 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
523 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
524 static int
525 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
526 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
527 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
528
529 static int
530 rack_do_closing(struct mbuf *m, struct tcphdr *th,
531 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
532 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
533 static int
534 rack_do_established(struct mbuf *m, struct tcphdr *th,
535 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
536 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
537 static int
538 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
539 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
540 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos);
541 static int
542 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th,
543 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
544 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
545 static int
546 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th,
547 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
548 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
549 static int
550 rack_do_lastack(struct mbuf *m, struct tcphdr *th,
551 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
552 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
553 static int
554 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th,
555 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
556 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
557 static int
558 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
559 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
560 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos);
561 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts);
562 struct rack_sendmap *
563 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
564 uint32_t tsused);
565 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
566 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
567 static void
568 tcp_rack_partialack(struct tcpcb *tp);
569 static int
570 rack_set_profile(struct tcp_rack *rack, int prof);
571 static void
572 rack_apply_deferred_options(struct tcp_rack *rack);
573
574 int32_t rack_clear_counter=0;
575
576 static uint64_t
rack_get_lt_bw(struct tcp_rack * rack)577 rack_get_lt_bw(struct tcp_rack *rack)
578 {
579 struct timeval tv;
580 uint64_t tim, bytes;
581
582 tim = rack->r_ctl.lt_bw_time;
583 bytes = rack->r_ctl.lt_bw_bytes;
584 if (rack->lt_bw_up) {
585 /* Include all the current bytes too */
586 microuptime(&tv);
587 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq);
588 tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark);
589 }
590 if ((bytes != 0) && (tim != 0))
591 return ((bytes * (uint64_t)1000000) / tim);
592 else
593 return (0);
594 }
595
596 static void
rack_swap_beta_values(struct tcp_rack * rack,uint8_t flex8)597 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8)
598 {
599 struct sockopt sopt;
600 struct cc_newreno_opts opt;
601 struct tcpcb *tp;
602 uint32_t old_beta;
603 uint32_t old_beta_ecn;
604 int error = 0, failed = 0;
605
606 tp = rack->rc_tp;
607 if (tp->t_cc == NULL) {
608 /* Tcb is leaving */
609 return;
610 }
611 rack->rc_pacing_cc_set = 1;
612 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
613 /* Not new-reno we can't play games with beta! */
614 failed = 1;
615 goto out;
616
617 }
618 if (CC_ALGO(tp)->ctl_output == NULL) {
619 /* Huh, not using new-reno so no swaps.? */
620 failed = 2;
621 goto out;
622 }
623 /* Get the current values out */
624 sopt.sopt_valsize = sizeof(struct cc_newreno_opts);
625 sopt.sopt_dir = SOPT_GET;
626 opt.name = CC_NEWRENO_BETA;
627 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
628 if (error) {
629 failed = 3;
630 goto out;
631 }
632 old_beta = opt.val;
633 opt.name = CC_NEWRENO_BETA_ECN;
634 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
635 if (error) {
636 failed = 4;
637 goto out;
638 }
639 old_beta_ecn = opt.val;
640
641 /* Now lets set in the values we have stored */
642 sopt.sopt_dir = SOPT_SET;
643 opt.name = CC_NEWRENO_BETA;
644 opt.val = rack->r_ctl.rc_saved_beta;
645 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
646 if (error) {
647 failed = 5;
648 goto out;
649 }
650 opt.name = CC_NEWRENO_BETA_ECN;
651 opt.val = rack->r_ctl.rc_saved_beta_ecn;
652 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
653 if (error) {
654 failed = 6;
655 goto out;
656 }
657 /* Save off the values for restoral */
658 rack->r_ctl.rc_saved_beta = old_beta;
659 rack->r_ctl.rc_saved_beta_ecn = old_beta_ecn;
660 out:
661 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
662 union tcp_log_stackspecific log;
663 struct timeval tv;
664 struct newreno *ptr;
665
666 ptr = ((struct newreno *)tp->t_ccv.cc_data);
667 memset(&log, 0, sizeof(log));
668 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
669 log.u_bbr.flex1 = ptr->beta;
670 log.u_bbr.flex2 = ptr->beta_ecn;
671 log.u_bbr.flex3 = ptr->newreno_flags;
672 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta;
673 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta_ecn;
674 log.u_bbr.flex6 = failed;
675 log.u_bbr.flex7 = rack->gp_ready;
676 log.u_bbr.flex7 <<= 1;
677 log.u_bbr.flex7 |= rack->use_fixed_rate;
678 log.u_bbr.flex7 <<= 1;
679 log.u_bbr.flex7 |= rack->rc_pacing_cc_set;
680 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
681 log.u_bbr.flex8 = flex8;
682 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error,
683 0, &log, false, NULL, NULL, 0, &tv);
684 }
685 }
686
687 static void
rack_set_cc_pacing(struct tcp_rack * rack)688 rack_set_cc_pacing(struct tcp_rack *rack)
689 {
690 if (rack->rc_pacing_cc_set)
691 return;
692 /*
693 * Use the swap utility placing in 3 for flex8 to id a
694 * set of a new set of values.
695 */
696 rack->rc_pacing_cc_set = 1;
697 rack_swap_beta_values(rack, 3);
698 }
699
700 static void
rack_undo_cc_pacing(struct tcp_rack * rack)701 rack_undo_cc_pacing(struct tcp_rack *rack)
702 {
703 if (rack->rc_pacing_cc_set == 0)
704 return;
705 /*
706 * Use the swap utility placing in 4 for flex8 to id a
707 * restoral of the old values.
708 */
709 rack->rc_pacing_cc_set = 0;
710 rack_swap_beta_values(rack, 4);
711 }
712
713 static void
rack_remove_pacing(struct tcp_rack * rack)714 rack_remove_pacing(struct tcp_rack *rack)
715 {
716 if (rack->rc_pacing_cc_set)
717 rack_undo_cc_pacing(rack);
718 if (rack->r_ctl.pacing_method & RACK_REG_PACING)
719 tcp_decrement_paced_conn();
720 if (rack->r_ctl.pacing_method & RACK_DGP_PACING)
721 tcp_dec_dgp_pacing_cnt();
722 rack->rc_always_pace = 0;
723 rack->r_ctl.pacing_method = RACK_PACING_NONE;
724 rack->dgp_on = 0;
725 rack->rc_hybrid_mode = 0;
726 rack->use_fixed_rate = 0;
727 }
728
729 static void
rack_log_gpset(struct tcp_rack * rack,uint32_t seq_end,uint32_t ack_end_t,uint32_t send_end_t,int line,uint8_t mode,struct rack_sendmap * rsm)730 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t,
731 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm)
732 {
733 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) {
734 union tcp_log_stackspecific log;
735 struct timeval tv;
736
737 memset(&log, 0, sizeof(log));
738 log.u_bbr.flex1 = seq_end;
739 log.u_bbr.flex2 = rack->rc_tp->gput_seq;
740 log.u_bbr.flex3 = ack_end_t;
741 log.u_bbr.flex4 = rack->rc_tp->gput_ts;
742 log.u_bbr.flex5 = send_end_t;
743 log.u_bbr.flex6 = rack->rc_tp->gput_ack;
744 log.u_bbr.flex7 = mode;
745 log.u_bbr.flex8 = 69;
746 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts;
747 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts;
748 log.u_bbr.pkts_out = line;
749 log.u_bbr.cwnd_gain = rack->app_limited_needs_set;
750 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt;
751 log.u_bbr.epoch = rack->r_ctl.current_round;
752 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
753 if (rsm != NULL) {
754 log.u_bbr.applimited = rsm->r_start;
755 log.u_bbr.delivered = rsm->r_end;
756 log.u_bbr.epoch = rsm->r_flags;
757 }
758 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
759 TCP_LOG_EVENTP(rack->rc_tp, NULL,
760 &rack->rc_inp->inp_socket->so_rcv,
761 &rack->rc_inp->inp_socket->so_snd,
762 BBR_LOG_HPTSI_CALC, 0,
763 0, &log, false, &tv);
764 }
765 }
766
767 static int
sysctl_rack_clear(SYSCTL_HANDLER_ARGS)768 sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
769 {
770 uint32_t stat;
771 int32_t error;
772
773 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t));
774 if (error || req->newptr == NULL)
775 return error;
776
777 error = SYSCTL_IN(req, &stat, sizeof(uint32_t));
778 if (error)
779 return (error);
780 if (stat == 1) {
781 #ifdef INVARIANTS
782 printf("Clearing RACK counters\n");
783 #endif
784 counter_u64_zero(rack_tlp_tot);
785 counter_u64_zero(rack_tlp_newdata);
786 counter_u64_zero(rack_tlp_retran);
787 counter_u64_zero(rack_tlp_retran_bytes);
788 counter_u64_zero(rack_to_tot);
789 counter_u64_zero(rack_saw_enobuf);
790 counter_u64_zero(rack_saw_enobuf_hw);
791 counter_u64_zero(rack_saw_enetunreach);
792 counter_u64_zero(rack_persists_sends);
793 counter_u64_zero(rack_total_bytes);
794 counter_u64_zero(rack_persists_acks);
795 counter_u64_zero(rack_persists_loss);
796 counter_u64_zero(rack_persists_lost_ends);
797 #ifdef INVARIANTS
798 counter_u64_zero(rack_adjust_map_bw);
799 #endif
800 counter_u64_zero(rack_to_alloc_hard);
801 counter_u64_zero(rack_to_alloc_emerg);
802 counter_u64_zero(rack_sack_proc_all);
803 counter_u64_zero(rack_fto_send);
804 counter_u64_zero(rack_fto_rsm_send);
805 counter_u64_zero(rack_extended_rfo);
806 counter_u64_zero(rack_hw_pace_init_fail);
807 counter_u64_zero(rack_hw_pace_lost);
808 counter_u64_zero(rack_non_fto_send);
809 counter_u64_zero(rack_nfto_resend);
810 counter_u64_zero(rack_sack_proc_short);
811 counter_u64_zero(rack_sack_proc_restart);
812 counter_u64_zero(rack_to_alloc);
813 counter_u64_zero(rack_to_alloc_limited);
814 counter_u64_zero(rack_alloc_limited_conns);
815 counter_u64_zero(rack_split_limited);
816 counter_u64_zero(rack_rxt_clamps_cwnd);
817 counter_u64_zero(rack_rxt_clamps_cwnd_uniq);
818 counter_u64_zero(rack_multi_single_eq);
819 counter_u64_zero(rack_proc_non_comp_ack);
820 counter_u64_zero(rack_try_scwnd);
821 counter_u64_zero(rack_collapsed_win);
822 counter_u64_zero(rack_collapsed_win_rxt);
823 counter_u64_zero(rack_collapsed_win_seen);
824 counter_u64_zero(rack_collapsed_win_rxt_bytes);
825 } else if (stat == 2) {
826 #ifdef INVARIANTS
827 printf("Clearing RACK option array\n");
828 #endif
829 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE);
830 } else if (stat == 3) {
831 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n");
832 } else if (stat == 4) {
833 #ifdef INVARIANTS
834 printf("Clearing RACK out size array\n");
835 #endif
836 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE);
837 }
838 rack_clear_counter = 0;
839 return (0);
840 }
841
842 static void
rack_init_sysctls(void)843 rack_init_sysctls(void)
844 {
845 struct sysctl_oid *rack_counters;
846 struct sysctl_oid *rack_pacing;
847 struct sysctl_oid *rack_timely;
848 struct sysctl_oid *rack_timers;
849 struct sysctl_oid *rack_tlp;
850 struct sysctl_oid *rack_misc;
851 struct sysctl_oid *rack_features;
852 struct sysctl_oid *rack_measure;
853 struct sysctl_oid *rack_probertt;
854 struct sysctl_oid *rack_hw_pacing;
855
856 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
857 SYSCTL_CHILDREN(rack_sysctl_root),
858 OID_AUTO,
859 "stats",
860 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
861 "Rack Counters");
862 SYSCTL_ADD_S32(&rack_sysctl_ctx,
863 SYSCTL_CHILDREN(rack_sysctl_root),
864 OID_AUTO, "rate_sample_method", CTLFLAG_RW,
865 &rack_rate_sample_method , USE_RTT_LOW,
866 "What method should we use for rate sampling 0=high, 1=low ");
867 /* Probe rtt related controls */
868 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
869 SYSCTL_CHILDREN(rack_sysctl_root),
870 OID_AUTO,
871 "probertt",
872 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
873 "ProbeRTT related Controls");
874 SYSCTL_ADD_U16(&rack_sysctl_ctx,
875 SYSCTL_CHILDREN(rack_probertt),
876 OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
877 &rack_atexit_prtt_hbp, 130,
878 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
879 SYSCTL_ADD_U16(&rack_sysctl_ctx,
880 SYSCTL_CHILDREN(rack_probertt),
881 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
882 &rack_atexit_prtt, 130,
883 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
884 SYSCTL_ADD_U16(&rack_sysctl_ctx,
885 SYSCTL_CHILDREN(rack_probertt),
886 OID_AUTO, "gp_per_mul", CTLFLAG_RW,
887 &rack_per_of_gp_probertt, 60,
888 "What percentage of goodput do we pace at in probertt");
889 SYSCTL_ADD_U16(&rack_sysctl_ctx,
890 SYSCTL_CHILDREN(rack_probertt),
891 OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
892 &rack_per_of_gp_probertt_reduce, 10,
893 "What percentage of goodput do we reduce every gp_srtt");
894 SYSCTL_ADD_U16(&rack_sysctl_ctx,
895 SYSCTL_CHILDREN(rack_probertt),
896 OID_AUTO, "gp_per_low", CTLFLAG_RW,
897 &rack_per_of_gp_lowthresh, 40,
898 "What percentage of goodput do we allow the multiplier to fall to");
899 SYSCTL_ADD_U32(&rack_sysctl_ctx,
900 SYSCTL_CHILDREN(rack_probertt),
901 OID_AUTO, "time_between", CTLFLAG_RW,
902 &rack_time_between_probertt, 96000000,
903 "How many useconds between the lowest rtt falling must past before we enter probertt");
904 SYSCTL_ADD_U32(&rack_sysctl_ctx,
905 SYSCTL_CHILDREN(rack_probertt),
906 OID_AUTO, "safety", CTLFLAG_RW,
907 &rack_probe_rtt_safety_val, 2000000,
908 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)");
909 SYSCTL_ADD_U32(&rack_sysctl_ctx,
910 SYSCTL_CHILDREN(rack_probertt),
911 OID_AUTO, "sets_cwnd", CTLFLAG_RW,
912 &rack_probe_rtt_sets_cwnd, 0,
913 "Do we set the cwnd too (if always_lower is on)");
914 SYSCTL_ADD_U32(&rack_sysctl_ctx,
915 SYSCTL_CHILDREN(rack_probertt),
916 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW,
917 &rack_max_drain_wait, 2,
918 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal");
919 SYSCTL_ADD_U32(&rack_sysctl_ctx,
920 SYSCTL_CHILDREN(rack_probertt),
921 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW,
922 &rack_must_drain, 1,
923 "We must drain this many gp_srtt's waiting for flight to reach goal");
924 SYSCTL_ADD_U32(&rack_sysctl_ctx,
925 SYSCTL_CHILDREN(rack_probertt),
926 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW,
927 &rack_probertt_use_min_rtt_entry, 1,
928 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry");
929 SYSCTL_ADD_U32(&rack_sysctl_ctx,
930 SYSCTL_CHILDREN(rack_probertt),
931 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW,
932 &rack_probertt_use_min_rtt_exit, 0,
933 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt");
934 SYSCTL_ADD_U32(&rack_sysctl_ctx,
935 SYSCTL_CHILDREN(rack_probertt),
936 OID_AUTO, "length_div", CTLFLAG_RW,
937 &rack_probertt_gpsrtt_cnt_div, 0,
938 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)");
939 SYSCTL_ADD_U32(&rack_sysctl_ctx,
940 SYSCTL_CHILDREN(rack_probertt),
941 OID_AUTO, "length_mul", CTLFLAG_RW,
942 &rack_probertt_gpsrtt_cnt_mul, 0,
943 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)");
944 SYSCTL_ADD_U32(&rack_sysctl_ctx,
945 SYSCTL_CHILDREN(rack_probertt),
946 OID_AUTO, "holdtim_at_target", CTLFLAG_RW,
947 &rack_min_probertt_hold, 200000,
948 "What is the minimum time we hold probertt at target");
949 SYSCTL_ADD_U32(&rack_sysctl_ctx,
950 SYSCTL_CHILDREN(rack_probertt),
951 OID_AUTO, "filter_life", CTLFLAG_RW,
952 &rack_probertt_filter_life, 10000000,
953 "What is the time for the filters life in useconds");
954 SYSCTL_ADD_U32(&rack_sysctl_ctx,
955 SYSCTL_CHILDREN(rack_probertt),
956 OID_AUTO, "lower_within", CTLFLAG_RW,
957 &rack_probertt_lower_within, 10,
958 "If the rtt goes lower within this percentage of the time, go into probe-rtt");
959 SYSCTL_ADD_U32(&rack_sysctl_ctx,
960 SYSCTL_CHILDREN(rack_probertt),
961 OID_AUTO, "must_move", CTLFLAG_RW,
962 &rack_min_rtt_movement, 250,
963 "How much is the minimum movement in rtt to count as a drop for probertt purposes");
964 SYSCTL_ADD_U32(&rack_sysctl_ctx,
965 SYSCTL_CHILDREN(rack_probertt),
966 OID_AUTO, "clear_is_cnts", CTLFLAG_RW,
967 &rack_probertt_clear_is, 1,
968 "Do we clear I/S counts on exiting probe-rtt");
969 SYSCTL_ADD_S32(&rack_sysctl_ctx,
970 SYSCTL_CHILDREN(rack_probertt),
971 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW,
972 &rack_max_drain_hbp, 1,
973 "How many extra drain gpsrtt's do we get in highly buffered paths");
974 SYSCTL_ADD_S32(&rack_sysctl_ctx,
975 SYSCTL_CHILDREN(rack_probertt),
976 OID_AUTO, "hbp_threshold", CTLFLAG_RW,
977 &rack_hbp_thresh, 3,
978 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold");
979 /* Pacing related sysctls */
980 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
981 SYSCTL_CHILDREN(rack_sysctl_root),
982 OID_AUTO,
983 "pacing",
984 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
985 "Pacing related Controls");
986 SYSCTL_ADD_U32(&rack_sysctl_ctx,
987 SYSCTL_CHILDREN(rack_pacing),
988 OID_AUTO, "pcm_enabled", CTLFLAG_RW,
989 &rack_pcm_is_enabled, 1,
990 "Do we by default do PCM measurements?");
991 SYSCTL_ADD_U32(&rack_sysctl_ctx,
992 SYSCTL_CHILDREN(rack_pacing),
993 OID_AUTO, "pcm_rnds", CTLFLAG_RW,
994 &rack_pcm_every_n_rounds, 100,
995 "How many rounds before we need to do a PCM measurement");
996 SYSCTL_ADD_U32(&rack_sysctl_ctx,
997 SYSCTL_CHILDREN(rack_pacing),
998 OID_AUTO, "pcm_blast", CTLFLAG_RW,
999 &rack_pcm_blast, 0,
1000 "Blast out the full cwnd/rwnd when doing a PCM measurement");
1001 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1002 SYSCTL_CHILDREN(rack_pacing),
1003 OID_AUTO, "rnd_gp_gain", CTLFLAG_RW,
1004 &rack_gp_gain_req, 1200,
1005 "How much do we have to increase the GP to record the round 1200 = 120.0");
1006 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1007 SYSCTL_CHILDREN(rack_pacing),
1008 OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW,
1009 &rack_rnd_cnt_req, 0x10005,
1010 "How many rounds less than rnd_gp_gain will drop us out of SS");
1011 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1012 SYSCTL_CHILDREN(rack_pacing),
1013 OID_AUTO, "no_timely", CTLFLAG_RW,
1014 &rack_timely_off, 0,
1015 "Do we not use timely in DGP?");
1016 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1017 SYSCTL_CHILDREN(rack_pacing),
1018 OID_AUTO, "fillcw", CTLFLAG_RW,
1019 &rack_fill_cw_state, 0,
1020 "Enable fillcw on new connections (default=0 off)?");
1021 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1022 SYSCTL_CHILDREN(rack_pacing),
1023 OID_AUTO, "min_burst", CTLFLAG_RW,
1024 &rack_pacing_min_seg, 0,
1025 "What is the min burst size for pacing (0 disables)?");
1026 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1027 SYSCTL_CHILDREN(rack_pacing),
1028 OID_AUTO, "divisor", CTLFLAG_RW,
1029 &rack_default_pacing_divisor, 250,
1030 "What is the default divisor given to the rl code?");
1031 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1032 SYSCTL_CHILDREN(rack_pacing),
1033 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW,
1034 &rack_bw_multipler, 0,
1035 "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?");
1036 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1037 SYSCTL_CHILDREN(rack_pacing),
1038 OID_AUTO, "max_pace_over", CTLFLAG_RW,
1039 &rack_max_per_above, 30,
1040 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)");
1041 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1042 SYSCTL_CHILDREN(rack_pacing),
1043 OID_AUTO, "allow1mss", CTLFLAG_RW,
1044 &rack_pace_one_seg, 0,
1045 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?");
1046 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1047 SYSCTL_CHILDREN(rack_pacing),
1048 OID_AUTO, "limit_wsrtt", CTLFLAG_RW,
1049 &rack_limit_time_with_srtt, 0,
1050 "Do we limit pacing time based on srtt");
1051 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1052 SYSCTL_CHILDREN(rack_pacing),
1053 OID_AUTO, "gp_per_ss", CTLFLAG_RW,
1054 &rack_per_of_gp_ss, 250,
1055 "If non zero, what percentage of goodput to pace at in slow start");
1056 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1057 SYSCTL_CHILDREN(rack_pacing),
1058 OID_AUTO, "gp_per_ca", CTLFLAG_RW,
1059 &rack_per_of_gp_ca, 150,
1060 "If non zero, what percentage of goodput to pace at in congestion avoidance");
1061 SYSCTL_ADD_U16(&rack_sysctl_ctx,
1062 SYSCTL_CHILDREN(rack_pacing),
1063 OID_AUTO, "gp_per_rec", CTLFLAG_RW,
1064 &rack_per_of_gp_rec, 200,
1065 "If non zero, what percentage of goodput to pace at in recovery");
1066 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1067 SYSCTL_CHILDREN(rack_pacing),
1068 OID_AUTO, "pace_max_seg", CTLFLAG_RW,
1069 &rack_hptsi_segments, 40,
1070 "What size is the max for TSO segments in pacing and burst mitigation");
1071 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1072 SYSCTL_CHILDREN(rack_pacing),
1073 OID_AUTO, "burst_reduces", CTLFLAG_RW,
1074 &rack_pacing_delay_reduction, 4,
1075 "When doing only burst mitigation what is the reduce divisor");
1076 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1077 SYSCTL_CHILDREN(rack_sysctl_root),
1078 OID_AUTO, "use_pacing", CTLFLAG_RW,
1079 &rack_pace_every_seg, 0,
1080 "If set we use pacing, if clear we use only the original burst mitigation");
1081 SYSCTL_ADD_U64(&rack_sysctl_ctx,
1082 SYSCTL_CHILDREN(rack_pacing),
1083 OID_AUTO, "rate_cap", CTLFLAG_RW,
1084 &rack_bw_rate_cap, 0,
1085 "If set we apply this value to the absolute rate cap used by pacing");
1086 SYSCTL_ADD_U64(&rack_sysctl_ctx,
1087 SYSCTL_CHILDREN(rack_pacing),
1088 OID_AUTO, "fillcw_cap", CTLFLAG_RW,
1089 &rack_fillcw_bw_cap, 3750000,
1090 "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?");
1091 SYSCTL_ADD_U8(&rack_sysctl_ctx,
1092 SYSCTL_CHILDREN(rack_sysctl_root),
1093 OID_AUTO, "req_measure_cnt", CTLFLAG_RW,
1094 &rack_req_measurements, 1,
1095 "If doing dynamic pacing, how many measurements must be in before we start pacing?");
1096 /* Hardware pacing */
1097 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1098 SYSCTL_CHILDREN(rack_sysctl_root),
1099 OID_AUTO,
1100 "hdwr_pacing",
1101 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1102 "Pacing related Controls");
1103 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1104 SYSCTL_CHILDREN(rack_hw_pacing),
1105 OID_AUTO, "rwnd_factor", CTLFLAG_RW,
1106 &rack_hw_rwnd_factor, 2,
1107 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?");
1108 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1109 SYSCTL_CHILDREN(rack_hw_pacing),
1110 OID_AUTO, "precheck", CTLFLAG_RW,
1111 &rack_hw_check_queue, 0,
1112 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?");
1113 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1114 SYSCTL_CHILDREN(rack_hw_pacing),
1115 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW,
1116 &rack_enobuf_hw_boost_mult, 0,
1117 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?");
1118 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1119 SYSCTL_CHILDREN(rack_hw_pacing),
1120 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW,
1121 &rack_enobuf_hw_max, 2,
1122 "What is the max boost the pacing time if we see a ENOBUFS?");
1123 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1124 SYSCTL_CHILDREN(rack_hw_pacing),
1125 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW,
1126 &rack_enobuf_hw_min, 2,
1127 "What is the min boost the pacing time if we see a ENOBUFS?");
1128 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1129 SYSCTL_CHILDREN(rack_hw_pacing),
1130 OID_AUTO, "enable", CTLFLAG_RW,
1131 &rack_enable_hw_pacing, 0,
1132 "Should RACK attempt to use hw pacing?");
1133 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1134 SYSCTL_CHILDREN(rack_hw_pacing),
1135 OID_AUTO, "rate_cap", CTLFLAG_RW,
1136 &rack_hw_rate_caps, 0,
1137 "Does the highest hardware pacing rate cap the rate we will send at??");
1138 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1139 SYSCTL_CHILDREN(rack_hw_pacing),
1140 OID_AUTO, "uncap_per", CTLFLAG_RW,
1141 &rack_hw_rate_cap_per, 0,
1142 "If you go over b/w by this amount you will be uncapped (0 = never)");
1143 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1144 SYSCTL_CHILDREN(rack_hw_pacing),
1145 OID_AUTO, "rate_min", CTLFLAG_RW,
1146 &rack_hw_rate_min, 0,
1147 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?");
1148 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1149 SYSCTL_CHILDREN(rack_hw_pacing),
1150 OID_AUTO, "rate_to_low", CTLFLAG_RW,
1151 &rack_hw_rate_to_low, 0,
1152 "If we fall below this rate, dis-engage hw pacing?");
1153 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1154 SYSCTL_CHILDREN(rack_hw_pacing),
1155 OID_AUTO, "up_only", CTLFLAG_RW,
1156 &rack_hw_up_only, 0,
1157 "Do we allow hw pacing to lower the rate selected?");
1158 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1159 SYSCTL_CHILDREN(rack_sysctl_root),
1160 OID_AUTO,
1161 "timely",
1162 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1163 "Rack Timely RTT Controls");
1164 /* Timely based GP dynmics */
1165 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1166 SYSCTL_CHILDREN(rack_timely),
1167 OID_AUTO, "upper", CTLFLAG_RW,
1168 &rack_gp_per_bw_mul_up, 2,
1169 "Rack timely upper range for equal b/w (in percentage)");
1170 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1171 SYSCTL_CHILDREN(rack_timely),
1172 OID_AUTO, "lower", CTLFLAG_RW,
1173 &rack_gp_per_bw_mul_down, 4,
1174 "Rack timely lower range for equal b/w (in percentage)");
1175 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1176 SYSCTL_CHILDREN(rack_timely),
1177 OID_AUTO, "rtt_max_mul", CTLFLAG_RW,
1178 &rack_gp_rtt_maxmul, 3,
1179 "Rack timely multiplier of lowest rtt for rtt_max");
1180 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1181 SYSCTL_CHILDREN(rack_timely),
1182 OID_AUTO, "rtt_min_div", CTLFLAG_RW,
1183 &rack_gp_rtt_mindiv, 4,
1184 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt");
1185 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1186 SYSCTL_CHILDREN(rack_timely),
1187 OID_AUTO, "rtt_min_mul", CTLFLAG_RW,
1188 &rack_gp_rtt_minmul, 1,
1189 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt");
1190 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1191 SYSCTL_CHILDREN(rack_timely),
1192 OID_AUTO, "decrease", CTLFLAG_RW,
1193 &rack_gp_decrease_per, 80,
1194 "Rack timely Beta value 80 = .8 (scaled by 100)");
1195 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1196 SYSCTL_CHILDREN(rack_timely),
1197 OID_AUTO, "increase", CTLFLAG_RW,
1198 &rack_gp_increase_per, 2,
1199 "Rack timely increase perentage of our GP multiplication factor");
1200 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1201 SYSCTL_CHILDREN(rack_timely),
1202 OID_AUTO, "lowerbound", CTLFLAG_RW,
1203 &rack_per_lower_bound, 50,
1204 "Rack timely lowest percentage we allow GP multiplier to fall to");
1205 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1206 SYSCTL_CHILDREN(rack_timely),
1207 OID_AUTO, "p5_upper", CTLFLAG_RW,
1208 &rack_gain_p5_ub, 250,
1209 "Profile 5 upper bound to timely gain");
1210
1211 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1212 SYSCTL_CHILDREN(rack_timely),
1213 OID_AUTO, "upperboundss", CTLFLAG_RW,
1214 &rack_per_upper_bound_ss, 0,
1215 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)");
1216 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1217 SYSCTL_CHILDREN(rack_timely),
1218 OID_AUTO, "upperboundca", CTLFLAG_RW,
1219 &rack_per_upper_bound_ca, 0,
1220 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)");
1221 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1222 SYSCTL_CHILDREN(rack_timely),
1223 OID_AUTO, "dynamicgp", CTLFLAG_RW,
1224 &rack_do_dyn_mul, 0,
1225 "Rack timely do we enable dynmaic timely goodput by default");
1226 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1227 SYSCTL_CHILDREN(rack_timely),
1228 OID_AUTO, "no_rec_red", CTLFLAG_RW,
1229 &rack_gp_no_rec_chg, 1,
1230 "Rack timely do we prohibit the recovery multiplier from being lowered");
1231 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1232 SYSCTL_CHILDREN(rack_timely),
1233 OID_AUTO, "red_clear_cnt", CTLFLAG_RW,
1234 &rack_timely_dec_clear, 6,
1235 "Rack timely what threshold do we count to before another boost during b/w decent");
1236 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1237 SYSCTL_CHILDREN(rack_timely),
1238 OID_AUTO, "max_push_rise", CTLFLAG_RW,
1239 &rack_timely_max_push_rise, 3,
1240 "Rack timely how many times do we push up with b/w increase");
1241 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1242 SYSCTL_CHILDREN(rack_timely),
1243 OID_AUTO, "max_push_drop", CTLFLAG_RW,
1244 &rack_timely_max_push_drop, 3,
1245 "Rack timely how many times do we push back on b/w decent");
1246 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1247 SYSCTL_CHILDREN(rack_timely),
1248 OID_AUTO, "min_segs", CTLFLAG_RW,
1249 &rack_timely_min_segs, 4,
1250 "Rack timely when setting the cwnd what is the min num segments");
1251 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1252 SYSCTL_CHILDREN(rack_timely),
1253 OID_AUTO, "nonstop", CTLFLAG_RW,
1254 &rack_timely_no_stopping, 0,
1255 "Rack timely don't stop increase");
1256 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1257 SYSCTL_CHILDREN(rack_timely),
1258 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW,
1259 &rack_down_raise_thresh, 100,
1260 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)");
1261 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1262 SYSCTL_CHILDREN(rack_timely),
1263 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW,
1264 &rack_req_segs, 1,
1265 "Bottom dragging if not these many segments outstanding and room");
1266
1267 /* TLP and Rack related parameters */
1268 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1269 SYSCTL_CHILDREN(rack_sysctl_root),
1270 OID_AUTO,
1271 "tlp",
1272 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1273 "TLP and Rack related Controls");
1274 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1275 SYSCTL_CHILDREN(rack_tlp),
1276 OID_AUTO, "use_rrr", CTLFLAG_RW,
1277 &use_rack_rr, 1,
1278 "Do we use Rack Rapid Recovery");
1279 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1280 SYSCTL_CHILDREN(rack_tlp),
1281 OID_AUTO, "post_rec_labc", CTLFLAG_RW,
1282 &rack_max_abc_post_recovery, 2,
1283 "Since we do early recovery, do we override the l_abc to a value, if so what?");
1284 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1285 SYSCTL_CHILDREN(rack_tlp),
1286 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW,
1287 &rack_non_rxt_use_cr, 0,
1288 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk");
1289 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1290 SYSCTL_CHILDREN(rack_tlp),
1291 OID_AUTO, "tlpmethod", CTLFLAG_RW,
1292 &rack_tlp_threshold_use, TLP_USE_TWO_ONE,
1293 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2");
1294 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1295 SYSCTL_CHILDREN(rack_tlp),
1296 OID_AUTO, "limit", CTLFLAG_RW,
1297 &rack_tlp_limit, 2,
1298 "How many TLP's can be sent without sending new data");
1299 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1300 SYSCTL_CHILDREN(rack_tlp),
1301 OID_AUTO, "use_greater", CTLFLAG_RW,
1302 &rack_tlp_use_greater, 1,
1303 "Should we use the rack_rtt time if its greater than srtt");
1304 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1305 SYSCTL_CHILDREN(rack_tlp),
1306 OID_AUTO, "tlpminto", CTLFLAG_RW,
1307 &rack_tlp_min, 10000,
1308 "TLP minimum timeout per the specification (in microseconds)");
1309 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1310 SYSCTL_CHILDREN(rack_tlp),
1311 OID_AUTO, "send_oldest", CTLFLAG_RW,
1312 &rack_always_send_oldest, 0,
1313 "Should we always send the oldest TLP and RACK-TLP");
1314 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1315 SYSCTL_CHILDREN(rack_tlp),
1316 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW,
1317 &rack_lower_cwnd_at_tlp, 0,
1318 "When a TLP completes a retran should we enter recovery");
1319 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1320 SYSCTL_CHILDREN(rack_tlp),
1321 OID_AUTO, "reorder_thresh", CTLFLAG_RW,
1322 &rack_reorder_thresh, 2,
1323 "What factor for rack will be added when seeing reordering (shift right)");
1324 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1325 SYSCTL_CHILDREN(rack_tlp),
1326 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW,
1327 &rack_tlp_thresh, 1,
1328 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)");
1329 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1330 SYSCTL_CHILDREN(rack_tlp),
1331 OID_AUTO, "reorder_fade", CTLFLAG_RW,
1332 &rack_reorder_fade, 60000000,
1333 "Does reorder detection fade, if so how many microseconds (0 means never)");
1334 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1335 SYSCTL_CHILDREN(rack_tlp),
1336 OID_AUTO, "pktdelay", CTLFLAG_RW,
1337 &rack_pkt_delay, 1000,
1338 "Extra RACK time (in microseconds) besides reordering thresh");
1339
1340 /* Timer related controls */
1341 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1342 SYSCTL_CHILDREN(rack_sysctl_root),
1343 OID_AUTO,
1344 "timers",
1345 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1346 "Timer related controls");
1347 SYSCTL_ADD_U8(&rack_sysctl_ctx,
1348 SYSCTL_CHILDREN(rack_timers),
1349 OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW,
1350 &rack_ssthresh_rest_rto_rec, 0,
1351 "When doing recovery -> rto -> recovery do we reset SSthresh?");
1352 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1353 SYSCTL_CHILDREN(rack_timers),
1354 OID_AUTO, "scoreboard_thresh", CTLFLAG_RW,
1355 &rack_rxt_scoreboard_clear_thresh, 2,
1356 "How many RTO's are allowed before we clear the scoreboard");
1357 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1358 SYSCTL_CHILDREN(rack_timers),
1359 OID_AUTO, "honor_hpts_min", CTLFLAG_RW,
1360 &rack_honors_hpts_min_to, 1,
1361 "Do rack pacing timers honor hpts min timeout");
1362 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1363 SYSCTL_CHILDREN(rack_timers),
1364 OID_AUTO, "hpts_max_reduce", CTLFLAG_RW,
1365 &rack_max_reduce, 10,
1366 "Max percentage we will reduce pacing delay by for pacing when we are behind");
1367 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1368 SYSCTL_CHILDREN(rack_timers),
1369 OID_AUTO, "persmin", CTLFLAG_RW,
1370 &rack_persist_min, 250000,
1371 "What is the minimum time in microseconds between persists");
1372 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1373 SYSCTL_CHILDREN(rack_timers),
1374 OID_AUTO, "persmax", CTLFLAG_RW,
1375 &rack_persist_max, 2000000,
1376 "What is the largest delay in microseconds between persists");
1377 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1378 SYSCTL_CHILDREN(rack_timers),
1379 OID_AUTO, "delayed_ack", CTLFLAG_RW,
1380 &rack_delayed_ack_time, 40000,
1381 "Delayed ack time (40ms in microseconds)");
1382 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1383 SYSCTL_CHILDREN(rack_timers),
1384 OID_AUTO, "minrto", CTLFLAG_RW,
1385 &rack_rto_min, 30000,
1386 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP");
1387 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1388 SYSCTL_CHILDREN(rack_timers),
1389 OID_AUTO, "maxrto", CTLFLAG_RW,
1390 &rack_rto_max, 4000000,
1391 "Maximum RTO in microseconds -- should be at least as large as min_rto");
1392 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1393 SYSCTL_CHILDREN(rack_timers),
1394 OID_AUTO, "minto", CTLFLAG_RW,
1395 &rack_min_to, 1000,
1396 "Minimum rack timeout in microseconds");
1397 /* Measure controls */
1398 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1399 SYSCTL_CHILDREN(rack_sysctl_root),
1400 OID_AUTO,
1401 "measure",
1402 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1403 "Measure related controls");
1404 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1405 SYSCTL_CHILDREN(rack_measure),
1406 OID_AUTO, "wma_divisor", CTLFLAG_RW,
1407 &rack_wma_divisor, 8,
1408 "When doing b/w calculation what is the divisor for the WMA");
1409 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1410 SYSCTL_CHILDREN(rack_measure),
1411 OID_AUTO, "end_cwnd", CTLFLAG_RW,
1412 &rack_cwnd_block_ends_measure, 0,
1413 "Does a cwnd just-return end the measurement window (app limited)");
1414 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1415 SYSCTL_CHILDREN(rack_measure),
1416 OID_AUTO, "end_rwnd", CTLFLAG_RW,
1417 &rack_rwnd_block_ends_measure, 0,
1418 "Does an rwnd just-return end the measurement window (app limited -- not persists)");
1419 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1420 SYSCTL_CHILDREN(rack_measure),
1421 OID_AUTO, "min_target", CTLFLAG_RW,
1422 &rack_def_data_window, 20,
1423 "What is the minimum target window (in mss) for a GP measurements");
1424 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1425 SYSCTL_CHILDREN(rack_measure),
1426 OID_AUTO, "goal_bdp", CTLFLAG_RW,
1427 &rack_goal_bdp, 2,
1428 "What is the goal BDP to measure");
1429 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1430 SYSCTL_CHILDREN(rack_measure),
1431 OID_AUTO, "min_srtts", CTLFLAG_RW,
1432 &rack_min_srtts, 1,
1433 "What is the goal BDP to measure");
1434 SYSCTL_ADD_U32(&rack_sysctl_ctx,
1435 SYSCTL_CHILDREN(rack_measure),
1436 OID_AUTO, "min_measure_tim", CTLFLAG_RW,
1437 &rack_min_measure_usec, 0,
1438 "What is the Minimum time time for a measurement if 0, this is off");
1439 /* Features */
1440 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1441 SYSCTL_CHILDREN(rack_sysctl_root),
1442 OID_AUTO,
1443 "features",
1444 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1445 "Feature controls");
1446 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1447 SYSCTL_CHILDREN(rack_features),
1448 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW,
1449 &rack_hybrid_allow_set_maxseg, 0,
1450 "Should hybrid pacing allow the setmss command");
1451 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1452 SYSCTL_CHILDREN(rack_features),
1453 OID_AUTO, "cmpack", CTLFLAG_RW,
1454 &rack_use_cmp_acks, 1,
1455 "Should RACK have LRO send compressed acks");
1456 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1457 SYSCTL_CHILDREN(rack_features),
1458 OID_AUTO, "fsb", CTLFLAG_RW,
1459 &rack_use_fsb, 1,
1460 "Should RACK use the fast send block?");
1461 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1462 SYSCTL_CHILDREN(rack_features),
1463 OID_AUTO, "rfo", CTLFLAG_RW,
1464 &rack_use_rfo, 1,
1465 "Should RACK use rack_fast_output()?");
1466 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1467 SYSCTL_CHILDREN(rack_features),
1468 OID_AUTO, "rsmrfo", CTLFLAG_RW,
1469 &rack_use_rsm_rfo, 1,
1470 "Should RACK use rack_fast_rsm_output()?");
1471 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1472 SYSCTL_CHILDREN(rack_features),
1473 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW,
1474 &rack_enable_mqueue_for_nonpaced, 0,
1475 "Should RACK use mbuf queuing for non-paced connections");
1476 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1477 SYSCTL_CHILDREN(rack_features),
1478 OID_AUTO, "hystartplusplus", CTLFLAG_RW,
1479 &rack_do_hystart, 0,
1480 "Should RACK enable HyStart++ on connections?");
1481 /* Misc rack controls */
1482 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
1483 SYSCTL_CHILDREN(rack_sysctl_root),
1484 OID_AUTO,
1485 "misc",
1486 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
1487 "Misc related controls");
1488 #ifdef TCP_ACCOUNTING
1489 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1490 SYSCTL_CHILDREN(rack_misc),
1491 OID_AUTO, "tcp_acct", CTLFLAG_RW,
1492 &rack_tcp_accounting, 0,
1493 "Should we turn on TCP accounting for all rack sessions?");
1494 #endif
1495 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1496 SYSCTL_CHILDREN(rack_misc),
1497 OID_AUTO, "dnd", CTLFLAG_RW,
1498 &rack_dnd_default, 0,
1499 "Do not disturb default for rack_rrr = 3");
1500 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1501 SYSCTL_CHILDREN(rack_misc),
1502 OID_AUTO, "rxt_controls", CTLFLAG_RW,
1503 &rack_rxt_controls, 0,
1504 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?");
1505 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1506 SYSCTL_CHILDREN(rack_misc),
1507 OID_AUTO, "rack_hibeta", CTLFLAG_RW,
1508 &rack_hibeta_setting, 0,
1509 "Do we ue a high beta (80 instead of 50)?");
1510 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1511 SYSCTL_CHILDREN(rack_misc),
1512 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW,
1513 &rack_apply_rtt_with_reduced_conf, 0,
1514 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?");
1515 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1516 SYSCTL_CHILDREN(rack_misc),
1517 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW,
1518 &rack_dsack_std_based, 3,
1519 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?");
1520 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1521 SYSCTL_CHILDREN(rack_misc),
1522 OID_AUTO, "prr_addback_max", CTLFLAG_RW,
1523 &rack_prr_addbackmax, 2,
1524 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?");
1525 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1526 SYSCTL_CHILDREN(rack_misc),
1527 OID_AUTO, "stats_gets_ms", CTLFLAG_RW,
1528 &rack_stats_gets_ms_rtt, 1,
1529 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?");
1530 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1531 SYSCTL_CHILDREN(rack_misc),
1532 OID_AUTO, "clientlowbuf", CTLFLAG_RW,
1533 &rack_client_low_buf, 0,
1534 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?");
1535 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1536 SYSCTL_CHILDREN(rack_misc),
1537 OID_AUTO, "defprofile", CTLFLAG_RW,
1538 &rack_def_profile, 0,
1539 "Should RACK use a default profile (0=no, num == profile num)?");
1540 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1541 SYSCTL_CHILDREN(rack_misc),
1542 OID_AUTO, "shared_cwnd", CTLFLAG_RW,
1543 &rack_enable_shared_cwnd, 1,
1544 "Should RACK try to use the shared cwnd on connections where allowed");
1545 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1546 SYSCTL_CHILDREN(rack_misc),
1547 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW,
1548 &rack_limits_scwnd, 1,
1549 "Should RACK place low end time limits on the shared cwnd feature");
1550 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1551 SYSCTL_CHILDREN(rack_misc),
1552 OID_AUTO, "no_prr", CTLFLAG_RW,
1553 &rack_disable_prr, 0,
1554 "Should RACK not use prr and only pace (must have pacing on)");
1555 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1556 SYSCTL_CHILDREN(rack_misc),
1557 OID_AUTO, "bb_verbose", CTLFLAG_RW,
1558 &rack_verbose_logging, 0,
1559 "Should RACK black box logging be verbose");
1560 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1561 SYSCTL_CHILDREN(rack_misc),
1562 OID_AUTO, "data_after_close", CTLFLAG_RW,
1563 &rack_ignore_data_after_close, 1,
1564 "Do we hold off sending a RST until all pending data is ack'd");
1565 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1566 SYSCTL_CHILDREN(rack_misc),
1567 OID_AUTO, "no_sack_needed", CTLFLAG_RW,
1568 &rack_sack_not_required, 1,
1569 "Do we allow rack to run on connections not supporting SACK");
1570 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1571 SYSCTL_CHILDREN(rack_misc),
1572 OID_AUTO, "prr_sendalot", CTLFLAG_RW,
1573 &rack_send_a_lot_in_prr, 1,
1574 "Send a lot in prr");
1575 SYSCTL_ADD_S32(&rack_sysctl_ctx,
1576 SYSCTL_CHILDREN(rack_misc),
1577 OID_AUTO, "autoscale", CTLFLAG_RW,
1578 &rack_autosndbuf_inc, 20,
1579 "What percentage should rack scale up its snd buffer by?");
1580
1581 /* Counters */
1582 rack_total_bytes = counter_u64_alloc(M_WAITOK);
1583 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1584 SYSCTL_CHILDREN(rack_counters),
1585 OID_AUTO, "totalbytes", CTLFLAG_RD,
1586 &rack_total_bytes,
1587 "Total number of bytes sent");
1588 rack_fto_send = counter_u64_alloc(M_WAITOK);
1589 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1590 SYSCTL_CHILDREN(rack_counters),
1591 OID_AUTO, "fto_send", CTLFLAG_RD,
1592 &rack_fto_send, "Total number of rack_fast_output sends");
1593 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK);
1594 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1595 SYSCTL_CHILDREN(rack_counters),
1596 OID_AUTO, "fto_rsm_send", CTLFLAG_RD,
1597 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends");
1598 rack_nfto_resend = counter_u64_alloc(M_WAITOK);
1599 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1600 SYSCTL_CHILDREN(rack_counters),
1601 OID_AUTO, "nfto_resend", CTLFLAG_RD,
1602 &rack_nfto_resend, "Total number of rack_output retransmissions");
1603 rack_non_fto_send = counter_u64_alloc(M_WAITOK);
1604 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1605 SYSCTL_CHILDREN(rack_counters),
1606 OID_AUTO, "nfto_send", CTLFLAG_RD,
1607 &rack_non_fto_send, "Total number of rack_output first sends");
1608 rack_extended_rfo = counter_u64_alloc(M_WAITOK);
1609 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1610 SYSCTL_CHILDREN(rack_counters),
1611 OID_AUTO, "rfo_extended", CTLFLAG_RD,
1612 &rack_extended_rfo, "Total number of times we extended rfo");
1613
1614 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK);
1615 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1616 SYSCTL_CHILDREN(rack_counters),
1617 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD,
1618 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing");
1619 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK);
1620
1621 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1622 SYSCTL_CHILDREN(rack_counters),
1623 OID_AUTO, "hwpace_lost", CTLFLAG_RD,
1624 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing");
1625 rack_tlp_tot = counter_u64_alloc(M_WAITOK);
1626 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1627 SYSCTL_CHILDREN(rack_counters),
1628 OID_AUTO, "tlp_to_total", CTLFLAG_RD,
1629 &rack_tlp_tot,
1630 "Total number of tail loss probe expirations");
1631 rack_tlp_newdata = counter_u64_alloc(M_WAITOK);
1632 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1633 SYSCTL_CHILDREN(rack_counters),
1634 OID_AUTO, "tlp_new", CTLFLAG_RD,
1635 &rack_tlp_newdata,
1636 "Total number of tail loss probe sending new data");
1637 rack_tlp_retran = counter_u64_alloc(M_WAITOK);
1638 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1639 SYSCTL_CHILDREN(rack_counters),
1640 OID_AUTO, "tlp_retran", CTLFLAG_RD,
1641 &rack_tlp_retran,
1642 "Total number of tail loss probe sending retransmitted data");
1643 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK);
1644 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1645 SYSCTL_CHILDREN(rack_counters),
1646 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD,
1647 &rack_tlp_retran_bytes,
1648 "Total bytes of tail loss probe sending retransmitted data");
1649 rack_to_tot = counter_u64_alloc(M_WAITOK);
1650 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1651 SYSCTL_CHILDREN(rack_counters),
1652 OID_AUTO, "rack_to_tot", CTLFLAG_RD,
1653 &rack_to_tot,
1654 "Total number of times the rack to expired");
1655 rack_saw_enobuf = counter_u64_alloc(M_WAITOK);
1656 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1657 SYSCTL_CHILDREN(rack_counters),
1658 OID_AUTO, "saw_enobufs", CTLFLAG_RD,
1659 &rack_saw_enobuf,
1660 "Total number of times a sends returned enobuf for non-hdwr paced connections");
1661 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK);
1662 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1663 SYSCTL_CHILDREN(rack_counters),
1664 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD,
1665 &rack_saw_enobuf_hw,
1666 "Total number of times a send returned enobuf for hdwr paced connections");
1667 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK);
1668 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1669 SYSCTL_CHILDREN(rack_counters),
1670 OID_AUTO, "saw_enetunreach", CTLFLAG_RD,
1671 &rack_saw_enetunreach,
1672 "Total number of times a send received a enetunreachable");
1673 rack_hot_alloc = counter_u64_alloc(M_WAITOK);
1674 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1675 SYSCTL_CHILDREN(rack_counters),
1676 OID_AUTO, "alloc_hot", CTLFLAG_RD,
1677 &rack_hot_alloc,
1678 "Total allocations from the top of our list");
1679 rack_to_alloc = counter_u64_alloc(M_WAITOK);
1680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1681 SYSCTL_CHILDREN(rack_counters),
1682 OID_AUTO, "allocs", CTLFLAG_RD,
1683 &rack_to_alloc,
1684 "Total allocations of tracking structures");
1685 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK);
1686 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1687 SYSCTL_CHILDREN(rack_counters),
1688 OID_AUTO, "allochard", CTLFLAG_RD,
1689 &rack_to_alloc_hard,
1690 "Total allocations done with sleeping the hard way");
1691 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK);
1692 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1693 SYSCTL_CHILDREN(rack_counters),
1694 OID_AUTO, "allocemerg", CTLFLAG_RD,
1695 &rack_to_alloc_emerg,
1696 "Total allocations done from emergency cache");
1697 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
1698 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1699 SYSCTL_CHILDREN(rack_counters),
1700 OID_AUTO, "alloc_limited", CTLFLAG_RD,
1701 &rack_to_alloc_limited,
1702 "Total allocations dropped due to limit");
1703 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
1704 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1705 SYSCTL_CHILDREN(rack_counters),
1706 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD,
1707 &rack_alloc_limited_conns,
1708 "Connections with allocations dropped due to limit");
1709 rack_split_limited = counter_u64_alloc(M_WAITOK);
1710 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1711 SYSCTL_CHILDREN(rack_counters),
1712 OID_AUTO, "split_limited", CTLFLAG_RD,
1713 &rack_split_limited,
1714 "Split allocations dropped due to limit");
1715 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK);
1716 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1717 SYSCTL_CHILDREN(rack_counters),
1718 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD,
1719 &rack_rxt_clamps_cwnd,
1720 "Number of times that excessive rxt clamped the cwnd down");
1721 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK);
1722 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1723 SYSCTL_CHILDREN(rack_counters),
1724 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD,
1725 &rack_rxt_clamps_cwnd_uniq,
1726 "Number of connections that have had excessive rxt clamped the cwnd down");
1727 rack_persists_sends = counter_u64_alloc(M_WAITOK);
1728 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1729 SYSCTL_CHILDREN(rack_counters),
1730 OID_AUTO, "persist_sends", CTLFLAG_RD,
1731 &rack_persists_sends,
1732 "Number of times we sent a persist probe");
1733 rack_persists_acks = counter_u64_alloc(M_WAITOK);
1734 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1735 SYSCTL_CHILDREN(rack_counters),
1736 OID_AUTO, "persist_acks", CTLFLAG_RD,
1737 &rack_persists_acks,
1738 "Number of times a persist probe was acked");
1739 rack_persists_loss = counter_u64_alloc(M_WAITOK);
1740 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1741 SYSCTL_CHILDREN(rack_counters),
1742 OID_AUTO, "persist_loss", CTLFLAG_RD,
1743 &rack_persists_loss,
1744 "Number of times we detected a lost persist probe (no ack)");
1745 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK);
1746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1747 SYSCTL_CHILDREN(rack_counters),
1748 OID_AUTO, "persist_loss_ends", CTLFLAG_RD,
1749 &rack_persists_lost_ends,
1750 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort");
1751 #ifdef INVARIANTS
1752 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK);
1753 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1754 SYSCTL_CHILDREN(rack_counters),
1755 OID_AUTO, "map_adjust_req", CTLFLAG_RD,
1756 &rack_adjust_map_bw,
1757 "Number of times we hit the case where the sb went up and down on a sendmap entry");
1758 #endif
1759 rack_multi_single_eq = counter_u64_alloc(M_WAITOK);
1760 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1761 SYSCTL_CHILDREN(rack_counters),
1762 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD,
1763 &rack_multi_single_eq,
1764 "Number of compressed acks total represented");
1765 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK);
1766 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1767 SYSCTL_CHILDREN(rack_counters),
1768 OID_AUTO, "cmp_ack_not", CTLFLAG_RD,
1769 &rack_proc_non_comp_ack,
1770 "Number of non compresseds acks that we processed");
1771
1772
1773 rack_sack_proc_all = counter_u64_alloc(M_WAITOK);
1774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1775 SYSCTL_CHILDREN(rack_counters),
1776 OID_AUTO, "sack_long", CTLFLAG_RD,
1777 &rack_sack_proc_all,
1778 "Total times we had to walk whole list for sack processing");
1779 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK);
1780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1781 SYSCTL_CHILDREN(rack_counters),
1782 OID_AUTO, "sack_restart", CTLFLAG_RD,
1783 &rack_sack_proc_restart,
1784 "Total times we had to walk whole list due to a restart");
1785 rack_sack_proc_short = counter_u64_alloc(M_WAITOK);
1786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1787 SYSCTL_CHILDREN(rack_counters),
1788 OID_AUTO, "sack_short", CTLFLAG_RD,
1789 &rack_sack_proc_short,
1790 "Total times we took shortcut for sack processing");
1791 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK);
1792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1793 SYSCTL_CHILDREN(rack_counters),
1794 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
1795 &rack_input_idle_reduces,
1796 "Total number of idle reductions on input");
1797 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
1798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1799 SYSCTL_CHILDREN(rack_counters),
1800 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
1801 &rack_collapsed_win_seen,
1802 "Total number of collapsed window events seen (where our window shrinks)");
1803
1804 rack_collapsed_win = counter_u64_alloc(M_WAITOK);
1805 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1806 SYSCTL_CHILDREN(rack_counters),
1807 OID_AUTO, "collapsed_win", CTLFLAG_RD,
1808 &rack_collapsed_win,
1809 "Total number of collapsed window events where we mark packets");
1810 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
1811 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1812 SYSCTL_CHILDREN(rack_counters),
1813 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
1814 &rack_collapsed_win_rxt,
1815 "Total number of packets that were retransmitted");
1816 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
1817 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1818 SYSCTL_CHILDREN(rack_counters),
1819 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
1820 &rack_collapsed_win_rxt_bytes,
1821 "Total number of bytes that were retransmitted");
1822 rack_try_scwnd = counter_u64_alloc(M_WAITOK);
1823 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
1824 SYSCTL_CHILDREN(rack_counters),
1825 OID_AUTO, "tried_scwnd", CTLFLAG_RD,
1826 &rack_try_scwnd,
1827 "Total number of scwnd attempts");
1828 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK);
1829 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1830 OID_AUTO, "outsize", CTLFLAG_RD,
1831 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes");
1832 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK);
1833 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root),
1834 OID_AUTO, "opts", CTLFLAG_RD,
1835 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats");
1836 SYSCTL_ADD_PROC(&rack_sysctl_ctx,
1837 SYSCTL_CHILDREN(rack_sysctl_root),
1838 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
1839 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters");
1840 }
1841
1842 static uint32_t
rc_init_window(struct tcp_rack * rack)1843 rc_init_window(struct tcp_rack *rack)
1844 {
1845 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)));
1846
1847 }
1848
1849 static uint64_t
rack_get_fixed_pacing_bw(struct tcp_rack * rack)1850 rack_get_fixed_pacing_bw(struct tcp_rack *rack)
1851 {
1852 if (IN_FASTRECOVERY(rack->rc_tp->t_flags))
1853 return (rack->r_ctl.rc_fixed_pacing_rate_rec);
1854 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
1855 return (rack->r_ctl.rc_fixed_pacing_rate_ss);
1856 else
1857 return (rack->r_ctl.rc_fixed_pacing_rate_ca);
1858 }
1859
1860 static void
rack_log_hybrid_bw(struct tcp_rack * rack,uint32_t seq,uint64_t cbw,uint64_t tim,uint64_t data,uint8_t mod,uint16_t aux,struct tcp_sendfile_track * cur,int line)1861 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim,
1862 uint64_t data, uint8_t mod, uint16_t aux,
1863 struct tcp_sendfile_track *cur, int line)
1864 {
1865 #ifdef TCP_REQUEST_TRK
1866 int do_log = 0;
1867
1868 /*
1869 * The rate cap one is noisy and only should come out when normal BB logging
1870 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out
1871 * once per chunk and make up the BBpoint that can be turned on by the client.
1872 */
1873 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
1874 /*
1875 * The very noisy two need to only come out when
1876 * we have verbose logging on.
1877 */
1878 if (rack_verbose_logging != 0)
1879 do_log = tcp_bblogging_on(rack->rc_tp);
1880 else
1881 do_log = 0;
1882 } else if (mod != HYBRID_LOG_BW_MEASURE) {
1883 /*
1884 * All other less noisy logs here except the measure which
1885 * also needs to come out on the point and the log.
1886 */
1887 do_log = tcp_bblogging_on(rack->rc_tp);
1888 } else {
1889 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING);
1890 }
1891
1892 if (do_log) {
1893 union tcp_log_stackspecific log;
1894 struct timeval tv;
1895 uint64_t lt_bw;
1896
1897 /* Convert our ms to a microsecond */
1898 memset(&log, 0, sizeof(log));
1899
1900 log.u_bbr.cwnd_gain = line;
1901 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1902 log.u_bbr.rttProp = tim;
1903 log.u_bbr.bw_inuse = cbw;
1904 log.u_bbr.delRate = rack_get_gp_est(rack);
1905 lt_bw = rack_get_lt_bw(rack);
1906 log.u_bbr.flex1 = seq;
1907 log.u_bbr.pacing_gain = aux;
1908 /* lt_bw = < flex3 | flex2 > */
1909 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff);
1910 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff);
1911 /* Record the last obtained us rtt in inflight */
1912 if (cur == NULL) {
1913 /* Make sure we are looking at the right log if an overide comes in */
1914 cur = rack->r_ctl.rc_last_sft;
1915 }
1916 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY)
1917 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt;
1918 else {
1919 /* Use the last known rtt i.e. the rack-rtt */
1920 log.u_bbr.inflight = rack->rc_rack_rtt;
1921 }
1922 if (cur != NULL) {
1923 uint64_t off;
1924
1925 log.u_bbr.cur_del_rate = cur->deadline;
1926 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) {
1927 /* start = < lost | pkt_epoch > */
1928 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
1929 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
1930 log.u_bbr.flex6 = cur->start_seq;
1931 log.u_bbr.pkts_out = cur->end_seq;
1932 } else {
1933 /* start = < lost | pkt_epoch > */
1934 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff);
1935 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
1936 /* end = < pkts_out | flex6 > */
1937 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff);
1938 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
1939 }
1940 /* first_send = <lt_epoch | epoch> */
1941 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff);
1942 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff);
1943 /* localtime = <delivered | applimited>*/
1944 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
1945 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
1946 #ifdef TCP_REQUEST_TRK
1947 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
1948 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
1949 #endif
1950 log.u_bbr.inhpts = 1;
1951 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs);
1952 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs);
1953 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags;
1954 } else {
1955 log.u_bbr.flex7 = 0xffff;
1956 log.u_bbr.cur_del_rate = 0xffffffffffffffff;
1957 }
1958 /*
1959 * Compose bbr_state to be a bit wise 0000ADHF
1960 * where A is the always_pace flag
1961 * where D is the dgp_on flag
1962 * where H is the hybrid_mode on flag
1963 * where F is the use_fixed_rate flag.
1964 */
1965 log.u_bbr.bbr_state = rack->rc_always_pace;
1966 log.u_bbr.bbr_state <<= 1;
1967 log.u_bbr.bbr_state |= rack->dgp_on;
1968 log.u_bbr.bbr_state <<= 1;
1969 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
1970 log.u_bbr.bbr_state <<= 1;
1971 log.u_bbr.bbr_state |= rack->use_fixed_rate;
1972 log.u_bbr.flex8 = mod;
1973 tcp_log_event(rack->rc_tp, NULL,
1974 &rack->rc_inp->inp_socket->so_rcv,
1975 &rack->rc_inp->inp_socket->so_snd,
1976 TCP_HYBRID_PACING_LOG, 0,
1977 0, &log, false, NULL, __func__, __LINE__, &tv);
1978
1979 }
1980 #endif
1981 }
1982
1983 #ifdef TCP_REQUEST_TRK
1984 static void
rack_log_hybrid_sends(struct tcp_rack * rack,struct tcp_sendfile_track * cur,int line)1985 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line)
1986 {
1987 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) {
1988 union tcp_log_stackspecific log;
1989 struct timeval tv;
1990 uint64_t off;
1991
1992 /* Convert our ms to a microsecond */
1993 memset(&log, 0, sizeof(log));
1994
1995 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
1996 log.u_bbr.delRate = cur->sent_at_fs;
1997
1998 if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) {
1999 /*
2000 * We did not get a new Rules Applied to set so
2001 * no overlapping send occured, this means the
2002 * current byte counts are correct.
2003 */
2004 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
2005 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes;
2006 } else {
2007 /*
2008 * Overlapping send case, we switched to a new
2009 * send and did a rules applied.
2010 */
2011 log.u_bbr.cur_del_rate = cur->sent_at_ls;
2012 log.u_bbr.rttProp = cur->rxt_at_ls;
2013 }
2014 log.u_bbr.bw_inuse = cur->rxt_at_fs;
2015 log.u_bbr.cwnd_gain = line;
2016 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
2017 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
2018 /* start = < flex1 | flex2 > */
2019 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff);
2020 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff);
2021 /* end = < flex3 | flex4 > */
2022 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff);
2023 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff);
2024
2025 /* localtime = <delivered | applimited>*/
2026 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff);
2027 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
2028 /* client timestamp = <lt_epoch | epoch>*/
2029 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff);
2030 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff);
2031 /* now set all the flags in */
2032 log.u_bbr.pkts_out = cur->hybrid_flags;
2033 log.u_bbr.lost = cur->playout_ms;
2034 log.u_bbr.flex6 = cur->flags;
2035 /*
2036 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases
2037 * where a false retransmit occurred so first_send <-> lastsend may
2038 * include longer time then it actually took if we have a false rxt.
2039 */
2040 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff);
2041 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff);
2042 /*
2043 * Compose bbr_state to be a bit wise 0000ADHF
2044 * where A is the always_pace flag
2045 * where D is the dgp_on flag
2046 * where H is the hybrid_mode on flag
2047 * where F is the use_fixed_rate flag.
2048 */
2049 log.u_bbr.bbr_state = rack->rc_always_pace;
2050 log.u_bbr.bbr_state <<= 1;
2051 log.u_bbr.bbr_state |= rack->dgp_on;
2052 log.u_bbr.bbr_state <<= 1;
2053 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
2054 log.u_bbr.bbr_state <<= 1;
2055 log.u_bbr.bbr_state |= rack->use_fixed_rate;
2056
2057 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST;
2058 tcp_log_event(rack->rc_tp, NULL,
2059 &rack->rc_inp->inp_socket->so_rcv,
2060 &rack->rc_inp->inp_socket->so_snd,
2061 TCP_HYBRID_PACING_LOG, 0,
2062 0, &log, false, NULL, __func__, __LINE__, &tv);
2063 }
2064 }
2065 #endif
2066
2067 static inline uint64_t
rack_compensate_for_linerate(struct tcp_rack * rack,uint64_t bw)2068 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw)
2069 {
2070 uint64_t ret_bw, ether;
2071 uint64_t u_segsiz;
2072
2073 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr);
2074 if (rack->r_is_v6){
2075 #ifdef INET6
2076 ether += sizeof(struct ip6_hdr);
2077 #endif
2078 ether += 14; /* eheader size 6+6+2 */
2079 } else {
2080 #ifdef INET
2081 ether += sizeof(struct ip);
2082 #endif
2083 ether += 14; /* eheader size 6+6+2 */
2084 }
2085 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs);
2086 ret_bw = bw;
2087 ret_bw *= ether;
2088 ret_bw /= u_segsiz;
2089 return (ret_bw);
2090 }
2091
2092 static void
rack_rate_cap_bw(struct tcp_rack * rack,uint64_t * bw,int * capped)2093 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped)
2094 {
2095 #ifdef TCP_REQUEST_TRK
2096 struct timeval tv;
2097 uint64_t timenow, timeleft, lenleft, lengone, calcbw;
2098 #endif
2099
2100 if (rack->r_ctl.bw_rate_cap == 0)
2101 return;
2102 #ifdef TCP_REQUEST_TRK
2103 if (rack->rc_catch_up && rack->rc_hybrid_mode &&
2104 (rack->r_ctl.rc_last_sft != NULL)) {
2105 /*
2106 * We have a dynamic cap. The original target
2107 * is in bw_rate_cap, but we need to look at
2108 * how long it is until we hit the deadline.
2109 */
2110 struct tcp_sendfile_track *ent;
2111
2112 ent = rack->r_ctl.rc_last_sft;
2113 microuptime(&tv);
2114 timenow = tcp_tv_to_lusec(&tv);
2115 if (timenow >= ent->deadline) {
2116 /* No time left we do DGP only */
2117 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2118 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__);
2119 rack->r_ctl.bw_rate_cap = 0;
2120 return;
2121 }
2122 /* We have the time */
2123 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow;
2124 if (timeleft < HPTS_MSEC_IN_SEC) {
2125 /* If there is less than a ms left just use DGPs rate */
2126 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2127 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__);
2128 rack->r_ctl.bw_rate_cap = 0;
2129 return;
2130 }
2131 /*
2132 * Now lets find the amount of data left to send.
2133 *
2134 * Now ideally we want to use the end_seq to figure out how much more
2135 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry..
2136 */
2137 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) {
2138 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una))
2139 lenleft = ent->end_seq - rack->rc_tp->snd_una;
2140 else {
2141 /* TSNH, we should catch it at the send */
2142 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2143 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__);
2144 rack->r_ctl.bw_rate_cap = 0;
2145 return;
2146 }
2147 } else {
2148 /*
2149 * The hard way, figure out how much is gone and then
2150 * take that away from the total the client asked for
2151 * (thats off by tls overhead if this is tls).
2152 */
2153 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq))
2154 lengone = rack->rc_tp->snd_una - ent->start_seq;
2155 else
2156 lengone = 0;
2157 if (lengone < (ent->end - ent->start))
2158 lenleft = (ent->end - ent->start) - lengone;
2159 else {
2160 /* TSNH, we should catch it at the send */
2161 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2162 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__);
2163 rack->r_ctl.bw_rate_cap = 0;
2164 return;
2165 }
2166 }
2167 if (lenleft == 0) {
2168 /* We have it all sent */
2169 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2170 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__);
2171 if (rack->r_ctl.bw_rate_cap)
2172 goto normal_ratecap;
2173 else
2174 return;
2175 }
2176 calcbw = lenleft * HPTS_USEC_IN_SEC;
2177 calcbw /= timeleft;
2178 /* Now we must compensate for IP/TCP overhead */
2179 calcbw = rack_compensate_for_linerate(rack, calcbw);
2180 /* Update the bit rate cap */
2181 rack->r_ctl.bw_rate_cap = calcbw;
2182 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2183 (rack_hybrid_allow_set_maxseg == 1) &&
2184 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2185 /* Lets set in a smaller mss possibly here to match our rate-cap */
2186 uint32_t orig_max;
2187
2188 orig_max = rack->r_ctl.rc_pace_max_segs;
2189 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2190 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp));
2191 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2192 }
2193 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2194 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__);
2195 if ((calcbw > 0) && (*bw > calcbw)) {
2196 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2197 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__);
2198 *capped = 1;
2199 *bw = calcbw;
2200 }
2201 return;
2202 }
2203 normal_ratecap:
2204 #endif
2205 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) {
2206 #ifdef TCP_REQUEST_TRK
2207 if (rack->rc_hybrid_mode &&
2208 rack->rc_catch_up &&
2209 (rack->r_ctl.rc_last_sft != NULL) &&
2210 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) &&
2211 (rack_hybrid_allow_set_maxseg == 1) &&
2212 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) {
2213 /* Lets set in a smaller mss possibly here to match our rate-cap */
2214 uint32_t orig_max;
2215
2216 orig_max = rack->r_ctl.rc_pace_max_segs;
2217 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS;
2218 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp));
2219 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5);
2220 }
2221 #endif
2222 *capped = 1;
2223 *bw = rack->r_ctl.bw_rate_cap;
2224 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
2225 *bw, 0, 0,
2226 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__);
2227 }
2228 }
2229
2230 static uint64_t
rack_get_gp_est(struct tcp_rack * rack)2231 rack_get_gp_est(struct tcp_rack *rack)
2232 {
2233 uint64_t bw, lt_bw, ret_bw;
2234
2235 if (rack->rc_gp_filled == 0) {
2236 /*
2237 * We have yet no b/w measurement,
2238 * if we have a user set initial bw
2239 * return it. If we don't have that and
2240 * we have an srtt, use the tcp IW (10) to
2241 * calculate a fictional b/w over the SRTT
2242 * which is more or less a guess. Note
2243 * we don't use our IW from rack on purpose
2244 * so if we have like IW=30, we are not
2245 * calculating a "huge" b/w.
2246 */
2247 uint64_t srtt;
2248
2249 if (rack->dis_lt_bw == 1)
2250 lt_bw = 0;
2251 else
2252 lt_bw = rack_get_lt_bw(rack);
2253 if (lt_bw) {
2254 /*
2255 * No goodput bw but a long-term b/w does exist
2256 * lets use that.
2257 */
2258 ret_bw = lt_bw;
2259 goto compensate;
2260 }
2261 if (rack->r_ctl.init_rate)
2262 return (rack->r_ctl.init_rate);
2263
2264 /* Ok lets come up with the IW guess, if we have a srtt */
2265 if (rack->rc_tp->t_srtt == 0) {
2266 /*
2267 * Go with old pacing method
2268 * i.e. burst mitigation only.
2269 */
2270 return (0);
2271 }
2272 /* Ok lets get the initial TCP win (not racks) */
2273 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp));
2274 srtt = (uint64_t)rack->rc_tp->t_srtt;
2275 bw *= (uint64_t)USECS_IN_SECOND;
2276 bw /= srtt;
2277 ret_bw = bw;
2278 goto compensate;
2279
2280 }
2281 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
2282 /* Averaging is done, we can return the value */
2283 bw = rack->r_ctl.gp_bw;
2284 } else {
2285 /* Still doing initial average must calculate */
2286 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1);
2287 }
2288 if (rack->dis_lt_bw) {
2289 /* We are not using lt-bw */
2290 ret_bw = bw;
2291 goto compensate;
2292 }
2293 lt_bw = rack_get_lt_bw(rack);
2294 if (lt_bw == 0) {
2295 /* If we don't have one then equate it to the gp_bw */
2296 lt_bw = rack->r_ctl.gp_bw;
2297 }
2298 if (rack->use_lesser_lt_bw) {
2299 if (lt_bw < bw)
2300 ret_bw = lt_bw;
2301 else
2302 ret_bw = bw;
2303 } else {
2304 if (lt_bw > bw)
2305 ret_bw = lt_bw;
2306 else
2307 ret_bw = bw;
2308 }
2309 /*
2310 * Now lets compensate based on the TCP/IP overhead. Our
2311 * Goodput estimate does not include this so we must pace out
2312 * a bit faster since our pacing calculations do. The pacing
2313 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz
2314 * we are using to do this, so we do that here in the opposite
2315 * direction as well. This means that if we are tunneled and the
2316 * segsiz is say 1200 bytes we will get quite a boost, but its
2317 * compensated for in the pacing time the opposite way.
2318 */
2319 compensate:
2320 ret_bw = rack_compensate_for_linerate(rack, ret_bw);
2321 return(ret_bw);
2322 }
2323
2324
2325 static uint64_t
rack_get_bw(struct tcp_rack * rack)2326 rack_get_bw(struct tcp_rack *rack)
2327 {
2328 uint64_t bw;
2329
2330 if (rack->use_fixed_rate) {
2331 /* Return the fixed pacing rate */
2332 return (rack_get_fixed_pacing_bw(rack));
2333 }
2334 bw = rack_get_gp_est(rack);
2335 return (bw);
2336 }
2337
2338 static uint16_t
rack_get_output_gain(struct tcp_rack * rack,struct rack_sendmap * rsm)2339 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm)
2340 {
2341 if (rack->use_fixed_rate) {
2342 return (100);
2343 } else if (rack->in_probe_rtt && (rsm == NULL))
2344 return (rack->r_ctl.rack_per_of_gp_probertt);
2345 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
2346 rack->r_ctl.rack_per_of_gp_rec)) {
2347 if (rsm) {
2348 /* a retransmission always use the recovery rate */
2349 return (rack->r_ctl.rack_per_of_gp_rec);
2350 } else if (rack->rack_rec_nonrxt_use_cr) {
2351 /* Directed to use the configured rate */
2352 goto configured_rate;
2353 } else if (rack->rack_no_prr &&
2354 (rack->r_ctl.rack_per_of_gp_rec > 100)) {
2355 /* No PRR, lets just use the b/w estimate only */
2356 return (100);
2357 } else {
2358 /*
2359 * Here we may have a non-retransmit but we
2360 * have no overrides, so just use the recovery
2361 * rate (prr is in effect).
2362 */
2363 return (rack->r_ctl.rack_per_of_gp_rec);
2364 }
2365 }
2366 configured_rate:
2367 /* For the configured rate we look at our cwnd vs the ssthresh */
2368 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh)
2369 return (rack->r_ctl.rack_per_of_gp_ss);
2370 else
2371 return (rack->r_ctl.rack_per_of_gp_ca);
2372 }
2373
2374 static void
rack_log_dsack_event(struct tcp_rack * rack,uint8_t mod,uint32_t flex4,uint32_t flex5,uint32_t flex6)2375 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6)
2376 {
2377 /*
2378 * Types of logs (mod value)
2379 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit.
2380 * 2 = a dsack round begins, persist is reset to 16.
2381 * 3 = a dsack round ends
2382 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh
2383 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack
2384 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh.
2385 */
2386 if (tcp_bblogging_on(rack->rc_tp)) {
2387 union tcp_log_stackspecific log;
2388 struct timeval tv;
2389
2390 memset(&log, 0, sizeof(log));
2391 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based;
2392 log.u_bbr.flex1 <<= 1;
2393 log.u_bbr.flex1 |= rack->rc_rack_use_dsack;
2394 log.u_bbr.flex1 <<= 1;
2395 log.u_bbr.flex1 |= rack->rc_dsack_round_seen;
2396 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end;
2397 log.u_bbr.flex3 = rack->r_ctl.num_dsack;
2398 log.u_bbr.flex4 = flex4;
2399 log.u_bbr.flex5 = flex5;
2400 log.u_bbr.flex6 = flex6;
2401 log.u_bbr.flex7 = rack->r_ctl.dsack_persist;
2402 log.u_bbr.flex8 = mod;
2403 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2404 log.u_bbr.epoch = rack->r_ctl.current_round;
2405 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2406 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2407 &rack->rc_inp->inp_socket->so_rcv,
2408 &rack->rc_inp->inp_socket->so_snd,
2409 RACK_DSACK_HANDLING, 0,
2410 0, &log, false, &tv);
2411 }
2412 }
2413
2414 static void
rack_log_hdwr_pacing(struct tcp_rack * rack,uint64_t rate,uint64_t hw_rate,int line,int error,uint16_t mod)2415 rack_log_hdwr_pacing(struct tcp_rack *rack,
2416 uint64_t rate, uint64_t hw_rate, int line,
2417 int error, uint16_t mod)
2418 {
2419 if (tcp_bblogging_on(rack->rc_tp)) {
2420 union tcp_log_stackspecific log;
2421 struct timeval tv;
2422 const struct ifnet *ifp;
2423 uint64_t ifp64;
2424
2425 memset(&log, 0, sizeof(log));
2426 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff);
2427 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff);
2428 if (rack->r_ctl.crte) {
2429 ifp = rack->r_ctl.crte->ptbl->rs_ifp;
2430 } else if (rack->rc_inp->inp_route.ro_nh &&
2431 rack->rc_inp->inp_route.ro_nh->nh_ifp) {
2432 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp;
2433 } else
2434 ifp = NULL;
2435 if (ifp) {
2436 ifp64 = (uintptr_t)ifp;
2437 log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff);
2438 log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff);
2439 }
2440 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2441 log.u_bbr.bw_inuse = rate;
2442 log.u_bbr.flex5 = line;
2443 log.u_bbr.flex6 = error;
2444 log.u_bbr.flex7 = mod;
2445 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs;
2446 log.u_bbr.flex8 = rack->use_fixed_rate;
2447 log.u_bbr.flex8 <<= 1;
2448 log.u_bbr.flex8 |= rack->rack_hdrw_pacing;
2449 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
2450 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate;
2451 if (rack->r_ctl.crte)
2452 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate;
2453 else
2454 log.u_bbr.cur_del_rate = 0;
2455 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req;
2456 log.u_bbr.epoch = rack->r_ctl.current_round;
2457 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2458 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2459 &rack->rc_inp->inp_socket->so_rcv,
2460 &rack->rc_inp->inp_socket->so_snd,
2461 BBR_LOG_HDWR_PACE, 0,
2462 0, &log, false, &tv);
2463 }
2464 }
2465
2466 static uint64_t
rack_get_output_bw(struct tcp_rack * rack,uint64_t bw,struct rack_sendmap * rsm,int * capped)2467 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped)
2468 {
2469 /*
2470 * We allow rack_per_of_gp_xx to dictate our bw rate we want.
2471 */
2472 uint64_t bw_est, high_rate;
2473 uint64_t gain;
2474
2475 gain = (uint64_t)rack_get_output_gain(rack, rsm);
2476 bw_est = bw * gain;
2477 bw_est /= (uint64_t)100;
2478 /* Never fall below the minimum (def 64kbps) */
2479 if (bw_est < RACK_MIN_BW)
2480 bw_est = RACK_MIN_BW;
2481 if (rack->r_rack_hw_rate_caps) {
2482 /* Rate caps are in place */
2483 if (rack->r_ctl.crte != NULL) {
2484 /* We have a hdwr rate already */
2485 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
2486 if (bw_est >= high_rate) {
2487 /* We are capping bw at the highest rate table entry */
2488 if (rack_hw_rate_cap_per &&
2489 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) {
2490 rack->r_rack_hw_rate_caps = 0;
2491 goto done;
2492 }
2493 rack_log_hdwr_pacing(rack,
2494 bw_est, high_rate, __LINE__,
2495 0, 3);
2496 bw_est = high_rate;
2497 if (capped)
2498 *capped = 1;
2499 }
2500 } else if ((rack->rack_hdrw_pacing == 0) &&
2501 (rack->rack_hdw_pace_ena) &&
2502 (rack->rack_attempt_hdwr_pace == 0) &&
2503 (rack->rc_inp->inp_route.ro_nh != NULL) &&
2504 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
2505 /*
2506 * Special case, we have not yet attempted hardware
2507 * pacing, and yet we may, when we do, find out if we are
2508 * above the highest rate. We need to know the maxbw for the interface
2509 * in question (if it supports ratelimiting). We get back
2510 * a 0, if the interface is not found in the RL lists.
2511 */
2512 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
2513 if (high_rate) {
2514 /* Yep, we have a rate is it above this rate? */
2515 if (bw_est > high_rate) {
2516 bw_est = high_rate;
2517 if (capped)
2518 *capped = 1;
2519 }
2520 }
2521 }
2522 }
2523 done:
2524 return (bw_est);
2525 }
2526
2527 static void
rack_log_retran_reason(struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t tsused,uint32_t thresh,int mod)2528 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod)
2529 {
2530 if (tcp_bblogging_on(rack->rc_tp)) {
2531 union tcp_log_stackspecific log;
2532 struct timeval tv;
2533
2534 if ((mod != 1) && (rack_verbose_logging == 0)) {
2535 /*
2536 * We get 3 values currently for mod
2537 * 1 - We are retransmitting and this tells the reason.
2538 * 2 - We are clearing a dup-ack count.
2539 * 3 - We are incrementing a dup-ack count.
2540 *
2541 * The clear/increment are only logged
2542 * if you have BBverbose on.
2543 */
2544 return;
2545 }
2546 memset(&log, 0, sizeof(log));
2547 log.u_bbr.flex1 = tsused;
2548 log.u_bbr.flex2 = thresh;
2549 log.u_bbr.flex3 = rsm->r_flags;
2550 log.u_bbr.flex4 = rsm->r_dupack;
2551 log.u_bbr.flex5 = rsm->r_start;
2552 log.u_bbr.flex6 = rsm->r_end;
2553 log.u_bbr.flex8 = mod;
2554 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2555 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2556 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2557 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2558 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2559 log.u_bbr.pacing_gain = rack->r_must_retran;
2560 log.u_bbr.epoch = rack->r_ctl.current_round;
2561 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost;
2562 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2563 &rack->rc_inp->inp_socket->so_rcv,
2564 &rack->rc_inp->inp_socket->so_snd,
2565 BBR_LOG_SETTINGS_CHG, 0,
2566 0, &log, false, &tv);
2567 }
2568 }
2569
2570 static void
rack_log_to_start(struct tcp_rack * rack,uint32_t cts,uint32_t to,int32_t pacing_delay,uint8_t which)2571 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which)
2572 {
2573 if (tcp_bblogging_on(rack->rc_tp)) {
2574 union tcp_log_stackspecific log;
2575 struct timeval tv;
2576
2577 memset(&log, 0, sizeof(log));
2578 log.u_bbr.flex1 = rack->rc_tp->t_srtt;
2579 log.u_bbr.flex2 = to;
2580 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags;
2581 log.u_bbr.flex4 = pacing_delay;
2582 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot;
2583 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2584 log.u_bbr.flex7 = rack->rc_in_persist;
2585 log.u_bbr.flex8 = which;
2586 if (rack->rack_no_prr)
2587 log.u_bbr.pkts_out = 0;
2588 else
2589 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
2590 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2591 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2592 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2593 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2594 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2595 log.u_bbr.pacing_gain = rack->r_must_retran;
2596 log.u_bbr.cwnd_gain = rack->rack_deferred_inited;
2597 log.u_bbr.pkt_epoch = rack->rc_has_collapsed;
2598 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift;
2599 log.u_bbr.lost = rack_rto_min;
2600 log.u_bbr.epoch = rack->r_ctl.roundends;
2601 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2602 log.u_bbr.bw_inuse <<= 32;
2603 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2604 log.u_bbr.applimited = rack->rc_tp->t_flags2;
2605 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2606 &rack->rc_inp->inp_socket->so_rcv,
2607 &rack->rc_inp->inp_socket->so_snd,
2608 BBR_LOG_TIMERSTAR, 0,
2609 0, &log, false, &tv);
2610 }
2611 }
2612
2613 static void
rack_log_to_event(struct tcp_rack * rack,int32_t to_num,struct rack_sendmap * rsm)2614 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm)
2615 {
2616 if (tcp_bblogging_on(rack->rc_tp)) {
2617 union tcp_log_stackspecific log;
2618 struct timeval tv;
2619
2620 memset(&log, 0, sizeof(log));
2621 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2622 log.u_bbr.flex8 = to_num;
2623 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt;
2624 log.u_bbr.flex2 = rack->rc_rack_rtt;
2625 if (rsm == NULL)
2626 log.u_bbr.flex3 = 0;
2627 else
2628 log.u_bbr.flex3 = rsm->r_end - rsm->r_start;
2629 if (rack->rack_no_prr)
2630 log.u_bbr.flex5 = 0;
2631 else
2632 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2633 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2634 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2635 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2636 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2637 log.u_bbr.pacing_gain = rack->r_must_retran;
2638 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2639 log.u_bbr.bw_inuse <<= 32;
2640 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2641 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2642 &rack->rc_inp->inp_socket->so_rcv,
2643 &rack->rc_inp->inp_socket->so_snd,
2644 BBR_LOG_RTO, 0,
2645 0, &log, false, &tv);
2646 }
2647 }
2648
2649 static void
rack_log_map_chg(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * prev,struct rack_sendmap * rsm,struct rack_sendmap * next,int flag,uint32_t th_ack,int line)2650 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack,
2651 struct rack_sendmap *prev,
2652 struct rack_sendmap *rsm,
2653 struct rack_sendmap *next,
2654 int flag, uint32_t th_ack, int line)
2655 {
2656 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2657 union tcp_log_stackspecific log;
2658 struct timeval tv;
2659
2660 memset(&log, 0, sizeof(log));
2661 log.u_bbr.flex8 = flag;
2662 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2663 log.u_bbr.cur_del_rate = (uintptr_t)prev;
2664 log.u_bbr.delRate = (uintptr_t)rsm;
2665 log.u_bbr.rttProp = (uintptr_t)next;
2666 log.u_bbr.flex7 = 0;
2667 if (prev) {
2668 log.u_bbr.flex1 = prev->r_start;
2669 log.u_bbr.flex2 = prev->r_end;
2670 log.u_bbr.flex7 |= 0x4;
2671 }
2672 if (rsm) {
2673 log.u_bbr.flex3 = rsm->r_start;
2674 log.u_bbr.flex4 = rsm->r_end;
2675 log.u_bbr.flex7 |= 0x2;
2676 }
2677 if (next) {
2678 log.u_bbr.flex5 = next->r_start;
2679 log.u_bbr.flex6 = next->r_end;
2680 log.u_bbr.flex7 |= 0x1;
2681 }
2682 log.u_bbr.applimited = line;
2683 log.u_bbr.pkts_out = th_ack;
2684 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2685 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2686 if (rack->rack_no_prr)
2687 log.u_bbr.lost = 0;
2688 else
2689 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt;
2690 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2691 log.u_bbr.bw_inuse <<= 32;
2692 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2693 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2694 &rack->rc_inp->inp_socket->so_rcv,
2695 &rack->rc_inp->inp_socket->so_snd,
2696 TCP_LOG_MAPCHG, 0,
2697 0, &log, false, &tv);
2698 }
2699 }
2700
2701 static void
rack_log_rtt_upd(struct tcpcb * tp,struct tcp_rack * rack,uint32_t t,uint32_t len,struct rack_sendmap * rsm,int conf)2702 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len,
2703 struct rack_sendmap *rsm, int conf)
2704 {
2705 if (tcp_bblogging_on(tp)) {
2706 union tcp_log_stackspecific log;
2707 struct timeval tv;
2708 memset(&log, 0, sizeof(log));
2709 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2710 log.u_bbr.flex1 = t;
2711 log.u_bbr.flex2 = len;
2712 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt;
2713 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest;
2714 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest;
2715 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2716 log.u_bbr.flex7 = conf;
2717 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot;
2718 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method;
2719 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2720 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt;
2721 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags;
2722 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2723 if (rsm) {
2724 log.u_bbr.pkt_epoch = rsm->r_start;
2725 log.u_bbr.lost = rsm->r_end;
2726 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt;
2727 /* We loose any upper of the 24 bits */
2728 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags;
2729 } else {
2730 /* Its a SYN */
2731 log.u_bbr.pkt_epoch = rack->rc_tp->iss;
2732 log.u_bbr.lost = 0;
2733 log.u_bbr.cwnd_gain = 0;
2734 log.u_bbr.pacing_gain = 0;
2735 }
2736 /* Write out general bits of interest rrs here */
2737 log.u_bbr.use_lt_bw = rack->rc_highly_buffered;
2738 log.u_bbr.use_lt_bw <<= 1;
2739 log.u_bbr.use_lt_bw |= rack->forced_ack;
2740 log.u_bbr.use_lt_bw <<= 1;
2741 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul;
2742 log.u_bbr.use_lt_bw <<= 1;
2743 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
2744 log.u_bbr.use_lt_bw <<= 1;
2745 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
2746 log.u_bbr.use_lt_bw <<= 1;
2747 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
2748 log.u_bbr.use_lt_bw <<= 1;
2749 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
2750 log.u_bbr.use_lt_bw <<= 1;
2751 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom;
2752 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight;
2753 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts;
2754 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered;
2755 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts;
2756 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt;
2757 log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
2758 log.u_bbr.bw_inuse <<= 32;
2759 if (rsm)
2760 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]);
2761 TCP_LOG_EVENTP(tp, NULL,
2762 &rack->rc_inp->inp_socket->so_rcv,
2763 &rack->rc_inp->inp_socket->so_snd,
2764 BBR_LOG_BBRRTT, 0,
2765 0, &log, false, &tv);
2766
2767
2768 }
2769 }
2770
2771 static void
rack_log_rtt_sample(struct tcp_rack * rack,uint32_t rtt)2772 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt)
2773 {
2774 /*
2775 * Log the rtt sample we are
2776 * applying to the srtt algorithm in
2777 * useconds.
2778 */
2779 if (tcp_bblogging_on(rack->rc_tp)) {
2780 union tcp_log_stackspecific log;
2781 struct timeval tv;
2782
2783 /* Convert our ms to a microsecond */
2784 memset(&log, 0, sizeof(log));
2785 log.u_bbr.flex1 = rtt;
2786 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
2787 log.u_bbr.flex7 = 1;
2788 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2789 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2790 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2791 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2792 log.u_bbr.pacing_gain = rack->r_must_retran;
2793 /*
2794 * We capture in delRate the upper 32 bits as
2795 * the confidence level we had declared, and the
2796 * lower 32 bits as the actual RTT using the arrival
2797 * timestamp.
2798 */
2799 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence;
2800 log.u_bbr.delRate <<= 32;
2801 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt;
2802 /* Lets capture all the things that make up t_rtxcur */
2803 log.u_bbr.applimited = rack_rto_min;
2804 log.u_bbr.epoch = rack_rto_max;
2805 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop;
2806 log.u_bbr.lost = rack_rto_min;
2807 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop);
2808 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp);
2809 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec;
2810 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC;
2811 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec;
2812 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2813 &rack->rc_inp->inp_socket->so_rcv,
2814 &rack->rc_inp->inp_socket->so_snd,
2815 TCP_LOG_RTT, 0,
2816 0, &log, false, &tv);
2817 }
2818 }
2819
2820 static void
rack_log_rtt_sample_calc(struct tcp_rack * rack,uint32_t rtt,uint32_t send_time,uint32_t ack_time,int where)2821 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where)
2822 {
2823 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2824 union tcp_log_stackspecific log;
2825 struct timeval tv;
2826
2827 /* Convert our ms to a microsecond */
2828 memset(&log, 0, sizeof(log));
2829 log.u_bbr.flex1 = rtt;
2830 log.u_bbr.flex2 = send_time;
2831 log.u_bbr.flex3 = ack_time;
2832 log.u_bbr.flex4 = where;
2833 log.u_bbr.flex7 = 2;
2834 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2835 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2836 log.u_bbr.bw_inuse <<= 32;
2837 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2838 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2839 &rack->rc_inp->inp_socket->so_rcv,
2840 &rack->rc_inp->inp_socket->so_snd,
2841 TCP_LOG_RTT, 0,
2842 0, &log, false, &tv);
2843 }
2844 }
2845
2846
2847 static void
rack_log_rtt_sendmap(struct tcp_rack * rack,uint32_t idx,uint64_t tsv,uint32_t tsecho)2848 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho)
2849 {
2850 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2851 union tcp_log_stackspecific log;
2852 struct timeval tv;
2853
2854 /* Convert our ms to a microsecond */
2855 memset(&log, 0, sizeof(log));
2856 log.u_bbr.flex1 = idx;
2857 log.u_bbr.flex2 = rack_ts_to_msec(tsv);
2858 log.u_bbr.flex3 = tsecho;
2859 log.u_bbr.flex7 = 3;
2860 log.u_bbr.rttProp = tsv;
2861 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2862 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2863 log.u_bbr.bw_inuse <<= 32;
2864 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2865 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2866 &rack->rc_inp->inp_socket->so_rcv,
2867 &rack->rc_inp->inp_socket->so_snd,
2868 TCP_LOG_RTT, 0,
2869 0, &log, false, &tv);
2870 }
2871 }
2872
2873
2874 static inline void
rack_log_progress_event(struct tcp_rack * rack,struct tcpcb * tp,uint32_t tick,int event,int line)2875 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line)
2876 {
2877 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2878 union tcp_log_stackspecific log;
2879 struct timeval tv;
2880
2881 memset(&log, 0, sizeof(log));
2882 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2883 log.u_bbr.flex1 = line;
2884 log.u_bbr.flex2 = tick;
2885 log.u_bbr.flex3 = tp->t_maxunacktime;
2886 log.u_bbr.flex4 = tp->t_acktime;
2887 log.u_bbr.flex8 = event;
2888 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2889 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2890 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2891 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2892 log.u_bbr.pacing_gain = rack->r_must_retran;
2893 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2894 log.u_bbr.bw_inuse <<= 32;
2895 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2896 TCP_LOG_EVENTP(tp, NULL,
2897 &rack->rc_inp->inp_socket->so_rcv,
2898 &rack->rc_inp->inp_socket->so_snd,
2899 BBR_LOG_PROGRESS, 0,
2900 0, &log, false, &tv);
2901 }
2902 }
2903
2904 static void
rack_log_type_bbrsnd(struct tcp_rack * rack,uint32_t len,uint32_t pacing_delay,uint32_t cts,struct timeval * tv,int line)2905 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line)
2906 {
2907 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
2908 union tcp_log_stackspecific log;
2909
2910 memset(&log, 0, sizeof(log));
2911 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2912 log.u_bbr.flex1 = pacing_delay;
2913 if (rack->rack_no_prr)
2914 log.u_bbr.flex2 = 0;
2915 else
2916 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt;
2917 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2918 log.u_bbr.flex6 = line;
2919 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags);
2920 log.u_bbr.flex8 = rack->rc_in_persist;
2921 log.u_bbr.timeStamp = cts;
2922 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2923 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2924 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2925 log.u_bbr.pacing_gain = rack->r_must_retran;
2926 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2927 &rack->rc_inp->inp_socket->so_rcv,
2928 &rack->rc_inp->inp_socket->so_snd,
2929 BBR_LOG_BBRSND, 0,
2930 0, &log, false, tv);
2931 }
2932 }
2933
2934 static void
rack_log_doseg_done(struct tcp_rack * rack,uint32_t cts,int32_t nxt_pkt,int32_t did_out,int way_out,int nsegs)2935 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs)
2936 {
2937 if (tcp_bblogging_on(rack->rc_tp)) {
2938 union tcp_log_stackspecific log;
2939 struct timeval tv;
2940
2941 memset(&log, 0, sizeof(log));
2942 log.u_bbr.flex1 = did_out;
2943 log.u_bbr.flex2 = nxt_pkt;
2944 log.u_bbr.flex3 = way_out;
2945 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
2946 if (rack->rack_no_prr)
2947 log.u_bbr.flex5 = 0;
2948 else
2949 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
2950 log.u_bbr.flex6 = nsegs;
2951 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs;
2952 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */
2953 log.u_bbr.flex7 <<= 1;
2954 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */
2955 log.u_bbr.flex7 <<= 1;
2956 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */
2957 log.u_bbr.flex8 = rack->rc_in_persist;
2958 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
2959 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2960 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2961 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
2962 log.u_bbr.use_lt_bw <<= 1;
2963 log.u_bbr.use_lt_bw |= rack->r_might_revert;
2964 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
2965 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
2966 log.u_bbr.pacing_gain = rack->r_must_retran;
2967 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
2968 log.u_bbr.bw_inuse <<= 32;
2969 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
2970 log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat;
2971 log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat;
2972 log.u_bbr.lost = rack->rc_tp->t_srtt;
2973 log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt;
2974 TCP_LOG_EVENTP(rack->rc_tp, NULL,
2975 &rack->rc_inp->inp_socket->so_rcv,
2976 &rack->rc_inp->inp_socket->so_snd,
2977 BBR_LOG_DOSEG_DONE, 0,
2978 0, &log, false, &tv);
2979 }
2980 }
2981
2982 static void
rack_log_type_pacing_sizes(struct tcpcb * tp,struct tcp_rack * rack,uint32_t arg1,uint32_t arg2,uint32_t arg3,uint8_t frm)2983 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm)
2984 {
2985 if (tcp_bblogging_on(rack->rc_tp)) {
2986 union tcp_log_stackspecific log;
2987 struct timeval tv;
2988
2989 memset(&log, 0, sizeof(log));
2990 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs;
2991 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
2992 log.u_bbr.flex4 = arg1;
2993 log.u_bbr.flex5 = arg2;
2994 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs;
2995 log.u_bbr.flex6 = arg3;
2996 log.u_bbr.flex8 = frm;
2997 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
2998 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
2999 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3000 log.u_bbr.applimited = rack->r_ctl.rc_sacked;
3001 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3002 log.u_bbr.pacing_gain = rack->r_must_retran;
3003 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
3004 &tptosocket(tp)->so_snd,
3005 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv);
3006 }
3007 }
3008
3009 static void
rack_log_type_just_return(struct tcp_rack * rack,uint32_t cts,uint32_t tlen,uint32_t pacing_delay,uint8_t hpts_calling,int reason,uint32_t cwnd_to_use)3010 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay,
3011 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use)
3012 {
3013 if (tcp_bblogging_on(rack->rc_tp)) {
3014 union tcp_log_stackspecific log;
3015 struct timeval tv;
3016
3017 memset(&log, 0, sizeof(log));
3018 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
3019 log.u_bbr.flex1 = pacing_delay;
3020 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags;
3021 log.u_bbr.flex4 = reason;
3022 if (rack->rack_no_prr)
3023 log.u_bbr.flex5 = 0;
3024 else
3025 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3026 log.u_bbr.flex7 = hpts_calling;
3027 log.u_bbr.flex8 = rack->rc_in_persist;
3028 log.u_bbr.lt_epoch = cwnd_to_use;
3029 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3030 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3031 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3032 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3033 log.u_bbr.pacing_gain = rack->r_must_retran;
3034 log.u_bbr.cwnd_gain = rack->rc_has_collapsed;
3035 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
3036 log.u_bbr.bw_inuse <<= 32;
3037 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
3038 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3039 &rack->rc_inp->inp_socket->so_rcv,
3040 &rack->rc_inp->inp_socket->so_snd,
3041 BBR_LOG_JUSTRET, 0,
3042 tlen, &log, false, &tv);
3043 }
3044 }
3045
3046 static void
rack_log_to_cancel(struct tcp_rack * rack,int32_t hpts_removed,int line,uint32_t us_cts,struct timeval * tv,uint32_t flags_on_entry)3047 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts,
3048 struct timeval *tv, uint32_t flags_on_entry)
3049 {
3050 if (tcp_bblogging_on(rack->rc_tp)) {
3051 union tcp_log_stackspecific log;
3052
3053 memset(&log, 0, sizeof(log));
3054 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
3055 log.u_bbr.flex1 = line;
3056 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to;
3057 log.u_bbr.flex3 = flags_on_entry;
3058 log.u_bbr.flex4 = us_cts;
3059 if (rack->rack_no_prr)
3060 log.u_bbr.flex5 = 0;
3061 else
3062 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt;
3063 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur;
3064 log.u_bbr.flex7 = hpts_removed;
3065 log.u_bbr.flex8 = 1;
3066 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags;
3067 log.u_bbr.timeStamp = us_cts;
3068 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3069 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3070 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3071 log.u_bbr.pacing_gain = rack->r_must_retran;
3072 log.u_bbr.bw_inuse = rack->r_ctl.current_round;
3073 log.u_bbr.bw_inuse <<= 32;
3074 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost;
3075 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3076 &rack->rc_inp->inp_socket->so_rcv,
3077 &rack->rc_inp->inp_socket->so_snd,
3078 BBR_LOG_TIMERCANC, 0,
3079 0, &log, false, tv);
3080 }
3081 }
3082
3083 static void
rack_log_alt_to_to_cancel(struct tcp_rack * rack,uint32_t flex1,uint32_t flex2,uint32_t flex3,uint32_t flex4,uint32_t flex5,uint32_t flex6,uint16_t flex7,uint8_t mod)3084 rack_log_alt_to_to_cancel(struct tcp_rack *rack,
3085 uint32_t flex1, uint32_t flex2,
3086 uint32_t flex3, uint32_t flex4,
3087 uint32_t flex5, uint32_t flex6,
3088 uint16_t flex7, uint8_t mod)
3089 {
3090 if (tcp_bblogging_on(rack->rc_tp)) {
3091 union tcp_log_stackspecific log;
3092 struct timeval tv;
3093
3094 if (mod == 1) {
3095 /* No you can't use 1, its for the real to cancel */
3096 return;
3097 }
3098 memset(&log, 0, sizeof(log));
3099 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3100 log.u_bbr.flex1 = flex1;
3101 log.u_bbr.flex2 = flex2;
3102 log.u_bbr.flex3 = flex3;
3103 log.u_bbr.flex4 = flex4;
3104 log.u_bbr.flex5 = flex5;
3105 log.u_bbr.flex6 = flex6;
3106 log.u_bbr.flex7 = flex7;
3107 log.u_bbr.flex8 = mod;
3108 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3109 &rack->rc_inp->inp_socket->so_rcv,
3110 &rack->rc_inp->inp_socket->so_snd,
3111 BBR_LOG_TIMERCANC, 0,
3112 0, &log, false, &tv);
3113 }
3114 }
3115
3116 static void
rack_log_to_processing(struct tcp_rack * rack,uint32_t cts,int32_t ret,int32_t timers)3117 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers)
3118 {
3119 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
3120 union tcp_log_stackspecific log;
3121 struct timeval tv;
3122
3123 memset(&log, 0, sizeof(log));
3124 log.u_bbr.flex1 = timers;
3125 log.u_bbr.flex2 = ret;
3126 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp;
3127 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
3128 log.u_bbr.flex5 = cts;
3129 if (rack->rack_no_prr)
3130 log.u_bbr.flex6 = 0;
3131 else
3132 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt;
3133 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto;
3134 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto;
3135 log.u_bbr.pacing_gain = rack->r_must_retran;
3136 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3137 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3138 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3139 &rack->rc_inp->inp_socket->so_rcv,
3140 &rack->rc_inp->inp_socket->so_snd,
3141 BBR_LOG_TO_PROCESS, 0,
3142 0, &log, false, &tv);
3143 }
3144 }
3145
3146 static void
rack_log_to_prr(struct tcp_rack * rack,int frm,int orig_cwnd,int line)3147 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line)
3148 {
3149 if (tcp_bblogging_on(rack->rc_tp)) {
3150 union tcp_log_stackspecific log;
3151 struct timeval tv;
3152
3153 memset(&log, 0, sizeof(log));
3154 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out;
3155 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs;
3156 if (rack->rack_no_prr)
3157 log.u_bbr.flex3 = 0;
3158 else
3159 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt;
3160 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered;
3161 log.u_bbr.flex5 = rack->r_ctl.rc_sacked;
3162 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt;
3163 log.u_bbr.flex7 = line;
3164 log.u_bbr.flex8 = frm;
3165 log.u_bbr.pkts_out = orig_cwnd;
3166 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3167 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3168 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
3169 log.u_bbr.use_lt_bw <<= 1;
3170 log.u_bbr.use_lt_bw |= rack->r_might_revert;
3171 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3172 &rack->rc_inp->inp_socket->so_rcv,
3173 &rack->rc_inp->inp_socket->so_snd,
3174 BBR_LOG_BBRUPD, 0,
3175 0, &log, false, &tv);
3176 }
3177 }
3178
3179 static void
rack_counter_destroy(void)3180 rack_counter_destroy(void)
3181 {
3182 counter_u64_free(rack_total_bytes);
3183 counter_u64_free(rack_fto_send);
3184 counter_u64_free(rack_fto_rsm_send);
3185 counter_u64_free(rack_nfto_resend);
3186 counter_u64_free(rack_hw_pace_init_fail);
3187 counter_u64_free(rack_hw_pace_lost);
3188 counter_u64_free(rack_non_fto_send);
3189 counter_u64_free(rack_extended_rfo);
3190 counter_u64_free(rack_tlp_tot);
3191 counter_u64_free(rack_tlp_newdata);
3192 counter_u64_free(rack_tlp_retran);
3193 counter_u64_free(rack_tlp_retran_bytes);
3194 counter_u64_free(rack_to_tot);
3195 counter_u64_free(rack_saw_enobuf);
3196 counter_u64_free(rack_saw_enobuf_hw);
3197 counter_u64_free(rack_saw_enetunreach);
3198 counter_u64_free(rack_hot_alloc);
3199 counter_u64_free(rack_to_alloc);
3200 counter_u64_free(rack_to_alloc_hard);
3201 counter_u64_free(rack_to_alloc_emerg);
3202 counter_u64_free(rack_to_alloc_limited);
3203 counter_u64_free(rack_alloc_limited_conns);
3204 counter_u64_free(rack_split_limited);
3205 counter_u64_free(rack_multi_single_eq);
3206 counter_u64_free(rack_rxt_clamps_cwnd);
3207 counter_u64_free(rack_rxt_clamps_cwnd_uniq);
3208 counter_u64_free(rack_proc_non_comp_ack);
3209 counter_u64_free(rack_sack_proc_all);
3210 counter_u64_free(rack_sack_proc_restart);
3211 counter_u64_free(rack_sack_proc_short);
3212 counter_u64_free(rack_input_idle_reduces);
3213 counter_u64_free(rack_collapsed_win);
3214 counter_u64_free(rack_collapsed_win_rxt);
3215 counter_u64_free(rack_collapsed_win_rxt_bytes);
3216 counter_u64_free(rack_collapsed_win_seen);
3217 counter_u64_free(rack_try_scwnd);
3218 counter_u64_free(rack_persists_sends);
3219 counter_u64_free(rack_persists_acks);
3220 counter_u64_free(rack_persists_loss);
3221 counter_u64_free(rack_persists_lost_ends);
3222 #ifdef INVARIANTS
3223 counter_u64_free(rack_adjust_map_bw);
3224 #endif
3225 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE);
3226 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE);
3227 }
3228
3229 static struct rack_sendmap *
rack_alloc(struct tcp_rack * rack)3230 rack_alloc(struct tcp_rack *rack)
3231 {
3232 struct rack_sendmap *rsm;
3233
3234 /*
3235 * First get the top of the list it in
3236 * theory is the "hottest" rsm we have,
3237 * possibly just freed by ack processing.
3238 */
3239 if (rack->rc_free_cnt > rack_free_cache) {
3240 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3241 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3242 counter_u64_add(rack_hot_alloc, 1);
3243 rack->rc_free_cnt--;
3244 return (rsm);
3245 }
3246 /*
3247 * Once we get under our free cache we probably
3248 * no longer have a "hot" one available. Lets
3249 * get one from UMA.
3250 */
3251 rsm = uma_zalloc(rack_zone, M_NOWAIT);
3252 if (rsm) {
3253 rack->r_ctl.rc_num_maps_alloced++;
3254 counter_u64_add(rack_to_alloc, 1);
3255 return (rsm);
3256 }
3257 /*
3258 * Dig in to our aux rsm's (the last two) since
3259 * UMA failed to get us one.
3260 */
3261 if (rack->rc_free_cnt) {
3262 counter_u64_add(rack_to_alloc_emerg, 1);
3263 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
3264 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3265 rack->rc_free_cnt--;
3266 return (rsm);
3267 }
3268 return (NULL);
3269 }
3270
3271 static struct rack_sendmap *
rack_alloc_full_limit(struct tcp_rack * rack)3272 rack_alloc_full_limit(struct tcp_rack *rack)
3273 {
3274 if ((V_tcp_map_entries_limit > 0) &&
3275 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
3276 counter_u64_add(rack_to_alloc_limited, 1);
3277 if (!rack->alloc_limit_reported) {
3278 rack->alloc_limit_reported = 1;
3279 counter_u64_add(rack_alloc_limited_conns, 1);
3280 }
3281 return (NULL);
3282 }
3283 return (rack_alloc(rack));
3284 }
3285
3286 /* wrapper to allocate a sendmap entry, subject to a specific limit */
3287 static struct rack_sendmap *
rack_alloc_limit(struct tcp_rack * rack,uint8_t limit_type)3288 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
3289 {
3290 struct rack_sendmap *rsm;
3291
3292 if (limit_type) {
3293 /* currently there is only one limit type */
3294 if (rack->r_ctl.rc_split_limit > 0 &&
3295 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) {
3296 counter_u64_add(rack_split_limited, 1);
3297 if (!rack->alloc_limit_reported) {
3298 rack->alloc_limit_reported = 1;
3299 counter_u64_add(rack_alloc_limited_conns, 1);
3300 }
3301 return (NULL);
3302 }
3303 }
3304
3305 /* allocate and mark in the limit type, if set */
3306 rsm = rack_alloc(rack);
3307 if (rsm != NULL && limit_type) {
3308 rsm->r_limit_type = limit_type;
3309 rack->r_ctl.rc_num_split_allocs++;
3310 }
3311 return (rsm);
3312 }
3313
3314 static void
rack_free_trim(struct tcp_rack * rack)3315 rack_free_trim(struct tcp_rack *rack)
3316 {
3317 struct rack_sendmap *rsm;
3318
3319 /*
3320 * Free up all the tail entries until
3321 * we get our list down to the limit.
3322 */
3323 while (rack->rc_free_cnt > rack_free_cache) {
3324 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head);
3325 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
3326 rack->rc_free_cnt--;
3327 rack->r_ctl.rc_num_maps_alloced--;
3328 uma_zfree(rack_zone, rsm);
3329 }
3330 }
3331
3332 static void
rack_free(struct tcp_rack * rack,struct rack_sendmap * rsm)3333 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm)
3334 {
3335 if (rsm->r_flags & RACK_APP_LIMITED) {
3336 KASSERT((rack->r_ctl.rc_app_limited_cnt > 0),
3337 ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm));
3338 rack->r_ctl.rc_app_limited_cnt--;
3339 }
3340 if (rsm->r_limit_type) {
3341 /* currently there is only one limit type */
3342 rack->r_ctl.rc_num_split_allocs--;
3343 }
3344 if (rsm == rack->r_ctl.rc_first_appl) {
3345 rack->r_ctl.cleared_app_ack_seq = rsm->r_end;
3346 rack->r_ctl.cleared_app_ack = 1;
3347 if (rack->r_ctl.rc_app_limited_cnt == 0)
3348 rack->r_ctl.rc_first_appl = NULL;
3349 else
3350 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl);
3351 }
3352 if (rsm == rack->r_ctl.rc_resend)
3353 rack->r_ctl.rc_resend = NULL;
3354 if (rsm == rack->r_ctl.rc_end_appl)
3355 rack->r_ctl.rc_end_appl = NULL;
3356 if (rack->r_ctl.rc_tlpsend == rsm)
3357 rack->r_ctl.rc_tlpsend = NULL;
3358 if (rack->r_ctl.rc_sacklast == rsm)
3359 rack->r_ctl.rc_sacklast = NULL;
3360 memset(rsm, 0, sizeof(struct rack_sendmap));
3361 /* Make sure we are not going to overrun our count limit of 0xff */
3362 if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) {
3363 rack_free_trim(rack);
3364 }
3365 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext);
3366 rack->rc_free_cnt++;
3367 }
3368
3369 static uint32_t
rack_get_measure_window(struct tcpcb * tp,struct tcp_rack * rack)3370 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack)
3371 {
3372 uint64_t srtt, bw, len, tim;
3373 uint32_t segsiz, def_len, minl;
3374
3375 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3376 def_len = rack_def_data_window * segsiz;
3377 if (rack->rc_gp_filled == 0) {
3378 /*
3379 * We have no measurement (IW is in flight?) so
3380 * we can only guess using our data_window sysctl
3381 * value (usually 20MSS).
3382 */
3383 return (def_len);
3384 }
3385 /*
3386 * Now we have a number of factors to consider.
3387 *
3388 * 1) We have a desired BDP which is usually
3389 * at least 2.
3390 * 2) We have a minimum number of rtt's usually 1 SRTT
3391 * but we allow it too to be more.
3392 * 3) We want to make sure a measurement last N useconds (if
3393 * we have set rack_min_measure_usec.
3394 *
3395 * We handle the first concern here by trying to create a data
3396 * window of max(rack_def_data_window, DesiredBDP). The
3397 * second concern we handle in not letting the measurement
3398 * window end normally until at least the required SRTT's
3399 * have gone by which is done further below in
3400 * rack_enough_for_measurement(). Finally the third concern
3401 * we also handle here by calculating how long that time
3402 * would take at the current BW and then return the
3403 * max of our first calculation and that length. Note
3404 * that if rack_min_measure_usec is 0, we don't deal
3405 * with concern 3. Also for both Concern 1 and 3 an
3406 * application limited period could end the measurement
3407 * earlier.
3408 *
3409 * So lets calculate the BDP with the "known" b/w using
3410 * the SRTT as our rtt and then multiply it by the goal.
3411 */
3412 bw = rack_get_bw(rack);
3413 srtt = (uint64_t)tp->t_srtt;
3414 len = bw * srtt;
3415 len /= (uint64_t)HPTS_USEC_IN_SEC;
3416 len *= max(1, rack_goal_bdp);
3417 /* Now we need to round up to the nearest MSS */
3418 len = roundup(len, segsiz);
3419 if (rack_min_measure_usec) {
3420 /* Now calculate our min length for this b/w */
3421 tim = rack_min_measure_usec;
3422 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC;
3423 if (minl == 0)
3424 minl = 1;
3425 minl = roundup(minl, segsiz);
3426 if (len < minl)
3427 len = minl;
3428 }
3429 /*
3430 * Now if we have a very small window we want
3431 * to attempt to get the window that is
3432 * as small as possible. This happens on
3433 * low b/w connections and we don't want to
3434 * span huge numbers of rtt's between measurements.
3435 *
3436 * We basically include 2 over our "MIN window" so
3437 * that the measurement can be shortened (possibly) by
3438 * an ack'ed packet.
3439 */
3440 if (len < def_len)
3441 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz)));
3442 else
3443 return (max((uint32_t)len, def_len));
3444
3445 }
3446
3447 static int
rack_enough_for_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq th_ack,uint8_t * quality)3448 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality)
3449 {
3450 uint32_t tim, srtts, segsiz;
3451
3452 /*
3453 * Has enough time passed for the GP measurement to be valid?
3454 */
3455 if (SEQ_LT(th_ack, tp->gput_seq)) {
3456 /* Not enough bytes yet */
3457 return (0);
3458 }
3459 if ((tp->snd_max == tp->snd_una) ||
3460 (th_ack == tp->snd_max)){
3461 /*
3462 * All is acked quality of all acked is
3463 * usually low or medium, but we in theory could split
3464 * all acked into two cases, where you got
3465 * a signifigant amount of your window and
3466 * where you did not. For now we leave it
3467 * but it is something to contemplate in the
3468 * future. The danger here is that delayed ack
3469 * is effecting the last byte (which is a 50:50 chance).
3470 */
3471 *quality = RACK_QUALITY_ALLACKED;
3472 return (1);
3473 }
3474 if (SEQ_GEQ(th_ack, tp->gput_ack)) {
3475 /*
3476 * We obtained our entire window of data we wanted
3477 * no matter if we are in recovery or not then
3478 * its ok since expanding the window does not
3479 * make things fuzzy (or at least not as much).
3480 */
3481 *quality = RACK_QUALITY_HIGH;
3482 return (1);
3483 }
3484 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
3485 if (SEQ_LT(th_ack, tp->gput_ack) &&
3486 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
3487 /* Not enough bytes yet */
3488 return (0);
3489 }
3490 if (rack->r_ctl.rc_first_appl &&
3491 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) {
3492 /*
3493 * We are up to the app limited send point
3494 * we have to measure irrespective of the time..
3495 */
3496 *quality = RACK_QUALITY_APPLIMITED;
3497 return (1);
3498 }
3499 /* Now what about time? */
3500 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts);
3501 tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts;
3502 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) {
3503 /*
3504 * We do not allow a measurement if we are in recovery
3505 * that would shrink the goodput window we wanted.
3506 * This is to prevent cloudyness of when the last send
3507 * was actually made.
3508 */
3509 *quality = RACK_QUALITY_HIGH;
3510 return (1);
3511 }
3512 /* Nope not even a full SRTT has passed */
3513 return (0);
3514 }
3515
3516 static void
rack_log_timely(struct tcp_rack * rack,uint32_t logged,uint64_t cur_bw,uint64_t low_bnd,uint64_t up_bnd,int line,uint8_t method)3517 rack_log_timely(struct tcp_rack *rack,
3518 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd,
3519 uint64_t up_bnd, int line, uint8_t method)
3520 {
3521 if (tcp_bblogging_on(rack->rc_tp)) {
3522 union tcp_log_stackspecific log;
3523 struct timeval tv;
3524
3525 memset(&log, 0, sizeof(log));
3526 log.u_bbr.flex1 = logged;
3527 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt;
3528 log.u_bbr.flex2 <<= 4;
3529 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt;
3530 log.u_bbr.flex2 <<= 4;
3531 log.u_bbr.flex2 |= rack->rc_gp_incr;
3532 log.u_bbr.flex2 <<= 4;
3533 log.u_bbr.flex2 |= rack->rc_gp_bwred;
3534 log.u_bbr.flex3 = rack->rc_gp_incr;
3535 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3536 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca;
3537 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec;
3538 log.u_bbr.flex7 = rack->rc_gp_bwred;
3539 log.u_bbr.flex8 = method;
3540 log.u_bbr.cur_del_rate = cur_bw;
3541 log.u_bbr.delRate = low_bnd;
3542 log.u_bbr.bw_inuse = up_bnd;
3543 log.u_bbr.rttProp = rack_get_bw(rack);
3544 log.u_bbr.pkt_epoch = line;
3545 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3546 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3547 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
3548 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
3549 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
3550 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom;
3551 log.u_bbr.cwnd_gain <<= 1;
3552 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec;
3553 log.u_bbr.cwnd_gain <<= 1;
3554 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
3555 log.u_bbr.cwnd_gain <<= 1;
3556 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
3557 log.u_bbr.lost = rack->r_ctl.rc_loss_count;
3558 TCP_LOG_EVENTP(rack->rc_tp, NULL,
3559 &rack->rc_inp->inp_socket->so_rcv,
3560 &rack->rc_inp->inp_socket->so_snd,
3561 TCP_TIMELY_WORK, 0,
3562 0, &log, false, &tv);
3563 }
3564 }
3565
3566 static int
rack_bw_can_be_raised(struct tcp_rack * rack,uint64_t cur_bw,uint64_t last_bw_est,uint16_t mult)3567 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult)
3568 {
3569 /*
3570 * Before we increase we need to know if
3571 * the estimate just made was less than
3572 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est)
3573 *
3574 * If we already are pacing at a fast enough
3575 * rate to push us faster there is no sense of
3576 * increasing.
3577 *
3578 * We first caculate our actual pacing rate (ss or ca multiplier
3579 * times our cur_bw).
3580 *
3581 * Then we take the last measured rate and multipy by our
3582 * maximum pacing overage to give us a max allowable rate.
3583 *
3584 * If our act_rate is smaller than our max_allowable rate
3585 * then we should increase. Else we should hold steady.
3586 *
3587 */
3588 uint64_t act_rate, max_allow_rate;
3589
3590 if (rack_timely_no_stopping)
3591 return (1);
3592
3593 if ((cur_bw == 0) || (last_bw_est == 0)) {
3594 /*
3595 * Initial startup case or
3596 * everything is acked case.
3597 */
3598 rack_log_timely(rack, mult, cur_bw, 0, 0,
3599 __LINE__, 9);
3600 return (1);
3601 }
3602 if (mult <= 100) {
3603 /*
3604 * We can always pace at or slightly above our rate.
3605 */
3606 rack_log_timely(rack, mult, cur_bw, 0, 0,
3607 __LINE__, 9);
3608 return (1);
3609 }
3610 act_rate = cur_bw * (uint64_t)mult;
3611 act_rate /= 100;
3612 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100);
3613 max_allow_rate /= 100;
3614 if (act_rate < max_allow_rate) {
3615 /*
3616 * Here the rate we are actually pacing at
3617 * is smaller than 10% above our last measurement.
3618 * This means we are pacing below what we would
3619 * like to try to achieve (plus some wiggle room).
3620 */
3621 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate,
3622 __LINE__, 9);
3623 return (1);
3624 } else {
3625 /*
3626 * Here we are already pacing at least rack_max_per_above(10%)
3627 * what we are getting back. This indicates most likely
3628 * that we are being limited (cwnd/rwnd/app) and can't
3629 * get any more b/w. There is no sense of trying to
3630 * raise up the pacing rate its not speeding us up
3631 * and we already are pacing faster than we are getting.
3632 */
3633 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate,
3634 __LINE__, 8);
3635 return (0);
3636 }
3637 }
3638
3639 static void
rack_validate_multipliers_at_or_above100(struct tcp_rack * rack)3640 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack)
3641 {
3642 /*
3643 * When we drag bottom, we want to assure
3644 * that no multiplier is below 1.0, if so
3645 * we want to restore it to at least that.
3646 */
3647 if (rack->r_ctl.rack_per_of_gp_rec < 100) {
3648 /* This is unlikely we usually do not touch recovery */
3649 rack->r_ctl.rack_per_of_gp_rec = 100;
3650 }
3651 if (rack->r_ctl.rack_per_of_gp_ca < 100) {
3652 rack->r_ctl.rack_per_of_gp_ca = 100;
3653 }
3654 if (rack->r_ctl.rack_per_of_gp_ss < 100) {
3655 rack->r_ctl.rack_per_of_gp_ss = 100;
3656 }
3657 }
3658
3659 static void
rack_validate_multipliers_at_or_below_100(struct tcp_rack * rack)3660 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack)
3661 {
3662 if (rack->r_ctl.rack_per_of_gp_ca > 100) {
3663 rack->r_ctl.rack_per_of_gp_ca = 100;
3664 }
3665 if (rack->r_ctl.rack_per_of_gp_ss > 100) {
3666 rack->r_ctl.rack_per_of_gp_ss = 100;
3667 }
3668 }
3669
3670 static void
rack_increase_bw_mul(struct tcp_rack * rack,int timely_says,uint64_t cur_bw,uint64_t last_bw_est,int override)3671 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override)
3672 {
3673 int32_t calc, logged, plus;
3674
3675 logged = 0;
3676
3677 if (rack->rc_skip_timely)
3678 return;
3679 if (override) {
3680 /*
3681 * override is passed when we are
3682 * loosing b/w and making one last
3683 * gasp at trying to not loose out
3684 * to a new-reno flow.
3685 */
3686 goto extra_boost;
3687 }
3688 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */
3689 if (rack->rc_gp_incr &&
3690 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) {
3691 /*
3692 * Reset and get 5 strokes more before the boost. Note
3693 * that the count is 0 based so we have to add one.
3694 */
3695 extra_boost:
3696 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST;
3697 rack->rc_gp_timely_inc_cnt = 0;
3698 } else
3699 plus = (uint32_t)rack_gp_increase_per;
3700 /* Must be at least 1% increase for true timely increases */
3701 if ((plus < 1) &&
3702 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0)))
3703 plus = 1;
3704 if (rack->rc_gp_saw_rec &&
3705 (rack->rc_gp_no_rec_chg == 0) &&
3706 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3707 rack->r_ctl.rack_per_of_gp_rec)) {
3708 /* We have been in recovery ding it too */
3709 calc = rack->r_ctl.rack_per_of_gp_rec + plus;
3710 if (calc > 0xffff)
3711 calc = 0xffff;
3712 logged |= 1;
3713 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc;
3714 if (rack->r_ctl.rack_per_upper_bound_ca &&
3715 (rack->rc_dragged_bottom == 0) &&
3716 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca))
3717 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca;
3718 }
3719 if (rack->rc_gp_saw_ca &&
3720 (rack->rc_gp_saw_ss == 0) &&
3721 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3722 rack->r_ctl.rack_per_of_gp_ca)) {
3723 /* In CA */
3724 calc = rack->r_ctl.rack_per_of_gp_ca + plus;
3725 if (calc > 0xffff)
3726 calc = 0xffff;
3727 logged |= 2;
3728 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc;
3729 if (rack->r_ctl.rack_per_upper_bound_ca &&
3730 (rack->rc_dragged_bottom == 0) &&
3731 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca))
3732 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca;
3733 }
3734 if (rack->rc_gp_saw_ss &&
3735 rack_bw_can_be_raised(rack, cur_bw, last_bw_est,
3736 rack->r_ctl.rack_per_of_gp_ss)) {
3737 /* In SS */
3738 calc = rack->r_ctl.rack_per_of_gp_ss + plus;
3739 if (calc > 0xffff)
3740 calc = 0xffff;
3741 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc;
3742 if (rack->r_ctl.rack_per_upper_bound_ss &&
3743 (rack->rc_dragged_bottom == 0) &&
3744 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss))
3745 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss;
3746 logged |= 4;
3747 }
3748 if (logged &&
3749 (rack->rc_gp_incr == 0)){
3750 /* Go into increment mode */
3751 rack->rc_gp_incr = 1;
3752 rack->rc_gp_timely_inc_cnt = 0;
3753 }
3754 if (rack->rc_gp_incr &&
3755 logged &&
3756 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) {
3757 rack->rc_gp_timely_inc_cnt++;
3758 }
3759 rack_log_timely(rack, logged, plus, 0, 0,
3760 __LINE__, 1);
3761 }
3762
3763 static uint32_t
rack_get_decrease(struct tcp_rack * rack,uint32_t curper,int32_t rtt_diff)3764 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff)
3765 {
3766 /*-
3767 * norm_grad = rtt_diff / minrtt;
3768 * new_per = curper * (1 - B * norm_grad)
3769 *
3770 * B = rack_gp_decrease_per (default 80%)
3771 * rtt_dif = input var current rtt-diff
3772 * curper = input var current percentage
3773 * minrtt = from rack filter
3774 *
3775 * In order to do the floating point calculations above we
3776 * do an integer conversion. The code looks confusing so let me
3777 * translate it into something that use more variables and
3778 * is clearer for us humans :)
3779 *
3780 * uint64_t norm_grad, inverse, reduce_by, final_result;
3781 * uint32_t perf;
3782 *
3783 * norm_grad = (((uint64_t)rtt_diff * 1000000) /
3784 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt));
3785 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad;
3786 * inverse /= 1000000;
3787 * reduce_by = (1000000 - inverse);
3788 * final_result = (cur_per * reduce_by) / 1000000;
3789 * perf = (uint32_t)final_result;
3790 */
3791 uint64_t perf;
3792
3793 perf = (((uint64_t)curper * ((uint64_t)1000000 -
3794 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 *
3795 (((uint64_t)rtt_diff * (uint64_t)1000000)/
3796 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/
3797 (uint64_t)1000000)) /
3798 (uint64_t)1000000);
3799 if (perf > curper) {
3800 /* TSNH */
3801 perf = curper - 1;
3802 }
3803 return ((uint32_t)perf);
3804 }
3805
3806 static uint32_t
rack_decrease_highrtt(struct tcp_rack * rack,uint32_t curper,uint32_t rtt)3807 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt)
3808 {
3809 /*
3810 * highrttthresh
3811 * result = curper * (1 - (B * ( 1 - ------ ))
3812 * gp_srtt
3813 *
3814 * B = rack_gp_decrease_per (default .8 i.e. 80)
3815 * highrttthresh = filter_min * rack_gp_rtt_maxmul
3816 */
3817 uint64_t perf;
3818 uint32_t highrttthresh;
3819
3820 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
3821
3822 perf = (((uint64_t)curper * ((uint64_t)1000000 -
3823 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 -
3824 ((uint64_t)highrttthresh * (uint64_t)1000000) /
3825 (uint64_t)rtt)) / 100)) /(uint64_t)1000000);
3826 if (tcp_bblogging_on(rack->rc_tp)) {
3827 uint64_t log1;
3828
3829 log1 = rtt;
3830 log1 <<= 32;
3831 log1 |= highrttthresh;
3832 rack_log_timely(rack,
3833 rack_gp_decrease_per,
3834 (uint64_t)curper,
3835 log1,
3836 perf,
3837 __LINE__,
3838 15);
3839 }
3840 return (perf);
3841 }
3842
3843 static void
rack_decrease_bw_mul(struct tcp_rack * rack,int timely_says,uint32_t rtt,int32_t rtt_diff)3844 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff)
3845 {
3846 uint64_t logvar, logvar2, logvar3;
3847 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val;
3848
3849 if (rack->rc_skip_timely)
3850 return;
3851 if (rack->rc_gp_incr) {
3852 /* Turn off increment counting */
3853 rack->rc_gp_incr = 0;
3854 rack->rc_gp_timely_inc_cnt = 0;
3855 }
3856 ss_red = ca_red = rec_red = 0;
3857 logged = 0;
3858 /* Calculate the reduction value */
3859 if (rtt_diff < 0) {
3860 rtt_diff *= -1;
3861 }
3862 /* Must be at least 1% reduction */
3863 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) {
3864 /* We have been in recovery ding it too */
3865 if (timely_says == 2) {
3866 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt);
3867 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3868 if (alt < new_per)
3869 val = alt;
3870 else
3871 val = new_per;
3872 } else
3873 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff);
3874 if (rack->r_ctl.rack_per_of_gp_rec > val) {
3875 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val);
3876 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val;
3877 } else {
3878 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3879 rec_red = 0;
3880 }
3881 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec)
3882 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound;
3883 logged |= 1;
3884 }
3885 if (rack->rc_gp_saw_ss) {
3886 /* Sent in SS */
3887 if (timely_says == 2) {
3888 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt);
3889 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3890 if (alt < new_per)
3891 val = alt;
3892 else
3893 val = new_per;
3894 } else
3895 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff);
3896 if (rack->r_ctl.rack_per_of_gp_ss > new_per) {
3897 ss_red = rack->r_ctl.rack_per_of_gp_ss - val;
3898 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val;
3899 } else {
3900 ss_red = new_per;
3901 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3902 logvar = new_per;
3903 logvar <<= 32;
3904 logvar |= alt;
3905 logvar2 = (uint32_t)rtt;
3906 logvar2 <<= 32;
3907 logvar2 |= (uint32_t)rtt_diff;
3908 logvar3 = rack_gp_rtt_maxmul;
3909 logvar3 <<= 32;
3910 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3911 rack_log_timely(rack, timely_says,
3912 logvar2, logvar3,
3913 logvar, __LINE__, 10);
3914 }
3915 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss)
3916 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound;
3917 logged |= 4;
3918 } else if (rack->rc_gp_saw_ca) {
3919 /* Sent in CA */
3920 if (timely_says == 2) {
3921 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt);
3922 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3923 if (alt < new_per)
3924 val = alt;
3925 else
3926 val = new_per;
3927 } else
3928 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff);
3929 if (rack->r_ctl.rack_per_of_gp_ca > val) {
3930 ca_red = rack->r_ctl.rack_per_of_gp_ca - val;
3931 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val;
3932 } else {
3933 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3934 ca_red = 0;
3935 logvar = new_per;
3936 logvar <<= 32;
3937 logvar |= alt;
3938 logvar2 = (uint32_t)rtt;
3939 logvar2 <<= 32;
3940 logvar2 |= (uint32_t)rtt_diff;
3941 logvar3 = rack_gp_rtt_maxmul;
3942 logvar3 <<= 32;
3943 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
3944 rack_log_timely(rack, timely_says,
3945 logvar2, logvar3,
3946 logvar, __LINE__, 10);
3947 }
3948 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca)
3949 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound;
3950 logged |= 2;
3951 }
3952 if (rack->rc_gp_timely_dec_cnt < 0x7) {
3953 rack->rc_gp_timely_dec_cnt++;
3954 if (rack_timely_dec_clear &&
3955 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear))
3956 rack->rc_gp_timely_dec_cnt = 0;
3957 }
3958 logvar = ss_red;
3959 logvar <<= 32;
3960 logvar |= ca_red;
3961 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar,
3962 __LINE__, 2);
3963 }
3964
3965 static void
rack_log_rtt_shrinks(struct tcp_rack * rack,uint32_t us_cts,uint32_t rtt,uint32_t line,uint8_t reas)3966 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts,
3967 uint32_t rtt, uint32_t line, uint8_t reas)
3968 {
3969 if (tcp_bblogging_on(rack->rc_tp)) {
3970 union tcp_log_stackspecific log;
3971 struct timeval tv;
3972
3973 memset(&log, 0, sizeof(log));
3974 log.u_bbr.flex1 = line;
3975 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts;
3976 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts;
3977 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss;
3978 log.u_bbr.flex5 = rtt;
3979 log.u_bbr.flex6 = rack->rc_highly_buffered;
3980 log.u_bbr.flex6 <<= 1;
3981 log.u_bbr.flex6 |= rack->forced_ack;
3982 log.u_bbr.flex6 <<= 1;
3983 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul;
3984 log.u_bbr.flex6 <<= 1;
3985 log.u_bbr.flex6 |= rack->in_probe_rtt;
3986 log.u_bbr.flex6 <<= 1;
3987 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt;
3988 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt;
3989 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca;
3990 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec;
3991 log.u_bbr.flex8 = reas;
3992 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
3993 log.u_bbr.delRate = rack_get_bw(rack);
3994 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt;
3995 log.u_bbr.cur_del_rate <<= 32;
3996 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt;
3997 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered;
3998 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff;
3999 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
4000 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt;
4001 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt;
4002 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts;
4003 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight;
4004 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4005 log.u_bbr.rttProp = us_cts;
4006 log.u_bbr.rttProp <<= 32;
4007 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt;
4008 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4009 &rack->rc_inp->inp_socket->so_rcv,
4010 &rack->rc_inp->inp_socket->so_snd,
4011 BBR_LOG_RTT_SHRINKS, 0,
4012 0, &log, false, &rack->r_ctl.act_rcv_time);
4013 }
4014 }
4015
4016 static void
rack_set_prtt_target(struct tcp_rack * rack,uint32_t segsiz,uint32_t rtt)4017 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt)
4018 {
4019 uint64_t bwdp;
4020
4021 bwdp = rack_get_bw(rack);
4022 bwdp *= (uint64_t)rtt;
4023 bwdp /= (uint64_t)HPTS_USEC_IN_SEC;
4024 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz);
4025 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) {
4026 /*
4027 * A window protocol must be able to have 4 packets
4028 * outstanding as the floor in order to function
4029 * (especially considering delayed ack :D).
4030 */
4031 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs);
4032 }
4033 }
4034
4035 static void
rack_enter_probertt(struct tcp_rack * rack,uint32_t us_cts)4036 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts)
4037 {
4038 /**
4039 * ProbeRTT is a bit different in rack_pacing than in
4040 * BBR. It is like BBR in that it uses the lowering of
4041 * the RTT as a signal that we saw something new and
4042 * counts from there for how long between. But it is
4043 * different in that its quite simple. It does not
4044 * play with the cwnd and wait until we get down
4045 * to N segments outstanding and hold that for
4046 * 200ms. Instead it just sets the pacing reduction
4047 * rate to a set percentage (70 by default) and hold
4048 * that for a number of recent GP Srtt's.
4049 */
4050 uint32_t segsiz;
4051
4052 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4053 if (rack->rc_gp_dyn_mul == 0)
4054 return;
4055
4056 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) {
4057 /* We are idle */
4058 return;
4059 }
4060 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4061 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4062 /*
4063 * Stop the goodput now, the idea here is
4064 * that future measurements with in_probe_rtt
4065 * won't register if they are not greater so
4066 * we want to get what info (if any) is available
4067 * now.
4068 */
4069 rack_do_goodput_measurement(rack->rc_tp, rack,
4070 rack->rc_tp->snd_una, __LINE__,
4071 RACK_QUALITY_PROBERTT);
4072 }
4073 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4074 rack->r_ctl.rc_time_probertt_entered = us_cts;
4075 segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4076 rack->r_ctl.rc_pace_min_segs);
4077 rack->in_probe_rtt = 1;
4078 rack->measure_saw_probe_rtt = 1;
4079 rack->r_ctl.rc_time_probertt_starts = 0;
4080 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt;
4081 if (rack_probertt_use_min_rtt_entry)
4082 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4083 else
4084 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt);
4085 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4086 __LINE__, RACK_RTTS_ENTERPROBE);
4087 }
4088
4089 static void
rack_exit_probertt(struct tcp_rack * rack,uint32_t us_cts)4090 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts)
4091 {
4092 struct rack_sendmap *rsm;
4093 uint32_t segsiz;
4094
4095 segsiz = min(ctf_fixed_maxseg(rack->rc_tp),
4096 rack->r_ctl.rc_pace_min_segs);
4097 rack->in_probe_rtt = 0;
4098 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) &&
4099 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) {
4100 /*
4101 * Stop the goodput now, the idea here is
4102 * that future measurements with in_probe_rtt
4103 * won't register if they are not greater so
4104 * we want to get what info (if any) is available
4105 * now.
4106 */
4107 rack_do_goodput_measurement(rack->rc_tp, rack,
4108 rack->rc_tp->snd_una, __LINE__,
4109 RACK_QUALITY_PROBERTT);
4110 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
4111 /*
4112 * We don't have enough data to make a measurement.
4113 * So lets just stop and start here after exiting
4114 * probe-rtt. We probably are not interested in
4115 * the results anyway.
4116 */
4117 rack->rc_tp->t_flags &= ~TF_GPUTINPROG;
4118 }
4119 /*
4120 * Measurements through the current snd_max are going
4121 * to be limited by the slower pacing rate.
4122 *
4123 * We need to mark these as app-limited so we
4124 * don't collapse the b/w.
4125 */
4126 rsm = tqhash_max(rack->r_ctl.tqh);
4127 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
4128 if (rack->r_ctl.rc_app_limited_cnt == 0)
4129 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
4130 else {
4131 /*
4132 * Go out to the end app limited and mark
4133 * this new one as next and move the end_appl up
4134 * to this guy.
4135 */
4136 if (rack->r_ctl.rc_end_appl)
4137 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
4138 rack->r_ctl.rc_end_appl = rsm;
4139 }
4140 rsm->r_flags |= RACK_APP_LIMITED;
4141 rack->r_ctl.rc_app_limited_cnt++;
4142 }
4143 /*
4144 * Now, we need to examine our pacing rate multipliers.
4145 * If its under 100%, we need to kick it back up to
4146 * 100%. We also don't let it be over our "max" above
4147 * the actual rate i.e. 100% + rack_clamp_atexit_prtt.
4148 * Note setting clamp_atexit_prtt to 0 has the effect
4149 * of setting CA/SS to 100% always at exit (which is
4150 * the default behavior).
4151 */
4152 if (rack_probertt_clear_is) {
4153 rack->rc_gp_incr = 0;
4154 rack->rc_gp_bwred = 0;
4155 rack->rc_gp_timely_inc_cnt = 0;
4156 rack->rc_gp_timely_dec_cnt = 0;
4157 }
4158 /* Do we do any clamping at exit? */
4159 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) {
4160 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp;
4161 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp;
4162 }
4163 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) {
4164 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt;
4165 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt;
4166 }
4167 /*
4168 * Lets set rtt_diff to 0, so that we will get a "boost"
4169 * after exiting.
4170 */
4171 rack->r_ctl.rc_rtt_diff = 0;
4172
4173 /* Clear all flags so we start fresh */
4174 rack->rc_tp->t_bytes_acked = 0;
4175 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
4176 /*
4177 * If configured to, set the cwnd and ssthresh to
4178 * our targets.
4179 */
4180 if (rack_probe_rtt_sets_cwnd) {
4181 uint64_t ebdp;
4182 uint32_t setto;
4183
4184 /* Set ssthresh so we get into CA once we hit our target */
4185 if (rack_probertt_use_min_rtt_exit == 1) {
4186 /* Set to min rtt */
4187 rack_set_prtt_target(rack, segsiz,
4188 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt));
4189 } else if (rack_probertt_use_min_rtt_exit == 2) {
4190 /* Set to current gp rtt */
4191 rack_set_prtt_target(rack, segsiz,
4192 rack->r_ctl.rc_gp_srtt);
4193 } else if (rack_probertt_use_min_rtt_exit == 3) {
4194 /* Set to entry gp rtt */
4195 rack_set_prtt_target(rack, segsiz,
4196 rack->r_ctl.rc_entry_gp_rtt);
4197 } else {
4198 uint64_t sum;
4199 uint32_t setval;
4200
4201 sum = rack->r_ctl.rc_entry_gp_rtt;
4202 sum *= 10;
4203 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt));
4204 if (sum >= 20) {
4205 /*
4206 * A highly buffered path needs
4207 * cwnd space for timely to work.
4208 * Lets set things up as if
4209 * we are heading back here again.
4210 */
4211 setval = rack->r_ctl.rc_entry_gp_rtt;
4212 } else if (sum >= 15) {
4213 /*
4214 * Lets take the smaller of the
4215 * two since we are just somewhat
4216 * buffered.
4217 */
4218 setval = rack->r_ctl.rc_gp_srtt;
4219 if (setval > rack->r_ctl.rc_entry_gp_rtt)
4220 setval = rack->r_ctl.rc_entry_gp_rtt;
4221 } else {
4222 /*
4223 * Here we are not highly buffered
4224 * and should pick the min we can to
4225 * keep from causing loss.
4226 */
4227 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
4228 }
4229 rack_set_prtt_target(rack, segsiz,
4230 setval);
4231 }
4232 if (rack_probe_rtt_sets_cwnd > 1) {
4233 /* There is a percentage here to boost */
4234 ebdp = rack->r_ctl.rc_target_probertt_flight;
4235 ebdp *= rack_probe_rtt_sets_cwnd;
4236 ebdp /= 100;
4237 setto = rack->r_ctl.rc_target_probertt_flight + ebdp;
4238 } else
4239 setto = rack->r_ctl.rc_target_probertt_flight;
4240 rack->rc_tp->snd_cwnd = roundup(setto, segsiz);
4241 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) {
4242 /* Enforce a min */
4243 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs;
4244 }
4245 /* If we set in the cwnd also set the ssthresh point so we are in CA */
4246 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1);
4247 }
4248 rack_log_rtt_shrinks(rack, us_cts,
4249 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4250 __LINE__, RACK_RTTS_EXITPROBE);
4251 /* Clear times last so log has all the info */
4252 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max;
4253 rack->r_ctl.rc_time_probertt_entered = us_cts;
4254 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
4255 rack->r_ctl.rc_time_of_last_probertt = us_cts;
4256 }
4257
4258 static void
rack_check_probe_rtt(struct tcp_rack * rack,uint32_t us_cts)4259 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts)
4260 {
4261 /* Check in on probe-rtt */
4262
4263 if (rack->rc_gp_filled == 0) {
4264 /* We do not do p-rtt unless we have gp measurements */
4265 return;
4266 }
4267 if (rack->in_probe_rtt) {
4268 uint64_t no_overflow;
4269 uint32_t endtime, must_stay;
4270
4271 if (rack->r_ctl.rc_went_idle_time &&
4272 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) {
4273 /*
4274 * We went idle during prtt, just exit now.
4275 */
4276 rack_exit_probertt(rack, us_cts);
4277 } else if (rack_probe_rtt_safety_val &&
4278 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) &&
4279 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) {
4280 /*
4281 * Probe RTT safety value triggered!
4282 */
4283 rack_log_rtt_shrinks(rack, us_cts,
4284 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4285 __LINE__, RACK_RTTS_SAFETY);
4286 rack_exit_probertt(rack, us_cts);
4287 }
4288 /* Calculate the max we will wait */
4289 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait);
4290 if (rack->rc_highly_buffered)
4291 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp);
4292 /* Calculate the min we must wait */
4293 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain);
4294 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) &&
4295 TSTMP_LT(us_cts, endtime)) {
4296 uint32_t calc;
4297 /* Do we lower more? */
4298 no_exit:
4299 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered))
4300 calc = us_cts - rack->r_ctl.rc_time_probertt_entered;
4301 else
4302 calc = 0;
4303 calc /= max(rack->r_ctl.rc_gp_srtt, 1);
4304 if (calc) {
4305 /* Maybe */
4306 calc *= rack_per_of_gp_probertt_reduce;
4307 if (calc > rack_per_of_gp_probertt)
4308 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4309 else
4310 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc;
4311 /* Limit it too */
4312 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh)
4313 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh;
4314 }
4315 /* We must reach target or the time set */
4316 return;
4317 }
4318 if (rack->r_ctl.rc_time_probertt_starts == 0) {
4319 if ((TSTMP_LT(us_cts, must_stay) &&
4320 rack->rc_highly_buffered) ||
4321 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) >
4322 rack->r_ctl.rc_target_probertt_flight)) {
4323 /* We are not past the must_stay time */
4324 goto no_exit;
4325 }
4326 rack_log_rtt_shrinks(rack, us_cts,
4327 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4328 __LINE__, RACK_RTTS_REACHTARGET);
4329 rack->r_ctl.rc_time_probertt_starts = us_cts;
4330 if (rack->r_ctl.rc_time_probertt_starts == 0)
4331 rack->r_ctl.rc_time_probertt_starts = 1;
4332 /* Restore back to our rate we want to pace at in prtt */
4333 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
4334 }
4335 /*
4336 * Setup our end time, some number of gp_srtts plus 200ms.
4337 */
4338 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt *
4339 (uint64_t)rack_probertt_gpsrtt_cnt_mul);
4340 if (rack_probertt_gpsrtt_cnt_div)
4341 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div);
4342 else
4343 endtime = 0;
4344 endtime += rack_min_probertt_hold;
4345 endtime += rack->r_ctl.rc_time_probertt_starts;
4346 if (TSTMP_GEQ(us_cts, endtime)) {
4347 /* yes, exit probertt */
4348 rack_exit_probertt(rack, us_cts);
4349 }
4350
4351 } else if ((rack->rc_skip_timely == 0) &&
4352 (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) &&
4353 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) {
4354 /* Go into probertt, its been too long since we went lower */
4355 rack_enter_probertt(rack, us_cts);
4356 }
4357 }
4358
4359 static void
rack_update_multiplier(struct tcp_rack * rack,int32_t timely_says,uint64_t last_bw_est,uint32_t rtt,int32_t rtt_diff)4360 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est,
4361 uint32_t rtt, int32_t rtt_diff)
4362 {
4363 uint64_t cur_bw, up_bnd, low_bnd, subfr;
4364 uint32_t losses;
4365
4366 if ((rack->rc_gp_dyn_mul == 0) ||
4367 (rack->use_fixed_rate) ||
4368 (rack->in_probe_rtt) ||
4369 (rack->rc_always_pace == 0)) {
4370 /* No dynamic GP multiplier in play */
4371 return;
4372 }
4373 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start;
4374 cur_bw = rack_get_bw(rack);
4375 /* Calculate our up and down range */
4376 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up;
4377 up_bnd /= 100;
4378 up_bnd += rack->r_ctl.last_gp_comp_bw;
4379
4380 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down;
4381 subfr /= 100;
4382 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr;
4383 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) {
4384 /*
4385 * This is the case where our RTT is above
4386 * the max target and we have been configured
4387 * to just do timely no bonus up stuff in that case.
4388 *
4389 * There are two configurations, set to 1, and we
4390 * just do timely if we are over our max. If its
4391 * set above 1 then we slam the multipliers down
4392 * to 100 and then decrement per timely.
4393 */
4394 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4395 __LINE__, 3);
4396 if (rack->r_ctl.rc_no_push_at_mrtt > 1)
4397 rack_validate_multipliers_at_or_below_100(rack);
4398 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4399 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) {
4400 /*
4401 * We are decreasing this is a bit complicated this
4402 * means we are loosing ground. This could be
4403 * because another flow entered and we are competing
4404 * for b/w with it. This will push the RTT up which
4405 * makes timely unusable unless we want to get shoved
4406 * into a corner and just be backed off (the age
4407 * old problem with delay based CC).
4408 *
4409 * On the other hand if it was a route change we
4410 * would like to stay somewhat contained and not
4411 * blow out the buffers.
4412 */
4413 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4414 __LINE__, 3);
4415 rack->r_ctl.last_gp_comp_bw = cur_bw;
4416 if (rack->rc_gp_bwred == 0) {
4417 /* Go into reduction counting */
4418 rack->rc_gp_bwred = 1;
4419 rack->rc_gp_timely_dec_cnt = 0;
4420 }
4421 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) {
4422 /*
4423 * Push another time with a faster pacing
4424 * to try to gain back (we include override to
4425 * get a full raise factor).
4426 */
4427 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) ||
4428 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) ||
4429 (timely_says == 0) ||
4430 (rack_down_raise_thresh == 0)) {
4431 /*
4432 * Do an override up in b/w if we were
4433 * below the threshold or if the threshold
4434 * is zero we always do the raise.
4435 */
4436 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1);
4437 } else {
4438 /* Log it stays the same */
4439 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0,
4440 __LINE__, 11);
4441 }
4442 rack->rc_gp_timely_dec_cnt++;
4443 /* We are not incrementing really no-count */
4444 rack->rc_gp_incr = 0;
4445 rack->rc_gp_timely_inc_cnt = 0;
4446 } else {
4447 /*
4448 * Lets just use the RTT
4449 * information and give up
4450 * pushing.
4451 */
4452 goto use_timely;
4453 }
4454 } else if ((timely_says != 2) &&
4455 !losses &&
4456 (last_bw_est > up_bnd)) {
4457 /*
4458 * We are increasing b/w lets keep going, updating
4459 * our b/w and ignoring any timely input, unless
4460 * of course we are at our max raise (if there is one).
4461 */
4462
4463 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4464 __LINE__, 3);
4465 rack->r_ctl.last_gp_comp_bw = cur_bw;
4466 if (rack->rc_gp_saw_ss &&
4467 rack->r_ctl.rack_per_upper_bound_ss &&
4468 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) {
4469 /*
4470 * In cases where we can't go higher
4471 * we should just use timely.
4472 */
4473 goto use_timely;
4474 }
4475 if (rack->rc_gp_saw_ca &&
4476 rack->r_ctl.rack_per_upper_bound_ca &&
4477 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) {
4478 /*
4479 * In cases where we can't go higher
4480 * we should just use timely.
4481 */
4482 goto use_timely;
4483 }
4484 rack->rc_gp_bwred = 0;
4485 rack->rc_gp_timely_dec_cnt = 0;
4486 /* You get a set number of pushes if timely is trying to reduce */
4487 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) {
4488 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4489 } else {
4490 /* Log it stays the same */
4491 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0,
4492 __LINE__, 12);
4493 }
4494 return;
4495 } else {
4496 /*
4497 * We are staying between the lower and upper range bounds
4498 * so use timely to decide.
4499 */
4500 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd,
4501 __LINE__, 3);
4502 use_timely:
4503 if (timely_says) {
4504 rack->rc_gp_incr = 0;
4505 rack->rc_gp_timely_inc_cnt = 0;
4506 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) &&
4507 !losses &&
4508 (last_bw_est < low_bnd)) {
4509 /* We are loosing ground */
4510 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4511 rack->rc_gp_timely_dec_cnt++;
4512 /* We are not incrementing really no-count */
4513 rack->rc_gp_incr = 0;
4514 rack->rc_gp_timely_inc_cnt = 0;
4515 } else
4516 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff);
4517 } else {
4518 rack->rc_gp_bwred = 0;
4519 rack->rc_gp_timely_dec_cnt = 0;
4520 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0);
4521 }
4522 }
4523 }
4524
4525 static int32_t
rack_make_timely_judgement(struct tcp_rack * rack,uint32_t rtt,int32_t rtt_diff,uint32_t prev_rtt)4526 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt)
4527 {
4528 int32_t timely_says;
4529 uint64_t log_mult, log_rtt_a_diff;
4530
4531 log_rtt_a_diff = rtt;
4532 log_rtt_a_diff <<= 32;
4533 log_rtt_a_diff |= (uint32_t)rtt_diff;
4534 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) *
4535 rack_gp_rtt_maxmul)) {
4536 /* Reduce the b/w multiplier */
4537 timely_says = 2;
4538 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul;
4539 log_mult <<= 32;
4540 log_mult |= prev_rtt;
4541 rack_log_timely(rack, timely_says, log_mult,
4542 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4543 log_rtt_a_diff, __LINE__, 4);
4544 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4545 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4546 max(rack_gp_rtt_mindiv , 1)))) {
4547 /* Increase the b/w multiplier */
4548 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) +
4549 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) /
4550 max(rack_gp_rtt_mindiv , 1));
4551 log_mult <<= 32;
4552 log_mult |= prev_rtt;
4553 timely_says = 0;
4554 rack_log_timely(rack, timely_says, log_mult ,
4555 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt),
4556 log_rtt_a_diff, __LINE__, 5);
4557 } else {
4558 /*
4559 * Use a gradient to find it the timely gradient
4560 * is:
4561 * grad = rc_rtt_diff / min_rtt;
4562 *
4563 * anything below or equal to 0 will be
4564 * a increase indication. Anything above
4565 * zero is a decrease. Note we take care
4566 * of the actual gradient calculation
4567 * in the reduction (its not needed for
4568 * increase).
4569 */
4570 log_mult = prev_rtt;
4571 if (rtt_diff <= 0) {
4572 /*
4573 * Rttdiff is less than zero, increase the
4574 * b/w multiplier (its 0 or negative)
4575 */
4576 timely_says = 0;
4577 rack_log_timely(rack, timely_says, log_mult,
4578 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6);
4579 } else {
4580 /* Reduce the b/w multiplier */
4581 timely_says = 1;
4582 rack_log_timely(rack, timely_says, log_mult,
4583 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7);
4584 }
4585 }
4586 return (timely_says);
4587 }
4588
4589 static inline int
rack_in_gp_window(struct tcpcb * tp,struct rack_sendmap * rsm)4590 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm)
4591 {
4592 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4593 SEQ_LEQ(rsm->r_end, tp->gput_ack)) {
4594 /**
4595 * This covers the case that the
4596 * resent is completely inside
4597 * the gp range or up to it.
4598 * |----------------|
4599 * |-----| <or>
4600 * |----|
4601 * <or> |---|
4602 */
4603 return (1);
4604 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) &&
4605 SEQ_GT(rsm->r_end, tp->gput_seq)){
4606 /**
4607 * This covers the case of
4608 * |--------------|
4609 * |-------->|
4610 */
4611 return (1);
4612 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) &&
4613 SEQ_LT(rsm->r_start, tp->gput_ack) &&
4614 SEQ_GEQ(rsm->r_end, tp->gput_ack)) {
4615
4616 /**
4617 * This covers the case of
4618 * |--------------|
4619 * |-------->|
4620 */
4621 return (1);
4622 }
4623 return (0);
4624 }
4625
4626 static inline void
rack_mark_in_gp_win(struct tcpcb * tp,struct rack_sendmap * rsm)4627 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm)
4628 {
4629
4630 if ((tp->t_flags & TF_GPUTINPROG) == 0)
4631 return;
4632 /*
4633 * We have a Goodput measurement in progress. Mark
4634 * the send if its within the window. If its not
4635 * in the window make sure it does not have the mark.
4636 */
4637 if (rack_in_gp_window(tp, rsm))
4638 rsm->r_flags |= RACK_IN_GP_WIN;
4639 else
4640 rsm->r_flags &= ~RACK_IN_GP_WIN;
4641 }
4642
4643 static inline void
rack_clear_gp_marks(struct tcpcb * tp,struct tcp_rack * rack)4644 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4645 {
4646 /* A GP measurement is ending, clear all marks on the send map*/
4647 struct rack_sendmap *rsm = NULL;
4648
4649 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4650 if (rsm == NULL) {
4651 rsm = tqhash_min(rack->r_ctl.tqh);
4652 }
4653 /* Nothing left? */
4654 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){
4655 rsm->r_flags &= ~RACK_IN_GP_WIN;
4656 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4657 }
4658 }
4659
4660
4661 static inline void
rack_tend_gp_marks(struct tcpcb * tp,struct tcp_rack * rack)4662 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack)
4663 {
4664 struct rack_sendmap *rsm = NULL;
4665
4666 if (tp->snd_una == tp->snd_max) {
4667 /* Nothing outstanding yet, nothing to do here */
4668 return;
4669 }
4670 if (SEQ_GT(tp->gput_seq, tp->snd_una)) {
4671 /*
4672 * We are measuring ahead of some outstanding
4673 * data. We need to walk through up until we get
4674 * to gp_seq marking so that no rsm is set incorrectly
4675 * with RACK_IN_GP_WIN.
4676 */
4677 rsm = tqhash_min(rack->r_ctl.tqh);
4678 while (rsm != NULL) {
4679 rack_mark_in_gp_win(tp, rsm);
4680 if (SEQ_GEQ(rsm->r_end, tp->gput_seq))
4681 break;
4682 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4683 }
4684 }
4685 if (rsm == NULL) {
4686 /*
4687 * Need to find the GP seq, if rsm is
4688 * set we stopped as we hit it.
4689 */
4690 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
4691 if (rsm == NULL)
4692 return;
4693 rack_mark_in_gp_win(tp, rsm);
4694 }
4695 /*
4696 * Now we may need to mark already sent rsm, ahead of
4697 * gput_seq in the window since they may have been sent
4698 * *before* we started our measurment. The rsm, if non-null
4699 * has been marked (note if rsm would have been NULL we would have
4700 * returned in the previous block). So we go to the next, and continue
4701 * until we run out of entries or we exceed the gp_ack value.
4702 */
4703 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4704 while (rsm) {
4705 rack_mark_in_gp_win(tp, rsm);
4706 if (SEQ_GT(rsm->r_end, tp->gput_ack))
4707 break;
4708 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
4709 }
4710 }
4711
4712 static void
rack_log_gp_calc(struct tcp_rack * rack,uint32_t add_part,uint32_t sub_part,uint32_t srtt,uint64_t meas_bw,uint64_t utim,uint8_t meth,uint32_t line)4713 rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line)
4714 {
4715 if (tcp_bblogging_on(rack->rc_tp)) {
4716 union tcp_log_stackspecific log;
4717 struct timeval tv;
4718
4719 memset(&log, 0, sizeof(log));
4720 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
4721 log.u_bbr.flex1 = add_part;
4722 log.u_bbr.flex2 = sub_part;
4723 log.u_bbr.flex3 = rack_wma_divisor;
4724 log.u_bbr.flex4 = srtt;
4725 log.u_bbr.flex7 = (uint16_t)line;
4726 log.u_bbr.flex8 = meth;
4727 log.u_bbr.delRate = rack->r_ctl.gp_bw;
4728 log.u_bbr.cur_del_rate = meas_bw;
4729 log.u_bbr.rttProp = utim;
4730 TCP_LOG_EVENTP(rack->rc_tp, NULL,
4731 &rack->rc_inp->inp_socket->so_rcv,
4732 &rack->rc_inp->inp_socket->so_snd,
4733 BBR_LOG_THRESH_CALC, 0,
4734 0, &log, false, &rack->r_ctl.act_rcv_time);
4735 }
4736 }
4737
4738 static void
rack_do_goodput_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq th_ack,int line,uint8_t quality)4739 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
4740 tcp_seq th_ack, int line, uint8_t quality)
4741 {
4742 uint64_t tim, bytes_ps, stim, utim;
4743 uint32_t segsiz, bytes, reqbytes, us_cts;
4744 int32_t gput, new_rtt_diff, timely_says;
4745 uint64_t resid_bw, subpart = 0, addpart = 0, srtt;
4746 int did_add = 0;
4747
4748 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
4749 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
4750 if (TSTMP_GEQ(us_cts, tp->gput_ts))
4751 tim = us_cts - tp->gput_ts;
4752 else
4753 tim = 0;
4754 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts)
4755 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts;
4756 else
4757 stim = 0;
4758 /*
4759 * Use the larger of the send time or ack time. This prevents us
4760 * from being influenced by ack artifacts to come up with too
4761 * high of measurement. Note that since we are spanning over many more
4762 * bytes in most of our measurements hopefully that is less likely to
4763 * occur.
4764 */
4765 if (tim > stim)
4766 utim = max(tim, 1);
4767 else
4768 utim = max(stim, 1);
4769 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz));
4770 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL);
4771 if ((tim == 0) && (stim == 0)) {
4772 /*
4773 * Invalid measurement time, maybe
4774 * all on one ack/one send?
4775 */
4776 bytes = 0;
4777 bytes_ps = 0;
4778 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4779 0, 0, 0, 10, __LINE__, NULL, quality);
4780 goto skip_measurement;
4781 }
4782 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) {
4783 /* We never made a us_rtt measurement? */
4784 bytes = 0;
4785 bytes_ps = 0;
4786 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4787 0, 0, 0, 10, __LINE__, NULL, quality);
4788 goto skip_measurement;
4789 }
4790 /*
4791 * Calculate the maximum possible b/w this connection
4792 * could have. We base our calculation on the lowest
4793 * rtt we have seen during the measurement and the
4794 * largest rwnd the client has given us in that time. This
4795 * forms a BDP that is the maximum that we could ever
4796 * get to the client. Anything larger is not valid.
4797 *
4798 * I originally had code here that rejected measurements
4799 * where the time was less than 1/2 the latest us_rtt.
4800 * But after thinking on that I realized its wrong since
4801 * say you had a 150Mbps or even 1Gbps link, and you
4802 * were a long way away.. example I am in Europe (100ms rtt)
4803 * talking to my 1Gbps link in S.C. Now measuring say 150,000
4804 * bytes my time would be 1.2ms, and yet my rtt would say
4805 * the measurement was invalid the time was < 50ms. The
4806 * same thing is true for 150Mb (8ms of time).
4807 *
4808 * A better way I realized is to look at what the maximum
4809 * the connection could possibly do. This is gated on
4810 * the lowest RTT we have seen and the highest rwnd.
4811 * We should in theory never exceed that, if we are
4812 * then something on the path is storing up packets
4813 * and then feeding them all at once to our endpoint
4814 * messing up our measurement.
4815 */
4816 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd;
4817 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC;
4818 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt;
4819 if (SEQ_LT(th_ack, tp->gput_seq)) {
4820 /* No measurement can be made */
4821 bytes = 0;
4822 bytes_ps = 0;
4823 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4824 0, 0, 0, 10, __LINE__, NULL, quality);
4825 goto skip_measurement;
4826 } else
4827 bytes = (th_ack - tp->gput_seq);
4828 bytes_ps = (uint64_t)bytes;
4829 /*
4830 * Don't measure a b/w for pacing unless we have gotten at least
4831 * an initial windows worth of data in this measurement interval.
4832 *
4833 * Small numbers of bytes get badly influenced by delayed ack and
4834 * other artifacts. Note we take the initial window or our
4835 * defined minimum GP (defaulting to 10 which hopefully is the
4836 * IW).
4837 */
4838 if (rack->rc_gp_filled == 0) {
4839 /*
4840 * The initial estimate is special. We
4841 * have blasted out an IW worth of packets
4842 * without a real valid ack ts results. We
4843 * then setup the app_limited_needs_set flag,
4844 * this should get the first ack in (probably 2
4845 * MSS worth) to be recorded as the timestamp.
4846 * We thus allow a smaller number of bytes i.e.
4847 * IW - 2MSS.
4848 */
4849 reqbytes -= (2 * segsiz);
4850 /* Also lets fill previous for our first measurement to be neutral */
4851 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
4852 }
4853 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) {
4854 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4855 rack->r_ctl.rc_app_limited_cnt,
4856 0, 0, 10, __LINE__, NULL, quality);
4857 goto skip_measurement;
4858 }
4859 /*
4860 * We now need to calculate the Timely like status so
4861 * we can update (possibly) the b/w multipliers.
4862 */
4863 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt;
4864 if (rack->rc_gp_filled == 0) {
4865 /* No previous reading */
4866 rack->r_ctl.rc_rtt_diff = new_rtt_diff;
4867 } else {
4868 if (rack->measure_saw_probe_rtt == 0) {
4869 /*
4870 * We don't want a probertt to be counted
4871 * since it will be negative incorrectly. We
4872 * expect to be reducing the RTT when we
4873 * pace at a slower rate.
4874 */
4875 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8);
4876 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8);
4877 }
4878 }
4879 timely_says = rack_make_timely_judgement(rack,
4880 rack->r_ctl.rc_gp_srtt,
4881 rack->r_ctl.rc_rtt_diff,
4882 rack->r_ctl.rc_prev_gp_srtt
4883 );
4884 bytes_ps *= HPTS_USEC_IN_SEC;
4885 bytes_ps /= utim;
4886 if (bytes_ps > rack->r_ctl.last_max_bw) {
4887 /*
4888 * Something is on path playing
4889 * since this b/w is not possible based
4890 * on our BDP (highest rwnd and lowest rtt
4891 * we saw in the measurement window).
4892 *
4893 * Another option here would be to
4894 * instead skip the measurement.
4895 */
4896 rack_log_pacing_delay_calc(rack, bytes, reqbytes,
4897 bytes_ps, rack->r_ctl.last_max_bw, 0,
4898 11, __LINE__, NULL, quality);
4899 bytes_ps = rack->r_ctl.last_max_bw;
4900 }
4901 /* We store gp for b/w in bytes per second */
4902 if (rack->rc_gp_filled == 0) {
4903 /* Initial measurement */
4904 if (bytes_ps) {
4905 rack->r_ctl.gp_bw = bytes_ps;
4906 rack->rc_gp_filled = 1;
4907 rack->r_ctl.num_measurements = 1;
4908 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
4909 } else {
4910 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes,
4911 rack->r_ctl.rc_app_limited_cnt,
4912 0, 0, 10, __LINE__, NULL, quality);
4913 }
4914 if (tcp_in_hpts(rack->rc_tp) &&
4915 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
4916 /*
4917 * Ok we can't trust the pacer in this case
4918 * where we transition from un-paced to paced.
4919 * Or for that matter when the burst mitigation
4920 * was making a wild guess and got it wrong.
4921 * Stop the pacer and clear up all the aggregate
4922 * delays etc.
4923 */
4924 tcp_hpts_remove(rack->rc_tp);
4925 rack->r_ctl.rc_hpts_flags = 0;
4926 rack->r_ctl.rc_last_output_to = 0;
4927 }
4928 did_add = 2;
4929 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) {
4930 /* Still a small number run an average */
4931 rack->r_ctl.gp_bw += bytes_ps;
4932 addpart = rack->r_ctl.num_measurements;
4933 rack->r_ctl.num_measurements++;
4934 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) {
4935 /* We have collected enough to move forward */
4936 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements;
4937 }
4938 rack_set_pace_segments(tp, rack, __LINE__, NULL);
4939 did_add = 3;
4940 } else {
4941 /*
4942 * We want to take 1/wma of the goodput and add in to 7/8th
4943 * of the old value weighted by the srtt. So if your measurement
4944 * period is say 2 SRTT's long you would get 1/4 as the
4945 * value, if it was like 1/2 SRTT then you would get 1/16th.
4946 *
4947 * But we must be careful not to take too much i.e. if the
4948 * srtt is say 20ms and the measurement is taken over
4949 * 400ms our weight would be 400/20 i.e. 20. On the
4950 * other hand if we get a measurement over 1ms with a
4951 * 10ms rtt we only want to take a much smaller portion.
4952 */
4953 uint8_t meth;
4954
4955 if (rack->r_ctl.num_measurements < 0xff) {
4956 rack->r_ctl.num_measurements++;
4957 }
4958 srtt = (uint64_t)tp->t_srtt;
4959 if (srtt == 0) {
4960 /*
4961 * Strange why did t_srtt go back to zero?
4962 */
4963 if (rack->r_ctl.rc_rack_min_rtt)
4964 srtt = rack->r_ctl.rc_rack_min_rtt;
4965 else
4966 srtt = HPTS_USEC_IN_MSEC;
4967 }
4968 /*
4969 * XXXrrs: Note for reviewers, in playing with
4970 * dynamic pacing I discovered this GP calculation
4971 * as done originally leads to some undesired results.
4972 * Basically you can get longer measurements contributing
4973 * too much to the WMA. Thus I changed it if you are doing
4974 * dynamic adjustments to only do the aportioned adjustment
4975 * if we have a very small (time wise) measurement. Longer
4976 * measurements just get there weight (defaulting to 1/8)
4977 * add to the WMA. We may want to think about changing
4978 * this to always do that for both sides i.e. dynamic
4979 * and non-dynamic... but considering lots of folks
4980 * were playing with this I did not want to change the
4981 * calculation per.se. without your thoughts.. Lawerence?
4982 * Peter??
4983 */
4984 if (rack->rc_gp_dyn_mul == 0) {
4985 subpart = rack->r_ctl.gp_bw * utim;
4986 subpart /= (srtt * 8);
4987 if (subpart < (rack->r_ctl.gp_bw / 2)) {
4988 /*
4989 * The b/w update takes no more
4990 * away then 1/2 our running total
4991 * so factor it in.
4992 */
4993 addpart = bytes_ps * utim;
4994 addpart /= (srtt * 8);
4995 meth = 1;
4996 } else {
4997 /*
4998 * Don't allow a single measurement
4999 * to account for more than 1/2 of the
5000 * WMA. This could happen on a retransmission
5001 * where utim becomes huge compared to
5002 * srtt (multiple retransmissions when using
5003 * the sending rate which factors in all the
5004 * transmissions from the first one).
5005 */
5006 subpart = rack->r_ctl.gp_bw / 2;
5007 addpart = bytes_ps / 2;
5008 meth = 2;
5009 }
5010 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
5011 resid_bw = rack->r_ctl.gp_bw - subpart;
5012 rack->r_ctl.gp_bw = resid_bw + addpart;
5013 did_add = 1;
5014 } else {
5015 if ((utim / srtt) <= 1) {
5016 /*
5017 * The b/w update was over a small period
5018 * of time. The idea here is to prevent a small
5019 * measurement time period from counting
5020 * too much. So we scale it based on the
5021 * time so it attributes less than 1/rack_wma_divisor
5022 * of its measurement.
5023 */
5024 subpart = rack->r_ctl.gp_bw * utim;
5025 subpart /= (srtt * rack_wma_divisor);
5026 addpart = bytes_ps * utim;
5027 addpart /= (srtt * rack_wma_divisor);
5028 meth = 3;
5029 } else {
5030 /*
5031 * The scaled measurement was long
5032 * enough so lets just add in the
5033 * portion of the measurement i.e. 1/rack_wma_divisor
5034 */
5035 subpart = rack->r_ctl.gp_bw / rack_wma_divisor;
5036 addpart = bytes_ps / rack_wma_divisor;
5037 meth = 4;
5038 }
5039 if ((rack->measure_saw_probe_rtt == 0) ||
5040 (bytes_ps > rack->r_ctl.gp_bw)) {
5041 /*
5042 * For probe-rtt we only add it in
5043 * if its larger, all others we just
5044 * add in.
5045 */
5046 did_add = 1;
5047 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__);
5048 resid_bw = rack->r_ctl.gp_bw - subpart;
5049 rack->r_ctl.gp_bw = resid_bw + addpart;
5050 }
5051 }
5052 rack_set_pace_segments(tp, rack, __LINE__, NULL);
5053 }
5054 /*
5055 * We only watch the growth of the GP during the initial startup
5056 * or first-slowstart that ensues. If we ever needed to watch
5057 * growth of gp outside of that period all we need to do is
5058 * remove the first clause of this if (rc_initial_ss_comp).
5059 */
5060 if ((rack->rc_initial_ss_comp == 0) &&
5061 (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) {
5062 uint64_t gp_est;
5063
5064 gp_est = bytes_ps;
5065 if (tcp_bblogging_on(rack->rc_tp)) {
5066 union tcp_log_stackspecific log;
5067 struct timeval tv;
5068
5069 memset(&log, 0, sizeof(log));
5070 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5071 log.u_bbr.flex1 = rack->r_ctl.current_round;
5072 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
5073 log.u_bbr.delRate = gp_est;
5074 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
5075 log.u_bbr.flex8 = 41;
5076 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5077 0, &log, false, NULL, __func__, __LINE__,&tv);
5078 }
5079 if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) ||
5080 (rack->r_ctl.last_gpest == 0)) {
5081 /*
5082 * The round we get our measurement averaging going
5083 * is the base round so it always is the source point
5084 * for when we had our first increment. From there on
5085 * we only record the round that had a rise.
5086 */
5087 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
5088 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
5089 } else if (gp_est >= rack->r_ctl.last_gpest) {
5090 /*
5091 * Test to see if its gone up enough
5092 * to set the round count up to now. Note
5093 * that on the seeding of the 4th measurement we
5094 */
5095 gp_est *= 1000;
5096 gp_est /= rack->r_ctl.last_gpest;
5097 if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) {
5098 /*
5099 * We went up enough to record the round.
5100 */
5101 if (tcp_bblogging_on(rack->rc_tp)) {
5102 union tcp_log_stackspecific log;
5103 struct timeval tv;
5104
5105 memset(&log, 0, sizeof(log));
5106 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5107 log.u_bbr.flex1 = rack->r_ctl.current_round;
5108 log.u_bbr.flex2 = (uint32_t)gp_est;
5109 log.u_bbr.flex3 = rack->r_ctl.gp_gain_req;
5110 log.u_bbr.delRate = gp_est;
5111 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest;
5112 log.u_bbr.flex8 = 42;
5113 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5114 0, &log, false, NULL, __func__, __LINE__,&tv);
5115 }
5116 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round;
5117 if (rack->r_ctl.use_gp_not_last == 1)
5118 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw;
5119 else
5120 rack->r_ctl.last_gpest = bytes_ps;
5121 }
5122 }
5123 }
5124 if ((rack->gp_ready == 0) &&
5125 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
5126 /* We have enough measurements now */
5127 rack->gp_ready = 1;
5128 if (rack->dgp_on ||
5129 rack->rack_hibeta)
5130 rack_set_cc_pacing(rack);
5131 if (rack->defer_options)
5132 rack_apply_deferred_options(rack);
5133 }
5134 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim,
5135 rack_get_bw(rack), 22, did_add, NULL, quality);
5136 /* We do not update any multipliers if we are in or have seen a probe-rtt */
5137
5138 if ((rack->measure_saw_probe_rtt == 0) &&
5139 rack->rc_gp_rtt_set) {
5140 if (rack->rc_skip_timely == 0) {
5141 rack_update_multiplier(rack, timely_says, bytes_ps,
5142 rack->r_ctl.rc_gp_srtt,
5143 rack->r_ctl.rc_rtt_diff);
5144 }
5145 }
5146 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim,
5147 rack_get_bw(rack), 3, line, NULL, quality);
5148 rack_log_pacing_delay_calc(rack,
5149 bytes, /* flex2 */
5150 tim, /* flex1 */
5151 bytes_ps, /* bw_inuse */
5152 rack->r_ctl.gp_bw, /* delRate */
5153 rack_get_lt_bw(rack), /* rttProp */
5154 20, line, NULL, 0);
5155 /* reset the gp srtt and setup the new prev */
5156 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt;
5157 /* Record the lost count for the next measurement */
5158 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count;
5159 skip_measurement:
5160 /*
5161 * We restart our diffs based on the gpsrtt in the
5162 * measurement window.
5163 */
5164 rack->rc_gp_rtt_set = 0;
5165 rack->rc_gp_saw_rec = 0;
5166 rack->rc_gp_saw_ca = 0;
5167 rack->rc_gp_saw_ss = 0;
5168 rack->rc_dragged_bottom = 0;
5169 if (quality == RACK_QUALITY_HIGH) {
5170 /*
5171 * Gput in the stats world is in kbps where bytes_ps is
5172 * bytes per second so we do ((x * 8)/ 1000).
5173 */
5174 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000);
5175 #ifdef STATS
5176 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
5177 gput);
5178 /*
5179 * XXXLAS: This is a temporary hack, and should be
5180 * chained off VOI_TCP_GPUT when stats(9) grows an
5181 * API to deal with chained VOIs.
5182 */
5183 if (tp->t_stats_gput_prev > 0)
5184 stats_voi_update_abs_s32(tp->t_stats,
5185 VOI_TCP_GPUT_ND,
5186 ((gput - tp->t_stats_gput_prev) * 100) /
5187 tp->t_stats_gput_prev);
5188 #endif
5189 tp->t_stats_gput_prev = gput;
5190 }
5191 tp->t_flags &= ~TF_GPUTINPROG;
5192 /*
5193 * Now are we app limited now and there is space from where we
5194 * were to where we want to go?
5195 *
5196 * We don't do the other case i.e. non-applimited here since
5197 * the next send will trigger us picking up the missing data.
5198 */
5199 if (rack->r_ctl.rc_first_appl &&
5200 TCPS_HAVEESTABLISHED(tp->t_state) &&
5201 rack->r_ctl.rc_app_limited_cnt &&
5202 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) &&
5203 ((rack->r_ctl.rc_first_appl->r_end - th_ack) >
5204 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) {
5205 /*
5206 * Yep there is enough outstanding to make a measurement here.
5207 */
5208 struct rack_sendmap *rsm;
5209
5210 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
5211 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
5212 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
5213 rack->app_limited_needs_set = 0;
5214 tp->gput_seq = th_ack;
5215 if (rack->in_probe_rtt)
5216 rack->measure_saw_probe_rtt = 1;
5217 else if ((rack->measure_saw_probe_rtt) &&
5218 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
5219 rack->measure_saw_probe_rtt = 0;
5220 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) {
5221 /* There is a full window to gain info from */
5222 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
5223 } else {
5224 /* We can only measure up to the applimited point */
5225 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack);
5226 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
5227 /*
5228 * We don't have enough to make a measurement.
5229 */
5230 tp->t_flags &= ~TF_GPUTINPROG;
5231 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
5232 0, 0, 0, 6, __LINE__, NULL, quality);
5233 return;
5234 }
5235 }
5236 if (tp->t_state >= TCPS_FIN_WAIT_1) {
5237 /*
5238 * We will get no more data into the SB
5239 * this means we need to have the data available
5240 * before we start a measurement.
5241 */
5242 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) {
5243 /* Nope not enough data. */
5244 return;
5245 }
5246 }
5247 tp->t_flags |= TF_GPUTINPROG;
5248 /*
5249 * Now we need to find the timestamp of the send at tp->gput_seq
5250 * for the send based measurement.
5251 */
5252 rack->r_ctl.rc_gp_cumack_ts = 0;
5253 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
5254 if (rsm) {
5255 /* Ok send-based limit is set */
5256 if (SEQ_LT(rsm->r_start, tp->gput_seq)) {
5257 /*
5258 * Move back to include the earlier part
5259 * so our ack time lines up right (this may
5260 * make an overlapping measurement but thats
5261 * ok).
5262 */
5263 tp->gput_seq = rsm->r_start;
5264 }
5265 if (rsm->r_flags & RACK_ACKED) {
5266 struct rack_sendmap *nrsm;
5267
5268 tp->gput_ts = (uint32_t)rsm->r_ack_arrival;
5269 tp->gput_seq = rsm->r_end;
5270 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
5271 if (nrsm)
5272 rsm = nrsm;
5273 else {
5274 rack->app_limited_needs_set = 1;
5275 }
5276 } else
5277 rack->app_limited_needs_set = 1;
5278 /* We always go from the first send */
5279 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
5280 } else {
5281 /*
5282 * If we don't find the rsm due to some
5283 * send-limit set the current time, which
5284 * basically disables the send-limit.
5285 */
5286 struct timeval tv;
5287
5288 microuptime(&tv);
5289 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
5290 }
5291 rack_tend_gp_marks(tp, rack);
5292 rack_log_pacing_delay_calc(rack,
5293 tp->gput_seq,
5294 tp->gput_ack,
5295 (uintptr_t)rsm,
5296 tp->gput_ts,
5297 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
5298 9,
5299 __LINE__, rsm, quality);
5300 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
5301 } else {
5302 /*
5303 * To make sure proper timestamp merging occurs, we need to clear
5304 * all GP marks if we don't start a measurement.
5305 */
5306 rack_clear_gp_marks(tp, rack);
5307 }
5308 }
5309
5310 /*
5311 * CC wrapper hook functions
5312 */
5313 static void
rack_ack_received(struct tcpcb * tp,struct tcp_rack * rack,uint32_t th_ack,uint16_t nsegs,uint16_t type,int32_t post_recovery)5314 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs,
5315 uint16_t type, int32_t post_recovery)
5316 {
5317 uint32_t prior_cwnd, acked;
5318 struct tcp_log_buffer *lgb = NULL;
5319 uint8_t labc_to_use, quality;
5320
5321 INP_WLOCK_ASSERT(tptoinpcb(tp));
5322 tp->t_ccv.nsegs = nsegs;
5323 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una);
5324 if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
5325 uint32_t max;
5326
5327 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp);
5328 if (tp->t_ccv.bytes_this_ack > max) {
5329 tp->t_ccv.bytes_this_ack = max;
5330 }
5331 }
5332 #ifdef STATS
5333 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
5334 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd);
5335 #endif
5336 if ((th_ack == tp->snd_max) && rack->lt_bw_up) {
5337 /*
5338 * We will ack all the data, time to end any
5339 * lt_bw_up we have running until something
5340 * new is sent. Note we need to use the actual
5341 * ack_rcv_time which with pacing may be different.
5342 */
5343 uint64_t tmark;
5344
5345 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq);
5346 rack->r_ctl.lt_seq = tp->snd_max;
5347 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
5348 if (tmark >= rack->r_ctl.lt_timemark) {
5349 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
5350 }
5351 rack->r_ctl.lt_timemark = tmark;
5352 rack->lt_bw_up = 0;
5353 }
5354 quality = RACK_QUALITY_NONE;
5355 if ((tp->t_flags & TF_GPUTINPROG) &&
5356 rack_enough_for_measurement(tp, rack, th_ack, &quality)) {
5357 /* Measure the Goodput */
5358 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality);
5359 }
5360 /* Which way our we limited, if not cwnd limited no advance in CA */
5361 if (tp->snd_cwnd <= tp->snd_wnd)
5362 tp->t_ccv.flags |= CCF_CWND_LIMITED;
5363 else
5364 tp->t_ccv.flags &= ~CCF_CWND_LIMITED;
5365 if (tp->snd_cwnd > tp->snd_ssthresh) {
5366 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack,
5367 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp));
5368 /* For the setting of a window past use the actual scwnd we are using */
5369 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) {
5370 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use;
5371 tp->t_ccv.flags |= CCF_ABC_SENTAWND;
5372 }
5373 } else {
5374 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND;
5375 tp->t_bytes_acked = 0;
5376 }
5377 prior_cwnd = tp->snd_cwnd;
5378 if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec ||
5379 (rack_client_low_buf && rack->client_bufferlvl &&
5380 (rack->client_bufferlvl < rack_client_low_buf)))
5381 labc_to_use = rack->rc_labc;
5382 else
5383 labc_to_use = rack_max_abc_post_recovery;
5384 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5385 union tcp_log_stackspecific log;
5386 struct timeval tv;
5387
5388 memset(&log, 0, sizeof(log));
5389 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5390 log.u_bbr.flex1 = th_ack;
5391 log.u_bbr.flex2 = tp->t_ccv.flags;
5392 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5393 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5394 log.u_bbr.flex5 = labc_to_use;
5395 log.u_bbr.flex6 = prior_cwnd;
5396 log.u_bbr.flex7 = V_tcp_do_newsack;
5397 log.u_bbr.flex8 = 1;
5398 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5399 0, &log, false, NULL, __func__, __LINE__,&tv);
5400 }
5401 if (CC_ALGO(tp)->ack_received != NULL) {
5402 /* XXXLAS: Find a way to live without this */
5403 tp->t_ccv.curack = th_ack;
5404 tp->t_ccv.labc = labc_to_use;
5405 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC;
5406 CC_ALGO(tp)->ack_received(&tp->t_ccv, type);
5407 }
5408 if (lgb) {
5409 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd;
5410 }
5411 if (rack->r_must_retran) {
5412 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) {
5413 /*
5414 * We now are beyond the rxt point so lets disable
5415 * the flag.
5416 */
5417 rack->r_ctl.rc_out_at_rto = 0;
5418 rack->r_must_retran = 0;
5419 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) {
5420 /*
5421 * Only decrement the rc_out_at_rto if the cwnd advances
5422 * at least a whole segment. Otherwise next time the peer
5423 * acks, we won't be able to send this generaly happens
5424 * when we are in Congestion Avoidance.
5425 */
5426 if (acked <= rack->r_ctl.rc_out_at_rto){
5427 rack->r_ctl.rc_out_at_rto -= acked;
5428 } else {
5429 rack->r_ctl.rc_out_at_rto = 0;
5430 }
5431 }
5432 }
5433 #ifdef STATS
5434 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use);
5435 #endif
5436 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) {
5437 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use;
5438 }
5439 if ((rack->rc_initial_ss_comp == 0) &&
5440 (tp->snd_cwnd >= tp->snd_ssthresh)) {
5441 /*
5442 * The cwnd has grown beyond ssthresh we have
5443 * entered ca and completed our first Slowstart.
5444 */
5445 rack->rc_initial_ss_comp = 1;
5446 }
5447 }
5448
5449 static void
tcp_rack_partialack(struct tcpcb * tp)5450 tcp_rack_partialack(struct tcpcb *tp)
5451 {
5452 struct tcp_rack *rack;
5453
5454 rack = (struct tcp_rack *)tp->t_fb_ptr;
5455 INP_WLOCK_ASSERT(tptoinpcb(tp));
5456 /*
5457 * If we are doing PRR and have enough
5458 * room to send <or> we are pacing and prr
5459 * is disabled we will want to see if we
5460 * can send data (by setting r_wanted_output to
5461 * true).
5462 */
5463 if ((rack->r_ctl.rc_prr_sndcnt > 0) ||
5464 rack->rack_no_prr)
5465 rack->r_wanted_output = 1;
5466 }
5467
5468 static void
rack_exit_recovery(struct tcpcb * tp,struct tcp_rack * rack,int how)5469 rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how)
5470 {
5471 /*
5472 * Now exit recovery.
5473 */
5474 EXIT_RECOVERY(tp->t_flags);
5475 }
5476
5477 static void
rack_post_recovery(struct tcpcb * tp,uint32_t th_ack)5478 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack)
5479 {
5480 struct tcp_rack *rack;
5481 uint32_t orig_cwnd;
5482
5483 orig_cwnd = tp->snd_cwnd;
5484 INP_WLOCK_ASSERT(tptoinpcb(tp));
5485 rack = (struct tcp_rack *)tp->t_fb_ptr;
5486 /* only alert CC if we alerted when we entered */
5487 if (CC_ALGO(tp)->post_recovery != NULL) {
5488 tp->t_ccv.curack = th_ack;
5489 CC_ALGO(tp)->post_recovery(&tp->t_ccv);
5490 if (tp->snd_cwnd < tp->snd_ssthresh) {
5491 /*
5492 * Rack has burst control and pacing
5493 * so lets not set this any lower than
5494 * snd_ssthresh per RFC-6582 (option 2).
5495 */
5496 tp->snd_cwnd = tp->snd_ssthresh;
5497 }
5498 }
5499 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
5500 union tcp_log_stackspecific log;
5501 struct timeval tv;
5502
5503 memset(&log, 0, sizeof(log));
5504 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
5505 log.u_bbr.flex1 = th_ack;
5506 log.u_bbr.flex2 = tp->t_ccv.flags;
5507 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack;
5508 log.u_bbr.flex4 = tp->t_ccv.nsegs;
5509 log.u_bbr.flex5 = V_tcp_abc_l_var;
5510 log.u_bbr.flex6 = orig_cwnd;
5511 log.u_bbr.flex7 = V_tcp_do_newsack;
5512 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt;
5513 log.u_bbr.flex8 = 2;
5514 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
5515 0, &log, false, NULL, __func__, __LINE__, &tv);
5516 }
5517 if ((rack->rack_no_prr == 0) &&
5518 (rack->no_prr_addback == 0) &&
5519 (rack->r_ctl.rc_prr_sndcnt > 0)) {
5520 /*
5521 * Suck the next prr cnt back into cwnd, but
5522 * only do that if we are not application limited.
5523 */
5524 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) {
5525 /*
5526 * We are allowed to add back to the cwnd the amount we did
5527 * not get out if:
5528 * a) no_prr_addback is off.
5529 * b) we are not app limited
5530 * c) we are doing prr
5531 * <and>
5532 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none).
5533 */
5534 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax),
5535 rack->r_ctl.rc_prr_sndcnt);
5536 }
5537 rack->r_ctl.rc_prr_sndcnt = 0;
5538 rack_log_to_prr(rack, 1, 0, __LINE__);
5539 }
5540 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
5541 tp->snd_recover = tp->snd_una;
5542 if (rack->r_ctl.dsack_persist) {
5543 rack->r_ctl.dsack_persist--;
5544 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
5545 rack->r_ctl.num_dsack = 0;
5546 }
5547 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
5548 }
5549 if (rack->rto_from_rec == 1) {
5550 rack->rto_from_rec = 0;
5551 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
5552 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
5553 }
5554 rack_exit_recovery(tp, rack, 1);
5555 }
5556
5557 static void
rack_cong_signal(struct tcpcb * tp,uint32_t type,uint32_t ack,int line)5558 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line)
5559 {
5560 struct tcp_rack *rack;
5561 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd;
5562
5563 INP_WLOCK_ASSERT(tptoinpcb(tp));
5564 #ifdef STATS
5565 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
5566 #endif
5567 if (IN_RECOVERY(tp->t_flags) == 0) {
5568 in_rec_at_entry = 0;
5569 ssthresh_enter = tp->snd_ssthresh;
5570 cwnd_enter = tp->snd_cwnd;
5571 } else
5572 in_rec_at_entry = 1;
5573 rack = (struct tcp_rack *)tp->t_fb_ptr;
5574 switch (type) {
5575 case CC_NDUPACK:
5576 tp->t_flags &= ~TF_WASFRECOVERY;
5577 tp->t_flags &= ~TF_WASCRECOVERY;
5578 if (!IN_FASTRECOVERY(tp->t_flags)) {
5579 /* Check if this is the end of the initial Start-up i.e. initial slow-start */
5580 if (rack->rc_initial_ss_comp == 0) {
5581 /* Yep it is the end of the initial slowstart */
5582 rack->rc_initial_ss_comp = 1;
5583 }
5584 rack->r_ctl.rc_prr_delivered = 0;
5585 rack->r_ctl.rc_prr_out = 0;
5586 rack->r_fast_output = 0;
5587 if (rack->rack_no_prr == 0) {
5588 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
5589 rack_log_to_prr(rack, 2, in_rec_at_entry, line);
5590 }
5591 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una;
5592 tp->snd_recover = tp->snd_max;
5593 if (tp->t_flags2 & TF2_ECN_PERMIT)
5594 tp->t_flags2 |= TF2_ECN_SND_CWR;
5595 }
5596 break;
5597 case CC_ECN:
5598 if (!IN_CONGRECOVERY(tp->t_flags) ||
5599 /*
5600 * Allow ECN reaction on ACK to CWR, if
5601 * that data segment was also CE marked.
5602 */
5603 SEQ_GEQ(ack, tp->snd_recover)) {
5604 EXIT_CONGRECOVERY(tp->t_flags);
5605 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd);
5606 rack->r_fast_output = 0;
5607 tp->snd_recover = tp->snd_max + 1;
5608 if (tp->t_flags2 & TF2_ECN_PERMIT)
5609 tp->t_flags2 |= TF2_ECN_SND_CWR;
5610 }
5611 break;
5612 case CC_RTO:
5613 tp->t_dupacks = 0;
5614 tp->t_bytes_acked = 0;
5615 rack->r_fast_output = 0;
5616 if (IN_RECOVERY(tp->t_flags))
5617 rack_exit_recovery(tp, rack, 2);
5618 orig_cwnd = tp->snd_cwnd;
5619 rack_log_to_prr(rack, 16, orig_cwnd, line);
5620 if (CC_ALGO(tp)->cong_signal == NULL) {
5621 /* TSNH */
5622 tp->snd_ssthresh = max(2,
5623 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 /
5624 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp);
5625 tp->snd_cwnd = ctf_fixed_maxseg(tp);
5626 }
5627 if (tp->t_flags2 & TF2_ECN_PERMIT)
5628 tp->t_flags2 |= TF2_ECN_SND_CWR;
5629 break;
5630 case CC_RTO_ERR:
5631 KMOD_TCPSTAT_INC(tcps_sndrexmitbad);
5632 /* RTO was unnecessary, so reset everything. */
5633 tp->snd_cwnd = tp->snd_cwnd_prev;
5634 tp->snd_ssthresh = tp->snd_ssthresh_prev;
5635 tp->snd_recover = tp->snd_recover_prev;
5636 if (tp->t_flags & TF_WASFRECOVERY) {
5637 ENTER_FASTRECOVERY(tp->t_flags);
5638 tp->t_flags &= ~TF_WASFRECOVERY;
5639 }
5640 if (tp->t_flags & TF_WASCRECOVERY) {
5641 ENTER_CONGRECOVERY(tp->t_flags);
5642 tp->t_flags &= ~TF_WASCRECOVERY;
5643 }
5644 tp->snd_nxt = tp->snd_max;
5645 tp->t_badrxtwin = 0;
5646 break;
5647 }
5648 if ((CC_ALGO(tp)->cong_signal != NULL) &&
5649 (type != CC_RTO)){
5650 tp->t_ccv.curack = ack;
5651 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type);
5652 }
5653 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) {
5654 rack_log_to_prr(rack, 15, cwnd_enter, line);
5655 rack->r_ctl.dsack_byte_cnt = 0;
5656 rack->r_ctl.retran_during_recovery = 0;
5657 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter;
5658 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter;
5659 rack->r_ent_rec_ns = 1;
5660 }
5661 }
5662
5663 static inline void
rack_cc_after_idle(struct tcp_rack * rack,struct tcpcb * tp)5664 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp)
5665 {
5666 uint32_t i_cwnd;
5667
5668 INP_WLOCK_ASSERT(tptoinpcb(tp));
5669
5670 if (CC_ALGO(tp)->after_idle != NULL)
5671 CC_ALGO(tp)->after_idle(&tp->t_ccv);
5672
5673 if (tp->snd_cwnd == 1)
5674 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
5675 else
5676 i_cwnd = rc_init_window(rack);
5677
5678 /*
5679 * Being idle is no different than the initial window. If the cc
5680 * clamps it down below the initial window raise it to the initial
5681 * window.
5682 */
5683 if (tp->snd_cwnd < i_cwnd) {
5684 tp->snd_cwnd = i_cwnd;
5685 }
5686 }
5687
5688 /*
5689 * Indicate whether this ack should be delayed. We can delay the ack if
5690 * following conditions are met:
5691 * - There is no delayed ack timer in progress.
5692 * - Our last ack wasn't a 0-sized window. We never want to delay
5693 * the ack that opens up a 0-sized window.
5694 * - LRO wasn't used for this segment. We make sure by checking that the
5695 * segment size is not larger than the MSS.
5696 * - Delayed acks are enabled or this is a half-synchronized T/TCP
5697 * connection.
5698 */
5699 #define DELAY_ACK(tp, tlen) \
5700 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \
5701 ((tp->t_flags & TF_DELACK) == 0) && \
5702 (tlen <= tp->t_maxseg) && \
5703 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN)))
5704
5705 static struct rack_sendmap *
rack_find_lowest_rsm(struct tcp_rack * rack)5706 rack_find_lowest_rsm(struct tcp_rack *rack)
5707 {
5708 struct rack_sendmap *rsm;
5709
5710 /*
5711 * Walk the time-order transmitted list looking for an rsm that is
5712 * not acked. This will be the one that was sent the longest time
5713 * ago that is still outstanding.
5714 */
5715 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
5716 if (rsm->r_flags & RACK_ACKED) {
5717 continue;
5718 }
5719 goto finish;
5720 }
5721 finish:
5722 return (rsm);
5723 }
5724
5725 static struct rack_sendmap *
rack_find_high_nonack(struct tcp_rack * rack,struct rack_sendmap * rsm)5726 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm)
5727 {
5728 struct rack_sendmap *prsm;
5729
5730 /*
5731 * Walk the sequence order list backward until we hit and arrive at
5732 * the highest seq not acked. In theory when this is called it
5733 * should be the last segment (which it was not).
5734 */
5735 prsm = rsm;
5736
5737 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) {
5738 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) {
5739 continue;
5740 }
5741 return (prsm);
5742 }
5743 return (NULL);
5744 }
5745
5746 static uint32_t
rack_calc_thresh_rack(struct tcp_rack * rack,uint32_t srtt,uint32_t cts,int line,int log_allowed)5747 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed)
5748 {
5749 int32_t lro;
5750 uint32_t thresh;
5751
5752 /*
5753 * lro is the flag we use to determine if we have seen reordering.
5754 * If it gets set we have seen reordering. The reorder logic either
5755 * works in one of two ways:
5756 *
5757 * If reorder-fade is configured, then we track the last time we saw
5758 * re-ordering occur. If we reach the point where enough time as
5759 * passed we no longer consider reordering as occurring.
5760 *
5761 * Or if reorder-face is 0, then once we see reordering we consider
5762 * the connection to alway be subject to reordering and just set lro
5763 * to 1.
5764 *
5765 * In the end if lro is non-zero we add the extra time for
5766 * reordering in.
5767 */
5768 if (srtt == 0)
5769 srtt = 1;
5770 if (rack->r_ctl.rc_reorder_ts) {
5771 if (rack->r_ctl.rc_reorder_fade) {
5772 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) {
5773 lro = cts - rack->r_ctl.rc_reorder_ts;
5774 if (lro == 0) {
5775 /*
5776 * No time as passed since the last
5777 * reorder, mark it as reordering.
5778 */
5779 lro = 1;
5780 }
5781 } else {
5782 /* Negative time? */
5783 lro = 0;
5784 }
5785 if (lro > rack->r_ctl.rc_reorder_fade) {
5786 /* Turn off reordering seen too */
5787 rack->r_ctl.rc_reorder_ts = 0;
5788 lro = 0;
5789 }
5790 } else {
5791 /* Reodering does not fade */
5792 lro = 1;
5793 }
5794 } else {
5795 lro = 0;
5796 }
5797 if (rack->rc_rack_tmr_std_based == 0) {
5798 thresh = srtt + rack->r_ctl.rc_pkt_delay;
5799 } else {
5800 /* Standards based pkt-delay is 1/4 srtt */
5801 thresh = srtt + (srtt >> 2);
5802 }
5803 if (lro && (rack->rc_rack_tmr_std_based == 0)) {
5804 /* It must be set, if not you get 1/4 rtt */
5805 if (rack->r_ctl.rc_reorder_shift)
5806 thresh += (srtt >> rack->r_ctl.rc_reorder_shift);
5807 else
5808 thresh += (srtt >> 2);
5809 }
5810 if (rack->rc_rack_use_dsack &&
5811 lro &&
5812 (rack->r_ctl.num_dsack > 0)) {
5813 /*
5814 * We only increase the reordering window if we
5815 * have seen reordering <and> we have a DSACK count.
5816 */
5817 thresh += rack->r_ctl.num_dsack * (srtt >> 2);
5818 if (log_allowed)
5819 rack_log_dsack_event(rack, 4, line, srtt, thresh);
5820 }
5821 /* SRTT * 2 is the ceiling */
5822 if (thresh > (srtt * 2)) {
5823 thresh = srtt * 2;
5824 }
5825 /* And we don't want it above the RTO max either */
5826 if (thresh > rack_rto_max) {
5827 thresh = rack_rto_max;
5828 }
5829 if (log_allowed)
5830 rack_log_dsack_event(rack, 6, line, srtt, thresh);
5831 return (thresh);
5832 }
5833
5834 static uint32_t
rack_calc_thresh_tlp(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t srtt)5835 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack,
5836 struct rack_sendmap *rsm, uint32_t srtt)
5837 {
5838 struct rack_sendmap *prsm;
5839 uint32_t thresh, len;
5840 int segsiz;
5841
5842 if (srtt == 0)
5843 srtt = 1;
5844 if (rack->r_ctl.rc_tlp_threshold)
5845 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold);
5846 else
5847 thresh = (srtt * 2);
5848
5849 /* Get the previous sent packet, if any */
5850 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
5851 len = rsm->r_end - rsm->r_start;
5852 if (rack->rack_tlp_threshold_use == TLP_USE_ID) {
5853 /* Exactly like the ID */
5854 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) {
5855 uint32_t alt_thresh;
5856 /*
5857 * Compensate for delayed-ack with the d-ack time.
5858 */
5859 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5860 if (alt_thresh > thresh)
5861 thresh = alt_thresh;
5862 }
5863 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) {
5864 /* 2.1 behavior */
5865 prsm = TAILQ_PREV(rsm, rack_head, r_tnext);
5866 if (prsm && (len <= segsiz)) {
5867 /*
5868 * Two packets outstanding, thresh should be (2*srtt) +
5869 * possible inter-packet delay (if any).
5870 */
5871 uint32_t inter_gap = 0;
5872 int idx, nidx;
5873
5874 idx = rsm->r_rtr_cnt - 1;
5875 nidx = prsm->r_rtr_cnt - 1;
5876 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) {
5877 /* Yes it was sent later (or at the same time) */
5878 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx];
5879 }
5880 thresh += inter_gap;
5881 } else if (len <= segsiz) {
5882 /*
5883 * Possibly compensate for delayed-ack.
5884 */
5885 uint32_t alt_thresh;
5886
5887 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5888 if (alt_thresh > thresh)
5889 thresh = alt_thresh;
5890 }
5891 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) {
5892 /* 2.2 behavior */
5893 if (len <= segsiz) {
5894 uint32_t alt_thresh;
5895 /*
5896 * Compensate for delayed-ack with the d-ack time.
5897 */
5898 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time;
5899 if (alt_thresh > thresh)
5900 thresh = alt_thresh;
5901 }
5902 }
5903 /* Not above an RTO */
5904 if (thresh > tp->t_rxtcur) {
5905 thresh = tp->t_rxtcur;
5906 }
5907 /* Not above a RTO max */
5908 if (thresh > rack_rto_max) {
5909 thresh = rack_rto_max;
5910 }
5911 /* Apply user supplied min TLP */
5912 if (thresh < rack_tlp_min) {
5913 thresh = rack_tlp_min;
5914 }
5915 return (thresh);
5916 }
5917
5918 static uint32_t
rack_grab_rtt(struct tcpcb * tp,struct tcp_rack * rack)5919 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack)
5920 {
5921 /*
5922 * We want the rack_rtt which is the
5923 * last rtt we measured. However if that
5924 * does not exist we fallback to the srtt (which
5925 * we probably will never do) and then as a last
5926 * resort we use RACK_INITIAL_RTO if no srtt is
5927 * yet set.
5928 */
5929 if (rack->rc_rack_rtt)
5930 return (rack->rc_rack_rtt);
5931 else if (tp->t_srtt == 0)
5932 return (RACK_INITIAL_RTO);
5933 return (tp->t_srtt);
5934 }
5935
5936 static struct rack_sendmap *
rack_check_recovery_mode(struct tcpcb * tp,uint32_t tsused)5937 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused)
5938 {
5939 /*
5940 * Check to see that we don't need to fall into recovery. We will
5941 * need to do so if our oldest transmit is past the time we should
5942 * have had an ack.
5943 */
5944 struct tcp_rack *rack;
5945 struct rack_sendmap *rsm;
5946 int32_t idx;
5947 uint32_t srtt, thresh;
5948
5949 rack = (struct tcp_rack *)tp->t_fb_ptr;
5950 if (tqhash_empty(rack->r_ctl.tqh)) {
5951 return (NULL);
5952 }
5953 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
5954 if (rsm == NULL)
5955 return (NULL);
5956
5957
5958 if (rsm->r_flags & RACK_ACKED) {
5959 rsm = rack_find_lowest_rsm(rack);
5960 if (rsm == NULL)
5961 return (NULL);
5962 }
5963 idx = rsm->r_rtr_cnt - 1;
5964 srtt = rack_grab_rtt(tp, rack);
5965 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
5966 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) {
5967 return (NULL);
5968 }
5969 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) {
5970 return (NULL);
5971 }
5972 /* Ok if we reach here we are over-due and this guy can be sent */
5973 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
5974 return (rsm);
5975 }
5976
5977 static uint32_t
rack_get_persists_timer_val(struct tcpcb * tp,struct tcp_rack * rack)5978 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack)
5979 {
5980 int32_t t;
5981 int32_t tt;
5982 uint32_t ret_val;
5983
5984 t = (tp->t_srtt + (tp->t_rttvar << 2));
5985 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
5986 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop);
5987 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT;
5988 ret_val = (uint32_t)tt;
5989 return (ret_val);
5990 }
5991
5992 static uint32_t
rack_timer_start(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,int sup_rack)5993 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack)
5994 {
5995 /*
5996 * Start the FR timer, we do this based on getting the first one in
5997 * the rc_tmap. Note that if its NULL we must stop the timer. in all
5998 * events we need to stop the running timer (if its running) before
5999 * starting the new one.
6000 */
6001 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse;
6002 uint32_t srtt_cur;
6003 int32_t idx;
6004 int32_t is_tlp_timer = 0;
6005 struct rack_sendmap *rsm;
6006
6007 if (rack->t_timers_stopped) {
6008 /* All timers have been stopped none are to run */
6009 return (0);
6010 }
6011 if (rack->rc_in_persist) {
6012 /* We can't start any timer in persists */
6013 return (rack_get_persists_timer_val(tp, rack));
6014 }
6015 rack->rc_on_min_to = 0;
6016 if ((tp->t_state < TCPS_ESTABLISHED) ||
6017 ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
6018 goto activate_rxt;
6019 }
6020 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6021 if ((rsm == NULL) || sup_rack) {
6022 /* Nothing on the send map or no rack */
6023 activate_rxt:
6024 time_since_sent = 0;
6025 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
6026 if (rsm) {
6027 /*
6028 * Should we discount the RTX timer any?
6029 *
6030 * We want to discount it the smallest amount.
6031 * If a timer (Rack/TLP or RXT) has gone off more
6032 * recently thats the discount we want to use (now - timer time).
6033 * If the retransmit of the oldest packet was more recent then
6034 * we want to use that (now - oldest-packet-last_transmit_time).
6035 *
6036 */
6037 idx = rsm->r_rtr_cnt - 1;
6038 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx])))
6039 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6040 else
6041 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6042 if (TSTMP_GT(cts, tstmp_touse))
6043 time_since_sent = cts - tstmp_touse;
6044 }
6045 if (SEQ_LT(tp->snd_una, tp->snd_max) ||
6046 sbavail(&tptosocket(tp)->so_snd)) {
6047 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT;
6048 to = tp->t_rxtcur;
6049 if (to > time_since_sent)
6050 to -= time_since_sent;
6051 else
6052 to = rack->r_ctl.rc_min_to;
6053 if (to == 0)
6054 to = 1;
6055 /* Special case for KEEPINIT */
6056 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
6057 (TP_KEEPINIT(tp) != 0) &&
6058 rsm) {
6059 /*
6060 * We have to put a ceiling on the rxt timer
6061 * of the keep-init timeout.
6062 */
6063 uint32_t max_time, red;
6064
6065 max_time = TICKS_2_USEC(TP_KEEPINIT(tp));
6066 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) {
6067 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]);
6068 if (red < max_time)
6069 max_time -= red;
6070 else
6071 max_time = 1;
6072 }
6073 /* Reduce timeout to the keep value if needed */
6074 if (max_time < to)
6075 to = max_time;
6076 }
6077 return (to);
6078 }
6079 return (0);
6080 }
6081 if (rsm->r_flags & RACK_ACKED) {
6082 rsm = rack_find_lowest_rsm(rack);
6083 if (rsm == NULL) {
6084 /* No lowest? */
6085 goto activate_rxt;
6086 }
6087 }
6088 /* Convert from ms to usecs */
6089 if ((rsm->r_flags & RACK_SACK_PASSED) ||
6090 (rsm->r_flags & RACK_RWND_COLLAPSED) ||
6091 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
6092 if ((tp->t_flags & TF_SENTFIN) &&
6093 ((tp->snd_max - tp->snd_una) == 1) &&
6094 (rsm->r_flags & RACK_HAS_FIN)) {
6095 /*
6096 * We don't start a rack timer if all we have is a
6097 * FIN outstanding.
6098 */
6099 goto activate_rxt;
6100 }
6101 if ((rack->use_rack_rr == 0) &&
6102 (IN_FASTRECOVERY(tp->t_flags)) &&
6103 (rack->rack_no_prr == 0) &&
6104 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) {
6105 /*
6106 * We are not cheating, in recovery and
6107 * not enough ack's to yet get our next
6108 * retransmission out.
6109 *
6110 * Note that classified attackers do not
6111 * get to use the rack-cheat.
6112 */
6113 goto activate_tlp;
6114 }
6115 srtt = rack_grab_rtt(tp, rack);
6116 thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1);
6117 idx = rsm->r_rtr_cnt - 1;
6118 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh;
6119 if (SEQ_GEQ(exp, cts)) {
6120 to = exp - cts;
6121 if (to < rack->r_ctl.rc_min_to) {
6122 to = rack->r_ctl.rc_min_to;
6123 if (rack->r_rr_config == 3)
6124 rack->rc_on_min_to = 1;
6125 }
6126 } else {
6127 to = rack->r_ctl.rc_min_to;
6128 if (rack->r_rr_config == 3)
6129 rack->rc_on_min_to = 1;
6130 }
6131 } else {
6132 /* Ok we need to do a TLP not RACK */
6133 activate_tlp:
6134 if ((rack->rc_tlp_in_progress != 0) &&
6135 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) {
6136 /*
6137 * The previous send was a TLP and we have sent
6138 * N TLP's without sending new data.
6139 */
6140 goto activate_rxt;
6141 }
6142 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
6143 if (rsm == NULL) {
6144 /* We found no rsm to TLP with. */
6145 goto activate_rxt;
6146 }
6147 if (rsm->r_flags & RACK_HAS_FIN) {
6148 /* If its a FIN we dont do TLP */
6149 rsm = NULL;
6150 goto activate_rxt;
6151 }
6152 idx = rsm->r_rtr_cnt - 1;
6153 time_since_sent = 0;
6154 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time))
6155 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx];
6156 else
6157 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time;
6158 if (TSTMP_GT(cts, tstmp_touse))
6159 time_since_sent = cts - tstmp_touse;
6160 is_tlp_timer = 1;
6161 if (tp->t_srtt) {
6162 if ((rack->rc_srtt_measure_made == 0) &&
6163 (tp->t_srtt == 1)) {
6164 /*
6165 * If another stack as run and set srtt to 1,
6166 * then the srtt was 0, so lets use the initial.
6167 */
6168 srtt = RACK_INITIAL_RTO;
6169 } else {
6170 srtt_cur = tp->t_srtt;
6171 srtt = srtt_cur;
6172 }
6173 } else
6174 srtt = RACK_INITIAL_RTO;
6175 /*
6176 * If the SRTT is not keeping up and the
6177 * rack RTT has spiked we want to use
6178 * the last RTT not the smoothed one.
6179 */
6180 if (rack_tlp_use_greater &&
6181 tp->t_srtt &&
6182 (srtt < rack_grab_rtt(tp, rack))) {
6183 srtt = rack_grab_rtt(tp, rack);
6184 }
6185 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt);
6186 if (thresh > time_since_sent) {
6187 to = thresh - time_since_sent;
6188 } else {
6189 to = rack->r_ctl.rc_min_to;
6190 rack_log_alt_to_to_cancel(rack,
6191 thresh, /* flex1 */
6192 time_since_sent, /* flex2 */
6193 tstmp_touse, /* flex3 */
6194 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */
6195 (uint32_t)rsm->r_tim_lastsent[idx],
6196 srtt,
6197 idx, 99);
6198 }
6199 if (to < rack_tlp_min) {
6200 to = rack_tlp_min;
6201 }
6202 if (to > TICKS_2_USEC(tcp_rexmit_max)) {
6203 /*
6204 * If the TLP time works out to larger than the max
6205 * RTO lets not do TLP.. just RTO.
6206 */
6207 goto activate_rxt;
6208 }
6209 }
6210 if (is_tlp_timer == 0) {
6211 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK;
6212 } else {
6213 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP;
6214 }
6215 if (to == 0)
6216 to = 1;
6217 return (to);
6218 }
6219
6220 static void
rack_enter_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,tcp_seq snd_una)6221 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una)
6222 {
6223 if (rack->rc_in_persist == 0) {
6224 if (tp->t_flags & TF_GPUTINPROG) {
6225 /*
6226 * Stop the goodput now, the calling of the
6227 * measurement function clears the flag.
6228 */
6229 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__,
6230 RACK_QUALITY_PERSIST);
6231 }
6232 #ifdef NETFLIX_SHARED_CWND
6233 if (rack->r_ctl.rc_scw) {
6234 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6235 rack->rack_scwnd_is_idle = 1;
6236 }
6237 #endif
6238 rack->r_ctl.rc_went_idle_time = cts;
6239 if (rack->r_ctl.rc_went_idle_time == 0)
6240 rack->r_ctl.rc_went_idle_time = 1;
6241 if (rack->lt_bw_up) {
6242 /* Suspend our LT BW measurement */
6243 uint64_t tmark;
6244
6245 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq);
6246 rack->r_ctl.lt_seq = snd_una;
6247 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
6248 if (tmark >= rack->r_ctl.lt_timemark) {
6249 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
6250 }
6251 rack->r_ctl.lt_timemark = tmark;
6252 rack->lt_bw_up = 0;
6253 rack->r_persist_lt_bw_off = 1;
6254 }
6255 rack_timer_cancel(tp, rack, cts, __LINE__);
6256 rack->r_ctl.persist_lost_ends = 0;
6257 rack->probe_not_answered = 0;
6258 rack->forced_ack = 0;
6259 tp->t_rxtshift = 0;
6260 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6261 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6262 rack->rc_in_persist = 1;
6263 }
6264 }
6265
6266 static void
rack_exit_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)6267 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6268 {
6269 if (tcp_in_hpts(rack->rc_tp)) {
6270 tcp_hpts_remove(rack->rc_tp);
6271 rack->r_ctl.rc_hpts_flags = 0;
6272 }
6273 #ifdef NETFLIX_SHARED_CWND
6274 if (rack->r_ctl.rc_scw) {
6275 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
6276 rack->rack_scwnd_is_idle = 0;
6277 }
6278 #endif
6279 if (rack->rc_gp_dyn_mul &&
6280 (rack->use_fixed_rate == 0) &&
6281 (rack->rc_always_pace)) {
6282 /*
6283 * Do we count this as if a probe-rtt just
6284 * finished?
6285 */
6286 uint32_t time_idle, idle_min;
6287
6288 time_idle = cts - rack->r_ctl.rc_went_idle_time;
6289 idle_min = rack_min_probertt_hold;
6290 if (rack_probertt_gpsrtt_cnt_div) {
6291 uint64_t extra;
6292 extra = (uint64_t)rack->r_ctl.rc_gp_srtt *
6293 (uint64_t)rack_probertt_gpsrtt_cnt_mul;
6294 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div;
6295 idle_min += (uint32_t)extra;
6296 }
6297 if (time_idle >= idle_min) {
6298 /* Yes, we count it as a probe-rtt. */
6299 uint32_t us_cts;
6300
6301 us_cts = tcp_get_usecs(NULL);
6302 if (rack->in_probe_rtt == 0) {
6303 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
6304 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
6305 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
6306 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
6307 } else {
6308 rack_exit_probertt(rack, us_cts);
6309 }
6310 }
6311 }
6312 if (rack->r_persist_lt_bw_off) {
6313 /* Continue where we left off */
6314 rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL);
6315 rack->lt_bw_up = 1;
6316 rack->r_persist_lt_bw_off = 0;
6317 }
6318 rack->rc_in_persist = 0;
6319 rack->r_ctl.rc_went_idle_time = 0;
6320 tp->t_rxtshift = 0;
6321 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
6322 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
6323 rack->r_ctl.rc_agg_delayed = 0;
6324 rack->r_early = 0;
6325 rack->r_late = 0;
6326 rack->r_ctl.rc_agg_early = 0;
6327 }
6328
6329 static void
rack_log_hpts_diag(struct tcp_rack * rack,uint32_t cts,struct hpts_diag * diag,struct timeval * tv)6330 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts,
6331 struct hpts_diag *diag, struct timeval *tv)
6332 {
6333 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6334 union tcp_log_stackspecific log;
6335
6336 memset(&log, 0, sizeof(log));
6337 log.u_bbr.flex1 = diag->p_nxt_slot;
6338 log.u_bbr.flex2 = diag->p_cur_slot;
6339 log.u_bbr.flex3 = diag->slot_req;
6340 log.u_bbr.flex4 = diag->inp_hptsslot;
6341 log.u_bbr.flex5 = diag->time_remaining;
6342 log.u_bbr.flex6 = diag->need_new_to;
6343 log.u_bbr.flex7 = diag->p_hpts_active;
6344 log.u_bbr.flex8 = diag->p_on_min_sleep;
6345 /* Hijack other fields as needed */
6346 log.u_bbr.epoch = diag->have_slept;
6347 log.u_bbr.lt_epoch = diag->yet_to_sleep;
6348 log.u_bbr.pkts_out = diag->co_ret;
6349 log.u_bbr.applimited = diag->hpts_sleep_time;
6350 log.u_bbr.delivered = diag->p_prev_slot;
6351 log.u_bbr.inflight = diag->p_runningslot;
6352 log.u_bbr.bw_inuse = diag->wheel_slot;
6353 log.u_bbr.rttProp = diag->wheel_cts;
6354 log.u_bbr.timeStamp = cts;
6355 log.u_bbr.delRate = diag->maxslots;
6356 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6357 &rack->rc_inp->inp_socket->so_rcv,
6358 &rack->rc_inp->inp_socket->so_snd,
6359 BBR_LOG_HPTSDIAG, 0,
6360 0, &log, false, tv);
6361 }
6362
6363 }
6364
6365 static void
rack_log_wakeup(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb,uint32_t len,int type)6366 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type)
6367 {
6368 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
6369 union tcp_log_stackspecific log;
6370 struct timeval tv;
6371
6372 memset(&log, 0, sizeof(log));
6373 log.u_bbr.flex1 = sb->sb_flags;
6374 log.u_bbr.flex2 = len;
6375 log.u_bbr.flex3 = sb->sb_state;
6376 log.u_bbr.flex8 = type;
6377 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
6378 TCP_LOG_EVENTP(rack->rc_tp, NULL,
6379 &rack->rc_inp->inp_socket->so_rcv,
6380 &rack->rc_inp->inp_socket->so_snd,
6381 TCP_LOG_SB_WAKE, 0,
6382 len, &log, false, &tv);
6383 }
6384 }
6385
6386 static void
rack_start_hpts_timer(struct tcp_rack * rack,struct tcpcb * tp,uint32_t cts,int32_t usecs,uint32_t tot_len_this_send,int sup_rack)6387 rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
6388 int32_t usecs, uint32_t tot_len_this_send, int sup_rack)
6389 {
6390 struct hpts_diag diag;
6391 struct inpcb *inp = tptoinpcb(tp);
6392 struct timeval tv;
6393 uint32_t delayed_ack = 0;
6394 uint32_t hpts_timeout;
6395 uint32_t entry_usecs = usecs;
6396 uint8_t stopped;
6397 uint32_t left = 0;
6398 uint32_t us_cts;
6399
6400 if ((tp->t_state == TCPS_CLOSED) ||
6401 (tp->t_state == TCPS_LISTEN)) {
6402 return;
6403 }
6404 if (tcp_in_hpts(tp)) {
6405 /* Already on the pacer */
6406 return;
6407 }
6408 stopped = rack->rc_tmr_stopped;
6409 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
6410 left = rack->r_ctl.rc_timer_exp - cts;
6411 }
6412 rack->r_ctl.rc_timer_exp = 0;
6413 rack->r_ctl.rc_hpts_flags = 0;
6414 us_cts = tcp_get_usecs(&tv);
6415 /* Now early/late accounting */
6416 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0);
6417 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) {
6418 /*
6419 * We have a early carry over set,
6420 * we can always add more time so we
6421 * can always make this compensation.
6422 *
6423 * Note if ack's are allowed to wake us do not
6424 * penalize the next timer for being awoke
6425 * by an ack aka the rc_agg_early (non-paced mode).
6426 */
6427 usecs += rack->r_ctl.rc_agg_early;
6428 rack->r_early = 0;
6429 rack->r_ctl.rc_agg_early = 0;
6430 }
6431 if ((rack->r_late) &&
6432 ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) {
6433 /*
6434 * This is harder, we can
6435 * compensate some but it
6436 * really depends on what
6437 * the current pacing time is.
6438 */
6439 if (rack->r_ctl.rc_agg_delayed >= usecs) {
6440 /*
6441 * We can't compensate for it all.
6442 * And we have to have some time
6443 * on the clock. We always have a min
6444 * 10 HPTS timer units (10 x 10 i.e. 100 usecs).
6445 */
6446 if (usecs <= HPTS_USECS_PER_SLOT) {
6447 /* We gain delay */
6448 rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs);
6449 usecs = HPTS_USECS_PER_SLOT;
6450 } else {
6451 /* We take off some */
6452 rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT);
6453 usecs = HPTS_USECS_PER_SLOT;
6454 }
6455 } else {
6456 usecs -= rack->r_ctl.rc_agg_delayed;
6457 rack->r_ctl.rc_agg_delayed = 0;
6458 /* Make sure we have 100 useconds at minimum */
6459 if (usecs < HPTS_USECS_PER_SLOT) {
6460 rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs;
6461 usecs = HPTS_USECS_PER_SLOT;
6462 }
6463 if (rack->r_ctl.rc_agg_delayed == 0)
6464 rack->r_late = 0;
6465 }
6466 } else if (rack->r_late) {
6467 /* r_use_hpts_min is on and so is DGP */
6468 uint32_t max_red;
6469
6470 max_red = (usecs * rack->r_ctl.max_reduction) / 100;
6471 if (max_red >= rack->r_ctl.rc_agg_delayed) {
6472 usecs -= rack->r_ctl.rc_agg_delayed;
6473 rack->r_ctl.rc_agg_delayed = 0;
6474 } else {
6475 usecs -= max_red;
6476 rack->r_ctl.rc_agg_delayed -= max_red;
6477 }
6478 }
6479 if ((rack->r_use_hpts_min == 1) &&
6480 (usecs > 0) &&
6481 (rack->dgp_on == 1)) {
6482 /*
6483 * We are enforcing a min pacing timer
6484 * based on our hpts min timeout.
6485 */
6486 uint32_t min;
6487
6488 min = get_hpts_min_sleep_time();
6489 if (min > usecs) {
6490 usecs = min;
6491 }
6492 }
6493 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack);
6494 if (tp->t_flags & TF_DELACK) {
6495 delayed_ack = TICKS_2_USEC(tcp_delacktime);
6496 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
6497 }
6498 if (delayed_ack && ((hpts_timeout == 0) ||
6499 (delayed_ack < hpts_timeout)))
6500 hpts_timeout = delayed_ack;
6501 else
6502 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
6503 /*
6504 * If no timers are going to run and we will fall off the hptsi
6505 * wheel, we resort to a keep-alive timer if its configured.
6506 */
6507 if ((hpts_timeout == 0) &&
6508 (usecs == 0)) {
6509 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
6510 (tp->t_state <= TCPS_CLOSING)) {
6511 /*
6512 * Ok we have no timer (persists, rack, tlp, rxt or
6513 * del-ack), we don't have segments being paced. So
6514 * all that is left is the keepalive timer.
6515 */
6516 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
6517 /* Get the established keep-alive time */
6518 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp));
6519 } else {
6520 /*
6521 * Get the initial setup keep-alive time,
6522 * note that this is probably not going to
6523 * happen, since rack will be running a rxt timer
6524 * if a SYN of some sort is outstanding. It is
6525 * actually handled in rack_timeout_rxt().
6526 */
6527 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp));
6528 }
6529 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP;
6530 if (rack->in_probe_rtt) {
6531 /*
6532 * We want to instead not wake up a long time from
6533 * now but to wake up about the time we would
6534 * exit probe-rtt and initiate a keep-alive ack.
6535 * This will get us out of probe-rtt and update
6536 * our min-rtt.
6537 */
6538 hpts_timeout = rack_min_probertt_hold;
6539 }
6540 }
6541 }
6542 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) ==
6543 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) {
6544 /*
6545 * RACK, TLP, persists and RXT timers all are restartable
6546 * based on actions input .. i.e we received a packet (ack
6547 * or sack) and that changes things (rw, or snd_una etc).
6548 * Thus we can restart them with a new value. For
6549 * keep-alive, delayed_ack we keep track of what was left
6550 * and restart the timer with a smaller value.
6551 */
6552 if (left < hpts_timeout)
6553 hpts_timeout = left;
6554 }
6555 if (hpts_timeout) {
6556 /*
6557 * Hack alert for now we can't time-out over 2,147,483
6558 * seconds (a bit more than 596 hours), which is probably ok
6559 * :).
6560 */
6561 if (hpts_timeout > 0x7ffffffe)
6562 hpts_timeout = 0x7ffffffe;
6563 rack->r_ctl.rc_timer_exp = cts + hpts_timeout;
6564 }
6565 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0);
6566 if ((rack->gp_ready == 0) &&
6567 (rack->use_fixed_rate == 0) &&
6568 (hpts_timeout < usecs) &&
6569 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) {
6570 /*
6571 * We have no good estimate yet for the
6572 * old clunky burst mitigation or the
6573 * real pacing. And the tlp or rxt is smaller
6574 * than the pacing calculation. Lets not
6575 * pace that long since we know the calculation
6576 * so far is not accurate.
6577 */
6578 usecs = hpts_timeout;
6579 }
6580 /**
6581 * Turn off all the flags for queuing by default. The
6582 * flags have important meanings to what happens when
6583 * LRO interacts with the transport. Most likely (by default now)
6584 * mbuf_queueing and ack compression are on. So the transport
6585 * has a couple of flags that control what happens (if those
6586 * are not on then these flags won't have any effect since it
6587 * won't go through the queuing LRO path).
6588 *
6589 * TF2_MBUF_QUEUE_READY - This flags says that I am busy
6590 * pacing output, so don't disturb. But
6591 * it also means LRO can wake me if there
6592 * is a SACK arrival.
6593 *
6594 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction
6595 * with the above flag (QUEUE_READY) and
6596 * when present it says don't even wake me
6597 * if a SACK arrives.
6598 *
6599 * The idea behind these flags is that if we are pacing we
6600 * set the MBUF_QUEUE_READY and only get woken up if
6601 * a SACK arrives (which could change things) or if
6602 * our pacing timer expires. If, however, we have a rack
6603 * timer running, then we don't even want a sack to wake
6604 * us since the rack timer has to expire before we can send.
6605 *
6606 * Other cases should usually have none of the flags set
6607 * so LRO can call into us.
6608 */
6609 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY);
6610 if (usecs) {
6611 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT;
6612 rack->r_ctl.rc_last_output_to = us_cts + usecs;
6613 /*
6614 * A pacing timer (usecs microseconds) is being set, in
6615 * such a case we cannot send (we are blocked by
6616 * the timer). So lets tell LRO that it should not
6617 * wake us unless there is a SACK. Note this only
6618 * will be effective if mbuf queueing is on or
6619 * compressed acks are being processed.
6620 */
6621 tp->t_flags2 |= TF2_MBUF_QUEUE_READY;
6622 /*
6623 * But wait if we have a Rack timer running
6624 * even a SACK should not disturb us (with
6625 * the exception of r_rr_config 3).
6626 */
6627 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) ||
6628 (IN_RECOVERY(tp->t_flags))) {
6629 if (rack->r_rr_config != 3)
6630 tp->t_flags2 |= TF2_DONT_SACK_QUEUE;
6631 else if (rack->rc_pace_dnd) {
6632 /*
6633 * When DND is on, we only let a sack
6634 * interrupt us if we are not in recovery.
6635 *
6636 * If DND is off, then we never hit here
6637 * and let all sacks wake us up.
6638 *
6639 */
6640 tp->t_flags2 |= TF2_DONT_SACK_QUEUE;
6641 }
6642 }
6643 if (rack->rc_ack_can_sendout_data) {
6644 /*
6645 * Ahh but wait, this is that special case
6646 * where the pacing timer can be disturbed
6647 * backout the changes (used for non-paced
6648 * burst limiting).
6649 */
6650 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE |
6651 TF2_MBUF_QUEUE_READY);
6652 }
6653 if ((rack->use_rack_rr) &&
6654 (rack->r_rr_config < 2) &&
6655 ((hpts_timeout) && (hpts_timeout < usecs))) {
6656 /*
6657 * Arrange for the hpts to kick back in after the
6658 * t-o if the t-o does not cause a send.
6659 */
6660 tcp_hpts_insert(tp, hpts_timeout, &diag);
6661 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6662 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
6663 } else {
6664 tcp_hpts_insert(tp, usecs, &diag);
6665 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6666 rack_log_to_start(rack, cts, hpts_timeout, usecs, 1);
6667 }
6668 } else if (hpts_timeout) {
6669 /*
6670 * With respect to t_flags2(?) here, lets let any new acks wake
6671 * us up here. Since we are not pacing (no pacing timer), output
6672 * can happen so we should let it. If its a Rack timer, then any inbound
6673 * packet probably won't change the sending (we will be blocked)
6674 * but it may change the prr stats so letting it in (the set defaults
6675 * at the start of this block) are good enough.
6676 */
6677 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6678 tcp_hpts_insert(tp, hpts_timeout, &diag);
6679 rack_log_hpts_diag(rack, us_cts, &diag, &tv);
6680 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0);
6681 } else {
6682 /* No timer starting */
6683 #ifdef INVARIANTS
6684 if (SEQ_GT(tp->snd_max, tp->snd_una)) {
6685 panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?",
6686 tp, rack, tot_len_this_send, cts, usecs, hpts_timeout);
6687 }
6688 #endif
6689 }
6690 rack->rc_tmr_stopped = 0;
6691 if (usecs)
6692 rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__);
6693 }
6694
6695 static void
rack_mark_lost(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t cts)6696 rack_mark_lost(struct tcpcb *tp,
6697 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
6698 {
6699 struct rack_sendmap *nrsm;
6700 uint32_t thresh, exp;
6701
6702 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
6703 nrsm = rsm;
6704 TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) {
6705 if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) {
6706 /* Got up to all that were marked sack-passed */
6707 break;
6708 }
6709 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
6710 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
6711 if (TSTMP_LT(exp, cts) || (exp == cts)) {
6712 /* We now consider it lost */
6713 nrsm->r_flags |= RACK_WAS_LOST;
6714 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
6715 } else {
6716 /* Past here it won't be lost so stop */
6717 break;
6718 }
6719 }
6720 }
6721 }
6722
6723 static inline void
rack_mark_nolonger_lost(struct tcp_rack * rack,struct rack_sendmap * rsm)6724 rack_mark_nolonger_lost(struct tcp_rack *rack, struct rack_sendmap *rsm)
6725 {
6726 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)),
6727 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
6728 rsm->r_flags &= ~RACK_WAS_LOST;
6729 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start))
6730 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start;
6731 else
6732 rack->r_ctl.rc_considered_lost = 0;
6733 }
6734
6735 /*
6736 * RACK Timer, here we simply do logging and house keeping.
6737 * the normal rack_output() function will call the
6738 * appropriate thing to check if we need to do a RACK retransmit.
6739 * We return 1, saying don't proceed with rack_output only
6740 * when all timers have been stopped (destroyed PCB?).
6741 */
6742 static int
rack_timeout_rack(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)6743 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
6744 {
6745 /*
6746 * This timer simply provides an internal trigger to send out data.
6747 * The check_recovery_mode call will see if there are needed
6748 * retransmissions, if so we will enter fast-recovery. The output
6749 * call may or may not do the same thing depending on sysctl
6750 * settings.
6751 */
6752 struct rack_sendmap *rsm;
6753
6754 counter_u64_add(rack_to_tot, 1);
6755 if (rack->r_state && (rack->r_state != tp->t_state))
6756 rack_set_state(tp, rack);
6757 rack->rc_on_min_to = 0;
6758 rsm = rack_check_recovery_mode(tp, cts);
6759 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm);
6760 if (rsm) {
6761 /* We need to stroke any lost that are now declared as lost */
6762 rack_mark_lost(tp, rack, rsm, cts);
6763 rack->r_ctl.rc_resend = rsm;
6764 rack->r_timer_override = 1;
6765 if (rack->use_rack_rr) {
6766 /*
6767 * Don't accumulate extra pacing delay
6768 * we are allowing the rack timer to
6769 * over-ride pacing i.e. rrr takes precedence
6770 * if the pacing interval is longer than the rrr
6771 * time (in other words we get the min pacing
6772 * time versus rrr pacing time).
6773 */
6774 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
6775 }
6776 }
6777 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK;
6778 if (rsm == NULL) {
6779 /* restart a timer and return 1 */
6780 rack_start_hpts_timer(rack, tp, cts,
6781 0, 0, 0);
6782 return (1);
6783 }
6784 return (0);
6785 }
6786
6787
6788
6789 static void
rack_adjust_orig_mlen(struct rack_sendmap * rsm)6790 rack_adjust_orig_mlen(struct rack_sendmap *rsm)
6791 {
6792
6793 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) {
6794 /*
6795 * The trailing space changed, mbufs can grow
6796 * at the tail but they can't shrink from
6797 * it, KASSERT that. Adjust the orig_m_len to
6798 * compensate for this change.
6799 */
6800 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)),
6801 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
6802 rsm->m,
6803 rsm,
6804 (intmax_t)M_TRAILINGROOM(rsm->m),
6805 rsm->orig_t_space,
6806 rsm->orig_m_len,
6807 rsm->m->m_len));
6808 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m));
6809 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6810 }
6811 if (rsm->m->m_len < rsm->orig_m_len) {
6812 /*
6813 * Mbuf shrank, trimmed off the top by an ack, our
6814 * offset changes.
6815 */
6816 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)),
6817 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n",
6818 rsm->m, rsm->m->m_len,
6819 rsm, rsm->orig_m_len,
6820 rsm->soff));
6821 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len))
6822 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len);
6823 else
6824 rsm->soff = 0;
6825 rsm->orig_m_len = rsm->m->m_len;
6826 #ifdef INVARIANTS
6827 } else if (rsm->m->m_len > rsm->orig_m_len) {
6828 panic("rsm:%p m:%p m_len grew outside of t_space compensation",
6829 rsm, rsm->m);
6830 #endif
6831 }
6832 }
6833
6834 static void
rack_setup_offset_for_rsm(struct tcp_rack * rack,struct rack_sendmap * src_rsm,struct rack_sendmap * rsm)6835 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm)
6836 {
6837 struct mbuf *m;
6838 uint32_t soff;
6839
6840 if (src_rsm->m &&
6841 ((src_rsm->orig_m_len != src_rsm->m->m_len) ||
6842 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) {
6843 /* Fix up the orig_m_len and possibly the mbuf offset */
6844 rack_adjust_orig_mlen(src_rsm);
6845 }
6846 m = src_rsm->m;
6847 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start);
6848 while (soff >= m->m_len) {
6849 /* Move out past this mbuf */
6850 soff -= m->m_len;
6851 m = m->m_next;
6852 KASSERT((m != NULL),
6853 ("rsm:%p nrsm:%p hit at soff:%u null m",
6854 src_rsm, rsm, soff));
6855 if (m == NULL) {
6856 /* This should *not* happen which is why there is a kassert */
6857 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
6858 (src_rsm->r_start - rack->rc_tp->snd_una),
6859 &src_rsm->soff);
6860 src_rsm->orig_m_len = src_rsm->m->m_len;
6861 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m);
6862 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
6863 (rsm->r_start - rack->rc_tp->snd_una),
6864 &rsm->soff);
6865 rsm->orig_m_len = rsm->m->m_len;
6866 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6867 return;
6868 }
6869 }
6870 rsm->m = m;
6871 rsm->soff = soff;
6872 rsm->orig_m_len = m->m_len;
6873 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
6874 }
6875
6876 static inline void
rack_clone_rsm(struct tcp_rack * rack,struct rack_sendmap * nrsm,struct rack_sendmap * rsm,uint32_t start)6877 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm,
6878 struct rack_sendmap *rsm, uint32_t start)
6879 {
6880 int idx;
6881
6882 nrsm->r_start = start;
6883 nrsm->r_end = rsm->r_end;
6884 nrsm->r_rtr_cnt = rsm->r_rtr_cnt;
6885 nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt;
6886 nrsm->r_flags = rsm->r_flags;
6887 nrsm->r_dupack = rsm->r_dupack;
6888 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed;
6889 nrsm->r_rtr_bytes = 0;
6890 nrsm->r_fas = rsm->r_fas;
6891 nrsm->r_bas = rsm->r_bas;
6892 tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start);
6893 nrsm->r_just_ret = rsm->r_just_ret;
6894 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) {
6895 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx];
6896 }
6897 /* Now if we have SYN flag we keep it on the left edge */
6898 if (nrsm->r_flags & RACK_HAS_SYN)
6899 nrsm->r_flags &= ~RACK_HAS_SYN;
6900 /* Now if we have a FIN flag we keep it on the right edge */
6901 if (rsm->r_flags & RACK_HAS_FIN)
6902 rsm->r_flags &= ~RACK_HAS_FIN;
6903 /* Push bit must go to the right edge as well */
6904 if (rsm->r_flags & RACK_HAD_PUSH)
6905 rsm->r_flags &= ~RACK_HAD_PUSH;
6906 /* Update the count if app limited */
6907 if (nrsm->r_flags & RACK_APP_LIMITED)
6908 rack->r_ctl.rc_app_limited_cnt++;
6909 /* Clone over the state of the hw_tls flag */
6910 nrsm->r_hw_tls = rsm->r_hw_tls;
6911 /*
6912 * Now we need to find nrsm's new location in the mbuf chain
6913 * we basically calculate a new offset, which is soff +
6914 * how much is left in original rsm. Then we walk out the mbuf
6915 * chain to find the righ position, it may be the same mbuf
6916 * or maybe not.
6917 */
6918 KASSERT(((rsm->m != NULL) ||
6919 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))),
6920 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack));
6921 if (rsm->m)
6922 rack_setup_offset_for_rsm(rack, rsm, nrsm);
6923 }
6924
6925 static struct rack_sendmap *
rack_merge_rsm(struct tcp_rack * rack,struct rack_sendmap * l_rsm,struct rack_sendmap * r_rsm)6926 rack_merge_rsm(struct tcp_rack *rack,
6927 struct rack_sendmap *l_rsm,
6928 struct rack_sendmap *r_rsm)
6929 {
6930 /*
6931 * We are merging two ack'd RSM's,
6932 * the l_rsm is on the left (lower seq
6933 * values) and the r_rsm is on the right
6934 * (higher seq value). The simplest way
6935 * to merge these is to move the right
6936 * one into the left. I don't think there
6937 * is any reason we need to try to find
6938 * the oldest (or last oldest retransmitted).
6939 */
6940 rack_log_map_chg(rack->rc_tp, rack, NULL,
6941 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__);
6942 tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end);
6943 if (l_rsm->r_dupack < r_rsm->r_dupack)
6944 l_rsm->r_dupack = r_rsm->r_dupack;
6945 if (r_rsm->r_rtr_bytes)
6946 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
6947 if (r_rsm->r_in_tmap) {
6948 /* This really should not happen */
6949 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
6950 r_rsm->r_in_tmap = 0;
6951 }
6952
6953 /* Now the flags */
6954 if (r_rsm->r_flags & RACK_HAS_FIN)
6955 l_rsm->r_flags |= RACK_HAS_FIN;
6956 if (r_rsm->r_flags & RACK_TLP)
6957 l_rsm->r_flags |= RACK_TLP;
6958 if (r_rsm->r_flags & RACK_RWND_COLLAPSED)
6959 l_rsm->r_flags |= RACK_RWND_COLLAPSED;
6960 if ((r_rsm->r_flags & RACK_APP_LIMITED) &&
6961 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) {
6962 /*
6963 * If both are app-limited then let the
6964 * free lower the count. If right is app
6965 * limited and left is not, transfer.
6966 */
6967 l_rsm->r_flags |= RACK_APP_LIMITED;
6968 r_rsm->r_flags &= ~RACK_APP_LIMITED;
6969 if (r_rsm == rack->r_ctl.rc_first_appl)
6970 rack->r_ctl.rc_first_appl = l_rsm;
6971 }
6972 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE);
6973 /*
6974 * We keep the largest value, which is the newest
6975 * send. We do this in case a segment that is
6976 * joined together and not part of a GP estimate
6977 * later gets expanded into the GP estimate.
6978 *
6979 * We prohibit the merging of unlike kinds i.e.
6980 * all pieces that are in the GP estimate can be
6981 * merged and all pieces that are not in a GP estimate
6982 * can be merged, but not disimilar pieces. Combine
6983 * this with taking the highest here and we should
6984 * be ok unless of course the client reneges. Then
6985 * all bets are off.
6986 */
6987 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] <
6988 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) {
6989 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)];
6990 }
6991 /*
6992 * When merging two RSM's we also need to consider the ack time and keep
6993 * newest. If the ack gets merged into a measurement then that is the
6994 * one we will want to be using.
6995 */
6996 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival)
6997 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival;
6998
6999 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
7000 /* Transfer the split limit to the map we free */
7001 r_rsm->r_limit_type = l_rsm->r_limit_type;
7002 l_rsm->r_limit_type = 0;
7003 }
7004 rack_free(rack, r_rsm);
7005 l_rsm->r_flags |= RACK_MERGED;
7006 return (l_rsm);
7007 }
7008
7009 /*
7010 * TLP Timer, here we simply setup what segment we want to
7011 * have the TLP expire on, the normal rack_output() will then
7012 * send it out.
7013 *
7014 * We return 1, saying don't proceed with rack_output only
7015 * when all timers have been stopped (destroyed PCB?).
7016 */
7017 static int
rack_timeout_tlp(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,uint8_t * doing_tlp)7018 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp)
7019 {
7020 /*
7021 * Tail Loss Probe.
7022 */
7023 struct rack_sendmap *rsm = NULL;
7024 int insret __diagused;
7025 struct socket *so = tptosocket(tp);
7026 uint32_t amm;
7027 uint32_t out, avail;
7028 int collapsed_win = 0;
7029
7030 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7031 /* Its not time yet */
7032 return (0);
7033 }
7034 if (ctf_progress_timeout_check(tp, true)) {
7035 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7036 return (-ETIMEDOUT); /* tcp_drop() */
7037 }
7038 /*
7039 * A TLP timer has expired. We have been idle for 2 rtts. So we now
7040 * need to figure out how to force a full MSS segment out.
7041 */
7042 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL);
7043 rack->r_ctl.retran_during_recovery = 0;
7044 rack->r_might_revert = 0;
7045 rack->r_ctl.dsack_byte_cnt = 0;
7046 counter_u64_add(rack_tlp_tot, 1);
7047 if (rack->r_state && (rack->r_state != tp->t_state))
7048 rack_set_state(tp, rack);
7049 avail = sbavail(&so->so_snd);
7050 out = tp->snd_max - tp->snd_una;
7051 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) {
7052 /* special case, we need a retransmission */
7053 collapsed_win = 1;
7054 goto need_retran;
7055 }
7056 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) {
7057 rack->r_ctl.dsack_persist--;
7058 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7059 rack->r_ctl.num_dsack = 0;
7060 }
7061 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7062 }
7063 if ((tp->t_flags & TF_GPUTINPROG) &&
7064 (rack->r_ctl.rc_tlp_cnt_out == 1)) {
7065 /*
7066 * If this is the second in a row
7067 * TLP and we are doing a measurement
7068 * its time to abandon the measurement.
7069 * Something is likely broken on
7070 * the clients network and measuring a
7071 * broken network does us no good.
7072 */
7073 tp->t_flags &= ~TF_GPUTINPROG;
7074 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7075 rack->r_ctl.rc_gp_srtt /*flex1*/,
7076 tp->gput_seq,
7077 0, 0, 18, __LINE__, NULL, 0);
7078 }
7079 /*
7080 * Check our send oldest always settings, and if
7081 * there is an oldest to send jump to the need_retran.
7082 */
7083 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0))
7084 goto need_retran;
7085
7086 if (avail > out) {
7087 /* New data is available */
7088 amm = avail - out;
7089 if (amm > ctf_fixed_maxseg(tp)) {
7090 amm = ctf_fixed_maxseg(tp);
7091 if ((amm + out) > tp->snd_wnd) {
7092 /* We are rwnd limited */
7093 goto need_retran;
7094 }
7095 } else if (amm < ctf_fixed_maxseg(tp)) {
7096 /* not enough to fill a MTU */
7097 goto need_retran;
7098 }
7099 if (IN_FASTRECOVERY(tp->t_flags)) {
7100 /* Unlikely */
7101 if (rack->rack_no_prr == 0) {
7102 if (out + amm <= tp->snd_wnd) {
7103 rack->r_ctl.rc_prr_sndcnt = amm;
7104 rack->r_ctl.rc_tlp_new_data = amm;
7105 rack_log_to_prr(rack, 4, 0, __LINE__);
7106 }
7107 } else
7108 goto need_retran;
7109 } else {
7110 /* Set the send-new override */
7111 if (out + amm <= tp->snd_wnd)
7112 rack->r_ctl.rc_tlp_new_data = amm;
7113 else
7114 goto need_retran;
7115 }
7116 rack->r_ctl.rc_tlpsend = NULL;
7117 counter_u64_add(rack_tlp_newdata, 1);
7118 goto send;
7119 }
7120 need_retran:
7121 /*
7122 * Ok we need to arrange the last un-acked segment to be re-sent, or
7123 * optionally the first un-acked segment.
7124 */
7125 if (collapsed_win == 0) {
7126 if (rack_always_send_oldest)
7127 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7128 else {
7129 rsm = tqhash_max(rack->r_ctl.tqh);
7130 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) {
7131 rsm = rack_find_high_nonack(rack, rsm);
7132 }
7133 }
7134 if (rsm == NULL) {
7135 #ifdef TCP_BLACKBOX
7136 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true);
7137 #endif
7138 goto out;
7139 }
7140 } else {
7141 /*
7142 * We had a collapsed window, lets find
7143 * the point before the collapse.
7144 */
7145 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una))
7146 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1));
7147 else {
7148 rsm = tqhash_min(rack->r_ctl.tqh);
7149 }
7150 if (rsm == NULL) {
7151 /* Huh */
7152 goto out;
7153 }
7154 }
7155 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) {
7156 /*
7157 * We need to split this the last segment in two.
7158 */
7159 struct rack_sendmap *nrsm;
7160
7161 nrsm = rack_alloc_full_limit(rack);
7162 if (nrsm == NULL) {
7163 /*
7164 * No memory to split, we will just exit and punt
7165 * off to the RXT timer.
7166 */
7167 goto out;
7168 }
7169 rack_clone_rsm(rack, nrsm, rsm,
7170 (rsm->r_end - ctf_fixed_maxseg(tp)));
7171 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
7172 #ifndef INVARIANTS
7173 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
7174 #else
7175 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
7176 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
7177 nrsm, insret, rack, rsm);
7178 }
7179 #endif
7180 if (rsm->r_in_tmap) {
7181 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
7182 nrsm->r_in_tmap = 1;
7183 }
7184 rsm = nrsm;
7185 }
7186 rack->r_ctl.rc_tlpsend = rsm;
7187 send:
7188 /* Make sure output path knows we are doing a TLP */
7189 *doing_tlp = 1;
7190 rack->r_timer_override = 1;
7191 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7192 return (0);
7193 out:
7194 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP;
7195 return (0);
7196 }
7197
7198 /*
7199 * Delayed ack Timer, here we simply need to setup the
7200 * ACK_NOW flag and remove the DELACK flag. From there
7201 * the output routine will send the ack out.
7202 *
7203 * We only return 1, saying don't proceed, if all timers
7204 * are stopped (destroyed PCB?).
7205 */
7206 static int
rack_timeout_delack(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7207 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7208 {
7209
7210 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL);
7211 tp->t_flags &= ~TF_DELACK;
7212 tp->t_flags |= TF_ACKNOW;
7213 KMOD_TCPSTAT_INC(tcps_delack);
7214 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK;
7215 return (0);
7216 }
7217
7218 static inline int
rack_send_ack_challange(struct tcp_rack * rack)7219 rack_send_ack_challange(struct tcp_rack *rack)
7220 {
7221 struct tcptemp *t_template;
7222
7223 t_template = tcpip_maketemplate(rack->rc_inp);
7224 if (t_template) {
7225 if (rack->forced_ack == 0) {
7226 rack->forced_ack = 1;
7227 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL);
7228 } else {
7229 rack->probe_not_answered = 1;
7230 }
7231 tcp_respond(rack->rc_tp, t_template->tt_ipgen,
7232 &t_template->tt_t, (struct mbuf *)NULL,
7233 rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0);
7234 free(t_template, M_TEMP);
7235 /* This does send an ack so kill any D-ack timer */
7236 if (rack->rc_tp->t_flags & TF_DELACK)
7237 rack->rc_tp->t_flags &= ~TF_DELACK;
7238 return(1);
7239 } else
7240 return (0);
7241
7242 }
7243
7244 /*
7245 * Persists timer, here we simply send the
7246 * same thing as a keepalive will.
7247 * the one byte send.
7248 *
7249 * We only return 1, saying don't proceed, if all timers
7250 * are stopped (destroyed PCB?).
7251 */
7252 static int
rack_timeout_persist(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7253 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7254 {
7255 int32_t retval = 1;
7256
7257 if (rack->rc_in_persist == 0)
7258 return (0);
7259 if (ctf_progress_timeout_check(tp, false)) {
7260 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7261 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7262 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7263 return (-ETIMEDOUT); /* tcp_drop() */
7264 }
7265 /*
7266 * Persistence timer into zero window. Force a byte to be output, if
7267 * possible.
7268 */
7269 KMOD_TCPSTAT_INC(tcps_persisttimeo);
7270 /*
7271 * Hack: if the peer is dead/unreachable, we do not time out if the
7272 * window is closed. After a full backoff, drop the connection if
7273 * the idle time (no responses to probes) reaches the maximum
7274 * backoff that we would use if retransmitting.
7275 */
7276 if (tp->t_rxtshift >= V_tcp_retries &&
7277 (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
7278 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) {
7279 KMOD_TCPSTAT_INC(tcps_persistdrop);
7280 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7281 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7282 retval = -ETIMEDOUT; /* tcp_drop() */
7283 goto out;
7284 }
7285 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) &&
7286 tp->snd_una == tp->snd_max)
7287 rack_exit_persist(tp, rack, cts);
7288 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT;
7289 /*
7290 * If the user has closed the socket then drop a persisting
7291 * connection after a much reduced timeout.
7292 */
7293 if (tp->t_state > TCPS_CLOSE_WAIT &&
7294 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
7295 KMOD_TCPSTAT_INC(tcps_persistdrop);
7296 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
7297 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends);
7298 retval = -ETIMEDOUT; /* tcp_drop() */
7299 goto out;
7300 }
7301 if (rack_send_ack_challange(rack)) {
7302 /* only set it if we were answered */
7303 if (rack->probe_not_answered) {
7304 counter_u64_add(rack_persists_loss, 1);
7305 rack->r_ctl.persist_lost_ends++;
7306 }
7307 counter_u64_add(rack_persists_sends, 1);
7308 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1);
7309 }
7310 if (tp->t_rxtshift < V_tcp_retries)
7311 tp->t_rxtshift++;
7312 out:
7313 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL);
7314 rack_start_hpts_timer(rack, tp, cts,
7315 0, 0, 0);
7316 return (retval);
7317 }
7318
7319 /*
7320 * If a keepalive goes off, we had no other timers
7321 * happening. We always return 1 here since this
7322 * routine either drops the connection or sends
7323 * out a segment with respond.
7324 */
7325 static int
rack_timeout_keepalive(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7326 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7327 {
7328 struct inpcb *inp = tptoinpcb(tp);
7329
7330 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP;
7331 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL);
7332 /*
7333 * Keep-alive timer went off; send something or drop connection if
7334 * idle for too long.
7335 */
7336 KMOD_TCPSTAT_INC(tcps_keeptimeo);
7337 if (tp->t_state < TCPS_ESTABLISHED)
7338 goto dropit;
7339 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
7340 tp->t_state <= TCPS_CLOSING) {
7341 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
7342 goto dropit;
7343 /*
7344 * Send a packet designed to force a response if the peer is
7345 * up and reachable: either an ACK if the connection is
7346 * still alive, or an RST if the peer has closed the
7347 * connection due to timeout or reboot. Using sequence
7348 * number tp->snd_una-1 causes the transmitted zero-length
7349 * segment to lie outside the receive window; by the
7350 * protocol spec, this requires the correspondent TCP to
7351 * respond.
7352 */
7353 KMOD_TCPSTAT_INC(tcps_keepprobe);
7354 rack_send_ack_challange(rack);
7355 }
7356 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
7357 return (1);
7358 dropit:
7359 KMOD_TCPSTAT_INC(tcps_keepdrops);
7360 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7361 return (-ETIMEDOUT); /* tcp_drop() */
7362 }
7363
7364 /*
7365 * Retransmit helper function, clear up all the ack
7366 * flags and take care of important book keeping.
7367 */
7368 static void
rack_remxt_tmr(struct tcpcb * tp)7369 rack_remxt_tmr(struct tcpcb *tp)
7370 {
7371 /*
7372 * The retransmit timer went off, all sack'd blocks must be
7373 * un-acked.
7374 */
7375 struct rack_sendmap *rsm, *trsm = NULL;
7376 struct tcp_rack *rack;
7377
7378 rack = (struct tcp_rack *)tp->t_fb_ptr;
7379 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__);
7380 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL);
7381 rack->r_timer_override = 1;
7382 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
7383 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una;
7384 rack->r_late = 0;
7385 rack->r_early = 0;
7386 rack->r_ctl.rc_agg_delayed = 0;
7387 rack->r_ctl.rc_agg_early = 0;
7388 if (rack->r_state && (rack->r_state != tp->t_state))
7389 rack_set_state(tp, rack);
7390 if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) {
7391 /*
7392 * We do not clear the scoreboard until we have had
7393 * more than rack_rxt_scoreboard_clear_thresh time-outs.
7394 */
7395 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
7396 if (rack->r_ctl.rc_resend != NULL)
7397 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7398
7399 return;
7400 }
7401 /*
7402 * Ideally we would like to be able to
7403 * mark SACK-PASS on anything not acked here.
7404 *
7405 * However, if we do that we would burst out
7406 * all that data 1ms apart. This would be unwise,
7407 * so for now we will just let the normal rxt timer
7408 * and tlp timer take care of it.
7409 *
7410 * Also we really need to stick them back in sequence
7411 * order. This way we send in the proper order and any
7412 * sacks that come floating in will "re-ack" the data.
7413 * To do this we zap the tmap with an INIT and then
7414 * walk through and place every rsm in the tail queue
7415 * hash table back in its seq ordered place.
7416 */
7417 TAILQ_INIT(&rack->r_ctl.rc_tmap);
7418
7419 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
7420 rsm->r_dupack = 0;
7421 if (rack_verbose_logging)
7422 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7423 /* We must re-add it back to the tlist */
7424 if (trsm == NULL) {
7425 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
7426 } else {
7427 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext);
7428 }
7429 rsm->r_in_tmap = 1;
7430 trsm = rsm;
7431 if (rsm->r_flags & RACK_ACKED)
7432 rsm->r_flags |= RACK_WAS_ACKED;
7433 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST);
7434 rsm->r_flags |= RACK_MUST_RXT;
7435 }
7436 /* zero the lost since it's all gone */
7437 rack->r_ctl.rc_considered_lost = 0;
7438 /* Clear the count (we just un-acked them) */
7439 rack->r_ctl.rc_sacked = 0;
7440 rack->r_ctl.rc_sacklast = NULL;
7441 /* Clear the tlp rtx mark */
7442 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
7443 if (rack->r_ctl.rc_resend != NULL)
7444 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7445 rack->r_ctl.rc_prr_sndcnt = 0;
7446 rack_log_to_prr(rack, 6, 0, __LINE__);
7447 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh);
7448 if (rack->r_ctl.rc_resend != NULL)
7449 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT;
7450 if (((tp->t_flags & TF_SACK_PERMIT) == 0) &&
7451 ((tp->t_flags & TF_SENTFIN) == 0)) {
7452 /*
7453 * For non-sack customers new data
7454 * needs to go out as retransmits until
7455 * we retransmit up to snd_max.
7456 */
7457 rack->r_must_retran = 1;
7458 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp,
7459 rack->r_ctl.rc_sacked);
7460 }
7461 }
7462
7463 static void
rack_convert_rtts(struct tcpcb * tp)7464 rack_convert_rtts(struct tcpcb *tp)
7465 {
7466 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
7467 tp->t_rxtcur = RACK_REXMTVAL(tp);
7468 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
7469 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop);
7470 }
7471 if (tp->t_rxtcur > rack_rto_max) {
7472 tp->t_rxtcur = rack_rto_max;
7473 }
7474 }
7475
7476 static void
rack_cc_conn_init(struct tcpcb * tp)7477 rack_cc_conn_init(struct tcpcb *tp)
7478 {
7479 struct tcp_rack *rack;
7480 uint32_t srtt;
7481
7482 rack = (struct tcp_rack *)tp->t_fb_ptr;
7483 srtt = tp->t_srtt;
7484 cc_conn_init(tp);
7485 /*
7486 * Now convert to rack's internal format,
7487 * if required.
7488 */
7489 if ((srtt == 0) && (tp->t_srtt != 0))
7490 rack_convert_rtts(tp);
7491 /*
7492 * We want a chance to stay in slowstart as
7493 * we create a connection. TCP spec says that
7494 * initially ssthresh is infinite. For our
7495 * purposes that is the snd_wnd.
7496 */
7497 if (tp->snd_ssthresh < tp->snd_wnd) {
7498 tp->snd_ssthresh = tp->snd_wnd;
7499 }
7500 /*
7501 * We also want to assure a IW worth of
7502 * data can get inflight.
7503 */
7504 if (rc_init_window(rack) < tp->snd_cwnd)
7505 tp->snd_cwnd = rc_init_window(rack);
7506 }
7507
7508 /*
7509 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise
7510 * we will setup to retransmit the lowest seq number outstanding.
7511 */
7512 static int
rack_timeout_rxt(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts)7513 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts)
7514 {
7515 struct inpcb *inp = tptoinpcb(tp);
7516 int32_t rexmt;
7517 int32_t retval = 0;
7518 bool isipv6;
7519
7520 if ((tp->t_flags & TF_GPUTINPROG) &&
7521 (tp->t_rxtshift)) {
7522 /*
7523 * We have had a second timeout
7524 * measurements on successive rxt's are not profitable.
7525 * It is unlikely to be of any use (the network is
7526 * broken or the client went away).
7527 */
7528 tp->t_flags &= ~TF_GPUTINPROG;
7529 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7530 rack->r_ctl.rc_gp_srtt /*flex1*/,
7531 tp->gput_seq,
7532 0, 0, 18, __LINE__, NULL, 0);
7533 }
7534 if (ctf_progress_timeout_check(tp, false)) {
7535 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7536 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
7537 return (-ETIMEDOUT); /* tcp_drop() */
7538 }
7539 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT;
7540 rack->r_ctl.retran_during_recovery = 0;
7541 rack->rc_ack_required = 1;
7542 rack->r_ctl.dsack_byte_cnt = 0;
7543 if (IN_RECOVERY(tp->t_flags) &&
7544 (rack->rto_from_rec == 0)) {
7545 /*
7546 * Mark that we had a rto while in recovery
7547 * and save the ssthresh so if we go back
7548 * into recovery we will have a chance
7549 * to slowstart back to the level.
7550 */
7551 rack->rto_from_rec = 1;
7552 rack->r_ctl.rto_ssthresh = tp->snd_ssthresh;
7553 }
7554 if (IN_FASTRECOVERY(tp->t_flags))
7555 tp->t_flags |= TF_WASFRECOVERY;
7556 else
7557 tp->t_flags &= ~TF_WASFRECOVERY;
7558 if (IN_CONGRECOVERY(tp->t_flags))
7559 tp->t_flags |= TF_WASCRECOVERY;
7560 else
7561 tp->t_flags &= ~TF_WASCRECOVERY;
7562 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
7563 (tp->snd_una == tp->snd_max)) {
7564 /* Nothing outstanding .. nothing to do */
7565 return (0);
7566 }
7567 if (rack->r_ctl.dsack_persist) {
7568 rack->r_ctl.dsack_persist--;
7569 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) {
7570 rack->r_ctl.num_dsack = 0;
7571 }
7572 rack_log_dsack_event(rack, 1, __LINE__, 0, 0);
7573 }
7574 /*
7575 * Rack can only run one timer at a time, so we cannot
7576 * run a KEEPINIT (gating SYN sending) and a retransmit
7577 * timer for the SYN. So if we are in a front state and
7578 * have a KEEPINIT timer we need to check the first transmit
7579 * against now to see if we have exceeded the KEEPINIT time
7580 * (if one is set).
7581 */
7582 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) &&
7583 (TP_KEEPINIT(tp) != 0)) {
7584 struct rack_sendmap *rsm;
7585
7586 rsm = tqhash_min(rack->r_ctl.tqh);
7587 if (rsm) {
7588 /* Ok we have something outstanding to test keepinit with */
7589 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) &&
7590 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) {
7591 /* We have exceeded the KEEPINIT time */
7592 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
7593 goto drop_it;
7594 }
7595 }
7596 }
7597 /*
7598 * Retransmission timer went off. Message has not been acked within
7599 * retransmit interval. Back off to a longer retransmit interval
7600 * and retransmit one segment.
7601 */
7602 if ((rack->r_ctl.rc_resend == NULL) ||
7603 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) {
7604 /*
7605 * If the rwnd collapsed on
7606 * the one we are retransmitting
7607 * it does not count against the
7608 * rxt count.
7609 */
7610 tp->t_rxtshift++;
7611 }
7612 rack_remxt_tmr(tp);
7613 if (tp->t_rxtshift > V_tcp_retries) {
7614 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
7615 drop_it:
7616 tp->t_rxtshift = V_tcp_retries;
7617 KMOD_TCPSTAT_INC(tcps_timeoutdrop);
7618 /* XXXGL: previously t_softerror was casted to uint16_t */
7619 MPASS(tp->t_softerror >= 0);
7620 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT;
7621 goto out; /* tcp_drop() */
7622 }
7623 if (tp->t_state == TCPS_SYN_SENT) {
7624 /*
7625 * If the SYN was retransmitted, indicate CWND to be limited
7626 * to 1 segment in cc_conn_init().
7627 */
7628 tp->snd_cwnd = 1;
7629 } else if (tp->t_rxtshift == 1) {
7630 /*
7631 * first retransmit; record ssthresh and cwnd so they can be
7632 * recovered if this turns out to be a "bad" retransmit. A
7633 * retransmit is considered "bad" if an ACK for this segment
7634 * is received within RTT/2 interval; the assumption here is
7635 * that the ACK was already in flight. See "On Estimating
7636 * End-to-End Network Path Properties" by Allman and Paxson
7637 * for more details.
7638 */
7639 tp->snd_cwnd_prev = tp->snd_cwnd;
7640 tp->snd_ssthresh_prev = tp->snd_ssthresh;
7641 tp->snd_recover_prev = tp->snd_recover;
7642 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2);
7643 tp->t_flags |= TF_PREVVALID;
7644 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
7645 tp->t_flags &= ~TF_PREVVALID;
7646 KMOD_TCPSTAT_INC(tcps_rexmttimeo);
7647 if ((tp->t_state == TCPS_SYN_SENT) ||
7648 (tp->t_state == TCPS_SYN_RECEIVED))
7649 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift];
7650 else
7651 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift];
7652
7653 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt,
7654 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop);
7655 /*
7656 * We enter the path for PLMTUD if connection is established or, if
7657 * connection is FIN_WAIT_1 status, reason for the last is that if
7658 * amount of data we send is very small, we could send it in couple
7659 * of packets and process straight to FIN. In that case we won't
7660 * catch ESTABLISHED state.
7661 */
7662 #ifdef INET6
7663 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false;
7664 #else
7665 isipv6 = false;
7666 #endif
7667 if (((V_tcp_pmtud_blackhole_detect == 1) ||
7668 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
7669 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
7670 ((tp->t_state == TCPS_ESTABLISHED) ||
7671 (tp->t_state == TCPS_FIN_WAIT_1))) {
7672 /*
7673 * Idea here is that at each stage of mtu probe (usually,
7674 * 1448 -> 1188 -> 524) should be given 2 chances to recover
7675 * before further clamping down. 'tp->t_rxtshift % 2 == 0'
7676 * should take care of that.
7677 */
7678 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) ==
7679 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) &&
7680 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 &&
7681 tp->t_rxtshift % 2 == 0)) {
7682 /*
7683 * Enter Path MTU Black-hole Detection mechanism: -
7684 * Disable Path MTU Discovery (IP "DF" bit). -
7685 * Reduce MTU to lower value than what we negotiated
7686 * with peer.
7687 */
7688 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
7689 /* Record that we may have found a black hole. */
7690 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
7691 /* Keep track of previous MSS. */
7692 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
7693 }
7694
7695 /*
7696 * Reduce the MSS to blackhole value or to the
7697 * default in an attempt to retransmit.
7698 */
7699 #ifdef INET6
7700 if (isipv6 &&
7701 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
7702 /* Use the sysctl tuneable blackhole MSS. */
7703 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
7704 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7705 } else if (isipv6) {
7706 /* Use the default MSS. */
7707 tp->t_maxseg = V_tcp_v6mssdflt;
7708 /*
7709 * Disable Path MTU Discovery when we switch
7710 * to minmss.
7711 */
7712 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7713 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7714 }
7715 #endif
7716 #if defined(INET6) && defined(INET)
7717 else
7718 #endif
7719 #ifdef INET
7720 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
7721 /* Use the sysctl tuneable blackhole MSS. */
7722 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
7723 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated);
7724 } else {
7725 /* Use the default MSS. */
7726 tp->t_maxseg = V_tcp_mssdflt;
7727 /*
7728 * Disable Path MTU Discovery when we switch
7729 * to minmss.
7730 */
7731 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
7732 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
7733 }
7734 #endif
7735 } else {
7736 /*
7737 * If further retransmissions are still unsuccessful
7738 * with a lowered MTU, maybe this isn't a blackhole
7739 * and we restore the previous MSS and blackhole
7740 * detection flags. The limit '6' is determined by
7741 * giving each probe stage (1448, 1188, 524) 2
7742 * chances to recover.
7743 */
7744 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
7745 (tp->t_rxtshift >= 6)) {
7746 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
7747 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
7748 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
7749 if (tp->t_maxseg < V_tcp_mssdflt) {
7750 /*
7751 * The MSS is so small we should not
7752 * process incoming SACK's since we are
7753 * subject to attack in such a case.
7754 */
7755 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT;
7756 } else {
7757 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT;
7758 }
7759 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed);
7760 }
7761 }
7762 }
7763 /*
7764 * Disable RFC1323 and SACK if we haven't got any response to
7765 * our third SYN to work-around some broken terminal servers
7766 * (most of which have hopefully been retired) that have bad VJ
7767 * header compression code which trashes TCP segments containing
7768 * unknown-to-them TCP options.
7769 */
7770 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
7771 (tp->t_rxtshift == 3))
7772 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
7773 /*
7774 * If we backed off this far, our srtt estimate is probably bogus.
7775 * Clobber it so we'll take the next rtt measurement as our srtt;
7776 * move the current srtt into rttvar to keep the current retransmit
7777 * times until then.
7778 */
7779 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
7780 #ifdef INET6
7781 if ((inp->inp_vflag & INP_IPV6) != 0)
7782 in6_losing(inp);
7783 else
7784 #endif
7785 in_losing(inp);
7786 tp->t_rttvar += tp->t_srtt;
7787 tp->t_srtt = 0;
7788 }
7789 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
7790 tp->snd_recover = tp->snd_max;
7791 tp->t_flags |= TF_ACKNOW;
7792 tp->t_rtttime = 0;
7793 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__);
7794 out:
7795 return (retval);
7796 }
7797
7798 static int
rack_process_timers(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,uint8_t hpts_calling,uint8_t * doing_tlp)7799 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp)
7800 {
7801 int32_t ret = 0;
7802 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK);
7803
7804 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
7805 (tp->t_flags & TF_GPUTINPROG)) {
7806 /*
7807 * We have a goodput in progress
7808 * and we have entered a late state.
7809 * Do we have enough data in the sb
7810 * to handle the GPUT request?
7811 */
7812 uint32_t bytes;
7813
7814 bytes = tp->gput_ack - tp->gput_seq;
7815 if (SEQ_GT(tp->gput_seq, tp->snd_una))
7816 bytes += tp->gput_seq - tp->snd_una;
7817 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
7818 /*
7819 * There are not enough bytes in the socket
7820 * buffer that have been sent to cover this
7821 * measurement. Cancel it.
7822 */
7823 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
7824 rack->r_ctl.rc_gp_srtt /*flex1*/,
7825 tp->gput_seq,
7826 0, 0, 18, __LINE__, NULL, 0);
7827 tp->t_flags &= ~TF_GPUTINPROG;
7828 }
7829 }
7830 if (timers == 0) {
7831 return (0);
7832 }
7833 if (tp->t_state == TCPS_LISTEN) {
7834 /* no timers on listen sockets */
7835 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)
7836 return (0);
7837 return (1);
7838 }
7839 if ((timers & PACE_TMR_RACK) &&
7840 rack->rc_on_min_to) {
7841 /*
7842 * For the rack timer when we
7843 * are on a min-timeout (which means rrr_conf = 3)
7844 * we don't want to check the timer. It may
7845 * be going off for a pace and thats ok we
7846 * want to send the retransmit (if its ready).
7847 *
7848 * If its on a normal rack timer (non-min) then
7849 * we will check if its expired.
7850 */
7851 goto skip_time_check;
7852 }
7853 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) {
7854 uint32_t left;
7855
7856 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
7857 ret = -1;
7858 rack_log_to_processing(rack, cts, ret, 0);
7859 return (0);
7860 }
7861 if (hpts_calling == 0) {
7862 /*
7863 * A user send or queued mbuf (sack) has called us? We
7864 * return 0 and let the pacing guards
7865 * deal with it if they should or
7866 * should not cause a send.
7867 */
7868 ret = -2;
7869 rack_log_to_processing(rack, cts, ret, 0);
7870 return (0);
7871 }
7872 /*
7873 * Ok our timer went off early and we are not paced false
7874 * alarm, go back to sleep. We make sure we don't have
7875 * no-sack wakeup on since we no longer have a PKT_OUTPUT
7876 * flag in place.
7877 */
7878 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE;
7879 ret = -3;
7880 left = rack->r_ctl.rc_timer_exp - cts;
7881 tcp_hpts_insert(tp, left, NULL);
7882 rack_log_to_processing(rack, cts, ret, left);
7883 return (1);
7884 }
7885 skip_time_check:
7886 rack->rc_tmr_stopped = 0;
7887 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK;
7888 if (timers & PACE_TMR_DELACK) {
7889 ret = rack_timeout_delack(tp, rack, cts);
7890 } else if (timers & PACE_TMR_RACK) {
7891 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7892 rack->r_fast_output = 0;
7893 ret = rack_timeout_rack(tp, rack, cts);
7894 } else if (timers & PACE_TMR_TLP) {
7895 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7896 rack->r_fast_output = 0;
7897 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp);
7898 } else if (timers & PACE_TMR_RXT) {
7899 rack->r_ctl.rc_tlp_rxt_last_time = cts;
7900 rack->r_fast_output = 0;
7901 ret = rack_timeout_rxt(tp, rack, cts);
7902 } else if (timers & PACE_TMR_PERSIT) {
7903 ret = rack_timeout_persist(tp, rack, cts);
7904 } else if (timers & PACE_TMR_KEEP) {
7905 ret = rack_timeout_keepalive(tp, rack, cts);
7906 }
7907 rack_log_to_processing(rack, cts, ret, timers);
7908 return (ret);
7909 }
7910
7911 static void
rack_timer_cancel(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cts,int line)7912 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line)
7913 {
7914 struct timeval tv;
7915 uint32_t us_cts, flags_on_entry;
7916 uint8_t hpts_removed = 0;
7917
7918 flags_on_entry = rack->r_ctl.rc_hpts_flags;
7919 us_cts = tcp_get_usecs(&tv);
7920 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
7921 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) ||
7922 ((tp->snd_max - tp->snd_una) == 0))) {
7923 tcp_hpts_remove(rack->rc_tp);
7924 hpts_removed = 1;
7925 /* If we were not delayed cancel out the flag. */
7926 if ((tp->snd_max - tp->snd_una) == 0)
7927 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
7928 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7929 }
7930 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
7931 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
7932 if (tcp_in_hpts(rack->rc_tp) &&
7933 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) {
7934 /*
7935 * Canceling timer's when we have no output being
7936 * paced. We also must remove ourselves from the
7937 * hpts.
7938 */
7939 tcp_hpts_remove(rack->rc_tp);
7940 hpts_removed = 1;
7941 }
7942 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK);
7943 }
7944 if (hpts_removed == 0)
7945 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry);
7946 }
7947
7948 static int
rack_stopall(struct tcpcb * tp)7949 rack_stopall(struct tcpcb *tp)
7950 {
7951 struct tcp_rack *rack;
7952
7953 rack = (struct tcp_rack *)tp->t_fb_ptr;
7954 rack->t_timers_stopped = 1;
7955
7956 tcp_hpts_remove(tp);
7957
7958 return (0);
7959 }
7960
7961 static void
rack_stop_all_timers(struct tcpcb * tp,struct tcp_rack * rack)7962 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack)
7963 {
7964 /*
7965 * Assure no timers are running.
7966 */
7967 if (tcp_timer_active(tp, TT_PERSIST)) {
7968 /* We enter in persists, set the flag appropriately */
7969 rack->rc_in_persist = 1;
7970 }
7971 if (tcp_in_hpts(rack->rc_tp)) {
7972 tcp_hpts_remove(rack->rc_tp);
7973 }
7974 }
7975
7976 static void
rack_update_rsm(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts,uint32_t add_flag,int segsiz)7977 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack,
7978 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz)
7979 {
7980 int32_t idx;
7981
7982 rsm->r_rtr_cnt++;
7983 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) {
7984 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS;
7985 rsm->r_flags |= RACK_OVERMAX;
7986 }
7987 rsm->r_act_rxt_cnt++;
7988 /* Peg the count/index */
7989 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
7990 rsm->r_dupack = 0;
7991 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) {
7992 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start);
7993 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start);
7994 }
7995 if (rsm->r_flags & RACK_WAS_LOST) {
7996 /*
7997 * We retransmitted it putting it back in flight
7998 * remove the lost desgination and reduce the
7999 * bytes considered lost.
8000 */
8001 rack_mark_nolonger_lost(rack, rsm);
8002 }
8003 idx = rsm->r_rtr_cnt - 1;
8004 rsm->r_tim_lastsent[idx] = ts;
8005 /*
8006 * Here we don't add in the len of send, since its already
8007 * in snduna <->snd_max.
8008 */
8009 rsm->r_fas = ctf_flight_size(rack->rc_tp,
8010 rack->r_ctl.rc_sacked);
8011 if (rsm->r_flags & RACK_ACKED) {
8012 /* Problably MTU discovery messing with us */
8013 rsm->r_flags &= ~RACK_ACKED;
8014 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
8015 }
8016 if (rsm->r_in_tmap) {
8017 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8018 rsm->r_in_tmap = 0;
8019 }
8020 /* Lets make sure it really is in or not the GP window */
8021 rack_mark_in_gp_win(tp, rsm);
8022 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8023 rsm->r_in_tmap = 1;
8024 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz);
8025 /* Take off the must retransmit flag, if its on */
8026 if (rsm->r_flags & RACK_MUST_RXT) {
8027 if (rack->r_must_retran)
8028 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
8029 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
8030 /*
8031 * We have retransmitted all we need. Clear
8032 * any must retransmit flags.
8033 */
8034 rack->r_must_retran = 0;
8035 rack->r_ctl.rc_out_at_rto = 0;
8036 }
8037 rsm->r_flags &= ~RACK_MUST_RXT;
8038 }
8039 /* Remove any collapsed flag */
8040 rsm->r_flags &= ~RACK_RWND_COLLAPSED;
8041 if (rsm->r_flags & RACK_SACK_PASSED) {
8042 /* We have retransmitted due to the SACK pass */
8043 rsm->r_flags &= ~RACK_SACK_PASSED;
8044 rsm->r_flags |= RACK_WAS_SACKPASS;
8045 }
8046 }
8047
8048 static uint32_t
rack_update_entry(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts,int32_t * lenp,uint32_t add_flag,int segsiz)8049 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack,
8050 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz)
8051 {
8052 /*
8053 * We (re-)transmitted starting at rsm->r_start for some length
8054 * (possibly less than r_end.
8055 */
8056 struct rack_sendmap *nrsm;
8057 int insret __diagused;
8058 uint32_t c_end;
8059 int32_t len;
8060
8061 len = *lenp;
8062 c_end = rsm->r_start + len;
8063 if (SEQ_GEQ(c_end, rsm->r_end)) {
8064 /*
8065 * We retransmitted the whole piece or more than the whole
8066 * slopping into the next rsm.
8067 */
8068 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8069 if (c_end == rsm->r_end) {
8070 *lenp = 0;
8071 return (0);
8072 } else {
8073 int32_t act_len;
8074
8075 /* Hangs over the end return whats left */
8076 act_len = rsm->r_end - rsm->r_start;
8077 *lenp = (len - act_len);
8078 return (rsm->r_end);
8079 }
8080 /* We don't get out of this block. */
8081 }
8082 /*
8083 * Here we retransmitted less than the whole thing which means we
8084 * have to split this into what was transmitted and what was not.
8085 */
8086 nrsm = rack_alloc_full_limit(rack);
8087 if (nrsm == NULL) {
8088 /*
8089 * We can't get memory, so lets not proceed.
8090 */
8091 *lenp = 0;
8092 return (0);
8093 }
8094 /*
8095 * So here we are going to take the original rsm and make it what we
8096 * retransmitted. nrsm will be the tail portion we did not
8097 * retransmit. For example say the chunk was 1, 11 (10 bytes). And
8098 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to
8099 * 1, 6 and the new piece will be 6, 11.
8100 */
8101 rack_clone_rsm(rack, nrsm, rsm, c_end);
8102 nrsm->r_dupack = 0;
8103 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
8104 #ifndef INVARIANTS
8105 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8106 #else
8107 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8108 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8109 nrsm, insret, rack, rsm);
8110 }
8111 #endif
8112 if (rsm->r_in_tmap) {
8113 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8114 nrsm->r_in_tmap = 1;
8115 }
8116 rsm->r_flags &= (~RACK_HAS_FIN);
8117 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz);
8118 /* Log a split of rsm into rsm and nrsm */
8119 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8120 *lenp = 0;
8121 return (0);
8122 }
8123
8124 static void
rack_log_output(struct tcpcb * tp,struct tcpopt * to,int32_t len,uint32_t seq_out,uint16_t th_flags,int32_t err,uint64_t cts,struct rack_sendmap * hintrsm,uint32_t add_flag,struct mbuf * s_mb,uint32_t s_moff,int hw_tls,int segsiz)8125 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
8126 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts,
8127 struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb,
8128 uint32_t s_moff, int hw_tls, int segsiz)
8129 {
8130 struct tcp_rack *rack;
8131 struct rack_sendmap *rsm, *nrsm;
8132 int insret __diagused;
8133
8134 register uint32_t snd_max, snd_una;
8135
8136 /*
8137 * Add to the RACK log of packets in flight or retransmitted. If
8138 * there is a TS option we will use the TS echoed, if not we will
8139 * grab a TS.
8140 *
8141 * Retransmissions will increment the count and move the ts to its
8142 * proper place. Note that if options do not include TS's then we
8143 * won't be able to effectively use the ACK for an RTT on a retran.
8144 *
8145 * Notes about r_start and r_end. Lets consider a send starting at
8146 * sequence 1 for 10 bytes. In such an example the r_start would be
8147 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11.
8148 * This means that r_end is actually the first sequence for the next
8149 * slot (11).
8150 *
8151 */
8152 /*
8153 * If err is set what do we do XXXrrs? should we not add the thing?
8154 * -- i.e. return if err != 0 or should we pretend we sent it? --
8155 * i.e. proceed with add ** do this for now.
8156 */
8157 INP_WLOCK_ASSERT(tptoinpcb(tp));
8158 if (err)
8159 /*
8160 * We don't log errors -- we could but snd_max does not
8161 * advance in this case either.
8162 */
8163 return;
8164
8165 if (th_flags & TH_RST) {
8166 /*
8167 * We don't log resets and we return immediately from
8168 * sending
8169 */
8170 return;
8171 }
8172 rack = (struct tcp_rack *)tp->t_fb_ptr;
8173 snd_una = tp->snd_una;
8174 snd_max = tp->snd_max;
8175 if (th_flags & (TH_SYN | TH_FIN)) {
8176 /*
8177 * The call to rack_log_output is made before bumping
8178 * snd_max. This means we can record one extra byte on a SYN
8179 * or FIN if seq_out is adding more on and a FIN is present
8180 * (and we are not resending).
8181 */
8182 if ((th_flags & TH_SYN) && (seq_out == tp->iss))
8183 len++;
8184 if (th_flags & TH_FIN)
8185 len++;
8186 }
8187 if (SEQ_LEQ((seq_out + len), snd_una)) {
8188 /* Are sending an old segment to induce an ack (keep-alive)? */
8189 return;
8190 }
8191 if (SEQ_LT(seq_out, snd_una)) {
8192 /* huh? should we panic? */
8193 uint32_t end;
8194
8195 end = seq_out + len;
8196 seq_out = snd_una;
8197 if (SEQ_GEQ(end, seq_out))
8198 len = end - seq_out;
8199 else
8200 len = 0;
8201 }
8202 if (len == 0) {
8203 /* We don't log zero window probes */
8204 return;
8205 }
8206 if (IN_FASTRECOVERY(tp->t_flags)) {
8207 rack->r_ctl.rc_prr_out += len;
8208 }
8209 /* First question is it a retransmission or new? */
8210 if (seq_out == snd_max) {
8211 /* Its new */
8212 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts);
8213 again:
8214 rsm = rack_alloc(rack);
8215 if (rsm == NULL) {
8216 /*
8217 * Hmm out of memory and the tcb got destroyed while
8218 * we tried to wait.
8219 */
8220 return;
8221 }
8222 if (th_flags & TH_FIN) {
8223 rsm->r_flags = RACK_HAS_FIN|add_flag;
8224 } else {
8225 rsm->r_flags = add_flag;
8226 }
8227 if (hw_tls)
8228 rsm->r_hw_tls = 1;
8229 rsm->r_tim_lastsent[0] = cts;
8230 rsm->r_rtr_cnt = 1;
8231 rsm->r_act_rxt_cnt = 0;
8232 rsm->r_rtr_bytes = 0;
8233 if (th_flags & TH_SYN) {
8234 /* The data space is one beyond snd_una */
8235 rsm->r_flags |= RACK_HAS_SYN;
8236 }
8237 rsm->r_start = seq_out;
8238 rsm->r_end = rsm->r_start + len;
8239 rack_mark_in_gp_win(tp, rsm);
8240 rsm->r_dupack = 0;
8241 /*
8242 * save off the mbuf location that
8243 * sndmbuf_noadv returned (which is
8244 * where we started copying from)..
8245 */
8246 rsm->m = s_mb;
8247 rsm->soff = s_moff;
8248 /*
8249 * Here we do add in the len of send, since its not yet
8250 * reflected in in snduna <->snd_max
8251 */
8252 rsm->r_fas = (ctf_flight_size(rack->rc_tp,
8253 rack->r_ctl.rc_sacked) +
8254 (rsm->r_end - rsm->r_start));
8255 if ((rack->rc_initial_ss_comp == 0) &&
8256 (rack->r_ctl.ss_hi_fs < rsm->r_fas)) {
8257 rack->r_ctl.ss_hi_fs = rsm->r_fas;
8258 }
8259 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */
8260 if (rsm->m) {
8261 if (rsm->m->m_len <= rsm->soff) {
8262 /*
8263 * XXXrrs Question, will this happen?
8264 *
8265 * If sbsndptr is set at the correct place
8266 * then s_moff should always be somewhere
8267 * within rsm->m. But if the sbsndptr was
8268 * off then that won't be true. If it occurs
8269 * we need to walkout to the correct location.
8270 */
8271 struct mbuf *lm;
8272
8273 lm = rsm->m;
8274 while (lm->m_len <= rsm->soff) {
8275 rsm->soff -= lm->m_len;
8276 lm = lm->m_next;
8277 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u",
8278 __func__, rack, s_moff, s_mb, rsm->soff));
8279 }
8280 rsm->m = lm;
8281 }
8282 rsm->orig_m_len = rsm->m->m_len;
8283 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
8284 } else {
8285 rsm->orig_m_len = 0;
8286 rsm->orig_t_space = 0;
8287 }
8288 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz);
8289 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
8290 /* Log a new rsm */
8291 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__);
8292 #ifndef INVARIANTS
8293 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
8294 #else
8295 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
8296 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8297 nrsm, insret, rack, rsm);
8298 }
8299 #endif
8300 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
8301 rsm->r_in_tmap = 1;
8302 if (rsm->r_flags & RACK_IS_PCM) {
8303 rack->r_ctl.pcm_i.send_time = cts;
8304 rack->r_ctl.pcm_i.eseq = rsm->r_end;
8305 /* First time through we set the start too */
8306 if (rack->pcm_in_progress == 0)
8307 rack->r_ctl.pcm_i.sseq = rsm->r_start;
8308 }
8309 /*
8310 * Special case detection, is there just a single
8311 * packet outstanding when we are not in recovery?
8312 *
8313 * If this is true mark it so.
8314 */
8315 if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
8316 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) {
8317 struct rack_sendmap *prsm;
8318
8319 prsm = tqhash_prev(rack->r_ctl.tqh, rsm);
8320 if (prsm)
8321 prsm->r_one_out_nr = 1;
8322 }
8323 return;
8324 }
8325 /*
8326 * If we reach here its a retransmission and we need to find it.
8327 */
8328 more:
8329 if (hintrsm && (hintrsm->r_start == seq_out)) {
8330 rsm = hintrsm;
8331 hintrsm = NULL;
8332 } else {
8333 /* No hints sorry */
8334 rsm = NULL;
8335 }
8336 if ((rsm) && (rsm->r_start == seq_out)) {
8337 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8338 if (len == 0) {
8339 return;
8340 } else {
8341 goto more;
8342 }
8343 }
8344 /* Ok it was not the last pointer go through it the hard way. */
8345 refind:
8346 rsm = tqhash_find(rack->r_ctl.tqh, seq_out);
8347 if (rsm) {
8348 if (rsm->r_start == seq_out) {
8349 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz);
8350 if (len == 0) {
8351 return;
8352 } else {
8353 goto refind;
8354 }
8355 }
8356 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) {
8357 /* Transmitted within this piece */
8358 /*
8359 * Ok we must split off the front and then let the
8360 * update do the rest
8361 */
8362 nrsm = rack_alloc_full_limit(rack);
8363 if (nrsm == NULL) {
8364 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz);
8365 return;
8366 }
8367 /*
8368 * copy rsm to nrsm and then trim the front of rsm
8369 * to not include this part.
8370 */
8371 rack_clone_rsm(rack, nrsm, rsm, seq_out);
8372 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__);
8373 #ifndef INVARIANTS
8374 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
8375 #else
8376 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
8377 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
8378 nrsm, insret, rack, rsm);
8379 }
8380 #endif
8381 if (rsm->r_in_tmap) {
8382 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
8383 nrsm->r_in_tmap = 1;
8384 }
8385 rsm->r_flags &= (~RACK_HAS_FIN);
8386 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz);
8387 if (len == 0) {
8388 return;
8389 } else if (len > 0)
8390 goto refind;
8391 }
8392 }
8393 /*
8394 * Hmm not found in map did they retransmit both old and on into the
8395 * new?
8396 */
8397 if (seq_out == tp->snd_max) {
8398 goto again;
8399 } else if (SEQ_LT(seq_out, tp->snd_max)) {
8400 #ifdef INVARIANTS
8401 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n",
8402 seq_out, len, tp->snd_una, tp->snd_max);
8403 printf("Starting Dump of all rack entries\n");
8404 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) {
8405 printf("rsm:%p start:%u end:%u\n",
8406 rsm, rsm->r_start, rsm->r_end);
8407 }
8408 printf("Dump complete\n");
8409 panic("seq_out not found rack:%p tp:%p",
8410 rack, tp);
8411 #endif
8412 } else {
8413 #ifdef INVARIANTS
8414 /*
8415 * Hmm beyond sndmax? (only if we are using the new rtt-pack
8416 * flag)
8417 */
8418 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p",
8419 seq_out, len, tp->snd_max, tp);
8420 #endif
8421 }
8422 }
8423
8424 /*
8425 * Record one of the RTT updates from an ack into
8426 * our sample structure.
8427 */
8428
8429 static void
tcp_rack_xmit_timer(struct tcp_rack * rack,int32_t rtt,uint32_t len,uint32_t us_rtt,int confidence,struct rack_sendmap * rsm,uint16_t rtrcnt)8430 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt,
8431 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt)
8432 {
8433 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8434 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) {
8435 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt;
8436 }
8437 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8438 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) {
8439 rack->r_ctl.rack_rs.rs_rtt_highest = rtt;
8440 }
8441 if (rack->rc_tp->t_flags & TF_GPUTINPROG) {
8442 if (us_rtt < rack->r_ctl.rc_gp_lowrtt)
8443 rack->r_ctl.rc_gp_lowrtt = us_rtt;
8444 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd)
8445 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
8446 }
8447 if ((confidence == 1) &&
8448 ((rsm == NULL) ||
8449 (rsm->r_just_ret) ||
8450 (rsm->r_one_out_nr &&
8451 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) {
8452 /*
8453 * If the rsm had a just return
8454 * hit it then we can't trust the
8455 * rtt measurement for buffer deterimination
8456 * Note that a confidence of 2, indicates
8457 * SACK'd which overrides the r_just_ret or
8458 * the r_one_out_nr. If it was a CUM-ACK and
8459 * we had only two outstanding, but get an
8460 * ack for only 1. Then that also lowers our
8461 * confidence.
8462 */
8463 confidence = 0;
8464 }
8465 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) ||
8466 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) {
8467 if (rack->r_ctl.rack_rs.confidence == 0) {
8468 /*
8469 * We take anything with no current confidence
8470 * saved.
8471 */
8472 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8473 rack->r_ctl.rack_rs.confidence = confidence;
8474 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8475 } else if (confidence != 0) {
8476 /*
8477 * Once we have a confident number,
8478 * we can update it with a smaller
8479 * value since this confident number
8480 * may include the DSACK time until
8481 * the next segment (the second one) arrived.
8482 */
8483 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt;
8484 rack->r_ctl.rack_rs.confidence = confidence;
8485 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt;
8486 }
8487 }
8488 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence);
8489 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID;
8490 rack->r_ctl.rack_rs.rs_rtt_tot += rtt;
8491 rack->r_ctl.rack_rs.rs_rtt_cnt++;
8492 }
8493
8494 /*
8495 * Collect new round-trip time estimate
8496 * and update averages and current timeout.
8497 */
8498 static void
tcp_rack_xmit_timer_commit(struct tcp_rack * rack,struct tcpcb * tp)8499 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp)
8500 {
8501 int32_t delta;
8502 int32_t rtt;
8503
8504 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY)
8505 /* No valid sample */
8506 return;
8507 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) {
8508 /* We are to use the lowest RTT seen in a single ack */
8509 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest;
8510 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) {
8511 /* We are to use the highest RTT seen in a single ack */
8512 rtt = rack->r_ctl.rack_rs.rs_rtt_highest;
8513 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) {
8514 /* We are to use the average RTT seen in a single ack */
8515 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot /
8516 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt);
8517 } else {
8518 #ifdef INVARIANTS
8519 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method);
8520 #endif
8521 return;
8522 }
8523 if (rtt == 0)
8524 rtt = 1;
8525 if (rack->rc_gp_rtt_set == 0) {
8526 /*
8527 * With no RTT we have to accept
8528 * even one we are not confident of.
8529 */
8530 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt;
8531 rack->rc_gp_rtt_set = 1;
8532 } else if (rack->r_ctl.rack_rs.confidence) {
8533 /* update the running gp srtt */
8534 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8);
8535 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8;
8536 }
8537 if (rack->r_ctl.rack_rs.confidence) {
8538 /*
8539 * record the low and high for highly buffered path computation,
8540 * we only do this if we are confident (not a retransmission).
8541 */
8542 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) {
8543 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8544 }
8545 if (rack->rc_highly_buffered == 0) {
8546 /*
8547 * Currently once we declare a path has
8548 * highly buffered there is no going
8549 * back, which may be a problem...
8550 */
8551 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) {
8552 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt,
8553 rack->r_ctl.rc_highest_us_rtt,
8554 rack->r_ctl.rc_lowest_us_rtt,
8555 RACK_RTTS_SEEHBP);
8556 rack->rc_highly_buffered = 1;
8557 }
8558 }
8559 }
8560 if ((rack->r_ctl.rack_rs.confidence) ||
8561 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) {
8562 /*
8563 * If we are highly confident of it <or> it was
8564 * never retransmitted we accept it as the last us_rtt.
8565 */
8566 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8567 /* The lowest rtt can be set if its was not retransmited */
8568 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) {
8569 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt;
8570 if (rack->r_ctl.rc_lowest_us_rtt == 0)
8571 rack->r_ctl.rc_lowest_us_rtt = 1;
8572 }
8573 }
8574 rack = (struct tcp_rack *)tp->t_fb_ptr;
8575 if (tp->t_srtt != 0) {
8576 /*
8577 * We keep a simple srtt in microseconds, like our rtt
8578 * measurement. We don't need to do any tricks with shifting
8579 * etc. Instead we just add in 1/8th of the new measurement
8580 * and subtract out 1/8 of the old srtt. We do the same with
8581 * the variance after finding the absolute value of the
8582 * difference between this sample and the current srtt.
8583 */
8584 delta = tp->t_srtt - rtt;
8585 /* Take off 1/8th of the current sRTT */
8586 tp->t_srtt -= (tp->t_srtt >> 3);
8587 /* Add in 1/8th of the new RTT just measured */
8588 tp->t_srtt += (rtt >> 3);
8589 if (tp->t_srtt <= 0)
8590 tp->t_srtt = 1;
8591 /* Now lets make the absolute value of the variance */
8592 if (delta < 0)
8593 delta = -delta;
8594 /* Subtract out 1/8th */
8595 tp->t_rttvar -= (tp->t_rttvar >> 3);
8596 /* Add in 1/8th of the new variance we just saw */
8597 tp->t_rttvar += (delta >> 3);
8598 if (tp->t_rttvar <= 0)
8599 tp->t_rttvar = 1;
8600 } else {
8601 /*
8602 * No rtt measurement yet - use the unsmoothed rtt. Set the
8603 * variance to half the rtt (so our first retransmit happens
8604 * at 3*rtt).
8605 */
8606 tp->t_srtt = rtt;
8607 tp->t_rttvar = rtt >> 1;
8608 }
8609 rack->rc_srtt_measure_made = 1;
8610 KMOD_TCPSTAT_INC(tcps_rttupdated);
8611 if (tp->t_rttupdated < UCHAR_MAX)
8612 tp->t_rttupdated++;
8613 #ifdef STATS
8614 if (rack_stats_gets_ms_rtt == 0) {
8615 /* Send in the microsecond rtt used for rxt timeout purposes */
8616 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
8617 } else if (rack_stats_gets_ms_rtt == 1) {
8618 /* Send in the millisecond rtt used for rxt timeout purposes */
8619 int32_t ms_rtt;
8620
8621 /* Round up */
8622 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8623 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8624 } else if (rack_stats_gets_ms_rtt == 2) {
8625 /* Send in the millisecond rtt has close to the path RTT as we can get */
8626 int32_t ms_rtt;
8627
8628 /* Round up */
8629 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC;
8630 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt));
8631 } else {
8632 /* Send in the microsecond rtt has close to the path RTT as we can get */
8633 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8634 }
8635 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt));
8636 #endif
8637 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
8638 /*
8639 * the retransmit should happen at rtt + 4 * rttvar. Because of the
8640 * way we do the smoothing, srtt and rttvar will each average +1/2
8641 * tick of bias. When we compute the retransmit timer, we want 1/2
8642 * tick of rounding and 1 extra tick because of +-1/2 tick
8643 * uncertainty in the firing of the timer. The bias will give us
8644 * exactly the 1.5 tick we need. But, because the bias is
8645 * statistical, we have to test that we don't drop below the minimum
8646 * feasible timer (which is 2 ticks).
8647 */
8648 tp->t_rxtshift = 0;
8649 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8650 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop);
8651 rack_log_rtt_sample(rack, rtt);
8652 tp->t_softerror = 0;
8653 }
8654
8655
8656 static void
rack_apply_updated_usrtt(struct tcp_rack * rack,uint32_t us_rtt,uint32_t us_cts)8657 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts)
8658 {
8659 /*
8660 * Apply to filter the inbound us-rtt at us_cts.
8661 */
8662 uint32_t old_rtt;
8663
8664 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt);
8665 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt,
8666 us_rtt, us_cts);
8667 if (old_rtt > us_rtt) {
8668 /* We just hit a new lower rtt time */
8669 rack_log_rtt_shrinks(rack, us_cts, old_rtt,
8670 __LINE__, RACK_RTTS_NEWRTT);
8671 /*
8672 * Only count it if its lower than what we saw within our
8673 * calculated range.
8674 */
8675 if ((old_rtt - us_rtt) > rack_min_rtt_movement) {
8676 if (rack_probertt_lower_within &&
8677 rack->rc_gp_dyn_mul &&
8678 (rack->use_fixed_rate == 0) &&
8679 (rack->rc_always_pace)) {
8680 /*
8681 * We are seeing a new lower rtt very close
8682 * to the time that we would have entered probe-rtt.
8683 * This is probably due to the fact that a peer flow
8684 * has entered probe-rtt. Lets go in now too.
8685 */
8686 uint32_t val;
8687
8688 val = rack_probertt_lower_within * rack_time_between_probertt;
8689 val /= 100;
8690 if ((rack->in_probe_rtt == 0) &&
8691 (rack->rc_skip_timely == 0) &&
8692 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) {
8693 rack_enter_probertt(rack, us_cts);
8694 }
8695 }
8696 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
8697 }
8698 }
8699 }
8700
8701 static int
rack_update_rtt(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,struct tcpopt * to,uint32_t cts,int32_t ack_type,tcp_seq th_ack)8702 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
8703 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack)
8704 {
8705 uint32_t us_rtt;
8706 int32_t i, all;
8707 uint32_t t, len_acked;
8708
8709 if ((rsm->r_flags & RACK_ACKED) ||
8710 (rsm->r_flags & RACK_WAS_ACKED))
8711 /* Already done */
8712 return (0);
8713 if (rsm->r_no_rtt_allowed) {
8714 /* Not allowed */
8715 return (0);
8716 }
8717 if (ack_type == CUM_ACKED) {
8718 if (SEQ_GT(th_ack, rsm->r_end)) {
8719 len_acked = rsm->r_end - rsm->r_start;
8720 all = 1;
8721 } else {
8722 len_acked = th_ack - rsm->r_start;
8723 all = 0;
8724 }
8725 } else {
8726 len_acked = rsm->r_end - rsm->r_start;
8727 all = 0;
8728 }
8729 if (rsm->r_rtr_cnt == 1) {
8730
8731 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8732 if ((int)t <= 0)
8733 t = 1;
8734 if (!tp->t_rttlow || tp->t_rttlow > t)
8735 tp->t_rttlow = t;
8736 if (!rack->r_ctl.rc_rack_min_rtt ||
8737 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8738 rack->r_ctl.rc_rack_min_rtt = t;
8739 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8740 rack->r_ctl.rc_rack_min_rtt = 1;
8741 }
8742 }
8743 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]))
8744 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8745 else
8746 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
8747 if (us_rtt == 0)
8748 us_rtt = 1;
8749 if (CC_ALGO(tp)->rttsample != NULL) {
8750 /* Kick the RTT to the CC */
8751 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8752 }
8753 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
8754 if (ack_type == SACKED) {
8755 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1);
8756 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt);
8757 } else {
8758 /*
8759 * We need to setup what our confidence
8760 * is in this ack.
8761 *
8762 * If the rsm was app limited and it is
8763 * less than a mss in length (the end
8764 * of the send) then we have a gap. If we
8765 * were app limited but say we were sending
8766 * multiple MSS's then we are more confident
8767 * int it.
8768 *
8769 * When we are not app-limited then we see if
8770 * the rsm is being included in the current
8771 * measurement, we tell this by the app_limited_needs_set
8772 * flag.
8773 *
8774 * Note that being cwnd blocked is not applimited
8775 * as well as the pacing delay between packets which
8776 * are sending only 1 or 2 MSS's also will show up
8777 * in the RTT. We probably need to examine this algorithm
8778 * a bit more and enhance it to account for the delay
8779 * between rsm's. We could do that by saving off the
8780 * pacing delay of each rsm (in an rsm) and then
8781 * factoring that in somehow though for now I am
8782 * not sure how :)
8783 */
8784 int calc_conf = 0;
8785
8786 if (rsm->r_flags & RACK_APP_LIMITED) {
8787 if (all && (len_acked <= ctf_fixed_maxseg(tp)))
8788 calc_conf = 0;
8789 else
8790 calc_conf = 1;
8791 } else if (rack->app_limited_needs_set == 0) {
8792 calc_conf = 1;
8793 } else {
8794 calc_conf = 0;
8795 }
8796 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2);
8797 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt,
8798 calc_conf, rsm, rsm->r_rtr_cnt);
8799 }
8800 if ((rsm->r_flags & RACK_TLP) &&
8801 (!IN_FASTRECOVERY(tp->t_flags))) {
8802 /* Segment was a TLP and our retrans matched */
8803 if (rack->r_ctl.rc_tlp_cwnd_reduce) {
8804 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
8805 }
8806 }
8807 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8808 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8809 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
8810 /* New more recent rack_tmit_time */
8811 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8812 if (rack->r_ctl.rc_rack_tmit_time == 0)
8813 rack->r_ctl.rc_rack_tmit_time = 1;
8814 rack->rc_rack_rtt = t;
8815 }
8816 return (1);
8817 }
8818 /*
8819 * We clear the soft/rxtshift since we got an ack.
8820 * There is no assurance we will call the commit() function
8821 * so we need to clear these to avoid incorrect handling.
8822 */
8823 tp->t_rxtshift = 0;
8824 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
8825 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
8826 tp->t_softerror = 0;
8827 if (to && (to->to_flags & TOF_TS) &&
8828 (ack_type == CUM_ACKED) &&
8829 (to->to_tsecr) &&
8830 ((rsm->r_flags & RACK_OVERMAX) == 0)) {
8831 /*
8832 * Now which timestamp does it match? In this block the ACK
8833 * must be coming from a previous transmission.
8834 */
8835 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8836 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) {
8837 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8838 if ((int)t <= 0)
8839 t = 1;
8840 if (CC_ALGO(tp)->rttsample != NULL) {
8841 /*
8842 * Kick the RTT to the CC, here
8843 * we lie a bit in that we know the
8844 * retransmission is correct even though
8845 * we retransmitted. This is because
8846 * we match the timestamps.
8847 */
8848 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i]))
8849 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i];
8850 else
8851 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i];
8852 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas);
8853 }
8854 if ((i + 1) < rsm->r_rtr_cnt) {
8855 /*
8856 * The peer ack'd from our previous
8857 * transmission. We have a spurious
8858 * retransmission and thus we dont
8859 * want to update our rack_rtt.
8860 *
8861 * Hmm should there be a CC revert here?
8862 *
8863 */
8864 return (0);
8865 }
8866 if (!tp->t_rttlow || tp->t_rttlow > t)
8867 tp->t_rttlow = t;
8868 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8869 rack->r_ctl.rc_rack_min_rtt = t;
8870 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8871 rack->r_ctl.rc_rack_min_rtt = 1;
8872 }
8873 }
8874 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8875 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8876 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) {
8877 /* New more recent rack_tmit_time */
8878 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
8879 if (rack->r_ctl.rc_rack_tmit_time == 0)
8880 rack->r_ctl.rc_rack_tmit_time = 1;
8881 rack->rc_rack_rtt = t;
8882 }
8883 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3);
8884 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm,
8885 rsm->r_rtr_cnt);
8886 return (1);
8887 }
8888 }
8889 /* If we are logging log out the sendmap */
8890 if (tcp_bblogging_on(rack->rc_tp)) {
8891 for (i = 0; i < rsm->r_rtr_cnt; i++) {
8892 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr);
8893 }
8894 }
8895 goto ts_not_found;
8896 } else {
8897 /*
8898 * Ok its a SACK block that we retransmitted. or a windows
8899 * machine without timestamps. We can tell nothing from the
8900 * time-stamp since its not there or the time the peer last
8901 * received a segment that moved forward its cum-ack point.
8902 */
8903 ts_not_found:
8904 i = rsm->r_rtr_cnt - 1;
8905 t = cts - (uint32_t)rsm->r_tim_lastsent[i];
8906 if ((int)t <= 0)
8907 t = 1;
8908 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8909 /*
8910 * We retransmitted and the ack came back in less
8911 * than the smallest rtt we have observed. We most
8912 * likely did an improper retransmit as outlined in
8913 * 6.2 Step 2 point 2 in the rack-draft so we
8914 * don't want to update our rack_rtt. We in
8915 * theory (in future) might want to think about reverting our
8916 * cwnd state but we won't for now.
8917 */
8918 return (0);
8919 } else if (rack->r_ctl.rc_rack_min_rtt) {
8920 /*
8921 * We retransmitted it and the retransmit did the
8922 * job.
8923 */
8924 if (!rack->r_ctl.rc_rack_min_rtt ||
8925 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) {
8926 rack->r_ctl.rc_rack_min_rtt = t;
8927 if (rack->r_ctl.rc_rack_min_rtt == 0) {
8928 rack->r_ctl.rc_rack_min_rtt = 1;
8929 }
8930 }
8931 if ((rack->r_ctl.rc_rack_tmit_time == 0) ||
8932 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time,
8933 (uint32_t)rsm->r_tim_lastsent[i]))) {
8934 /* New more recent rack_tmit_time */
8935 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i];
8936 if (rack->r_ctl.rc_rack_tmit_time == 0)
8937 rack->r_ctl.rc_rack_tmit_time = 1;
8938 rack->rc_rack_rtt = t;
8939 }
8940 return (1);
8941 }
8942 }
8943 return (0);
8944 }
8945
8946 /*
8947 * Mark the SACK_PASSED flag on all entries prior to rsm send wise.
8948 */
8949 static void
rack_log_sack_passed(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint32_t cts)8950 rack_log_sack_passed(struct tcpcb *tp,
8951 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts)
8952 {
8953 struct rack_sendmap *nrsm;
8954 uint32_t thresh;
8955
8956 /* Get our rxt threshold for lost consideration */
8957 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0);
8958 /* Now start looking at rsm's */
8959 nrsm = rsm;
8960 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap,
8961 rack_head, r_tnext) {
8962 if (nrsm == rsm) {
8963 /* Skip original segment he is acked */
8964 continue;
8965 }
8966 if (nrsm->r_flags & RACK_ACKED) {
8967 /*
8968 * Skip ack'd segments, though we
8969 * should not see these, since tmap
8970 * should not have ack'd segments.
8971 */
8972 continue;
8973 }
8974 if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
8975 /*
8976 * If the peer dropped the rwnd on
8977 * these then we don't worry about them.
8978 */
8979 continue;
8980 }
8981 /* Check lost state */
8982 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) {
8983 uint32_t exp;
8984
8985 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh;
8986 if (TSTMP_LT(exp, cts) || (exp == cts)) {
8987 /* We consider it lost */
8988 nrsm->r_flags |= RACK_WAS_LOST;
8989 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start;
8990 }
8991 }
8992 if (nrsm->r_flags & RACK_SACK_PASSED) {
8993 /*
8994 * We found one that is already marked
8995 * passed, we have been here before and
8996 * so all others below this are marked.
8997 */
8998 break;
8999 }
9000 nrsm->r_flags |= RACK_SACK_PASSED;
9001 nrsm->r_flags &= ~RACK_WAS_SACKPASS;
9002 }
9003 }
9004
9005 static void
rack_need_set_test(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,tcp_seq th_ack,int line,int use_which)9006 rack_need_set_test(struct tcpcb *tp,
9007 struct tcp_rack *rack,
9008 struct rack_sendmap *rsm,
9009 tcp_seq th_ack,
9010 int line,
9011 int use_which)
9012 {
9013 struct rack_sendmap *s_rsm;
9014
9015 if ((tp->t_flags & TF_GPUTINPROG) &&
9016 SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9017 /*
9018 * We were app limited, and this ack
9019 * butts up or goes beyond the point where we want
9020 * to start our next measurement. We need
9021 * to record the new gput_ts as here and
9022 * possibly update the start sequence.
9023 */
9024 uint32_t seq, ts;
9025
9026 if (rsm->r_rtr_cnt > 1) {
9027 /*
9028 * This is a retransmit, can we
9029 * really make any assessment at this
9030 * point? We are not really sure of
9031 * the timestamp, is it this or the
9032 * previous transmission?
9033 *
9034 * Lets wait for something better that
9035 * is not retransmitted.
9036 */
9037 return;
9038 }
9039 seq = tp->gput_seq;
9040 ts = tp->gput_ts;
9041 rack->app_limited_needs_set = 0;
9042 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
9043 /* Do we start at a new end? */
9044 if ((use_which == RACK_USE_BEG) &&
9045 SEQ_GEQ(rsm->r_start, tp->gput_seq)) {
9046 /*
9047 * When we get an ACK that just eats
9048 * up some of the rsm, we set RACK_USE_BEG
9049 * since whats at r_start (i.e. th_ack)
9050 * is left unacked and thats where the
9051 * measurement now starts.
9052 */
9053 tp->gput_seq = rsm->r_start;
9054 }
9055 if ((use_which == RACK_USE_END) &&
9056 SEQ_GEQ(rsm->r_end, tp->gput_seq)) {
9057 /*
9058 * We use the end when the cumack
9059 * is moving forward and completely
9060 * deleting the rsm passed so basically
9061 * r_end holds th_ack.
9062 *
9063 * For SACK's we also want to use the end
9064 * since this piece just got sacked and
9065 * we want to target anything after that
9066 * in our measurement.
9067 */
9068 tp->gput_seq = rsm->r_end;
9069 }
9070 if (use_which == RACK_USE_END_OR_THACK) {
9071 /*
9072 * special case for ack moving forward,
9073 * not a sack, we need to move all the
9074 * way up to where this ack cum-ack moves
9075 * to.
9076 */
9077 if (SEQ_GT(th_ack, rsm->r_end))
9078 tp->gput_seq = th_ack;
9079 else
9080 tp->gput_seq = rsm->r_end;
9081 }
9082 if (SEQ_LT(tp->gput_seq, tp->snd_max))
9083 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq);
9084 else
9085 s_rsm = NULL;
9086 /*
9087 * Pick up the correct send time if we can the rsm passed in
9088 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other
9089 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will
9090 * find a different seq i.e. the next send up.
9091 *
9092 * If that has not been sent, s_rsm will be NULL and we must
9093 * arrange it so this function will get called again by setting
9094 * app_limited_needs_set.
9095 */
9096 if (s_rsm)
9097 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0];
9098 else {
9099 /* If we hit here we have to have *not* sent tp->gput_seq */
9100 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0];
9101 /* Set it up so we will go through here again */
9102 rack->app_limited_needs_set = 1;
9103 }
9104 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) {
9105 /*
9106 * We moved beyond this guy's range, re-calculate
9107 * the new end point.
9108 */
9109 if (rack->rc_gp_filled == 0) {
9110 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
9111 } else {
9112 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
9113 }
9114 }
9115 /*
9116 * We are moving the goal post, we may be able to clear the
9117 * measure_saw_probe_rtt flag.
9118 */
9119 if ((rack->in_probe_rtt == 0) &&
9120 (rack->measure_saw_probe_rtt) &&
9121 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
9122 rack->measure_saw_probe_rtt = 0;
9123 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts,
9124 seq, tp->gput_seq,
9125 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9126 (uint64_t)rack->r_ctl.rc_gp_output_ts),
9127 5, line, NULL, 0);
9128 if (rack->rc_gp_filled &&
9129 ((tp->gput_ack - tp->gput_seq) <
9130 max(rc_init_window(rack), (MIN_GP_WIN *
9131 ctf_fixed_maxseg(tp))))) {
9132 uint32_t ideal_amount;
9133
9134 ideal_amount = rack_get_measure_window(tp, rack);
9135 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) {
9136 /*
9137 * There is no sense of continuing this measurement
9138 * because its too small to gain us anything we
9139 * trust. Skip it and that way we can start a new
9140 * measurement quicker.
9141 */
9142 tp->t_flags &= ~TF_GPUTINPROG;
9143 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq,
9144 0, 0,
9145 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) |
9146 (uint64_t)rack->r_ctl.rc_gp_output_ts),
9147 6, __LINE__, NULL, 0);
9148 } else {
9149 /*
9150 * Reset the window further out.
9151 */
9152 tp->gput_ack = tp->gput_seq + ideal_amount;
9153 }
9154 }
9155 rack_tend_gp_marks(tp, rack);
9156 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm);
9157 }
9158 }
9159
9160 static inline int
is_rsm_inside_declared_tlp_block(struct tcp_rack * rack,struct rack_sendmap * rsm)9161 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm)
9162 {
9163 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) {
9164 /* Behind our TLP definition or right at */
9165 return (0);
9166 }
9167 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) {
9168 /* The start is beyond or right at our end of TLP definition */
9169 return (0);
9170 }
9171 /* It has to be a sub-part of the original TLP recorded */
9172 return (1);
9173 }
9174
9175 static uint32_t
rack_proc_sack_blk(struct tcpcb * tp,struct tcp_rack * rack,struct sackblk * sack,struct tcpopt * to,struct rack_sendmap ** prsm,uint32_t cts,uint32_t segsiz)9176 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack,
9177 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts,
9178 uint32_t segsiz)
9179 {
9180 uint32_t start, end, changed = 0;
9181 struct rack_sendmap stack_map;
9182 struct rack_sendmap *rsm, *nrsm, *prev, *next;
9183 int insret __diagused;
9184 int32_t used_ref = 1;
9185 int can_use_hookery = 0;
9186
9187 start = sack->start;
9188 end = sack->end;
9189 rsm = *prsm;
9190
9191 do_rest_ofb:
9192 if ((rsm == NULL) ||
9193 (SEQ_LT(end, rsm->r_start)) ||
9194 (SEQ_GEQ(start, rsm->r_end)) ||
9195 (SEQ_LT(start, rsm->r_start))) {
9196 /*
9197 * We are not in the right spot,
9198 * find the correct spot in the tree.
9199 */
9200 used_ref = 0;
9201 rsm = tqhash_find(rack->r_ctl.tqh, start);
9202 }
9203 if (rsm == NULL) {
9204 /* TSNH */
9205 goto out;
9206 }
9207 /* Ok we have an ACK for some piece of this rsm */
9208 if (rsm->r_start != start) {
9209 if ((rsm->r_flags & RACK_ACKED) == 0) {
9210 /*
9211 * Before any splitting or hookery is
9212 * done is it a TLP of interest i.e. rxt?
9213 */
9214 if ((rsm->r_flags & RACK_TLP) &&
9215 (rsm->r_rtr_cnt > 1)) {
9216 /*
9217 * We are splitting a rxt TLP, check
9218 * if we need to save off the start/end
9219 */
9220 if (rack->rc_last_tlp_acked_set &&
9221 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9222 /*
9223 * We already turned this on since we are inside
9224 * the previous one was a partially sack now we
9225 * are getting another one (maybe all of it).
9226 *
9227 */
9228 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9229 /*
9230 * Lets make sure we have all of it though.
9231 */
9232 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9233 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9234 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9235 rack->r_ctl.last_tlp_acked_end);
9236 }
9237 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9238 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9239 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9240 rack->r_ctl.last_tlp_acked_end);
9241 }
9242 } else {
9243 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9244 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9245 rack->rc_last_tlp_past_cumack = 0;
9246 rack->rc_last_tlp_acked_set = 1;
9247 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9248 }
9249 }
9250 /**
9251 * Need to split this in two pieces the before and after,
9252 * the before remains in the map, the after must be
9253 * added. In other words we have:
9254 * rsm |--------------|
9255 * sackblk |------->
9256 * rsm will become
9257 * rsm |---|
9258 * and nrsm will be the sacked piece
9259 * nrsm |----------|
9260 *
9261 * But before we start down that path lets
9262 * see if the sack spans over on top of
9263 * the next guy and it is already sacked.
9264 *
9265 */
9266 /*
9267 * Hookery can only be used if the two entries
9268 * are in the same bucket and neither one of
9269 * them staddle the bucket line.
9270 */
9271 next = tqhash_next(rack->r_ctl.tqh, rsm);
9272 if (next &&
9273 (rsm->bindex == next->bindex) &&
9274 ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9275 ((next->r_flags & RACK_STRADDLE) == 0) &&
9276 ((rsm->r_flags & RACK_IS_PCM) == 0) &&
9277 ((next->r_flags & RACK_IS_PCM) == 0) &&
9278 (rsm->r_flags & RACK_IN_GP_WIN) &&
9279 (next->r_flags & RACK_IN_GP_WIN))
9280 can_use_hookery = 1;
9281 else
9282 can_use_hookery = 0;
9283 if (next && can_use_hookery &&
9284 (next->r_flags & RACK_ACKED) &&
9285 SEQ_GEQ(end, next->r_start)) {
9286 /**
9287 * So the next one is already acked, and
9288 * we can thus by hookery use our stack_map
9289 * to reflect the piece being sacked and
9290 * then adjust the two tree entries moving
9291 * the start and ends around. So we start like:
9292 * rsm |------------| (not-acked)
9293 * next |-----------| (acked)
9294 * sackblk |-------->
9295 * We want to end like so:
9296 * rsm |------| (not-acked)
9297 * next |-----------------| (acked)
9298 * nrsm |-----|
9299 * Where nrsm is a temporary stack piece we
9300 * use to update all the gizmos.
9301 */
9302 /* Copy up our fudge block */
9303 nrsm = &stack_map;
9304 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9305 /* Now adjust our tree blocks */
9306 tqhash_update_end(rack->r_ctl.tqh, rsm, start);
9307 next->r_start = start;
9308 rsm->r_flags |= RACK_SHUFFLED;
9309 next->r_flags |= RACK_SHUFFLED;
9310 /* Now we must adjust back where next->m is */
9311 rack_setup_offset_for_rsm(rack, rsm, next);
9312 /*
9313 * Which timestamp do we keep? It is rather
9314 * important in GP measurements to have the
9315 * accurate end of the send window.
9316 *
9317 * We keep the largest value, which is the newest
9318 * send. We do this in case a segment that is
9319 * joined together and not part of a GP estimate
9320 * later gets expanded into the GP estimate.
9321 *
9322 * We prohibit the merging of unlike kinds i.e.
9323 * all pieces that are in the GP estimate can be
9324 * merged and all pieces that are not in a GP estimate
9325 * can be merged, but not disimilar pieces. Combine
9326 * this with taking the highest here and we should
9327 * be ok unless of course the client reneges. Then
9328 * all bets are off.
9329 */
9330 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] <
9331 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)])
9332 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)];
9333 /*
9334 * And we must keep the newest ack arrival time.
9335 */
9336 if (next->r_ack_arrival <
9337 rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9338 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9339
9340
9341 /* We don't need to adjust rsm, it did not change */
9342 /* Clear out the dup ack count of the remainder */
9343 rsm->r_dupack = 0;
9344 rsm->r_just_ret = 0;
9345 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9346 /* Now lets make sure our fudge block is right */
9347 nrsm->r_start = start;
9348 /* Now lets update all the stats and such */
9349 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9350 if (rack->app_limited_needs_set)
9351 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9352 changed += (nrsm->r_end - nrsm->r_start);
9353 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9354 if (rsm->r_flags & RACK_WAS_LOST) {
9355 int my_chg;
9356
9357 /*
9358 * Note here we do not use our rack_mark_nolonger_lost() function
9359 * since we are moving our data pointer around and the
9360 * ack'ed side is already not considered lost.
9361 */
9362 my_chg = (nrsm->r_end - nrsm->r_start);
9363 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
9364 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
9365 if (my_chg <= rack->r_ctl.rc_considered_lost)
9366 rack->r_ctl.rc_considered_lost -= my_chg;
9367 else
9368 rack->r_ctl.rc_considered_lost = 0;
9369 }
9370 if (nrsm->r_flags & RACK_SACK_PASSED) {
9371 rack->r_ctl.rc_reorder_ts = cts;
9372 if (rack->r_ctl.rc_reorder_ts == 0)
9373 rack->r_ctl.rc_reorder_ts = 1;
9374 }
9375 /*
9376 * Now we want to go up from rsm (the
9377 * one left un-acked) to the next one
9378 * in the tmap. We do this so when
9379 * we walk backwards we include marking
9380 * sack-passed on rsm (The one passed in
9381 * is skipped since it is generally called
9382 * on something sacked before removing it
9383 * from the tmap).
9384 */
9385 if (rsm->r_in_tmap) {
9386 nrsm = TAILQ_NEXT(rsm, r_tnext);
9387 /*
9388 * Now that we have the next
9389 * one walk backwards from there.
9390 */
9391 if (nrsm && nrsm->r_in_tmap)
9392 rack_log_sack_passed(tp, rack, nrsm, cts);
9393 }
9394 /* Now are we done? */
9395 if (SEQ_LT(end, next->r_end) ||
9396 (end == next->r_end)) {
9397 /* Done with block */
9398 goto out;
9399 }
9400 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__);
9401 /* Postion for the next block */
9402 start = next->r_end;
9403 rsm = tqhash_next(rack->r_ctl.tqh, next);
9404 if (rsm == NULL)
9405 goto out;
9406 } else {
9407 /**
9408 * We can't use any hookery here, so we
9409 * need to split the map. We enter like
9410 * so:
9411 * rsm |--------|
9412 * sackblk |----->
9413 * We will add the new block nrsm and
9414 * that will be the new portion, and then
9415 * fall through after reseting rsm. So we
9416 * split and look like this:
9417 * rsm |----|
9418 * sackblk |----->
9419 * nrsm |---|
9420 * We then fall through reseting
9421 * rsm to nrsm, so the next block
9422 * picks it up.
9423 */
9424 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9425 if (nrsm == NULL) {
9426 /*
9427 * failed XXXrrs what can we do but loose the sack
9428 * info?
9429 */
9430 goto out;
9431 }
9432 rack_clone_rsm(rack, nrsm, rsm, start);
9433 rsm->r_just_ret = 0;
9434 #ifndef INVARIANTS
9435 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
9436 #else
9437 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
9438 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
9439 nrsm, insret, rack, rsm);
9440 }
9441 #endif
9442 if (rsm->r_in_tmap) {
9443 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
9444 nrsm->r_in_tmap = 1;
9445 }
9446 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__);
9447 rsm->r_flags &= (~RACK_HAS_FIN);
9448 /* Position us to point to the new nrsm that starts the sack blk */
9449 rsm = nrsm;
9450 }
9451 } else {
9452 /* Already sacked this piece */
9453 if (end == rsm->r_end) {
9454 /* Done with block */
9455 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9456 goto out;
9457 } else if (SEQ_LT(end, rsm->r_end)) {
9458 /* A partial sack to a already sacked block */
9459 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9460 goto out;
9461 } else {
9462 /*
9463 * The end goes beyond this guy
9464 * reposition the start to the
9465 * next block.
9466 */
9467 start = rsm->r_end;
9468 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9469 if (rsm == NULL)
9470 goto out;
9471 }
9472 }
9473 }
9474 if (SEQ_GEQ(end, rsm->r_end)) {
9475 /**
9476 * The end of this block is either beyond this guy or right
9477 * at this guy. I.e.:
9478 * rsm --- |-----|
9479 * end |-----|
9480 * <or>
9481 * end |---------|
9482 */
9483 if ((rsm->r_flags & RACK_ACKED) == 0) {
9484 /*
9485 * Is it a TLP of interest?
9486 */
9487 if ((rsm->r_flags & RACK_TLP) &&
9488 (rsm->r_rtr_cnt > 1)) {
9489 /*
9490 * We are splitting a rxt TLP, check
9491 * if we need to save off the start/end
9492 */
9493 if (rack->rc_last_tlp_acked_set &&
9494 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9495 /*
9496 * We already turned this on since we are inside
9497 * the previous one was a partially sack now we
9498 * are getting another one (maybe all of it).
9499 */
9500 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9501 /*
9502 * Lets make sure we have all of it though.
9503 */
9504 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9505 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9506 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9507 rack->r_ctl.last_tlp_acked_end);
9508 }
9509 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9510 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9511 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9512 rack->r_ctl.last_tlp_acked_end);
9513 }
9514 } else {
9515 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9516 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9517 rack->rc_last_tlp_past_cumack = 0;
9518 rack->rc_last_tlp_acked_set = 1;
9519 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9520 }
9521 }
9522 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
9523 changed += (rsm->r_end - rsm->r_start);
9524 /* You get a count for acking a whole segment or more */
9525 if (rsm->r_flags & RACK_WAS_LOST) {
9526 /*
9527 * Here we can use the inline function since
9528 * the rsm is truly marked lost and now no longer lost.
9529 */
9530 rack_mark_nolonger_lost(rack, rsm);
9531 }
9532 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
9533 if (rsm->r_in_tmap) /* should be true */
9534 rack_log_sack_passed(tp, rack, rsm, cts);
9535 /* Is Reordering occuring? */
9536 if (rsm->r_flags & RACK_SACK_PASSED) {
9537 rsm->r_flags &= ~RACK_SACK_PASSED;
9538 rack->r_ctl.rc_reorder_ts = cts;
9539 if (rack->r_ctl.rc_reorder_ts == 0)
9540 rack->r_ctl.rc_reorder_ts = 1;
9541 }
9542 if (rack->app_limited_needs_set)
9543 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
9544 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9545 rsm->r_flags |= RACK_ACKED;
9546 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
9547 if (rsm->r_in_tmap) {
9548 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9549 rsm->r_in_tmap = 0;
9550 }
9551 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__);
9552 }
9553 if (end == rsm->r_end) {
9554 /* This block only - done, setup for next */
9555 goto out;
9556 }
9557 /*
9558 * There is more not coverend by this rsm move on
9559 * to the next block in the tail queue hash table.
9560 */
9561 nrsm = tqhash_next(rack->r_ctl.tqh, rsm);
9562 start = rsm->r_end;
9563 rsm = nrsm;
9564 if (rsm == NULL)
9565 goto out;
9566 goto do_rest_ofb;
9567 }
9568 /**
9569 * The end of this sack block is smaller than
9570 * our rsm i.e.:
9571 * rsm --- |-----|
9572 * end |--|
9573 */
9574 if ((rsm->r_flags & RACK_ACKED) == 0) {
9575 /*
9576 * Is it a TLP of interest?
9577 */
9578 if ((rsm->r_flags & RACK_TLP) &&
9579 (rsm->r_rtr_cnt > 1)) {
9580 /*
9581 * We are splitting a rxt TLP, check
9582 * if we need to save off the start/end
9583 */
9584 if (rack->rc_last_tlp_acked_set &&
9585 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9586 /*
9587 * We already turned this on since we are inside
9588 * the previous one was a partially sack now we
9589 * are getting another one (maybe all of it).
9590 */
9591 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9592 /*
9593 * Lets make sure we have all of it though.
9594 */
9595 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9596 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9597 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9598 rack->r_ctl.last_tlp_acked_end);
9599 }
9600 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9601 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9602 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9603 rack->r_ctl.last_tlp_acked_end);
9604 }
9605 } else {
9606 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9607 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9608 rack->rc_last_tlp_past_cumack = 0;
9609 rack->rc_last_tlp_acked_set = 1;
9610 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9611 }
9612 }
9613 /*
9614 * Hookery can only be used if the two entries
9615 * are in the same bucket and neither one of
9616 * them staddle the bucket line.
9617 */
9618 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9619 if (prev &&
9620 (rsm->bindex == prev->bindex) &&
9621 ((rsm->r_flags & RACK_STRADDLE) == 0) &&
9622 ((prev->r_flags & RACK_STRADDLE) == 0) &&
9623 ((rsm->r_flags & RACK_IS_PCM) == 0) &&
9624 ((prev->r_flags & RACK_IS_PCM) == 0) &&
9625 (rsm->r_flags & RACK_IN_GP_WIN) &&
9626 (prev->r_flags & RACK_IN_GP_WIN))
9627 can_use_hookery = 1;
9628 else
9629 can_use_hookery = 0;
9630 if (prev && can_use_hookery &&
9631 (prev->r_flags & RACK_ACKED)) {
9632 /**
9633 * Goal, we want the right remainder of rsm to shrink
9634 * in place and span from (rsm->r_start = end) to rsm->r_end.
9635 * We want to expand prev to go all the way
9636 * to prev->r_end <- end.
9637 * so in the tree we have before:
9638 * prev |--------| (acked)
9639 * rsm |-------| (non-acked)
9640 * sackblk |-|
9641 * We churn it so we end up with
9642 * prev |----------| (acked)
9643 * rsm |-----| (non-acked)
9644 * nrsm |-| (temporary)
9645 *
9646 * Note if either prev/rsm is a TLP we don't
9647 * do this.
9648 */
9649 nrsm = &stack_map;
9650 memcpy(nrsm, rsm, sizeof(struct rack_sendmap));
9651 tqhash_update_end(rack->r_ctl.tqh, prev, end);
9652 rsm->r_start = end;
9653 rsm->r_flags |= RACK_SHUFFLED;
9654 prev->r_flags |= RACK_SHUFFLED;
9655 /* Now adjust nrsm (stack copy) to be
9656 * the one that is the small
9657 * piece that was "sacked".
9658 */
9659 nrsm->r_end = end;
9660 rsm->r_dupack = 0;
9661 /*
9662 * Which timestamp do we keep? It is rather
9663 * important in GP measurements to have the
9664 * accurate end of the send window.
9665 *
9666 * We keep the largest value, which is the newest
9667 * send. We do this in case a segment that is
9668 * joined together and not part of a GP estimate
9669 * later gets expanded into the GP estimate.
9670 *
9671 * We prohibit the merging of unlike kinds i.e.
9672 * all pieces that are in the GP estimate can be
9673 * merged and all pieces that are not in a GP estimate
9674 * can be merged, but not disimilar pieces. Combine
9675 * this with taking the highest here and we should
9676 * be ok unless of course the client reneges. Then
9677 * all bets are off.
9678 */
9679 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] <
9680 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) {
9681 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)];
9682 }
9683 /*
9684 * And we must keep the newest ack arrival time.
9685 */
9686
9687 if(prev->r_ack_arrival <
9688 rack_to_usec_ts(&rack->r_ctl.act_rcv_time))
9689 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9690
9691 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
9692 /*
9693 * Now that the rsm has had its start moved forward
9694 * lets go ahead and get its new place in the world.
9695 */
9696 rack_setup_offset_for_rsm(rack, prev, rsm);
9697 /*
9698 * Now nrsm is our new little piece
9699 * that is acked (which was merged
9700 * to prev). Update the rtt and changed
9701 * based on that. Also check for reordering.
9702 */
9703 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0);
9704 if (rack->app_limited_needs_set)
9705 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END);
9706 changed += (nrsm->r_end - nrsm->r_start);
9707 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start);
9708 if (rsm->r_flags & RACK_WAS_LOST) {
9709 int my_chg;
9710
9711 /*
9712 * Note here we are using hookery again so we can't
9713 * use our rack_mark_nolonger_lost() function.
9714 */
9715 my_chg = (nrsm->r_end - nrsm->r_start);
9716 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg),
9717 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack));
9718 if (my_chg <= rack->r_ctl.rc_considered_lost)
9719 rack->r_ctl.rc_considered_lost -= my_chg;
9720 else
9721 rack->r_ctl.rc_considered_lost = 0;
9722 }
9723 if (nrsm->r_flags & RACK_SACK_PASSED) {
9724 rack->r_ctl.rc_reorder_ts = cts;
9725 if (rack->r_ctl.rc_reorder_ts == 0)
9726 rack->r_ctl.rc_reorder_ts = 1;
9727 }
9728 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__);
9729 rsm = prev;
9730 } else {
9731 /**
9732 * This is the case where our previous
9733 * block is not acked either, so we must
9734 * split the block in two.
9735 */
9736 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
9737 if (nrsm == NULL) {
9738 /* failed rrs what can we do but loose the sack info? */
9739 goto out;
9740 }
9741 if ((rsm->r_flags & RACK_TLP) &&
9742 (rsm->r_rtr_cnt > 1)) {
9743 /*
9744 * We are splitting a rxt TLP, check
9745 * if we need to save off the start/end
9746 */
9747 if (rack->rc_last_tlp_acked_set &&
9748 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
9749 /*
9750 * We already turned this on since this block is inside
9751 * the previous one was a partially sack now we
9752 * are getting another one (maybe all of it).
9753 */
9754 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
9755 /*
9756 * Lets make sure we have all of it though.
9757 */
9758 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
9759 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9760 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9761 rack->r_ctl.last_tlp_acked_end);
9762 }
9763 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
9764 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9765 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
9766 rack->r_ctl.last_tlp_acked_end);
9767 }
9768 } else {
9769 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
9770 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
9771 rack->rc_last_tlp_acked_set = 1;
9772 rack->rc_last_tlp_past_cumack = 0;
9773 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
9774 }
9775 }
9776 /**
9777 * In this case nrsm becomes
9778 * nrsm->r_start = end;
9779 * nrsm->r_end = rsm->r_end;
9780 * which is un-acked.
9781 * <and>
9782 * rsm->r_end = nrsm->r_start;
9783 * i.e. the remaining un-acked
9784 * piece is left on the left
9785 * hand side.
9786 *
9787 * So we start like this
9788 * rsm |----------| (not acked)
9789 * sackblk |---|
9790 * build it so we have
9791 * rsm |---| (acked)
9792 * nrsm |------| (not acked)
9793 */
9794 rack_clone_rsm(rack, nrsm, rsm, end);
9795 rsm->r_flags &= (~RACK_HAS_FIN);
9796 rsm->r_just_ret = 0;
9797 #ifndef INVARIANTS
9798 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
9799 #else
9800 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
9801 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p",
9802 nrsm, insret, rack, rsm);
9803 }
9804 #endif
9805 if (rsm->r_in_tmap) {
9806 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
9807 nrsm->r_in_tmap = 1;
9808 }
9809 nrsm->r_dupack = 0;
9810 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2);
9811 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0);
9812 changed += (rsm->r_end - rsm->r_start);
9813 if (rsm->r_flags & RACK_WAS_LOST) {
9814 /*
9815 * Here it is safe to use our function.
9816 */
9817 rack_mark_nolonger_lost(rack, rsm);
9818 }
9819 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start);
9820
9821 if (rsm->r_in_tmap) /* should be true */
9822 rack_log_sack_passed(tp, rack, rsm, cts);
9823 /* Is Reordering occuring? */
9824 if (rsm->r_flags & RACK_SACK_PASSED) {
9825 rsm->r_flags &= ~RACK_SACK_PASSED;
9826 rack->r_ctl.rc_reorder_ts = cts;
9827 if (rack->r_ctl.rc_reorder_ts == 0)
9828 rack->r_ctl.rc_reorder_ts = 1;
9829 }
9830 if (rack->app_limited_needs_set)
9831 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END);
9832 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
9833 rsm->r_flags |= RACK_ACKED;
9834 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end);
9835 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__);
9836 if (rsm->r_in_tmap) {
9837 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9838 rsm->r_in_tmap = 0;
9839 }
9840 }
9841 }
9842 out:
9843 if (rsm &&
9844 ((rsm->r_flags & RACK_TLP) == 0) &&
9845 (rsm->r_flags & RACK_ACKED)) {
9846 /*
9847 * Now can we merge where we worked
9848 * with either the previous or
9849 * next block?
9850 */
9851 next = tqhash_next(rack->r_ctl.tqh, rsm);
9852 while (next) {
9853 if (next->r_flags & RACK_TLP)
9854 break;
9855 /* Only allow merges between ones in or out of GP window */
9856 if ((next->r_flags & RACK_IN_GP_WIN) &&
9857 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
9858 break;
9859 }
9860 if ((rsm->r_flags & RACK_IN_GP_WIN) &&
9861 ((next->r_flags & RACK_IN_GP_WIN) == 0)) {
9862 break;
9863 }
9864 if (rsm->bindex != next->bindex)
9865 break;
9866 if (rsm->r_flags & RACK_STRADDLE)
9867 break;
9868 if (rsm->r_flags & RACK_IS_PCM)
9869 break;
9870 if (next->r_flags & RACK_STRADDLE)
9871 break;
9872 if (next->r_flags & RACK_IS_PCM)
9873 break;
9874 if (next->r_flags & RACK_ACKED) {
9875 /* yep this and next can be merged */
9876 rsm = rack_merge_rsm(rack, rsm, next);
9877 next = tqhash_next(rack->r_ctl.tqh, rsm);
9878 } else
9879 break;
9880 }
9881 /* Now what about the previous? */
9882 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9883 while (prev) {
9884 if (prev->r_flags & RACK_TLP)
9885 break;
9886 /* Only allow merges between ones in or out of GP window */
9887 if ((prev->r_flags & RACK_IN_GP_WIN) &&
9888 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) {
9889 break;
9890 }
9891 if ((rsm->r_flags & RACK_IN_GP_WIN) &&
9892 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) {
9893 break;
9894 }
9895 if (rsm->bindex != prev->bindex)
9896 break;
9897 if (rsm->r_flags & RACK_STRADDLE)
9898 break;
9899 if (rsm->r_flags & RACK_IS_PCM)
9900 break;
9901 if (prev->r_flags & RACK_STRADDLE)
9902 break;
9903 if (prev->r_flags & RACK_IS_PCM)
9904 break;
9905 if (prev->r_flags & RACK_ACKED) {
9906 /* yep the previous and this can be merged */
9907 rsm = rack_merge_rsm(rack, prev, rsm);
9908 prev = tqhash_prev(rack->r_ctl.tqh, rsm);
9909 } else
9910 break;
9911 }
9912 }
9913 if (used_ref == 0) {
9914 counter_u64_add(rack_sack_proc_all, 1);
9915 } else {
9916 counter_u64_add(rack_sack_proc_short, 1);
9917 }
9918 /* Save off the next one for quick reference. */
9919 nrsm = tqhash_find(rack->r_ctl.tqh, end);
9920 *prsm = rack->r_ctl.rc_sacklast = nrsm;
9921 return (changed);
9922 }
9923
9924 static void inline
rack_peer_reneges(struct tcp_rack * rack,struct rack_sendmap * rsm,tcp_seq th_ack)9925 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack)
9926 {
9927 struct rack_sendmap *tmap;
9928
9929 tmap = NULL;
9930 while (rsm && (rsm->r_flags & RACK_ACKED)) {
9931 /* Its no longer sacked, mark it so */
9932 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
9933 #ifdef INVARIANTS
9934 if (rsm->r_in_tmap) {
9935 panic("rack:%p rsm:%p flags:0x%x in tmap?",
9936 rack, rsm, rsm->r_flags);
9937 }
9938 #endif
9939 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS);
9940 /* Rebuild it into our tmap */
9941 if (tmap == NULL) {
9942 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext);
9943 tmap = rsm;
9944 } else {
9945 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext);
9946 tmap = rsm;
9947 }
9948 tmap->r_in_tmap = 1;
9949 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
9950 }
9951 /*
9952 * Now lets possibly clear the sack filter so we start
9953 * recognizing sacks that cover this area.
9954 */
9955 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack);
9956
9957 }
9958
9959
9960 static void inline
rack_rsm_sender_update(struct tcp_rack * rack,struct tcpcb * tp,struct rack_sendmap * rsm,uint8_t from)9961 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from)
9962 {
9963 /*
9964 * We look at advancing the end send time for our GP
9965 * measurement tracking only as the cumulative acknowledgment
9966 * moves forward. You might wonder about this, why not
9967 * at every transmission or retransmission within the
9968 * GP window update the rc_gp_cumack_ts? Well its rather
9969 * nuanced but basically the GP window *may* expand (as
9970 * it does below) or worse and harder to track it may shrink.
9971 *
9972 * This last makes it impossible to track at the time of
9973 * the send, since you may set forward your rc_gp_cumack_ts
9974 * when you send, because that send *is* in your currently
9975 * "guessed" window, but then it shrinks. Now which was
9976 * the send time of the last bytes in the window, by the
9977 * time you ask that question that part of the sendmap
9978 * is freed. So you don't know and you will have too
9979 * long of send window. Instead by updating the time
9980 * marker only when the cumack advances this assures us
9981 * that we will have only the sends in the window of our
9982 * GP measurement.
9983 *
9984 * Another complication from this is the
9985 * merging of sendmap entries. During SACK processing this
9986 * can happen to conserve the sendmap size. That breaks
9987 * everything down in tracking the send window of the GP
9988 * estimate. So to prevent that and keep it working with
9989 * a tiny bit more limited merging, we only allow like
9990 * types to be merged. I.e. if two sends are in the GP window
9991 * then its ok to merge them together. If two sends are not
9992 * in the GP window its ok to merge them together too. Though
9993 * one send in and one send out cannot be merged. We combine
9994 * this with never allowing the shrinking of the GP window when
9995 * we are in recovery so that we can properly calculate the
9996 * sending times.
9997 *
9998 * This all of course seems complicated, because it is.. :)
9999 *
10000 * The cum-ack is being advanced upon the sendmap.
10001 * If we are not doing a GP estimate don't
10002 * proceed.
10003 */
10004 uint64_t ts;
10005
10006 if ((tp->t_flags & TF_GPUTINPROG) == 0)
10007 return;
10008 /*
10009 * If this sendmap entry is going
10010 * beyond the measurement window we had picked,
10011 * expand the measurement window by that much.
10012 */
10013 if (SEQ_GT(rsm->r_end, tp->gput_ack)) {
10014 tp->gput_ack = rsm->r_end;
10015 }
10016 /*
10017 * If we have not setup a ack, then we
10018 * have no idea if the newly acked pieces
10019 * will be "in our seq measurement range". If
10020 * it is when we clear the app_limited_needs_set
10021 * flag the timestamp will be updated.
10022 */
10023 if (rack->app_limited_needs_set)
10024 return;
10025 /*
10026 * Finally, we grab out the latest timestamp
10027 * that this packet was sent and then see
10028 * if:
10029 * a) The packet touches are newly defined GP range.
10030 * b) The time is greater than (newer) than the
10031 * one we currently have. If so we update
10032 * our sending end time window.
10033 *
10034 * Note we *do not* do this at send time. The reason
10035 * is that if you do you *may* pick up a newer timestamp
10036 * for a range you are not going to measure. We project
10037 * out how far and then sometimes modify that to be
10038 * smaller. If that occurs then you will have a send
10039 * that does not belong to the range included.
10040 */
10041 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <=
10042 rack->r_ctl.rc_gp_cumack_ts)
10043 return;
10044 if (rack_in_gp_window(tp, rsm)) {
10045 rack->r_ctl.rc_gp_cumack_ts = ts;
10046 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end,
10047 __LINE__, from, rsm);
10048 }
10049 }
10050
10051 static void
rack_process_to_cumack(struct tcpcb * tp,struct tcp_rack * rack,register uint32_t th_ack,uint32_t cts,struct tcpopt * to,uint64_t acktime)10052 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime)
10053 {
10054 struct rack_sendmap *rsm;
10055 /*
10056 * The ACK point is advancing to th_ack, we must drop off
10057 * the packets in the rack log and calculate any eligble
10058 * RTT's.
10059 */
10060
10061 if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) {
10062 /*
10063 * If we have some sack blocks in the filter
10064 * lets prune them out by calling sfb with no blocks.
10065 */
10066 sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack);
10067 }
10068 if (SEQ_GT(th_ack, tp->snd_una)) {
10069 /* Clear any app ack remembered settings */
10070 rack->r_ctl.cleared_app_ack = 0;
10071 }
10072 rack->r_wanted_output = 1;
10073 if (SEQ_GT(th_ack, tp->snd_una))
10074 rack->r_ctl.last_cumack_advance = acktime;
10075
10076 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */
10077 if ((rack->rc_last_tlp_acked_set == 1)&&
10078 (rack->rc_last_tlp_past_cumack == 1) &&
10079 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) {
10080 /*
10081 * We have reached the point where our last rack
10082 * tlp retransmit sequence is ahead of the cum-ack.
10083 * This can only happen when the cum-ack moves all
10084 * the way around (its been a full 2^^31+1 bytes
10085 * or more since we sent a retransmitted TLP). Lets
10086 * turn off the valid flag since its not really valid.
10087 *
10088 * Note since sack's also turn on this event we have
10089 * a complication, we have to wait to age it out until
10090 * the cum-ack is by the TLP before checking which is
10091 * what the next else clause does.
10092 */
10093 rack_log_dsack_event(rack, 9, __LINE__,
10094 rack->r_ctl.last_tlp_acked_start,
10095 rack->r_ctl.last_tlp_acked_end);
10096 rack->rc_last_tlp_acked_set = 0;
10097 rack->rc_last_tlp_past_cumack = 0;
10098 } else if ((rack->rc_last_tlp_acked_set == 1) &&
10099 (rack->rc_last_tlp_past_cumack == 0) &&
10100 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) {
10101 /*
10102 * It is safe to start aging TLP's out.
10103 */
10104 rack->rc_last_tlp_past_cumack = 1;
10105 }
10106 /* We do the same for the tlp send seq as well */
10107 if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10108 (rack->rc_last_sent_tlp_past_cumack == 1) &&
10109 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) {
10110 rack_log_dsack_event(rack, 9, __LINE__,
10111 rack->r_ctl.last_sent_tlp_seq,
10112 (rack->r_ctl.last_sent_tlp_seq +
10113 rack->r_ctl.last_sent_tlp_len));
10114 rack->rc_last_sent_tlp_seq_valid = 0;
10115 rack->rc_last_sent_tlp_past_cumack = 0;
10116 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) &&
10117 (rack->rc_last_sent_tlp_past_cumack == 0) &&
10118 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) {
10119 /*
10120 * It is safe to start aging TLP's send.
10121 */
10122 rack->rc_last_sent_tlp_past_cumack = 1;
10123 }
10124 more:
10125 rsm = tqhash_min(rack->r_ctl.tqh);
10126 if (rsm == NULL) {
10127 if ((th_ack - 1) == tp->iss) {
10128 /*
10129 * For the SYN incoming case we will not
10130 * have called tcp_output for the sending of
10131 * the SYN, so there will be no map. All
10132 * other cases should probably be a panic.
10133 */
10134 return;
10135 }
10136 if (tp->t_flags & TF_SENTFIN) {
10137 /* if we sent a FIN we often will not have map */
10138 return;
10139 }
10140 #ifdef INVARIANTS
10141 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n",
10142 tp,
10143 tp->t_state, th_ack, rack,
10144 tp->snd_una, tp->snd_max);
10145 #endif
10146 return;
10147 }
10148 if (SEQ_LT(th_ack, rsm->r_start)) {
10149 /* Huh map is missing this */
10150 #ifdef INVARIANTS
10151 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n",
10152 rsm->r_start,
10153 th_ack, tp->t_state, rack->r_state);
10154 #endif
10155 return;
10156 }
10157 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack);
10158
10159 /* Now was it a retransmitted TLP? */
10160 if ((rsm->r_flags & RACK_TLP) &&
10161 (rsm->r_rtr_cnt > 1)) {
10162 /*
10163 * Yes, this rsm was a TLP and retransmitted, remember that
10164 * since if a DSACK comes back on this we don't want
10165 * to think of it as a reordered segment. This may
10166 * get updated again with possibly even other TLPs
10167 * in flight, but thats ok. Only when we don't send
10168 * a retransmitted TLP for 1/2 the sequences space
10169 * will it get turned off (above).
10170 */
10171 if (rack->rc_last_tlp_acked_set &&
10172 (is_rsm_inside_declared_tlp_block(rack, rsm))) {
10173 /*
10174 * We already turned this on since the end matches,
10175 * the previous one was a partially ack now we
10176 * are getting another one (maybe all of it).
10177 */
10178 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end);
10179 /*
10180 * Lets make sure we have all of it though.
10181 */
10182 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) {
10183 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10184 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10185 rack->r_ctl.last_tlp_acked_end);
10186 }
10187 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) {
10188 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10189 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start,
10190 rack->r_ctl.last_tlp_acked_end);
10191 }
10192 } else {
10193 rack->rc_last_tlp_past_cumack = 1;
10194 rack->r_ctl.last_tlp_acked_start = rsm->r_start;
10195 rack->r_ctl.last_tlp_acked_end = rsm->r_end;
10196 rack->rc_last_tlp_acked_set = 1;
10197 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end);
10198 }
10199 }
10200 /* Now do we consume the whole thing? */
10201 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)];
10202 if (SEQ_GEQ(th_ack, rsm->r_end)) {
10203 /* Its all consumed. */
10204 uint32_t left;
10205 uint8_t newly_acked;
10206
10207 if (rsm->r_flags & RACK_WAS_LOST) {
10208 /*
10209 * This can happen when we marked it as lost
10210 * and yet before retransmitting we get an ack
10211 * which can happen due to reordering.
10212 */
10213 rack_mark_nolonger_lost(rack, rsm);
10214 }
10215 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__);
10216 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes;
10217 rsm->r_rtr_bytes = 0;
10218 /*
10219 * Record the time of highest cumack sent if its in our measurement
10220 * window and possibly bump out the end.
10221 */
10222 rack_rsm_sender_update(rack, tp, rsm, 4);
10223 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
10224 if (rsm->r_in_tmap) {
10225 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext);
10226 rsm->r_in_tmap = 0;
10227 }
10228 newly_acked = 1;
10229 if (rsm->r_flags & RACK_ACKED) {
10230 /*
10231 * It was acked on the scoreboard -- remove
10232 * it from total
10233 */
10234 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start);
10235 newly_acked = 0;
10236 } else if (rsm->r_flags & RACK_SACK_PASSED) {
10237 /*
10238 * There are segments ACKED on the
10239 * scoreboard further up. We are seeing
10240 * reordering.
10241 */
10242 rsm->r_flags &= ~RACK_SACK_PASSED;
10243 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
10244 rsm->r_flags |= RACK_ACKED;
10245 rack->r_ctl.rc_reorder_ts = cts;
10246 if (rack->r_ctl.rc_reorder_ts == 0)
10247 rack->r_ctl.rc_reorder_ts = 1;
10248 if (rack->r_ent_rec_ns) {
10249 /*
10250 * We have sent no more, and we saw an sack
10251 * then ack arrive.
10252 */
10253 rack->r_might_revert = 1;
10254 }
10255 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
10256 } else {
10257 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end);
10258 }
10259 if ((rsm->r_flags & RACK_TO_REXT) &&
10260 (tp->t_flags & TF_RCVD_TSTMP) &&
10261 (to->to_flags & TOF_TS) &&
10262 (to->to_tsecr != 0) &&
10263 (tp->t_flags & TF_PREVVALID)) {
10264 /*
10265 * We can use the timestamp to see
10266 * if this retransmission was from the
10267 * first transmit. If so we made a mistake.
10268 */
10269 tp->t_flags &= ~TF_PREVVALID;
10270 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) {
10271 /* The first transmit is what this ack is for */
10272 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__);
10273 }
10274 }
10275 left = th_ack - rsm->r_end;
10276 if (rack->app_limited_needs_set && newly_acked)
10277 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK);
10278 /* Free back to zone */
10279 rack_free(rack, rsm);
10280 if (left) {
10281 goto more;
10282 }
10283 /* Check for reneging */
10284 rsm = tqhash_min(rack->r_ctl.tqh);
10285 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) {
10286 /*
10287 * The peer has moved snd_una up to
10288 * the edge of this send, i.e. one
10289 * that it had previously acked. The only
10290 * way that can be true if the peer threw
10291 * away data (space issues) that it had
10292 * previously sacked (else it would have
10293 * given us snd_una up to (rsm->r_end).
10294 * We need to undo the acked markings here.
10295 *
10296 * Note we have to look to make sure th_ack is
10297 * our rsm->r_start in case we get an old ack
10298 * where th_ack is behind snd_una.
10299 */
10300 rack_peer_reneges(rack, rsm, th_ack);
10301 }
10302 return;
10303 }
10304 if (rsm->r_flags & RACK_ACKED) {
10305 /*
10306 * It was acked on the scoreboard -- remove it from
10307 * total for the part being cum-acked.
10308 */
10309 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start);
10310 } else {
10311 rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack);
10312 }
10313 /* And what about the lost flag? */
10314 if (rsm->r_flags & RACK_WAS_LOST) {
10315 /*
10316 * This can happen when we marked it as lost
10317 * and yet before retransmitting we get an ack
10318 * which can happen due to reordering. In this
10319 * case its only a partial ack of the send.
10320 */
10321 rack_mark_nolonger_lost(rack, rsm);
10322 }
10323 /*
10324 * Clear the dup ack count for
10325 * the piece that remains.
10326 */
10327 rsm->r_dupack = 0;
10328 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2);
10329 if (rsm->r_rtr_bytes) {
10330 /*
10331 * It was retransmitted adjust the
10332 * sack holes for what was acked.
10333 */
10334 int ack_am;
10335
10336 ack_am = (th_ack - rsm->r_start);
10337 if (ack_am >= rsm->r_rtr_bytes) {
10338 rack->r_ctl.rc_holes_rxt -= ack_am;
10339 rsm->r_rtr_bytes -= ack_am;
10340 }
10341 }
10342 /*
10343 * Update where the piece starts and record
10344 * the time of send of highest cumack sent if
10345 * its in our GP range.
10346 */
10347 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__);
10348 /* Now we need to move our offset forward too */
10349 if (rsm->m &&
10350 ((rsm->orig_m_len != rsm->m->m_len) ||
10351 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
10352 /* Fix up the orig_m_len and possibly the mbuf offset */
10353 rack_adjust_orig_mlen(rsm);
10354 }
10355 rsm->soff += (th_ack - rsm->r_start);
10356 rack_rsm_sender_update(rack, tp, rsm, 5);
10357 /* The trim will move th_ack into r_start for us */
10358 tqhash_trim(rack->r_ctl.tqh, th_ack);
10359 /* Now do we need to move the mbuf fwd too? */
10360 {
10361 struct mbuf *m;
10362 uint32_t soff;
10363
10364 m = rsm->m;
10365 soff = rsm->soff;
10366 if (m) {
10367 while (soff >= m->m_len) {
10368 soff -= m->m_len;
10369 KASSERT((m->m_next != NULL),
10370 (" rsm:%p off:%u soff:%u m:%p",
10371 rsm, rsm->soff, soff, m));
10372 m = m->m_next;
10373 if (m == NULL) {
10374 /*
10375 * This is a fall-back that prevents a panic. In reality
10376 * we should be able to walk the mbuf's and find our place.
10377 * At this point snd_una has not been updated with the sbcut() yet
10378 * but tqhash_trim did update rsm->r_start so the offset calcuation
10379 * should work fine. This is undesirable since we will take cache
10380 * hits to access the socket buffer. And even more puzzling is that
10381 * it happens occasionally. It should not :(
10382 */
10383 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
10384 (rsm->r_start - tp->snd_una),
10385 &soff);
10386 break;
10387 }
10388 }
10389 /*
10390 * Now save in our updated values.
10391 */
10392 rsm->m = m;
10393 rsm->soff = soff;
10394 rsm->orig_m_len = rsm->m->m_len;
10395 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
10396 }
10397 }
10398 if (rack->app_limited_needs_set &&
10399 SEQ_GEQ(th_ack, tp->gput_seq))
10400 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG);
10401 }
10402
10403 static void
rack_handle_might_revert(struct tcpcb * tp,struct tcp_rack * rack)10404 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack)
10405 {
10406 struct rack_sendmap *rsm;
10407 int sack_pass_fnd = 0;
10408
10409 if (rack->r_might_revert) {
10410 /*
10411 * Ok we have reordering, have not sent anything, we
10412 * might want to revert the congestion state if nothing
10413 * further has SACK_PASSED on it. Lets check.
10414 *
10415 * We also get here when we have DSACKs come in for
10416 * all the data that we FR'd. Note that a rxt or tlp
10417 * timer clears this from happening.
10418 */
10419
10420 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
10421 if (rsm->r_flags & RACK_SACK_PASSED) {
10422 sack_pass_fnd = 1;
10423 break;
10424 }
10425 }
10426 if (sack_pass_fnd == 0) {
10427 /*
10428 * We went into recovery
10429 * incorrectly due to reordering!
10430 */
10431 int orig_cwnd;
10432
10433 rack->r_ent_rec_ns = 0;
10434 orig_cwnd = tp->snd_cwnd;
10435 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec;
10436 tp->snd_recover = tp->snd_una;
10437 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__);
10438 if (IN_RECOVERY(tp->t_flags)) {
10439 rack_exit_recovery(tp, rack, 3);
10440 if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){
10441 /*
10442 * We were in recovery, had an RTO
10443 * and then re-entered recovery (more sack's arrived)
10444 * and we have properly recorded the old ssthresh from
10445 * the first recovery. We want to be able to slow-start
10446 * back to this level. The ssthresh from the timeout
10447 * and then back into recovery will end up most likely
10448 * to be min(cwnd=1mss, 2mss). Which makes it basically
10449 * so we get no slow-start after our RTO.
10450 */
10451 rack->rto_from_rec = 0;
10452 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh)
10453 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh;
10454 }
10455 }
10456 }
10457 rack->r_might_revert = 0;
10458 }
10459 }
10460
10461
10462 static int
rack_note_dsack(struct tcp_rack * rack,tcp_seq start,tcp_seq end)10463 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end)
10464 {
10465
10466 uint32_t am, l_end;
10467 int was_tlp = 0;
10468
10469 if (SEQ_GT(end, start))
10470 am = end - start;
10471 else
10472 am = 0;
10473 if ((rack->rc_last_tlp_acked_set ) &&
10474 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) &&
10475 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) {
10476 /*
10477 * The DSACK is because of a TLP which we don't
10478 * do anything with the reordering window over since
10479 * it was not reordering that caused the DSACK but
10480 * our previous retransmit TLP.
10481 */
10482 rack_log_dsack_event(rack, 7, __LINE__, start, end);
10483 was_tlp = 1;
10484 goto skip_dsack_round;
10485 }
10486 if (rack->rc_last_sent_tlp_seq_valid) {
10487 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len;
10488 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) &&
10489 (SEQ_LEQ(end, l_end))) {
10490 /*
10491 * This dsack is from the last sent TLP, ignore it
10492 * for reordering purposes.
10493 */
10494 rack_log_dsack_event(rack, 7, __LINE__, start, end);
10495 was_tlp = 1;
10496 goto skip_dsack_round;
10497 }
10498 }
10499 if (rack->rc_dsack_round_seen == 0) {
10500 rack->rc_dsack_round_seen = 1;
10501 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max;
10502 rack->r_ctl.num_dsack++;
10503 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */
10504 rack_log_dsack_event(rack, 2, __LINE__, 0, 0);
10505 }
10506 skip_dsack_round:
10507 /*
10508 * We keep track of how many DSACK blocks we get
10509 * after a recovery incident.
10510 */
10511 rack->r_ctl.dsack_byte_cnt += am;
10512 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) &&
10513 rack->r_ctl.retran_during_recovery &&
10514 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) {
10515 /*
10516 * False recovery most likely culprit is reordering. If
10517 * nothing else is missing we need to revert.
10518 */
10519 rack->r_might_revert = 1;
10520 rack_handle_might_revert(rack->rc_tp, rack);
10521 rack->r_might_revert = 0;
10522 rack->r_ctl.retran_during_recovery = 0;
10523 rack->r_ctl.dsack_byte_cnt = 0;
10524 }
10525 return (was_tlp);
10526 }
10527
10528 static uint32_t
do_rack_compute_pipe(struct tcpcb * tp,struct tcp_rack * rack,uint32_t snd_una)10529 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una)
10530 {
10531 return (((tp->snd_max - snd_una) -
10532 (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt);
10533 }
10534
10535 static int32_t
rack_compute_pipe(struct tcpcb * tp)10536 rack_compute_pipe(struct tcpcb *tp)
10537 {
10538 return ((int32_t)do_rack_compute_pipe(tp,
10539 (struct tcp_rack *)tp->t_fb_ptr,
10540 tp->snd_una));
10541 }
10542
10543 static void
rack_update_prr(struct tcpcb * tp,struct tcp_rack * rack,uint32_t changed,tcp_seq th_ack)10544 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack)
10545 {
10546 /* Deal with changed and PRR here (in recovery only) */
10547 uint32_t pipe, snd_una;
10548
10549 rack->r_ctl.rc_prr_delivered += changed;
10550
10551 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) {
10552 /*
10553 * It is all outstanding, we are application limited
10554 * and thus we don't need more room to send anything.
10555 * Note we use tp->snd_una here and not th_ack because
10556 * the data as yet not been cut from the sb.
10557 */
10558 rack->r_ctl.rc_prr_sndcnt = 0;
10559 return;
10560 }
10561 /* Compute prr_sndcnt */
10562 if (SEQ_GT(tp->snd_una, th_ack)) {
10563 snd_una = tp->snd_una;
10564 } else {
10565 snd_una = th_ack;
10566 }
10567 pipe = do_rack_compute_pipe(tp, rack, snd_una);
10568 if (pipe > tp->snd_ssthresh) {
10569 long sndcnt;
10570
10571 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh;
10572 if (rack->r_ctl.rc_prr_recovery_fs > 0)
10573 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs;
10574 else {
10575 rack->r_ctl.rc_prr_sndcnt = 0;
10576 rack_log_to_prr(rack, 9, 0, __LINE__);
10577 sndcnt = 0;
10578 }
10579 sndcnt++;
10580 if (sndcnt > (long)rack->r_ctl.rc_prr_out)
10581 sndcnt -= rack->r_ctl.rc_prr_out;
10582 else
10583 sndcnt = 0;
10584 rack->r_ctl.rc_prr_sndcnt = sndcnt;
10585 rack_log_to_prr(rack, 10, 0, __LINE__);
10586 } else {
10587 uint32_t limit;
10588
10589 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out)
10590 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out);
10591 else
10592 limit = 0;
10593 if (changed > limit)
10594 limit = changed;
10595 limit += ctf_fixed_maxseg(tp);
10596 if (tp->snd_ssthresh > pipe) {
10597 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit);
10598 rack_log_to_prr(rack, 11, 0, __LINE__);
10599 } else {
10600 rack->r_ctl.rc_prr_sndcnt = min(0, limit);
10601 rack_log_to_prr(rack, 12, 0, __LINE__);
10602 }
10603 }
10604 }
10605
10606 static void
rack_log_ack(struct tcpcb * tp,struct tcpopt * to,struct tcphdr * th,int entered_recovery,int dup_ack_struck,int * dsack_seen,int * sacks_seen)10607 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck,
10608 int *dsack_seen, int *sacks_seen)
10609 {
10610 uint32_t changed;
10611 struct tcp_rack *rack;
10612 struct rack_sendmap *rsm;
10613 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1];
10614 register uint32_t th_ack;
10615 int32_t i, j, k, num_sack_blks = 0;
10616 uint32_t cts, acked, ack_point;
10617 int loop_start = 0;
10618 uint32_t tsused;
10619 uint32_t segsiz;
10620
10621
10622 INP_WLOCK_ASSERT(tptoinpcb(tp));
10623 if (tcp_get_flags(th) & TH_RST) {
10624 /* We don't log resets */
10625 return;
10626 }
10627 rack = (struct tcp_rack *)tp->t_fb_ptr;
10628 cts = tcp_get_usecs(NULL);
10629 rsm = tqhash_min(rack->r_ctl.tqh);
10630 changed = 0;
10631 th_ack = th->th_ack;
10632 segsiz = ctf_fixed_maxseg(rack->rc_tp);
10633 if (SEQ_GT(th_ack, tp->snd_una)) {
10634 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__);
10635 tp->t_acktime = ticks;
10636 }
10637 if (rsm && SEQ_GT(th_ack, rsm->r_start))
10638 changed = th_ack - rsm->r_start;
10639 if (changed) {
10640 rack_process_to_cumack(tp, rack, th_ack, cts, to,
10641 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
10642 }
10643 if ((to->to_flags & TOF_SACK) == 0) {
10644 /* We are done nothing left and no sack. */
10645 rack_handle_might_revert(tp, rack);
10646 /*
10647 * For cases where we struck a dup-ack
10648 * with no SACK, add to the changes so
10649 * PRR will work right.
10650 */
10651 if (dup_ack_struck && (changed == 0)) {
10652 changed += ctf_fixed_maxseg(rack->rc_tp);
10653 }
10654 goto out;
10655 }
10656 /* Sack block processing */
10657 if (SEQ_GT(th_ack, tp->snd_una))
10658 ack_point = th_ack;
10659 else
10660 ack_point = tp->snd_una;
10661 for (i = 0; i < to->to_nsacks; i++) {
10662 bcopy((to->to_sacks + i * TCPOLEN_SACK),
10663 &sack, sizeof(sack));
10664 sack.start = ntohl(sack.start);
10665 sack.end = ntohl(sack.end);
10666 if (SEQ_GT(sack.end, sack.start) &&
10667 SEQ_GT(sack.start, ack_point) &&
10668 SEQ_LT(sack.start, tp->snd_max) &&
10669 SEQ_GT(sack.end, ack_point) &&
10670 SEQ_LEQ(sack.end, tp->snd_max)) {
10671 sack_blocks[num_sack_blks] = sack;
10672 num_sack_blks++;
10673 } else if (SEQ_LEQ(sack.start, th_ack) &&
10674 SEQ_LEQ(sack.end, th_ack)) {
10675 int was_tlp;
10676
10677 if (dsack_seen != NULL)
10678 *dsack_seen = 1;
10679 was_tlp = rack_note_dsack(rack, sack.start, sack.end);
10680 /*
10681 * Its a D-SACK block.
10682 */
10683 tcp_record_dsack(tp, sack.start, sack.end, was_tlp);
10684 }
10685 }
10686 if (rack->rc_dsack_round_seen) {
10687 /* Is the dsack roound over? */
10688 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) {
10689 /* Yes it is */
10690 rack->rc_dsack_round_seen = 0;
10691 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
10692 }
10693 }
10694 /*
10695 * Sort the SACK blocks so we can update the rack scoreboard with
10696 * just one pass.
10697 */
10698 num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks,
10699 num_sack_blks, th->th_ack);
10700 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
10701 if (sacks_seen != NULL)
10702 *sacks_seen = num_sack_blks;
10703 if (num_sack_blks == 0) {
10704 /* Nothing to sack */
10705 goto out;
10706 }
10707 /* Its a sack of some sort */
10708 if (num_sack_blks < 2) {
10709 /* Only one, we don't need to sort */
10710 goto do_sack_work;
10711 }
10712 /* Sort the sacks */
10713 for (i = 0; i < num_sack_blks; i++) {
10714 for (j = i + 1; j < num_sack_blks; j++) {
10715 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
10716 sack = sack_blocks[i];
10717 sack_blocks[i] = sack_blocks[j];
10718 sack_blocks[j] = sack;
10719 }
10720 }
10721 }
10722 /*
10723 * Now are any of the sack block ends the same (yes some
10724 * implementations send these)?
10725 */
10726 again:
10727 if (num_sack_blks == 0)
10728 goto out;
10729 if (num_sack_blks > 1) {
10730 for (i = 0; i < num_sack_blks; i++) {
10731 for (j = i + 1; j < num_sack_blks; j++) {
10732 if (sack_blocks[i].end == sack_blocks[j].end) {
10733 /*
10734 * Ok these two have the same end we
10735 * want the smallest end and then
10736 * throw away the larger and start
10737 * again.
10738 */
10739 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) {
10740 /*
10741 * The second block covers
10742 * more area use that
10743 */
10744 sack_blocks[i].start = sack_blocks[j].start;
10745 }
10746 /*
10747 * Now collapse out the dup-sack and
10748 * lower the count
10749 */
10750 for (k = (j + 1); k < num_sack_blks; k++) {
10751 sack_blocks[j].start = sack_blocks[k].start;
10752 sack_blocks[j].end = sack_blocks[k].end;
10753 j++;
10754 }
10755 num_sack_blks--;
10756 goto again;
10757 }
10758 }
10759 }
10760 }
10761 do_sack_work:
10762 /*
10763 * First lets look to see if
10764 * we have retransmitted and
10765 * can use the transmit next?
10766 */
10767 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10768 if (rsm &&
10769 SEQ_GT(sack_blocks[0].end, rsm->r_start) &&
10770 SEQ_LT(sack_blocks[0].start, rsm->r_end)) {
10771 /*
10772 * We probably did the FR and the next
10773 * SACK in continues as we would expect.
10774 */
10775 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz);
10776 if (acked) {
10777 rack->r_wanted_output = 1;
10778 changed += acked;
10779 }
10780 if (num_sack_blks == 1) {
10781 goto out;
10782 } else {
10783 /*
10784 * Start the loop through the
10785 * rest of blocks, past the first block.
10786 */
10787 loop_start = 1;
10788 }
10789 }
10790 rsm = rack->r_ctl.rc_sacklast;
10791 for (i = loop_start; i < num_sack_blks; i++) {
10792 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz);
10793 if (acked) {
10794 rack->r_wanted_output = 1;
10795 changed += acked;
10796 }
10797 }
10798 out:
10799 if (changed) {
10800 /* Something changed cancel the rack timer */
10801 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
10802 }
10803 tsused = tcp_get_usecs(NULL);
10804 rsm = tcp_rack_output(tp, rack, tsused);
10805 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
10806 rsm &&
10807 ((rsm->r_flags & RACK_MUST_RXT) == 0)) {
10808 /* Enter recovery */
10809 entered_recovery = 1;
10810 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__);
10811 /*
10812 * When we enter recovery we need to assure we send
10813 * one packet.
10814 */
10815 if (rack->rack_no_prr == 0) {
10816 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp);
10817 rack_log_to_prr(rack, 8, 0, __LINE__);
10818 }
10819 rack->r_timer_override = 1;
10820 rack->r_early = 0;
10821 rack->r_ctl.rc_agg_early = 0;
10822 } else if (IN_FASTRECOVERY(tp->t_flags) &&
10823 rsm &&
10824 (rack->r_rr_config == 3)) {
10825 /*
10826 * Assure we can output and we get no
10827 * remembered pace time except the retransmit.
10828 */
10829 rack->r_timer_override = 1;
10830 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
10831 rack->r_ctl.rc_resend = rsm;
10832 }
10833 if (IN_FASTRECOVERY(tp->t_flags) &&
10834 (rack->rack_no_prr == 0) &&
10835 (entered_recovery == 0)) {
10836 rack_update_prr(tp, rack, changed, th_ack);
10837 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) &&
10838 ((tcp_in_hpts(rack->rc_tp) == 0) &&
10839 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) {
10840 /*
10841 * If you are pacing output you don't want
10842 * to override.
10843 */
10844 rack->r_early = 0;
10845 rack->r_ctl.rc_agg_early = 0;
10846 rack->r_timer_override = 1;
10847 }
10848 }
10849 }
10850
10851 static void
rack_strike_dupack(struct tcp_rack * rack,tcp_seq th_ack)10852 rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack)
10853 {
10854 struct rack_sendmap *rsm;
10855
10856 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
10857 while (rsm) {
10858 /*
10859 * We need to skip anything already set
10860 * to be retransmitted.
10861 */
10862 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
10863 (rsm->r_flags & RACK_MUST_RXT)) {
10864 rsm = TAILQ_NEXT(rsm, r_tnext);
10865 continue;
10866 }
10867 break;
10868 }
10869 if (rsm && (rsm->r_dupack < 0xff)) {
10870 rsm->r_dupack++;
10871 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) {
10872 struct timeval tv;
10873 uint32_t cts;
10874 /*
10875 * Here we see if we need to retransmit. For
10876 * a SACK type connection if enough time has passed
10877 * we will get a return of the rsm. For a non-sack
10878 * connection we will get the rsm returned if the
10879 * dupack value is 3 or more.
10880 */
10881 cts = tcp_get_usecs(&tv);
10882 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts);
10883 if (rack->r_ctl.rc_resend != NULL) {
10884 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) {
10885 rack_cong_signal(rack->rc_tp, CC_NDUPACK,
10886 th_ack, __LINE__);
10887 }
10888 rack->r_wanted_output = 1;
10889 rack->r_timer_override = 1;
10890 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3);
10891 }
10892 } else {
10893 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3);
10894 }
10895 }
10896 }
10897
10898 static void
rack_check_bottom_drag(struct tcpcb * tp,struct tcp_rack * rack,struct socket * so)10899 rack_check_bottom_drag(struct tcpcb *tp,
10900 struct tcp_rack *rack,
10901 struct socket *so)
10902 {
10903 /*
10904 * So what is dragging bottom?
10905 *
10906 * Dragging bottom means you were under pacing and had a
10907 * delay in processing inbound acks waiting on our pacing
10908 * timer to expire. While you were waiting all of the acknowledgments
10909 * for the packets you sent have arrived. This means we are pacing
10910 * way underneath the bottleneck to the point where our Goodput
10911 * measurements stop working, since they require more than one
10912 * ack (usually at least 8 packets worth with multiple acks so we can
10913 * gauge the inter-ack times). If that occurs we have a real problem
10914 * since we are stuck in a hole that we can't get out of without
10915 * something speeding us up.
10916 *
10917 * We also check to see if we are widdling down to just one segment
10918 * outstanding. If this occurs and we have room to send in our cwnd/rwnd
10919 * then we are adding the delayed ack interval into our measurments and
10920 * we need to speed up slightly.
10921 */
10922 uint32_t segsiz, minseg;
10923
10924 segsiz = ctf_fixed_maxseg(tp);
10925 minseg = segsiz;
10926 if (tp->snd_max == tp->snd_una) {
10927 /*
10928 * We are doing dynamic pacing and we are way
10929 * under. Basically everything got acked while
10930 * we were still waiting on the pacer to expire.
10931 *
10932 * This means we need to boost the b/w in
10933 * addition to any earlier boosting of
10934 * the multiplier.
10935 */
10936 uint64_t lt_bw;
10937
10938 tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM);
10939 lt_bw = rack_get_lt_bw(rack);
10940 rack->rc_dragged_bottom = 1;
10941 rack_validate_multipliers_at_or_above100(rack);
10942 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) &&
10943 (rack->dis_lt_bw == 0) &&
10944 (rack->use_lesser_lt_bw == 0) &&
10945 (lt_bw > 0)) {
10946 /*
10947 * Lets use the long-term b/w we have
10948 * been getting as a base.
10949 */
10950 if (rack->rc_gp_filled == 0) {
10951 if (lt_bw > ONE_POINT_TWO_MEG) {
10952 /*
10953 * If we have no measurement
10954 * don't let us set in more than
10955 * 1.2Mbps. If we are still too
10956 * low after pacing with this we
10957 * will hopefully have a max b/w
10958 * available to sanity check things.
10959 */
10960 lt_bw = ONE_POINT_TWO_MEG;
10961 }
10962 rack->r_ctl.rc_rtt_diff = 0;
10963 rack->r_ctl.gp_bw = lt_bw;
10964 rack->rc_gp_filled = 1;
10965 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10966 rack->r_ctl.num_measurements = RACK_REQ_AVG;
10967 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10968 } else if (lt_bw > rack->r_ctl.gp_bw) {
10969 rack->r_ctl.rc_rtt_diff = 0;
10970 if (rack->r_ctl.num_measurements < RACK_REQ_AVG)
10971 rack->r_ctl.num_measurements = RACK_REQ_AVG;
10972 rack->r_ctl.gp_bw = lt_bw;
10973 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
10974 } else
10975 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10976 if ((rack->gp_ready == 0) &&
10977 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) {
10978 /* We have enough measurements now */
10979 rack->gp_ready = 1;
10980 if (rack->dgp_on ||
10981 rack->rack_hibeta)
10982 rack_set_cc_pacing(rack);
10983 if (rack->defer_options)
10984 rack_apply_deferred_options(rack);
10985 }
10986 } else {
10987 /*
10988 * zero rtt possibly?, settle for just an old increase.
10989 */
10990 rack_increase_bw_mul(rack, -1, 0, 0, 1);
10991 }
10992 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) &&
10993 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)),
10994 minseg)) &&
10995 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10996 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) &&
10997 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <=
10998 (segsiz * rack_req_segs))) {
10999 /*
11000 * We are doing dynamic GP pacing and
11001 * we have everything except 1MSS or less
11002 * bytes left out. We are still pacing away.
11003 * And there is data that could be sent, This
11004 * means we are inserting delayed ack time in
11005 * our measurements because we are pacing too slow.
11006 */
11007 rack_validate_multipliers_at_or_above100(rack);
11008 rack->rc_dragged_bottom = 1;
11009 rack_increase_bw_mul(rack, -1, 0, 0, 1);
11010 }
11011 }
11012
11013 #ifdef TCP_REQUEST_TRK
11014 static void
rack_log_hybrid(struct tcp_rack * rack,uint32_t seq,struct tcp_sendfile_track * cur,uint8_t mod,int line,int err)11015 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq,
11016 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err)
11017 {
11018 int do_log;
11019
11020 do_log = tcp_bblogging_on(rack->rc_tp);
11021 if (do_log == 0) {
11022 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0)
11023 return;
11024 /* We only allow the three below with point logging on */
11025 if ((mod != HYBRID_LOG_RULES_APP) &&
11026 (mod != HYBRID_LOG_RULES_SET) &&
11027 (mod != HYBRID_LOG_REQ_COMP))
11028 return;
11029
11030 }
11031 if (do_log) {
11032 union tcp_log_stackspecific log;
11033 struct timeval tv;
11034
11035 /* Convert our ms to a microsecond */
11036 memset(&log, 0, sizeof(log));
11037 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11038 log.u_bbr.flex1 = seq;
11039 log.u_bbr.cwnd_gain = line;
11040 if (cur != NULL) {
11041 uint64_t off;
11042
11043 log.u_bbr.flex2 = cur->start_seq;
11044 log.u_bbr.flex3 = cur->end_seq;
11045 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff);
11046 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff);
11047 log.u_bbr.flex6 = cur->flags;
11048 log.u_bbr.pkts_out = cur->hybrid_flags;
11049 log.u_bbr.rttProp = cur->timestamp;
11050 log.u_bbr.cur_del_rate = cur->cspr;
11051 log.u_bbr.bw_inuse = cur->start;
11052 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff);
11053 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ;
11054 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff);
11055 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ;
11056 log.u_bbr.inhpts = 1;
11057 #ifdef TCP_REQUEST_TRK
11058 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]);
11059 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track));
11060 #endif
11061 } else {
11062 log.u_bbr.flex2 = err;
11063 }
11064 /*
11065 * Fill in flex7 to be CHD (catchup|hybrid|DGP)
11066 */
11067 log.u_bbr.flex7 = rack->rc_catch_up;
11068 log.u_bbr.flex7 <<= 1;
11069 log.u_bbr.flex7 |= rack->rc_hybrid_mode;
11070 log.u_bbr.flex7 <<= 1;
11071 log.u_bbr.flex7 |= rack->dgp_on;
11072 /*
11073 * Compose bbr_state to be a bit wise 0000ADHF
11074 * where A is the always_pace flag
11075 * where D is the dgp_on flag
11076 * where H is the hybrid_mode on flag
11077 * where F is the use_fixed_rate flag.
11078 */
11079 log.u_bbr.bbr_state = rack->rc_always_pace;
11080 log.u_bbr.bbr_state <<= 1;
11081 log.u_bbr.bbr_state |= rack->dgp_on;
11082 log.u_bbr.bbr_state <<= 1;
11083 log.u_bbr.bbr_state |= rack->rc_hybrid_mode;
11084 log.u_bbr.bbr_state <<= 1;
11085 log.u_bbr.bbr_state |= rack->use_fixed_rate;
11086 log.u_bbr.flex8 = mod;
11087 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap;
11088 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg;
11089 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11090 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start;
11091 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error;
11092 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop;
11093 tcp_log_event(rack->rc_tp, NULL,
11094 &rack->rc_inp->inp_socket->so_rcv,
11095 &rack->rc_inp->inp_socket->so_snd,
11096 TCP_HYBRID_PACING_LOG, 0,
11097 0, &log, false, NULL, __func__, __LINE__, &tv);
11098 }
11099 }
11100 #endif
11101
11102 #ifdef TCP_REQUEST_TRK
11103 static void
rack_set_dgp_hybrid_mode(struct tcp_rack * rack,tcp_seq seq,uint32_t len,uint64_t cts)11104 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
11105 {
11106 struct tcp_sendfile_track *rc_cur, *orig_ent;
11107 struct tcpcb *tp;
11108 int err = 0;
11109
11110 orig_ent = rack->r_ctl.rc_last_sft;
11111 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq);
11112 if (rc_cur == NULL) {
11113 /* If not in the beginning what about the end piece */
11114 if (rack->rc_hybrid_mode)
11115 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11116 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1));
11117 } else {
11118 err = 12345;
11119 }
11120 /* If we find no parameters we are in straight DGP mode */
11121 if(rc_cur == NULL) {
11122 /* None found for this seq, just DGP for now */
11123 if (rack->rc_hybrid_mode) {
11124 rack->r_ctl.client_suggested_maxseg = 0;
11125 rack->rc_catch_up = 0;
11126 if (rack->cspr_is_fcc == 0)
11127 rack->r_ctl.bw_rate_cap = 0;
11128 else
11129 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11130 }
11131 if (rack->rc_hybrid_mode) {
11132 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err);
11133 }
11134 if (rack->r_ctl.rc_last_sft) {
11135 rack->r_ctl.rc_last_sft = NULL;
11136 }
11137 return;
11138 }
11139 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) {
11140 /* This entry was never setup for hybrid pacing on/off etc */
11141 if (rack->rc_hybrid_mode) {
11142 rack->r_ctl.client_suggested_maxseg = 0;
11143 rack->rc_catch_up = 0;
11144 rack->r_ctl.bw_rate_cap = 0;
11145 }
11146 if (rack->r_ctl.rc_last_sft) {
11147 rack->r_ctl.rc_last_sft = NULL;
11148 }
11149 if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
11150 rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND;
11151 rc_cur->first_send = cts;
11152 rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes;
11153 rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
11154 }
11155 return;
11156 }
11157 /*
11158 * Ok if we have a new entry *or* have never
11159 * set up an entry we need to proceed. If
11160 * we have already set it up this entry we
11161 * just continue along with what we already
11162 * setup.
11163 */
11164 tp = rack->rc_tp;
11165 if ((rack->r_ctl.rc_last_sft != NULL) &&
11166 (rack->r_ctl.rc_last_sft == rc_cur)) {
11167 /* Its already in place */
11168 if (rack->rc_hybrid_mode)
11169 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0);
11170 return;
11171 }
11172 if (rack->rc_hybrid_mode == 0) {
11173 rack->r_ctl.rc_last_sft = rc_cur;
11174 if (orig_ent) {
11175 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
11176 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
11177 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
11178 }
11179 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11180 return;
11181 }
11182 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){
11183 /* Compensate for all the header overhead's */
11184 if (rack->cspr_is_fcc == 0)
11185 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
11186 else
11187 rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr);
11188 } else {
11189 if (rack->rc_hybrid_mode) {
11190 if (rack->cspr_is_fcc == 0)
11191 rack->r_ctl.bw_rate_cap = 0;
11192 else
11193 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11194 }
11195 }
11196 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS)
11197 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg;
11198 else
11199 rack->r_ctl.client_suggested_maxseg = 0;
11200 if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) {
11201 /*
11202 * It is the same timestamp as the previous one
11203 * add the hybrid flag that will indicate we use
11204 * sendtime not arrival time for catch-up mode.
11205 */
11206 rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME;
11207 }
11208 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) &&
11209 (rc_cur->cspr > 0)) {
11210 uint64_t len;
11211
11212 rack->rc_catch_up = 1;
11213 /*
11214 * Calculate the deadline time, first set the
11215 * time to when the request arrived.
11216 */
11217 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) {
11218 /*
11219 * For cases where its a duplicate tm (we received more
11220 * than one request for a tm) we want to use now, the point
11221 * where we are just sending the first bit of the request.
11222 */
11223 rc_cur->deadline = cts;
11224 } else {
11225 /*
11226 * Here we have a different tm from the last request
11227 * so we want to use arrival time as our base.
11228 */
11229 rc_cur->deadline = rc_cur->localtime;
11230 }
11231 /*
11232 * Next calculate the length and compensate for
11233 * TLS if need be.
11234 */
11235 len = rc_cur->end - rc_cur->start;
11236 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) {
11237 /*
11238 * This session is doing TLS. Take a swag guess
11239 * at the overhead.
11240 */
11241 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len);
11242 }
11243 /*
11244 * Now considering the size, and the cspr, what is the time that
11245 * would be required at the cspr rate. Here we use the raw
11246 * cspr value since the client only looks at the raw data. We
11247 * do use len which includes TLS overhead, but not the TCP/IP etc.
11248 * That will get made up for in the CU pacing rate set.
11249 */
11250 len *= HPTS_USEC_IN_SEC;
11251 len /= rc_cur->cspr;
11252 rc_cur->deadline += len;
11253 } else {
11254 rack->rc_catch_up = 0;
11255 rc_cur->deadline = 0;
11256 }
11257 if (rack->r_ctl.client_suggested_maxseg != 0) {
11258 /*
11259 * We need to reset the max pace segs if we have a
11260 * client_suggested_maxseg.
11261 */
11262 rack_set_pace_segments(tp, rack, __LINE__, NULL);
11263 }
11264 if (orig_ent) {
11265 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes;
11266 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes;
11267 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND;
11268 }
11269 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0);
11270 /* Remember it for next time and for CU mode */
11271 rack->r_ctl.rc_last_sft = rc_cur;
11272 rack->r_ctl.last_tm_mark = rc_cur->timestamp;
11273 }
11274 #endif
11275
11276 static void
rack_chk_req_and_hybrid_on_out(struct tcp_rack * rack,tcp_seq seq,uint32_t len,uint64_t cts)11277 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts)
11278 {
11279 #ifdef TCP_REQUEST_TRK
11280 struct tcp_sendfile_track *ent;
11281
11282 ent = rack->r_ctl.rc_last_sft;
11283 if ((ent == NULL) ||
11284 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) ||
11285 (SEQ_GEQ(seq, ent->end_seq))) {
11286 /* Time to update the track. */
11287 rack_set_dgp_hybrid_mode(rack, seq, len, cts);
11288 ent = rack->r_ctl.rc_last_sft;
11289 }
11290 /* Out of all */
11291 if (ent == NULL) {
11292 return;
11293 }
11294 if (SEQ_LT(ent->end_seq, (seq + len))) {
11295 /*
11296 * This is the case where our end_seq guess
11297 * was wrong. This is usually due to TLS having
11298 * more bytes then our guess. It could also be the
11299 * case that the client sent in two requests closely
11300 * and the SB is full of both so we are sending part
11301 * of each (end|beg). In such a case lets move this
11302 * guys end to match the end of this send. That
11303 * way it will complete when all of it is acked.
11304 */
11305 ent->end_seq = (seq + len);
11306 if (rack->rc_hybrid_mode)
11307 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__);
11308 }
11309 /* Now validate we have set the send time of this one */
11310 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) {
11311 ent->flags |= TCP_TRK_TRACK_FLG_FSND;
11312 ent->first_send = cts;
11313 ent->sent_at_fs = rack->rc_tp->t_sndbytes;
11314 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes;
11315 }
11316 #endif
11317 }
11318
11319 static void
rack_gain_for_fastoutput(struct tcp_rack * rack,struct tcpcb * tp,struct socket * so,uint32_t acked_amount)11320 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount)
11321 {
11322 /*
11323 * The fast output path is enabled and we
11324 * have moved the cumack forward. Lets see if
11325 * we can expand forward the fast path length by
11326 * that amount. What we would ideally like to
11327 * do is increase the number of bytes in the
11328 * fast path block (left_to_send) by the
11329 * acked amount. However we have to gate that
11330 * by two factors:
11331 * 1) The amount outstanding and the rwnd of the peer
11332 * (i.e. we don't want to exceed the rwnd of the peer).
11333 * <and>
11334 * 2) The amount of data left in the socket buffer (i.e.
11335 * we can't send beyond what is in the buffer).
11336 *
11337 * Note that this does not take into account any increase
11338 * in the cwnd. We will only extend the fast path by
11339 * what was acked.
11340 */
11341 uint32_t new_total, gating_val;
11342
11343 new_total = acked_amount + rack->r_ctl.fsb.left_to_send;
11344 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)),
11345 (tp->snd_wnd - (tp->snd_max - tp->snd_una)));
11346 if (new_total <= gating_val) {
11347 /* We can increase left_to_send by the acked amount */
11348 counter_u64_add(rack_extended_rfo, 1);
11349 rack->r_ctl.fsb.left_to_send = new_total;
11350 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))),
11351 ("rack:%p left_to_send:%u sbavail:%u out:%u",
11352 rack, rack->r_ctl.fsb.left_to_send,
11353 sbavail(&rack->rc_inp->inp_socket->so_snd),
11354 (tp->snd_max - tp->snd_una)));
11355
11356 }
11357 }
11358
11359 static void
rack_adjust_sendmap_head(struct tcp_rack * rack,struct sockbuf * sb)11360 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb)
11361 {
11362 /*
11363 * Here any sendmap entry that points to the
11364 * beginning mbuf must be adjusted to the correct
11365 * offset. This must be called with:
11366 * 1) The socket buffer locked
11367 * 2) snd_una adjusted to its new position.
11368 *
11369 * Note that (2) implies rack_ack_received has also
11370 * been called and all the sbcut's have been done.
11371 *
11372 * We grab the first mbuf in the socket buffer and
11373 * then go through the front of the sendmap, recalculating
11374 * the stored offset for any sendmap entry that has
11375 * that mbuf. We must use the sb functions to do this
11376 * since its possible an add was done has well as
11377 * the subtraction we may have just completed. This should
11378 * not be a penalty though, since we just referenced the sb
11379 * to go in and trim off the mbufs that we freed (of course
11380 * there will be a penalty for the sendmap references though).
11381 *
11382 * Note also with INVARIANT on, we validate with a KASSERT
11383 * that the first sendmap entry has a soff of 0.
11384 *
11385 */
11386 struct mbuf *m;
11387 struct rack_sendmap *rsm;
11388 tcp_seq snd_una;
11389 #ifdef INVARIANTS
11390 int first_processed = 0;
11391 #endif
11392
11393 snd_una = rack->rc_tp->snd_una;
11394 SOCKBUF_LOCK_ASSERT(sb);
11395 m = sb->sb_mb;
11396 rsm = tqhash_min(rack->r_ctl.tqh);
11397 if ((rsm == NULL) || (m == NULL)) {
11398 /* Nothing outstanding */
11399 return;
11400 }
11401 /* The very first RSM's mbuf must point to the head mbuf in the sb */
11402 KASSERT((rsm->m == m),
11403 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb",
11404 rack, sb, rsm));
11405 while (rsm->m && (rsm->m == m)) {
11406 /* one to adjust */
11407 #ifdef INVARIANTS
11408 struct mbuf *tm;
11409 uint32_t soff;
11410
11411 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff);
11412 if ((rsm->orig_m_len != m->m_len) ||
11413 (rsm->orig_t_space != M_TRAILINGROOM(m))){
11414 rack_adjust_orig_mlen(rsm);
11415 }
11416 if (first_processed == 0) {
11417 KASSERT((rsm->soff == 0),
11418 ("Rack:%p rsm:%p -- rsm at head but soff not zero",
11419 rack, rsm));
11420 first_processed = 1;
11421 }
11422 if ((rsm->soff != soff) || (rsm->m != tm)) {
11423 /*
11424 * This is not a fatal error, we anticipate it
11425 * might happen (the else code), so we count it here
11426 * so that under invariant we can see that it really
11427 * does happen.
11428 */
11429 counter_u64_add(rack_adjust_map_bw, 1);
11430 }
11431 rsm->m = tm;
11432 rsm->soff = soff;
11433 if (tm) {
11434 rsm->orig_m_len = rsm->m->m_len;
11435 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11436 } else {
11437 rsm->orig_m_len = 0;
11438 rsm->orig_t_space = 0;
11439 }
11440 #else
11441 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff);
11442 if (rsm->m) {
11443 rsm->orig_m_len = rsm->m->m_len;
11444 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
11445 } else {
11446 rsm->orig_m_len = 0;
11447 rsm->orig_t_space = 0;
11448 }
11449 #endif
11450 rsm = tqhash_next(rack->r_ctl.tqh, rsm);
11451 if (rsm == NULL)
11452 break;
11453 }
11454 }
11455
11456 #ifdef TCP_REQUEST_TRK
11457 static inline void
rack_req_check_for_comp(struct tcp_rack * rack,tcp_seq th_ack)11458 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack)
11459 {
11460 struct tcp_sendfile_track *ent;
11461 int i;
11462
11463 if ((rack->rc_hybrid_mode == 0) &&
11464 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) {
11465 /*
11466 * Just do normal completions hybrid pacing is not on
11467 * and CLDL is off as well.
11468 */
11469 tcp_req_check_for_comp(rack->rc_tp, th_ack);
11470 return;
11471 }
11472 /*
11473 * Originally I was just going to find the th_ack associated
11474 * with an entry. But then I realized a large strech ack could
11475 * in theory ack two or more requests at once. So instead we
11476 * need to find all entries that are completed by th_ack not
11477 * just a single entry and do our logging.
11478 */
11479 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11480 while (ent != NULL) {
11481 /*
11482 * We may be doing hybrid pacing or CLDL and need more details possibly
11483 * so we do it manually instead of calling
11484 * tcp_req_check_for_comp()
11485 */
11486 uint64_t laa, tim, data, cbw, ftim;
11487
11488 /* Ok this ack frees it */
11489 rack_log_hybrid(rack, th_ack,
11490 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0);
11491 rack_log_hybrid_sends(rack, ent, __LINE__);
11492 /* calculate the time based on the ack arrival */
11493 data = ent->end - ent->start;
11494 laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time);
11495 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) {
11496 if (ent->first_send > ent->localtime)
11497 ftim = ent->first_send;
11498 else
11499 ftim = ent->localtime;
11500 } else {
11501 /* TSNH */
11502 ftim = ent->localtime;
11503 }
11504 if (laa > ent->localtime)
11505 tim = laa - ftim;
11506 else
11507 tim = 0;
11508 cbw = data * HPTS_USEC_IN_SEC;
11509 if (tim > 0)
11510 cbw /= tim;
11511 else
11512 cbw = 0;
11513 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__);
11514 /*
11515 * Check to see if we are freeing what we are pointing to send wise
11516 * if so be sure to NULL the pointer so we know we are no longer
11517 * set to anything.
11518 */
11519 if (ent == rack->r_ctl.rc_last_sft) {
11520 rack->r_ctl.rc_last_sft = NULL;
11521 if (rack->rc_hybrid_mode) {
11522 rack->rc_catch_up = 0;
11523 if (rack->cspr_is_fcc == 0)
11524 rack->r_ctl.bw_rate_cap = 0;
11525 else
11526 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
11527 rack->r_ctl.client_suggested_maxseg = 0;
11528 }
11529 }
11530 /* Generate the log that the tcp_netflix call would have */
11531 tcp_req_log_req_info(rack->rc_tp, ent,
11532 i, TCP_TRK_REQ_LOG_FREED, 0, 0);
11533 /* Free it and see if there is another one */
11534 tcp_req_free_a_slot(rack->rc_tp, ent);
11535 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i);
11536 }
11537 }
11538 #endif
11539
11540
11541 /*
11542 * Return value of 1, we do not need to call rack_process_data().
11543 * return value of 0, rack_process_data can be called.
11544 * For ret_val if its 0 the TCP is locked, if its non-zero
11545 * its unlocked and probably unsafe to touch the TCB.
11546 */
11547 static int
rack_process_ack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,uint32_t tiwin,int32_t tlen,int32_t * ofia,int32_t thflags,int32_t * ret_val,int32_t orig_tlen)11548 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
11549 struct tcpcb *tp, struct tcpopt *to,
11550 uint32_t tiwin, int32_t tlen,
11551 int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen)
11552 {
11553 int32_t ourfinisacked = 0;
11554 int32_t nsegs, acked_amount;
11555 int32_t acked;
11556 struct mbuf *mfree;
11557 struct tcp_rack *rack;
11558 int32_t under_pacing = 0;
11559 int32_t post_recovery = 0;
11560 uint32_t p_cwnd;
11561
11562 INP_WLOCK_ASSERT(tptoinpcb(tp));
11563
11564 rack = (struct tcp_rack *)tp->t_fb_ptr;
11565 if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) {
11566 /* Checking SEG.ACK against ISS is definitely redundant. */
11567 tp->t_flags2 |= TF2_NO_ISS_CHECK;
11568 }
11569 if (!V_tcp_insecure_ack) {
11570 tcp_seq seq_min;
11571 bool ghost_ack_check;
11572
11573 if (tp->t_flags2 & TF2_NO_ISS_CHECK) {
11574 /* Check for too old ACKs (RFC 5961, Section 5.2). */
11575 seq_min = tp->snd_una - tp->max_sndwnd;
11576 ghost_ack_check = false;
11577 } else {
11578 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) {
11579 /* Checking for ghost ACKs is stricter. */
11580 seq_min = tp->iss + 1;
11581 ghost_ack_check = true;
11582 } else {
11583 /*
11584 * Checking for too old ACKs (RFC 5961,
11585 * Section 5.2) is stricter.
11586 */
11587 seq_min = tp->snd_una - tp->max_sndwnd;
11588 ghost_ack_check = false;
11589 }
11590 }
11591 if (SEQ_LT(th->th_ack, seq_min)) {
11592 if (ghost_ack_check)
11593 TCPSTAT_INC(tcps_rcvghostack);
11594 else
11595 TCPSTAT_INC(tcps_rcvacktooold);
11596 /* Send challenge ACK. */
11597 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
11598 rack->r_wanted_output = 1;
11599 return (1);
11600 }
11601 }
11602 if (SEQ_GT(th->th_ack, tp->snd_max)) {
11603 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
11604 rack->r_wanted_output = 1;
11605 return (1);
11606 }
11607 if (rack->gp_ready &&
11608 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
11609 under_pacing = 1;
11610 }
11611 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) {
11612 int in_rec, dup_ack_struck = 0;
11613 int dsack_seen = 0, sacks_seen = 0;
11614
11615 in_rec = IN_FASTRECOVERY(tp->t_flags);
11616 if (rack->rc_in_persist) {
11617 tp->t_rxtshift = 0;
11618 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11619 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11620 }
11621
11622 if ((th->th_ack == tp->snd_una) &&
11623 (tiwin == tp->snd_wnd) &&
11624 (orig_tlen == 0) &&
11625 ((to->to_flags & TOF_SACK) == 0)) {
11626 rack_strike_dupack(rack, th->th_ack);
11627 dup_ack_struck = 1;
11628 }
11629 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)),
11630 dup_ack_struck, &dsack_seen, &sacks_seen);
11631
11632 }
11633 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
11634 /*
11635 * Old ack, behind (or duplicate to) the last one rcv'd
11636 * Note: We mark reordering is occuring if its
11637 * less than and we have not closed our window.
11638 */
11639 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) {
11640 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
11641 if (rack->r_ctl.rc_reorder_ts == 0)
11642 rack->r_ctl.rc_reorder_ts = 1;
11643 }
11644 return (0);
11645 }
11646 /*
11647 * If we reach this point, ACK is not a duplicate, i.e., it ACKs
11648 * something we sent.
11649 */
11650 if (tp->t_flags & TF_NEEDSYN) {
11651 /*
11652 * T/TCP: Connection was half-synchronized, and our SYN has
11653 * been ACK'd (so connection is now fully synchronized). Go
11654 * to non-starred state, increment snd_una for ACK of SYN,
11655 * and check if we can do window scaling.
11656 */
11657 tp->t_flags &= ~TF_NEEDSYN;
11658 tp->snd_una++;
11659 /* Do window scaling? */
11660 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
11661 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
11662 tp->rcv_scale = tp->request_r_scale;
11663 /* Send window already scaled. */
11664 }
11665 }
11666 nsegs = max(1, m->m_pkthdr.lro_nsegs);
11667
11668 acked = BYTES_THIS_ACK(tp, th);
11669 if (acked) {
11670 /*
11671 * Any time we move the cum-ack forward clear
11672 * keep-alive tied probe-not-answered. The
11673 * persists clears its own on entry.
11674 */
11675 rack->probe_not_answered = 0;
11676 }
11677 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
11678 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
11679 /*
11680 * If we just performed our first retransmit, and the ACK arrives
11681 * within our recovery window, then it was a mistake to do the
11682 * retransmit in the first place. Recover our original cwnd and
11683 * ssthresh, and proceed to transmit where we left off.
11684 */
11685 if ((tp->t_flags & TF_PREVVALID) &&
11686 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
11687 tp->t_flags &= ~TF_PREVVALID;
11688 if (tp->t_rxtshift == 1 &&
11689 (int)(ticks - tp->t_badrxtwin) < 0)
11690 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
11691 }
11692 if (acked) {
11693 /* assure we are not backed off */
11694 tp->t_rxtshift = 0;
11695 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
11696 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
11697 rack->rc_tlp_in_progress = 0;
11698 rack->r_ctl.rc_tlp_cnt_out = 0;
11699 /*
11700 * If it is the RXT timer we want to
11701 * stop it, so we can restart a TLP.
11702 */
11703 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
11704 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11705 #ifdef TCP_REQUEST_TRK
11706 rack_req_check_for_comp(rack, th->th_ack);
11707 #endif
11708 }
11709 /*
11710 * If we have a timestamp reply, update smoothed round trip time. If
11711 * no timestamp is present but transmit timer is running and timed
11712 * sequence number was acked, update smoothed round trip time. Since
11713 * we now have an rtt measurement, cancel the timer backoff (cf.,
11714 * Phil Karn's retransmit alg.). Recompute the initial retransmit
11715 * timer.
11716 *
11717 * Some boxes send broken timestamp replies during the SYN+ACK
11718 * phase, ignore timestamps of 0 or we could calculate a huge RTT
11719 * and blow up the retransmit timer.
11720 */
11721 /*
11722 * If all outstanding data is acked, stop retransmit timer and
11723 * remember to restart (more output or persist). If there is more
11724 * data to be acked, restart retransmit timer, using current
11725 * (possibly backed-off) value.
11726 */
11727 if (acked == 0) {
11728 if (ofia)
11729 *ofia = ourfinisacked;
11730 return (0);
11731 }
11732 if (IN_RECOVERY(tp->t_flags)) {
11733 if (SEQ_LT(th->th_ack, tp->snd_recover) &&
11734 (SEQ_LT(th->th_ack, tp->snd_max))) {
11735 tcp_rack_partialack(tp);
11736 } else {
11737 rack_post_recovery(tp, th->th_ack);
11738 post_recovery = 1;
11739 /*
11740 * Grab the segsiz, multiply by 2 and add the snd_cwnd
11741 * that is the max the CC should add if we are exiting
11742 * recovery and doing a late add.
11743 */
11744 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
11745 p_cwnd <<= 1;
11746 p_cwnd += tp->snd_cwnd;
11747 }
11748 } else if ((rack->rto_from_rec == 1) &&
11749 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
11750 /*
11751 * We were in recovery, hit a rxt timeout
11752 * and never re-entered recovery. The timeout(s)
11753 * made up all the lost data. In such a case
11754 * we need to clear the rto_from_rec flag.
11755 */
11756 rack->rto_from_rec = 0;
11757 }
11758 /*
11759 * Let the congestion control algorithm update congestion control
11760 * related information. This typically means increasing the
11761 * congestion window.
11762 */
11763 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery);
11764 if (post_recovery &&
11765 (tp->snd_cwnd > p_cwnd)) {
11766 /* Must be non-newreno (cubic) getting too ahead of itself */
11767 tp->snd_cwnd = p_cwnd;
11768 }
11769 SOCK_SENDBUF_LOCK(so);
11770 acked_amount = min(acked, (int)sbavail(&so->so_snd));
11771 tp->snd_wnd -= acked_amount;
11772 mfree = sbcut_locked(&so->so_snd, acked_amount);
11773 if ((sbused(&so->so_snd) == 0) &&
11774 (acked > acked_amount) &&
11775 (tp->t_state >= TCPS_FIN_WAIT_1) &&
11776 (tp->t_flags & TF_SENTFIN)) {
11777 /*
11778 * We must be sure our fin
11779 * was sent and acked (we can be
11780 * in FIN_WAIT_1 without having
11781 * sent the fin).
11782 */
11783 ourfinisacked = 1;
11784 }
11785 tp->snd_una = th->th_ack;
11786 /* wakeups? */
11787 if (acked_amount && sbavail(&so->so_snd))
11788 rack_adjust_sendmap_head(rack, &so->so_snd);
11789 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
11790 /* NB: sowwakeup_locked() does an implicit unlock. */
11791 sowwakeup_locked(so);
11792 m_freem(mfree);
11793 if (SEQ_GT(tp->snd_una, tp->snd_recover))
11794 tp->snd_recover = tp->snd_una;
11795
11796 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
11797 tp->snd_nxt = tp->snd_max;
11798 }
11799 if (under_pacing &&
11800 (rack->use_fixed_rate == 0) &&
11801 (rack->in_probe_rtt == 0) &&
11802 rack->rc_gp_dyn_mul &&
11803 rack->rc_always_pace) {
11804 /* Check if we are dragging bottom */
11805 rack_check_bottom_drag(tp, rack, so);
11806 }
11807 if (tp->snd_una == tp->snd_max) {
11808 /* Nothing left outstanding */
11809 tp->t_flags &= ~TF_PREVVALID;
11810 if (rack->r_ctl.rc_went_idle_time == 0)
11811 rack->r_ctl.rc_went_idle_time = 1;
11812 rack->r_ctl.retran_during_recovery = 0;
11813 rack->r_ctl.dsack_byte_cnt = 0;
11814 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
11815 if (sbavail(&tptosocket(tp)->so_snd) == 0)
11816 tp->t_acktime = 0;
11817 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
11818 rack->rc_suspicious = 0;
11819 /* Set need output so persist might get set */
11820 rack->r_wanted_output = 1;
11821 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
11822 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
11823 (sbavail(&so->so_snd) == 0) &&
11824 (tp->t_flags2 & TF2_DROP_AF_DATA)) {
11825 /*
11826 * The socket was gone and the
11827 * peer sent data (now or in the past), time to
11828 * reset him.
11829 */
11830 *ret_val = 1;
11831 /* tcp_close will kill the inp pre-log the Reset */
11832 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
11833 tp = tcp_close(tp);
11834 ctf_do_dropwithreset(m, tp, th, tlen);
11835 return (1);
11836 }
11837 }
11838 if (ofia)
11839 *ofia = ourfinisacked;
11840 return (0);
11841 }
11842
11843
11844 static void
rack_log_collapse(struct tcp_rack * rack,uint32_t cnt,uint32_t split,uint32_t out,int line,int dir,uint32_t flags,struct rack_sendmap * rsm)11845 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
11846 int dir, uint32_t flags, struct rack_sendmap *rsm)
11847 {
11848 if (tcp_bblogging_on(rack->rc_tp)) {
11849 union tcp_log_stackspecific log;
11850 struct timeval tv;
11851
11852 memset(&log, 0, sizeof(log));
11853 log.u_bbr.flex1 = cnt;
11854 log.u_bbr.flex2 = split;
11855 log.u_bbr.flex3 = out;
11856 log.u_bbr.flex4 = line;
11857 log.u_bbr.flex5 = rack->r_must_retran;
11858 log.u_bbr.flex6 = flags;
11859 log.u_bbr.flex7 = rack->rc_has_collapsed;
11860 log.u_bbr.flex8 = dir; /*
11861 * 1 is collapsed, 0 is uncollapsed,
11862 * 2 is log of a rsm being marked, 3 is a split.
11863 */
11864 if (rsm == NULL)
11865 log.u_bbr.rttProp = 0;
11866 else
11867 log.u_bbr.rttProp = (uintptr_t)rsm;
11868 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
11869 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
11870 TCP_LOG_EVENTP(rack->rc_tp, NULL,
11871 &rack->rc_inp->inp_socket->so_rcv,
11872 &rack->rc_inp->inp_socket->so_snd,
11873 TCP_RACK_LOG_COLLAPSE, 0,
11874 0, &log, false, &tv);
11875 }
11876 }
11877
11878 static void
rack_collapsed_window(struct tcp_rack * rack,uint32_t out,tcp_seq th_ack,int line)11879 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line)
11880 {
11881 /*
11882 * Here all we do is mark the collapsed point and set the flag.
11883 * This may happen again and again, but there is no
11884 * sense splitting our map until we know where the
11885 * peer finally lands in the collapse.
11886 */
11887 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
11888 if ((rack->rc_has_collapsed == 0) ||
11889 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd)))
11890 counter_u64_add(rack_collapsed_win_seen, 1);
11891 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd;
11892 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
11893 rack->rc_has_collapsed = 1;
11894 rack->r_collapse_point_valid = 1;
11895 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
11896 }
11897
11898 static void
rack_un_collapse_window(struct tcp_rack * rack,int line)11899 rack_un_collapse_window(struct tcp_rack *rack, int line)
11900 {
11901 struct rack_sendmap *nrsm, *rsm;
11902 int cnt = 0, split = 0;
11903 int insret __diagused;
11904
11905
11906 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND);
11907 rack->rc_has_collapsed = 0;
11908 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
11909 if (rsm == NULL) {
11910 /* Nothing to do maybe the peer ack'ed it all */
11911 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
11912 return;
11913 }
11914 /* Now do we need to split this one? */
11915 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
11916 rack_log_collapse(rack, rsm->r_start, rsm->r_end,
11917 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
11918 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
11919 if (nrsm == NULL) {
11920 /* We can't get a rsm, mark all? */
11921 nrsm = rsm;
11922 goto no_split;
11923 }
11924 /* Clone it */
11925 split = 1;
11926 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
11927 #ifndef INVARIANTS
11928 (void)tqhash_insert(rack->r_ctl.tqh, nrsm);
11929 #else
11930 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) {
11931 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p",
11932 nrsm, insret, rack, rsm);
11933 }
11934 #endif
11935 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
11936 rack->r_ctl.last_collapse_point, __LINE__);
11937 if (rsm->r_in_tmap) {
11938 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
11939 nrsm->r_in_tmap = 1;
11940 }
11941 /*
11942 * Set in the new RSM as the
11943 * collapsed starting point
11944 */
11945 rsm = nrsm;
11946 }
11947
11948 no_split:
11949 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) {
11950 cnt++;
11951 nrsm->r_flags |= RACK_RWND_COLLAPSED;
11952 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
11953 cnt++;
11954 }
11955 if (cnt) {
11956 counter_u64_add(rack_collapsed_win, 1);
11957 }
11958 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
11959 }
11960
11961 static void
rack_handle_delayed_ack(struct tcpcb * tp,struct tcp_rack * rack,int32_t tlen,int32_t tfo_syn)11962 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack,
11963 int32_t tlen, int32_t tfo_syn)
11964 {
11965 if (DELAY_ACK(tp, tlen) || tfo_syn) {
11966 rack_timer_cancel(tp, rack,
11967 rack->r_ctl.rc_rcvtime, __LINE__);
11968 tp->t_flags |= TF_DELACK;
11969 } else {
11970 rack->r_wanted_output = 1;
11971 tp->t_flags |= TF_ACKNOW;
11972 }
11973 }
11974
11975 static void
rack_validate_fo_sendwin_up(struct tcpcb * tp,struct tcp_rack * rack)11976 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack)
11977 {
11978 /*
11979 * If fast output is in progress, lets validate that
11980 * the new window did not shrink on us and make it
11981 * so fast output should end.
11982 */
11983 if (rack->r_fast_output) {
11984 uint32_t out;
11985
11986 /*
11987 * Calculate what we will send if left as is
11988 * and compare that to our send window.
11989 */
11990 out = ctf_outstanding(tp);
11991 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) {
11992 /* ok we have an issue */
11993 if (out >= tp->snd_wnd) {
11994 /* Turn off fast output the window is met or collapsed */
11995 rack->r_fast_output = 0;
11996 } else {
11997 /* we have some room left */
11998 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out;
11999 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) {
12000 /* If not at least 1 full segment never mind */
12001 rack->r_fast_output = 0;
12002 }
12003 }
12004 }
12005 }
12006 }
12007
12008 /*
12009 * Return value of 1, the TCB is unlocked and most
12010 * likely gone, return value of 0, the TCP is still
12011 * locked.
12012 */
12013 static int
rack_process_data(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt)12014 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
12015 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
12016 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
12017 {
12018 /*
12019 * Update window information. Don't look at window if no ACK: TAC's
12020 * send garbage on first SYN.
12021 */
12022 int32_t nsegs;
12023 int32_t tfo_syn;
12024 struct tcp_rack *rack;
12025
12026 INP_WLOCK_ASSERT(tptoinpcb(tp));
12027
12028 rack = (struct tcp_rack *)tp->t_fb_ptr;
12029 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12030 if ((thflags & TH_ACK) &&
12031 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
12032 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
12033 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
12034 /* keep track of pure window updates */
12035 if (tlen == 0 &&
12036 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
12037 KMOD_TCPSTAT_INC(tcps_rcvwinupd);
12038 tp->snd_wnd = tiwin;
12039 rack_validate_fo_sendwin_up(tp, rack);
12040 tp->snd_wl1 = th->th_seq;
12041 tp->snd_wl2 = th->th_ack;
12042 if (tp->snd_wnd > tp->max_sndwnd)
12043 tp->max_sndwnd = tp->snd_wnd;
12044 rack->r_wanted_output = 1;
12045 } else if (thflags & TH_ACK) {
12046 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) {
12047 tp->snd_wnd = tiwin;
12048 rack_validate_fo_sendwin_up(tp, rack);
12049 tp->snd_wl1 = th->th_seq;
12050 tp->snd_wl2 = th->th_ack;
12051 }
12052 }
12053 if (tp->snd_wnd < ctf_outstanding(tp))
12054 /* The peer collapsed the window */
12055 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12056 else if (rack->rc_has_collapsed)
12057 rack_un_collapse_window(rack, __LINE__);
12058 if ((rack->r_collapse_point_valid) &&
12059 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
12060 rack->r_collapse_point_valid = 0;
12061 /* Was persist timer active and now we have window space? */
12062 if ((rack->rc_in_persist != 0) &&
12063 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12064 rack->r_ctl.rc_pace_min_segs))) {
12065 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime);
12066 tp->snd_nxt = tp->snd_max;
12067 /* Make sure we output to start the timer */
12068 rack->r_wanted_output = 1;
12069 }
12070 /* Do we enter persists? */
12071 if ((rack->rc_in_persist == 0) &&
12072 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12073 TCPS_HAVEESTABLISHED(tp->t_state) &&
12074 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12075 sbavail(&tptosocket(tp)->so_snd) &&
12076 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12077 /*
12078 * Here the rwnd is less than
12079 * the pacing size, we are established,
12080 * nothing is outstanding, and there is
12081 * data to send. Enter persists.
12082 */
12083 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
12084 }
12085 if (tp->t_flags2 & TF2_DROP_AF_DATA) {
12086 m_freem(m);
12087 return (0);
12088 }
12089 /*
12090 * don't process the URG bit, ignore them drag
12091 * along the up.
12092 */
12093 tp->rcv_up = tp->rcv_nxt;
12094
12095 /*
12096 * Process the segment text, merging it into the TCP sequencing
12097 * queue, and arranging for acknowledgment of receipt if necessary.
12098 * This process logically involves adjusting tp->rcv_wnd as data is
12099 * presented to the user (this happens in tcp_usrreq.c, case
12100 * PRU_RCVD). If a FIN has already been received on this connection
12101 * then we just ignore the text.
12102 */
12103 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
12104 (tp->t_flags & TF_FASTOPEN));
12105 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) &&
12106 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12107 tcp_seq save_start = th->th_seq;
12108 tcp_seq save_rnxt = tp->rcv_nxt;
12109 int save_tlen = tlen;
12110
12111 m_adj(m, drop_hdrlen); /* delayed header drop */
12112 /*
12113 * Insert segment which includes th into TCP reassembly
12114 * queue with control block tp. Set thflags to whether
12115 * reassembly now includes a segment with FIN. This handles
12116 * the common case inline (segment is the next to be
12117 * received on an established connection, and the queue is
12118 * empty), avoiding linkage into and removal from the queue
12119 * and repetition of various conversions. Set DELACK for
12120 * segments received in order, but ack immediately when
12121 * segments are out of order (so fast retransmit can work).
12122 */
12123 if (th->th_seq == tp->rcv_nxt &&
12124 SEGQ_EMPTY(tp) &&
12125 (TCPS_HAVEESTABLISHED(tp->t_state) ||
12126 tfo_syn)) {
12127 #ifdef NETFLIX_SB_LIMITS
12128 u_int mcnt, appended;
12129
12130 if (so->so_rcv.sb_shlim) {
12131 mcnt = m_memcnt(m);
12132 appended = 0;
12133 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12134 CFO_NOSLEEP, NULL) == false) {
12135 counter_u64_add(tcp_sb_shlim_fails, 1);
12136 m_freem(m);
12137 return (0);
12138 }
12139 }
12140 #endif
12141 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn);
12142 tp->rcv_nxt += tlen;
12143 if (tlen &&
12144 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12145 (tp->t_fbyte_in == 0)) {
12146 tp->t_fbyte_in = ticks;
12147 if (tp->t_fbyte_in == 0)
12148 tp->t_fbyte_in = 1;
12149 if (tp->t_fbyte_out && tp->t_fbyte_in)
12150 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12151 }
12152 thflags = tcp_get_flags(th) & TH_FIN;
12153 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12154 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12155 SOCK_RECVBUF_LOCK(so);
12156 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12157 m_freem(m);
12158 } else {
12159 int32_t newsize;
12160
12161 if (tlen > 0) {
12162 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
12163 if (newsize)
12164 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
12165 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
12166 }
12167 #ifdef NETFLIX_SB_LIMITS
12168 appended =
12169 #endif
12170 sbappendstream_locked(&so->so_rcv, m, 0);
12171 }
12172 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12173 /* NB: sorwakeup_locked() does an implicit unlock. */
12174 sorwakeup_locked(so);
12175 #ifdef NETFLIX_SB_LIMITS
12176 if (so->so_rcv.sb_shlim && appended != mcnt)
12177 counter_fo_release(so->so_rcv.sb_shlim,
12178 mcnt - appended);
12179 #endif
12180 } else {
12181 /*
12182 * XXX: Due to the header drop above "th" is
12183 * theoretically invalid by now. Fortunately
12184 * m_adj() doesn't actually frees any mbufs when
12185 * trimming from the head.
12186 */
12187 tcp_seq temp = save_start;
12188
12189 thflags = tcp_reass(tp, th, &temp, &tlen, m);
12190 tp->t_flags |= TF_ACKNOW;
12191 if (tp->t_flags & TF_WAKESOR) {
12192 tp->t_flags &= ~TF_WAKESOR;
12193 /* NB: sorwakeup_locked() does an implicit unlock. */
12194 sorwakeup_locked(so);
12195 }
12196 }
12197 if ((tp->t_flags & TF_SACK_PERMIT) &&
12198 (save_tlen > 0) &&
12199 TCPS_HAVEESTABLISHED(tp->t_state)) {
12200 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
12201 /*
12202 * DSACK actually handled in the fastpath
12203 * above.
12204 */
12205 tcp_update_sack_list(tp, save_start,
12206 save_start + save_tlen);
12207 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
12208 if ((tp->rcv_numsacks >= 1) &&
12209 (tp->sackblks[0].end == save_start)) {
12210 /*
12211 * Partial overlap, recorded at todrop
12212 * above.
12213 */
12214 tcp_update_sack_list(tp,
12215 tp->sackblks[0].start,
12216 tp->sackblks[0].end);
12217 } else {
12218 tcp_update_dsack_list(tp, save_start,
12219 save_start + save_tlen);
12220 }
12221 } else if (tlen >= save_tlen) {
12222 /* Update of sackblks. */
12223 tcp_update_dsack_list(tp, save_start,
12224 save_start + save_tlen);
12225 } else if (tlen > 0) {
12226 tcp_update_dsack_list(tp, save_start,
12227 save_start + tlen);
12228 }
12229 }
12230 } else {
12231 m_freem(m);
12232 thflags &= ~TH_FIN;
12233 }
12234
12235 /*
12236 * If FIN is received ACK the FIN and let the user know that the
12237 * connection is closing.
12238 */
12239 if (thflags & TH_FIN) {
12240 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
12241 /* The socket upcall is handled by socantrcvmore. */
12242 socantrcvmore(so);
12243 /*
12244 * If connection is half-synchronized (ie NEEDSYN
12245 * flag on) then delay ACK, so it may be piggybacked
12246 * when SYN is sent. Otherwise, since we received a
12247 * FIN then no more input can be expected, send ACK
12248 * now.
12249 */
12250 if (tp->t_flags & TF_NEEDSYN) {
12251 rack_timer_cancel(tp, rack,
12252 rack->r_ctl.rc_rcvtime, __LINE__);
12253 tp->t_flags |= TF_DELACK;
12254 } else {
12255 tp->t_flags |= TF_ACKNOW;
12256 }
12257 tp->rcv_nxt++;
12258 }
12259 switch (tp->t_state) {
12260 /*
12261 * In SYN_RECEIVED and ESTABLISHED STATES enter the
12262 * CLOSE_WAIT state.
12263 */
12264 case TCPS_SYN_RECEIVED:
12265 tp->t_starttime = ticks;
12266 /* FALLTHROUGH */
12267 case TCPS_ESTABLISHED:
12268 rack_timer_cancel(tp, rack,
12269 rack->r_ctl.rc_rcvtime, __LINE__);
12270 tcp_state_change(tp, TCPS_CLOSE_WAIT);
12271 break;
12272
12273 /*
12274 * If still in FIN_WAIT_1 STATE FIN has not been
12275 * acked so enter the CLOSING state.
12276 */
12277 case TCPS_FIN_WAIT_1:
12278 rack_timer_cancel(tp, rack,
12279 rack->r_ctl.rc_rcvtime, __LINE__);
12280 tcp_state_change(tp, TCPS_CLOSING);
12281 break;
12282
12283 /*
12284 * In FIN_WAIT_2 state enter the TIME_WAIT state,
12285 * starting the time-wait timer, turning off the
12286 * other standard timers.
12287 */
12288 case TCPS_FIN_WAIT_2:
12289 rack_timer_cancel(tp, rack,
12290 rack->r_ctl.rc_rcvtime, __LINE__);
12291 tcp_twstart(tp);
12292 return (1);
12293 }
12294 }
12295 /*
12296 * Return any desired output.
12297 */
12298 if ((tp->t_flags & TF_ACKNOW) ||
12299 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
12300 rack->r_wanted_output = 1;
12301 }
12302 return (0);
12303 }
12304
12305 /*
12306 * Here nothing is really faster, its just that we
12307 * have broken out the fast-data path also just like
12308 * the fast-ack.
12309 */
12310 static int
rack_do_fastnewdata(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t nxt_pkt,uint8_t iptos)12311 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
12312 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12313 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
12314 {
12315 int32_t nsegs;
12316 int32_t newsize = 0; /* automatic sockbuf scaling */
12317 struct tcp_rack *rack;
12318 #ifdef NETFLIX_SB_LIMITS
12319 u_int mcnt, appended;
12320 #endif
12321
12322 /*
12323 * If last ACK falls within this segment's sequence numbers, record
12324 * the timestamp. NOTE that the test is modified according to the
12325 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12326 */
12327 if (__predict_false(th->th_seq != tp->rcv_nxt)) {
12328 return (0);
12329 }
12330 if (tiwin && tiwin != tp->snd_wnd) {
12331 return (0);
12332 }
12333 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) {
12334 return (0);
12335 }
12336 if (__predict_false((to->to_flags & TOF_TS) &&
12337 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) {
12338 return (0);
12339 }
12340 if (__predict_false((th->th_ack != tp->snd_una))) {
12341 return (0);
12342 }
12343 if (__predict_false(tlen > sbspace(&so->so_rcv))) {
12344 return (0);
12345 }
12346 if ((to->to_flags & TOF_TS) != 0 &&
12347 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12348 tp->ts_recent_age = tcp_ts_getticks();
12349 tp->ts_recent = to->to_tsval;
12350 }
12351 rack = (struct tcp_rack *)tp->t_fb_ptr;
12352 /*
12353 * This is a pure, in-sequence data packet with nothing on the
12354 * reassembly queue and we have enough buffer space to take it.
12355 */
12356 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12357
12358 #ifdef NETFLIX_SB_LIMITS
12359 if (so->so_rcv.sb_shlim) {
12360 mcnt = m_memcnt(m);
12361 appended = 0;
12362 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt,
12363 CFO_NOSLEEP, NULL) == false) {
12364 counter_u64_add(tcp_sb_shlim_fails, 1);
12365 m_freem(m);
12366 return (1);
12367 }
12368 }
12369 #endif
12370 /* Clean receiver SACK report if present */
12371 if (tp->rcv_numsacks)
12372 tcp_clean_sackreport(tp);
12373 KMOD_TCPSTAT_INC(tcps_preddat);
12374 tp->rcv_nxt += tlen;
12375 if (tlen &&
12376 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
12377 (tp->t_fbyte_in == 0)) {
12378 tp->t_fbyte_in = ticks;
12379 if (tp->t_fbyte_in == 0)
12380 tp->t_fbyte_in = 1;
12381 if (tp->t_fbyte_out && tp->t_fbyte_in)
12382 tp->t_flags2 |= TF2_FBYTES_COMPLETE;
12383 }
12384 /*
12385 * Pull snd_wl1 up to prevent seq wrap relative to th_seq.
12386 */
12387 tp->snd_wl1 = th->th_seq;
12388 /*
12389 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt.
12390 */
12391 tp->rcv_up = tp->rcv_nxt;
12392 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs);
12393 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen);
12394 newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
12395
12396 /* Add data to socket buffer. */
12397 SOCK_RECVBUF_LOCK(so);
12398 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12399 m_freem(m);
12400 } else {
12401 /*
12402 * Set new socket buffer size. Give up when limit is
12403 * reached.
12404 */
12405 if (newsize)
12406 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
12407 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
12408 m_adj(m, drop_hdrlen); /* delayed header drop */
12409 #ifdef NETFLIX_SB_LIMITS
12410 appended =
12411 #endif
12412 sbappendstream_locked(&so->so_rcv, m, 0);
12413 ctf_calc_rwin(so, tp);
12414 }
12415 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
12416 /* NB: sorwakeup_locked() does an implicit unlock. */
12417 sorwakeup_locked(so);
12418 #ifdef NETFLIX_SB_LIMITS
12419 if (so->so_rcv.sb_shlim && mcnt != appended)
12420 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
12421 #endif
12422 rack_handle_delayed_ack(tp, rack, tlen, 0);
12423 if (tp->snd_una == tp->snd_max)
12424 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
12425 return (1);
12426 }
12427
12428 /*
12429 * This subfunction is used to try to highly optimize the
12430 * fast path. We again allow window updates that are
12431 * in sequence to remain in the fast-path. We also add
12432 * in the __predict's to attempt to help the compiler.
12433 * Note that if we return a 0, then we can *not* process
12434 * it and the caller should push the packet into the
12435 * slow-path.
12436 */
12437 static int
rack_fastack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t nxt_pkt,uint32_t cts)12438 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
12439 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12440 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts)
12441 {
12442 int32_t acked;
12443 int32_t nsegs;
12444 int32_t under_pacing = 0;
12445 struct tcp_rack *rack;
12446
12447 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) {
12448 /* Old ack, behind (or duplicate to) the last one rcv'd */
12449 return (0);
12450 }
12451 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) {
12452 /* Above what we have sent? */
12453 return (0);
12454 }
12455 if (__predict_false(tiwin == 0)) {
12456 /* zero window */
12457 return (0);
12458 }
12459 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) {
12460 /* We need a SYN or a FIN, unlikely.. */
12461 return (0);
12462 }
12463 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) {
12464 /* Timestamp is behind .. old ack with seq wrap? */
12465 return (0);
12466 }
12467 if (__predict_false(IN_RECOVERY(tp->t_flags))) {
12468 /* Still recovering */
12469 return (0);
12470 }
12471 rack = (struct tcp_rack *)tp->t_fb_ptr;
12472 if (rack->r_ctl.rc_sacked) {
12473 /* We have sack holes on our scoreboard */
12474 return (0);
12475 }
12476 /* Ok if we reach here, we can process a fast-ack */
12477 if (rack->gp_ready &&
12478 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
12479 under_pacing = 1;
12480 }
12481 nsegs = max(1, m->m_pkthdr.lro_nsegs);
12482 rack_log_ack(tp, to, th, 0, 0, NULL, NULL);
12483 /* Did the window get updated? */
12484 if (tiwin != tp->snd_wnd) {
12485 tp->snd_wnd = tiwin;
12486 rack_validate_fo_sendwin_up(tp, rack);
12487 tp->snd_wl1 = th->th_seq;
12488 if (tp->snd_wnd > tp->max_sndwnd)
12489 tp->max_sndwnd = tp->snd_wnd;
12490 }
12491 /* Do we exit persists? */
12492 if ((rack->rc_in_persist != 0) &&
12493 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
12494 rack->r_ctl.rc_pace_min_segs))) {
12495 rack_exit_persist(tp, rack, cts);
12496 }
12497 /* Do we enter persists? */
12498 if ((rack->rc_in_persist == 0) &&
12499 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
12500 TCPS_HAVEESTABLISHED(tp->t_state) &&
12501 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
12502 sbavail(&tptosocket(tp)->so_snd) &&
12503 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
12504 /*
12505 * Here the rwnd is less than
12506 * the pacing size, we are established,
12507 * nothing is outstanding, and there is
12508 * data to send. Enter persists.
12509 */
12510 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack);
12511 }
12512 /*
12513 * If last ACK falls within this segment's sequence numbers, record
12514 * the timestamp. NOTE that the test is modified according to the
12515 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26).
12516 */
12517 if ((to->to_flags & TOF_TS) != 0 &&
12518 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
12519 tp->ts_recent_age = tcp_ts_getticks();
12520 tp->ts_recent = to->to_tsval;
12521 }
12522 /*
12523 * This is a pure ack for outstanding data.
12524 */
12525 KMOD_TCPSTAT_INC(tcps_predack);
12526
12527 /*
12528 * "bad retransmit" recovery.
12529 */
12530 if ((tp->t_flags & TF_PREVVALID) &&
12531 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
12532 tp->t_flags &= ~TF_PREVVALID;
12533 if (tp->t_rxtshift == 1 &&
12534 (int)(ticks - tp->t_badrxtwin) < 0)
12535 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__);
12536 }
12537 /*
12538 * Recalculate the transmit timer / rtt.
12539 *
12540 * Some boxes send broken timestamp replies during the SYN+ACK
12541 * phase, ignore timestamps of 0 or we could calculate a huge RTT
12542 * and blow up the retransmit timer.
12543 */
12544 acked = BYTES_THIS_ACK(tp, th);
12545
12546 #ifdef TCP_HHOOK
12547 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
12548 hhook_run_tcp_est_in(tp, th, to);
12549 #endif
12550 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs);
12551 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
12552 if (acked) {
12553 struct mbuf *mfree;
12554
12555 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0);
12556 SOCK_SENDBUF_LOCK(so);
12557 mfree = sbcut_locked(&so->so_snd, acked);
12558 tp->snd_una = th->th_ack;
12559 /* Note we want to hold the sb lock through the sendmap adjust */
12560 rack_adjust_sendmap_head(rack, &so->so_snd);
12561 /* Wake up the socket if we have room to write more */
12562 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
12563 sowwakeup_locked(so);
12564 m_freem(mfree);
12565 tp->t_rxtshift = 0;
12566 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
12567 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
12568 rack->rc_tlp_in_progress = 0;
12569 rack->r_ctl.rc_tlp_cnt_out = 0;
12570 /*
12571 * If it is the RXT timer we want to
12572 * stop it, so we can restart a TLP.
12573 */
12574 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
12575 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12576
12577 #ifdef TCP_REQUEST_TRK
12578 rack_req_check_for_comp(rack, th->th_ack);
12579 #endif
12580 }
12581 /*
12582 * Let the congestion control algorithm update congestion control
12583 * related information. This typically means increasing the
12584 * congestion window.
12585 */
12586 if (tp->snd_wnd < ctf_outstanding(tp)) {
12587 /* The peer collapsed the window */
12588 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__);
12589 } else if (rack->rc_has_collapsed)
12590 rack_un_collapse_window(rack, __LINE__);
12591 if ((rack->r_collapse_point_valid) &&
12592 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
12593 rack->r_collapse_point_valid = 0;
12594 /*
12595 * Pull snd_wl2 up to prevent seq wrap relative to th_ack.
12596 */
12597 tp->snd_wl2 = th->th_ack;
12598 tp->t_dupacks = 0;
12599 m_freem(m);
12600 /* ND6_HINT(tp); *//* Some progress has been made. */
12601
12602 /*
12603 * If all outstanding data are acked, stop retransmit timer,
12604 * otherwise restart timer using current (possibly backed-off)
12605 * value. If process is waiting for space, wakeup/selwakeup/signal.
12606 * If data are ready to send, let tcp_output decide between more
12607 * output or persist.
12608 */
12609 if (under_pacing &&
12610 (rack->use_fixed_rate == 0) &&
12611 (rack->in_probe_rtt == 0) &&
12612 rack->rc_gp_dyn_mul &&
12613 rack->rc_always_pace) {
12614 /* Check if we are dragging bottom */
12615 rack_check_bottom_drag(tp, rack, so);
12616 }
12617 if (tp->snd_una == tp->snd_max) {
12618 tp->t_flags &= ~TF_PREVVALID;
12619 rack->r_ctl.retran_during_recovery = 0;
12620 rack->rc_suspicious = 0;
12621 rack->r_ctl.dsack_byte_cnt = 0;
12622 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
12623 if (rack->r_ctl.rc_went_idle_time == 0)
12624 rack->r_ctl.rc_went_idle_time = 1;
12625 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
12626 if (sbavail(&tptosocket(tp)->so_snd) == 0)
12627 tp->t_acktime = 0;
12628 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
12629 }
12630 if (acked && rack->r_fast_output)
12631 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked);
12632 if (sbavail(&so->so_snd)) {
12633 rack->r_wanted_output = 1;
12634 }
12635 return (1);
12636 }
12637
12638 /*
12639 * Return value of 1, the TCB is unlocked and most
12640 * likely gone, return value of 0, the TCP is still
12641 * locked.
12642 */
12643 static int
rack_do_syn_sent(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)12644 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
12645 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12646 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12647 {
12648 int32_t ret_val = 0;
12649 int32_t orig_tlen = tlen;
12650 int32_t todrop;
12651 int32_t ourfinisacked = 0;
12652 struct tcp_rack *rack;
12653
12654 INP_WLOCK_ASSERT(tptoinpcb(tp));
12655
12656 ctf_calc_rwin(so, tp);
12657 /*
12658 * If the state is SYN_SENT: if seg contains an ACK, but not for our
12659 * SYN, drop the input. if seg contains a RST, then drop the
12660 * connection. if seg does not contain SYN, then drop it. Otherwise
12661 * this is an acceptable SYN segment initialize tp->rcv_nxt and
12662 * tp->irs if seg contains ack then advance tp->snd_una if seg
12663 * contains an ECE and ECN support is enabled, the stream is ECN
12664 * capable. if SYN has been acked change to ESTABLISHED else
12665 * SYN_RCVD state arrange for segment to be acked (eventually)
12666 * continue processing rest of data/controls.
12667 */
12668 if ((thflags & TH_ACK) &&
12669 (SEQ_LEQ(th->th_ack, tp->iss) ||
12670 SEQ_GT(th->th_ack, tp->snd_max))) {
12671 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12672 ctf_do_dropwithreset(m, tp, th, tlen);
12673 return (1);
12674 }
12675 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) {
12676 TCP_PROBE5(connect__refused, NULL, tp,
12677 mtod(m, const char *), tp, th);
12678 tp = tcp_drop(tp, ECONNREFUSED);
12679 ctf_do_drop(m, tp);
12680 return (1);
12681 }
12682 if (thflags & TH_RST) {
12683 ctf_do_drop(m, tp);
12684 return (1);
12685 }
12686 if (!(thflags & TH_SYN)) {
12687 ctf_do_drop(m, tp);
12688 return (1);
12689 }
12690 tp->irs = th->th_seq;
12691 tcp_rcvseqinit(tp);
12692 rack = (struct tcp_rack *)tp->t_fb_ptr;
12693 if (thflags & TH_ACK) {
12694 int tfo_partial = 0;
12695
12696 KMOD_TCPSTAT_INC(tcps_connects);
12697 soisconnected(so);
12698 #ifdef MAC
12699 mac_socketpeer_set_from_mbuf(m, so);
12700 #endif
12701 /* Do window scaling on this connection? */
12702 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
12703 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
12704 tp->rcv_scale = tp->request_r_scale;
12705 }
12706 tp->rcv_adv += min(tp->rcv_wnd,
12707 TCP_MAXWIN << tp->rcv_scale);
12708 /*
12709 * If not all the data that was sent in the TFO SYN
12710 * has been acked, resend the remainder right away.
12711 */
12712 if ((tp->t_flags & TF_FASTOPEN) &&
12713 (tp->snd_una != tp->snd_max)) {
12714 /* Was it a partial ack? */
12715 if (SEQ_LT(th->th_ack, tp->snd_max))
12716 tfo_partial = 1;
12717 }
12718 /*
12719 * If there's data, delay ACK; if there's also a FIN ACKNOW
12720 * will be turned on later.
12721 */
12722 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) {
12723 rack_timer_cancel(tp, rack,
12724 rack->r_ctl.rc_rcvtime, __LINE__);
12725 tp->t_flags |= TF_DELACK;
12726 } else {
12727 rack->r_wanted_output = 1;
12728 tp->t_flags |= TF_ACKNOW;
12729 }
12730
12731 tcp_ecn_input_syn_sent(tp, thflags, iptos);
12732
12733 if (SEQ_GT(th->th_ack, tp->snd_una)) {
12734 /*
12735 * We advance snd_una for the
12736 * fast open case. If th_ack is
12737 * acknowledging data beyond
12738 * snd_una we can't just call
12739 * ack-processing since the
12740 * data stream in our send-map
12741 * will start at snd_una + 1 (one
12742 * beyond the SYN). If its just
12743 * equal we don't need to do that
12744 * and there is no send_map.
12745 */
12746 tp->snd_una++;
12747 if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) {
12748 /*
12749 * We sent a SYN with data, and thus have a
12750 * sendmap entry with a SYN set. Lets find it
12751 * and take off the send bit and the byte and
12752 * set it up to be what we send (send it next).
12753 */
12754 struct rack_sendmap *rsm;
12755
12756 rsm = tqhash_min(rack->r_ctl.tqh);
12757 if (rsm) {
12758 if (rsm->r_flags & RACK_HAS_SYN) {
12759 rsm->r_flags &= ~RACK_HAS_SYN;
12760 rsm->r_start++;
12761 }
12762 rack->r_ctl.rc_resend = rsm;
12763 }
12764 }
12765 }
12766 /*
12767 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
12768 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
12769 */
12770 tp->t_starttime = ticks;
12771 if (tp->t_flags & TF_NEEDFIN) {
12772 tcp_state_change(tp, TCPS_FIN_WAIT_1);
12773 tp->t_flags &= ~TF_NEEDFIN;
12774 thflags &= ~TH_SYN;
12775 } else {
12776 tcp_state_change(tp, TCPS_ESTABLISHED);
12777 TCP_PROBE5(connect__established, NULL, tp,
12778 mtod(m, const char *), tp, th);
12779 rack_cc_conn_init(tp);
12780 }
12781 } else {
12782 /*
12783 * Received initial SYN in SYN-SENT[*] state => simultaneous
12784 * open. If segment contains CC option and there is a
12785 * cached CC, apply TAO test. If it succeeds, connection is *
12786 * half-synchronized. Otherwise, do 3-way handshake:
12787 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If
12788 * there was no CC option, clear cached CC value.
12789 */
12790 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN);
12791 tcp_state_change(tp, TCPS_SYN_RECEIVED);
12792 }
12793 /*
12794 * Advance th->th_seq to correspond to first data byte. If data,
12795 * trim to stay within window, dropping FIN if necessary.
12796 */
12797 th->th_seq++;
12798 if (tlen > tp->rcv_wnd) {
12799 todrop = tlen - tp->rcv_wnd;
12800 m_adj(m, -todrop);
12801 tlen = tp->rcv_wnd;
12802 thflags &= ~TH_FIN;
12803 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
12804 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
12805 }
12806 tp->snd_wl1 = th->th_seq - 1;
12807 tp->rcv_up = th->th_seq;
12808 /*
12809 * Client side of transaction: already sent SYN and data. If the
12810 * remote host used T/TCP to validate the SYN, our data will be
12811 * ACK'd; if so, enter normal data segment processing in the middle
12812 * of step 5, ack processing. Otherwise, goto step 6.
12813 */
12814 if (thflags & TH_ACK) {
12815 /* For syn-sent we need to possibly update the rtt */
12816 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
12817 uint32_t t, mcts;
12818
12819 mcts = tcp_ts_getticks();
12820 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
12821 if (!tp->t_rttlow || tp->t_rttlow > t)
12822 tp->t_rttlow = t;
12823 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4);
12824 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
12825 tcp_rack_xmit_timer_commit(rack, tp);
12826 }
12827 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen))
12828 return (ret_val);
12829 /* We may have changed to FIN_WAIT_1 above */
12830 if (tp->t_state == TCPS_FIN_WAIT_1) {
12831 /*
12832 * In FIN_WAIT_1 STATE in addition to the processing
12833 * for the ESTABLISHED state if our FIN is now
12834 * acknowledged then enter FIN_WAIT_2.
12835 */
12836 if (ourfinisacked) {
12837 /*
12838 * If we can't receive any more data, then
12839 * closing user can proceed. Starting the
12840 * timer is contrary to the specification,
12841 * but if we don't get a FIN we'll hang
12842 * forever.
12843 *
12844 * XXXjl: we should release the tp also, and
12845 * use a compressed state.
12846 */
12847 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
12848 soisdisconnected(so);
12849 tcp_timer_activate(tp, TT_2MSL,
12850 (tcp_fast_finwait2_recycle ?
12851 tcp_finwait2_timeout :
12852 TP_MAXIDLE(tp)));
12853 }
12854 tcp_state_change(tp, TCPS_FIN_WAIT_2);
12855 }
12856 }
12857 }
12858 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12859 tiwin, thflags, nxt_pkt));
12860 }
12861
12862 /*
12863 * Return value of 1, the TCB is unlocked and most
12864 * likely gone, return value of 0, the TCP is still
12865 * locked.
12866 */
12867 static int
rack_do_syn_recv(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)12868 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
12869 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
12870 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
12871 {
12872 struct tcp_rack *rack;
12873 int32_t orig_tlen = tlen;
12874 int32_t ret_val = 0;
12875 int32_t ourfinisacked = 0;
12876
12877 rack = (struct tcp_rack *)tp->t_fb_ptr;
12878 ctf_calc_rwin(so, tp);
12879 if ((thflags & TH_RST) ||
12880 (tp->t_fin_is_rst && (thflags & TH_FIN)))
12881 return (ctf_process_rst(m, th, so, tp));
12882 if ((thflags & TH_ACK) &&
12883 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
12884 SEQ_GT(th->th_ack, tp->snd_max))) {
12885 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12886 ctf_do_dropwithreset(m, tp, th, tlen);
12887 return (1);
12888 }
12889 if (tp->t_flags & TF_FASTOPEN) {
12890 /*
12891 * When a TFO connection is in SYN_RECEIVED, the
12892 * only valid packets are the initial SYN, a
12893 * retransmit/copy of the initial SYN (possibly with
12894 * a subset of the original data), a valid ACK, a
12895 * FIN, or a RST.
12896 */
12897 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
12898 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12899 ctf_do_dropwithreset(m, tp, th, tlen);
12900 return (1);
12901 } else if (thflags & TH_SYN) {
12902 /* non-initial SYN is ignored */
12903 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) ||
12904 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) ||
12905 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) {
12906 ctf_do_drop(m, NULL);
12907 return (0);
12908 }
12909 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) {
12910 ctf_do_drop(m, NULL);
12911 return (0);
12912 }
12913 }
12914
12915 /*
12916 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
12917 * it's less than ts_recent, drop it.
12918 */
12919 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
12920 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
12921 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
12922 return (ret_val);
12923 }
12924 /*
12925 * In the SYN-RECEIVED state, validate that the packet belongs to
12926 * this connection before trimming the data to fit the receive
12927 * window. Check the sequence number versus IRS since we know the
12928 * sequence numbers haven't wrapped. This is a partial fix for the
12929 * "LAND" DoS attack.
12930 */
12931 if (SEQ_LT(th->th_seq, tp->irs)) {
12932 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
12933 ctf_do_dropwithreset(m, tp, th, tlen);
12934 return (1);
12935 }
12936 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
12937 return (ret_val);
12938 }
12939 /*
12940 * If last ACK falls within this segment's sequence numbers, record
12941 * its timestamp. NOTE: 1) That the test incorporates suggestions
12942 * from the latest proposal of the tcplw@cray.com list (Braden
12943 * 1993/04/26). 2) That updating only on newer timestamps interferes
12944 * with our earlier PAWS tests, so this check should be solely
12945 * predicated on the sequence space of this segment. 3) That we
12946 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
12947 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
12948 * SEG.Len, This modified check allows us to overcome RFC1323's
12949 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
12950 * p.869. In such cases, we can still calculate the RTT correctly
12951 * when RCV.NXT == Last.ACK.Sent.
12952 */
12953 if ((to->to_flags & TOF_TS) != 0 &&
12954 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
12955 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
12956 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
12957 tp->ts_recent_age = tcp_ts_getticks();
12958 tp->ts_recent = to->to_tsval;
12959 }
12960 tp->snd_wnd = tiwin;
12961 rack_validate_fo_sendwin_up(tp, rack);
12962 /*
12963 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
12964 * is on (half-synchronized state), then queue data for later
12965 * processing; else drop segment and return.
12966 */
12967 if ((thflags & TH_ACK) == 0) {
12968 if (tp->t_flags & TF_FASTOPEN) {
12969 rack_cc_conn_init(tp);
12970 }
12971 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
12972 tiwin, thflags, nxt_pkt));
12973 }
12974 KMOD_TCPSTAT_INC(tcps_connects);
12975 if (tp->t_flags & TF_SONOTCONN) {
12976 tp->t_flags &= ~TF_SONOTCONN;
12977 soisconnected(so);
12978 }
12979 /* Do window scaling? */
12980 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
12981 (TF_RCVD_SCALE | TF_REQ_SCALE)) {
12982 tp->rcv_scale = tp->request_r_scale;
12983 }
12984 /*
12985 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
12986 * FIN-WAIT-1
12987 */
12988 tp->t_starttime = ticks;
12989 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) {
12990 tcp_fastopen_decrement_counter(tp->t_tfo_pending);
12991 tp->t_tfo_pending = NULL;
12992 }
12993 if (tp->t_flags & TF_NEEDFIN) {
12994 tcp_state_change(tp, TCPS_FIN_WAIT_1);
12995 tp->t_flags &= ~TF_NEEDFIN;
12996 } else {
12997 tcp_state_change(tp, TCPS_ESTABLISHED);
12998 TCP_PROBE5(accept__established, NULL, tp,
12999 mtod(m, const char *), tp, th);
13000 /*
13001 * TFO connections call cc_conn_init() during SYN
13002 * processing. Calling it again here for such connections
13003 * is not harmless as it would undo the snd_cwnd reduction
13004 * that occurs when a TFO SYN|ACK is retransmitted.
13005 */
13006 if (!(tp->t_flags & TF_FASTOPEN))
13007 rack_cc_conn_init(tp);
13008 }
13009 /*
13010 * Account for the ACK of our SYN prior to
13011 * regular ACK processing below, except for
13012 * simultaneous SYN, which is handled later.
13013 */
13014 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN))
13015 tp->snd_una++;
13016 /*
13017 * If segment contains data or ACK, will call tcp_reass() later; if
13018 * not, do so now to pass queued data to user.
13019 */
13020 if (tlen == 0 && (thflags & TH_FIN) == 0) {
13021 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
13022 (struct mbuf *)0);
13023 if (tp->t_flags & TF_WAKESOR) {
13024 tp->t_flags &= ~TF_WAKESOR;
13025 /* NB: sorwakeup_locked() does an implicit unlock. */
13026 sorwakeup_locked(so);
13027 }
13028 }
13029 tp->snd_wl1 = th->th_seq - 1;
13030 /* For syn-recv we need to possibly update the rtt */
13031 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) {
13032 uint32_t t, mcts;
13033
13034 mcts = tcp_ts_getticks();
13035 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC;
13036 if (!tp->t_rttlow || tp->t_rttlow > t)
13037 tp->t_rttlow = t;
13038 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5);
13039 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2);
13040 tcp_rack_xmit_timer_commit(rack, tp);
13041 }
13042 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13043 return (ret_val);
13044 }
13045 if (tp->t_state == TCPS_FIN_WAIT_1) {
13046 /* We could have went to FIN_WAIT_1 (or EST) above */
13047 /*
13048 * In FIN_WAIT_1 STATE in addition to the processing for the
13049 * ESTABLISHED state if our FIN is now acknowledged then
13050 * enter FIN_WAIT_2.
13051 */
13052 if (ourfinisacked) {
13053 /*
13054 * If we can't receive any more data, then closing
13055 * user can proceed. Starting the timer is contrary
13056 * to the specification, but if we don't get a FIN
13057 * we'll hang forever.
13058 *
13059 * XXXjl: we should release the tp also, and use a
13060 * compressed state.
13061 */
13062 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13063 soisdisconnected(so);
13064 tcp_timer_activate(tp, TT_2MSL,
13065 (tcp_fast_finwait2_recycle ?
13066 tcp_finwait2_timeout :
13067 TP_MAXIDLE(tp)));
13068 }
13069 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13070 }
13071 }
13072 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13073 tiwin, thflags, nxt_pkt));
13074 }
13075
13076 /*
13077 * Return value of 1, the TCB is unlocked and most
13078 * likely gone, return value of 0, the TCP is still
13079 * locked.
13080 */
13081 static int
rack_do_established(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13082 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
13083 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13084 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13085 {
13086 int32_t ret_val = 0;
13087 int32_t orig_tlen = tlen;
13088 struct tcp_rack *rack;
13089
13090 /*
13091 * Header prediction: check for the two common cases of a
13092 * uni-directional data xfer. If the packet has no control flags,
13093 * is in-sequence, the window didn't change and we're not
13094 * retransmitting, it's a candidate. If the length is zero and the
13095 * ack moved forward, we're the sender side of the xfer. Just free
13096 * the data acked & wake any higher level process that was blocked
13097 * waiting for space. If the length is non-zero and the ack didn't
13098 * move, we're the receiver side. If we're getting packets in-order
13099 * (the reassembly queue is empty), add the data toc The socket
13100 * buffer and note that we need a delayed ack. Make sure that the
13101 * hidden state-flags are also off. Since we check for
13102 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN.
13103 */
13104 rack = (struct tcp_rack *)tp->t_fb_ptr;
13105 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) &&
13106 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) &&
13107 __predict_true(SEGQ_EMPTY(tp)) &&
13108 __predict_true(th->th_seq == tp->rcv_nxt)) {
13109 if (tlen == 0) {
13110 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
13111 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) {
13112 return (0);
13113 }
13114 } else {
13115 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen,
13116 tiwin, nxt_pkt, iptos)) {
13117 return (0);
13118 }
13119 }
13120 }
13121 ctf_calc_rwin(so, tp);
13122
13123 if ((thflags & TH_RST) ||
13124 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13125 return (ctf_process_rst(m, th, so, tp));
13126
13127 /*
13128 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13129 * synchronized state.
13130 */
13131 if (thflags & TH_SYN) {
13132 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13133 return (ret_val);
13134 }
13135 /*
13136 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13137 * it's less than ts_recent, drop it.
13138 */
13139 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13140 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13141 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13142 return (ret_val);
13143 }
13144 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13145 return (ret_val);
13146 }
13147 /*
13148 * If last ACK falls within this segment's sequence numbers, record
13149 * its timestamp. NOTE: 1) That the test incorporates suggestions
13150 * from the latest proposal of the tcplw@cray.com list (Braden
13151 * 1993/04/26). 2) That updating only on newer timestamps interferes
13152 * with our earlier PAWS tests, so this check should be solely
13153 * predicated on the sequence space of this segment. 3) That we
13154 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13155 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13156 * SEG.Len, This modified check allows us to overcome RFC1323's
13157 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13158 * p.869. In such cases, we can still calculate the RTT correctly
13159 * when RCV.NXT == Last.ACK.Sent.
13160 */
13161 if ((to->to_flags & TOF_TS) != 0 &&
13162 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13163 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13164 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13165 tp->ts_recent_age = tcp_ts_getticks();
13166 tp->ts_recent = to->to_tsval;
13167 }
13168 /*
13169 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13170 * is on (half-synchronized state), then queue data for later
13171 * processing; else drop segment and return.
13172 */
13173 if ((thflags & TH_ACK) == 0) {
13174 if (tp->t_flags & TF_NEEDSYN) {
13175 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13176 tiwin, thflags, nxt_pkt));
13177
13178 } else if (tp->t_flags & TF_ACKNOW) {
13179 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13180 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13181 return (ret_val);
13182 } else {
13183 ctf_do_drop(m, NULL);
13184 return (0);
13185 }
13186 }
13187 /*
13188 * Ack processing.
13189 */
13190 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
13191 return (ret_val);
13192 }
13193 if (sbavail(&so->so_snd)) {
13194 if (ctf_progress_timeout_check(tp, true)) {
13195 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__);
13196 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13197 return (1);
13198 }
13199 }
13200 /* State changes only happen in rack_process_data() */
13201 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13202 tiwin, thflags, nxt_pkt));
13203 }
13204
13205 /*
13206 * Return value of 1, the TCB is unlocked and most
13207 * likely gone, return value of 0, the TCP is still
13208 * locked.
13209 */
13210 static int
rack_do_close_wait(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13211 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
13212 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13213 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13214 {
13215 int32_t ret_val = 0;
13216 int32_t orig_tlen = tlen;
13217
13218 ctf_calc_rwin(so, tp);
13219 if ((thflags & TH_RST) ||
13220 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13221 return (ctf_process_rst(m, th, so, tp));
13222 /*
13223 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13224 * synchronized state.
13225 */
13226 if (thflags & TH_SYN) {
13227 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13228 return (ret_val);
13229 }
13230 /*
13231 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13232 * it's less than ts_recent, drop it.
13233 */
13234 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13235 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13236 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13237 return (ret_val);
13238 }
13239 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13240 return (ret_val);
13241 }
13242 /*
13243 * If last ACK falls within this segment's sequence numbers, record
13244 * its timestamp. NOTE: 1) That the test incorporates suggestions
13245 * from the latest proposal of the tcplw@cray.com list (Braden
13246 * 1993/04/26). 2) That updating only on newer timestamps interferes
13247 * with our earlier PAWS tests, so this check should be solely
13248 * predicated on the sequence space of this segment. 3) That we
13249 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13250 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13251 * SEG.Len, This modified check allows us to overcome RFC1323's
13252 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13253 * p.869. In such cases, we can still calculate the RTT correctly
13254 * when RCV.NXT == Last.ACK.Sent.
13255 */
13256 if ((to->to_flags & TOF_TS) != 0 &&
13257 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13258 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13259 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13260 tp->ts_recent_age = tcp_ts_getticks();
13261 tp->ts_recent = to->to_tsval;
13262 }
13263 /*
13264 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13265 * is on (half-synchronized state), then queue data for later
13266 * processing; else drop segment and return.
13267 */
13268 if ((thflags & TH_ACK) == 0) {
13269 if (tp->t_flags & TF_NEEDSYN) {
13270 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13271 tiwin, thflags, nxt_pkt));
13272
13273 } else if (tp->t_flags & TF_ACKNOW) {
13274 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13275 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13276 return (ret_val);
13277 } else {
13278 ctf_do_drop(m, NULL);
13279 return (0);
13280 }
13281 }
13282 /*
13283 * Ack processing.
13284 */
13285 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) {
13286 return (ret_val);
13287 }
13288 if (sbavail(&so->so_snd)) {
13289 if (ctf_progress_timeout_check(tp, true)) {
13290 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13291 tp, tick, PROGRESS_DROP, __LINE__);
13292 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13293 return (1);
13294 }
13295 }
13296 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13297 tiwin, thflags, nxt_pkt));
13298 }
13299
13300 static int
rack_check_data_after_close(struct mbuf * m,struct tcpcb * tp,int32_t * tlen,struct tcphdr * th,struct socket * so)13301 rack_check_data_after_close(struct mbuf *m,
13302 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
13303 {
13304 struct tcp_rack *rack;
13305
13306 rack = (struct tcp_rack *)tp->t_fb_ptr;
13307 if (rack->rc_allow_data_af_clo == 0) {
13308 close_now:
13309 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13310 /* tcp_close will kill the inp pre-log the Reset */
13311 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
13312 tp = tcp_close(tp);
13313 KMOD_TCPSTAT_INC(tcps_rcvafterclose);
13314 ctf_do_dropwithreset(m, tp, th, *tlen);
13315 return (1);
13316 }
13317 if (sbavail(&so->so_snd) == 0)
13318 goto close_now;
13319 /* Ok we allow data that is ignored and a followup reset */
13320 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
13321 tp->rcv_nxt = th->th_seq + *tlen;
13322 tp->t_flags2 |= TF2_DROP_AF_DATA;
13323 rack->r_wanted_output = 1;
13324 *tlen = 0;
13325 return (0);
13326 }
13327
13328 /*
13329 * Return value of 1, the TCB is unlocked and most
13330 * likely gone, return value of 0, the TCP is still
13331 * locked.
13332 */
13333 static int
rack_do_fin_wait_1(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13334 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
13335 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13336 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13337 {
13338 int32_t ret_val = 0;
13339 int32_t orig_tlen = tlen;
13340 int32_t ourfinisacked = 0;
13341
13342 ctf_calc_rwin(so, tp);
13343
13344 if ((thflags & TH_RST) ||
13345 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13346 return (ctf_process_rst(m, th, so, tp));
13347 /*
13348 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13349 * synchronized state.
13350 */
13351 if (thflags & TH_SYN) {
13352 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13353 return (ret_val);
13354 }
13355 /*
13356 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13357 * it's less than ts_recent, drop it.
13358 */
13359 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13360 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13361 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13362 return (ret_val);
13363 }
13364 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13365 return (ret_val);
13366 }
13367 /*
13368 * If new data are received on a connection after the user processes
13369 * are gone, then RST the other end.
13370 */
13371 if ((tp->t_flags & TF_CLOSED) && tlen &&
13372 rack_check_data_after_close(m, tp, &tlen, th, so))
13373 return (1);
13374 /*
13375 * If last ACK falls within this segment's sequence numbers, record
13376 * its timestamp. NOTE: 1) That the test incorporates suggestions
13377 * from the latest proposal of the tcplw@cray.com list (Braden
13378 * 1993/04/26). 2) That updating only on newer timestamps interferes
13379 * with our earlier PAWS tests, so this check should be solely
13380 * predicated on the sequence space of this segment. 3) That we
13381 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13382 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13383 * SEG.Len, This modified check allows us to overcome RFC1323's
13384 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13385 * p.869. In such cases, we can still calculate the RTT correctly
13386 * when RCV.NXT == Last.ACK.Sent.
13387 */
13388 if ((to->to_flags & TOF_TS) != 0 &&
13389 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13390 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13391 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13392 tp->ts_recent_age = tcp_ts_getticks();
13393 tp->ts_recent = to->to_tsval;
13394 }
13395 /*
13396 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13397 * is on (half-synchronized state), then queue data for later
13398 * processing; else drop segment and return.
13399 */
13400 if ((thflags & TH_ACK) == 0) {
13401 if (tp->t_flags & TF_NEEDSYN) {
13402 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13403 tiwin, thflags, nxt_pkt));
13404 } else if (tp->t_flags & TF_ACKNOW) {
13405 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13406 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13407 return (ret_val);
13408 } else {
13409 ctf_do_drop(m, NULL);
13410 return (0);
13411 }
13412 }
13413 /*
13414 * Ack processing.
13415 */
13416 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13417 return (ret_val);
13418 }
13419 if (ourfinisacked) {
13420 /*
13421 * If we can't receive any more data, then closing user can
13422 * proceed. Starting the timer is contrary to the
13423 * specification, but if we don't get a FIN we'll hang
13424 * forever.
13425 *
13426 * XXXjl: we should release the tp also, and use a
13427 * compressed state.
13428 */
13429 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
13430 soisdisconnected(so);
13431 tcp_timer_activate(tp, TT_2MSL,
13432 (tcp_fast_finwait2_recycle ?
13433 tcp_finwait2_timeout :
13434 TP_MAXIDLE(tp)));
13435 }
13436 tcp_state_change(tp, TCPS_FIN_WAIT_2);
13437 }
13438 if (sbavail(&so->so_snd)) {
13439 if (ctf_progress_timeout_check(tp, true)) {
13440 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13441 tp, tick, PROGRESS_DROP, __LINE__);
13442 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13443 return (1);
13444 }
13445 }
13446 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13447 tiwin, thflags, nxt_pkt));
13448 }
13449
13450 /*
13451 * Return value of 1, the TCB is unlocked and most
13452 * likely gone, return value of 0, the TCP is still
13453 * locked.
13454 */
13455 static int
rack_do_closing(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13456 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
13457 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13458 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13459 {
13460 int32_t ret_val = 0;
13461 int32_t orig_tlen = tlen;
13462 int32_t ourfinisacked = 0;
13463
13464 ctf_calc_rwin(so, tp);
13465
13466 if ((thflags & TH_RST) ||
13467 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13468 return (ctf_process_rst(m, th, so, tp));
13469 /*
13470 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13471 * synchronized state.
13472 */
13473 if (thflags & TH_SYN) {
13474 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13475 return (ret_val);
13476 }
13477 /*
13478 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13479 * it's less than ts_recent, drop it.
13480 */
13481 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13482 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13483 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13484 return (ret_val);
13485 }
13486 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13487 return (ret_val);
13488 }
13489 /*
13490 * If last ACK falls within this segment's sequence numbers, record
13491 * its timestamp. NOTE: 1) That the test incorporates suggestions
13492 * from the latest proposal of the tcplw@cray.com list (Braden
13493 * 1993/04/26). 2) That updating only on newer timestamps interferes
13494 * with our earlier PAWS tests, so this check should be solely
13495 * predicated on the sequence space of this segment. 3) That we
13496 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13497 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13498 * SEG.Len, This modified check allows us to overcome RFC1323's
13499 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13500 * p.869. In such cases, we can still calculate the RTT correctly
13501 * when RCV.NXT == Last.ACK.Sent.
13502 */
13503 if ((to->to_flags & TOF_TS) != 0 &&
13504 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13505 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13506 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13507 tp->ts_recent_age = tcp_ts_getticks();
13508 tp->ts_recent = to->to_tsval;
13509 }
13510 /*
13511 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13512 * is on (half-synchronized state), then queue data for later
13513 * processing; else drop segment and return.
13514 */
13515 if ((thflags & TH_ACK) == 0) {
13516 if (tp->t_flags & TF_NEEDSYN) {
13517 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13518 tiwin, thflags, nxt_pkt));
13519 } else if (tp->t_flags & TF_ACKNOW) {
13520 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13521 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13522 return (ret_val);
13523 } else {
13524 ctf_do_drop(m, NULL);
13525 return (0);
13526 }
13527 }
13528 /*
13529 * Ack processing.
13530 */
13531 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13532 return (ret_val);
13533 }
13534 if (ourfinisacked) {
13535 tcp_twstart(tp);
13536 m_freem(m);
13537 return (1);
13538 }
13539 if (sbavail(&so->so_snd)) {
13540 if (ctf_progress_timeout_check(tp, true)) {
13541 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13542 tp, tick, PROGRESS_DROP, __LINE__);
13543 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13544 return (1);
13545 }
13546 }
13547 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13548 tiwin, thflags, nxt_pkt));
13549 }
13550
13551 /*
13552 * Return value of 1, the TCB is unlocked and most
13553 * likely gone, return value of 0, the TCP is still
13554 * locked.
13555 */
13556 static int
rack_do_lastack(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13557 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
13558 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13559 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13560 {
13561 int32_t ret_val = 0;
13562 int32_t orig_tlen;
13563 int32_t ourfinisacked = 0;
13564
13565 ctf_calc_rwin(so, tp);
13566
13567 if ((thflags & TH_RST) ||
13568 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13569 return (ctf_process_rst(m, th, so, tp));
13570 /*
13571 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13572 * synchronized state.
13573 */
13574 if (thflags & TH_SYN) {
13575 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13576 return (ret_val);
13577 }
13578 /*
13579 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13580 * it's less than ts_recent, drop it.
13581 */
13582 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13583 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13584 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13585 return (ret_val);
13586 }
13587 orig_tlen = tlen;
13588 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13589 return (ret_val);
13590 }
13591 /*
13592 * If last ACK falls within this segment's sequence numbers, record
13593 * its timestamp. NOTE: 1) That the test incorporates suggestions
13594 * from the latest proposal of the tcplw@cray.com list (Braden
13595 * 1993/04/26). 2) That updating only on newer timestamps interferes
13596 * with our earlier PAWS tests, so this check should be solely
13597 * predicated on the sequence space of this segment. 3) That we
13598 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13599 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13600 * SEG.Len, This modified check allows us to overcome RFC1323's
13601 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13602 * p.869. In such cases, we can still calculate the RTT correctly
13603 * when RCV.NXT == Last.ACK.Sent.
13604 */
13605 if ((to->to_flags & TOF_TS) != 0 &&
13606 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13607 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13608 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13609 tp->ts_recent_age = tcp_ts_getticks();
13610 tp->ts_recent = to->to_tsval;
13611 }
13612 /*
13613 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13614 * is on (half-synchronized state), then queue data for later
13615 * processing; else drop segment and return.
13616 */
13617 if ((thflags & TH_ACK) == 0) {
13618 if (tp->t_flags & TF_NEEDSYN) {
13619 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13620 tiwin, thflags, nxt_pkt));
13621 } else if (tp->t_flags & TF_ACKNOW) {
13622 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13623 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13624 return (ret_val);
13625 } else {
13626 ctf_do_drop(m, NULL);
13627 return (0);
13628 }
13629 }
13630 /*
13631 * case TCPS_LAST_ACK: Ack processing.
13632 */
13633 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13634 return (ret_val);
13635 }
13636 if (ourfinisacked) {
13637 tp = tcp_close(tp);
13638 ctf_do_drop(m, tp);
13639 return (1);
13640 }
13641 if (sbavail(&so->so_snd)) {
13642 if (ctf_progress_timeout_check(tp, true)) {
13643 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13644 tp, tick, PROGRESS_DROP, __LINE__);
13645 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13646 return (1);
13647 }
13648 }
13649 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13650 tiwin, thflags, nxt_pkt));
13651 }
13652
13653 /*
13654 * Return value of 1, the TCB is unlocked and most
13655 * likely gone, return value of 0, the TCP is still
13656 * locked.
13657 */
13658 static int
rack_do_fin_wait_2(struct mbuf * m,struct tcphdr * th,struct socket * so,struct tcpcb * tp,struct tcpopt * to,int32_t drop_hdrlen,int32_t tlen,uint32_t tiwin,int32_t thflags,int32_t nxt_pkt,uint8_t iptos)13659 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
13660 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
13661 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
13662 {
13663 int32_t ret_val = 0;
13664 int32_t orig_tlen = tlen;
13665 int32_t ourfinisacked = 0;
13666
13667 ctf_calc_rwin(so, tp);
13668
13669 /* Reset receive buffer auto scaling when not in bulk receive mode. */
13670 if ((thflags & TH_RST) ||
13671 (tp->t_fin_is_rst && (thflags & TH_FIN)))
13672 return (ctf_process_rst(m, th, so, tp));
13673 /*
13674 * RFC5961 Section 4.2 Send challenge ACK for any SYN in
13675 * synchronized state.
13676 */
13677 if (thflags & TH_SYN) {
13678 ctf_challenge_ack(m, th, tp, iptos, &ret_val);
13679 return (ret_val);
13680 }
13681 /*
13682 * RFC 1323 PAWS: If we have a timestamp reply on this segment and
13683 * it's less than ts_recent, drop it.
13684 */
13685 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent &&
13686 TSTMP_LT(to->to_tsval, tp->ts_recent)) {
13687 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val))
13688 return (ret_val);
13689 }
13690 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) {
13691 return (ret_val);
13692 }
13693 /*
13694 * If new data are received on a connection after the user processes
13695 * are gone, then RST the other end.
13696 */
13697 if ((tp->t_flags & TF_CLOSED) && tlen &&
13698 rack_check_data_after_close(m, tp, &tlen, th, so))
13699 return (1);
13700 /*
13701 * If last ACK falls within this segment's sequence numbers, record
13702 * its timestamp. NOTE: 1) That the test incorporates suggestions
13703 * from the latest proposal of the tcplw@cray.com list (Braden
13704 * 1993/04/26). 2) That updating only on newer timestamps interferes
13705 * with our earlier PAWS tests, so this check should be solely
13706 * predicated on the sequence space of this segment. 3) That we
13707 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ
13708 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ +
13709 * SEG.Len, This modified check allows us to overcome RFC1323's
13710 * limitations as described in Stevens TCP/IP Illustrated Vol. 2
13711 * p.869. In such cases, we can still calculate the RTT correctly
13712 * when RCV.NXT == Last.ACK.Sent.
13713 */
13714 if ((to->to_flags & TOF_TS) != 0 &&
13715 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
13716 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
13717 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
13718 tp->ts_recent_age = tcp_ts_getticks();
13719 tp->ts_recent = to->to_tsval;
13720 }
13721 /*
13722 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
13723 * is on (half-synchronized state), then queue data for later
13724 * processing; else drop segment and return.
13725 */
13726 if ((thflags & TH_ACK) == 0) {
13727 if (tp->t_flags & TF_NEEDSYN) {
13728 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13729 tiwin, thflags, nxt_pkt));
13730 } else if (tp->t_flags & TF_ACKNOW) {
13731 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val);
13732 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1;
13733 return (ret_val);
13734 } else {
13735 ctf_do_drop(m, NULL);
13736 return (0);
13737 }
13738 }
13739 /*
13740 * Ack processing.
13741 */
13742 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) {
13743 return (ret_val);
13744 }
13745 if (sbavail(&so->so_snd)) {
13746 if (ctf_progress_timeout_check(tp, true)) {
13747 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
13748 tp, tick, PROGRESS_DROP, __LINE__);
13749 ctf_do_dropwithreset_conn(m, tp, th, tlen);
13750 return (1);
13751 }
13752 }
13753 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
13754 tiwin, thflags, nxt_pkt));
13755 }
13756
13757 static void inline
rack_clear_rate_sample(struct tcp_rack * rack)13758 rack_clear_rate_sample(struct tcp_rack *rack)
13759 {
13760 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY;
13761 rack->r_ctl.rack_rs.rs_rtt_cnt = 0;
13762 rack->r_ctl.rack_rs.rs_rtt_tot = 0;
13763 }
13764
13765 static void
rack_set_pace_segments(struct tcpcb * tp,struct tcp_rack * rack,uint32_t line,uint64_t * fill_override)13766 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override)
13767 {
13768 uint64_t bw_est, rate_wanted;
13769 int chged = 0;
13770 uint32_t user_max, orig_min, orig_max;
13771
13772 #ifdef TCP_REQUEST_TRK
13773 if (rack->rc_hybrid_mode &&
13774 (rack->r_ctl.rc_pace_max_segs != 0) &&
13775 (rack_hybrid_allow_set_maxseg == 1) &&
13776 (rack->r_ctl.rc_last_sft != NULL)) {
13777 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS;
13778 return;
13779 }
13780 #endif
13781 orig_min = rack->r_ctl.rc_pace_min_segs;
13782 orig_max = rack->r_ctl.rc_pace_max_segs;
13783 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs;
13784 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs)
13785 chged = 1;
13786 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp);
13787 if (rack->use_fixed_rate || rack->rc_force_max_seg) {
13788 if (user_max != rack->r_ctl.rc_pace_max_segs)
13789 chged = 1;
13790 }
13791 if (rack->rc_force_max_seg) {
13792 rack->r_ctl.rc_pace_max_segs = user_max;
13793 } else if (rack->use_fixed_rate) {
13794 bw_est = rack_get_bw(rack);
13795 if ((rack->r_ctl.crte == NULL) ||
13796 (bw_est != rack->r_ctl.crte->rate)) {
13797 rack->r_ctl.rc_pace_max_segs = user_max;
13798 } else {
13799 /* We are pacing right at the hardware rate */
13800 uint32_t segsiz, pace_one;
13801
13802 if (rack_pace_one_seg ||
13803 (rack->r_ctl.rc_user_set_min_segs == 1))
13804 pace_one = 1;
13805 else
13806 pace_one = 0;
13807 segsiz = min(ctf_fixed_maxseg(tp),
13808 rack->r_ctl.rc_pace_min_segs);
13809 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(
13810 tp, bw_est, segsiz, pace_one,
13811 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
13812 }
13813 } else if (rack->rc_always_pace) {
13814 if (rack->r_ctl.gp_bw ||
13815 rack->r_ctl.init_rate) {
13816 /* We have a rate of some sort set */
13817 uint32_t orig;
13818
13819 bw_est = rack_get_bw(rack);
13820 orig = rack->r_ctl.rc_pace_max_segs;
13821 if (fill_override)
13822 rate_wanted = *fill_override;
13823 else
13824 rate_wanted = rack_get_gp_est(rack);
13825 if (rate_wanted) {
13826 /* We have something */
13827 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack,
13828 rate_wanted,
13829 ctf_fixed_maxseg(rack->rc_tp));
13830 } else
13831 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs;
13832 if (orig != rack->r_ctl.rc_pace_max_segs)
13833 chged = 1;
13834 } else if ((rack->r_ctl.gp_bw == 0) &&
13835 (rack->r_ctl.rc_pace_max_segs == 0)) {
13836 /*
13837 * If we have nothing limit us to bursting
13838 * out IW sized pieces.
13839 */
13840 chged = 1;
13841 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack);
13842 }
13843 }
13844 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) {
13845 chged = 1;
13846 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES;
13847 }
13848 if (chged)
13849 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2);
13850 }
13851
13852
13853 static void
rack_init_fsb_block(struct tcpcb * tp,struct tcp_rack * rack,int32_t flags)13854 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags)
13855 {
13856 #ifdef INET6
13857 struct ip6_hdr *ip6 = NULL;
13858 #endif
13859 #ifdef INET
13860 struct ip *ip = NULL;
13861 #endif
13862 struct udphdr *udp = NULL;
13863
13864 /* Ok lets fill in the fast block, it can only be used with no IP options! */
13865 #ifdef INET6
13866 if (rack->r_is_v6) {
13867 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
13868 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
13869 if (tp->t_port) {
13870 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
13871 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
13872 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13873 udp->uh_dport = tp->t_port;
13874 rack->r_ctl.fsb.udp = udp;
13875 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
13876 } else
13877 {
13878 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1);
13879 rack->r_ctl.fsb.udp = NULL;
13880 }
13881 tcpip_fillheaders(rack->rc_inp,
13882 tp->t_port,
13883 ip6, rack->r_ctl.fsb.th);
13884 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL);
13885 } else
13886 #endif /* INET6 */
13887 #ifdef INET
13888 {
13889 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr);
13890 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
13891 if (tp->t_port) {
13892 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr);
13893 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
13894 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
13895 udp->uh_dport = tp->t_port;
13896 rack->r_ctl.fsb.udp = udp;
13897 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1);
13898 } else
13899 {
13900 rack->r_ctl.fsb.udp = NULL;
13901 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1);
13902 }
13903 tcpip_fillheaders(rack->rc_inp,
13904 tp->t_port,
13905 ip, rack->r_ctl.fsb.th);
13906 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl;
13907 }
13908 #endif
13909 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0),
13910 (long)TCP_MAXWIN << tp->rcv_scale);
13911 rack->r_fsb_inited = 1;
13912 }
13913
13914 static int
rack_init_fsb(struct tcpcb * tp,struct tcp_rack * rack)13915 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack)
13916 {
13917 /*
13918 * Allocate the larger of spaces V6 if available else just
13919 * V4 and include udphdr (overbook)
13920 */
13921 #ifdef INET6
13922 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr);
13923 #else
13924 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr);
13925 #endif
13926 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len,
13927 M_TCPFSB, M_NOWAIT|M_ZERO);
13928 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) {
13929 return (ENOMEM);
13930 }
13931 rack->r_fsb_inited = 0;
13932 return (0);
13933 }
13934
13935 static void
rack_log_hystart_event(struct tcp_rack * rack,uint32_t high_seq,uint8_t mod)13936 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod)
13937 {
13938 /*
13939 * Types of logs (mod value)
13940 * 20 - Initial round setup
13941 * 21 - Rack declares a new round.
13942 */
13943 struct tcpcb *tp;
13944
13945 tp = rack->rc_tp;
13946 if (tcp_bblogging_on(tp)) {
13947 union tcp_log_stackspecific log;
13948 struct timeval tv;
13949
13950 memset(&log, 0, sizeof(log));
13951 log.u_bbr.flex1 = rack->r_ctl.current_round;
13952 log.u_bbr.flex2 = rack->r_ctl.roundends;
13953 log.u_bbr.flex3 = high_seq;
13954 log.u_bbr.flex4 = tp->snd_max;
13955 log.u_bbr.flex8 = mod;
13956 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
13957 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes;
13958 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes;
13959 TCP_LOG_EVENTP(tp, NULL,
13960 &tptosocket(tp)->so_rcv,
13961 &tptosocket(tp)->so_snd,
13962 TCP_HYSTART, 0,
13963 0, &log, false, &tv);
13964 }
13965 }
13966
13967 static void
rack_deferred_init(struct tcpcb * tp,struct tcp_rack * rack)13968 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack)
13969 {
13970 rack->rack_deferred_inited = 1;
13971 rack->r_ctl.roundends = tp->snd_max;
13972 rack->r_ctl.rc_high_rwnd = tp->snd_wnd;
13973 rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
13974 }
13975
13976 static void
rack_init_retransmit_value(struct tcp_rack * rack,int ctl)13977 rack_init_retransmit_value(struct tcp_rack *rack, int ctl)
13978 {
13979 /* Retransmit bit controls.
13980 *
13981 * The setting of these values control one of
13982 * three settings you can have and dictate
13983 * how rack does retransmissions. Note this
13984 * is in *any* mode i.e. pacing on or off DGP
13985 * fixed rate pacing, or just bursting rack.
13986 *
13987 * 1 - Use full sized retransmits i.e. limit
13988 * the size to whatever the pace_max_segments
13989 * size is.
13990 *
13991 * 2 - Use pacer min granularity as a guide to
13992 * the size combined with the current calculated
13993 * goodput b/w measurement. So for example if
13994 * the goodput is measured at 20Mbps we would
13995 * calculate 8125 (pacer minimum 250usec in
13996 * that b/w) and then round it up to the next
13997 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes.
13998 *
13999 * 0 - The rack default 1 MSS (anything not 0/1/2
14000 * fall here too if we are setting via rack_init()).
14001 *
14002 */
14003 if (ctl == 1) {
14004 rack->full_size_rxt = 1;
14005 rack->shape_rxt_to_pacing_min = 0;
14006 } else if (ctl == 2) {
14007 rack->full_size_rxt = 0;
14008 rack->shape_rxt_to_pacing_min = 1;
14009 } else {
14010 rack->full_size_rxt = 0;
14011 rack->shape_rxt_to_pacing_min = 0;
14012 }
14013 }
14014
14015 static void
rack_log_chg_info(struct tcpcb * tp,struct tcp_rack * rack,uint8_t mod,uint32_t flex1,uint32_t flex2,uint32_t flex3)14016 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod,
14017 uint32_t flex1,
14018 uint32_t flex2,
14019 uint32_t flex3)
14020 {
14021 if (tcp_bblogging_on(rack->rc_tp)) {
14022 union tcp_log_stackspecific log;
14023 struct timeval tv;
14024
14025 memset(&log, 0, sizeof(log));
14026 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14027 log.u_bbr.flex8 = mod;
14028 log.u_bbr.flex1 = flex1;
14029 log.u_bbr.flex2 = flex2;
14030 log.u_bbr.flex3 = flex3;
14031 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0,
14032 0, &log, false, NULL, __func__, __LINE__, &tv);
14033 }
14034 }
14035
14036 static int
rack_chg_query(struct tcpcb * tp,struct tcp_query_resp * reqr)14037 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr)
14038 {
14039 struct tcp_rack *rack;
14040 struct rack_sendmap *rsm;
14041 int i;
14042
14043
14044 rack = (struct tcp_rack *)tp->t_fb_ptr;
14045 switch (reqr->req) {
14046 case TCP_QUERY_SENDMAP:
14047 if ((reqr->req_param == tp->snd_max) ||
14048 (tp->snd_max == tp->snd_una)){
14049 /* Unlikely */
14050 return (0);
14051 }
14052 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param);
14053 if (rsm == NULL) {
14054 /* Can't find that seq -- unlikely */
14055 return (0);
14056 }
14057 reqr->sendmap_start = rsm->r_start;
14058 reqr->sendmap_end = rsm->r_end;
14059 reqr->sendmap_send_cnt = rsm->r_rtr_cnt;
14060 reqr->sendmap_fas = rsm->r_fas;
14061 if (reqr->sendmap_send_cnt > SNDMAP_NRTX)
14062 reqr->sendmap_send_cnt = SNDMAP_NRTX;
14063 for(i=0; i<reqr->sendmap_send_cnt; i++)
14064 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i];
14065 reqr->sendmap_ack_arrival = rsm->r_ack_arrival;
14066 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK;
14067 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes;
14068 reqr->sendmap_dupacks = rsm->r_dupack;
14069 rack_log_chg_info(tp, rack, 1,
14070 rsm->r_start,
14071 rsm->r_end,
14072 rsm->r_flags);
14073 return(1);
14074 break;
14075 case TCP_QUERY_TIMERS_UP:
14076 if (rack->r_ctl.rc_hpts_flags == 0) {
14077 /* no timers up */
14078 return (0);
14079 }
14080 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags;
14081 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14082 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to;
14083 }
14084 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14085 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp;
14086 }
14087 rack_log_chg_info(tp, rack, 2,
14088 rack->r_ctl.rc_hpts_flags,
14089 rack->r_ctl.rc_last_output_to,
14090 rack->r_ctl.rc_timer_exp);
14091 return (1);
14092 break;
14093 case TCP_QUERY_RACK_TIMES:
14094 /* Reordering items */
14095 reqr->rack_num_dsacks = rack->r_ctl.num_dsack;
14096 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts;
14097 /* Timerstamps and timers */
14098 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time;
14099 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt;
14100 reqr->rack_rtt = rack->rc_rack_rtt;
14101 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time;
14102 reqr->rack_srtt_measured = rack->rc_srtt_measure_made;
14103 /* PRR data */
14104 reqr->rack_sacked = rack->r_ctl.rc_sacked;
14105 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt;
14106 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered;
14107 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs;
14108 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt;
14109 reqr->rack_prr_out = rack->r_ctl.rc_prr_out;
14110 /* TLP and persists info */
14111 reqr->rack_tlp_out = rack->rc_tlp_in_progress;
14112 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out;
14113 if (rack->rc_in_persist) {
14114 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time;
14115 reqr->rack_in_persist = 1;
14116 } else {
14117 reqr->rack_time_went_idle = 0;
14118 reqr->rack_in_persist = 0;
14119 }
14120 if (rack->r_wanted_output)
14121 reqr->rack_wanted_output = 1;
14122 else
14123 reqr->rack_wanted_output = 0;
14124 return (1);
14125 break;
14126 default:
14127 return (-EINVAL);
14128 }
14129 }
14130
14131 static void
rack_switch_failed(struct tcpcb * tp)14132 rack_switch_failed(struct tcpcb *tp)
14133 {
14134 /*
14135 * This method gets called if a stack switch was
14136 * attempted and it failed. We are left
14137 * but our hpts timers were stopped and we
14138 * need to validate time units and t_flags2.
14139 */
14140 struct tcp_rack *rack;
14141 struct timeval tv;
14142 uint32_t cts;
14143 uint32_t toval;
14144 struct hpts_diag diag;
14145
14146 rack = (struct tcp_rack *)tp->t_fb_ptr;
14147 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC);
14148 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
14149 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
14150 else
14151 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
14152 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14153 tp->t_flags2 |= TF2_MBUF_ACKCMP;
14154 if (tp->t_in_hpts > IHPTS_NONE) {
14155 /* Strange */
14156 return;
14157 }
14158 cts = tcp_get_usecs(&tv);
14159 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
14160 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
14161 toval = rack->r_ctl.rc_last_output_to - cts;
14162 } else {
14163 /* one slot please */
14164 toval = HPTS_USECS_PER_SLOT;
14165 }
14166 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
14167 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) {
14168 toval = rack->r_ctl.rc_timer_exp - cts;
14169 } else {
14170 /* one slot please */
14171 toval = HPTS_USECS_PER_SLOT;
14172 }
14173 } else
14174 toval = HPTS_USECS_PER_SLOT;
14175 tcp_hpts_insert(tp, toval, &diag);
14176 rack_log_hpts_diag(rack, cts, &diag, &tv);
14177 }
14178
14179 static int
rack_init_outstanding(struct tcpcb * tp,struct tcp_rack * rack,uint32_t us_cts,void * ptr)14180 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr)
14181 {
14182 struct rack_sendmap *rsm, *ersm;
14183 int insret __diagused;
14184 /*
14185 * When initing outstanding, we must be quite careful
14186 * to not refer to tp->t_fb_ptr. This has the old rack
14187 * pointer in it, not the "new" one (when we are doing
14188 * a stack switch).
14189 */
14190
14191
14192 if (tp->t_fb->tfb_chg_query == NULL) {
14193 /* Create a send map for the current outstanding data */
14194
14195 rsm = rack_alloc(rack);
14196 if (rsm == NULL) {
14197 uma_zfree(rack_pcb_zone, ptr);
14198 return (ENOMEM);
14199 }
14200 rsm->r_no_rtt_allowed = 1;
14201 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time);
14202 rsm->r_rtr_cnt = 1;
14203 rsm->r_rtr_bytes = 0;
14204 if (tp->t_flags & TF_SENTFIN)
14205 rsm->r_flags |= RACK_HAS_FIN;
14206 rsm->r_end = tp->snd_max;
14207 if (tp->snd_una == tp->iss) {
14208 /* The data space is one beyond snd_una */
14209 rsm->r_flags |= RACK_HAS_SYN;
14210 rsm->r_start = tp->iss;
14211 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una);
14212 } else
14213 rsm->r_start = tp->snd_una;
14214 rsm->r_dupack = 0;
14215 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) {
14216 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff);
14217 if (rsm->m) {
14218 rsm->orig_m_len = rsm->m->m_len;
14219 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14220 } else {
14221 rsm->orig_m_len = 0;
14222 rsm->orig_t_space = 0;
14223 }
14224 } else {
14225 /*
14226 * This can happen if we have a stand-alone FIN or
14227 * SYN.
14228 */
14229 rsm->m = NULL;
14230 rsm->orig_m_len = 0;
14231 rsm->orig_t_space = 0;
14232 rsm->soff = 0;
14233 }
14234 #ifdef INVARIANTS
14235 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14236 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p",
14237 insret, rack, rsm);
14238 }
14239 #else
14240 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14241 #endif
14242 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14243 rsm->r_in_tmap = 1;
14244 } else {
14245 /* We have a query mechanism, lets use it */
14246 struct tcp_query_resp qr;
14247 int i;
14248 tcp_seq at;
14249
14250 at = tp->snd_una;
14251 while (at != tp->snd_max) {
14252 memset(&qr, 0, sizeof(qr));
14253 qr.req = TCP_QUERY_SENDMAP;
14254 qr.req_param = at;
14255 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0)
14256 break;
14257 /* Move forward */
14258 at = qr.sendmap_end;
14259 /* Now lets build the entry for this one */
14260 rsm = rack_alloc(rack);
14261 if (rsm == NULL) {
14262 uma_zfree(rack_pcb_zone, ptr);
14263 return (ENOMEM);
14264 }
14265 memset(rsm, 0, sizeof(struct rack_sendmap));
14266 /* Now configure the rsm and insert it */
14267 rsm->r_dupack = qr.sendmap_dupacks;
14268 rsm->r_start = qr.sendmap_start;
14269 rsm->r_end = qr.sendmap_end;
14270 if (qr.sendmap_fas)
14271 rsm->r_fas = qr.sendmap_end;
14272 else
14273 rsm->r_fas = rsm->r_start - tp->snd_una;
14274 /*
14275 * We have carefully aligned the bits
14276 * so that all we have to do is copy over
14277 * the bits with the mask.
14278 */
14279 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK;
14280 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes;
14281 rsm->r_rtr_cnt = qr.sendmap_send_cnt;
14282 rsm->r_ack_arrival = qr.sendmap_ack_arrival;
14283 for (i=0 ; i<rsm->r_rtr_cnt; i++)
14284 rsm->r_tim_lastsent[i] = qr.sendmap_time[i];
14285 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd,
14286 (rsm->r_start - tp->snd_una), &rsm->soff);
14287 if (rsm->m) {
14288 rsm->orig_m_len = rsm->m->m_len;
14289 rsm->orig_t_space = M_TRAILINGROOM(rsm->m);
14290 } else {
14291 rsm->orig_m_len = 0;
14292 rsm->orig_t_space = 0;
14293 }
14294 #ifdef INVARIANTS
14295 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) {
14296 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p",
14297 insret, rack, rsm);
14298 }
14299 #else
14300 (void)tqhash_insert(rack->r_ctl.tqh, rsm);
14301 #endif
14302 if ((rsm->r_flags & RACK_ACKED) == 0) {
14303 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) {
14304 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] >
14305 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) {
14306 /*
14307 * If the existing ersm was sent at
14308 * a later time than the new one, then
14309 * the new one should appear ahead of this
14310 * ersm.
14311 */
14312 rsm->r_in_tmap = 1;
14313 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext);
14314 break;
14315 }
14316 }
14317 if (rsm->r_in_tmap == 0) {
14318 /*
14319 * Not found so shove it on the tail.
14320 */
14321 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
14322 rsm->r_in_tmap = 1;
14323 }
14324 } else {
14325 if ((rack->r_ctl.rc_sacklast == NULL) ||
14326 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) {
14327 rack->r_ctl.rc_sacklast = rsm;
14328 }
14329 }
14330 rack_log_chg_info(tp, rack, 3,
14331 rsm->r_start,
14332 rsm->r_end,
14333 rsm->r_flags);
14334 }
14335 }
14336 return (0);
14337 }
14338
14339
14340 static int32_t
rack_init(struct tcpcb * tp,void ** ptr)14341 rack_init(struct tcpcb *tp, void **ptr)
14342 {
14343 struct inpcb *inp = tptoinpcb(tp);
14344 struct tcp_rack *rack = NULL;
14345 uint32_t iwin, snt, us_cts;
14346 size_t sz;
14347 int err, no_query;
14348
14349 tcp_hpts_init(tp);
14350
14351 /*
14352 * First are we the initial or are we a switched stack?
14353 * If we are initing via tcp_newtcppcb the ptr passed
14354 * will be tp->t_fb_ptr. If its a stack switch that
14355 * has a previous stack we can query it will be a local
14356 * var that will in the end be set into t_fb_ptr.
14357 */
14358 if (ptr == &tp->t_fb_ptr)
14359 no_query = 1;
14360 else
14361 no_query = 0;
14362 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
14363 if (*ptr == NULL) {
14364 /*
14365 * We need to allocate memory but cant. The INP and INP_INFO
14366 * locks and they are recursive (happens during setup. So a
14367 * scheme to drop the locks fails :(
14368 *
14369 */
14370 return(ENOMEM);
14371 }
14372 memset(*ptr, 0, sizeof(struct tcp_rack));
14373 rack = (struct tcp_rack *)*ptr;
14374 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT);
14375 if (rack->r_ctl.tqh == NULL) {
14376 uma_zfree(rack_pcb_zone, rack);
14377 return(ENOMEM);
14378 }
14379 tqhash_init(rack->r_ctl.tqh);
14380 TAILQ_INIT(&rack->r_ctl.rc_free);
14381 TAILQ_INIT(&rack->r_ctl.rc_tmap);
14382 rack->rc_tp = tp;
14383 rack->rc_inp = inp;
14384 /* Set the flag */
14385 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0;
14386 /* Probably not needed but lets be sure */
14387 rack_clear_rate_sample(rack);
14388 /*
14389 * Save off the default values, socket options will poke
14390 * at these if pacing is not on or we have not yet
14391 * reached where pacing is on (gp_ready/fixed enabled).
14392 * When they get set into the CC module (when gp_ready
14393 * is enabled or we enable fixed) then we will set these
14394 * values into the CC and place in here the old values
14395 * so we have a restoral. Then we will set the flag
14396 * rc_pacing_cc_set. That way whenever we turn off pacing
14397 * or switch off this stack, we will know to go restore
14398 * the saved values.
14399 *
14400 * We specifically put into the beta the ecn value for pacing.
14401 */
14402 rack->rc_new_rnd_needed = 1;
14403 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit;
14404 /* We want abe like behavior as well */
14405
14406 rack->r_ctl.rc_reorder_fade = rack_reorder_fade;
14407 rack->rc_allow_data_af_clo = rack_ignore_data_after_close;
14408 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh;
14409 if (rack_fill_cw_state)
14410 rack->rc_pace_to_cwnd = 1;
14411 if (rack_pacing_min_seg)
14412 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg;
14413 if (use_rack_rr)
14414 rack->use_rack_rr = 1;
14415 if (rack_dnd_default) {
14416 rack->rc_pace_dnd = 1;
14417 }
14418 if (V_tcp_delack_enabled)
14419 tp->t_delayed_ack = 1;
14420 else
14421 tp->t_delayed_ack = 0;
14422 #ifdef TCP_ACCOUNTING
14423 if (rack_tcp_accounting) {
14424 tp->t_flags2 |= TF2_TCP_ACCOUNTING;
14425 }
14426 #endif
14427 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY;
14428 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc);
14429 rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT);
14430 if (rack->r_ctl.pcm_s == NULL) {
14431 rack->r_ctl.pcm_i.cnt_alloc = 0;
14432 }
14433 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss;
14434 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca;
14435 if (rack_enable_shared_cwnd)
14436 rack->rack_enable_scwnd = 1;
14437 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
14438 rack->rc_user_set_max_segs = rack_hptsi_segments;
14439 rack->r_ctl.max_reduction = rack_max_reduce;
14440 rack->rc_force_max_seg = 0;
14441 TAILQ_INIT(&rack->r_ctl.opt_list);
14442 rack->r_ctl.rc_saved_beta = V_newreno_beta_ecn;
14443 rack->r_ctl.rc_saved_beta_ecn = V_newreno_beta_ecn;
14444 if (rack_hibeta_setting) {
14445 rack->rack_hibeta = 1;
14446 if ((rack_hibeta_setting >= 50) &&
14447 (rack_hibeta_setting <= 100)) {
14448 rack->r_ctl.rc_saved_beta = rack_hibeta_setting;
14449 rack->r_ctl.saved_hibeta = rack_hibeta_setting;
14450 }
14451 } else {
14452 rack->r_ctl.saved_hibeta = 50;
14453 }
14454 /*
14455 * We initialize to all ones so we never match 0
14456 * just in case the client sends in 0, it hopefully
14457 * will never have all 1's in ms :-)
14458 */
14459 rack->r_ctl.last_tm_mark = 0xffffffffffffffff;
14460 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh;
14461 rack->r_ctl.rc_pkt_delay = rack_pkt_delay;
14462 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp;
14463 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff;
14464 rack->r_ctl.rc_highest_us_rtt = 0;
14465 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap;
14466 rack->pcm_enabled = rack_pcm_is_enabled;
14467 if (rack_fillcw_bw_cap)
14468 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap;
14469 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop);
14470 if (rack_use_cmp_acks)
14471 rack->r_use_cmp_ack = 1;
14472 if (rack_disable_prr)
14473 rack->rack_no_prr = 1;
14474 if (rack_gp_no_rec_chg)
14475 rack->rc_gp_no_rec_chg = 1;
14476 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
14477 rack->r_ctl.pacing_method |= RACK_REG_PACING;
14478 rack->rc_always_pace = 1;
14479 if (rack->rack_hibeta)
14480 rack_set_cc_pacing(rack);
14481 } else
14482 rack->rc_always_pace = 0;
14483 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack)
14484 rack->r_mbuf_queue = 1;
14485 else
14486 rack->r_mbuf_queue = 0;
14487 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14488 if (rack_limits_scwnd)
14489 rack->r_limit_scw = 1;
14490 else
14491 rack->r_limit_scw = 0;
14492 rack_init_retransmit_value(rack, rack_rxt_controls);
14493 rack->rc_labc = V_tcp_abc_l_var;
14494 if (rack_honors_hpts_min_to)
14495 rack->r_use_hpts_min = 1;
14496 if (tp->snd_una != 0) {
14497 rack->rc_sendvars_notset = 0;
14498 /*
14499 * Make sure any TCP timers are not running.
14500 */
14501 tcp_timer_stop(tp);
14502 } else {
14503 /*
14504 * Server side, we are called from the
14505 * syn-cache. This means none of the
14506 * snd_una/max are set yet so we have
14507 * to defer this until the first send.
14508 */
14509 rack->rc_sendvars_notset = 1;
14510 }
14511
14512 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method;
14513 rack->rack_tlp_threshold_use = rack_tlp_threshold_use;
14514 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
14515 rack->r_ctl.rc_min_to = rack_min_to;
14516 microuptime(&rack->r_ctl.act_rcv_time);
14517 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss;
14518 if (rack_hw_up_only)
14519 rack->r_up_only = 1;
14520 if (rack_do_dyn_mul) {
14521 /* When dynamic adjustment is on CA needs to start at 100% */
14522 rack->rc_gp_dyn_mul = 1;
14523 if (rack_do_dyn_mul >= 100)
14524 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
14525 } else
14526 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
14527 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec;
14528 if (rack_timely_off) {
14529 rack->rc_skip_timely = 1;
14530 }
14531 if (rack->rc_skip_timely) {
14532 rack->r_ctl.rack_per_of_gp_rec = 90;
14533 rack->r_ctl.rack_per_of_gp_ca = 100;
14534 rack->r_ctl.rack_per_of_gp_ss = 250;
14535 }
14536 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt;
14537 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
14538 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time);
14539
14540 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN,
14541 rack_probertt_filter_life);
14542 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
14543 rack->r_ctl.rc_lower_rtt_us_cts = us_cts;
14544 rack->r_ctl.rc_time_of_last_probertt = us_cts;
14545 rack->r_ctl.rc_went_idle_time = us_cts;
14546 rack->r_ctl.rc_time_probertt_starts = 0;
14547
14548 rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff;
14549 if (rack_rnd_cnt_req & 0x10000)
14550 rack->r_ctl.gate_to_fs = 1;
14551 rack->r_ctl.gp_gain_req = rack_gp_gain_req;
14552 if ((rack_rnd_cnt_req & 0x100) > 0) {
14553
14554 }
14555 if (rack_dsack_std_based & 0x1) {
14556 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
14557 rack->rc_rack_tmr_std_based = 1;
14558 }
14559 if (rack_dsack_std_based & 0x2) {
14560 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
14561 rack->rc_rack_use_dsack = 1;
14562 }
14563 /* We require at least one measurement, even if the sysctl is 0 */
14564 if (rack_req_measurements)
14565 rack->r_ctl.req_measurements = rack_req_measurements;
14566 else
14567 rack->r_ctl.req_measurements = 1;
14568 if (rack_enable_hw_pacing)
14569 rack->rack_hdw_pace_ena = 1;
14570 if (rack_hw_rate_caps)
14571 rack->r_rack_hw_rate_caps = 1;
14572 if (rack_non_rxt_use_cr)
14573 rack->rack_rec_nonrxt_use_cr = 1;
14574 /* Lets setup the fsb block */
14575 err = rack_init_fsb(tp, rack);
14576 if (err) {
14577 uma_zfree(rack_pcb_zone, *ptr);
14578 *ptr = NULL;
14579 return (err);
14580 }
14581 if (rack_do_hystart) {
14582 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
14583 if (rack_do_hystart > 1)
14584 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
14585 if (rack_do_hystart > 2)
14586 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
14587 }
14588 /* Log what we will do with queries */
14589 rack_log_chg_info(tp, rack, 7,
14590 no_query, 0, 0);
14591 if (rack_def_profile)
14592 rack_set_profile(rack, rack_def_profile);
14593 /* Cancel the GP measurement in progress */
14594 tp->t_flags &= ~TF_GPUTINPROG;
14595 if ((tp->t_state != TCPS_CLOSED) &&
14596 (tp->t_state != TCPS_TIME_WAIT)) {
14597 /*
14598 * We are already open, we may
14599 * need to adjust a few things.
14600 */
14601 if (SEQ_GT(tp->snd_max, tp->iss))
14602 snt = tp->snd_max - tp->iss;
14603 else
14604 snt = 0;
14605 iwin = rc_init_window(rack);
14606 if ((snt < iwin) &&
14607 (no_query == 1)) {
14608 /* We are not past the initial window
14609 * on the first init (i.e. a stack switch
14610 * has not yet occured) so we need to make
14611 * sure cwnd and ssthresh is correct.
14612 */
14613 if (tp->snd_cwnd < iwin)
14614 tp->snd_cwnd = iwin;
14615 /*
14616 * If we are within the initial window
14617 * we want ssthresh to be unlimited. Setting
14618 * it to the rwnd (which the default stack does
14619 * and older racks) is not really a good idea
14620 * since we want to be in SS and grow both the
14621 * cwnd and the rwnd (via dynamic rwnd growth). If
14622 * we set it to the rwnd then as the peer grows its
14623 * rwnd we will be stuck in CA and never hit SS.
14624 *
14625 * Its far better to raise it up high (this takes the
14626 * risk that there as been a loss already, probably
14627 * we should have an indicator in all stacks of loss
14628 * but we don't), but considering the normal use this
14629 * is a risk worth taking. The consequences of not
14630 * hitting SS are far worse than going one more time
14631 * into it early on (before we have sent even a IW).
14632 * It is highly unlikely that we will have had a loss
14633 * before getting the IW out.
14634 */
14635 tp->snd_ssthresh = 0xffffffff;
14636 }
14637 /*
14638 * Any init based on sequence numbers
14639 * should be done in the deferred init path
14640 * since we can be CLOSED and not have them
14641 * inited when rack_init() is called. We
14642 * are not closed so lets call it.
14643 */
14644 rack_deferred_init(tp, rack);
14645 }
14646 if ((tp->t_state != TCPS_CLOSED) &&
14647 (tp->t_state != TCPS_TIME_WAIT) &&
14648 (no_query == 0) &&
14649 (tp->snd_una != tp->snd_max)) {
14650 err = rack_init_outstanding(tp, rack, us_cts, *ptr);
14651 if (err) {
14652 *ptr = NULL;
14653 return(err);
14654 }
14655 }
14656 rack_stop_all_timers(tp, rack);
14657 /* Setup all the t_flags2 */
14658 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
14659 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
14660 else
14661 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
14662 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14663 tp->t_flags2 |= TF2_MBUF_ACKCMP;
14664 /*
14665 * Timers in Rack are kept in microseconds so lets
14666 * convert any initial incoming variables
14667 * from ticks into usecs. Note that we
14668 * also change the values of t_srtt and t_rttvar, if
14669 * they are non-zero. They are kept with a 5
14670 * bit decimal so we have to carefully convert
14671 * these to get the full precision.
14672 */
14673 rack_convert_rtts(tp);
14674 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20);
14675 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) {
14676 /* We do not start any timers on DROPPED connections */
14677 if (tp->t_fb->tfb_chg_query == NULL) {
14678 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
14679 } else {
14680 struct tcp_query_resp qr;
14681 int ret;
14682
14683 memset(&qr, 0, sizeof(qr));
14684
14685 /* Get the misc time stamps and such for rack */
14686 qr.req = TCP_QUERY_RACK_TIMES;
14687 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
14688 if (ret == 1) {
14689 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts;
14690 rack->r_ctl.num_dsack = qr.rack_num_dsacks;
14691 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time;
14692 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt;
14693 rack->rc_rack_rtt = qr.rack_rtt;
14694 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time;
14695 rack->r_ctl.rc_sacked = qr.rack_sacked;
14696 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt;
14697 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered;
14698 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs;
14699 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt;
14700 rack->r_ctl.rc_prr_out = qr.rack_prr_out;
14701 if (qr.rack_tlp_out) {
14702 rack->rc_tlp_in_progress = 1;
14703 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out;
14704 } else {
14705 rack->rc_tlp_in_progress = 0;
14706 rack->r_ctl.rc_tlp_cnt_out = 0;
14707 }
14708 if (qr.rack_srtt_measured)
14709 rack->rc_srtt_measure_made = 1;
14710 if (qr.rack_in_persist == 1) {
14711 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle;
14712 #ifdef NETFLIX_SHARED_CWND
14713 if (rack->r_ctl.rc_scw) {
14714 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
14715 rack->rack_scwnd_is_idle = 1;
14716 }
14717 #endif
14718 rack->r_ctl.persist_lost_ends = 0;
14719 rack->probe_not_answered = 0;
14720 rack->forced_ack = 0;
14721 tp->t_rxtshift = 0;
14722 rack->rc_in_persist = 1;
14723 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
14724 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
14725 }
14726 if (qr.rack_wanted_output)
14727 rack->r_wanted_output = 1;
14728 rack_log_chg_info(tp, rack, 6,
14729 qr.rack_min_rtt,
14730 qr.rack_rtt,
14731 qr.rack_reorder_ts);
14732 }
14733 /* Get the old stack timers */
14734 qr.req_param = 0;
14735 qr.req = TCP_QUERY_TIMERS_UP;
14736 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr);
14737 if (ret) {
14738 /*
14739 * non-zero return means we have a timer('s)
14740 * to start. Zero means no timer (no keepalive
14741 * I suppose).
14742 */
14743 uint32_t tov = 0;
14744
14745 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags;
14746 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) {
14747 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to;
14748 if (TSTMP_GT(qr.timer_pacing_to, us_cts))
14749 tov = qr.timer_pacing_to - us_cts;
14750 else
14751 tov = HPTS_USECS_PER_SLOT;
14752 }
14753 if (qr.timer_hpts_flags & PACE_TMR_MASK) {
14754 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp;
14755 if (tov == 0) {
14756 if (TSTMP_GT(qr.timer_timer_exp, us_cts))
14757 tov = qr.timer_timer_exp - us_cts;
14758 else
14759 tov = HPTS_USECS_PER_SLOT;
14760 }
14761 }
14762 rack_log_chg_info(tp, rack, 4,
14763 rack->r_ctl.rc_hpts_flags,
14764 rack->r_ctl.rc_last_output_to,
14765 rack->r_ctl.rc_timer_exp);
14766 if (tov) {
14767 struct hpts_diag diag;
14768
14769 tcp_hpts_insert(tp, tov, &diag);
14770 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time);
14771 }
14772 }
14773 }
14774 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur,
14775 __LINE__, RACK_RTTS_INIT);
14776 }
14777 return (0);
14778 }
14779
14780 static int
rack_handoff_ok(struct tcpcb * tp)14781 rack_handoff_ok(struct tcpcb *tp)
14782 {
14783 if ((tp->t_state == TCPS_CLOSED) ||
14784 (tp->t_state == TCPS_LISTEN)) {
14785 /* Sure no problem though it may not stick */
14786 return (0);
14787 }
14788 if ((tp->t_state == TCPS_SYN_SENT) ||
14789 (tp->t_state == TCPS_SYN_RECEIVED)) {
14790 /*
14791 * We really don't know if you support sack,
14792 * you have to get to ESTAB or beyond to tell.
14793 */
14794 return (EAGAIN);
14795 }
14796 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) {
14797 /*
14798 * Rack will only send a FIN after all data is acknowledged.
14799 * So in this case we have more data outstanding. We can't
14800 * switch stacks until either all data and only the FIN
14801 * is left (in which case rack_init() now knows how
14802 * to deal with that) <or> all is acknowledged and we
14803 * are only left with incoming data, though why you
14804 * would want to switch to rack after all data is acknowledged
14805 * I have no idea (rrs)!
14806 */
14807 return (EAGAIN);
14808 }
14809 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){
14810 return (0);
14811 }
14812 /*
14813 * If we reach here we don't do SACK on this connection so we can
14814 * never do rack.
14815 */
14816 return (EINVAL);
14817 }
14818
14819 static void
rack_fini(struct tcpcb * tp,int32_t tcb_is_purged)14820 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
14821 {
14822
14823 if (tp->t_fb_ptr) {
14824 uint32_t cnt_free = 0;
14825 struct tcp_rack *rack;
14826 struct rack_sendmap *rsm;
14827
14828 tcp_handle_orphaned_packets(tp);
14829 tp->t_flags &= ~TF_FORCEDATA;
14830 rack = (struct tcp_rack *)tp->t_fb_ptr;
14831 rack_log_pacing_delay_calc(rack,
14832 0,
14833 0,
14834 0,
14835 rack_get_gp_est(rack), /* delRate */
14836 rack_get_lt_bw(rack), /* rttProp */
14837 20, __LINE__, NULL, 0);
14838 #ifdef NETFLIX_SHARED_CWND
14839 if (rack->r_ctl.rc_scw) {
14840 uint32_t limit;
14841
14842 if (rack->r_limit_scw)
14843 limit = max(1, rack->r_ctl.rc_lowest_us_rtt);
14844 else
14845 limit = 0;
14846 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw,
14847 rack->r_ctl.rc_scw_index,
14848 limit);
14849 rack->r_ctl.rc_scw = NULL;
14850 }
14851 #endif
14852 if (rack->r_ctl.fsb.tcp_ip_hdr) {
14853 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB);
14854 rack->r_ctl.fsb.tcp_ip_hdr = NULL;
14855 rack->r_ctl.fsb.th = NULL;
14856 }
14857 if (rack->rc_always_pace == 1) {
14858 rack_remove_pacing(rack);
14859 }
14860 /* Clean up any options if they were not applied */
14861 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) {
14862 struct deferred_opt_list *dol;
14863
14864 dol = TAILQ_FIRST(&rack->r_ctl.opt_list);
14865 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
14866 free(dol, M_TCPDO);
14867 }
14868 /* rack does not use force data but other stacks may clear it */
14869 if (rack->r_ctl.crte != NULL) {
14870 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
14871 rack->rack_hdrw_pacing = 0;
14872 rack->r_ctl.crte = NULL;
14873 }
14874 #ifdef TCP_BLACKBOX
14875 tcp_log_flowend(tp);
14876 #endif
14877 /*
14878 * Lets take a different approach to purging just
14879 * get each one and free it like a cum-ack would and
14880 * not use a foreach loop.
14881 */
14882 rsm = tqhash_min(rack->r_ctl.tqh);
14883 while (rsm) {
14884 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK);
14885 rack->r_ctl.rc_num_maps_alloced--;
14886 uma_zfree(rack_zone, rsm);
14887 rsm = tqhash_min(rack->r_ctl.tqh);
14888 }
14889 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14890 while (rsm) {
14891 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext);
14892 rack->r_ctl.rc_num_maps_alloced--;
14893 rack->rc_free_cnt--;
14894 cnt_free++;
14895 uma_zfree(rack_zone, rsm);
14896 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14897 }
14898 if (rack->r_ctl.pcm_s != NULL) {
14899 free(rack->r_ctl.pcm_s, M_TCPPCM);
14900 rack->r_ctl.pcm_s = NULL;
14901 rack->r_ctl.pcm_i.cnt_alloc = 0;
14902 rack->r_ctl.pcm_i.cnt = 0;
14903 }
14904 if ((rack->r_ctl.rc_num_maps_alloced > 0) &&
14905 (tcp_bblogging_on(tp))) {
14906 union tcp_log_stackspecific log;
14907 struct timeval tv;
14908
14909 memset(&log, 0, sizeof(log));
14910 log.u_bbr.flex8 = 10;
14911 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced;
14912 log.u_bbr.flex2 = rack->rc_free_cnt;
14913 log.u_bbr.flex3 = cnt_free;
14914 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
14915 rsm = tqhash_min(rack->r_ctl.tqh);
14916 log.u_bbr.delRate = (uintptr_t)rsm;
14917 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
14918 log.u_bbr.cur_del_rate = (uintptr_t)rsm;
14919 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
14920 log.u_bbr.pkt_epoch = __LINE__;
14921 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
14922 0, &log, false, NULL, NULL, 0, &tv);
14923 }
14924 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0),
14925 ("rack:%p num_aloc:%u after freeing all?",
14926 rack,
14927 rack->r_ctl.rc_num_maps_alloced));
14928 rack->rc_free_cnt = 0;
14929 free(rack->r_ctl.tqh, M_TCPFSB);
14930 rack->r_ctl.tqh = NULL;
14931 uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
14932 tp->t_fb_ptr = NULL;
14933 }
14934 /* Make sure snd_nxt is correctly set */
14935 tp->snd_nxt = tp->snd_max;
14936 }
14937
14938 static void
rack_set_state(struct tcpcb * tp,struct tcp_rack * rack)14939 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack)
14940 {
14941 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) {
14942 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0;
14943 }
14944 switch (tp->t_state) {
14945 case TCPS_SYN_SENT:
14946 rack->r_state = TCPS_SYN_SENT;
14947 rack->r_substate = rack_do_syn_sent;
14948 break;
14949 case TCPS_SYN_RECEIVED:
14950 rack->r_state = TCPS_SYN_RECEIVED;
14951 rack->r_substate = rack_do_syn_recv;
14952 break;
14953 case TCPS_ESTABLISHED:
14954 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14955 rack->r_state = TCPS_ESTABLISHED;
14956 rack->r_substate = rack_do_established;
14957 break;
14958 case TCPS_CLOSE_WAIT:
14959 rack->r_state = TCPS_CLOSE_WAIT;
14960 rack->r_substate = rack_do_close_wait;
14961 break;
14962 case TCPS_FIN_WAIT_1:
14963 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14964 rack->r_state = TCPS_FIN_WAIT_1;
14965 rack->r_substate = rack_do_fin_wait_1;
14966 break;
14967 case TCPS_CLOSING:
14968 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14969 rack->r_state = TCPS_CLOSING;
14970 rack->r_substate = rack_do_closing;
14971 break;
14972 case TCPS_LAST_ACK:
14973 rack_set_pace_segments(tp, rack, __LINE__, NULL);
14974 rack->r_state = TCPS_LAST_ACK;
14975 rack->r_substate = rack_do_lastack;
14976 break;
14977 case TCPS_FIN_WAIT_2:
14978 rack->r_state = TCPS_FIN_WAIT_2;
14979 rack->r_substate = rack_do_fin_wait_2;
14980 break;
14981 case TCPS_LISTEN:
14982 case TCPS_CLOSED:
14983 case TCPS_TIME_WAIT:
14984 default:
14985 break;
14986 };
14987 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
14988 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
14989
14990 }
14991
14992 static void
rack_timer_audit(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb)14993 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb)
14994 {
14995 /*
14996 * We received an ack, and then did not
14997 * call send or were bounced out due to the
14998 * hpts was running. Now a timer is up as well, is
14999 * it the right timer?
15000 */
15001 struct rack_sendmap *rsm;
15002 int tmr_up;
15003
15004 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK;
15005 if (tcp_in_hpts(rack->rc_tp) == 0) {
15006 /*
15007 * Ok we probably need some timer up, but no
15008 * matter what the mask we are not in hpts. We
15009 * may have received an old ack and thus did nothing.
15010 */
15011 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15012 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15013 return;
15014 }
15015 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT))
15016 return;
15017 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
15018 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) &&
15019 (tmr_up == PACE_TMR_RXT)) {
15020 /* Should be an RXT */
15021 return;
15022 }
15023 if (rsm == NULL) {
15024 /* Nothing outstanding? */
15025 if (tp->t_flags & TF_DELACK) {
15026 if (tmr_up == PACE_TMR_DELACK)
15027 /* We are supposed to have delayed ack up and we do */
15028 return;
15029 } else if (((V_tcp_always_keepalive ||
15030 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
15031 (tp->t_state <= TCPS_CLOSING)) &&
15032 (tmr_up == PACE_TMR_KEEP) &&
15033 (tp->snd_max == tp->snd_una)) {
15034 /* We should have keep alive up and we do */
15035 return;
15036 }
15037 }
15038 if (SEQ_GT(tp->snd_max, tp->snd_una) &&
15039 ((tmr_up == PACE_TMR_TLP) ||
15040 (tmr_up == PACE_TMR_RACK) ||
15041 (tmr_up == PACE_TMR_RXT))) {
15042 /*
15043 * Either a Rack, TLP or RXT is fine if we
15044 * have outstanding data.
15045 */
15046 return;
15047 } else if (tmr_up == PACE_TMR_DELACK) {
15048 /*
15049 * If the delayed ack was going to go off
15050 * before the rtx/tlp/rack timer were going to
15051 * expire, then that would be the timer in control.
15052 * Note we don't check the time here trusting the
15053 * code is correct.
15054 */
15055 return;
15056 }
15057 /*
15058 * Ok the timer originally started is not what we want now.
15059 * We will force the hpts to be stopped if any, and restart
15060 * with the slot set to what was in the saved slot.
15061 */
15062 if (tcp_in_hpts(rack->rc_tp)) {
15063 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
15064 uint32_t us_cts;
15065
15066 us_cts = tcp_get_usecs(NULL);
15067 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
15068 rack->r_early = 1;
15069 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
15070 }
15071 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
15072 }
15073 tcp_hpts_remove(rack->rc_tp);
15074 }
15075 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15076 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
15077 }
15078
15079
15080 static void
rack_do_win_updates(struct tcpcb * tp,struct tcp_rack * rack,uint32_t tiwin,uint32_t seq,uint32_t ack,uint32_t cts)15081 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts)
15082 {
15083 if ((SEQ_LT(tp->snd_wl1, seq) ||
15084 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) ||
15085 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) {
15086 /* keep track of pure window updates */
15087 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd))
15088 KMOD_TCPSTAT_INC(tcps_rcvwinupd);
15089 tp->snd_wnd = tiwin;
15090 rack_validate_fo_sendwin_up(tp, rack);
15091 tp->snd_wl1 = seq;
15092 tp->snd_wl2 = ack;
15093 if (tp->snd_wnd > tp->max_sndwnd)
15094 tp->max_sndwnd = tp->snd_wnd;
15095 rack->r_wanted_output = 1;
15096 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) {
15097 tp->snd_wnd = tiwin;
15098 rack_validate_fo_sendwin_up(tp, rack);
15099 tp->snd_wl1 = seq;
15100 tp->snd_wl2 = ack;
15101 } else {
15102 /* Not a valid win update */
15103 return;
15104 }
15105 if (tp->snd_wnd > tp->max_sndwnd)
15106 tp->max_sndwnd = tp->snd_wnd;
15107 /* Do we exit persists? */
15108 if ((rack->rc_in_persist != 0) &&
15109 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
15110 rack->r_ctl.rc_pace_min_segs))) {
15111 rack_exit_persist(tp, rack, cts);
15112 }
15113 /* Do we enter persists? */
15114 if ((rack->rc_in_persist == 0) &&
15115 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) &&
15116 TCPS_HAVEESTABLISHED(tp->t_state) &&
15117 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) &&
15118 sbavail(&tptosocket(tp)->so_snd) &&
15119 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) {
15120 /*
15121 * Here the rwnd is less than
15122 * the pacing size, we are established,
15123 * nothing is outstanding, and there is
15124 * data to send. Enter persists.
15125 */
15126 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack);
15127 }
15128 }
15129
15130 static void
rack_log_input_packet(struct tcpcb * tp,struct tcp_rack * rack,struct tcp_ackent * ae,int ackval,uint32_t high_seq)15131 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq)
15132 {
15133
15134 if (tcp_bblogging_on(rack->rc_tp)) {
15135 struct inpcb *inp = tptoinpcb(tp);
15136 union tcp_log_stackspecific log;
15137 struct timeval ltv;
15138 char tcp_hdr_buf[60];
15139 struct tcphdr *th;
15140 struct timespec ts;
15141 uint32_t orig_snd_una;
15142 uint8_t xx = 0;
15143
15144 #ifdef TCP_REQUEST_TRK
15145 struct tcp_sendfile_track *tcp_req;
15146
15147 if (SEQ_GT(ae->ack, tp->snd_una)) {
15148 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1));
15149 } else {
15150 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack);
15151 }
15152 #endif
15153 memset(&log, 0, sizeof(log));
15154 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
15155 if (rack->rack_no_prr == 0)
15156 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
15157 else
15158 log.u_bbr.flex1 = 0;
15159 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
15160 log.u_bbr.use_lt_bw <<= 1;
15161 log.u_bbr.use_lt_bw |= rack->r_might_revert;
15162 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
15163 log.u_bbr.bbr_state = rack->rc_free_cnt;
15164 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
15165 log.u_bbr.pkts_out = tp->t_maxseg;
15166 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
15167 log.u_bbr.flex7 = 1;
15168 log.u_bbr.lost = ae->flags;
15169 log.u_bbr.cwnd_gain = ackval;
15170 log.u_bbr.pacing_gain = 0x2;
15171 if (ae->flags & TSTMP_HDWR) {
15172 /* Record the hardware timestamp if present */
15173 log.u_bbr.flex3 = M_TSTMP;
15174 ts.tv_sec = ae->timestamp / 1000000000;
15175 ts.tv_nsec = ae->timestamp % 1000000000;
15176 ltv.tv_sec = ts.tv_sec;
15177 ltv.tv_usec = ts.tv_nsec / 1000;
15178 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v);
15179 } else if (ae->flags & TSTMP_LRO) {
15180 /* Record the LRO the arrival timestamp */
15181 log.u_bbr.flex3 = M_TSTMP_LRO;
15182 ts.tv_sec = ae->timestamp / 1000000000;
15183 ts.tv_nsec = ae->timestamp % 1000000000;
15184 ltv.tv_sec = ts.tv_sec;
15185 ltv.tv_usec = ts.tv_nsec / 1000;
15186 log.u_bbr.flex5 = tcp_tv_to_usec(<v);
15187 }
15188 log.u_bbr.timeStamp = tcp_get_usecs(<v);
15189 /* Log the rcv time */
15190 log.u_bbr.delRate = ae->timestamp;
15191 #ifdef TCP_REQUEST_TRK
15192 log.u_bbr.applimited = tp->t_tcpreq_closed;
15193 log.u_bbr.applimited <<= 8;
15194 log.u_bbr.applimited |= tp->t_tcpreq_open;
15195 log.u_bbr.applimited <<= 8;
15196 log.u_bbr.applimited |= tp->t_tcpreq_req;
15197 if (tcp_req) {
15198 /* Copy out any client req info */
15199 /* seconds */
15200 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC);
15201 /* useconds */
15202 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC);
15203 log.u_bbr.rttProp = tcp_req->timestamp;
15204 log.u_bbr.cur_del_rate = tcp_req->start;
15205 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) {
15206 log.u_bbr.flex8 |= 1;
15207 } else {
15208 log.u_bbr.flex8 |= 2;
15209 log.u_bbr.bw_inuse = tcp_req->end;
15210 }
15211 log.u_bbr.flex6 = tcp_req->start_seq;
15212 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) {
15213 log.u_bbr.flex8 |= 4;
15214 log.u_bbr.epoch = tcp_req->end_seq;
15215 }
15216 }
15217 #endif
15218 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf));
15219 th = (struct tcphdr *)tcp_hdr_buf;
15220 th->th_seq = ae->seq;
15221 th->th_ack = ae->ack;
15222 th->th_win = ae->win;
15223 /* Now fill in the ports */
15224 th->th_sport = inp->inp_fport;
15225 th->th_dport = inp->inp_lport;
15226 tcp_set_flags(th, ae->flags);
15227 /* Now do we have a timestamp option? */
15228 if (ae->flags & HAS_TSTMP) {
15229 u_char *cp;
15230 uint32_t val;
15231
15232 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2);
15233 cp = (u_char *)(th + 1);
15234 *cp = TCPOPT_NOP;
15235 cp++;
15236 *cp = TCPOPT_NOP;
15237 cp++;
15238 *cp = TCPOPT_TIMESTAMP;
15239 cp++;
15240 *cp = TCPOLEN_TIMESTAMP;
15241 cp++;
15242 val = htonl(ae->ts_value);
15243 bcopy((char *)&val,
15244 (char *)cp, sizeof(uint32_t));
15245 val = htonl(ae->ts_echo);
15246 bcopy((char *)&val,
15247 (char *)(cp + 4), sizeof(uint32_t));
15248 } else
15249 th->th_off = (sizeof(struct tcphdr) >> 2);
15250
15251 /*
15252 * For sane logging we need to play a little trick.
15253 * If the ack were fully processed we would have moved
15254 * snd_una to high_seq, but since compressed acks are
15255 * processed in two phases, at this point (logging) snd_una
15256 * won't be advanced. So we would see multiple acks showing
15257 * the advancement. We can prevent that by "pretending" that
15258 * snd_una was advanced and then un-advancing it so that the
15259 * logging code has the right value for tlb_snd_una.
15260 */
15261 if (tp->snd_una != high_seq) {
15262 orig_snd_una = tp->snd_una;
15263 tp->snd_una = high_seq;
15264 xx = 1;
15265 } else
15266 xx = 0;
15267 TCP_LOG_EVENTP(tp, th,
15268 &tptosocket(tp)->so_rcv,
15269 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0,
15270 0, &log, true, <v);
15271 if (xx) {
15272 tp->snd_una = orig_snd_una;
15273 }
15274 }
15275
15276 }
15277
15278 static void
rack_handle_probe_response(struct tcp_rack * rack,uint32_t tiwin,uint32_t us_cts)15279 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts)
15280 {
15281 uint32_t us_rtt;
15282 /*
15283 * A persist or keep-alive was forced out, update our
15284 * min rtt time. Note now worry about lost responses.
15285 * When a subsequent keep-alive or persist times out
15286 * and forced_ack is still on, then the last probe
15287 * was not responded to. In such cases we have a
15288 * sysctl that controls the behavior. Either we apply
15289 * the rtt but with reduced confidence (0). Or we just
15290 * plain don't apply the rtt estimate. Having data flow
15291 * will clear the probe_not_answered flag i.e. cum-ack
15292 * move forward <or> exiting and reentering persists.
15293 */
15294
15295 rack->forced_ack = 0;
15296 rack->rc_tp->t_rxtshift = 0;
15297 if ((rack->rc_in_persist &&
15298 (tiwin == rack->rc_tp->snd_wnd)) ||
15299 (rack->rc_in_persist == 0)) {
15300 /*
15301 * In persists only apply the RTT update if this is
15302 * a response to our window probe. And that
15303 * means the rwnd sent must match the current
15304 * snd_wnd. If it does not, then we got a
15305 * window update ack instead. For keepalive
15306 * we allow the answer no matter what the window.
15307 *
15308 * Note that if the probe_not_answered is set then
15309 * the forced_ack_ts is the oldest one i.e. the first
15310 * probe sent that might have been lost. This assures
15311 * us that if we do calculate an RTT it is longer not
15312 * some short thing.
15313 */
15314 if (rack->rc_in_persist)
15315 counter_u64_add(rack_persists_acks, 1);
15316 us_rtt = us_cts - rack->r_ctl.forced_ack_ts;
15317 if (us_rtt == 0)
15318 us_rtt = 1;
15319 if (rack->probe_not_answered == 0) {
15320 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15321 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1);
15322 } else {
15323 /* We have a retransmitted probe here too */
15324 if (rack_apply_rtt_with_reduced_conf) {
15325 rack_apply_updated_usrtt(rack, us_rtt, us_cts);
15326 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1);
15327 }
15328 }
15329 }
15330 }
15331
15332 static void
rack_new_round_starts(struct tcpcb * tp,struct tcp_rack * rack,uint32_t high_seq)15333 rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
15334 {
15335 /*
15336 * The next send has occurred mark the end of the round
15337 * as when that data gets acknowledged. We can
15338 * also do common things we might need to do when
15339 * a round begins.
15340 */
15341 rack->r_ctl.roundends = tp->snd_max;
15342 rack->rc_new_rnd_needed = 0;
15343 rack_log_hystart_event(rack, tp->snd_max, 4);
15344 }
15345
15346
15347 static void
rack_log_pcm(struct tcp_rack * rack,uint8_t mod,uint32_t flex1,uint32_t flex2,uint32_t flex3)15348 rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2,
15349 uint32_t flex3)
15350 {
15351 if (tcp_bblogging_on(rack->rc_tp)) {
15352 union tcp_log_stackspecific log;
15353 struct timeval tv;
15354
15355 (void)tcp_get_usecs(&tv);
15356 memset(&log, 0, sizeof(log));
15357 log.u_bbr.timeStamp = tcp_tv_to_usec(&tv);
15358 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
15359 log.u_bbr.flex8 = mod;
15360 log.u_bbr.flex1 = flex1;
15361 log.u_bbr.flex2 = flex2;
15362 log.u_bbr.flex3 = flex3;
15363 log.u_bbr.flex4 = rack_pcm_every_n_rounds;
15364 log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds;
15365 log.u_bbr.bbr_substate = rack->pcm_needed;
15366 log.u_bbr.bbr_substate <<= 1;
15367 log.u_bbr.bbr_substate |= rack->pcm_in_progress;
15368 log.u_bbr.bbr_substate <<= 1;
15369 log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */
15370 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK,
15371 0, &log, false, NULL, NULL, 0, &tv);
15372 }
15373 }
15374
15375 static void
rack_new_round_setup(struct tcpcb * tp,struct tcp_rack * rack,uint32_t high_seq)15376 rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq)
15377 {
15378 /*
15379 * The round (current_round) has ended. We now
15380 * setup for the next round by incrementing the
15381 * round numnber and doing any round specific
15382 * things.
15383 */
15384 rack_log_hystart_event(rack, high_seq, 21);
15385 rack->r_ctl.current_round++;
15386 /* New round (current_round) begins at next send */
15387 rack->rc_new_rnd_needed = 1;
15388 if ((rack->pcm_enabled == 1) &&
15389 (rack->pcm_needed == 0) &&
15390 (rack->pcm_in_progress == 0)) {
15391 /*
15392 * If we have enabled PCM, then we need to
15393 * check if the round has adanced to the state
15394 * where one is required.
15395 */
15396 int rnds;
15397
15398 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
15399 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
15400 rack->pcm_needed = 1;
15401 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
15402 } else if (rack_verbose_logging) {
15403 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round );
15404 }
15405 }
15406 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
15407 /* We have hystart enabled send the round info in */
15408 if (CC_ALGO(tp)->newround != NULL) {
15409 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round);
15410 }
15411 }
15412 /*
15413 * For DGP an initial startup check. We want to validate
15414 * that we are not just pushing on slow-start and just
15415 * not gaining.. i.e. filling buffers without getting any
15416 * boost in b/w during the inital slow-start.
15417 */
15418 if (rack->dgp_on &&
15419 (rack->rc_initial_ss_comp == 0) &&
15420 (tp->snd_cwnd < tp->snd_ssthresh) &&
15421 (rack->r_ctl.num_measurements >= RACK_REQ_AVG) &&
15422 (rack->r_ctl.gp_rnd_thresh > 0) &&
15423 ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) {
15424
15425 /*
15426 * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where
15427 * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets
15428 * exit SS.
15429 *
15430 * Pick up the flight size now as we enter slowstart (not the
15431 * cwnd which may be inflated).
15432 */
15433 rack->rc_initial_ss_comp = 1;
15434
15435 if (tcp_bblogging_on(rack->rc_tp)) {
15436 union tcp_log_stackspecific log;
15437 struct timeval tv;
15438
15439 memset(&log, 0, sizeof(log));
15440 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
15441 log.u_bbr.flex1 = rack->r_ctl.current_round;
15442 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise;
15443 log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh;
15444 log.u_bbr.flex4 = rack->r_ctl.gate_to_fs;
15445 log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs;
15446 log.u_bbr.flex8 = 40;
15447 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0,
15448 0, &log, false, NULL, __func__, __LINE__,&tv);
15449 }
15450 if ((rack->r_ctl.gate_to_fs == 1) &&
15451 (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) {
15452 tp->snd_cwnd = rack->r_ctl.ss_hi_fs;
15453 }
15454 tp->snd_ssthresh = tp->snd_cwnd - 1;
15455 /* Turn off any fast output running */
15456 rack->r_fast_output = 0;
15457 }
15458 }
15459
15460 static int
rack_do_compressed_ack_processing(struct tcpcb * tp,struct socket * so,struct mbuf * m,int nxt_pkt,struct timeval * tv)15461 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv)
15462 {
15463 /*
15464 * Handle a "special" compressed ack mbuf. Each incoming
15465 * ack has only four possible dispositions:
15466 *
15467 * A) It moves the cum-ack forward
15468 * B) It is behind the cum-ack.
15469 * C) It is a window-update ack.
15470 * D) It is a dup-ack.
15471 *
15472 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES
15473 * in the incoming mbuf. We also need to still pay attention
15474 * to nxt_pkt since there may be another packet after this
15475 * one.
15476 */
15477 #ifdef TCP_ACCOUNTING
15478 uint64_t ts_val;
15479 uint64_t rdstc;
15480 #endif
15481 int segsiz;
15482 struct timespec ts;
15483 struct tcp_rack *rack;
15484 struct tcp_ackent *ae;
15485 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
15486 int cnt, i, did_out, ourfinisacked = 0;
15487 struct tcpopt to_holder, *to = NULL;
15488 #ifdef TCP_ACCOUNTING
15489 int win_up_req = 0;
15490 #endif
15491 int nsegs = 0;
15492 int under_pacing = 0;
15493 int post_recovery = 0;
15494 #ifdef TCP_ACCOUNTING
15495 sched_pin();
15496 #endif
15497 rack = (struct tcp_rack *)tp->t_fb_ptr;
15498 if (rack->gp_ready &&
15499 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT))
15500 under_pacing = 1;
15501
15502 if (rack->r_state != tp->t_state)
15503 rack_set_state(tp, rack);
15504 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
15505 (tp->t_flags & TF_GPUTINPROG)) {
15506 /*
15507 * We have a goodput in progress
15508 * and we have entered a late state.
15509 * Do we have enough data in the sb
15510 * to handle the GPUT request?
15511 */
15512 uint32_t bytes;
15513
15514 bytes = tp->gput_ack - tp->gput_seq;
15515 if (SEQ_GT(tp->gput_seq, tp->snd_una))
15516 bytes += tp->gput_seq - tp->snd_una;
15517 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
15518 /*
15519 * There are not enough bytes in the socket
15520 * buffer that have been sent to cover this
15521 * measurement. Cancel it.
15522 */
15523 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
15524 rack->r_ctl.rc_gp_srtt /*flex1*/,
15525 tp->gput_seq,
15526 0, 0, 18, __LINE__, NULL, 0);
15527 tp->t_flags &= ~TF_GPUTINPROG;
15528 }
15529 }
15530 to = &to_holder;
15531 to->to_flags = 0;
15532 KASSERT((m->m_len >= sizeof(struct tcp_ackent)),
15533 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len));
15534 cnt = m->m_len / sizeof(struct tcp_ackent);
15535 counter_u64_add(rack_multi_single_eq, cnt);
15536 high_seq = tp->snd_una;
15537 the_win = tp->snd_wnd;
15538 win_seq = tp->snd_wl1;
15539 win_upd_ack = tp->snd_wl2;
15540 cts = tcp_tv_to_usec(tv);
15541 ms_cts = tcp_tv_to_msec(tv);
15542 rack->r_ctl.rc_rcvtime = cts;
15543 segsiz = ctf_fixed_maxseg(tp);
15544 if ((rack->rc_gp_dyn_mul) &&
15545 (rack->use_fixed_rate == 0) &&
15546 (rack->rc_always_pace)) {
15547 /* Check in on probertt */
15548 rack_check_probe_rtt(rack, cts);
15549 }
15550 for (i = 0; i < cnt; i++) {
15551 #ifdef TCP_ACCOUNTING
15552 ts_val = get_cyclecount();
15553 #endif
15554 rack_clear_rate_sample(rack);
15555 ae = ((mtod(m, struct tcp_ackent *)) + i);
15556 if (ae->flags & TH_FIN)
15557 rack_log_pacing_delay_calc(rack,
15558 0,
15559 0,
15560 0,
15561 rack_get_gp_est(rack), /* delRate */
15562 rack_get_lt_bw(rack), /* rttProp */
15563 20, __LINE__, NULL, 0);
15564 /* Setup the window */
15565 tiwin = ae->win << tp->snd_scale;
15566 if (tiwin > rack->r_ctl.rc_high_rwnd)
15567 rack->r_ctl.rc_high_rwnd = tiwin;
15568 /* figure out the type of ack */
15569 if (SEQ_LT(ae->ack, high_seq)) {
15570 /* Case B*/
15571 ae->ack_val_set = ACK_BEHIND;
15572 } else if (SEQ_GT(ae->ack, high_seq)) {
15573 /* Case A */
15574 ae->ack_val_set = ACK_CUMACK;
15575 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){
15576 /* Case D */
15577 ae->ack_val_set = ACK_DUPACK;
15578 } else {
15579 /* Case C */
15580 ae->ack_val_set = ACK_RWND;
15581 }
15582 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
15583 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq);
15584 /* Validate timestamp */
15585 if (ae->flags & HAS_TSTMP) {
15586 /* Setup for a timestamp */
15587 to->to_flags = TOF_TS;
15588 ae->ts_echo -= tp->ts_offset;
15589 to->to_tsecr = ae->ts_echo;
15590 to->to_tsval = ae->ts_value;
15591 /*
15592 * If echoed timestamp is later than the current time, fall back to
15593 * non RFC1323 RTT calculation. Normalize timestamp if syncookies
15594 * were used when this connection was established.
15595 */
15596 if (TSTMP_GT(ae->ts_echo, ms_cts))
15597 to->to_tsecr = 0;
15598 if (tp->ts_recent &&
15599 TSTMP_LT(ae->ts_value, tp->ts_recent)) {
15600 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
15601 #ifdef TCP_ACCOUNTING
15602 rdstc = get_cyclecount();
15603 if (rdstc > ts_val) {
15604 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15605 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
15606 }
15607 }
15608 #endif
15609 continue;
15610 }
15611 }
15612 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) &&
15613 SEQ_LEQ(tp->last_ack_sent, ae->seq)) {
15614 tp->ts_recent_age = tcp_ts_getticks();
15615 tp->ts_recent = ae->ts_value;
15616 }
15617 } else {
15618 /* Setup for a no options */
15619 to->to_flags = 0;
15620 }
15621 /* Update the rcv time and perform idle reduction possibly */
15622 if (tp->t_idle_reduce &&
15623 (tp->snd_max == tp->snd_una) &&
15624 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
15625 counter_u64_add(rack_input_idle_reduces, 1);
15626 rack_cc_after_idle(rack, tp);
15627 }
15628 tp->t_rcvtime = ticks;
15629 /* Now what about ECN of a chain of pure ACKs? */
15630 if (tcp_ecn_input_segment(tp, ae->flags, 0,
15631 tcp_packets_this_ack(tp, ae->ack),
15632 ae->codepoint))
15633 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
15634 #ifdef TCP_ACCOUNTING
15635 /* Count for the specific type of ack in */
15636 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15637 tp->tcp_cnt_counters[ae->ack_val_set]++;
15638 }
15639 #endif
15640 /*
15641 * Note how we could move up these in the determination
15642 * above, but we don't so that way the timestamp checks (and ECN)
15643 * is done first before we do any processing on the ACK.
15644 * The non-compressed path through the code has this
15645 * weakness (noted by @jtl) that it actually does some
15646 * processing before verifying the timestamp information.
15647 * We don't take that path here which is why we set
15648 * the ack_val_set first, do the timestamp and ecn
15649 * processing, and then look at what we have setup.
15650 */
15651 if (ae->ack_val_set == ACK_BEHIND) {
15652 /*
15653 * Case B flag reordering, if window is not closed
15654 * or it could be a keep-alive or persists
15655 */
15656 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) {
15657 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
15658 if (rack->r_ctl.rc_reorder_ts == 0)
15659 rack->r_ctl.rc_reorder_ts = 1;
15660 }
15661 } else if (ae->ack_val_set == ACK_DUPACK) {
15662 /* Case D */
15663 rack_strike_dupack(rack, ae->ack);
15664 } else if (ae->ack_val_set == ACK_RWND) {
15665 /* Case C */
15666 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
15667 ts.tv_sec = ae->timestamp / 1000000000;
15668 ts.tv_nsec = ae->timestamp % 1000000000;
15669 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
15670 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
15671 } else {
15672 rack->r_ctl.act_rcv_time = *tv;
15673 }
15674 if (rack->forced_ack) {
15675 rack_handle_probe_response(rack, tiwin,
15676 tcp_tv_to_usec(&rack->r_ctl.act_rcv_time));
15677 }
15678 #ifdef TCP_ACCOUNTING
15679 win_up_req = 1;
15680 #endif
15681 win_upd_ack = ae->ack;
15682 win_seq = ae->seq;
15683 the_win = tiwin;
15684 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15685 } else {
15686 /* Case A */
15687 if (SEQ_GT(ae->ack, tp->snd_max)) {
15688 /*
15689 * We just send an ack since the incoming
15690 * ack is beyond the largest seq we sent.
15691 */
15692 if ((tp->t_flags & TF_ACKNOW) == 0) {
15693 ctf_ack_war_checks(tp);
15694 if (tp->t_flags && TF_ACKNOW)
15695 rack->r_wanted_output = 1;
15696 }
15697 } else {
15698 nsegs++;
15699 /* If the window changed setup to update */
15700 if (tiwin != tp->snd_wnd) {
15701 win_upd_ack = ae->ack;
15702 win_seq = ae->seq;
15703 the_win = tiwin;
15704 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts);
15705 }
15706 #ifdef TCP_ACCOUNTING
15707 /* Account for the acks */
15708 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15709 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz);
15710 }
15711 #endif
15712 high_seq = ae->ack;
15713 /* Setup our act_rcv_time */
15714 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) {
15715 ts.tv_sec = ae->timestamp / 1000000000;
15716 ts.tv_nsec = ae->timestamp % 1000000000;
15717 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
15718 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
15719 } else {
15720 rack->r_ctl.act_rcv_time = *tv;
15721 }
15722 rack_process_to_cumack(tp, rack, ae->ack, cts, to,
15723 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time));
15724 #ifdef TCP_REQUEST_TRK
15725 rack_req_check_for_comp(rack, high_seq);
15726 #endif
15727 if (rack->rc_dsack_round_seen) {
15728 /* Is the dsack round over? */
15729 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
15730 /* Yes it is */
15731 rack->rc_dsack_round_seen = 0;
15732 rack_log_dsack_event(rack, 3, __LINE__, 0, 0);
15733 }
15734 }
15735 }
15736 }
15737 /* And lets be sure to commit the rtt measurements for this ack */
15738 tcp_rack_xmit_timer_commit(rack, tp);
15739 #ifdef TCP_ACCOUNTING
15740 rdstc = get_cyclecount();
15741 if (rdstc > ts_val) {
15742 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15743 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val);
15744 if (ae->ack_val_set == ACK_CUMACK)
15745 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val);
15746 }
15747 }
15748 #endif
15749 }
15750 #ifdef TCP_ACCOUNTING
15751 ts_val = get_cyclecount();
15752 #endif
15753 /* Tend to any collapsed window */
15754 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
15755 /* The peer collapsed the window */
15756 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__);
15757 } else if (rack->rc_has_collapsed)
15758 rack_un_collapse_window(rack, __LINE__);
15759 if ((rack->r_collapse_point_valid) &&
15760 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
15761 rack->r_collapse_point_valid = 0;
15762 acked_amount = acked = (high_seq - tp->snd_una);
15763 if (acked) {
15764 /*
15765 * The draft (v3) calls for us to use SEQ_GEQ, but that
15766 * causes issues when we are just going app limited. Lets
15767 * instead use SEQ_GT <or> where its equal but more data
15768 * is outstanding.
15769 *
15770 * Also make sure we are on the last ack of a series. We
15771 * have to have all the ack's processed in queue to know
15772 * if there is something left outstanding.
15773 *
15774 */
15775 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) &&
15776 (rack->rc_new_rnd_needed == 0) &&
15777 (nxt_pkt == 0)) {
15778 /*
15779 * We have crossed into a new round with
15780 * this th_ack value.
15781 */
15782 rack_new_round_setup(tp, rack, high_seq);
15783 }
15784 /*
15785 * Clear the probe not answered flag
15786 * since cum-ack moved forward.
15787 */
15788 rack->probe_not_answered = 0;
15789 if (tp->t_flags & TF_NEEDSYN) {
15790 /*
15791 * T/TCP: Connection was half-synchronized, and our SYN has
15792 * been ACK'd (so connection is now fully synchronized). Go
15793 * to non-starred state, increment snd_una for ACK of SYN,
15794 * and check if we can do window scaling.
15795 */
15796 tp->t_flags &= ~TF_NEEDSYN;
15797 tp->snd_una++;
15798 acked_amount = acked = (high_seq - tp->snd_una);
15799 }
15800 if (acked > sbavail(&so->so_snd))
15801 acked_amount = sbavail(&so->so_snd);
15802 if (IN_FASTRECOVERY(tp->t_flags) &&
15803 (rack->rack_no_prr == 0))
15804 rack_update_prr(tp, rack, acked_amount, high_seq);
15805 if (IN_RECOVERY(tp->t_flags)) {
15806 if (SEQ_LT(high_seq, tp->snd_recover) &&
15807 (SEQ_LT(high_seq, tp->snd_max))) {
15808 tcp_rack_partialack(tp);
15809 } else {
15810 rack_post_recovery(tp, high_seq);
15811 post_recovery = 1;
15812 }
15813 } else if ((rack->rto_from_rec == 1) &&
15814 SEQ_GEQ(high_seq, tp->snd_recover)) {
15815 /*
15816 * We were in recovery, hit a rxt timeout
15817 * and never re-entered recovery. The timeout(s)
15818 * made up all the lost data. In such a case
15819 * we need to clear the rto_from_rec flag.
15820 */
15821 rack->rto_from_rec = 0;
15822 }
15823 /* Handle the rack-log-ack part (sendmap) */
15824 if ((sbused(&so->so_snd) == 0) &&
15825 (acked > acked_amount) &&
15826 (tp->t_state >= TCPS_FIN_WAIT_1) &&
15827 (tp->t_flags & TF_SENTFIN)) {
15828 /*
15829 * We must be sure our fin
15830 * was sent and acked (we can be
15831 * in FIN_WAIT_1 without having
15832 * sent the fin).
15833 */
15834 ourfinisacked = 1;
15835 /*
15836 * Lets make sure snd_una is updated
15837 * since most likely acked_amount = 0 (it
15838 * should be).
15839 */
15840 tp->snd_una = high_seq;
15841 }
15842 /* Did we make a RTO error? */
15843 if ((tp->t_flags & TF_PREVVALID) &&
15844 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) {
15845 tp->t_flags &= ~TF_PREVVALID;
15846 if (tp->t_rxtshift == 1 &&
15847 (int)(ticks - tp->t_badrxtwin) < 0)
15848 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__);
15849 }
15850 /* Handle the data in the socket buffer */
15851 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1);
15852 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked);
15853 if (acked_amount > 0) {
15854 uint32_t p_cwnd;
15855 struct mbuf *mfree;
15856
15857 if (post_recovery) {
15858 /*
15859 * Grab the segsiz, multiply by 2 and add the snd_cwnd
15860 * that is the max the CC should add if we are exiting
15861 * recovery and doing a late add.
15862 */
15863 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
15864 p_cwnd <<= 1;
15865 p_cwnd += tp->snd_cwnd;
15866 }
15867 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery);
15868 if (post_recovery && (tp->snd_cwnd > p_cwnd)) {
15869 /* Must be non-newreno (cubic) getting too ahead of itself */
15870 tp->snd_cwnd = p_cwnd;
15871 }
15872 SOCK_SENDBUF_LOCK(so);
15873 mfree = sbcut_locked(&so->so_snd, acked_amount);
15874 tp->snd_una = high_seq;
15875 /* Note we want to hold the sb lock through the sendmap adjust */
15876 rack_adjust_sendmap_head(rack, &so->so_snd);
15877 /* Wake up the socket if we have room to write more */
15878 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2);
15879 sowwakeup_locked(so);
15880 m_freem(mfree);
15881 }
15882 /* update progress */
15883 tp->t_acktime = ticks;
15884 rack_log_progress_event(rack, tp, tp->t_acktime,
15885 PROGRESS_UPDATE, __LINE__);
15886 /* Clear out shifts and such */
15887 tp->t_rxtshift = 0;
15888 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
15889 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop);
15890 rack->rc_tlp_in_progress = 0;
15891 rack->r_ctl.rc_tlp_cnt_out = 0;
15892 /* Send recover and snd_nxt must be dragged along */
15893 if (SEQ_GT(tp->snd_una, tp->snd_recover))
15894 tp->snd_recover = tp->snd_una;
15895 if (SEQ_LT(tp->snd_nxt, tp->snd_max))
15896 tp->snd_nxt = tp->snd_max;
15897 /*
15898 * If the RXT timer is running we want to
15899 * stop it, so we can restart a TLP (or new RXT).
15900 */
15901 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT)
15902 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15903 tp->snd_wl2 = high_seq;
15904 tp->t_dupacks = 0;
15905 if (under_pacing &&
15906 (rack->use_fixed_rate == 0) &&
15907 (rack->in_probe_rtt == 0) &&
15908 rack->rc_gp_dyn_mul &&
15909 rack->rc_always_pace) {
15910 /* Check if we are dragging bottom */
15911 rack_check_bottom_drag(tp, rack, so);
15912 }
15913 if (tp->snd_una == tp->snd_max) {
15914 tp->t_flags &= ~TF_PREVVALID;
15915 rack->r_ctl.retran_during_recovery = 0;
15916 rack->rc_suspicious = 0;
15917 rack->r_ctl.dsack_byte_cnt = 0;
15918 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL);
15919 if (rack->r_ctl.rc_went_idle_time == 0)
15920 rack->r_ctl.rc_went_idle_time = 1;
15921 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__);
15922 if (sbavail(&tptosocket(tp)->so_snd) == 0)
15923 tp->t_acktime = 0;
15924 /* Set so we might enter persists... */
15925 rack->r_wanted_output = 1;
15926 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15927 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
15928 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
15929 (sbavail(&so->so_snd) == 0) &&
15930 (tp->t_flags2 & TF2_DROP_AF_DATA)) {
15931 /*
15932 * The socket was gone and the
15933 * peer sent data (not now in the past), time to
15934 * reset him.
15935 */
15936 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
15937 /* tcp_close will kill the inp pre-log the Reset */
15938 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
15939 #ifdef TCP_ACCOUNTING
15940 rdstc = get_cyclecount();
15941 if (rdstc > ts_val) {
15942 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
15943 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
15944 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
15945 }
15946 }
15947 #endif
15948 m_freem(m);
15949 tp = tcp_close(tp);
15950 if (tp == NULL) {
15951 #ifdef TCP_ACCOUNTING
15952 sched_unpin();
15953 #endif
15954 return (1);
15955 }
15956 /*
15957 * We would normally do drop-with-reset which would
15958 * send back a reset. We can't since we don't have
15959 * all the needed bits. Instead lets arrange for
15960 * a call to tcp_output(). That way since we
15961 * are in the closed state we will generate a reset.
15962 *
15963 * Note if tcp_accounting is on we don't unpin since
15964 * we do that after the goto label.
15965 */
15966 goto send_out_a_rst;
15967 }
15968 if ((sbused(&so->so_snd) == 0) &&
15969 (tp->t_state >= TCPS_FIN_WAIT_1) &&
15970 (tp->t_flags & TF_SENTFIN)) {
15971 /*
15972 * If we can't receive any more data, then closing user can
15973 * proceed. Starting the timer is contrary to the
15974 * specification, but if we don't get a FIN we'll hang
15975 * forever.
15976 *
15977 */
15978 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
15979 soisdisconnected(so);
15980 tcp_timer_activate(tp, TT_2MSL,
15981 (tcp_fast_finwait2_recycle ?
15982 tcp_finwait2_timeout :
15983 TP_MAXIDLE(tp)));
15984 }
15985 if (ourfinisacked == 0) {
15986 /*
15987 * We don't change to fin-wait-2 if we have our fin acked
15988 * which means we are probably in TCPS_CLOSING.
15989 */
15990 tcp_state_change(tp, TCPS_FIN_WAIT_2);
15991 }
15992 }
15993 }
15994 /* Wake up the socket if we have room to write more */
15995 if (sbavail(&so->so_snd)) {
15996 rack->r_wanted_output = 1;
15997 if (ctf_progress_timeout_check(tp, true)) {
15998 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr,
15999 tp, tick, PROGRESS_DROP, __LINE__);
16000 /*
16001 * We cheat here and don't send a RST, we should send one
16002 * when the pacer drops the connection.
16003 */
16004 #ifdef TCP_ACCOUNTING
16005 rdstc = get_cyclecount();
16006 if (rdstc > ts_val) {
16007 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16008 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16009 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16010 }
16011 }
16012 sched_unpin();
16013 #endif
16014 (void)tcp_drop(tp, ETIMEDOUT);
16015 m_freem(m);
16016 return (1);
16017 }
16018 }
16019 if (ourfinisacked) {
16020 switch(tp->t_state) {
16021 case TCPS_CLOSING:
16022 #ifdef TCP_ACCOUNTING
16023 rdstc = get_cyclecount();
16024 if (rdstc > ts_val) {
16025 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16026 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16027 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16028 }
16029 }
16030 sched_unpin();
16031 #endif
16032 tcp_twstart(tp);
16033 m_freem(m);
16034 return (1);
16035 break;
16036 case TCPS_LAST_ACK:
16037 #ifdef TCP_ACCOUNTING
16038 rdstc = get_cyclecount();
16039 if (rdstc > ts_val) {
16040 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16041 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16042 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16043 }
16044 }
16045 sched_unpin();
16046 #endif
16047 tp = tcp_close(tp);
16048 ctf_do_drop(m, tp);
16049 return (1);
16050 break;
16051 case TCPS_FIN_WAIT_1:
16052 #ifdef TCP_ACCOUNTING
16053 rdstc = get_cyclecount();
16054 if (rdstc > ts_val) {
16055 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16056 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16057 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16058 }
16059 }
16060 #endif
16061 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
16062 soisdisconnected(so);
16063 tcp_timer_activate(tp, TT_2MSL,
16064 (tcp_fast_finwait2_recycle ?
16065 tcp_finwait2_timeout :
16066 TP_MAXIDLE(tp)));
16067 }
16068 tcp_state_change(tp, TCPS_FIN_WAIT_2);
16069 break;
16070 default:
16071 break;
16072 }
16073 }
16074 if (rack->r_fast_output) {
16075 /*
16076 * We re doing fast output.. can we expand that?
16077 */
16078 rack_gain_for_fastoutput(rack, tp, so, acked_amount);
16079 }
16080 #ifdef TCP_ACCOUNTING
16081 rdstc = get_cyclecount();
16082 if (rdstc > ts_val) {
16083 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16084 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val);
16085 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val);
16086 }
16087 }
16088
16089 } else if (win_up_req) {
16090 rdstc = get_cyclecount();
16091 if (rdstc > ts_val) {
16092 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16093 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val);
16094 }
16095 }
16096 #endif
16097 }
16098 /* Now is there a next packet, if so we are done */
16099 m_freem(m);
16100 did_out = 0;
16101 if (nxt_pkt) {
16102 #ifdef TCP_ACCOUNTING
16103 sched_unpin();
16104 #endif
16105 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs);
16106 return (0);
16107 }
16108 rack_handle_might_revert(tp, rack);
16109 ctf_calc_rwin(so, tp);
16110 if ((rack->r_wanted_output != 0) ||
16111 (rack->r_fast_output != 0) ||
16112 (tp->t_flags & TF_ACKNOW )) {
16113 send_out_a_rst:
16114 if (tcp_output(tp) < 0) {
16115 #ifdef TCP_ACCOUNTING
16116 sched_unpin();
16117 #endif
16118 return (1);
16119 }
16120 did_out = 1;
16121 }
16122 if (tp->t_flags2 & TF2_HPTS_CALLS)
16123 tp->t_flags2 &= ~TF2_HPTS_CALLS;
16124 rack_free_trim(rack);
16125 #ifdef TCP_ACCOUNTING
16126 sched_unpin();
16127 #endif
16128 rack_timer_audit(tp, rack, &so->so_snd);
16129 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs);
16130 return (0);
16131 }
16132
16133 #define TCP_LRO_TS_OPTION \
16134 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
16135 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
16136
16137 static int
rack_do_segment_nounlock(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int32_t drop_hdrlen,int32_t tlen,uint8_t iptos,int32_t nxt_pkt,struct timeval * tv)16138 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
16139 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt,
16140 struct timeval *tv)
16141 {
16142 struct inpcb *inp = tptoinpcb(tp);
16143 struct socket *so = tptosocket(tp);
16144 #ifdef TCP_ACCOUNTING
16145 uint64_t ts_val;
16146 #endif
16147 int32_t thflags, retval, did_out = 0;
16148 int32_t way_out = 0;
16149 /*
16150 * cts - is the current time from tv (caller gets ts) in microseconds.
16151 * ms_cts - is the current time from tv in milliseconds.
16152 * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
16153 */
16154 uint32_t cts, us_cts, ms_cts;
16155 uint32_t tiwin;
16156 struct timespec ts;
16157 struct tcpopt to;
16158 struct tcp_rack *rack;
16159 struct rack_sendmap *rsm;
16160 int32_t prev_state = 0;
16161 int no_output = 0;
16162 int time_remaining = 0;
16163 #ifdef TCP_ACCOUNTING
16164 int ack_val_set = 0xf;
16165 #endif
16166 int nsegs;
16167
16168 NET_EPOCH_ASSERT();
16169 INP_WLOCK_ASSERT(inp);
16170
16171 /*
16172 * tv passed from common code is from either M_TSTMP_LRO or
16173 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
16174 */
16175 rack = (struct tcp_rack *)tp->t_fb_ptr;
16176 if (rack->rack_deferred_inited == 0) {
16177 /*
16178 * If we are the connecting socket we will
16179 * hit rack_init() when no sequence numbers
16180 * are setup. This makes it so we must defer
16181 * some initialization. Call that now.
16182 */
16183 rack_deferred_init(tp, rack);
16184 }
16185 /*
16186 * Check to see if we need to skip any output plans. This
16187 * can happen in the non-LRO path where we are pacing and
16188 * must process the ack coming in but need to defer sending
16189 * anything becase a pacing timer is running.
16190 */
16191 us_cts = tcp_tv_to_usec(tv);
16192 if (m->m_flags & M_ACKCMP) {
16193 /*
16194 * All compressed ack's are ack's by definition so
16195 * remove any ack required flag and then do the processing.
16196 */
16197 rack->rc_ack_required = 0;
16198 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
16199 }
16200 thflags = tcp_get_flags(th);
16201 if ((rack->rc_always_pace == 1) &&
16202 (rack->rc_ack_can_sendout_data == 0) &&
16203 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
16204 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) {
16205 /*
16206 * Ok conditions are right for queuing the packets
16207 * but we do have to check the flags in the inp, it
16208 * could be, if a sack is present, we want to be awoken and
16209 * so should process the packets.
16210 */
16211 time_remaining = rack->r_ctl.rc_last_output_to - us_cts;
16212 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) {
16213 no_output = 1;
16214 } else {
16215 /*
16216 * If there is no options, or just a
16217 * timestamp option, we will want to queue
16218 * the packets. This is the same that LRO does
16219 * and will need to change with accurate ECN.
16220 */
16221 uint32_t *ts_ptr;
16222 int optlen;
16223
16224 optlen = (th->th_off << 2) - sizeof(struct tcphdr);
16225 ts_ptr = (uint32_t *)(th + 1);
16226 if ((optlen == 0) ||
16227 ((optlen == TCPOLEN_TSTAMP_APPA) &&
16228 (*ts_ptr == TCP_LRO_TS_OPTION)))
16229 no_output = 1;
16230 }
16231 if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) {
16232 /*
16233 * It is unrealistic to think we can pace in less than
16234 * the minimum granularity of the pacer (def:250usec). So
16235 * if we have less than that time remaining we should go
16236 * ahead and allow output to be "early". We will attempt to
16237 * make up for it in any pacing time we try to apply on
16238 * the outbound packet.
16239 */
16240 no_output = 0;
16241 }
16242 }
16243 /*
16244 * If there is a RST or FIN lets dump out the bw
16245 * with a FIN the connection may go on but we
16246 * may not.
16247 */
16248 if ((thflags & TH_FIN) || (thflags & TH_RST))
16249 rack_log_pacing_delay_calc(rack,
16250 rack->r_ctl.gp_bw,
16251 0,
16252 0,
16253 rack_get_gp_est(rack), /* delRate */
16254 rack_get_lt_bw(rack), /* rttProp */
16255 20, __LINE__, NULL, 0);
16256 if (m->m_flags & M_ACKCMP) {
16257 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
16258 }
16259 cts = tcp_tv_to_usec(tv);
16260 ms_cts = tcp_tv_to_msec(tv);
16261 nsegs = m->m_pkthdr.lro_nsegs;
16262 counter_u64_add(rack_proc_non_comp_ack, 1);
16263 #ifdef TCP_ACCOUNTING
16264 sched_pin();
16265 if (thflags & TH_ACK)
16266 ts_val = get_cyclecount();
16267 #endif
16268 if ((m->m_flags & M_TSTMP) ||
16269 (m->m_flags & M_TSTMP_LRO)) {
16270 mbuf_tstmp2timespec(m, &ts);
16271 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec;
16272 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000;
16273 } else
16274 rack->r_ctl.act_rcv_time = *tv;
16275 kern_prefetch(rack, &prev_state);
16276 prev_state = 0;
16277 /*
16278 * Unscale the window into a 32-bit value. For the SYN_SENT state
16279 * the scale is zero.
16280 */
16281 tiwin = th->th_win << tp->snd_scale;
16282 #ifdef TCP_ACCOUNTING
16283 if (thflags & TH_ACK) {
16284 /*
16285 * We have a tradeoff here. We can either do what we are
16286 * doing i.e. pinning to this CPU and then doing the accounting
16287 * <or> we could do a critical enter, setup the rdtsc and cpu
16288 * as in below, and then validate we are on the same CPU on
16289 * exit. I have choosen to not do the critical enter since
16290 * that often will gain you a context switch, and instead lock
16291 * us (line above this if) to the same CPU with sched_pin(). This
16292 * means we may be context switched out for a higher priority
16293 * interupt but we won't be moved to another CPU.
16294 *
16295 * If this occurs (which it won't very often since we most likely
16296 * are running this code in interupt context and only a higher
16297 * priority will bump us ... clock?) we will falsely add in
16298 * to the time the interupt processing time plus the ack processing
16299 * time. This is ok since its a rare event.
16300 */
16301 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin,
16302 ctf_fixed_maxseg(tp));
16303 }
16304 #endif
16305 /*
16306 * Parse options on any incoming segment.
16307 */
16308 memset(&to, 0, sizeof(to));
16309 tcp_dooptions(&to, (u_char *)(th + 1),
16310 (th->th_off << 2) - sizeof(struct tcphdr),
16311 (thflags & TH_SYN) ? TO_SYN : 0);
16312 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
16313 __func__));
16314 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
16315 __func__));
16316 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) {
16317 /*
16318 * We don't look at sack's from the
16319 * peer because the MSS is too small which
16320 * can subject us to an attack.
16321 */
16322 to.to_flags &= ~TOF_SACK;
16323 }
16324 if ((tp->t_state >= TCPS_FIN_WAIT_1) &&
16325 (tp->t_flags & TF_GPUTINPROG)) {
16326 /*
16327 * We have a goodput in progress
16328 * and we have entered a late state.
16329 * Do we have enough data in the sb
16330 * to handle the GPUT request?
16331 */
16332 uint32_t bytes;
16333
16334 bytes = tp->gput_ack - tp->gput_seq;
16335 if (SEQ_GT(tp->gput_seq, tp->snd_una))
16336 bytes += tp->gput_seq - tp->snd_una;
16337 if (bytes > sbavail(&tptosocket(tp)->so_snd)) {
16338 /*
16339 * There are not enough bytes in the socket
16340 * buffer that have been sent to cover this
16341 * measurement. Cancel it.
16342 */
16343 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
16344 rack->r_ctl.rc_gp_srtt /*flex1*/,
16345 tp->gput_seq,
16346 0, 0, 18, __LINE__, NULL, 0);
16347 tp->t_flags &= ~TF_GPUTINPROG;
16348 }
16349 }
16350 if (tcp_bblogging_on(rack->rc_tp)) {
16351 union tcp_log_stackspecific log;
16352 struct timeval ltv;
16353 #ifdef TCP_REQUEST_TRK
16354 struct tcp_sendfile_track *tcp_req;
16355
16356 if (SEQ_GT(th->th_ack, tp->snd_una)) {
16357 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1));
16358 } else {
16359 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack);
16360 }
16361 #endif
16362 memset(&log, 0, sizeof(log));
16363 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
16364 if (rack->rack_no_prr == 0)
16365 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
16366 else
16367 log.u_bbr.flex1 = 0;
16368 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns;
16369 log.u_bbr.use_lt_bw <<= 1;
16370 log.u_bbr.use_lt_bw |= rack->r_might_revert;
16371 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
16372 log.u_bbr.bbr_state = rack->rc_free_cnt;
16373 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16374 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg;
16375 log.u_bbr.flex3 = m->m_flags;
16376 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags;
16377 log.u_bbr.lost = thflags;
16378 log.u_bbr.pacing_gain = 0x1;
16379 #ifdef TCP_ACCOUNTING
16380 log.u_bbr.cwnd_gain = ack_val_set;
16381 #endif
16382 log.u_bbr.flex7 = 2;
16383 if (m->m_flags & M_TSTMP) {
16384 /* Record the hardware timestamp if present */
16385 mbuf_tstmp2timespec(m, &ts);
16386 ltv.tv_sec = ts.tv_sec;
16387 ltv.tv_usec = ts.tv_nsec / 1000;
16388 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v);
16389 } else if (m->m_flags & M_TSTMP_LRO) {
16390 /* Record the LRO the arrival timestamp */
16391 mbuf_tstmp2timespec(m, &ts);
16392 ltv.tv_sec = ts.tv_sec;
16393 ltv.tv_usec = ts.tv_nsec / 1000;
16394 log.u_bbr.flex5 = tcp_tv_to_usec(<v);
16395 }
16396 log.u_bbr.timeStamp = tcp_get_usecs(<v);
16397 /* Log the rcv time */
16398 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp;
16399 #ifdef TCP_REQUEST_TRK
16400 log.u_bbr.applimited = tp->t_tcpreq_closed;
16401 log.u_bbr.applimited <<= 8;
16402 log.u_bbr.applimited |= tp->t_tcpreq_open;
16403 log.u_bbr.applimited <<= 8;
16404 log.u_bbr.applimited |= tp->t_tcpreq_req;
16405 if (tcp_req) {
16406 /* Copy out any client req info */
16407 /* seconds */
16408 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC);
16409 /* useconds */
16410 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC);
16411 log.u_bbr.rttProp = tcp_req->timestamp;
16412 log.u_bbr.cur_del_rate = tcp_req->start;
16413 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) {
16414 log.u_bbr.flex8 |= 1;
16415 } else {
16416 log.u_bbr.flex8 |= 2;
16417 log.u_bbr.bw_inuse = tcp_req->end;
16418 }
16419 log.u_bbr.flex6 = tcp_req->start_seq;
16420 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) {
16421 log.u_bbr.flex8 |= 4;
16422 log.u_bbr.epoch = tcp_req->end_seq;
16423 }
16424 }
16425 #endif
16426 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
16427 tlen, &log, true, <v);
16428 }
16429 /* Remove ack required flag if set, we have one */
16430 if (thflags & TH_ACK)
16431 rack->rc_ack_required = 0;
16432 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__);
16433 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
16434 way_out = 4;
16435 retval = 0;
16436 m_freem(m);
16437 goto done_with_input;
16438 }
16439 /*
16440 * If a segment with the ACK-bit set arrives in the SYN-SENT state
16441 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
16442 */
16443 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
16444 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
16445 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
16446 ctf_do_dropwithreset(m, tp, th, tlen);
16447 #ifdef TCP_ACCOUNTING
16448 sched_unpin();
16449 #endif
16450 return (1);
16451 }
16452 /*
16453 * If timestamps were negotiated during SYN/ACK and a
16454 * segment without a timestamp is received, silently drop
16455 * the segment, unless it is a RST segment or missing timestamps are
16456 * tolerated.
16457 * See section 3.2 of RFC 7323.
16458 */
16459 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) &&
16460 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) {
16461 way_out = 5;
16462 retval = 0;
16463 m_freem(m);
16464 goto done_with_input;
16465 }
16466 /*
16467 * Segment received on connection. Reset idle time and keep-alive
16468 * timer. XXX: This should be done after segment validation to
16469 * ignore broken/spoofed segs.
16470 */
16471 if (tp->t_idle_reduce &&
16472 (tp->snd_max == tp->snd_una) &&
16473 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) {
16474 counter_u64_add(rack_input_idle_reduces, 1);
16475 rack_cc_after_idle(rack, tp);
16476 }
16477 tp->t_rcvtime = ticks;
16478 #ifdef STATS
16479 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
16480 #endif
16481 if (tiwin > rack->r_ctl.rc_high_rwnd)
16482 rack->r_ctl.rc_high_rwnd = tiwin;
16483 /*
16484 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
16485 * this to occur after we've validated the segment.
16486 */
16487 if (tcp_ecn_input_segment(tp, thflags, tlen,
16488 tcp_packets_this_ack(tp, th->th_ack),
16489 iptos))
16490 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
16491
16492 /*
16493 * If echoed timestamp is later than the current time, fall back to
16494 * non RFC1323 RTT calculation. Normalize timestamp if syncookies
16495 * were used when this connection was established.
16496 */
16497 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
16498 to.to_tsecr -= tp->ts_offset;
16499 if (TSTMP_GT(to.to_tsecr, ms_cts))
16500 to.to_tsecr = 0;
16501 }
16502 if ((rack->r_rcvpath_rtt_up == 1) &&
16503 (to.to_flags & TOF_TS) &&
16504 (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) {
16505 uint32_t rtt = 0;
16506
16507 /*
16508 * We are receiving only and thus not sending
16509 * data to do an RTT. We set a flag when we first
16510 * sent this TS to the peer. We now have it back
16511 * and have an RTT to share. We log it as a conf
16512 * 4, we are not so sure about it.. since we
16513 * may have lost an ack.
16514 */
16515 if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv))
16516 rtt = (cts - rack->r_ctl.last_time_of_arm_rcv);
16517 rack->r_rcvpath_rtt_up = 0;
16518 /* Submit and commit the timer */
16519 if (rtt > 0) {
16520 tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1);
16521 tcp_rack_xmit_timer_commit(rack, tp);
16522 }
16523 }
16524 /*
16525 * If its the first time in we need to take care of options and
16526 * verify we can do SACK for rack!
16527 */
16528 if (rack->r_state == 0) {
16529 /* Should be init'd by rack_init() */
16530 KASSERT(rack->rc_inp != NULL,
16531 ("%s: rack->rc_inp unexpectedly NULL", __func__));
16532 if (rack->rc_inp == NULL) {
16533 rack->rc_inp = inp;
16534 }
16535
16536 /*
16537 * Process options only when we get SYN/ACK back. The SYN
16538 * case for incoming connections is handled in tcp_syncache.
16539 * According to RFC1323 the window field in a SYN (i.e., a
16540 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX
16541 * this is traditional behavior, may need to be cleaned up.
16542 */
16543 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
16544 /* Handle parallel SYN for ECN */
16545 tcp_ecn_input_parallel_syn(tp, thflags, iptos);
16546 if ((to.to_flags & TOF_SCALE) &&
16547 (tp->t_flags & TF_REQ_SCALE)) {
16548 tp->t_flags |= TF_RCVD_SCALE;
16549 tp->snd_scale = to.to_wscale;
16550 } else
16551 tp->t_flags &= ~TF_REQ_SCALE;
16552 /*
16553 * Initial send window. It will be updated with the
16554 * next incoming segment to the scaled value.
16555 */
16556 tp->snd_wnd = th->th_win;
16557 rack_validate_fo_sendwin_up(tp, rack);
16558 if ((to.to_flags & TOF_TS) &&
16559 (tp->t_flags & TF_REQ_TSTMP)) {
16560 tp->t_flags |= TF_RCVD_TSTMP;
16561 tp->ts_recent = to.to_tsval;
16562 tp->ts_recent_age = cts;
16563 } else
16564 tp->t_flags &= ~TF_REQ_TSTMP;
16565 if (to.to_flags & TOF_MSS) {
16566 tcp_mss(tp, to.to_mss);
16567 }
16568 if ((tp->t_flags & TF_SACK_PERMIT) &&
16569 (to.to_flags & TOF_SACKPERM) == 0)
16570 tp->t_flags &= ~TF_SACK_PERMIT;
16571 if (tp->t_flags & TF_FASTOPEN) {
16572 if (to.to_flags & TOF_FASTOPEN) {
16573 uint16_t mss;
16574
16575 if (to.to_flags & TOF_MSS)
16576 mss = to.to_mss;
16577 else
16578 if ((inp->inp_vflag & INP_IPV6) != 0)
16579 mss = TCP6_MSS;
16580 else
16581 mss = TCP_MSS;
16582 tcp_fastopen_update_cache(tp, mss,
16583 to.to_tfo_len, to.to_tfo_cookie);
16584 } else
16585 tcp_fastopen_disable_path(tp);
16586 }
16587 }
16588 /*
16589 * At this point we are at the initial call. Here we decide
16590 * if we are doing RACK or not. We do this by seeing if
16591 * TF_SACK_PERMIT is set and the sack-not-required is clear.
16592 * The code now does do dup-ack counting so if you don't
16593 * switch back you won't get rack & TLP, but you will still
16594 * get this stack.
16595 */
16596
16597 if ((rack_sack_not_required == 0) &&
16598 ((tp->t_flags & TF_SACK_PERMIT) == 0)) {
16599 tcp_switch_back_to_default(tp);
16600 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen,
16601 tlen, iptos);
16602 #ifdef TCP_ACCOUNTING
16603 sched_unpin();
16604 #endif
16605 return (1);
16606 }
16607 tcp_set_hpts(tp);
16608 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
16609 }
16610 if (thflags & TH_FIN)
16611 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
16612 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time);
16613 if ((rack->rc_gp_dyn_mul) &&
16614 (rack->use_fixed_rate == 0) &&
16615 (rack->rc_always_pace)) {
16616 /* Check in on probertt */
16617 rack_check_probe_rtt(rack, cts);
16618 }
16619 rack_clear_rate_sample(rack);
16620 if ((rack->forced_ack) &&
16621 ((tcp_get_flags(th) & TH_RST) == 0)) {
16622 rack_handle_probe_response(rack, tiwin, us_cts);
16623 }
16624 /*
16625 * This is the one exception case where we set the rack state
16626 * always. All other times (timers etc) we must have a rack-state
16627 * set (so we assure we have done the checks above for SACK).
16628 */
16629 rack->r_ctl.rc_rcvtime = cts;
16630 if (rack->r_state != tp->t_state)
16631 rack_set_state(tp, rack);
16632 if (SEQ_GT(th->th_ack, tp->snd_una) &&
16633 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL)
16634 kern_prefetch(rsm, &prev_state);
16635 prev_state = rack->r_state;
16636 if ((thflags & TH_RST) &&
16637 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
16638 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
16639 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) {
16640 /* The connection will be killed by a reset check the tracepoint */
16641 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV);
16642 }
16643 retval = (*rack->r_substate) (m, th, so,
16644 tp, &to, drop_hdrlen,
16645 tlen, tiwin, thflags, nxt_pkt, iptos);
16646 if (retval == 0) {
16647 /*
16648 * If retval is 1 the tcb is unlocked and most likely the tp
16649 * is gone.
16650 */
16651 INP_WLOCK_ASSERT(inp);
16652 if ((rack->rc_gp_dyn_mul) &&
16653 (rack->rc_always_pace) &&
16654 (rack->use_fixed_rate == 0) &&
16655 rack->in_probe_rtt &&
16656 (rack->r_ctl.rc_time_probertt_starts == 0)) {
16657 /*
16658 * If we are going for target, lets recheck before
16659 * we output.
16660 */
16661 rack_check_probe_rtt(rack, cts);
16662 }
16663 if (rack->set_pacing_done_a_iw == 0) {
16664 /* How much has been acked? */
16665 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) {
16666 /* We have enough to set in the pacing segment size */
16667 rack->set_pacing_done_a_iw = 1;
16668 rack_set_pace_segments(tp, rack, __LINE__, NULL);
16669 }
16670 }
16671 tcp_rack_xmit_timer_commit(rack, tp);
16672 #ifdef TCP_ACCOUNTING
16673 /*
16674 * If we set the ack_val_se to what ack processing we are doing
16675 * we also want to track how many cycles we burned. Note
16676 * the bits after tcp_output we let be "free". This is because
16677 * we are also tracking the tcp_output times as well. Note the
16678 * use of 0xf here since we only have 11 counter (0 - 0xa) and
16679 * 0xf cannot be returned and is what we initialize it too to
16680 * indicate we are not doing the tabulations.
16681 */
16682 if (ack_val_set != 0xf) {
16683 uint64_t crtsc;
16684
16685 crtsc = get_cyclecount();
16686 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
16687 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val);
16688 }
16689 }
16690 #endif
16691 if ((nxt_pkt == 0) && (no_output == 0)) {
16692 if ((rack->r_wanted_output != 0) ||
16693 (tp->t_flags & TF_ACKNOW) ||
16694 (rack->r_fast_output != 0)) {
16695
16696 do_output_now:
16697 if (tcp_output(tp) < 0) {
16698 #ifdef TCP_ACCOUNTING
16699 sched_unpin();
16700 #endif
16701 return (1);
16702 }
16703 did_out = 1;
16704 }
16705 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
16706 rack_free_trim(rack);
16707 } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) {
16708 goto do_output_now;
16709 } else if ((no_output == 1) &&
16710 (nxt_pkt == 0) &&
16711 (tcp_in_hpts(rack->rc_tp) == 0)) {
16712 /*
16713 * We are not in hpts and we had a pacing timer up. Use
16714 * the remaining time (time_remaining) to restart the timer.
16715 */
16716 KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp));
16717 rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0);
16718 rack_free_trim(rack);
16719 }
16720 /* Clear the flag, it may have been cleared by output but we may not have */
16721 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS))
16722 tp->t_flags2 &= ~TF2_HPTS_CALLS;
16723 /*
16724 * The draft (v3) calls for us to use SEQ_GEQ, but that
16725 * causes issues when we are just going app limited. Lets
16726 * instead use SEQ_GT <or> where its equal but more data
16727 * is outstanding.
16728 *
16729 * Also make sure we are on the last ack of a series. We
16730 * have to have all the ack's processed in queue to know
16731 * if there is something left outstanding.
16732 */
16733 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) &&
16734 (rack->rc_new_rnd_needed == 0) &&
16735 (nxt_pkt == 0)) {
16736 /*
16737 * We have crossed into a new round with
16738 * the new snd_unae.
16739 */
16740 rack_new_round_setup(tp, rack, tp->snd_una);
16741 }
16742 if ((nxt_pkt == 0) &&
16743 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) &&
16744 (SEQ_GT(tp->snd_max, tp->snd_una) ||
16745 (tp->t_flags & TF_DELACK) ||
16746 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) &&
16747 (tp->t_state <= TCPS_CLOSING)))) {
16748 /* We could not send (probably in the hpts but stopped the timer earlier)? */
16749 if ((tp->snd_max == tp->snd_una) &&
16750 ((tp->t_flags & TF_DELACK) == 0) &&
16751 (tcp_in_hpts(rack->rc_tp)) &&
16752 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) {
16753 /* keep alive not needed if we are hptsi output yet */
16754 ;
16755 } else {
16756 int late = 0;
16757 if (tcp_in_hpts(tp)) {
16758 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
16759 us_cts = tcp_get_usecs(NULL);
16760 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) {
16761 rack->r_early = 1;
16762 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts);
16763 } else
16764 late = 1;
16765 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
16766 }
16767 tcp_hpts_remove(tp);
16768 }
16769 if (late && (did_out == 0)) {
16770 /*
16771 * We are late in the sending
16772 * and we did not call the output
16773 * (this probably should not happen).
16774 */
16775 goto do_output_now;
16776 }
16777 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0);
16778 }
16779 way_out = 1;
16780 } else if (nxt_pkt == 0) {
16781 /* Do we have the correct timer running? */
16782 rack_timer_audit(tp, rack, &so->so_snd);
16783 way_out = 2;
16784 }
16785 done_with_input:
16786 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs));
16787 if (did_out)
16788 rack->r_wanted_output = 0;
16789 }
16790
16791 #ifdef TCP_ACCOUNTING
16792 sched_unpin();
16793 #endif
16794 return (retval);
16795 }
16796
16797 static void
rack_do_segment(struct tcpcb * tp,struct mbuf * m,struct tcphdr * th,int32_t drop_hdrlen,int32_t tlen,uint8_t iptos)16798 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
16799 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos)
16800 {
16801 struct timeval tv;
16802
16803 /* First lets see if we have old packets */
16804 if (!STAILQ_EMPTY(&tp->t_inqueue)) {
16805 if (ctf_do_queued_segments(tp, 1)) {
16806 m_freem(m);
16807 return;
16808 }
16809 }
16810 if (m->m_flags & M_TSTMP_LRO) {
16811 mbuf_tstmp2timeval(m, &tv);
16812 } else {
16813 /* Should not be should we kassert instead? */
16814 tcp_get_usecs(&tv);
16815 }
16816 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0,
16817 &tv) == 0) {
16818 INP_WUNLOCK(tptoinpcb(tp));
16819 }
16820 }
16821
16822 struct rack_sendmap *
tcp_rack_output(struct tcpcb * tp,struct tcp_rack * rack,uint32_t tsused)16823 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused)
16824 {
16825 struct rack_sendmap *rsm = NULL;
16826 int32_t idx;
16827 uint32_t srtt = 0, thresh = 0, ts_low = 0;
16828
16829 /* Return the next guy to be re-transmitted */
16830 if (tqhash_empty(rack->r_ctl.tqh)) {
16831 return (NULL);
16832 }
16833 if (tp->t_flags & TF_SENTFIN) {
16834 /* retran the end FIN? */
16835 return (NULL);
16836 }
16837 /* ok lets look at this one */
16838 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
16839 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) {
16840 return (rsm);
16841 }
16842 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) {
16843 goto check_it;
16844 }
16845 rsm = rack_find_lowest_rsm(rack);
16846 if (rsm == NULL) {
16847 return (NULL);
16848 }
16849 check_it:
16850 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) &&
16851 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
16852 /*
16853 * No sack so we automatically do the 3 strikes and
16854 * retransmit (no rack timer would be started).
16855 */
16856 return (rsm);
16857 }
16858 if (rsm->r_flags & RACK_ACKED) {
16859 return (NULL);
16860 }
16861 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) &&
16862 (rsm->r_dupack < DUP_ACK_THRESHOLD)) {
16863 /* Its not yet ready */
16864 return (NULL);
16865 }
16866 srtt = rack_grab_rtt(tp, rack);
16867 idx = rsm->r_rtr_cnt - 1;
16868 ts_low = (uint32_t)rsm->r_tim_lastsent[idx];
16869 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1);
16870 if ((tsused == ts_low) ||
16871 (TSTMP_LT(tsused, ts_low))) {
16872 /* No time since sending */
16873 return (NULL);
16874 }
16875 if ((tsused - ts_low) < thresh) {
16876 /* It has not been long enough yet */
16877 return (NULL);
16878 }
16879 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) ||
16880 ((rsm->r_flags & RACK_SACK_PASSED))) {
16881 /*
16882 * We have passed the dup-ack threshold <or>
16883 * a SACK has indicated this is missing.
16884 * Note that if you are a declared attacker
16885 * it is only the dup-ack threshold that
16886 * will cause retransmits.
16887 */
16888 /* log retransmit reason */
16889 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1);
16890 rack->r_fast_output = 0;
16891 return (rsm);
16892 }
16893 return (NULL);
16894 }
16895
16896 static void
rack_log_pacing_delay_calc(struct tcp_rack * rack,uint32_t len,uint32_t pacing_delay,uint64_t bw_est,uint64_t bw,uint64_t len_time,int method,int line,struct rack_sendmap * rsm,uint8_t quality)16897 rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay,
16898 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method,
16899 int line, struct rack_sendmap *rsm, uint8_t quality)
16900 {
16901 if (tcp_bblogging_on(rack->rc_tp)) {
16902 union tcp_log_stackspecific log;
16903 struct timeval tv;
16904
16905 if (rack_verbose_logging == 0) {
16906 /*
16907 * We are not verbose screen out all but
16908 * ones we always want.
16909 */
16910 if ((method != 2) &&
16911 (method != 3) &&
16912 (method != 7) &&
16913 (method != 89) &&
16914 (method != 14) &&
16915 (method != 20)) {
16916 return;
16917 }
16918 }
16919 memset(&log, 0, sizeof(log));
16920 log.u_bbr.flex1 = pacing_delay;
16921 log.u_bbr.flex2 = len;
16922 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs;
16923 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs;
16924 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss;
16925 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca;
16926 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data;
16927 log.u_bbr.use_lt_bw <<= 1;
16928 log.u_bbr.use_lt_bw |= rack->r_late;
16929 log.u_bbr.use_lt_bw <<= 1;
16930 log.u_bbr.use_lt_bw |= rack->r_early;
16931 log.u_bbr.use_lt_bw <<= 1;
16932 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set;
16933 log.u_bbr.use_lt_bw <<= 1;
16934 log.u_bbr.use_lt_bw |= rack->rc_gp_filled;
16935 log.u_bbr.use_lt_bw <<= 1;
16936 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt;
16937 log.u_bbr.use_lt_bw <<= 1;
16938 log.u_bbr.use_lt_bw |= rack->in_probe_rtt;
16939 log.u_bbr.use_lt_bw <<= 1;
16940 log.u_bbr.use_lt_bw |= rack->gp_ready;
16941 log.u_bbr.pkt_epoch = line;
16942 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed;
16943 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early;
16944 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec;
16945 log.u_bbr.bw_inuse = bw_est;
16946 log.u_bbr.delRate = bw;
16947 if (rack->r_ctl.gp_bw == 0)
16948 log.u_bbr.cur_del_rate = 0;
16949 else
16950 log.u_bbr.cur_del_rate = rack_get_bw(rack);
16951 log.u_bbr.rttProp = len_time;
16952 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt;
16953 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit;
16954 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
16955 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) {
16956 /* We are in slow start */
16957 log.u_bbr.flex7 = 1;
16958 } else {
16959 /* we are on congestion avoidance */
16960 log.u_bbr.flex7 = 0;
16961 }
16962 log.u_bbr.flex8 = method;
16963 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
16964 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
16965 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec;
16966 log.u_bbr.cwnd_gain <<= 1;
16967 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss;
16968 log.u_bbr.cwnd_gain <<= 1;
16969 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca;
16970 log.u_bbr.cwnd_gain <<= 1;
16971 log.u_bbr.cwnd_gain |= rack->use_fixed_rate;
16972 log.u_bbr.cwnd_gain <<= 1;
16973 log.u_bbr.cwnd_gain |= rack->rc_always_pace;
16974 log.u_bbr.cwnd_gain <<= 1;
16975 log.u_bbr.cwnd_gain |= rack->gp_ready;
16976 log.u_bbr.bbr_substate = quality;
16977 log.u_bbr.bbr_state = rack->dgp_on;
16978 log.u_bbr.bbr_state <<= 1;
16979 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd;
16980 log.u_bbr.bbr_state <<= 2;
16981 TCP_LOG_EVENTP(rack->rc_tp, NULL,
16982 &rack->rc_inp->inp_socket->so_rcv,
16983 &rack->rc_inp->inp_socket->so_snd,
16984 BBR_LOG_HPTSI_CALC, 0,
16985 0, &log, false, &tv);
16986 }
16987 }
16988
16989 static uint32_t
rack_get_pacing_len(struct tcp_rack * rack,uint64_t bw,uint32_t mss)16990 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss)
16991 {
16992 uint32_t new_tso, user_max, pace_one;
16993
16994 user_max = rack->rc_user_set_max_segs * mss;
16995 if (rack->rc_force_max_seg) {
16996 return (user_max);
16997 }
16998 if (rack->use_fixed_rate &&
16999 ((rack->r_ctl.crte == NULL) ||
17000 (bw != rack->r_ctl.crte->rate))) {
17001 /* Use the user mss since we are not exactly matched */
17002 return (user_max);
17003 }
17004 if (rack_pace_one_seg ||
17005 (rack->r_ctl.rc_user_set_min_segs == 1))
17006 pace_one = 1;
17007 else
17008 pace_one = 0;
17009
17010 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss,
17011 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor);
17012 if (new_tso > user_max)
17013 new_tso = user_max;
17014 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) {
17015 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso)
17016 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss;
17017 }
17018 if (rack->r_ctl.rc_user_set_min_segs &&
17019 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso))
17020 new_tso = rack->r_ctl.rc_user_set_min_segs * mss;
17021 return (new_tso);
17022 }
17023
17024 static uint64_t
rack_arrive_at_discounted_rate(struct tcp_rack * rack,uint64_t window_input,uint32_t * rate_set,uint32_t * gain_b)17025 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b)
17026 {
17027 uint64_t reduced_win;
17028 uint32_t gain;
17029
17030 if (window_input < rc_init_window(rack)) {
17031 /*
17032 * The cwnd is collapsed to
17033 * nearly zero, maybe because of a time-out?
17034 * Lets drop back to the lt-bw.
17035 */
17036 reduced_win = rack_get_lt_bw(rack);
17037 /* Set the flag so the caller knows its a rate and not a reduced window */
17038 *rate_set = 1;
17039 gain = 100;
17040 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) {
17041 /*
17042 * If we are in recover our cwnd needs to be less for
17043 * our pacing consideration.
17044 */
17045 if (rack->rack_hibeta == 0) {
17046 reduced_win = window_input / 2;
17047 gain = 50;
17048 } else {
17049 reduced_win = window_input * rack->r_ctl.saved_hibeta;
17050 reduced_win /= 100;
17051 gain = rack->r_ctl.saved_hibeta;
17052 }
17053 } else {
17054 /*
17055 * Apply Timely factor to increase/decrease the
17056 * amount we are pacing at.
17057 */
17058 gain = rack_get_output_gain(rack, NULL);
17059 if (gain > rack_gain_p5_ub) {
17060 gain = rack_gain_p5_ub;
17061 }
17062 reduced_win = window_input * gain;
17063 reduced_win /= 100;
17064 }
17065 if (gain_b != NULL)
17066 *gain_b = gain;
17067 /*
17068 * What is being returned here is a trimmed down
17069 * window values in all cases where rate_set is left
17070 * at 0. In one case we actually return the rate (lt_bw).
17071 * the "reduced_win" is returned as a slimmed down cwnd that
17072 * is then calculated by the caller into a rate when rate_set
17073 * is 0.
17074 */
17075 return (reduced_win);
17076 }
17077
17078 static int32_t
pace_to_fill_cwnd(struct tcp_rack * rack,int32_t pacing_delay,uint32_t len,uint32_t segsiz,int * capped,uint64_t * rate_wanted,uint8_t non_paced)17079 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced)
17080 {
17081 uint64_t lentim, fill_bw;
17082
17083 rack->r_via_fill_cw = 0;
17084 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use)
17085 return (pacing_delay);
17086 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd)
17087 return (pacing_delay);
17088 if (rack->r_ctl.rc_last_us_rtt == 0)
17089 return (pacing_delay);
17090 if (rack->rc_pace_fill_if_rttin_range &&
17091 (rack->r_ctl.rc_last_us_rtt >=
17092 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) {
17093 /* The rtt is huge, N * smallest, lets not fill */
17094 return (pacing_delay);
17095 }
17096 if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap)
17097 return (pacing_delay);
17098 /*
17099 * first lets calculate the b/w based on the last us-rtt
17100 * and the the smallest send window.
17101 */
17102 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
17103 if (rack->rc_fillcw_apply_discount) {
17104 uint32_t rate_set = 0;
17105
17106 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL);
17107 if (rate_set) {
17108 goto at_lt_bw;
17109 }
17110 }
17111 /* Take the rwnd if its smaller */
17112 if (fill_bw > rack->rc_tp->snd_wnd)
17113 fill_bw = rack->rc_tp->snd_wnd;
17114 /* Now lets make it into a b/w */
17115 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC;
17116 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
17117 /* Adjust to any cap */
17118 if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap)
17119 fill_bw = rack->r_ctl.fillcw_cap;
17120
17121 at_lt_bw:
17122 if (rack_bw_multipler > 0) {
17123 /*
17124 * We want to limit fill-cw to the some multiplier
17125 * of the max(lt_bw, gp_est). The normal default
17126 * is 0 for off, so a sysctl has enabled it.
17127 */
17128 uint64_t lt_bw, gp, rate;
17129
17130 gp = rack_get_gp_est(rack);
17131 lt_bw = rack_get_lt_bw(rack);
17132 if (lt_bw > gp)
17133 rate = lt_bw;
17134 else
17135 rate = gp;
17136 rate *= rack_bw_multipler;
17137 rate /= 100;
17138 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
17139 union tcp_log_stackspecific log;
17140 struct timeval tv;
17141
17142 memset(&log, 0, sizeof(log));
17143 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17144 log.u_bbr.flex1 = rack_bw_multipler;
17145 log.u_bbr.flex2 = len;
17146 log.u_bbr.cur_del_rate = gp;
17147 log.u_bbr.delRate = lt_bw;
17148 log.u_bbr.bw_inuse = rate;
17149 log.u_bbr.rttProp = fill_bw;
17150 log.u_bbr.flex8 = 44;
17151 tcp_log_event(rack->rc_tp, NULL, NULL, NULL,
17152 BBR_LOG_CWND, 0,
17153 0, &log, false, NULL,
17154 __func__, __LINE__, &tv);
17155 }
17156 if (fill_bw > rate)
17157 fill_bw = rate;
17158 }
17159 /* We are below the min b/w */
17160 if (non_paced)
17161 *rate_wanted = fill_bw;
17162 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted))
17163 return (pacing_delay);
17164 rack->r_via_fill_cw = 1;
17165 if (rack->r_rack_hw_rate_caps &&
17166 (rack->r_ctl.crte != NULL)) {
17167 uint64_t high_rate;
17168
17169 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte);
17170 if (fill_bw > high_rate) {
17171 /* We are capping bw at the highest rate table entry */
17172 if (*rate_wanted > high_rate) {
17173 /* The original rate was also capped */
17174 rack->r_via_fill_cw = 0;
17175 }
17176 rack_log_hdwr_pacing(rack,
17177 fill_bw, high_rate, __LINE__,
17178 0, 3);
17179 fill_bw = high_rate;
17180 if (capped)
17181 *capped = 1;
17182 }
17183 } else if ((rack->r_ctl.crte == NULL) &&
17184 (rack->rack_hdrw_pacing == 0) &&
17185 (rack->rack_hdw_pace_ena) &&
17186 rack->r_rack_hw_rate_caps &&
17187 (rack->rack_attempt_hdwr_pace == 0) &&
17188 (rack->rc_inp->inp_route.ro_nh != NULL) &&
17189 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17190 /*
17191 * Ok we may have a first attempt that is greater than our top rate
17192 * lets check.
17193 */
17194 uint64_t high_rate;
17195
17196 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp);
17197 if (high_rate) {
17198 if (fill_bw > high_rate) {
17199 fill_bw = high_rate;
17200 if (capped)
17201 *capped = 1;
17202 }
17203 }
17204 }
17205 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) {
17206 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max,
17207 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__);
17208 fill_bw = rack->r_ctl.bw_rate_cap;
17209 }
17210 /*
17211 * Ok fill_bw holds our mythical b/w to fill the cwnd
17212 * in an rtt (unless it was capped), what does that
17213 * time wise equate too?
17214 */
17215 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC;
17216 lentim /= fill_bw;
17217 *rate_wanted = fill_bw;
17218 if (non_paced || (lentim < pacing_delay)) {
17219 rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw,
17220 0, lentim, 12, __LINE__, NULL, 0);
17221 return ((int32_t)lentim);
17222 } else
17223 return (pacing_delay);
17224 }
17225
17226 static int32_t
rack_get_pacing_delay(struct tcp_rack * rack,struct tcpcb * tp,uint32_t len,struct rack_sendmap * rsm,uint32_t segsiz,int line)17227 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line)
17228 {
17229 uint64_t srtt;
17230 int32_t pacing_delay = 0;
17231 int can_start_hw_pacing = 1;
17232 int err;
17233 int pace_one;
17234
17235 if (rack_pace_one_seg ||
17236 (rack->r_ctl.rc_user_set_min_segs == 1))
17237 pace_one = 1;
17238 else
17239 pace_one = 0;
17240 if (rack->rc_always_pace == 0) {
17241 /*
17242 * We use the most optimistic possible cwnd/srtt for
17243 * sending calculations. This will make our
17244 * calculation anticipate getting more through
17245 * quicker then possible. But thats ok we don't want
17246 * the peer to have a gap in data sending.
17247 */
17248 uint64_t cwnd, tr_perms = 0;
17249 int32_t reduce;
17250
17251 old_method:
17252 /*
17253 * We keep no precise pacing with the old method
17254 * instead we use the pacer to mitigate bursts.
17255 */
17256 if (rack->r_ctl.rc_rack_min_rtt)
17257 srtt = rack->r_ctl.rc_rack_min_rtt;
17258 else
17259 srtt = max(tp->t_srtt, 1);
17260 if (rack->r_ctl.rc_rack_largest_cwnd)
17261 cwnd = rack->r_ctl.rc_rack_largest_cwnd;
17262 else
17263 cwnd = rack->r_ctl.cwnd_to_use;
17264 /* Inflate cwnd by 1000 so srtt of usecs is in ms */
17265 tr_perms = (cwnd * 1000) / srtt;
17266 if (tr_perms == 0) {
17267 tr_perms = ctf_fixed_maxseg(tp);
17268 }
17269 /*
17270 * Calculate how long this will take to drain, if
17271 * the calculation comes out to zero, thats ok we
17272 * will use send_a_lot to possibly spin around for
17273 * more increasing tot_len_this_send to the point
17274 * that its going to require a pace, or we hit the
17275 * cwnd. Which in that case we are just waiting for
17276 * a ACK.
17277 */
17278 pacing_delay = len / tr_perms;
17279 /* Now do we reduce the time so we don't run dry? */
17280 if (pacing_delay && rack_pacing_delay_reduction) {
17281 reduce = (pacing_delay / rack_pacing_delay_reduction);
17282 if (reduce < pacing_delay) {
17283 pacing_delay -= reduce;
17284 } else
17285 pacing_delay = 0;
17286 } else
17287 reduce = 0;
17288 pacing_delay *= HPTS_USEC_IN_MSEC;
17289 if (rack->rc_pace_to_cwnd) {
17290 uint64_t rate_wanted = 0;
17291
17292 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1);
17293 rack->rc_ack_can_sendout_data = 1;
17294 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0);
17295 } else
17296 rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0);
17297 /*******************************************************/
17298 /* RRS: We insert non-paced call to stats here for len */
17299 /*******************************************************/
17300 } else {
17301 uint64_t bw_est, res, lentim, rate_wanted;
17302 uint32_t segs, oh;
17303 int capped = 0;
17304 int prev_fill;
17305
17306 if ((rack->r_rr_config == 1) && rsm) {
17307 return (rack->r_ctl.rc_min_to);
17308 }
17309 if (rack->use_fixed_rate) {
17310 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack);
17311 } else if ((rack->r_ctl.init_rate == 0) &&
17312 (rack->r_ctl.gp_bw == 0)) {
17313 /* no way to yet do an estimate */
17314 bw_est = rate_wanted = 0;
17315 } else if (rack->dgp_on) {
17316 bw_est = rack_get_bw(rack);
17317 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped);
17318 } else {
17319 uint32_t gain, rate_set = 0;
17320
17321 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use);
17322 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain);
17323 if (rate_set == 0) {
17324 if (rate_wanted > rack->rc_tp->snd_wnd)
17325 rate_wanted = rack->rc_tp->snd_wnd;
17326 /* Now lets make it into a b/w */
17327 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC;
17328 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt;
17329 }
17330 bw_est = rate_wanted;
17331 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd,
17332 rack->r_ctl.cwnd_to_use,
17333 rate_wanted, bw_est,
17334 rack->r_ctl.rc_last_us_rtt,
17335 88, __LINE__, NULL, gain);
17336 }
17337 if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) &&
17338 (rack->use_fixed_rate == 0)) {
17339 /*
17340 * No way yet to make a b/w estimate or
17341 * our raise is set incorrectly.
17342 */
17343 goto old_method;
17344 }
17345 rack_rate_cap_bw(rack, &rate_wanted, &capped);
17346 /* We need to account for all the overheads */
17347 segs = (len + segsiz - 1) / segsiz;
17348 /*
17349 * We need the diff between 1514 bytes (e-mtu with e-hdr)
17350 * and how much data we put in each packet. Yes this
17351 * means we may be off if we are larger than 1500 bytes
17352 * or smaller. But this just makes us more conservative.
17353 */
17354
17355 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr);
17356 if (rack->r_is_v6) {
17357 #ifdef INET6
17358 oh += sizeof(struct ip6_hdr);
17359 #endif
17360 } else {
17361 #ifdef INET
17362 oh += sizeof(struct ip);
17363 #endif
17364 }
17365 /* We add a fixed 14 for the ethernet header */
17366 oh += 14;
17367 segs *= oh;
17368 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC;
17369 res = lentim / rate_wanted;
17370 pacing_delay = (uint32_t)res;
17371 if (rack_hw_rate_min &&
17372 (rate_wanted < rack_hw_rate_min)) {
17373 can_start_hw_pacing = 0;
17374 if (rack->r_ctl.crte) {
17375 /*
17376 * Ok we need to release it, we
17377 * have fallen too low.
17378 */
17379 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17380 rack->r_ctl.crte = NULL;
17381 rack->rack_attempt_hdwr_pace = 0;
17382 rack->rack_hdrw_pacing = 0;
17383 }
17384 }
17385 if (rack->r_ctl.crte &&
17386 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17387 /*
17388 * We want more than the hardware can give us,
17389 * don't start any hw pacing.
17390 */
17391 can_start_hw_pacing = 0;
17392 if (rack->r_rack_hw_rate_caps == 0) {
17393 /*
17394 * Ok we need to release it, we
17395 * want more than the card can give us and
17396 * no rate cap is in place. Set it up so
17397 * when we want less we can retry.
17398 */
17399 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17400 rack->r_ctl.crte = NULL;
17401 rack->rack_attempt_hdwr_pace = 0;
17402 rack->rack_hdrw_pacing = 0;
17403 }
17404 }
17405 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) {
17406 /*
17407 * We lost our rate somehow, this can happen
17408 * if the interface changed underneath us.
17409 */
17410 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17411 rack->r_ctl.crte = NULL;
17412 /* Lets re-allow attempting to setup pacing */
17413 rack->rack_hdrw_pacing = 0;
17414 rack->rack_attempt_hdwr_pace = 0;
17415 rack_log_hdwr_pacing(rack,
17416 rate_wanted, bw_est, __LINE__,
17417 0, 6);
17418 }
17419 prev_fill = rack->r_via_fill_cw;
17420 if ((rack->rc_pace_to_cwnd) &&
17421 (capped == 0) &&
17422 (rack->dgp_on == 1) &&
17423 (rack->use_fixed_rate == 0) &&
17424 (rack->in_probe_rtt == 0) &&
17425 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) {
17426 /*
17427 * We want to pace at our rate *or* faster to
17428 * fill the cwnd to the max if its not full.
17429 */
17430 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0);
17431 /* Re-check to make sure we are not exceeding our max b/w */
17432 if ((rack->r_ctl.crte != NULL) &&
17433 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) {
17434 /*
17435 * We want more than the hardware can give us,
17436 * don't start any hw pacing.
17437 */
17438 can_start_hw_pacing = 0;
17439 if (rack->r_rack_hw_rate_caps == 0) {
17440 /*
17441 * Ok we need to release it, we
17442 * want more than the card can give us and
17443 * no rate cap is in place. Set it up so
17444 * when we want less we can retry.
17445 */
17446 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17447 rack->r_ctl.crte = NULL;
17448 rack->rack_attempt_hdwr_pace = 0;
17449 rack->rack_hdrw_pacing = 0;
17450 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
17451 }
17452 }
17453 }
17454 if ((rack->rc_inp->inp_route.ro_nh != NULL) &&
17455 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) {
17456 if ((rack->rack_hdw_pace_ena) &&
17457 (can_start_hw_pacing > 0) &&
17458 (rack->rack_hdrw_pacing == 0) &&
17459 (rack->rack_attempt_hdwr_pace == 0)) {
17460 /*
17461 * Lets attempt to turn on hardware pacing
17462 * if we can.
17463 */
17464 rack->rack_attempt_hdwr_pace = 1;
17465 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp,
17466 rack->rc_inp->inp_route.ro_nh->nh_ifp,
17467 rate_wanted,
17468 RS_PACING_GEQ,
17469 &err, &rack->r_ctl.crte_prev_rate);
17470 if (rack->r_ctl.crte) {
17471 rack->rack_hdrw_pacing = 1;
17472 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz,
17473 pace_one, rack->r_ctl.crte,
17474 NULL, rack->r_ctl.pace_len_divisor);
17475 rack_log_hdwr_pacing(rack,
17476 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17477 err, 0);
17478 rack->r_ctl.last_hw_bw_req = rate_wanted;
17479 } else {
17480 counter_u64_add(rack_hw_pace_init_fail, 1);
17481 }
17482 } else if (rack->rack_hdrw_pacing &&
17483 (rack->r_ctl.last_hw_bw_req != rate_wanted)) {
17484 /* Do we need to adjust our rate? */
17485 const struct tcp_hwrate_limit_table *nrte;
17486
17487 if (rack->r_up_only &&
17488 (rate_wanted < rack->r_ctl.crte->rate)) {
17489 /**
17490 * We have four possible states here
17491 * having to do with the previous time
17492 * and this time.
17493 * previous | this-time
17494 * A) 0 | 0 -- fill_cw not in the picture
17495 * B) 1 | 0 -- we were doing a fill-cw but now are not
17496 * C) 1 | 1 -- all rates from fill_cw
17497 * D) 0 | 1 -- we were doing non-fill and now we are filling
17498 *
17499 * For case A, C and D we don't allow a drop. But for
17500 * case B where we now our on our steady rate we do
17501 * allow a drop.
17502 *
17503 */
17504 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0)))
17505 goto done_w_hdwr;
17506 }
17507 if ((rate_wanted > rack->r_ctl.crte->rate) ||
17508 (rate_wanted <= rack->r_ctl.crte_prev_rate)) {
17509 if (rack_hw_rate_to_low &&
17510 (bw_est < rack_hw_rate_to_low)) {
17511 /*
17512 * The pacing rate is too low for hardware, but
17513 * do allow hardware pacing to be restarted.
17514 */
17515 rack_log_hdwr_pacing(rack,
17516 bw_est, rack->r_ctl.crte->rate, __LINE__,
17517 0, 5);
17518 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp);
17519 rack->r_ctl.crte = NULL;
17520 rack->rack_attempt_hdwr_pace = 0;
17521 rack->rack_hdrw_pacing = 0;
17522 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17523 goto done_w_hdwr;
17524 }
17525 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte,
17526 rack->rc_tp,
17527 rack->rc_inp->inp_route.ro_nh->nh_ifp,
17528 rate_wanted,
17529 RS_PACING_GEQ,
17530 &err, &rack->r_ctl.crte_prev_rate);
17531 if (nrte == NULL) {
17532 /*
17533 * Lost the rate, lets drop hardware pacing
17534 * period.
17535 */
17536 rack->rack_hdrw_pacing = 0;
17537 rack->r_ctl.crte = NULL;
17538 rack_log_hdwr_pacing(rack,
17539 rate_wanted, 0, __LINE__,
17540 err, 1);
17541 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17542 counter_u64_add(rack_hw_pace_lost, 1);
17543 } else if (nrte != rack->r_ctl.crte) {
17544 rack->r_ctl.crte = nrte;
17545 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted,
17546 segsiz, pace_one, rack->r_ctl.crte,
17547 NULL, rack->r_ctl.pace_len_divisor);
17548 rack_log_hdwr_pacing(rack,
17549 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17550 err, 2);
17551 rack->r_ctl.last_hw_bw_req = rate_wanted;
17552 }
17553 } else {
17554 /* We just need to adjust the segment size */
17555 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted);
17556 rack_log_hdwr_pacing(rack,
17557 rate_wanted, rack->r_ctl.crte->rate, __LINE__,
17558 0, 4);
17559 rack->r_ctl.last_hw_bw_req = rate_wanted;
17560 }
17561 }
17562 }
17563 done_w_hdwr:
17564 if (rack_limit_time_with_srtt &&
17565 (rack->use_fixed_rate == 0) &&
17566 (rack->rack_hdrw_pacing == 0)) {
17567 /*
17568 * Sanity check, we do not allow the pacing delay
17569 * to be longer than the SRTT of the path. If it is
17570 * a slow path, then adding a packet should increase
17571 * the RTT and compensate for this i.e. the srtt will
17572 * be greater so the allowed pacing time will be greater.
17573 *
17574 * Note this restriction is not for where a peak rate
17575 * is set, we are doing fixed pacing or hardware pacing.
17576 */
17577 if (rack->rc_tp->t_srtt)
17578 srtt = rack->rc_tp->t_srtt;
17579 else
17580 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */
17581 if (srtt < (uint64_t)pacing_delay) {
17582 rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0);
17583 pacing_delay = srtt;
17584 }
17585 }
17586 /*******************************************************************/
17587 /* RRS: We insert paced call to stats here for len and rate_wanted */
17588 /*******************************************************************/
17589 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0);
17590 }
17591 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) {
17592 /*
17593 * If this rate is seeing enobufs when it
17594 * goes to send then either the nic is out
17595 * of gas or we are mis-estimating the time
17596 * somehow and not letting the queue empty
17597 * completely. Lets add to the pacing time.
17598 */
17599 int hw_boost_delay;
17600
17601 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult;
17602 if (hw_boost_delay > rack_enobuf_hw_max)
17603 hw_boost_delay = rack_enobuf_hw_max;
17604 else if (hw_boost_delay < rack_enobuf_hw_min)
17605 hw_boost_delay = rack_enobuf_hw_min;
17606 pacing_delay += hw_boost_delay;
17607 }
17608 return (pacing_delay);
17609 }
17610
17611 static void
rack_start_gp_measurement(struct tcpcb * tp,struct tcp_rack * rack,tcp_seq startseq,uint32_t sb_offset)17612 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack,
17613 tcp_seq startseq, uint32_t sb_offset)
17614 {
17615 struct rack_sendmap *my_rsm = NULL;
17616
17617 if (tp->t_state < TCPS_ESTABLISHED) {
17618 /*
17619 * We don't start any measurements if we are
17620 * not at least established.
17621 */
17622 return;
17623 }
17624 if (tp->t_state >= TCPS_FIN_WAIT_1) {
17625 /*
17626 * We will get no more data into the SB
17627 * this means we need to have the data available
17628 * before we start a measurement.
17629 */
17630
17631 if (sbavail(&tptosocket(tp)->so_snd) <
17632 max(rc_init_window(rack),
17633 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) {
17634 /* Nope not enough data */
17635 return;
17636 }
17637 }
17638 tp->t_flags |= TF_GPUTINPROG;
17639 rack->r_ctl.rc_gp_cumack_ts = 0;
17640 rack->r_ctl.rc_gp_lowrtt = 0xffffffff;
17641 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd;
17642 tp->gput_seq = startseq;
17643 rack->app_limited_needs_set = 0;
17644 if (rack->in_probe_rtt)
17645 rack->measure_saw_probe_rtt = 1;
17646 else if ((rack->measure_saw_probe_rtt) &&
17647 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit)))
17648 rack->measure_saw_probe_rtt = 0;
17649 if (rack->rc_gp_filled)
17650 tp->gput_ts = rack->r_ctl.last_cumack_advance;
17651 else {
17652 /* Special case initial measurement */
17653 struct timeval tv;
17654
17655 tp->gput_ts = tcp_get_usecs(&tv);
17656 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17657 }
17658 /*
17659 * We take a guess out into the future,
17660 * if we have no measurement and no
17661 * initial rate, we measure the first
17662 * initial-windows worth of data to
17663 * speed up getting some GP measurement and
17664 * thus start pacing.
17665 */
17666 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) {
17667 rack->app_limited_needs_set = 1;
17668 tp->gput_ack = startseq + max(rc_init_window(rack),
17669 (MIN_GP_WIN * ctf_fixed_maxseg(tp)));
17670 rack_log_pacing_delay_calc(rack,
17671 tp->gput_seq,
17672 tp->gput_ack,
17673 0,
17674 tp->gput_ts,
17675 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17676 9,
17677 __LINE__, NULL, 0);
17678 rack_tend_gp_marks(tp, rack);
17679 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17680 return;
17681 }
17682 if (sb_offset) {
17683 /*
17684 * We are out somewhere in the sb
17685 * can we use the already outstanding data?
17686 */
17687
17688 if (rack->r_ctl.rc_app_limited_cnt == 0) {
17689 /*
17690 * Yes first one is good and in this case
17691 * the tp->gput_ts is correctly set based on
17692 * the last ack that arrived (no need to
17693 * set things up when an ack comes in).
17694 */
17695 my_rsm = tqhash_min(rack->r_ctl.tqh);
17696 if ((my_rsm == NULL) ||
17697 (my_rsm->r_rtr_cnt != 1)) {
17698 /* retransmission? */
17699 goto use_latest;
17700 }
17701 } else {
17702 if (rack->r_ctl.rc_first_appl == NULL) {
17703 /*
17704 * If rc_first_appl is NULL
17705 * then the cnt should be 0.
17706 * This is probably an error, maybe
17707 * a KASSERT would be approprate.
17708 */
17709 goto use_latest;
17710 }
17711 /*
17712 * If we have a marker pointer to the last one that is
17713 * app limited we can use that, but we need to set
17714 * things up so that when it gets ack'ed we record
17715 * the ack time (if its not already acked).
17716 */
17717 rack->app_limited_needs_set = 1;
17718 /*
17719 * We want to get to the rsm that is either
17720 * next with space i.e. over 1 MSS or the one
17721 * after that (after the app-limited).
17722 */
17723 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl);
17724 if (my_rsm) {
17725 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp))
17726 /* Have to use the next one */
17727 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17728 else {
17729 /* Use after the first MSS of it is acked */
17730 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp);
17731 goto start_set;
17732 }
17733 }
17734 if ((my_rsm == NULL) ||
17735 (my_rsm->r_rtr_cnt != 1)) {
17736 /*
17737 * Either its a retransmit or
17738 * the last is the app-limited one.
17739 */
17740 goto use_latest;
17741 }
17742 }
17743 tp->gput_seq = my_rsm->r_start;
17744 start_set:
17745 if (my_rsm->r_flags & RACK_ACKED) {
17746 /*
17747 * This one has been acked use the arrival ack time
17748 */
17749 struct rack_sendmap *nrsm;
17750
17751 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17752 rack->app_limited_needs_set = 0;
17753 /*
17754 * Ok in this path we need to use the r_end now
17755 * since this guy is the starting ack.
17756 */
17757 tp->gput_seq = my_rsm->r_end;
17758 /*
17759 * We also need to adjust up the sendtime
17760 * to the send of the next data after my_rsm.
17761 */
17762 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm);
17763 if (nrsm != NULL)
17764 my_rsm = nrsm;
17765 else {
17766 /*
17767 * The next as not been sent, thats the
17768 * case for using the latest.
17769 */
17770 goto use_latest;
17771 }
17772 }
17773 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17774 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack);
17775 rack->r_ctl.rc_gp_cumack_ts = 0;
17776 if ((rack->r_ctl.cleared_app_ack == 1) &&
17777 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) {
17778 /*
17779 * We just cleared an application limited period
17780 * so the next seq out needs to skip the first
17781 * ack.
17782 */
17783 rack->app_limited_needs_set = 1;
17784 rack->r_ctl.cleared_app_ack = 0;
17785 }
17786 rack_log_pacing_delay_calc(rack,
17787 tp->gput_seq,
17788 tp->gput_ack,
17789 (uintptr_t)my_rsm,
17790 tp->gput_ts,
17791 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17792 9,
17793 __LINE__, my_rsm, 0);
17794 /* Now lets make sure all are marked as they should be */
17795 rack_tend_gp_marks(tp, rack);
17796 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17797 return;
17798 }
17799
17800 use_latest:
17801 /*
17802 * We don't know how long we may have been
17803 * idle or if this is the first-send. Lets
17804 * setup the flag so we will trim off
17805 * the first ack'd data so we get a true
17806 * measurement.
17807 */
17808 rack->app_limited_needs_set = 1;
17809 tp->gput_ack = startseq + rack_get_measure_window(tp, rack);
17810 rack->r_ctl.rc_gp_cumack_ts = 0;
17811 /* Find this guy so we can pull the send time */
17812 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq);
17813 if (my_rsm) {
17814 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0];
17815 if (my_rsm->r_flags & RACK_ACKED) {
17816 /*
17817 * Unlikely since its probably what was
17818 * just transmitted (but I am paranoid).
17819 */
17820 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival;
17821 rack->app_limited_needs_set = 0;
17822 }
17823 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) {
17824 /* This also is unlikely */
17825 tp->gput_seq = my_rsm->r_start;
17826 }
17827 } else {
17828 /*
17829 * TSNH unless we have some send-map limit,
17830 * and even at that it should not be hitting
17831 * that limit (we should have stopped sending).
17832 */
17833 struct timeval tv;
17834
17835 microuptime(&tv);
17836 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv);
17837 }
17838 rack_tend_gp_marks(tp, rack);
17839 rack_log_pacing_delay_calc(rack,
17840 tp->gput_seq,
17841 tp->gput_ack,
17842 (uintptr_t)my_rsm,
17843 tp->gput_ts,
17844 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts),
17845 9, __LINE__, NULL, 0);
17846 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL);
17847 }
17848
17849 static inline uint32_t
rack_what_can_we_send(struct tcpcb * tp,struct tcp_rack * rack,uint32_t cwnd_to_use,uint32_t avail,int32_t sb_offset)17850 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use,
17851 uint32_t avail, int32_t sb_offset)
17852 {
17853 uint32_t len;
17854 uint32_t sendwin;
17855
17856 if (tp->snd_wnd > cwnd_to_use)
17857 sendwin = cwnd_to_use;
17858 else
17859 sendwin = tp->snd_wnd;
17860 if (ctf_outstanding(tp) >= tp->snd_wnd) {
17861 /* We never want to go over our peers rcv-window */
17862 len = 0;
17863 } else {
17864 uint32_t flight;
17865
17866 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked);
17867 if (flight >= sendwin) {
17868 /*
17869 * We have in flight what we are allowed by cwnd (if
17870 * it was rwnd blocking it would have hit above out
17871 * >= tp->snd_wnd).
17872 */
17873 return (0);
17874 }
17875 len = sendwin - flight;
17876 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) {
17877 /* We would send too much (beyond the rwnd) */
17878 len = tp->snd_wnd - ctf_outstanding(tp);
17879 }
17880 if ((len + sb_offset) > avail) {
17881 /*
17882 * We don't have that much in the SB, how much is
17883 * there?
17884 */
17885 len = avail - sb_offset;
17886 }
17887 }
17888 return (len);
17889 }
17890
17891 static void
rack_log_fsb(struct tcp_rack * rack,struct tcpcb * tp,struct socket * so,uint32_t flags,unsigned ipoptlen,int32_t orig_len,int32_t len,int error,int rsm_is_null,int optlen,int line,uint16_t mode)17892 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags,
17893 unsigned ipoptlen, int32_t orig_len, int32_t len, int error,
17894 int rsm_is_null, int optlen, int line, uint16_t mode)
17895 {
17896 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) {
17897 union tcp_log_stackspecific log;
17898 struct timeval tv;
17899
17900 memset(&log, 0, sizeof(log));
17901 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
17902 log.u_bbr.flex1 = error;
17903 log.u_bbr.flex2 = flags;
17904 log.u_bbr.flex3 = rsm_is_null;
17905 log.u_bbr.flex4 = ipoptlen;
17906 log.u_bbr.flex5 = tp->rcv_numsacks;
17907 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
17908 log.u_bbr.flex7 = optlen;
17909 log.u_bbr.flex8 = rack->r_fsb_inited;
17910 log.u_bbr.applimited = rack->r_fast_output;
17911 log.u_bbr.bw_inuse = rack_get_bw(rack);
17912 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
17913 log.u_bbr.cwnd_gain = mode;
17914 log.u_bbr.pkts_out = orig_len;
17915 log.u_bbr.lt_epoch = len;
17916 log.u_bbr.delivered = line;
17917 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
17918 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
17919 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0,
17920 len, &log, false, NULL, __func__, __LINE__, &tv);
17921 }
17922 }
17923
17924
17925 static struct mbuf *
rack_fo_base_copym(struct mbuf * the_m,uint32_t the_off,int32_t * plen,struct rack_fast_send_blk * fsb,int32_t seglimit,int32_t segsize,int hw_tls)17926 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen,
17927 struct rack_fast_send_blk *fsb,
17928 int32_t seglimit, int32_t segsize, int hw_tls)
17929 {
17930 #ifdef KERN_TLS
17931 struct ktls_session *tls, *ntls;
17932 #ifdef INVARIANTS
17933 struct mbuf *start;
17934 #endif
17935 #endif
17936 struct mbuf *m, *n, **np, *smb;
17937 struct mbuf *top;
17938 int32_t off, soff;
17939 int32_t len = *plen;
17940 int32_t fragsize;
17941 int32_t len_cp = 0;
17942 uint32_t mlen, frags;
17943
17944 soff = off = the_off;
17945 smb = m = the_m;
17946 np = ⊤
17947 top = NULL;
17948 #ifdef KERN_TLS
17949 if (hw_tls && (m->m_flags & M_EXTPG))
17950 tls = m->m_epg_tls;
17951 else
17952 tls = NULL;
17953 #ifdef INVARIANTS
17954 start = m;
17955 #endif
17956 #endif
17957 while (len > 0) {
17958 if (m == NULL) {
17959 *plen = len_cp;
17960 break;
17961 }
17962 #ifdef KERN_TLS
17963 if (hw_tls) {
17964 if (m->m_flags & M_EXTPG)
17965 ntls = m->m_epg_tls;
17966 else
17967 ntls = NULL;
17968
17969 /*
17970 * Avoid mixing TLS records with handshake
17971 * data or TLS records from different
17972 * sessions.
17973 */
17974 if (tls != ntls) {
17975 MPASS(m != start);
17976 *plen = len_cp;
17977 break;
17978 }
17979 }
17980 #endif
17981 mlen = min(len, m->m_len - off);
17982 if (seglimit) {
17983 /*
17984 * For M_EXTPG mbufs, add 3 segments
17985 * + 1 in case we are crossing page boundaries
17986 * + 2 in case the TLS hdr/trailer are used
17987 * It is cheaper to just add the segments
17988 * than it is to take the cache miss to look
17989 * at the mbuf ext_pgs state in detail.
17990 */
17991 if (m->m_flags & M_EXTPG) {
17992 fragsize = min(segsize, PAGE_SIZE);
17993 frags = 3;
17994 } else {
17995 fragsize = segsize;
17996 frags = 0;
17997 }
17998
17999 /* Break if we really can't fit anymore. */
18000 if ((frags + 1) >= seglimit) {
18001 *plen = len_cp;
18002 break;
18003 }
18004
18005 /*
18006 * Reduce size if you can't copy the whole
18007 * mbuf. If we can't copy the whole mbuf, also
18008 * adjust len so the loop will end after this
18009 * mbuf.
18010 */
18011 if ((frags + howmany(mlen, fragsize)) >= seglimit) {
18012 mlen = (seglimit - frags - 1) * fragsize;
18013 len = mlen;
18014 *plen = len_cp + len;
18015 }
18016 frags += howmany(mlen, fragsize);
18017 if (frags == 0)
18018 frags++;
18019 seglimit -= frags;
18020 KASSERT(seglimit > 0,
18021 ("%s: seglimit went too low", __func__));
18022 }
18023 n = m_get(M_NOWAIT, m->m_type);
18024 *np = n;
18025 if (n == NULL)
18026 goto nospace;
18027 n->m_len = mlen;
18028 soff += mlen;
18029 len_cp += n->m_len;
18030 if (m->m_flags & (M_EXT | M_EXTPG)) {
18031 n->m_data = m->m_data + off;
18032 mb_dupcl(n, m);
18033 } else {
18034 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
18035 (u_int)n->m_len);
18036 }
18037 len -= n->m_len;
18038 off = 0;
18039 m = m->m_next;
18040 np = &n->m_next;
18041 if (len || (soff == smb->m_len)) {
18042 /*
18043 * We have more so we move forward or
18044 * we have consumed the entire mbuf and
18045 * len has fell to 0.
18046 */
18047 soff = 0;
18048 smb = m;
18049 }
18050
18051 }
18052 if (fsb != NULL) {
18053 fsb->m = smb;
18054 fsb->off = soff;
18055 if (smb) {
18056 /*
18057 * Save off the size of the mbuf. We do
18058 * this so that we can recognize when it
18059 * has been trimmed by sbcut() as acks
18060 * come in.
18061 */
18062 fsb->o_m_len = smb->m_len;
18063 fsb->o_t_len = M_TRAILINGROOM(smb);
18064 } else {
18065 /*
18066 * This is the case where the next mbuf went to NULL. This
18067 * means with this copy we have sent everything in the sb.
18068 * In theory we could clear the fast_output flag, but lets
18069 * not since its possible that we could get more added
18070 * and acks that call the extend function which would let
18071 * us send more.
18072 */
18073 fsb->o_m_len = 0;
18074 fsb->o_t_len = 0;
18075 }
18076 }
18077 return (top);
18078 nospace:
18079 if (top)
18080 m_freem(top);
18081 return (NULL);
18082
18083 }
18084
18085 /*
18086 * This is a copy of m_copym(), taking the TSO segment size/limit
18087 * constraints into account, and advancing the sndptr as it goes.
18088 */
18089 static struct mbuf *
rack_fo_m_copym(struct tcp_rack * rack,int32_t * plen,int32_t seglimit,int32_t segsize,struct mbuf ** s_mb,int * s_soff)18090 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
18091 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff)
18092 {
18093 struct mbuf *m, *n;
18094 int32_t soff;
18095
18096 m = rack->r_ctl.fsb.m;
18097 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) {
18098 /*
18099 * The trailing space changed, mbufs can grow
18100 * at the tail but they can't shrink from
18101 * it, KASSERT that. Adjust the orig_m_len to
18102 * compensate for this change.
18103 */
18104 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)),
18105 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n",
18106 m,
18107 rack,
18108 (intmax_t)M_TRAILINGROOM(m),
18109 rack->r_ctl.fsb.o_t_len,
18110 rack->r_ctl.fsb.o_m_len,
18111 m->m_len));
18112 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m));
18113 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m);
18114 }
18115 if (m->m_len < rack->r_ctl.fsb.o_m_len) {
18116 /*
18117 * Mbuf shrank, trimmed off the top by an ack, our
18118 * offset changes.
18119 */
18120 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)),
18121 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n",
18122 m, m->m_len,
18123 rack, rack->r_ctl.fsb.o_m_len,
18124 rack->r_ctl.fsb.off));
18125
18126 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len))
18127 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len);
18128 else
18129 rack->r_ctl.fsb.off = 0;
18130 rack->r_ctl.fsb.o_m_len = m->m_len;
18131 #ifdef INVARIANTS
18132 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) {
18133 panic("rack:%p m:%p m_len grew outside of t_space compensation",
18134 rack, m);
18135 #endif
18136 }
18137 soff = rack->r_ctl.fsb.off;
18138 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
18139 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));
18140 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?",
18141 __FUNCTION__,
18142 rack, *plen, m, m->m_len));
18143 /* Save off the right location before we copy and advance */
18144 *s_soff = soff;
18145 *s_mb = rack->r_ctl.fsb.m;
18146 n = rack_fo_base_copym(m, soff, plen,
18147 &rack->r_ctl.fsb,
18148 seglimit, segsize, rack->r_ctl.fsb.hw_tls);
18149 return (n);
18150 }
18151
18152 /* Log the buffer level */
18153 static void
rack_log_queue_level(struct tcpcb * tp,struct tcp_rack * rack,int len,struct timeval * tv,uint32_t cts)18154 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack,
18155 int len, struct timeval *tv,
18156 uint32_t cts)
18157 {
18158 uint32_t p_rate = 0, p_queue = 0, err = 0;
18159 union tcp_log_stackspecific log;
18160
18161 #ifdef RATELIMIT
18162 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18163 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18164 #endif
18165 memset(&log, 0, sizeof(log));
18166 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18167 log.u_bbr.flex1 = p_rate;
18168 log.u_bbr.flex2 = p_queue;
18169 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18170 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18171 log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18172 log.u_bbr.flex7 = 99;
18173 log.u_bbr.flex8 = 0;
18174 log.u_bbr.pkts_out = err;
18175 log.u_bbr.delRate = rack->r_ctl.crte->rate;
18176 log.u_bbr.timeStamp = cts;
18177 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18178 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18179 len, &log, false, NULL, __func__, __LINE__, tv);
18180
18181 }
18182
18183 static uint32_t
rack_check_queue_level(struct tcp_rack * rack,struct tcpcb * tp,struct timeval * tv,uint32_t cts,int len,uint32_t segsiz)18184 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp,
18185 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz)
18186 {
18187 uint64_t lentime = 0;
18188 #ifdef RATELIMIT
18189 uint32_t p_rate = 0, p_queue = 0, err;
18190 union tcp_log_stackspecific log;
18191 uint64_t bw;
18192
18193 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue);
18194 /* Failed or queue is zero */
18195 if (err || (p_queue == 0)) {
18196 lentime = 0;
18197 goto out;
18198 }
18199 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate);
18200 if (err) {
18201 lentime = 0;
18202 goto out;
18203 }
18204 /*
18205 * If we reach here we have some bytes in
18206 * the queue. The number returned is a value
18207 * between 0 and 0xffff where ffff is full
18208 * and 0 is empty. So how best to make this into
18209 * something usable?
18210 *
18211 * The "safer" way is lets take the b/w gotten
18212 * from the query (which should be our b/w rate)
18213 * and pretend that a full send (our rc_pace_max_segs)
18214 * is outstanding. We factor it so its as if a full
18215 * number of our MSS segment is terms of full
18216 * ethernet segments are outstanding.
18217 */
18218 bw = p_rate / 8;
18219 if (bw) {
18220 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz);
18221 lentime *= ETHERNET_SEGMENT_SIZE;
18222 lentime *= (uint64_t)HPTS_USEC_IN_SEC;
18223 lentime /= bw;
18224 } else {
18225 /* TSNH -- KASSERT? */
18226 lentime = 0;
18227 }
18228 out:
18229 if (tcp_bblogging_on(tp)) {
18230 memset(&log, 0, sizeof(log));
18231 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18232 log.u_bbr.flex1 = p_rate;
18233 log.u_bbr.flex2 = p_queue;
18234 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using;
18235 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs;
18236 log.u_bbr.flex6 = rack->r_ctl.crte->time_between;
18237 log.u_bbr.flex7 = 99;
18238 log.u_bbr.flex8 = 0;
18239 log.u_bbr.pkts_out = err;
18240 log.u_bbr.delRate = rack->r_ctl.crte->rate;
18241 log.u_bbr.cur_del_rate = lentime;
18242 log.u_bbr.timeStamp = cts;
18243 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18244 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0,
18245 len, &log, false, NULL, __func__, __LINE__,tv);
18246 }
18247 #endif
18248 return ((uint32_t)lentime);
18249 }
18250
18251 static int
rack_fast_rsm_output(struct tcpcb * tp,struct tcp_rack * rack,struct rack_sendmap * rsm,uint64_t ts_val,uint32_t cts,uint32_t ms_cts,struct timeval * tv,int len,uint8_t doing_tlp)18252 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm,
18253 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp)
18254 {
18255 /*
18256 * Enter the fast retransmit path. We are given that a sched_pin is
18257 * in place (if accounting is compliled in) and the cycle count taken
18258 * at the entry is in the ts_val. The concept her is that the rsm
18259 * now holds the mbuf offsets and such so we can directly transmit
18260 * without a lot of overhead, the len field is already set for
18261 * us to prohibit us from sending too much (usually its 1MSS).
18262 */
18263 struct ip *ip = NULL;
18264 struct udphdr *udp = NULL;
18265 struct tcphdr *th = NULL;
18266 struct mbuf *m = NULL;
18267 struct inpcb *inp;
18268 uint8_t *cpto;
18269 struct tcp_log_buffer *lgb;
18270 #ifdef TCP_ACCOUNTING
18271 uint64_t crtsc;
18272 int cnt_thru = 1;
18273 #endif
18274 struct tcpopt to;
18275 u_char opt[TCP_MAXOLEN];
18276 uint32_t hdrlen, optlen;
18277 int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0;
18278 uint16_t flags;
18279 uint32_t if_hw_tsomaxsegcount = 0, startseq;
18280 uint32_t if_hw_tsomaxsegsize;
18281 int32_t ip_sendflag = IP_NO_SND_TAG_RL;
18282
18283 #ifdef INET6
18284 struct ip6_hdr *ip6 = NULL;
18285
18286 if (rack->r_is_v6) {
18287 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18288 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18289 } else
18290 #endif /* INET6 */
18291 {
18292 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18293 hdrlen = sizeof(struct tcpiphdr);
18294 }
18295 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
18296 goto failed;
18297 }
18298 if (doing_tlp) {
18299 /* Its a TLP add the flag, it may already be there but be sure */
18300 rsm->r_flags |= RACK_TLP;
18301 } else {
18302 /* If it was a TLP it is not not on this retransmit */
18303 rsm->r_flags &= ~RACK_TLP;
18304 }
18305 startseq = rsm->r_start;
18306 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
18307 inp = rack->rc_inp;
18308 to.to_flags = 0;
18309 flags = tcp_outflags[tp->t_state];
18310 if (flags & (TH_SYN|TH_RST)) {
18311 goto failed;
18312 }
18313 if (rsm->r_flags & RACK_HAS_FIN) {
18314 /* We can't send a FIN here */
18315 goto failed;
18316 }
18317 if (flags & TH_FIN) {
18318 /* We never send a FIN */
18319 flags &= ~TH_FIN;
18320 }
18321 if (tp->t_flags & TF_RCVD_TSTMP) {
18322 to.to_tsval = ms_cts + tp->ts_offset;
18323 to.to_tsecr = tp->ts_recent;
18324 to.to_flags = TOF_TS;
18325 }
18326 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18327 /* TCP-MD5 (RFC2385). */
18328 if (tp->t_flags & TF_SIGNATURE)
18329 to.to_flags |= TOF_SIGNATURE;
18330 #endif
18331 optlen = tcp_addoptions(&to, opt);
18332 hdrlen += optlen;
18333 udp = rack->r_ctl.fsb.udp;
18334 if (udp)
18335 hdrlen += sizeof(struct udphdr);
18336 if (rack->r_ctl.rc_pace_max_segs)
18337 max_val = rack->r_ctl.rc_pace_max_segs;
18338 else if (rack->rc_user_set_max_segs)
18339 max_val = rack->rc_user_set_max_segs * segsiz;
18340 else
18341 max_val = len;
18342 if ((tp->t_flags & TF_TSO) &&
18343 V_tcp_do_tso &&
18344 (len > segsiz) &&
18345 (tp->t_port == 0))
18346 tso = 1;
18347 #ifdef INET6
18348 if (MHLEN < hdrlen + max_linkhdr)
18349 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18350 else
18351 #endif
18352 m = m_gethdr(M_NOWAIT, MT_DATA);
18353 if (m == NULL)
18354 goto failed;
18355 m->m_data += max_linkhdr;
18356 m->m_len = hdrlen;
18357 th = rack->r_ctl.fsb.th;
18358 /* Establish the len to send */
18359 if (len > max_val)
18360 len = max_val;
18361 if ((tso) && (len + optlen > segsiz)) {
18362 uint32_t if_hw_tsomax;
18363 int32_t max_len;
18364
18365 /* extract TSO information */
18366 if_hw_tsomax = tp->t_tsomax;
18367 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18368 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18369 /*
18370 * Check if we should limit by maximum payload
18371 * length:
18372 */
18373 if (if_hw_tsomax != 0) {
18374 /* compute maximum TSO length */
18375 max_len = (if_hw_tsomax - hdrlen -
18376 max_linkhdr);
18377 if (max_len <= 0) {
18378 goto failed;
18379 } else if (len > max_len) {
18380 len = max_len;
18381 }
18382 }
18383 if (len <= segsiz) {
18384 /*
18385 * In case there are too many small fragments don't
18386 * use TSO:
18387 */
18388 tso = 0;
18389 }
18390 } else {
18391 tso = 0;
18392 }
18393 if ((tso == 0) && (len > segsiz))
18394 len = segsiz;
18395 (void)tcp_get_usecs(tv);
18396 if ((len == 0) ||
18397 (len <= MHLEN - hdrlen - max_linkhdr)) {
18398 goto failed;
18399 }
18400 th->th_seq = htonl(rsm->r_start);
18401 th->th_ack = htonl(tp->rcv_nxt);
18402 /*
18403 * The PUSH bit should only be applied
18404 * if the full retransmission is made. If
18405 * we are sending less than this is the
18406 * left hand edge and should not have
18407 * the PUSH bit.
18408 */
18409 if ((rsm->r_flags & RACK_HAD_PUSH) &&
18410 (len == (rsm->r_end - rsm->r_start)))
18411 flags |= TH_PUSH;
18412 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
18413 if (th->th_win == 0) {
18414 tp->t_sndzerowin++;
18415 tp->t_flags |= TF_RXWIN0SENT;
18416 } else
18417 tp->t_flags &= ~TF_RXWIN0SENT;
18418 if (rsm->r_flags & RACK_TLP) {
18419 /*
18420 * TLP should not count in retran count, but
18421 * in its own bin
18422 */
18423 counter_u64_add(rack_tlp_retran, 1);
18424 counter_u64_add(rack_tlp_retran_bytes, len);
18425 } else {
18426 tp->t_sndrexmitpack++;
18427 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
18428 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
18429 }
18430 #ifdef STATS
18431 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
18432 len);
18433 #endif
18434 if (rsm->m == NULL)
18435 goto failed;
18436 if (rsm->m &&
18437 ((rsm->orig_m_len != rsm->m->m_len) ||
18438 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) {
18439 /* Fix up the orig_m_len and possibly the mbuf offset */
18440 rack_adjust_orig_mlen(rsm);
18441 }
18442 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls);
18443 if (len <= segsiz) {
18444 /*
18445 * Must have ran out of mbufs for the copy
18446 * shorten it to no longer need tso. Lets
18447 * not put on sendalot since we are low on
18448 * mbufs.
18449 */
18450 tso = 0;
18451 }
18452 if ((m->m_next == NULL) || (len <= 0)){
18453 goto failed;
18454 }
18455 if (udp) {
18456 if (rack->r_is_v6)
18457 ulen = hdrlen + len - sizeof(struct ip6_hdr);
18458 else
18459 ulen = hdrlen + len - sizeof(struct ip);
18460 udp->uh_ulen = htons(ulen);
18461 }
18462 m->m_pkthdr.rcvif = (struct ifnet *)0;
18463 if (TCPS_HAVERCVDSYN(tp->t_state) &&
18464 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
18465 int ect = tcp_ecn_output_established(tp, &flags, len, true);
18466 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
18467 (tp->t_flags2 & TF2_ECN_SND_ECE))
18468 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
18469 #ifdef INET6
18470 if (rack->r_is_v6) {
18471 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
18472 ip6->ip6_flow |= htonl(ect << 20);
18473 }
18474 else
18475 #endif
18476 {
18477 ip->ip_tos &= ~IPTOS_ECN_MASK;
18478 ip->ip_tos |= ect;
18479 }
18480 }
18481 if (rack->r_ctl.crte != NULL) {
18482 /* See if we can send via the hw queue */
18483 pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz);
18484 /* If there is nothing in queue (no pacing time) we can send via the hw queue */
18485 if (pacing_delay == 0)
18486 ip_sendflag = 0;
18487 }
18488 tcp_set_flags(th, flags);
18489 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
18490 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18491 if (to.to_flags & TOF_SIGNATURE) {
18492 /*
18493 * Calculate MD5 signature and put it into the place
18494 * determined before.
18495 * NOTE: since TCP options buffer doesn't point into
18496 * mbuf's data, calculate offset and use it.
18497 */
18498 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
18499 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
18500 /*
18501 * Do not send segment if the calculation of MD5
18502 * digest has failed.
18503 */
18504 goto failed;
18505 }
18506 }
18507 #endif
18508 #ifdef INET6
18509 if (rack->r_is_v6) {
18510 if (tp->t_port) {
18511 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
18512 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18513 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
18514 th->th_sum = htons(0);
18515 UDPSTAT_INC(udps_opackets);
18516 } else {
18517 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
18518 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18519 th->th_sum = in6_cksum_pseudo(ip6,
18520 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
18521 0);
18522 }
18523 }
18524 #endif
18525 #if defined(INET6) && defined(INET)
18526 else
18527 #endif
18528 #ifdef INET
18529 {
18530 if (tp->t_port) {
18531 m->m_pkthdr.csum_flags = CSUM_UDP;
18532 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
18533 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
18534 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
18535 th->th_sum = htons(0);
18536 UDPSTAT_INC(udps_opackets);
18537 } else {
18538 m->m_pkthdr.csum_flags = CSUM_TCP;
18539 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
18540 th->th_sum = in_pseudo(ip->ip_src.s_addr,
18541 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
18542 IPPROTO_TCP + len + optlen));
18543 }
18544 /* IP version must be set here for ipv4/ipv6 checking later */
18545 KASSERT(ip->ip_v == IPVERSION,
18546 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
18547 }
18548 #endif
18549 if (tso) {
18550 /*
18551 * Here we use segsiz since we have no added options besides
18552 * any standard timestamp options (no DSACKs or SACKS are sent
18553 * via either fast-path).
18554 */
18555 KASSERT(len > segsiz,
18556 ("%s: len <= tso_segsz tp:%p", __func__, tp));
18557 m->m_pkthdr.csum_flags |= CSUM_TSO;
18558 m->m_pkthdr.tso_segsz = segsiz;
18559 }
18560 #ifdef INET6
18561 if (rack->r_is_v6) {
18562 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
18563 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
18564 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
18565 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18566 else
18567 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18568 }
18569 #endif
18570 #if defined(INET) && defined(INET6)
18571 else
18572 #endif
18573 #ifdef INET
18574 {
18575 ip->ip_len = htons(m->m_pkthdr.len);
18576 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
18577 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
18578 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
18579 if (tp->t_port == 0 || len < V_tcp_minmss) {
18580 ip->ip_off |= htons(IP_DF);
18581 }
18582 } else {
18583 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
18584 }
18585 }
18586 #endif
18587 if (doing_tlp == 0) {
18588 /* Set we retransmitted */
18589 rack->rc_gp_saw_rec = 1;
18590 } else {
18591 /* Its a TLP set ca or ss */
18592 if (tp->snd_cwnd > tp->snd_ssthresh) {
18593 /* Set we sent in CA */
18594 rack->rc_gp_saw_ca = 1;
18595 } else {
18596 /* Set we sent in SS */
18597 rack->rc_gp_saw_ss = 1;
18598 }
18599 }
18600 /* Time to copy in our header */
18601 cpto = mtod(m, uint8_t *);
18602 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
18603 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
18604 if (optlen) {
18605 bcopy(opt, th + 1, optlen);
18606 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
18607 } else {
18608 th->th_off = sizeof(struct tcphdr) >> 2;
18609 }
18610 if (tcp_bblogging_on(rack->rc_tp)) {
18611 union tcp_log_stackspecific log;
18612
18613 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
18614 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
18615 counter_u64_add(rack_collapsed_win_rxt, 1);
18616 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
18617 }
18618 memset(&log, 0, sizeof(log));
18619 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
18620 if (rack->rack_no_prr)
18621 log.u_bbr.flex1 = 0;
18622 else
18623 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
18624 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
18625 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
18626 log.u_bbr.flex4 = max_val;
18627 /* Save off the early/late values */
18628 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
18629 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
18630 log.u_bbr.bw_inuse = rack_get_bw(rack);
18631 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
18632 if (doing_tlp == 0)
18633 log.u_bbr.flex8 = 1;
18634 else
18635 log.u_bbr.flex8 = 2;
18636 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
18637 log.u_bbr.flex7 = 55;
18638 log.u_bbr.pkts_out = tp->t_maxseg;
18639 log.u_bbr.timeStamp = cts;
18640 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
18641 if (rsm->r_rtr_cnt > 0) {
18642 /*
18643 * When we have a retransmit we want to log the
18644 * burst at send and flight at send from before.
18645 */
18646 log.u_bbr.flex5 = rsm->r_fas;
18647 log.u_bbr.bbr_substate = rsm->r_bas;
18648 } else {
18649 /*
18650 * This is currently unlikely until we do the
18651 * packet pair probes but I will add it for completeness.
18652 */
18653 log.u_bbr.flex5 = log.u_bbr.inflight;
18654 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
18655 }
18656 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
18657 log.u_bbr.delivered = 0;
18658 log.u_bbr.rttProp = (uintptr_t)rsm;
18659 log.u_bbr.delRate = rsm->r_flags;
18660 log.u_bbr.delRate <<= 31;
18661 log.u_bbr.delRate |= rack->r_must_retran;
18662 log.u_bbr.delRate <<= 1;
18663 log.u_bbr.delRate |= 1;
18664 log.u_bbr.pkt_epoch = __LINE__;
18665 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
18666 len, &log, false, NULL, __func__, __LINE__, tv);
18667 } else
18668 lgb = NULL;
18669 if ((rack->r_ctl.crte != NULL) &&
18670 tcp_bblogging_on(tp)) {
18671 rack_log_queue_level(tp, rack, len, tv, cts);
18672 }
18673 #ifdef INET6
18674 if (rack->r_is_v6) {
18675 error = ip6_output(m, inp->in6p_outputopts,
18676 &inp->inp_route6,
18677 ip_sendflag, NULL, NULL, inp);
18678 }
18679 else
18680 #endif
18681 #ifdef INET
18682 {
18683 error = ip_output(m, NULL,
18684 &inp->inp_route,
18685 ip_sendflag, 0, inp);
18686 }
18687 #endif
18688 m = NULL;
18689 if (lgb) {
18690 lgb->tlb_errno = error;
18691 lgb = NULL;
18692 }
18693 /* Move snd_nxt to snd_max so we don't have false retransmissions */
18694 tp->snd_nxt = tp->snd_max;
18695 if (error) {
18696 goto failed;
18697 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) {
18698 rack->rc_hw_nobuf = 0;
18699 rack->r_ctl.rc_agg_delayed = 0;
18700 rack->r_early = 0;
18701 rack->r_late = 0;
18702 rack->r_ctl.rc_agg_early = 0;
18703 }
18704 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv),
18705 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz);
18706 if (doing_tlp) {
18707 rack->rc_tlp_in_progress = 1;
18708 rack->r_ctl.rc_tlp_cnt_out++;
18709 }
18710 if (error == 0) {
18711 counter_u64_add(rack_total_bytes, len);
18712 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls);
18713 if (doing_tlp) {
18714 rack->rc_last_sent_tlp_past_cumack = 0;
18715 rack->rc_last_sent_tlp_seq_valid = 1;
18716 rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
18717 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
18718 }
18719 if (rack->r_ctl.rc_prr_sndcnt >= len)
18720 rack->r_ctl.rc_prr_sndcnt -= len;
18721 else
18722 rack->r_ctl.rc_prr_sndcnt = 0;
18723 }
18724 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
18725 rack->forced_ack = 0; /* If we send something zap the FA flag */
18726 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
18727 rack->r_ctl.retran_during_recovery += len;
18728 {
18729 int idx;
18730
18731 idx = (len / segsiz) + 3;
18732 if (idx >= TCP_MSS_ACCT_ATIMER)
18733 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
18734 else
18735 counter_u64_add(rack_out_size[idx], 1);
18736 }
18737 if (tp->t_rtttime == 0) {
18738 tp->t_rtttime = ticks;
18739 tp->t_rtseq = startseq;
18740 KMOD_TCPSTAT_INC(tcps_segstimed);
18741 }
18742 counter_u64_add(rack_fto_rsm_send, 1);
18743 if (error && (error == ENOBUFS)) {
18744 if (rack->r_ctl.crte != NULL) {
18745 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
18746 if (tcp_bblogging_on(rack->rc_tp))
18747 rack_log_queue_level(tp, rack, len, tv, cts);
18748 } else
18749 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
18750 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
18751 if (rack->rc_enobuf < 0x7f)
18752 rack->rc_enobuf++;
18753 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
18754 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
18755 if (rack->r_ctl.crte != NULL) {
18756 counter_u64_add(rack_saw_enobuf_hw, 1);
18757 tcp_rl_log_enobuf(rack->r_ctl.crte);
18758 }
18759 counter_u64_add(rack_saw_enobuf, 1);
18760 } else {
18761 pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__);
18762 }
18763 rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0);
18764 #ifdef TCP_ACCOUNTING
18765 crtsc = get_cyclecount();
18766 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
18767 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
18768 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
18769 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz);
18770 }
18771 sched_unpin();
18772 #endif
18773 return (0);
18774 failed:
18775 if (m)
18776 m_free(m);
18777 return (-1);
18778 }
18779
18780 static void
rack_sndbuf_autoscale(struct tcp_rack * rack)18781 rack_sndbuf_autoscale(struct tcp_rack *rack)
18782 {
18783 /*
18784 * Automatic sizing of send socket buffer. Often the send buffer
18785 * size is not optimally adjusted to the actual network conditions
18786 * at hand (delay bandwidth product). Setting the buffer size too
18787 * small limits throughput on links with high bandwidth and high
18788 * delay (eg. trans-continental/oceanic links). Setting the
18789 * buffer size too big consumes too much real kernel memory,
18790 * especially with many connections on busy servers.
18791 *
18792 * The criteria to step up the send buffer one notch are:
18793 * 1. receive window of remote host is larger than send buffer
18794 * (with a fudge factor of 5/4th);
18795 * 2. send buffer is filled to 7/8th with data (so we actually
18796 * have data to make use of it);
18797 * 3. send buffer fill has not hit maximal automatic size;
18798 * 4. our send window (slow start and cogestion controlled) is
18799 * larger than sent but unacknowledged data in send buffer.
18800 *
18801 * Note that the rack version moves things much faster since
18802 * we want to avoid hitting cache lines in the rack_fast_output()
18803 * path so this is called much less often and thus moves
18804 * the SB forward by a percentage.
18805 */
18806 struct socket *so;
18807 struct tcpcb *tp;
18808 uint32_t sendwin, scaleup;
18809
18810 tp = rack->rc_tp;
18811 so = rack->rc_inp->inp_socket;
18812 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd);
18813 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
18814 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
18815 sbused(&so->so_snd) >=
18816 (so->so_snd.sb_hiwat / 8 * 7) &&
18817 sbused(&so->so_snd) < V_tcp_autosndbuf_max &&
18818 sendwin >= (sbused(&so->so_snd) -
18819 (tp->snd_max - tp->snd_una))) {
18820 if (rack_autosndbuf_inc)
18821 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100;
18822 else
18823 scaleup = V_tcp_autosndbuf_inc;
18824 if (scaleup < V_tcp_autosndbuf_inc)
18825 scaleup = V_tcp_autosndbuf_inc;
18826 scaleup += so->so_snd.sb_hiwat;
18827 if (scaleup > V_tcp_autosndbuf_max)
18828 scaleup = V_tcp_autosndbuf_max;
18829 if (!sbreserve_locked(so, SO_SND, scaleup, curthread))
18830 so->so_snd.sb_flags &= ~SB_AUTOSIZE;
18831 }
18832 }
18833 }
18834
18835 static int
rack_fast_output(struct tcpcb * tp,struct tcp_rack * rack,uint64_t ts_val,uint32_t cts,uint32_t ms_cts,struct timeval * tv,long * tot_len,int * send_err,int line)18836 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val,
18837 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line)
18838 {
18839 /*
18840 * Enter to do fast output. We are given that the sched_pin is
18841 * in place (if accounting is compiled in) and the cycle count taken
18842 * at entry is in place in ts_val. The idea here is that
18843 * we know how many more bytes needs to be sent (presumably either
18844 * during pacing or to fill the cwnd and that was greater than
18845 * the max-burst). We have how much to send and all the info we
18846 * need to just send.
18847 */
18848 #ifdef INET
18849 struct ip *ip = NULL;
18850 #endif
18851 struct udphdr *udp = NULL;
18852 struct tcphdr *th = NULL;
18853 struct mbuf *m, *s_mb;
18854 struct inpcb *inp;
18855 uint8_t *cpto;
18856 struct tcp_log_buffer *lgb;
18857 #ifdef TCP_ACCOUNTING
18858 uint64_t crtsc;
18859 #endif
18860 struct tcpopt to;
18861 u_char opt[TCP_MAXOLEN];
18862 uint32_t hdrlen, optlen;
18863 #ifdef TCP_ACCOUNTING
18864 int cnt_thru = 1;
18865 #endif
18866 int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0;
18867 uint16_t flags;
18868 uint32_t s_soff;
18869 uint32_t if_hw_tsomaxsegcount = 0, startseq;
18870 uint32_t if_hw_tsomaxsegsize;
18871 uint32_t add_flag = RACK_SENT_FP;
18872 #ifdef INET6
18873 struct ip6_hdr *ip6 = NULL;
18874
18875 if (rack->r_is_v6) {
18876 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
18877 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
18878 } else
18879 #endif /* INET6 */
18880 {
18881 #ifdef INET
18882 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
18883 hdrlen = sizeof(struct tcpiphdr);
18884 #endif
18885 }
18886 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) {
18887 m = NULL;
18888 goto failed;
18889 }
18890 rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
18891 startseq = tp->snd_max;
18892 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
18893 inp = rack->rc_inp;
18894 len = rack->r_ctl.fsb.left_to_send;
18895 to.to_flags = 0;
18896 flags = rack->r_ctl.fsb.tcp_flags;
18897 if (tp->t_flags & TF_RCVD_TSTMP) {
18898 to.to_tsval = ms_cts + tp->ts_offset;
18899 to.to_tsecr = tp->ts_recent;
18900 to.to_flags = TOF_TS;
18901 }
18902 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
18903 /* TCP-MD5 (RFC2385). */
18904 if (tp->t_flags & TF_SIGNATURE)
18905 to.to_flags |= TOF_SIGNATURE;
18906 #endif
18907 optlen = tcp_addoptions(&to, opt);
18908 hdrlen += optlen;
18909 udp = rack->r_ctl.fsb.udp;
18910 if (udp)
18911 hdrlen += sizeof(struct udphdr);
18912 if (rack->r_ctl.rc_pace_max_segs)
18913 max_val = rack->r_ctl.rc_pace_max_segs;
18914 else if (rack->rc_user_set_max_segs)
18915 max_val = rack->rc_user_set_max_segs * segsiz;
18916 else
18917 max_val = len;
18918 if ((tp->t_flags & TF_TSO) &&
18919 V_tcp_do_tso &&
18920 (len > segsiz) &&
18921 (tp->t_port == 0))
18922 tso = 1;
18923 again:
18924 #ifdef INET6
18925 if (MHLEN < hdrlen + max_linkhdr)
18926 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
18927 else
18928 #endif
18929 m = m_gethdr(M_NOWAIT, MT_DATA);
18930 if (m == NULL)
18931 goto failed;
18932 m->m_data += max_linkhdr;
18933 m->m_len = hdrlen;
18934 th = rack->r_ctl.fsb.th;
18935 /* Establish the len to send */
18936 if (len > max_val)
18937 len = max_val;
18938 if ((tso) && (len + optlen > segsiz)) {
18939 uint32_t if_hw_tsomax;
18940 int32_t max_len;
18941
18942 /* extract TSO information */
18943 if_hw_tsomax = tp->t_tsomax;
18944 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
18945 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
18946 /*
18947 * Check if we should limit by maximum payload
18948 * length:
18949 */
18950 if (if_hw_tsomax != 0) {
18951 /* compute maximum TSO length */
18952 max_len = (if_hw_tsomax - hdrlen -
18953 max_linkhdr);
18954 if (max_len <= 0) {
18955 goto failed;
18956 } else if (len > max_len) {
18957 len = max_len;
18958 }
18959 }
18960 if (len <= segsiz) {
18961 /*
18962 * In case there are too many small fragments don't
18963 * use TSO:
18964 */
18965 tso = 0;
18966 }
18967 } else {
18968 tso = 0;
18969 }
18970 if ((tso == 0) && (len > segsiz))
18971 len = segsiz;
18972 (void)tcp_get_usecs(tv);
18973 if ((len == 0) ||
18974 (len <= MHLEN - hdrlen - max_linkhdr)) {
18975 goto failed;
18976 }
18977 sb_offset = tp->snd_max - tp->snd_una;
18978 th->th_seq = htonl(tp->snd_max);
18979 th->th_ack = htonl(tp->rcv_nxt);
18980 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale));
18981 if (th->th_win == 0) {
18982 tp->t_sndzerowin++;
18983 tp->t_flags |= TF_RXWIN0SENT;
18984 } else
18985 tp->t_flags &= ~TF_RXWIN0SENT;
18986 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
18987 KMOD_TCPSTAT_INC(tcps_sndpack);
18988 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
18989 #ifdef STATS
18990 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
18991 len);
18992 #endif
18993 if (rack->r_ctl.fsb.m == NULL)
18994 goto failed;
18995
18996 /* s_mb and s_soff are saved for rack_log_output */
18997 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize,
18998 &s_mb, &s_soff);
18999 if (len <= segsiz) {
19000 /*
19001 * Must have ran out of mbufs for the copy
19002 * shorten it to no longer need tso. Lets
19003 * not put on sendalot since we are low on
19004 * mbufs.
19005 */
19006 tso = 0;
19007 }
19008 if (rack->r_ctl.fsb.rfo_apply_push &&
19009 (len == rack->r_ctl.fsb.left_to_send)) {
19010 flags |= TH_PUSH;
19011 add_flag |= RACK_HAD_PUSH;
19012 }
19013 if ((m->m_next == NULL) || (len <= 0)){
19014 goto failed;
19015 }
19016 if (udp) {
19017 if (rack->r_is_v6)
19018 ulen = hdrlen + len - sizeof(struct ip6_hdr);
19019 else
19020 ulen = hdrlen + len - sizeof(struct ip);
19021 udp->uh_ulen = htons(ulen);
19022 }
19023 m->m_pkthdr.rcvif = (struct ifnet *)0;
19024 if (TCPS_HAVERCVDSYN(tp->t_state) &&
19025 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
19026 int ect = tcp_ecn_output_established(tp, &flags, len, false);
19027 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
19028 (tp->t_flags2 & TF2_ECN_SND_ECE))
19029 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
19030 #ifdef INET6
19031 if (rack->r_is_v6) {
19032 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
19033 ip6->ip6_flow |= htonl(ect << 20);
19034 }
19035 else
19036 #endif
19037 {
19038 #ifdef INET
19039 ip->ip_tos &= ~IPTOS_ECN_MASK;
19040 ip->ip_tos |= ect;
19041 #endif
19042 }
19043 }
19044 tcp_set_flags(th, flags);
19045 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
19046 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
19047 if (to.to_flags & TOF_SIGNATURE) {
19048 /*
19049 * Calculate MD5 signature and put it into the place
19050 * determined before.
19051 * NOTE: since TCP options buffer doesn't point into
19052 * mbuf's data, calculate offset and use it.
19053 */
19054 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
19055 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
19056 /*
19057 * Do not send segment if the calculation of MD5
19058 * digest has failed.
19059 */
19060 goto failed;
19061 }
19062 }
19063 #endif
19064 #ifdef INET6
19065 if (rack->r_is_v6) {
19066 if (tp->t_port) {
19067 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
19068 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19069 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
19070 th->th_sum = htons(0);
19071 UDPSTAT_INC(udps_opackets);
19072 } else {
19073 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
19074 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19075 th->th_sum = in6_cksum_pseudo(ip6,
19076 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
19077 0);
19078 }
19079 }
19080 #endif
19081 #if defined(INET6) && defined(INET)
19082 else
19083 #endif
19084 #ifdef INET
19085 {
19086 if (tp->t_port) {
19087 m->m_pkthdr.csum_flags = CSUM_UDP;
19088 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
19089 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
19090 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
19091 th->th_sum = htons(0);
19092 UDPSTAT_INC(udps_opackets);
19093 } else {
19094 m->m_pkthdr.csum_flags = CSUM_TCP;
19095 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
19096 th->th_sum = in_pseudo(ip->ip_src.s_addr,
19097 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
19098 IPPROTO_TCP + len + optlen));
19099 }
19100 /* IP version must be set here for ipv4/ipv6 checking later */
19101 KASSERT(ip->ip_v == IPVERSION,
19102 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
19103 }
19104 #endif
19105 if (tso) {
19106 /*
19107 * Here we use segsiz since we have no added options besides
19108 * any standard timestamp options (no DSACKs or SACKS are sent
19109 * via either fast-path).
19110 */
19111 KASSERT(len > segsiz,
19112 ("%s: len <= tso_segsz tp:%p", __func__, tp));
19113 m->m_pkthdr.csum_flags |= CSUM_TSO;
19114 m->m_pkthdr.tso_segsz = segsiz;
19115 }
19116 #ifdef INET6
19117 if (rack->r_is_v6) {
19118 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit;
19119 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
19120 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
19121 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19122 else
19123 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19124 }
19125 #endif
19126 #if defined(INET) && defined(INET6)
19127 else
19128 #endif
19129 #ifdef INET
19130 {
19131 ip->ip_len = htons(m->m_pkthdr.len);
19132 ip->ip_ttl = rack->r_ctl.fsb.hoplimit;
19133 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
19134 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
19135 if (tp->t_port == 0 || len < V_tcp_minmss) {
19136 ip->ip_off |= htons(IP_DF);
19137 }
19138 } else {
19139 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
19140 }
19141 }
19142 #endif
19143 if (tp->snd_cwnd > tp->snd_ssthresh) {
19144 /* Set we sent in CA */
19145 rack->rc_gp_saw_ca = 1;
19146 } else {
19147 /* Set we sent in SS */
19148 rack->rc_gp_saw_ss = 1;
19149 }
19150 /* Time to copy in our header */
19151 cpto = mtod(m, uint8_t *);
19152 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
19153 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
19154 if (optlen) {
19155 bcopy(opt, th + 1, optlen);
19156 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
19157 } else {
19158 th->th_off = sizeof(struct tcphdr) >> 2;
19159 }
19160 if ((rack->r_ctl.crte != NULL) &&
19161 tcp_bblogging_on(tp)) {
19162 rack_log_queue_level(tp, rack, len, tv, cts);
19163 }
19164 if (tcp_bblogging_on(rack->rc_tp)) {
19165 union tcp_log_stackspecific log;
19166
19167 memset(&log, 0, sizeof(log));
19168 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
19169 if (rack->rack_no_prr)
19170 log.u_bbr.flex1 = 0;
19171 else
19172 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
19173 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
19174 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
19175 log.u_bbr.flex4 = max_val;
19176 /* Save off the early/late values */
19177 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
19178 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
19179 log.u_bbr.bw_inuse = rack_get_bw(rack);
19180 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
19181 log.u_bbr.flex8 = 0;
19182 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL);
19183 log.u_bbr.flex7 = 44;
19184 log.u_bbr.pkts_out = tp->t_maxseg;
19185 log.u_bbr.timeStamp = cts;
19186 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
19187 log.u_bbr.flex5 = log.u_bbr.inflight;
19188 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use;
19189 log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send;
19190 log.u_bbr.rttProp = 0;
19191 log.u_bbr.delRate = rack->r_must_retran;
19192 log.u_bbr.delRate <<= 1;
19193 log.u_bbr.pkt_epoch = line;
19194 /* For fast output no retrans so just inflight and how many mss we send */
19195 log.u_bbr.flex5 = log.u_bbr.inflight;
19196 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
19197 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK,
19198 len, &log, false, NULL, __func__, __LINE__, tv);
19199 } else
19200 lgb = NULL;
19201 #ifdef INET6
19202 if (rack->r_is_v6) {
19203 error = ip6_output(m, inp->in6p_outputopts,
19204 &inp->inp_route6,
19205 0, NULL, NULL, inp);
19206 }
19207 #endif
19208 #if defined(INET) && defined(INET6)
19209 else
19210 #endif
19211 #ifdef INET
19212 {
19213 error = ip_output(m, NULL,
19214 &inp->inp_route,
19215 0, 0, inp);
19216 }
19217 #endif
19218 if (lgb) {
19219 lgb->tlb_errno = error;
19220 lgb = NULL;
19221 }
19222 if (error) {
19223 *send_err = error;
19224 m = NULL;
19225 goto failed;
19226 } else if (rack->rc_hw_nobuf) {
19227 rack->rc_hw_nobuf = 0;
19228 rack->r_ctl.rc_agg_delayed = 0;
19229 rack->r_early = 0;
19230 rack->r_late = 0;
19231 rack->r_ctl.rc_agg_early = 0;
19232 }
19233 if ((error == 0) && (rack->lt_bw_up == 0)) {
19234 /* Unlikely */
19235 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv);
19236 rack->r_ctl.lt_seq = tp->snd_una;
19237 rack->lt_bw_up = 1;
19238 } else if ((error == 0) &&
19239 (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) {
19240 /*
19241 * Need to record what we have since we are
19242 * approaching seq wrap.
19243 */
19244 struct timeval tv;
19245 uint64_t tmark;
19246
19247 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
19248 rack->r_ctl.lt_seq = tp->snd_una;
19249 tmark = tcp_get_u64_usecs(&tv);
19250 if (tmark > rack->r_ctl.lt_timemark) {
19251 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
19252 rack->r_ctl.lt_timemark = tmark;
19253 }
19254 }
19255 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv),
19256 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz);
19257 if (tp->snd_una == tp->snd_max) {
19258 rack->r_ctl.rc_tlp_rxt_last_time = cts;
19259 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
19260 tp->t_acktime = ticks;
19261 }
19262 counter_u64_add(rack_total_bytes, len);
19263 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls);
19264
19265 rack->forced_ack = 0; /* If we send something zap the FA flag */
19266 *tot_len += len;
19267 if ((tp->t_flags & TF_GPUTINPROG) == 0)
19268 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset);
19269 tp->snd_max += len;
19270 tp->snd_nxt = tp->snd_max;
19271 if (rack->rc_new_rnd_needed) {
19272 rack_new_round_starts(tp, rack, tp->snd_max);
19273 }
19274 {
19275 int idx;
19276
19277 idx = (len / segsiz) + 3;
19278 if (idx >= TCP_MSS_ACCT_ATIMER)
19279 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
19280 else
19281 counter_u64_add(rack_out_size[idx], 1);
19282 }
19283 if (len <= rack->r_ctl.fsb.left_to_send)
19284 rack->r_ctl.fsb.left_to_send -= len;
19285 else
19286 rack->r_ctl.fsb.left_to_send = 0;
19287 if (rack->r_ctl.fsb.left_to_send < segsiz) {
19288 rack->r_fast_output = 0;
19289 rack->r_ctl.fsb.left_to_send = 0;
19290 /* At the end of fast_output scale up the sb */
19291 SOCK_SENDBUF_LOCK(rack->rc_inp->inp_socket);
19292 rack_sndbuf_autoscale(rack);
19293 SOCK_SENDBUF_UNLOCK(rack->rc_inp->inp_socket);
19294 }
19295 if (tp->t_rtttime == 0) {
19296 tp->t_rtttime = ticks;
19297 tp->t_rtseq = startseq;
19298 KMOD_TCPSTAT_INC(tcps_segstimed);
19299 }
19300 if ((rack->r_ctl.fsb.left_to_send >= segsiz) &&
19301 (max_val > len) &&
19302 (*tot_len < rack->r_ctl.rc_pace_max_segs) &&
19303 (tso == 0)) {
19304 max_val -= len;
19305 len = segsiz;
19306 th = rack->r_ctl.fsb.th;
19307 #ifdef TCP_ACCOUNTING
19308 cnt_thru++;
19309 #endif
19310 goto again;
19311 }
19312 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
19313 counter_u64_add(rack_fto_send, 1);
19314 pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__);
19315 rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0);
19316 #ifdef TCP_ACCOUNTING
19317 crtsc = get_cyclecount();
19318 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19319 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru;
19320 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
19321 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz);
19322 }
19323 sched_unpin();
19324 #endif
19325 return (0);
19326 failed:
19327 if (m)
19328 m_free(m);
19329 rack->r_fast_output = 0;
19330 return (-1);
19331 }
19332
19333 static inline void
rack_setup_fast_output(struct tcpcb * tp,struct tcp_rack * rack,struct sockbuf * sb,int len,int orig_len,int segsiz,uint32_t pace_max_seg,bool hw_tls,uint16_t flags)19334 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack,
19335 struct sockbuf *sb,
19336 int len, int orig_len, int segsiz, uint32_t pace_max_seg,
19337 bool hw_tls,
19338 uint16_t flags)
19339 {
19340 rack->r_fast_output = 1;
19341 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
19342 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
19343 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
19344 rack->r_ctl.fsb.tcp_flags = flags;
19345 rack->r_ctl.fsb.left_to_send = orig_len - len;
19346 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) {
19347 /* Less than a full sized pace, lets not */
19348 rack->r_fast_output = 0;
19349 return;
19350 } else {
19351 /* Round down to the nearest pace_max_seg */
19352 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg);
19353 }
19354 if (hw_tls)
19355 rack->r_ctl.fsb.hw_tls = 1;
19356 else
19357 rack->r_ctl.fsb.hw_tls = 0;
19358 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))),
19359 ("rack:%p left_to_send:%u sbavail:%u out:%u",
19360 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb),
19361 (tp->snd_max - tp->snd_una)));
19362 if (rack->r_ctl.fsb.left_to_send < segsiz)
19363 rack->r_fast_output = 0;
19364 else {
19365 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una)))
19366 rack->r_ctl.fsb.rfo_apply_push = 1;
19367 else
19368 rack->r_ctl.fsb.rfo_apply_push = 0;
19369 }
19370 }
19371
19372 static uint32_t
rack_get_hpts_pacing_min_for_bw(struct tcp_rack * rack,int32_t segsiz)19373 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz)
19374 {
19375 uint64_t min_time;
19376 uint32_t maxlen;
19377
19378 min_time = (uint64_t)get_hpts_min_sleep_time();
19379 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC);
19380 maxlen = roundup(maxlen, segsiz);
19381 return (maxlen);
19382 }
19383
19384 static struct rack_sendmap *
rack_check_collapsed(struct tcp_rack * rack,uint32_t cts)19385 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
19386 {
19387 struct rack_sendmap *rsm = NULL;
19388 int thresh;
19389
19390 restart:
19391 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point);
19392 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
19393 /* Nothing, strange turn off validity */
19394 rack->r_collapse_point_valid = 0;
19395 return (NULL);
19396 }
19397 /* Can we send it yet? */
19398 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
19399 /*
19400 * Receiver window has not grown enough for
19401 * the segment to be put on the wire.
19402 */
19403 return (NULL);
19404 }
19405 if (rsm->r_flags & RACK_ACKED) {
19406 /*
19407 * It has been sacked, lets move to the
19408 * next one if possible.
19409 */
19410 rack->r_ctl.last_collapse_point = rsm->r_end;
19411 /* Are we done? */
19412 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19413 rack->r_ctl.high_collapse_point)) {
19414 rack->r_collapse_point_valid = 0;
19415 return (NULL);
19416 }
19417 goto restart;
19418 }
19419 /* Now has it been long enough ? */
19420 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1);
19421 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
19422 rack_log_collapse(rack, rsm->r_start,
19423 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19424 thresh, __LINE__, 6, rsm->r_flags, rsm);
19425 return (rsm);
19426 }
19427 /* Not enough time */
19428 rack_log_collapse(rack, rsm->r_start,
19429 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
19430 thresh, __LINE__, 7, rsm->r_flags, rsm);
19431 return (NULL);
19432 }
19433
19434 static inline void
rack_validate_sizes(struct tcp_rack * rack,int32_t * len,int32_t segsiz,uint32_t pace_max_seg)19435 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg)
19436 {
19437 if ((rack->full_size_rxt == 0) &&
19438 (rack->shape_rxt_to_pacing_min == 0) &&
19439 (*len >= segsiz)) {
19440 *len = segsiz;
19441 } else if (rack->shape_rxt_to_pacing_min &&
19442 rack->gp_ready) {
19443 /* We use pacing min as shaping len req */
19444 uint32_t maxlen;
19445
19446 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
19447 if (*len > maxlen)
19448 *len = maxlen;
19449 } else {
19450 /*
19451 * The else is full_size_rxt is on so send it all
19452 * note we do need to check this for exceeding
19453 * our max segment size due to the fact that
19454 * we do sometimes merge chunks together i.e.
19455 * we cannot just assume that we will never have
19456 * a chunk greater than pace_max_seg
19457 */
19458 if (*len > pace_max_seg)
19459 *len = pace_max_seg;
19460 }
19461 }
19462
19463 static int
rack_output(struct tcpcb * tp)19464 rack_output(struct tcpcb *tp)
19465 {
19466 struct socket *so;
19467 uint32_t recwin;
19468 uint32_t sb_offset, s_moff = 0;
19469 int32_t len, error = 0;
19470 uint16_t flags;
19471 struct mbuf *m, *s_mb = NULL;
19472 struct mbuf *mb;
19473 uint32_t if_hw_tsomaxsegcount = 0;
19474 uint32_t if_hw_tsomaxsegsize;
19475 int32_t segsiz, minseg;
19476 long tot_len_this_send = 0;
19477 #ifdef INET
19478 struct ip *ip = NULL;
19479 #endif
19480 struct udphdr *udp = NULL;
19481 struct tcp_rack *rack;
19482 struct tcphdr *th;
19483 uint8_t pass = 0;
19484 uint8_t mark = 0;
19485 uint8_t check_done = 0;
19486 uint8_t wanted_cookie = 0;
19487 u_char opt[TCP_MAXOLEN];
19488 unsigned ipoptlen, optlen, hdrlen, ulen=0;
19489 uint32_t rack_seq;
19490
19491 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
19492 unsigned ipsec_optlen = 0;
19493
19494 #endif
19495 int32_t idle, sendalot;
19496 uint32_t tot_idle;
19497 int32_t sub_from_prr = 0;
19498 volatile int32_t sack_rxmit;
19499 struct rack_sendmap *rsm = NULL;
19500 int32_t tso, mtu;
19501 struct tcpopt to;
19502 int32_t pacing_delay = 0;
19503 int32_t sup_rack = 0;
19504 uint32_t cts, ms_cts, delayed, early;
19505 uint32_t add_flag = RACK_SENT_SP;
19506 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */
19507 uint8_t doing_tlp = 0;
19508 uint32_t cwnd_to_use, pace_max_seg;
19509 int32_t do_a_prefetch = 0;
19510 int32_t prefetch_rsm = 0;
19511 int32_t orig_len = 0;
19512 struct timeval tv;
19513 int32_t prefetch_so_done = 0;
19514 struct tcp_log_buffer *lgb;
19515 struct inpcb *inp = tptoinpcb(tp);
19516 struct sockbuf *sb;
19517 uint64_t ts_val = 0;
19518 #ifdef TCP_ACCOUNTING
19519 uint64_t crtsc;
19520 #endif
19521 #ifdef INET6
19522 struct ip6_hdr *ip6 = NULL;
19523 int32_t isipv6;
19524 #endif
19525 bool hpts_calling, hw_tls = false;
19526
19527 NET_EPOCH_ASSERT();
19528 INP_WLOCK_ASSERT(inp);
19529
19530 /* setup and take the cache hits here */
19531 rack = (struct tcp_rack *)tp->t_fb_ptr;
19532 #ifdef TCP_ACCOUNTING
19533 sched_pin();
19534 ts_val = get_cyclecount();
19535 #endif
19536 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS);
19537 tp->t_flags2 &= ~TF2_HPTS_CALLS;
19538 #ifdef TCP_OFFLOAD
19539 if (tp->t_flags & TF_TOE) {
19540 #ifdef TCP_ACCOUNTING
19541 sched_unpin();
19542 #endif
19543 return (tcp_offload_output(tp));
19544 }
19545 #endif
19546 if (rack->rack_deferred_inited == 0) {
19547 /*
19548 * If we are the connecting socket we will
19549 * hit rack_init() when no sequence numbers
19550 * are setup. This makes it so we must defer
19551 * some initialization. Call that now.
19552 */
19553 rack_deferred_init(tp, rack);
19554 }
19555 /*
19556 * For TFO connections in SYN_RECEIVED, only allow the initial
19557 * SYN|ACK and those sent by the retransmit timer.
19558 */
19559 if ((tp->t_flags & TF_FASTOPEN) &&
19560 (tp->t_state == TCPS_SYN_RECEIVED) &&
19561 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */
19562 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */
19563 #ifdef TCP_ACCOUNTING
19564 sched_unpin();
19565 #endif
19566 return (0);
19567 }
19568 #ifdef INET6
19569 if (rack->r_state) {
19570 /* Use the cache line loaded if possible */
19571 isipv6 = rack->r_is_v6;
19572 } else {
19573 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0;
19574 }
19575 #endif
19576 early = 0;
19577 cts = tcp_get_usecs(&tv);
19578 ms_cts = tcp_tv_to_msec(&tv);
19579 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) &&
19580 tcp_in_hpts(rack->rc_tp)) {
19581 /*
19582 * We are on the hpts for some timer but not hptsi output.
19583 * Remove from the hpts unconditionally.
19584 */
19585 rack_timer_cancel(tp, rack, cts, __LINE__);
19586 }
19587 /* Are we pacing and late? */
19588 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19589 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) {
19590 /* We are delayed */
19591 delayed = cts - rack->r_ctl.rc_last_output_to;
19592 } else {
19593 delayed = 0;
19594 }
19595 /* Do the timers, which may override the pacer */
19596 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
19597 int retval;
19598
19599 retval = rack_process_timers(tp, rack, cts, hpts_calling,
19600 &doing_tlp);
19601 if (retval != 0) {
19602 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1);
19603 #ifdef TCP_ACCOUNTING
19604 sched_unpin();
19605 #endif
19606 /*
19607 * If timers want tcp_drop(), then pass error out,
19608 * otherwise suppress it.
19609 */
19610 return (retval < 0 ? retval : 0);
19611 }
19612 }
19613 if (rack->rc_in_persist) {
19614 if (tcp_in_hpts(rack->rc_tp) == 0) {
19615 /* Timer is not running */
19616 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19617 }
19618 #ifdef TCP_ACCOUNTING
19619 sched_unpin();
19620 #endif
19621 return (0);
19622 }
19623 if ((rack->rc_ack_required == 1) &&
19624 (rack->r_timer_override == 0)){
19625 /* A timeout occurred and no ack has arrived */
19626 if (tcp_in_hpts(rack->rc_tp) == 0) {
19627 /* Timer is not running */
19628 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19629 }
19630 #ifdef TCP_ACCOUNTING
19631 sched_unpin();
19632 #endif
19633 return (0);
19634 }
19635 if ((rack->r_timer_override) ||
19636 (rack->rc_ack_can_sendout_data) ||
19637 (delayed) ||
19638 (tp->t_state < TCPS_ESTABLISHED)) {
19639 rack->rc_ack_can_sendout_data = 0;
19640 if (tcp_in_hpts(rack->rc_tp))
19641 tcp_hpts_remove(rack->rc_tp);
19642 } else if (tcp_in_hpts(rack->rc_tp)) {
19643 /*
19644 * On the hpts you can't pass even if ACKNOW is on, we will
19645 * when the hpts fires.
19646 */
19647 #ifdef TCP_ACCOUNTING
19648 crtsc = get_cyclecount();
19649 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
19650 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val);
19651 tp->tcp_cnt_counters[SND_BLOCKED]++;
19652 }
19653 sched_unpin();
19654 #endif
19655 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1);
19656 return (0);
19657 }
19658 /* Finish out both pacing early and late accounting */
19659 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) &&
19660 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) {
19661 early = rack->r_ctl.rc_last_output_to - cts;
19662 } else
19663 early = 0;
19664 if (delayed && (rack->rc_always_pace == 1)) {
19665 rack->r_ctl.rc_agg_delayed += delayed;
19666 rack->r_late = 1;
19667 } else if (early && (rack->rc_always_pace == 1)) {
19668 rack->r_ctl.rc_agg_early += early;
19669 rack->r_early = 1;
19670 } else if (rack->rc_always_pace == 0) {
19671 /* Non-paced we are not late */
19672 rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0;
19673 rack->r_early = rack->r_late = 0;
19674 }
19675 /* Now that early/late accounting is done turn off the flag */
19676 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT;
19677 rack->r_wanted_output = 0;
19678 rack->r_timer_override = 0;
19679 if ((tp->t_state != rack->r_state) &&
19680 TCPS_HAVEESTABLISHED(tp->t_state)) {
19681 rack_set_state(tp, rack);
19682 }
19683 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs);
19684 minseg = segsiz;
19685 if (rack->r_ctl.rc_pace_max_segs == 0)
19686 pace_max_seg = rack->rc_user_set_max_segs * segsiz;
19687 else
19688 pace_max_seg = rack->r_ctl.rc_pace_max_segs;
19689 if ((rack->r_fast_output) &&
19690 (doing_tlp == 0) &&
19691 (tp->rcv_numsacks == 0)) {
19692 int ret;
19693
19694 error = 0;
19695 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
19696 if (ret > 0)
19697 return(ret);
19698 else if (error) {
19699 inp = rack->rc_inp;
19700 so = inp->inp_socket;
19701 sb = &so->so_snd;
19702 goto nomore;
19703 } else {
19704 /* Return == 0, if there is more we can send tot_len wise fall through and send */
19705 if (tot_len_this_send >= pace_max_seg)
19706 return (ret);
19707 #ifdef TCP_ACCOUNTING
19708 /* We need to re-pin since fast_output un-pined */
19709 sched_pin();
19710 ts_val = get_cyclecount();
19711 #endif
19712 /* Fall back out so we can send any more that may bring us to pace_max_seg */
19713 }
19714 }
19715 inp = rack->rc_inp;
19716 /*
19717 * For TFO connections in SYN_SENT or SYN_RECEIVED,
19718 * only allow the initial SYN or SYN|ACK and those sent
19719 * by the retransmit timer.
19720 */
19721 if ((tp->t_flags & TF_FASTOPEN) &&
19722 ((tp->t_state == TCPS_SYN_RECEIVED) ||
19723 (tp->t_state == TCPS_SYN_SENT)) &&
19724 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
19725 (tp->t_rxtshift == 0)) { /* not a retransmit */
19726 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0);
19727 #ifdef TCP_ACCOUNTING
19728 sched_unpin();
19729 #endif
19730 return (0);
19731 }
19732 /*
19733 * Determine length of data that should be transmitted, and flags
19734 * that will be used. If there is some data or critical controls
19735 * (SYN, RST) to send, then transmit; otherwise, investigate
19736 * further.
19737 */
19738 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
19739 if (tp->t_idle_reduce) {
19740 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur))
19741 rack_cc_after_idle(rack, tp);
19742 }
19743 tp->t_flags &= ~TF_LASTIDLE;
19744 if (idle) {
19745 if (tp->t_flags & TF_MORETOCOME) {
19746 tp->t_flags |= TF_LASTIDLE;
19747 idle = 0;
19748 }
19749 }
19750 if ((tp->snd_una == tp->snd_max) &&
19751 rack->r_ctl.rc_went_idle_time &&
19752 (cts > rack->r_ctl.rc_went_idle_time)) {
19753 tot_idle = (cts - rack->r_ctl.rc_went_idle_time);
19754 if (tot_idle > rack_min_probertt_hold) {
19755 /* Count as a probe rtt */
19756 if (rack->in_probe_rtt == 0) {
19757 rack->r_ctl.rc_lower_rtt_us_cts = cts;
19758 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts;
19759 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts;
19760 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts;
19761 } else {
19762 rack_exit_probertt(rack, cts);
19763 }
19764 }
19765 } else
19766 tot_idle = 0;
19767 if (rack_use_fsb &&
19768 (rack->r_ctl.fsb.tcp_ip_hdr) &&
19769 (rack->r_fsb_inited == 0) &&
19770 (rack->r_state != TCPS_CLOSED))
19771 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]);
19772 if (rack->rc_sendvars_notset == 1) {
19773 rack->rc_sendvars_notset = 0;
19774 /*
19775 * Make sure any TCP timers (keep-alive) is not running.
19776 */
19777 tcp_timer_stop(tp);
19778 }
19779 if ((rack->rack_no_prr == 1) &&
19780 (rack->rc_always_pace == 0)) {
19781 /*
19782 * Sanity check before sending, if we have
19783 * no-pacing enabled and prr is turned off that
19784 * is a logistics error. Correct this by turnning
19785 * prr back on. A user *must* set some form of
19786 * pacing in order to turn PRR off. We do this
19787 * in the output path so that we can avoid socket
19788 * option ordering issues that would occur if we
19789 * tried to do it while setting rack_no_prr on.
19790 */
19791 rack->rack_no_prr = 0;
19792 }
19793 if ((rack->pcm_enabled == 1) &&
19794 (rack->pcm_needed == 0) &&
19795 (tot_idle > 0)) {
19796 /*
19797 * We have been idle some micro seconds. We need
19798 * to factor this in to see if a PCM is needed.
19799 */
19800 uint32_t rtts_idle, rnds;
19801
19802 if (tp->t_srtt)
19803 rtts_idle = tot_idle / tp->t_srtt;
19804 else
19805 rtts_idle = 0;
19806 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round;
19807 rack->r_ctl.pcm_idle_rounds += rtts_idle;
19808 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) {
19809 rack->pcm_needed = 1;
19810 rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round );
19811 }
19812 }
19813 again:
19814 sendalot = 0;
19815 cts = tcp_get_usecs(&tv);
19816 ms_cts = tcp_tv_to_msec(&tv);
19817 tso = 0;
19818 mtu = 0;
19819 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
19820 (rack->r_ctl.pcm_max_seg == 0)) {
19821 /*
19822 * We set in our first send so we know that the ctf_fixed_maxseg
19823 * has been fully set. If we do it in rack_init() we most likely
19824 * see 512 bytes so we end up at 5120, not desirable.
19825 */
19826 rack->r_ctl.pcm_max_seg = rc_init_window(rack);
19827 if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) {
19828 /*
19829 * Assure our initial PCM probe is at least 10 MSS.
19830 */
19831 rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10;
19832 }
19833 }
19834 if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) {
19835 uint32_t rw_avail, cwa;
19836
19837 if (tp->snd_wnd > ctf_outstanding(tp))
19838 rw_avail = tp->snd_wnd - ctf_outstanding(tp);
19839 else
19840 rw_avail = 0;
19841 if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked))
19842 cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
19843 else
19844 cwa = 0;
19845 if ((cwa >= rack->r_ctl.pcm_max_seg) &&
19846 (rw_avail > rack->r_ctl.pcm_max_seg)) {
19847 /* Raise up the max seg for this trip through */
19848 pace_max_seg = rack->r_ctl.pcm_max_seg;
19849 /* Disable any fast output */
19850 rack->r_fast_output = 0;
19851 }
19852 if (rack_verbose_logging) {
19853 rack_log_pcm(rack, 4,
19854 cwa, rack->r_ctl.pcm_max_seg, rw_avail);
19855 }
19856 }
19857 sb_offset = tp->snd_max - tp->snd_una;
19858 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd;
19859 flags = tcp_outflags[tp->t_state];
19860 while (rack->rc_free_cnt < rack_free_cache) {
19861 rsm = rack_alloc(rack);
19862 if (rsm == NULL) {
19863 if (hpts_calling)
19864 /* Retry in a ms */
19865 pacing_delay = (1 * HPTS_USEC_IN_MSEC);
19866 so = inp->inp_socket;
19867 sb = &so->so_snd;
19868 goto just_return_nolock;
19869 }
19870 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext);
19871 rack->rc_free_cnt++;
19872 rsm = NULL;
19873 }
19874 sack_rxmit = 0;
19875 len = 0;
19876 rsm = NULL;
19877 if (flags & TH_RST) {
19878 SOCK_SENDBUF_LOCK(inp->inp_socket);
19879 so = inp->inp_socket;
19880 sb = &so->so_snd;
19881 goto send;
19882 }
19883 if (rack->r_ctl.rc_resend) {
19884 /* Retransmit timer */
19885 rsm = rack->r_ctl.rc_resend;
19886 rack->r_ctl.rc_resend = NULL;
19887 len = rsm->r_end - rsm->r_start;
19888 sack_rxmit = 1;
19889 sendalot = 0;
19890 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19891 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19892 __func__, __LINE__,
19893 rsm->r_start, tp->snd_una, tp, rack, rsm));
19894 sb_offset = rsm->r_start - tp->snd_una;
19895 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19896 } else if (rack->r_collapse_point_valid &&
19897 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
19898 /*
19899 * If an RSM is returned then enough time has passed
19900 * for us to retransmit it. Move up the collapse point,
19901 * since this rsm has its chance to retransmit now.
19902 */
19903 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT);
19904 rack->r_ctl.last_collapse_point = rsm->r_end;
19905 /* Are we done? */
19906 if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
19907 rack->r_ctl.high_collapse_point))
19908 rack->r_collapse_point_valid = 0;
19909 sack_rxmit = 1;
19910 /* We are not doing a TLP */
19911 doing_tlp = 0;
19912 len = rsm->r_end - rsm->r_start;
19913 sb_offset = rsm->r_start - tp->snd_una;
19914 sendalot = 0;
19915 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19916 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
19917 /* We have a retransmit that takes precedence */
19918 if ((!IN_FASTRECOVERY(tp->t_flags)) &&
19919 ((rsm->r_flags & RACK_MUST_RXT) == 0) &&
19920 ((tp->t_flags & TF_WASFRECOVERY) == 0)) {
19921 /* Enter recovery if not induced by a time-out */
19922 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__);
19923 }
19924 #ifdef INVARIANTS
19925 if (SEQ_LT(rsm->r_start, tp->snd_una)) {
19926 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n",
19927 tp, rack, rsm, rsm->r_start, tp->snd_una);
19928 }
19929 #endif
19930 len = rsm->r_end - rsm->r_start;
19931 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19932 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19933 __func__, __LINE__,
19934 rsm->r_start, tp->snd_una, tp, rack, rsm));
19935 sb_offset = rsm->r_start - tp->snd_una;
19936 sendalot = 0;
19937 rack_validate_sizes(rack, &len, segsiz, pace_max_seg);
19938 if (len > 0) {
19939 sack_rxmit = 1;
19940 KMOD_TCPSTAT_INC(tcps_sack_rexmits);
19941 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes,
19942 min(len, segsiz));
19943 }
19944 } else if (rack->r_ctl.rc_tlpsend) {
19945 /* Tail loss probe */
19946 long cwin;
19947 long tlen;
19948
19949 /*
19950 * Check if we can do a TLP with a RACK'd packet
19951 * this can happen if we are not doing the rack
19952 * cheat and we skipped to a TLP and it
19953 * went off.
19954 */
19955 rsm = rack->r_ctl.rc_tlpsend;
19956 /* We are doing a TLP make sure the flag is preent */
19957 rsm->r_flags |= RACK_TLP;
19958 rack->r_ctl.rc_tlpsend = NULL;
19959 sack_rxmit = 1;
19960 tlen = rsm->r_end - rsm->r_start;
19961 if (tlen > segsiz)
19962 tlen = segsiz;
19963 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
19964 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
19965 __func__, __LINE__,
19966 rsm->r_start, tp->snd_una, tp, rack, rsm));
19967 sb_offset = rsm->r_start - tp->snd_una;
19968 cwin = min(tp->snd_wnd, tlen);
19969 len = cwin;
19970 }
19971 if (rack->r_must_retran &&
19972 (doing_tlp == 0) &&
19973 (SEQ_GT(tp->snd_max, tp->snd_una)) &&
19974 (rsm == NULL)) {
19975 /*
19976 * There are two different ways that we
19977 * can get into this block:
19978 * a) This is a non-sack connection, we had a time-out
19979 * and thus r_must_retran was set and everything
19980 * left outstanding as been marked for retransmit.
19981 * b) The MTU of the path shrank, so that everything
19982 * was marked to be retransmitted with the smaller
19983 * mtu and r_must_retran was set.
19984 *
19985 * This means that we expect the sendmap (outstanding)
19986 * to all be marked must. We can use the tmap to
19987 * look at them.
19988 *
19989 */
19990 int sendwin, flight;
19991
19992 sendwin = min(tp->snd_wnd, tp->snd_cwnd);
19993 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
19994 if (flight >= sendwin) {
19995 /*
19996 * We can't send yet.
19997 */
19998 so = inp->inp_socket;
19999 sb = &so->so_snd;
20000 goto just_return_nolock;
20001 }
20002 /*
20003 * This is the case a/b mentioned above. All
20004 * outstanding/not-acked should be marked.
20005 * We can use the tmap to find them.
20006 */
20007 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
20008 if (rsm == NULL) {
20009 /* TSNH */
20010 rack->r_must_retran = 0;
20011 rack->r_ctl.rc_out_at_rto = 0;
20012 so = inp->inp_socket;
20013 sb = &so->so_snd;
20014 goto just_return_nolock;
20015 }
20016 if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
20017 /*
20018 * The first one does not have the flag, did we collapse
20019 * further up in our list?
20020 */
20021 rack->r_must_retran = 0;
20022 rack->r_ctl.rc_out_at_rto = 0;
20023 rsm = NULL;
20024 sack_rxmit = 0;
20025 } else {
20026 sack_rxmit = 1;
20027 len = rsm->r_end - rsm->r_start;
20028 sb_offset = rsm->r_start - tp->snd_una;
20029 sendalot = 0;
20030 if ((rack->full_size_rxt == 0) &&
20031 (rack->shape_rxt_to_pacing_min == 0) &&
20032 (len >= segsiz))
20033 len = segsiz;
20034 else if (rack->shape_rxt_to_pacing_min &&
20035 rack->gp_ready) {
20036 /* We use pacing min as shaping len req */
20037 uint32_t maxlen;
20038
20039 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20040 if (len > maxlen)
20041 len = maxlen;
20042 }
20043 /*
20044 * Delay removing the flag RACK_MUST_RXT so
20045 * that the fastpath for retransmit will
20046 * work with this rsm.
20047 */
20048 }
20049 }
20050 /*
20051 * Enforce a connection sendmap count limit if set
20052 * as long as we are not retransmiting.
20053 */
20054 if ((rsm == NULL) &&
20055 (V_tcp_map_entries_limit > 0) &&
20056 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) {
20057 counter_u64_add(rack_to_alloc_limited, 1);
20058 if (!rack->alloc_limit_reported) {
20059 rack->alloc_limit_reported = 1;
20060 counter_u64_add(rack_alloc_limited_conns, 1);
20061 }
20062 so = inp->inp_socket;
20063 sb = &so->so_snd;
20064 goto just_return_nolock;
20065 }
20066 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) {
20067 /* we are retransmitting the fin */
20068 len--;
20069 if (len) {
20070 /*
20071 * When retransmitting data do *not* include the
20072 * FIN. This could happen from a TLP probe.
20073 */
20074 flags &= ~TH_FIN;
20075 }
20076 }
20077 if (rsm && rack->r_fsb_inited &&
20078 rack_use_rsm_rfo &&
20079 ((rsm->r_flags & RACK_HAS_FIN) == 0)) {
20080 int ret;
20081
20082 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp);
20083 if (ret == 0)
20084 return (0);
20085 }
20086 so = inp->inp_socket;
20087 sb = &so->so_snd;
20088 if (do_a_prefetch == 0) {
20089 kern_prefetch(sb, &do_a_prefetch);
20090 do_a_prefetch = 1;
20091 }
20092 #ifdef NETFLIX_SHARED_CWND
20093 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) &&
20094 rack->rack_enable_scwnd) {
20095 /* We are doing cwnd sharing */
20096 if (rack->gp_ready &&
20097 (rack->rack_attempted_scwnd == 0) &&
20098 (rack->r_ctl.rc_scw == NULL) &&
20099 tp->t_lib) {
20100 /* The pcbid is in, lets make an attempt */
20101 counter_u64_add(rack_try_scwnd, 1);
20102 rack->rack_attempted_scwnd = 1;
20103 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp,
20104 &rack->r_ctl.rc_scw_index,
20105 segsiz);
20106 }
20107 if (rack->r_ctl.rc_scw &&
20108 (rack->rack_scwnd_is_idle == 1) &&
20109 sbavail(&so->so_snd)) {
20110 /* we are no longer out of data */
20111 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20112 rack->rack_scwnd_is_idle = 0;
20113 }
20114 if (rack->r_ctl.rc_scw) {
20115 /* First lets update and get the cwnd */
20116 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw,
20117 rack->r_ctl.rc_scw_index,
20118 tp->snd_cwnd, tp->snd_wnd, segsiz);
20119 }
20120 }
20121 #endif
20122 /*
20123 * Get standard flags, and add SYN or FIN if requested by 'hidden'
20124 * state flags.
20125 */
20126 if (tp->t_flags & TF_NEEDFIN)
20127 flags |= TH_FIN;
20128 if (tp->t_flags & TF_NEEDSYN)
20129 flags |= TH_SYN;
20130 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) {
20131 void *end_rsm;
20132 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
20133 if (end_rsm)
20134 kern_prefetch(end_rsm, &prefetch_rsm);
20135 prefetch_rsm = 1;
20136 }
20137 SOCK_SENDBUF_LOCK(so);
20138 if ((sack_rxmit == 0) &&
20139 (TCPS_HAVEESTABLISHED(tp->t_state) ||
20140 (tp->t_flags & TF_FASTOPEN))) {
20141 /*
20142 * We are not retransmitting (sack_rxmit is 0) so we
20143 * are sending new data. This is always based on snd_max.
20144 * Now in theory snd_max may be equal to snd_una, if so
20145 * then nothing is outstanding and the offset would be 0.
20146 */
20147 uint32_t avail;
20148
20149 avail = sbavail(sb);
20150 if (SEQ_GT(tp->snd_max, tp->snd_una) && avail)
20151 sb_offset = tp->snd_max - tp->snd_una;
20152 else
20153 sb_offset = 0;
20154 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) {
20155 if (rack->r_ctl.rc_tlp_new_data) {
20156 /* TLP is forcing out new data */
20157 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) {
20158 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset);
20159 }
20160 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) {
20161 if (tp->snd_wnd > sb_offset)
20162 len = tp->snd_wnd - sb_offset;
20163 else
20164 len = 0;
20165 } else {
20166 len = rack->r_ctl.rc_tlp_new_data;
20167 }
20168 rack->r_ctl.rc_tlp_new_data = 0;
20169 } else {
20170 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset);
20171 }
20172 if ((rack->r_ctl.crte == NULL) &&
20173 IN_FASTRECOVERY(tp->t_flags) &&
20174 (rack->full_size_rxt == 0) &&
20175 (rack->shape_rxt_to_pacing_min == 0) &&
20176 (len > segsiz)) {
20177 /*
20178 * For prr=off, we need to send only 1 MSS
20179 * at a time. We do this because another sack could
20180 * be arriving that causes us to send retransmits and
20181 * we don't want to be on a long pace due to a larger send
20182 * that keeps us from sending out the retransmit.
20183 */
20184 len = segsiz;
20185 } else if (rack->shape_rxt_to_pacing_min &&
20186 rack->gp_ready) {
20187 /* We use pacing min as shaping len req */
20188 uint32_t maxlen;
20189
20190 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz);
20191 if (len > maxlen)
20192 len = maxlen;
20193 }/* The else is full_size_rxt is on so send it all */
20194 } else {
20195 uint32_t outstanding;
20196 /*
20197 * We are inside of a Fast recovery episode, this
20198 * is caused by a SACK or 3 dup acks. At this point
20199 * we have sent all the retransmissions and we rely
20200 * on PRR to dictate what we will send in the form of
20201 * new data.
20202 */
20203
20204 outstanding = tp->snd_max - tp->snd_una;
20205 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
20206 if (tp->snd_wnd > outstanding) {
20207 len = tp->snd_wnd - outstanding;
20208 /* Check to see if we have the data */
20209 if ((sb_offset + len) > avail) {
20210 /* It does not all fit */
20211 if (avail > sb_offset)
20212 len = avail - sb_offset;
20213 else
20214 len = 0;
20215 }
20216 } else {
20217 len = 0;
20218 }
20219 } else if (avail > sb_offset) {
20220 len = avail - sb_offset;
20221 } else {
20222 len = 0;
20223 }
20224 if (len > 0) {
20225 if (len > rack->r_ctl.rc_prr_sndcnt) {
20226 len = rack->r_ctl.rc_prr_sndcnt;
20227 }
20228 if (len > 0) {
20229 sub_from_prr = 1;
20230 }
20231 }
20232 if (len > segsiz) {
20233 /*
20234 * We should never send more than a MSS when
20235 * retransmitting or sending new data in prr
20236 * mode unless the override flag is on. Most
20237 * likely the PRR algorithm is not going to
20238 * let us send a lot as well :-)
20239 */
20240 if (rack->r_ctl.rc_prr_sendalot == 0) {
20241 len = segsiz;
20242 }
20243 } else if (len < segsiz) {
20244 /*
20245 * Do we send any? The idea here is if the
20246 * send empty's the socket buffer we want to
20247 * do it. However if not then lets just wait
20248 * for our prr_sndcnt to get bigger.
20249 */
20250 long leftinsb;
20251
20252 leftinsb = sbavail(sb) - sb_offset;
20253 if (leftinsb > len) {
20254 /* This send does not empty the sb */
20255 len = 0;
20256 }
20257 }
20258 }
20259 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
20260 /*
20261 * If you have not established
20262 * and are not doing FAST OPEN
20263 * no data please.
20264 */
20265 if ((sack_rxmit == 0) &&
20266 !(tp->t_flags & TF_FASTOPEN)) {
20267 len = 0;
20268 sb_offset = 0;
20269 }
20270 }
20271 if (prefetch_so_done == 0) {
20272 kern_prefetch(so, &prefetch_so_done);
20273 prefetch_so_done = 1;
20274 }
20275 orig_len = len;
20276 /*
20277 * Lop off SYN bit if it has already been sent. However, if this is
20278 * SYN-SENT state and if segment contains data and if we don't know
20279 * that foreign host supports TAO, suppress sending segment.
20280 */
20281 if ((flags & TH_SYN) &&
20282 SEQ_GT(tp->snd_max, tp->snd_una) &&
20283 ((sack_rxmit == 0) &&
20284 (tp->t_rxtshift == 0))) {
20285 /*
20286 * When sending additional segments following a TFO SYN|ACK,
20287 * do not include the SYN bit.
20288 */
20289 if ((tp->t_flags & TF_FASTOPEN) &&
20290 (tp->t_state == TCPS_SYN_RECEIVED))
20291 flags &= ~TH_SYN;
20292 }
20293 /*
20294 * Be careful not to send data and/or FIN on SYN segments. This
20295 * measure is needed to prevent interoperability problems with not
20296 * fully conformant TCP implementations.
20297 */
20298 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) {
20299 len = 0;
20300 flags &= ~TH_FIN;
20301 }
20302 /*
20303 * On TFO sockets, ensure no data is sent in the following cases:
20304 *
20305 * - When retransmitting SYN|ACK on a passively-created socket
20306 *
20307 * - When retransmitting SYN on an actively created socket
20308 *
20309 * - When sending a zero-length cookie (cookie request) on an
20310 * actively created socket
20311 *
20312 * - When the socket is in the CLOSED state (RST is being sent)
20313 */
20314 if ((tp->t_flags & TF_FASTOPEN) &&
20315 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
20316 ((tp->t_state == TCPS_SYN_SENT) &&
20317 (tp->t_tfo_client_cookie_len == 0)) ||
20318 (flags & TH_RST))) {
20319 sack_rxmit = 0;
20320 len = 0;
20321 }
20322 /* Without fast-open there should never be data sent on a SYN */
20323 if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) {
20324 len = 0;
20325 }
20326 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) {
20327 /* We only send 1 MSS if we have a DSACK block */
20328 add_flag |= RACK_SENT_W_DSACK;
20329 len = segsiz;
20330 }
20331 if (len <= 0) {
20332 /*
20333 * We have nothing to send, or the window shrank, or
20334 * is closed, do we need to go into persists?
20335 */
20336 len = 0;
20337 if ((tp->snd_wnd == 0) &&
20338 (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20339 (tp->snd_una == tp->snd_max) &&
20340 (sb_offset < (int)sbavail(sb))) {
20341 rack_enter_persist(tp, rack, cts, tp->snd_una);
20342 }
20343 } else if ((rsm == NULL) &&
20344 (doing_tlp == 0) &&
20345 (len < pace_max_seg)) {
20346 /*
20347 * We are not sending a maximum sized segment for
20348 * some reason. Should we not send anything (think
20349 * sws or persists)?
20350 */
20351 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20352 (TCPS_HAVEESTABLISHED(tp->t_state)) &&
20353 (len < minseg) &&
20354 (len < (int)(sbavail(sb) - sb_offset))) {
20355 /*
20356 * Here the rwnd is less than
20357 * the minimum pacing size, this is not a retransmit,
20358 * we are established and
20359 * the send is not the last in the socket buffer
20360 * we send nothing, and we may enter persists
20361 * if nothing is outstanding.
20362 */
20363 len = 0;
20364 if (tp->snd_max == tp->snd_una) {
20365 /*
20366 * Nothing out we can
20367 * go into persists.
20368 */
20369 rack_enter_persist(tp, rack, cts, tp->snd_una);
20370 }
20371 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) &&
20372 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20373 (len < (int)(sbavail(sb) - sb_offset)) &&
20374 (len < minseg)) {
20375 /*
20376 * Here we are not retransmitting, and
20377 * the cwnd is not so small that we could
20378 * not send at least a min size (rxt timer
20379 * not having gone off), We have 2 segments or
20380 * more already in flight, its not the tail end
20381 * of the socket buffer and the cwnd is blocking
20382 * us from sending out a minimum pacing segment size.
20383 * Lets not send anything.
20384 */
20385 len = 0;
20386 } else if (((tp->snd_wnd - ctf_outstanding(tp)) <
20387 min((rack->r_ctl.rc_high_rwnd/2), minseg)) &&
20388 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) &&
20389 (len < (int)(sbavail(sb) - sb_offset)) &&
20390 (TCPS_HAVEESTABLISHED(tp->t_state))) {
20391 /*
20392 * Here we have a send window but we have
20393 * filled it up and we can't send another pacing segment.
20394 * We also have in flight more than 2 segments
20395 * and we are not completing the sb i.e. we allow
20396 * the last bytes of the sb to go out even if
20397 * its not a full pacing segment.
20398 */
20399 len = 0;
20400 } else if ((rack->r_ctl.crte != NULL) &&
20401 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) &&
20402 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) &&
20403 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) &&
20404 (len < (int)(sbavail(sb) - sb_offset))) {
20405 /*
20406 * Here we are doing hardware pacing, this is not a TLP,
20407 * we are not sending a pace max segment size, there is rwnd
20408 * room to send at least N pace_max_seg, the cwnd is greater
20409 * than or equal to a full pacing segments plus 4 mss and we have 2 or
20410 * more segments in flight and its not the tail of the socket buffer.
20411 *
20412 * We don't want to send instead we need to get more ack's in to
20413 * allow us to send a full pacing segment. Normally, if we are pacing
20414 * about the right speed, we should have finished our pacing
20415 * send as most of the acks have come back if we are at the
20416 * right rate. This is a bit fuzzy since return path delay
20417 * can delay the acks, which is why we want to make sure we
20418 * have cwnd space to have a bit more than a max pace segments in flight.
20419 *
20420 * If we have not gotten our acks back we are pacing at too high a
20421 * rate delaying will not hurt and will bring our GP estimate down by
20422 * injecting the delay. If we don't do this we will send
20423 * 2 MSS out in response to the acks being clocked in which
20424 * defeats the point of hw-pacing (i.e. to help us get
20425 * larger TSO's out).
20426 */
20427 len = 0;
20428 }
20429
20430 }
20431 /* len will be >= 0 after this point. */
20432 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
20433 rack_sndbuf_autoscale(rack);
20434 /*
20435 * Decide if we can use TCP Segmentation Offloading (if supported by
20436 * hardware).
20437 *
20438 * TSO may only be used if we are in a pure bulk sending state. The
20439 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP
20440 * options prevent using TSO. With TSO the TCP header is the same
20441 * (except for the sequence number) for all generated packets. This
20442 * makes it impossible to transmit any options which vary per
20443 * generated segment or packet.
20444 *
20445 * IPv4 handling has a clear separation of ip options and ip header
20446 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does
20447 * the right thing below to provide length of just ip options and thus
20448 * checking for ipoptlen is enough to decide if ip options are present.
20449 */
20450 ipoptlen = 0;
20451 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20452 /*
20453 * Pre-calculate here as we save another lookup into the darknesses
20454 * of IPsec that way and can actually decide if TSO is ok.
20455 */
20456 #ifdef INET6
20457 if (isipv6 && IPSEC_ENABLED(ipv6))
20458 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp);
20459 #ifdef INET
20460 else
20461 #endif
20462 #endif /* INET6 */
20463 #ifdef INET
20464 if (IPSEC_ENABLED(ipv4))
20465 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp);
20466 #endif /* INET */
20467 #endif
20468
20469 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
20470 ipoptlen += ipsec_optlen;
20471 #endif
20472 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz &&
20473 (tp->t_port == 0) &&
20474 ((tp->t_flags & TF_SIGNATURE) == 0) &&
20475 sack_rxmit == 0 &&
20476 ipoptlen == 0)
20477 tso = 1;
20478 {
20479 uint32_t outstanding __unused;
20480
20481 outstanding = tp->snd_max - tp->snd_una;
20482 if (tp->t_flags & TF_SENTFIN) {
20483 /*
20484 * If we sent a fin, snd_max is 1 higher than
20485 * snd_una
20486 */
20487 outstanding--;
20488 }
20489 if (sack_rxmit) {
20490 if ((rsm->r_flags & RACK_HAS_FIN) == 0)
20491 flags &= ~TH_FIN;
20492 }
20493 }
20494 recwin = lmin(lmax(sbspace(&so->so_rcv), 0),
20495 (long)TCP_MAXWIN << tp->rcv_scale);
20496
20497 /*
20498 * Sender silly window avoidance. We transmit under the following
20499 * conditions when len is non-zero:
20500 *
20501 * - We have a full segment (or more with TSO) - This is the last
20502 * buffer in a write()/send() and we are either idle or running
20503 * NODELAY - we've timed out (e.g. persist timer) - we have more
20504 * then 1/2 the maximum send window's worth of data (receiver may be
20505 * limited the window size) - we need to retransmit
20506 */
20507 if (len) {
20508 if (len >= segsiz) {
20509 goto send;
20510 }
20511 /*
20512 * NOTE! on localhost connections an 'ack' from the remote
20513 * end may occur synchronously with the output and cause us
20514 * to flush a buffer queued with moretocome. XXX
20515 *
20516 */
20517 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */
20518 (idle || (tp->t_flags & TF_NODELAY)) &&
20519 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20520 (tp->t_flags & TF_NOPUSH) == 0) {
20521 pass = 2;
20522 goto send;
20523 }
20524 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */
20525 pass = 22;
20526 goto send;
20527 }
20528 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
20529 pass = 4;
20530 goto send;
20531 }
20532 if (sack_rxmit) {
20533 pass = 6;
20534 goto send;
20535 }
20536 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) &&
20537 (ctf_outstanding(tp) < (segsiz * 2))) {
20538 /*
20539 * We have less than two MSS outstanding (delayed ack)
20540 * and our rwnd will not let us send a full sized
20541 * MSS. Lets go ahead and let this small segment
20542 * out because we want to try to have at least two
20543 * packets inflight to not be caught by delayed ack.
20544 */
20545 pass = 12;
20546 goto send;
20547 }
20548 }
20549 /*
20550 * Sending of standalone window updates.
20551 *
20552 * Window updates are important when we close our window due to a
20553 * full socket buffer and are opening it again after the application
20554 * reads data from it. Once the window has opened again and the
20555 * remote end starts to send again the ACK clock takes over and
20556 * provides the most current window information.
20557 *
20558 * We must avoid the silly window syndrome whereas every read from
20559 * the receive buffer, no matter how small, causes a window update
20560 * to be sent. We also should avoid sending a flurry of window
20561 * updates when the socket buffer had queued a lot of data and the
20562 * application is doing small reads.
20563 *
20564 * Prevent a flurry of pointless window updates by only sending an
20565 * update when we can increase the advertized window by more than
20566 * 1/4th of the socket buffer capacity. When the buffer is getting
20567 * full or is very small be more aggressive and send an update
20568 * whenever we can increase by two mss sized segments. In all other
20569 * situations the ACK's to new incoming data will carry further
20570 * window increases.
20571 *
20572 * Don't send an independent window update if a delayed ACK is
20573 * pending (it will get piggy-backed on it) or the remote side
20574 * already has done a half-close and won't send more data. Skip
20575 * this if the connection is in T/TCP half-open state.
20576 */
20577 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) &&
20578 !(tp->t_flags & TF_DELACK) &&
20579 !TCPS_HAVERCVDFIN(tp->t_state)) {
20580 /*
20581 * "adv" is the amount we could increase the window, taking
20582 * into account that we are limited by TCP_MAXWIN <<
20583 * tp->rcv_scale.
20584 */
20585 int32_t adv;
20586 int oldwin;
20587
20588 adv = recwin;
20589 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
20590 oldwin = (tp->rcv_adv - tp->rcv_nxt);
20591 if (adv > oldwin)
20592 adv -= oldwin;
20593 else {
20594 /* We can't increase the window */
20595 adv = 0;
20596 }
20597 } else
20598 oldwin = 0;
20599
20600 /*
20601 * If the new window size ends up being the same as or less
20602 * than the old size when it is scaled, then don't force
20603 * a window update.
20604 */
20605 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale)
20606 goto dontupdate;
20607
20608 if (adv >= (int32_t)(2 * segsiz) &&
20609 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) ||
20610 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) ||
20611 so->so_rcv.sb_hiwat <= 8 * segsiz)) {
20612 pass = 7;
20613 goto send;
20614 }
20615 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) {
20616 pass = 23;
20617 goto send;
20618 }
20619 }
20620 dontupdate:
20621
20622 /*
20623 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
20624 * is also a catch-all for the retransmit timer timeout case.
20625 */
20626 if (tp->t_flags & TF_ACKNOW) {
20627 pass = 8;
20628 goto send;
20629 }
20630 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
20631 pass = 9;
20632 goto send;
20633 }
20634 /*
20635 * If our state indicates that FIN should be sent and we have not
20636 * yet done so, then we need to send.
20637 */
20638 if ((flags & TH_FIN) &&
20639 (tp->snd_max == tp->snd_una)) {
20640 pass = 11;
20641 goto send;
20642 }
20643 /*
20644 * No reason to send a segment, just return.
20645 */
20646 just_return:
20647 SOCK_SENDBUF_UNLOCK(so);
20648 just_return_nolock:
20649 {
20650 int app_limited = CTF_JR_SENT_DATA;
20651
20652 if ((tp->t_flags & TF_FASTOPEN) == 0 &&
20653 (flags & TH_FIN) &&
20654 (len == 0) &&
20655 (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
20656 ((tp->snd_max - tp->snd_una) <= segsiz)) {
20657 /*
20658 * Ok less than or right at a MSS is
20659 * outstanding. The original FreeBSD stack would
20660 * have sent a FIN, which can speed things up for
20661 * a transactional application doing a MSG_WAITALL.
20662 * To speed things up since we do *not* send a FIN
20663 * if data is outstanding, we send a "challenge ack".
20664 * The idea behind that is instead of having to have
20665 * the peer wait for the delayed-ack timer to run off
20666 * we send an ack that makes the peer send us an ack.
20667 */
20668 rack_send_ack_challange(rack);
20669 }
20670 if (tot_len_this_send > 0) {
20671 rack->r_ctl.fsb.recwin = recwin;
20672 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__);
20673 if ((error == 0) &&
20674 rack_use_rfo &&
20675 ((flags & (TH_SYN|TH_FIN)) == 0) &&
20676 (ipoptlen == 0) &&
20677 rack->r_fsb_inited &&
20678 TCPS_HAVEESTABLISHED(tp->t_state) &&
20679 ((IN_RECOVERY(tp->t_flags)) == 0) &&
20680 (doing_tlp == 0) &&
20681 (rack->r_must_retran == 0) &&
20682 ((tp->t_flags & TF_NEEDFIN) == 0) &&
20683 (len > 0) && (orig_len > 0) &&
20684 (orig_len > len) &&
20685 ((orig_len - len) >= segsiz) &&
20686 ((optlen == 0) ||
20687 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
20688 /* We can send at least one more MSS using our fsb */
20689 rack_setup_fast_output(tp, rack, sb, len, orig_len,
20690 segsiz, pace_max_seg, hw_tls, flags);
20691 } else
20692 rack->r_fast_output = 0;
20693 rack_log_fsb(rack, tp, so, flags,
20694 ipoptlen, orig_len, len, 0,
20695 1, optlen, __LINE__, 1);
20696 /* Assure when we leave that snd_nxt will point to top */
20697 if (SEQ_GT(tp->snd_max, tp->snd_nxt))
20698 tp->snd_nxt = tp->snd_max;
20699 } else {
20700 int end_window = 0;
20701 uint32_t seq = tp->gput_ack;
20702
20703 rsm = tqhash_max(rack->r_ctl.tqh);
20704 if (rsm) {
20705 /*
20706 * Mark the last sent that we just-returned (hinting
20707 * that delayed ack may play a role in any rtt measurement).
20708 */
20709 rsm->r_just_ret = 1;
20710 }
20711 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1);
20712 rack->r_ctl.rc_agg_delayed = 0;
20713 rack->r_early = 0;
20714 rack->r_late = 0;
20715 rack->r_ctl.rc_agg_early = 0;
20716 if ((ctf_outstanding(tp) +
20717 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)),
20718 minseg)) >= tp->snd_wnd) {
20719 /* We are limited by the rwnd */
20720 app_limited = CTF_JR_RWND_LIMITED;
20721 if (IN_FASTRECOVERY(tp->t_flags))
20722 rack->r_ctl.rc_prr_sndcnt = 0;
20723 } else if (ctf_outstanding(tp) >= sbavail(sb)) {
20724 /* We are limited by whats available -- app limited */
20725 app_limited = CTF_JR_APP_LIMITED;
20726 if (IN_FASTRECOVERY(tp->t_flags))
20727 rack->r_ctl.rc_prr_sndcnt = 0;
20728 } else if ((idle == 0) &&
20729 ((tp->t_flags & TF_NODELAY) == 0) &&
20730 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) &&
20731 (len < segsiz)) {
20732 /*
20733 * No delay is not on and the
20734 * user is sending less than 1MSS. This
20735 * brings out SWS avoidance so we
20736 * don't send. Another app-limited case.
20737 */
20738 app_limited = CTF_JR_APP_LIMITED;
20739 } else if (tp->t_flags & TF_NOPUSH) {
20740 /*
20741 * The user has requested no push of
20742 * the last segment and we are
20743 * at the last segment. Another app
20744 * limited case.
20745 */
20746 app_limited = CTF_JR_APP_LIMITED;
20747 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) {
20748 /* Its the cwnd */
20749 app_limited = CTF_JR_CWND_LIMITED;
20750 } else if (IN_FASTRECOVERY(tp->t_flags) &&
20751 (rack->rack_no_prr == 0) &&
20752 (rack->r_ctl.rc_prr_sndcnt < segsiz)) {
20753 app_limited = CTF_JR_PRR;
20754 } else {
20755 /* Now why here are we not sending? */
20756 #ifdef NOW
20757 #ifdef INVARIANTS
20758 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use);
20759 #endif
20760 #endif
20761 app_limited = CTF_JR_ASSESSING;
20762 }
20763 /*
20764 * App limited in some fashion, for our pacing GP
20765 * measurements we don't want any gap (even cwnd).
20766 * Close down the measurement window.
20767 */
20768 if (rack_cwnd_block_ends_measure &&
20769 ((app_limited == CTF_JR_CWND_LIMITED) ||
20770 (app_limited == CTF_JR_PRR))) {
20771 /*
20772 * The reason we are not sending is
20773 * the cwnd (or prr). We have been configured
20774 * to end the measurement window in
20775 * this case.
20776 */
20777 end_window = 1;
20778 } else if (rack_rwnd_block_ends_measure &&
20779 (app_limited == CTF_JR_RWND_LIMITED)) {
20780 /*
20781 * We are rwnd limited and have been
20782 * configured to end the measurement
20783 * window in this case.
20784 */
20785 end_window = 1;
20786 } else if (app_limited == CTF_JR_APP_LIMITED) {
20787 /*
20788 * A true application limited period, we have
20789 * ran out of data.
20790 */
20791 end_window = 1;
20792 } else if (app_limited == CTF_JR_ASSESSING) {
20793 /*
20794 * In the assessing case we hit the end of
20795 * the if/else and had no known reason
20796 * This will panic us under invariants..
20797 *
20798 * If we get this out in logs we need to
20799 * investagate which reason we missed.
20800 */
20801 end_window = 1;
20802 }
20803 if (end_window) {
20804 uint8_t log = 0;
20805
20806 /* Adjust the Gput measurement */
20807 if ((tp->t_flags & TF_GPUTINPROG) &&
20808 SEQ_GT(tp->gput_ack, tp->snd_max)) {
20809 tp->gput_ack = tp->snd_max;
20810 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) {
20811 /*
20812 * There is not enough to measure.
20813 */
20814 tp->t_flags &= ~TF_GPUTINPROG;
20815 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/,
20816 rack->r_ctl.rc_gp_srtt /*flex1*/,
20817 tp->gput_seq,
20818 0, 0, 18, __LINE__, NULL, 0);
20819 } else
20820 log = 1;
20821 }
20822 /* Mark the last packet as app limited */
20823 rsm = tqhash_max(rack->r_ctl.tqh);
20824 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) {
20825 if (rack->r_ctl.rc_app_limited_cnt == 0)
20826 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm;
20827 else {
20828 /*
20829 * Go out to the end app limited and mark
20830 * this new one as next and move the end_appl up
20831 * to this guy.
20832 */
20833 if (rack->r_ctl.rc_end_appl)
20834 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start;
20835 rack->r_ctl.rc_end_appl = rsm;
20836 }
20837 rsm->r_flags |= RACK_APP_LIMITED;
20838 rack->r_ctl.rc_app_limited_cnt++;
20839 }
20840 if (log)
20841 rack_log_pacing_delay_calc(rack,
20842 rack->r_ctl.rc_app_limited_cnt, seq,
20843 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0);
20844 }
20845 }
20846 /* Check if we need to go into persists or not */
20847 if ((tp->snd_max == tp->snd_una) &&
20848 TCPS_HAVEESTABLISHED(tp->t_state) &&
20849 sbavail(sb) &&
20850 (sbavail(sb) > tp->snd_wnd) &&
20851 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) {
20852 /* Yes lets make sure to move to persist before timer-start */
20853 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una);
20854 }
20855 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack);
20856 rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use);
20857 }
20858 #ifdef NETFLIX_SHARED_CWND
20859 if ((sbavail(sb) == 0) &&
20860 rack->r_ctl.rc_scw) {
20861 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index);
20862 rack->rack_scwnd_is_idle = 1;
20863 }
20864 #endif
20865 #ifdef TCP_ACCOUNTING
20866 if (tot_len_this_send > 0) {
20867 crtsc = get_cyclecount();
20868 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20869 tp->tcp_cnt_counters[SND_OUT_DATA]++;
20870 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val);
20871 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz);
20872 }
20873 } else {
20874 crtsc = get_cyclecount();
20875 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
20876 tp->tcp_cnt_counters[SND_LIMITED]++;
20877 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val);
20878 }
20879 }
20880 sched_unpin();
20881 #endif
20882 return (0);
20883
20884 send:
20885 if ((rack->r_ctl.crte != NULL) &&
20886 (rsm == NULL) &&
20887 ((rack->rc_hw_nobuf == 1) ||
20888 (rack_hw_check_queue && (check_done == 0)))) {
20889 /*
20890 * We only want to do this once with the hw_check_queue,
20891 * for the enobuf case we would only do it once if
20892 * we come around to again, the flag will be clear.
20893 */
20894 check_done = 1;
20895 pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz);
20896 if (pacing_delay) {
20897 rack->r_ctl.rc_agg_delayed = 0;
20898 rack->r_ctl.rc_agg_early = 0;
20899 rack->r_early = 0;
20900 rack->r_late = 0;
20901 SOCK_SENDBUF_UNLOCK(so);
20902 goto skip_all_send;
20903 }
20904 }
20905 if (rsm || sack_rxmit)
20906 counter_u64_add(rack_nfto_resend, 1);
20907 else
20908 counter_u64_add(rack_non_fto_send, 1);
20909 if ((flags & TH_FIN) &&
20910 sbavail(sb)) {
20911 /*
20912 * We do not transmit a FIN
20913 * with data outstanding. We
20914 * need to make it so all data
20915 * is acked first.
20916 */
20917 flags &= ~TH_FIN;
20918 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
20919 (sbused(sb) == (tp->snd_max - tp->snd_una)) &&
20920 ((tp->snd_max - tp->snd_una) <= segsiz)) {
20921 /*
20922 * Ok less than or right at a MSS is
20923 * outstanding. The original FreeBSD stack would
20924 * have sent a FIN, which can speed things up for
20925 * a transactional application doing a MSG_WAITALL.
20926 * To speed things up since we do *not* send a FIN
20927 * if data is outstanding, we send a "challenge ack".
20928 * The idea behind that is instead of having to have
20929 * the peer wait for the delayed-ack timer to run off
20930 * we send an ack that makes the peer send us an ack.
20931 */
20932 rack_send_ack_challange(rack);
20933 }
20934 }
20935 /* Enforce stack imposed max seg size if we have one */
20936 if (pace_max_seg &&
20937 (len > pace_max_seg)) {
20938 mark = 1;
20939 len = pace_max_seg;
20940 }
20941 if ((rsm == NULL) &&
20942 (rack->pcm_in_progress == 0) &&
20943 (rack->r_ctl.pcm_max_seg > 0) &&
20944 (len >= rack->r_ctl.pcm_max_seg)) {
20945 /* It is large enough for a measurement */
20946 add_flag |= RACK_IS_PCM;
20947 rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag);
20948 } else if (rack_verbose_logging) {
20949 rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag);
20950 }
20951
20952 SOCKBUF_LOCK_ASSERT(sb);
20953 if (len > 0) {
20954 if (len >= segsiz)
20955 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT;
20956 else
20957 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT;
20958 }
20959 /*
20960 * Before ESTABLISHED, force sending of initial options unless TCP
20961 * set not to do any options. NOTE: we assume that the IP/TCP header
20962 * plus TCP options always fit in a single mbuf, leaving room for a
20963 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr)
20964 * + optlen <= MCLBYTES
20965 */
20966 optlen = 0;
20967 #ifdef INET6
20968 if (isipv6)
20969 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
20970 else
20971 #endif
20972 hdrlen = sizeof(struct tcpiphdr);
20973
20974 /*
20975 * Ok what seq are we sending from. If we have
20976 * no rsm to use, then we look at various bits,
20977 * if we are putting out a SYN it will be ISS.
20978 * If we are retransmitting a FIN it will
20979 * be snd_max-1 else its snd_max.
20980 */
20981 if (rsm == NULL) {
20982 if (flags & TH_SYN)
20983 rack_seq = tp->iss;
20984 else if ((flags & TH_FIN) &&
20985 (tp->t_flags & TF_SENTFIN))
20986 rack_seq = tp->snd_max - 1;
20987 else
20988 rack_seq = tp->snd_max;
20989 } else {
20990 rack_seq = rsm->r_start;
20991 }
20992 /*
20993 * Compute options for segment. We only have to care about SYN and
20994 * established connection segments. Options for SYN-ACK segments
20995 * are handled in TCP syncache.
20996 */
20997 to.to_flags = 0;
20998 if ((tp->t_flags & TF_NOOPT) == 0) {
20999 /* Maximum segment size. */
21000 if (flags & TH_SYN) {
21001 to.to_mss = tcp_mssopt(&inp->inp_inc);
21002 if (tp->t_port)
21003 to.to_mss -= V_tcp_udp_tunneling_overhead;
21004 to.to_flags |= TOF_MSS;
21005
21006 /*
21007 * On SYN or SYN|ACK transmits on TFO connections,
21008 * only include the TFO option if it is not a
21009 * retransmit, as the presence of the TFO option may
21010 * have caused the original SYN or SYN|ACK to have
21011 * been dropped by a middlebox.
21012 */
21013 if ((tp->t_flags & TF_FASTOPEN) &&
21014 (tp->t_rxtshift == 0)) {
21015 if (tp->t_state == TCPS_SYN_RECEIVED) {
21016 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
21017 to.to_tfo_cookie =
21018 (u_int8_t *)&tp->t_tfo_cookie.server;
21019 to.to_flags |= TOF_FASTOPEN;
21020 wanted_cookie = 1;
21021 } else if (tp->t_state == TCPS_SYN_SENT) {
21022 to.to_tfo_len =
21023 tp->t_tfo_client_cookie_len;
21024 to.to_tfo_cookie =
21025 tp->t_tfo_cookie.client;
21026 to.to_flags |= TOF_FASTOPEN;
21027 wanted_cookie = 1;
21028 /*
21029 * If we wind up having more data to
21030 * send with the SYN than can fit in
21031 * one segment, don't send any more
21032 * until the SYN|ACK comes back from
21033 * the other end.
21034 */
21035 sendalot = 0;
21036 }
21037 }
21038 }
21039 /* Window scaling. */
21040 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
21041 to.to_wscale = tp->request_r_scale;
21042 to.to_flags |= TOF_SCALE;
21043 }
21044 /* Timestamps. */
21045 if ((tp->t_flags & TF_RCVD_TSTMP) ||
21046 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
21047 uint32_t ts_to_use;
21048
21049 if ((rack->r_rcvpath_rtt_up == 1) &&
21050 (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) {
21051 /*
21052 * When we are doing a rcv_rtt probe all
21053 * other timestamps use the next msec. This
21054 * is safe since our previous ack is in the
21055 * air and we will just have a few more
21056 * on the next ms. This assures that only
21057 * the one ack has the ms_cts that was on
21058 * our ack-probe.
21059 */
21060 ts_to_use = ms_cts + 1;
21061 } else {
21062 ts_to_use = ms_cts;
21063 }
21064 to.to_tsval = ts_to_use + tp->ts_offset;
21065 to.to_tsecr = tp->ts_recent;
21066 to.to_flags |= TOF_TS;
21067 if ((len == 0) &&
21068 (TCPS_HAVEESTABLISHED(tp->t_state)) &&
21069 ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) &&
21070 (tp->snd_una == tp->snd_max) &&
21071 (flags & TH_ACK) &&
21072 (sbavail(sb) == 0) &&
21073 (rack->r_ctl.current_round != 0) &&
21074 ((flags & (TH_SYN|TH_FIN)) == 0) &&
21075 (rack->r_rcvpath_rtt_up == 0)) {
21076 rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts;
21077 rack->r_ctl.last_time_of_arm_rcv = cts;
21078 rack->r_rcvpath_rtt_up = 1;
21079 /* Subtract 1 from seq to force a response */
21080 rack_seq--;
21081 }
21082 }
21083 /* Set receive buffer autosizing timestamp. */
21084 if (tp->rfbuf_ts == 0 &&
21085 (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
21086 tp->rfbuf_ts = ms_cts;
21087 }
21088 /* Selective ACK's. */
21089 if (tp->t_flags & TF_SACK_PERMIT) {
21090 if (flags & TH_SYN)
21091 to.to_flags |= TOF_SACKPERM;
21092 else if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21093 tp->rcv_numsacks > 0) {
21094 to.to_flags |= TOF_SACK;
21095 to.to_nsacks = tp->rcv_numsacks;
21096 to.to_sacks = (u_char *)tp->sackblks;
21097 }
21098 }
21099 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21100 /* TCP-MD5 (RFC2385). */
21101 if (tp->t_flags & TF_SIGNATURE)
21102 to.to_flags |= TOF_SIGNATURE;
21103 #endif
21104
21105 /* Processing the options. */
21106 hdrlen += optlen = tcp_addoptions(&to, opt);
21107 /*
21108 * If we wanted a TFO option to be added, but it was unable
21109 * to fit, ensure no data is sent.
21110 */
21111 if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie &&
21112 !(to.to_flags & TOF_FASTOPEN))
21113 len = 0;
21114 }
21115 if (tp->t_port) {
21116 if (V_tcp_udp_tunneling_port == 0) {
21117 /* The port was removed?? */
21118 SOCK_SENDBUF_UNLOCK(so);
21119 #ifdef TCP_ACCOUNTING
21120 crtsc = get_cyclecount();
21121 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
21122 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
21123 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
21124 }
21125 sched_unpin();
21126 #endif
21127 return (EHOSTUNREACH);
21128 }
21129 hdrlen += sizeof(struct udphdr);
21130 }
21131 #ifdef INET6
21132 if (isipv6)
21133 ipoptlen = ip6_optlen(inp);
21134 else
21135 #endif
21136 if (inp->inp_options)
21137 ipoptlen = inp->inp_options->m_len -
21138 offsetof(struct ipoption, ipopt_list);
21139 else
21140 ipoptlen = 0;
21141 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21142 ipoptlen += ipsec_optlen;
21143 #endif
21144
21145 /*
21146 * Adjust data length if insertion of options will bump the packet
21147 * length beyond the t_maxseg length. Clear the FIN bit because we
21148 * cut off the tail of the segment.
21149 */
21150 if (len + optlen + ipoptlen > tp->t_maxseg) {
21151 if (tso) {
21152 uint32_t if_hw_tsomax;
21153 uint32_t moff;
21154 int32_t max_len;
21155
21156 /* extract TSO information */
21157 if_hw_tsomax = tp->t_tsomax;
21158 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount;
21159 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize;
21160 KASSERT(ipoptlen == 0,
21161 ("%s: TSO can't do IP options", __func__));
21162
21163 /*
21164 * Check if we should limit by maximum payload
21165 * length:
21166 */
21167 if (if_hw_tsomax != 0) {
21168 /* compute maximum TSO length */
21169 max_len = (if_hw_tsomax - hdrlen -
21170 max_linkhdr);
21171 if (max_len <= 0) {
21172 len = 0;
21173 } else if (len > max_len) {
21174 if (doing_tlp == 0)
21175 sendalot = 1;
21176 len = max_len;
21177 mark = 2;
21178 }
21179 }
21180 /*
21181 * Prevent the last segment from being fractional
21182 * unless the send sockbuf can be emptied:
21183 */
21184 max_len = (tp->t_maxseg - optlen);
21185 if ((sb_offset + len) < sbavail(sb)) {
21186 moff = len % (u_int)max_len;
21187 if (moff != 0) {
21188 mark = 3;
21189 len -= moff;
21190 }
21191 }
21192 /*
21193 * In case there are too many small fragments don't
21194 * use TSO:
21195 */
21196 if (len <= max_len) {
21197 mark = 4;
21198 tso = 0;
21199 }
21200 /*
21201 * Send the FIN in a separate segment after the bulk
21202 * sending is done. We don't trust the TSO
21203 * implementations to clear the FIN flag on all but
21204 * the last segment.
21205 */
21206 if (tp->t_flags & TF_NEEDFIN) {
21207 sendalot = 4;
21208 }
21209 } else {
21210 mark = 5;
21211 if (optlen + ipoptlen >= tp->t_maxseg) {
21212 /*
21213 * Since we don't have enough space to put
21214 * the IP header chain and the TCP header in
21215 * one packet as required by RFC 7112, don't
21216 * send it. Also ensure that at least one
21217 * byte of the payload can be put into the
21218 * TCP segment.
21219 */
21220 SOCK_SENDBUF_UNLOCK(so);
21221 error = EMSGSIZE;
21222 sack_rxmit = 0;
21223 goto out;
21224 }
21225 len = tp->t_maxseg - optlen - ipoptlen;
21226 sendalot = 5;
21227 }
21228 } else {
21229 tso = 0;
21230 mark = 6;
21231 }
21232 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET,
21233 ("%s: len > IP_MAXPACKET", __func__));
21234 #ifdef DIAGNOSTIC
21235 #ifdef INET6
21236 if (max_linkhdr + hdrlen > MCLBYTES)
21237 #else
21238 if (max_linkhdr + hdrlen > MHLEN)
21239 #endif
21240 panic("tcphdr too big");
21241 #endif
21242
21243 /*
21244 * This KASSERT is here to catch edge cases at a well defined place.
21245 * Before, those had triggered (random) panic conditions further
21246 * down.
21247 */
21248 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__));
21249 if ((len == 0) &&
21250 (flags & TH_FIN) &&
21251 (sbused(sb))) {
21252 /*
21253 * We have outstanding data, don't send a fin by itself!.
21254 *
21255 * Check to see if we need to send a challenge ack.
21256 */
21257 if ((sbused(sb) == (tp->snd_max - tp->snd_una)) &&
21258 ((tp->snd_max - tp->snd_una) <= segsiz)) {
21259 /*
21260 * Ok less than or right at a MSS is
21261 * outstanding. The original FreeBSD stack would
21262 * have sent a FIN, which can speed things up for
21263 * a transactional application doing a MSG_WAITALL.
21264 * To speed things up since we do *not* send a FIN
21265 * if data is outstanding, we send a "challenge ack".
21266 * The idea behind that is instead of having to have
21267 * the peer wait for the delayed-ack timer to run off
21268 * we send an ack that makes the peer send us an ack.
21269 */
21270 rack_send_ack_challange(rack);
21271 }
21272 goto just_return;
21273 }
21274 /*
21275 * Grab a header mbuf, attaching a copy of data to be transmitted,
21276 * and initialize the header from the template for sends on this
21277 * connection.
21278 */
21279 hw_tls = tp->t_nic_ktls_xmit != 0;
21280 if (len) {
21281 uint32_t max_val;
21282 uint32_t moff;
21283
21284 if (pace_max_seg)
21285 max_val = pace_max_seg;
21286 else
21287 max_val = len;
21288 /*
21289 * We allow a limit on sending with hptsi.
21290 */
21291 if (len > max_val) {
21292 mark = 7;
21293 len = max_val;
21294 }
21295 #ifdef INET6
21296 if (MHLEN < hdrlen + max_linkhdr)
21297 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
21298 else
21299 #endif
21300 m = m_gethdr(M_NOWAIT, MT_DATA);
21301
21302 if (m == NULL) {
21303 SOCK_SENDBUF_UNLOCK(so);
21304 error = ENOBUFS;
21305 sack_rxmit = 0;
21306 goto out;
21307 }
21308 m->m_data += max_linkhdr;
21309 m->m_len = hdrlen;
21310
21311 /*
21312 * Start the m_copy functions from the closest mbuf to the
21313 * sb_offset in the socket buffer chain.
21314 */
21315 mb = sbsndptr_noadv(sb, sb_offset, &moff);
21316 s_mb = mb;
21317 s_moff = moff;
21318 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) {
21319 m_copydata(mb, moff, (int)len,
21320 mtod(m, caddr_t)+hdrlen);
21321 /*
21322 * If we are not retransmitting advance the
21323 * sndptr to help remember the next place in
21324 * the sb.
21325 */
21326 if (rsm == NULL)
21327 sbsndptr_adv(sb, mb, len);
21328 m->m_len += len;
21329 } else {
21330 struct sockbuf *msb;
21331
21332 /*
21333 * If we are not retransmitting pass in msb so
21334 * the socket buffer can be advanced. Otherwise
21335 * set it to NULL if its a retransmission since
21336 * we don't want to change the sb remembered
21337 * location.
21338 */
21339 if (rsm == NULL)
21340 msb = sb;
21341 else
21342 msb = NULL;
21343 m->m_next = tcp_m_copym(
21344 mb, moff, &len,
21345 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
21346 ((rsm == NULL) ? hw_tls : 0));
21347 if (len <= (tp->t_maxseg - optlen)) {
21348 /*
21349 * Must have ran out of mbufs for the copy
21350 * shorten it to no longer need tso. Lets
21351 * not put on sendalot since we are low on
21352 * mbufs.
21353 */
21354 tso = 0;
21355 }
21356 if (m->m_next == NULL) {
21357 SOCK_SENDBUF_UNLOCK(so);
21358 (void)m_free(m);
21359 error = ENOBUFS;
21360 sack_rxmit = 0;
21361 goto out;
21362 }
21363 }
21364 if (sack_rxmit) {
21365 if (rsm && (rsm->r_flags & RACK_TLP)) {
21366 /*
21367 * TLP should not count in retran count, but
21368 * in its own bin
21369 */
21370 counter_u64_add(rack_tlp_retran, 1);
21371 counter_u64_add(rack_tlp_retran_bytes, len);
21372 } else {
21373 tp->t_sndrexmitpack++;
21374 KMOD_TCPSTAT_INC(tcps_sndrexmitpack);
21375 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len);
21376 }
21377 #ifdef STATS
21378 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
21379 len);
21380 #endif
21381 } else {
21382 KMOD_TCPSTAT_INC(tcps_sndpack);
21383 KMOD_TCPSTAT_ADD(tcps_sndbyte, len);
21384 #ifdef STATS
21385 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
21386 len);
21387 #endif
21388 }
21389 /*
21390 * If we're sending everything we've got, set PUSH. (This
21391 * will keep happy those implementations which only give
21392 * data to the user when a buffer fills or a PUSH comes in.)
21393 */
21394 if (sb_offset + len == sbused(sb) &&
21395 sbused(sb) &&
21396 !(flags & TH_SYN)) {
21397 flags |= TH_PUSH;
21398 add_flag |= RACK_HAD_PUSH;
21399 }
21400 SOCK_SENDBUF_UNLOCK(so);
21401 } else {
21402 SOCK_SENDBUF_UNLOCK(so);
21403 if (tp->t_flags & TF_ACKNOW)
21404 KMOD_TCPSTAT_INC(tcps_sndacks);
21405 else if (flags & (TH_SYN | TH_FIN | TH_RST))
21406 KMOD_TCPSTAT_INC(tcps_sndctrl);
21407 else
21408 KMOD_TCPSTAT_INC(tcps_sndwinup);
21409
21410 m = m_gethdr(M_NOWAIT, MT_DATA);
21411 if (m == NULL) {
21412 error = ENOBUFS;
21413 sack_rxmit = 0;
21414 goto out;
21415 }
21416 #ifdef INET6
21417 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) &&
21418 MHLEN >= hdrlen) {
21419 M_ALIGN(m, hdrlen);
21420 } else
21421 #endif
21422 m->m_data += max_linkhdr;
21423 m->m_len = hdrlen;
21424 }
21425 SOCK_SENDBUF_UNLOCK_ASSERT(so);
21426 m->m_pkthdr.rcvif = (struct ifnet *)0;
21427 #ifdef MAC
21428 mac_inpcb_create_mbuf(inp, m);
21429 #endif
21430 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
21431 #ifdef INET6
21432 if (isipv6)
21433 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr;
21434 else
21435 #endif /* INET6 */
21436 #ifdef INET
21437 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
21438 #endif
21439 th = rack->r_ctl.fsb.th;
21440 udp = rack->r_ctl.fsb.udp;
21441 if (udp) {
21442 #ifdef INET6
21443 if (isipv6)
21444 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21445 else
21446 #endif /* INET6 */
21447 ulen = hdrlen + len - sizeof(struct ip);
21448 udp->uh_ulen = htons(ulen);
21449 }
21450 } else {
21451 #ifdef INET6
21452 if (isipv6) {
21453 ip6 = mtod(m, struct ip6_hdr *);
21454 if (tp->t_port) {
21455 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr));
21456 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21457 udp->uh_dport = tp->t_port;
21458 ulen = hdrlen + len - sizeof(struct ip6_hdr);
21459 udp->uh_ulen = htons(ulen);
21460 th = (struct tcphdr *)(udp + 1);
21461 } else
21462 th = (struct tcphdr *)(ip6 + 1);
21463 tcpip_fillheaders(inp, tp->t_port, ip6, th);
21464 } else
21465 #endif /* INET6 */
21466 {
21467 #ifdef INET
21468 ip = mtod(m, struct ip *);
21469 if (tp->t_port) {
21470 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip));
21471 udp->uh_sport = htons(V_tcp_udp_tunneling_port);
21472 udp->uh_dport = tp->t_port;
21473 ulen = hdrlen + len - sizeof(struct ip);
21474 udp->uh_ulen = htons(ulen);
21475 th = (struct tcphdr *)(udp + 1);
21476 } else
21477 th = (struct tcphdr *)(ip + 1);
21478 tcpip_fillheaders(inp, tp->t_port, ip, th);
21479 #endif
21480 }
21481 }
21482 /*
21483 * If we are starting a connection, send ECN setup SYN packet. If we
21484 * are on a retransmit, we may resend those bits a number of times
21485 * as per RFC 3168.
21486 */
21487 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) {
21488 flags |= tcp_ecn_output_syn_sent(tp);
21489 }
21490 /* Also handle parallel SYN for ECN */
21491 if (TCPS_HAVERCVDSYN(tp->t_state) &&
21492 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) {
21493 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit);
21494 if ((tp->t_state == TCPS_SYN_RECEIVED) &&
21495 (tp->t_flags2 & TF2_ECN_SND_ECE))
21496 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
21497 #ifdef INET6
21498 if (isipv6) {
21499 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20);
21500 ip6->ip6_flow |= htonl(ect << 20);
21501 }
21502 else
21503 #endif
21504 {
21505 #ifdef INET
21506 ip->ip_tos &= ~IPTOS_ECN_MASK;
21507 ip->ip_tos |= ect;
21508 #endif
21509 }
21510 }
21511 th->th_seq = htonl(rack_seq);
21512 th->th_ack = htonl(tp->rcv_nxt);
21513 tcp_set_flags(th, flags);
21514 /*
21515 * Calculate receive window. Don't shrink window, but avoid silly
21516 * window syndrome.
21517 * If a RST segment is sent, advertise a window of zero.
21518 */
21519 if (flags & TH_RST) {
21520 recwin = 0;
21521 } else {
21522 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
21523 recwin < (long)segsiz) {
21524 recwin = 0;
21525 }
21526 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
21527 recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
21528 recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
21529 }
21530
21531 /*
21532 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or
21533 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is
21534 * handled in syncache.
21535 */
21536 if (flags & TH_SYN)
21537 th->th_win = htons((u_short)
21538 (min(sbspace(&so->so_rcv), TCP_MAXWIN)));
21539 else {
21540 /* Avoid shrinking window with window scaling. */
21541 recwin = roundup2(recwin, 1 << tp->rcv_scale);
21542 th->th_win = htons((u_short)(recwin >> tp->rcv_scale));
21543 }
21544 /*
21545 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0
21546 * window. This may cause the remote transmitter to stall. This
21547 * flag tells soreceive() to disable delayed acknowledgements when
21548 * draining the buffer. This can occur if the receiver is
21549 * attempting to read more data than can be buffered prior to
21550 * transmitting on the connection.
21551 */
21552 if (th->th_win == 0) {
21553 tp->t_sndzerowin++;
21554 tp->t_flags |= TF_RXWIN0SENT;
21555 } else
21556 tp->t_flags &= ~TF_RXWIN0SENT;
21557 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */
21558 /* Now are we using fsb?, if so copy the template data to the mbuf */
21559 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) {
21560 uint8_t *cpto;
21561
21562 cpto = mtod(m, uint8_t *);
21563 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len);
21564 /*
21565 * We have just copied in:
21566 * IP/IP6
21567 * <optional udphdr>
21568 * tcphdr (no options)
21569 *
21570 * We need to grab the correct pointers into the mbuf
21571 * for both the tcp header, and possibly the udp header (if tunneling).
21572 * We do this by using the offset in the copy buffer and adding it
21573 * to the mbuf base pointer (cpto).
21574 */
21575 #ifdef INET6
21576 if (isipv6)
21577 ip6 = mtod(m, struct ip6_hdr *);
21578 else
21579 #endif /* INET6 */
21580 #ifdef INET
21581 ip = mtod(m, struct ip *);
21582 #endif
21583 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr));
21584 /* If we have a udp header lets set it into the mbuf as well */
21585 if (udp)
21586 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr));
21587 }
21588 if (optlen) {
21589 bcopy(opt, th + 1, optlen);
21590 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
21591 }
21592 /*
21593 * Put TCP length in extended header, and then checksum extended
21594 * header and data.
21595 */
21596 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
21597 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
21598 if (to.to_flags & TOF_SIGNATURE) {
21599 /*
21600 * Calculate MD5 signature and put it into the place
21601 * determined before.
21602 * NOTE: since TCP options buffer doesn't point into
21603 * mbuf's data, calculate offset and use it.
21604 */
21605 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th,
21606 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) {
21607 /*
21608 * Do not send segment if the calculation of MD5
21609 * digest has failed.
21610 */
21611 goto out;
21612 }
21613 }
21614 #endif
21615 #ifdef INET6
21616 if (isipv6) {
21617 /*
21618 * ip6_plen is not need to be filled now, and will be filled
21619 * in ip6_output.
21620 */
21621 if (tp->t_port) {
21622 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
21623 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21624 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
21625 th->th_sum = htons(0);
21626 UDPSTAT_INC(udps_opackets);
21627 } else {
21628 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
21629 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21630 th->th_sum = in6_cksum_pseudo(ip6,
21631 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
21632 0);
21633 }
21634 }
21635 #endif
21636 #if defined(INET6) && defined(INET)
21637 else
21638 #endif
21639 #ifdef INET
21640 {
21641 if (tp->t_port) {
21642 m->m_pkthdr.csum_flags = CSUM_UDP;
21643 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
21644 udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
21645 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
21646 th->th_sum = htons(0);
21647 UDPSTAT_INC(udps_opackets);
21648 } else {
21649 m->m_pkthdr.csum_flags = CSUM_TCP;
21650 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
21651 th->th_sum = in_pseudo(ip->ip_src.s_addr,
21652 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
21653 IPPROTO_TCP + len + optlen));
21654 }
21655 /* IP version must be set here for ipv4/ipv6 checking later */
21656 KASSERT(ip->ip_v == IPVERSION,
21657 ("%s: IP version incorrect: %d", __func__, ip->ip_v));
21658 }
21659 #endif
21660 /*
21661 * Enable TSO and specify the size of the segments. The TCP pseudo
21662 * header checksum is always provided. XXX: Fixme: This is currently
21663 * not the case for IPv6.
21664 */
21665 if (tso) {
21666 /*
21667 * Here we must use t_maxseg and the optlen since
21668 * the optlen may include SACK's (or DSACK).
21669 */
21670 KASSERT(len > tp->t_maxseg - optlen,
21671 ("%s: len <= tso_segsz", __func__));
21672 m->m_pkthdr.csum_flags |= CSUM_TSO;
21673 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen;
21674 }
21675 KASSERT(len + hdrlen == m_length(m, NULL),
21676 ("%s: mbuf chain different than expected: %d + %u != %u",
21677 __func__, len, hdrlen, m_length(m, NULL)));
21678
21679 #ifdef TCP_HHOOK
21680 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */
21681 hhook_run_tcp_est_out(tp, th, &to, len, tso);
21682 #endif
21683 if ((rack->r_ctl.crte != NULL) &&
21684 (rack->rc_hw_nobuf == 0) &&
21685 tcp_bblogging_on(tp)) {
21686 rack_log_queue_level(tp, rack, len, &tv, cts);
21687 }
21688 /* We're getting ready to send; log now. */
21689 if (tcp_bblogging_on(rack->rc_tp)) {
21690 union tcp_log_stackspecific log;
21691
21692 memset(&log, 0, sizeof(log));
21693 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp);
21694 if (rack->rack_no_prr)
21695 log.u_bbr.flex1 = 0;
21696 else
21697 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt;
21698 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
21699 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
21700 log.u_bbr.flex4 = orig_len;
21701 /* Save off the early/late values */
21702 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
21703 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
21704 log.u_bbr.bw_inuse = rack_get_bw(rack);
21705 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw;
21706 log.u_bbr.flex8 = 0;
21707 if (rsm) {
21708 if (rsm->r_flags & RACK_RWND_COLLAPSED) {
21709 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
21710 counter_u64_add(rack_collapsed_win_rxt, 1);
21711 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
21712 }
21713 if (doing_tlp)
21714 log.u_bbr.flex8 = 2;
21715 else
21716 log.u_bbr.flex8 = 1;
21717 } else {
21718 if (doing_tlp)
21719 log.u_bbr.flex8 = 3;
21720 }
21721 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm);
21722 log.u_bbr.flex7 = mark;
21723 log.u_bbr.flex7 <<= 8;
21724 log.u_bbr.flex7 |= pass;
21725 log.u_bbr.pkts_out = tp->t_maxseg;
21726 log.u_bbr.timeStamp = cts;
21727 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
21728 if (rsm && (rsm->r_rtr_cnt > 0)) {
21729 /*
21730 * When we have a retransmit we want to log the
21731 * burst at send and flight at send from before.
21732 */
21733 log.u_bbr.flex5 = rsm->r_fas;
21734 log.u_bbr.bbr_substate = rsm->r_bas;
21735 } else {
21736 /*
21737 * New transmits we log in flex5 the inflight again as
21738 * well as the number of segments in our send in the
21739 * substate field.
21740 */
21741 log.u_bbr.flex5 = log.u_bbr.inflight;
21742 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz);
21743 }
21744 log.u_bbr.lt_epoch = cwnd_to_use;
21745 log.u_bbr.delivered = sendalot;
21746 log.u_bbr.rttProp = (uintptr_t)rsm;
21747 log.u_bbr.pkt_epoch = __LINE__;
21748 if (rsm) {
21749 log.u_bbr.delRate = rsm->r_flags;
21750 log.u_bbr.delRate <<= 31;
21751 log.u_bbr.delRate |= rack->r_must_retran;
21752 log.u_bbr.delRate <<= 1;
21753 log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21754 } else {
21755 log.u_bbr.delRate = rack->r_must_retran;
21756 log.u_bbr.delRate <<= 1;
21757 log.u_bbr.delRate |= (sack_rxmit & 0x00000001);
21758 }
21759 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
21760 len, &log, false, NULL, __func__, __LINE__, &tv);
21761 } else
21762 lgb = NULL;
21763
21764 /*
21765 * Fill in IP length and desired time to live and send to IP level.
21766 * There should be a better way to handle ttl and tos; we could keep
21767 * them in the template, but need a way to checksum without them.
21768 */
21769 /*
21770 * m->m_pkthdr.len should have been set before cksum calcuration,
21771 * because in6_cksum() need it.
21772 */
21773 #ifdef INET6
21774 if (isipv6) {
21775 /*
21776 * we separately set hoplimit for every segment, since the
21777 * user might want to change the value via setsockopt. Also,
21778 * desired default hop limit might be changed via Neighbor
21779 * Discovery.
21780 */
21781 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL);
21782
21783 /*
21784 * Set the packet size here for the benefit of DTrace
21785 * probes. ip6_output() will set it properly; it's supposed
21786 * to include the option header lengths as well.
21787 */
21788 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6));
21789
21790 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss)
21791 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21792 else
21793 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21794
21795 if (tp->t_state == TCPS_SYN_SENT)
21796 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th);
21797
21798 TCP_PROBE5(send, NULL, tp, ip6, tp, th);
21799 /* TODO: IPv6 IP6TOS_ECT bit on */
21800 error = ip6_output(m,
21801 inp->in6p_outputopts,
21802 &inp->inp_route6,
21803 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0),
21804 NULL, NULL, inp);
21805
21806 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL)
21807 mtu = inp->inp_route6.ro_nh->nh_mtu;
21808 }
21809 #endif /* INET6 */
21810 #if defined(INET) && defined(INET6)
21811 else
21812 #endif
21813 #ifdef INET
21814 {
21815 ip->ip_len = htons(m->m_pkthdr.len);
21816 #ifdef INET6
21817 if (inp->inp_vflag & INP_IPV6PROTO)
21818 ip->ip_ttl = in6_selecthlim(inp, NULL);
21819 #endif /* INET6 */
21820 rack->r_ctl.fsb.hoplimit = ip->ip_ttl;
21821 /*
21822 * If we do path MTU discovery, then we set DF on every
21823 * packet. This might not be the best thing to do according
21824 * to RFC3390 Section 2. However the tcp hostcache migitates
21825 * the problem so it affects only the first tcp connection
21826 * with a host.
21827 *
21828 * NB: Don't set DF on small MTU/MSS to have a safe
21829 * fallback.
21830 */
21831 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
21832 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
21833 if (tp->t_port == 0 || len < V_tcp_minmss) {
21834 ip->ip_off |= htons(IP_DF);
21835 }
21836 } else {
21837 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
21838 }
21839
21840 if (tp->t_state == TCPS_SYN_SENT)
21841 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th);
21842
21843 TCP_PROBE5(send, NULL, tp, ip, tp, th);
21844
21845 error = ip_output(m,
21846 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
21847 inp->inp_options,
21848 #else
21849 NULL,
21850 #endif
21851 &inp->inp_route,
21852 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0,
21853 inp);
21854 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL)
21855 mtu = inp->inp_route.ro_nh->nh_mtu;
21856 }
21857 #endif /* INET */
21858 if (lgb) {
21859 lgb->tlb_errno = error;
21860 lgb = NULL;
21861 }
21862
21863 out:
21864 /*
21865 * In transmit state, time the transmission and arrange for the
21866 * retransmit. In persist state, just set snd_max.
21867 */
21868 if ((rsm == NULL) && doing_tlp)
21869 add_flag |= RACK_TLP;
21870 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
21871 rack_to_usec_ts(&tv),
21872 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz);
21873 if (error == 0) {
21874 if (add_flag & RACK_IS_PCM) {
21875 /* We just launched a PCM */
21876 /* rrs here log */
21877 rack->pcm_in_progress = 1;
21878 rack->pcm_needed = 0;
21879 rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag);
21880 }
21881 if (rsm == NULL) {
21882 if (rack->lt_bw_up == 0) {
21883 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv);
21884 rack->r_ctl.lt_seq = tp->snd_una;
21885 rack->lt_bw_up = 1;
21886 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) {
21887 /*
21888 * Need to record what we have since we are
21889 * approaching seq wrap.
21890 */
21891 uint64_t tmark;
21892
21893 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq);
21894 rack->r_ctl.lt_seq = tp->snd_una;
21895 tmark = tcp_get_u64_usecs(&tv);
21896 if (tmark > rack->r_ctl.lt_timemark) {
21897 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark);
21898 rack->r_ctl.lt_timemark = tmark;
21899 }
21900 }
21901 }
21902 rack->forced_ack = 0; /* If we send something zap the FA flag */
21903 counter_u64_add(rack_total_bytes, len);
21904 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls);
21905 if (rsm && doing_tlp) {
21906 rack->rc_last_sent_tlp_past_cumack = 0;
21907 rack->rc_last_sent_tlp_seq_valid = 1;
21908 rack->r_ctl.last_sent_tlp_seq = rsm->r_start;
21909 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start;
21910 }
21911 if (rack->rc_hw_nobuf) {
21912 rack->rc_hw_nobuf = 0;
21913 rack->r_ctl.rc_agg_delayed = 0;
21914 rack->r_early = 0;
21915 rack->r_late = 0;
21916 rack->r_ctl.rc_agg_early = 0;
21917 }
21918 if (rsm && (doing_tlp == 0)) {
21919 /* Set we retransmitted */
21920 rack->rc_gp_saw_rec = 1;
21921 } else {
21922 if (cwnd_to_use > tp->snd_ssthresh) {
21923 /* Set we sent in CA */
21924 rack->rc_gp_saw_ca = 1;
21925 } else {
21926 /* Set we sent in SS */
21927 rack->rc_gp_saw_ss = 1;
21928 }
21929 }
21930 if (TCPS_HAVEESTABLISHED(tp->t_state) &&
21931 (tp->t_flags & TF_SACK_PERMIT) &&
21932 tp->rcv_numsacks > 0)
21933 tcp_clean_dsack_blocks(tp);
21934 tot_len_this_send += len;
21935 if (len == 0) {
21936 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
21937 } else {
21938 int idx;
21939
21940 idx = (len / segsiz) + 3;
21941 if (idx >= TCP_MSS_ACCT_ATIMER)
21942 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1);
21943 else
21944 counter_u64_add(rack_out_size[idx], 1);
21945 }
21946 }
21947 if ((rack->rack_no_prr == 0) &&
21948 sub_from_prr &&
21949 (error == 0)) {
21950 if (rack->r_ctl.rc_prr_sndcnt >= len)
21951 rack->r_ctl.rc_prr_sndcnt -= len;
21952 else
21953 rack->r_ctl.rc_prr_sndcnt = 0;
21954 }
21955 sub_from_prr = 0;
21956 if (rsm != NULL) {
21957 if (doing_tlp)
21958 /* Make sure the TLP is added */
21959 rsm->r_flags |= RACK_TLP;
21960 else
21961 /* If its a resend without TLP then it must not have the flag */
21962 rsm->r_flags &= ~RACK_TLP;
21963 }
21964 if ((error == 0) &&
21965 (len > 0) &&
21966 (tp->snd_una == tp->snd_max))
21967 rack->r_ctl.rc_tlp_rxt_last_time = cts;
21968
21969 {
21970 /*
21971 * This block is not associated with the above error == 0 test.
21972 * It is used to advance snd_max if we have a new transmit.
21973 */
21974 tcp_seq startseq = tp->snd_max;
21975
21976
21977 if (rsm && (doing_tlp == 0))
21978 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start;
21979 if (error)
21980 /* We don't log or do anything with errors */
21981 goto nomore;
21982 if (doing_tlp == 0) {
21983 if (rsm == NULL) {
21984 /*
21985 * Not a retransmission of some
21986 * sort, new data is going out so
21987 * clear our TLP count and flag.
21988 */
21989 rack->rc_tlp_in_progress = 0;
21990 rack->r_ctl.rc_tlp_cnt_out = 0;
21991 }
21992 } else {
21993 /*
21994 * We have just sent a TLP, mark that it is true
21995 * and make sure our in progress is set so we
21996 * continue to check the count.
21997 */
21998 rack->rc_tlp_in_progress = 1;
21999 rack->r_ctl.rc_tlp_cnt_out++;
22000 }
22001 /*
22002 * If we are retransmitting we are done, snd_max
22003 * does not get updated.
22004 */
22005 if (sack_rxmit)
22006 goto nomore;
22007 if ((tp->snd_una == tp->snd_max) && (len > 0)) {
22008 /*
22009 * Update the time we just added data since
22010 * nothing was outstanding.
22011 */
22012 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
22013 tp->t_acktime = ticks;
22014 }
22015 /*
22016 * Now for special SYN/FIN handling.
22017 */
22018 if (flags & (TH_SYN | TH_FIN)) {
22019 if ((flags & TH_SYN) &&
22020 ((tp->t_flags & TF_SENTSYN) == 0)) {
22021 tp->snd_max++;
22022 tp->t_flags |= TF_SENTSYN;
22023 }
22024 if ((flags & TH_FIN) &&
22025 ((tp->t_flags & TF_SENTFIN) == 0)) {
22026 tp->snd_max++;
22027 tp->t_flags |= TF_SENTFIN;
22028 }
22029 }
22030 tp->snd_max += len;
22031 if (rack->rc_new_rnd_needed) {
22032 rack_new_round_starts(tp, rack, tp->snd_max);
22033 }
22034 /*
22035 * Time this transmission if not a retransmission and
22036 * not currently timing anything.
22037 * This is only relevant in case of switching back to
22038 * the base stack.
22039 */
22040 if (tp->t_rtttime == 0) {
22041 tp->t_rtttime = ticks;
22042 tp->t_rtseq = startseq;
22043 KMOD_TCPSTAT_INC(tcps_segstimed);
22044 }
22045 if (len &&
22046 ((tp->t_flags & TF_GPUTINPROG) == 0))
22047 rack_start_gp_measurement(tp, rack, startseq, sb_offset);
22048 /*
22049 * If we are doing FO we need to update the mbuf position and subtract
22050 * this happens when the peer sends us duplicate information and
22051 * we thus want to send a DSACK.
22052 *
22053 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO
22054 * turned off? If not then we are going to echo multiple DSACK blocks
22055 * out (with the TSO), which we should not be doing.
22056 */
22057 if (rack->r_fast_output && len) {
22058 if (rack->r_ctl.fsb.left_to_send > len)
22059 rack->r_ctl.fsb.left_to_send -= len;
22060 else
22061 rack->r_ctl.fsb.left_to_send = 0;
22062 if (rack->r_ctl.fsb.left_to_send < segsiz)
22063 rack->r_fast_output = 0;
22064 if (rack->r_fast_output) {
22065 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off);
22066 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len;
22067 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m);
22068 }
22069 }
22070 if (rack_pcm_blast == 0) {
22071 if ((orig_len > len) &&
22072 (add_flag & RACK_IS_PCM) &&
22073 (len < pace_max_seg) &&
22074 ((pace_max_seg - len) > segsiz)) {
22075 /*
22076 * We are doing a PCM measurement and we did
22077 * not get enough data in the TSO to meet the
22078 * burst requirement.
22079 */
22080 uint32_t n_len;
22081
22082 n_len = (orig_len - len);
22083 orig_len -= len;
22084 pace_max_seg -= len;
22085 len = n_len;
22086 sb_offset = tp->snd_max - tp->snd_una;
22087 /* Re-lock for the next spin */
22088 SOCK_SENDBUF_LOCK(so);
22089 goto send;
22090 }
22091 } else {
22092 if ((orig_len > len) &&
22093 (add_flag & RACK_IS_PCM) &&
22094 ((orig_len - len) > segsiz)) {
22095 /*
22096 * We are doing a PCM measurement and we did
22097 * not get enough data in the TSO to meet the
22098 * burst requirement.
22099 */
22100 uint32_t n_len;
22101
22102 n_len = (orig_len - len);
22103 orig_len -= len;
22104 len = n_len;
22105 sb_offset = tp->snd_max - tp->snd_una;
22106 /* Re-lock for the next spin */
22107 SOCK_SENDBUF_LOCK(so);
22108 goto send;
22109 }
22110 }
22111 }
22112 nomore:
22113 if (error) {
22114 rack->r_ctl.rc_agg_delayed = 0;
22115 rack->r_early = 0;
22116 rack->r_late = 0;
22117 rack->r_ctl.rc_agg_early = 0;
22118 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */
22119 /*
22120 * Failures do not advance the seq counter above. For the
22121 * case of ENOBUFS we will fall out and retry in 1ms with
22122 * the hpts. Everything else will just have to retransmit
22123 * with the timer.
22124 *
22125 * In any case, we do not want to loop around for another
22126 * send without a good reason.
22127 */
22128 sendalot = 0;
22129 switch (error) {
22130 case EPERM:
22131 case EACCES:
22132 tp->t_softerror = error;
22133 #ifdef TCP_ACCOUNTING
22134 crtsc = get_cyclecount();
22135 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22136 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22137 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22138 }
22139 sched_unpin();
22140 #endif
22141 return (error);
22142 case ENOBUFS:
22143 /*
22144 * Pace us right away to retry in a some
22145 * time
22146 */
22147 if (rack->r_ctl.crte != NULL) {
22148 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF);
22149 if (tcp_bblogging_on(rack->rc_tp))
22150 rack_log_queue_level(tp, rack, len, &tv, cts);
22151 } else
22152 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF);
22153 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC);
22154 if (rack->rc_enobuf < 0x7f)
22155 rack->rc_enobuf++;
22156 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC))
22157 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22158 if (rack->r_ctl.crte != NULL) {
22159 counter_u64_add(rack_saw_enobuf_hw, 1);
22160 tcp_rl_log_enobuf(rack->r_ctl.crte);
22161 }
22162 counter_u64_add(rack_saw_enobuf, 1);
22163 goto enobufs;
22164 case EMSGSIZE:
22165 /*
22166 * For some reason the interface we used initially
22167 * to send segments changed to another or lowered
22168 * its MTU. If TSO was active we either got an
22169 * interface without TSO capabilits or TSO was
22170 * turned off. If we obtained mtu from ip_output()
22171 * then update it and try again.
22172 */
22173 if (tso)
22174 tp->t_flags &= ~TF_TSO;
22175 if (mtu != 0) {
22176 int saved_mtu;
22177
22178 saved_mtu = tp->t_maxseg;
22179 tcp_mss_update(tp, -1, mtu, NULL, NULL);
22180 if (saved_mtu > tp->t_maxseg) {
22181 goto again;
22182 }
22183 }
22184 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22185 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
22186 #ifdef TCP_ACCOUNTING
22187 crtsc = get_cyclecount();
22188 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22189 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22190 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22191 }
22192 sched_unpin();
22193 #endif
22194 return (error);
22195 case ENETUNREACH:
22196 counter_u64_add(rack_saw_enetunreach, 1);
22197 /* FALLTHROUGH */
22198 case EHOSTDOWN:
22199 case EHOSTUNREACH:
22200 case ENETDOWN:
22201 if (TCPS_HAVERCVDSYN(tp->t_state)) {
22202 tp->t_softerror = error;
22203 error = 0;
22204 }
22205 /* FALLTHROUGH */
22206 default:
22207 pacing_delay = 10 * HPTS_USEC_IN_MSEC;
22208 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0);
22209 #ifdef TCP_ACCOUNTING
22210 crtsc = get_cyclecount();
22211 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22212 tp->tcp_cnt_counters[SND_OUT_FAIL]++;
22213 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val);
22214 }
22215 sched_unpin();
22216 #endif
22217 return (error);
22218 }
22219 } else {
22220 rack->rc_enobuf = 0;
22221 if (IN_FASTRECOVERY(tp->t_flags) && rsm)
22222 rack->r_ctl.retran_during_recovery += len;
22223 }
22224 KMOD_TCPSTAT_INC(tcps_sndtotal);
22225
22226 /*
22227 * Data sent (as far as we can tell). If this advertises a larger
22228 * window than any other segment, then remember the size of the
22229 * advertised window. Any pending ACK has now been sent.
22230 */
22231 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv))
22232 tp->rcv_adv = tp->rcv_nxt + recwin;
22233
22234 tp->last_ack_sent = tp->rcv_nxt;
22235 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
22236 enobufs:
22237 if (sendalot) {
22238 /* Do we need to turn off sendalot? */
22239 if (pace_max_seg &&
22240 (tot_len_this_send >= pace_max_seg)) {
22241 /* We hit our max. */
22242 sendalot = 0;
22243 }
22244 }
22245 if ((error == 0) && (flags & TH_FIN))
22246 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
22247 if (flags & TH_RST) {
22248 /*
22249 * We don't send again after sending a RST.
22250 */
22251 pacing_delay = 0;
22252 sendalot = 0;
22253 if (error == 0)
22254 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
22255 } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) {
22256 /*
22257 * Get our pacing rate, if an error
22258 * occurred in sending (ENOBUF) we would
22259 * hit the else if with slot preset. Other
22260 * errors return.
22261 */
22262 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__);
22263 }
22264 /* We have sent clear the flag */
22265 rack->r_ent_rec_ns = 0;
22266 if (rack->r_must_retran) {
22267 if (rsm) {
22268 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start);
22269 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
22270 /*
22271 * We have retransmitted all.
22272 */
22273 rack->r_must_retran = 0;
22274 rack->r_ctl.rc_out_at_rto = 0;
22275 }
22276 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22277 /*
22278 * Sending new data will also kill
22279 * the loop.
22280 */
22281 rack->r_must_retran = 0;
22282 rack->r_ctl.rc_out_at_rto = 0;
22283 }
22284 }
22285 rack->r_ctl.fsb.recwin = recwin;
22286 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) &&
22287 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) {
22288 /*
22289 * We hit an RTO and now have past snd_max at the RTO
22290 * clear all the WAS flags.
22291 */
22292 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY);
22293 }
22294 if (pacing_delay) {
22295 /* set the rack tcb into the slot N */
22296 if ((error == 0) &&
22297 rack_use_rfo &&
22298 ((flags & (TH_SYN|TH_FIN)) == 0) &&
22299 (rsm == NULL) &&
22300 (ipoptlen == 0) &&
22301 (doing_tlp == 0) &&
22302 rack->r_fsb_inited &&
22303 TCPS_HAVEESTABLISHED(tp->t_state) &&
22304 ((IN_RECOVERY(tp->t_flags)) == 0) &&
22305 (rack->r_must_retran == 0) &&
22306 ((tp->t_flags & TF_NEEDFIN) == 0) &&
22307 (len > 0) && (orig_len > 0) &&
22308 (orig_len > len) &&
22309 ((orig_len - len) >= segsiz) &&
22310 ((optlen == 0) ||
22311 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22312 /* We can send at least one more MSS using our fsb */
22313 rack_setup_fast_output(tp, rack, sb, len, orig_len,
22314 segsiz, pace_max_seg, hw_tls, flags);
22315 } else
22316 rack->r_fast_output = 0;
22317 rack_log_fsb(rack, tp, so, flags,
22318 ipoptlen, orig_len, len, error,
22319 (rsm == NULL), optlen, __LINE__, 2);
22320 } else if (sendalot) {
22321 int ret;
22322
22323 sack_rxmit = 0;
22324 if ((error == 0) &&
22325 rack_use_rfo &&
22326 ((flags & (TH_SYN|TH_FIN)) == 0) &&
22327 (rsm == NULL) &&
22328 (doing_tlp == 0) &&
22329 (ipoptlen == 0) &&
22330 (rack->r_must_retran == 0) &&
22331 rack->r_fsb_inited &&
22332 TCPS_HAVEESTABLISHED(tp->t_state) &&
22333 ((IN_RECOVERY(tp->t_flags)) == 0) &&
22334 ((tp->t_flags & TF_NEEDFIN) == 0) &&
22335 (len > 0) && (orig_len > 0) &&
22336 (orig_len > len) &&
22337 ((orig_len - len) >= segsiz) &&
22338 ((optlen == 0) ||
22339 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) {
22340 /* we can use fast_output for more */
22341 rack_setup_fast_output(tp, rack, sb, len, orig_len,
22342 segsiz, pace_max_seg, hw_tls, flags);
22343 if (rack->r_fast_output) {
22344 error = 0;
22345 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__);
22346 if (ret >= 0)
22347 return (ret);
22348 else if (error)
22349 goto nomore;
22350
22351 }
22352 }
22353 goto again;
22354 }
22355 skip_all_send:
22356 /* Assure when we leave that snd_nxt will point to top */
22357 if (SEQ_GT(tp->snd_max, tp->snd_nxt))
22358 tp->snd_nxt = tp->snd_max;
22359 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0);
22360 #ifdef TCP_ACCOUNTING
22361 crtsc = get_cyclecount() - ts_val;
22362 if (tot_len_this_send) {
22363 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22364 tp->tcp_cnt_counters[SND_OUT_DATA]++;
22365 tp->tcp_proc_time[SND_OUT_DATA] += crtsc;
22366 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz);
22367 }
22368 } else {
22369 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) {
22370 tp->tcp_cnt_counters[SND_OUT_ACK]++;
22371 tp->tcp_proc_time[SND_OUT_ACK] += crtsc;
22372 }
22373 }
22374 sched_unpin();
22375 #endif
22376 if (error == ENOBUFS)
22377 error = 0;
22378 return (error);
22379 }
22380
22381 static void
rack_update_seg(struct tcp_rack * rack)22382 rack_update_seg(struct tcp_rack *rack)
22383 {
22384 uint32_t orig_val;
22385
22386 orig_val = rack->r_ctl.rc_pace_max_segs;
22387 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL);
22388 if (orig_val != rack->r_ctl.rc_pace_max_segs)
22389 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0);
22390 }
22391
22392 static void
rack_mtu_change(struct tcpcb * tp)22393 rack_mtu_change(struct tcpcb *tp)
22394 {
22395 /*
22396 * The MSS may have changed
22397 */
22398 struct tcp_rack *rack;
22399 struct rack_sendmap *rsm;
22400
22401 rack = (struct tcp_rack *)tp->t_fb_ptr;
22402 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) {
22403 /*
22404 * The MTU has changed we need to resend everything
22405 * since all we have sent is lost. We first fix
22406 * up the mtu though.
22407 */
22408 rack_set_pace_segments(tp, rack, __LINE__, NULL);
22409 /* We treat this like a full retransmit timeout without the cwnd adjustment */
22410 rack_remxt_tmr(tp);
22411 rack->r_fast_output = 0;
22412 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp,
22413 rack->r_ctl.rc_sacked);
22414 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max;
22415 rack->r_must_retran = 1;
22416 /* Mark all inflight to needing to be rxt'd */
22417 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) {
22418 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG);
22419 }
22420 }
22421 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
22422 /* We don't use snd_nxt to retransmit */
22423 tp->snd_nxt = tp->snd_max;
22424 }
22425
22426 static int
rack_set_dgp(struct tcp_rack * rack)22427 rack_set_dgp(struct tcp_rack *rack)
22428 {
22429 if (rack->dgp_on == 1)
22430 return(0);
22431 if ((rack->use_fixed_rate == 1) &&
22432 (rack->rc_always_pace == 1)) {
22433 /*
22434 * We are already pacing another
22435 * way.
22436 */
22437 return (EBUSY);
22438 }
22439 if (rack->rc_always_pace == 1) {
22440 rack_remove_pacing(rack);
22441 }
22442 if (tcp_incr_dgp_pacing_cnt() == 0)
22443 return (ENOSPC);
22444 rack->r_ctl.pacing_method |= RACK_DGP_PACING;
22445 rack->rc_fillcw_apply_discount = 0;
22446 rack->dgp_on = 1;
22447 rack->rc_always_pace = 1;
22448 rack->rc_pace_dnd = 1;
22449 rack->use_fixed_rate = 0;
22450 if (rack->gp_ready)
22451 rack_set_cc_pacing(rack);
22452 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22453 rack->rack_attempt_hdwr_pace = 0;
22454 /* rxt settings */
22455 rack->full_size_rxt = 1;
22456 rack->shape_rxt_to_pacing_min = 0;
22457 /* cmpack=1 */
22458 rack->r_use_cmp_ack = 1;
22459 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) &&
22460 rack->r_use_cmp_ack)
22461 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
22462 /* scwnd=1 */
22463 rack->rack_enable_scwnd = 1;
22464 /* dynamic=100 */
22465 rack->rc_gp_dyn_mul = 1;
22466 /* gp_inc_ca */
22467 rack->r_ctl.rack_per_of_gp_ca = 100;
22468 /* rrr_conf=3 */
22469 rack->r_rr_config = 3;
22470 /* npush=2 */
22471 rack->r_ctl.rc_no_push_at_mrtt = 2;
22472 /* fillcw=1 */
22473 rack->rc_pace_to_cwnd = 1;
22474 rack->rc_pace_fill_if_rttin_range = 0;
22475 rack->rtt_limit_mul = 0;
22476 /* noprr=1 */
22477 rack->rack_no_prr = 1;
22478 /* lscwnd=1 */
22479 rack->r_limit_scw = 1;
22480 /* gp_inc_rec */
22481 rack->r_ctl.rack_per_of_gp_rec = 90;
22482 return (0);
22483 }
22484
22485 static int
rack_set_profile(struct tcp_rack * rack,int prof)22486 rack_set_profile(struct tcp_rack *rack, int prof)
22487 {
22488 int err = EINVAL;
22489 if (prof == 1) {
22490 /*
22491 * Profile 1 is "standard" DGP. It ignores
22492 * client buffer level.
22493 */
22494 err = rack_set_dgp(rack);
22495 if (err)
22496 return (err);
22497 } else if (prof == 6) {
22498 err = rack_set_dgp(rack);
22499 if (err)
22500 return (err);
22501 /*
22502 * Profile 6 tweaks DGP so that it will apply to
22503 * fill-cw the same settings that profile5 does
22504 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted).
22505 */
22506 rack->rc_fillcw_apply_discount = 1;
22507 } else if (prof == 0) {
22508 /* This changes things back to the default settings */
22509 if (rack->rc_always_pace == 1) {
22510 rack_remove_pacing(rack);
22511 } else {
22512 /* Make sure any stray flags are off */
22513 rack->dgp_on = 0;
22514 rack->rc_hybrid_mode = 0;
22515 rack->use_fixed_rate = 0;
22516 }
22517 err = 0;
22518 if (rack_fill_cw_state)
22519 rack->rc_pace_to_cwnd = 1;
22520 else
22521 rack->rc_pace_to_cwnd = 0;
22522
22523 if (rack_pace_every_seg && tcp_can_enable_pacing()) {
22524 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22525 rack->rc_always_pace = 1;
22526 if (rack->rack_hibeta)
22527 rack_set_cc_pacing(rack);
22528 } else
22529 rack->rc_always_pace = 0;
22530 if (rack_dsack_std_based & 0x1) {
22531 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */
22532 rack->rc_rack_tmr_std_based = 1;
22533 }
22534 if (rack_dsack_std_based & 0x2) {
22535 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */
22536 rack->rc_rack_use_dsack = 1;
22537 }
22538 if (rack_use_cmp_acks)
22539 rack->r_use_cmp_ack = 1;
22540 else
22541 rack->r_use_cmp_ack = 0;
22542 if (rack_disable_prr)
22543 rack->rack_no_prr = 1;
22544 else
22545 rack->rack_no_prr = 0;
22546 if (rack_gp_no_rec_chg)
22547 rack->rc_gp_no_rec_chg = 1;
22548 else
22549 rack->rc_gp_no_rec_chg = 0;
22550 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) {
22551 rack->r_mbuf_queue = 1;
22552 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state))
22553 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP;
22554 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22555 } else {
22556 rack->r_mbuf_queue = 0;
22557 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
22558 }
22559 if (rack_enable_shared_cwnd)
22560 rack->rack_enable_scwnd = 1;
22561 else
22562 rack->rack_enable_scwnd = 0;
22563 if (rack_do_dyn_mul) {
22564 /* When dynamic adjustment is on CA needs to start at 100% */
22565 rack->rc_gp_dyn_mul = 1;
22566 if (rack_do_dyn_mul >= 100)
22567 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul;
22568 } else {
22569 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca;
22570 rack->rc_gp_dyn_mul = 0;
22571 }
22572 rack->r_rr_config = 0;
22573 rack->r_ctl.rc_no_push_at_mrtt = 0;
22574 rack->rc_pace_fill_if_rttin_range = 0;
22575 rack->rtt_limit_mul = 0;
22576
22577 if (rack_enable_hw_pacing)
22578 rack->rack_hdw_pace_ena = 1;
22579 else
22580 rack->rack_hdw_pace_ena = 0;
22581 if (rack_disable_prr)
22582 rack->rack_no_prr = 1;
22583 else
22584 rack->rack_no_prr = 0;
22585 if (rack_limits_scwnd)
22586 rack->r_limit_scw = 1;
22587 else
22588 rack->r_limit_scw = 0;
22589 rack_init_retransmit_value(rack, rack_rxt_controls);
22590 err = 0;
22591 }
22592 return (err);
22593 }
22594
22595 static int
rack_add_deferred_option(struct tcp_rack * rack,int sopt_name,uint64_t loptval)22596 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval)
22597 {
22598 struct deferred_opt_list *dol;
22599
22600 dol = malloc(sizeof(struct deferred_opt_list),
22601 M_TCPDO, M_NOWAIT|M_ZERO);
22602 if (dol == NULL) {
22603 /*
22604 * No space yikes -- fail out..
22605 */
22606 return (0);
22607 }
22608 dol->optname = sopt_name;
22609 dol->optval = loptval;
22610 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next);
22611 return (1);
22612 }
22613
22614 static int
process_hybrid_pacing(struct tcp_rack * rack,struct tcp_hybrid_req * hybrid)22615 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid)
22616 {
22617 #ifdef TCP_REQUEST_TRK
22618 struct tcp_sendfile_track *sft;
22619 struct timeval tv;
22620 tcp_seq seq;
22621 int err;
22622
22623 microuptime(&tv);
22624
22625 /* Make sure no fixed rate is on */
22626 rack->use_fixed_rate = 0;
22627 rack->r_ctl.rc_fixed_pacing_rate_rec = 0;
22628 rack->r_ctl.rc_fixed_pacing_rate_ca = 0;
22629 rack->r_ctl.rc_fixed_pacing_rate_ss = 0;
22630 /* Now allocate or find our entry that will have these settings */
22631 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0);
22632 if (sft == NULL) {
22633 rack->rc_tp->tcp_hybrid_error++;
22634 /* no space, where would it have gone? */
22635 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc;
22636 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0);
22637 return (ENOSPC);
22638 }
22639 /* mask our internal flags */
22640 hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK;
22641 /* The seq will be snd_una + everything in the buffer */
22642 seq = sft->start_seq;
22643 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) {
22644 /* Disabling hybrid pacing */
22645 if (rack->rc_hybrid_mode) {
22646 rack_set_profile(rack, 0);
22647 rack->rc_tp->tcp_hybrid_stop++;
22648 }
22649 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0);
22650 return (0);
22651 }
22652 if (rack->dgp_on == 0) {
22653 /*
22654 * If we have not yet turned DGP on, do so
22655 * now setting pure DGP mode, no buffer level
22656 * response.
22657 */
22658 if ((err = rack_set_profile(rack, 1)) != 0){
22659 /* Failed to turn pacing on */
22660 rack->rc_tp->tcp_hybrid_error++;
22661 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0);
22662 return (err);
22663 }
22664 }
22665 /*
22666 * Now we must switch to hybrid mode as well which also
22667 * means moving to regular pacing.
22668 */
22669 if (rack->rc_hybrid_mode == 0) {
22670 /* First time */
22671 if (tcp_can_enable_pacing()) {
22672 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22673 rack->rc_hybrid_mode = 1;
22674 } else {
22675 return (ENOSPC);
22676 }
22677 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) {
22678 /*
22679 * This should be true.
22680 */
22681 tcp_dec_dgp_pacing_cnt();
22682 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
22683 }
22684 }
22685 /* Now set in our flags */
22686 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET;
22687 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR)
22688 sft->cspr = hybrid->cspr;
22689 else
22690 sft->cspr = 0;
22691 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS)
22692 sft->hint_maxseg = hybrid->hint_maxseg;
22693 else
22694 sft->hint_maxseg = 0;
22695 rack->rc_tp->tcp_hybrid_start++;
22696 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0);
22697 return (0);
22698 #else
22699 return (ENOTSUP);
22700 #endif
22701 }
22702
22703 static int
rack_stack_information(struct tcpcb * tp,struct stack_specific_info * si)22704 rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si)
22705 {
22706 /* We pulled a SSI info log out what was there */
22707 si->bytes_transmitted = tp->t_sndbytes;
22708 si->bytes_retransmitted = tp->t_snd_rxt_bytes;
22709 return (0);
22710 }
22711
22712 static int
rack_process_option(struct tcpcb * tp,struct tcp_rack * rack,int sopt_name,uint32_t optval,uint64_t loptval,struct tcp_hybrid_req * hybrid)22713 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name,
22714 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid)
22715
22716 {
22717 struct epoch_tracker et;
22718 struct sockopt sopt;
22719 struct cc_newreno_opts opt;
22720 uint64_t val;
22721 int error = 0;
22722 uint16_t ca, ss;
22723
22724 switch (sopt_name) {
22725 case TCP_RACK_SET_RXT_OPTIONS:
22726 if (optval <= 2) {
22727 rack_init_retransmit_value(rack, optval);
22728 } else {
22729 /*
22730 * You must send in 0, 1 or 2 all else is
22731 * invalid.
22732 */
22733 error = EINVAL;
22734 }
22735 break;
22736 case TCP_RACK_DSACK_OPT:
22737 RACK_OPTS_INC(tcp_rack_dsack_opt);
22738 if (optval & 0x1) {
22739 rack->rc_rack_tmr_std_based = 1;
22740 } else {
22741 rack->rc_rack_tmr_std_based = 0;
22742 }
22743 if (optval & 0x2) {
22744 rack->rc_rack_use_dsack = 1;
22745 } else {
22746 rack->rc_rack_use_dsack = 0;
22747 }
22748 rack_log_dsack_event(rack, 5, __LINE__, 0, 0);
22749 break;
22750 case TCP_RACK_PACING_DIVISOR:
22751 RACK_OPTS_INC(tcp_rack_pacing_divisor);
22752 if (optval == 0) {
22753 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor;
22754 } else {
22755 if (optval < RL_MIN_DIVISOR)
22756 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR;
22757 else
22758 rack->r_ctl.pace_len_divisor = optval;
22759 }
22760 break;
22761 case TCP_RACK_HI_BETA:
22762 RACK_OPTS_INC(tcp_rack_hi_beta);
22763 if (optval > 0) {
22764 rack->rack_hibeta = 1;
22765 if ((optval >= 50) &&
22766 (optval <= 100)) {
22767 /*
22768 * User wants to set a custom beta.
22769 */
22770 rack->r_ctl.saved_hibeta = optval;
22771 if (rack->rc_pacing_cc_set)
22772 rack_undo_cc_pacing(rack);
22773 rack->r_ctl.rc_saved_beta = optval;
22774 }
22775 if (rack->rc_pacing_cc_set == 0)
22776 rack_set_cc_pacing(rack);
22777 } else {
22778 rack->rack_hibeta = 0;
22779 if (rack->rc_pacing_cc_set)
22780 rack_undo_cc_pacing(rack);
22781 }
22782 break;
22783 case TCP_RACK_PACING_BETA:
22784 error = EINVAL;
22785 break;
22786 case TCP_RACK_TIMER_SLOP:
22787 RACK_OPTS_INC(tcp_rack_timer_slop);
22788 rack->r_ctl.timer_slop = optval;
22789 if (rack->rc_tp->t_srtt) {
22790 /*
22791 * If we have an SRTT lets update t_rxtcur
22792 * to have the new slop.
22793 */
22794 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp),
22795 rack_rto_min, rack_rto_max,
22796 rack->r_ctl.timer_slop);
22797 }
22798 break;
22799 case TCP_RACK_PACING_BETA_ECN:
22800 RACK_OPTS_INC(tcp_rack_beta_ecn);
22801 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) {
22802 /* This only works for newreno. */
22803 error = EINVAL;
22804 break;
22805 }
22806 if (rack->rc_pacing_cc_set) {
22807 /*
22808 * Set them into the real CC module
22809 * whats in the rack pcb is the old values
22810 * to be used on restoral/
22811 */
22812 sopt.sopt_dir = SOPT_SET;
22813 opt.name = CC_NEWRENO_BETA_ECN;
22814 opt.val = optval;
22815 if (CC_ALGO(tp)->ctl_output != NULL)
22816 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt);
22817 else
22818 error = ENOENT;
22819 } else {
22820 /*
22821 * Not pacing yet so set it into our local
22822 * rack pcb storage.
22823 */
22824 rack->r_ctl.rc_saved_beta_ecn = optval;
22825 }
22826 break;
22827 case TCP_DEFER_OPTIONS:
22828 RACK_OPTS_INC(tcp_defer_opt);
22829 if (optval) {
22830 if (rack->gp_ready) {
22831 /* Too late */
22832 error = EINVAL;
22833 break;
22834 }
22835 rack->defer_options = 1;
22836 } else
22837 rack->defer_options = 0;
22838 break;
22839 case TCP_RACK_MEASURE_CNT:
22840 RACK_OPTS_INC(tcp_rack_measure_cnt);
22841 if (optval && (optval <= 0xff)) {
22842 rack->r_ctl.req_measurements = optval;
22843 } else
22844 error = EINVAL;
22845 break;
22846 case TCP_REC_ABC_VAL:
22847 RACK_OPTS_INC(tcp_rec_abc_val);
22848 if (optval > 0)
22849 rack->r_use_labc_for_rec = 1;
22850 else
22851 rack->r_use_labc_for_rec = 0;
22852 break;
22853 case TCP_RACK_ABC_VAL:
22854 RACK_OPTS_INC(tcp_rack_abc_val);
22855 if ((optval > 0) && (optval < 255))
22856 rack->rc_labc = optval;
22857 else
22858 error = EINVAL;
22859 break;
22860 case TCP_HDWR_UP_ONLY:
22861 RACK_OPTS_INC(tcp_pacing_up_only);
22862 if (optval)
22863 rack->r_up_only = 1;
22864 else
22865 rack->r_up_only = 0;
22866 break;
22867 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
22868 RACK_OPTS_INC(tcp_fillcw_rate_cap);
22869 rack->r_ctl.fillcw_cap = loptval;
22870 break;
22871 case TCP_PACING_RATE_CAP:
22872 RACK_OPTS_INC(tcp_pacing_rate_cap);
22873 if ((rack->dgp_on == 1) &&
22874 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
22875 /*
22876 * If we are doing DGP we need to switch
22877 * to using the pacing limit.
22878 */
22879 if (tcp_can_enable_pacing() == 0) {
22880 error = ENOSPC;
22881 break;
22882 }
22883 /*
22884 * Now change up the flags and counts to be correct.
22885 */
22886 rack->r_ctl.pacing_method |= RACK_REG_PACING;
22887 tcp_dec_dgp_pacing_cnt();
22888 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
22889 }
22890 rack->r_ctl.bw_rate_cap = loptval;
22891 break;
22892 case TCP_HYBRID_PACING:
22893 if (hybrid == NULL) {
22894 error = EINVAL;
22895 break;
22896 }
22897 if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) {
22898 error = EPERM;
22899 break;
22900 }
22901 error = process_hybrid_pacing(rack, hybrid);
22902 break;
22903 case TCP_SIDECHAN_DIS: /* URL:scodm */
22904 if (optval)
22905 rack->r_ctl.side_chan_dis_mask = optval;
22906 else
22907 rack->r_ctl.side_chan_dis_mask = 0;
22908 break;
22909 case TCP_RACK_PROFILE:
22910 RACK_OPTS_INC(tcp_profile);
22911 error = rack_set_profile(rack, optval);
22912 break;
22913 case TCP_USE_CMP_ACKS:
22914 RACK_OPTS_INC(tcp_use_cmp_acks);
22915 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) {
22916 /* You can't turn it off once its on! */
22917 error = EINVAL;
22918 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) {
22919 rack->r_use_cmp_ack = 1;
22920 rack->r_mbuf_queue = 1;
22921 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22922 }
22923 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state))
22924 tp->t_flags2 |= TF2_MBUF_ACKCMP;
22925 break;
22926 case TCP_SHARED_CWND_TIME_LIMIT:
22927 RACK_OPTS_INC(tcp_lscwnd);
22928 if (optval)
22929 rack->r_limit_scw = 1;
22930 else
22931 rack->r_limit_scw = 0;
22932 break;
22933 case TCP_RACK_DGP_IN_REC:
22934 error = EINVAL;
22935 break;
22936 case TCP_RACK_PACE_TO_FILL:
22937 RACK_OPTS_INC(tcp_fillcw);
22938 if (optval == 0)
22939 rack->rc_pace_to_cwnd = 0;
22940 else {
22941 rack->rc_pace_to_cwnd = 1;
22942 }
22943 if ((optval >= rack_gp_rtt_maxmul) &&
22944 rack_gp_rtt_maxmul &&
22945 (optval < 0xf)) {
22946 rack->rc_pace_fill_if_rttin_range = 1;
22947 rack->rtt_limit_mul = optval;
22948 } else {
22949 rack->rc_pace_fill_if_rttin_range = 0;
22950 rack->rtt_limit_mul = 0;
22951 }
22952 break;
22953 case TCP_RACK_NO_PUSH_AT_MAX:
22954 RACK_OPTS_INC(tcp_npush);
22955 if (optval == 0)
22956 rack->r_ctl.rc_no_push_at_mrtt = 0;
22957 else if (optval < 0xff)
22958 rack->r_ctl.rc_no_push_at_mrtt = optval;
22959 else
22960 error = EINVAL;
22961 break;
22962 case TCP_SHARED_CWND_ENABLE:
22963 RACK_OPTS_INC(tcp_rack_scwnd);
22964 if (optval == 0)
22965 rack->rack_enable_scwnd = 0;
22966 else
22967 rack->rack_enable_scwnd = 1;
22968 break;
22969 case TCP_RACK_MBUF_QUEUE:
22970 /* Now do we use the LRO mbuf-queue feature */
22971 RACK_OPTS_INC(tcp_rack_mbufq);
22972 if (optval || rack->r_use_cmp_ack)
22973 rack->r_mbuf_queue = 1;
22974 else
22975 rack->r_mbuf_queue = 0;
22976 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
22977 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
22978 else
22979 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
22980 break;
22981 case TCP_RACK_NONRXT_CFG_RATE:
22982 RACK_OPTS_INC(tcp_rack_cfg_rate);
22983 if (optval == 0)
22984 rack->rack_rec_nonrxt_use_cr = 0;
22985 else
22986 rack->rack_rec_nonrxt_use_cr = 1;
22987 break;
22988 case TCP_NO_PRR:
22989 RACK_OPTS_INC(tcp_rack_noprr);
22990 if (optval == 0)
22991 rack->rack_no_prr = 0;
22992 else if (optval == 1)
22993 rack->rack_no_prr = 1;
22994 else if (optval == 2)
22995 rack->no_prr_addback = 1;
22996 else
22997 error = EINVAL;
22998 break;
22999 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
23000 if (optval > 0)
23001 rack->cspr_is_fcc = 1;
23002 else
23003 rack->cspr_is_fcc = 0;
23004 break;
23005 case TCP_TIMELY_DYN_ADJ:
23006 RACK_OPTS_INC(tcp_timely_dyn);
23007 if (optval == 0)
23008 rack->rc_gp_dyn_mul = 0;
23009 else {
23010 rack->rc_gp_dyn_mul = 1;
23011 if (optval >= 100) {
23012 /*
23013 * If the user sets something 100 or more
23014 * its the gp_ca value.
23015 */
23016 rack->r_ctl.rack_per_of_gp_ca = optval;
23017 }
23018 }
23019 break;
23020 case TCP_RACK_DO_DETECTION:
23021 error = EINVAL;
23022 break;
23023 case TCP_RACK_TLP_USE:
23024 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) {
23025 error = EINVAL;
23026 break;
23027 }
23028 RACK_OPTS_INC(tcp_tlp_use);
23029 rack->rack_tlp_threshold_use = optval;
23030 break;
23031 case TCP_RACK_TLP_REDUCE:
23032 /* RACK TLP cwnd reduction (bool) */
23033 RACK_OPTS_INC(tcp_rack_tlp_reduce);
23034 rack->r_ctl.rc_tlp_cwnd_reduce = optval;
23035 break;
23036 /* Pacing related ones */
23037 case TCP_RACK_PACE_ALWAYS:
23038 /*
23039 * zero is old rack method, 1 is new
23040 * method using a pacing rate.
23041 */
23042 RACK_OPTS_INC(tcp_rack_pace_always);
23043 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23044 error = EPERM;
23045 break;
23046 }
23047 if (optval > 0) {
23048 if (rack->rc_always_pace) {
23049 error = EALREADY;
23050 break;
23051 } else if (tcp_can_enable_pacing()) {
23052 rack->r_ctl.pacing_method |= RACK_REG_PACING;
23053 rack->rc_always_pace = 1;
23054 if (rack->rack_hibeta)
23055 rack_set_cc_pacing(rack);
23056 }
23057 else {
23058 error = ENOSPC;
23059 break;
23060 }
23061 } else {
23062 if (rack->rc_always_pace == 1) {
23063 rack_remove_pacing(rack);
23064 }
23065 }
23066 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack)
23067 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23068 else
23069 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
23070 /* A rate may be set irate or other, if so set seg size */
23071 rack_update_seg(rack);
23072 break;
23073 case TCP_BBR_RACK_INIT_RATE:
23074 RACK_OPTS_INC(tcp_initial_rate);
23075 val = optval;
23076 /* Change from kbits per second to bytes per second */
23077 val *= 1000;
23078 val /= 8;
23079 rack->r_ctl.init_rate = val;
23080 if (rack->rc_always_pace)
23081 rack_update_seg(rack);
23082 break;
23083 case TCP_BBR_IWINTSO:
23084 error = EINVAL;
23085 break;
23086 case TCP_RACK_FORCE_MSEG:
23087 RACK_OPTS_INC(tcp_rack_force_max_seg);
23088 if (optval)
23089 rack->rc_force_max_seg = 1;
23090 else
23091 rack->rc_force_max_seg = 0;
23092 break;
23093 case TCP_RACK_PACE_MIN_SEG:
23094 RACK_OPTS_INC(tcp_rack_min_seg);
23095 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval);
23096 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23097 break;
23098 case TCP_RACK_PACE_MAX_SEG:
23099 /* Max segments size in a pace in bytes */
23100 RACK_OPTS_INC(tcp_rack_max_seg);
23101 if ((rack->dgp_on == 1) &&
23102 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) {
23103 /*
23104 * If we set a max-seg and are doing DGP then
23105 * we now fall under the pacing limits not the
23106 * DGP ones.
23107 */
23108 if (tcp_can_enable_pacing() == 0) {
23109 error = ENOSPC;
23110 break;
23111 }
23112 /*
23113 * Now change up the flags and counts to be correct.
23114 */
23115 rack->r_ctl.pacing_method |= RACK_REG_PACING;
23116 tcp_dec_dgp_pacing_cnt();
23117 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING;
23118 }
23119 if (optval <= MAX_USER_SET_SEG)
23120 rack->rc_user_set_max_segs = optval;
23121 else
23122 rack->rc_user_set_max_segs = MAX_USER_SET_SEG;
23123 rack_set_pace_segments(tp, rack, __LINE__, NULL);
23124 break;
23125 case TCP_RACK_PACE_RATE_REC:
23126 /* Set the fixed pacing rate in Bytes per second ca */
23127 RACK_OPTS_INC(tcp_rack_pace_rate_rec);
23128 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23129 error = EPERM;
23130 break;
23131 }
23132 if (rack->dgp_on) {
23133 /*
23134 * We are already pacing another
23135 * way.
23136 */
23137 error = EBUSY;
23138 break;
23139 }
23140 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23141 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23142 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23143 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23144 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23145 rack->use_fixed_rate = 1;
23146 if (rack->rack_hibeta)
23147 rack_set_cc_pacing(rack);
23148 rack_log_pacing_delay_calc(rack,
23149 rack->r_ctl.rc_fixed_pacing_rate_ss,
23150 rack->r_ctl.rc_fixed_pacing_rate_ca,
23151 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23152 __LINE__, NULL,0);
23153 break;
23154
23155 case TCP_RACK_PACE_RATE_SS:
23156 /* Set the fixed pacing rate in Bytes per second ca */
23157 RACK_OPTS_INC(tcp_rack_pace_rate_ss);
23158 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23159 error = EPERM;
23160 break;
23161 }
23162 if (rack->dgp_on) {
23163 /*
23164 * We are already pacing another
23165 * way.
23166 */
23167 error = EBUSY;
23168 break;
23169 }
23170 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23171 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0)
23172 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23173 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23174 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23175 rack->use_fixed_rate = 1;
23176 if (rack->rack_hibeta)
23177 rack_set_cc_pacing(rack);
23178 rack_log_pacing_delay_calc(rack,
23179 rack->r_ctl.rc_fixed_pacing_rate_ss,
23180 rack->r_ctl.rc_fixed_pacing_rate_ca,
23181 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23182 __LINE__, NULL, 0);
23183 break;
23184
23185 case TCP_RACK_PACE_RATE_CA:
23186 /* Set the fixed pacing rate in Bytes per second ca */
23187 RACK_OPTS_INC(tcp_rack_pace_rate_ca);
23188 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) {
23189 error = EPERM;
23190 break;
23191 }
23192 if (rack->dgp_on) {
23193 /*
23194 * We are already pacing another
23195 * way.
23196 */
23197 error = EBUSY;
23198 break;
23199 }
23200 rack->r_ctl.rc_fixed_pacing_rate_ca = optval;
23201 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0)
23202 rack->r_ctl.rc_fixed_pacing_rate_ss = optval;
23203 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0)
23204 rack->r_ctl.rc_fixed_pacing_rate_rec = optval;
23205 rack->use_fixed_rate = 1;
23206 if (rack->rack_hibeta)
23207 rack_set_cc_pacing(rack);
23208 rack_log_pacing_delay_calc(rack,
23209 rack->r_ctl.rc_fixed_pacing_rate_ss,
23210 rack->r_ctl.rc_fixed_pacing_rate_ca,
23211 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8,
23212 __LINE__, NULL, 0);
23213 break;
23214 case TCP_RACK_GP_INCREASE_REC:
23215 RACK_OPTS_INC(tcp_gp_inc_rec);
23216 rack->r_ctl.rack_per_of_gp_rec = optval;
23217 rack_log_pacing_delay_calc(rack,
23218 rack->r_ctl.rack_per_of_gp_ss,
23219 rack->r_ctl.rack_per_of_gp_ca,
23220 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23221 __LINE__, NULL, 0);
23222 break;
23223 case TCP_RACK_GP_INCREASE_CA:
23224 RACK_OPTS_INC(tcp_gp_inc_ca);
23225 ca = optval;
23226 if (ca < 100) {
23227 /*
23228 * We don't allow any reduction
23229 * over the GP b/w.
23230 */
23231 error = EINVAL;
23232 break;
23233 }
23234 rack->r_ctl.rack_per_of_gp_ca = ca;
23235 rack_log_pacing_delay_calc(rack,
23236 rack->r_ctl.rack_per_of_gp_ss,
23237 rack->r_ctl.rack_per_of_gp_ca,
23238 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23239 __LINE__, NULL, 0);
23240 break;
23241 case TCP_RACK_GP_INCREASE_SS:
23242 RACK_OPTS_INC(tcp_gp_inc_ss);
23243 ss = optval;
23244 if (ss < 100) {
23245 /*
23246 * We don't allow any reduction
23247 * over the GP b/w.
23248 */
23249 error = EINVAL;
23250 break;
23251 }
23252 rack->r_ctl.rack_per_of_gp_ss = ss;
23253 rack_log_pacing_delay_calc(rack,
23254 rack->r_ctl.rack_per_of_gp_ss,
23255 rack->r_ctl.rack_per_of_gp_ca,
23256 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1,
23257 __LINE__, NULL, 0);
23258 break;
23259 case TCP_RACK_RR_CONF:
23260 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate);
23261 if (optval && optval <= 3)
23262 rack->r_rr_config = optval;
23263 else
23264 rack->r_rr_config = 0;
23265 break;
23266 case TCP_PACING_DND: /* URL:dnd */
23267 if (optval > 0)
23268 rack->rc_pace_dnd = 1;
23269 else
23270 rack->rc_pace_dnd = 0;
23271 break;
23272 case TCP_HDWR_RATE_CAP:
23273 RACK_OPTS_INC(tcp_hdwr_rate_cap);
23274 if (optval) {
23275 if (rack->r_rack_hw_rate_caps == 0)
23276 rack->r_rack_hw_rate_caps = 1;
23277 else
23278 error = EALREADY;
23279 } else {
23280 rack->r_rack_hw_rate_caps = 0;
23281 }
23282 break;
23283 case TCP_DGP_UPPER_BOUNDS:
23284 {
23285 uint8_t val;
23286 val = optval & 0x0000ff;
23287 rack->r_ctl.rack_per_upper_bound_ca = val;
23288 val = (optval >> 16) & 0x0000ff;
23289 rack->r_ctl.rack_per_upper_bound_ss = val;
23290 break;
23291 }
23292 case TCP_SS_EEXIT: /* URL:eexit */
23293 if (optval > 0) {
23294 rack->r_ctl.gp_rnd_thresh = optval & 0x0ff;
23295 if (optval & 0x10000) {
23296 rack->r_ctl.gate_to_fs = 1;
23297 } else {
23298 rack->r_ctl.gate_to_fs = 0;
23299 }
23300 if (optval & 0x20000) {
23301 rack->r_ctl.use_gp_not_last = 1;
23302 } else {
23303 rack->r_ctl.use_gp_not_last = 0;
23304 }
23305 if (optval & 0xfffc0000) {
23306 uint32_t v;
23307
23308 v = (optval >> 18) & 0x00003fff;
23309 if (v >= 1000)
23310 rack->r_ctl.gp_gain_req = v;
23311 }
23312 } else {
23313 /* We do not do ss early exit at all */
23314 rack->rc_initial_ss_comp = 1;
23315 rack->r_ctl.gp_rnd_thresh = 0;
23316 }
23317 break;
23318 case TCP_RACK_SPLIT_LIMIT:
23319 RACK_OPTS_INC(tcp_split_limit);
23320 rack->r_ctl.rc_split_limit = optval;
23321 break;
23322 case TCP_BBR_HDWR_PACE:
23323 RACK_OPTS_INC(tcp_hdwr_pacing);
23324 if (optval){
23325 if (rack->rack_hdrw_pacing == 0) {
23326 rack->rack_hdw_pace_ena = 1;
23327 rack->rack_attempt_hdwr_pace = 0;
23328 } else
23329 error = EALREADY;
23330 } else {
23331 rack->rack_hdw_pace_ena = 0;
23332 #ifdef RATELIMIT
23333 if (rack->r_ctl.crte != NULL) {
23334 rack->rack_hdrw_pacing = 0;
23335 rack->rack_attempt_hdwr_pace = 0;
23336 tcp_rel_pacing_rate(rack->r_ctl.crte, tp);
23337 rack->r_ctl.crte = NULL;
23338 }
23339 #endif
23340 }
23341 break;
23342 /* End Pacing related ones */
23343 case TCP_RACK_PRR_SENDALOT:
23344 /* Allow PRR to send more than one seg */
23345 RACK_OPTS_INC(tcp_rack_prr_sendalot);
23346 rack->r_ctl.rc_prr_sendalot = optval;
23347 break;
23348 case TCP_RACK_MIN_TO:
23349 /* Minimum time between rack t-o's in ms */
23350 RACK_OPTS_INC(tcp_rack_min_to);
23351 rack->r_ctl.rc_min_to = optval;
23352 break;
23353 case TCP_RACK_EARLY_SEG:
23354 /* If early recovery max segments */
23355 RACK_OPTS_INC(tcp_rack_early_seg);
23356 rack->r_ctl.rc_early_recovery_segs = optval;
23357 break;
23358 case TCP_RACK_ENABLE_HYSTART:
23359 {
23360 if (optval) {
23361 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
23362 if (rack_do_hystart > RACK_HYSTART_ON)
23363 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
23364 if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
23365 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
23366 } else {
23367 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
23368 }
23369 }
23370 break;
23371 case TCP_RACK_REORD_THRESH:
23372 /* RACK reorder threshold (shift amount) */
23373 RACK_OPTS_INC(tcp_rack_reord_thresh);
23374 if ((optval > 0) && (optval < 31))
23375 rack->r_ctl.rc_reorder_shift = optval;
23376 else
23377 error = EINVAL;
23378 break;
23379 case TCP_RACK_REORD_FADE:
23380 /* Does reordering fade after ms time */
23381 RACK_OPTS_INC(tcp_rack_reord_fade);
23382 rack->r_ctl.rc_reorder_fade = optval;
23383 break;
23384 case TCP_RACK_TLP_THRESH:
23385 /* RACK TLP theshold i.e. srtt+(srtt/N) */
23386 RACK_OPTS_INC(tcp_rack_tlp_thresh);
23387 if (optval)
23388 rack->r_ctl.rc_tlp_threshold = optval;
23389 else
23390 error = EINVAL;
23391 break;
23392 case TCP_BBR_USE_RACK_RR:
23393 RACK_OPTS_INC(tcp_rack_rr);
23394 if (optval)
23395 rack->use_rack_rr = 1;
23396 else
23397 rack->use_rack_rr = 0;
23398 break;
23399 case TCP_RACK_PKT_DELAY:
23400 /* RACK added ms i.e. rack-rtt + reord + N */
23401 RACK_OPTS_INC(tcp_rack_pkt_delay);
23402 rack->r_ctl.rc_pkt_delay = optval;
23403 break;
23404 case TCP_DELACK:
23405 RACK_OPTS_INC(tcp_rack_delayed_ack);
23406 if (optval == 0)
23407 tp->t_delayed_ack = 0;
23408 else
23409 tp->t_delayed_ack = 1;
23410 if (tp->t_flags & TF_DELACK) {
23411 tp->t_flags &= ~TF_DELACK;
23412 tp->t_flags |= TF_ACKNOW;
23413 NET_EPOCH_ENTER(et);
23414 rack_output(tp);
23415 NET_EPOCH_EXIT(et);
23416 }
23417 break;
23418
23419 case TCP_BBR_RACK_RTT_USE:
23420 RACK_OPTS_INC(tcp_rack_rtt_use);
23421 if ((optval != USE_RTT_HIGH) &&
23422 (optval != USE_RTT_LOW) &&
23423 (optval != USE_RTT_AVG))
23424 error = EINVAL;
23425 else
23426 rack->r_ctl.rc_rate_sample_method = optval;
23427 break;
23428 case TCP_HONOR_HPTS_MIN:
23429 RACK_OPTS_INC(tcp_honor_hpts);
23430 if (optval) {
23431 rack->r_use_hpts_min = 1;
23432 /*
23433 * Must be between 2 - 80% to be a reduction else
23434 * we keep the default (10%).
23435 */
23436 if ((optval > 1) && (optval <= 80)) {
23437 rack->r_ctl.max_reduction = optval;
23438 }
23439 } else
23440 rack->r_use_hpts_min = 0;
23441 break;
23442 case TCP_REC_IS_DYN: /* URL:dynrec */
23443 RACK_OPTS_INC(tcp_dyn_rec);
23444 if (optval)
23445 rack->rc_gp_no_rec_chg = 1;
23446 else
23447 rack->rc_gp_no_rec_chg = 0;
23448 break;
23449 case TCP_NO_TIMELY:
23450 RACK_OPTS_INC(tcp_notimely);
23451 if (optval) {
23452 rack->rc_skip_timely = 1;
23453 rack->r_ctl.rack_per_of_gp_rec = 90;
23454 rack->r_ctl.rack_per_of_gp_ca = 100;
23455 rack->r_ctl.rack_per_of_gp_ss = 250;
23456 } else {
23457 rack->rc_skip_timely = 0;
23458 }
23459 break;
23460 case TCP_GP_USE_LTBW:
23461 if (optval == 0) {
23462 rack->use_lesser_lt_bw = 0;
23463 rack->dis_lt_bw = 1;
23464 } else if (optval == 1) {
23465 rack->use_lesser_lt_bw = 1;
23466 rack->dis_lt_bw = 0;
23467 } else if (optval == 2) {
23468 rack->use_lesser_lt_bw = 0;
23469 rack->dis_lt_bw = 0;
23470 }
23471 break;
23472 case TCP_DATA_AFTER_CLOSE:
23473 RACK_OPTS_INC(tcp_data_after_close);
23474 if (optval)
23475 rack->rc_allow_data_af_clo = 1;
23476 else
23477 rack->rc_allow_data_af_clo = 0;
23478 break;
23479 default:
23480 break;
23481 }
23482 tcp_log_socket_option(tp, sopt_name, optval, error);
23483 return (error);
23484 }
23485
23486 static void
rack_inherit(struct tcpcb * tp,struct inpcb * parent)23487 rack_inherit(struct tcpcb *tp, struct inpcb *parent)
23488 {
23489 /*
23490 * A new connection has been created (tp) and
23491 * the parent is the inpcb given. We want to
23492 * apply a read-lock to the parent (we are already
23493 * holding a write lock on the tp) and copy anything
23494 * out of the rack specific data as long as its tfb is
23495 * the same as ours i.e. we are the same stack. Otherwise
23496 * we just return.
23497 */
23498 struct tcpcb *par;
23499 struct tcp_rack *dest, *src;
23500 int cnt = 0;
23501
23502 par = intotcpcb(parent);
23503 if (par->t_fb != tp->t_fb) {
23504 /* Not the same stack */
23505 tcp_log_socket_option(tp, 0, 0, 1);
23506 return;
23507 }
23508 /* Ok if we reach here lets setup the two rack pointers */
23509 dest = (struct tcp_rack *)tp->t_fb_ptr;
23510 src = (struct tcp_rack *)par->t_fb_ptr;
23511 if ((src == NULL) || (dest == NULL)) {
23512 /* Huh? */
23513 tcp_log_socket_option(tp, 0, 0, 2);
23514 return;
23515 }
23516 /* Now copy out anything we wish to inherit i.e. things in socket-options */
23517 /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */
23518 if ((src->dgp_on) && (dest->dgp_on == 0)) {
23519 /* Profile 1 had to be set via sock opt */
23520 rack_set_dgp(dest);
23521 cnt++;
23522 }
23523 /* TCP_RACK_SET_RXT_OPTIONS */
23524 if (dest->full_size_rxt != src->full_size_rxt) {
23525 dest->full_size_rxt = src->full_size_rxt;
23526 cnt++;
23527 }
23528 if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) {
23529 dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min;
23530 cnt++;
23531 }
23532 /* TCP_RACK_DSACK_OPT */
23533 if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) {
23534 dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based;
23535 cnt++;
23536 }
23537 if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) {
23538 dest->rc_rack_use_dsack = src->rc_rack_use_dsack;
23539 cnt++;
23540 }
23541 /* TCP_RACK_PACING_DIVISOR */
23542 if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) {
23543 dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor;
23544 cnt++;
23545 }
23546 /* TCP_RACK_HI_BETA */
23547 if (src->rack_hibeta != dest->rack_hibeta) {
23548 cnt++;
23549 if (src->rack_hibeta) {
23550 dest->r_ctl.rc_saved_beta = src->r_ctl.rc_saved_beta;
23551 dest->rack_hibeta = 1;
23552 } else {
23553 dest->rack_hibeta = 0;
23554 }
23555 }
23556 /* TCP_RACK_TIMER_SLOP */
23557 if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) {
23558 dest->r_ctl.timer_slop = src->r_ctl.timer_slop;
23559 cnt++;
23560 }
23561 /* TCP_RACK_PACING_BETA_ECN */
23562 if (dest->r_ctl.rc_saved_beta_ecn != src->r_ctl.rc_saved_beta_ecn) {
23563 dest->r_ctl.rc_saved_beta_ecn = src->r_ctl.rc_saved_beta_ecn;
23564 cnt++;
23565 }
23566 /* We do not do TCP_DEFER_OPTIONS */
23567 /* TCP_RACK_MEASURE_CNT */
23568 if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) {
23569 dest->r_ctl.req_measurements = src->r_ctl.req_measurements;
23570 cnt++;
23571 }
23572 /* TCP_HDWR_UP_ONLY */
23573 if (dest->r_up_only != src->r_up_only) {
23574 dest->r_up_only = src->r_up_only;
23575 cnt++;
23576 }
23577 /* TCP_FILLCW_RATE_CAP */
23578 if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) {
23579 dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap;
23580 cnt++;
23581 }
23582 /* TCP_PACING_RATE_CAP */
23583 if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) {
23584 dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap;
23585 cnt++;
23586 }
23587 /* A listener can't set TCP_HYBRID_PACING */
23588 /* TCP_SIDECHAN_DIS */
23589 if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) {
23590 dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask;
23591 cnt++;
23592 }
23593 /* TCP_SHARED_CWND_TIME_LIMIT */
23594 if (dest->r_limit_scw != src->r_limit_scw) {
23595 dest->r_limit_scw = src->r_limit_scw;
23596 cnt++;
23597 }
23598 /* TCP_RACK_PACE_TO_FILL */
23599 if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) {
23600 dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd;
23601 cnt++;
23602 }
23603 if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) {
23604 dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range;
23605 cnt++;
23606 }
23607 if (dest->rtt_limit_mul != src->rtt_limit_mul) {
23608 dest->rtt_limit_mul = src->rtt_limit_mul;
23609 cnt++;
23610 }
23611 /* TCP_RACK_NO_PUSH_AT_MAX */
23612 if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) {
23613 dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt;
23614 cnt++;
23615 }
23616 /* TCP_SHARED_CWND_ENABLE */
23617 if (dest->rack_enable_scwnd != src->rack_enable_scwnd) {
23618 dest->rack_enable_scwnd = src->rack_enable_scwnd;
23619 cnt++;
23620 }
23621 /* TCP_USE_CMP_ACKS */
23622 if (dest->r_use_cmp_ack != src->r_use_cmp_ack) {
23623 dest->r_use_cmp_ack = src->r_use_cmp_ack;
23624 cnt++;
23625 }
23626
23627 if (dest->r_mbuf_queue != src->r_mbuf_queue) {
23628 dest->r_mbuf_queue = src->r_mbuf_queue;
23629 cnt++;
23630 }
23631 /* TCP_RACK_MBUF_QUEUE */
23632 if (dest->r_mbuf_queue != src->r_mbuf_queue) {
23633 dest->r_mbuf_queue = src->r_mbuf_queue;
23634 cnt++;
23635 }
23636 if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) {
23637 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ;
23638 } else {
23639 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ;
23640 }
23641 if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) {
23642 tp->t_flags2 |= TF2_MBUF_ACKCMP;
23643 }
23644 /* TCP_RACK_NONRXT_CFG_RATE */
23645 if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) {
23646 dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr;
23647 cnt++;
23648 }
23649 /* TCP_NO_PRR */
23650 if (dest->rack_no_prr != src->rack_no_prr) {
23651 dest->rack_no_prr = src->rack_no_prr;
23652 cnt++;
23653 }
23654 if (dest->no_prr_addback != src->no_prr_addback) {
23655 dest->no_prr_addback = src->no_prr_addback;
23656 cnt++;
23657 }
23658 /* RACK_CSPR_IS_FCC */
23659 if (dest->cspr_is_fcc != src->cspr_is_fcc) {
23660 dest->cspr_is_fcc = src->cspr_is_fcc;
23661 cnt++;
23662 }
23663 /* TCP_TIMELY_DYN_ADJ */
23664 if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) {
23665 dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul;
23666 cnt++;
23667 }
23668 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
23669 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
23670 cnt++;
23671 }
23672 /* TCP_RACK_TLP_USE */
23673 if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) {
23674 dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use;
23675 cnt++;
23676 }
23677 /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */
23678 /* TCP_BBR_RACK_INIT_RATE */
23679 if (dest->r_ctl.init_rate != src->r_ctl.init_rate) {
23680 dest->r_ctl.init_rate = src->r_ctl.init_rate;
23681 cnt++;
23682 }
23683 /* TCP_RACK_FORCE_MSEG */
23684 if (dest->rc_force_max_seg != src->rc_force_max_seg) {
23685 dest->rc_force_max_seg = src->rc_force_max_seg;
23686 cnt++;
23687 }
23688 /* TCP_RACK_PACE_MIN_SEG */
23689 if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) {
23690 dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs;
23691 cnt++;
23692 }
23693 /* we don't allow TCP_RACK_PACE_MAX_SEG */
23694 /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */
23695 if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) {
23696 dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca;
23697 cnt++;
23698 }
23699 if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) {
23700 dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss;
23701 cnt++;
23702 }
23703 if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) {
23704 dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec;
23705 cnt++;
23706 }
23707 /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */
23708 if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) {
23709 dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec;
23710 cnt++;
23711 }
23712 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) {
23713 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca;
23714 cnt++;
23715 }
23716
23717 if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) {
23718 dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss;
23719 cnt++;
23720 }
23721 /* TCP_RACK_RR_CONF */
23722 if (dest->r_rr_config != src->r_rr_config) {
23723 dest->r_rr_config = src->r_rr_config;
23724 cnt++;
23725 }
23726 /* TCP_PACING_DND */
23727 if (dest->rc_pace_dnd != src->rc_pace_dnd) {
23728 dest->rc_pace_dnd = src->rc_pace_dnd;
23729 cnt++;
23730 }
23731 /* TCP_HDWR_RATE_CAP */
23732 if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) {
23733 dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps;
23734 cnt++;
23735 }
23736 /* TCP_DGP_UPPER_BOUNDS */
23737 if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) {
23738 dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca;
23739 cnt++;
23740 }
23741 if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) {
23742 dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss;
23743 cnt++;
23744 }
23745 /* TCP_SS_EEXIT */
23746 if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) {
23747 dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh;
23748 cnt++;
23749 }
23750 if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) {
23751 dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs;
23752 cnt++;
23753 }
23754 if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) {
23755 dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last;
23756 cnt++;
23757 }
23758 if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) {
23759 dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req;
23760 cnt++;
23761 }
23762 /* TCP_BBR_HDWR_PACE */
23763 if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) {
23764 dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena;
23765 cnt++;
23766 }
23767 if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) {
23768 dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace;
23769 cnt++;
23770 }
23771 /* TCP_RACK_PRR_SENDALOT */
23772 if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) {
23773 dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot;
23774 cnt++;
23775 }
23776 /* TCP_RACK_MIN_TO */
23777 if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) {
23778 dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to;
23779 cnt++;
23780 }
23781 /* TCP_RACK_EARLY_SEG */
23782 if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) {
23783 dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs;
23784 cnt++;
23785 }
23786 /* TCP_RACK_ENABLE_HYSTART */
23787 if (par->t_ccv.flags != tp->t_ccv.flags) {
23788 cnt++;
23789 if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) {
23790 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED;
23791 if (rack_do_hystart > RACK_HYSTART_ON)
23792 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND;
23793 if (rack_do_hystart > RACK_HYSTART_ON_W_SC)
23794 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH;
23795 } else {
23796 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH);
23797 }
23798 }
23799 /* TCP_RACK_REORD_THRESH */
23800 if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) {
23801 dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift;
23802 cnt++;
23803 }
23804 /* TCP_RACK_REORD_FADE */
23805 if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) {
23806 dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade;
23807 cnt++;
23808 }
23809 /* TCP_RACK_TLP_THRESH */
23810 if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) {
23811 dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold;
23812 cnt++;
23813 }
23814 /* TCP_BBR_USE_RACK_RR */
23815 if (dest->use_rack_rr != src->use_rack_rr) {
23816 dest->use_rack_rr = src->use_rack_rr;
23817 cnt++;
23818 }
23819 /* TCP_RACK_PKT_DELAY */
23820 if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) {
23821 dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay;
23822 cnt++;
23823 }
23824 /* TCP_DELACK will get copied via the main code if applicable */
23825 /* TCP_BBR_RACK_RTT_USE */
23826 if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) {
23827 dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method;
23828 cnt++;
23829 }
23830 /* TCP_HONOR_HPTS_MIN */
23831 if (dest->r_use_hpts_min != src->r_use_hpts_min) {
23832 dest->r_use_hpts_min = src->r_use_hpts_min;
23833 cnt++;
23834 }
23835 if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) {
23836 dest->r_ctl.max_reduction = src->r_ctl.max_reduction;
23837 cnt++;
23838 }
23839 /* TCP_REC_IS_DYN */
23840 if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) {
23841 dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg;
23842 cnt++;
23843 }
23844 if (dest->rc_skip_timely != src->rc_skip_timely) {
23845 dest->rc_skip_timely = src->rc_skip_timely;
23846 cnt++;
23847 }
23848 /* TCP_DATA_AFTER_CLOSE */
23849 if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) {
23850 dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo;
23851 cnt++;
23852 }
23853 /* TCP_GP_USE_LTBW */
23854 if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) {
23855 dest->use_lesser_lt_bw = src->use_lesser_lt_bw;
23856 cnt++;
23857 }
23858 if (dest->dis_lt_bw != src->dis_lt_bw) {
23859 dest->dis_lt_bw = src->dis_lt_bw;
23860 cnt++;
23861 }
23862 tcp_log_socket_option(tp, 0, cnt, 0);
23863 }
23864
23865
23866 static void
rack_apply_deferred_options(struct tcp_rack * rack)23867 rack_apply_deferred_options(struct tcp_rack *rack)
23868 {
23869 struct deferred_opt_list *dol, *sdol;
23870 uint32_t s_optval;
23871
23872 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) {
23873 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next);
23874 /* Disadvantage of deferal is you loose the error return */
23875 s_optval = (uint32_t)dol->optval;
23876 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL);
23877 free(dol, M_TCPDO);
23878 }
23879 }
23880
23881 static void
rack_hw_tls_change(struct tcpcb * tp,int chg)23882 rack_hw_tls_change(struct tcpcb *tp, int chg)
23883 {
23884 /* Update HW tls state */
23885 struct tcp_rack *rack;
23886
23887 rack = (struct tcp_rack *)tp->t_fb_ptr;
23888 if (chg)
23889 rack->r_ctl.fsb.hw_tls = 1;
23890 else
23891 rack->r_ctl.fsb.hw_tls = 0;
23892 }
23893
23894 static int
rack_pru_options(struct tcpcb * tp,int flags)23895 rack_pru_options(struct tcpcb *tp, int flags)
23896 {
23897 if (flags & PRUS_OOB)
23898 return (EOPNOTSUPP);
23899 return (0);
23900 }
23901
23902 static bool
rack_wake_check(struct tcpcb * tp)23903 rack_wake_check(struct tcpcb *tp)
23904 {
23905 struct tcp_rack *rack;
23906 struct timeval tv;
23907 uint32_t cts;
23908
23909 rack = (struct tcp_rack *)tp->t_fb_ptr;
23910 if (rack->r_ctl.rc_hpts_flags) {
23911 cts = tcp_get_usecs(&tv);
23912 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){
23913 /*
23914 * Pacing timer is up, check if we are ready.
23915 */
23916 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to))
23917 return (true);
23918 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) {
23919 /*
23920 * A timer is up, check if we are ready.
23921 */
23922 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp))
23923 return (true);
23924 }
23925 }
23926 return (false);
23927 }
23928
23929 static struct tcp_function_block __tcp_rack = {
23930 .tfb_tcp_block_name = __XSTRING(STACKNAME),
23931 .tfb_tcp_output = rack_output,
23932 .tfb_do_queued_segments = ctf_do_queued_segments,
23933 .tfb_do_segment_nounlock = rack_do_segment_nounlock,
23934 .tfb_tcp_do_segment = rack_do_segment,
23935 .tfb_tcp_ctloutput = rack_ctloutput,
23936 .tfb_tcp_fb_init = rack_init,
23937 .tfb_tcp_fb_fini = rack_fini,
23938 .tfb_tcp_timer_stop_all = rack_stopall,
23939 .tfb_tcp_rexmit_tmr = rack_remxt_tmr,
23940 .tfb_tcp_handoff_ok = rack_handoff_ok,
23941 .tfb_tcp_mtu_chg = rack_mtu_change,
23942 .tfb_pru_options = rack_pru_options,
23943 .tfb_hwtls_change = rack_hw_tls_change,
23944 .tfb_chg_query = rack_chg_query,
23945 .tfb_switch_failed = rack_switch_failed,
23946 .tfb_early_wake_check = rack_wake_check,
23947 .tfb_compute_pipe = rack_compute_pipe,
23948 .tfb_stack_info = rack_stack_information,
23949 .tfb_inherit = rack_inherit,
23950 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK,
23951
23952 };
23953
23954 /*
23955 * rack_ctloutput() must drop the inpcb lock before performing copyin on
23956 * socket option arguments. When it re-acquires the lock after the copy, it
23957 * has to revalidate that the connection is still valid for the socket
23958 * option.
23959 */
23960 static int
rack_set_sockopt(struct tcpcb * tp,struct sockopt * sopt)23961 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt)
23962 {
23963 struct inpcb *inp = tptoinpcb(tp);
23964 #ifdef INET
23965 struct ip *ip;
23966 #endif
23967 struct tcp_rack *rack;
23968 struct tcp_hybrid_req hybrid;
23969 uint64_t loptval;
23970 int32_t error = 0, optval;
23971
23972 rack = (struct tcp_rack *)tp->t_fb_ptr;
23973 if (rack == NULL) {
23974 INP_WUNLOCK(inp);
23975 return (EINVAL);
23976 }
23977 #ifdef INET
23978 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr;
23979 #endif
23980
23981 switch (sopt->sopt_level) {
23982 #ifdef INET6
23983 case IPPROTO_IPV6:
23984 MPASS(inp->inp_vflag & INP_IPV6PROTO);
23985 switch (sopt->sopt_name) {
23986 case IPV6_USE_MIN_MTU:
23987 tcp6_use_min_mtu(tp);
23988 break;
23989 }
23990 INP_WUNLOCK(inp);
23991 return (0);
23992 #endif
23993 #ifdef INET
23994 case IPPROTO_IP:
23995 switch (sopt->sopt_name) {
23996 case IP_TOS:
23997 /*
23998 * The DSCP codepoint has changed, update the fsb.
23999 */
24000 ip->ip_tos = rack->rc_inp->inp_ip_tos;
24001 break;
24002 case IP_TTL:
24003 /*
24004 * The TTL has changed, update the fsb.
24005 */
24006 ip->ip_ttl = rack->rc_inp->inp_ip_ttl;
24007 break;
24008 }
24009 INP_WUNLOCK(inp);
24010 return (0);
24011 #endif
24012 #ifdef SO_PEERPRIO
24013 case SOL_SOCKET:
24014 switch (sopt->sopt_name) {
24015 case SO_PEERPRIO: /* SC-URL:bs */
24016 /* Already read in and sanity checked in sosetopt(). */
24017 if (inp->inp_socket) {
24018 rack->client_bufferlvl = inp->inp_socket->so_peerprio;
24019 }
24020 break;
24021 }
24022 INP_WUNLOCK(inp);
24023 return (0);
24024 #endif
24025 case IPPROTO_TCP:
24026 switch (sopt->sopt_name) {
24027 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */
24028 /* Pacing related ones */
24029 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */
24030 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */
24031 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */
24032 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */
24033 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */
24034 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */
24035 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/
24036 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */
24037 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */
24038 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */
24039 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */
24040 case TCP_RACK_RR_CONF: /* URL:rrr_conf */
24041 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */
24042 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */
24043 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */
24044 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */
24045 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */
24046 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */
24047 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */
24048 /* End pacing related */
24049 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */
24050 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */
24051 case TCP_RACK_MIN_TO: /* URL:min_to */
24052 case TCP_RACK_EARLY_SEG: /* URL:early_seg */
24053 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */
24054 case TCP_RACK_REORD_FADE: /* URL:reord_fade */
24055 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */
24056 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */
24057 case TCP_RACK_TLP_USE: /* URL:tlp_use */
24058 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */
24059 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */
24060 case TCP_NO_PRR: /* URL:noprr */
24061 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */
24062 case TCP_DATA_AFTER_CLOSE: /* no URL */
24063 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */
24064 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */
24065 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */
24066 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */
24067 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */
24068 case TCP_RACK_PROFILE: /* URL:profile */
24069 case TCP_SIDECHAN_DIS: /* URL:scodm */
24070 case TCP_HYBRID_PACING: /* URL:pacing=hybrid */
24071 case TCP_USE_CMP_ACKS: /* URL:cmpack */
24072 case TCP_RACK_ABC_VAL: /* URL:labc */
24073 case TCP_REC_ABC_VAL: /* URL:reclabc */
24074 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */
24075 case TCP_DEFER_OPTIONS: /* URL:defer */
24076 case TCP_RACK_DSACK_OPT: /* URL:dsack */
24077 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */
24078 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */
24079 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */
24080 case TCP_RACK_HI_BETA: /* URL:hibeta */
24081 case TCP_RACK_SPLIT_LIMIT: /* URL:split */
24082 case TCP_SS_EEXIT: /* URL:eexit */
24083 case TCP_DGP_UPPER_BOUNDS: /* URL:upper */
24084 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */
24085 case TCP_PACING_DND: /* URL:dnd */
24086 case TCP_NO_TIMELY: /* URL:notimely */
24087 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */
24088 case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */
24089 case TCP_REC_IS_DYN: /* URL:dynrec */
24090 case TCP_GP_USE_LTBW: /* URL:useltbw */
24091 goto process_opt;
24092 break;
24093 default:
24094 /* Filter off all unknown options to the base stack */
24095 return (tcp_default_ctloutput(tp, sopt));
24096 break;
24097 }
24098 default:
24099 INP_WUNLOCK(inp);
24100 return (0);
24101 }
24102 process_opt:
24103 INP_WUNLOCK(inp);
24104 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
24105 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) {
24106 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval));
24107 /*
24108 * We truncate it down to 32 bits for the socket-option trace this
24109 * means rates > 34Gbps won't show right, but thats probably ok.
24110 */
24111 optval = (uint32_t)loptval;
24112 } else if (sopt->sopt_name == TCP_HYBRID_PACING) {
24113 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid));
24114 } else {
24115 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval));
24116 /* Save it in 64 bit form too */
24117 loptval = optval;
24118 }
24119 if (error)
24120 return (error);
24121 INP_WLOCK(inp);
24122 if (tp->t_fb != &__tcp_rack) {
24123 INP_WUNLOCK(inp);
24124 return (ENOPROTOOPT);
24125 }
24126 if (rack->defer_options && (rack->gp_ready == 0) &&
24127 (sopt->sopt_name != TCP_DEFER_OPTIONS) &&
24128 (sopt->sopt_name != TCP_HYBRID_PACING) &&
24129 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) &&
24130 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) &&
24131 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) {
24132 /* Options are being deferred */
24133 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) {
24134 INP_WUNLOCK(inp);
24135 return (0);
24136 } else {
24137 /* No memory to defer, fail */
24138 INP_WUNLOCK(inp);
24139 return (ENOMEM);
24140 }
24141 }
24142 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid);
24143 INP_WUNLOCK(inp);
24144 return (error);
24145 }
24146
24147 static void
rack_fill_info(struct tcpcb * tp,struct tcp_info * ti)24148 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti)
24149 {
24150
24151 INP_WLOCK_ASSERT(tptoinpcb(tp));
24152 bzero(ti, sizeof(*ti));
24153
24154 ti->tcpi_state = tp->t_state;
24155 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
24156 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
24157 if (tp->t_flags & TF_SACK_PERMIT)
24158 ti->tcpi_options |= TCPI_OPT_SACK;
24159 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
24160 ti->tcpi_options |= TCPI_OPT_WSCALE;
24161 ti->tcpi_snd_wscale = tp->snd_scale;
24162 ti->tcpi_rcv_wscale = tp->rcv_scale;
24163 }
24164 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
24165 ti->tcpi_options |= TCPI_OPT_ECN;
24166 if (tp->t_flags & TF_FASTOPEN)
24167 ti->tcpi_options |= TCPI_OPT_TFO;
24168 /* still kept in ticks is t_rcvtime */
24169 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
24170 /* Since we hold everything in precise useconds this is easy */
24171 ti->tcpi_rtt = tp->t_srtt;
24172 ti->tcpi_rttvar = tp->t_rttvar;
24173 ti->tcpi_rto = tp->t_rxtcur;
24174 ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
24175 ti->tcpi_snd_cwnd = tp->snd_cwnd;
24176 /*
24177 * FreeBSD-specific extension fields for tcp_info.
24178 */
24179 ti->tcpi_rcv_space = tp->rcv_wnd;
24180 ti->tcpi_rcv_nxt = tp->rcv_nxt;
24181 ti->tcpi_snd_wnd = tp->snd_wnd;
24182 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */
24183 ti->tcpi_snd_nxt = tp->snd_nxt;
24184 ti->tcpi_snd_mss = tp->t_maxseg;
24185 ti->tcpi_rcv_mss = tp->t_maxseg;
24186 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
24187 ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
24188 ti->tcpi_snd_zerowin = tp->t_sndzerowin;
24189 ti->tcpi_total_tlp = tp->t_sndtlppack;
24190 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte;
24191 ti->tcpi_rttmin = tp->t_rttlow;
24192 #ifdef NETFLIX_STATS
24193 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
24194 #endif
24195 #ifdef TCP_OFFLOAD
24196 if (tp->t_flags & TF_TOE) {
24197 ti->tcpi_options |= TCPI_OPT_TOE;
24198 tcp_offload_tcp_info(tp, ti);
24199 }
24200 #endif
24201 }
24202
24203 static int
rack_get_sockopt(struct tcpcb * tp,struct sockopt * sopt)24204 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt)
24205 {
24206 struct inpcb *inp = tptoinpcb(tp);
24207 struct tcp_rack *rack;
24208 int32_t error, optval;
24209 uint64_t val, loptval;
24210 struct tcp_info ti;
24211 /*
24212 * Because all our options are either boolean or an int, we can just
24213 * pull everything into optval and then unlock and copy. If we ever
24214 * add a option that is not a int, then this will have quite an
24215 * impact to this routine.
24216 */
24217 error = 0;
24218 rack = (struct tcp_rack *)tp->t_fb_ptr;
24219 if (rack == NULL) {
24220 INP_WUNLOCK(inp);
24221 return (EINVAL);
24222 }
24223 switch (sopt->sopt_name) {
24224 case TCP_INFO:
24225 /* First get the info filled */
24226 rack_fill_info(tp, &ti);
24227 /* Fix up the rtt related fields if needed */
24228 INP_WUNLOCK(inp);
24229 error = sooptcopyout(sopt, &ti, sizeof ti);
24230 return (error);
24231 /*
24232 * Beta is the congestion control value for NewReno that influences how
24233 * much of a backoff happens when loss is detected. It is normally set
24234 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value
24235 * when you exit recovery.
24236 */
24237 case TCP_RACK_PACING_BETA:
24238 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
24239 error = EINVAL;
24240 else if (rack->rc_pacing_cc_set == 0)
24241 optval = rack->r_ctl.rc_saved_beta;
24242 else {
24243 /*
24244 * Reach out into the CC data and report back what
24245 * I have previously set. Yeah it looks hackish but
24246 * we don't want to report the saved values.
24247 */
24248 if (tp->t_ccv.cc_data)
24249 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta;
24250 else
24251 error = EINVAL;
24252 }
24253 break;
24254 /*
24255 * Beta_ecn is the congestion control value for NewReno that influences how
24256 * much of a backoff happens when a ECN mark is detected. It is normally set
24257 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when
24258 * you exit recovery. Note that classic ECN has a beta of 50, it is only
24259 * ABE Ecn that uses this "less" value, but we do too with pacing :)
24260 */
24261 case TCP_RACK_PACING_BETA_ECN:
24262 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0)
24263 error = EINVAL;
24264 else if (rack->rc_pacing_cc_set == 0)
24265 optval = rack->r_ctl.rc_saved_beta_ecn;
24266 else {
24267 /*
24268 * Reach out into the CC data and report back what
24269 * I have previously set. Yeah it looks hackish but
24270 * we don't want to report the saved values.
24271 */
24272 if (tp->t_ccv.cc_data)
24273 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn;
24274 else
24275 error = EINVAL;
24276 }
24277 break;
24278 case TCP_RACK_DSACK_OPT:
24279 optval = 0;
24280 if (rack->rc_rack_tmr_std_based) {
24281 optval |= 1;
24282 }
24283 if (rack->rc_rack_use_dsack) {
24284 optval |= 2;
24285 }
24286 break;
24287 case TCP_RACK_ENABLE_HYSTART:
24288 {
24289 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) {
24290 optval = RACK_HYSTART_ON;
24291 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND)
24292 optval = RACK_HYSTART_ON_W_SC;
24293 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH)
24294 optval = RACK_HYSTART_ON_W_SC_C;
24295 } else {
24296 optval = RACK_HYSTART_OFF;
24297 }
24298 }
24299 break;
24300 case TCP_RACK_DGP_IN_REC:
24301 error = EINVAL;
24302 break;
24303 case TCP_RACK_HI_BETA:
24304 optval = rack->rack_hibeta;
24305 break;
24306 case TCP_DEFER_OPTIONS:
24307 optval = rack->defer_options;
24308 break;
24309 case TCP_RACK_MEASURE_CNT:
24310 optval = rack->r_ctl.req_measurements;
24311 break;
24312 case TCP_REC_ABC_VAL:
24313 optval = rack->r_use_labc_for_rec;
24314 break;
24315 case TCP_RACK_ABC_VAL:
24316 optval = rack->rc_labc;
24317 break;
24318 case TCP_HDWR_UP_ONLY:
24319 optval= rack->r_up_only;
24320 break;
24321 case TCP_FILLCW_RATE_CAP:
24322 loptval = rack->r_ctl.fillcw_cap;
24323 break;
24324 case TCP_PACING_RATE_CAP:
24325 loptval = rack->r_ctl.bw_rate_cap;
24326 break;
24327 case TCP_RACK_PROFILE:
24328 /* You cannot retrieve a profile, its write only */
24329 error = EINVAL;
24330 break;
24331 case TCP_SIDECHAN_DIS:
24332 optval = rack->r_ctl.side_chan_dis_mask;
24333 break;
24334 case TCP_HYBRID_PACING:
24335 /* You cannot retrieve hybrid pacing information, its write only */
24336 error = EINVAL;
24337 break;
24338 case TCP_USE_CMP_ACKS:
24339 optval = rack->r_use_cmp_ack;
24340 break;
24341 case TCP_RACK_PACE_TO_FILL:
24342 optval = rack->rc_pace_to_cwnd;
24343 break;
24344 case TCP_RACK_NO_PUSH_AT_MAX:
24345 optval = rack->r_ctl.rc_no_push_at_mrtt;
24346 break;
24347 case TCP_SHARED_CWND_ENABLE:
24348 optval = rack->rack_enable_scwnd;
24349 break;
24350 case TCP_RACK_NONRXT_CFG_RATE:
24351 optval = rack->rack_rec_nonrxt_use_cr;
24352 break;
24353 case TCP_NO_PRR:
24354 if (rack->rack_no_prr == 1)
24355 optval = 1;
24356 else if (rack->no_prr_addback == 1)
24357 optval = 2;
24358 else
24359 optval = 0;
24360 break;
24361 case TCP_GP_USE_LTBW:
24362 if (rack->dis_lt_bw) {
24363 /* It is not used */
24364 optval = 0;
24365 } else if (rack->use_lesser_lt_bw) {
24366 /* we use min() */
24367 optval = 1;
24368 } else {
24369 /* we use max() */
24370 optval = 2;
24371 }
24372 break;
24373 case TCP_RACK_DO_DETECTION:
24374 error = EINVAL;
24375 break;
24376 case TCP_RACK_MBUF_QUEUE:
24377 /* Now do we use the LRO mbuf-queue feature */
24378 optval = rack->r_mbuf_queue;
24379 break;
24380 case RACK_CSPR_IS_FCC:
24381 optval = rack->cspr_is_fcc;
24382 break;
24383 case TCP_TIMELY_DYN_ADJ:
24384 optval = rack->rc_gp_dyn_mul;
24385 break;
24386 case TCP_BBR_IWINTSO:
24387 error = EINVAL;
24388 break;
24389 case TCP_RACK_TLP_REDUCE:
24390 /* RACK TLP cwnd reduction (bool) */
24391 optval = rack->r_ctl.rc_tlp_cwnd_reduce;
24392 break;
24393 case TCP_BBR_RACK_INIT_RATE:
24394 val = rack->r_ctl.init_rate;
24395 /* convert to kbits per sec */
24396 val *= 8;
24397 val /= 1000;
24398 optval = (uint32_t)val;
24399 break;
24400 case TCP_RACK_FORCE_MSEG:
24401 optval = rack->rc_force_max_seg;
24402 break;
24403 case TCP_RACK_PACE_MIN_SEG:
24404 optval = rack->r_ctl.rc_user_set_min_segs;
24405 break;
24406 case TCP_RACK_PACE_MAX_SEG:
24407 /* Max segments in a pace */
24408 optval = rack->rc_user_set_max_segs;
24409 break;
24410 case TCP_RACK_PACE_ALWAYS:
24411 /* Use the always pace method */
24412 optval = rack->rc_always_pace;
24413 break;
24414 case TCP_RACK_PRR_SENDALOT:
24415 /* Allow PRR to send more than one seg */
24416 optval = rack->r_ctl.rc_prr_sendalot;
24417 break;
24418 case TCP_RACK_MIN_TO:
24419 /* Minimum time between rack t-o's in ms */
24420 optval = rack->r_ctl.rc_min_to;
24421 break;
24422 case TCP_RACK_SPLIT_LIMIT:
24423 optval = rack->r_ctl.rc_split_limit;
24424 break;
24425 case TCP_RACK_EARLY_SEG:
24426 /* If early recovery max segments */
24427 optval = rack->r_ctl.rc_early_recovery_segs;
24428 break;
24429 case TCP_RACK_REORD_THRESH:
24430 /* RACK reorder threshold (shift amount) */
24431 optval = rack->r_ctl.rc_reorder_shift;
24432 break;
24433 case TCP_SS_EEXIT:
24434 if (rack->r_ctl.gp_rnd_thresh) {
24435 uint32_t v;
24436
24437 v = rack->r_ctl.gp_gain_req;
24438 v <<= 17;
24439 optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff);
24440 if (rack->r_ctl.gate_to_fs == 1)
24441 optval |= 0x10000;
24442 } else
24443 optval = 0;
24444 break;
24445 case TCP_RACK_REORD_FADE:
24446 /* Does reordering fade after ms time */
24447 optval = rack->r_ctl.rc_reorder_fade;
24448 break;
24449 case TCP_BBR_USE_RACK_RR:
24450 /* Do we use the rack cheat for rxt */
24451 optval = rack->use_rack_rr;
24452 break;
24453 case TCP_RACK_RR_CONF:
24454 optval = rack->r_rr_config;
24455 break;
24456 case TCP_HDWR_RATE_CAP:
24457 optval = rack->r_rack_hw_rate_caps;
24458 break;
24459 case TCP_BBR_HDWR_PACE:
24460 optval = rack->rack_hdw_pace_ena;
24461 break;
24462 case TCP_RACK_TLP_THRESH:
24463 /* RACK TLP theshold i.e. srtt+(srtt/N) */
24464 optval = rack->r_ctl.rc_tlp_threshold;
24465 break;
24466 case TCP_RACK_PKT_DELAY:
24467 /* RACK added ms i.e. rack-rtt + reord + N */
24468 optval = rack->r_ctl.rc_pkt_delay;
24469 break;
24470 case TCP_RACK_TLP_USE:
24471 optval = rack->rack_tlp_threshold_use;
24472 break;
24473 case TCP_PACING_DND:
24474 optval = rack->rc_pace_dnd;
24475 break;
24476 case TCP_RACK_PACE_RATE_CA:
24477 optval = rack->r_ctl.rc_fixed_pacing_rate_ca;
24478 break;
24479 case TCP_RACK_PACE_RATE_SS:
24480 optval = rack->r_ctl.rc_fixed_pacing_rate_ss;
24481 break;
24482 case TCP_RACK_PACE_RATE_REC:
24483 optval = rack->r_ctl.rc_fixed_pacing_rate_rec;
24484 break;
24485 case TCP_DGP_UPPER_BOUNDS:
24486 optval = rack->r_ctl.rack_per_upper_bound_ss;
24487 optval <<= 16;
24488 optval |= rack->r_ctl.rack_per_upper_bound_ca;
24489 break;
24490 case TCP_RACK_GP_INCREASE_SS:
24491 optval = rack->r_ctl.rack_per_of_gp_ca;
24492 break;
24493 case TCP_RACK_GP_INCREASE_CA:
24494 optval = rack->r_ctl.rack_per_of_gp_ss;
24495 break;
24496 case TCP_RACK_PACING_DIVISOR:
24497 optval = rack->r_ctl.pace_len_divisor;
24498 break;
24499 case TCP_BBR_RACK_RTT_USE:
24500 optval = rack->r_ctl.rc_rate_sample_method;
24501 break;
24502 case TCP_DELACK:
24503 optval = tp->t_delayed_ack;
24504 break;
24505 case TCP_DATA_AFTER_CLOSE:
24506 optval = rack->rc_allow_data_af_clo;
24507 break;
24508 case TCP_SHARED_CWND_TIME_LIMIT:
24509 optval = rack->r_limit_scw;
24510 break;
24511 case TCP_HONOR_HPTS_MIN:
24512 if (rack->r_use_hpts_min)
24513 optval = rack->r_ctl.max_reduction;
24514 else
24515 optval = 0;
24516 break;
24517 case TCP_REC_IS_DYN:
24518 optval = rack->rc_gp_no_rec_chg;
24519 break;
24520 case TCP_NO_TIMELY:
24521 optval = rack->rc_skip_timely;
24522 break;
24523 case TCP_RACK_TIMER_SLOP:
24524 optval = rack->r_ctl.timer_slop;
24525 break;
24526 default:
24527 return (tcp_default_ctloutput(tp, sopt));
24528 break;
24529 }
24530 INP_WUNLOCK(inp);
24531 if (error == 0) {
24532 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) ||
24533 (sopt->sopt_name == TCP_FILLCW_RATE_CAP))
24534 error = sooptcopyout(sopt, &loptval, sizeof loptval);
24535 else
24536 error = sooptcopyout(sopt, &optval, sizeof optval);
24537 }
24538 return (error);
24539 }
24540
24541 static int
rack_ctloutput(struct tcpcb * tp,struct sockopt * sopt)24542 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt)
24543 {
24544 if (sopt->sopt_dir == SOPT_SET) {
24545 return (rack_set_sockopt(tp, sopt));
24546 } else if (sopt->sopt_dir == SOPT_GET) {
24547 return (rack_get_sockopt(tp, sopt));
24548 } else {
24549 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
24550 }
24551 }
24552
24553 static const char *rack_stack_names[] = {
24554 __XSTRING(STACKNAME),
24555 #ifdef STACKALIAS
24556 __XSTRING(STACKALIAS),
24557 #endif
24558 };
24559
24560 static int
rack_ctor(void * mem,int32_t size,void * arg,int32_t how)24561 rack_ctor(void *mem, int32_t size, void *arg, int32_t how)
24562 {
24563 memset(mem, 0, size);
24564 return (0);
24565 }
24566
24567 static void
rack_dtor(void * mem,int32_t size,void * arg)24568 rack_dtor(void *mem, int32_t size, void *arg)
24569 {
24570
24571 }
24572
24573 static bool rack_mod_inited = false;
24574
24575 static int
tcp_addrack(module_t mod,int32_t type,void * data)24576 tcp_addrack(module_t mod, int32_t type, void *data)
24577 {
24578 int32_t err = 0;
24579 int num_stacks;
24580
24581 switch (type) {
24582 case MOD_LOAD:
24583 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map",
24584 sizeof(struct rack_sendmap),
24585 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
24586
24587 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb",
24588 sizeof(struct tcp_rack),
24589 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
24590
24591 sysctl_ctx_init(&rack_sysctl_ctx);
24592 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
24593 SYSCTL_STATIC_CHILDREN(_net_inet_tcp),
24594 OID_AUTO,
24595 #ifdef STACKALIAS
24596 __XSTRING(STACKALIAS),
24597 #else
24598 __XSTRING(STACKNAME),
24599 #endif
24600 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
24601 "");
24602 if (rack_sysctl_root == NULL) {
24603 printf("Failed to add sysctl node\n");
24604 err = EFAULT;
24605 goto free_uma;
24606 }
24607 rack_init_sysctls();
24608 num_stacks = nitems(rack_stack_names);
24609 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK,
24610 rack_stack_names, &num_stacks);
24611 if (err) {
24612 printf("Failed to register %s stack name for "
24613 "%s module\n", rack_stack_names[num_stacks],
24614 __XSTRING(MODNAME));
24615 sysctl_ctx_free(&rack_sysctl_ctx);
24616 free_uma:
24617 uma_zdestroy(rack_zone);
24618 uma_zdestroy(rack_pcb_zone);
24619 rack_counter_destroy();
24620 printf("Failed to register rack module -- err:%d\n", err);
24621 return (err);
24622 }
24623 tcp_lro_reg_mbufq();
24624 rack_mod_inited = true;
24625 break;
24626 case MOD_QUIESCE:
24627 err = deregister_tcp_functions(&__tcp_rack, true, false);
24628 break;
24629 case MOD_UNLOAD:
24630 err = deregister_tcp_functions(&__tcp_rack, false, true);
24631 if (err == EBUSY)
24632 break;
24633 if (rack_mod_inited) {
24634 uma_zdestroy(rack_zone);
24635 uma_zdestroy(rack_pcb_zone);
24636 sysctl_ctx_free(&rack_sysctl_ctx);
24637 rack_counter_destroy();
24638 rack_mod_inited = false;
24639 }
24640 tcp_lro_dereg_mbufq();
24641 err = 0;
24642 break;
24643 default:
24644 return (EOPNOTSUPP);
24645 }
24646 return (err);
24647 }
24648
24649 static moduledata_t tcp_rack = {
24650 .name = __XSTRING(MODNAME),
24651 .evhand = tcp_addrack,
24652 .priv = 0
24653 };
24654
24655 MODULE_VERSION(MODNAME, 1);
24656 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
24657 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
24658
24659 #endif /* #if !defined(INET) && !defined(INET6) */
24660