1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 #include "opt_ipsec.h" 31 #include "opt_ratelimit.h" 32 #include "opt_kern_tls.h" 33 #if defined(INET) || defined(INET6) 34 #include <sys/param.h> 35 #include <sys/arb.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mutex.h> 44 #include <sys/mbuf.h> 45 #include <sys/proc.h> /* for proc0 declaration */ 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/systm.h> 50 #ifdef STATS 51 #include <sys/qmath.h> 52 #include <sys/tree.h> 53 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 54 #else 55 #include <sys/tree.h> 56 #endif 57 #include <sys/refcount.h> 58 #include <sys/queue.h> 59 #include <sys/tim_filter.h> 60 #include <sys/smp.h> 61 #include <sys/kthread.h> 62 #include <sys/kern_prefetch.h> 63 #include <sys/protosw.h> 64 #ifdef TCP_ACCOUNTING 65 #include <sys/sched.h> 66 #include <machine/cpu.h> 67 #endif 68 #include <vm/uma.h> 69 70 #include <net/route.h> 71 #include <net/route/nhop.h> 72 #include <net/vnet.h> 73 74 #define TCPSTATES /* for logging */ 75 76 #include <netinet/in.h> 77 #include <netinet/in_kdtrace.h> 78 #include <netinet/in_pcb.h> 79 #include <netinet/ip.h> 80 #include <netinet/ip_var.h> 81 #include <netinet/ip6.h> 82 #include <netinet6/in6_pcb.h> 83 #include <netinet6/ip6_var.h> 84 #include <netinet/tcp.h> 85 #define TCPOUTFLAGS 86 #include <netinet/tcp_fsm.h> 87 #include <netinet/tcp_seq.h> 88 #include <netinet/tcp_timer.h> 89 #include <netinet/tcp_var.h> 90 #include <netinet/tcp_log_buf.h> 91 #include <netinet/tcp_syncache.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcp_accounting.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/cc/cc.h> 97 #include <netinet/cc/cc_newreno.h> 98 #include <netinet/tcp_fastopen.h> 99 #include <netinet/tcp_lro.h> 100 #ifdef NETFLIX_SHARED_CWND 101 #include <netinet/tcp_shared_cwnd.h> 102 #endif 103 #ifdef TCP_OFFLOAD 104 #include <netinet/tcp_offload.h> 105 #endif 106 #ifdef INET6 107 #include <netinet6/tcp6_var.h> 108 #endif 109 #include <netinet/tcp_ecn.h> 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "tailq_hash.h" 128 #include "rack_bbr_common.h" 129 130 uma_zone_t rack_zone; 131 uma_zone_t rack_pcb_zone; 132 133 #ifndef TICKS2SBT 134 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 135 #endif 136 137 VNET_DECLARE(uint32_t, newreno_beta); 138 VNET_DECLARE(uint32_t, newreno_beta_ecn); 139 #define V_newreno_beta VNET(newreno_beta) 140 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 141 142 #define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME) 143 #define M_TCPDO __CONCAT(M_TCPDO, STACKNAME) 144 145 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block"); 146 MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options"); 147 MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information"); 148 149 struct sysctl_ctx_list rack_sysctl_ctx; 150 struct sysctl_oid *rack_sysctl_root; 151 152 #define CUM_ACKED 1 153 #define SACKED 2 154 155 /* 156 * The RACK module incorporates a number of 157 * TCP ideas that have been put out into the IETF 158 * over the last few years: 159 * - Matt Mathis's Rate Halving which slowly drops 160 * the congestion window so that the ack clock can 161 * be maintained during a recovery. 162 * - Yuchung Cheng's RACK TCP (for which its named) that 163 * will stop us using the number of dup acks and instead 164 * use time as the gage of when we retransmit. 165 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 166 * of Dukkipati et.al. 167 * RACK depends on SACK, so if an endpoint arrives that 168 * cannot do SACK the state machine below will shuttle the 169 * connection back to using the "default" TCP stack that is 170 * in FreeBSD. 171 * 172 * To implement RACK the original TCP stack was first decomposed 173 * into a functional state machine with individual states 174 * for each of the possible TCP connection states. The do_segment 175 * functions role in life is to mandate the connection supports SACK 176 * initially and then assure that the RACK state matches the conenction 177 * state before calling the states do_segment function. Each 178 * state is simplified due to the fact that the original do_segment 179 * has been decomposed and we *know* what state we are in (no 180 * switches on the state) and all tests for SACK are gone. This 181 * greatly simplifies what each state does. 182 * 183 * TCP output is also over-written with a new version since it 184 * must maintain the new rack scoreboard. 185 * 186 */ 187 static int32_t rack_tlp_thresh = 1; 188 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 189 static int32_t rack_tlp_use_greater = 1; 190 static int32_t rack_reorder_thresh = 2; 191 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 192 * - 60 seconds */ 193 static uint32_t rack_pcm_every_n_rounds = 100; 194 static uint32_t rack_pcm_blast = 0; 195 static uint32_t rack_pcm_is_enabled = 1; 196 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ 197 198 static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ 199 static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ 200 201 202 static int32_t rack_rxt_scoreboard_clear_thresh = 2; 203 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ 204 static int32_t rack_rxt_controls = 0; 205 static int32_t rack_fill_cw_state = 0; 206 static uint8_t rack_req_measurements = 1; 207 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 208 static int32_t rack_hw_rate_caps = 0; /* 1; */ 209 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ 210 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 211 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 212 static int32_t rack_hw_up_only = 0; 213 static int32_t rack_stats_gets_ms_rtt = 1; 214 static int32_t rack_prr_addbackmax = 2; 215 static int32_t rack_do_hystart = 0; 216 static int32_t rack_apply_rtt_with_reduced_conf = 0; 217 static int32_t rack_hibeta_setting = 0; 218 static int32_t rack_default_pacing_divisor = 250; 219 static uint16_t rack_pacing_min_seg = 0; 220 static int32_t rack_timely_off = 0; 221 222 static int32_t rack_pkt_delay = 1000; 223 static int32_t rack_send_a_lot_in_prr = 1; 224 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 225 static int32_t rack_verbose_logging = 0; 226 static int32_t rack_ignore_data_after_close = 1; 227 static int32_t rack_enable_shared_cwnd = 1; 228 static int32_t rack_use_cmp_acks = 1; 229 static int32_t rack_use_fsb = 1; 230 static int32_t rack_use_rfo = 1; 231 static int32_t rack_use_rsm_rfo = 1; 232 static int32_t rack_max_abc_post_recovery = 2; 233 static int32_t rack_client_low_buf = 0; 234 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 235 static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */ 236 #ifdef TCP_ACCOUNTING 237 static int32_t rack_tcp_accounting = 0; 238 #endif 239 static int32_t rack_limits_scwnd = 1; 240 static int32_t rack_enable_mqueue_for_nonpaced = 0; 241 static int32_t rack_hybrid_allow_set_maxseg = 0; 242 static int32_t rack_disable_prr = 0; 243 static int32_t use_rack_rr = 1; 244 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 245 static int32_t rack_persist_min = 250000; /* 250usec */ 246 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 247 static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ 248 static uint32_t rack_max_reduce = 10; /* Percent we can reduce pacing delay by */ 249 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 250 static int32_t rack_limit_time_with_srtt = 0; 251 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 252 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost pacing delay using time_between */ 253 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 254 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 255 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 256 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ 257 258 /* 259 * Currently regular tcp has a rto_min of 30ms 260 * the backoff goes 12 times so that ends up 261 * being a total of 122.850 seconds before a 262 * connection is killed. 263 */ 264 static uint32_t rack_def_data_window = 20; 265 static uint32_t rack_goal_bdp = 2; 266 static uint32_t rack_min_srtts = 1; 267 static uint32_t rack_min_measure_usec = 0; 268 static int32_t rack_tlp_min = 10000; /* 10ms */ 269 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 270 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 271 static const int32_t rack_free_cache = 2; 272 static int32_t rack_hptsi_segments = 40; 273 static int32_t rack_rate_sample_method = USE_RTT_LOW; 274 static int32_t rack_pace_every_seg = 0; 275 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 276 static int32_t rack_pacing_delay_reduction = 4; 277 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 278 static int32_t rack_cwnd_block_ends_measure = 0; 279 static int32_t rack_rwnd_block_ends_measure = 0; 280 static int32_t rack_def_profile = 0; 281 282 static int32_t rack_lower_cwnd_at_tlp = 0; 283 static int32_t rack_always_send_oldest = 0; 284 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 285 286 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 287 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 288 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 289 290 /* Probertt */ 291 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 292 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 293 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 294 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 295 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 296 297 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 298 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 299 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 300 static uint32_t rack_probertt_use_min_rtt_exit = 0; 301 static uint32_t rack_probe_rtt_sets_cwnd = 0; 302 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 303 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 304 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 305 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 306 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 307 static uint32_t rack_probertt_filter_life = 10000000; 308 static uint32_t rack_probertt_lower_within = 10; 309 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 310 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 311 static int32_t rack_probertt_clear_is = 1; 312 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 313 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 314 315 /* Part of pacing */ 316 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 317 318 /* Timely information: 319 * 320 * Here we have various control parameters on how 321 * timely may change the multiplier. rack_gain_p5_ub 322 * is associated with timely but not directly influencing 323 * the rate decision like the other variables. It controls 324 * the way fill-cw interacts with timely and caps how much 325 * timely can boost the fill-cw b/w. 326 * 327 * The other values are various boost/shrink numbers as well 328 * as potential caps when adjustments are made to the timely 329 * gain (returned by rack_get_output_gain(). Remember too that 330 * the gain returned can be overriden by other factors such as 331 * probeRTT as well as fixed-rate-pacing. 332 */ 333 static int32_t rack_gain_p5_ub = 250; 334 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 335 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 336 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 337 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 338 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 339 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */ 340 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 341 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 342 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 343 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 344 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 345 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 346 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 347 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 348 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 349 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 350 static int32_t rack_timely_no_stopping = 0; 351 static int32_t rack_down_raise_thresh = 100; 352 static int32_t rack_req_segs = 1; 353 static uint64_t rack_bw_rate_cap = 0; 354 static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */ 355 356 357 /* Rack specific counters */ 358 counter_u64_t rack_saw_enobuf; 359 counter_u64_t rack_saw_enobuf_hw; 360 counter_u64_t rack_saw_enetunreach; 361 counter_u64_t rack_persists_sends; 362 counter_u64_t rack_persists_acks; 363 counter_u64_t rack_persists_loss; 364 counter_u64_t rack_persists_lost_ends; 365 counter_u64_t rack_total_bytes; 366 #ifdef INVARIANTS 367 counter_u64_t rack_adjust_map_bw; 368 #endif 369 /* Tail loss probe counters */ 370 counter_u64_t rack_tlp_tot; 371 counter_u64_t rack_tlp_newdata; 372 counter_u64_t rack_tlp_retran; 373 counter_u64_t rack_tlp_retran_bytes; 374 counter_u64_t rack_to_tot; 375 counter_u64_t rack_hot_alloc; 376 counter_u64_t rack_to_alloc; 377 counter_u64_t rack_to_alloc_hard; 378 counter_u64_t rack_to_alloc_emerg; 379 counter_u64_t rack_to_alloc_limited; 380 counter_u64_t rack_alloc_limited_conns; 381 counter_u64_t rack_split_limited; 382 counter_u64_t rack_rxt_clamps_cwnd; 383 counter_u64_t rack_rxt_clamps_cwnd_uniq; 384 385 counter_u64_t rack_multi_single_eq; 386 counter_u64_t rack_proc_non_comp_ack; 387 388 counter_u64_t rack_fto_send; 389 counter_u64_t rack_fto_rsm_send; 390 counter_u64_t rack_nfto_resend; 391 counter_u64_t rack_non_fto_send; 392 counter_u64_t rack_extended_rfo; 393 394 counter_u64_t rack_sack_proc_all; 395 counter_u64_t rack_sack_proc_short; 396 counter_u64_t rack_sack_proc_restart; 397 398 counter_u64_t rack_input_idle_reduces; 399 counter_u64_t rack_collapsed_win; 400 counter_u64_t rack_collapsed_win_seen; 401 counter_u64_t rack_collapsed_win_rxt; 402 counter_u64_t rack_collapsed_win_rxt_bytes; 403 counter_u64_t rack_try_scwnd; 404 counter_u64_t rack_hw_pace_init_fail; 405 counter_u64_t rack_hw_pace_lost; 406 407 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 408 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 409 410 411 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 412 413 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 414 (tv) = (value) + slop; \ 415 if ((u_long)(tv) < (u_long)(tvmin)) \ 416 (tv) = (tvmin); \ 417 if ((u_long)(tv) > (u_long)(tvmax)) \ 418 (tv) = (tvmax); \ 419 } while (0) 420 421 static void 422 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 423 424 static int 425 rack_process_ack(struct mbuf *m, struct tcphdr *th, 426 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 427 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen); 428 static int 429 rack_process_data(struct mbuf *m, struct tcphdr *th, 430 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 431 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 432 static void 433 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 434 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 435 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 436 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 437 uint8_t limit_type); 438 static struct rack_sendmap * 439 rack_check_recovery_mode(struct tcpcb *tp, 440 uint32_t tsused); 441 static uint32_t 442 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack); 443 static void 444 rack_cong_signal(struct tcpcb *tp, 445 uint32_t type, uint32_t ack, int ); 446 static void rack_counter_destroy(void); 447 static int 448 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt); 449 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 450 static void 451 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 452 static void 453 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 454 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); 455 static void rack_dtor(void *mem, int32_t size, void *arg); 456 static void 457 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 458 uint32_t flex1, uint32_t flex2, 459 uint32_t flex3, uint32_t flex4, 460 uint32_t flex5, uint32_t flex6, 461 uint16_t flex7, uint8_t mod); 462 463 static void 464 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, 465 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 466 struct rack_sendmap *rsm, uint8_t quality); 467 static struct rack_sendmap * 468 rack_find_high_nonack(struct tcp_rack *rack, 469 struct rack_sendmap *rsm); 470 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 471 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 472 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 473 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt); 474 static void 475 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 476 tcp_seq th_ack, int line, uint8_t quality); 477 static void 478 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); 479 480 static uint32_t 481 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 482 static int32_t rack_handoff_ok(struct tcpcb *tp); 483 static int32_t rack_init(struct tcpcb *tp, void **ptr); 484 static void rack_init_sysctls(void); 485 486 static void 487 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 488 struct tcphdr *th, int entered_rec, int dup_ack_struck, 489 int *dsack_seen, int *sacks_seen); 490 static void 491 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 492 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 493 struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); 494 495 static uint64_t rack_get_gp_est(struct tcp_rack *rack); 496 497 498 static void 499 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 500 struct rack_sendmap *rsm, uint32_t cts); 501 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 502 static int32_t rack_output(struct tcpcb *tp); 503 504 static uint32_t 505 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 506 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 507 uint32_t cts, uint32_t segsiz); 508 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 509 static void rack_remxt_tmr(struct tcpcb *tp); 510 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); 511 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 512 static int32_t rack_stopall(struct tcpcb *tp); 513 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 514 static uint32_t 515 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 516 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz); 517 static void 518 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 519 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz); 520 static int 521 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 522 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 523 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 524 static int 525 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 526 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 527 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 528 529 static int 530 rack_do_closing(struct mbuf *m, struct tcphdr *th, 531 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 532 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 533 static int 534 rack_do_established(struct mbuf *m, struct tcphdr *th, 535 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 536 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 537 static int 538 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 539 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 540 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 541 static int 542 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 543 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 544 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 545 static int 546 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 547 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 548 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 549 static int 550 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 551 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 552 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 553 static int 554 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 555 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 556 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 557 static int 558 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 559 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 560 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 561 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); 562 struct rack_sendmap * 563 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 564 uint32_t tsused); 565 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 566 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 567 static void 568 tcp_rack_partialack(struct tcpcb *tp); 569 static int 570 rack_set_profile(struct tcp_rack *rack, int prof); 571 static void 572 rack_apply_deferred_options(struct tcp_rack *rack); 573 574 int32_t rack_clear_counter=0; 575 576 static uint64_t 577 rack_get_lt_bw(struct tcp_rack *rack) 578 { 579 struct timeval tv; 580 uint64_t tim, bytes; 581 582 tim = rack->r_ctl.lt_bw_time; 583 bytes = rack->r_ctl.lt_bw_bytes; 584 if (rack->lt_bw_up) { 585 /* Include all the current bytes too */ 586 microuptime(&tv); 587 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); 588 tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark); 589 } 590 if ((bytes != 0) && (tim != 0)) 591 return ((bytes * (uint64_t)1000000) / tim); 592 else 593 return (0); 594 } 595 596 static void 597 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) 598 { 599 struct sockopt sopt; 600 struct cc_newreno_opts opt; 601 struct tcpcb *tp; 602 uint32_t old_beta; 603 uint32_t old_beta_ecn; 604 int error = 0, failed = 0; 605 606 tp = rack->rc_tp; 607 if (tp->t_cc == NULL) { 608 /* Tcb is leaving */ 609 return; 610 } 611 rack->rc_pacing_cc_set = 1; 612 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 613 /* Not new-reno we can't play games with beta! */ 614 failed = 1; 615 goto out; 616 617 } 618 if (CC_ALGO(tp)->ctl_output == NULL) { 619 /* Huh, not using new-reno so no swaps.? */ 620 failed = 2; 621 goto out; 622 } 623 /* Get the current values out */ 624 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 625 sopt.sopt_dir = SOPT_GET; 626 opt.name = CC_NEWRENO_BETA; 627 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 628 if (error) { 629 failed = 3; 630 goto out; 631 } 632 old_beta = opt.val; 633 opt.name = CC_NEWRENO_BETA_ECN; 634 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 635 if (error) { 636 failed = 4; 637 goto out; 638 } 639 old_beta_ecn = opt.val; 640 641 /* Now lets set in the values we have stored */ 642 sopt.sopt_dir = SOPT_SET; 643 opt.name = CC_NEWRENO_BETA; 644 opt.val = rack->r_ctl.rc_saved_beta; 645 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 646 if (error) { 647 failed = 5; 648 goto out; 649 } 650 opt.name = CC_NEWRENO_BETA_ECN; 651 opt.val = rack->r_ctl.rc_saved_beta_ecn; 652 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 653 if (error) { 654 failed = 6; 655 goto out; 656 } 657 /* Save off the values for restoral */ 658 rack->r_ctl.rc_saved_beta = old_beta; 659 rack->r_ctl.rc_saved_beta_ecn = old_beta_ecn; 660 out: 661 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 662 union tcp_log_stackspecific log; 663 struct timeval tv; 664 struct newreno *ptr; 665 666 ptr = ((struct newreno *)tp->t_ccv.cc_data); 667 memset(&log, 0, sizeof(log)); 668 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 669 log.u_bbr.flex1 = ptr->beta; 670 log.u_bbr.flex2 = ptr->beta_ecn; 671 log.u_bbr.flex3 = ptr->newreno_flags; 672 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta; 673 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta_ecn; 674 log.u_bbr.flex6 = failed; 675 log.u_bbr.flex7 = rack->gp_ready; 676 log.u_bbr.flex7 <<= 1; 677 log.u_bbr.flex7 |= rack->use_fixed_rate; 678 log.u_bbr.flex7 <<= 1; 679 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 680 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 681 log.u_bbr.flex8 = flex8; 682 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 683 0, &log, false, NULL, NULL, 0, &tv); 684 } 685 } 686 687 static void 688 rack_set_cc_pacing(struct tcp_rack *rack) 689 { 690 if (rack->rc_pacing_cc_set) 691 return; 692 /* 693 * Use the swap utility placing in 3 for flex8 to id a 694 * set of a new set of values. 695 */ 696 rack->rc_pacing_cc_set = 1; 697 rack_swap_beta_values(rack, 3); 698 } 699 700 static void 701 rack_undo_cc_pacing(struct tcp_rack *rack) 702 { 703 if (rack->rc_pacing_cc_set == 0) 704 return; 705 /* 706 * Use the swap utility placing in 4 for flex8 to id a 707 * restoral of the old values. 708 */ 709 rack->rc_pacing_cc_set = 0; 710 rack_swap_beta_values(rack, 4); 711 } 712 713 static void 714 rack_remove_pacing(struct tcp_rack *rack) 715 { 716 if (rack->rc_pacing_cc_set) 717 rack_undo_cc_pacing(rack); 718 if (rack->r_ctl.pacing_method & RACK_REG_PACING) 719 tcp_decrement_paced_conn(); 720 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) 721 tcp_dec_dgp_pacing_cnt(); 722 rack->rc_always_pace = 0; 723 rack->r_ctl.pacing_method = RACK_PACING_NONE; 724 rack->dgp_on = 0; 725 rack->rc_hybrid_mode = 0; 726 rack->use_fixed_rate = 0; 727 } 728 729 static void 730 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, 731 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) 732 { 733 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) { 734 union tcp_log_stackspecific log; 735 struct timeval tv; 736 737 memset(&log, 0, sizeof(log)); 738 log.u_bbr.flex1 = seq_end; 739 log.u_bbr.flex2 = rack->rc_tp->gput_seq; 740 log.u_bbr.flex3 = ack_end_t; 741 log.u_bbr.flex4 = rack->rc_tp->gput_ts; 742 log.u_bbr.flex5 = send_end_t; 743 log.u_bbr.flex6 = rack->rc_tp->gput_ack; 744 log.u_bbr.flex7 = mode; 745 log.u_bbr.flex8 = 69; 746 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; 747 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; 748 log.u_bbr.pkts_out = line; 749 log.u_bbr.cwnd_gain = rack->app_limited_needs_set; 750 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; 751 log.u_bbr.epoch = rack->r_ctl.current_round; 752 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 753 if (rsm != NULL) { 754 log.u_bbr.applimited = rsm->r_start; 755 log.u_bbr.delivered = rsm->r_end; 756 log.u_bbr.epoch = rsm->r_flags; 757 } 758 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 759 TCP_LOG_EVENTP(rack->rc_tp, NULL, 760 &rack->rc_inp->inp_socket->so_rcv, 761 &rack->rc_inp->inp_socket->so_snd, 762 BBR_LOG_HPTSI_CALC, 0, 763 0, &log, false, &tv); 764 } 765 } 766 767 static int 768 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 769 { 770 uint32_t stat; 771 int32_t error; 772 773 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 774 if (error || req->newptr == NULL) 775 return error; 776 777 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 778 if (error) 779 return (error); 780 if (stat == 1) { 781 #ifdef INVARIANTS 782 printf("Clearing RACK counters\n"); 783 #endif 784 counter_u64_zero(rack_tlp_tot); 785 counter_u64_zero(rack_tlp_newdata); 786 counter_u64_zero(rack_tlp_retran); 787 counter_u64_zero(rack_tlp_retran_bytes); 788 counter_u64_zero(rack_to_tot); 789 counter_u64_zero(rack_saw_enobuf); 790 counter_u64_zero(rack_saw_enobuf_hw); 791 counter_u64_zero(rack_saw_enetunreach); 792 counter_u64_zero(rack_persists_sends); 793 counter_u64_zero(rack_total_bytes); 794 counter_u64_zero(rack_persists_acks); 795 counter_u64_zero(rack_persists_loss); 796 counter_u64_zero(rack_persists_lost_ends); 797 #ifdef INVARIANTS 798 counter_u64_zero(rack_adjust_map_bw); 799 #endif 800 counter_u64_zero(rack_to_alloc_hard); 801 counter_u64_zero(rack_to_alloc_emerg); 802 counter_u64_zero(rack_sack_proc_all); 803 counter_u64_zero(rack_fto_send); 804 counter_u64_zero(rack_fto_rsm_send); 805 counter_u64_zero(rack_extended_rfo); 806 counter_u64_zero(rack_hw_pace_init_fail); 807 counter_u64_zero(rack_hw_pace_lost); 808 counter_u64_zero(rack_non_fto_send); 809 counter_u64_zero(rack_nfto_resend); 810 counter_u64_zero(rack_sack_proc_short); 811 counter_u64_zero(rack_sack_proc_restart); 812 counter_u64_zero(rack_to_alloc); 813 counter_u64_zero(rack_to_alloc_limited); 814 counter_u64_zero(rack_alloc_limited_conns); 815 counter_u64_zero(rack_split_limited); 816 counter_u64_zero(rack_rxt_clamps_cwnd); 817 counter_u64_zero(rack_rxt_clamps_cwnd_uniq); 818 counter_u64_zero(rack_multi_single_eq); 819 counter_u64_zero(rack_proc_non_comp_ack); 820 counter_u64_zero(rack_try_scwnd); 821 counter_u64_zero(rack_collapsed_win); 822 counter_u64_zero(rack_collapsed_win_rxt); 823 counter_u64_zero(rack_collapsed_win_seen); 824 counter_u64_zero(rack_collapsed_win_rxt_bytes); 825 } else if (stat == 2) { 826 #ifdef INVARIANTS 827 printf("Clearing RACK option array\n"); 828 #endif 829 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); 830 } else if (stat == 3) { 831 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); 832 } else if (stat == 4) { 833 #ifdef INVARIANTS 834 printf("Clearing RACK out size array\n"); 835 #endif 836 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); 837 } 838 rack_clear_counter = 0; 839 return (0); 840 } 841 842 static void 843 rack_init_sysctls(void) 844 { 845 struct sysctl_oid *rack_counters; 846 struct sysctl_oid *rack_pacing; 847 struct sysctl_oid *rack_timely; 848 struct sysctl_oid *rack_timers; 849 struct sysctl_oid *rack_tlp; 850 struct sysctl_oid *rack_misc; 851 struct sysctl_oid *rack_features; 852 struct sysctl_oid *rack_measure; 853 struct sysctl_oid *rack_probertt; 854 struct sysctl_oid *rack_hw_pacing; 855 856 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_sysctl_root), 858 OID_AUTO, 859 "stats", 860 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 861 "Rack Counters"); 862 SYSCTL_ADD_S32(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_sysctl_root), 864 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 865 &rack_rate_sample_method , USE_RTT_LOW, 866 "What method should we use for rate sampling 0=high, 1=low "); 867 /* Probe rtt related controls */ 868 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 869 SYSCTL_CHILDREN(rack_sysctl_root), 870 OID_AUTO, 871 "probertt", 872 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 873 "ProbeRTT related Controls"); 874 SYSCTL_ADD_U16(&rack_sysctl_ctx, 875 SYSCTL_CHILDREN(rack_probertt), 876 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 877 &rack_atexit_prtt_hbp, 130, 878 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 879 SYSCTL_ADD_U16(&rack_sysctl_ctx, 880 SYSCTL_CHILDREN(rack_probertt), 881 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 882 &rack_atexit_prtt, 130, 883 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 884 SYSCTL_ADD_U16(&rack_sysctl_ctx, 885 SYSCTL_CHILDREN(rack_probertt), 886 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 887 &rack_per_of_gp_probertt, 60, 888 "What percentage of goodput do we pace at in probertt"); 889 SYSCTL_ADD_U16(&rack_sysctl_ctx, 890 SYSCTL_CHILDREN(rack_probertt), 891 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 892 &rack_per_of_gp_probertt_reduce, 10, 893 "What percentage of goodput do we reduce every gp_srtt"); 894 SYSCTL_ADD_U16(&rack_sysctl_ctx, 895 SYSCTL_CHILDREN(rack_probertt), 896 OID_AUTO, "gp_per_low", CTLFLAG_RW, 897 &rack_per_of_gp_lowthresh, 40, 898 "What percentage of goodput do we allow the multiplier to fall to"); 899 SYSCTL_ADD_U32(&rack_sysctl_ctx, 900 SYSCTL_CHILDREN(rack_probertt), 901 OID_AUTO, "time_between", CTLFLAG_RW, 902 &rack_time_between_probertt, 96000000, 903 "How many useconds between the lowest rtt falling must past before we enter probertt"); 904 SYSCTL_ADD_U32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_probertt), 906 OID_AUTO, "safety", CTLFLAG_RW, 907 &rack_probe_rtt_safety_val, 2000000, 908 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 909 SYSCTL_ADD_U32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_probertt), 911 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 912 &rack_probe_rtt_sets_cwnd, 0, 913 "Do we set the cwnd too (if always_lower is on)"); 914 SYSCTL_ADD_U32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_probertt), 916 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 917 &rack_max_drain_wait, 2, 918 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 919 SYSCTL_ADD_U32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_probertt), 921 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 922 &rack_must_drain, 1, 923 "We must drain this many gp_srtt's waiting for flight to reach goal"); 924 SYSCTL_ADD_U32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_probertt), 926 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 927 &rack_probertt_use_min_rtt_entry, 1, 928 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 929 SYSCTL_ADD_U32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_probertt), 931 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 932 &rack_probertt_use_min_rtt_exit, 0, 933 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 934 SYSCTL_ADD_U32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_probertt), 936 OID_AUTO, "length_div", CTLFLAG_RW, 937 &rack_probertt_gpsrtt_cnt_div, 0, 938 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 939 SYSCTL_ADD_U32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_probertt), 941 OID_AUTO, "length_mul", CTLFLAG_RW, 942 &rack_probertt_gpsrtt_cnt_mul, 0, 943 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 944 SYSCTL_ADD_U32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_probertt), 946 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 947 &rack_min_probertt_hold, 200000, 948 "What is the minimum time we hold probertt at target"); 949 SYSCTL_ADD_U32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_probertt), 951 OID_AUTO, "filter_life", CTLFLAG_RW, 952 &rack_probertt_filter_life, 10000000, 953 "What is the time for the filters life in useconds"); 954 SYSCTL_ADD_U32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_probertt), 956 OID_AUTO, "lower_within", CTLFLAG_RW, 957 &rack_probertt_lower_within, 10, 958 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 959 SYSCTL_ADD_U32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_probertt), 961 OID_AUTO, "must_move", CTLFLAG_RW, 962 &rack_min_rtt_movement, 250, 963 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 964 SYSCTL_ADD_U32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_probertt), 966 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 967 &rack_probertt_clear_is, 1, 968 "Do we clear I/S counts on exiting probe-rtt"); 969 SYSCTL_ADD_S32(&rack_sysctl_ctx, 970 SYSCTL_CHILDREN(rack_probertt), 971 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 972 &rack_max_drain_hbp, 1, 973 "How many extra drain gpsrtt's do we get in highly buffered paths"); 974 SYSCTL_ADD_S32(&rack_sysctl_ctx, 975 SYSCTL_CHILDREN(rack_probertt), 976 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 977 &rack_hbp_thresh, 3, 978 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 979 /* Pacing related sysctls */ 980 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_sysctl_root), 982 OID_AUTO, 983 "pacing", 984 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 985 "Pacing related Controls"); 986 SYSCTL_ADD_U32(&rack_sysctl_ctx, 987 SYSCTL_CHILDREN(rack_pacing), 988 OID_AUTO, "pcm_enabled", CTLFLAG_RW, 989 &rack_pcm_is_enabled, 1, 990 "Do we by default do PCM measurements?"); 991 SYSCTL_ADD_U32(&rack_sysctl_ctx, 992 SYSCTL_CHILDREN(rack_pacing), 993 OID_AUTO, "pcm_rnds", CTLFLAG_RW, 994 &rack_pcm_every_n_rounds, 100, 995 "How many rounds before we need to do a PCM measurement"); 996 SYSCTL_ADD_U32(&rack_sysctl_ctx, 997 SYSCTL_CHILDREN(rack_pacing), 998 OID_AUTO, "pcm_blast", CTLFLAG_RW, 999 &rack_pcm_blast, 0, 1000 "Blast out the full cwnd/rwnd when doing a PCM measurement"); 1001 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1002 SYSCTL_CHILDREN(rack_pacing), 1003 OID_AUTO, "rnd_gp_gain", CTLFLAG_RW, 1004 &rack_gp_gain_req, 1200, 1005 "How much do we have to increase the GP to record the round 1200 = 120.0"); 1006 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1007 SYSCTL_CHILDREN(rack_pacing), 1008 OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW, 1009 &rack_rnd_cnt_req, 0x10005, 1010 "How many rounds less than rnd_gp_gain will drop us out of SS"); 1011 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1012 SYSCTL_CHILDREN(rack_pacing), 1013 OID_AUTO, "no_timely", CTLFLAG_RW, 1014 &rack_timely_off, 0, 1015 "Do we not use timely in DGP?"); 1016 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1017 SYSCTL_CHILDREN(rack_pacing), 1018 OID_AUTO, "fillcw", CTLFLAG_RW, 1019 &rack_fill_cw_state, 0, 1020 "Enable fillcw on new connections (default=0 off)?"); 1021 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1022 SYSCTL_CHILDREN(rack_pacing), 1023 OID_AUTO, "min_burst", CTLFLAG_RW, 1024 &rack_pacing_min_seg, 0, 1025 "What is the min burst size for pacing (0 disables)?"); 1026 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1027 SYSCTL_CHILDREN(rack_pacing), 1028 OID_AUTO, "divisor", CTLFLAG_RW, 1029 &rack_default_pacing_divisor, 250, 1030 "What is the default divisor given to the rl code?"); 1031 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1032 SYSCTL_CHILDREN(rack_pacing), 1033 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, 1034 &rack_bw_multipler, 0, 1035 "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?"); 1036 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1037 SYSCTL_CHILDREN(rack_pacing), 1038 OID_AUTO, "max_pace_over", CTLFLAG_RW, 1039 &rack_max_per_above, 30, 1040 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 1041 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1042 SYSCTL_CHILDREN(rack_pacing), 1043 OID_AUTO, "allow1mss", CTLFLAG_RW, 1044 &rack_pace_one_seg, 0, 1045 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); 1046 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1047 SYSCTL_CHILDREN(rack_pacing), 1048 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 1049 &rack_limit_time_with_srtt, 0, 1050 "Do we limit pacing time based on srtt"); 1051 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1052 SYSCTL_CHILDREN(rack_pacing), 1053 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1054 &rack_per_of_gp_ss, 250, 1055 "If non zero, what percentage of goodput to pace at in slow start"); 1056 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_pacing), 1058 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1059 &rack_per_of_gp_ca, 150, 1060 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1061 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_pacing), 1063 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1064 &rack_per_of_gp_rec, 200, 1065 "If non zero, what percentage of goodput to pace at in recovery"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_pacing), 1068 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1069 &rack_hptsi_segments, 40, 1070 "What size is the max for TSO segments in pacing and burst mitigation"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_pacing), 1073 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1074 &rack_pacing_delay_reduction, 4, 1075 "When doing only burst mitigation what is the reduce divisor"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_sysctl_root), 1078 OID_AUTO, "use_pacing", CTLFLAG_RW, 1079 &rack_pace_every_seg, 0, 1080 "If set we use pacing, if clear we use only the original burst mitigation"); 1081 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_pacing), 1083 OID_AUTO, "rate_cap", CTLFLAG_RW, 1084 &rack_bw_rate_cap, 0, 1085 "If set we apply this value to the absolute rate cap used by pacing"); 1086 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_pacing), 1088 OID_AUTO, "fillcw_cap", CTLFLAG_RW, 1089 &rack_fillcw_bw_cap, 3750000, 1090 "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?"); 1091 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_sysctl_root), 1093 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1094 &rack_req_measurements, 1, 1095 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1096 /* Hardware pacing */ 1097 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1098 SYSCTL_CHILDREN(rack_sysctl_root), 1099 OID_AUTO, 1100 "hdwr_pacing", 1101 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1102 "Pacing related Controls"); 1103 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1104 SYSCTL_CHILDREN(rack_hw_pacing), 1105 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1106 &rack_hw_rwnd_factor, 2, 1107 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1108 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1109 SYSCTL_CHILDREN(rack_hw_pacing), 1110 OID_AUTO, "precheck", CTLFLAG_RW, 1111 &rack_hw_check_queue, 0, 1112 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); 1113 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1114 SYSCTL_CHILDREN(rack_hw_pacing), 1115 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1116 &rack_enobuf_hw_boost_mult, 0, 1117 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1118 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1119 SYSCTL_CHILDREN(rack_hw_pacing), 1120 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1121 &rack_enobuf_hw_max, 2, 1122 "What is the max boost the pacing time if we see a ENOBUFS?"); 1123 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1124 SYSCTL_CHILDREN(rack_hw_pacing), 1125 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1126 &rack_enobuf_hw_min, 2, 1127 "What is the min boost the pacing time if we see a ENOBUFS?"); 1128 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_hw_pacing), 1130 OID_AUTO, "enable", CTLFLAG_RW, 1131 &rack_enable_hw_pacing, 0, 1132 "Should RACK attempt to use hw pacing?"); 1133 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_hw_pacing), 1135 OID_AUTO, "rate_cap", CTLFLAG_RW, 1136 &rack_hw_rate_caps, 0, 1137 "Does the highest hardware pacing rate cap the rate we will send at??"); 1138 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1139 SYSCTL_CHILDREN(rack_hw_pacing), 1140 OID_AUTO, "uncap_per", CTLFLAG_RW, 1141 &rack_hw_rate_cap_per, 0, 1142 "If you go over b/w by this amount you will be uncapped (0 = never)"); 1143 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1144 SYSCTL_CHILDREN(rack_hw_pacing), 1145 OID_AUTO, "rate_min", CTLFLAG_RW, 1146 &rack_hw_rate_min, 0, 1147 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1148 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1149 SYSCTL_CHILDREN(rack_hw_pacing), 1150 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1151 &rack_hw_rate_to_low, 0, 1152 "If we fall below this rate, dis-engage hw pacing?"); 1153 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1154 SYSCTL_CHILDREN(rack_hw_pacing), 1155 OID_AUTO, "up_only", CTLFLAG_RW, 1156 &rack_hw_up_only, 0, 1157 "Do we allow hw pacing to lower the rate selected?"); 1158 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1159 SYSCTL_CHILDREN(rack_sysctl_root), 1160 OID_AUTO, 1161 "timely", 1162 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1163 "Rack Timely RTT Controls"); 1164 /* Timely based GP dynmics */ 1165 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1166 SYSCTL_CHILDREN(rack_timely), 1167 OID_AUTO, "upper", CTLFLAG_RW, 1168 &rack_gp_per_bw_mul_up, 2, 1169 "Rack timely upper range for equal b/w (in percentage)"); 1170 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1171 SYSCTL_CHILDREN(rack_timely), 1172 OID_AUTO, "lower", CTLFLAG_RW, 1173 &rack_gp_per_bw_mul_down, 4, 1174 "Rack timely lower range for equal b/w (in percentage)"); 1175 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1176 SYSCTL_CHILDREN(rack_timely), 1177 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1178 &rack_gp_rtt_maxmul, 3, 1179 "Rack timely multiplier of lowest rtt for rtt_max"); 1180 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1181 SYSCTL_CHILDREN(rack_timely), 1182 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1183 &rack_gp_rtt_mindiv, 4, 1184 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1185 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1186 SYSCTL_CHILDREN(rack_timely), 1187 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1188 &rack_gp_rtt_minmul, 1, 1189 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1190 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1191 SYSCTL_CHILDREN(rack_timely), 1192 OID_AUTO, "decrease", CTLFLAG_RW, 1193 &rack_gp_decrease_per, 80, 1194 "Rack timely Beta value 80 = .8 (scaled by 100)"); 1195 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1196 SYSCTL_CHILDREN(rack_timely), 1197 OID_AUTO, "increase", CTLFLAG_RW, 1198 &rack_gp_increase_per, 2, 1199 "Rack timely increase perentage of our GP multiplication factor"); 1200 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1201 SYSCTL_CHILDREN(rack_timely), 1202 OID_AUTO, "lowerbound", CTLFLAG_RW, 1203 &rack_per_lower_bound, 50, 1204 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1205 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_timely), 1207 OID_AUTO, "p5_upper", CTLFLAG_RW, 1208 &rack_gain_p5_ub, 250, 1209 "Profile 5 upper bound to timely gain"); 1210 1211 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_timely), 1213 OID_AUTO, "upperboundss", CTLFLAG_RW, 1214 &rack_per_upper_bound_ss, 0, 1215 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1216 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_timely), 1218 OID_AUTO, "upperboundca", CTLFLAG_RW, 1219 &rack_per_upper_bound_ca, 0, 1220 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1221 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1222 SYSCTL_CHILDREN(rack_timely), 1223 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1224 &rack_do_dyn_mul, 0, 1225 "Rack timely do we enable dynmaic timely goodput by default"); 1226 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1227 SYSCTL_CHILDREN(rack_timely), 1228 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1229 &rack_gp_no_rec_chg, 1, 1230 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1231 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1232 SYSCTL_CHILDREN(rack_timely), 1233 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1234 &rack_timely_dec_clear, 6, 1235 "Rack timely what threshold do we count to before another boost during b/w decent"); 1236 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_timely), 1238 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1239 &rack_timely_max_push_rise, 3, 1240 "Rack timely how many times do we push up with b/w increase"); 1241 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_timely), 1243 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1244 &rack_timely_max_push_drop, 3, 1245 "Rack timely how many times do we push back on b/w decent"); 1246 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1247 SYSCTL_CHILDREN(rack_timely), 1248 OID_AUTO, "min_segs", CTLFLAG_RW, 1249 &rack_timely_min_segs, 4, 1250 "Rack timely when setting the cwnd what is the min num segments"); 1251 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1252 SYSCTL_CHILDREN(rack_timely), 1253 OID_AUTO, "nonstop", CTLFLAG_RW, 1254 &rack_timely_no_stopping, 0, 1255 "Rack timely don't stop increase"); 1256 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1257 SYSCTL_CHILDREN(rack_timely), 1258 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1259 &rack_down_raise_thresh, 100, 1260 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1261 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1262 SYSCTL_CHILDREN(rack_timely), 1263 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1264 &rack_req_segs, 1, 1265 "Bottom dragging if not these many segments outstanding and room"); 1266 1267 /* TLP and Rack related parameters */ 1268 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1269 SYSCTL_CHILDREN(rack_sysctl_root), 1270 OID_AUTO, 1271 "tlp", 1272 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1273 "TLP and Rack related Controls"); 1274 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1275 SYSCTL_CHILDREN(rack_tlp), 1276 OID_AUTO, "use_rrr", CTLFLAG_RW, 1277 &use_rack_rr, 1, 1278 "Do we use Rack Rapid Recovery"); 1279 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1280 SYSCTL_CHILDREN(rack_tlp), 1281 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1282 &rack_max_abc_post_recovery, 2, 1283 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1284 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1285 SYSCTL_CHILDREN(rack_tlp), 1286 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1287 &rack_non_rxt_use_cr, 0, 1288 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1289 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_tlp), 1291 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1292 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1293 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1294 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_tlp), 1296 OID_AUTO, "limit", CTLFLAG_RW, 1297 &rack_tlp_limit, 2, 1298 "How many TLP's can be sent without sending new data"); 1299 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1300 SYSCTL_CHILDREN(rack_tlp), 1301 OID_AUTO, "use_greater", CTLFLAG_RW, 1302 &rack_tlp_use_greater, 1, 1303 "Should we use the rack_rtt time if its greater than srtt"); 1304 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1305 SYSCTL_CHILDREN(rack_tlp), 1306 OID_AUTO, "tlpminto", CTLFLAG_RW, 1307 &rack_tlp_min, 10000, 1308 "TLP minimum timeout per the specification (in microseconds)"); 1309 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1310 SYSCTL_CHILDREN(rack_tlp), 1311 OID_AUTO, "send_oldest", CTLFLAG_RW, 1312 &rack_always_send_oldest, 0, 1313 "Should we always send the oldest TLP and RACK-TLP"); 1314 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_tlp), 1316 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1317 &rack_lower_cwnd_at_tlp, 0, 1318 "When a TLP completes a retran should we enter recovery"); 1319 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1320 SYSCTL_CHILDREN(rack_tlp), 1321 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1322 &rack_reorder_thresh, 2, 1323 "What factor for rack will be added when seeing reordering (shift right)"); 1324 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1325 SYSCTL_CHILDREN(rack_tlp), 1326 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1327 &rack_tlp_thresh, 1, 1328 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1329 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1330 SYSCTL_CHILDREN(rack_tlp), 1331 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1332 &rack_reorder_fade, 60000000, 1333 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1334 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1335 SYSCTL_CHILDREN(rack_tlp), 1336 OID_AUTO, "pktdelay", CTLFLAG_RW, 1337 &rack_pkt_delay, 1000, 1338 "Extra RACK time (in microseconds) besides reordering thresh"); 1339 1340 /* Timer related controls */ 1341 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1342 SYSCTL_CHILDREN(rack_sysctl_root), 1343 OID_AUTO, 1344 "timers", 1345 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1346 "Timer related controls"); 1347 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1348 SYSCTL_CHILDREN(rack_timers), 1349 OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW, 1350 &rack_ssthresh_rest_rto_rec, 0, 1351 "When doing recovery -> rto -> recovery do we reset SSthresh?"); 1352 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1353 SYSCTL_CHILDREN(rack_timers), 1354 OID_AUTO, "scoreboard_thresh", CTLFLAG_RW, 1355 &rack_rxt_scoreboard_clear_thresh, 2, 1356 "How many RTO's are allowed before we clear the scoreboard"); 1357 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1358 SYSCTL_CHILDREN(rack_timers), 1359 OID_AUTO, "honor_hpts_min", CTLFLAG_RW, 1360 &rack_honors_hpts_min_to, 1, 1361 "Do rack pacing timers honor hpts min timeout"); 1362 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_timers), 1364 OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, 1365 &rack_max_reduce, 10, 1366 "Max percentage we will reduce pacing delay by for pacing when we are behind"); 1367 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1368 SYSCTL_CHILDREN(rack_timers), 1369 OID_AUTO, "persmin", CTLFLAG_RW, 1370 &rack_persist_min, 250000, 1371 "What is the minimum time in microseconds between persists"); 1372 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_timers), 1374 OID_AUTO, "persmax", CTLFLAG_RW, 1375 &rack_persist_max, 2000000, 1376 "What is the largest delay in microseconds between persists"); 1377 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1378 SYSCTL_CHILDREN(rack_timers), 1379 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1380 &rack_delayed_ack_time, 40000, 1381 "Delayed ack time (40ms in microseconds)"); 1382 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1383 SYSCTL_CHILDREN(rack_timers), 1384 OID_AUTO, "minrto", CTLFLAG_RW, 1385 &rack_rto_min, 30000, 1386 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1387 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1388 SYSCTL_CHILDREN(rack_timers), 1389 OID_AUTO, "maxrto", CTLFLAG_RW, 1390 &rack_rto_max, 4000000, 1391 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1392 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1393 SYSCTL_CHILDREN(rack_timers), 1394 OID_AUTO, "minto", CTLFLAG_RW, 1395 &rack_min_to, 1000, 1396 "Minimum rack timeout in microseconds"); 1397 /* Measure controls */ 1398 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_sysctl_root), 1400 OID_AUTO, 1401 "measure", 1402 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1403 "Measure related controls"); 1404 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1405 SYSCTL_CHILDREN(rack_measure), 1406 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1407 &rack_wma_divisor, 8, 1408 "When doing b/w calculation what is the divisor for the WMA"); 1409 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1410 SYSCTL_CHILDREN(rack_measure), 1411 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1412 &rack_cwnd_block_ends_measure, 0, 1413 "Does a cwnd just-return end the measurement window (app limited)"); 1414 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1415 SYSCTL_CHILDREN(rack_measure), 1416 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1417 &rack_rwnd_block_ends_measure, 0, 1418 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1419 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1420 SYSCTL_CHILDREN(rack_measure), 1421 OID_AUTO, "min_target", CTLFLAG_RW, 1422 &rack_def_data_window, 20, 1423 "What is the minimum target window (in mss) for a GP measurements"); 1424 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1425 SYSCTL_CHILDREN(rack_measure), 1426 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1427 &rack_goal_bdp, 2, 1428 "What is the goal BDP to measure"); 1429 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1430 SYSCTL_CHILDREN(rack_measure), 1431 OID_AUTO, "min_srtts", CTLFLAG_RW, 1432 &rack_min_srtts, 1, 1433 "What is the goal BDP to measure"); 1434 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1435 SYSCTL_CHILDREN(rack_measure), 1436 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1437 &rack_min_measure_usec, 0, 1438 "What is the Minimum time time for a measurement if 0, this is off"); 1439 /* Features */ 1440 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_sysctl_root), 1442 OID_AUTO, 1443 "features", 1444 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1445 "Feature controls"); 1446 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1447 SYSCTL_CHILDREN(rack_features), 1448 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, 1449 &rack_hybrid_allow_set_maxseg, 0, 1450 "Should hybrid pacing allow the setmss command"); 1451 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1452 SYSCTL_CHILDREN(rack_features), 1453 OID_AUTO, "cmpack", CTLFLAG_RW, 1454 &rack_use_cmp_acks, 1, 1455 "Should RACK have LRO send compressed acks"); 1456 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1457 SYSCTL_CHILDREN(rack_features), 1458 OID_AUTO, "fsb", CTLFLAG_RW, 1459 &rack_use_fsb, 1, 1460 "Should RACK use the fast send block?"); 1461 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1462 SYSCTL_CHILDREN(rack_features), 1463 OID_AUTO, "rfo", CTLFLAG_RW, 1464 &rack_use_rfo, 1, 1465 "Should RACK use rack_fast_output()?"); 1466 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1467 SYSCTL_CHILDREN(rack_features), 1468 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1469 &rack_use_rsm_rfo, 1, 1470 "Should RACK use rack_fast_rsm_output()?"); 1471 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1472 SYSCTL_CHILDREN(rack_features), 1473 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1474 &rack_enable_mqueue_for_nonpaced, 0, 1475 "Should RACK use mbuf queuing for non-paced connections"); 1476 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1477 SYSCTL_CHILDREN(rack_features), 1478 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1479 &rack_do_hystart, 0, 1480 "Should RACK enable HyStart++ on connections?"); 1481 /* Misc rack controls */ 1482 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1483 SYSCTL_CHILDREN(rack_sysctl_root), 1484 OID_AUTO, 1485 "misc", 1486 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1487 "Misc related controls"); 1488 #ifdef TCP_ACCOUNTING 1489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_misc), 1491 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1492 &rack_tcp_accounting, 0, 1493 "Should we turn on TCP accounting for all rack sessions?"); 1494 #endif 1495 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1496 SYSCTL_CHILDREN(rack_misc), 1497 OID_AUTO, "dnd", CTLFLAG_RW, 1498 &rack_dnd_default, 0, 1499 "Do not disturb default for rack_rrr = 3"); 1500 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1501 SYSCTL_CHILDREN(rack_misc), 1502 OID_AUTO, "rxt_controls", CTLFLAG_RW, 1503 &rack_rxt_controls, 0, 1504 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); 1505 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1506 SYSCTL_CHILDREN(rack_misc), 1507 OID_AUTO, "rack_hibeta", CTLFLAG_RW, 1508 &rack_hibeta_setting, 0, 1509 "Do we ue a high beta (80 instead of 50)?"); 1510 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1511 SYSCTL_CHILDREN(rack_misc), 1512 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1513 &rack_apply_rtt_with_reduced_conf, 0, 1514 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1515 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1516 SYSCTL_CHILDREN(rack_misc), 1517 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1518 &rack_dsack_std_based, 3, 1519 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1520 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1521 SYSCTL_CHILDREN(rack_misc), 1522 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1523 &rack_prr_addbackmax, 2, 1524 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1525 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1526 SYSCTL_CHILDREN(rack_misc), 1527 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1528 &rack_stats_gets_ms_rtt, 1, 1529 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1530 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1531 SYSCTL_CHILDREN(rack_misc), 1532 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1533 &rack_client_low_buf, 0, 1534 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1535 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1536 SYSCTL_CHILDREN(rack_misc), 1537 OID_AUTO, "defprofile", CTLFLAG_RW, 1538 &rack_def_profile, 0, 1539 "Should RACK use a default profile (0=no, num == profile num)?"); 1540 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1541 SYSCTL_CHILDREN(rack_misc), 1542 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1543 &rack_enable_shared_cwnd, 1, 1544 "Should RACK try to use the shared cwnd on connections where allowed"); 1545 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1546 SYSCTL_CHILDREN(rack_misc), 1547 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1548 &rack_limits_scwnd, 1, 1549 "Should RACK place low end time limits on the shared cwnd feature"); 1550 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1551 SYSCTL_CHILDREN(rack_misc), 1552 OID_AUTO, "no_prr", CTLFLAG_RW, 1553 &rack_disable_prr, 0, 1554 "Should RACK not use prr and only pace (must have pacing on)"); 1555 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1556 SYSCTL_CHILDREN(rack_misc), 1557 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1558 &rack_verbose_logging, 0, 1559 "Should RACK black box logging be verbose"); 1560 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1561 SYSCTL_CHILDREN(rack_misc), 1562 OID_AUTO, "data_after_close", CTLFLAG_RW, 1563 &rack_ignore_data_after_close, 1, 1564 "Do we hold off sending a RST until all pending data is ack'd"); 1565 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1566 SYSCTL_CHILDREN(rack_misc), 1567 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1568 &rack_sack_not_required, 1, 1569 "Do we allow rack to run on connections not supporting SACK"); 1570 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1571 SYSCTL_CHILDREN(rack_misc), 1572 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1573 &rack_send_a_lot_in_prr, 1, 1574 "Send a lot in prr"); 1575 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1576 SYSCTL_CHILDREN(rack_misc), 1577 OID_AUTO, "autoscale", CTLFLAG_RW, 1578 &rack_autosndbuf_inc, 20, 1579 "What percentage should rack scale up its snd buffer by?"); 1580 1581 /* Counters */ 1582 rack_total_bytes = counter_u64_alloc(M_WAITOK); 1583 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1584 SYSCTL_CHILDREN(rack_counters), 1585 OID_AUTO, "totalbytes", CTLFLAG_RD, 1586 &rack_total_bytes, 1587 "Total number of bytes sent"); 1588 rack_fto_send = counter_u64_alloc(M_WAITOK); 1589 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1590 SYSCTL_CHILDREN(rack_counters), 1591 OID_AUTO, "fto_send", CTLFLAG_RD, 1592 &rack_fto_send, "Total number of rack_fast_output sends"); 1593 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1594 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1595 SYSCTL_CHILDREN(rack_counters), 1596 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1597 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1598 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1599 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1600 SYSCTL_CHILDREN(rack_counters), 1601 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1602 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1603 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1604 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1605 SYSCTL_CHILDREN(rack_counters), 1606 OID_AUTO, "nfto_send", CTLFLAG_RD, 1607 &rack_non_fto_send, "Total number of rack_output first sends"); 1608 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1609 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1610 SYSCTL_CHILDREN(rack_counters), 1611 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1612 &rack_extended_rfo, "Total number of times we extended rfo"); 1613 1614 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1615 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1616 SYSCTL_CHILDREN(rack_counters), 1617 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1618 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1619 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1620 1621 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1622 SYSCTL_CHILDREN(rack_counters), 1623 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1624 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1625 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1626 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1627 SYSCTL_CHILDREN(rack_counters), 1628 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1629 &rack_tlp_tot, 1630 "Total number of tail loss probe expirations"); 1631 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1632 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1633 SYSCTL_CHILDREN(rack_counters), 1634 OID_AUTO, "tlp_new", CTLFLAG_RD, 1635 &rack_tlp_newdata, 1636 "Total number of tail loss probe sending new data"); 1637 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1638 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1639 SYSCTL_CHILDREN(rack_counters), 1640 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1641 &rack_tlp_retran, 1642 "Total number of tail loss probe sending retransmitted data"); 1643 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1644 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1645 SYSCTL_CHILDREN(rack_counters), 1646 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1647 &rack_tlp_retran_bytes, 1648 "Total bytes of tail loss probe sending retransmitted data"); 1649 rack_to_tot = counter_u64_alloc(M_WAITOK); 1650 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1651 SYSCTL_CHILDREN(rack_counters), 1652 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1653 &rack_to_tot, 1654 "Total number of times the rack to expired"); 1655 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1656 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1657 SYSCTL_CHILDREN(rack_counters), 1658 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1659 &rack_saw_enobuf, 1660 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1661 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1662 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1663 SYSCTL_CHILDREN(rack_counters), 1664 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1665 &rack_saw_enobuf_hw, 1666 "Total number of times a send returned enobuf for hdwr paced connections"); 1667 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1668 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1669 SYSCTL_CHILDREN(rack_counters), 1670 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1671 &rack_saw_enetunreach, 1672 "Total number of times a send received a enetunreachable"); 1673 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1674 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1675 SYSCTL_CHILDREN(rack_counters), 1676 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1677 &rack_hot_alloc, 1678 "Total allocations from the top of our list"); 1679 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1681 SYSCTL_CHILDREN(rack_counters), 1682 OID_AUTO, "allocs", CTLFLAG_RD, 1683 &rack_to_alloc, 1684 "Total allocations of tracking structures"); 1685 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1686 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1687 SYSCTL_CHILDREN(rack_counters), 1688 OID_AUTO, "allochard", CTLFLAG_RD, 1689 &rack_to_alloc_hard, 1690 "Total allocations done with sleeping the hard way"); 1691 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1692 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1693 SYSCTL_CHILDREN(rack_counters), 1694 OID_AUTO, "allocemerg", CTLFLAG_RD, 1695 &rack_to_alloc_emerg, 1696 "Total allocations done from emergency cache"); 1697 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1698 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1699 SYSCTL_CHILDREN(rack_counters), 1700 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1701 &rack_to_alloc_limited, 1702 "Total allocations dropped due to limit"); 1703 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1704 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1705 SYSCTL_CHILDREN(rack_counters), 1706 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1707 &rack_alloc_limited_conns, 1708 "Connections with allocations dropped due to limit"); 1709 rack_split_limited = counter_u64_alloc(M_WAITOK); 1710 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1711 SYSCTL_CHILDREN(rack_counters), 1712 OID_AUTO, "split_limited", CTLFLAG_RD, 1713 &rack_split_limited, 1714 "Split allocations dropped due to limit"); 1715 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); 1716 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1717 SYSCTL_CHILDREN(rack_counters), 1718 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, 1719 &rack_rxt_clamps_cwnd, 1720 "Number of times that excessive rxt clamped the cwnd down"); 1721 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); 1722 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1723 SYSCTL_CHILDREN(rack_counters), 1724 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, 1725 &rack_rxt_clamps_cwnd_uniq, 1726 "Number of connections that have had excessive rxt clamped the cwnd down"); 1727 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1728 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1729 SYSCTL_CHILDREN(rack_counters), 1730 OID_AUTO, "persist_sends", CTLFLAG_RD, 1731 &rack_persists_sends, 1732 "Number of times we sent a persist probe"); 1733 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1734 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1735 SYSCTL_CHILDREN(rack_counters), 1736 OID_AUTO, "persist_acks", CTLFLAG_RD, 1737 &rack_persists_acks, 1738 "Number of times a persist probe was acked"); 1739 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1740 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1741 SYSCTL_CHILDREN(rack_counters), 1742 OID_AUTO, "persist_loss", CTLFLAG_RD, 1743 &rack_persists_loss, 1744 "Number of times we detected a lost persist probe (no ack)"); 1745 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1747 SYSCTL_CHILDREN(rack_counters), 1748 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1749 &rack_persists_lost_ends, 1750 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1751 #ifdef INVARIANTS 1752 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1753 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1754 SYSCTL_CHILDREN(rack_counters), 1755 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1756 &rack_adjust_map_bw, 1757 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1758 #endif 1759 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1760 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1761 SYSCTL_CHILDREN(rack_counters), 1762 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1763 &rack_multi_single_eq, 1764 "Number of compressed acks total represented"); 1765 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1766 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1767 SYSCTL_CHILDREN(rack_counters), 1768 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1769 &rack_proc_non_comp_ack, 1770 "Number of non compresseds acks that we processed"); 1771 1772 1773 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1775 SYSCTL_CHILDREN(rack_counters), 1776 OID_AUTO, "sack_long", CTLFLAG_RD, 1777 &rack_sack_proc_all, 1778 "Total times we had to walk whole list for sack processing"); 1779 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1781 SYSCTL_CHILDREN(rack_counters), 1782 OID_AUTO, "sack_restart", CTLFLAG_RD, 1783 &rack_sack_proc_restart, 1784 "Total times we had to walk whole list due to a restart"); 1785 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1787 SYSCTL_CHILDREN(rack_counters), 1788 OID_AUTO, "sack_short", CTLFLAG_RD, 1789 &rack_sack_proc_short, 1790 "Total times we took shortcut for sack processing"); 1791 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1793 SYSCTL_CHILDREN(rack_counters), 1794 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1795 &rack_input_idle_reduces, 1796 "Total number of idle reductions on input"); 1797 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 1798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1799 SYSCTL_CHILDREN(rack_counters), 1800 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 1801 &rack_collapsed_win_seen, 1802 "Total number of collapsed window events seen (where our window shrinks)"); 1803 1804 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1805 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1806 SYSCTL_CHILDREN(rack_counters), 1807 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1808 &rack_collapsed_win, 1809 "Total number of collapsed window events where we mark packets"); 1810 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 1811 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1812 SYSCTL_CHILDREN(rack_counters), 1813 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 1814 &rack_collapsed_win_rxt, 1815 "Total number of packets that were retransmitted"); 1816 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 1817 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1818 SYSCTL_CHILDREN(rack_counters), 1819 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 1820 &rack_collapsed_win_rxt_bytes, 1821 "Total number of bytes that were retransmitted"); 1822 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1823 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1824 SYSCTL_CHILDREN(rack_counters), 1825 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1826 &rack_try_scwnd, 1827 "Total number of scwnd attempts"); 1828 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1829 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1830 OID_AUTO, "outsize", CTLFLAG_RD, 1831 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1832 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1833 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1834 OID_AUTO, "opts", CTLFLAG_RD, 1835 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1836 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1837 SYSCTL_CHILDREN(rack_sysctl_root), 1838 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1839 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1840 } 1841 1842 static uint32_t 1843 rc_init_window(struct tcp_rack *rack) 1844 { 1845 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1846 1847 } 1848 1849 static uint64_t 1850 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1851 { 1852 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1853 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1854 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1855 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1856 else 1857 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1858 } 1859 1860 static void 1861 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, 1862 uint64_t data, uint8_t mod, uint16_t aux, 1863 struct tcp_sendfile_track *cur, int line) 1864 { 1865 #ifdef TCP_REQUEST_TRK 1866 int do_log = 0; 1867 1868 /* 1869 * The rate cap one is noisy and only should come out when normal BB logging 1870 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out 1871 * once per chunk and make up the BBpoint that can be turned on by the client. 1872 */ 1873 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 1874 /* 1875 * The very noisy two need to only come out when 1876 * we have verbose logging on. 1877 */ 1878 if (rack_verbose_logging != 0) 1879 do_log = tcp_bblogging_on(rack->rc_tp); 1880 else 1881 do_log = 0; 1882 } else if (mod != HYBRID_LOG_BW_MEASURE) { 1883 /* 1884 * All other less noisy logs here except the measure which 1885 * also needs to come out on the point and the log. 1886 */ 1887 do_log = tcp_bblogging_on(rack->rc_tp); 1888 } else { 1889 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); 1890 } 1891 1892 if (do_log) { 1893 union tcp_log_stackspecific log; 1894 struct timeval tv; 1895 uint64_t lt_bw; 1896 1897 /* Convert our ms to a microsecond */ 1898 memset(&log, 0, sizeof(log)); 1899 1900 log.u_bbr.cwnd_gain = line; 1901 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1902 log.u_bbr.rttProp = tim; 1903 log.u_bbr.bw_inuse = cbw; 1904 log.u_bbr.delRate = rack_get_gp_est(rack); 1905 lt_bw = rack_get_lt_bw(rack); 1906 log.u_bbr.flex1 = seq; 1907 log.u_bbr.pacing_gain = aux; 1908 /* lt_bw = < flex3 | flex2 > */ 1909 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); 1910 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); 1911 /* Record the last obtained us rtt in inflight */ 1912 if (cur == NULL) { 1913 /* Make sure we are looking at the right log if an overide comes in */ 1914 cur = rack->r_ctl.rc_last_sft; 1915 } 1916 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) 1917 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; 1918 else { 1919 /* Use the last known rtt i.e. the rack-rtt */ 1920 log.u_bbr.inflight = rack->rc_rack_rtt; 1921 } 1922 if (cur != NULL) { 1923 uint64_t off; 1924 1925 log.u_bbr.cur_del_rate = cur->deadline; 1926 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 1927 /* start = < lost | pkt_epoch > */ 1928 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 1929 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 1930 log.u_bbr.flex6 = cur->start_seq; 1931 log.u_bbr.pkts_out = cur->end_seq; 1932 } else { 1933 /* start = < lost | pkt_epoch > */ 1934 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 1935 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 1936 /* end = < pkts_out | flex6 > */ 1937 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); 1938 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 1939 } 1940 /* first_send = <lt_epoch | epoch> */ 1941 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); 1942 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); 1943 /* localtime = <delivered | applimited>*/ 1944 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 1945 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 1946 #ifdef TCP_REQUEST_TRK 1947 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 1948 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 1949 #endif 1950 log.u_bbr.inhpts = 1; 1951 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); 1952 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); 1953 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; 1954 } else { 1955 log.u_bbr.flex7 = 0xffff; 1956 log.u_bbr.cur_del_rate = 0xffffffffffffffff; 1957 } 1958 /* 1959 * Compose bbr_state to be a bit wise 0000ADHF 1960 * where A is the always_pace flag 1961 * where D is the dgp_on flag 1962 * where H is the hybrid_mode on flag 1963 * where F is the use_fixed_rate flag. 1964 */ 1965 log.u_bbr.bbr_state = rack->rc_always_pace; 1966 log.u_bbr.bbr_state <<= 1; 1967 log.u_bbr.bbr_state |= rack->dgp_on; 1968 log.u_bbr.bbr_state <<= 1; 1969 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 1970 log.u_bbr.bbr_state <<= 1; 1971 log.u_bbr.bbr_state |= rack->use_fixed_rate; 1972 log.u_bbr.flex8 = mod; 1973 tcp_log_event(rack->rc_tp, NULL, 1974 &rack->rc_inp->inp_socket->so_rcv, 1975 &rack->rc_inp->inp_socket->so_snd, 1976 TCP_HYBRID_PACING_LOG, 0, 1977 0, &log, false, NULL, __func__, __LINE__, &tv); 1978 1979 } 1980 #endif 1981 } 1982 1983 #ifdef TCP_REQUEST_TRK 1984 static void 1985 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line) 1986 { 1987 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) { 1988 union tcp_log_stackspecific log; 1989 struct timeval tv; 1990 uint64_t off; 1991 1992 /* Convert our ms to a microsecond */ 1993 memset(&log, 0, sizeof(log)); 1994 1995 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1996 log.u_bbr.delRate = cur->sent_at_fs; 1997 1998 if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) { 1999 /* 2000 * We did not get a new Rules Applied to set so 2001 * no overlapping send occured, this means the 2002 * current byte counts are correct. 2003 */ 2004 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 2005 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; 2006 } else { 2007 /* 2008 * Overlapping send case, we switched to a new 2009 * send and did a rules applied. 2010 */ 2011 log.u_bbr.cur_del_rate = cur->sent_at_ls; 2012 log.u_bbr.rttProp = cur->rxt_at_ls; 2013 } 2014 log.u_bbr.bw_inuse = cur->rxt_at_fs; 2015 log.u_bbr.cwnd_gain = line; 2016 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2017 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2018 /* start = < flex1 | flex2 > */ 2019 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff); 2020 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2021 /* end = < flex3 | flex4 > */ 2022 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff); 2023 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2024 2025 /* localtime = <delivered | applimited>*/ 2026 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2027 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2028 /* client timestamp = <lt_epoch | epoch>*/ 2029 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff); 2030 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff); 2031 /* now set all the flags in */ 2032 log.u_bbr.pkts_out = cur->hybrid_flags; 2033 log.u_bbr.lost = cur->playout_ms; 2034 log.u_bbr.flex6 = cur->flags; 2035 /* 2036 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases 2037 * where a false retransmit occurred so first_send <-> lastsend may 2038 * include longer time then it actually took if we have a false rxt. 2039 */ 2040 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff); 2041 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff); 2042 /* 2043 * Compose bbr_state to be a bit wise 0000ADHF 2044 * where A is the always_pace flag 2045 * where D is the dgp_on flag 2046 * where H is the hybrid_mode on flag 2047 * where F is the use_fixed_rate flag. 2048 */ 2049 log.u_bbr.bbr_state = rack->rc_always_pace; 2050 log.u_bbr.bbr_state <<= 1; 2051 log.u_bbr.bbr_state |= rack->dgp_on; 2052 log.u_bbr.bbr_state <<= 1; 2053 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2054 log.u_bbr.bbr_state <<= 1; 2055 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2056 2057 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST; 2058 tcp_log_event(rack->rc_tp, NULL, 2059 &rack->rc_inp->inp_socket->so_rcv, 2060 &rack->rc_inp->inp_socket->so_snd, 2061 TCP_HYBRID_PACING_LOG, 0, 2062 0, &log, false, NULL, __func__, __LINE__, &tv); 2063 } 2064 } 2065 #endif 2066 2067 static inline uint64_t 2068 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) 2069 { 2070 uint64_t ret_bw, ether; 2071 uint64_t u_segsiz; 2072 2073 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); 2074 if (rack->r_is_v6){ 2075 #ifdef INET6 2076 ether += sizeof(struct ip6_hdr); 2077 #endif 2078 ether += 14; /* eheader size 6+6+2 */ 2079 } else { 2080 #ifdef INET 2081 ether += sizeof(struct ip); 2082 #endif 2083 ether += 14; /* eheader size 6+6+2 */ 2084 } 2085 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); 2086 ret_bw = bw; 2087 ret_bw *= ether; 2088 ret_bw /= u_segsiz; 2089 return (ret_bw); 2090 } 2091 2092 static void 2093 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) 2094 { 2095 #ifdef TCP_REQUEST_TRK 2096 struct timeval tv; 2097 uint64_t timenow, timeleft, lenleft, lengone, calcbw; 2098 #endif 2099 2100 if (rack->r_ctl.bw_rate_cap == 0) 2101 return; 2102 #ifdef TCP_REQUEST_TRK 2103 if (rack->rc_catch_up && rack->rc_hybrid_mode && 2104 (rack->r_ctl.rc_last_sft != NULL)) { 2105 /* 2106 * We have a dynamic cap. The original target 2107 * is in bw_rate_cap, but we need to look at 2108 * how long it is until we hit the deadline. 2109 */ 2110 struct tcp_sendfile_track *ent; 2111 2112 ent = rack->r_ctl.rc_last_sft; 2113 microuptime(&tv); 2114 timenow = tcp_tv_to_lusec(&tv); 2115 if (timenow >= ent->deadline) { 2116 /* No time left we do DGP only */ 2117 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2118 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2119 rack->r_ctl.bw_rate_cap = 0; 2120 return; 2121 } 2122 /* We have the time */ 2123 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; 2124 if (timeleft < HPTS_MSEC_IN_SEC) { 2125 /* If there is less than a ms left just use DGPs rate */ 2126 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2127 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2128 rack->r_ctl.bw_rate_cap = 0; 2129 return; 2130 } 2131 /* 2132 * Now lets find the amount of data left to send. 2133 * 2134 * Now ideally we want to use the end_seq to figure out how much more 2135 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. 2136 */ 2137 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) { 2138 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) 2139 lenleft = ent->end_seq - rack->rc_tp->snd_una; 2140 else { 2141 /* TSNH, we should catch it at the send */ 2142 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2143 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2144 rack->r_ctl.bw_rate_cap = 0; 2145 return; 2146 } 2147 } else { 2148 /* 2149 * The hard way, figure out how much is gone and then 2150 * take that away from the total the client asked for 2151 * (thats off by tls overhead if this is tls). 2152 */ 2153 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) 2154 lengone = rack->rc_tp->snd_una - ent->start_seq; 2155 else 2156 lengone = 0; 2157 if (lengone < (ent->end - ent->start)) 2158 lenleft = (ent->end - ent->start) - lengone; 2159 else { 2160 /* TSNH, we should catch it at the send */ 2161 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2162 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2163 rack->r_ctl.bw_rate_cap = 0; 2164 return; 2165 } 2166 } 2167 if (lenleft == 0) { 2168 /* We have it all sent */ 2169 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2170 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__); 2171 if (rack->r_ctl.bw_rate_cap) 2172 goto normal_ratecap; 2173 else 2174 return; 2175 } 2176 calcbw = lenleft * HPTS_USEC_IN_SEC; 2177 calcbw /= timeleft; 2178 /* Now we must compensate for IP/TCP overhead */ 2179 calcbw = rack_compensate_for_linerate(rack, calcbw); 2180 /* Update the bit rate cap */ 2181 rack->r_ctl.bw_rate_cap = calcbw; 2182 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2183 (rack_hybrid_allow_set_maxseg == 1) && 2184 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2185 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2186 uint32_t orig_max; 2187 2188 orig_max = rack->r_ctl.rc_pace_max_segs; 2189 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2190 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); 2191 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2192 } 2193 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2194 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__); 2195 if ((calcbw > 0) && (*bw > calcbw)) { 2196 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2197 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__); 2198 *capped = 1; 2199 *bw = calcbw; 2200 } 2201 return; 2202 } 2203 normal_ratecap: 2204 #endif 2205 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { 2206 #ifdef TCP_REQUEST_TRK 2207 if (rack->rc_hybrid_mode && 2208 rack->rc_catch_up && 2209 (rack->r_ctl.rc_last_sft != NULL) && 2210 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2211 (rack_hybrid_allow_set_maxseg == 1) && 2212 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2213 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2214 uint32_t orig_max; 2215 2216 orig_max = rack->r_ctl.rc_pace_max_segs; 2217 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2218 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); 2219 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2220 } 2221 #endif 2222 *capped = 1; 2223 *bw = rack->r_ctl.bw_rate_cap; 2224 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2225 *bw, 0, 0, 2226 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__); 2227 } 2228 } 2229 2230 static uint64_t 2231 rack_get_gp_est(struct tcp_rack *rack) 2232 { 2233 uint64_t bw, lt_bw, ret_bw; 2234 2235 if (rack->rc_gp_filled == 0) { 2236 /* 2237 * We have yet no b/w measurement, 2238 * if we have a user set initial bw 2239 * return it. If we don't have that and 2240 * we have an srtt, use the tcp IW (10) to 2241 * calculate a fictional b/w over the SRTT 2242 * which is more or less a guess. Note 2243 * we don't use our IW from rack on purpose 2244 * so if we have like IW=30, we are not 2245 * calculating a "huge" b/w. 2246 */ 2247 uint64_t srtt; 2248 2249 if (rack->dis_lt_bw == 1) 2250 lt_bw = 0; 2251 else 2252 lt_bw = rack_get_lt_bw(rack); 2253 if (lt_bw) { 2254 /* 2255 * No goodput bw but a long-term b/w does exist 2256 * lets use that. 2257 */ 2258 ret_bw = lt_bw; 2259 goto compensate; 2260 } 2261 if (rack->r_ctl.init_rate) 2262 return (rack->r_ctl.init_rate); 2263 2264 /* Ok lets come up with the IW guess, if we have a srtt */ 2265 if (rack->rc_tp->t_srtt == 0) { 2266 /* 2267 * Go with old pacing method 2268 * i.e. burst mitigation only. 2269 */ 2270 return (0); 2271 } 2272 /* Ok lets get the initial TCP win (not racks) */ 2273 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2274 srtt = (uint64_t)rack->rc_tp->t_srtt; 2275 bw *= (uint64_t)USECS_IN_SECOND; 2276 bw /= srtt; 2277 ret_bw = bw; 2278 goto compensate; 2279 2280 } 2281 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2282 /* Averaging is done, we can return the value */ 2283 bw = rack->r_ctl.gp_bw; 2284 } else { 2285 /* Still doing initial average must calculate */ 2286 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); 2287 } 2288 if (rack->dis_lt_bw) { 2289 /* We are not using lt-bw */ 2290 ret_bw = bw; 2291 goto compensate; 2292 } 2293 lt_bw = rack_get_lt_bw(rack); 2294 if (lt_bw == 0) { 2295 /* If we don't have one then equate it to the gp_bw */ 2296 lt_bw = rack->r_ctl.gp_bw; 2297 } 2298 if (rack->use_lesser_lt_bw) { 2299 if (lt_bw < bw) 2300 ret_bw = lt_bw; 2301 else 2302 ret_bw = bw; 2303 } else { 2304 if (lt_bw > bw) 2305 ret_bw = lt_bw; 2306 else 2307 ret_bw = bw; 2308 } 2309 /* 2310 * Now lets compensate based on the TCP/IP overhead. Our 2311 * Goodput estimate does not include this so we must pace out 2312 * a bit faster since our pacing calculations do. The pacing 2313 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz 2314 * we are using to do this, so we do that here in the opposite 2315 * direction as well. This means that if we are tunneled and the 2316 * segsiz is say 1200 bytes we will get quite a boost, but its 2317 * compensated for in the pacing time the opposite way. 2318 */ 2319 compensate: 2320 ret_bw = rack_compensate_for_linerate(rack, ret_bw); 2321 return(ret_bw); 2322 } 2323 2324 2325 static uint64_t 2326 rack_get_bw(struct tcp_rack *rack) 2327 { 2328 uint64_t bw; 2329 2330 if (rack->use_fixed_rate) { 2331 /* Return the fixed pacing rate */ 2332 return (rack_get_fixed_pacing_bw(rack)); 2333 } 2334 bw = rack_get_gp_est(rack); 2335 return (bw); 2336 } 2337 2338 static uint16_t 2339 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2340 { 2341 if (rack->use_fixed_rate) { 2342 return (100); 2343 } else if (rack->in_probe_rtt && (rsm == NULL)) 2344 return (rack->r_ctl.rack_per_of_gp_probertt); 2345 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2346 rack->r_ctl.rack_per_of_gp_rec)) { 2347 if (rsm) { 2348 /* a retransmission always use the recovery rate */ 2349 return (rack->r_ctl.rack_per_of_gp_rec); 2350 } else if (rack->rack_rec_nonrxt_use_cr) { 2351 /* Directed to use the configured rate */ 2352 goto configured_rate; 2353 } else if (rack->rack_no_prr && 2354 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2355 /* No PRR, lets just use the b/w estimate only */ 2356 return (100); 2357 } else { 2358 /* 2359 * Here we may have a non-retransmit but we 2360 * have no overrides, so just use the recovery 2361 * rate (prr is in effect). 2362 */ 2363 return (rack->r_ctl.rack_per_of_gp_rec); 2364 } 2365 } 2366 configured_rate: 2367 /* For the configured rate we look at our cwnd vs the ssthresh */ 2368 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2369 return (rack->r_ctl.rack_per_of_gp_ss); 2370 else 2371 return (rack->r_ctl.rack_per_of_gp_ca); 2372 } 2373 2374 static void 2375 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2376 { 2377 /* 2378 * Types of logs (mod value) 2379 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2380 * 2 = a dsack round begins, persist is reset to 16. 2381 * 3 = a dsack round ends 2382 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2383 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2384 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2385 */ 2386 if (tcp_bblogging_on(rack->rc_tp)) { 2387 union tcp_log_stackspecific log; 2388 struct timeval tv; 2389 2390 memset(&log, 0, sizeof(log)); 2391 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2392 log.u_bbr.flex1 <<= 1; 2393 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2394 log.u_bbr.flex1 <<= 1; 2395 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2396 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2397 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2398 log.u_bbr.flex4 = flex4; 2399 log.u_bbr.flex5 = flex5; 2400 log.u_bbr.flex6 = flex6; 2401 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2402 log.u_bbr.flex8 = mod; 2403 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2404 log.u_bbr.epoch = rack->r_ctl.current_round; 2405 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2406 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2407 &rack->rc_inp->inp_socket->so_rcv, 2408 &rack->rc_inp->inp_socket->so_snd, 2409 RACK_DSACK_HANDLING, 0, 2410 0, &log, false, &tv); 2411 } 2412 } 2413 2414 static void 2415 rack_log_hdwr_pacing(struct tcp_rack *rack, 2416 uint64_t rate, uint64_t hw_rate, int line, 2417 int error, uint16_t mod) 2418 { 2419 if (tcp_bblogging_on(rack->rc_tp)) { 2420 union tcp_log_stackspecific log; 2421 struct timeval tv; 2422 const struct ifnet *ifp; 2423 uint64_t ifp64; 2424 2425 memset(&log, 0, sizeof(log)); 2426 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2427 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2428 if (rack->r_ctl.crte) { 2429 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2430 } else if (rack->rc_inp->inp_route.ro_nh && 2431 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2432 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2433 } else 2434 ifp = NULL; 2435 if (ifp) { 2436 ifp64 = (uintptr_t)ifp; 2437 log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff); 2438 log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff); 2439 } 2440 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2441 log.u_bbr.bw_inuse = rate; 2442 log.u_bbr.flex5 = line; 2443 log.u_bbr.flex6 = error; 2444 log.u_bbr.flex7 = mod; 2445 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2446 log.u_bbr.flex8 = rack->use_fixed_rate; 2447 log.u_bbr.flex8 <<= 1; 2448 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2449 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2450 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2451 if (rack->r_ctl.crte) 2452 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2453 else 2454 log.u_bbr.cur_del_rate = 0; 2455 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2456 log.u_bbr.epoch = rack->r_ctl.current_round; 2457 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2458 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2459 &rack->rc_inp->inp_socket->so_rcv, 2460 &rack->rc_inp->inp_socket->so_snd, 2461 BBR_LOG_HDWR_PACE, 0, 2462 0, &log, false, &tv); 2463 } 2464 } 2465 2466 static uint64_t 2467 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2468 { 2469 /* 2470 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2471 */ 2472 uint64_t bw_est, high_rate; 2473 uint64_t gain; 2474 2475 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2476 bw_est = bw * gain; 2477 bw_est /= (uint64_t)100; 2478 /* Never fall below the minimum (def 64kbps) */ 2479 if (bw_est < RACK_MIN_BW) 2480 bw_est = RACK_MIN_BW; 2481 if (rack->r_rack_hw_rate_caps) { 2482 /* Rate caps are in place */ 2483 if (rack->r_ctl.crte != NULL) { 2484 /* We have a hdwr rate already */ 2485 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2486 if (bw_est >= high_rate) { 2487 /* We are capping bw at the highest rate table entry */ 2488 if (rack_hw_rate_cap_per && 2489 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { 2490 rack->r_rack_hw_rate_caps = 0; 2491 goto done; 2492 } 2493 rack_log_hdwr_pacing(rack, 2494 bw_est, high_rate, __LINE__, 2495 0, 3); 2496 bw_est = high_rate; 2497 if (capped) 2498 *capped = 1; 2499 } 2500 } else if ((rack->rack_hdrw_pacing == 0) && 2501 (rack->rack_hdw_pace_ena) && 2502 (rack->rack_attempt_hdwr_pace == 0) && 2503 (rack->rc_inp->inp_route.ro_nh != NULL) && 2504 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2505 /* 2506 * Special case, we have not yet attempted hardware 2507 * pacing, and yet we may, when we do, find out if we are 2508 * above the highest rate. We need to know the maxbw for the interface 2509 * in question (if it supports ratelimiting). We get back 2510 * a 0, if the interface is not found in the RL lists. 2511 */ 2512 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2513 if (high_rate) { 2514 /* Yep, we have a rate is it above this rate? */ 2515 if (bw_est > high_rate) { 2516 bw_est = high_rate; 2517 if (capped) 2518 *capped = 1; 2519 } 2520 } 2521 } 2522 } 2523 done: 2524 return (bw_est); 2525 } 2526 2527 static void 2528 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2529 { 2530 if (tcp_bblogging_on(rack->rc_tp)) { 2531 union tcp_log_stackspecific log; 2532 struct timeval tv; 2533 2534 if ((mod != 1) && (rack_verbose_logging == 0)) { 2535 /* 2536 * We get 3 values currently for mod 2537 * 1 - We are retransmitting and this tells the reason. 2538 * 2 - We are clearing a dup-ack count. 2539 * 3 - We are incrementing a dup-ack count. 2540 * 2541 * The clear/increment are only logged 2542 * if you have BBverbose on. 2543 */ 2544 return; 2545 } 2546 memset(&log, 0, sizeof(log)); 2547 log.u_bbr.flex1 = tsused; 2548 log.u_bbr.flex2 = thresh; 2549 log.u_bbr.flex3 = rsm->r_flags; 2550 log.u_bbr.flex4 = rsm->r_dupack; 2551 log.u_bbr.flex5 = rsm->r_start; 2552 log.u_bbr.flex6 = rsm->r_end; 2553 log.u_bbr.flex8 = mod; 2554 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2555 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2556 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2557 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2558 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2559 log.u_bbr.pacing_gain = rack->r_must_retran; 2560 log.u_bbr.epoch = rack->r_ctl.current_round; 2561 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2562 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2563 &rack->rc_inp->inp_socket->so_rcv, 2564 &rack->rc_inp->inp_socket->so_snd, 2565 BBR_LOG_SETTINGS_CHG, 0, 2566 0, &log, false, &tv); 2567 } 2568 } 2569 2570 static void 2571 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t pacing_delay, uint8_t which) 2572 { 2573 if (tcp_bblogging_on(rack->rc_tp)) { 2574 union tcp_log_stackspecific log; 2575 struct timeval tv; 2576 2577 memset(&log, 0, sizeof(log)); 2578 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2579 log.u_bbr.flex2 = to; 2580 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2581 log.u_bbr.flex4 = pacing_delay; 2582 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; 2583 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2584 log.u_bbr.flex7 = rack->rc_in_persist; 2585 log.u_bbr.flex8 = which; 2586 if (rack->rack_no_prr) 2587 log.u_bbr.pkts_out = 0; 2588 else 2589 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2590 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2591 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2592 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2593 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2594 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2595 log.u_bbr.pacing_gain = rack->r_must_retran; 2596 log.u_bbr.cwnd_gain = rack->rack_deferred_inited; 2597 log.u_bbr.pkt_epoch = rack->rc_has_collapsed; 2598 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2599 log.u_bbr.lost = rack_rto_min; 2600 log.u_bbr.epoch = rack->r_ctl.roundends; 2601 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2602 log.u_bbr.bw_inuse <<= 32; 2603 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2604 log.u_bbr.applimited = rack->rc_tp->t_flags2; 2605 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2606 &rack->rc_inp->inp_socket->so_rcv, 2607 &rack->rc_inp->inp_socket->so_snd, 2608 BBR_LOG_TIMERSTAR, 0, 2609 0, &log, false, &tv); 2610 } 2611 } 2612 2613 static void 2614 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2615 { 2616 if (tcp_bblogging_on(rack->rc_tp)) { 2617 union tcp_log_stackspecific log; 2618 struct timeval tv; 2619 2620 memset(&log, 0, sizeof(log)); 2621 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2622 log.u_bbr.flex8 = to_num; 2623 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2624 log.u_bbr.flex2 = rack->rc_rack_rtt; 2625 if (rsm == NULL) 2626 log.u_bbr.flex3 = 0; 2627 else 2628 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2629 if (rack->rack_no_prr) 2630 log.u_bbr.flex5 = 0; 2631 else 2632 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2633 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2634 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2635 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2636 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2637 log.u_bbr.pacing_gain = rack->r_must_retran; 2638 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2639 log.u_bbr.bw_inuse <<= 32; 2640 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2641 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2642 &rack->rc_inp->inp_socket->so_rcv, 2643 &rack->rc_inp->inp_socket->so_snd, 2644 BBR_LOG_RTO, 0, 2645 0, &log, false, &tv); 2646 } 2647 } 2648 2649 static void 2650 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2651 struct rack_sendmap *prev, 2652 struct rack_sendmap *rsm, 2653 struct rack_sendmap *next, 2654 int flag, uint32_t th_ack, int line) 2655 { 2656 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2657 union tcp_log_stackspecific log; 2658 struct timeval tv; 2659 2660 memset(&log, 0, sizeof(log)); 2661 log.u_bbr.flex8 = flag; 2662 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2663 log.u_bbr.cur_del_rate = (uintptr_t)prev; 2664 log.u_bbr.delRate = (uintptr_t)rsm; 2665 log.u_bbr.rttProp = (uintptr_t)next; 2666 log.u_bbr.flex7 = 0; 2667 if (prev) { 2668 log.u_bbr.flex1 = prev->r_start; 2669 log.u_bbr.flex2 = prev->r_end; 2670 log.u_bbr.flex7 |= 0x4; 2671 } 2672 if (rsm) { 2673 log.u_bbr.flex3 = rsm->r_start; 2674 log.u_bbr.flex4 = rsm->r_end; 2675 log.u_bbr.flex7 |= 0x2; 2676 } 2677 if (next) { 2678 log.u_bbr.flex5 = next->r_start; 2679 log.u_bbr.flex6 = next->r_end; 2680 log.u_bbr.flex7 |= 0x1; 2681 } 2682 log.u_bbr.applimited = line; 2683 log.u_bbr.pkts_out = th_ack; 2684 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2685 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2686 if (rack->rack_no_prr) 2687 log.u_bbr.lost = 0; 2688 else 2689 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2690 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2691 log.u_bbr.bw_inuse <<= 32; 2692 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2693 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2694 &rack->rc_inp->inp_socket->so_rcv, 2695 &rack->rc_inp->inp_socket->so_snd, 2696 TCP_LOG_MAPCHG, 0, 2697 0, &log, false, &tv); 2698 } 2699 } 2700 2701 static void 2702 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2703 struct rack_sendmap *rsm, int conf) 2704 { 2705 if (tcp_bblogging_on(tp)) { 2706 union tcp_log_stackspecific log; 2707 struct timeval tv; 2708 memset(&log, 0, sizeof(log)); 2709 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2710 log.u_bbr.flex1 = t; 2711 log.u_bbr.flex2 = len; 2712 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2713 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2714 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2715 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2716 log.u_bbr.flex7 = conf; 2717 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2718 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2719 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2720 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2721 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2722 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2723 if (rsm) { 2724 log.u_bbr.pkt_epoch = rsm->r_start; 2725 log.u_bbr.lost = rsm->r_end; 2726 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2727 /* We loose any upper of the 24 bits */ 2728 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2729 } else { 2730 /* Its a SYN */ 2731 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2732 log.u_bbr.lost = 0; 2733 log.u_bbr.cwnd_gain = 0; 2734 log.u_bbr.pacing_gain = 0; 2735 } 2736 /* Write out general bits of interest rrs here */ 2737 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2738 log.u_bbr.use_lt_bw <<= 1; 2739 log.u_bbr.use_lt_bw |= rack->forced_ack; 2740 log.u_bbr.use_lt_bw <<= 1; 2741 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2742 log.u_bbr.use_lt_bw <<= 1; 2743 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2744 log.u_bbr.use_lt_bw <<= 1; 2745 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2746 log.u_bbr.use_lt_bw <<= 1; 2747 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2748 log.u_bbr.use_lt_bw <<= 1; 2749 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2750 log.u_bbr.use_lt_bw <<= 1; 2751 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2752 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2753 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2754 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2755 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2756 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2757 log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 2758 log.u_bbr.bw_inuse <<= 32; 2759 if (rsm) 2760 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2761 TCP_LOG_EVENTP(tp, NULL, 2762 &rack->rc_inp->inp_socket->so_rcv, 2763 &rack->rc_inp->inp_socket->so_snd, 2764 BBR_LOG_BBRRTT, 0, 2765 0, &log, false, &tv); 2766 2767 2768 } 2769 } 2770 2771 static void 2772 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2773 { 2774 /* 2775 * Log the rtt sample we are 2776 * applying to the srtt algorithm in 2777 * useconds. 2778 */ 2779 if (tcp_bblogging_on(rack->rc_tp)) { 2780 union tcp_log_stackspecific log; 2781 struct timeval tv; 2782 2783 /* Convert our ms to a microsecond */ 2784 memset(&log, 0, sizeof(log)); 2785 log.u_bbr.flex1 = rtt; 2786 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2787 log.u_bbr.flex7 = 1; 2788 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2789 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2790 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2791 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2792 log.u_bbr.pacing_gain = rack->r_must_retran; 2793 /* 2794 * We capture in delRate the upper 32 bits as 2795 * the confidence level we had declared, and the 2796 * lower 32 bits as the actual RTT using the arrival 2797 * timestamp. 2798 */ 2799 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2800 log.u_bbr.delRate <<= 32; 2801 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2802 /* Lets capture all the things that make up t_rtxcur */ 2803 log.u_bbr.applimited = rack_rto_min; 2804 log.u_bbr.epoch = rack_rto_max; 2805 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2806 log.u_bbr.lost = rack_rto_min; 2807 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2808 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2809 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2810 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2811 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2812 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2813 &rack->rc_inp->inp_socket->so_rcv, 2814 &rack->rc_inp->inp_socket->so_snd, 2815 TCP_LOG_RTT, 0, 2816 0, &log, false, &tv); 2817 } 2818 } 2819 2820 static void 2821 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2822 { 2823 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2824 union tcp_log_stackspecific log; 2825 struct timeval tv; 2826 2827 /* Convert our ms to a microsecond */ 2828 memset(&log, 0, sizeof(log)); 2829 log.u_bbr.flex1 = rtt; 2830 log.u_bbr.flex2 = send_time; 2831 log.u_bbr.flex3 = ack_time; 2832 log.u_bbr.flex4 = where; 2833 log.u_bbr.flex7 = 2; 2834 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2835 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2836 log.u_bbr.bw_inuse <<= 32; 2837 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2838 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2839 &rack->rc_inp->inp_socket->so_rcv, 2840 &rack->rc_inp->inp_socket->so_snd, 2841 TCP_LOG_RTT, 0, 2842 0, &log, false, &tv); 2843 } 2844 } 2845 2846 2847 static void 2848 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) 2849 { 2850 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2851 union tcp_log_stackspecific log; 2852 struct timeval tv; 2853 2854 /* Convert our ms to a microsecond */ 2855 memset(&log, 0, sizeof(log)); 2856 log.u_bbr.flex1 = idx; 2857 log.u_bbr.flex2 = rack_ts_to_msec(tsv); 2858 log.u_bbr.flex3 = tsecho; 2859 log.u_bbr.flex7 = 3; 2860 log.u_bbr.rttProp = tsv; 2861 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2862 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2863 log.u_bbr.bw_inuse <<= 32; 2864 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2865 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2866 &rack->rc_inp->inp_socket->so_rcv, 2867 &rack->rc_inp->inp_socket->so_snd, 2868 TCP_LOG_RTT, 0, 2869 0, &log, false, &tv); 2870 } 2871 } 2872 2873 2874 static inline void 2875 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2876 { 2877 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2878 union tcp_log_stackspecific log; 2879 struct timeval tv; 2880 2881 memset(&log, 0, sizeof(log)); 2882 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2883 log.u_bbr.flex1 = line; 2884 log.u_bbr.flex2 = tick; 2885 log.u_bbr.flex3 = tp->t_maxunacktime; 2886 log.u_bbr.flex4 = tp->t_acktime; 2887 log.u_bbr.flex8 = event; 2888 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2889 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2890 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2891 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2892 log.u_bbr.pacing_gain = rack->r_must_retran; 2893 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2894 log.u_bbr.bw_inuse <<= 32; 2895 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2896 TCP_LOG_EVENTP(tp, NULL, 2897 &rack->rc_inp->inp_socket->so_rcv, 2898 &rack->rc_inp->inp_socket->so_snd, 2899 BBR_LOG_PROGRESS, 0, 2900 0, &log, false, &tv); 2901 } 2902 } 2903 2904 static void 2905 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, uint32_t cts, struct timeval *tv, int line) 2906 { 2907 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2908 union tcp_log_stackspecific log; 2909 2910 memset(&log, 0, sizeof(log)); 2911 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2912 log.u_bbr.flex1 = pacing_delay; 2913 if (rack->rack_no_prr) 2914 log.u_bbr.flex2 = 0; 2915 else 2916 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2917 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2918 log.u_bbr.flex6 = line; 2919 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2920 log.u_bbr.flex8 = rack->rc_in_persist; 2921 log.u_bbr.timeStamp = cts; 2922 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2923 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2924 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2925 log.u_bbr.pacing_gain = rack->r_must_retran; 2926 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2927 &rack->rc_inp->inp_socket->so_rcv, 2928 &rack->rc_inp->inp_socket->so_snd, 2929 BBR_LOG_BBRSND, 0, 2930 0, &log, false, tv); 2931 } 2932 } 2933 2934 static void 2935 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2936 { 2937 if (tcp_bblogging_on(rack->rc_tp)) { 2938 union tcp_log_stackspecific log; 2939 struct timeval tv; 2940 2941 memset(&log, 0, sizeof(log)); 2942 log.u_bbr.flex1 = did_out; 2943 log.u_bbr.flex2 = nxt_pkt; 2944 log.u_bbr.flex3 = way_out; 2945 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2946 if (rack->rack_no_prr) 2947 log.u_bbr.flex5 = 0; 2948 else 2949 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2950 log.u_bbr.flex6 = nsegs; 2951 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2952 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2953 log.u_bbr.flex7 <<= 1; 2954 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2955 log.u_bbr.flex7 <<= 1; 2956 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2957 log.u_bbr.flex8 = rack->rc_in_persist; 2958 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2959 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2960 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2961 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2962 log.u_bbr.use_lt_bw <<= 1; 2963 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2964 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2965 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2966 log.u_bbr.pacing_gain = rack->r_must_retran; 2967 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2968 log.u_bbr.bw_inuse <<= 32; 2969 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2970 log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat; 2971 log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat; 2972 log.u_bbr.lost = rack->rc_tp->t_srtt; 2973 log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt; 2974 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2975 &rack->rc_inp->inp_socket->so_rcv, 2976 &rack->rc_inp->inp_socket->so_snd, 2977 BBR_LOG_DOSEG_DONE, 0, 2978 0, &log, false, &tv); 2979 } 2980 } 2981 2982 static void 2983 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2984 { 2985 if (tcp_bblogging_on(rack->rc_tp)) { 2986 union tcp_log_stackspecific log; 2987 struct timeval tv; 2988 2989 memset(&log, 0, sizeof(log)); 2990 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2991 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2992 log.u_bbr.flex4 = arg1; 2993 log.u_bbr.flex5 = arg2; 2994 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; 2995 log.u_bbr.flex6 = arg3; 2996 log.u_bbr.flex8 = frm; 2997 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2998 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2999 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3000 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 3001 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3002 log.u_bbr.pacing_gain = rack->r_must_retran; 3003 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 3004 &tptosocket(tp)->so_snd, 3005 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 3006 } 3007 } 3008 3009 static void 3010 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t pacing_delay, 3011 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 3012 { 3013 if (tcp_bblogging_on(rack->rc_tp)) { 3014 union tcp_log_stackspecific log; 3015 struct timeval tv; 3016 3017 memset(&log, 0, sizeof(log)); 3018 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3019 log.u_bbr.flex1 = pacing_delay; 3020 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 3021 log.u_bbr.flex4 = reason; 3022 if (rack->rack_no_prr) 3023 log.u_bbr.flex5 = 0; 3024 else 3025 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3026 log.u_bbr.flex7 = hpts_calling; 3027 log.u_bbr.flex8 = rack->rc_in_persist; 3028 log.u_bbr.lt_epoch = cwnd_to_use; 3029 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3030 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3031 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3032 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3033 log.u_bbr.pacing_gain = rack->r_must_retran; 3034 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 3035 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3036 log.u_bbr.bw_inuse <<= 32; 3037 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3038 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3039 &rack->rc_inp->inp_socket->so_rcv, 3040 &rack->rc_inp->inp_socket->so_snd, 3041 BBR_LOG_JUSTRET, 0, 3042 tlen, &log, false, &tv); 3043 } 3044 } 3045 3046 static void 3047 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 3048 struct timeval *tv, uint32_t flags_on_entry) 3049 { 3050 if (tcp_bblogging_on(rack->rc_tp)) { 3051 union tcp_log_stackspecific log; 3052 3053 memset(&log, 0, sizeof(log)); 3054 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3055 log.u_bbr.flex1 = line; 3056 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 3057 log.u_bbr.flex3 = flags_on_entry; 3058 log.u_bbr.flex4 = us_cts; 3059 if (rack->rack_no_prr) 3060 log.u_bbr.flex5 = 0; 3061 else 3062 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3063 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3064 log.u_bbr.flex7 = hpts_removed; 3065 log.u_bbr.flex8 = 1; 3066 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 3067 log.u_bbr.timeStamp = us_cts; 3068 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3069 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3070 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3071 log.u_bbr.pacing_gain = rack->r_must_retran; 3072 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3073 log.u_bbr.bw_inuse <<= 32; 3074 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3075 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3076 &rack->rc_inp->inp_socket->so_rcv, 3077 &rack->rc_inp->inp_socket->so_snd, 3078 BBR_LOG_TIMERCANC, 0, 3079 0, &log, false, tv); 3080 } 3081 } 3082 3083 static void 3084 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 3085 uint32_t flex1, uint32_t flex2, 3086 uint32_t flex3, uint32_t flex4, 3087 uint32_t flex5, uint32_t flex6, 3088 uint16_t flex7, uint8_t mod) 3089 { 3090 if (tcp_bblogging_on(rack->rc_tp)) { 3091 union tcp_log_stackspecific log; 3092 struct timeval tv; 3093 3094 if (mod == 1) { 3095 /* No you can't use 1, its for the real to cancel */ 3096 return; 3097 } 3098 memset(&log, 0, sizeof(log)); 3099 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3100 log.u_bbr.flex1 = flex1; 3101 log.u_bbr.flex2 = flex2; 3102 log.u_bbr.flex3 = flex3; 3103 log.u_bbr.flex4 = flex4; 3104 log.u_bbr.flex5 = flex5; 3105 log.u_bbr.flex6 = flex6; 3106 log.u_bbr.flex7 = flex7; 3107 log.u_bbr.flex8 = mod; 3108 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3109 &rack->rc_inp->inp_socket->so_rcv, 3110 &rack->rc_inp->inp_socket->so_snd, 3111 BBR_LOG_TIMERCANC, 0, 3112 0, &log, false, &tv); 3113 } 3114 } 3115 3116 static void 3117 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 3118 { 3119 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3120 union tcp_log_stackspecific log; 3121 struct timeval tv; 3122 3123 memset(&log, 0, sizeof(log)); 3124 log.u_bbr.flex1 = timers; 3125 log.u_bbr.flex2 = ret; 3126 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 3127 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3128 log.u_bbr.flex5 = cts; 3129 if (rack->rack_no_prr) 3130 log.u_bbr.flex6 = 0; 3131 else 3132 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 3133 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3134 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3135 log.u_bbr.pacing_gain = rack->r_must_retran; 3136 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3137 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3138 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3139 &rack->rc_inp->inp_socket->so_rcv, 3140 &rack->rc_inp->inp_socket->so_snd, 3141 BBR_LOG_TO_PROCESS, 0, 3142 0, &log, false, &tv); 3143 } 3144 } 3145 3146 static void 3147 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 3148 { 3149 if (tcp_bblogging_on(rack->rc_tp)) { 3150 union tcp_log_stackspecific log; 3151 struct timeval tv; 3152 3153 memset(&log, 0, sizeof(log)); 3154 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 3155 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 3156 if (rack->rack_no_prr) 3157 log.u_bbr.flex3 = 0; 3158 else 3159 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 3160 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 3161 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 3162 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 3163 log.u_bbr.flex7 = line; 3164 log.u_bbr.flex8 = frm; 3165 log.u_bbr.pkts_out = orig_cwnd; 3166 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3167 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3168 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3169 log.u_bbr.use_lt_bw <<= 1; 3170 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3171 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3172 &rack->rc_inp->inp_socket->so_rcv, 3173 &rack->rc_inp->inp_socket->so_snd, 3174 BBR_LOG_BBRUPD, 0, 3175 0, &log, false, &tv); 3176 } 3177 } 3178 3179 static void 3180 rack_counter_destroy(void) 3181 { 3182 counter_u64_free(rack_total_bytes); 3183 counter_u64_free(rack_fto_send); 3184 counter_u64_free(rack_fto_rsm_send); 3185 counter_u64_free(rack_nfto_resend); 3186 counter_u64_free(rack_hw_pace_init_fail); 3187 counter_u64_free(rack_hw_pace_lost); 3188 counter_u64_free(rack_non_fto_send); 3189 counter_u64_free(rack_extended_rfo); 3190 counter_u64_free(rack_tlp_tot); 3191 counter_u64_free(rack_tlp_newdata); 3192 counter_u64_free(rack_tlp_retran); 3193 counter_u64_free(rack_tlp_retran_bytes); 3194 counter_u64_free(rack_to_tot); 3195 counter_u64_free(rack_saw_enobuf); 3196 counter_u64_free(rack_saw_enobuf_hw); 3197 counter_u64_free(rack_saw_enetunreach); 3198 counter_u64_free(rack_hot_alloc); 3199 counter_u64_free(rack_to_alloc); 3200 counter_u64_free(rack_to_alloc_hard); 3201 counter_u64_free(rack_to_alloc_emerg); 3202 counter_u64_free(rack_to_alloc_limited); 3203 counter_u64_free(rack_alloc_limited_conns); 3204 counter_u64_free(rack_split_limited); 3205 counter_u64_free(rack_multi_single_eq); 3206 counter_u64_free(rack_rxt_clamps_cwnd); 3207 counter_u64_free(rack_rxt_clamps_cwnd_uniq); 3208 counter_u64_free(rack_proc_non_comp_ack); 3209 counter_u64_free(rack_sack_proc_all); 3210 counter_u64_free(rack_sack_proc_restart); 3211 counter_u64_free(rack_sack_proc_short); 3212 counter_u64_free(rack_input_idle_reduces); 3213 counter_u64_free(rack_collapsed_win); 3214 counter_u64_free(rack_collapsed_win_rxt); 3215 counter_u64_free(rack_collapsed_win_rxt_bytes); 3216 counter_u64_free(rack_collapsed_win_seen); 3217 counter_u64_free(rack_try_scwnd); 3218 counter_u64_free(rack_persists_sends); 3219 counter_u64_free(rack_persists_acks); 3220 counter_u64_free(rack_persists_loss); 3221 counter_u64_free(rack_persists_lost_ends); 3222 #ifdef INVARIANTS 3223 counter_u64_free(rack_adjust_map_bw); 3224 #endif 3225 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 3226 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 3227 } 3228 3229 static struct rack_sendmap * 3230 rack_alloc(struct tcp_rack *rack) 3231 { 3232 struct rack_sendmap *rsm; 3233 3234 /* 3235 * First get the top of the list it in 3236 * theory is the "hottest" rsm we have, 3237 * possibly just freed by ack processing. 3238 */ 3239 if (rack->rc_free_cnt > rack_free_cache) { 3240 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3241 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3242 counter_u64_add(rack_hot_alloc, 1); 3243 rack->rc_free_cnt--; 3244 return (rsm); 3245 } 3246 /* 3247 * Once we get under our free cache we probably 3248 * no longer have a "hot" one available. Lets 3249 * get one from UMA. 3250 */ 3251 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3252 if (rsm) { 3253 rack->r_ctl.rc_num_maps_alloced++; 3254 counter_u64_add(rack_to_alloc, 1); 3255 return (rsm); 3256 } 3257 /* 3258 * Dig in to our aux rsm's (the last two) since 3259 * UMA failed to get us one. 3260 */ 3261 if (rack->rc_free_cnt) { 3262 counter_u64_add(rack_to_alloc_emerg, 1); 3263 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3264 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3265 rack->rc_free_cnt--; 3266 return (rsm); 3267 } 3268 return (NULL); 3269 } 3270 3271 static struct rack_sendmap * 3272 rack_alloc_full_limit(struct tcp_rack *rack) 3273 { 3274 if ((V_tcp_map_entries_limit > 0) && 3275 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3276 counter_u64_add(rack_to_alloc_limited, 1); 3277 if (!rack->alloc_limit_reported) { 3278 rack->alloc_limit_reported = 1; 3279 counter_u64_add(rack_alloc_limited_conns, 1); 3280 } 3281 return (NULL); 3282 } 3283 return (rack_alloc(rack)); 3284 } 3285 3286 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3287 static struct rack_sendmap * 3288 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3289 { 3290 struct rack_sendmap *rsm; 3291 3292 if (limit_type) { 3293 /* currently there is only one limit type */ 3294 if (rack->r_ctl.rc_split_limit > 0 && 3295 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { 3296 counter_u64_add(rack_split_limited, 1); 3297 if (!rack->alloc_limit_reported) { 3298 rack->alloc_limit_reported = 1; 3299 counter_u64_add(rack_alloc_limited_conns, 1); 3300 } 3301 return (NULL); 3302 } 3303 } 3304 3305 /* allocate and mark in the limit type, if set */ 3306 rsm = rack_alloc(rack); 3307 if (rsm != NULL && limit_type) { 3308 rsm->r_limit_type = limit_type; 3309 rack->r_ctl.rc_num_split_allocs++; 3310 } 3311 return (rsm); 3312 } 3313 3314 static void 3315 rack_free_trim(struct tcp_rack *rack) 3316 { 3317 struct rack_sendmap *rsm; 3318 3319 /* 3320 * Free up all the tail entries until 3321 * we get our list down to the limit. 3322 */ 3323 while (rack->rc_free_cnt > rack_free_cache) { 3324 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3325 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3326 rack->rc_free_cnt--; 3327 rack->r_ctl.rc_num_maps_alloced--; 3328 uma_zfree(rack_zone, rsm); 3329 } 3330 } 3331 3332 static void 3333 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3334 { 3335 if (rsm->r_flags & RACK_APP_LIMITED) { 3336 KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), 3337 ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); 3338 rack->r_ctl.rc_app_limited_cnt--; 3339 } 3340 if (rsm->r_limit_type) { 3341 /* currently there is only one limit type */ 3342 rack->r_ctl.rc_num_split_allocs--; 3343 } 3344 if (rsm == rack->r_ctl.rc_first_appl) { 3345 rack->r_ctl.cleared_app_ack_seq = rsm->r_end; 3346 rack->r_ctl.cleared_app_ack = 1; 3347 if (rack->r_ctl.rc_app_limited_cnt == 0) 3348 rack->r_ctl.rc_first_appl = NULL; 3349 else 3350 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); 3351 } 3352 if (rsm == rack->r_ctl.rc_resend) 3353 rack->r_ctl.rc_resend = NULL; 3354 if (rsm == rack->r_ctl.rc_end_appl) 3355 rack->r_ctl.rc_end_appl = NULL; 3356 if (rack->r_ctl.rc_tlpsend == rsm) 3357 rack->r_ctl.rc_tlpsend = NULL; 3358 if (rack->r_ctl.rc_sacklast == rsm) 3359 rack->r_ctl.rc_sacklast = NULL; 3360 memset(rsm, 0, sizeof(struct rack_sendmap)); 3361 /* Make sure we are not going to overrun our count limit of 0xff */ 3362 if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) { 3363 rack_free_trim(rack); 3364 } 3365 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3366 rack->rc_free_cnt++; 3367 } 3368 3369 static uint32_t 3370 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3371 { 3372 uint64_t srtt, bw, len, tim; 3373 uint32_t segsiz, def_len, minl; 3374 3375 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3376 def_len = rack_def_data_window * segsiz; 3377 if (rack->rc_gp_filled == 0) { 3378 /* 3379 * We have no measurement (IW is in flight?) so 3380 * we can only guess using our data_window sysctl 3381 * value (usually 20MSS). 3382 */ 3383 return (def_len); 3384 } 3385 /* 3386 * Now we have a number of factors to consider. 3387 * 3388 * 1) We have a desired BDP which is usually 3389 * at least 2. 3390 * 2) We have a minimum number of rtt's usually 1 SRTT 3391 * but we allow it too to be more. 3392 * 3) We want to make sure a measurement last N useconds (if 3393 * we have set rack_min_measure_usec. 3394 * 3395 * We handle the first concern here by trying to create a data 3396 * window of max(rack_def_data_window, DesiredBDP). The 3397 * second concern we handle in not letting the measurement 3398 * window end normally until at least the required SRTT's 3399 * have gone by which is done further below in 3400 * rack_enough_for_measurement(). Finally the third concern 3401 * we also handle here by calculating how long that time 3402 * would take at the current BW and then return the 3403 * max of our first calculation and that length. Note 3404 * that if rack_min_measure_usec is 0, we don't deal 3405 * with concern 3. Also for both Concern 1 and 3 an 3406 * application limited period could end the measurement 3407 * earlier. 3408 * 3409 * So lets calculate the BDP with the "known" b/w using 3410 * the SRTT as our rtt and then multiply it by the goal. 3411 */ 3412 bw = rack_get_bw(rack); 3413 srtt = (uint64_t)tp->t_srtt; 3414 len = bw * srtt; 3415 len /= (uint64_t)HPTS_USEC_IN_SEC; 3416 len *= max(1, rack_goal_bdp); 3417 /* Now we need to round up to the nearest MSS */ 3418 len = roundup(len, segsiz); 3419 if (rack_min_measure_usec) { 3420 /* Now calculate our min length for this b/w */ 3421 tim = rack_min_measure_usec; 3422 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3423 if (minl == 0) 3424 minl = 1; 3425 minl = roundup(minl, segsiz); 3426 if (len < minl) 3427 len = minl; 3428 } 3429 /* 3430 * Now if we have a very small window we want 3431 * to attempt to get the window that is 3432 * as small as possible. This happens on 3433 * low b/w connections and we don't want to 3434 * span huge numbers of rtt's between measurements. 3435 * 3436 * We basically include 2 over our "MIN window" so 3437 * that the measurement can be shortened (possibly) by 3438 * an ack'ed packet. 3439 */ 3440 if (len < def_len) 3441 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3442 else 3443 return (max((uint32_t)len, def_len)); 3444 3445 } 3446 3447 static int 3448 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3449 { 3450 uint32_t tim, srtts, segsiz; 3451 3452 /* 3453 * Has enough time passed for the GP measurement to be valid? 3454 */ 3455 if (SEQ_LT(th_ack, tp->gput_seq)) { 3456 /* Not enough bytes yet */ 3457 return (0); 3458 } 3459 if ((tp->snd_max == tp->snd_una) || 3460 (th_ack == tp->snd_max)){ 3461 /* 3462 * All is acked quality of all acked is 3463 * usually low or medium, but we in theory could split 3464 * all acked into two cases, where you got 3465 * a signifigant amount of your window and 3466 * where you did not. For now we leave it 3467 * but it is something to contemplate in the 3468 * future. The danger here is that delayed ack 3469 * is effecting the last byte (which is a 50:50 chance). 3470 */ 3471 *quality = RACK_QUALITY_ALLACKED; 3472 return (1); 3473 } 3474 if (SEQ_GEQ(th_ack, tp->gput_ack)) { 3475 /* 3476 * We obtained our entire window of data we wanted 3477 * no matter if we are in recovery or not then 3478 * its ok since expanding the window does not 3479 * make things fuzzy (or at least not as much). 3480 */ 3481 *quality = RACK_QUALITY_HIGH; 3482 return (1); 3483 } 3484 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3485 if (SEQ_LT(th_ack, tp->gput_ack) && 3486 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3487 /* Not enough bytes yet */ 3488 return (0); 3489 } 3490 if (rack->r_ctl.rc_first_appl && 3491 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3492 /* 3493 * We are up to the app limited send point 3494 * we have to measure irrespective of the time.. 3495 */ 3496 *quality = RACK_QUALITY_APPLIMITED; 3497 return (1); 3498 } 3499 /* Now what about time? */ 3500 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3501 tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3502 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 3503 /* 3504 * We do not allow a measurement if we are in recovery 3505 * that would shrink the goodput window we wanted. 3506 * This is to prevent cloudyness of when the last send 3507 * was actually made. 3508 */ 3509 *quality = RACK_QUALITY_HIGH; 3510 return (1); 3511 } 3512 /* Nope not even a full SRTT has passed */ 3513 return (0); 3514 } 3515 3516 static void 3517 rack_log_timely(struct tcp_rack *rack, 3518 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3519 uint64_t up_bnd, int line, uint8_t method) 3520 { 3521 if (tcp_bblogging_on(rack->rc_tp)) { 3522 union tcp_log_stackspecific log; 3523 struct timeval tv; 3524 3525 memset(&log, 0, sizeof(log)); 3526 log.u_bbr.flex1 = logged; 3527 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3528 log.u_bbr.flex2 <<= 4; 3529 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3530 log.u_bbr.flex2 <<= 4; 3531 log.u_bbr.flex2 |= rack->rc_gp_incr; 3532 log.u_bbr.flex2 <<= 4; 3533 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3534 log.u_bbr.flex3 = rack->rc_gp_incr; 3535 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3536 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3537 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3538 log.u_bbr.flex7 = rack->rc_gp_bwred; 3539 log.u_bbr.flex8 = method; 3540 log.u_bbr.cur_del_rate = cur_bw; 3541 log.u_bbr.delRate = low_bnd; 3542 log.u_bbr.bw_inuse = up_bnd; 3543 log.u_bbr.rttProp = rack_get_bw(rack); 3544 log.u_bbr.pkt_epoch = line; 3545 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3546 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3547 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3548 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3549 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3550 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3551 log.u_bbr.cwnd_gain <<= 1; 3552 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3553 log.u_bbr.cwnd_gain <<= 1; 3554 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3555 log.u_bbr.cwnd_gain <<= 1; 3556 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3557 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3558 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3559 &rack->rc_inp->inp_socket->so_rcv, 3560 &rack->rc_inp->inp_socket->so_snd, 3561 TCP_TIMELY_WORK, 0, 3562 0, &log, false, &tv); 3563 } 3564 } 3565 3566 static int 3567 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3568 { 3569 /* 3570 * Before we increase we need to know if 3571 * the estimate just made was less than 3572 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3573 * 3574 * If we already are pacing at a fast enough 3575 * rate to push us faster there is no sense of 3576 * increasing. 3577 * 3578 * We first caculate our actual pacing rate (ss or ca multiplier 3579 * times our cur_bw). 3580 * 3581 * Then we take the last measured rate and multipy by our 3582 * maximum pacing overage to give us a max allowable rate. 3583 * 3584 * If our act_rate is smaller than our max_allowable rate 3585 * then we should increase. Else we should hold steady. 3586 * 3587 */ 3588 uint64_t act_rate, max_allow_rate; 3589 3590 if (rack_timely_no_stopping) 3591 return (1); 3592 3593 if ((cur_bw == 0) || (last_bw_est == 0)) { 3594 /* 3595 * Initial startup case or 3596 * everything is acked case. 3597 */ 3598 rack_log_timely(rack, mult, cur_bw, 0, 0, 3599 __LINE__, 9); 3600 return (1); 3601 } 3602 if (mult <= 100) { 3603 /* 3604 * We can always pace at or slightly above our rate. 3605 */ 3606 rack_log_timely(rack, mult, cur_bw, 0, 0, 3607 __LINE__, 9); 3608 return (1); 3609 } 3610 act_rate = cur_bw * (uint64_t)mult; 3611 act_rate /= 100; 3612 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3613 max_allow_rate /= 100; 3614 if (act_rate < max_allow_rate) { 3615 /* 3616 * Here the rate we are actually pacing at 3617 * is smaller than 10% above our last measurement. 3618 * This means we are pacing below what we would 3619 * like to try to achieve (plus some wiggle room). 3620 */ 3621 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3622 __LINE__, 9); 3623 return (1); 3624 } else { 3625 /* 3626 * Here we are already pacing at least rack_max_per_above(10%) 3627 * what we are getting back. This indicates most likely 3628 * that we are being limited (cwnd/rwnd/app) and can't 3629 * get any more b/w. There is no sense of trying to 3630 * raise up the pacing rate its not speeding us up 3631 * and we already are pacing faster than we are getting. 3632 */ 3633 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3634 __LINE__, 8); 3635 return (0); 3636 } 3637 } 3638 3639 static void 3640 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3641 { 3642 /* 3643 * When we drag bottom, we want to assure 3644 * that no multiplier is below 1.0, if so 3645 * we want to restore it to at least that. 3646 */ 3647 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3648 /* This is unlikely we usually do not touch recovery */ 3649 rack->r_ctl.rack_per_of_gp_rec = 100; 3650 } 3651 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3652 rack->r_ctl.rack_per_of_gp_ca = 100; 3653 } 3654 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3655 rack->r_ctl.rack_per_of_gp_ss = 100; 3656 } 3657 } 3658 3659 static void 3660 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3661 { 3662 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3663 rack->r_ctl.rack_per_of_gp_ca = 100; 3664 } 3665 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3666 rack->r_ctl.rack_per_of_gp_ss = 100; 3667 } 3668 } 3669 3670 static void 3671 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3672 { 3673 int32_t calc, logged, plus; 3674 3675 logged = 0; 3676 3677 if (rack->rc_skip_timely) 3678 return; 3679 if (override) { 3680 /* 3681 * override is passed when we are 3682 * loosing b/w and making one last 3683 * gasp at trying to not loose out 3684 * to a new-reno flow. 3685 */ 3686 goto extra_boost; 3687 } 3688 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3689 if (rack->rc_gp_incr && 3690 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3691 /* 3692 * Reset and get 5 strokes more before the boost. Note 3693 * that the count is 0 based so we have to add one. 3694 */ 3695 extra_boost: 3696 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3697 rack->rc_gp_timely_inc_cnt = 0; 3698 } else 3699 plus = (uint32_t)rack_gp_increase_per; 3700 /* Must be at least 1% increase for true timely increases */ 3701 if ((plus < 1) && 3702 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3703 plus = 1; 3704 if (rack->rc_gp_saw_rec && 3705 (rack->rc_gp_no_rec_chg == 0) && 3706 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3707 rack->r_ctl.rack_per_of_gp_rec)) { 3708 /* We have been in recovery ding it too */ 3709 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3710 if (calc > 0xffff) 3711 calc = 0xffff; 3712 logged |= 1; 3713 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3714 if (rack->r_ctl.rack_per_upper_bound_ca && 3715 (rack->rc_dragged_bottom == 0) && 3716 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) 3717 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; 3718 } 3719 if (rack->rc_gp_saw_ca && 3720 (rack->rc_gp_saw_ss == 0) && 3721 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3722 rack->r_ctl.rack_per_of_gp_ca)) { 3723 /* In CA */ 3724 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3725 if (calc > 0xffff) 3726 calc = 0xffff; 3727 logged |= 2; 3728 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3729 if (rack->r_ctl.rack_per_upper_bound_ca && 3730 (rack->rc_dragged_bottom == 0) && 3731 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) 3732 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; 3733 } 3734 if (rack->rc_gp_saw_ss && 3735 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3736 rack->r_ctl.rack_per_of_gp_ss)) { 3737 /* In SS */ 3738 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3739 if (calc > 0xffff) 3740 calc = 0xffff; 3741 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3742 if (rack->r_ctl.rack_per_upper_bound_ss && 3743 (rack->rc_dragged_bottom == 0) && 3744 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) 3745 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; 3746 logged |= 4; 3747 } 3748 if (logged && 3749 (rack->rc_gp_incr == 0)){ 3750 /* Go into increment mode */ 3751 rack->rc_gp_incr = 1; 3752 rack->rc_gp_timely_inc_cnt = 0; 3753 } 3754 if (rack->rc_gp_incr && 3755 logged && 3756 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3757 rack->rc_gp_timely_inc_cnt++; 3758 } 3759 rack_log_timely(rack, logged, plus, 0, 0, 3760 __LINE__, 1); 3761 } 3762 3763 static uint32_t 3764 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3765 { 3766 /*- 3767 * norm_grad = rtt_diff / minrtt; 3768 * new_per = curper * (1 - B * norm_grad) 3769 * 3770 * B = rack_gp_decrease_per (default 80%) 3771 * rtt_dif = input var current rtt-diff 3772 * curper = input var current percentage 3773 * minrtt = from rack filter 3774 * 3775 * In order to do the floating point calculations above we 3776 * do an integer conversion. The code looks confusing so let me 3777 * translate it into something that use more variables and 3778 * is clearer for us humans :) 3779 * 3780 * uint64_t norm_grad, inverse, reduce_by, final_result; 3781 * uint32_t perf; 3782 * 3783 * norm_grad = (((uint64_t)rtt_diff * 1000000) / 3784 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt)); 3785 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad; 3786 * inverse /= 1000000; 3787 * reduce_by = (1000000 - inverse); 3788 * final_result = (cur_per * reduce_by) / 1000000; 3789 * perf = (uint32_t)final_result; 3790 */ 3791 uint64_t perf; 3792 3793 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3794 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3795 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3796 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3797 (uint64_t)1000000)) / 3798 (uint64_t)1000000); 3799 if (perf > curper) { 3800 /* TSNH */ 3801 perf = curper - 1; 3802 } 3803 return ((uint32_t)perf); 3804 } 3805 3806 static uint32_t 3807 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3808 { 3809 /* 3810 * highrttthresh 3811 * result = curper * (1 - (B * ( 1 - ------ )) 3812 * gp_srtt 3813 * 3814 * B = rack_gp_decrease_per (default .8 i.e. 80) 3815 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3816 */ 3817 uint64_t perf; 3818 uint32_t highrttthresh; 3819 3820 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3821 3822 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3823 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3824 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3825 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3826 if (tcp_bblogging_on(rack->rc_tp)) { 3827 uint64_t log1; 3828 3829 log1 = rtt; 3830 log1 <<= 32; 3831 log1 |= highrttthresh; 3832 rack_log_timely(rack, 3833 rack_gp_decrease_per, 3834 (uint64_t)curper, 3835 log1, 3836 perf, 3837 __LINE__, 3838 15); 3839 } 3840 return (perf); 3841 } 3842 3843 static void 3844 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3845 { 3846 uint64_t logvar, logvar2, logvar3; 3847 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3848 3849 if (rack->rc_skip_timely) 3850 return; 3851 if (rack->rc_gp_incr) { 3852 /* Turn off increment counting */ 3853 rack->rc_gp_incr = 0; 3854 rack->rc_gp_timely_inc_cnt = 0; 3855 } 3856 ss_red = ca_red = rec_red = 0; 3857 logged = 0; 3858 /* Calculate the reduction value */ 3859 if (rtt_diff < 0) { 3860 rtt_diff *= -1; 3861 } 3862 /* Must be at least 1% reduction */ 3863 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3864 /* We have been in recovery ding it too */ 3865 if (timely_says == 2) { 3866 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3867 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3868 if (alt < new_per) 3869 val = alt; 3870 else 3871 val = new_per; 3872 } else 3873 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3874 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3875 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3876 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3877 } else { 3878 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3879 rec_red = 0; 3880 } 3881 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3882 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3883 logged |= 1; 3884 } 3885 if (rack->rc_gp_saw_ss) { 3886 /* Sent in SS */ 3887 if (timely_says == 2) { 3888 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3889 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3890 if (alt < new_per) 3891 val = alt; 3892 else 3893 val = new_per; 3894 } else 3895 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3896 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3897 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3898 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3899 } else { 3900 ss_red = new_per; 3901 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3902 logvar = new_per; 3903 logvar <<= 32; 3904 logvar |= alt; 3905 logvar2 = (uint32_t)rtt; 3906 logvar2 <<= 32; 3907 logvar2 |= (uint32_t)rtt_diff; 3908 logvar3 = rack_gp_rtt_maxmul; 3909 logvar3 <<= 32; 3910 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3911 rack_log_timely(rack, timely_says, 3912 logvar2, logvar3, 3913 logvar, __LINE__, 10); 3914 } 3915 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3916 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3917 logged |= 4; 3918 } else if (rack->rc_gp_saw_ca) { 3919 /* Sent in CA */ 3920 if (timely_says == 2) { 3921 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3922 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3923 if (alt < new_per) 3924 val = alt; 3925 else 3926 val = new_per; 3927 } else 3928 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3929 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3930 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3931 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3932 } else { 3933 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3934 ca_red = 0; 3935 logvar = new_per; 3936 logvar <<= 32; 3937 logvar |= alt; 3938 logvar2 = (uint32_t)rtt; 3939 logvar2 <<= 32; 3940 logvar2 |= (uint32_t)rtt_diff; 3941 logvar3 = rack_gp_rtt_maxmul; 3942 logvar3 <<= 32; 3943 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3944 rack_log_timely(rack, timely_says, 3945 logvar2, logvar3, 3946 logvar, __LINE__, 10); 3947 } 3948 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3949 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3950 logged |= 2; 3951 } 3952 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3953 rack->rc_gp_timely_dec_cnt++; 3954 if (rack_timely_dec_clear && 3955 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3956 rack->rc_gp_timely_dec_cnt = 0; 3957 } 3958 logvar = ss_red; 3959 logvar <<= 32; 3960 logvar |= ca_red; 3961 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3962 __LINE__, 2); 3963 } 3964 3965 static void 3966 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3967 uint32_t rtt, uint32_t line, uint8_t reas) 3968 { 3969 if (tcp_bblogging_on(rack->rc_tp)) { 3970 union tcp_log_stackspecific log; 3971 struct timeval tv; 3972 3973 memset(&log, 0, sizeof(log)); 3974 log.u_bbr.flex1 = line; 3975 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 3976 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 3977 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3978 log.u_bbr.flex5 = rtt; 3979 log.u_bbr.flex6 = rack->rc_highly_buffered; 3980 log.u_bbr.flex6 <<= 1; 3981 log.u_bbr.flex6 |= rack->forced_ack; 3982 log.u_bbr.flex6 <<= 1; 3983 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 3984 log.u_bbr.flex6 <<= 1; 3985 log.u_bbr.flex6 |= rack->in_probe_rtt; 3986 log.u_bbr.flex6 <<= 1; 3987 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 3988 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 3989 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 3990 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 3991 log.u_bbr.flex8 = reas; 3992 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3993 log.u_bbr.delRate = rack_get_bw(rack); 3994 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 3995 log.u_bbr.cur_del_rate <<= 32; 3996 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 3997 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 3998 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3999 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 4000 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 4001 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 4002 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 4003 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 4004 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4005 log.u_bbr.rttProp = us_cts; 4006 log.u_bbr.rttProp <<= 32; 4007 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 4008 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4009 &rack->rc_inp->inp_socket->so_rcv, 4010 &rack->rc_inp->inp_socket->so_snd, 4011 BBR_LOG_RTT_SHRINKS, 0, 4012 0, &log, false, &rack->r_ctl.act_rcv_time); 4013 } 4014 } 4015 4016 static void 4017 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 4018 { 4019 uint64_t bwdp; 4020 4021 bwdp = rack_get_bw(rack); 4022 bwdp *= (uint64_t)rtt; 4023 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 4024 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 4025 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 4026 /* 4027 * A window protocol must be able to have 4 packets 4028 * outstanding as the floor in order to function 4029 * (especially considering delayed ack :D). 4030 */ 4031 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 4032 } 4033 } 4034 4035 static void 4036 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 4037 { 4038 /** 4039 * ProbeRTT is a bit different in rack_pacing than in 4040 * BBR. It is like BBR in that it uses the lowering of 4041 * the RTT as a signal that we saw something new and 4042 * counts from there for how long between. But it is 4043 * different in that its quite simple. It does not 4044 * play with the cwnd and wait until we get down 4045 * to N segments outstanding and hold that for 4046 * 200ms. Instead it just sets the pacing reduction 4047 * rate to a set percentage (70 by default) and hold 4048 * that for a number of recent GP Srtt's. 4049 */ 4050 uint32_t segsiz; 4051 4052 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4053 if (rack->rc_gp_dyn_mul == 0) 4054 return; 4055 4056 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 4057 /* We are idle */ 4058 return; 4059 } 4060 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4061 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4062 /* 4063 * Stop the goodput now, the idea here is 4064 * that future measurements with in_probe_rtt 4065 * won't register if they are not greater so 4066 * we want to get what info (if any) is available 4067 * now. 4068 */ 4069 rack_do_goodput_measurement(rack->rc_tp, rack, 4070 rack->rc_tp->snd_una, __LINE__, 4071 RACK_QUALITY_PROBERTT); 4072 } 4073 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4074 rack->r_ctl.rc_time_probertt_entered = us_cts; 4075 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4076 rack->r_ctl.rc_pace_min_segs); 4077 rack->in_probe_rtt = 1; 4078 rack->measure_saw_probe_rtt = 1; 4079 rack->r_ctl.rc_time_probertt_starts = 0; 4080 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 4081 if (rack_probertt_use_min_rtt_entry) 4082 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4083 else 4084 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 4085 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4086 __LINE__, RACK_RTTS_ENTERPROBE); 4087 } 4088 4089 static void 4090 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 4091 { 4092 struct rack_sendmap *rsm; 4093 uint32_t segsiz; 4094 4095 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4096 rack->r_ctl.rc_pace_min_segs); 4097 rack->in_probe_rtt = 0; 4098 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4099 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4100 /* 4101 * Stop the goodput now, the idea here is 4102 * that future measurements with in_probe_rtt 4103 * won't register if they are not greater so 4104 * we want to get what info (if any) is available 4105 * now. 4106 */ 4107 rack_do_goodput_measurement(rack->rc_tp, rack, 4108 rack->rc_tp->snd_una, __LINE__, 4109 RACK_QUALITY_PROBERTT); 4110 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 4111 /* 4112 * We don't have enough data to make a measurement. 4113 * So lets just stop and start here after exiting 4114 * probe-rtt. We probably are not interested in 4115 * the results anyway. 4116 */ 4117 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 4118 } 4119 /* 4120 * Measurements through the current snd_max are going 4121 * to be limited by the slower pacing rate. 4122 * 4123 * We need to mark these as app-limited so we 4124 * don't collapse the b/w. 4125 */ 4126 rsm = tqhash_max(rack->r_ctl.tqh); 4127 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 4128 if (rack->r_ctl.rc_app_limited_cnt == 0) 4129 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 4130 else { 4131 /* 4132 * Go out to the end app limited and mark 4133 * this new one as next and move the end_appl up 4134 * to this guy. 4135 */ 4136 if (rack->r_ctl.rc_end_appl) 4137 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 4138 rack->r_ctl.rc_end_appl = rsm; 4139 } 4140 rsm->r_flags |= RACK_APP_LIMITED; 4141 rack->r_ctl.rc_app_limited_cnt++; 4142 } 4143 /* 4144 * Now, we need to examine our pacing rate multipliers. 4145 * If its under 100%, we need to kick it back up to 4146 * 100%. We also don't let it be over our "max" above 4147 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 4148 * Note setting clamp_atexit_prtt to 0 has the effect 4149 * of setting CA/SS to 100% always at exit (which is 4150 * the default behavior). 4151 */ 4152 if (rack_probertt_clear_is) { 4153 rack->rc_gp_incr = 0; 4154 rack->rc_gp_bwred = 0; 4155 rack->rc_gp_timely_inc_cnt = 0; 4156 rack->rc_gp_timely_dec_cnt = 0; 4157 } 4158 /* Do we do any clamping at exit? */ 4159 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 4160 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 4161 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 4162 } 4163 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 4164 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 4165 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 4166 } 4167 /* 4168 * Lets set rtt_diff to 0, so that we will get a "boost" 4169 * after exiting. 4170 */ 4171 rack->r_ctl.rc_rtt_diff = 0; 4172 4173 /* Clear all flags so we start fresh */ 4174 rack->rc_tp->t_bytes_acked = 0; 4175 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4176 /* 4177 * If configured to, set the cwnd and ssthresh to 4178 * our targets. 4179 */ 4180 if (rack_probe_rtt_sets_cwnd) { 4181 uint64_t ebdp; 4182 uint32_t setto; 4183 4184 /* Set ssthresh so we get into CA once we hit our target */ 4185 if (rack_probertt_use_min_rtt_exit == 1) { 4186 /* Set to min rtt */ 4187 rack_set_prtt_target(rack, segsiz, 4188 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4189 } else if (rack_probertt_use_min_rtt_exit == 2) { 4190 /* Set to current gp rtt */ 4191 rack_set_prtt_target(rack, segsiz, 4192 rack->r_ctl.rc_gp_srtt); 4193 } else if (rack_probertt_use_min_rtt_exit == 3) { 4194 /* Set to entry gp rtt */ 4195 rack_set_prtt_target(rack, segsiz, 4196 rack->r_ctl.rc_entry_gp_rtt); 4197 } else { 4198 uint64_t sum; 4199 uint32_t setval; 4200 4201 sum = rack->r_ctl.rc_entry_gp_rtt; 4202 sum *= 10; 4203 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 4204 if (sum >= 20) { 4205 /* 4206 * A highly buffered path needs 4207 * cwnd space for timely to work. 4208 * Lets set things up as if 4209 * we are heading back here again. 4210 */ 4211 setval = rack->r_ctl.rc_entry_gp_rtt; 4212 } else if (sum >= 15) { 4213 /* 4214 * Lets take the smaller of the 4215 * two since we are just somewhat 4216 * buffered. 4217 */ 4218 setval = rack->r_ctl.rc_gp_srtt; 4219 if (setval > rack->r_ctl.rc_entry_gp_rtt) 4220 setval = rack->r_ctl.rc_entry_gp_rtt; 4221 } else { 4222 /* 4223 * Here we are not highly buffered 4224 * and should pick the min we can to 4225 * keep from causing loss. 4226 */ 4227 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4228 } 4229 rack_set_prtt_target(rack, segsiz, 4230 setval); 4231 } 4232 if (rack_probe_rtt_sets_cwnd > 1) { 4233 /* There is a percentage here to boost */ 4234 ebdp = rack->r_ctl.rc_target_probertt_flight; 4235 ebdp *= rack_probe_rtt_sets_cwnd; 4236 ebdp /= 100; 4237 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 4238 } else 4239 setto = rack->r_ctl.rc_target_probertt_flight; 4240 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 4241 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 4242 /* Enforce a min */ 4243 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 4244 } 4245 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 4246 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 4247 } 4248 rack_log_rtt_shrinks(rack, us_cts, 4249 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4250 __LINE__, RACK_RTTS_EXITPROBE); 4251 /* Clear times last so log has all the info */ 4252 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 4253 rack->r_ctl.rc_time_probertt_entered = us_cts; 4254 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4255 rack->r_ctl.rc_time_of_last_probertt = us_cts; 4256 } 4257 4258 static void 4259 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 4260 { 4261 /* Check in on probe-rtt */ 4262 4263 if (rack->rc_gp_filled == 0) { 4264 /* We do not do p-rtt unless we have gp measurements */ 4265 return; 4266 } 4267 if (rack->in_probe_rtt) { 4268 uint64_t no_overflow; 4269 uint32_t endtime, must_stay; 4270 4271 if (rack->r_ctl.rc_went_idle_time && 4272 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 4273 /* 4274 * We went idle during prtt, just exit now. 4275 */ 4276 rack_exit_probertt(rack, us_cts); 4277 } else if (rack_probe_rtt_safety_val && 4278 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 4279 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 4280 /* 4281 * Probe RTT safety value triggered! 4282 */ 4283 rack_log_rtt_shrinks(rack, us_cts, 4284 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4285 __LINE__, RACK_RTTS_SAFETY); 4286 rack_exit_probertt(rack, us_cts); 4287 } 4288 /* Calculate the max we will wait */ 4289 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 4290 if (rack->rc_highly_buffered) 4291 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 4292 /* Calculate the min we must wait */ 4293 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 4294 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 4295 TSTMP_LT(us_cts, endtime)) { 4296 uint32_t calc; 4297 /* Do we lower more? */ 4298 no_exit: 4299 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 4300 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 4301 else 4302 calc = 0; 4303 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4304 if (calc) { 4305 /* Maybe */ 4306 calc *= rack_per_of_gp_probertt_reduce; 4307 if (calc > rack_per_of_gp_probertt) 4308 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4309 else 4310 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4311 /* Limit it too */ 4312 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4313 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4314 } 4315 /* We must reach target or the time set */ 4316 return; 4317 } 4318 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4319 if ((TSTMP_LT(us_cts, must_stay) && 4320 rack->rc_highly_buffered) || 4321 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4322 rack->r_ctl.rc_target_probertt_flight)) { 4323 /* We are not past the must_stay time */ 4324 goto no_exit; 4325 } 4326 rack_log_rtt_shrinks(rack, us_cts, 4327 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4328 __LINE__, RACK_RTTS_REACHTARGET); 4329 rack->r_ctl.rc_time_probertt_starts = us_cts; 4330 if (rack->r_ctl.rc_time_probertt_starts == 0) 4331 rack->r_ctl.rc_time_probertt_starts = 1; 4332 /* Restore back to our rate we want to pace at in prtt */ 4333 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4334 } 4335 /* 4336 * Setup our end time, some number of gp_srtts plus 200ms. 4337 */ 4338 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4339 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4340 if (rack_probertt_gpsrtt_cnt_div) 4341 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4342 else 4343 endtime = 0; 4344 endtime += rack_min_probertt_hold; 4345 endtime += rack->r_ctl.rc_time_probertt_starts; 4346 if (TSTMP_GEQ(us_cts, endtime)) { 4347 /* yes, exit probertt */ 4348 rack_exit_probertt(rack, us_cts); 4349 } 4350 4351 } else if ((rack->rc_skip_timely == 0) && 4352 (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) && 4353 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) { 4354 /* Go into probertt, its been too long since we went lower */ 4355 rack_enter_probertt(rack, us_cts); 4356 } 4357 } 4358 4359 static void 4360 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4361 uint32_t rtt, int32_t rtt_diff) 4362 { 4363 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4364 uint32_t losses; 4365 4366 if ((rack->rc_gp_dyn_mul == 0) || 4367 (rack->use_fixed_rate) || 4368 (rack->in_probe_rtt) || 4369 (rack->rc_always_pace == 0)) { 4370 /* No dynamic GP multiplier in play */ 4371 return; 4372 } 4373 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4374 cur_bw = rack_get_bw(rack); 4375 /* Calculate our up and down range */ 4376 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4377 up_bnd /= 100; 4378 up_bnd += rack->r_ctl.last_gp_comp_bw; 4379 4380 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4381 subfr /= 100; 4382 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4383 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4384 /* 4385 * This is the case where our RTT is above 4386 * the max target and we have been configured 4387 * to just do timely no bonus up stuff in that case. 4388 * 4389 * There are two configurations, set to 1, and we 4390 * just do timely if we are over our max. If its 4391 * set above 1 then we slam the multipliers down 4392 * to 100 and then decrement per timely. 4393 */ 4394 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4395 __LINE__, 3); 4396 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4397 rack_validate_multipliers_at_or_below_100(rack); 4398 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4399 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { 4400 /* 4401 * We are decreasing this is a bit complicated this 4402 * means we are loosing ground. This could be 4403 * because another flow entered and we are competing 4404 * for b/w with it. This will push the RTT up which 4405 * makes timely unusable unless we want to get shoved 4406 * into a corner and just be backed off (the age 4407 * old problem with delay based CC). 4408 * 4409 * On the other hand if it was a route change we 4410 * would like to stay somewhat contained and not 4411 * blow out the buffers. 4412 */ 4413 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4414 __LINE__, 3); 4415 rack->r_ctl.last_gp_comp_bw = cur_bw; 4416 if (rack->rc_gp_bwred == 0) { 4417 /* Go into reduction counting */ 4418 rack->rc_gp_bwred = 1; 4419 rack->rc_gp_timely_dec_cnt = 0; 4420 } 4421 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { 4422 /* 4423 * Push another time with a faster pacing 4424 * to try to gain back (we include override to 4425 * get a full raise factor). 4426 */ 4427 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4428 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4429 (timely_says == 0) || 4430 (rack_down_raise_thresh == 0)) { 4431 /* 4432 * Do an override up in b/w if we were 4433 * below the threshold or if the threshold 4434 * is zero we always do the raise. 4435 */ 4436 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4437 } else { 4438 /* Log it stays the same */ 4439 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4440 __LINE__, 11); 4441 } 4442 rack->rc_gp_timely_dec_cnt++; 4443 /* We are not incrementing really no-count */ 4444 rack->rc_gp_incr = 0; 4445 rack->rc_gp_timely_inc_cnt = 0; 4446 } else { 4447 /* 4448 * Lets just use the RTT 4449 * information and give up 4450 * pushing. 4451 */ 4452 goto use_timely; 4453 } 4454 } else if ((timely_says != 2) && 4455 !losses && 4456 (last_bw_est > up_bnd)) { 4457 /* 4458 * We are increasing b/w lets keep going, updating 4459 * our b/w and ignoring any timely input, unless 4460 * of course we are at our max raise (if there is one). 4461 */ 4462 4463 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4464 __LINE__, 3); 4465 rack->r_ctl.last_gp_comp_bw = cur_bw; 4466 if (rack->rc_gp_saw_ss && 4467 rack->r_ctl.rack_per_upper_bound_ss && 4468 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { 4469 /* 4470 * In cases where we can't go higher 4471 * we should just use timely. 4472 */ 4473 goto use_timely; 4474 } 4475 if (rack->rc_gp_saw_ca && 4476 rack->r_ctl.rack_per_upper_bound_ca && 4477 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { 4478 /* 4479 * In cases where we can't go higher 4480 * we should just use timely. 4481 */ 4482 goto use_timely; 4483 } 4484 rack->rc_gp_bwred = 0; 4485 rack->rc_gp_timely_dec_cnt = 0; 4486 /* You get a set number of pushes if timely is trying to reduce */ 4487 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4488 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4489 } else { 4490 /* Log it stays the same */ 4491 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4492 __LINE__, 12); 4493 } 4494 return; 4495 } else { 4496 /* 4497 * We are staying between the lower and upper range bounds 4498 * so use timely to decide. 4499 */ 4500 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4501 __LINE__, 3); 4502 use_timely: 4503 if (timely_says) { 4504 rack->rc_gp_incr = 0; 4505 rack->rc_gp_timely_inc_cnt = 0; 4506 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4507 !losses && 4508 (last_bw_est < low_bnd)) { 4509 /* We are loosing ground */ 4510 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4511 rack->rc_gp_timely_dec_cnt++; 4512 /* We are not incrementing really no-count */ 4513 rack->rc_gp_incr = 0; 4514 rack->rc_gp_timely_inc_cnt = 0; 4515 } else 4516 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4517 } else { 4518 rack->rc_gp_bwred = 0; 4519 rack->rc_gp_timely_dec_cnt = 0; 4520 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4521 } 4522 } 4523 } 4524 4525 static int32_t 4526 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4527 { 4528 int32_t timely_says; 4529 uint64_t log_mult, log_rtt_a_diff; 4530 4531 log_rtt_a_diff = rtt; 4532 log_rtt_a_diff <<= 32; 4533 log_rtt_a_diff |= (uint32_t)rtt_diff; 4534 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4535 rack_gp_rtt_maxmul)) { 4536 /* Reduce the b/w multiplier */ 4537 timely_says = 2; 4538 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4539 log_mult <<= 32; 4540 log_mult |= prev_rtt; 4541 rack_log_timely(rack, timely_says, log_mult, 4542 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4543 log_rtt_a_diff, __LINE__, 4); 4544 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4545 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4546 max(rack_gp_rtt_mindiv , 1)))) { 4547 /* Increase the b/w multiplier */ 4548 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4549 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4550 max(rack_gp_rtt_mindiv , 1)); 4551 log_mult <<= 32; 4552 log_mult |= prev_rtt; 4553 timely_says = 0; 4554 rack_log_timely(rack, timely_says, log_mult , 4555 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4556 log_rtt_a_diff, __LINE__, 5); 4557 } else { 4558 /* 4559 * Use a gradient to find it the timely gradient 4560 * is: 4561 * grad = rc_rtt_diff / min_rtt; 4562 * 4563 * anything below or equal to 0 will be 4564 * a increase indication. Anything above 4565 * zero is a decrease. Note we take care 4566 * of the actual gradient calculation 4567 * in the reduction (its not needed for 4568 * increase). 4569 */ 4570 log_mult = prev_rtt; 4571 if (rtt_diff <= 0) { 4572 /* 4573 * Rttdiff is less than zero, increase the 4574 * b/w multiplier (its 0 or negative) 4575 */ 4576 timely_says = 0; 4577 rack_log_timely(rack, timely_says, log_mult, 4578 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4579 } else { 4580 /* Reduce the b/w multiplier */ 4581 timely_says = 1; 4582 rack_log_timely(rack, timely_says, log_mult, 4583 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4584 } 4585 } 4586 return (timely_says); 4587 } 4588 4589 static inline int 4590 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) 4591 { 4592 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4593 SEQ_LEQ(rsm->r_end, tp->gput_ack)) { 4594 /** 4595 * This covers the case that the 4596 * resent is completely inside 4597 * the gp range or up to it. 4598 * |----------------| 4599 * |-----| <or> 4600 * |----| 4601 * <or> |---| 4602 */ 4603 return (1); 4604 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && 4605 SEQ_GT(rsm->r_end, tp->gput_seq)){ 4606 /** 4607 * This covers the case of 4608 * |--------------| 4609 * |-------->| 4610 */ 4611 return (1); 4612 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4613 SEQ_LT(rsm->r_start, tp->gput_ack) && 4614 SEQ_GEQ(rsm->r_end, tp->gput_ack)) { 4615 4616 /** 4617 * This covers the case of 4618 * |--------------| 4619 * |-------->| 4620 */ 4621 return (1); 4622 } 4623 return (0); 4624 } 4625 4626 static inline void 4627 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) 4628 { 4629 4630 if ((tp->t_flags & TF_GPUTINPROG) == 0) 4631 return; 4632 /* 4633 * We have a Goodput measurement in progress. Mark 4634 * the send if its within the window. If its not 4635 * in the window make sure it does not have the mark. 4636 */ 4637 if (rack_in_gp_window(tp, rsm)) 4638 rsm->r_flags |= RACK_IN_GP_WIN; 4639 else 4640 rsm->r_flags &= ~RACK_IN_GP_WIN; 4641 } 4642 4643 static inline void 4644 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4645 { 4646 /* A GP measurement is ending, clear all marks on the send map*/ 4647 struct rack_sendmap *rsm = NULL; 4648 4649 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4650 if (rsm == NULL) { 4651 rsm = tqhash_min(rack->r_ctl.tqh); 4652 } 4653 /* Nothing left? */ 4654 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ 4655 rsm->r_flags &= ~RACK_IN_GP_WIN; 4656 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4657 } 4658 } 4659 4660 4661 static inline void 4662 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4663 { 4664 struct rack_sendmap *rsm = NULL; 4665 4666 if (tp->snd_una == tp->snd_max) { 4667 /* Nothing outstanding yet, nothing to do here */ 4668 return; 4669 } 4670 if (SEQ_GT(tp->gput_seq, tp->snd_una)) { 4671 /* 4672 * We are measuring ahead of some outstanding 4673 * data. We need to walk through up until we get 4674 * to gp_seq marking so that no rsm is set incorrectly 4675 * with RACK_IN_GP_WIN. 4676 */ 4677 rsm = tqhash_min(rack->r_ctl.tqh); 4678 while (rsm != NULL) { 4679 rack_mark_in_gp_win(tp, rsm); 4680 if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) 4681 break; 4682 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4683 } 4684 } 4685 if (rsm == NULL) { 4686 /* 4687 * Need to find the GP seq, if rsm is 4688 * set we stopped as we hit it. 4689 */ 4690 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4691 if (rsm == NULL) 4692 return; 4693 rack_mark_in_gp_win(tp, rsm); 4694 } 4695 /* 4696 * Now we may need to mark already sent rsm, ahead of 4697 * gput_seq in the window since they may have been sent 4698 * *before* we started our measurment. The rsm, if non-null 4699 * has been marked (note if rsm would have been NULL we would have 4700 * returned in the previous block). So we go to the next, and continue 4701 * until we run out of entries or we exceed the gp_ack value. 4702 */ 4703 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4704 while (rsm) { 4705 rack_mark_in_gp_win(tp, rsm); 4706 if (SEQ_GT(rsm->r_end, tp->gput_ack)) 4707 break; 4708 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4709 } 4710 } 4711 4712 static void 4713 rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line) 4714 { 4715 if (tcp_bblogging_on(rack->rc_tp)) { 4716 union tcp_log_stackspecific log; 4717 struct timeval tv; 4718 4719 memset(&log, 0, sizeof(log)); 4720 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4721 log.u_bbr.flex1 = add_part; 4722 log.u_bbr.flex2 = sub_part; 4723 log.u_bbr.flex3 = rack_wma_divisor; 4724 log.u_bbr.flex4 = srtt; 4725 log.u_bbr.flex7 = (uint16_t)line; 4726 log.u_bbr.flex8 = meth; 4727 log.u_bbr.delRate = rack->r_ctl.gp_bw; 4728 log.u_bbr.cur_del_rate = meas_bw; 4729 log.u_bbr.rttProp = utim; 4730 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4731 &rack->rc_inp->inp_socket->so_rcv, 4732 &rack->rc_inp->inp_socket->so_snd, 4733 BBR_LOG_THRESH_CALC, 0, 4734 0, &log, false, &rack->r_ctl.act_rcv_time); 4735 } 4736 } 4737 4738 static void 4739 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4740 tcp_seq th_ack, int line, uint8_t quality) 4741 { 4742 uint64_t tim, bytes_ps, stim, utim; 4743 uint32_t segsiz, bytes, reqbytes, us_cts; 4744 int32_t gput, new_rtt_diff, timely_says; 4745 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4746 int did_add = 0; 4747 4748 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 4749 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4750 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4751 tim = us_cts - tp->gput_ts; 4752 else 4753 tim = 0; 4754 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4755 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4756 else 4757 stim = 0; 4758 /* 4759 * Use the larger of the send time or ack time. This prevents us 4760 * from being influenced by ack artifacts to come up with too 4761 * high of measurement. Note that since we are spanning over many more 4762 * bytes in most of our measurements hopefully that is less likely to 4763 * occur. 4764 */ 4765 if (tim > stim) 4766 utim = max(tim, 1); 4767 else 4768 utim = max(stim, 1); 4769 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4770 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); 4771 if ((tim == 0) && (stim == 0)) { 4772 /* 4773 * Invalid measurement time, maybe 4774 * all on one ack/one send? 4775 */ 4776 bytes = 0; 4777 bytes_ps = 0; 4778 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4779 0, 0, 0, 10, __LINE__, NULL, quality); 4780 goto skip_measurement; 4781 } 4782 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4783 /* We never made a us_rtt measurement? */ 4784 bytes = 0; 4785 bytes_ps = 0; 4786 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4787 0, 0, 0, 10, __LINE__, NULL, quality); 4788 goto skip_measurement; 4789 } 4790 /* 4791 * Calculate the maximum possible b/w this connection 4792 * could have. We base our calculation on the lowest 4793 * rtt we have seen during the measurement and the 4794 * largest rwnd the client has given us in that time. This 4795 * forms a BDP that is the maximum that we could ever 4796 * get to the client. Anything larger is not valid. 4797 * 4798 * I originally had code here that rejected measurements 4799 * where the time was less than 1/2 the latest us_rtt. 4800 * But after thinking on that I realized its wrong since 4801 * say you had a 150Mbps or even 1Gbps link, and you 4802 * were a long way away.. example I am in Europe (100ms rtt) 4803 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4804 * bytes my time would be 1.2ms, and yet my rtt would say 4805 * the measurement was invalid the time was < 50ms. The 4806 * same thing is true for 150Mb (8ms of time). 4807 * 4808 * A better way I realized is to look at what the maximum 4809 * the connection could possibly do. This is gated on 4810 * the lowest RTT we have seen and the highest rwnd. 4811 * We should in theory never exceed that, if we are 4812 * then something on the path is storing up packets 4813 * and then feeding them all at once to our endpoint 4814 * messing up our measurement. 4815 */ 4816 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4817 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4818 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4819 if (SEQ_LT(th_ack, tp->gput_seq)) { 4820 /* No measurement can be made */ 4821 bytes = 0; 4822 bytes_ps = 0; 4823 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4824 0, 0, 0, 10, __LINE__, NULL, quality); 4825 goto skip_measurement; 4826 } else 4827 bytes = (th_ack - tp->gput_seq); 4828 bytes_ps = (uint64_t)bytes; 4829 /* 4830 * Don't measure a b/w for pacing unless we have gotten at least 4831 * an initial windows worth of data in this measurement interval. 4832 * 4833 * Small numbers of bytes get badly influenced by delayed ack and 4834 * other artifacts. Note we take the initial window or our 4835 * defined minimum GP (defaulting to 10 which hopefully is the 4836 * IW). 4837 */ 4838 if (rack->rc_gp_filled == 0) { 4839 /* 4840 * The initial estimate is special. We 4841 * have blasted out an IW worth of packets 4842 * without a real valid ack ts results. We 4843 * then setup the app_limited_needs_set flag, 4844 * this should get the first ack in (probably 2 4845 * MSS worth) to be recorded as the timestamp. 4846 * We thus allow a smaller number of bytes i.e. 4847 * IW - 2MSS. 4848 */ 4849 reqbytes -= (2 * segsiz); 4850 /* Also lets fill previous for our first measurement to be neutral */ 4851 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4852 } 4853 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4854 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4855 rack->r_ctl.rc_app_limited_cnt, 4856 0, 0, 10, __LINE__, NULL, quality); 4857 goto skip_measurement; 4858 } 4859 /* 4860 * We now need to calculate the Timely like status so 4861 * we can update (possibly) the b/w multipliers. 4862 */ 4863 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4864 if (rack->rc_gp_filled == 0) { 4865 /* No previous reading */ 4866 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4867 } else { 4868 if (rack->measure_saw_probe_rtt == 0) { 4869 /* 4870 * We don't want a probertt to be counted 4871 * since it will be negative incorrectly. We 4872 * expect to be reducing the RTT when we 4873 * pace at a slower rate. 4874 */ 4875 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4876 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4877 } 4878 } 4879 timely_says = rack_make_timely_judgement(rack, 4880 rack->r_ctl.rc_gp_srtt, 4881 rack->r_ctl.rc_rtt_diff, 4882 rack->r_ctl.rc_prev_gp_srtt 4883 ); 4884 bytes_ps *= HPTS_USEC_IN_SEC; 4885 bytes_ps /= utim; 4886 if (bytes_ps > rack->r_ctl.last_max_bw) { 4887 /* 4888 * Something is on path playing 4889 * since this b/w is not possible based 4890 * on our BDP (highest rwnd and lowest rtt 4891 * we saw in the measurement window). 4892 * 4893 * Another option here would be to 4894 * instead skip the measurement. 4895 */ 4896 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4897 bytes_ps, rack->r_ctl.last_max_bw, 0, 4898 11, __LINE__, NULL, quality); 4899 bytes_ps = rack->r_ctl.last_max_bw; 4900 } 4901 /* We store gp for b/w in bytes per second */ 4902 if (rack->rc_gp_filled == 0) { 4903 /* Initial measurement */ 4904 if (bytes_ps) { 4905 rack->r_ctl.gp_bw = bytes_ps; 4906 rack->rc_gp_filled = 1; 4907 rack->r_ctl.num_measurements = 1; 4908 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4909 } else { 4910 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4911 rack->r_ctl.rc_app_limited_cnt, 4912 0, 0, 10, __LINE__, NULL, quality); 4913 } 4914 if (tcp_in_hpts(rack->rc_tp) && 4915 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4916 /* 4917 * Ok we can't trust the pacer in this case 4918 * where we transition from un-paced to paced. 4919 * Or for that matter when the burst mitigation 4920 * was making a wild guess and got it wrong. 4921 * Stop the pacer and clear up all the aggregate 4922 * delays etc. 4923 */ 4924 tcp_hpts_remove(rack->rc_tp); 4925 rack->r_ctl.rc_hpts_flags = 0; 4926 rack->r_ctl.rc_last_output_to = 0; 4927 } 4928 did_add = 2; 4929 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4930 /* Still a small number run an average */ 4931 rack->r_ctl.gp_bw += bytes_ps; 4932 addpart = rack->r_ctl.num_measurements; 4933 rack->r_ctl.num_measurements++; 4934 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4935 /* We have collected enough to move forward */ 4936 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4937 } 4938 rack_set_pace_segments(tp, rack, __LINE__, NULL); 4939 did_add = 3; 4940 } else { 4941 /* 4942 * We want to take 1/wma of the goodput and add in to 7/8th 4943 * of the old value weighted by the srtt. So if your measurement 4944 * period is say 2 SRTT's long you would get 1/4 as the 4945 * value, if it was like 1/2 SRTT then you would get 1/16th. 4946 * 4947 * But we must be careful not to take too much i.e. if the 4948 * srtt is say 20ms and the measurement is taken over 4949 * 400ms our weight would be 400/20 i.e. 20. On the 4950 * other hand if we get a measurement over 1ms with a 4951 * 10ms rtt we only want to take a much smaller portion. 4952 */ 4953 uint8_t meth; 4954 4955 if (rack->r_ctl.num_measurements < 0xff) { 4956 rack->r_ctl.num_measurements++; 4957 } 4958 srtt = (uint64_t)tp->t_srtt; 4959 if (srtt == 0) { 4960 /* 4961 * Strange why did t_srtt go back to zero? 4962 */ 4963 if (rack->r_ctl.rc_rack_min_rtt) 4964 srtt = rack->r_ctl.rc_rack_min_rtt; 4965 else 4966 srtt = HPTS_USEC_IN_MSEC; 4967 } 4968 /* 4969 * XXXrrs: Note for reviewers, in playing with 4970 * dynamic pacing I discovered this GP calculation 4971 * as done originally leads to some undesired results. 4972 * Basically you can get longer measurements contributing 4973 * too much to the WMA. Thus I changed it if you are doing 4974 * dynamic adjustments to only do the aportioned adjustment 4975 * if we have a very small (time wise) measurement. Longer 4976 * measurements just get there weight (defaulting to 1/8) 4977 * add to the WMA. We may want to think about changing 4978 * this to always do that for both sides i.e. dynamic 4979 * and non-dynamic... but considering lots of folks 4980 * were playing with this I did not want to change the 4981 * calculation per.se. without your thoughts.. Lawerence? 4982 * Peter?? 4983 */ 4984 if (rack->rc_gp_dyn_mul == 0) { 4985 subpart = rack->r_ctl.gp_bw * utim; 4986 subpart /= (srtt * 8); 4987 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4988 /* 4989 * The b/w update takes no more 4990 * away then 1/2 our running total 4991 * so factor it in. 4992 */ 4993 addpart = bytes_ps * utim; 4994 addpart /= (srtt * 8); 4995 meth = 1; 4996 } else { 4997 /* 4998 * Don't allow a single measurement 4999 * to account for more than 1/2 of the 5000 * WMA. This could happen on a retransmission 5001 * where utim becomes huge compared to 5002 * srtt (multiple retransmissions when using 5003 * the sending rate which factors in all the 5004 * transmissions from the first one). 5005 */ 5006 subpart = rack->r_ctl.gp_bw / 2; 5007 addpart = bytes_ps / 2; 5008 meth = 2; 5009 } 5010 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5011 resid_bw = rack->r_ctl.gp_bw - subpart; 5012 rack->r_ctl.gp_bw = resid_bw + addpart; 5013 did_add = 1; 5014 } else { 5015 if ((utim / srtt) <= 1) { 5016 /* 5017 * The b/w update was over a small period 5018 * of time. The idea here is to prevent a small 5019 * measurement time period from counting 5020 * too much. So we scale it based on the 5021 * time so it attributes less than 1/rack_wma_divisor 5022 * of its measurement. 5023 */ 5024 subpart = rack->r_ctl.gp_bw * utim; 5025 subpart /= (srtt * rack_wma_divisor); 5026 addpart = bytes_ps * utim; 5027 addpart /= (srtt * rack_wma_divisor); 5028 meth = 3; 5029 } else { 5030 /* 5031 * The scaled measurement was long 5032 * enough so lets just add in the 5033 * portion of the measurement i.e. 1/rack_wma_divisor 5034 */ 5035 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 5036 addpart = bytes_ps / rack_wma_divisor; 5037 meth = 4; 5038 } 5039 if ((rack->measure_saw_probe_rtt == 0) || 5040 (bytes_ps > rack->r_ctl.gp_bw)) { 5041 /* 5042 * For probe-rtt we only add it in 5043 * if its larger, all others we just 5044 * add in. 5045 */ 5046 did_add = 1; 5047 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5048 resid_bw = rack->r_ctl.gp_bw - subpart; 5049 rack->r_ctl.gp_bw = resid_bw + addpart; 5050 } 5051 } 5052 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5053 } 5054 /* 5055 * We only watch the growth of the GP during the initial startup 5056 * or first-slowstart that ensues. If we ever needed to watch 5057 * growth of gp outside of that period all we need to do is 5058 * remove the first clause of this if (rc_initial_ss_comp). 5059 */ 5060 if ((rack->rc_initial_ss_comp == 0) && 5061 (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) { 5062 uint64_t gp_est; 5063 5064 gp_est = bytes_ps; 5065 if (tcp_bblogging_on(rack->rc_tp)) { 5066 union tcp_log_stackspecific log; 5067 struct timeval tv; 5068 5069 memset(&log, 0, sizeof(log)); 5070 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5071 log.u_bbr.flex1 = rack->r_ctl.current_round; 5072 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 5073 log.u_bbr.delRate = gp_est; 5074 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5075 log.u_bbr.flex8 = 41; 5076 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5077 0, &log, false, NULL, __func__, __LINE__,&tv); 5078 } 5079 if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) || 5080 (rack->r_ctl.last_gpest == 0)) { 5081 /* 5082 * The round we get our measurement averaging going 5083 * is the base round so it always is the source point 5084 * for when we had our first increment. From there on 5085 * we only record the round that had a rise. 5086 */ 5087 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5088 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5089 } else if (gp_est >= rack->r_ctl.last_gpest) { 5090 /* 5091 * Test to see if its gone up enough 5092 * to set the round count up to now. Note 5093 * that on the seeding of the 4th measurement we 5094 */ 5095 gp_est *= 1000; 5096 gp_est /= rack->r_ctl.last_gpest; 5097 if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) { 5098 /* 5099 * We went up enough to record the round. 5100 */ 5101 if (tcp_bblogging_on(rack->rc_tp)) { 5102 union tcp_log_stackspecific log; 5103 struct timeval tv; 5104 5105 memset(&log, 0, sizeof(log)); 5106 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5107 log.u_bbr.flex1 = rack->r_ctl.current_round; 5108 log.u_bbr.flex2 = (uint32_t)gp_est; 5109 log.u_bbr.flex3 = rack->r_ctl.gp_gain_req; 5110 log.u_bbr.delRate = gp_est; 5111 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5112 log.u_bbr.flex8 = 42; 5113 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5114 0, &log, false, NULL, __func__, __LINE__,&tv); 5115 } 5116 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5117 if (rack->r_ctl.use_gp_not_last == 1) 5118 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5119 else 5120 rack->r_ctl.last_gpest = bytes_ps; 5121 } 5122 } 5123 } 5124 if ((rack->gp_ready == 0) && 5125 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 5126 /* We have enough measurements now */ 5127 rack->gp_ready = 1; 5128 if (rack->dgp_on || 5129 rack->rack_hibeta) 5130 rack_set_cc_pacing(rack); 5131 if (rack->defer_options) 5132 rack_apply_deferred_options(rack); 5133 } 5134 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 5135 rack_get_bw(rack), 22, did_add, NULL, quality); 5136 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 5137 5138 if ((rack->measure_saw_probe_rtt == 0) && 5139 rack->rc_gp_rtt_set) { 5140 if (rack->rc_skip_timely == 0) { 5141 rack_update_multiplier(rack, timely_says, bytes_ps, 5142 rack->r_ctl.rc_gp_srtt, 5143 rack->r_ctl.rc_rtt_diff); 5144 } 5145 } 5146 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 5147 rack_get_bw(rack), 3, line, NULL, quality); 5148 rack_log_pacing_delay_calc(rack, 5149 bytes, /* flex2 */ 5150 tim, /* flex1 */ 5151 bytes_ps, /* bw_inuse */ 5152 rack->r_ctl.gp_bw, /* delRate */ 5153 rack_get_lt_bw(rack), /* rttProp */ 5154 20, line, NULL, 0); 5155 /* reset the gp srtt and setup the new prev */ 5156 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5157 /* Record the lost count for the next measurement */ 5158 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 5159 skip_measurement: 5160 /* 5161 * We restart our diffs based on the gpsrtt in the 5162 * measurement window. 5163 */ 5164 rack->rc_gp_rtt_set = 0; 5165 rack->rc_gp_saw_rec = 0; 5166 rack->rc_gp_saw_ca = 0; 5167 rack->rc_gp_saw_ss = 0; 5168 rack->rc_dragged_bottom = 0; 5169 if (quality == RACK_QUALITY_HIGH) { 5170 /* 5171 * Gput in the stats world is in kbps where bytes_ps is 5172 * bytes per second so we do ((x * 8)/ 1000). 5173 */ 5174 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); 5175 #ifdef STATS 5176 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 5177 gput); 5178 /* 5179 * XXXLAS: This is a temporary hack, and should be 5180 * chained off VOI_TCP_GPUT when stats(9) grows an 5181 * API to deal with chained VOIs. 5182 */ 5183 if (tp->t_stats_gput_prev > 0) 5184 stats_voi_update_abs_s32(tp->t_stats, 5185 VOI_TCP_GPUT_ND, 5186 ((gput - tp->t_stats_gput_prev) * 100) / 5187 tp->t_stats_gput_prev); 5188 #endif 5189 tp->t_stats_gput_prev = gput; 5190 } 5191 tp->t_flags &= ~TF_GPUTINPROG; 5192 /* 5193 * Now are we app limited now and there is space from where we 5194 * were to where we want to go? 5195 * 5196 * We don't do the other case i.e. non-applimited here since 5197 * the next send will trigger us picking up the missing data. 5198 */ 5199 if (rack->r_ctl.rc_first_appl && 5200 TCPS_HAVEESTABLISHED(tp->t_state) && 5201 rack->r_ctl.rc_app_limited_cnt && 5202 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 5203 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 5204 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 5205 /* 5206 * Yep there is enough outstanding to make a measurement here. 5207 */ 5208 struct rack_sendmap *rsm; 5209 5210 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 5211 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 5212 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 5213 rack->app_limited_needs_set = 0; 5214 tp->gput_seq = th_ack; 5215 if (rack->in_probe_rtt) 5216 rack->measure_saw_probe_rtt = 1; 5217 else if ((rack->measure_saw_probe_rtt) && 5218 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 5219 rack->measure_saw_probe_rtt = 0; 5220 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 5221 /* There is a full window to gain info from */ 5222 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 5223 } else { 5224 /* We can only measure up to the applimited point */ 5225 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 5226 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 5227 /* 5228 * We don't have enough to make a measurement. 5229 */ 5230 tp->t_flags &= ~TF_GPUTINPROG; 5231 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 5232 0, 0, 0, 6, __LINE__, NULL, quality); 5233 return; 5234 } 5235 } 5236 if (tp->t_state >= TCPS_FIN_WAIT_1) { 5237 /* 5238 * We will get no more data into the SB 5239 * this means we need to have the data available 5240 * before we start a measurement. 5241 */ 5242 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 5243 /* Nope not enough data. */ 5244 return; 5245 } 5246 } 5247 tp->t_flags |= TF_GPUTINPROG; 5248 /* 5249 * Now we need to find the timestamp of the send at tp->gput_seq 5250 * for the send based measurement. 5251 */ 5252 rack->r_ctl.rc_gp_cumack_ts = 0; 5253 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 5254 if (rsm) { 5255 /* Ok send-based limit is set */ 5256 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 5257 /* 5258 * Move back to include the earlier part 5259 * so our ack time lines up right (this may 5260 * make an overlapping measurement but thats 5261 * ok). 5262 */ 5263 tp->gput_seq = rsm->r_start; 5264 } 5265 if (rsm->r_flags & RACK_ACKED) { 5266 struct rack_sendmap *nrsm; 5267 5268 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 5269 tp->gput_seq = rsm->r_end; 5270 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 5271 if (nrsm) 5272 rsm = nrsm; 5273 else { 5274 rack->app_limited_needs_set = 1; 5275 } 5276 } else 5277 rack->app_limited_needs_set = 1; 5278 /* We always go from the first send */ 5279 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 5280 } else { 5281 /* 5282 * If we don't find the rsm due to some 5283 * send-limit set the current time, which 5284 * basically disables the send-limit. 5285 */ 5286 struct timeval tv; 5287 5288 microuptime(&tv); 5289 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 5290 } 5291 rack_tend_gp_marks(tp, rack); 5292 rack_log_pacing_delay_calc(rack, 5293 tp->gput_seq, 5294 tp->gput_ack, 5295 (uintptr_t)rsm, 5296 tp->gput_ts, 5297 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 5298 9, 5299 __LINE__, rsm, quality); 5300 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 5301 } else { 5302 /* 5303 * To make sure proper timestamp merging occurs, we need to clear 5304 * all GP marks if we don't start a measurement. 5305 */ 5306 rack_clear_gp_marks(tp, rack); 5307 } 5308 } 5309 5310 /* 5311 * CC wrapper hook functions 5312 */ 5313 static void 5314 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 5315 uint16_t type, int32_t post_recovery) 5316 { 5317 uint32_t prior_cwnd, acked; 5318 struct tcp_log_buffer *lgb = NULL; 5319 uint8_t labc_to_use, quality; 5320 5321 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5322 tp->t_ccv.nsegs = nsegs; 5323 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 5324 if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 5325 uint32_t max; 5326 5327 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 5328 if (tp->t_ccv.bytes_this_ack > max) { 5329 tp->t_ccv.bytes_this_ack = max; 5330 } 5331 } 5332 #ifdef STATS 5333 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 5334 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 5335 #endif 5336 if ((th_ack == tp->snd_max) && rack->lt_bw_up) { 5337 /* 5338 * We will ack all the data, time to end any 5339 * lt_bw_up we have running until something 5340 * new is sent. Note we need to use the actual 5341 * ack_rcv_time which with pacing may be different. 5342 */ 5343 uint64_t tmark; 5344 5345 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); 5346 rack->r_ctl.lt_seq = tp->snd_max; 5347 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 5348 if (tmark >= rack->r_ctl.lt_timemark) { 5349 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 5350 } 5351 rack->r_ctl.lt_timemark = tmark; 5352 rack->lt_bw_up = 0; 5353 } 5354 quality = RACK_QUALITY_NONE; 5355 if ((tp->t_flags & TF_GPUTINPROG) && 5356 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 5357 /* Measure the Goodput */ 5358 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 5359 } 5360 /* Which way our we limited, if not cwnd limited no advance in CA */ 5361 if (tp->snd_cwnd <= tp->snd_wnd) 5362 tp->t_ccv.flags |= CCF_CWND_LIMITED; 5363 else 5364 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 5365 if (tp->snd_cwnd > tp->snd_ssthresh) { 5366 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 5367 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 5368 /* For the setting of a window past use the actual scwnd we are using */ 5369 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 5370 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 5371 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 5372 } 5373 } else { 5374 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 5375 tp->t_bytes_acked = 0; 5376 } 5377 prior_cwnd = tp->snd_cwnd; 5378 if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 5379 (rack_client_low_buf && rack->client_bufferlvl && 5380 (rack->client_bufferlvl < rack_client_low_buf))) 5381 labc_to_use = rack->rc_labc; 5382 else 5383 labc_to_use = rack_max_abc_post_recovery; 5384 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5385 union tcp_log_stackspecific log; 5386 struct timeval tv; 5387 5388 memset(&log, 0, sizeof(log)); 5389 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5390 log.u_bbr.flex1 = th_ack; 5391 log.u_bbr.flex2 = tp->t_ccv.flags; 5392 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5393 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5394 log.u_bbr.flex5 = labc_to_use; 5395 log.u_bbr.flex6 = prior_cwnd; 5396 log.u_bbr.flex7 = V_tcp_do_newsack; 5397 log.u_bbr.flex8 = 1; 5398 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5399 0, &log, false, NULL, __func__, __LINE__,&tv); 5400 } 5401 if (CC_ALGO(tp)->ack_received != NULL) { 5402 /* XXXLAS: Find a way to live without this */ 5403 tp->t_ccv.curack = th_ack; 5404 tp->t_ccv.labc = labc_to_use; 5405 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 5406 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 5407 } 5408 if (lgb) { 5409 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 5410 } 5411 if (rack->r_must_retran) { 5412 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 5413 /* 5414 * We now are beyond the rxt point so lets disable 5415 * the flag. 5416 */ 5417 rack->r_ctl.rc_out_at_rto = 0; 5418 rack->r_must_retran = 0; 5419 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 5420 /* 5421 * Only decrement the rc_out_at_rto if the cwnd advances 5422 * at least a whole segment. Otherwise next time the peer 5423 * acks, we won't be able to send this generaly happens 5424 * when we are in Congestion Avoidance. 5425 */ 5426 if (acked <= rack->r_ctl.rc_out_at_rto){ 5427 rack->r_ctl.rc_out_at_rto -= acked; 5428 } else { 5429 rack->r_ctl.rc_out_at_rto = 0; 5430 } 5431 } 5432 } 5433 #ifdef STATS 5434 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 5435 #endif 5436 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 5437 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 5438 } 5439 if ((rack->rc_initial_ss_comp == 0) && 5440 (tp->snd_cwnd >= tp->snd_ssthresh)) { 5441 /* 5442 * The cwnd has grown beyond ssthresh we have 5443 * entered ca and completed our first Slowstart. 5444 */ 5445 rack->rc_initial_ss_comp = 1; 5446 } 5447 } 5448 5449 static void 5450 tcp_rack_partialack(struct tcpcb *tp) 5451 { 5452 struct tcp_rack *rack; 5453 5454 rack = (struct tcp_rack *)tp->t_fb_ptr; 5455 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5456 /* 5457 * If we are doing PRR and have enough 5458 * room to send <or> we are pacing and prr 5459 * is disabled we will want to see if we 5460 * can send data (by setting r_wanted_output to 5461 * true). 5462 */ 5463 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 5464 rack->rack_no_prr) 5465 rack->r_wanted_output = 1; 5466 } 5467 5468 static void 5469 rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how) 5470 { 5471 /* 5472 * Now exit recovery. 5473 */ 5474 EXIT_RECOVERY(tp->t_flags); 5475 } 5476 5477 static void 5478 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 5479 { 5480 struct tcp_rack *rack; 5481 uint32_t orig_cwnd; 5482 5483 orig_cwnd = tp->snd_cwnd; 5484 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5485 rack = (struct tcp_rack *)tp->t_fb_ptr; 5486 /* only alert CC if we alerted when we entered */ 5487 if (CC_ALGO(tp)->post_recovery != NULL) { 5488 tp->t_ccv.curack = th_ack; 5489 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 5490 if (tp->snd_cwnd < tp->snd_ssthresh) { 5491 /* 5492 * Rack has burst control and pacing 5493 * so lets not set this any lower than 5494 * snd_ssthresh per RFC-6582 (option 2). 5495 */ 5496 tp->snd_cwnd = tp->snd_ssthresh; 5497 } 5498 } 5499 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5500 union tcp_log_stackspecific log; 5501 struct timeval tv; 5502 5503 memset(&log, 0, sizeof(log)); 5504 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5505 log.u_bbr.flex1 = th_ack; 5506 log.u_bbr.flex2 = tp->t_ccv.flags; 5507 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5508 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5509 log.u_bbr.flex5 = V_tcp_abc_l_var; 5510 log.u_bbr.flex6 = orig_cwnd; 5511 log.u_bbr.flex7 = V_tcp_do_newsack; 5512 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 5513 log.u_bbr.flex8 = 2; 5514 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5515 0, &log, false, NULL, __func__, __LINE__, &tv); 5516 } 5517 if ((rack->rack_no_prr == 0) && 5518 (rack->no_prr_addback == 0) && 5519 (rack->r_ctl.rc_prr_sndcnt > 0)) { 5520 /* 5521 * Suck the next prr cnt back into cwnd, but 5522 * only do that if we are not application limited. 5523 */ 5524 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 5525 /* 5526 * We are allowed to add back to the cwnd the amount we did 5527 * not get out if: 5528 * a) no_prr_addback is off. 5529 * b) we are not app limited 5530 * c) we are doing prr 5531 * <and> 5532 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 5533 */ 5534 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 5535 rack->r_ctl.rc_prr_sndcnt); 5536 } 5537 rack->r_ctl.rc_prr_sndcnt = 0; 5538 rack_log_to_prr(rack, 1, 0, __LINE__); 5539 } 5540 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 5541 tp->snd_recover = tp->snd_una; 5542 if (rack->r_ctl.dsack_persist) { 5543 rack->r_ctl.dsack_persist--; 5544 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 5545 rack->r_ctl.num_dsack = 0; 5546 } 5547 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 5548 } 5549 if (rack->rto_from_rec == 1) { 5550 rack->rto_from_rec = 0; 5551 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 5552 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 5553 } 5554 rack_exit_recovery(tp, rack, 1); 5555 } 5556 5557 static void 5558 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 5559 { 5560 struct tcp_rack *rack; 5561 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 5562 5563 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5564 #ifdef STATS 5565 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 5566 #endif 5567 if (IN_RECOVERY(tp->t_flags) == 0) { 5568 in_rec_at_entry = 0; 5569 ssthresh_enter = tp->snd_ssthresh; 5570 cwnd_enter = tp->snd_cwnd; 5571 } else 5572 in_rec_at_entry = 1; 5573 rack = (struct tcp_rack *)tp->t_fb_ptr; 5574 switch (type) { 5575 case CC_NDUPACK: 5576 tp->t_flags &= ~TF_WASFRECOVERY; 5577 tp->t_flags &= ~TF_WASCRECOVERY; 5578 if (!IN_FASTRECOVERY(tp->t_flags)) { 5579 /* Check if this is the end of the initial Start-up i.e. initial slow-start */ 5580 if (rack->rc_initial_ss_comp == 0) { 5581 /* Yep it is the end of the initial slowstart */ 5582 rack->rc_initial_ss_comp = 1; 5583 } 5584 rack->r_ctl.rc_prr_delivered = 0; 5585 rack->r_ctl.rc_prr_out = 0; 5586 rack->r_fast_output = 0; 5587 if (rack->rack_no_prr == 0) { 5588 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5589 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 5590 } 5591 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 5592 tp->snd_recover = tp->snd_max; 5593 if (tp->t_flags2 & TF2_ECN_PERMIT) 5594 tp->t_flags2 |= TF2_ECN_SND_CWR; 5595 } 5596 break; 5597 case CC_ECN: 5598 if (!IN_CONGRECOVERY(tp->t_flags) || 5599 /* 5600 * Allow ECN reaction on ACK to CWR, if 5601 * that data segment was also CE marked. 5602 */ 5603 SEQ_GEQ(ack, tp->snd_recover)) { 5604 EXIT_CONGRECOVERY(tp->t_flags); 5605 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 5606 rack->r_fast_output = 0; 5607 tp->snd_recover = tp->snd_max + 1; 5608 if (tp->t_flags2 & TF2_ECN_PERMIT) 5609 tp->t_flags2 |= TF2_ECN_SND_CWR; 5610 } 5611 break; 5612 case CC_RTO: 5613 tp->t_dupacks = 0; 5614 tp->t_bytes_acked = 0; 5615 rack->r_fast_output = 0; 5616 if (IN_RECOVERY(tp->t_flags)) 5617 rack_exit_recovery(tp, rack, 2); 5618 orig_cwnd = tp->snd_cwnd; 5619 rack_log_to_prr(rack, 16, orig_cwnd, line); 5620 if (CC_ALGO(tp)->cong_signal == NULL) { 5621 /* TSNH */ 5622 tp->snd_ssthresh = max(2, 5623 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 5624 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 5625 tp->snd_cwnd = ctf_fixed_maxseg(tp); 5626 } 5627 if (tp->t_flags2 & TF2_ECN_PERMIT) 5628 tp->t_flags2 |= TF2_ECN_SND_CWR; 5629 break; 5630 case CC_RTO_ERR: 5631 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 5632 /* RTO was unnecessary, so reset everything. */ 5633 tp->snd_cwnd = tp->snd_cwnd_prev; 5634 tp->snd_ssthresh = tp->snd_ssthresh_prev; 5635 tp->snd_recover = tp->snd_recover_prev; 5636 if (tp->t_flags & TF_WASFRECOVERY) { 5637 ENTER_FASTRECOVERY(tp->t_flags); 5638 tp->t_flags &= ~TF_WASFRECOVERY; 5639 } 5640 if (tp->t_flags & TF_WASCRECOVERY) { 5641 ENTER_CONGRECOVERY(tp->t_flags); 5642 tp->t_flags &= ~TF_WASCRECOVERY; 5643 } 5644 tp->snd_nxt = tp->snd_max; 5645 tp->t_badrxtwin = 0; 5646 break; 5647 } 5648 if ((CC_ALGO(tp)->cong_signal != NULL) && 5649 (type != CC_RTO)){ 5650 tp->t_ccv.curack = ack; 5651 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 5652 } 5653 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 5654 rack_log_to_prr(rack, 15, cwnd_enter, line); 5655 rack->r_ctl.dsack_byte_cnt = 0; 5656 rack->r_ctl.retran_during_recovery = 0; 5657 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 5658 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 5659 rack->r_ent_rec_ns = 1; 5660 } 5661 } 5662 5663 static inline void 5664 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 5665 { 5666 uint32_t i_cwnd; 5667 5668 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5669 5670 if (CC_ALGO(tp)->after_idle != NULL) 5671 CC_ALGO(tp)->after_idle(&tp->t_ccv); 5672 5673 if (tp->snd_cwnd == 1) 5674 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 5675 else 5676 i_cwnd = rc_init_window(rack); 5677 5678 /* 5679 * Being idle is no different than the initial window. If the cc 5680 * clamps it down below the initial window raise it to the initial 5681 * window. 5682 */ 5683 if (tp->snd_cwnd < i_cwnd) { 5684 tp->snd_cwnd = i_cwnd; 5685 } 5686 } 5687 5688 /* 5689 * Indicate whether this ack should be delayed. We can delay the ack if 5690 * following conditions are met: 5691 * - There is no delayed ack timer in progress. 5692 * - Our last ack wasn't a 0-sized window. We never want to delay 5693 * the ack that opens up a 0-sized window. 5694 * - LRO wasn't used for this segment. We make sure by checking that the 5695 * segment size is not larger than the MSS. 5696 * - Delayed acks are enabled or this is a half-synchronized T/TCP 5697 * connection. 5698 */ 5699 #define DELAY_ACK(tp, tlen) \ 5700 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 5701 ((tp->t_flags & TF_DELACK) == 0) && \ 5702 (tlen <= tp->t_maxseg) && \ 5703 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 5704 5705 static struct rack_sendmap * 5706 rack_find_lowest_rsm(struct tcp_rack *rack) 5707 { 5708 struct rack_sendmap *rsm; 5709 5710 /* 5711 * Walk the time-order transmitted list looking for an rsm that is 5712 * not acked. This will be the one that was sent the longest time 5713 * ago that is still outstanding. 5714 */ 5715 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 5716 if (rsm->r_flags & RACK_ACKED) { 5717 continue; 5718 } 5719 goto finish; 5720 } 5721 finish: 5722 return (rsm); 5723 } 5724 5725 static struct rack_sendmap * 5726 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5727 { 5728 struct rack_sendmap *prsm; 5729 5730 /* 5731 * Walk the sequence order list backward until we hit and arrive at 5732 * the highest seq not acked. In theory when this is called it 5733 * should be the last segment (which it was not). 5734 */ 5735 prsm = rsm; 5736 5737 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { 5738 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5739 continue; 5740 } 5741 return (prsm); 5742 } 5743 return (NULL); 5744 } 5745 5746 static uint32_t 5747 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed) 5748 { 5749 int32_t lro; 5750 uint32_t thresh; 5751 5752 /* 5753 * lro is the flag we use to determine if we have seen reordering. 5754 * If it gets set we have seen reordering. The reorder logic either 5755 * works in one of two ways: 5756 * 5757 * If reorder-fade is configured, then we track the last time we saw 5758 * re-ordering occur. If we reach the point where enough time as 5759 * passed we no longer consider reordering as occurring. 5760 * 5761 * Or if reorder-face is 0, then once we see reordering we consider 5762 * the connection to alway be subject to reordering and just set lro 5763 * to 1. 5764 * 5765 * In the end if lro is non-zero we add the extra time for 5766 * reordering in. 5767 */ 5768 if (srtt == 0) 5769 srtt = 1; 5770 if (rack->r_ctl.rc_reorder_ts) { 5771 if (rack->r_ctl.rc_reorder_fade) { 5772 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 5773 lro = cts - rack->r_ctl.rc_reorder_ts; 5774 if (lro == 0) { 5775 /* 5776 * No time as passed since the last 5777 * reorder, mark it as reordering. 5778 */ 5779 lro = 1; 5780 } 5781 } else { 5782 /* Negative time? */ 5783 lro = 0; 5784 } 5785 if (lro > rack->r_ctl.rc_reorder_fade) { 5786 /* Turn off reordering seen too */ 5787 rack->r_ctl.rc_reorder_ts = 0; 5788 lro = 0; 5789 } 5790 } else { 5791 /* Reodering does not fade */ 5792 lro = 1; 5793 } 5794 } else { 5795 lro = 0; 5796 } 5797 if (rack->rc_rack_tmr_std_based == 0) { 5798 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5799 } else { 5800 /* Standards based pkt-delay is 1/4 srtt */ 5801 thresh = srtt + (srtt >> 2); 5802 } 5803 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 5804 /* It must be set, if not you get 1/4 rtt */ 5805 if (rack->r_ctl.rc_reorder_shift) 5806 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5807 else 5808 thresh += (srtt >> 2); 5809 } 5810 if (rack->rc_rack_use_dsack && 5811 lro && 5812 (rack->r_ctl.num_dsack > 0)) { 5813 /* 5814 * We only increase the reordering window if we 5815 * have seen reordering <and> we have a DSACK count. 5816 */ 5817 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 5818 if (log_allowed) 5819 rack_log_dsack_event(rack, 4, line, srtt, thresh); 5820 } 5821 /* SRTT * 2 is the ceiling */ 5822 if (thresh > (srtt * 2)) { 5823 thresh = srtt * 2; 5824 } 5825 /* And we don't want it above the RTO max either */ 5826 if (thresh > rack_rto_max) { 5827 thresh = rack_rto_max; 5828 } 5829 if (log_allowed) 5830 rack_log_dsack_event(rack, 6, line, srtt, thresh); 5831 return (thresh); 5832 } 5833 5834 static uint32_t 5835 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5836 struct rack_sendmap *rsm, uint32_t srtt) 5837 { 5838 struct rack_sendmap *prsm; 5839 uint32_t thresh, len; 5840 int segsiz; 5841 5842 if (srtt == 0) 5843 srtt = 1; 5844 if (rack->r_ctl.rc_tlp_threshold) 5845 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5846 else 5847 thresh = (srtt * 2); 5848 5849 /* Get the previous sent packet, if any */ 5850 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5851 len = rsm->r_end - rsm->r_start; 5852 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5853 /* Exactly like the ID */ 5854 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5855 uint32_t alt_thresh; 5856 /* 5857 * Compensate for delayed-ack with the d-ack time. 5858 */ 5859 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5860 if (alt_thresh > thresh) 5861 thresh = alt_thresh; 5862 } 5863 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 5864 /* 2.1 behavior */ 5865 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 5866 if (prsm && (len <= segsiz)) { 5867 /* 5868 * Two packets outstanding, thresh should be (2*srtt) + 5869 * possible inter-packet delay (if any). 5870 */ 5871 uint32_t inter_gap = 0; 5872 int idx, nidx; 5873 5874 idx = rsm->r_rtr_cnt - 1; 5875 nidx = prsm->r_rtr_cnt - 1; 5876 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 5877 /* Yes it was sent later (or at the same time) */ 5878 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 5879 } 5880 thresh += inter_gap; 5881 } else if (len <= segsiz) { 5882 /* 5883 * Possibly compensate for delayed-ack. 5884 */ 5885 uint32_t alt_thresh; 5886 5887 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5888 if (alt_thresh > thresh) 5889 thresh = alt_thresh; 5890 } 5891 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 5892 /* 2.2 behavior */ 5893 if (len <= segsiz) { 5894 uint32_t alt_thresh; 5895 /* 5896 * Compensate for delayed-ack with the d-ack time. 5897 */ 5898 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5899 if (alt_thresh > thresh) 5900 thresh = alt_thresh; 5901 } 5902 } 5903 /* Not above an RTO */ 5904 if (thresh > tp->t_rxtcur) { 5905 thresh = tp->t_rxtcur; 5906 } 5907 /* Not above a RTO max */ 5908 if (thresh > rack_rto_max) { 5909 thresh = rack_rto_max; 5910 } 5911 /* Apply user supplied min TLP */ 5912 if (thresh < rack_tlp_min) { 5913 thresh = rack_tlp_min; 5914 } 5915 return (thresh); 5916 } 5917 5918 static uint32_t 5919 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 5920 { 5921 /* 5922 * We want the rack_rtt which is the 5923 * last rtt we measured. However if that 5924 * does not exist we fallback to the srtt (which 5925 * we probably will never do) and then as a last 5926 * resort we use RACK_INITIAL_RTO if no srtt is 5927 * yet set. 5928 */ 5929 if (rack->rc_rack_rtt) 5930 return (rack->rc_rack_rtt); 5931 else if (tp->t_srtt == 0) 5932 return (RACK_INITIAL_RTO); 5933 return (tp->t_srtt); 5934 } 5935 5936 static struct rack_sendmap * 5937 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 5938 { 5939 /* 5940 * Check to see that we don't need to fall into recovery. We will 5941 * need to do so if our oldest transmit is past the time we should 5942 * have had an ack. 5943 */ 5944 struct tcp_rack *rack; 5945 struct rack_sendmap *rsm; 5946 int32_t idx; 5947 uint32_t srtt, thresh; 5948 5949 rack = (struct tcp_rack *)tp->t_fb_ptr; 5950 if (tqhash_empty(rack->r_ctl.tqh)) { 5951 return (NULL); 5952 } 5953 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5954 if (rsm == NULL) 5955 return (NULL); 5956 5957 5958 if (rsm->r_flags & RACK_ACKED) { 5959 rsm = rack_find_lowest_rsm(rack); 5960 if (rsm == NULL) 5961 return (NULL); 5962 } 5963 idx = rsm->r_rtr_cnt - 1; 5964 srtt = rack_grab_rtt(tp, rack); 5965 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 5966 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 5967 return (NULL); 5968 } 5969 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 5970 return (NULL); 5971 } 5972 /* Ok if we reach here we are over-due and this guy can be sent */ 5973 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 5974 return (rsm); 5975 } 5976 5977 static uint32_t 5978 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 5979 { 5980 int32_t t; 5981 int32_t tt; 5982 uint32_t ret_val; 5983 5984 t = (tp->t_srtt + (tp->t_rttvar << 2)); 5985 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 5986 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 5987 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 5988 ret_val = (uint32_t)tt; 5989 return (ret_val); 5990 } 5991 5992 static uint32_t 5993 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 5994 { 5995 /* 5996 * Start the FR timer, we do this based on getting the first one in 5997 * the rc_tmap. Note that if its NULL we must stop the timer. in all 5998 * events we need to stop the running timer (if its running) before 5999 * starting the new one. 6000 */ 6001 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 6002 uint32_t srtt_cur; 6003 int32_t idx; 6004 int32_t is_tlp_timer = 0; 6005 struct rack_sendmap *rsm; 6006 6007 if (rack->t_timers_stopped) { 6008 /* All timers have been stopped none are to run */ 6009 return (0); 6010 } 6011 if (rack->rc_in_persist) { 6012 /* We can't start any timer in persists */ 6013 return (rack_get_persists_timer_val(tp, rack)); 6014 } 6015 rack->rc_on_min_to = 0; 6016 if ((tp->t_state < TCPS_ESTABLISHED) || 6017 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 6018 goto activate_rxt; 6019 } 6020 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6021 if ((rsm == NULL) || sup_rack) { 6022 /* Nothing on the send map or no rack */ 6023 activate_rxt: 6024 time_since_sent = 0; 6025 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6026 if (rsm) { 6027 /* 6028 * Should we discount the RTX timer any? 6029 * 6030 * We want to discount it the smallest amount. 6031 * If a timer (Rack/TLP or RXT) has gone off more 6032 * recently thats the discount we want to use (now - timer time). 6033 * If the retransmit of the oldest packet was more recent then 6034 * we want to use that (now - oldest-packet-last_transmit_time). 6035 * 6036 */ 6037 idx = rsm->r_rtr_cnt - 1; 6038 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 6039 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6040 else 6041 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6042 if (TSTMP_GT(cts, tstmp_touse)) 6043 time_since_sent = cts - tstmp_touse; 6044 } 6045 if (SEQ_LT(tp->snd_una, tp->snd_max) || 6046 sbavail(&tptosocket(tp)->so_snd)) { 6047 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 6048 to = tp->t_rxtcur; 6049 if (to > time_since_sent) 6050 to -= time_since_sent; 6051 else 6052 to = rack->r_ctl.rc_min_to; 6053 if (to == 0) 6054 to = 1; 6055 /* Special case for KEEPINIT */ 6056 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6057 (TP_KEEPINIT(tp) != 0) && 6058 rsm) { 6059 /* 6060 * We have to put a ceiling on the rxt timer 6061 * of the keep-init timeout. 6062 */ 6063 uint32_t max_time, red; 6064 6065 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 6066 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 6067 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 6068 if (red < max_time) 6069 max_time -= red; 6070 else 6071 max_time = 1; 6072 } 6073 /* Reduce timeout to the keep value if needed */ 6074 if (max_time < to) 6075 to = max_time; 6076 } 6077 return (to); 6078 } 6079 return (0); 6080 } 6081 if (rsm->r_flags & RACK_ACKED) { 6082 rsm = rack_find_lowest_rsm(rack); 6083 if (rsm == NULL) { 6084 /* No lowest? */ 6085 goto activate_rxt; 6086 } 6087 } 6088 /* Convert from ms to usecs */ 6089 if ((rsm->r_flags & RACK_SACK_PASSED) || 6090 (rsm->r_flags & RACK_RWND_COLLAPSED) || 6091 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 6092 if ((tp->t_flags & TF_SENTFIN) && 6093 ((tp->snd_max - tp->snd_una) == 1) && 6094 (rsm->r_flags & RACK_HAS_FIN)) { 6095 /* 6096 * We don't start a rack timer if all we have is a 6097 * FIN outstanding. 6098 */ 6099 goto activate_rxt; 6100 } 6101 if ((rack->use_rack_rr == 0) && 6102 (IN_FASTRECOVERY(tp->t_flags)) && 6103 (rack->rack_no_prr == 0) && 6104 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 6105 /* 6106 * We are not cheating, in recovery and 6107 * not enough ack's to yet get our next 6108 * retransmission out. 6109 * 6110 * Note that classified attackers do not 6111 * get to use the rack-cheat. 6112 */ 6113 goto activate_tlp; 6114 } 6115 srtt = rack_grab_rtt(tp, rack); 6116 thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1); 6117 idx = rsm->r_rtr_cnt - 1; 6118 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 6119 if (SEQ_GEQ(exp, cts)) { 6120 to = exp - cts; 6121 if (to < rack->r_ctl.rc_min_to) { 6122 to = rack->r_ctl.rc_min_to; 6123 if (rack->r_rr_config == 3) 6124 rack->rc_on_min_to = 1; 6125 } 6126 } else { 6127 to = rack->r_ctl.rc_min_to; 6128 if (rack->r_rr_config == 3) 6129 rack->rc_on_min_to = 1; 6130 } 6131 } else { 6132 /* Ok we need to do a TLP not RACK */ 6133 activate_tlp: 6134 if ((rack->rc_tlp_in_progress != 0) && 6135 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 6136 /* 6137 * The previous send was a TLP and we have sent 6138 * N TLP's without sending new data. 6139 */ 6140 goto activate_rxt; 6141 } 6142 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 6143 if (rsm == NULL) { 6144 /* We found no rsm to TLP with. */ 6145 goto activate_rxt; 6146 } 6147 if (rsm->r_flags & RACK_HAS_FIN) { 6148 /* If its a FIN we dont do TLP */ 6149 rsm = NULL; 6150 goto activate_rxt; 6151 } 6152 idx = rsm->r_rtr_cnt - 1; 6153 time_since_sent = 0; 6154 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 6155 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6156 else 6157 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6158 if (TSTMP_GT(cts, tstmp_touse)) 6159 time_since_sent = cts - tstmp_touse; 6160 is_tlp_timer = 1; 6161 if (tp->t_srtt) { 6162 if ((rack->rc_srtt_measure_made == 0) && 6163 (tp->t_srtt == 1)) { 6164 /* 6165 * If another stack as run and set srtt to 1, 6166 * then the srtt was 0, so lets use the initial. 6167 */ 6168 srtt = RACK_INITIAL_RTO; 6169 } else { 6170 srtt_cur = tp->t_srtt; 6171 srtt = srtt_cur; 6172 } 6173 } else 6174 srtt = RACK_INITIAL_RTO; 6175 /* 6176 * If the SRTT is not keeping up and the 6177 * rack RTT has spiked we want to use 6178 * the last RTT not the smoothed one. 6179 */ 6180 if (rack_tlp_use_greater && 6181 tp->t_srtt && 6182 (srtt < rack_grab_rtt(tp, rack))) { 6183 srtt = rack_grab_rtt(tp, rack); 6184 } 6185 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 6186 if (thresh > time_since_sent) { 6187 to = thresh - time_since_sent; 6188 } else { 6189 to = rack->r_ctl.rc_min_to; 6190 rack_log_alt_to_to_cancel(rack, 6191 thresh, /* flex1 */ 6192 time_since_sent, /* flex2 */ 6193 tstmp_touse, /* flex3 */ 6194 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 6195 (uint32_t)rsm->r_tim_lastsent[idx], 6196 srtt, 6197 idx, 99); 6198 } 6199 if (to < rack_tlp_min) { 6200 to = rack_tlp_min; 6201 } 6202 if (to > TICKS_2_USEC(tcp_rexmit_max)) { 6203 /* 6204 * If the TLP time works out to larger than the max 6205 * RTO lets not do TLP.. just RTO. 6206 */ 6207 goto activate_rxt; 6208 } 6209 } 6210 if (is_tlp_timer == 0) { 6211 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 6212 } else { 6213 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 6214 } 6215 if (to == 0) 6216 to = 1; 6217 return (to); 6218 } 6219 6220 static void 6221 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) 6222 { 6223 if (rack->rc_in_persist == 0) { 6224 if (tp->t_flags & TF_GPUTINPROG) { 6225 /* 6226 * Stop the goodput now, the calling of the 6227 * measurement function clears the flag. 6228 */ 6229 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 6230 RACK_QUALITY_PERSIST); 6231 } 6232 #ifdef NETFLIX_SHARED_CWND 6233 if (rack->r_ctl.rc_scw) { 6234 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6235 rack->rack_scwnd_is_idle = 1; 6236 } 6237 #endif 6238 rack->r_ctl.rc_went_idle_time = cts; 6239 if (rack->r_ctl.rc_went_idle_time == 0) 6240 rack->r_ctl.rc_went_idle_time = 1; 6241 if (rack->lt_bw_up) { 6242 /* Suspend our LT BW measurement */ 6243 uint64_t tmark; 6244 6245 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); 6246 rack->r_ctl.lt_seq = snd_una; 6247 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 6248 if (tmark >= rack->r_ctl.lt_timemark) { 6249 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 6250 } 6251 rack->r_ctl.lt_timemark = tmark; 6252 rack->lt_bw_up = 0; 6253 rack->r_persist_lt_bw_off = 1; 6254 } 6255 rack_timer_cancel(tp, rack, cts, __LINE__); 6256 rack->r_ctl.persist_lost_ends = 0; 6257 rack->probe_not_answered = 0; 6258 rack->forced_ack = 0; 6259 tp->t_rxtshift = 0; 6260 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6261 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6262 rack->rc_in_persist = 1; 6263 } 6264 } 6265 6266 static void 6267 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6268 { 6269 if (tcp_in_hpts(rack->rc_tp)) { 6270 tcp_hpts_remove(rack->rc_tp); 6271 rack->r_ctl.rc_hpts_flags = 0; 6272 } 6273 #ifdef NETFLIX_SHARED_CWND 6274 if (rack->r_ctl.rc_scw) { 6275 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6276 rack->rack_scwnd_is_idle = 0; 6277 } 6278 #endif 6279 if (rack->rc_gp_dyn_mul && 6280 (rack->use_fixed_rate == 0) && 6281 (rack->rc_always_pace)) { 6282 /* 6283 * Do we count this as if a probe-rtt just 6284 * finished? 6285 */ 6286 uint32_t time_idle, idle_min; 6287 6288 time_idle = cts - rack->r_ctl.rc_went_idle_time; 6289 idle_min = rack_min_probertt_hold; 6290 if (rack_probertt_gpsrtt_cnt_div) { 6291 uint64_t extra; 6292 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 6293 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 6294 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 6295 idle_min += (uint32_t)extra; 6296 } 6297 if (time_idle >= idle_min) { 6298 /* Yes, we count it as a probe-rtt. */ 6299 uint32_t us_cts; 6300 6301 us_cts = tcp_get_usecs(NULL); 6302 if (rack->in_probe_rtt == 0) { 6303 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6304 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 6305 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 6306 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 6307 } else { 6308 rack_exit_probertt(rack, us_cts); 6309 } 6310 } 6311 } 6312 if (rack->r_persist_lt_bw_off) { 6313 /* Continue where we left off */ 6314 rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL); 6315 rack->lt_bw_up = 1; 6316 rack->r_persist_lt_bw_off = 0; 6317 } 6318 rack->rc_in_persist = 0; 6319 rack->r_ctl.rc_went_idle_time = 0; 6320 tp->t_rxtshift = 0; 6321 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6322 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6323 rack->r_ctl.rc_agg_delayed = 0; 6324 rack->r_early = 0; 6325 rack->r_late = 0; 6326 rack->r_ctl.rc_agg_early = 0; 6327 } 6328 6329 static void 6330 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 6331 struct hpts_diag *diag, struct timeval *tv) 6332 { 6333 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6334 union tcp_log_stackspecific log; 6335 6336 memset(&log, 0, sizeof(log)); 6337 log.u_bbr.flex1 = diag->p_nxt_slot; 6338 log.u_bbr.flex2 = diag->p_cur_slot; 6339 log.u_bbr.flex3 = diag->slot_req; 6340 log.u_bbr.flex4 = diag->inp_hptsslot; 6341 log.u_bbr.flex5 = diag->time_remaining; 6342 log.u_bbr.flex6 = diag->need_new_to; 6343 log.u_bbr.flex7 = diag->p_hpts_active; 6344 log.u_bbr.flex8 = diag->p_on_min_sleep; 6345 /* Hijack other fields as needed */ 6346 log.u_bbr.epoch = diag->have_slept; 6347 log.u_bbr.lt_epoch = diag->yet_to_sleep; 6348 log.u_bbr.pkts_out = diag->co_ret; 6349 log.u_bbr.applimited = diag->hpts_sleep_time; 6350 log.u_bbr.delivered = diag->p_prev_slot; 6351 log.u_bbr.inflight = diag->p_runningslot; 6352 log.u_bbr.bw_inuse = diag->wheel_slot; 6353 log.u_bbr.rttProp = diag->wheel_cts; 6354 log.u_bbr.timeStamp = cts; 6355 log.u_bbr.delRate = diag->maxslots; 6356 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6357 &rack->rc_inp->inp_socket->so_rcv, 6358 &rack->rc_inp->inp_socket->so_snd, 6359 BBR_LOG_HPTSDIAG, 0, 6360 0, &log, false, tv); 6361 } 6362 6363 } 6364 6365 static void 6366 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 6367 { 6368 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6369 union tcp_log_stackspecific log; 6370 struct timeval tv; 6371 6372 memset(&log, 0, sizeof(log)); 6373 log.u_bbr.flex1 = sb->sb_flags; 6374 log.u_bbr.flex2 = len; 6375 log.u_bbr.flex3 = sb->sb_state; 6376 log.u_bbr.flex8 = type; 6377 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6378 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6379 &rack->rc_inp->inp_socket->so_rcv, 6380 &rack->rc_inp->inp_socket->so_snd, 6381 TCP_LOG_SB_WAKE, 0, 6382 len, &log, false, &tv); 6383 } 6384 } 6385 6386 static void 6387 rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 6388 int32_t usecs, uint32_t tot_len_this_send, int sup_rack) 6389 { 6390 struct hpts_diag diag; 6391 struct inpcb *inp = tptoinpcb(tp); 6392 struct timeval tv; 6393 uint32_t delayed_ack = 0; 6394 uint32_t hpts_timeout; 6395 uint32_t entry_usecs = usecs; 6396 uint8_t stopped; 6397 uint32_t left = 0; 6398 uint32_t us_cts; 6399 6400 if ((tp->t_state == TCPS_CLOSED) || 6401 (tp->t_state == TCPS_LISTEN)) { 6402 return; 6403 } 6404 if (tcp_in_hpts(tp)) { 6405 /* Already on the pacer */ 6406 return; 6407 } 6408 stopped = rack->rc_tmr_stopped; 6409 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 6410 left = rack->r_ctl.rc_timer_exp - cts; 6411 } 6412 rack->r_ctl.rc_timer_exp = 0; 6413 rack->r_ctl.rc_hpts_flags = 0; 6414 us_cts = tcp_get_usecs(&tv); 6415 /* Now early/late accounting */ 6416 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, 0, 0, 0, 26, __LINE__, NULL, 0); 6417 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 6418 /* 6419 * We have a early carry over set, 6420 * we can always add more time so we 6421 * can always make this compensation. 6422 * 6423 * Note if ack's are allowed to wake us do not 6424 * penalize the next timer for being awoke 6425 * by an ack aka the rc_agg_early (non-paced mode). 6426 */ 6427 usecs += rack->r_ctl.rc_agg_early; 6428 rack->r_early = 0; 6429 rack->r_ctl.rc_agg_early = 0; 6430 } 6431 if ((rack->r_late) && 6432 ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) { 6433 /* 6434 * This is harder, we can 6435 * compensate some but it 6436 * really depends on what 6437 * the current pacing time is. 6438 */ 6439 if (rack->r_ctl.rc_agg_delayed >= usecs) { 6440 /* 6441 * We can't compensate for it all. 6442 * And we have to have some time 6443 * on the clock. We always have a min 6444 * 10 HPTS timer units (10 x 10 i.e. 100 usecs). 6445 */ 6446 if (usecs <= HPTS_USECS_PER_SLOT) { 6447 /* We gain delay */ 6448 rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - usecs); 6449 usecs = HPTS_USECS_PER_SLOT; 6450 } else { 6451 /* We take off some */ 6452 rack->r_ctl.rc_agg_delayed -= (usecs - HPTS_USECS_PER_SLOT); 6453 usecs = HPTS_USECS_PER_SLOT; 6454 } 6455 } else { 6456 usecs -= rack->r_ctl.rc_agg_delayed; 6457 rack->r_ctl.rc_agg_delayed = 0; 6458 /* Make sure we have 100 useconds at minimum */ 6459 if (usecs < HPTS_USECS_PER_SLOT) { 6460 rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - usecs; 6461 usecs = HPTS_USECS_PER_SLOT; 6462 } 6463 if (rack->r_ctl.rc_agg_delayed == 0) 6464 rack->r_late = 0; 6465 } 6466 } else if (rack->r_late) { 6467 /* r_use_hpts_min is on and so is DGP */ 6468 uint32_t max_red; 6469 6470 max_red = (usecs * rack->r_ctl.max_reduction) / 100; 6471 if (max_red >= rack->r_ctl.rc_agg_delayed) { 6472 usecs -= rack->r_ctl.rc_agg_delayed; 6473 rack->r_ctl.rc_agg_delayed = 0; 6474 } else { 6475 usecs -= max_red; 6476 rack->r_ctl.rc_agg_delayed -= max_red; 6477 } 6478 } 6479 if ((rack->r_use_hpts_min == 1) && 6480 (usecs > 0) && 6481 (rack->dgp_on == 1)) { 6482 /* 6483 * We are enforcing a min pacing timer 6484 * based on our hpts min timeout. 6485 */ 6486 uint32_t min; 6487 6488 min = get_hpts_min_sleep_time(); 6489 if (min > usecs) { 6490 usecs = min; 6491 } 6492 } 6493 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 6494 if (tp->t_flags & TF_DELACK) { 6495 delayed_ack = TICKS_2_USEC(tcp_delacktime); 6496 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 6497 } 6498 if (delayed_ack && ((hpts_timeout == 0) || 6499 (delayed_ack < hpts_timeout))) 6500 hpts_timeout = delayed_ack; 6501 else 6502 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6503 /* 6504 * If no timers are going to run and we will fall off the hptsi 6505 * wheel, we resort to a keep-alive timer if its configured. 6506 */ 6507 if ((hpts_timeout == 0) && 6508 (usecs == 0)) { 6509 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6510 (tp->t_state <= TCPS_CLOSING)) { 6511 /* 6512 * Ok we have no timer (persists, rack, tlp, rxt or 6513 * del-ack), we don't have segments being paced. So 6514 * all that is left is the keepalive timer. 6515 */ 6516 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6517 /* Get the established keep-alive time */ 6518 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 6519 } else { 6520 /* 6521 * Get the initial setup keep-alive time, 6522 * note that this is probably not going to 6523 * happen, since rack will be running a rxt timer 6524 * if a SYN of some sort is outstanding. It is 6525 * actually handled in rack_timeout_rxt(). 6526 */ 6527 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 6528 } 6529 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 6530 if (rack->in_probe_rtt) { 6531 /* 6532 * We want to instead not wake up a long time from 6533 * now but to wake up about the time we would 6534 * exit probe-rtt and initiate a keep-alive ack. 6535 * This will get us out of probe-rtt and update 6536 * our min-rtt. 6537 */ 6538 hpts_timeout = rack_min_probertt_hold; 6539 } 6540 } 6541 } 6542 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 6543 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 6544 /* 6545 * RACK, TLP, persists and RXT timers all are restartable 6546 * based on actions input .. i.e we received a packet (ack 6547 * or sack) and that changes things (rw, or snd_una etc). 6548 * Thus we can restart them with a new value. For 6549 * keep-alive, delayed_ack we keep track of what was left 6550 * and restart the timer with a smaller value. 6551 */ 6552 if (left < hpts_timeout) 6553 hpts_timeout = left; 6554 } 6555 if (hpts_timeout) { 6556 /* 6557 * Hack alert for now we can't time-out over 2,147,483 6558 * seconds (a bit more than 596 hours), which is probably ok 6559 * :). 6560 */ 6561 if (hpts_timeout > 0x7ffffffe) 6562 hpts_timeout = 0x7ffffffe; 6563 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 6564 } 6565 rack_log_pacing_delay_calc(rack, entry_usecs, usecs, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 6566 if ((rack->gp_ready == 0) && 6567 (rack->use_fixed_rate == 0) && 6568 (hpts_timeout < usecs) && 6569 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 6570 /* 6571 * We have no good estimate yet for the 6572 * old clunky burst mitigation or the 6573 * real pacing. And the tlp or rxt is smaller 6574 * than the pacing calculation. Lets not 6575 * pace that long since we know the calculation 6576 * so far is not accurate. 6577 */ 6578 usecs = hpts_timeout; 6579 } 6580 /** 6581 * Turn off all the flags for queuing by default. The 6582 * flags have important meanings to what happens when 6583 * LRO interacts with the transport. Most likely (by default now) 6584 * mbuf_queueing and ack compression are on. So the transport 6585 * has a couple of flags that control what happens (if those 6586 * are not on then these flags won't have any effect since it 6587 * won't go through the queuing LRO path). 6588 * 6589 * TF2_MBUF_QUEUE_READY - This flags says that I am busy 6590 * pacing output, so don't disturb. But 6591 * it also means LRO can wake me if there 6592 * is a SACK arrival. 6593 * 6594 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction 6595 * with the above flag (QUEUE_READY) and 6596 * when present it says don't even wake me 6597 * if a SACK arrives. 6598 * 6599 * The idea behind these flags is that if we are pacing we 6600 * set the MBUF_QUEUE_READY and only get woken up if 6601 * a SACK arrives (which could change things) or if 6602 * our pacing timer expires. If, however, we have a rack 6603 * timer running, then we don't even want a sack to wake 6604 * us since the rack timer has to expire before we can send. 6605 * 6606 * Other cases should usually have none of the flags set 6607 * so LRO can call into us. 6608 */ 6609 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); 6610 if (usecs) { 6611 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 6612 rack->r_ctl.rc_last_output_to = us_cts + usecs; 6613 /* 6614 * A pacing timer (usecs microseconds) is being set, in 6615 * such a case we cannot send (we are blocked by 6616 * the timer). So lets tell LRO that it should not 6617 * wake us unless there is a SACK. Note this only 6618 * will be effective if mbuf queueing is on or 6619 * compressed acks are being processed. 6620 */ 6621 tp->t_flags2 |= TF2_MBUF_QUEUE_READY; 6622 /* 6623 * But wait if we have a Rack timer running 6624 * even a SACK should not disturb us (with 6625 * the exception of r_rr_config 3). 6626 */ 6627 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) || 6628 (IN_RECOVERY(tp->t_flags))) { 6629 if (rack->r_rr_config != 3) 6630 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6631 else if (rack->rc_pace_dnd) { 6632 /* 6633 * When DND is on, we only let a sack 6634 * interrupt us if we are not in recovery. 6635 * 6636 * If DND is off, then we never hit here 6637 * and let all sacks wake us up. 6638 * 6639 */ 6640 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6641 } 6642 } 6643 if (rack->rc_ack_can_sendout_data) { 6644 /* 6645 * Ahh but wait, this is that special case 6646 * where the pacing timer can be disturbed 6647 * backout the changes (used for non-paced 6648 * burst limiting). 6649 */ 6650 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE | 6651 TF2_MBUF_QUEUE_READY); 6652 } 6653 if ((rack->use_rack_rr) && 6654 (rack->r_rr_config < 2) && 6655 ((hpts_timeout) && (hpts_timeout < usecs))) { 6656 /* 6657 * Arrange for the hpts to kick back in after the 6658 * t-o if the t-o does not cause a send. 6659 */ 6660 tcp_hpts_insert(tp, hpts_timeout, &diag); 6661 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6662 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); 6663 } else { 6664 tcp_hpts_insert(tp, usecs, &diag); 6665 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6666 rack_log_to_start(rack, cts, hpts_timeout, usecs, 1); 6667 } 6668 } else if (hpts_timeout) { 6669 /* 6670 * With respect to t_flags2(?) here, lets let any new acks wake 6671 * us up here. Since we are not pacing (no pacing timer), output 6672 * can happen so we should let it. If its a Rack timer, then any inbound 6673 * packet probably won't change the sending (we will be blocked) 6674 * but it may change the prr stats so letting it in (the set defaults 6675 * at the start of this block) are good enough. 6676 */ 6677 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6678 tcp_hpts_insert(tp, hpts_timeout, &diag); 6679 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6680 rack_log_to_start(rack, cts, hpts_timeout, usecs, 0); 6681 } else { 6682 /* No timer starting */ 6683 #ifdef INVARIANTS 6684 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 6685 panic("tp:%p rack:%p tlts:%d cts:%u usecs:%u pto:%u -- no timer started?", 6686 tp, rack, tot_len_this_send, cts, usecs, hpts_timeout); 6687 } 6688 #endif 6689 } 6690 rack->rc_tmr_stopped = 0; 6691 if (usecs) 6692 rack_log_type_bbrsnd(rack, tot_len_this_send, usecs, us_cts, &tv, __LINE__); 6693 } 6694 6695 static void 6696 rack_mark_lost(struct tcpcb *tp, 6697 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 6698 { 6699 struct rack_sendmap *nrsm; 6700 uint32_t thresh, exp; 6701 6702 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 6703 nrsm = rsm; 6704 TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) { 6705 if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) { 6706 /* Got up to all that were marked sack-passed */ 6707 break; 6708 } 6709 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 6710 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 6711 if (TSTMP_LT(exp, cts) || (exp == cts)) { 6712 /* We now consider it lost */ 6713 nrsm->r_flags |= RACK_WAS_LOST; 6714 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 6715 } else { 6716 /* Past here it won't be lost so stop */ 6717 break; 6718 } 6719 } 6720 } 6721 } 6722 6723 static inline void 6724 rack_mark_nolonger_lost(struct tcp_rack *rack, struct rack_sendmap *rsm) 6725 { 6726 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), 6727 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 6728 rsm->r_flags &= ~RACK_WAS_LOST; 6729 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) 6730 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; 6731 else 6732 rack->r_ctl.rc_considered_lost = 0; 6733 } 6734 6735 /* 6736 * RACK Timer, here we simply do logging and house keeping. 6737 * the normal rack_output() function will call the 6738 * appropriate thing to check if we need to do a RACK retransmit. 6739 * We return 1, saying don't proceed with rack_output only 6740 * when all timers have been stopped (destroyed PCB?). 6741 */ 6742 static int 6743 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6744 { 6745 /* 6746 * This timer simply provides an internal trigger to send out data. 6747 * The check_recovery_mode call will see if there are needed 6748 * retransmissions, if so we will enter fast-recovery. The output 6749 * call may or may not do the same thing depending on sysctl 6750 * settings. 6751 */ 6752 struct rack_sendmap *rsm; 6753 6754 counter_u64_add(rack_to_tot, 1); 6755 if (rack->r_state && (rack->r_state != tp->t_state)) 6756 rack_set_state(tp, rack); 6757 rack->rc_on_min_to = 0; 6758 rsm = rack_check_recovery_mode(tp, cts); 6759 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 6760 if (rsm) { 6761 /* We need to stroke any lost that are now declared as lost */ 6762 rack_mark_lost(tp, rack, rsm, cts); 6763 rack->r_ctl.rc_resend = rsm; 6764 rack->r_timer_override = 1; 6765 if (rack->use_rack_rr) { 6766 /* 6767 * Don't accumulate extra pacing delay 6768 * we are allowing the rack timer to 6769 * over-ride pacing i.e. rrr takes precedence 6770 * if the pacing interval is longer than the rrr 6771 * time (in other words we get the min pacing 6772 * time versus rrr pacing time). 6773 */ 6774 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6775 } 6776 } 6777 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 6778 if (rsm == NULL) { 6779 /* restart a timer and return 1 */ 6780 rack_start_hpts_timer(rack, tp, cts, 6781 0, 0, 0); 6782 return (1); 6783 } 6784 return (0); 6785 } 6786 6787 6788 6789 static void 6790 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 6791 { 6792 6793 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { 6794 /* 6795 * The trailing space changed, mbufs can grow 6796 * at the tail but they can't shrink from 6797 * it, KASSERT that. Adjust the orig_m_len to 6798 * compensate for this change. 6799 */ 6800 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), 6801 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 6802 rsm->m, 6803 rsm, 6804 (intmax_t)M_TRAILINGROOM(rsm->m), 6805 rsm->orig_t_space, 6806 rsm->orig_m_len, 6807 rsm->m->m_len)); 6808 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); 6809 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 6810 } 6811 if (rsm->m->m_len < rsm->orig_m_len) { 6812 /* 6813 * Mbuf shrank, trimmed off the top by an ack, our 6814 * offset changes. 6815 */ 6816 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), 6817 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", 6818 rsm->m, rsm->m->m_len, 6819 rsm, rsm->orig_m_len, 6820 rsm->soff)); 6821 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) 6822 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 6823 else 6824 rsm->soff = 0; 6825 rsm->orig_m_len = rsm->m->m_len; 6826 #ifdef INVARIANTS 6827 } else if (rsm->m->m_len > rsm->orig_m_len) { 6828 panic("rsm:%p m:%p m_len grew outside of t_space compensation", 6829 rsm, rsm->m); 6830 #endif 6831 } 6832 } 6833 6834 static void 6835 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 6836 { 6837 struct mbuf *m; 6838 uint32_t soff; 6839 6840 if (src_rsm->m && 6841 ((src_rsm->orig_m_len != src_rsm->m->m_len) || 6842 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { 6843 /* Fix up the orig_m_len and possibly the mbuf offset */ 6844 rack_adjust_orig_mlen(src_rsm); 6845 } 6846 m = src_rsm->m; 6847 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 6848 while (soff >= m->m_len) { 6849 /* Move out past this mbuf */ 6850 soff -= m->m_len; 6851 m = m->m_next; 6852 KASSERT((m != NULL), 6853 ("rsm:%p nrsm:%p hit at soff:%u null m", 6854 src_rsm, rsm, soff)); 6855 if (m == NULL) { 6856 /* This should *not* happen which is why there is a kassert */ 6857 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 6858 (src_rsm->r_start - rack->rc_tp->snd_una), 6859 &src_rsm->soff); 6860 src_rsm->orig_m_len = src_rsm->m->m_len; 6861 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); 6862 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 6863 (rsm->r_start - rack->rc_tp->snd_una), 6864 &rsm->soff); 6865 rsm->orig_m_len = rsm->m->m_len; 6866 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 6867 return; 6868 } 6869 } 6870 rsm->m = m; 6871 rsm->soff = soff; 6872 rsm->orig_m_len = m->m_len; 6873 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 6874 } 6875 6876 static inline void 6877 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 6878 struct rack_sendmap *rsm, uint32_t start) 6879 { 6880 int idx; 6881 6882 nrsm->r_start = start; 6883 nrsm->r_end = rsm->r_end; 6884 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 6885 nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt; 6886 nrsm->r_flags = rsm->r_flags; 6887 nrsm->r_dupack = rsm->r_dupack; 6888 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 6889 nrsm->r_rtr_bytes = 0; 6890 nrsm->r_fas = rsm->r_fas; 6891 nrsm->r_bas = rsm->r_bas; 6892 tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start); 6893 nrsm->r_just_ret = rsm->r_just_ret; 6894 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 6895 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 6896 } 6897 /* Now if we have SYN flag we keep it on the left edge */ 6898 if (nrsm->r_flags & RACK_HAS_SYN) 6899 nrsm->r_flags &= ~RACK_HAS_SYN; 6900 /* Now if we have a FIN flag we keep it on the right edge */ 6901 if (rsm->r_flags & RACK_HAS_FIN) 6902 rsm->r_flags &= ~RACK_HAS_FIN; 6903 /* Push bit must go to the right edge as well */ 6904 if (rsm->r_flags & RACK_HAD_PUSH) 6905 rsm->r_flags &= ~RACK_HAD_PUSH; 6906 /* Update the count if app limited */ 6907 if (nrsm->r_flags & RACK_APP_LIMITED) 6908 rack->r_ctl.rc_app_limited_cnt++; 6909 /* Clone over the state of the hw_tls flag */ 6910 nrsm->r_hw_tls = rsm->r_hw_tls; 6911 /* 6912 * Now we need to find nrsm's new location in the mbuf chain 6913 * we basically calculate a new offset, which is soff + 6914 * how much is left in original rsm. Then we walk out the mbuf 6915 * chain to find the righ position, it may be the same mbuf 6916 * or maybe not. 6917 */ 6918 KASSERT(((rsm->m != NULL) || 6919 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 6920 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 6921 if (rsm->m) 6922 rack_setup_offset_for_rsm(rack, rsm, nrsm); 6923 } 6924 6925 static struct rack_sendmap * 6926 rack_merge_rsm(struct tcp_rack *rack, 6927 struct rack_sendmap *l_rsm, 6928 struct rack_sendmap *r_rsm) 6929 { 6930 /* 6931 * We are merging two ack'd RSM's, 6932 * the l_rsm is on the left (lower seq 6933 * values) and the r_rsm is on the right 6934 * (higher seq value). The simplest way 6935 * to merge these is to move the right 6936 * one into the left. I don't think there 6937 * is any reason we need to try to find 6938 * the oldest (or last oldest retransmitted). 6939 */ 6940 rack_log_map_chg(rack->rc_tp, rack, NULL, 6941 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 6942 tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end); 6943 if (l_rsm->r_dupack < r_rsm->r_dupack) 6944 l_rsm->r_dupack = r_rsm->r_dupack; 6945 if (r_rsm->r_rtr_bytes) 6946 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 6947 if (r_rsm->r_in_tmap) { 6948 /* This really should not happen */ 6949 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 6950 r_rsm->r_in_tmap = 0; 6951 } 6952 6953 /* Now the flags */ 6954 if (r_rsm->r_flags & RACK_HAS_FIN) 6955 l_rsm->r_flags |= RACK_HAS_FIN; 6956 if (r_rsm->r_flags & RACK_TLP) 6957 l_rsm->r_flags |= RACK_TLP; 6958 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 6959 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 6960 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 6961 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 6962 /* 6963 * If both are app-limited then let the 6964 * free lower the count. If right is app 6965 * limited and left is not, transfer. 6966 */ 6967 l_rsm->r_flags |= RACK_APP_LIMITED; 6968 r_rsm->r_flags &= ~RACK_APP_LIMITED; 6969 if (r_rsm == rack->r_ctl.rc_first_appl) 6970 rack->r_ctl.rc_first_appl = l_rsm; 6971 } 6972 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); 6973 /* 6974 * We keep the largest value, which is the newest 6975 * send. We do this in case a segment that is 6976 * joined together and not part of a GP estimate 6977 * later gets expanded into the GP estimate. 6978 * 6979 * We prohibit the merging of unlike kinds i.e. 6980 * all pieces that are in the GP estimate can be 6981 * merged and all pieces that are not in a GP estimate 6982 * can be merged, but not disimilar pieces. Combine 6983 * this with taking the highest here and we should 6984 * be ok unless of course the client reneges. Then 6985 * all bets are off. 6986 */ 6987 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < 6988 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { 6989 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; 6990 } 6991 /* 6992 * When merging two RSM's we also need to consider the ack time and keep 6993 * newest. If the ack gets merged into a measurement then that is the 6994 * one we will want to be using. 6995 */ 6996 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) 6997 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; 6998 6999 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 7000 /* Transfer the split limit to the map we free */ 7001 r_rsm->r_limit_type = l_rsm->r_limit_type; 7002 l_rsm->r_limit_type = 0; 7003 } 7004 rack_free(rack, r_rsm); 7005 l_rsm->r_flags |= RACK_MERGED; 7006 return (l_rsm); 7007 } 7008 7009 /* 7010 * TLP Timer, here we simply setup what segment we want to 7011 * have the TLP expire on, the normal rack_output() will then 7012 * send it out. 7013 * 7014 * We return 1, saying don't proceed with rack_output only 7015 * when all timers have been stopped (destroyed PCB?). 7016 */ 7017 static int 7018 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 7019 { 7020 /* 7021 * Tail Loss Probe. 7022 */ 7023 struct rack_sendmap *rsm = NULL; 7024 int insret __diagused; 7025 struct socket *so = tptosocket(tp); 7026 uint32_t amm; 7027 uint32_t out, avail; 7028 int collapsed_win = 0; 7029 7030 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7031 /* Its not time yet */ 7032 return (0); 7033 } 7034 if (ctf_progress_timeout_check(tp, true)) { 7035 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7036 return (-ETIMEDOUT); /* tcp_drop() */ 7037 } 7038 /* 7039 * A TLP timer has expired. We have been idle for 2 rtts. So we now 7040 * need to figure out how to force a full MSS segment out. 7041 */ 7042 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 7043 rack->r_ctl.retran_during_recovery = 0; 7044 rack->r_might_revert = 0; 7045 rack->r_ctl.dsack_byte_cnt = 0; 7046 counter_u64_add(rack_tlp_tot, 1); 7047 if (rack->r_state && (rack->r_state != tp->t_state)) 7048 rack_set_state(tp, rack); 7049 avail = sbavail(&so->so_snd); 7050 out = tp->snd_max - tp->snd_una; 7051 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 7052 /* special case, we need a retransmission */ 7053 collapsed_win = 1; 7054 goto need_retran; 7055 } 7056 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 7057 rack->r_ctl.dsack_persist--; 7058 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7059 rack->r_ctl.num_dsack = 0; 7060 } 7061 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7062 } 7063 if ((tp->t_flags & TF_GPUTINPROG) && 7064 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 7065 /* 7066 * If this is the second in a row 7067 * TLP and we are doing a measurement 7068 * its time to abandon the measurement. 7069 * Something is likely broken on 7070 * the clients network and measuring a 7071 * broken network does us no good. 7072 */ 7073 tp->t_flags &= ~TF_GPUTINPROG; 7074 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7075 rack->r_ctl.rc_gp_srtt /*flex1*/, 7076 tp->gput_seq, 7077 0, 0, 18, __LINE__, NULL, 0); 7078 } 7079 /* 7080 * Check our send oldest always settings, and if 7081 * there is an oldest to send jump to the need_retran. 7082 */ 7083 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 7084 goto need_retran; 7085 7086 if (avail > out) { 7087 /* New data is available */ 7088 amm = avail - out; 7089 if (amm > ctf_fixed_maxseg(tp)) { 7090 amm = ctf_fixed_maxseg(tp); 7091 if ((amm + out) > tp->snd_wnd) { 7092 /* We are rwnd limited */ 7093 goto need_retran; 7094 } 7095 } else if (amm < ctf_fixed_maxseg(tp)) { 7096 /* not enough to fill a MTU */ 7097 goto need_retran; 7098 } 7099 if (IN_FASTRECOVERY(tp->t_flags)) { 7100 /* Unlikely */ 7101 if (rack->rack_no_prr == 0) { 7102 if (out + amm <= tp->snd_wnd) { 7103 rack->r_ctl.rc_prr_sndcnt = amm; 7104 rack->r_ctl.rc_tlp_new_data = amm; 7105 rack_log_to_prr(rack, 4, 0, __LINE__); 7106 } 7107 } else 7108 goto need_retran; 7109 } else { 7110 /* Set the send-new override */ 7111 if (out + amm <= tp->snd_wnd) 7112 rack->r_ctl.rc_tlp_new_data = amm; 7113 else 7114 goto need_retran; 7115 } 7116 rack->r_ctl.rc_tlpsend = NULL; 7117 counter_u64_add(rack_tlp_newdata, 1); 7118 goto send; 7119 } 7120 need_retran: 7121 /* 7122 * Ok we need to arrange the last un-acked segment to be re-sent, or 7123 * optionally the first un-acked segment. 7124 */ 7125 if (collapsed_win == 0) { 7126 if (rack_always_send_oldest) 7127 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7128 else { 7129 rsm = tqhash_max(rack->r_ctl.tqh); 7130 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 7131 rsm = rack_find_high_nonack(rack, rsm); 7132 } 7133 } 7134 if (rsm == NULL) { 7135 #ifdef TCP_BLACKBOX 7136 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 7137 #endif 7138 goto out; 7139 } 7140 } else { 7141 /* 7142 * We had a collapsed window, lets find 7143 * the point before the collapse. 7144 */ 7145 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) 7146 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); 7147 else { 7148 rsm = tqhash_min(rack->r_ctl.tqh); 7149 } 7150 if (rsm == NULL) { 7151 /* Huh */ 7152 goto out; 7153 } 7154 } 7155 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 7156 /* 7157 * We need to split this the last segment in two. 7158 */ 7159 struct rack_sendmap *nrsm; 7160 7161 nrsm = rack_alloc_full_limit(rack); 7162 if (nrsm == NULL) { 7163 /* 7164 * No memory to split, we will just exit and punt 7165 * off to the RXT timer. 7166 */ 7167 goto out; 7168 } 7169 rack_clone_rsm(rack, nrsm, rsm, 7170 (rsm->r_end - ctf_fixed_maxseg(tp))); 7171 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7172 #ifndef INVARIANTS 7173 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 7174 #else 7175 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 7176 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 7177 nrsm, insret, rack, rsm); 7178 } 7179 #endif 7180 if (rsm->r_in_tmap) { 7181 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7182 nrsm->r_in_tmap = 1; 7183 } 7184 rsm = nrsm; 7185 } 7186 rack->r_ctl.rc_tlpsend = rsm; 7187 send: 7188 /* Make sure output path knows we are doing a TLP */ 7189 *doing_tlp = 1; 7190 rack->r_timer_override = 1; 7191 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7192 return (0); 7193 out: 7194 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7195 return (0); 7196 } 7197 7198 /* 7199 * Delayed ack Timer, here we simply need to setup the 7200 * ACK_NOW flag and remove the DELACK flag. From there 7201 * the output routine will send the ack out. 7202 * 7203 * We only return 1, saying don't proceed, if all timers 7204 * are stopped (destroyed PCB?). 7205 */ 7206 static int 7207 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7208 { 7209 7210 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 7211 tp->t_flags &= ~TF_DELACK; 7212 tp->t_flags |= TF_ACKNOW; 7213 KMOD_TCPSTAT_INC(tcps_delack); 7214 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7215 return (0); 7216 } 7217 7218 static inline int 7219 rack_send_ack_challange(struct tcp_rack *rack) 7220 { 7221 struct tcptemp *t_template; 7222 7223 t_template = tcpip_maketemplate(rack->rc_inp); 7224 if (t_template) { 7225 if (rack->forced_ack == 0) { 7226 rack->forced_ack = 1; 7227 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7228 } else { 7229 rack->probe_not_answered = 1; 7230 } 7231 tcp_respond(rack->rc_tp, t_template->tt_ipgen, 7232 &t_template->tt_t, (struct mbuf *)NULL, 7233 rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0); 7234 free(t_template, M_TEMP); 7235 /* This does send an ack so kill any D-ack timer */ 7236 if (rack->rc_tp->t_flags & TF_DELACK) 7237 rack->rc_tp->t_flags &= ~TF_DELACK; 7238 return(1); 7239 } else 7240 return (0); 7241 7242 } 7243 7244 /* 7245 * Persists timer, here we simply send the 7246 * same thing as a keepalive will. 7247 * the one byte send. 7248 * 7249 * We only return 1, saying don't proceed, if all timers 7250 * are stopped (destroyed PCB?). 7251 */ 7252 static int 7253 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7254 { 7255 int32_t retval = 1; 7256 7257 if (rack->rc_in_persist == 0) 7258 return (0); 7259 if (ctf_progress_timeout_check(tp, false)) { 7260 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7261 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7262 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7263 return (-ETIMEDOUT); /* tcp_drop() */ 7264 } 7265 /* 7266 * Persistence timer into zero window. Force a byte to be output, if 7267 * possible. 7268 */ 7269 KMOD_TCPSTAT_INC(tcps_persisttimeo); 7270 /* 7271 * Hack: if the peer is dead/unreachable, we do not time out if the 7272 * window is closed. After a full backoff, drop the connection if 7273 * the idle time (no responses to probes) reaches the maximum 7274 * backoff that we would use if retransmitting. 7275 */ 7276 if (tp->t_rxtshift >= V_tcp_retries && 7277 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 7278 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 7279 KMOD_TCPSTAT_INC(tcps_persistdrop); 7280 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7281 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7282 retval = -ETIMEDOUT; /* tcp_drop() */ 7283 goto out; 7284 } 7285 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 7286 tp->snd_una == tp->snd_max) 7287 rack_exit_persist(tp, rack, cts); 7288 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 7289 /* 7290 * If the user has closed the socket then drop a persisting 7291 * connection after a much reduced timeout. 7292 */ 7293 if (tp->t_state > TCPS_CLOSE_WAIT && 7294 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 7295 KMOD_TCPSTAT_INC(tcps_persistdrop); 7296 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7297 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7298 retval = -ETIMEDOUT; /* tcp_drop() */ 7299 goto out; 7300 } 7301 if (rack_send_ack_challange(rack)) { 7302 /* only set it if we were answered */ 7303 if (rack->probe_not_answered) { 7304 counter_u64_add(rack_persists_loss, 1); 7305 rack->r_ctl.persist_lost_ends++; 7306 } 7307 counter_u64_add(rack_persists_sends, 1); 7308 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 7309 } 7310 if (tp->t_rxtshift < V_tcp_retries) 7311 tp->t_rxtshift++; 7312 out: 7313 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 7314 rack_start_hpts_timer(rack, tp, cts, 7315 0, 0, 0); 7316 return (retval); 7317 } 7318 7319 /* 7320 * If a keepalive goes off, we had no other timers 7321 * happening. We always return 1 here since this 7322 * routine either drops the connection or sends 7323 * out a segment with respond. 7324 */ 7325 static int 7326 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7327 { 7328 struct inpcb *inp = tptoinpcb(tp); 7329 7330 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 7331 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 7332 /* 7333 * Keep-alive timer went off; send something or drop connection if 7334 * idle for too long. 7335 */ 7336 KMOD_TCPSTAT_INC(tcps_keeptimeo); 7337 if (tp->t_state < TCPS_ESTABLISHED) 7338 goto dropit; 7339 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 7340 tp->t_state <= TCPS_CLOSING) { 7341 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 7342 goto dropit; 7343 /* 7344 * Send a packet designed to force a response if the peer is 7345 * up and reachable: either an ACK if the connection is 7346 * still alive, or an RST if the peer has closed the 7347 * connection due to timeout or reboot. Using sequence 7348 * number tp->snd_una-1 causes the transmitted zero-length 7349 * segment to lie outside the receive window; by the 7350 * protocol spec, this requires the correspondent TCP to 7351 * respond. 7352 */ 7353 KMOD_TCPSTAT_INC(tcps_keepprobe); 7354 rack_send_ack_challange(rack); 7355 } 7356 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 7357 return (1); 7358 dropit: 7359 KMOD_TCPSTAT_INC(tcps_keepdrops); 7360 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7361 return (-ETIMEDOUT); /* tcp_drop() */ 7362 } 7363 7364 /* 7365 * Retransmit helper function, clear up all the ack 7366 * flags and take care of important book keeping. 7367 */ 7368 static void 7369 rack_remxt_tmr(struct tcpcb *tp) 7370 { 7371 /* 7372 * The retransmit timer went off, all sack'd blocks must be 7373 * un-acked. 7374 */ 7375 struct rack_sendmap *rsm, *trsm = NULL; 7376 struct tcp_rack *rack; 7377 7378 rack = (struct tcp_rack *)tp->t_fb_ptr; 7379 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 7380 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 7381 rack->r_timer_override = 1; 7382 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 7383 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 7384 rack->r_late = 0; 7385 rack->r_early = 0; 7386 rack->r_ctl.rc_agg_delayed = 0; 7387 rack->r_ctl.rc_agg_early = 0; 7388 if (rack->r_state && (rack->r_state != tp->t_state)) 7389 rack_set_state(tp, rack); 7390 if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) { 7391 /* 7392 * We do not clear the scoreboard until we have had 7393 * more than rack_rxt_scoreboard_clear_thresh time-outs. 7394 */ 7395 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7396 if (rack->r_ctl.rc_resend != NULL) 7397 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7398 7399 return; 7400 } 7401 /* 7402 * Ideally we would like to be able to 7403 * mark SACK-PASS on anything not acked here. 7404 * 7405 * However, if we do that we would burst out 7406 * all that data 1ms apart. This would be unwise, 7407 * so for now we will just let the normal rxt timer 7408 * and tlp timer take care of it. 7409 * 7410 * Also we really need to stick them back in sequence 7411 * order. This way we send in the proper order and any 7412 * sacks that come floating in will "re-ack" the data. 7413 * To do this we zap the tmap with an INIT and then 7414 * walk through and place every rsm in the tail queue 7415 * hash table back in its seq ordered place. 7416 */ 7417 TAILQ_INIT(&rack->r_ctl.rc_tmap); 7418 7419 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 7420 rsm->r_dupack = 0; 7421 if (rack_verbose_logging) 7422 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7423 /* We must re-add it back to the tlist */ 7424 if (trsm == NULL) { 7425 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7426 } else { 7427 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 7428 } 7429 rsm->r_in_tmap = 1; 7430 trsm = rsm; 7431 if (rsm->r_flags & RACK_ACKED) 7432 rsm->r_flags |= RACK_WAS_ACKED; 7433 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST); 7434 rsm->r_flags |= RACK_MUST_RXT; 7435 } 7436 /* zero the lost since it's all gone */ 7437 rack->r_ctl.rc_considered_lost = 0; 7438 /* Clear the count (we just un-acked them) */ 7439 rack->r_ctl.rc_sacked = 0; 7440 rack->r_ctl.rc_sacklast = NULL; 7441 /* Clear the tlp rtx mark */ 7442 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7443 if (rack->r_ctl.rc_resend != NULL) 7444 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7445 rack->r_ctl.rc_prr_sndcnt = 0; 7446 rack_log_to_prr(rack, 6, 0, __LINE__); 7447 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7448 if (rack->r_ctl.rc_resend != NULL) 7449 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7450 if (((tp->t_flags & TF_SACK_PERMIT) == 0) && 7451 ((tp->t_flags & TF_SENTFIN) == 0)) { 7452 /* 7453 * For non-sack customers new data 7454 * needs to go out as retransmits until 7455 * we retransmit up to snd_max. 7456 */ 7457 rack->r_must_retran = 1; 7458 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 7459 rack->r_ctl.rc_sacked); 7460 } 7461 } 7462 7463 static void 7464 rack_convert_rtts(struct tcpcb *tp) 7465 { 7466 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 7467 tp->t_rxtcur = RACK_REXMTVAL(tp); 7468 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 7469 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 7470 } 7471 if (tp->t_rxtcur > rack_rto_max) { 7472 tp->t_rxtcur = rack_rto_max; 7473 } 7474 } 7475 7476 static void 7477 rack_cc_conn_init(struct tcpcb *tp) 7478 { 7479 struct tcp_rack *rack; 7480 uint32_t srtt; 7481 7482 rack = (struct tcp_rack *)tp->t_fb_ptr; 7483 srtt = tp->t_srtt; 7484 cc_conn_init(tp); 7485 /* 7486 * Now convert to rack's internal format, 7487 * if required. 7488 */ 7489 if ((srtt == 0) && (tp->t_srtt != 0)) 7490 rack_convert_rtts(tp); 7491 /* 7492 * We want a chance to stay in slowstart as 7493 * we create a connection. TCP spec says that 7494 * initially ssthresh is infinite. For our 7495 * purposes that is the snd_wnd. 7496 */ 7497 if (tp->snd_ssthresh < tp->snd_wnd) { 7498 tp->snd_ssthresh = tp->snd_wnd; 7499 } 7500 /* 7501 * We also want to assure a IW worth of 7502 * data can get inflight. 7503 */ 7504 if (rc_init_window(rack) < tp->snd_cwnd) 7505 tp->snd_cwnd = rc_init_window(rack); 7506 } 7507 7508 /* 7509 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 7510 * we will setup to retransmit the lowest seq number outstanding. 7511 */ 7512 static int 7513 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7514 { 7515 struct inpcb *inp = tptoinpcb(tp); 7516 int32_t rexmt; 7517 int32_t retval = 0; 7518 bool isipv6; 7519 7520 if ((tp->t_flags & TF_GPUTINPROG) && 7521 (tp->t_rxtshift)) { 7522 /* 7523 * We have had a second timeout 7524 * measurements on successive rxt's are not profitable. 7525 * It is unlikely to be of any use (the network is 7526 * broken or the client went away). 7527 */ 7528 tp->t_flags &= ~TF_GPUTINPROG; 7529 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7530 rack->r_ctl.rc_gp_srtt /*flex1*/, 7531 tp->gput_seq, 7532 0, 0, 18, __LINE__, NULL, 0); 7533 } 7534 if (ctf_progress_timeout_check(tp, false)) { 7535 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7536 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7537 return (-ETIMEDOUT); /* tcp_drop() */ 7538 } 7539 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 7540 rack->r_ctl.retran_during_recovery = 0; 7541 rack->rc_ack_required = 1; 7542 rack->r_ctl.dsack_byte_cnt = 0; 7543 if (IN_RECOVERY(tp->t_flags) && 7544 (rack->rto_from_rec == 0)) { 7545 /* 7546 * Mark that we had a rto while in recovery 7547 * and save the ssthresh so if we go back 7548 * into recovery we will have a chance 7549 * to slowstart back to the level. 7550 */ 7551 rack->rto_from_rec = 1; 7552 rack->r_ctl.rto_ssthresh = tp->snd_ssthresh; 7553 } 7554 if (IN_FASTRECOVERY(tp->t_flags)) 7555 tp->t_flags |= TF_WASFRECOVERY; 7556 else 7557 tp->t_flags &= ~TF_WASFRECOVERY; 7558 if (IN_CONGRECOVERY(tp->t_flags)) 7559 tp->t_flags |= TF_WASCRECOVERY; 7560 else 7561 tp->t_flags &= ~TF_WASCRECOVERY; 7562 if (TCPS_HAVEESTABLISHED(tp->t_state) && 7563 (tp->snd_una == tp->snd_max)) { 7564 /* Nothing outstanding .. nothing to do */ 7565 return (0); 7566 } 7567 if (rack->r_ctl.dsack_persist) { 7568 rack->r_ctl.dsack_persist--; 7569 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7570 rack->r_ctl.num_dsack = 0; 7571 } 7572 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7573 } 7574 /* 7575 * Rack can only run one timer at a time, so we cannot 7576 * run a KEEPINIT (gating SYN sending) and a retransmit 7577 * timer for the SYN. So if we are in a front state and 7578 * have a KEEPINIT timer we need to check the first transmit 7579 * against now to see if we have exceeded the KEEPINIT time 7580 * (if one is set). 7581 */ 7582 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 7583 (TP_KEEPINIT(tp) != 0)) { 7584 struct rack_sendmap *rsm; 7585 7586 rsm = tqhash_min(rack->r_ctl.tqh); 7587 if (rsm) { 7588 /* Ok we have something outstanding to test keepinit with */ 7589 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 7590 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 7591 /* We have exceeded the KEEPINIT time */ 7592 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7593 goto drop_it; 7594 } 7595 } 7596 } 7597 /* 7598 * Retransmission timer went off. Message has not been acked within 7599 * retransmit interval. Back off to a longer retransmit interval 7600 * and retransmit one segment. 7601 */ 7602 if ((rack->r_ctl.rc_resend == NULL) || 7603 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 7604 /* 7605 * If the rwnd collapsed on 7606 * the one we are retransmitting 7607 * it does not count against the 7608 * rxt count. 7609 */ 7610 tp->t_rxtshift++; 7611 } 7612 rack_remxt_tmr(tp); 7613 if (tp->t_rxtshift > V_tcp_retries) { 7614 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7615 drop_it: 7616 tp->t_rxtshift = V_tcp_retries; 7617 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 7618 /* XXXGL: previously t_softerror was casted to uint16_t */ 7619 MPASS(tp->t_softerror >= 0); 7620 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 7621 goto out; /* tcp_drop() */ 7622 } 7623 if (tp->t_state == TCPS_SYN_SENT) { 7624 /* 7625 * If the SYN was retransmitted, indicate CWND to be limited 7626 * to 1 segment in cc_conn_init(). 7627 */ 7628 tp->snd_cwnd = 1; 7629 } else if (tp->t_rxtshift == 1) { 7630 /* 7631 * first retransmit; record ssthresh and cwnd so they can be 7632 * recovered if this turns out to be a "bad" retransmit. A 7633 * retransmit is considered "bad" if an ACK for this segment 7634 * is received within RTT/2 interval; the assumption here is 7635 * that the ACK was already in flight. See "On Estimating 7636 * End-to-End Network Path Properties" by Allman and Paxson 7637 * for more details. 7638 */ 7639 tp->snd_cwnd_prev = tp->snd_cwnd; 7640 tp->snd_ssthresh_prev = tp->snd_ssthresh; 7641 tp->snd_recover_prev = tp->snd_recover; 7642 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 7643 tp->t_flags |= TF_PREVVALID; 7644 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 7645 tp->t_flags &= ~TF_PREVVALID; 7646 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 7647 if ((tp->t_state == TCPS_SYN_SENT) || 7648 (tp->t_state == TCPS_SYN_RECEIVED)) 7649 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 7650 else 7651 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 7652 7653 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 7654 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 7655 /* 7656 * We enter the path for PLMTUD if connection is established or, if 7657 * connection is FIN_WAIT_1 status, reason for the last is that if 7658 * amount of data we send is very small, we could send it in couple 7659 * of packets and process straight to FIN. In that case we won't 7660 * catch ESTABLISHED state. 7661 */ 7662 #ifdef INET6 7663 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 7664 #else 7665 isipv6 = false; 7666 #endif 7667 if (((V_tcp_pmtud_blackhole_detect == 1) || 7668 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 7669 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 7670 ((tp->t_state == TCPS_ESTABLISHED) || 7671 (tp->t_state == TCPS_FIN_WAIT_1))) { 7672 /* 7673 * Idea here is that at each stage of mtu probe (usually, 7674 * 1448 -> 1188 -> 524) should be given 2 chances to recover 7675 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 7676 * should take care of that. 7677 */ 7678 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 7679 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 7680 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 7681 tp->t_rxtshift % 2 == 0)) { 7682 /* 7683 * Enter Path MTU Black-hole Detection mechanism: - 7684 * Disable Path MTU Discovery (IP "DF" bit). - 7685 * Reduce MTU to lower value than what we negotiated 7686 * with peer. 7687 */ 7688 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 7689 /* Record that we may have found a black hole. */ 7690 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 7691 /* Keep track of previous MSS. */ 7692 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 7693 } 7694 7695 /* 7696 * Reduce the MSS to blackhole value or to the 7697 * default in an attempt to retransmit. 7698 */ 7699 #ifdef INET6 7700 if (isipv6 && 7701 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 7702 /* Use the sysctl tuneable blackhole MSS. */ 7703 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 7704 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7705 } else if (isipv6) { 7706 /* Use the default MSS. */ 7707 tp->t_maxseg = V_tcp_v6mssdflt; 7708 /* 7709 * Disable Path MTU Discovery when we switch 7710 * to minmss. 7711 */ 7712 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7713 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7714 } 7715 #endif 7716 #if defined(INET6) && defined(INET) 7717 else 7718 #endif 7719 #ifdef INET 7720 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 7721 /* Use the sysctl tuneable blackhole MSS. */ 7722 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 7723 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7724 } else { 7725 /* Use the default MSS. */ 7726 tp->t_maxseg = V_tcp_mssdflt; 7727 /* 7728 * Disable Path MTU Discovery when we switch 7729 * to minmss. 7730 */ 7731 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7732 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7733 } 7734 #endif 7735 } else { 7736 /* 7737 * If further retransmissions are still unsuccessful 7738 * with a lowered MTU, maybe this isn't a blackhole 7739 * and we restore the previous MSS and blackhole 7740 * detection flags. The limit '6' is determined by 7741 * giving each probe stage (1448, 1188, 524) 2 7742 * chances to recover. 7743 */ 7744 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 7745 (tp->t_rxtshift >= 6)) { 7746 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 7747 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7748 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 7749 if (tp->t_maxseg < V_tcp_mssdflt) { 7750 /* 7751 * The MSS is so small we should not 7752 * process incoming SACK's since we are 7753 * subject to attack in such a case. 7754 */ 7755 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 7756 } else { 7757 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 7758 } 7759 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 7760 } 7761 } 7762 } 7763 /* 7764 * Disable RFC1323 and SACK if we haven't got any response to 7765 * our third SYN to work-around some broken terminal servers 7766 * (most of which have hopefully been retired) that have bad VJ 7767 * header compression code which trashes TCP segments containing 7768 * unknown-to-them TCP options. 7769 */ 7770 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7771 (tp->t_rxtshift == 3)) 7772 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7773 /* 7774 * If we backed off this far, our srtt estimate is probably bogus. 7775 * Clobber it so we'll take the next rtt measurement as our srtt; 7776 * move the current srtt into rttvar to keep the current retransmit 7777 * times until then. 7778 */ 7779 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 7780 #ifdef INET6 7781 if ((inp->inp_vflag & INP_IPV6) != 0) 7782 in6_losing(inp); 7783 else 7784 #endif 7785 in_losing(inp); 7786 tp->t_rttvar += tp->t_srtt; 7787 tp->t_srtt = 0; 7788 } 7789 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 7790 tp->snd_recover = tp->snd_max; 7791 tp->t_flags |= TF_ACKNOW; 7792 tp->t_rtttime = 0; 7793 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 7794 out: 7795 return (retval); 7796 } 7797 7798 static int 7799 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 7800 { 7801 int32_t ret = 0; 7802 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 7803 7804 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 7805 (tp->t_flags & TF_GPUTINPROG)) { 7806 /* 7807 * We have a goodput in progress 7808 * and we have entered a late state. 7809 * Do we have enough data in the sb 7810 * to handle the GPUT request? 7811 */ 7812 uint32_t bytes; 7813 7814 bytes = tp->gput_ack - tp->gput_seq; 7815 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 7816 bytes += tp->gput_seq - tp->snd_una; 7817 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 7818 /* 7819 * There are not enough bytes in the socket 7820 * buffer that have been sent to cover this 7821 * measurement. Cancel it. 7822 */ 7823 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7824 rack->r_ctl.rc_gp_srtt /*flex1*/, 7825 tp->gput_seq, 7826 0, 0, 18, __LINE__, NULL, 0); 7827 tp->t_flags &= ~TF_GPUTINPROG; 7828 } 7829 } 7830 if (timers == 0) { 7831 return (0); 7832 } 7833 if (tp->t_state == TCPS_LISTEN) { 7834 /* no timers on listen sockets */ 7835 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 7836 return (0); 7837 return (1); 7838 } 7839 if ((timers & PACE_TMR_RACK) && 7840 rack->rc_on_min_to) { 7841 /* 7842 * For the rack timer when we 7843 * are on a min-timeout (which means rrr_conf = 3) 7844 * we don't want to check the timer. It may 7845 * be going off for a pace and thats ok we 7846 * want to send the retransmit (if its ready). 7847 * 7848 * If its on a normal rack timer (non-min) then 7849 * we will check if its expired. 7850 */ 7851 goto skip_time_check; 7852 } 7853 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7854 uint32_t left; 7855 7856 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 7857 ret = -1; 7858 rack_log_to_processing(rack, cts, ret, 0); 7859 return (0); 7860 } 7861 if (hpts_calling == 0) { 7862 /* 7863 * A user send or queued mbuf (sack) has called us? We 7864 * return 0 and let the pacing guards 7865 * deal with it if they should or 7866 * should not cause a send. 7867 */ 7868 ret = -2; 7869 rack_log_to_processing(rack, cts, ret, 0); 7870 return (0); 7871 } 7872 /* 7873 * Ok our timer went off early and we are not paced false 7874 * alarm, go back to sleep. We make sure we don't have 7875 * no-sack wakeup on since we no longer have a PKT_OUTPUT 7876 * flag in place. 7877 */ 7878 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; 7879 ret = -3; 7880 left = rack->r_ctl.rc_timer_exp - cts; 7881 tcp_hpts_insert(tp, left, NULL); 7882 rack_log_to_processing(rack, cts, ret, left); 7883 return (1); 7884 } 7885 skip_time_check: 7886 rack->rc_tmr_stopped = 0; 7887 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 7888 if (timers & PACE_TMR_DELACK) { 7889 ret = rack_timeout_delack(tp, rack, cts); 7890 } else if (timers & PACE_TMR_RACK) { 7891 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7892 rack->r_fast_output = 0; 7893 ret = rack_timeout_rack(tp, rack, cts); 7894 } else if (timers & PACE_TMR_TLP) { 7895 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7896 rack->r_fast_output = 0; 7897 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 7898 } else if (timers & PACE_TMR_RXT) { 7899 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7900 rack->r_fast_output = 0; 7901 ret = rack_timeout_rxt(tp, rack, cts); 7902 } else if (timers & PACE_TMR_PERSIT) { 7903 ret = rack_timeout_persist(tp, rack, cts); 7904 } else if (timers & PACE_TMR_KEEP) { 7905 ret = rack_timeout_keepalive(tp, rack, cts); 7906 } 7907 rack_log_to_processing(rack, cts, ret, timers); 7908 return (ret); 7909 } 7910 7911 static void 7912 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 7913 { 7914 struct timeval tv; 7915 uint32_t us_cts, flags_on_entry; 7916 uint8_t hpts_removed = 0; 7917 7918 flags_on_entry = rack->r_ctl.rc_hpts_flags; 7919 us_cts = tcp_get_usecs(&tv); 7920 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 7921 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 7922 ((tp->snd_max - tp->snd_una) == 0))) { 7923 tcp_hpts_remove(rack->rc_tp); 7924 hpts_removed = 1; 7925 /* If we were not delayed cancel out the flag. */ 7926 if ((tp->snd_max - tp->snd_una) == 0) 7927 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7928 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7929 } 7930 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7931 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7932 if (tcp_in_hpts(rack->rc_tp) && 7933 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 7934 /* 7935 * Canceling timer's when we have no output being 7936 * paced. We also must remove ourselves from the 7937 * hpts. 7938 */ 7939 tcp_hpts_remove(rack->rc_tp); 7940 hpts_removed = 1; 7941 } 7942 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 7943 } 7944 if (hpts_removed == 0) 7945 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7946 } 7947 7948 static int 7949 rack_stopall(struct tcpcb *tp) 7950 { 7951 struct tcp_rack *rack; 7952 7953 rack = (struct tcp_rack *)tp->t_fb_ptr; 7954 rack->t_timers_stopped = 1; 7955 7956 tcp_hpts_remove(tp); 7957 7958 return (0); 7959 } 7960 7961 static void 7962 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) 7963 { 7964 /* 7965 * Assure no timers are running. 7966 */ 7967 if (tcp_timer_active(tp, TT_PERSIST)) { 7968 /* We enter in persists, set the flag appropriately */ 7969 rack->rc_in_persist = 1; 7970 } 7971 if (tcp_in_hpts(rack->rc_tp)) { 7972 tcp_hpts_remove(rack->rc_tp); 7973 } 7974 } 7975 7976 static void 7977 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 7978 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz) 7979 { 7980 int32_t idx; 7981 7982 rsm->r_rtr_cnt++; 7983 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 7984 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 7985 rsm->r_flags |= RACK_OVERMAX; 7986 } 7987 rsm->r_act_rxt_cnt++; 7988 /* Peg the count/index */ 7989 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7990 rsm->r_dupack = 0; 7991 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 7992 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 7993 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 7994 } 7995 if (rsm->r_flags & RACK_WAS_LOST) { 7996 /* 7997 * We retransmitted it putting it back in flight 7998 * remove the lost desgination and reduce the 7999 * bytes considered lost. 8000 */ 8001 rack_mark_nolonger_lost(rack, rsm); 8002 } 8003 idx = rsm->r_rtr_cnt - 1; 8004 rsm->r_tim_lastsent[idx] = ts; 8005 /* 8006 * Here we don't add in the len of send, since its already 8007 * in snduna <->snd_max. 8008 */ 8009 rsm->r_fas = ctf_flight_size(rack->rc_tp, 8010 rack->r_ctl.rc_sacked); 8011 if (rsm->r_flags & RACK_ACKED) { 8012 /* Problably MTU discovery messing with us */ 8013 rsm->r_flags &= ~RACK_ACKED; 8014 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8015 } 8016 if (rsm->r_in_tmap) { 8017 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8018 rsm->r_in_tmap = 0; 8019 } 8020 /* Lets make sure it really is in or not the GP window */ 8021 rack_mark_in_gp_win(tp, rsm); 8022 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8023 rsm->r_in_tmap = 1; 8024 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); 8025 /* Take off the must retransmit flag, if its on */ 8026 if (rsm->r_flags & RACK_MUST_RXT) { 8027 if (rack->r_must_retran) 8028 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 8029 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 8030 /* 8031 * We have retransmitted all we need. Clear 8032 * any must retransmit flags. 8033 */ 8034 rack->r_must_retran = 0; 8035 rack->r_ctl.rc_out_at_rto = 0; 8036 } 8037 rsm->r_flags &= ~RACK_MUST_RXT; 8038 } 8039 /* Remove any collapsed flag */ 8040 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8041 if (rsm->r_flags & RACK_SACK_PASSED) { 8042 /* We have retransmitted due to the SACK pass */ 8043 rsm->r_flags &= ~RACK_SACK_PASSED; 8044 rsm->r_flags |= RACK_WAS_SACKPASS; 8045 } 8046 } 8047 8048 static uint32_t 8049 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 8050 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz) 8051 { 8052 /* 8053 * We (re-)transmitted starting at rsm->r_start for some length 8054 * (possibly less than r_end. 8055 */ 8056 struct rack_sendmap *nrsm; 8057 int insret __diagused; 8058 uint32_t c_end; 8059 int32_t len; 8060 8061 len = *lenp; 8062 c_end = rsm->r_start + len; 8063 if (SEQ_GEQ(c_end, rsm->r_end)) { 8064 /* 8065 * We retransmitted the whole piece or more than the whole 8066 * slopping into the next rsm. 8067 */ 8068 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8069 if (c_end == rsm->r_end) { 8070 *lenp = 0; 8071 return (0); 8072 } else { 8073 int32_t act_len; 8074 8075 /* Hangs over the end return whats left */ 8076 act_len = rsm->r_end - rsm->r_start; 8077 *lenp = (len - act_len); 8078 return (rsm->r_end); 8079 } 8080 /* We don't get out of this block. */ 8081 } 8082 /* 8083 * Here we retransmitted less than the whole thing which means we 8084 * have to split this into what was transmitted and what was not. 8085 */ 8086 nrsm = rack_alloc_full_limit(rack); 8087 if (nrsm == NULL) { 8088 /* 8089 * We can't get memory, so lets not proceed. 8090 */ 8091 *lenp = 0; 8092 return (0); 8093 } 8094 /* 8095 * So here we are going to take the original rsm and make it what we 8096 * retransmitted. nrsm will be the tail portion we did not 8097 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 8098 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 8099 * 1, 6 and the new piece will be 6, 11. 8100 */ 8101 rack_clone_rsm(rack, nrsm, rsm, c_end); 8102 nrsm->r_dupack = 0; 8103 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8104 #ifndef INVARIANTS 8105 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8106 #else 8107 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8108 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8109 nrsm, insret, rack, rsm); 8110 } 8111 #endif 8112 if (rsm->r_in_tmap) { 8113 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8114 nrsm->r_in_tmap = 1; 8115 } 8116 rsm->r_flags &= (~RACK_HAS_FIN); 8117 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8118 /* Log a split of rsm into rsm and nrsm */ 8119 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8120 *lenp = 0; 8121 return (0); 8122 } 8123 8124 static void 8125 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 8126 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 8127 struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb, 8128 uint32_t s_moff, int hw_tls, int segsiz) 8129 { 8130 struct tcp_rack *rack; 8131 struct rack_sendmap *rsm, *nrsm; 8132 int insret __diagused; 8133 8134 register uint32_t snd_max, snd_una; 8135 8136 /* 8137 * Add to the RACK log of packets in flight or retransmitted. If 8138 * there is a TS option we will use the TS echoed, if not we will 8139 * grab a TS. 8140 * 8141 * Retransmissions will increment the count and move the ts to its 8142 * proper place. Note that if options do not include TS's then we 8143 * won't be able to effectively use the ACK for an RTT on a retran. 8144 * 8145 * Notes about r_start and r_end. Lets consider a send starting at 8146 * sequence 1 for 10 bytes. In such an example the r_start would be 8147 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 8148 * This means that r_end is actually the first sequence for the next 8149 * slot (11). 8150 * 8151 */ 8152 /* 8153 * If err is set what do we do XXXrrs? should we not add the thing? 8154 * -- i.e. return if err != 0 or should we pretend we sent it? -- 8155 * i.e. proceed with add ** do this for now. 8156 */ 8157 INP_WLOCK_ASSERT(tptoinpcb(tp)); 8158 if (err) 8159 /* 8160 * We don't log errors -- we could but snd_max does not 8161 * advance in this case either. 8162 */ 8163 return; 8164 8165 if (th_flags & TH_RST) { 8166 /* 8167 * We don't log resets and we return immediately from 8168 * sending 8169 */ 8170 return; 8171 } 8172 rack = (struct tcp_rack *)tp->t_fb_ptr; 8173 snd_una = tp->snd_una; 8174 snd_max = tp->snd_max; 8175 if (th_flags & (TH_SYN | TH_FIN)) { 8176 /* 8177 * The call to rack_log_output is made before bumping 8178 * snd_max. This means we can record one extra byte on a SYN 8179 * or FIN if seq_out is adding more on and a FIN is present 8180 * (and we are not resending). 8181 */ 8182 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 8183 len++; 8184 if (th_flags & TH_FIN) 8185 len++; 8186 } 8187 if (SEQ_LEQ((seq_out + len), snd_una)) { 8188 /* Are sending an old segment to induce an ack (keep-alive)? */ 8189 return; 8190 } 8191 if (SEQ_LT(seq_out, snd_una)) { 8192 /* huh? should we panic? */ 8193 uint32_t end; 8194 8195 end = seq_out + len; 8196 seq_out = snd_una; 8197 if (SEQ_GEQ(end, seq_out)) 8198 len = end - seq_out; 8199 else 8200 len = 0; 8201 } 8202 if (len == 0) { 8203 /* We don't log zero window probes */ 8204 return; 8205 } 8206 if (IN_FASTRECOVERY(tp->t_flags)) { 8207 rack->r_ctl.rc_prr_out += len; 8208 } 8209 /* First question is it a retransmission or new? */ 8210 if (seq_out == snd_max) { 8211 /* Its new */ 8212 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts); 8213 again: 8214 rsm = rack_alloc(rack); 8215 if (rsm == NULL) { 8216 /* 8217 * Hmm out of memory and the tcb got destroyed while 8218 * we tried to wait. 8219 */ 8220 return; 8221 } 8222 if (th_flags & TH_FIN) { 8223 rsm->r_flags = RACK_HAS_FIN|add_flag; 8224 } else { 8225 rsm->r_flags = add_flag; 8226 } 8227 if (hw_tls) 8228 rsm->r_hw_tls = 1; 8229 rsm->r_tim_lastsent[0] = cts; 8230 rsm->r_rtr_cnt = 1; 8231 rsm->r_act_rxt_cnt = 0; 8232 rsm->r_rtr_bytes = 0; 8233 if (th_flags & TH_SYN) { 8234 /* The data space is one beyond snd_una */ 8235 rsm->r_flags |= RACK_HAS_SYN; 8236 } 8237 rsm->r_start = seq_out; 8238 rsm->r_end = rsm->r_start + len; 8239 rack_mark_in_gp_win(tp, rsm); 8240 rsm->r_dupack = 0; 8241 /* 8242 * save off the mbuf location that 8243 * sndmbuf_noadv returned (which is 8244 * where we started copying from).. 8245 */ 8246 rsm->m = s_mb; 8247 rsm->soff = s_moff; 8248 /* 8249 * Here we do add in the len of send, since its not yet 8250 * reflected in in snduna <->snd_max 8251 */ 8252 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 8253 rack->r_ctl.rc_sacked) + 8254 (rsm->r_end - rsm->r_start)); 8255 if ((rack->rc_initial_ss_comp == 0) && 8256 (rack->r_ctl.ss_hi_fs < rsm->r_fas)) { 8257 rack->r_ctl.ss_hi_fs = rsm->r_fas; 8258 } 8259 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 8260 if (rsm->m) { 8261 if (rsm->m->m_len <= rsm->soff) { 8262 /* 8263 * XXXrrs Question, will this happen? 8264 * 8265 * If sbsndptr is set at the correct place 8266 * then s_moff should always be somewhere 8267 * within rsm->m. But if the sbsndptr was 8268 * off then that won't be true. If it occurs 8269 * we need to walkout to the correct location. 8270 */ 8271 struct mbuf *lm; 8272 8273 lm = rsm->m; 8274 while (lm->m_len <= rsm->soff) { 8275 rsm->soff -= lm->m_len; 8276 lm = lm->m_next; 8277 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 8278 __func__, rack, s_moff, s_mb, rsm->soff)); 8279 } 8280 rsm->m = lm; 8281 } 8282 rsm->orig_m_len = rsm->m->m_len; 8283 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 8284 } else { 8285 rsm->orig_m_len = 0; 8286 rsm->orig_t_space = 0; 8287 } 8288 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); 8289 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8290 /* Log a new rsm */ 8291 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 8292 #ifndef INVARIANTS 8293 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 8294 #else 8295 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 8296 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8297 nrsm, insret, rack, rsm); 8298 } 8299 #endif 8300 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8301 rsm->r_in_tmap = 1; 8302 if (rsm->r_flags & RACK_IS_PCM) { 8303 rack->r_ctl.pcm_i.send_time = cts; 8304 rack->r_ctl.pcm_i.eseq = rsm->r_end; 8305 /* First time through we set the start too */ 8306 if (rack->pcm_in_progress == 0) 8307 rack->r_ctl.pcm_i.sseq = rsm->r_start; 8308 } 8309 /* 8310 * Special case detection, is there just a single 8311 * packet outstanding when we are not in recovery? 8312 * 8313 * If this is true mark it so. 8314 */ 8315 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 8316 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 8317 struct rack_sendmap *prsm; 8318 8319 prsm = tqhash_prev(rack->r_ctl.tqh, rsm); 8320 if (prsm) 8321 prsm->r_one_out_nr = 1; 8322 } 8323 return; 8324 } 8325 /* 8326 * If we reach here its a retransmission and we need to find it. 8327 */ 8328 more: 8329 if (hintrsm && (hintrsm->r_start == seq_out)) { 8330 rsm = hintrsm; 8331 hintrsm = NULL; 8332 } else { 8333 /* No hints sorry */ 8334 rsm = NULL; 8335 } 8336 if ((rsm) && (rsm->r_start == seq_out)) { 8337 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8338 if (len == 0) { 8339 return; 8340 } else { 8341 goto more; 8342 } 8343 } 8344 /* Ok it was not the last pointer go through it the hard way. */ 8345 refind: 8346 rsm = tqhash_find(rack->r_ctl.tqh, seq_out); 8347 if (rsm) { 8348 if (rsm->r_start == seq_out) { 8349 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8350 if (len == 0) { 8351 return; 8352 } else { 8353 goto refind; 8354 } 8355 } 8356 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 8357 /* Transmitted within this piece */ 8358 /* 8359 * Ok we must split off the front and then let the 8360 * update do the rest 8361 */ 8362 nrsm = rack_alloc_full_limit(rack); 8363 if (nrsm == NULL) { 8364 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); 8365 return; 8366 } 8367 /* 8368 * copy rsm to nrsm and then trim the front of rsm 8369 * to not include this part. 8370 */ 8371 rack_clone_rsm(rack, nrsm, rsm, seq_out); 8372 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8373 #ifndef INVARIANTS 8374 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8375 #else 8376 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8377 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8378 nrsm, insret, rack, rsm); 8379 } 8380 #endif 8381 if (rsm->r_in_tmap) { 8382 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8383 nrsm->r_in_tmap = 1; 8384 } 8385 rsm->r_flags &= (~RACK_HAS_FIN); 8386 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); 8387 if (len == 0) { 8388 return; 8389 } else if (len > 0) 8390 goto refind; 8391 } 8392 } 8393 /* 8394 * Hmm not found in map did they retransmit both old and on into the 8395 * new? 8396 */ 8397 if (seq_out == tp->snd_max) { 8398 goto again; 8399 } else if (SEQ_LT(seq_out, tp->snd_max)) { 8400 #ifdef INVARIANTS 8401 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 8402 seq_out, len, tp->snd_una, tp->snd_max); 8403 printf("Starting Dump of all rack entries\n"); 8404 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 8405 printf("rsm:%p start:%u end:%u\n", 8406 rsm, rsm->r_start, rsm->r_end); 8407 } 8408 printf("Dump complete\n"); 8409 panic("seq_out not found rack:%p tp:%p", 8410 rack, tp); 8411 #endif 8412 } else { 8413 #ifdef INVARIANTS 8414 /* 8415 * Hmm beyond sndmax? (only if we are using the new rtt-pack 8416 * flag) 8417 */ 8418 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 8419 seq_out, len, tp->snd_max, tp); 8420 #endif 8421 } 8422 } 8423 8424 /* 8425 * Record one of the RTT updates from an ack into 8426 * our sample structure. 8427 */ 8428 8429 static void 8430 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 8431 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 8432 { 8433 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8434 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 8435 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 8436 } 8437 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8438 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 8439 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 8440 } 8441 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 8442 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 8443 rack->r_ctl.rc_gp_lowrtt = us_rtt; 8444 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 8445 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 8446 } 8447 if ((confidence == 1) && 8448 ((rsm == NULL) || 8449 (rsm->r_just_ret) || 8450 (rsm->r_one_out_nr && 8451 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 8452 /* 8453 * If the rsm had a just return 8454 * hit it then we can't trust the 8455 * rtt measurement for buffer deterimination 8456 * Note that a confidence of 2, indicates 8457 * SACK'd which overrides the r_just_ret or 8458 * the r_one_out_nr. If it was a CUM-ACK and 8459 * we had only two outstanding, but get an 8460 * ack for only 1. Then that also lowers our 8461 * confidence. 8462 */ 8463 confidence = 0; 8464 } 8465 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8466 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 8467 if (rack->r_ctl.rack_rs.confidence == 0) { 8468 /* 8469 * We take anything with no current confidence 8470 * saved. 8471 */ 8472 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8473 rack->r_ctl.rack_rs.confidence = confidence; 8474 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8475 } else if (confidence != 0) { 8476 /* 8477 * Once we have a confident number, 8478 * we can update it with a smaller 8479 * value since this confident number 8480 * may include the DSACK time until 8481 * the next segment (the second one) arrived. 8482 */ 8483 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8484 rack->r_ctl.rack_rs.confidence = confidence; 8485 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8486 } 8487 } 8488 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 8489 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 8490 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 8491 rack->r_ctl.rack_rs.rs_rtt_cnt++; 8492 } 8493 8494 /* 8495 * Collect new round-trip time estimate 8496 * and update averages and current timeout. 8497 */ 8498 static void 8499 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 8500 { 8501 int32_t delta; 8502 int32_t rtt; 8503 8504 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 8505 /* No valid sample */ 8506 return; 8507 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 8508 /* We are to use the lowest RTT seen in a single ack */ 8509 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 8510 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 8511 /* We are to use the highest RTT seen in a single ack */ 8512 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 8513 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 8514 /* We are to use the average RTT seen in a single ack */ 8515 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 8516 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 8517 } else { 8518 #ifdef INVARIANTS 8519 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 8520 #endif 8521 return; 8522 } 8523 if (rtt == 0) 8524 rtt = 1; 8525 if (rack->rc_gp_rtt_set == 0) { 8526 /* 8527 * With no RTT we have to accept 8528 * even one we are not confident of. 8529 */ 8530 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 8531 rack->rc_gp_rtt_set = 1; 8532 } else if (rack->r_ctl.rack_rs.confidence) { 8533 /* update the running gp srtt */ 8534 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 8535 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 8536 } 8537 if (rack->r_ctl.rack_rs.confidence) { 8538 /* 8539 * record the low and high for highly buffered path computation, 8540 * we only do this if we are confident (not a retransmission). 8541 */ 8542 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 8543 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8544 } 8545 if (rack->rc_highly_buffered == 0) { 8546 /* 8547 * Currently once we declare a path has 8548 * highly buffered there is no going 8549 * back, which may be a problem... 8550 */ 8551 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 8552 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 8553 rack->r_ctl.rc_highest_us_rtt, 8554 rack->r_ctl.rc_lowest_us_rtt, 8555 RACK_RTTS_SEEHBP); 8556 rack->rc_highly_buffered = 1; 8557 } 8558 } 8559 } 8560 if ((rack->r_ctl.rack_rs.confidence) || 8561 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 8562 /* 8563 * If we are highly confident of it <or> it was 8564 * never retransmitted we accept it as the last us_rtt. 8565 */ 8566 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8567 /* The lowest rtt can be set if its was not retransmited */ 8568 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 8569 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8570 if (rack->r_ctl.rc_lowest_us_rtt == 0) 8571 rack->r_ctl.rc_lowest_us_rtt = 1; 8572 } 8573 } 8574 rack = (struct tcp_rack *)tp->t_fb_ptr; 8575 if (tp->t_srtt != 0) { 8576 /* 8577 * We keep a simple srtt in microseconds, like our rtt 8578 * measurement. We don't need to do any tricks with shifting 8579 * etc. Instead we just add in 1/8th of the new measurement 8580 * and subtract out 1/8 of the old srtt. We do the same with 8581 * the variance after finding the absolute value of the 8582 * difference between this sample and the current srtt. 8583 */ 8584 delta = tp->t_srtt - rtt; 8585 /* Take off 1/8th of the current sRTT */ 8586 tp->t_srtt -= (tp->t_srtt >> 3); 8587 /* Add in 1/8th of the new RTT just measured */ 8588 tp->t_srtt += (rtt >> 3); 8589 if (tp->t_srtt <= 0) 8590 tp->t_srtt = 1; 8591 /* Now lets make the absolute value of the variance */ 8592 if (delta < 0) 8593 delta = -delta; 8594 /* Subtract out 1/8th */ 8595 tp->t_rttvar -= (tp->t_rttvar >> 3); 8596 /* Add in 1/8th of the new variance we just saw */ 8597 tp->t_rttvar += (delta >> 3); 8598 if (tp->t_rttvar <= 0) 8599 tp->t_rttvar = 1; 8600 } else { 8601 /* 8602 * No rtt measurement yet - use the unsmoothed rtt. Set the 8603 * variance to half the rtt (so our first retransmit happens 8604 * at 3*rtt). 8605 */ 8606 tp->t_srtt = rtt; 8607 tp->t_rttvar = rtt >> 1; 8608 } 8609 rack->rc_srtt_measure_made = 1; 8610 KMOD_TCPSTAT_INC(tcps_rttupdated); 8611 if (tp->t_rttupdated < UCHAR_MAX) 8612 tp->t_rttupdated++; 8613 #ifdef STATS 8614 if (rack_stats_gets_ms_rtt == 0) { 8615 /* Send in the microsecond rtt used for rxt timeout purposes */ 8616 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 8617 } else if (rack_stats_gets_ms_rtt == 1) { 8618 /* Send in the millisecond rtt used for rxt timeout purposes */ 8619 int32_t ms_rtt; 8620 8621 /* Round up */ 8622 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8623 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8624 } else if (rack_stats_gets_ms_rtt == 2) { 8625 /* Send in the millisecond rtt has close to the path RTT as we can get */ 8626 int32_t ms_rtt; 8627 8628 /* Round up */ 8629 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8630 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8631 } else { 8632 /* Send in the microsecond rtt has close to the path RTT as we can get */ 8633 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8634 } 8635 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8636 #endif 8637 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 8638 /* 8639 * the retransmit should happen at rtt + 4 * rttvar. Because of the 8640 * way we do the smoothing, srtt and rttvar will each average +1/2 8641 * tick of bias. When we compute the retransmit timer, we want 1/2 8642 * tick of rounding and 1 extra tick because of +-1/2 tick 8643 * uncertainty in the firing of the timer. The bias will give us 8644 * exactly the 1.5 tick we need. But, because the bias is 8645 * statistical, we have to test that we don't drop below the minimum 8646 * feasible timer (which is 2 ticks). 8647 */ 8648 tp->t_rxtshift = 0; 8649 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8650 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 8651 rack_log_rtt_sample(rack, rtt); 8652 tp->t_softerror = 0; 8653 } 8654 8655 8656 static void 8657 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 8658 { 8659 /* 8660 * Apply to filter the inbound us-rtt at us_cts. 8661 */ 8662 uint32_t old_rtt; 8663 8664 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 8665 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 8666 us_rtt, us_cts); 8667 if (old_rtt > us_rtt) { 8668 /* We just hit a new lower rtt time */ 8669 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 8670 __LINE__, RACK_RTTS_NEWRTT); 8671 /* 8672 * Only count it if its lower than what we saw within our 8673 * calculated range. 8674 */ 8675 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 8676 if (rack_probertt_lower_within && 8677 rack->rc_gp_dyn_mul && 8678 (rack->use_fixed_rate == 0) && 8679 (rack->rc_always_pace)) { 8680 /* 8681 * We are seeing a new lower rtt very close 8682 * to the time that we would have entered probe-rtt. 8683 * This is probably due to the fact that a peer flow 8684 * has entered probe-rtt. Lets go in now too. 8685 */ 8686 uint32_t val; 8687 8688 val = rack_probertt_lower_within * rack_time_between_probertt; 8689 val /= 100; 8690 if ((rack->in_probe_rtt == 0) && 8691 (rack->rc_skip_timely == 0) && 8692 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 8693 rack_enter_probertt(rack, us_cts); 8694 } 8695 } 8696 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 8697 } 8698 } 8699 } 8700 8701 static int 8702 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 8703 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 8704 { 8705 uint32_t us_rtt; 8706 int32_t i, all; 8707 uint32_t t, len_acked; 8708 8709 if ((rsm->r_flags & RACK_ACKED) || 8710 (rsm->r_flags & RACK_WAS_ACKED)) 8711 /* Already done */ 8712 return (0); 8713 if (rsm->r_no_rtt_allowed) { 8714 /* Not allowed */ 8715 return (0); 8716 } 8717 if (ack_type == CUM_ACKED) { 8718 if (SEQ_GT(th_ack, rsm->r_end)) { 8719 len_acked = rsm->r_end - rsm->r_start; 8720 all = 1; 8721 } else { 8722 len_acked = th_ack - rsm->r_start; 8723 all = 0; 8724 } 8725 } else { 8726 len_acked = rsm->r_end - rsm->r_start; 8727 all = 0; 8728 } 8729 if (rsm->r_rtr_cnt == 1) { 8730 8731 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8732 if ((int)t <= 0) 8733 t = 1; 8734 if (!tp->t_rttlow || tp->t_rttlow > t) 8735 tp->t_rttlow = t; 8736 if (!rack->r_ctl.rc_rack_min_rtt || 8737 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8738 rack->r_ctl.rc_rack_min_rtt = t; 8739 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8740 rack->r_ctl.rc_rack_min_rtt = 1; 8741 } 8742 } 8743 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 8744 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8745 else 8746 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8747 if (us_rtt == 0) 8748 us_rtt = 1; 8749 if (CC_ALGO(tp)->rttsample != NULL) { 8750 /* Kick the RTT to the CC */ 8751 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8752 } 8753 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); 8754 if (ack_type == SACKED) { 8755 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 8756 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 8757 } else { 8758 /* 8759 * We need to setup what our confidence 8760 * is in this ack. 8761 * 8762 * If the rsm was app limited and it is 8763 * less than a mss in length (the end 8764 * of the send) then we have a gap. If we 8765 * were app limited but say we were sending 8766 * multiple MSS's then we are more confident 8767 * int it. 8768 * 8769 * When we are not app-limited then we see if 8770 * the rsm is being included in the current 8771 * measurement, we tell this by the app_limited_needs_set 8772 * flag. 8773 * 8774 * Note that being cwnd blocked is not applimited 8775 * as well as the pacing delay between packets which 8776 * are sending only 1 or 2 MSS's also will show up 8777 * in the RTT. We probably need to examine this algorithm 8778 * a bit more and enhance it to account for the delay 8779 * between rsm's. We could do that by saving off the 8780 * pacing delay of each rsm (in an rsm) and then 8781 * factoring that in somehow though for now I am 8782 * not sure how :) 8783 */ 8784 int calc_conf = 0; 8785 8786 if (rsm->r_flags & RACK_APP_LIMITED) { 8787 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 8788 calc_conf = 0; 8789 else 8790 calc_conf = 1; 8791 } else if (rack->app_limited_needs_set == 0) { 8792 calc_conf = 1; 8793 } else { 8794 calc_conf = 0; 8795 } 8796 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 8797 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 8798 calc_conf, rsm, rsm->r_rtr_cnt); 8799 } 8800 if ((rsm->r_flags & RACK_TLP) && 8801 (!IN_FASTRECOVERY(tp->t_flags))) { 8802 /* Segment was a TLP and our retrans matched */ 8803 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 8804 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 8805 } 8806 } 8807 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 8808 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8809 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 8810 /* New more recent rack_tmit_time */ 8811 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8812 if (rack->r_ctl.rc_rack_tmit_time == 0) 8813 rack->r_ctl.rc_rack_tmit_time = 1; 8814 rack->rc_rack_rtt = t; 8815 } 8816 return (1); 8817 } 8818 /* 8819 * We clear the soft/rxtshift since we got an ack. 8820 * There is no assurance we will call the commit() function 8821 * so we need to clear these to avoid incorrect handling. 8822 */ 8823 tp->t_rxtshift = 0; 8824 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8825 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 8826 tp->t_softerror = 0; 8827 if (to && (to->to_flags & TOF_TS) && 8828 (ack_type == CUM_ACKED) && 8829 (to->to_tsecr) && 8830 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 8831 /* 8832 * Now which timestamp does it match? In this block the ACK 8833 * must be coming from a previous transmission. 8834 */ 8835 for (i = 0; i < rsm->r_rtr_cnt; i++) { 8836 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 8837 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8838 if ((int)t <= 0) 8839 t = 1; 8840 if (CC_ALGO(tp)->rttsample != NULL) { 8841 /* 8842 * Kick the RTT to the CC, here 8843 * we lie a bit in that we know the 8844 * retransmission is correct even though 8845 * we retransmitted. This is because 8846 * we match the timestamps. 8847 */ 8848 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 8849 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 8850 else 8851 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 8852 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8853 } 8854 if ((i + 1) < rsm->r_rtr_cnt) { 8855 /* 8856 * The peer ack'd from our previous 8857 * transmission. We have a spurious 8858 * retransmission and thus we dont 8859 * want to update our rack_rtt. 8860 * 8861 * Hmm should there be a CC revert here? 8862 * 8863 */ 8864 return (0); 8865 } 8866 if (!tp->t_rttlow || tp->t_rttlow > t) 8867 tp->t_rttlow = t; 8868 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8869 rack->r_ctl.rc_rack_min_rtt = t; 8870 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8871 rack->r_ctl.rc_rack_min_rtt = 1; 8872 } 8873 } 8874 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 8875 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8876 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 8877 /* New more recent rack_tmit_time */ 8878 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8879 if (rack->r_ctl.rc_rack_tmit_time == 0) 8880 rack->r_ctl.rc_rack_tmit_time = 1; 8881 rack->rc_rack_rtt = t; 8882 } 8883 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 8884 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 8885 rsm->r_rtr_cnt); 8886 return (1); 8887 } 8888 } 8889 /* If we are logging log out the sendmap */ 8890 if (tcp_bblogging_on(rack->rc_tp)) { 8891 for (i = 0; i < rsm->r_rtr_cnt; i++) { 8892 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); 8893 } 8894 } 8895 goto ts_not_found; 8896 } else { 8897 /* 8898 * Ok its a SACK block that we retransmitted. or a windows 8899 * machine without timestamps. We can tell nothing from the 8900 * time-stamp since its not there or the time the peer last 8901 * received a segment that moved forward its cum-ack point. 8902 */ 8903 ts_not_found: 8904 i = rsm->r_rtr_cnt - 1; 8905 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8906 if ((int)t <= 0) 8907 t = 1; 8908 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8909 /* 8910 * We retransmitted and the ack came back in less 8911 * than the smallest rtt we have observed. We most 8912 * likely did an improper retransmit as outlined in 8913 * 6.2 Step 2 point 2 in the rack-draft so we 8914 * don't want to update our rack_rtt. We in 8915 * theory (in future) might want to think about reverting our 8916 * cwnd state but we won't for now. 8917 */ 8918 return (0); 8919 } else if (rack->r_ctl.rc_rack_min_rtt) { 8920 /* 8921 * We retransmitted it and the retransmit did the 8922 * job. 8923 */ 8924 if (!rack->r_ctl.rc_rack_min_rtt || 8925 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8926 rack->r_ctl.rc_rack_min_rtt = t; 8927 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8928 rack->r_ctl.rc_rack_min_rtt = 1; 8929 } 8930 } 8931 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 8932 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8933 (uint32_t)rsm->r_tim_lastsent[i]))) { 8934 /* New more recent rack_tmit_time */ 8935 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 8936 if (rack->r_ctl.rc_rack_tmit_time == 0) 8937 rack->r_ctl.rc_rack_tmit_time = 1; 8938 rack->rc_rack_rtt = t; 8939 } 8940 return (1); 8941 } 8942 } 8943 return (0); 8944 } 8945 8946 /* 8947 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 8948 */ 8949 static void 8950 rack_log_sack_passed(struct tcpcb *tp, 8951 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 8952 { 8953 struct rack_sendmap *nrsm; 8954 uint32_t thresh; 8955 8956 /* Get our rxt threshold for lost consideration */ 8957 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 8958 /* Now start looking at rsm's */ 8959 nrsm = rsm; 8960 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 8961 rack_head, r_tnext) { 8962 if (nrsm == rsm) { 8963 /* Skip original segment he is acked */ 8964 continue; 8965 } 8966 if (nrsm->r_flags & RACK_ACKED) { 8967 /* 8968 * Skip ack'd segments, though we 8969 * should not see these, since tmap 8970 * should not have ack'd segments. 8971 */ 8972 continue; 8973 } 8974 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 8975 /* 8976 * If the peer dropped the rwnd on 8977 * these then we don't worry about them. 8978 */ 8979 continue; 8980 } 8981 /* Check lost state */ 8982 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 8983 uint32_t exp; 8984 8985 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 8986 if (TSTMP_LT(exp, cts) || (exp == cts)) { 8987 /* We consider it lost */ 8988 nrsm->r_flags |= RACK_WAS_LOST; 8989 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 8990 } 8991 } 8992 if (nrsm->r_flags & RACK_SACK_PASSED) { 8993 /* 8994 * We found one that is already marked 8995 * passed, we have been here before and 8996 * so all others below this are marked. 8997 */ 8998 break; 8999 } 9000 nrsm->r_flags |= RACK_SACK_PASSED; 9001 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 9002 } 9003 } 9004 9005 static void 9006 rack_need_set_test(struct tcpcb *tp, 9007 struct tcp_rack *rack, 9008 struct rack_sendmap *rsm, 9009 tcp_seq th_ack, 9010 int line, 9011 int use_which) 9012 { 9013 struct rack_sendmap *s_rsm; 9014 9015 if ((tp->t_flags & TF_GPUTINPROG) && 9016 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9017 /* 9018 * We were app limited, and this ack 9019 * butts up or goes beyond the point where we want 9020 * to start our next measurement. We need 9021 * to record the new gput_ts as here and 9022 * possibly update the start sequence. 9023 */ 9024 uint32_t seq, ts; 9025 9026 if (rsm->r_rtr_cnt > 1) { 9027 /* 9028 * This is a retransmit, can we 9029 * really make any assessment at this 9030 * point? We are not really sure of 9031 * the timestamp, is it this or the 9032 * previous transmission? 9033 * 9034 * Lets wait for something better that 9035 * is not retransmitted. 9036 */ 9037 return; 9038 } 9039 seq = tp->gput_seq; 9040 ts = tp->gput_ts; 9041 rack->app_limited_needs_set = 0; 9042 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 9043 /* Do we start at a new end? */ 9044 if ((use_which == RACK_USE_BEG) && 9045 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 9046 /* 9047 * When we get an ACK that just eats 9048 * up some of the rsm, we set RACK_USE_BEG 9049 * since whats at r_start (i.e. th_ack) 9050 * is left unacked and thats where the 9051 * measurement now starts. 9052 */ 9053 tp->gput_seq = rsm->r_start; 9054 } 9055 if ((use_which == RACK_USE_END) && 9056 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9057 /* 9058 * We use the end when the cumack 9059 * is moving forward and completely 9060 * deleting the rsm passed so basically 9061 * r_end holds th_ack. 9062 * 9063 * For SACK's we also want to use the end 9064 * since this piece just got sacked and 9065 * we want to target anything after that 9066 * in our measurement. 9067 */ 9068 tp->gput_seq = rsm->r_end; 9069 } 9070 if (use_which == RACK_USE_END_OR_THACK) { 9071 /* 9072 * special case for ack moving forward, 9073 * not a sack, we need to move all the 9074 * way up to where this ack cum-ack moves 9075 * to. 9076 */ 9077 if (SEQ_GT(th_ack, rsm->r_end)) 9078 tp->gput_seq = th_ack; 9079 else 9080 tp->gput_seq = rsm->r_end; 9081 } 9082 if (SEQ_LT(tp->gput_seq, tp->snd_max)) 9083 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 9084 else 9085 s_rsm = NULL; 9086 /* 9087 * Pick up the correct send time if we can the rsm passed in 9088 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other 9089 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will 9090 * find a different seq i.e. the next send up. 9091 * 9092 * If that has not been sent, s_rsm will be NULL and we must 9093 * arrange it so this function will get called again by setting 9094 * app_limited_needs_set. 9095 */ 9096 if (s_rsm) 9097 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; 9098 else { 9099 /* If we hit here we have to have *not* sent tp->gput_seq */ 9100 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 9101 /* Set it up so we will go through here again */ 9102 rack->app_limited_needs_set = 1; 9103 } 9104 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 9105 /* 9106 * We moved beyond this guy's range, re-calculate 9107 * the new end point. 9108 */ 9109 if (rack->rc_gp_filled == 0) { 9110 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 9111 } else { 9112 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 9113 } 9114 } 9115 /* 9116 * We are moving the goal post, we may be able to clear the 9117 * measure_saw_probe_rtt flag. 9118 */ 9119 if ((rack->in_probe_rtt == 0) && 9120 (rack->measure_saw_probe_rtt) && 9121 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 9122 rack->measure_saw_probe_rtt = 0; 9123 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 9124 seq, tp->gput_seq, 9125 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9126 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9127 5, line, NULL, 0); 9128 if (rack->rc_gp_filled && 9129 ((tp->gput_ack - tp->gput_seq) < 9130 max(rc_init_window(rack), (MIN_GP_WIN * 9131 ctf_fixed_maxseg(tp))))) { 9132 uint32_t ideal_amount; 9133 9134 ideal_amount = rack_get_measure_window(tp, rack); 9135 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 9136 /* 9137 * There is no sense of continuing this measurement 9138 * because its too small to gain us anything we 9139 * trust. Skip it and that way we can start a new 9140 * measurement quicker. 9141 */ 9142 tp->t_flags &= ~TF_GPUTINPROG; 9143 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 9144 0, 0, 9145 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9146 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9147 6, __LINE__, NULL, 0); 9148 } else { 9149 /* 9150 * Reset the window further out. 9151 */ 9152 tp->gput_ack = tp->gput_seq + ideal_amount; 9153 } 9154 } 9155 rack_tend_gp_marks(tp, rack); 9156 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); 9157 } 9158 } 9159 9160 static inline int 9161 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 9162 { 9163 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 9164 /* Behind our TLP definition or right at */ 9165 return (0); 9166 } 9167 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 9168 /* The start is beyond or right at our end of TLP definition */ 9169 return (0); 9170 } 9171 /* It has to be a sub-part of the original TLP recorded */ 9172 return (1); 9173 } 9174 9175 static uint32_t 9176 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 9177 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, 9178 uint32_t segsiz) 9179 { 9180 uint32_t start, end, changed = 0; 9181 struct rack_sendmap stack_map; 9182 struct rack_sendmap *rsm, *nrsm, *prev, *next; 9183 int insret __diagused; 9184 int32_t used_ref = 1; 9185 int can_use_hookery = 0; 9186 9187 start = sack->start; 9188 end = sack->end; 9189 rsm = *prsm; 9190 9191 do_rest_ofb: 9192 if ((rsm == NULL) || 9193 (SEQ_LT(end, rsm->r_start)) || 9194 (SEQ_GEQ(start, rsm->r_end)) || 9195 (SEQ_LT(start, rsm->r_start))) { 9196 /* 9197 * We are not in the right spot, 9198 * find the correct spot in the tree. 9199 */ 9200 used_ref = 0; 9201 rsm = tqhash_find(rack->r_ctl.tqh, start); 9202 } 9203 if (rsm == NULL) { 9204 /* TSNH */ 9205 goto out; 9206 } 9207 /* Ok we have an ACK for some piece of this rsm */ 9208 if (rsm->r_start != start) { 9209 if ((rsm->r_flags & RACK_ACKED) == 0) { 9210 /* 9211 * Before any splitting or hookery is 9212 * done is it a TLP of interest i.e. rxt? 9213 */ 9214 if ((rsm->r_flags & RACK_TLP) && 9215 (rsm->r_rtr_cnt > 1)) { 9216 /* 9217 * We are splitting a rxt TLP, check 9218 * if we need to save off the start/end 9219 */ 9220 if (rack->rc_last_tlp_acked_set && 9221 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9222 /* 9223 * We already turned this on since we are inside 9224 * the previous one was a partially sack now we 9225 * are getting another one (maybe all of it). 9226 * 9227 */ 9228 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9229 /* 9230 * Lets make sure we have all of it though. 9231 */ 9232 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9233 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9234 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9235 rack->r_ctl.last_tlp_acked_end); 9236 } 9237 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9238 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9239 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9240 rack->r_ctl.last_tlp_acked_end); 9241 } 9242 } else { 9243 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9244 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9245 rack->rc_last_tlp_past_cumack = 0; 9246 rack->rc_last_tlp_acked_set = 1; 9247 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9248 } 9249 } 9250 /** 9251 * Need to split this in two pieces the before and after, 9252 * the before remains in the map, the after must be 9253 * added. In other words we have: 9254 * rsm |--------------| 9255 * sackblk |-------> 9256 * rsm will become 9257 * rsm |---| 9258 * and nrsm will be the sacked piece 9259 * nrsm |----------| 9260 * 9261 * But before we start down that path lets 9262 * see if the sack spans over on top of 9263 * the next guy and it is already sacked. 9264 * 9265 */ 9266 /* 9267 * Hookery can only be used if the two entries 9268 * are in the same bucket and neither one of 9269 * them staddle the bucket line. 9270 */ 9271 next = tqhash_next(rack->r_ctl.tqh, rsm); 9272 if (next && 9273 (rsm->bindex == next->bindex) && 9274 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9275 ((next->r_flags & RACK_STRADDLE) == 0) && 9276 ((rsm->r_flags & RACK_IS_PCM) == 0) && 9277 ((next->r_flags & RACK_IS_PCM) == 0) && 9278 (rsm->r_flags & RACK_IN_GP_WIN) && 9279 (next->r_flags & RACK_IN_GP_WIN)) 9280 can_use_hookery = 1; 9281 else 9282 can_use_hookery = 0; 9283 if (next && can_use_hookery && 9284 (next->r_flags & RACK_ACKED) && 9285 SEQ_GEQ(end, next->r_start)) { 9286 /** 9287 * So the next one is already acked, and 9288 * we can thus by hookery use our stack_map 9289 * to reflect the piece being sacked and 9290 * then adjust the two tree entries moving 9291 * the start and ends around. So we start like: 9292 * rsm |------------| (not-acked) 9293 * next |-----------| (acked) 9294 * sackblk |--------> 9295 * We want to end like so: 9296 * rsm |------| (not-acked) 9297 * next |-----------------| (acked) 9298 * nrsm |-----| 9299 * Where nrsm is a temporary stack piece we 9300 * use to update all the gizmos. 9301 */ 9302 /* Copy up our fudge block */ 9303 nrsm = &stack_map; 9304 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9305 /* Now adjust our tree blocks */ 9306 tqhash_update_end(rack->r_ctl.tqh, rsm, start); 9307 next->r_start = start; 9308 rsm->r_flags |= RACK_SHUFFLED; 9309 next->r_flags |= RACK_SHUFFLED; 9310 /* Now we must adjust back where next->m is */ 9311 rack_setup_offset_for_rsm(rack, rsm, next); 9312 /* 9313 * Which timestamp do we keep? It is rather 9314 * important in GP measurements to have the 9315 * accurate end of the send window. 9316 * 9317 * We keep the largest value, which is the newest 9318 * send. We do this in case a segment that is 9319 * joined together and not part of a GP estimate 9320 * later gets expanded into the GP estimate. 9321 * 9322 * We prohibit the merging of unlike kinds i.e. 9323 * all pieces that are in the GP estimate can be 9324 * merged and all pieces that are not in a GP estimate 9325 * can be merged, but not disimilar pieces. Combine 9326 * this with taking the highest here and we should 9327 * be ok unless of course the client reneges. Then 9328 * all bets are off. 9329 */ 9330 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < 9331 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) 9332 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; 9333 /* 9334 * And we must keep the newest ack arrival time. 9335 */ 9336 if (next->r_ack_arrival < 9337 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9338 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9339 9340 9341 /* We don't need to adjust rsm, it did not change */ 9342 /* Clear out the dup ack count of the remainder */ 9343 rsm->r_dupack = 0; 9344 rsm->r_just_ret = 0; 9345 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9346 /* Now lets make sure our fudge block is right */ 9347 nrsm->r_start = start; 9348 /* Now lets update all the stats and such */ 9349 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9350 if (rack->app_limited_needs_set) 9351 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9352 changed += (nrsm->r_end - nrsm->r_start); 9353 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9354 if (rsm->r_flags & RACK_WAS_LOST) { 9355 int my_chg; 9356 9357 /* 9358 * Note here we do not use our rack_mark_nolonger_lost() function 9359 * since we are moving our data pointer around and the 9360 * ack'ed side is already not considered lost. 9361 */ 9362 my_chg = (nrsm->r_end - nrsm->r_start); 9363 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9364 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9365 if (my_chg <= rack->r_ctl.rc_considered_lost) 9366 rack->r_ctl.rc_considered_lost -= my_chg; 9367 else 9368 rack->r_ctl.rc_considered_lost = 0; 9369 } 9370 if (nrsm->r_flags & RACK_SACK_PASSED) { 9371 rack->r_ctl.rc_reorder_ts = cts; 9372 if (rack->r_ctl.rc_reorder_ts == 0) 9373 rack->r_ctl.rc_reorder_ts = 1; 9374 } 9375 /* 9376 * Now we want to go up from rsm (the 9377 * one left un-acked) to the next one 9378 * in the tmap. We do this so when 9379 * we walk backwards we include marking 9380 * sack-passed on rsm (The one passed in 9381 * is skipped since it is generally called 9382 * on something sacked before removing it 9383 * from the tmap). 9384 */ 9385 if (rsm->r_in_tmap) { 9386 nrsm = TAILQ_NEXT(rsm, r_tnext); 9387 /* 9388 * Now that we have the next 9389 * one walk backwards from there. 9390 */ 9391 if (nrsm && nrsm->r_in_tmap) 9392 rack_log_sack_passed(tp, rack, nrsm, cts); 9393 } 9394 /* Now are we done? */ 9395 if (SEQ_LT(end, next->r_end) || 9396 (end == next->r_end)) { 9397 /* Done with block */ 9398 goto out; 9399 } 9400 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 9401 /* Postion for the next block */ 9402 start = next->r_end; 9403 rsm = tqhash_next(rack->r_ctl.tqh, next); 9404 if (rsm == NULL) 9405 goto out; 9406 } else { 9407 /** 9408 * We can't use any hookery here, so we 9409 * need to split the map. We enter like 9410 * so: 9411 * rsm |--------| 9412 * sackblk |-----> 9413 * We will add the new block nrsm and 9414 * that will be the new portion, and then 9415 * fall through after reseting rsm. So we 9416 * split and look like this: 9417 * rsm |----| 9418 * sackblk |-----> 9419 * nrsm |---| 9420 * We then fall through reseting 9421 * rsm to nrsm, so the next block 9422 * picks it up. 9423 */ 9424 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9425 if (nrsm == NULL) { 9426 /* 9427 * failed XXXrrs what can we do but loose the sack 9428 * info? 9429 */ 9430 goto out; 9431 } 9432 rack_clone_rsm(rack, nrsm, rsm, start); 9433 rsm->r_just_ret = 0; 9434 #ifndef INVARIANTS 9435 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9436 #else 9437 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9438 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 9439 nrsm, insret, rack, rsm); 9440 } 9441 #endif 9442 if (rsm->r_in_tmap) { 9443 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9444 nrsm->r_in_tmap = 1; 9445 } 9446 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 9447 rsm->r_flags &= (~RACK_HAS_FIN); 9448 /* Position us to point to the new nrsm that starts the sack blk */ 9449 rsm = nrsm; 9450 } 9451 } else { 9452 /* Already sacked this piece */ 9453 if (end == rsm->r_end) { 9454 /* Done with block */ 9455 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9456 goto out; 9457 } else if (SEQ_LT(end, rsm->r_end)) { 9458 /* A partial sack to a already sacked block */ 9459 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9460 goto out; 9461 } else { 9462 /* 9463 * The end goes beyond this guy 9464 * reposition the start to the 9465 * next block. 9466 */ 9467 start = rsm->r_end; 9468 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9469 if (rsm == NULL) 9470 goto out; 9471 } 9472 } 9473 } 9474 if (SEQ_GEQ(end, rsm->r_end)) { 9475 /** 9476 * The end of this block is either beyond this guy or right 9477 * at this guy. I.e.: 9478 * rsm --- |-----| 9479 * end |-----| 9480 * <or> 9481 * end |---------| 9482 */ 9483 if ((rsm->r_flags & RACK_ACKED) == 0) { 9484 /* 9485 * Is it a TLP of interest? 9486 */ 9487 if ((rsm->r_flags & RACK_TLP) && 9488 (rsm->r_rtr_cnt > 1)) { 9489 /* 9490 * We are splitting a rxt TLP, check 9491 * if we need to save off the start/end 9492 */ 9493 if (rack->rc_last_tlp_acked_set && 9494 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9495 /* 9496 * We already turned this on since we are inside 9497 * the previous one was a partially sack now we 9498 * are getting another one (maybe all of it). 9499 */ 9500 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9501 /* 9502 * Lets make sure we have all of it though. 9503 */ 9504 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9505 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9506 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9507 rack->r_ctl.last_tlp_acked_end); 9508 } 9509 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9510 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9511 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9512 rack->r_ctl.last_tlp_acked_end); 9513 } 9514 } else { 9515 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9516 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9517 rack->rc_last_tlp_past_cumack = 0; 9518 rack->rc_last_tlp_acked_set = 1; 9519 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9520 } 9521 } 9522 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9523 changed += (rsm->r_end - rsm->r_start); 9524 /* You get a count for acking a whole segment or more */ 9525 if (rsm->r_flags & RACK_WAS_LOST) { 9526 /* 9527 * Here we can use the inline function since 9528 * the rsm is truly marked lost and now no longer lost. 9529 */ 9530 rack_mark_nolonger_lost(rack, rsm); 9531 } 9532 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9533 if (rsm->r_in_tmap) /* should be true */ 9534 rack_log_sack_passed(tp, rack, rsm, cts); 9535 /* Is Reordering occuring? */ 9536 if (rsm->r_flags & RACK_SACK_PASSED) { 9537 rsm->r_flags &= ~RACK_SACK_PASSED; 9538 rack->r_ctl.rc_reorder_ts = cts; 9539 if (rack->r_ctl.rc_reorder_ts == 0) 9540 rack->r_ctl.rc_reorder_ts = 1; 9541 } 9542 if (rack->app_limited_needs_set) 9543 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9544 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9545 rsm->r_flags |= RACK_ACKED; 9546 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 9547 if (rsm->r_in_tmap) { 9548 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9549 rsm->r_in_tmap = 0; 9550 } 9551 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 9552 } 9553 if (end == rsm->r_end) { 9554 /* This block only - done, setup for next */ 9555 goto out; 9556 } 9557 /* 9558 * There is more not coverend by this rsm move on 9559 * to the next block in the tail queue hash table. 9560 */ 9561 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 9562 start = rsm->r_end; 9563 rsm = nrsm; 9564 if (rsm == NULL) 9565 goto out; 9566 goto do_rest_ofb; 9567 } 9568 /** 9569 * The end of this sack block is smaller than 9570 * our rsm i.e.: 9571 * rsm --- |-----| 9572 * end |--| 9573 */ 9574 if ((rsm->r_flags & RACK_ACKED) == 0) { 9575 /* 9576 * Is it a TLP of interest? 9577 */ 9578 if ((rsm->r_flags & RACK_TLP) && 9579 (rsm->r_rtr_cnt > 1)) { 9580 /* 9581 * We are splitting a rxt TLP, check 9582 * if we need to save off the start/end 9583 */ 9584 if (rack->rc_last_tlp_acked_set && 9585 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9586 /* 9587 * We already turned this on since we are inside 9588 * the previous one was a partially sack now we 9589 * are getting another one (maybe all of it). 9590 */ 9591 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9592 /* 9593 * Lets make sure we have all of it though. 9594 */ 9595 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9596 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9597 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9598 rack->r_ctl.last_tlp_acked_end); 9599 } 9600 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9601 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9602 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9603 rack->r_ctl.last_tlp_acked_end); 9604 } 9605 } else { 9606 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9607 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9608 rack->rc_last_tlp_past_cumack = 0; 9609 rack->rc_last_tlp_acked_set = 1; 9610 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9611 } 9612 } 9613 /* 9614 * Hookery can only be used if the two entries 9615 * are in the same bucket and neither one of 9616 * them staddle the bucket line. 9617 */ 9618 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9619 if (prev && 9620 (rsm->bindex == prev->bindex) && 9621 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9622 ((prev->r_flags & RACK_STRADDLE) == 0) && 9623 ((rsm->r_flags & RACK_IS_PCM) == 0) && 9624 ((prev->r_flags & RACK_IS_PCM) == 0) && 9625 (rsm->r_flags & RACK_IN_GP_WIN) && 9626 (prev->r_flags & RACK_IN_GP_WIN)) 9627 can_use_hookery = 1; 9628 else 9629 can_use_hookery = 0; 9630 if (prev && can_use_hookery && 9631 (prev->r_flags & RACK_ACKED)) { 9632 /** 9633 * Goal, we want the right remainder of rsm to shrink 9634 * in place and span from (rsm->r_start = end) to rsm->r_end. 9635 * We want to expand prev to go all the way 9636 * to prev->r_end <- end. 9637 * so in the tree we have before: 9638 * prev |--------| (acked) 9639 * rsm |-------| (non-acked) 9640 * sackblk |-| 9641 * We churn it so we end up with 9642 * prev |----------| (acked) 9643 * rsm |-----| (non-acked) 9644 * nrsm |-| (temporary) 9645 * 9646 * Note if either prev/rsm is a TLP we don't 9647 * do this. 9648 */ 9649 nrsm = &stack_map; 9650 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9651 tqhash_update_end(rack->r_ctl.tqh, prev, end); 9652 rsm->r_start = end; 9653 rsm->r_flags |= RACK_SHUFFLED; 9654 prev->r_flags |= RACK_SHUFFLED; 9655 /* Now adjust nrsm (stack copy) to be 9656 * the one that is the small 9657 * piece that was "sacked". 9658 */ 9659 nrsm->r_end = end; 9660 rsm->r_dupack = 0; 9661 /* 9662 * Which timestamp do we keep? It is rather 9663 * important in GP measurements to have the 9664 * accurate end of the send window. 9665 * 9666 * We keep the largest value, which is the newest 9667 * send. We do this in case a segment that is 9668 * joined together and not part of a GP estimate 9669 * later gets expanded into the GP estimate. 9670 * 9671 * We prohibit the merging of unlike kinds i.e. 9672 * all pieces that are in the GP estimate can be 9673 * merged and all pieces that are not in a GP estimate 9674 * can be merged, but not disimilar pieces. Combine 9675 * this with taking the highest here and we should 9676 * be ok unless of course the client reneges. Then 9677 * all bets are off. 9678 */ 9679 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < 9680 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { 9681 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9682 } 9683 /* 9684 * And we must keep the newest ack arrival time. 9685 */ 9686 9687 if(prev->r_ack_arrival < 9688 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9689 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9690 9691 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9692 /* 9693 * Now that the rsm has had its start moved forward 9694 * lets go ahead and get its new place in the world. 9695 */ 9696 rack_setup_offset_for_rsm(rack, prev, rsm); 9697 /* 9698 * Now nrsm is our new little piece 9699 * that is acked (which was merged 9700 * to prev). Update the rtt and changed 9701 * based on that. Also check for reordering. 9702 */ 9703 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9704 if (rack->app_limited_needs_set) 9705 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9706 changed += (nrsm->r_end - nrsm->r_start); 9707 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9708 if (rsm->r_flags & RACK_WAS_LOST) { 9709 int my_chg; 9710 9711 /* 9712 * Note here we are using hookery again so we can't 9713 * use our rack_mark_nolonger_lost() function. 9714 */ 9715 my_chg = (nrsm->r_end - nrsm->r_start); 9716 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9717 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9718 if (my_chg <= rack->r_ctl.rc_considered_lost) 9719 rack->r_ctl.rc_considered_lost -= my_chg; 9720 else 9721 rack->r_ctl.rc_considered_lost = 0; 9722 } 9723 if (nrsm->r_flags & RACK_SACK_PASSED) { 9724 rack->r_ctl.rc_reorder_ts = cts; 9725 if (rack->r_ctl.rc_reorder_ts == 0) 9726 rack->r_ctl.rc_reorder_ts = 1; 9727 } 9728 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 9729 rsm = prev; 9730 } else { 9731 /** 9732 * This is the case where our previous 9733 * block is not acked either, so we must 9734 * split the block in two. 9735 */ 9736 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9737 if (nrsm == NULL) { 9738 /* failed rrs what can we do but loose the sack info? */ 9739 goto out; 9740 } 9741 if ((rsm->r_flags & RACK_TLP) && 9742 (rsm->r_rtr_cnt > 1)) { 9743 /* 9744 * We are splitting a rxt TLP, check 9745 * if we need to save off the start/end 9746 */ 9747 if (rack->rc_last_tlp_acked_set && 9748 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9749 /* 9750 * We already turned this on since this block is inside 9751 * the previous one was a partially sack now we 9752 * are getting another one (maybe all of it). 9753 */ 9754 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9755 /* 9756 * Lets make sure we have all of it though. 9757 */ 9758 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9759 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9760 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9761 rack->r_ctl.last_tlp_acked_end); 9762 } 9763 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9764 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9765 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9766 rack->r_ctl.last_tlp_acked_end); 9767 } 9768 } else { 9769 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9770 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9771 rack->rc_last_tlp_acked_set = 1; 9772 rack->rc_last_tlp_past_cumack = 0; 9773 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9774 } 9775 } 9776 /** 9777 * In this case nrsm becomes 9778 * nrsm->r_start = end; 9779 * nrsm->r_end = rsm->r_end; 9780 * which is un-acked. 9781 * <and> 9782 * rsm->r_end = nrsm->r_start; 9783 * i.e. the remaining un-acked 9784 * piece is left on the left 9785 * hand side. 9786 * 9787 * So we start like this 9788 * rsm |----------| (not acked) 9789 * sackblk |---| 9790 * build it so we have 9791 * rsm |---| (acked) 9792 * nrsm |------| (not acked) 9793 */ 9794 rack_clone_rsm(rack, nrsm, rsm, end); 9795 rsm->r_flags &= (~RACK_HAS_FIN); 9796 rsm->r_just_ret = 0; 9797 #ifndef INVARIANTS 9798 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9799 #else 9800 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9801 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p", 9802 nrsm, insret, rack, rsm); 9803 } 9804 #endif 9805 if (rsm->r_in_tmap) { 9806 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9807 nrsm->r_in_tmap = 1; 9808 } 9809 nrsm->r_dupack = 0; 9810 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 9811 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9812 changed += (rsm->r_end - rsm->r_start); 9813 if (rsm->r_flags & RACK_WAS_LOST) { 9814 /* 9815 * Here it is safe to use our function. 9816 */ 9817 rack_mark_nolonger_lost(rack, rsm); 9818 } 9819 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9820 9821 if (rsm->r_in_tmap) /* should be true */ 9822 rack_log_sack_passed(tp, rack, rsm, cts); 9823 /* Is Reordering occuring? */ 9824 if (rsm->r_flags & RACK_SACK_PASSED) { 9825 rsm->r_flags &= ~RACK_SACK_PASSED; 9826 rack->r_ctl.rc_reorder_ts = cts; 9827 if (rack->r_ctl.rc_reorder_ts == 0) 9828 rack->r_ctl.rc_reorder_ts = 1; 9829 } 9830 if (rack->app_limited_needs_set) 9831 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9832 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9833 rsm->r_flags |= RACK_ACKED; 9834 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 9835 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 9836 if (rsm->r_in_tmap) { 9837 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9838 rsm->r_in_tmap = 0; 9839 } 9840 } 9841 } 9842 out: 9843 if (rsm && 9844 ((rsm->r_flags & RACK_TLP) == 0) && 9845 (rsm->r_flags & RACK_ACKED)) { 9846 /* 9847 * Now can we merge where we worked 9848 * with either the previous or 9849 * next block? 9850 */ 9851 next = tqhash_next(rack->r_ctl.tqh, rsm); 9852 while (next) { 9853 if (next->r_flags & RACK_TLP) 9854 break; 9855 /* Only allow merges between ones in or out of GP window */ 9856 if ((next->r_flags & RACK_IN_GP_WIN) && 9857 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 9858 break; 9859 } 9860 if ((rsm->r_flags & RACK_IN_GP_WIN) && 9861 ((next->r_flags & RACK_IN_GP_WIN) == 0)) { 9862 break; 9863 } 9864 if (rsm->bindex != next->bindex) 9865 break; 9866 if (rsm->r_flags & RACK_STRADDLE) 9867 break; 9868 if (rsm->r_flags & RACK_IS_PCM) 9869 break; 9870 if (next->r_flags & RACK_STRADDLE) 9871 break; 9872 if (next->r_flags & RACK_IS_PCM) 9873 break; 9874 if (next->r_flags & RACK_ACKED) { 9875 /* yep this and next can be merged */ 9876 rsm = rack_merge_rsm(rack, rsm, next); 9877 next = tqhash_next(rack->r_ctl.tqh, rsm); 9878 } else 9879 break; 9880 } 9881 /* Now what about the previous? */ 9882 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9883 while (prev) { 9884 if (prev->r_flags & RACK_TLP) 9885 break; 9886 /* Only allow merges between ones in or out of GP window */ 9887 if ((prev->r_flags & RACK_IN_GP_WIN) && 9888 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 9889 break; 9890 } 9891 if ((rsm->r_flags & RACK_IN_GP_WIN) && 9892 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { 9893 break; 9894 } 9895 if (rsm->bindex != prev->bindex) 9896 break; 9897 if (rsm->r_flags & RACK_STRADDLE) 9898 break; 9899 if (rsm->r_flags & RACK_IS_PCM) 9900 break; 9901 if (prev->r_flags & RACK_STRADDLE) 9902 break; 9903 if (prev->r_flags & RACK_IS_PCM) 9904 break; 9905 if (prev->r_flags & RACK_ACKED) { 9906 /* yep the previous and this can be merged */ 9907 rsm = rack_merge_rsm(rack, prev, rsm); 9908 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9909 } else 9910 break; 9911 } 9912 } 9913 if (used_ref == 0) { 9914 counter_u64_add(rack_sack_proc_all, 1); 9915 } else { 9916 counter_u64_add(rack_sack_proc_short, 1); 9917 } 9918 /* Save off the next one for quick reference. */ 9919 nrsm = tqhash_find(rack->r_ctl.tqh, end); 9920 *prsm = rack->r_ctl.rc_sacklast = nrsm; 9921 return (changed); 9922 } 9923 9924 static void inline 9925 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 9926 { 9927 struct rack_sendmap *tmap; 9928 9929 tmap = NULL; 9930 while (rsm && (rsm->r_flags & RACK_ACKED)) { 9931 /* Its no longer sacked, mark it so */ 9932 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 9933 #ifdef INVARIANTS 9934 if (rsm->r_in_tmap) { 9935 panic("rack:%p rsm:%p flags:0x%x in tmap?", 9936 rack, rsm, rsm->r_flags); 9937 } 9938 #endif 9939 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 9940 /* Rebuild it into our tmap */ 9941 if (tmap == NULL) { 9942 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9943 tmap = rsm; 9944 } else { 9945 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 9946 tmap = rsm; 9947 } 9948 tmap->r_in_tmap = 1; 9949 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9950 } 9951 /* 9952 * Now lets possibly clear the sack filter so we start 9953 * recognizing sacks that cover this area. 9954 */ 9955 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 9956 9957 } 9958 9959 9960 static void inline 9961 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) 9962 { 9963 /* 9964 * We look at advancing the end send time for our GP 9965 * measurement tracking only as the cumulative acknowledgment 9966 * moves forward. You might wonder about this, why not 9967 * at every transmission or retransmission within the 9968 * GP window update the rc_gp_cumack_ts? Well its rather 9969 * nuanced but basically the GP window *may* expand (as 9970 * it does below) or worse and harder to track it may shrink. 9971 * 9972 * This last makes it impossible to track at the time of 9973 * the send, since you may set forward your rc_gp_cumack_ts 9974 * when you send, because that send *is* in your currently 9975 * "guessed" window, but then it shrinks. Now which was 9976 * the send time of the last bytes in the window, by the 9977 * time you ask that question that part of the sendmap 9978 * is freed. So you don't know and you will have too 9979 * long of send window. Instead by updating the time 9980 * marker only when the cumack advances this assures us 9981 * that we will have only the sends in the window of our 9982 * GP measurement. 9983 * 9984 * Another complication from this is the 9985 * merging of sendmap entries. During SACK processing this 9986 * can happen to conserve the sendmap size. That breaks 9987 * everything down in tracking the send window of the GP 9988 * estimate. So to prevent that and keep it working with 9989 * a tiny bit more limited merging, we only allow like 9990 * types to be merged. I.e. if two sends are in the GP window 9991 * then its ok to merge them together. If two sends are not 9992 * in the GP window its ok to merge them together too. Though 9993 * one send in and one send out cannot be merged. We combine 9994 * this with never allowing the shrinking of the GP window when 9995 * we are in recovery so that we can properly calculate the 9996 * sending times. 9997 * 9998 * This all of course seems complicated, because it is.. :) 9999 * 10000 * The cum-ack is being advanced upon the sendmap. 10001 * If we are not doing a GP estimate don't 10002 * proceed. 10003 */ 10004 uint64_t ts; 10005 10006 if ((tp->t_flags & TF_GPUTINPROG) == 0) 10007 return; 10008 /* 10009 * If this sendmap entry is going 10010 * beyond the measurement window we had picked, 10011 * expand the measurement window by that much. 10012 */ 10013 if (SEQ_GT(rsm->r_end, tp->gput_ack)) { 10014 tp->gput_ack = rsm->r_end; 10015 } 10016 /* 10017 * If we have not setup a ack, then we 10018 * have no idea if the newly acked pieces 10019 * will be "in our seq measurement range". If 10020 * it is when we clear the app_limited_needs_set 10021 * flag the timestamp will be updated. 10022 */ 10023 if (rack->app_limited_needs_set) 10024 return; 10025 /* 10026 * Finally, we grab out the latest timestamp 10027 * that this packet was sent and then see 10028 * if: 10029 * a) The packet touches are newly defined GP range. 10030 * b) The time is greater than (newer) than the 10031 * one we currently have. If so we update 10032 * our sending end time window. 10033 * 10034 * Note we *do not* do this at send time. The reason 10035 * is that if you do you *may* pick up a newer timestamp 10036 * for a range you are not going to measure. We project 10037 * out how far and then sometimes modify that to be 10038 * smaller. If that occurs then you will have a send 10039 * that does not belong to the range included. 10040 */ 10041 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= 10042 rack->r_ctl.rc_gp_cumack_ts) 10043 return; 10044 if (rack_in_gp_window(tp, rsm)) { 10045 rack->r_ctl.rc_gp_cumack_ts = ts; 10046 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, 10047 __LINE__, from, rsm); 10048 } 10049 } 10050 10051 static void 10052 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) 10053 { 10054 struct rack_sendmap *rsm; 10055 /* 10056 * The ACK point is advancing to th_ack, we must drop off 10057 * the packets in the rack log and calculate any eligble 10058 * RTT's. 10059 */ 10060 10061 if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) { 10062 /* 10063 * If we have some sack blocks in the filter 10064 * lets prune them out by calling sfb with no blocks. 10065 */ 10066 sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack); 10067 } 10068 if (SEQ_GT(th_ack, tp->snd_una)) { 10069 /* Clear any app ack remembered settings */ 10070 rack->r_ctl.cleared_app_ack = 0; 10071 } 10072 rack->r_wanted_output = 1; 10073 if (SEQ_GT(th_ack, tp->snd_una)) 10074 rack->r_ctl.last_cumack_advance = acktime; 10075 10076 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 10077 if ((rack->rc_last_tlp_acked_set == 1)&& 10078 (rack->rc_last_tlp_past_cumack == 1) && 10079 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 10080 /* 10081 * We have reached the point where our last rack 10082 * tlp retransmit sequence is ahead of the cum-ack. 10083 * This can only happen when the cum-ack moves all 10084 * the way around (its been a full 2^^31+1 bytes 10085 * or more since we sent a retransmitted TLP). Lets 10086 * turn off the valid flag since its not really valid. 10087 * 10088 * Note since sack's also turn on this event we have 10089 * a complication, we have to wait to age it out until 10090 * the cum-ack is by the TLP before checking which is 10091 * what the next else clause does. 10092 */ 10093 rack_log_dsack_event(rack, 9, __LINE__, 10094 rack->r_ctl.last_tlp_acked_start, 10095 rack->r_ctl.last_tlp_acked_end); 10096 rack->rc_last_tlp_acked_set = 0; 10097 rack->rc_last_tlp_past_cumack = 0; 10098 } else if ((rack->rc_last_tlp_acked_set == 1) && 10099 (rack->rc_last_tlp_past_cumack == 0) && 10100 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 10101 /* 10102 * It is safe to start aging TLP's out. 10103 */ 10104 rack->rc_last_tlp_past_cumack = 1; 10105 } 10106 /* We do the same for the tlp send seq as well */ 10107 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10108 (rack->rc_last_sent_tlp_past_cumack == 1) && 10109 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 10110 rack_log_dsack_event(rack, 9, __LINE__, 10111 rack->r_ctl.last_sent_tlp_seq, 10112 (rack->r_ctl.last_sent_tlp_seq + 10113 rack->r_ctl.last_sent_tlp_len)); 10114 rack->rc_last_sent_tlp_seq_valid = 0; 10115 rack->rc_last_sent_tlp_past_cumack = 0; 10116 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10117 (rack->rc_last_sent_tlp_past_cumack == 0) && 10118 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 10119 /* 10120 * It is safe to start aging TLP's send. 10121 */ 10122 rack->rc_last_sent_tlp_past_cumack = 1; 10123 } 10124 more: 10125 rsm = tqhash_min(rack->r_ctl.tqh); 10126 if (rsm == NULL) { 10127 if ((th_ack - 1) == tp->iss) { 10128 /* 10129 * For the SYN incoming case we will not 10130 * have called tcp_output for the sending of 10131 * the SYN, so there will be no map. All 10132 * other cases should probably be a panic. 10133 */ 10134 return; 10135 } 10136 if (tp->t_flags & TF_SENTFIN) { 10137 /* if we sent a FIN we often will not have map */ 10138 return; 10139 } 10140 #ifdef INVARIANTS 10141 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n", 10142 tp, 10143 tp->t_state, th_ack, rack, 10144 tp->snd_una, tp->snd_max); 10145 #endif 10146 return; 10147 } 10148 if (SEQ_LT(th_ack, rsm->r_start)) { 10149 /* Huh map is missing this */ 10150 #ifdef INVARIANTS 10151 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 10152 rsm->r_start, 10153 th_ack, tp->t_state, rack->r_state); 10154 #endif 10155 return; 10156 } 10157 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 10158 10159 /* Now was it a retransmitted TLP? */ 10160 if ((rsm->r_flags & RACK_TLP) && 10161 (rsm->r_rtr_cnt > 1)) { 10162 /* 10163 * Yes, this rsm was a TLP and retransmitted, remember that 10164 * since if a DSACK comes back on this we don't want 10165 * to think of it as a reordered segment. This may 10166 * get updated again with possibly even other TLPs 10167 * in flight, but thats ok. Only when we don't send 10168 * a retransmitted TLP for 1/2 the sequences space 10169 * will it get turned off (above). 10170 */ 10171 if (rack->rc_last_tlp_acked_set && 10172 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10173 /* 10174 * We already turned this on since the end matches, 10175 * the previous one was a partially ack now we 10176 * are getting another one (maybe all of it). 10177 */ 10178 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10179 /* 10180 * Lets make sure we have all of it though. 10181 */ 10182 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10183 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10184 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10185 rack->r_ctl.last_tlp_acked_end); 10186 } 10187 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10188 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10189 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10190 rack->r_ctl.last_tlp_acked_end); 10191 } 10192 } else { 10193 rack->rc_last_tlp_past_cumack = 1; 10194 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10195 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10196 rack->rc_last_tlp_acked_set = 1; 10197 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10198 } 10199 } 10200 /* Now do we consume the whole thing? */ 10201 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 10202 if (SEQ_GEQ(th_ack, rsm->r_end)) { 10203 /* Its all consumed. */ 10204 uint32_t left; 10205 uint8_t newly_acked; 10206 10207 if (rsm->r_flags & RACK_WAS_LOST) { 10208 /* 10209 * This can happen when we marked it as lost 10210 * and yet before retransmitting we get an ack 10211 * which can happen due to reordering. 10212 */ 10213 rack_mark_nolonger_lost(rack, rsm); 10214 } 10215 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 10216 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 10217 rsm->r_rtr_bytes = 0; 10218 /* 10219 * Record the time of highest cumack sent if its in our measurement 10220 * window and possibly bump out the end. 10221 */ 10222 rack_rsm_sender_update(rack, tp, rsm, 4); 10223 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 10224 if (rsm->r_in_tmap) { 10225 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10226 rsm->r_in_tmap = 0; 10227 } 10228 newly_acked = 1; 10229 if (rsm->r_flags & RACK_ACKED) { 10230 /* 10231 * It was acked on the scoreboard -- remove 10232 * it from total 10233 */ 10234 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10235 newly_acked = 0; 10236 } else if (rsm->r_flags & RACK_SACK_PASSED) { 10237 /* 10238 * There are segments ACKED on the 10239 * scoreboard further up. We are seeing 10240 * reordering. 10241 */ 10242 rsm->r_flags &= ~RACK_SACK_PASSED; 10243 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10244 rsm->r_flags |= RACK_ACKED; 10245 rack->r_ctl.rc_reorder_ts = cts; 10246 if (rack->r_ctl.rc_reorder_ts == 0) 10247 rack->r_ctl.rc_reorder_ts = 1; 10248 if (rack->r_ent_rec_ns) { 10249 /* 10250 * We have sent no more, and we saw an sack 10251 * then ack arrive. 10252 */ 10253 rack->r_might_revert = 1; 10254 } 10255 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 10256 } else { 10257 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 10258 } 10259 if ((rsm->r_flags & RACK_TO_REXT) && 10260 (tp->t_flags & TF_RCVD_TSTMP) && 10261 (to->to_flags & TOF_TS) && 10262 (to->to_tsecr != 0) && 10263 (tp->t_flags & TF_PREVVALID)) { 10264 /* 10265 * We can use the timestamp to see 10266 * if this retransmission was from the 10267 * first transmit. If so we made a mistake. 10268 */ 10269 tp->t_flags &= ~TF_PREVVALID; 10270 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 10271 /* The first transmit is what this ack is for */ 10272 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 10273 } 10274 } 10275 left = th_ack - rsm->r_end; 10276 if (rack->app_limited_needs_set && newly_acked) 10277 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 10278 /* Free back to zone */ 10279 rack_free(rack, rsm); 10280 if (left) { 10281 goto more; 10282 } 10283 /* Check for reneging */ 10284 rsm = tqhash_min(rack->r_ctl.tqh); 10285 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 10286 /* 10287 * The peer has moved snd_una up to 10288 * the edge of this send, i.e. one 10289 * that it had previously acked. The only 10290 * way that can be true if the peer threw 10291 * away data (space issues) that it had 10292 * previously sacked (else it would have 10293 * given us snd_una up to (rsm->r_end). 10294 * We need to undo the acked markings here. 10295 * 10296 * Note we have to look to make sure th_ack is 10297 * our rsm->r_start in case we get an old ack 10298 * where th_ack is behind snd_una. 10299 */ 10300 rack_peer_reneges(rack, rsm, th_ack); 10301 } 10302 return; 10303 } 10304 if (rsm->r_flags & RACK_ACKED) { 10305 /* 10306 * It was acked on the scoreboard -- remove it from 10307 * total for the part being cum-acked. 10308 */ 10309 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 10310 } else { 10311 rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack); 10312 } 10313 /* And what about the lost flag? */ 10314 if (rsm->r_flags & RACK_WAS_LOST) { 10315 /* 10316 * This can happen when we marked it as lost 10317 * and yet before retransmitting we get an ack 10318 * which can happen due to reordering. In this 10319 * case its only a partial ack of the send. 10320 */ 10321 rack_mark_nolonger_lost(rack, rsm); 10322 } 10323 /* 10324 * Clear the dup ack count for 10325 * the piece that remains. 10326 */ 10327 rsm->r_dupack = 0; 10328 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10329 if (rsm->r_rtr_bytes) { 10330 /* 10331 * It was retransmitted adjust the 10332 * sack holes for what was acked. 10333 */ 10334 int ack_am; 10335 10336 ack_am = (th_ack - rsm->r_start); 10337 if (ack_am >= rsm->r_rtr_bytes) { 10338 rack->r_ctl.rc_holes_rxt -= ack_am; 10339 rsm->r_rtr_bytes -= ack_am; 10340 } 10341 } 10342 /* 10343 * Update where the piece starts and record 10344 * the time of send of highest cumack sent if 10345 * its in our GP range. 10346 */ 10347 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 10348 /* Now we need to move our offset forward too */ 10349 if (rsm->m && 10350 ((rsm->orig_m_len != rsm->m->m_len) || 10351 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 10352 /* Fix up the orig_m_len and possibly the mbuf offset */ 10353 rack_adjust_orig_mlen(rsm); 10354 } 10355 rsm->soff += (th_ack - rsm->r_start); 10356 rack_rsm_sender_update(rack, tp, rsm, 5); 10357 /* The trim will move th_ack into r_start for us */ 10358 tqhash_trim(rack->r_ctl.tqh, th_ack); 10359 /* Now do we need to move the mbuf fwd too? */ 10360 { 10361 struct mbuf *m; 10362 uint32_t soff; 10363 10364 m = rsm->m; 10365 soff = rsm->soff; 10366 if (m) { 10367 while (soff >= m->m_len) { 10368 soff -= m->m_len; 10369 KASSERT((m->m_next != NULL), 10370 (" rsm:%p off:%u soff:%u m:%p", 10371 rsm, rsm->soff, soff, m)); 10372 m = m->m_next; 10373 if (m == NULL) { 10374 /* 10375 * This is a fall-back that prevents a panic. In reality 10376 * we should be able to walk the mbuf's and find our place. 10377 * At this point snd_una has not been updated with the sbcut() yet 10378 * but tqhash_trim did update rsm->r_start so the offset calcuation 10379 * should work fine. This is undesirable since we will take cache 10380 * hits to access the socket buffer. And even more puzzling is that 10381 * it happens occasionally. It should not :( 10382 */ 10383 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 10384 (rsm->r_start - tp->snd_una), 10385 &soff); 10386 break; 10387 } 10388 } 10389 /* 10390 * Now save in our updated values. 10391 */ 10392 rsm->m = m; 10393 rsm->soff = soff; 10394 rsm->orig_m_len = rsm->m->m_len; 10395 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 10396 } 10397 } 10398 if (rack->app_limited_needs_set && 10399 SEQ_GEQ(th_ack, tp->gput_seq)) 10400 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 10401 } 10402 10403 static void 10404 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 10405 { 10406 struct rack_sendmap *rsm; 10407 int sack_pass_fnd = 0; 10408 10409 if (rack->r_might_revert) { 10410 /* 10411 * Ok we have reordering, have not sent anything, we 10412 * might want to revert the congestion state if nothing 10413 * further has SACK_PASSED on it. Lets check. 10414 * 10415 * We also get here when we have DSACKs come in for 10416 * all the data that we FR'd. Note that a rxt or tlp 10417 * timer clears this from happening. 10418 */ 10419 10420 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 10421 if (rsm->r_flags & RACK_SACK_PASSED) { 10422 sack_pass_fnd = 1; 10423 break; 10424 } 10425 } 10426 if (sack_pass_fnd == 0) { 10427 /* 10428 * We went into recovery 10429 * incorrectly due to reordering! 10430 */ 10431 int orig_cwnd; 10432 10433 rack->r_ent_rec_ns = 0; 10434 orig_cwnd = tp->snd_cwnd; 10435 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 10436 tp->snd_recover = tp->snd_una; 10437 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 10438 if (IN_RECOVERY(tp->t_flags)) { 10439 rack_exit_recovery(tp, rack, 3); 10440 if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){ 10441 /* 10442 * We were in recovery, had an RTO 10443 * and then re-entered recovery (more sack's arrived) 10444 * and we have properly recorded the old ssthresh from 10445 * the first recovery. We want to be able to slow-start 10446 * back to this level. The ssthresh from the timeout 10447 * and then back into recovery will end up most likely 10448 * to be min(cwnd=1mss, 2mss). Which makes it basically 10449 * so we get no slow-start after our RTO. 10450 */ 10451 rack->rto_from_rec = 0; 10452 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 10453 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 10454 } 10455 } 10456 } 10457 rack->r_might_revert = 0; 10458 } 10459 } 10460 10461 10462 static int 10463 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 10464 { 10465 10466 uint32_t am, l_end; 10467 int was_tlp = 0; 10468 10469 if (SEQ_GT(end, start)) 10470 am = end - start; 10471 else 10472 am = 0; 10473 if ((rack->rc_last_tlp_acked_set ) && 10474 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 10475 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 10476 /* 10477 * The DSACK is because of a TLP which we don't 10478 * do anything with the reordering window over since 10479 * it was not reordering that caused the DSACK but 10480 * our previous retransmit TLP. 10481 */ 10482 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10483 was_tlp = 1; 10484 goto skip_dsack_round; 10485 } 10486 if (rack->rc_last_sent_tlp_seq_valid) { 10487 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 10488 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 10489 (SEQ_LEQ(end, l_end))) { 10490 /* 10491 * This dsack is from the last sent TLP, ignore it 10492 * for reordering purposes. 10493 */ 10494 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10495 was_tlp = 1; 10496 goto skip_dsack_round; 10497 } 10498 } 10499 if (rack->rc_dsack_round_seen == 0) { 10500 rack->rc_dsack_round_seen = 1; 10501 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 10502 rack->r_ctl.num_dsack++; 10503 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 10504 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 10505 } 10506 skip_dsack_round: 10507 /* 10508 * We keep track of how many DSACK blocks we get 10509 * after a recovery incident. 10510 */ 10511 rack->r_ctl.dsack_byte_cnt += am; 10512 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 10513 rack->r_ctl.retran_during_recovery && 10514 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 10515 /* 10516 * False recovery most likely culprit is reordering. If 10517 * nothing else is missing we need to revert. 10518 */ 10519 rack->r_might_revert = 1; 10520 rack_handle_might_revert(rack->rc_tp, rack); 10521 rack->r_might_revert = 0; 10522 rack->r_ctl.retran_during_recovery = 0; 10523 rack->r_ctl.dsack_byte_cnt = 0; 10524 } 10525 return (was_tlp); 10526 } 10527 10528 static uint32_t 10529 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 10530 { 10531 return (((tp->snd_max - snd_una) - 10532 (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt); 10533 } 10534 10535 static int32_t 10536 rack_compute_pipe(struct tcpcb *tp) 10537 { 10538 return ((int32_t)do_rack_compute_pipe(tp, 10539 (struct tcp_rack *)tp->t_fb_ptr, 10540 tp->snd_una)); 10541 } 10542 10543 static void 10544 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 10545 { 10546 /* Deal with changed and PRR here (in recovery only) */ 10547 uint32_t pipe, snd_una; 10548 10549 rack->r_ctl.rc_prr_delivered += changed; 10550 10551 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 10552 /* 10553 * It is all outstanding, we are application limited 10554 * and thus we don't need more room to send anything. 10555 * Note we use tp->snd_una here and not th_ack because 10556 * the data as yet not been cut from the sb. 10557 */ 10558 rack->r_ctl.rc_prr_sndcnt = 0; 10559 return; 10560 } 10561 /* Compute prr_sndcnt */ 10562 if (SEQ_GT(tp->snd_una, th_ack)) { 10563 snd_una = tp->snd_una; 10564 } else { 10565 snd_una = th_ack; 10566 } 10567 pipe = do_rack_compute_pipe(tp, rack, snd_una); 10568 if (pipe > tp->snd_ssthresh) { 10569 long sndcnt; 10570 10571 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 10572 if (rack->r_ctl.rc_prr_recovery_fs > 0) 10573 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 10574 else { 10575 rack->r_ctl.rc_prr_sndcnt = 0; 10576 rack_log_to_prr(rack, 9, 0, __LINE__); 10577 sndcnt = 0; 10578 } 10579 sndcnt++; 10580 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 10581 sndcnt -= rack->r_ctl.rc_prr_out; 10582 else 10583 sndcnt = 0; 10584 rack->r_ctl.rc_prr_sndcnt = sndcnt; 10585 rack_log_to_prr(rack, 10, 0, __LINE__); 10586 } else { 10587 uint32_t limit; 10588 10589 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 10590 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 10591 else 10592 limit = 0; 10593 if (changed > limit) 10594 limit = changed; 10595 limit += ctf_fixed_maxseg(tp); 10596 if (tp->snd_ssthresh > pipe) { 10597 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 10598 rack_log_to_prr(rack, 11, 0, __LINE__); 10599 } else { 10600 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 10601 rack_log_to_prr(rack, 12, 0, __LINE__); 10602 } 10603 } 10604 } 10605 10606 static void 10607 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, 10608 int *dsack_seen, int *sacks_seen) 10609 { 10610 uint32_t changed; 10611 struct tcp_rack *rack; 10612 struct rack_sendmap *rsm; 10613 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 10614 register uint32_t th_ack; 10615 int32_t i, j, k, num_sack_blks = 0; 10616 uint32_t cts, acked, ack_point; 10617 int loop_start = 0; 10618 uint32_t tsused; 10619 uint32_t segsiz; 10620 10621 10622 INP_WLOCK_ASSERT(tptoinpcb(tp)); 10623 if (tcp_get_flags(th) & TH_RST) { 10624 /* We don't log resets */ 10625 return; 10626 } 10627 rack = (struct tcp_rack *)tp->t_fb_ptr; 10628 cts = tcp_get_usecs(NULL); 10629 rsm = tqhash_min(rack->r_ctl.tqh); 10630 changed = 0; 10631 th_ack = th->th_ack; 10632 segsiz = ctf_fixed_maxseg(rack->rc_tp); 10633 if (SEQ_GT(th_ack, tp->snd_una)) { 10634 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 10635 tp->t_acktime = ticks; 10636 } 10637 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 10638 changed = th_ack - rsm->r_start; 10639 if (changed) { 10640 rack_process_to_cumack(tp, rack, th_ack, cts, to, 10641 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); 10642 } 10643 if ((to->to_flags & TOF_SACK) == 0) { 10644 /* We are done nothing left and no sack. */ 10645 rack_handle_might_revert(tp, rack); 10646 /* 10647 * For cases where we struck a dup-ack 10648 * with no SACK, add to the changes so 10649 * PRR will work right. 10650 */ 10651 if (dup_ack_struck && (changed == 0)) { 10652 changed += ctf_fixed_maxseg(rack->rc_tp); 10653 } 10654 goto out; 10655 } 10656 /* Sack block processing */ 10657 if (SEQ_GT(th_ack, tp->snd_una)) 10658 ack_point = th_ack; 10659 else 10660 ack_point = tp->snd_una; 10661 for (i = 0; i < to->to_nsacks; i++) { 10662 bcopy((to->to_sacks + i * TCPOLEN_SACK), 10663 &sack, sizeof(sack)); 10664 sack.start = ntohl(sack.start); 10665 sack.end = ntohl(sack.end); 10666 if (SEQ_GT(sack.end, sack.start) && 10667 SEQ_GT(sack.start, ack_point) && 10668 SEQ_LT(sack.start, tp->snd_max) && 10669 SEQ_GT(sack.end, ack_point) && 10670 SEQ_LEQ(sack.end, tp->snd_max)) { 10671 sack_blocks[num_sack_blks] = sack; 10672 num_sack_blks++; 10673 } else if (SEQ_LEQ(sack.start, th_ack) && 10674 SEQ_LEQ(sack.end, th_ack)) { 10675 int was_tlp; 10676 10677 if (dsack_seen != NULL) 10678 *dsack_seen = 1; 10679 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 10680 /* 10681 * Its a D-SACK block. 10682 */ 10683 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 10684 } 10685 } 10686 if (rack->rc_dsack_round_seen) { 10687 /* Is the dsack roound over? */ 10688 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 10689 /* Yes it is */ 10690 rack->rc_dsack_round_seen = 0; 10691 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 10692 } 10693 } 10694 /* 10695 * Sort the SACK blocks so we can update the rack scoreboard with 10696 * just one pass. 10697 */ 10698 num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks, 10699 num_sack_blks, th->th_ack); 10700 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 10701 if (sacks_seen != NULL) 10702 *sacks_seen = num_sack_blks; 10703 if (num_sack_blks == 0) { 10704 /* Nothing to sack */ 10705 goto out; 10706 } 10707 /* Its a sack of some sort */ 10708 if (num_sack_blks < 2) { 10709 /* Only one, we don't need to sort */ 10710 goto do_sack_work; 10711 } 10712 /* Sort the sacks */ 10713 for (i = 0; i < num_sack_blks; i++) { 10714 for (j = i + 1; j < num_sack_blks; j++) { 10715 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 10716 sack = sack_blocks[i]; 10717 sack_blocks[i] = sack_blocks[j]; 10718 sack_blocks[j] = sack; 10719 } 10720 } 10721 } 10722 /* 10723 * Now are any of the sack block ends the same (yes some 10724 * implementations send these)? 10725 */ 10726 again: 10727 if (num_sack_blks == 0) 10728 goto out; 10729 if (num_sack_blks > 1) { 10730 for (i = 0; i < num_sack_blks; i++) { 10731 for (j = i + 1; j < num_sack_blks; j++) { 10732 if (sack_blocks[i].end == sack_blocks[j].end) { 10733 /* 10734 * Ok these two have the same end we 10735 * want the smallest end and then 10736 * throw away the larger and start 10737 * again. 10738 */ 10739 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 10740 /* 10741 * The second block covers 10742 * more area use that 10743 */ 10744 sack_blocks[i].start = sack_blocks[j].start; 10745 } 10746 /* 10747 * Now collapse out the dup-sack and 10748 * lower the count 10749 */ 10750 for (k = (j + 1); k < num_sack_blks; k++) { 10751 sack_blocks[j].start = sack_blocks[k].start; 10752 sack_blocks[j].end = sack_blocks[k].end; 10753 j++; 10754 } 10755 num_sack_blks--; 10756 goto again; 10757 } 10758 } 10759 } 10760 } 10761 do_sack_work: 10762 /* 10763 * First lets look to see if 10764 * we have retransmitted and 10765 * can use the transmit next? 10766 */ 10767 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10768 if (rsm && 10769 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 10770 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 10771 /* 10772 * We probably did the FR and the next 10773 * SACK in continues as we would expect. 10774 */ 10775 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz); 10776 if (acked) { 10777 rack->r_wanted_output = 1; 10778 changed += acked; 10779 } 10780 if (num_sack_blks == 1) { 10781 goto out; 10782 } else { 10783 /* 10784 * Start the loop through the 10785 * rest of blocks, past the first block. 10786 */ 10787 loop_start = 1; 10788 } 10789 } 10790 rsm = rack->r_ctl.rc_sacklast; 10791 for (i = loop_start; i < num_sack_blks; i++) { 10792 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz); 10793 if (acked) { 10794 rack->r_wanted_output = 1; 10795 changed += acked; 10796 } 10797 } 10798 out: 10799 if (changed) { 10800 /* Something changed cancel the rack timer */ 10801 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10802 } 10803 tsused = tcp_get_usecs(NULL); 10804 rsm = tcp_rack_output(tp, rack, tsused); 10805 if ((!IN_FASTRECOVERY(tp->t_flags)) && 10806 rsm && 10807 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 10808 /* Enter recovery */ 10809 entered_recovery = 1; 10810 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 10811 /* 10812 * When we enter recovery we need to assure we send 10813 * one packet. 10814 */ 10815 if (rack->rack_no_prr == 0) { 10816 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 10817 rack_log_to_prr(rack, 8, 0, __LINE__); 10818 } 10819 rack->r_timer_override = 1; 10820 rack->r_early = 0; 10821 rack->r_ctl.rc_agg_early = 0; 10822 } else if (IN_FASTRECOVERY(tp->t_flags) && 10823 rsm && 10824 (rack->r_rr_config == 3)) { 10825 /* 10826 * Assure we can output and we get no 10827 * remembered pace time except the retransmit. 10828 */ 10829 rack->r_timer_override = 1; 10830 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10831 rack->r_ctl.rc_resend = rsm; 10832 } 10833 if (IN_FASTRECOVERY(tp->t_flags) && 10834 (rack->rack_no_prr == 0) && 10835 (entered_recovery == 0)) { 10836 rack_update_prr(tp, rack, changed, th_ack); 10837 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 10838 ((tcp_in_hpts(rack->rc_tp) == 0) && 10839 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 10840 /* 10841 * If you are pacing output you don't want 10842 * to override. 10843 */ 10844 rack->r_early = 0; 10845 rack->r_ctl.rc_agg_early = 0; 10846 rack->r_timer_override = 1; 10847 } 10848 } 10849 } 10850 10851 static void 10852 rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) 10853 { 10854 struct rack_sendmap *rsm; 10855 10856 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10857 while (rsm) { 10858 /* 10859 * We need to skip anything already set 10860 * to be retransmitted. 10861 */ 10862 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 10863 (rsm->r_flags & RACK_MUST_RXT)) { 10864 rsm = TAILQ_NEXT(rsm, r_tnext); 10865 continue; 10866 } 10867 break; 10868 } 10869 if (rsm && (rsm->r_dupack < 0xff)) { 10870 rsm->r_dupack++; 10871 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 10872 struct timeval tv; 10873 uint32_t cts; 10874 /* 10875 * Here we see if we need to retransmit. For 10876 * a SACK type connection if enough time has passed 10877 * we will get a return of the rsm. For a non-sack 10878 * connection we will get the rsm returned if the 10879 * dupack value is 3 or more. 10880 */ 10881 cts = tcp_get_usecs(&tv); 10882 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 10883 if (rack->r_ctl.rc_resend != NULL) { 10884 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 10885 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 10886 th_ack, __LINE__); 10887 } 10888 rack->r_wanted_output = 1; 10889 rack->r_timer_override = 1; 10890 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 10891 } 10892 } else { 10893 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 10894 } 10895 } 10896 } 10897 10898 static void 10899 rack_check_bottom_drag(struct tcpcb *tp, 10900 struct tcp_rack *rack, 10901 struct socket *so) 10902 { 10903 /* 10904 * So what is dragging bottom? 10905 * 10906 * Dragging bottom means you were under pacing and had a 10907 * delay in processing inbound acks waiting on our pacing 10908 * timer to expire. While you were waiting all of the acknowledgments 10909 * for the packets you sent have arrived. This means we are pacing 10910 * way underneath the bottleneck to the point where our Goodput 10911 * measurements stop working, since they require more than one 10912 * ack (usually at least 8 packets worth with multiple acks so we can 10913 * gauge the inter-ack times). If that occurs we have a real problem 10914 * since we are stuck in a hole that we can't get out of without 10915 * something speeding us up. 10916 * 10917 * We also check to see if we are widdling down to just one segment 10918 * outstanding. If this occurs and we have room to send in our cwnd/rwnd 10919 * then we are adding the delayed ack interval into our measurments and 10920 * we need to speed up slightly. 10921 */ 10922 uint32_t segsiz, minseg; 10923 10924 segsiz = ctf_fixed_maxseg(tp); 10925 minseg = segsiz; 10926 if (tp->snd_max == tp->snd_una) { 10927 /* 10928 * We are doing dynamic pacing and we are way 10929 * under. Basically everything got acked while 10930 * we were still waiting on the pacer to expire. 10931 * 10932 * This means we need to boost the b/w in 10933 * addition to any earlier boosting of 10934 * the multiplier. 10935 */ 10936 uint64_t lt_bw; 10937 10938 tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM); 10939 lt_bw = rack_get_lt_bw(rack); 10940 rack->rc_dragged_bottom = 1; 10941 rack_validate_multipliers_at_or_above100(rack); 10942 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 10943 (rack->dis_lt_bw == 0) && 10944 (rack->use_lesser_lt_bw == 0) && 10945 (lt_bw > 0)) { 10946 /* 10947 * Lets use the long-term b/w we have 10948 * been getting as a base. 10949 */ 10950 if (rack->rc_gp_filled == 0) { 10951 if (lt_bw > ONE_POINT_TWO_MEG) { 10952 /* 10953 * If we have no measurement 10954 * don't let us set in more than 10955 * 1.2Mbps. If we are still too 10956 * low after pacing with this we 10957 * will hopefully have a max b/w 10958 * available to sanity check things. 10959 */ 10960 lt_bw = ONE_POINT_TWO_MEG; 10961 } 10962 rack->r_ctl.rc_rtt_diff = 0; 10963 rack->r_ctl.gp_bw = lt_bw; 10964 rack->rc_gp_filled = 1; 10965 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 10966 rack->r_ctl.num_measurements = RACK_REQ_AVG; 10967 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 10968 } else if (lt_bw > rack->r_ctl.gp_bw) { 10969 rack->r_ctl.rc_rtt_diff = 0; 10970 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 10971 rack->r_ctl.num_measurements = RACK_REQ_AVG; 10972 rack->r_ctl.gp_bw = lt_bw; 10973 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 10974 } else 10975 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10976 if ((rack->gp_ready == 0) && 10977 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 10978 /* We have enough measurements now */ 10979 rack->gp_ready = 1; 10980 if (rack->dgp_on || 10981 rack->rack_hibeta) 10982 rack_set_cc_pacing(rack); 10983 if (rack->defer_options) 10984 rack_apply_deferred_options(rack); 10985 } 10986 } else { 10987 /* 10988 * zero rtt possibly?, settle for just an old increase. 10989 */ 10990 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10991 } 10992 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 10993 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 10994 minseg)) && 10995 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 10996 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 10997 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 10998 (segsiz * rack_req_segs))) { 10999 /* 11000 * We are doing dynamic GP pacing and 11001 * we have everything except 1MSS or less 11002 * bytes left out. We are still pacing away. 11003 * And there is data that could be sent, This 11004 * means we are inserting delayed ack time in 11005 * our measurements because we are pacing too slow. 11006 */ 11007 rack_validate_multipliers_at_or_above100(rack); 11008 rack->rc_dragged_bottom = 1; 11009 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11010 } 11011 } 11012 11013 #ifdef TCP_REQUEST_TRK 11014 static void 11015 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, 11016 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err) 11017 { 11018 int do_log; 11019 11020 do_log = tcp_bblogging_on(rack->rc_tp); 11021 if (do_log == 0) { 11022 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) 11023 return; 11024 /* We only allow the three below with point logging on */ 11025 if ((mod != HYBRID_LOG_RULES_APP) && 11026 (mod != HYBRID_LOG_RULES_SET) && 11027 (mod != HYBRID_LOG_REQ_COMP)) 11028 return; 11029 11030 } 11031 if (do_log) { 11032 union tcp_log_stackspecific log; 11033 struct timeval tv; 11034 11035 /* Convert our ms to a microsecond */ 11036 memset(&log, 0, sizeof(log)); 11037 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11038 log.u_bbr.flex1 = seq; 11039 log.u_bbr.cwnd_gain = line; 11040 if (cur != NULL) { 11041 uint64_t off; 11042 11043 log.u_bbr.flex2 = cur->start_seq; 11044 log.u_bbr.flex3 = cur->end_seq; 11045 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 11046 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); 11047 log.u_bbr.flex6 = cur->flags; 11048 log.u_bbr.pkts_out = cur->hybrid_flags; 11049 log.u_bbr.rttProp = cur->timestamp; 11050 log.u_bbr.cur_del_rate = cur->cspr; 11051 log.u_bbr.bw_inuse = cur->start; 11052 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); 11053 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; 11054 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); 11055 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; 11056 log.u_bbr.inhpts = 1; 11057 #ifdef TCP_REQUEST_TRK 11058 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 11059 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 11060 #endif 11061 } else { 11062 log.u_bbr.flex2 = err; 11063 } 11064 /* 11065 * Fill in flex7 to be CHD (catchup|hybrid|DGP) 11066 */ 11067 log.u_bbr.flex7 = rack->rc_catch_up; 11068 log.u_bbr.flex7 <<= 1; 11069 log.u_bbr.flex7 |= rack->rc_hybrid_mode; 11070 log.u_bbr.flex7 <<= 1; 11071 log.u_bbr.flex7 |= rack->dgp_on; 11072 /* 11073 * Compose bbr_state to be a bit wise 0000ADHF 11074 * where A is the always_pace flag 11075 * where D is the dgp_on flag 11076 * where H is the hybrid_mode on flag 11077 * where F is the use_fixed_rate flag. 11078 */ 11079 log.u_bbr.bbr_state = rack->rc_always_pace; 11080 log.u_bbr.bbr_state <<= 1; 11081 log.u_bbr.bbr_state |= rack->dgp_on; 11082 log.u_bbr.bbr_state <<= 1; 11083 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 11084 log.u_bbr.bbr_state <<= 1; 11085 log.u_bbr.bbr_state |= rack->use_fixed_rate; 11086 log.u_bbr.flex8 = mod; 11087 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; 11088 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; 11089 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11090 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; 11091 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; 11092 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; 11093 tcp_log_event(rack->rc_tp, NULL, 11094 &rack->rc_inp->inp_socket->so_rcv, 11095 &rack->rc_inp->inp_socket->so_snd, 11096 TCP_HYBRID_PACING_LOG, 0, 11097 0, &log, false, NULL, __func__, __LINE__, &tv); 11098 } 11099 } 11100 #endif 11101 11102 #ifdef TCP_REQUEST_TRK 11103 static void 11104 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11105 { 11106 struct tcp_sendfile_track *rc_cur, *orig_ent; 11107 struct tcpcb *tp; 11108 int err = 0; 11109 11110 orig_ent = rack->r_ctl.rc_last_sft; 11111 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); 11112 if (rc_cur == NULL) { 11113 /* If not in the beginning what about the end piece */ 11114 if (rack->rc_hybrid_mode) 11115 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11116 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1)); 11117 } else { 11118 err = 12345; 11119 } 11120 /* If we find no parameters we are in straight DGP mode */ 11121 if(rc_cur == NULL) { 11122 /* None found for this seq, just DGP for now */ 11123 if (rack->rc_hybrid_mode) { 11124 rack->r_ctl.client_suggested_maxseg = 0; 11125 rack->rc_catch_up = 0; 11126 if (rack->cspr_is_fcc == 0) 11127 rack->r_ctl.bw_rate_cap = 0; 11128 else 11129 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11130 } 11131 if (rack->rc_hybrid_mode) { 11132 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11133 } 11134 if (rack->r_ctl.rc_last_sft) { 11135 rack->r_ctl.rc_last_sft = NULL; 11136 } 11137 return; 11138 } 11139 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) { 11140 /* This entry was never setup for hybrid pacing on/off etc */ 11141 if (rack->rc_hybrid_mode) { 11142 rack->r_ctl.client_suggested_maxseg = 0; 11143 rack->rc_catch_up = 0; 11144 rack->r_ctl.bw_rate_cap = 0; 11145 } 11146 if (rack->r_ctl.rc_last_sft) { 11147 rack->r_ctl.rc_last_sft = NULL; 11148 } 11149 if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11150 rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND; 11151 rc_cur->first_send = cts; 11152 rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes; 11153 rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11154 } 11155 return; 11156 } 11157 /* 11158 * Ok if we have a new entry *or* have never 11159 * set up an entry we need to proceed. If 11160 * we have already set it up this entry we 11161 * just continue along with what we already 11162 * setup. 11163 */ 11164 tp = rack->rc_tp; 11165 if ((rack->r_ctl.rc_last_sft != NULL) && 11166 (rack->r_ctl.rc_last_sft == rc_cur)) { 11167 /* Its already in place */ 11168 if (rack->rc_hybrid_mode) 11169 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); 11170 return; 11171 } 11172 if (rack->rc_hybrid_mode == 0) { 11173 rack->r_ctl.rc_last_sft = rc_cur; 11174 if (orig_ent) { 11175 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 11176 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 11177 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 11178 } 11179 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11180 return; 11181 } 11182 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ 11183 /* Compensate for all the header overhead's */ 11184 if (rack->cspr_is_fcc == 0) 11185 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11186 else 11187 rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11188 } else { 11189 if (rack->rc_hybrid_mode) { 11190 if (rack->cspr_is_fcc == 0) 11191 rack->r_ctl.bw_rate_cap = 0; 11192 else 11193 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11194 } 11195 } 11196 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) 11197 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; 11198 else 11199 rack->r_ctl.client_suggested_maxseg = 0; 11200 if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) { 11201 /* 11202 * It is the same timestamp as the previous one 11203 * add the hybrid flag that will indicate we use 11204 * sendtime not arrival time for catch-up mode. 11205 */ 11206 rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME; 11207 } 11208 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && 11209 (rc_cur->cspr > 0)) { 11210 uint64_t len; 11211 11212 rack->rc_catch_up = 1; 11213 /* 11214 * Calculate the deadline time, first set the 11215 * time to when the request arrived. 11216 */ 11217 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) { 11218 /* 11219 * For cases where its a duplicate tm (we received more 11220 * than one request for a tm) we want to use now, the point 11221 * where we are just sending the first bit of the request. 11222 */ 11223 rc_cur->deadline = cts; 11224 } else { 11225 /* 11226 * Here we have a different tm from the last request 11227 * so we want to use arrival time as our base. 11228 */ 11229 rc_cur->deadline = rc_cur->localtime; 11230 } 11231 /* 11232 * Next calculate the length and compensate for 11233 * TLS if need be. 11234 */ 11235 len = rc_cur->end - rc_cur->start; 11236 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { 11237 /* 11238 * This session is doing TLS. Take a swag guess 11239 * at the overhead. 11240 */ 11241 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); 11242 } 11243 /* 11244 * Now considering the size, and the cspr, what is the time that 11245 * would be required at the cspr rate. Here we use the raw 11246 * cspr value since the client only looks at the raw data. We 11247 * do use len which includes TLS overhead, but not the TCP/IP etc. 11248 * That will get made up for in the CU pacing rate set. 11249 */ 11250 len *= HPTS_USEC_IN_SEC; 11251 len /= rc_cur->cspr; 11252 rc_cur->deadline += len; 11253 } else { 11254 rack->rc_catch_up = 0; 11255 rc_cur->deadline = 0; 11256 } 11257 if (rack->r_ctl.client_suggested_maxseg != 0) { 11258 /* 11259 * We need to reset the max pace segs if we have a 11260 * client_suggested_maxseg. 11261 */ 11262 rack_set_pace_segments(tp, rack, __LINE__, NULL); 11263 } 11264 if (orig_ent) { 11265 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 11266 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 11267 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 11268 } 11269 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11270 /* Remember it for next time and for CU mode */ 11271 rack->r_ctl.rc_last_sft = rc_cur; 11272 rack->r_ctl.last_tm_mark = rc_cur->timestamp; 11273 } 11274 #endif 11275 11276 static void 11277 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11278 { 11279 #ifdef TCP_REQUEST_TRK 11280 struct tcp_sendfile_track *ent; 11281 11282 ent = rack->r_ctl.rc_last_sft; 11283 if ((ent == NULL) || 11284 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || 11285 (SEQ_GEQ(seq, ent->end_seq))) { 11286 /* Time to update the track. */ 11287 rack_set_dgp_hybrid_mode(rack, seq, len, cts); 11288 ent = rack->r_ctl.rc_last_sft; 11289 } 11290 /* Out of all */ 11291 if (ent == NULL) { 11292 return; 11293 } 11294 if (SEQ_LT(ent->end_seq, (seq + len))) { 11295 /* 11296 * This is the case where our end_seq guess 11297 * was wrong. This is usually due to TLS having 11298 * more bytes then our guess. It could also be the 11299 * case that the client sent in two requests closely 11300 * and the SB is full of both so we are sending part 11301 * of each (end|beg). In such a case lets move this 11302 * guys end to match the end of this send. That 11303 * way it will complete when all of it is acked. 11304 */ 11305 ent->end_seq = (seq + len); 11306 if (rack->rc_hybrid_mode) 11307 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__); 11308 } 11309 /* Now validate we have set the send time of this one */ 11310 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11311 ent->flags |= TCP_TRK_TRACK_FLG_FSND; 11312 ent->first_send = cts; 11313 ent->sent_at_fs = rack->rc_tp->t_sndbytes; 11314 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11315 } 11316 #endif 11317 } 11318 11319 static void 11320 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 11321 { 11322 /* 11323 * The fast output path is enabled and we 11324 * have moved the cumack forward. Lets see if 11325 * we can expand forward the fast path length by 11326 * that amount. What we would ideally like to 11327 * do is increase the number of bytes in the 11328 * fast path block (left_to_send) by the 11329 * acked amount. However we have to gate that 11330 * by two factors: 11331 * 1) The amount outstanding and the rwnd of the peer 11332 * (i.e. we don't want to exceed the rwnd of the peer). 11333 * <and> 11334 * 2) The amount of data left in the socket buffer (i.e. 11335 * we can't send beyond what is in the buffer). 11336 * 11337 * Note that this does not take into account any increase 11338 * in the cwnd. We will only extend the fast path by 11339 * what was acked. 11340 */ 11341 uint32_t new_total, gating_val; 11342 11343 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 11344 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 11345 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 11346 if (new_total <= gating_val) { 11347 /* We can increase left_to_send by the acked amount */ 11348 counter_u64_add(rack_extended_rfo, 1); 11349 rack->r_ctl.fsb.left_to_send = new_total; 11350 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 11351 ("rack:%p left_to_send:%u sbavail:%u out:%u", 11352 rack, rack->r_ctl.fsb.left_to_send, 11353 sbavail(&rack->rc_inp->inp_socket->so_snd), 11354 (tp->snd_max - tp->snd_una))); 11355 11356 } 11357 } 11358 11359 static void 11360 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) 11361 { 11362 /* 11363 * Here any sendmap entry that points to the 11364 * beginning mbuf must be adjusted to the correct 11365 * offset. This must be called with: 11366 * 1) The socket buffer locked 11367 * 2) snd_una adjusted to its new position. 11368 * 11369 * Note that (2) implies rack_ack_received has also 11370 * been called and all the sbcut's have been done. 11371 * 11372 * We grab the first mbuf in the socket buffer and 11373 * then go through the front of the sendmap, recalculating 11374 * the stored offset for any sendmap entry that has 11375 * that mbuf. We must use the sb functions to do this 11376 * since its possible an add was done has well as 11377 * the subtraction we may have just completed. This should 11378 * not be a penalty though, since we just referenced the sb 11379 * to go in and trim off the mbufs that we freed (of course 11380 * there will be a penalty for the sendmap references though). 11381 * 11382 * Note also with INVARIANT on, we validate with a KASSERT 11383 * that the first sendmap entry has a soff of 0. 11384 * 11385 */ 11386 struct mbuf *m; 11387 struct rack_sendmap *rsm; 11388 tcp_seq snd_una; 11389 #ifdef INVARIANTS 11390 int first_processed = 0; 11391 #endif 11392 11393 snd_una = rack->rc_tp->snd_una; 11394 SOCKBUF_LOCK_ASSERT(sb); 11395 m = sb->sb_mb; 11396 rsm = tqhash_min(rack->r_ctl.tqh); 11397 if ((rsm == NULL) || (m == NULL)) { 11398 /* Nothing outstanding */ 11399 return; 11400 } 11401 /* The very first RSM's mbuf must point to the head mbuf in the sb */ 11402 KASSERT((rsm->m == m), 11403 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", 11404 rack, sb, rsm)); 11405 while (rsm->m && (rsm->m == m)) { 11406 /* one to adjust */ 11407 #ifdef INVARIANTS 11408 struct mbuf *tm; 11409 uint32_t soff; 11410 11411 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 11412 if ((rsm->orig_m_len != m->m_len) || 11413 (rsm->orig_t_space != M_TRAILINGROOM(m))){ 11414 rack_adjust_orig_mlen(rsm); 11415 } 11416 if (first_processed == 0) { 11417 KASSERT((rsm->soff == 0), 11418 ("Rack:%p rsm:%p -- rsm at head but soff not zero", 11419 rack, rsm)); 11420 first_processed = 1; 11421 } 11422 if ((rsm->soff != soff) || (rsm->m != tm)) { 11423 /* 11424 * This is not a fatal error, we anticipate it 11425 * might happen (the else code), so we count it here 11426 * so that under invariant we can see that it really 11427 * does happen. 11428 */ 11429 counter_u64_add(rack_adjust_map_bw, 1); 11430 } 11431 rsm->m = tm; 11432 rsm->soff = soff; 11433 if (tm) { 11434 rsm->orig_m_len = rsm->m->m_len; 11435 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11436 } else { 11437 rsm->orig_m_len = 0; 11438 rsm->orig_t_space = 0; 11439 } 11440 #else 11441 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 11442 if (rsm->m) { 11443 rsm->orig_m_len = rsm->m->m_len; 11444 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11445 } else { 11446 rsm->orig_m_len = 0; 11447 rsm->orig_t_space = 0; 11448 } 11449 #endif 11450 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 11451 if (rsm == NULL) 11452 break; 11453 } 11454 } 11455 11456 #ifdef TCP_REQUEST_TRK 11457 static inline void 11458 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) 11459 { 11460 struct tcp_sendfile_track *ent; 11461 int i; 11462 11463 if ((rack->rc_hybrid_mode == 0) && 11464 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { 11465 /* 11466 * Just do normal completions hybrid pacing is not on 11467 * and CLDL is off as well. 11468 */ 11469 tcp_req_check_for_comp(rack->rc_tp, th_ack); 11470 return; 11471 } 11472 /* 11473 * Originally I was just going to find the th_ack associated 11474 * with an entry. But then I realized a large strech ack could 11475 * in theory ack two or more requests at once. So instead we 11476 * need to find all entries that are completed by th_ack not 11477 * just a single entry and do our logging. 11478 */ 11479 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11480 while (ent != NULL) { 11481 /* 11482 * We may be doing hybrid pacing or CLDL and need more details possibly 11483 * so we do it manually instead of calling 11484 * tcp_req_check_for_comp() 11485 */ 11486 uint64_t laa, tim, data, cbw, ftim; 11487 11488 /* Ok this ack frees it */ 11489 rack_log_hybrid(rack, th_ack, 11490 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); 11491 rack_log_hybrid_sends(rack, ent, __LINE__); 11492 /* calculate the time based on the ack arrival */ 11493 data = ent->end - ent->start; 11494 laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 11495 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { 11496 if (ent->first_send > ent->localtime) 11497 ftim = ent->first_send; 11498 else 11499 ftim = ent->localtime; 11500 } else { 11501 /* TSNH */ 11502 ftim = ent->localtime; 11503 } 11504 if (laa > ent->localtime) 11505 tim = laa - ftim; 11506 else 11507 tim = 0; 11508 cbw = data * HPTS_USEC_IN_SEC; 11509 if (tim > 0) 11510 cbw /= tim; 11511 else 11512 cbw = 0; 11513 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__); 11514 /* 11515 * Check to see if we are freeing what we are pointing to send wise 11516 * if so be sure to NULL the pointer so we know we are no longer 11517 * set to anything. 11518 */ 11519 if (ent == rack->r_ctl.rc_last_sft) { 11520 rack->r_ctl.rc_last_sft = NULL; 11521 if (rack->rc_hybrid_mode) { 11522 rack->rc_catch_up = 0; 11523 if (rack->cspr_is_fcc == 0) 11524 rack->r_ctl.bw_rate_cap = 0; 11525 else 11526 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11527 rack->r_ctl.client_suggested_maxseg = 0; 11528 } 11529 } 11530 /* Generate the log that the tcp_netflix call would have */ 11531 tcp_req_log_req_info(rack->rc_tp, ent, 11532 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 11533 /* Free it and see if there is another one */ 11534 tcp_req_free_a_slot(rack->rc_tp, ent); 11535 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11536 } 11537 } 11538 #endif 11539 11540 11541 /* 11542 * Return value of 1, we do not need to call rack_process_data(). 11543 * return value of 0, rack_process_data can be called. 11544 * For ret_val if its 0 the TCP is locked, if its non-zero 11545 * its unlocked and probably unsafe to touch the TCB. 11546 */ 11547 static int 11548 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11549 struct tcpcb *tp, struct tcpopt *to, 11550 uint32_t tiwin, int32_t tlen, 11551 int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen) 11552 { 11553 int32_t ourfinisacked = 0; 11554 int32_t nsegs, acked_amount; 11555 int32_t acked; 11556 struct mbuf *mfree; 11557 struct tcp_rack *rack; 11558 int32_t under_pacing = 0; 11559 int32_t post_recovery = 0; 11560 uint32_t p_cwnd; 11561 11562 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11563 11564 rack = (struct tcp_rack *)tp->t_fb_ptr; 11565 if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) { 11566 /* Checking SEG.ACK against ISS is definitely redundant. */ 11567 tp->t_flags2 |= TF2_NO_ISS_CHECK; 11568 } 11569 if (!V_tcp_insecure_ack) { 11570 tcp_seq seq_min; 11571 bool ghost_ack_check; 11572 11573 if (tp->t_flags2 & TF2_NO_ISS_CHECK) { 11574 /* Check for too old ACKs (RFC 5961, Section 5.2). */ 11575 seq_min = tp->snd_una - tp->max_sndwnd; 11576 ghost_ack_check = false; 11577 } else { 11578 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { 11579 /* Checking for ghost ACKs is stricter. */ 11580 seq_min = tp->iss + 1; 11581 ghost_ack_check = true; 11582 } else { 11583 /* 11584 * Checking for too old ACKs (RFC 5961, 11585 * Section 5.2) is stricter. 11586 */ 11587 seq_min = tp->snd_una - tp->max_sndwnd; 11588 ghost_ack_check = false; 11589 } 11590 } 11591 if (SEQ_LT(th->th_ack, seq_min)) { 11592 if (ghost_ack_check) 11593 TCPSTAT_INC(tcps_rcvghostack); 11594 else 11595 TCPSTAT_INC(tcps_rcvacktooold); 11596 /* Send challenge ACK. */ 11597 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 11598 rack->r_wanted_output = 1; 11599 return (1); 11600 } 11601 } 11602 if (SEQ_GT(th->th_ack, tp->snd_max)) { 11603 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 11604 rack->r_wanted_output = 1; 11605 return (1); 11606 } 11607 if (rack->gp_ready && 11608 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11609 under_pacing = 1; 11610 } 11611 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 11612 int in_rec, dup_ack_struck = 0; 11613 int dsack_seen = 0, sacks_seen = 0; 11614 11615 in_rec = IN_FASTRECOVERY(tp->t_flags); 11616 if (rack->rc_in_persist) { 11617 tp->t_rxtshift = 0; 11618 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11619 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11620 } 11621 11622 if ((th->th_ack == tp->snd_una) && 11623 (tiwin == tp->snd_wnd) && 11624 (orig_tlen == 0) && 11625 ((to->to_flags & TOF_SACK) == 0)) { 11626 rack_strike_dupack(rack, th->th_ack); 11627 dup_ack_struck = 1; 11628 } 11629 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), 11630 dup_ack_struck, &dsack_seen, &sacks_seen); 11631 11632 } 11633 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 11634 /* 11635 * Old ack, behind (or duplicate to) the last one rcv'd 11636 * Note: We mark reordering is occuring if its 11637 * less than and we have not closed our window. 11638 */ 11639 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 11640 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 11641 if (rack->r_ctl.rc_reorder_ts == 0) 11642 rack->r_ctl.rc_reorder_ts = 1; 11643 } 11644 return (0); 11645 } 11646 /* 11647 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 11648 * something we sent. 11649 */ 11650 if (tp->t_flags & TF_NEEDSYN) { 11651 /* 11652 * T/TCP: Connection was half-synchronized, and our SYN has 11653 * been ACK'd (so connection is now fully synchronized). Go 11654 * to non-starred state, increment snd_una for ACK of SYN, 11655 * and check if we can do window scaling. 11656 */ 11657 tp->t_flags &= ~TF_NEEDSYN; 11658 tp->snd_una++; 11659 /* Do window scaling? */ 11660 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11661 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11662 tp->rcv_scale = tp->request_r_scale; 11663 /* Send window already scaled. */ 11664 } 11665 } 11666 nsegs = max(1, m->m_pkthdr.lro_nsegs); 11667 11668 acked = BYTES_THIS_ACK(tp, th); 11669 if (acked) { 11670 /* 11671 * Any time we move the cum-ack forward clear 11672 * keep-alive tied probe-not-answered. The 11673 * persists clears its own on entry. 11674 */ 11675 rack->probe_not_answered = 0; 11676 } 11677 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 11678 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 11679 /* 11680 * If we just performed our first retransmit, and the ACK arrives 11681 * within our recovery window, then it was a mistake to do the 11682 * retransmit in the first place. Recover our original cwnd and 11683 * ssthresh, and proceed to transmit where we left off. 11684 */ 11685 if ((tp->t_flags & TF_PREVVALID) && 11686 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 11687 tp->t_flags &= ~TF_PREVVALID; 11688 if (tp->t_rxtshift == 1 && 11689 (int)(ticks - tp->t_badrxtwin) < 0) 11690 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 11691 } 11692 if (acked) { 11693 /* assure we are not backed off */ 11694 tp->t_rxtshift = 0; 11695 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11696 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11697 rack->rc_tlp_in_progress = 0; 11698 rack->r_ctl.rc_tlp_cnt_out = 0; 11699 /* 11700 * If it is the RXT timer we want to 11701 * stop it, so we can restart a TLP. 11702 */ 11703 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 11704 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11705 #ifdef TCP_REQUEST_TRK 11706 rack_req_check_for_comp(rack, th->th_ack); 11707 #endif 11708 } 11709 /* 11710 * If we have a timestamp reply, update smoothed round trip time. If 11711 * no timestamp is present but transmit timer is running and timed 11712 * sequence number was acked, update smoothed round trip time. Since 11713 * we now have an rtt measurement, cancel the timer backoff (cf., 11714 * Phil Karn's retransmit alg.). Recompute the initial retransmit 11715 * timer. 11716 * 11717 * Some boxes send broken timestamp replies during the SYN+ACK 11718 * phase, ignore timestamps of 0 or we could calculate a huge RTT 11719 * and blow up the retransmit timer. 11720 */ 11721 /* 11722 * If all outstanding data is acked, stop retransmit timer and 11723 * remember to restart (more output or persist). If there is more 11724 * data to be acked, restart retransmit timer, using current 11725 * (possibly backed-off) value. 11726 */ 11727 if (acked == 0) { 11728 if (ofia) 11729 *ofia = ourfinisacked; 11730 return (0); 11731 } 11732 if (IN_RECOVERY(tp->t_flags)) { 11733 if (SEQ_LT(th->th_ack, tp->snd_recover) && 11734 (SEQ_LT(th->th_ack, tp->snd_max))) { 11735 tcp_rack_partialack(tp); 11736 } else { 11737 rack_post_recovery(tp, th->th_ack); 11738 post_recovery = 1; 11739 /* 11740 * Grab the segsiz, multiply by 2 and add the snd_cwnd 11741 * that is the max the CC should add if we are exiting 11742 * recovery and doing a late add. 11743 */ 11744 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 11745 p_cwnd <<= 1; 11746 p_cwnd += tp->snd_cwnd; 11747 } 11748 } else if ((rack->rto_from_rec == 1) && 11749 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 11750 /* 11751 * We were in recovery, hit a rxt timeout 11752 * and never re-entered recovery. The timeout(s) 11753 * made up all the lost data. In such a case 11754 * we need to clear the rto_from_rec flag. 11755 */ 11756 rack->rto_from_rec = 0; 11757 } 11758 /* 11759 * Let the congestion control algorithm update congestion control 11760 * related information. This typically means increasing the 11761 * congestion window. 11762 */ 11763 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery); 11764 if (post_recovery && 11765 (tp->snd_cwnd > p_cwnd)) { 11766 /* Must be non-newreno (cubic) getting too ahead of itself */ 11767 tp->snd_cwnd = p_cwnd; 11768 } 11769 SOCK_SENDBUF_LOCK(so); 11770 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 11771 tp->snd_wnd -= acked_amount; 11772 mfree = sbcut_locked(&so->so_snd, acked_amount); 11773 if ((sbused(&so->so_snd) == 0) && 11774 (acked > acked_amount) && 11775 (tp->t_state >= TCPS_FIN_WAIT_1) && 11776 (tp->t_flags & TF_SENTFIN)) { 11777 /* 11778 * We must be sure our fin 11779 * was sent and acked (we can be 11780 * in FIN_WAIT_1 without having 11781 * sent the fin). 11782 */ 11783 ourfinisacked = 1; 11784 } 11785 tp->snd_una = th->th_ack; 11786 /* wakeups? */ 11787 if (acked_amount && sbavail(&so->so_snd)) 11788 rack_adjust_sendmap_head(rack, &so->so_snd); 11789 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 11790 /* NB: sowwakeup_locked() does an implicit unlock. */ 11791 sowwakeup_locked(so); 11792 m_freem(mfree); 11793 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 11794 tp->snd_recover = tp->snd_una; 11795 11796 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 11797 tp->snd_nxt = tp->snd_max; 11798 } 11799 if (under_pacing && 11800 (rack->use_fixed_rate == 0) && 11801 (rack->in_probe_rtt == 0) && 11802 rack->rc_gp_dyn_mul && 11803 rack->rc_always_pace) { 11804 /* Check if we are dragging bottom */ 11805 rack_check_bottom_drag(tp, rack, so); 11806 } 11807 if (tp->snd_una == tp->snd_max) { 11808 /* Nothing left outstanding */ 11809 tp->t_flags &= ~TF_PREVVALID; 11810 if (rack->r_ctl.rc_went_idle_time == 0) 11811 rack->r_ctl.rc_went_idle_time = 1; 11812 rack->r_ctl.retran_during_recovery = 0; 11813 rack->r_ctl.dsack_byte_cnt = 0; 11814 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 11815 if (sbavail(&tptosocket(tp)->so_snd) == 0) 11816 tp->t_acktime = 0; 11817 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11818 rack->rc_suspicious = 0; 11819 /* Set need output so persist might get set */ 11820 rack->r_wanted_output = 1; 11821 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 11822 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 11823 (sbavail(&so->so_snd) == 0) && 11824 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 11825 /* 11826 * The socket was gone and the 11827 * peer sent data (now or in the past), time to 11828 * reset him. 11829 */ 11830 *ret_val = 1; 11831 /* tcp_close will kill the inp pre-log the Reset */ 11832 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 11833 tp = tcp_close(tp); 11834 ctf_do_dropwithreset(m, tp, th, tlen); 11835 return (1); 11836 } 11837 } 11838 if (ofia) 11839 *ofia = ourfinisacked; 11840 return (0); 11841 } 11842 11843 11844 static void 11845 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 11846 int dir, uint32_t flags, struct rack_sendmap *rsm) 11847 { 11848 if (tcp_bblogging_on(rack->rc_tp)) { 11849 union tcp_log_stackspecific log; 11850 struct timeval tv; 11851 11852 memset(&log, 0, sizeof(log)); 11853 log.u_bbr.flex1 = cnt; 11854 log.u_bbr.flex2 = split; 11855 log.u_bbr.flex3 = out; 11856 log.u_bbr.flex4 = line; 11857 log.u_bbr.flex5 = rack->r_must_retran; 11858 log.u_bbr.flex6 = flags; 11859 log.u_bbr.flex7 = rack->rc_has_collapsed; 11860 log.u_bbr.flex8 = dir; /* 11861 * 1 is collapsed, 0 is uncollapsed, 11862 * 2 is log of a rsm being marked, 3 is a split. 11863 */ 11864 if (rsm == NULL) 11865 log.u_bbr.rttProp = 0; 11866 else 11867 log.u_bbr.rttProp = (uintptr_t)rsm; 11868 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11869 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11870 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11871 &rack->rc_inp->inp_socket->so_rcv, 11872 &rack->rc_inp->inp_socket->so_snd, 11873 TCP_RACK_LOG_COLLAPSE, 0, 11874 0, &log, false, &tv); 11875 } 11876 } 11877 11878 static void 11879 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) 11880 { 11881 /* 11882 * Here all we do is mark the collapsed point and set the flag. 11883 * This may happen again and again, but there is no 11884 * sense splitting our map until we know where the 11885 * peer finally lands in the collapse. 11886 */ 11887 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 11888 if ((rack->rc_has_collapsed == 0) || 11889 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) 11890 counter_u64_add(rack_collapsed_win_seen, 1); 11891 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; 11892 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 11893 rack->rc_has_collapsed = 1; 11894 rack->r_collapse_point_valid = 1; 11895 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 11896 } 11897 11898 static void 11899 rack_un_collapse_window(struct tcp_rack *rack, int line) 11900 { 11901 struct rack_sendmap *nrsm, *rsm; 11902 int cnt = 0, split = 0; 11903 int insret __diagused; 11904 11905 11906 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 11907 rack->rc_has_collapsed = 0; 11908 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 11909 if (rsm == NULL) { 11910 /* Nothing to do maybe the peer ack'ed it all */ 11911 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 11912 return; 11913 } 11914 /* Now do we need to split this one? */ 11915 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 11916 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 11917 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 11918 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 11919 if (nrsm == NULL) { 11920 /* We can't get a rsm, mark all? */ 11921 nrsm = rsm; 11922 goto no_split; 11923 } 11924 /* Clone it */ 11925 split = 1; 11926 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 11927 #ifndef INVARIANTS 11928 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 11929 #else 11930 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 11931 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 11932 nrsm, insret, rack, rsm); 11933 } 11934 #endif 11935 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 11936 rack->r_ctl.last_collapse_point, __LINE__); 11937 if (rsm->r_in_tmap) { 11938 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 11939 nrsm->r_in_tmap = 1; 11940 } 11941 /* 11942 * Set in the new RSM as the 11943 * collapsed starting point 11944 */ 11945 rsm = nrsm; 11946 } 11947 11948 no_split: 11949 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { 11950 cnt++; 11951 nrsm->r_flags |= RACK_RWND_COLLAPSED; 11952 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 11953 cnt++; 11954 } 11955 if (cnt) { 11956 counter_u64_add(rack_collapsed_win, 1); 11957 } 11958 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 11959 } 11960 11961 static void 11962 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 11963 int32_t tlen, int32_t tfo_syn) 11964 { 11965 if (DELAY_ACK(tp, tlen) || tfo_syn) { 11966 rack_timer_cancel(tp, rack, 11967 rack->r_ctl.rc_rcvtime, __LINE__); 11968 tp->t_flags |= TF_DELACK; 11969 } else { 11970 rack->r_wanted_output = 1; 11971 tp->t_flags |= TF_ACKNOW; 11972 } 11973 } 11974 11975 static void 11976 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 11977 { 11978 /* 11979 * If fast output is in progress, lets validate that 11980 * the new window did not shrink on us and make it 11981 * so fast output should end. 11982 */ 11983 if (rack->r_fast_output) { 11984 uint32_t out; 11985 11986 /* 11987 * Calculate what we will send if left as is 11988 * and compare that to our send window. 11989 */ 11990 out = ctf_outstanding(tp); 11991 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 11992 /* ok we have an issue */ 11993 if (out >= tp->snd_wnd) { 11994 /* Turn off fast output the window is met or collapsed */ 11995 rack->r_fast_output = 0; 11996 } else { 11997 /* we have some room left */ 11998 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 11999 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 12000 /* If not at least 1 full segment never mind */ 12001 rack->r_fast_output = 0; 12002 } 12003 } 12004 } 12005 } 12006 } 12007 12008 /* 12009 * Return value of 1, the TCB is unlocked and most 12010 * likely gone, return value of 0, the TCP is still 12011 * locked. 12012 */ 12013 static int 12014 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 12015 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 12016 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 12017 { 12018 /* 12019 * Update window information. Don't look at window if no ACK: TAC's 12020 * send garbage on first SYN. 12021 */ 12022 int32_t nsegs; 12023 int32_t tfo_syn; 12024 struct tcp_rack *rack; 12025 12026 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12027 12028 rack = (struct tcp_rack *)tp->t_fb_ptr; 12029 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12030 if ((thflags & TH_ACK) && 12031 (SEQ_LT(tp->snd_wl1, th->th_seq) || 12032 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 12033 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 12034 /* keep track of pure window updates */ 12035 if (tlen == 0 && 12036 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 12037 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 12038 tp->snd_wnd = tiwin; 12039 rack_validate_fo_sendwin_up(tp, rack); 12040 tp->snd_wl1 = th->th_seq; 12041 tp->snd_wl2 = th->th_ack; 12042 if (tp->snd_wnd > tp->max_sndwnd) 12043 tp->max_sndwnd = tp->snd_wnd; 12044 rack->r_wanted_output = 1; 12045 } else if (thflags & TH_ACK) { 12046 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 12047 tp->snd_wnd = tiwin; 12048 rack_validate_fo_sendwin_up(tp, rack); 12049 tp->snd_wl1 = th->th_seq; 12050 tp->snd_wl2 = th->th_ack; 12051 } 12052 } 12053 if (tp->snd_wnd < ctf_outstanding(tp)) 12054 /* The peer collapsed the window */ 12055 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12056 else if (rack->rc_has_collapsed) 12057 rack_un_collapse_window(rack, __LINE__); 12058 if ((rack->r_collapse_point_valid) && 12059 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 12060 rack->r_collapse_point_valid = 0; 12061 /* Was persist timer active and now we have window space? */ 12062 if ((rack->rc_in_persist != 0) && 12063 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12064 rack->r_ctl.rc_pace_min_segs))) { 12065 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12066 tp->snd_nxt = tp->snd_max; 12067 /* Make sure we output to start the timer */ 12068 rack->r_wanted_output = 1; 12069 } 12070 /* Do we enter persists? */ 12071 if ((rack->rc_in_persist == 0) && 12072 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12073 TCPS_HAVEESTABLISHED(tp->t_state) && 12074 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12075 sbavail(&tptosocket(tp)->so_snd) && 12076 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12077 /* 12078 * Here the rwnd is less than 12079 * the pacing size, we are established, 12080 * nothing is outstanding, and there is 12081 * data to send. Enter persists. 12082 */ 12083 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 12084 } 12085 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 12086 m_freem(m); 12087 return (0); 12088 } 12089 /* 12090 * don't process the URG bit, ignore them drag 12091 * along the up. 12092 */ 12093 tp->rcv_up = tp->rcv_nxt; 12094 12095 /* 12096 * Process the segment text, merging it into the TCP sequencing 12097 * queue, and arranging for acknowledgment of receipt if necessary. 12098 * This process logically involves adjusting tp->rcv_wnd as data is 12099 * presented to the user (this happens in tcp_usrreq.c, case 12100 * PRU_RCVD). If a FIN has already been received on this connection 12101 * then we just ignore the text. 12102 */ 12103 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 12104 (tp->t_flags & TF_FASTOPEN)); 12105 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 12106 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12107 tcp_seq save_start = th->th_seq; 12108 tcp_seq save_rnxt = tp->rcv_nxt; 12109 int save_tlen = tlen; 12110 12111 m_adj(m, drop_hdrlen); /* delayed header drop */ 12112 /* 12113 * Insert segment which includes th into TCP reassembly 12114 * queue with control block tp. Set thflags to whether 12115 * reassembly now includes a segment with FIN. This handles 12116 * the common case inline (segment is the next to be 12117 * received on an established connection, and the queue is 12118 * empty), avoiding linkage into and removal from the queue 12119 * and repetition of various conversions. Set DELACK for 12120 * segments received in order, but ack immediately when 12121 * segments are out of order (so fast retransmit can work). 12122 */ 12123 if (th->th_seq == tp->rcv_nxt && 12124 SEGQ_EMPTY(tp) && 12125 (TCPS_HAVEESTABLISHED(tp->t_state) || 12126 tfo_syn)) { 12127 #ifdef NETFLIX_SB_LIMITS 12128 u_int mcnt, appended; 12129 12130 if (so->so_rcv.sb_shlim) { 12131 mcnt = m_memcnt(m); 12132 appended = 0; 12133 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12134 CFO_NOSLEEP, NULL) == false) { 12135 counter_u64_add(tcp_sb_shlim_fails, 1); 12136 m_freem(m); 12137 return (0); 12138 } 12139 } 12140 #endif 12141 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 12142 tp->rcv_nxt += tlen; 12143 if (tlen && 12144 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12145 (tp->t_fbyte_in == 0)) { 12146 tp->t_fbyte_in = ticks; 12147 if (tp->t_fbyte_in == 0) 12148 tp->t_fbyte_in = 1; 12149 if (tp->t_fbyte_out && tp->t_fbyte_in) 12150 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12151 } 12152 thflags = tcp_get_flags(th) & TH_FIN; 12153 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12154 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12155 SOCK_RECVBUF_LOCK(so); 12156 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12157 m_freem(m); 12158 } else { 12159 int32_t newsize; 12160 12161 if (tlen > 0) { 12162 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12163 if (newsize) 12164 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12165 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12166 } 12167 #ifdef NETFLIX_SB_LIMITS 12168 appended = 12169 #endif 12170 sbappendstream_locked(&so->so_rcv, m, 0); 12171 } 12172 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12173 /* NB: sorwakeup_locked() does an implicit unlock. */ 12174 sorwakeup_locked(so); 12175 #ifdef NETFLIX_SB_LIMITS 12176 if (so->so_rcv.sb_shlim && appended != mcnt) 12177 counter_fo_release(so->so_rcv.sb_shlim, 12178 mcnt - appended); 12179 #endif 12180 } else { 12181 /* 12182 * XXX: Due to the header drop above "th" is 12183 * theoretically invalid by now. Fortunately 12184 * m_adj() doesn't actually frees any mbufs when 12185 * trimming from the head. 12186 */ 12187 tcp_seq temp = save_start; 12188 12189 thflags = tcp_reass(tp, th, &temp, &tlen, m); 12190 tp->t_flags |= TF_ACKNOW; 12191 if (tp->t_flags & TF_WAKESOR) { 12192 tp->t_flags &= ~TF_WAKESOR; 12193 /* NB: sorwakeup_locked() does an implicit unlock. */ 12194 sorwakeup_locked(so); 12195 } 12196 } 12197 if ((tp->t_flags & TF_SACK_PERMIT) && 12198 (save_tlen > 0) && 12199 TCPS_HAVEESTABLISHED(tp->t_state)) { 12200 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 12201 /* 12202 * DSACK actually handled in the fastpath 12203 * above. 12204 */ 12205 tcp_update_sack_list(tp, save_start, 12206 save_start + save_tlen); 12207 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 12208 if ((tp->rcv_numsacks >= 1) && 12209 (tp->sackblks[0].end == save_start)) { 12210 /* 12211 * Partial overlap, recorded at todrop 12212 * above. 12213 */ 12214 tcp_update_sack_list(tp, 12215 tp->sackblks[0].start, 12216 tp->sackblks[0].end); 12217 } else { 12218 tcp_update_dsack_list(tp, save_start, 12219 save_start + save_tlen); 12220 } 12221 } else if (tlen >= save_tlen) { 12222 /* Update of sackblks. */ 12223 tcp_update_dsack_list(tp, save_start, 12224 save_start + save_tlen); 12225 } else if (tlen > 0) { 12226 tcp_update_dsack_list(tp, save_start, 12227 save_start + tlen); 12228 } 12229 } 12230 } else { 12231 m_freem(m); 12232 thflags &= ~TH_FIN; 12233 } 12234 12235 /* 12236 * If FIN is received ACK the FIN and let the user know that the 12237 * connection is closing. 12238 */ 12239 if (thflags & TH_FIN) { 12240 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12241 /* The socket upcall is handled by socantrcvmore. */ 12242 socantrcvmore(so); 12243 /* 12244 * If connection is half-synchronized (ie NEEDSYN 12245 * flag on) then delay ACK, so it may be piggybacked 12246 * when SYN is sent. Otherwise, since we received a 12247 * FIN then no more input can be expected, send ACK 12248 * now. 12249 */ 12250 if (tp->t_flags & TF_NEEDSYN) { 12251 rack_timer_cancel(tp, rack, 12252 rack->r_ctl.rc_rcvtime, __LINE__); 12253 tp->t_flags |= TF_DELACK; 12254 } else { 12255 tp->t_flags |= TF_ACKNOW; 12256 } 12257 tp->rcv_nxt++; 12258 } 12259 switch (tp->t_state) { 12260 /* 12261 * In SYN_RECEIVED and ESTABLISHED STATES enter the 12262 * CLOSE_WAIT state. 12263 */ 12264 case TCPS_SYN_RECEIVED: 12265 tp->t_starttime = ticks; 12266 /* FALLTHROUGH */ 12267 case TCPS_ESTABLISHED: 12268 rack_timer_cancel(tp, rack, 12269 rack->r_ctl.rc_rcvtime, __LINE__); 12270 tcp_state_change(tp, TCPS_CLOSE_WAIT); 12271 break; 12272 12273 /* 12274 * If still in FIN_WAIT_1 STATE FIN has not been 12275 * acked so enter the CLOSING state. 12276 */ 12277 case TCPS_FIN_WAIT_1: 12278 rack_timer_cancel(tp, rack, 12279 rack->r_ctl.rc_rcvtime, __LINE__); 12280 tcp_state_change(tp, TCPS_CLOSING); 12281 break; 12282 12283 /* 12284 * In FIN_WAIT_2 state enter the TIME_WAIT state, 12285 * starting the time-wait timer, turning off the 12286 * other standard timers. 12287 */ 12288 case TCPS_FIN_WAIT_2: 12289 rack_timer_cancel(tp, rack, 12290 rack->r_ctl.rc_rcvtime, __LINE__); 12291 tcp_twstart(tp); 12292 return (1); 12293 } 12294 } 12295 /* 12296 * Return any desired output. 12297 */ 12298 if ((tp->t_flags & TF_ACKNOW) || 12299 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 12300 rack->r_wanted_output = 1; 12301 } 12302 return (0); 12303 } 12304 12305 /* 12306 * Here nothing is really faster, its just that we 12307 * have broken out the fast-data path also just like 12308 * the fast-ack. 12309 */ 12310 static int 12311 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 12312 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12313 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 12314 { 12315 int32_t nsegs; 12316 int32_t newsize = 0; /* automatic sockbuf scaling */ 12317 struct tcp_rack *rack; 12318 #ifdef NETFLIX_SB_LIMITS 12319 u_int mcnt, appended; 12320 #endif 12321 12322 /* 12323 * If last ACK falls within this segment's sequence numbers, record 12324 * the timestamp. NOTE that the test is modified according to the 12325 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12326 */ 12327 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 12328 return (0); 12329 } 12330 if (tiwin && tiwin != tp->snd_wnd) { 12331 return (0); 12332 } 12333 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 12334 return (0); 12335 } 12336 if (__predict_false((to->to_flags & TOF_TS) && 12337 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 12338 return (0); 12339 } 12340 if (__predict_false((th->th_ack != tp->snd_una))) { 12341 return (0); 12342 } 12343 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 12344 return (0); 12345 } 12346 if ((to->to_flags & TOF_TS) != 0 && 12347 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12348 tp->ts_recent_age = tcp_ts_getticks(); 12349 tp->ts_recent = to->to_tsval; 12350 } 12351 rack = (struct tcp_rack *)tp->t_fb_ptr; 12352 /* 12353 * This is a pure, in-sequence data packet with nothing on the 12354 * reassembly queue and we have enough buffer space to take it. 12355 */ 12356 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12357 12358 #ifdef NETFLIX_SB_LIMITS 12359 if (so->so_rcv.sb_shlim) { 12360 mcnt = m_memcnt(m); 12361 appended = 0; 12362 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12363 CFO_NOSLEEP, NULL) == false) { 12364 counter_u64_add(tcp_sb_shlim_fails, 1); 12365 m_freem(m); 12366 return (1); 12367 } 12368 } 12369 #endif 12370 /* Clean receiver SACK report if present */ 12371 if (tp->rcv_numsacks) 12372 tcp_clean_sackreport(tp); 12373 KMOD_TCPSTAT_INC(tcps_preddat); 12374 tp->rcv_nxt += tlen; 12375 if (tlen && 12376 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12377 (tp->t_fbyte_in == 0)) { 12378 tp->t_fbyte_in = ticks; 12379 if (tp->t_fbyte_in == 0) 12380 tp->t_fbyte_in = 1; 12381 if (tp->t_fbyte_out && tp->t_fbyte_in) 12382 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12383 } 12384 /* 12385 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 12386 */ 12387 tp->snd_wl1 = th->th_seq; 12388 /* 12389 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 12390 */ 12391 tp->rcv_up = tp->rcv_nxt; 12392 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12393 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12394 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12395 12396 /* Add data to socket buffer. */ 12397 SOCK_RECVBUF_LOCK(so); 12398 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12399 m_freem(m); 12400 } else { 12401 /* 12402 * Set new socket buffer size. Give up when limit is 12403 * reached. 12404 */ 12405 if (newsize) 12406 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12407 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12408 m_adj(m, drop_hdrlen); /* delayed header drop */ 12409 #ifdef NETFLIX_SB_LIMITS 12410 appended = 12411 #endif 12412 sbappendstream_locked(&so->so_rcv, m, 0); 12413 ctf_calc_rwin(so, tp); 12414 } 12415 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12416 /* NB: sorwakeup_locked() does an implicit unlock. */ 12417 sorwakeup_locked(so); 12418 #ifdef NETFLIX_SB_LIMITS 12419 if (so->so_rcv.sb_shlim && mcnt != appended) 12420 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 12421 #endif 12422 rack_handle_delayed_ack(tp, rack, tlen, 0); 12423 if (tp->snd_una == tp->snd_max) 12424 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12425 return (1); 12426 } 12427 12428 /* 12429 * This subfunction is used to try to highly optimize the 12430 * fast path. We again allow window updates that are 12431 * in sequence to remain in the fast-path. We also add 12432 * in the __predict's to attempt to help the compiler. 12433 * Note that if we return a 0, then we can *not* process 12434 * it and the caller should push the packet into the 12435 * slow-path. 12436 */ 12437 static int 12438 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12439 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12440 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 12441 { 12442 int32_t acked; 12443 int32_t nsegs; 12444 int32_t under_pacing = 0; 12445 struct tcp_rack *rack; 12446 12447 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12448 /* Old ack, behind (or duplicate to) the last one rcv'd */ 12449 return (0); 12450 } 12451 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 12452 /* Above what we have sent? */ 12453 return (0); 12454 } 12455 if (__predict_false(tiwin == 0)) { 12456 /* zero window */ 12457 return (0); 12458 } 12459 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 12460 /* We need a SYN or a FIN, unlikely.. */ 12461 return (0); 12462 } 12463 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 12464 /* Timestamp is behind .. old ack with seq wrap? */ 12465 return (0); 12466 } 12467 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 12468 /* Still recovering */ 12469 return (0); 12470 } 12471 rack = (struct tcp_rack *)tp->t_fb_ptr; 12472 if (rack->r_ctl.rc_sacked) { 12473 /* We have sack holes on our scoreboard */ 12474 return (0); 12475 } 12476 /* Ok if we reach here, we can process a fast-ack */ 12477 if (rack->gp_ready && 12478 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12479 under_pacing = 1; 12480 } 12481 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12482 rack_log_ack(tp, to, th, 0, 0, NULL, NULL); 12483 /* Did the window get updated? */ 12484 if (tiwin != tp->snd_wnd) { 12485 tp->snd_wnd = tiwin; 12486 rack_validate_fo_sendwin_up(tp, rack); 12487 tp->snd_wl1 = th->th_seq; 12488 if (tp->snd_wnd > tp->max_sndwnd) 12489 tp->max_sndwnd = tp->snd_wnd; 12490 } 12491 /* Do we exit persists? */ 12492 if ((rack->rc_in_persist != 0) && 12493 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12494 rack->r_ctl.rc_pace_min_segs))) { 12495 rack_exit_persist(tp, rack, cts); 12496 } 12497 /* Do we enter persists? */ 12498 if ((rack->rc_in_persist == 0) && 12499 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12500 TCPS_HAVEESTABLISHED(tp->t_state) && 12501 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12502 sbavail(&tptosocket(tp)->so_snd) && 12503 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12504 /* 12505 * Here the rwnd is less than 12506 * the pacing size, we are established, 12507 * nothing is outstanding, and there is 12508 * data to send. Enter persists. 12509 */ 12510 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); 12511 } 12512 /* 12513 * If last ACK falls within this segment's sequence numbers, record 12514 * the timestamp. NOTE that the test is modified according to the 12515 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12516 */ 12517 if ((to->to_flags & TOF_TS) != 0 && 12518 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12519 tp->ts_recent_age = tcp_ts_getticks(); 12520 tp->ts_recent = to->to_tsval; 12521 } 12522 /* 12523 * This is a pure ack for outstanding data. 12524 */ 12525 KMOD_TCPSTAT_INC(tcps_predack); 12526 12527 /* 12528 * "bad retransmit" recovery. 12529 */ 12530 if ((tp->t_flags & TF_PREVVALID) && 12531 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12532 tp->t_flags &= ~TF_PREVVALID; 12533 if (tp->t_rxtshift == 1 && 12534 (int)(ticks - tp->t_badrxtwin) < 0) 12535 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12536 } 12537 /* 12538 * Recalculate the transmit timer / rtt. 12539 * 12540 * Some boxes send broken timestamp replies during the SYN+ACK 12541 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12542 * and blow up the retransmit timer. 12543 */ 12544 acked = BYTES_THIS_ACK(tp, th); 12545 12546 #ifdef TCP_HHOOK 12547 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 12548 hhook_run_tcp_est_in(tp, th, to); 12549 #endif 12550 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12551 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12552 if (acked) { 12553 struct mbuf *mfree; 12554 12555 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 12556 SOCK_SENDBUF_LOCK(so); 12557 mfree = sbcut_locked(&so->so_snd, acked); 12558 tp->snd_una = th->th_ack; 12559 /* Note we want to hold the sb lock through the sendmap adjust */ 12560 rack_adjust_sendmap_head(rack, &so->so_snd); 12561 /* Wake up the socket if we have room to write more */ 12562 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12563 sowwakeup_locked(so); 12564 m_freem(mfree); 12565 tp->t_rxtshift = 0; 12566 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12567 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12568 rack->rc_tlp_in_progress = 0; 12569 rack->r_ctl.rc_tlp_cnt_out = 0; 12570 /* 12571 * If it is the RXT timer we want to 12572 * stop it, so we can restart a TLP. 12573 */ 12574 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12575 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12576 12577 #ifdef TCP_REQUEST_TRK 12578 rack_req_check_for_comp(rack, th->th_ack); 12579 #endif 12580 } 12581 /* 12582 * Let the congestion control algorithm update congestion control 12583 * related information. This typically means increasing the 12584 * congestion window. 12585 */ 12586 if (tp->snd_wnd < ctf_outstanding(tp)) { 12587 /* The peer collapsed the window */ 12588 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12589 } else if (rack->rc_has_collapsed) 12590 rack_un_collapse_window(rack, __LINE__); 12591 if ((rack->r_collapse_point_valid) && 12592 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 12593 rack->r_collapse_point_valid = 0; 12594 /* 12595 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 12596 */ 12597 tp->snd_wl2 = th->th_ack; 12598 tp->t_dupacks = 0; 12599 m_freem(m); 12600 /* ND6_HINT(tp); *//* Some progress has been made. */ 12601 12602 /* 12603 * If all outstanding data are acked, stop retransmit timer, 12604 * otherwise restart timer using current (possibly backed-off) 12605 * value. If process is waiting for space, wakeup/selwakeup/signal. 12606 * If data are ready to send, let tcp_output decide between more 12607 * output or persist. 12608 */ 12609 if (under_pacing && 12610 (rack->use_fixed_rate == 0) && 12611 (rack->in_probe_rtt == 0) && 12612 rack->rc_gp_dyn_mul && 12613 rack->rc_always_pace) { 12614 /* Check if we are dragging bottom */ 12615 rack_check_bottom_drag(tp, rack, so); 12616 } 12617 if (tp->snd_una == tp->snd_max) { 12618 tp->t_flags &= ~TF_PREVVALID; 12619 rack->r_ctl.retran_during_recovery = 0; 12620 rack->rc_suspicious = 0; 12621 rack->r_ctl.dsack_byte_cnt = 0; 12622 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 12623 if (rack->r_ctl.rc_went_idle_time == 0) 12624 rack->r_ctl.rc_went_idle_time = 1; 12625 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12626 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12627 tp->t_acktime = 0; 12628 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12629 } 12630 if (acked && rack->r_fast_output) 12631 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 12632 if (sbavail(&so->so_snd)) { 12633 rack->r_wanted_output = 1; 12634 } 12635 return (1); 12636 } 12637 12638 /* 12639 * Return value of 1, the TCB is unlocked and most 12640 * likely gone, return value of 0, the TCP is still 12641 * locked. 12642 */ 12643 static int 12644 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 12645 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12646 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12647 { 12648 int32_t ret_val = 0; 12649 int32_t orig_tlen = tlen; 12650 int32_t todrop; 12651 int32_t ourfinisacked = 0; 12652 struct tcp_rack *rack; 12653 12654 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12655 12656 ctf_calc_rwin(so, tp); 12657 /* 12658 * If the state is SYN_SENT: if seg contains an ACK, but not for our 12659 * SYN, drop the input. if seg contains a RST, then drop the 12660 * connection. if seg does not contain SYN, then drop it. Otherwise 12661 * this is an acceptable SYN segment initialize tp->rcv_nxt and 12662 * tp->irs if seg contains ack then advance tp->snd_una if seg 12663 * contains an ECE and ECN support is enabled, the stream is ECN 12664 * capable. if SYN has been acked change to ESTABLISHED else 12665 * SYN_RCVD state arrange for segment to be acked (eventually) 12666 * continue processing rest of data/controls. 12667 */ 12668 if ((thflags & TH_ACK) && 12669 (SEQ_LEQ(th->th_ack, tp->iss) || 12670 SEQ_GT(th->th_ack, tp->snd_max))) { 12671 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 12672 ctf_do_dropwithreset(m, tp, th, tlen); 12673 return (1); 12674 } 12675 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 12676 TCP_PROBE5(connect__refused, NULL, tp, 12677 mtod(m, const char *), tp, th); 12678 tp = tcp_drop(tp, ECONNREFUSED); 12679 ctf_do_drop(m, tp); 12680 return (1); 12681 } 12682 if (thflags & TH_RST) { 12683 ctf_do_drop(m, tp); 12684 return (1); 12685 } 12686 if (!(thflags & TH_SYN)) { 12687 ctf_do_drop(m, tp); 12688 return (1); 12689 } 12690 tp->irs = th->th_seq; 12691 tcp_rcvseqinit(tp); 12692 rack = (struct tcp_rack *)tp->t_fb_ptr; 12693 if (thflags & TH_ACK) { 12694 int tfo_partial = 0; 12695 12696 KMOD_TCPSTAT_INC(tcps_connects); 12697 soisconnected(so); 12698 #ifdef MAC 12699 mac_socketpeer_set_from_mbuf(m, so); 12700 #endif 12701 /* Do window scaling on this connection? */ 12702 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12703 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12704 tp->rcv_scale = tp->request_r_scale; 12705 } 12706 tp->rcv_adv += min(tp->rcv_wnd, 12707 TCP_MAXWIN << tp->rcv_scale); 12708 /* 12709 * If not all the data that was sent in the TFO SYN 12710 * has been acked, resend the remainder right away. 12711 */ 12712 if ((tp->t_flags & TF_FASTOPEN) && 12713 (tp->snd_una != tp->snd_max)) { 12714 /* Was it a partial ack? */ 12715 if (SEQ_LT(th->th_ack, tp->snd_max)) 12716 tfo_partial = 1; 12717 } 12718 /* 12719 * If there's data, delay ACK; if there's also a FIN ACKNOW 12720 * will be turned on later. 12721 */ 12722 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 12723 rack_timer_cancel(tp, rack, 12724 rack->r_ctl.rc_rcvtime, __LINE__); 12725 tp->t_flags |= TF_DELACK; 12726 } else { 12727 rack->r_wanted_output = 1; 12728 tp->t_flags |= TF_ACKNOW; 12729 } 12730 12731 tcp_ecn_input_syn_sent(tp, thflags, iptos); 12732 12733 if (SEQ_GT(th->th_ack, tp->snd_una)) { 12734 /* 12735 * We advance snd_una for the 12736 * fast open case. If th_ack is 12737 * acknowledging data beyond 12738 * snd_una we can't just call 12739 * ack-processing since the 12740 * data stream in our send-map 12741 * will start at snd_una + 1 (one 12742 * beyond the SYN). If its just 12743 * equal we don't need to do that 12744 * and there is no send_map. 12745 */ 12746 tp->snd_una++; 12747 if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) { 12748 /* 12749 * We sent a SYN with data, and thus have a 12750 * sendmap entry with a SYN set. Lets find it 12751 * and take off the send bit and the byte and 12752 * set it up to be what we send (send it next). 12753 */ 12754 struct rack_sendmap *rsm; 12755 12756 rsm = tqhash_min(rack->r_ctl.tqh); 12757 if (rsm) { 12758 if (rsm->r_flags & RACK_HAS_SYN) { 12759 rsm->r_flags &= ~RACK_HAS_SYN; 12760 rsm->r_start++; 12761 } 12762 rack->r_ctl.rc_resend = rsm; 12763 } 12764 } 12765 } 12766 /* 12767 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 12768 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 12769 */ 12770 tp->t_starttime = ticks; 12771 if (tp->t_flags & TF_NEEDFIN) { 12772 tcp_state_change(tp, TCPS_FIN_WAIT_1); 12773 tp->t_flags &= ~TF_NEEDFIN; 12774 thflags &= ~TH_SYN; 12775 } else { 12776 tcp_state_change(tp, TCPS_ESTABLISHED); 12777 TCP_PROBE5(connect__established, NULL, tp, 12778 mtod(m, const char *), tp, th); 12779 rack_cc_conn_init(tp); 12780 } 12781 } else { 12782 /* 12783 * Received initial SYN in SYN-SENT[*] state => simultaneous 12784 * open. If segment contains CC option and there is a 12785 * cached CC, apply TAO test. If it succeeds, connection is * 12786 * half-synchronized. Otherwise, do 3-way handshake: 12787 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 12788 * there was no CC option, clear cached CC value. 12789 */ 12790 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 12791 tcp_state_change(tp, TCPS_SYN_RECEIVED); 12792 } 12793 /* 12794 * Advance th->th_seq to correspond to first data byte. If data, 12795 * trim to stay within window, dropping FIN if necessary. 12796 */ 12797 th->th_seq++; 12798 if (tlen > tp->rcv_wnd) { 12799 todrop = tlen - tp->rcv_wnd; 12800 m_adj(m, -todrop); 12801 tlen = tp->rcv_wnd; 12802 thflags &= ~TH_FIN; 12803 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 12804 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 12805 } 12806 tp->snd_wl1 = th->th_seq - 1; 12807 tp->rcv_up = th->th_seq; 12808 /* 12809 * Client side of transaction: already sent SYN and data. If the 12810 * remote host used T/TCP to validate the SYN, our data will be 12811 * ACK'd; if so, enter normal data segment processing in the middle 12812 * of step 5, ack processing. Otherwise, goto step 6. 12813 */ 12814 if (thflags & TH_ACK) { 12815 /* For syn-sent we need to possibly update the rtt */ 12816 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 12817 uint32_t t, mcts; 12818 12819 mcts = tcp_ts_getticks(); 12820 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 12821 if (!tp->t_rttlow || tp->t_rttlow > t) 12822 tp->t_rttlow = t; 12823 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 12824 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 12825 tcp_rack_xmit_timer_commit(rack, tp); 12826 } 12827 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) 12828 return (ret_val); 12829 /* We may have changed to FIN_WAIT_1 above */ 12830 if (tp->t_state == TCPS_FIN_WAIT_1) { 12831 /* 12832 * In FIN_WAIT_1 STATE in addition to the processing 12833 * for the ESTABLISHED state if our FIN is now 12834 * acknowledged then enter FIN_WAIT_2. 12835 */ 12836 if (ourfinisacked) { 12837 /* 12838 * If we can't receive any more data, then 12839 * closing user can proceed. Starting the 12840 * timer is contrary to the specification, 12841 * but if we don't get a FIN we'll hang 12842 * forever. 12843 * 12844 * XXXjl: we should release the tp also, and 12845 * use a compressed state. 12846 */ 12847 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12848 soisdisconnected(so); 12849 tcp_timer_activate(tp, TT_2MSL, 12850 (tcp_fast_finwait2_recycle ? 12851 tcp_finwait2_timeout : 12852 TP_MAXIDLE(tp))); 12853 } 12854 tcp_state_change(tp, TCPS_FIN_WAIT_2); 12855 } 12856 } 12857 } 12858 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12859 tiwin, thflags, nxt_pkt)); 12860 } 12861 12862 /* 12863 * Return value of 1, the TCB is unlocked and most 12864 * likely gone, return value of 0, the TCP is still 12865 * locked. 12866 */ 12867 static int 12868 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 12869 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12870 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12871 { 12872 struct tcp_rack *rack; 12873 int32_t orig_tlen = tlen; 12874 int32_t ret_val = 0; 12875 int32_t ourfinisacked = 0; 12876 12877 rack = (struct tcp_rack *)tp->t_fb_ptr; 12878 ctf_calc_rwin(so, tp); 12879 if ((thflags & TH_RST) || 12880 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12881 return (ctf_process_rst(m, th, so, tp)); 12882 if ((thflags & TH_ACK) && 12883 (SEQ_LEQ(th->th_ack, tp->snd_una) || 12884 SEQ_GT(th->th_ack, tp->snd_max))) { 12885 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 12886 ctf_do_dropwithreset(m, tp, th, tlen); 12887 return (1); 12888 } 12889 if (tp->t_flags & TF_FASTOPEN) { 12890 /* 12891 * When a TFO connection is in SYN_RECEIVED, the 12892 * only valid packets are the initial SYN, a 12893 * retransmit/copy of the initial SYN (possibly with 12894 * a subset of the original data), a valid ACK, a 12895 * FIN, or a RST. 12896 */ 12897 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 12898 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 12899 ctf_do_dropwithreset(m, tp, th, tlen); 12900 return (1); 12901 } else if (thflags & TH_SYN) { 12902 /* non-initial SYN is ignored */ 12903 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 12904 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 12905 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 12906 ctf_do_drop(m, NULL); 12907 return (0); 12908 } 12909 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 12910 ctf_do_drop(m, NULL); 12911 return (0); 12912 } 12913 } 12914 12915 /* 12916 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12917 * it's less than ts_recent, drop it. 12918 */ 12919 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12920 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12921 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12922 return (ret_val); 12923 } 12924 /* 12925 * In the SYN-RECEIVED state, validate that the packet belongs to 12926 * this connection before trimming the data to fit the receive 12927 * window. Check the sequence number versus IRS since we know the 12928 * sequence numbers haven't wrapped. This is a partial fix for the 12929 * "LAND" DoS attack. 12930 */ 12931 if (SEQ_LT(th->th_seq, tp->irs)) { 12932 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 12933 ctf_do_dropwithreset(m, tp, th, tlen); 12934 return (1); 12935 } 12936 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 12937 return (ret_val); 12938 } 12939 /* 12940 * If last ACK falls within this segment's sequence numbers, record 12941 * its timestamp. NOTE: 1) That the test incorporates suggestions 12942 * from the latest proposal of the tcplw@cray.com list (Braden 12943 * 1993/04/26). 2) That updating only on newer timestamps interferes 12944 * with our earlier PAWS tests, so this check should be solely 12945 * predicated on the sequence space of this segment. 3) That we 12946 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12947 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12948 * SEG.Len, This modified check allows us to overcome RFC1323's 12949 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12950 * p.869. In such cases, we can still calculate the RTT correctly 12951 * when RCV.NXT == Last.ACK.Sent. 12952 */ 12953 if ((to->to_flags & TOF_TS) != 0 && 12954 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12955 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12956 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12957 tp->ts_recent_age = tcp_ts_getticks(); 12958 tp->ts_recent = to->to_tsval; 12959 } 12960 tp->snd_wnd = tiwin; 12961 rack_validate_fo_sendwin_up(tp, rack); 12962 /* 12963 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12964 * is on (half-synchronized state), then queue data for later 12965 * processing; else drop segment and return. 12966 */ 12967 if ((thflags & TH_ACK) == 0) { 12968 if (tp->t_flags & TF_FASTOPEN) { 12969 rack_cc_conn_init(tp); 12970 } 12971 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12972 tiwin, thflags, nxt_pkt)); 12973 } 12974 KMOD_TCPSTAT_INC(tcps_connects); 12975 if (tp->t_flags & TF_SONOTCONN) { 12976 tp->t_flags &= ~TF_SONOTCONN; 12977 soisconnected(so); 12978 } 12979 /* Do window scaling? */ 12980 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12981 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12982 tp->rcv_scale = tp->request_r_scale; 12983 } 12984 /* 12985 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 12986 * FIN-WAIT-1 12987 */ 12988 tp->t_starttime = ticks; 12989 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { 12990 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 12991 tp->t_tfo_pending = NULL; 12992 } 12993 if (tp->t_flags & TF_NEEDFIN) { 12994 tcp_state_change(tp, TCPS_FIN_WAIT_1); 12995 tp->t_flags &= ~TF_NEEDFIN; 12996 } else { 12997 tcp_state_change(tp, TCPS_ESTABLISHED); 12998 TCP_PROBE5(accept__established, NULL, tp, 12999 mtod(m, const char *), tp, th); 13000 /* 13001 * TFO connections call cc_conn_init() during SYN 13002 * processing. Calling it again here for such connections 13003 * is not harmless as it would undo the snd_cwnd reduction 13004 * that occurs when a TFO SYN|ACK is retransmitted. 13005 */ 13006 if (!(tp->t_flags & TF_FASTOPEN)) 13007 rack_cc_conn_init(tp); 13008 } 13009 /* 13010 * Account for the ACK of our SYN prior to 13011 * regular ACK processing below, except for 13012 * simultaneous SYN, which is handled later. 13013 */ 13014 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 13015 tp->snd_una++; 13016 /* 13017 * If segment contains data or ACK, will call tcp_reass() later; if 13018 * not, do so now to pass queued data to user. 13019 */ 13020 if (tlen == 0 && (thflags & TH_FIN) == 0) { 13021 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 13022 (struct mbuf *)0); 13023 if (tp->t_flags & TF_WAKESOR) { 13024 tp->t_flags &= ~TF_WAKESOR; 13025 /* NB: sorwakeup_locked() does an implicit unlock. */ 13026 sorwakeup_locked(so); 13027 } 13028 } 13029 tp->snd_wl1 = th->th_seq - 1; 13030 /* For syn-recv we need to possibly update the rtt */ 13031 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13032 uint32_t t, mcts; 13033 13034 mcts = tcp_ts_getticks(); 13035 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13036 if (!tp->t_rttlow || tp->t_rttlow > t) 13037 tp->t_rttlow = t; 13038 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 13039 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13040 tcp_rack_xmit_timer_commit(rack, tp); 13041 } 13042 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13043 return (ret_val); 13044 } 13045 if (tp->t_state == TCPS_FIN_WAIT_1) { 13046 /* We could have went to FIN_WAIT_1 (or EST) above */ 13047 /* 13048 * In FIN_WAIT_1 STATE in addition to the processing for the 13049 * ESTABLISHED state if our FIN is now acknowledged then 13050 * enter FIN_WAIT_2. 13051 */ 13052 if (ourfinisacked) { 13053 /* 13054 * If we can't receive any more data, then closing 13055 * user can proceed. Starting the timer is contrary 13056 * to the specification, but if we don't get a FIN 13057 * we'll hang forever. 13058 * 13059 * XXXjl: we should release the tp also, and use a 13060 * compressed state. 13061 */ 13062 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13063 soisdisconnected(so); 13064 tcp_timer_activate(tp, TT_2MSL, 13065 (tcp_fast_finwait2_recycle ? 13066 tcp_finwait2_timeout : 13067 TP_MAXIDLE(tp))); 13068 } 13069 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13070 } 13071 } 13072 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13073 tiwin, thflags, nxt_pkt)); 13074 } 13075 13076 /* 13077 * Return value of 1, the TCB is unlocked and most 13078 * likely gone, return value of 0, the TCP is still 13079 * locked. 13080 */ 13081 static int 13082 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 13083 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13084 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13085 { 13086 int32_t ret_val = 0; 13087 int32_t orig_tlen = tlen; 13088 struct tcp_rack *rack; 13089 13090 /* 13091 * Header prediction: check for the two common cases of a 13092 * uni-directional data xfer. If the packet has no control flags, 13093 * is in-sequence, the window didn't change and we're not 13094 * retransmitting, it's a candidate. If the length is zero and the 13095 * ack moved forward, we're the sender side of the xfer. Just free 13096 * the data acked & wake any higher level process that was blocked 13097 * waiting for space. If the length is non-zero and the ack didn't 13098 * move, we're the receiver side. If we're getting packets in-order 13099 * (the reassembly queue is empty), add the data toc The socket 13100 * buffer and note that we need a delayed ack. Make sure that the 13101 * hidden state-flags are also off. Since we check for 13102 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 13103 */ 13104 rack = (struct tcp_rack *)tp->t_fb_ptr; 13105 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 13106 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 13107 __predict_true(SEGQ_EMPTY(tp)) && 13108 __predict_true(th->th_seq == tp->rcv_nxt)) { 13109 if (tlen == 0) { 13110 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 13111 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 13112 return (0); 13113 } 13114 } else { 13115 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 13116 tiwin, nxt_pkt, iptos)) { 13117 return (0); 13118 } 13119 } 13120 } 13121 ctf_calc_rwin(so, tp); 13122 13123 if ((thflags & TH_RST) || 13124 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13125 return (ctf_process_rst(m, th, so, tp)); 13126 13127 /* 13128 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13129 * synchronized state. 13130 */ 13131 if (thflags & TH_SYN) { 13132 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13133 return (ret_val); 13134 } 13135 /* 13136 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13137 * it's less than ts_recent, drop it. 13138 */ 13139 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13140 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13141 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13142 return (ret_val); 13143 } 13144 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13145 return (ret_val); 13146 } 13147 /* 13148 * If last ACK falls within this segment's sequence numbers, record 13149 * its timestamp. NOTE: 1) That the test incorporates suggestions 13150 * from the latest proposal of the tcplw@cray.com list (Braden 13151 * 1993/04/26). 2) That updating only on newer timestamps interferes 13152 * with our earlier PAWS tests, so this check should be solely 13153 * predicated on the sequence space of this segment. 3) That we 13154 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13155 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13156 * SEG.Len, This modified check allows us to overcome RFC1323's 13157 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13158 * p.869. In such cases, we can still calculate the RTT correctly 13159 * when RCV.NXT == Last.ACK.Sent. 13160 */ 13161 if ((to->to_flags & TOF_TS) != 0 && 13162 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13163 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13164 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13165 tp->ts_recent_age = tcp_ts_getticks(); 13166 tp->ts_recent = to->to_tsval; 13167 } 13168 /* 13169 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13170 * is on (half-synchronized state), then queue data for later 13171 * processing; else drop segment and return. 13172 */ 13173 if ((thflags & TH_ACK) == 0) { 13174 if (tp->t_flags & TF_NEEDSYN) { 13175 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13176 tiwin, thflags, nxt_pkt)); 13177 13178 } else if (tp->t_flags & TF_ACKNOW) { 13179 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13180 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13181 return (ret_val); 13182 } else { 13183 ctf_do_drop(m, NULL); 13184 return (0); 13185 } 13186 } 13187 /* 13188 * Ack processing. 13189 */ 13190 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 13191 return (ret_val); 13192 } 13193 if (sbavail(&so->so_snd)) { 13194 if (ctf_progress_timeout_check(tp, true)) { 13195 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 13196 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13197 return (1); 13198 } 13199 } 13200 /* State changes only happen in rack_process_data() */ 13201 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13202 tiwin, thflags, nxt_pkt)); 13203 } 13204 13205 /* 13206 * Return value of 1, the TCB is unlocked and most 13207 * likely gone, return value of 0, the TCP is still 13208 * locked. 13209 */ 13210 static int 13211 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 13212 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13213 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13214 { 13215 int32_t ret_val = 0; 13216 int32_t orig_tlen = tlen; 13217 13218 ctf_calc_rwin(so, tp); 13219 if ((thflags & TH_RST) || 13220 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13221 return (ctf_process_rst(m, th, so, tp)); 13222 /* 13223 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13224 * synchronized state. 13225 */ 13226 if (thflags & TH_SYN) { 13227 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13228 return (ret_val); 13229 } 13230 /* 13231 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13232 * it's less than ts_recent, drop it. 13233 */ 13234 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13235 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13236 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13237 return (ret_val); 13238 } 13239 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13240 return (ret_val); 13241 } 13242 /* 13243 * If last ACK falls within this segment's sequence numbers, record 13244 * its timestamp. NOTE: 1) That the test incorporates suggestions 13245 * from the latest proposal of the tcplw@cray.com list (Braden 13246 * 1993/04/26). 2) That updating only on newer timestamps interferes 13247 * with our earlier PAWS tests, so this check should be solely 13248 * predicated on the sequence space of this segment. 3) That we 13249 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13250 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13251 * SEG.Len, This modified check allows us to overcome RFC1323's 13252 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13253 * p.869. In such cases, we can still calculate the RTT correctly 13254 * when RCV.NXT == Last.ACK.Sent. 13255 */ 13256 if ((to->to_flags & TOF_TS) != 0 && 13257 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13258 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13259 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13260 tp->ts_recent_age = tcp_ts_getticks(); 13261 tp->ts_recent = to->to_tsval; 13262 } 13263 /* 13264 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13265 * is on (half-synchronized state), then queue data for later 13266 * processing; else drop segment and return. 13267 */ 13268 if ((thflags & TH_ACK) == 0) { 13269 if (tp->t_flags & TF_NEEDSYN) { 13270 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13271 tiwin, thflags, nxt_pkt)); 13272 13273 } else if (tp->t_flags & TF_ACKNOW) { 13274 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13275 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13276 return (ret_val); 13277 } else { 13278 ctf_do_drop(m, NULL); 13279 return (0); 13280 } 13281 } 13282 /* 13283 * Ack processing. 13284 */ 13285 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 13286 return (ret_val); 13287 } 13288 if (sbavail(&so->so_snd)) { 13289 if (ctf_progress_timeout_check(tp, true)) { 13290 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13291 tp, tick, PROGRESS_DROP, __LINE__); 13292 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13293 return (1); 13294 } 13295 } 13296 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13297 tiwin, thflags, nxt_pkt)); 13298 } 13299 13300 static int 13301 rack_check_data_after_close(struct mbuf *m, 13302 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 13303 { 13304 struct tcp_rack *rack; 13305 13306 rack = (struct tcp_rack *)tp->t_fb_ptr; 13307 if (rack->rc_allow_data_af_clo == 0) { 13308 close_now: 13309 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13310 /* tcp_close will kill the inp pre-log the Reset */ 13311 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13312 tp = tcp_close(tp); 13313 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 13314 ctf_do_dropwithreset(m, tp, th, *tlen); 13315 return (1); 13316 } 13317 if (sbavail(&so->so_snd) == 0) 13318 goto close_now; 13319 /* Ok we allow data that is ignored and a followup reset */ 13320 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13321 tp->rcv_nxt = th->th_seq + *tlen; 13322 tp->t_flags2 |= TF2_DROP_AF_DATA; 13323 rack->r_wanted_output = 1; 13324 *tlen = 0; 13325 return (0); 13326 } 13327 13328 /* 13329 * Return value of 1, the TCB is unlocked and most 13330 * likely gone, return value of 0, the TCP is still 13331 * locked. 13332 */ 13333 static int 13334 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 13335 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13336 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13337 { 13338 int32_t ret_val = 0; 13339 int32_t orig_tlen = tlen; 13340 int32_t ourfinisacked = 0; 13341 13342 ctf_calc_rwin(so, tp); 13343 13344 if ((thflags & TH_RST) || 13345 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13346 return (ctf_process_rst(m, th, so, tp)); 13347 /* 13348 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13349 * synchronized state. 13350 */ 13351 if (thflags & TH_SYN) { 13352 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13353 return (ret_val); 13354 } 13355 /* 13356 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13357 * it's less than ts_recent, drop it. 13358 */ 13359 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13360 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13361 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13362 return (ret_val); 13363 } 13364 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13365 return (ret_val); 13366 } 13367 /* 13368 * If new data are received on a connection after the user processes 13369 * are gone, then RST the other end. 13370 */ 13371 if ((tp->t_flags & TF_CLOSED) && tlen && 13372 rack_check_data_after_close(m, tp, &tlen, th, so)) 13373 return (1); 13374 /* 13375 * If last ACK falls within this segment's sequence numbers, record 13376 * its timestamp. NOTE: 1) That the test incorporates suggestions 13377 * from the latest proposal of the tcplw@cray.com list (Braden 13378 * 1993/04/26). 2) That updating only on newer timestamps interferes 13379 * with our earlier PAWS tests, so this check should be solely 13380 * predicated on the sequence space of this segment. 3) That we 13381 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13382 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13383 * SEG.Len, This modified check allows us to overcome RFC1323's 13384 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13385 * p.869. In such cases, we can still calculate the RTT correctly 13386 * when RCV.NXT == Last.ACK.Sent. 13387 */ 13388 if ((to->to_flags & TOF_TS) != 0 && 13389 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13390 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13391 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13392 tp->ts_recent_age = tcp_ts_getticks(); 13393 tp->ts_recent = to->to_tsval; 13394 } 13395 /* 13396 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13397 * is on (half-synchronized state), then queue data for later 13398 * processing; else drop segment and return. 13399 */ 13400 if ((thflags & TH_ACK) == 0) { 13401 if (tp->t_flags & TF_NEEDSYN) { 13402 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13403 tiwin, thflags, nxt_pkt)); 13404 } else if (tp->t_flags & TF_ACKNOW) { 13405 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13406 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13407 return (ret_val); 13408 } else { 13409 ctf_do_drop(m, NULL); 13410 return (0); 13411 } 13412 } 13413 /* 13414 * Ack processing. 13415 */ 13416 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13417 return (ret_val); 13418 } 13419 if (ourfinisacked) { 13420 /* 13421 * If we can't receive any more data, then closing user can 13422 * proceed. Starting the timer is contrary to the 13423 * specification, but if we don't get a FIN we'll hang 13424 * forever. 13425 * 13426 * XXXjl: we should release the tp also, and use a 13427 * compressed state. 13428 */ 13429 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13430 soisdisconnected(so); 13431 tcp_timer_activate(tp, TT_2MSL, 13432 (tcp_fast_finwait2_recycle ? 13433 tcp_finwait2_timeout : 13434 TP_MAXIDLE(tp))); 13435 } 13436 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13437 } 13438 if (sbavail(&so->so_snd)) { 13439 if (ctf_progress_timeout_check(tp, true)) { 13440 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13441 tp, tick, PROGRESS_DROP, __LINE__); 13442 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13443 return (1); 13444 } 13445 } 13446 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13447 tiwin, thflags, nxt_pkt)); 13448 } 13449 13450 /* 13451 * Return value of 1, the TCB is unlocked and most 13452 * likely gone, return value of 0, the TCP is still 13453 * locked. 13454 */ 13455 static int 13456 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 13457 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13458 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13459 { 13460 int32_t ret_val = 0; 13461 int32_t orig_tlen = tlen; 13462 int32_t ourfinisacked = 0; 13463 13464 ctf_calc_rwin(so, tp); 13465 13466 if ((thflags & TH_RST) || 13467 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13468 return (ctf_process_rst(m, th, so, tp)); 13469 /* 13470 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13471 * synchronized state. 13472 */ 13473 if (thflags & TH_SYN) { 13474 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13475 return (ret_val); 13476 } 13477 /* 13478 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13479 * it's less than ts_recent, drop it. 13480 */ 13481 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13482 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13483 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13484 return (ret_val); 13485 } 13486 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13487 return (ret_val); 13488 } 13489 /* 13490 * If last ACK falls within this segment's sequence numbers, record 13491 * its timestamp. NOTE: 1) That the test incorporates suggestions 13492 * from the latest proposal of the tcplw@cray.com list (Braden 13493 * 1993/04/26). 2) That updating only on newer timestamps interferes 13494 * with our earlier PAWS tests, so this check should be solely 13495 * predicated on the sequence space of this segment. 3) That we 13496 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13497 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13498 * SEG.Len, This modified check allows us to overcome RFC1323's 13499 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13500 * p.869. In such cases, we can still calculate the RTT correctly 13501 * when RCV.NXT == Last.ACK.Sent. 13502 */ 13503 if ((to->to_flags & TOF_TS) != 0 && 13504 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13505 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13506 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13507 tp->ts_recent_age = tcp_ts_getticks(); 13508 tp->ts_recent = to->to_tsval; 13509 } 13510 /* 13511 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13512 * is on (half-synchronized state), then queue data for later 13513 * processing; else drop segment and return. 13514 */ 13515 if ((thflags & TH_ACK) == 0) { 13516 if (tp->t_flags & TF_NEEDSYN) { 13517 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13518 tiwin, thflags, nxt_pkt)); 13519 } else if (tp->t_flags & TF_ACKNOW) { 13520 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13521 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13522 return (ret_val); 13523 } else { 13524 ctf_do_drop(m, NULL); 13525 return (0); 13526 } 13527 } 13528 /* 13529 * Ack processing. 13530 */ 13531 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13532 return (ret_val); 13533 } 13534 if (ourfinisacked) { 13535 tcp_twstart(tp); 13536 m_freem(m); 13537 return (1); 13538 } 13539 if (sbavail(&so->so_snd)) { 13540 if (ctf_progress_timeout_check(tp, true)) { 13541 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13542 tp, tick, PROGRESS_DROP, __LINE__); 13543 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13544 return (1); 13545 } 13546 } 13547 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13548 tiwin, thflags, nxt_pkt)); 13549 } 13550 13551 /* 13552 * Return value of 1, the TCB is unlocked and most 13553 * likely gone, return value of 0, the TCP is still 13554 * locked. 13555 */ 13556 static int 13557 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 13558 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13559 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13560 { 13561 int32_t ret_val = 0; 13562 int32_t orig_tlen; 13563 int32_t ourfinisacked = 0; 13564 13565 ctf_calc_rwin(so, tp); 13566 13567 if ((thflags & TH_RST) || 13568 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13569 return (ctf_process_rst(m, th, so, tp)); 13570 /* 13571 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13572 * synchronized state. 13573 */ 13574 if (thflags & TH_SYN) { 13575 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13576 return (ret_val); 13577 } 13578 /* 13579 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13580 * it's less than ts_recent, drop it. 13581 */ 13582 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13583 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13584 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13585 return (ret_val); 13586 } 13587 orig_tlen = tlen; 13588 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13589 return (ret_val); 13590 } 13591 /* 13592 * If last ACK falls within this segment's sequence numbers, record 13593 * its timestamp. NOTE: 1) That the test incorporates suggestions 13594 * from the latest proposal of the tcplw@cray.com list (Braden 13595 * 1993/04/26). 2) That updating only on newer timestamps interferes 13596 * with our earlier PAWS tests, so this check should be solely 13597 * predicated on the sequence space of this segment. 3) That we 13598 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13599 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13600 * SEG.Len, This modified check allows us to overcome RFC1323's 13601 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13602 * p.869. In such cases, we can still calculate the RTT correctly 13603 * when RCV.NXT == Last.ACK.Sent. 13604 */ 13605 if ((to->to_flags & TOF_TS) != 0 && 13606 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13607 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13608 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13609 tp->ts_recent_age = tcp_ts_getticks(); 13610 tp->ts_recent = to->to_tsval; 13611 } 13612 /* 13613 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13614 * is on (half-synchronized state), then queue data for later 13615 * processing; else drop segment and return. 13616 */ 13617 if ((thflags & TH_ACK) == 0) { 13618 if (tp->t_flags & TF_NEEDSYN) { 13619 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13620 tiwin, thflags, nxt_pkt)); 13621 } else if (tp->t_flags & TF_ACKNOW) { 13622 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13623 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13624 return (ret_val); 13625 } else { 13626 ctf_do_drop(m, NULL); 13627 return (0); 13628 } 13629 } 13630 /* 13631 * case TCPS_LAST_ACK: Ack processing. 13632 */ 13633 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13634 return (ret_val); 13635 } 13636 if (ourfinisacked) { 13637 tp = tcp_close(tp); 13638 ctf_do_drop(m, tp); 13639 return (1); 13640 } 13641 if (sbavail(&so->so_snd)) { 13642 if (ctf_progress_timeout_check(tp, true)) { 13643 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13644 tp, tick, PROGRESS_DROP, __LINE__); 13645 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13646 return (1); 13647 } 13648 } 13649 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13650 tiwin, thflags, nxt_pkt)); 13651 } 13652 13653 /* 13654 * Return value of 1, the TCB is unlocked and most 13655 * likely gone, return value of 0, the TCP is still 13656 * locked. 13657 */ 13658 static int 13659 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 13660 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13661 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13662 { 13663 int32_t ret_val = 0; 13664 int32_t orig_tlen = tlen; 13665 int32_t ourfinisacked = 0; 13666 13667 ctf_calc_rwin(so, tp); 13668 13669 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 13670 if ((thflags & TH_RST) || 13671 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13672 return (ctf_process_rst(m, th, so, tp)); 13673 /* 13674 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13675 * synchronized state. 13676 */ 13677 if (thflags & TH_SYN) { 13678 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13679 return (ret_val); 13680 } 13681 /* 13682 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13683 * it's less than ts_recent, drop it. 13684 */ 13685 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13686 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13687 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13688 return (ret_val); 13689 } 13690 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13691 return (ret_val); 13692 } 13693 /* 13694 * If new data are received on a connection after the user processes 13695 * are gone, then RST the other end. 13696 */ 13697 if ((tp->t_flags & TF_CLOSED) && tlen && 13698 rack_check_data_after_close(m, tp, &tlen, th, so)) 13699 return (1); 13700 /* 13701 * If last ACK falls within this segment's sequence numbers, record 13702 * its timestamp. NOTE: 1) That the test incorporates suggestions 13703 * from the latest proposal of the tcplw@cray.com list (Braden 13704 * 1993/04/26). 2) That updating only on newer timestamps interferes 13705 * with our earlier PAWS tests, so this check should be solely 13706 * predicated on the sequence space of this segment. 3) That we 13707 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13708 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13709 * SEG.Len, This modified check allows us to overcome RFC1323's 13710 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13711 * p.869. In such cases, we can still calculate the RTT correctly 13712 * when RCV.NXT == Last.ACK.Sent. 13713 */ 13714 if ((to->to_flags & TOF_TS) != 0 && 13715 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13716 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13717 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13718 tp->ts_recent_age = tcp_ts_getticks(); 13719 tp->ts_recent = to->to_tsval; 13720 } 13721 /* 13722 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13723 * is on (half-synchronized state), then queue data for later 13724 * processing; else drop segment and return. 13725 */ 13726 if ((thflags & TH_ACK) == 0) { 13727 if (tp->t_flags & TF_NEEDSYN) { 13728 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13729 tiwin, thflags, nxt_pkt)); 13730 } else if (tp->t_flags & TF_ACKNOW) { 13731 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13732 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13733 return (ret_val); 13734 } else { 13735 ctf_do_drop(m, NULL); 13736 return (0); 13737 } 13738 } 13739 /* 13740 * Ack processing. 13741 */ 13742 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13743 return (ret_val); 13744 } 13745 if (sbavail(&so->so_snd)) { 13746 if (ctf_progress_timeout_check(tp, true)) { 13747 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13748 tp, tick, PROGRESS_DROP, __LINE__); 13749 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13750 return (1); 13751 } 13752 } 13753 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13754 tiwin, thflags, nxt_pkt)); 13755 } 13756 13757 static void inline 13758 rack_clear_rate_sample(struct tcp_rack *rack) 13759 { 13760 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 13761 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 13762 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 13763 } 13764 13765 static void 13766 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 13767 { 13768 uint64_t bw_est, rate_wanted; 13769 int chged = 0; 13770 uint32_t user_max, orig_min, orig_max; 13771 13772 #ifdef TCP_REQUEST_TRK 13773 if (rack->rc_hybrid_mode && 13774 (rack->r_ctl.rc_pace_max_segs != 0) && 13775 (rack_hybrid_allow_set_maxseg == 1) && 13776 (rack->r_ctl.rc_last_sft != NULL)) { 13777 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; 13778 return; 13779 } 13780 #endif 13781 orig_min = rack->r_ctl.rc_pace_min_segs; 13782 orig_max = rack->r_ctl.rc_pace_max_segs; 13783 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 13784 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 13785 chged = 1; 13786 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 13787 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 13788 if (user_max != rack->r_ctl.rc_pace_max_segs) 13789 chged = 1; 13790 } 13791 if (rack->rc_force_max_seg) { 13792 rack->r_ctl.rc_pace_max_segs = user_max; 13793 } else if (rack->use_fixed_rate) { 13794 bw_est = rack_get_bw(rack); 13795 if ((rack->r_ctl.crte == NULL) || 13796 (bw_est != rack->r_ctl.crte->rate)) { 13797 rack->r_ctl.rc_pace_max_segs = user_max; 13798 } else { 13799 /* We are pacing right at the hardware rate */ 13800 uint32_t segsiz, pace_one; 13801 13802 if (rack_pace_one_seg || 13803 (rack->r_ctl.rc_user_set_min_segs == 1)) 13804 pace_one = 1; 13805 else 13806 pace_one = 0; 13807 segsiz = min(ctf_fixed_maxseg(tp), 13808 rack->r_ctl.rc_pace_min_segs); 13809 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( 13810 tp, bw_est, segsiz, pace_one, 13811 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 13812 } 13813 } else if (rack->rc_always_pace) { 13814 if (rack->r_ctl.gp_bw || 13815 rack->r_ctl.init_rate) { 13816 /* We have a rate of some sort set */ 13817 uint32_t orig; 13818 13819 bw_est = rack_get_bw(rack); 13820 orig = rack->r_ctl.rc_pace_max_segs; 13821 if (fill_override) 13822 rate_wanted = *fill_override; 13823 else 13824 rate_wanted = rack_get_gp_est(rack); 13825 if (rate_wanted) { 13826 /* We have something */ 13827 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 13828 rate_wanted, 13829 ctf_fixed_maxseg(rack->rc_tp)); 13830 } else 13831 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 13832 if (orig != rack->r_ctl.rc_pace_max_segs) 13833 chged = 1; 13834 } else if ((rack->r_ctl.gp_bw == 0) && 13835 (rack->r_ctl.rc_pace_max_segs == 0)) { 13836 /* 13837 * If we have nothing limit us to bursting 13838 * out IW sized pieces. 13839 */ 13840 chged = 1; 13841 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 13842 } 13843 } 13844 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 13845 chged = 1; 13846 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 13847 } 13848 if (chged) 13849 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 13850 } 13851 13852 13853 static void 13854 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) 13855 { 13856 #ifdef INET6 13857 struct ip6_hdr *ip6 = NULL; 13858 #endif 13859 #ifdef INET 13860 struct ip *ip = NULL; 13861 #endif 13862 struct udphdr *udp = NULL; 13863 13864 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 13865 #ifdef INET6 13866 if (rack->r_is_v6) { 13867 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 13868 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 13869 if (tp->t_port) { 13870 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 13871 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 13872 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13873 udp->uh_dport = tp->t_port; 13874 rack->r_ctl.fsb.udp = udp; 13875 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 13876 } else 13877 { 13878 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 13879 rack->r_ctl.fsb.udp = NULL; 13880 } 13881 tcpip_fillheaders(rack->rc_inp, 13882 tp->t_port, 13883 ip6, rack->r_ctl.fsb.th); 13884 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); 13885 } else 13886 #endif /* INET6 */ 13887 #ifdef INET 13888 { 13889 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 13890 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 13891 if (tp->t_port) { 13892 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 13893 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 13894 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13895 udp->uh_dport = tp->t_port; 13896 rack->r_ctl.fsb.udp = udp; 13897 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 13898 } else 13899 { 13900 rack->r_ctl.fsb.udp = NULL; 13901 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 13902 } 13903 tcpip_fillheaders(rack->rc_inp, 13904 tp->t_port, 13905 ip, rack->r_ctl.fsb.th); 13906 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; 13907 } 13908 #endif 13909 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), 13910 (long)TCP_MAXWIN << tp->rcv_scale); 13911 rack->r_fsb_inited = 1; 13912 } 13913 13914 static int 13915 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 13916 { 13917 /* 13918 * Allocate the larger of spaces V6 if available else just 13919 * V4 and include udphdr (overbook) 13920 */ 13921 #ifdef INET6 13922 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 13923 #else 13924 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 13925 #endif 13926 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 13927 M_TCPFSB, M_NOWAIT|M_ZERO); 13928 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 13929 return (ENOMEM); 13930 } 13931 rack->r_fsb_inited = 0; 13932 return (0); 13933 } 13934 13935 static void 13936 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) 13937 { 13938 /* 13939 * Types of logs (mod value) 13940 * 20 - Initial round setup 13941 * 21 - Rack declares a new round. 13942 */ 13943 struct tcpcb *tp; 13944 13945 tp = rack->rc_tp; 13946 if (tcp_bblogging_on(tp)) { 13947 union tcp_log_stackspecific log; 13948 struct timeval tv; 13949 13950 memset(&log, 0, sizeof(log)); 13951 log.u_bbr.flex1 = rack->r_ctl.current_round; 13952 log.u_bbr.flex2 = rack->r_ctl.roundends; 13953 log.u_bbr.flex3 = high_seq; 13954 log.u_bbr.flex4 = tp->snd_max; 13955 log.u_bbr.flex8 = mod; 13956 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13957 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 13958 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 13959 TCP_LOG_EVENTP(tp, NULL, 13960 &tptosocket(tp)->so_rcv, 13961 &tptosocket(tp)->so_snd, 13962 TCP_HYSTART, 0, 13963 0, &log, false, &tv); 13964 } 13965 } 13966 13967 static void 13968 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) 13969 { 13970 rack->rack_deferred_inited = 1; 13971 rack->r_ctl.roundends = tp->snd_max; 13972 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 13973 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 13974 } 13975 13976 static void 13977 rack_init_retransmit_value(struct tcp_rack *rack, int ctl) 13978 { 13979 /* Retransmit bit controls. 13980 * 13981 * The setting of these values control one of 13982 * three settings you can have and dictate 13983 * how rack does retransmissions. Note this 13984 * is in *any* mode i.e. pacing on or off DGP 13985 * fixed rate pacing, or just bursting rack. 13986 * 13987 * 1 - Use full sized retransmits i.e. limit 13988 * the size to whatever the pace_max_segments 13989 * size is. 13990 * 13991 * 2 - Use pacer min granularity as a guide to 13992 * the size combined with the current calculated 13993 * goodput b/w measurement. So for example if 13994 * the goodput is measured at 20Mbps we would 13995 * calculate 8125 (pacer minimum 250usec in 13996 * that b/w) and then round it up to the next 13997 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. 13998 * 13999 * 0 - The rack default 1 MSS (anything not 0/1/2 14000 * fall here too if we are setting via rack_init()). 14001 * 14002 */ 14003 if (ctl == 1) { 14004 rack->full_size_rxt = 1; 14005 rack->shape_rxt_to_pacing_min = 0; 14006 } else if (ctl == 2) { 14007 rack->full_size_rxt = 0; 14008 rack->shape_rxt_to_pacing_min = 1; 14009 } else { 14010 rack->full_size_rxt = 0; 14011 rack->shape_rxt_to_pacing_min = 0; 14012 } 14013 } 14014 14015 static void 14016 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, 14017 uint32_t flex1, 14018 uint32_t flex2, 14019 uint32_t flex3) 14020 { 14021 if (tcp_bblogging_on(rack->rc_tp)) { 14022 union tcp_log_stackspecific log; 14023 struct timeval tv; 14024 14025 memset(&log, 0, sizeof(log)); 14026 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14027 log.u_bbr.flex8 = mod; 14028 log.u_bbr.flex1 = flex1; 14029 log.u_bbr.flex2 = flex2; 14030 log.u_bbr.flex3 = flex3; 14031 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, 14032 0, &log, false, NULL, __func__, __LINE__, &tv); 14033 } 14034 } 14035 14036 static int 14037 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) 14038 { 14039 struct tcp_rack *rack; 14040 struct rack_sendmap *rsm; 14041 int i; 14042 14043 14044 rack = (struct tcp_rack *)tp->t_fb_ptr; 14045 switch (reqr->req) { 14046 case TCP_QUERY_SENDMAP: 14047 if ((reqr->req_param == tp->snd_max) || 14048 (tp->snd_max == tp->snd_una)){ 14049 /* Unlikely */ 14050 return (0); 14051 } 14052 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); 14053 if (rsm == NULL) { 14054 /* Can't find that seq -- unlikely */ 14055 return (0); 14056 } 14057 reqr->sendmap_start = rsm->r_start; 14058 reqr->sendmap_end = rsm->r_end; 14059 reqr->sendmap_send_cnt = rsm->r_rtr_cnt; 14060 reqr->sendmap_fas = rsm->r_fas; 14061 if (reqr->sendmap_send_cnt > SNDMAP_NRTX) 14062 reqr->sendmap_send_cnt = SNDMAP_NRTX; 14063 for(i=0; i<reqr->sendmap_send_cnt; i++) 14064 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; 14065 reqr->sendmap_ack_arrival = rsm->r_ack_arrival; 14066 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; 14067 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; 14068 reqr->sendmap_dupacks = rsm->r_dupack; 14069 rack_log_chg_info(tp, rack, 1, 14070 rsm->r_start, 14071 rsm->r_end, 14072 rsm->r_flags); 14073 return(1); 14074 break; 14075 case TCP_QUERY_TIMERS_UP: 14076 if (rack->r_ctl.rc_hpts_flags == 0) { 14077 /* no timers up */ 14078 return (0); 14079 } 14080 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; 14081 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14082 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; 14083 } 14084 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14085 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; 14086 } 14087 rack_log_chg_info(tp, rack, 2, 14088 rack->r_ctl.rc_hpts_flags, 14089 rack->r_ctl.rc_last_output_to, 14090 rack->r_ctl.rc_timer_exp); 14091 return (1); 14092 break; 14093 case TCP_QUERY_RACK_TIMES: 14094 /* Reordering items */ 14095 reqr->rack_num_dsacks = rack->r_ctl.num_dsack; 14096 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; 14097 /* Timerstamps and timers */ 14098 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; 14099 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; 14100 reqr->rack_rtt = rack->rc_rack_rtt; 14101 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; 14102 reqr->rack_srtt_measured = rack->rc_srtt_measure_made; 14103 /* PRR data */ 14104 reqr->rack_sacked = rack->r_ctl.rc_sacked; 14105 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; 14106 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; 14107 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; 14108 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; 14109 reqr->rack_prr_out = rack->r_ctl.rc_prr_out; 14110 /* TLP and persists info */ 14111 reqr->rack_tlp_out = rack->rc_tlp_in_progress; 14112 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; 14113 if (rack->rc_in_persist) { 14114 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; 14115 reqr->rack_in_persist = 1; 14116 } else { 14117 reqr->rack_time_went_idle = 0; 14118 reqr->rack_in_persist = 0; 14119 } 14120 if (rack->r_wanted_output) 14121 reqr->rack_wanted_output = 1; 14122 else 14123 reqr->rack_wanted_output = 0; 14124 return (1); 14125 break; 14126 default: 14127 return (-EINVAL); 14128 } 14129 } 14130 14131 static void 14132 rack_switch_failed(struct tcpcb *tp) 14133 { 14134 /* 14135 * This method gets called if a stack switch was 14136 * attempted and it failed. We are left 14137 * but our hpts timers were stopped and we 14138 * need to validate time units and t_flags2. 14139 */ 14140 struct tcp_rack *rack; 14141 struct timeval tv; 14142 uint32_t cts; 14143 uint32_t toval; 14144 struct hpts_diag diag; 14145 14146 rack = (struct tcp_rack *)tp->t_fb_ptr; 14147 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 14148 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14149 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14150 else 14151 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14152 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14153 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14154 if (tp->t_in_hpts > IHPTS_NONE) { 14155 /* Strange */ 14156 return; 14157 } 14158 cts = tcp_get_usecs(&tv); 14159 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14160 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 14161 toval = rack->r_ctl.rc_last_output_to - cts; 14162 } else { 14163 /* one slot please */ 14164 toval = HPTS_USECS_PER_SLOT; 14165 } 14166 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14167 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 14168 toval = rack->r_ctl.rc_timer_exp - cts; 14169 } else { 14170 /* one slot please */ 14171 toval = HPTS_USECS_PER_SLOT; 14172 } 14173 } else 14174 toval = HPTS_USECS_PER_SLOT; 14175 tcp_hpts_insert(tp, toval, &diag); 14176 rack_log_hpts_diag(rack, cts, &diag, &tv); 14177 } 14178 14179 static int 14180 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) 14181 { 14182 struct rack_sendmap *rsm, *ersm; 14183 int insret __diagused; 14184 /* 14185 * When initing outstanding, we must be quite careful 14186 * to not refer to tp->t_fb_ptr. This has the old rack 14187 * pointer in it, not the "new" one (when we are doing 14188 * a stack switch). 14189 */ 14190 14191 14192 if (tp->t_fb->tfb_chg_query == NULL) { 14193 /* Create a send map for the current outstanding data */ 14194 14195 rsm = rack_alloc(rack); 14196 if (rsm == NULL) { 14197 uma_zfree(rack_pcb_zone, ptr); 14198 return (ENOMEM); 14199 } 14200 rsm->r_no_rtt_allowed = 1; 14201 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 14202 rsm->r_rtr_cnt = 1; 14203 rsm->r_rtr_bytes = 0; 14204 if (tp->t_flags & TF_SENTFIN) 14205 rsm->r_flags |= RACK_HAS_FIN; 14206 rsm->r_end = tp->snd_max; 14207 if (tp->snd_una == tp->iss) { 14208 /* The data space is one beyond snd_una */ 14209 rsm->r_flags |= RACK_HAS_SYN; 14210 rsm->r_start = tp->iss; 14211 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 14212 } else 14213 rsm->r_start = tp->snd_una; 14214 rsm->r_dupack = 0; 14215 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 14216 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 14217 if (rsm->m) { 14218 rsm->orig_m_len = rsm->m->m_len; 14219 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14220 } else { 14221 rsm->orig_m_len = 0; 14222 rsm->orig_t_space = 0; 14223 } 14224 } else { 14225 /* 14226 * This can happen if we have a stand-alone FIN or 14227 * SYN. 14228 */ 14229 rsm->m = NULL; 14230 rsm->orig_m_len = 0; 14231 rsm->orig_t_space = 0; 14232 rsm->soff = 0; 14233 } 14234 #ifdef INVARIANTS 14235 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14236 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14237 insret, rack, rsm); 14238 } 14239 #else 14240 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14241 #endif 14242 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14243 rsm->r_in_tmap = 1; 14244 } else { 14245 /* We have a query mechanism, lets use it */ 14246 struct tcp_query_resp qr; 14247 int i; 14248 tcp_seq at; 14249 14250 at = tp->snd_una; 14251 while (at != tp->snd_max) { 14252 memset(&qr, 0, sizeof(qr)); 14253 qr.req = TCP_QUERY_SENDMAP; 14254 qr.req_param = at; 14255 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) 14256 break; 14257 /* Move forward */ 14258 at = qr.sendmap_end; 14259 /* Now lets build the entry for this one */ 14260 rsm = rack_alloc(rack); 14261 if (rsm == NULL) { 14262 uma_zfree(rack_pcb_zone, ptr); 14263 return (ENOMEM); 14264 } 14265 memset(rsm, 0, sizeof(struct rack_sendmap)); 14266 /* Now configure the rsm and insert it */ 14267 rsm->r_dupack = qr.sendmap_dupacks; 14268 rsm->r_start = qr.sendmap_start; 14269 rsm->r_end = qr.sendmap_end; 14270 if (qr.sendmap_fas) 14271 rsm->r_fas = qr.sendmap_end; 14272 else 14273 rsm->r_fas = rsm->r_start - tp->snd_una; 14274 /* 14275 * We have carefully aligned the bits 14276 * so that all we have to do is copy over 14277 * the bits with the mask. 14278 */ 14279 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; 14280 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; 14281 rsm->r_rtr_cnt = qr.sendmap_send_cnt; 14282 rsm->r_ack_arrival = qr.sendmap_ack_arrival; 14283 for (i=0 ; i<rsm->r_rtr_cnt; i++) 14284 rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; 14285 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 14286 (rsm->r_start - tp->snd_una), &rsm->soff); 14287 if (rsm->m) { 14288 rsm->orig_m_len = rsm->m->m_len; 14289 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14290 } else { 14291 rsm->orig_m_len = 0; 14292 rsm->orig_t_space = 0; 14293 } 14294 #ifdef INVARIANTS 14295 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14296 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14297 insret, rack, rsm); 14298 } 14299 #else 14300 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14301 #endif 14302 if ((rsm->r_flags & RACK_ACKED) == 0) { 14303 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { 14304 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > 14305 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { 14306 /* 14307 * If the existing ersm was sent at 14308 * a later time than the new one, then 14309 * the new one should appear ahead of this 14310 * ersm. 14311 */ 14312 rsm->r_in_tmap = 1; 14313 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); 14314 break; 14315 } 14316 } 14317 if (rsm->r_in_tmap == 0) { 14318 /* 14319 * Not found so shove it on the tail. 14320 */ 14321 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14322 rsm->r_in_tmap = 1; 14323 } 14324 } else { 14325 if ((rack->r_ctl.rc_sacklast == NULL) || 14326 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { 14327 rack->r_ctl.rc_sacklast = rsm; 14328 } 14329 } 14330 rack_log_chg_info(tp, rack, 3, 14331 rsm->r_start, 14332 rsm->r_end, 14333 rsm->r_flags); 14334 } 14335 } 14336 return (0); 14337 } 14338 14339 14340 static int32_t 14341 rack_init(struct tcpcb *tp, void **ptr) 14342 { 14343 struct inpcb *inp = tptoinpcb(tp); 14344 struct tcp_rack *rack = NULL; 14345 uint32_t iwin, snt, us_cts; 14346 size_t sz; 14347 int err, no_query; 14348 14349 tcp_hpts_init(tp); 14350 14351 /* 14352 * First are we the initial or are we a switched stack? 14353 * If we are initing via tcp_newtcppcb the ptr passed 14354 * will be tp->t_fb_ptr. If its a stack switch that 14355 * has a previous stack we can query it will be a local 14356 * var that will in the end be set into t_fb_ptr. 14357 */ 14358 if (ptr == &tp->t_fb_ptr) 14359 no_query = 1; 14360 else 14361 no_query = 0; 14362 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 14363 if (*ptr == NULL) { 14364 /* 14365 * We need to allocate memory but cant. The INP and INP_INFO 14366 * locks and they are recursive (happens during setup. So a 14367 * scheme to drop the locks fails :( 14368 * 14369 */ 14370 return(ENOMEM); 14371 } 14372 memset(*ptr, 0, sizeof(struct tcp_rack)); 14373 rack = (struct tcp_rack *)*ptr; 14374 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); 14375 if (rack->r_ctl.tqh == NULL) { 14376 uma_zfree(rack_pcb_zone, rack); 14377 return(ENOMEM); 14378 } 14379 tqhash_init(rack->r_ctl.tqh); 14380 TAILQ_INIT(&rack->r_ctl.rc_free); 14381 TAILQ_INIT(&rack->r_ctl.rc_tmap); 14382 rack->rc_tp = tp; 14383 rack->rc_inp = inp; 14384 /* Set the flag */ 14385 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 14386 /* Probably not needed but lets be sure */ 14387 rack_clear_rate_sample(rack); 14388 /* 14389 * Save off the default values, socket options will poke 14390 * at these if pacing is not on or we have not yet 14391 * reached where pacing is on (gp_ready/fixed enabled). 14392 * When they get set into the CC module (when gp_ready 14393 * is enabled or we enable fixed) then we will set these 14394 * values into the CC and place in here the old values 14395 * so we have a restoral. Then we will set the flag 14396 * rc_pacing_cc_set. That way whenever we turn off pacing 14397 * or switch off this stack, we will know to go restore 14398 * the saved values. 14399 * 14400 * We specifically put into the beta the ecn value for pacing. 14401 */ 14402 rack->rc_new_rnd_needed = 1; 14403 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; 14404 /* We want abe like behavior as well */ 14405 14406 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 14407 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 14408 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 14409 if (rack_fill_cw_state) 14410 rack->rc_pace_to_cwnd = 1; 14411 if (rack_pacing_min_seg) 14412 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; 14413 if (use_rack_rr) 14414 rack->use_rack_rr = 1; 14415 if (rack_dnd_default) { 14416 rack->rc_pace_dnd = 1; 14417 } 14418 if (V_tcp_delack_enabled) 14419 tp->t_delayed_ack = 1; 14420 else 14421 tp->t_delayed_ack = 0; 14422 #ifdef TCP_ACCOUNTING 14423 if (rack_tcp_accounting) { 14424 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 14425 } 14426 #endif 14427 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY; 14428 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); 14429 rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT); 14430 if (rack->r_ctl.pcm_s == NULL) { 14431 rack->r_ctl.pcm_i.cnt_alloc = 0; 14432 } 14433 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 14434 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 14435 if (rack_enable_shared_cwnd) 14436 rack->rack_enable_scwnd = 1; 14437 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 14438 rack->rc_user_set_max_segs = rack_hptsi_segments; 14439 rack->r_ctl.max_reduction = rack_max_reduce; 14440 rack->rc_force_max_seg = 0; 14441 TAILQ_INIT(&rack->r_ctl.opt_list); 14442 rack->r_ctl.rc_saved_beta = V_newreno_beta_ecn; 14443 rack->r_ctl.rc_saved_beta_ecn = V_newreno_beta_ecn; 14444 if (rack_hibeta_setting) { 14445 rack->rack_hibeta = 1; 14446 if ((rack_hibeta_setting >= 50) && 14447 (rack_hibeta_setting <= 100)) { 14448 rack->r_ctl.rc_saved_beta = rack_hibeta_setting; 14449 rack->r_ctl.saved_hibeta = rack_hibeta_setting; 14450 } 14451 } else { 14452 rack->r_ctl.saved_hibeta = 50; 14453 } 14454 /* 14455 * We initialize to all ones so we never match 0 14456 * just in case the client sends in 0, it hopefully 14457 * will never have all 1's in ms :-) 14458 */ 14459 rack->r_ctl.last_tm_mark = 0xffffffffffffffff; 14460 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 14461 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 14462 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 14463 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 14464 rack->r_ctl.rc_highest_us_rtt = 0; 14465 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 14466 rack->pcm_enabled = rack_pcm_is_enabled; 14467 if (rack_fillcw_bw_cap) 14468 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 14469 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 14470 if (rack_use_cmp_acks) 14471 rack->r_use_cmp_ack = 1; 14472 if (rack_disable_prr) 14473 rack->rack_no_prr = 1; 14474 if (rack_gp_no_rec_chg) 14475 rack->rc_gp_no_rec_chg = 1; 14476 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 14477 rack->r_ctl.pacing_method |= RACK_REG_PACING; 14478 rack->rc_always_pace = 1; 14479 if (rack->rack_hibeta) 14480 rack_set_cc_pacing(rack); 14481 } else 14482 rack->rc_always_pace = 0; 14483 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 14484 rack->r_mbuf_queue = 1; 14485 else 14486 rack->r_mbuf_queue = 0; 14487 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14488 if (rack_limits_scwnd) 14489 rack->r_limit_scw = 1; 14490 else 14491 rack->r_limit_scw = 0; 14492 rack_init_retransmit_value(rack, rack_rxt_controls); 14493 rack->rc_labc = V_tcp_abc_l_var; 14494 if (rack_honors_hpts_min_to) 14495 rack->r_use_hpts_min = 1; 14496 if (tp->snd_una != 0) { 14497 rack->rc_sendvars_notset = 0; 14498 /* 14499 * Make sure any TCP timers are not running. 14500 */ 14501 tcp_timer_stop(tp); 14502 } else { 14503 /* 14504 * Server side, we are called from the 14505 * syn-cache. This means none of the 14506 * snd_una/max are set yet so we have 14507 * to defer this until the first send. 14508 */ 14509 rack->rc_sendvars_notset = 1; 14510 } 14511 14512 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 14513 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 14514 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 14515 rack->r_ctl.rc_min_to = rack_min_to; 14516 microuptime(&rack->r_ctl.act_rcv_time); 14517 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 14518 if (rack_hw_up_only) 14519 rack->r_up_only = 1; 14520 if (rack_do_dyn_mul) { 14521 /* When dynamic adjustment is on CA needs to start at 100% */ 14522 rack->rc_gp_dyn_mul = 1; 14523 if (rack_do_dyn_mul >= 100) 14524 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 14525 } else 14526 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 14527 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 14528 if (rack_timely_off) { 14529 rack->rc_skip_timely = 1; 14530 } 14531 if (rack->rc_skip_timely) { 14532 rack->r_ctl.rack_per_of_gp_rec = 90; 14533 rack->r_ctl.rack_per_of_gp_ca = 100; 14534 rack->r_ctl.rack_per_of_gp_ss = 250; 14535 } 14536 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 14537 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 14538 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 14539 14540 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 14541 rack_probertt_filter_life); 14542 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 14543 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 14544 rack->r_ctl.rc_time_of_last_probertt = us_cts; 14545 rack->r_ctl.rc_went_idle_time = us_cts; 14546 rack->r_ctl.rc_time_probertt_starts = 0; 14547 14548 rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff; 14549 if (rack_rnd_cnt_req & 0x10000) 14550 rack->r_ctl.gate_to_fs = 1; 14551 rack->r_ctl.gp_gain_req = rack_gp_gain_req; 14552 if ((rack_rnd_cnt_req & 0x100) > 0) { 14553 14554 } 14555 if (rack_dsack_std_based & 0x1) { 14556 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 14557 rack->rc_rack_tmr_std_based = 1; 14558 } 14559 if (rack_dsack_std_based & 0x2) { 14560 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 14561 rack->rc_rack_use_dsack = 1; 14562 } 14563 /* We require at least one measurement, even if the sysctl is 0 */ 14564 if (rack_req_measurements) 14565 rack->r_ctl.req_measurements = rack_req_measurements; 14566 else 14567 rack->r_ctl.req_measurements = 1; 14568 if (rack_enable_hw_pacing) 14569 rack->rack_hdw_pace_ena = 1; 14570 if (rack_hw_rate_caps) 14571 rack->r_rack_hw_rate_caps = 1; 14572 if (rack_non_rxt_use_cr) 14573 rack->rack_rec_nonrxt_use_cr = 1; 14574 /* Lets setup the fsb block */ 14575 err = rack_init_fsb(tp, rack); 14576 if (err) { 14577 uma_zfree(rack_pcb_zone, *ptr); 14578 *ptr = NULL; 14579 return (err); 14580 } 14581 if (rack_do_hystart) { 14582 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 14583 if (rack_do_hystart > 1) 14584 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 14585 if (rack_do_hystart > 2) 14586 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 14587 } 14588 /* Log what we will do with queries */ 14589 rack_log_chg_info(tp, rack, 7, 14590 no_query, 0, 0); 14591 if (rack_def_profile) 14592 rack_set_profile(rack, rack_def_profile); 14593 /* Cancel the GP measurement in progress */ 14594 tp->t_flags &= ~TF_GPUTINPROG; 14595 if ((tp->t_state != TCPS_CLOSED) && 14596 (tp->t_state != TCPS_TIME_WAIT)) { 14597 /* 14598 * We are already open, we may 14599 * need to adjust a few things. 14600 */ 14601 if (SEQ_GT(tp->snd_max, tp->iss)) 14602 snt = tp->snd_max - tp->iss; 14603 else 14604 snt = 0; 14605 iwin = rc_init_window(rack); 14606 if ((snt < iwin) && 14607 (no_query == 1)) { 14608 /* We are not past the initial window 14609 * on the first init (i.e. a stack switch 14610 * has not yet occured) so we need to make 14611 * sure cwnd and ssthresh is correct. 14612 */ 14613 if (tp->snd_cwnd < iwin) 14614 tp->snd_cwnd = iwin; 14615 /* 14616 * If we are within the initial window 14617 * we want ssthresh to be unlimited. Setting 14618 * it to the rwnd (which the default stack does 14619 * and older racks) is not really a good idea 14620 * since we want to be in SS and grow both the 14621 * cwnd and the rwnd (via dynamic rwnd growth). If 14622 * we set it to the rwnd then as the peer grows its 14623 * rwnd we will be stuck in CA and never hit SS. 14624 * 14625 * Its far better to raise it up high (this takes the 14626 * risk that there as been a loss already, probably 14627 * we should have an indicator in all stacks of loss 14628 * but we don't), but considering the normal use this 14629 * is a risk worth taking. The consequences of not 14630 * hitting SS are far worse than going one more time 14631 * into it early on (before we have sent even a IW). 14632 * It is highly unlikely that we will have had a loss 14633 * before getting the IW out. 14634 */ 14635 tp->snd_ssthresh = 0xffffffff; 14636 } 14637 /* 14638 * Any init based on sequence numbers 14639 * should be done in the deferred init path 14640 * since we can be CLOSED and not have them 14641 * inited when rack_init() is called. We 14642 * are not closed so lets call it. 14643 */ 14644 rack_deferred_init(tp, rack); 14645 } 14646 if ((tp->t_state != TCPS_CLOSED) && 14647 (tp->t_state != TCPS_TIME_WAIT) && 14648 (no_query == 0) && 14649 (tp->snd_una != tp->snd_max)) { 14650 err = rack_init_outstanding(tp, rack, us_cts, *ptr); 14651 if (err) { 14652 *ptr = NULL; 14653 return(err); 14654 } 14655 } 14656 rack_stop_all_timers(tp, rack); 14657 /* Setup all the t_flags2 */ 14658 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14659 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14660 else 14661 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14662 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14663 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14664 /* 14665 * Timers in Rack are kept in microseconds so lets 14666 * convert any initial incoming variables 14667 * from ticks into usecs. Note that we 14668 * also change the values of t_srtt and t_rttvar, if 14669 * they are non-zero. They are kept with a 5 14670 * bit decimal so we have to carefully convert 14671 * these to get the full precision. 14672 */ 14673 rack_convert_rtts(tp); 14674 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); 14675 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { 14676 /* We do not start any timers on DROPPED connections */ 14677 if (tp->t_fb->tfb_chg_query == NULL) { 14678 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 14679 } else { 14680 struct tcp_query_resp qr; 14681 int ret; 14682 14683 memset(&qr, 0, sizeof(qr)); 14684 14685 /* Get the misc time stamps and such for rack */ 14686 qr.req = TCP_QUERY_RACK_TIMES; 14687 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 14688 if (ret == 1) { 14689 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; 14690 rack->r_ctl.num_dsack = qr.rack_num_dsacks; 14691 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; 14692 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; 14693 rack->rc_rack_rtt = qr.rack_rtt; 14694 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; 14695 rack->r_ctl.rc_sacked = qr.rack_sacked; 14696 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; 14697 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; 14698 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; 14699 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; 14700 rack->r_ctl.rc_prr_out = qr.rack_prr_out; 14701 if (qr.rack_tlp_out) { 14702 rack->rc_tlp_in_progress = 1; 14703 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; 14704 } else { 14705 rack->rc_tlp_in_progress = 0; 14706 rack->r_ctl.rc_tlp_cnt_out = 0; 14707 } 14708 if (qr.rack_srtt_measured) 14709 rack->rc_srtt_measure_made = 1; 14710 if (qr.rack_in_persist == 1) { 14711 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; 14712 #ifdef NETFLIX_SHARED_CWND 14713 if (rack->r_ctl.rc_scw) { 14714 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 14715 rack->rack_scwnd_is_idle = 1; 14716 } 14717 #endif 14718 rack->r_ctl.persist_lost_ends = 0; 14719 rack->probe_not_answered = 0; 14720 rack->forced_ack = 0; 14721 tp->t_rxtshift = 0; 14722 rack->rc_in_persist = 1; 14723 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 14724 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 14725 } 14726 if (qr.rack_wanted_output) 14727 rack->r_wanted_output = 1; 14728 rack_log_chg_info(tp, rack, 6, 14729 qr.rack_min_rtt, 14730 qr.rack_rtt, 14731 qr.rack_reorder_ts); 14732 } 14733 /* Get the old stack timers */ 14734 qr.req_param = 0; 14735 qr.req = TCP_QUERY_TIMERS_UP; 14736 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 14737 if (ret) { 14738 /* 14739 * non-zero return means we have a timer('s) 14740 * to start. Zero means no timer (no keepalive 14741 * I suppose). 14742 */ 14743 uint32_t tov = 0; 14744 14745 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; 14746 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { 14747 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; 14748 if (TSTMP_GT(qr.timer_pacing_to, us_cts)) 14749 tov = qr.timer_pacing_to - us_cts; 14750 else 14751 tov = HPTS_USECS_PER_SLOT; 14752 } 14753 if (qr.timer_hpts_flags & PACE_TMR_MASK) { 14754 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; 14755 if (tov == 0) { 14756 if (TSTMP_GT(qr.timer_timer_exp, us_cts)) 14757 tov = qr.timer_timer_exp - us_cts; 14758 else 14759 tov = HPTS_USECS_PER_SLOT; 14760 } 14761 } 14762 rack_log_chg_info(tp, rack, 4, 14763 rack->r_ctl.rc_hpts_flags, 14764 rack->r_ctl.rc_last_output_to, 14765 rack->r_ctl.rc_timer_exp); 14766 if (tov) { 14767 struct hpts_diag diag; 14768 14769 tcp_hpts_insert(tp, tov, &diag); 14770 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); 14771 } 14772 } 14773 } 14774 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 14775 __LINE__, RACK_RTTS_INIT); 14776 } 14777 return (0); 14778 } 14779 14780 static int 14781 rack_handoff_ok(struct tcpcb *tp) 14782 { 14783 if ((tp->t_state == TCPS_CLOSED) || 14784 (tp->t_state == TCPS_LISTEN)) { 14785 /* Sure no problem though it may not stick */ 14786 return (0); 14787 } 14788 if ((tp->t_state == TCPS_SYN_SENT) || 14789 (tp->t_state == TCPS_SYN_RECEIVED)) { 14790 /* 14791 * We really don't know if you support sack, 14792 * you have to get to ESTAB or beyond to tell. 14793 */ 14794 return (EAGAIN); 14795 } 14796 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 14797 /* 14798 * Rack will only send a FIN after all data is acknowledged. 14799 * So in this case we have more data outstanding. We can't 14800 * switch stacks until either all data and only the FIN 14801 * is left (in which case rack_init() now knows how 14802 * to deal with that) <or> all is acknowledged and we 14803 * are only left with incoming data, though why you 14804 * would want to switch to rack after all data is acknowledged 14805 * I have no idea (rrs)! 14806 */ 14807 return (EAGAIN); 14808 } 14809 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 14810 return (0); 14811 } 14812 /* 14813 * If we reach here we don't do SACK on this connection so we can 14814 * never do rack. 14815 */ 14816 return (EINVAL); 14817 } 14818 14819 static void 14820 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 14821 { 14822 14823 if (tp->t_fb_ptr) { 14824 uint32_t cnt_free = 0; 14825 struct tcp_rack *rack; 14826 struct rack_sendmap *rsm; 14827 14828 tcp_handle_orphaned_packets(tp); 14829 tp->t_flags &= ~TF_FORCEDATA; 14830 rack = (struct tcp_rack *)tp->t_fb_ptr; 14831 rack_log_pacing_delay_calc(rack, 14832 0, 14833 0, 14834 0, 14835 rack_get_gp_est(rack), /* delRate */ 14836 rack_get_lt_bw(rack), /* rttProp */ 14837 20, __LINE__, NULL, 0); 14838 #ifdef NETFLIX_SHARED_CWND 14839 if (rack->r_ctl.rc_scw) { 14840 uint32_t limit; 14841 14842 if (rack->r_limit_scw) 14843 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 14844 else 14845 limit = 0; 14846 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 14847 rack->r_ctl.rc_scw_index, 14848 limit); 14849 rack->r_ctl.rc_scw = NULL; 14850 } 14851 #endif 14852 if (rack->r_ctl.fsb.tcp_ip_hdr) { 14853 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 14854 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 14855 rack->r_ctl.fsb.th = NULL; 14856 } 14857 if (rack->rc_always_pace == 1) { 14858 rack_remove_pacing(rack); 14859 } 14860 /* Clean up any options if they were not applied */ 14861 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 14862 struct deferred_opt_list *dol; 14863 14864 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 14865 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 14866 free(dol, M_TCPDO); 14867 } 14868 /* rack does not use force data but other stacks may clear it */ 14869 if (rack->r_ctl.crte != NULL) { 14870 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 14871 rack->rack_hdrw_pacing = 0; 14872 rack->r_ctl.crte = NULL; 14873 } 14874 #ifdef TCP_BLACKBOX 14875 tcp_log_flowend(tp); 14876 #endif 14877 /* 14878 * Lets take a different approach to purging just 14879 * get each one and free it like a cum-ack would and 14880 * not use a foreach loop. 14881 */ 14882 rsm = tqhash_min(rack->r_ctl.tqh); 14883 while (rsm) { 14884 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 14885 rack->r_ctl.rc_num_maps_alloced--; 14886 uma_zfree(rack_zone, rsm); 14887 rsm = tqhash_min(rack->r_ctl.tqh); 14888 } 14889 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 14890 while (rsm) { 14891 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 14892 rack->r_ctl.rc_num_maps_alloced--; 14893 rack->rc_free_cnt--; 14894 cnt_free++; 14895 uma_zfree(rack_zone, rsm); 14896 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 14897 } 14898 if (rack->r_ctl.pcm_s != NULL) { 14899 free(rack->r_ctl.pcm_s, M_TCPPCM); 14900 rack->r_ctl.pcm_s = NULL; 14901 rack->r_ctl.pcm_i.cnt_alloc = 0; 14902 rack->r_ctl.pcm_i.cnt = 0; 14903 } 14904 if ((rack->r_ctl.rc_num_maps_alloced > 0) && 14905 (tcp_bblogging_on(tp))) { 14906 union tcp_log_stackspecific log; 14907 struct timeval tv; 14908 14909 memset(&log, 0, sizeof(log)); 14910 log.u_bbr.flex8 = 10; 14911 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; 14912 log.u_bbr.flex2 = rack->rc_free_cnt; 14913 log.u_bbr.flex3 = cnt_free; 14914 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14915 rsm = tqhash_min(rack->r_ctl.tqh); 14916 log.u_bbr.delRate = (uintptr_t)rsm; 14917 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 14918 log.u_bbr.cur_del_rate = (uintptr_t)rsm; 14919 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14920 log.u_bbr.pkt_epoch = __LINE__; 14921 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 14922 0, &log, false, NULL, NULL, 0, &tv); 14923 } 14924 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), 14925 ("rack:%p num_aloc:%u after freeing all?", 14926 rack, 14927 rack->r_ctl.rc_num_maps_alloced)); 14928 rack->rc_free_cnt = 0; 14929 free(rack->r_ctl.tqh, M_TCPFSB); 14930 rack->r_ctl.tqh = NULL; 14931 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 14932 tp->t_fb_ptr = NULL; 14933 } 14934 /* Make sure snd_nxt is correctly set */ 14935 tp->snd_nxt = tp->snd_max; 14936 } 14937 14938 static void 14939 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 14940 { 14941 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 14942 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 14943 } 14944 switch (tp->t_state) { 14945 case TCPS_SYN_SENT: 14946 rack->r_state = TCPS_SYN_SENT; 14947 rack->r_substate = rack_do_syn_sent; 14948 break; 14949 case TCPS_SYN_RECEIVED: 14950 rack->r_state = TCPS_SYN_RECEIVED; 14951 rack->r_substate = rack_do_syn_recv; 14952 break; 14953 case TCPS_ESTABLISHED: 14954 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14955 rack->r_state = TCPS_ESTABLISHED; 14956 rack->r_substate = rack_do_established; 14957 break; 14958 case TCPS_CLOSE_WAIT: 14959 rack->r_state = TCPS_CLOSE_WAIT; 14960 rack->r_substate = rack_do_close_wait; 14961 break; 14962 case TCPS_FIN_WAIT_1: 14963 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14964 rack->r_state = TCPS_FIN_WAIT_1; 14965 rack->r_substate = rack_do_fin_wait_1; 14966 break; 14967 case TCPS_CLOSING: 14968 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14969 rack->r_state = TCPS_CLOSING; 14970 rack->r_substate = rack_do_closing; 14971 break; 14972 case TCPS_LAST_ACK: 14973 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14974 rack->r_state = TCPS_LAST_ACK; 14975 rack->r_substate = rack_do_lastack; 14976 break; 14977 case TCPS_FIN_WAIT_2: 14978 rack->r_state = TCPS_FIN_WAIT_2; 14979 rack->r_substate = rack_do_fin_wait_2; 14980 break; 14981 case TCPS_LISTEN: 14982 case TCPS_CLOSED: 14983 case TCPS_TIME_WAIT: 14984 default: 14985 break; 14986 }; 14987 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14988 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 14989 14990 } 14991 14992 static void 14993 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 14994 { 14995 /* 14996 * We received an ack, and then did not 14997 * call send or were bounced out due to the 14998 * hpts was running. Now a timer is up as well, is 14999 * it the right timer? 15000 */ 15001 struct rack_sendmap *rsm; 15002 int tmr_up; 15003 15004 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 15005 if (tcp_in_hpts(rack->rc_tp) == 0) { 15006 /* 15007 * Ok we probably need some timer up, but no 15008 * matter what the mask we are not in hpts. We 15009 * may have received an old ack and thus did nothing. 15010 */ 15011 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15012 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15013 return; 15014 } 15015 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 15016 return; 15017 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 15018 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 15019 (tmr_up == PACE_TMR_RXT)) { 15020 /* Should be an RXT */ 15021 return; 15022 } 15023 if (rsm == NULL) { 15024 /* Nothing outstanding? */ 15025 if (tp->t_flags & TF_DELACK) { 15026 if (tmr_up == PACE_TMR_DELACK) 15027 /* We are supposed to have delayed ack up and we do */ 15028 return; 15029 } else if (((V_tcp_always_keepalive || 15030 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 15031 (tp->t_state <= TCPS_CLOSING)) && 15032 (tmr_up == PACE_TMR_KEEP) && 15033 (tp->snd_max == tp->snd_una)) { 15034 /* We should have keep alive up and we do */ 15035 return; 15036 } 15037 } 15038 if (SEQ_GT(tp->snd_max, tp->snd_una) && 15039 ((tmr_up == PACE_TMR_TLP) || 15040 (tmr_up == PACE_TMR_RACK) || 15041 (tmr_up == PACE_TMR_RXT))) { 15042 /* 15043 * Either a Rack, TLP or RXT is fine if we 15044 * have outstanding data. 15045 */ 15046 return; 15047 } else if (tmr_up == PACE_TMR_DELACK) { 15048 /* 15049 * If the delayed ack was going to go off 15050 * before the rtx/tlp/rack timer were going to 15051 * expire, then that would be the timer in control. 15052 * Note we don't check the time here trusting the 15053 * code is correct. 15054 */ 15055 return; 15056 } 15057 /* 15058 * Ok the timer originally started is not what we want now. 15059 * We will force the hpts to be stopped if any, and restart 15060 * with the slot set to what was in the saved slot. 15061 */ 15062 if (tcp_in_hpts(rack->rc_tp)) { 15063 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 15064 uint32_t us_cts; 15065 15066 us_cts = tcp_get_usecs(NULL); 15067 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 15068 rack->r_early = 1; 15069 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 15070 } 15071 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 15072 } 15073 tcp_hpts_remove(rack->rc_tp); 15074 } 15075 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15076 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15077 } 15078 15079 15080 static void 15081 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) 15082 { 15083 if ((SEQ_LT(tp->snd_wl1, seq) || 15084 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 15085 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 15086 /* keep track of pure window updates */ 15087 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 15088 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 15089 tp->snd_wnd = tiwin; 15090 rack_validate_fo_sendwin_up(tp, rack); 15091 tp->snd_wl1 = seq; 15092 tp->snd_wl2 = ack; 15093 if (tp->snd_wnd > tp->max_sndwnd) 15094 tp->max_sndwnd = tp->snd_wnd; 15095 rack->r_wanted_output = 1; 15096 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 15097 tp->snd_wnd = tiwin; 15098 rack_validate_fo_sendwin_up(tp, rack); 15099 tp->snd_wl1 = seq; 15100 tp->snd_wl2 = ack; 15101 } else { 15102 /* Not a valid win update */ 15103 return; 15104 } 15105 if (tp->snd_wnd > tp->max_sndwnd) 15106 tp->max_sndwnd = tp->snd_wnd; 15107 /* Do we exit persists? */ 15108 if ((rack->rc_in_persist != 0) && 15109 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 15110 rack->r_ctl.rc_pace_min_segs))) { 15111 rack_exit_persist(tp, rack, cts); 15112 } 15113 /* Do we enter persists? */ 15114 if ((rack->rc_in_persist == 0) && 15115 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 15116 TCPS_HAVEESTABLISHED(tp->t_state) && 15117 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 15118 sbavail(&tptosocket(tp)->so_snd) && 15119 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 15120 /* 15121 * Here the rwnd is less than 15122 * the pacing size, we are established, 15123 * nothing is outstanding, and there is 15124 * data to send. Enter persists. 15125 */ 15126 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); 15127 } 15128 } 15129 15130 static void 15131 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 15132 { 15133 15134 if (tcp_bblogging_on(rack->rc_tp)) { 15135 struct inpcb *inp = tptoinpcb(tp); 15136 union tcp_log_stackspecific log; 15137 struct timeval ltv; 15138 char tcp_hdr_buf[60]; 15139 struct tcphdr *th; 15140 struct timespec ts; 15141 uint32_t orig_snd_una; 15142 uint8_t xx = 0; 15143 15144 #ifdef TCP_REQUEST_TRK 15145 struct tcp_sendfile_track *tcp_req; 15146 15147 if (SEQ_GT(ae->ack, tp->snd_una)) { 15148 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1)); 15149 } else { 15150 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); 15151 } 15152 #endif 15153 memset(&log, 0, sizeof(log)); 15154 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 15155 if (rack->rack_no_prr == 0) 15156 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15157 else 15158 log.u_bbr.flex1 = 0; 15159 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 15160 log.u_bbr.use_lt_bw <<= 1; 15161 log.u_bbr.use_lt_bw |= rack->r_might_revert; 15162 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 15163 log.u_bbr.bbr_state = rack->rc_free_cnt; 15164 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15165 log.u_bbr.pkts_out = tp->t_maxseg; 15166 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 15167 log.u_bbr.flex7 = 1; 15168 log.u_bbr.lost = ae->flags; 15169 log.u_bbr.cwnd_gain = ackval; 15170 log.u_bbr.pacing_gain = 0x2; 15171 if (ae->flags & TSTMP_HDWR) { 15172 /* Record the hardware timestamp if present */ 15173 log.u_bbr.flex3 = M_TSTMP; 15174 ts.tv_sec = ae->timestamp / 1000000000; 15175 ts.tv_nsec = ae->timestamp % 1000000000; 15176 ltv.tv_sec = ts.tv_sec; 15177 ltv.tv_usec = ts.tv_nsec / 1000; 15178 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); 15179 } else if (ae->flags & TSTMP_LRO) { 15180 /* Record the LRO the arrival timestamp */ 15181 log.u_bbr.flex3 = M_TSTMP_LRO; 15182 ts.tv_sec = ae->timestamp / 1000000000; 15183 ts.tv_nsec = ae->timestamp % 1000000000; 15184 ltv.tv_sec = ts.tv_sec; 15185 ltv.tv_usec = ts.tv_nsec / 1000; 15186 log.u_bbr.flex5 = tcp_tv_to_usec(<v); 15187 } 15188 log.u_bbr.timeStamp = tcp_get_usecs(<v); 15189 /* Log the rcv time */ 15190 log.u_bbr.delRate = ae->timestamp; 15191 #ifdef TCP_REQUEST_TRK 15192 log.u_bbr.applimited = tp->t_tcpreq_closed; 15193 log.u_bbr.applimited <<= 8; 15194 log.u_bbr.applimited |= tp->t_tcpreq_open; 15195 log.u_bbr.applimited <<= 8; 15196 log.u_bbr.applimited |= tp->t_tcpreq_req; 15197 if (tcp_req) { 15198 /* Copy out any client req info */ 15199 /* seconds */ 15200 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 15201 /* useconds */ 15202 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 15203 log.u_bbr.rttProp = tcp_req->timestamp; 15204 log.u_bbr.cur_del_rate = tcp_req->start; 15205 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 15206 log.u_bbr.flex8 |= 1; 15207 } else { 15208 log.u_bbr.flex8 |= 2; 15209 log.u_bbr.bw_inuse = tcp_req->end; 15210 } 15211 log.u_bbr.flex6 = tcp_req->start_seq; 15212 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 15213 log.u_bbr.flex8 |= 4; 15214 log.u_bbr.epoch = tcp_req->end_seq; 15215 } 15216 } 15217 #endif 15218 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 15219 th = (struct tcphdr *)tcp_hdr_buf; 15220 th->th_seq = ae->seq; 15221 th->th_ack = ae->ack; 15222 th->th_win = ae->win; 15223 /* Now fill in the ports */ 15224 th->th_sport = inp->inp_fport; 15225 th->th_dport = inp->inp_lport; 15226 tcp_set_flags(th, ae->flags); 15227 /* Now do we have a timestamp option? */ 15228 if (ae->flags & HAS_TSTMP) { 15229 u_char *cp; 15230 uint32_t val; 15231 15232 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 15233 cp = (u_char *)(th + 1); 15234 *cp = TCPOPT_NOP; 15235 cp++; 15236 *cp = TCPOPT_NOP; 15237 cp++; 15238 *cp = TCPOPT_TIMESTAMP; 15239 cp++; 15240 *cp = TCPOLEN_TIMESTAMP; 15241 cp++; 15242 val = htonl(ae->ts_value); 15243 bcopy((char *)&val, 15244 (char *)cp, sizeof(uint32_t)); 15245 val = htonl(ae->ts_echo); 15246 bcopy((char *)&val, 15247 (char *)(cp + 4), sizeof(uint32_t)); 15248 } else 15249 th->th_off = (sizeof(struct tcphdr) >> 2); 15250 15251 /* 15252 * For sane logging we need to play a little trick. 15253 * If the ack were fully processed we would have moved 15254 * snd_una to high_seq, but since compressed acks are 15255 * processed in two phases, at this point (logging) snd_una 15256 * won't be advanced. So we would see multiple acks showing 15257 * the advancement. We can prevent that by "pretending" that 15258 * snd_una was advanced and then un-advancing it so that the 15259 * logging code has the right value for tlb_snd_una. 15260 */ 15261 if (tp->snd_una != high_seq) { 15262 orig_snd_una = tp->snd_una; 15263 tp->snd_una = high_seq; 15264 xx = 1; 15265 } else 15266 xx = 0; 15267 TCP_LOG_EVENTP(tp, th, 15268 &tptosocket(tp)->so_rcv, 15269 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 15270 0, &log, true, <v); 15271 if (xx) { 15272 tp->snd_una = orig_snd_una; 15273 } 15274 } 15275 15276 } 15277 15278 static void 15279 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 15280 { 15281 uint32_t us_rtt; 15282 /* 15283 * A persist or keep-alive was forced out, update our 15284 * min rtt time. Note now worry about lost responses. 15285 * When a subsequent keep-alive or persist times out 15286 * and forced_ack is still on, then the last probe 15287 * was not responded to. In such cases we have a 15288 * sysctl that controls the behavior. Either we apply 15289 * the rtt but with reduced confidence (0). Or we just 15290 * plain don't apply the rtt estimate. Having data flow 15291 * will clear the probe_not_answered flag i.e. cum-ack 15292 * move forward <or> exiting and reentering persists. 15293 */ 15294 15295 rack->forced_ack = 0; 15296 rack->rc_tp->t_rxtshift = 0; 15297 if ((rack->rc_in_persist && 15298 (tiwin == rack->rc_tp->snd_wnd)) || 15299 (rack->rc_in_persist == 0)) { 15300 /* 15301 * In persists only apply the RTT update if this is 15302 * a response to our window probe. And that 15303 * means the rwnd sent must match the current 15304 * snd_wnd. If it does not, then we got a 15305 * window update ack instead. For keepalive 15306 * we allow the answer no matter what the window. 15307 * 15308 * Note that if the probe_not_answered is set then 15309 * the forced_ack_ts is the oldest one i.e. the first 15310 * probe sent that might have been lost. This assures 15311 * us that if we do calculate an RTT it is longer not 15312 * some short thing. 15313 */ 15314 if (rack->rc_in_persist) 15315 counter_u64_add(rack_persists_acks, 1); 15316 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 15317 if (us_rtt == 0) 15318 us_rtt = 1; 15319 if (rack->probe_not_answered == 0) { 15320 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15321 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 15322 } else { 15323 /* We have a retransmitted probe here too */ 15324 if (rack_apply_rtt_with_reduced_conf) { 15325 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15326 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 15327 } 15328 } 15329 } 15330 } 15331 15332 static void 15333 rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 15334 { 15335 /* 15336 * The next send has occurred mark the end of the round 15337 * as when that data gets acknowledged. We can 15338 * also do common things we might need to do when 15339 * a round begins. 15340 */ 15341 rack->r_ctl.roundends = tp->snd_max; 15342 rack->rc_new_rnd_needed = 0; 15343 rack_log_hystart_event(rack, tp->snd_max, 4); 15344 } 15345 15346 15347 static void 15348 rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, 15349 uint32_t flex3) 15350 { 15351 if (tcp_bblogging_on(rack->rc_tp)) { 15352 union tcp_log_stackspecific log; 15353 struct timeval tv; 15354 15355 (void)tcp_get_usecs(&tv); 15356 memset(&log, 0, sizeof(log)); 15357 log.u_bbr.timeStamp = tcp_tv_to_usec(&tv); 15358 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15359 log.u_bbr.flex8 = mod; 15360 log.u_bbr.flex1 = flex1; 15361 log.u_bbr.flex2 = flex2; 15362 log.u_bbr.flex3 = flex3; 15363 log.u_bbr.flex4 = rack_pcm_every_n_rounds; 15364 log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds; 15365 log.u_bbr.bbr_substate = rack->pcm_needed; 15366 log.u_bbr.bbr_substate <<= 1; 15367 log.u_bbr.bbr_substate |= rack->pcm_in_progress; 15368 log.u_bbr.bbr_substate <<= 1; 15369 log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */ 15370 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, 15371 0, &log, false, NULL, NULL, 0, &tv); 15372 } 15373 } 15374 15375 static void 15376 rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 15377 { 15378 /* 15379 * The round (current_round) has ended. We now 15380 * setup for the next round by incrementing the 15381 * round numnber and doing any round specific 15382 * things. 15383 */ 15384 rack_log_hystart_event(rack, high_seq, 21); 15385 rack->r_ctl.current_round++; 15386 /* New round (current_round) begins at next send */ 15387 rack->rc_new_rnd_needed = 1; 15388 if ((rack->pcm_enabled == 1) && 15389 (rack->pcm_needed == 0) && 15390 (rack->pcm_in_progress == 0)) { 15391 /* 15392 * If we have enabled PCM, then we need to 15393 * check if the round has adanced to the state 15394 * where one is required. 15395 */ 15396 int rnds; 15397 15398 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 15399 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 15400 rack->pcm_needed = 1; 15401 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 15402 } else if (rack_verbose_logging) { 15403 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 15404 } 15405 } 15406 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 15407 /* We have hystart enabled send the round info in */ 15408 if (CC_ALGO(tp)->newround != NULL) { 15409 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 15410 } 15411 } 15412 /* 15413 * For DGP an initial startup check. We want to validate 15414 * that we are not just pushing on slow-start and just 15415 * not gaining.. i.e. filling buffers without getting any 15416 * boost in b/w during the inital slow-start. 15417 */ 15418 if (rack->dgp_on && 15419 (rack->rc_initial_ss_comp == 0) && 15420 (tp->snd_cwnd < tp->snd_ssthresh) && 15421 (rack->r_ctl.num_measurements >= RACK_REQ_AVG) && 15422 (rack->r_ctl.gp_rnd_thresh > 0) && 15423 ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) { 15424 15425 /* 15426 * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where 15427 * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets 15428 * exit SS. 15429 * 15430 * Pick up the flight size now as we enter slowstart (not the 15431 * cwnd which may be inflated). 15432 */ 15433 rack->rc_initial_ss_comp = 1; 15434 15435 if (tcp_bblogging_on(rack->rc_tp)) { 15436 union tcp_log_stackspecific log; 15437 struct timeval tv; 15438 15439 memset(&log, 0, sizeof(log)); 15440 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15441 log.u_bbr.flex1 = rack->r_ctl.current_round; 15442 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 15443 log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh; 15444 log.u_bbr.flex4 = rack->r_ctl.gate_to_fs; 15445 log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs; 15446 log.u_bbr.flex8 = 40; 15447 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 15448 0, &log, false, NULL, __func__, __LINE__,&tv); 15449 } 15450 if ((rack->r_ctl.gate_to_fs == 1) && 15451 (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) { 15452 tp->snd_cwnd = rack->r_ctl.ss_hi_fs; 15453 } 15454 tp->snd_ssthresh = tp->snd_cwnd - 1; 15455 /* Turn off any fast output running */ 15456 rack->r_fast_output = 0; 15457 } 15458 } 15459 15460 static int 15461 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 15462 { 15463 /* 15464 * Handle a "special" compressed ack mbuf. Each incoming 15465 * ack has only four possible dispositions: 15466 * 15467 * A) It moves the cum-ack forward 15468 * B) It is behind the cum-ack. 15469 * C) It is a window-update ack. 15470 * D) It is a dup-ack. 15471 * 15472 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 15473 * in the incoming mbuf. We also need to still pay attention 15474 * to nxt_pkt since there may be another packet after this 15475 * one. 15476 */ 15477 #ifdef TCP_ACCOUNTING 15478 uint64_t ts_val; 15479 uint64_t rdstc; 15480 #endif 15481 int segsiz; 15482 struct timespec ts; 15483 struct tcp_rack *rack; 15484 struct tcp_ackent *ae; 15485 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 15486 int cnt, i, did_out, ourfinisacked = 0; 15487 struct tcpopt to_holder, *to = NULL; 15488 #ifdef TCP_ACCOUNTING 15489 int win_up_req = 0; 15490 #endif 15491 int nsegs = 0; 15492 int under_pacing = 0; 15493 int post_recovery = 0; 15494 #ifdef TCP_ACCOUNTING 15495 sched_pin(); 15496 #endif 15497 rack = (struct tcp_rack *)tp->t_fb_ptr; 15498 if (rack->gp_ready && 15499 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 15500 under_pacing = 1; 15501 15502 if (rack->r_state != tp->t_state) 15503 rack_set_state(tp, rack); 15504 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 15505 (tp->t_flags & TF_GPUTINPROG)) { 15506 /* 15507 * We have a goodput in progress 15508 * and we have entered a late state. 15509 * Do we have enough data in the sb 15510 * to handle the GPUT request? 15511 */ 15512 uint32_t bytes; 15513 15514 bytes = tp->gput_ack - tp->gput_seq; 15515 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 15516 bytes += tp->gput_seq - tp->snd_una; 15517 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 15518 /* 15519 * There are not enough bytes in the socket 15520 * buffer that have been sent to cover this 15521 * measurement. Cancel it. 15522 */ 15523 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 15524 rack->r_ctl.rc_gp_srtt /*flex1*/, 15525 tp->gput_seq, 15526 0, 0, 18, __LINE__, NULL, 0); 15527 tp->t_flags &= ~TF_GPUTINPROG; 15528 } 15529 } 15530 to = &to_holder; 15531 to->to_flags = 0; 15532 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 15533 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 15534 cnt = m->m_len / sizeof(struct tcp_ackent); 15535 counter_u64_add(rack_multi_single_eq, cnt); 15536 high_seq = tp->snd_una; 15537 the_win = tp->snd_wnd; 15538 win_seq = tp->snd_wl1; 15539 win_upd_ack = tp->snd_wl2; 15540 cts = tcp_tv_to_usec(tv); 15541 ms_cts = tcp_tv_to_msec(tv); 15542 rack->r_ctl.rc_rcvtime = cts; 15543 segsiz = ctf_fixed_maxseg(tp); 15544 if ((rack->rc_gp_dyn_mul) && 15545 (rack->use_fixed_rate == 0) && 15546 (rack->rc_always_pace)) { 15547 /* Check in on probertt */ 15548 rack_check_probe_rtt(rack, cts); 15549 } 15550 for (i = 0; i < cnt; i++) { 15551 #ifdef TCP_ACCOUNTING 15552 ts_val = get_cyclecount(); 15553 #endif 15554 rack_clear_rate_sample(rack); 15555 ae = ((mtod(m, struct tcp_ackent *)) + i); 15556 if (ae->flags & TH_FIN) 15557 rack_log_pacing_delay_calc(rack, 15558 0, 15559 0, 15560 0, 15561 rack_get_gp_est(rack), /* delRate */ 15562 rack_get_lt_bw(rack), /* rttProp */ 15563 20, __LINE__, NULL, 0); 15564 /* Setup the window */ 15565 tiwin = ae->win << tp->snd_scale; 15566 if (tiwin > rack->r_ctl.rc_high_rwnd) 15567 rack->r_ctl.rc_high_rwnd = tiwin; 15568 /* figure out the type of ack */ 15569 if (SEQ_LT(ae->ack, high_seq)) { 15570 /* Case B*/ 15571 ae->ack_val_set = ACK_BEHIND; 15572 } else if (SEQ_GT(ae->ack, high_seq)) { 15573 /* Case A */ 15574 ae->ack_val_set = ACK_CUMACK; 15575 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 15576 /* Case D */ 15577 ae->ack_val_set = ACK_DUPACK; 15578 } else { 15579 /* Case C */ 15580 ae->ack_val_set = ACK_RWND; 15581 } 15582 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 15583 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 15584 /* Validate timestamp */ 15585 if (ae->flags & HAS_TSTMP) { 15586 /* Setup for a timestamp */ 15587 to->to_flags = TOF_TS; 15588 ae->ts_echo -= tp->ts_offset; 15589 to->to_tsecr = ae->ts_echo; 15590 to->to_tsval = ae->ts_value; 15591 /* 15592 * If echoed timestamp is later than the current time, fall back to 15593 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 15594 * were used when this connection was established. 15595 */ 15596 if (TSTMP_GT(ae->ts_echo, ms_cts)) 15597 to->to_tsecr = 0; 15598 if (tp->ts_recent && 15599 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 15600 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 15601 #ifdef TCP_ACCOUNTING 15602 rdstc = get_cyclecount(); 15603 if (rdstc > ts_val) { 15604 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15605 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 15606 } 15607 } 15608 #endif 15609 continue; 15610 } 15611 } 15612 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 15613 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 15614 tp->ts_recent_age = tcp_ts_getticks(); 15615 tp->ts_recent = ae->ts_value; 15616 } 15617 } else { 15618 /* Setup for a no options */ 15619 to->to_flags = 0; 15620 } 15621 /* Update the rcv time and perform idle reduction possibly */ 15622 if (tp->t_idle_reduce && 15623 (tp->snd_max == tp->snd_una) && 15624 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 15625 counter_u64_add(rack_input_idle_reduces, 1); 15626 rack_cc_after_idle(rack, tp); 15627 } 15628 tp->t_rcvtime = ticks; 15629 /* Now what about ECN of a chain of pure ACKs? */ 15630 if (tcp_ecn_input_segment(tp, ae->flags, 0, 15631 tcp_packets_this_ack(tp, ae->ack), 15632 ae->codepoint)) 15633 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 15634 #ifdef TCP_ACCOUNTING 15635 /* Count for the specific type of ack in */ 15636 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15637 tp->tcp_cnt_counters[ae->ack_val_set]++; 15638 } 15639 #endif 15640 /* 15641 * Note how we could move up these in the determination 15642 * above, but we don't so that way the timestamp checks (and ECN) 15643 * is done first before we do any processing on the ACK. 15644 * The non-compressed path through the code has this 15645 * weakness (noted by @jtl) that it actually does some 15646 * processing before verifying the timestamp information. 15647 * We don't take that path here which is why we set 15648 * the ack_val_set first, do the timestamp and ecn 15649 * processing, and then look at what we have setup. 15650 */ 15651 if (ae->ack_val_set == ACK_BEHIND) { 15652 /* 15653 * Case B flag reordering, if window is not closed 15654 * or it could be a keep-alive or persists 15655 */ 15656 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 15657 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 15658 if (rack->r_ctl.rc_reorder_ts == 0) 15659 rack->r_ctl.rc_reorder_ts = 1; 15660 } 15661 } else if (ae->ack_val_set == ACK_DUPACK) { 15662 /* Case D */ 15663 rack_strike_dupack(rack, ae->ack); 15664 } else if (ae->ack_val_set == ACK_RWND) { 15665 /* Case C */ 15666 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 15667 ts.tv_sec = ae->timestamp / 1000000000; 15668 ts.tv_nsec = ae->timestamp % 1000000000; 15669 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 15670 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 15671 } else { 15672 rack->r_ctl.act_rcv_time = *tv; 15673 } 15674 if (rack->forced_ack) { 15675 rack_handle_probe_response(rack, tiwin, 15676 tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); 15677 } 15678 #ifdef TCP_ACCOUNTING 15679 win_up_req = 1; 15680 #endif 15681 win_upd_ack = ae->ack; 15682 win_seq = ae->seq; 15683 the_win = tiwin; 15684 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 15685 } else { 15686 /* Case A */ 15687 if (SEQ_GT(ae->ack, tp->snd_max)) { 15688 /* 15689 * We just send an ack since the incoming 15690 * ack is beyond the largest seq we sent. 15691 */ 15692 if ((tp->t_flags & TF_ACKNOW) == 0) { 15693 ctf_ack_war_checks(tp); 15694 if (tp->t_flags && TF_ACKNOW) 15695 rack->r_wanted_output = 1; 15696 } 15697 } else { 15698 nsegs++; 15699 /* If the window changed setup to update */ 15700 if (tiwin != tp->snd_wnd) { 15701 win_upd_ack = ae->ack; 15702 win_seq = ae->seq; 15703 the_win = tiwin; 15704 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 15705 } 15706 #ifdef TCP_ACCOUNTING 15707 /* Account for the acks */ 15708 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15709 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 15710 } 15711 #endif 15712 high_seq = ae->ack; 15713 /* Setup our act_rcv_time */ 15714 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 15715 ts.tv_sec = ae->timestamp / 1000000000; 15716 ts.tv_nsec = ae->timestamp % 1000000000; 15717 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 15718 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 15719 } else { 15720 rack->r_ctl.act_rcv_time = *tv; 15721 } 15722 rack_process_to_cumack(tp, rack, ae->ack, cts, to, 15723 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); 15724 #ifdef TCP_REQUEST_TRK 15725 rack_req_check_for_comp(rack, high_seq); 15726 #endif 15727 if (rack->rc_dsack_round_seen) { 15728 /* Is the dsack round over? */ 15729 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 15730 /* Yes it is */ 15731 rack->rc_dsack_round_seen = 0; 15732 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 15733 } 15734 } 15735 } 15736 } 15737 /* And lets be sure to commit the rtt measurements for this ack */ 15738 tcp_rack_xmit_timer_commit(rack, tp); 15739 #ifdef TCP_ACCOUNTING 15740 rdstc = get_cyclecount(); 15741 if (rdstc > ts_val) { 15742 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15743 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 15744 if (ae->ack_val_set == ACK_CUMACK) 15745 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 15746 } 15747 } 15748 #endif 15749 } 15750 #ifdef TCP_ACCOUNTING 15751 ts_val = get_cyclecount(); 15752 #endif 15753 /* Tend to any collapsed window */ 15754 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 15755 /* The peer collapsed the window */ 15756 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); 15757 } else if (rack->rc_has_collapsed) 15758 rack_un_collapse_window(rack, __LINE__); 15759 if ((rack->r_collapse_point_valid) && 15760 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 15761 rack->r_collapse_point_valid = 0; 15762 acked_amount = acked = (high_seq - tp->snd_una); 15763 if (acked) { 15764 /* 15765 * The draft (v3) calls for us to use SEQ_GEQ, but that 15766 * causes issues when we are just going app limited. Lets 15767 * instead use SEQ_GT <or> where its equal but more data 15768 * is outstanding. 15769 * 15770 * Also make sure we are on the last ack of a series. We 15771 * have to have all the ack's processed in queue to know 15772 * if there is something left outstanding. 15773 * 15774 */ 15775 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && 15776 (rack->rc_new_rnd_needed == 0) && 15777 (nxt_pkt == 0)) { 15778 /* 15779 * We have crossed into a new round with 15780 * this th_ack value. 15781 */ 15782 rack_new_round_setup(tp, rack, high_seq); 15783 } 15784 /* 15785 * Clear the probe not answered flag 15786 * since cum-ack moved forward. 15787 */ 15788 rack->probe_not_answered = 0; 15789 if (tp->t_flags & TF_NEEDSYN) { 15790 /* 15791 * T/TCP: Connection was half-synchronized, and our SYN has 15792 * been ACK'd (so connection is now fully synchronized). Go 15793 * to non-starred state, increment snd_una for ACK of SYN, 15794 * and check if we can do window scaling. 15795 */ 15796 tp->t_flags &= ~TF_NEEDSYN; 15797 tp->snd_una++; 15798 acked_amount = acked = (high_seq - tp->snd_una); 15799 } 15800 if (acked > sbavail(&so->so_snd)) 15801 acked_amount = sbavail(&so->so_snd); 15802 if (IN_FASTRECOVERY(tp->t_flags) && 15803 (rack->rack_no_prr == 0)) 15804 rack_update_prr(tp, rack, acked_amount, high_seq); 15805 if (IN_RECOVERY(tp->t_flags)) { 15806 if (SEQ_LT(high_seq, tp->snd_recover) && 15807 (SEQ_LT(high_seq, tp->snd_max))) { 15808 tcp_rack_partialack(tp); 15809 } else { 15810 rack_post_recovery(tp, high_seq); 15811 post_recovery = 1; 15812 } 15813 } else if ((rack->rto_from_rec == 1) && 15814 SEQ_GEQ(high_seq, tp->snd_recover)) { 15815 /* 15816 * We were in recovery, hit a rxt timeout 15817 * and never re-entered recovery. The timeout(s) 15818 * made up all the lost data. In such a case 15819 * we need to clear the rto_from_rec flag. 15820 */ 15821 rack->rto_from_rec = 0; 15822 } 15823 /* Handle the rack-log-ack part (sendmap) */ 15824 if ((sbused(&so->so_snd) == 0) && 15825 (acked > acked_amount) && 15826 (tp->t_state >= TCPS_FIN_WAIT_1) && 15827 (tp->t_flags & TF_SENTFIN)) { 15828 /* 15829 * We must be sure our fin 15830 * was sent and acked (we can be 15831 * in FIN_WAIT_1 without having 15832 * sent the fin). 15833 */ 15834 ourfinisacked = 1; 15835 /* 15836 * Lets make sure snd_una is updated 15837 * since most likely acked_amount = 0 (it 15838 * should be). 15839 */ 15840 tp->snd_una = high_seq; 15841 } 15842 /* Did we make a RTO error? */ 15843 if ((tp->t_flags & TF_PREVVALID) && 15844 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 15845 tp->t_flags &= ~TF_PREVVALID; 15846 if (tp->t_rxtshift == 1 && 15847 (int)(ticks - tp->t_badrxtwin) < 0) 15848 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 15849 } 15850 /* Handle the data in the socket buffer */ 15851 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 15852 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 15853 if (acked_amount > 0) { 15854 uint32_t p_cwnd; 15855 struct mbuf *mfree; 15856 15857 if (post_recovery) { 15858 /* 15859 * Grab the segsiz, multiply by 2 and add the snd_cwnd 15860 * that is the max the CC should add if we are exiting 15861 * recovery and doing a late add. 15862 */ 15863 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15864 p_cwnd <<= 1; 15865 p_cwnd += tp->snd_cwnd; 15866 } 15867 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery); 15868 if (post_recovery && (tp->snd_cwnd > p_cwnd)) { 15869 /* Must be non-newreno (cubic) getting too ahead of itself */ 15870 tp->snd_cwnd = p_cwnd; 15871 } 15872 SOCK_SENDBUF_LOCK(so); 15873 mfree = sbcut_locked(&so->so_snd, acked_amount); 15874 tp->snd_una = high_seq; 15875 /* Note we want to hold the sb lock through the sendmap adjust */ 15876 rack_adjust_sendmap_head(rack, &so->so_snd); 15877 /* Wake up the socket if we have room to write more */ 15878 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 15879 sowwakeup_locked(so); 15880 m_freem(mfree); 15881 } 15882 /* update progress */ 15883 tp->t_acktime = ticks; 15884 rack_log_progress_event(rack, tp, tp->t_acktime, 15885 PROGRESS_UPDATE, __LINE__); 15886 /* Clear out shifts and such */ 15887 tp->t_rxtshift = 0; 15888 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 15889 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 15890 rack->rc_tlp_in_progress = 0; 15891 rack->r_ctl.rc_tlp_cnt_out = 0; 15892 /* Send recover and snd_nxt must be dragged along */ 15893 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 15894 tp->snd_recover = tp->snd_una; 15895 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 15896 tp->snd_nxt = tp->snd_max; 15897 /* 15898 * If the RXT timer is running we want to 15899 * stop it, so we can restart a TLP (or new RXT). 15900 */ 15901 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 15902 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15903 tp->snd_wl2 = high_seq; 15904 tp->t_dupacks = 0; 15905 if (under_pacing && 15906 (rack->use_fixed_rate == 0) && 15907 (rack->in_probe_rtt == 0) && 15908 rack->rc_gp_dyn_mul && 15909 rack->rc_always_pace) { 15910 /* Check if we are dragging bottom */ 15911 rack_check_bottom_drag(tp, rack, so); 15912 } 15913 if (tp->snd_una == tp->snd_max) { 15914 tp->t_flags &= ~TF_PREVVALID; 15915 rack->r_ctl.retran_during_recovery = 0; 15916 rack->rc_suspicious = 0; 15917 rack->r_ctl.dsack_byte_cnt = 0; 15918 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 15919 if (rack->r_ctl.rc_went_idle_time == 0) 15920 rack->r_ctl.rc_went_idle_time = 1; 15921 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 15922 if (sbavail(&tptosocket(tp)->so_snd) == 0) 15923 tp->t_acktime = 0; 15924 /* Set so we might enter persists... */ 15925 rack->r_wanted_output = 1; 15926 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15927 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 15928 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 15929 (sbavail(&so->so_snd) == 0) && 15930 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 15931 /* 15932 * The socket was gone and the 15933 * peer sent data (not now in the past), time to 15934 * reset him. 15935 */ 15936 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15937 /* tcp_close will kill the inp pre-log the Reset */ 15938 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 15939 #ifdef TCP_ACCOUNTING 15940 rdstc = get_cyclecount(); 15941 if (rdstc > ts_val) { 15942 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15943 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 15944 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 15945 } 15946 } 15947 #endif 15948 m_freem(m); 15949 tp = tcp_close(tp); 15950 if (tp == NULL) { 15951 #ifdef TCP_ACCOUNTING 15952 sched_unpin(); 15953 #endif 15954 return (1); 15955 } 15956 /* 15957 * We would normally do drop-with-reset which would 15958 * send back a reset. We can't since we don't have 15959 * all the needed bits. Instead lets arrange for 15960 * a call to tcp_output(). That way since we 15961 * are in the closed state we will generate a reset. 15962 * 15963 * Note if tcp_accounting is on we don't unpin since 15964 * we do that after the goto label. 15965 */ 15966 goto send_out_a_rst; 15967 } 15968 if ((sbused(&so->so_snd) == 0) && 15969 (tp->t_state >= TCPS_FIN_WAIT_1) && 15970 (tp->t_flags & TF_SENTFIN)) { 15971 /* 15972 * If we can't receive any more data, then closing user can 15973 * proceed. Starting the timer is contrary to the 15974 * specification, but if we don't get a FIN we'll hang 15975 * forever. 15976 * 15977 */ 15978 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 15979 soisdisconnected(so); 15980 tcp_timer_activate(tp, TT_2MSL, 15981 (tcp_fast_finwait2_recycle ? 15982 tcp_finwait2_timeout : 15983 TP_MAXIDLE(tp))); 15984 } 15985 if (ourfinisacked == 0) { 15986 /* 15987 * We don't change to fin-wait-2 if we have our fin acked 15988 * which means we are probably in TCPS_CLOSING. 15989 */ 15990 tcp_state_change(tp, TCPS_FIN_WAIT_2); 15991 } 15992 } 15993 } 15994 /* Wake up the socket if we have room to write more */ 15995 if (sbavail(&so->so_snd)) { 15996 rack->r_wanted_output = 1; 15997 if (ctf_progress_timeout_check(tp, true)) { 15998 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 15999 tp, tick, PROGRESS_DROP, __LINE__); 16000 /* 16001 * We cheat here and don't send a RST, we should send one 16002 * when the pacer drops the connection. 16003 */ 16004 #ifdef TCP_ACCOUNTING 16005 rdstc = get_cyclecount(); 16006 if (rdstc > ts_val) { 16007 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16008 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16009 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16010 } 16011 } 16012 sched_unpin(); 16013 #endif 16014 (void)tcp_drop(tp, ETIMEDOUT); 16015 m_freem(m); 16016 return (1); 16017 } 16018 } 16019 if (ourfinisacked) { 16020 switch(tp->t_state) { 16021 case TCPS_CLOSING: 16022 #ifdef TCP_ACCOUNTING 16023 rdstc = get_cyclecount(); 16024 if (rdstc > ts_val) { 16025 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16026 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16027 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16028 } 16029 } 16030 sched_unpin(); 16031 #endif 16032 tcp_twstart(tp); 16033 m_freem(m); 16034 return (1); 16035 break; 16036 case TCPS_LAST_ACK: 16037 #ifdef TCP_ACCOUNTING 16038 rdstc = get_cyclecount(); 16039 if (rdstc > ts_val) { 16040 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16041 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16042 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16043 } 16044 } 16045 sched_unpin(); 16046 #endif 16047 tp = tcp_close(tp); 16048 ctf_do_drop(m, tp); 16049 return (1); 16050 break; 16051 case TCPS_FIN_WAIT_1: 16052 #ifdef TCP_ACCOUNTING 16053 rdstc = get_cyclecount(); 16054 if (rdstc > ts_val) { 16055 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16056 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16057 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16058 } 16059 } 16060 #endif 16061 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16062 soisdisconnected(so); 16063 tcp_timer_activate(tp, TT_2MSL, 16064 (tcp_fast_finwait2_recycle ? 16065 tcp_finwait2_timeout : 16066 TP_MAXIDLE(tp))); 16067 } 16068 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16069 break; 16070 default: 16071 break; 16072 } 16073 } 16074 if (rack->r_fast_output) { 16075 /* 16076 * We re doing fast output.. can we expand that? 16077 */ 16078 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 16079 } 16080 #ifdef TCP_ACCOUNTING 16081 rdstc = get_cyclecount(); 16082 if (rdstc > ts_val) { 16083 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16084 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16085 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16086 } 16087 } 16088 16089 } else if (win_up_req) { 16090 rdstc = get_cyclecount(); 16091 if (rdstc > ts_val) { 16092 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16093 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 16094 } 16095 } 16096 #endif 16097 } 16098 /* Now is there a next packet, if so we are done */ 16099 m_freem(m); 16100 did_out = 0; 16101 if (nxt_pkt) { 16102 #ifdef TCP_ACCOUNTING 16103 sched_unpin(); 16104 #endif 16105 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 16106 return (0); 16107 } 16108 rack_handle_might_revert(tp, rack); 16109 ctf_calc_rwin(so, tp); 16110 if ((rack->r_wanted_output != 0) || 16111 (rack->r_fast_output != 0) || 16112 (tp->t_flags & TF_ACKNOW )) { 16113 send_out_a_rst: 16114 if (tcp_output(tp) < 0) { 16115 #ifdef TCP_ACCOUNTING 16116 sched_unpin(); 16117 #endif 16118 return (1); 16119 } 16120 did_out = 1; 16121 } 16122 if (tp->t_flags2 & TF2_HPTS_CALLS) 16123 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16124 rack_free_trim(rack); 16125 #ifdef TCP_ACCOUNTING 16126 sched_unpin(); 16127 #endif 16128 rack_timer_audit(tp, rack, &so->so_snd); 16129 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 16130 return (0); 16131 } 16132 16133 #define TCP_LRO_TS_OPTION \ 16134 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 16135 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) 16136 16137 static int 16138 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 16139 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, 16140 struct timeval *tv) 16141 { 16142 struct inpcb *inp = tptoinpcb(tp); 16143 struct socket *so = tptosocket(tp); 16144 #ifdef TCP_ACCOUNTING 16145 uint64_t ts_val; 16146 #endif 16147 int32_t thflags, retval, did_out = 0; 16148 int32_t way_out = 0; 16149 /* 16150 * cts - is the current time from tv (caller gets ts) in microseconds. 16151 * ms_cts - is the current time from tv in milliseconds. 16152 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 16153 */ 16154 uint32_t cts, us_cts, ms_cts; 16155 uint32_t tiwin; 16156 struct timespec ts; 16157 struct tcpopt to; 16158 struct tcp_rack *rack; 16159 struct rack_sendmap *rsm; 16160 int32_t prev_state = 0; 16161 int no_output = 0; 16162 int time_remaining = 0; 16163 #ifdef TCP_ACCOUNTING 16164 int ack_val_set = 0xf; 16165 #endif 16166 int nsegs; 16167 16168 NET_EPOCH_ASSERT(); 16169 INP_WLOCK_ASSERT(inp); 16170 16171 /* 16172 * tv passed from common code is from either M_TSTMP_LRO or 16173 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 16174 */ 16175 rack = (struct tcp_rack *)tp->t_fb_ptr; 16176 if (rack->rack_deferred_inited == 0) { 16177 /* 16178 * If we are the connecting socket we will 16179 * hit rack_init() when no sequence numbers 16180 * are setup. This makes it so we must defer 16181 * some initialization. Call that now. 16182 */ 16183 rack_deferred_init(tp, rack); 16184 } 16185 /* 16186 * Check to see if we need to skip any output plans. This 16187 * can happen in the non-LRO path where we are pacing and 16188 * must process the ack coming in but need to defer sending 16189 * anything becase a pacing timer is running. 16190 */ 16191 us_cts = tcp_tv_to_usec(tv); 16192 if (m->m_flags & M_ACKCMP) { 16193 /* 16194 * All compressed ack's are ack's by definition so 16195 * remove any ack required flag and then do the processing. 16196 */ 16197 rack->rc_ack_required = 0; 16198 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 16199 } 16200 thflags = tcp_get_flags(th); 16201 if ((rack->rc_always_pace == 1) && 16202 (rack->rc_ack_can_sendout_data == 0) && 16203 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16204 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) { 16205 /* 16206 * Ok conditions are right for queuing the packets 16207 * but we do have to check the flags in the inp, it 16208 * could be, if a sack is present, we want to be awoken and 16209 * so should process the packets. 16210 */ 16211 time_remaining = rack->r_ctl.rc_last_output_to - us_cts; 16212 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { 16213 no_output = 1; 16214 } else { 16215 /* 16216 * If there is no options, or just a 16217 * timestamp option, we will want to queue 16218 * the packets. This is the same that LRO does 16219 * and will need to change with accurate ECN. 16220 */ 16221 uint32_t *ts_ptr; 16222 int optlen; 16223 16224 optlen = (th->th_off << 2) - sizeof(struct tcphdr); 16225 ts_ptr = (uint32_t *)(th + 1); 16226 if ((optlen == 0) || 16227 ((optlen == TCPOLEN_TSTAMP_APPA) && 16228 (*ts_ptr == TCP_LRO_TS_OPTION))) 16229 no_output = 1; 16230 } 16231 if ((no_output == 1) && (time_remaining < tcp_min_hptsi_time)) { 16232 /* 16233 * It is unrealistic to think we can pace in less than 16234 * the minimum granularity of the pacer (def:250usec). So 16235 * if we have less than that time remaining we should go 16236 * ahead and allow output to be "early". We will attempt to 16237 * make up for it in any pacing time we try to apply on 16238 * the outbound packet. 16239 */ 16240 no_output = 0; 16241 } 16242 } 16243 /* 16244 * If there is a RST or FIN lets dump out the bw 16245 * with a FIN the connection may go on but we 16246 * may not. 16247 */ 16248 if ((thflags & TH_FIN) || (thflags & TH_RST)) 16249 rack_log_pacing_delay_calc(rack, 16250 rack->r_ctl.gp_bw, 16251 0, 16252 0, 16253 rack_get_gp_est(rack), /* delRate */ 16254 rack_get_lt_bw(rack), /* rttProp */ 16255 20, __LINE__, NULL, 0); 16256 if (m->m_flags & M_ACKCMP) { 16257 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 16258 } 16259 cts = tcp_tv_to_usec(tv); 16260 ms_cts = tcp_tv_to_msec(tv); 16261 nsegs = m->m_pkthdr.lro_nsegs; 16262 counter_u64_add(rack_proc_non_comp_ack, 1); 16263 #ifdef TCP_ACCOUNTING 16264 sched_pin(); 16265 if (thflags & TH_ACK) 16266 ts_val = get_cyclecount(); 16267 #endif 16268 if ((m->m_flags & M_TSTMP) || 16269 (m->m_flags & M_TSTMP_LRO)) { 16270 mbuf_tstmp2timespec(m, &ts); 16271 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16272 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16273 } else 16274 rack->r_ctl.act_rcv_time = *tv; 16275 kern_prefetch(rack, &prev_state); 16276 prev_state = 0; 16277 /* 16278 * Unscale the window into a 32-bit value. For the SYN_SENT state 16279 * the scale is zero. 16280 */ 16281 tiwin = th->th_win << tp->snd_scale; 16282 #ifdef TCP_ACCOUNTING 16283 if (thflags & TH_ACK) { 16284 /* 16285 * We have a tradeoff here. We can either do what we are 16286 * doing i.e. pinning to this CPU and then doing the accounting 16287 * <or> we could do a critical enter, setup the rdtsc and cpu 16288 * as in below, and then validate we are on the same CPU on 16289 * exit. I have choosen to not do the critical enter since 16290 * that often will gain you a context switch, and instead lock 16291 * us (line above this if) to the same CPU with sched_pin(). This 16292 * means we may be context switched out for a higher priority 16293 * interupt but we won't be moved to another CPU. 16294 * 16295 * If this occurs (which it won't very often since we most likely 16296 * are running this code in interupt context and only a higher 16297 * priority will bump us ... clock?) we will falsely add in 16298 * to the time the interupt processing time plus the ack processing 16299 * time. This is ok since its a rare event. 16300 */ 16301 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 16302 ctf_fixed_maxseg(tp)); 16303 } 16304 #endif 16305 /* 16306 * Parse options on any incoming segment. 16307 */ 16308 memset(&to, 0, sizeof(to)); 16309 tcp_dooptions(&to, (u_char *)(th + 1), 16310 (th->th_off << 2) - sizeof(struct tcphdr), 16311 (thflags & TH_SYN) ? TO_SYN : 0); 16312 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 16313 __func__)); 16314 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 16315 __func__)); 16316 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { 16317 /* 16318 * We don't look at sack's from the 16319 * peer because the MSS is too small which 16320 * can subject us to an attack. 16321 */ 16322 to.to_flags &= ~TOF_SACK; 16323 } 16324 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16325 (tp->t_flags & TF_GPUTINPROG)) { 16326 /* 16327 * We have a goodput in progress 16328 * and we have entered a late state. 16329 * Do we have enough data in the sb 16330 * to handle the GPUT request? 16331 */ 16332 uint32_t bytes; 16333 16334 bytes = tp->gput_ack - tp->gput_seq; 16335 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 16336 bytes += tp->gput_seq - tp->snd_una; 16337 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 16338 /* 16339 * There are not enough bytes in the socket 16340 * buffer that have been sent to cover this 16341 * measurement. Cancel it. 16342 */ 16343 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 16344 rack->r_ctl.rc_gp_srtt /*flex1*/, 16345 tp->gput_seq, 16346 0, 0, 18, __LINE__, NULL, 0); 16347 tp->t_flags &= ~TF_GPUTINPROG; 16348 } 16349 } 16350 if (tcp_bblogging_on(rack->rc_tp)) { 16351 union tcp_log_stackspecific log; 16352 struct timeval ltv; 16353 #ifdef TCP_REQUEST_TRK 16354 struct tcp_sendfile_track *tcp_req; 16355 16356 if (SEQ_GT(th->th_ack, tp->snd_una)) { 16357 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1)); 16358 } else { 16359 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); 16360 } 16361 #endif 16362 memset(&log, 0, sizeof(log)); 16363 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 16364 if (rack->rack_no_prr == 0) 16365 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16366 else 16367 log.u_bbr.flex1 = 0; 16368 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 16369 log.u_bbr.use_lt_bw <<= 1; 16370 log.u_bbr.use_lt_bw |= rack->r_might_revert; 16371 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 16372 log.u_bbr.bbr_state = rack->rc_free_cnt; 16373 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16374 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 16375 log.u_bbr.flex3 = m->m_flags; 16376 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 16377 log.u_bbr.lost = thflags; 16378 log.u_bbr.pacing_gain = 0x1; 16379 #ifdef TCP_ACCOUNTING 16380 log.u_bbr.cwnd_gain = ack_val_set; 16381 #endif 16382 log.u_bbr.flex7 = 2; 16383 if (m->m_flags & M_TSTMP) { 16384 /* Record the hardware timestamp if present */ 16385 mbuf_tstmp2timespec(m, &ts); 16386 ltv.tv_sec = ts.tv_sec; 16387 ltv.tv_usec = ts.tv_nsec / 1000; 16388 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); 16389 } else if (m->m_flags & M_TSTMP_LRO) { 16390 /* Record the LRO the arrival timestamp */ 16391 mbuf_tstmp2timespec(m, &ts); 16392 ltv.tv_sec = ts.tv_sec; 16393 ltv.tv_usec = ts.tv_nsec / 1000; 16394 log.u_bbr.flex5 = tcp_tv_to_usec(<v); 16395 } 16396 log.u_bbr.timeStamp = tcp_get_usecs(<v); 16397 /* Log the rcv time */ 16398 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 16399 #ifdef TCP_REQUEST_TRK 16400 log.u_bbr.applimited = tp->t_tcpreq_closed; 16401 log.u_bbr.applimited <<= 8; 16402 log.u_bbr.applimited |= tp->t_tcpreq_open; 16403 log.u_bbr.applimited <<= 8; 16404 log.u_bbr.applimited |= tp->t_tcpreq_req; 16405 if (tcp_req) { 16406 /* Copy out any client req info */ 16407 /* seconds */ 16408 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 16409 /* useconds */ 16410 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 16411 log.u_bbr.rttProp = tcp_req->timestamp; 16412 log.u_bbr.cur_del_rate = tcp_req->start; 16413 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 16414 log.u_bbr.flex8 |= 1; 16415 } else { 16416 log.u_bbr.flex8 |= 2; 16417 log.u_bbr.bw_inuse = tcp_req->end; 16418 } 16419 log.u_bbr.flex6 = tcp_req->start_seq; 16420 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 16421 log.u_bbr.flex8 |= 4; 16422 log.u_bbr.epoch = tcp_req->end_seq; 16423 } 16424 } 16425 #endif 16426 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 16427 tlen, &log, true, <v); 16428 } 16429 /* Remove ack required flag if set, we have one */ 16430 if (thflags & TH_ACK) 16431 rack->rc_ack_required = 0; 16432 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16433 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 16434 way_out = 4; 16435 retval = 0; 16436 m_freem(m); 16437 goto done_with_input; 16438 } 16439 /* 16440 * If a segment with the ACK-bit set arrives in the SYN-SENT state 16441 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 16442 */ 16443 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 16444 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 16445 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 16446 ctf_do_dropwithreset(m, tp, th, tlen); 16447 #ifdef TCP_ACCOUNTING 16448 sched_unpin(); 16449 #endif 16450 return (1); 16451 } 16452 /* 16453 * If timestamps were negotiated during SYN/ACK and a 16454 * segment without a timestamp is received, silently drop 16455 * the segment, unless it is a RST segment or missing timestamps are 16456 * tolerated. 16457 * See section 3.2 of RFC 7323. 16458 */ 16459 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 16460 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 16461 way_out = 5; 16462 retval = 0; 16463 m_freem(m); 16464 goto done_with_input; 16465 } 16466 /* 16467 * Segment received on connection. Reset idle time and keep-alive 16468 * timer. XXX: This should be done after segment validation to 16469 * ignore broken/spoofed segs. 16470 */ 16471 if (tp->t_idle_reduce && 16472 (tp->snd_max == tp->snd_una) && 16473 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16474 counter_u64_add(rack_input_idle_reduces, 1); 16475 rack_cc_after_idle(rack, tp); 16476 } 16477 tp->t_rcvtime = ticks; 16478 #ifdef STATS 16479 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 16480 #endif 16481 if (tiwin > rack->r_ctl.rc_high_rwnd) 16482 rack->r_ctl.rc_high_rwnd = tiwin; 16483 /* 16484 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 16485 * this to occur after we've validated the segment. 16486 */ 16487 if (tcp_ecn_input_segment(tp, thflags, tlen, 16488 tcp_packets_this_ack(tp, th->th_ack), 16489 iptos)) 16490 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 16491 16492 /* 16493 * If echoed timestamp is later than the current time, fall back to 16494 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16495 * were used when this connection was established. 16496 */ 16497 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 16498 to.to_tsecr -= tp->ts_offset; 16499 if (TSTMP_GT(to.to_tsecr, ms_cts)) 16500 to.to_tsecr = 0; 16501 } 16502 if ((rack->r_rcvpath_rtt_up == 1) && 16503 (to.to_flags & TOF_TS) && 16504 (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) { 16505 uint32_t rtt = 0; 16506 16507 /* 16508 * We are receiving only and thus not sending 16509 * data to do an RTT. We set a flag when we first 16510 * sent this TS to the peer. We now have it back 16511 * and have an RTT to share. We log it as a conf 16512 * 4, we are not so sure about it.. since we 16513 * may have lost an ack. 16514 */ 16515 if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv)) 16516 rtt = (cts - rack->r_ctl.last_time_of_arm_rcv); 16517 rack->r_rcvpath_rtt_up = 0; 16518 /* Submit and commit the timer */ 16519 if (rtt > 0) { 16520 tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1); 16521 tcp_rack_xmit_timer_commit(rack, tp); 16522 } 16523 } 16524 /* 16525 * If its the first time in we need to take care of options and 16526 * verify we can do SACK for rack! 16527 */ 16528 if (rack->r_state == 0) { 16529 /* Should be init'd by rack_init() */ 16530 KASSERT(rack->rc_inp != NULL, 16531 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 16532 if (rack->rc_inp == NULL) { 16533 rack->rc_inp = inp; 16534 } 16535 16536 /* 16537 * Process options only when we get SYN/ACK back. The SYN 16538 * case for incoming connections is handled in tcp_syncache. 16539 * According to RFC1323 the window field in a SYN (i.e., a 16540 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 16541 * this is traditional behavior, may need to be cleaned up. 16542 */ 16543 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 16544 /* Handle parallel SYN for ECN */ 16545 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 16546 if ((to.to_flags & TOF_SCALE) && 16547 (tp->t_flags & TF_REQ_SCALE)) { 16548 tp->t_flags |= TF_RCVD_SCALE; 16549 tp->snd_scale = to.to_wscale; 16550 } else 16551 tp->t_flags &= ~TF_REQ_SCALE; 16552 /* 16553 * Initial send window. It will be updated with the 16554 * next incoming segment to the scaled value. 16555 */ 16556 tp->snd_wnd = th->th_win; 16557 rack_validate_fo_sendwin_up(tp, rack); 16558 if ((to.to_flags & TOF_TS) && 16559 (tp->t_flags & TF_REQ_TSTMP)) { 16560 tp->t_flags |= TF_RCVD_TSTMP; 16561 tp->ts_recent = to.to_tsval; 16562 tp->ts_recent_age = cts; 16563 } else 16564 tp->t_flags &= ~TF_REQ_TSTMP; 16565 if (to.to_flags & TOF_MSS) { 16566 tcp_mss(tp, to.to_mss); 16567 } 16568 if ((tp->t_flags & TF_SACK_PERMIT) && 16569 (to.to_flags & TOF_SACKPERM) == 0) 16570 tp->t_flags &= ~TF_SACK_PERMIT; 16571 if (tp->t_flags & TF_FASTOPEN) { 16572 if (to.to_flags & TOF_FASTOPEN) { 16573 uint16_t mss; 16574 16575 if (to.to_flags & TOF_MSS) 16576 mss = to.to_mss; 16577 else 16578 if ((inp->inp_vflag & INP_IPV6) != 0) 16579 mss = TCP6_MSS; 16580 else 16581 mss = TCP_MSS; 16582 tcp_fastopen_update_cache(tp, mss, 16583 to.to_tfo_len, to.to_tfo_cookie); 16584 } else 16585 tcp_fastopen_disable_path(tp); 16586 } 16587 } 16588 /* 16589 * At this point we are at the initial call. Here we decide 16590 * if we are doing RACK or not. We do this by seeing if 16591 * TF_SACK_PERMIT is set and the sack-not-required is clear. 16592 * The code now does do dup-ack counting so if you don't 16593 * switch back you won't get rack & TLP, but you will still 16594 * get this stack. 16595 */ 16596 16597 if ((rack_sack_not_required == 0) && 16598 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 16599 tcp_switch_back_to_default(tp); 16600 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen, 16601 tlen, iptos); 16602 #ifdef TCP_ACCOUNTING 16603 sched_unpin(); 16604 #endif 16605 return (1); 16606 } 16607 tcp_set_hpts(tp); 16608 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 16609 } 16610 if (thflags & TH_FIN) 16611 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 16612 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 16613 if ((rack->rc_gp_dyn_mul) && 16614 (rack->use_fixed_rate == 0) && 16615 (rack->rc_always_pace)) { 16616 /* Check in on probertt */ 16617 rack_check_probe_rtt(rack, cts); 16618 } 16619 rack_clear_rate_sample(rack); 16620 if ((rack->forced_ack) && 16621 ((tcp_get_flags(th) & TH_RST) == 0)) { 16622 rack_handle_probe_response(rack, tiwin, us_cts); 16623 } 16624 /* 16625 * This is the one exception case where we set the rack state 16626 * always. All other times (timers etc) we must have a rack-state 16627 * set (so we assure we have done the checks above for SACK). 16628 */ 16629 rack->r_ctl.rc_rcvtime = cts; 16630 if (rack->r_state != tp->t_state) 16631 rack_set_state(tp, rack); 16632 if (SEQ_GT(th->th_ack, tp->snd_una) && 16633 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) 16634 kern_prefetch(rsm, &prev_state); 16635 prev_state = rack->r_state; 16636 if ((thflags & TH_RST) && 16637 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 16638 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 16639 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { 16640 /* The connection will be killed by a reset check the tracepoint */ 16641 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); 16642 } 16643 retval = (*rack->r_substate) (m, th, so, 16644 tp, &to, drop_hdrlen, 16645 tlen, tiwin, thflags, nxt_pkt, iptos); 16646 if (retval == 0) { 16647 /* 16648 * If retval is 1 the tcb is unlocked and most likely the tp 16649 * is gone. 16650 */ 16651 INP_WLOCK_ASSERT(inp); 16652 if ((rack->rc_gp_dyn_mul) && 16653 (rack->rc_always_pace) && 16654 (rack->use_fixed_rate == 0) && 16655 rack->in_probe_rtt && 16656 (rack->r_ctl.rc_time_probertt_starts == 0)) { 16657 /* 16658 * If we are going for target, lets recheck before 16659 * we output. 16660 */ 16661 rack_check_probe_rtt(rack, cts); 16662 } 16663 if (rack->set_pacing_done_a_iw == 0) { 16664 /* How much has been acked? */ 16665 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 16666 /* We have enough to set in the pacing segment size */ 16667 rack->set_pacing_done_a_iw = 1; 16668 rack_set_pace_segments(tp, rack, __LINE__, NULL); 16669 } 16670 } 16671 tcp_rack_xmit_timer_commit(rack, tp); 16672 #ifdef TCP_ACCOUNTING 16673 /* 16674 * If we set the ack_val_se to what ack processing we are doing 16675 * we also want to track how many cycles we burned. Note 16676 * the bits after tcp_output we let be "free". This is because 16677 * we are also tracking the tcp_output times as well. Note the 16678 * use of 0xf here since we only have 11 counter (0 - 0xa) and 16679 * 0xf cannot be returned and is what we initialize it too to 16680 * indicate we are not doing the tabulations. 16681 */ 16682 if (ack_val_set != 0xf) { 16683 uint64_t crtsc; 16684 16685 crtsc = get_cyclecount(); 16686 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16687 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 16688 } 16689 } 16690 #endif 16691 if ((nxt_pkt == 0) && (no_output == 0)) { 16692 if ((rack->r_wanted_output != 0) || 16693 (tp->t_flags & TF_ACKNOW) || 16694 (rack->r_fast_output != 0)) { 16695 16696 do_output_now: 16697 if (tcp_output(tp) < 0) { 16698 #ifdef TCP_ACCOUNTING 16699 sched_unpin(); 16700 #endif 16701 return (1); 16702 } 16703 did_out = 1; 16704 } 16705 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16706 rack_free_trim(rack); 16707 } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { 16708 goto do_output_now; 16709 } else if ((no_output == 1) && 16710 (nxt_pkt == 0) && 16711 (tcp_in_hpts(rack->rc_tp) == 0)) { 16712 /* 16713 * We are not in hpts and we had a pacing timer up. Use 16714 * the remaining time (time_remaining) to restart the timer. 16715 */ 16716 KASSERT ((time_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); 16717 rack_start_hpts_timer(rack, tp, cts, time_remaining, 0, 0); 16718 rack_free_trim(rack); 16719 } 16720 /* Clear the flag, it may have been cleared by output but we may not have */ 16721 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) 16722 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16723 /* 16724 * The draft (v3) calls for us to use SEQ_GEQ, but that 16725 * causes issues when we are just going app limited. Lets 16726 * instead use SEQ_GT <or> where its equal but more data 16727 * is outstanding. 16728 * 16729 * Also make sure we are on the last ack of a series. We 16730 * have to have all the ack's processed in queue to know 16731 * if there is something left outstanding. 16732 */ 16733 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && 16734 (rack->rc_new_rnd_needed == 0) && 16735 (nxt_pkt == 0)) { 16736 /* 16737 * We have crossed into a new round with 16738 * the new snd_unae. 16739 */ 16740 rack_new_round_setup(tp, rack, tp->snd_una); 16741 } 16742 if ((nxt_pkt == 0) && 16743 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 16744 (SEQ_GT(tp->snd_max, tp->snd_una) || 16745 (tp->t_flags & TF_DELACK) || 16746 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 16747 (tp->t_state <= TCPS_CLOSING)))) { 16748 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 16749 if ((tp->snd_max == tp->snd_una) && 16750 ((tp->t_flags & TF_DELACK) == 0) && 16751 (tcp_in_hpts(rack->rc_tp)) && 16752 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 16753 /* keep alive not needed if we are hptsi output yet */ 16754 ; 16755 } else { 16756 int late = 0; 16757 if (tcp_in_hpts(tp)) { 16758 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 16759 us_cts = tcp_get_usecs(NULL); 16760 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 16761 rack->r_early = 1; 16762 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 16763 } else 16764 late = 1; 16765 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16766 } 16767 tcp_hpts_remove(tp); 16768 } 16769 if (late && (did_out == 0)) { 16770 /* 16771 * We are late in the sending 16772 * and we did not call the output 16773 * (this probably should not happen). 16774 */ 16775 goto do_output_now; 16776 } 16777 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 16778 } 16779 way_out = 1; 16780 } else if (nxt_pkt == 0) { 16781 /* Do we have the correct timer running? */ 16782 rack_timer_audit(tp, rack, &so->so_snd); 16783 way_out = 2; 16784 } 16785 done_with_input: 16786 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 16787 if (did_out) 16788 rack->r_wanted_output = 0; 16789 } 16790 16791 #ifdef TCP_ACCOUNTING 16792 sched_unpin(); 16793 #endif 16794 return (retval); 16795 } 16796 16797 static void 16798 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 16799 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 16800 { 16801 struct timeval tv; 16802 16803 /* First lets see if we have old packets */ 16804 if (!STAILQ_EMPTY(&tp->t_inqueue)) { 16805 if (ctf_do_queued_segments(tp, 1)) { 16806 m_freem(m); 16807 return; 16808 } 16809 } 16810 if (m->m_flags & M_TSTMP_LRO) { 16811 mbuf_tstmp2timeval(m, &tv); 16812 } else { 16813 /* Should not be should we kassert instead? */ 16814 tcp_get_usecs(&tv); 16815 } 16816 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0, 16817 &tv) == 0) { 16818 INP_WUNLOCK(tptoinpcb(tp)); 16819 } 16820 } 16821 16822 struct rack_sendmap * 16823 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 16824 { 16825 struct rack_sendmap *rsm = NULL; 16826 int32_t idx; 16827 uint32_t srtt = 0, thresh = 0, ts_low = 0; 16828 16829 /* Return the next guy to be re-transmitted */ 16830 if (tqhash_empty(rack->r_ctl.tqh)) { 16831 return (NULL); 16832 } 16833 if (tp->t_flags & TF_SENTFIN) { 16834 /* retran the end FIN? */ 16835 return (NULL); 16836 } 16837 /* ok lets look at this one */ 16838 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 16839 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 16840 return (rsm); 16841 } 16842 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 16843 goto check_it; 16844 } 16845 rsm = rack_find_lowest_rsm(rack); 16846 if (rsm == NULL) { 16847 return (NULL); 16848 } 16849 check_it: 16850 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 16851 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 16852 /* 16853 * No sack so we automatically do the 3 strikes and 16854 * retransmit (no rack timer would be started). 16855 */ 16856 return (rsm); 16857 } 16858 if (rsm->r_flags & RACK_ACKED) { 16859 return (NULL); 16860 } 16861 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 16862 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 16863 /* Its not yet ready */ 16864 return (NULL); 16865 } 16866 srtt = rack_grab_rtt(tp, rack); 16867 idx = rsm->r_rtr_cnt - 1; 16868 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 16869 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 16870 if ((tsused == ts_low) || 16871 (TSTMP_LT(tsused, ts_low))) { 16872 /* No time since sending */ 16873 return (NULL); 16874 } 16875 if ((tsused - ts_low) < thresh) { 16876 /* It has not been long enough yet */ 16877 return (NULL); 16878 } 16879 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 16880 ((rsm->r_flags & RACK_SACK_PASSED))) { 16881 /* 16882 * We have passed the dup-ack threshold <or> 16883 * a SACK has indicated this is missing. 16884 * Note that if you are a declared attacker 16885 * it is only the dup-ack threshold that 16886 * will cause retransmits. 16887 */ 16888 /* log retransmit reason */ 16889 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 16890 rack->r_fast_output = 0; 16891 return (rsm); 16892 } 16893 return (NULL); 16894 } 16895 16896 static void 16897 rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t pacing_delay, 16898 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 16899 int line, struct rack_sendmap *rsm, uint8_t quality) 16900 { 16901 if (tcp_bblogging_on(rack->rc_tp)) { 16902 union tcp_log_stackspecific log; 16903 struct timeval tv; 16904 16905 if (rack_verbose_logging == 0) { 16906 /* 16907 * We are not verbose screen out all but 16908 * ones we always want. 16909 */ 16910 if ((method != 2) && 16911 (method != 3) && 16912 (method != 7) && 16913 (method != 89) && 16914 (method != 14) && 16915 (method != 20)) { 16916 return; 16917 } 16918 } 16919 memset(&log, 0, sizeof(log)); 16920 log.u_bbr.flex1 = pacing_delay; 16921 log.u_bbr.flex2 = len; 16922 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 16923 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 16924 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 16925 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 16926 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 16927 log.u_bbr.use_lt_bw <<= 1; 16928 log.u_bbr.use_lt_bw |= rack->r_late; 16929 log.u_bbr.use_lt_bw <<= 1; 16930 log.u_bbr.use_lt_bw |= rack->r_early; 16931 log.u_bbr.use_lt_bw <<= 1; 16932 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 16933 log.u_bbr.use_lt_bw <<= 1; 16934 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 16935 log.u_bbr.use_lt_bw <<= 1; 16936 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 16937 log.u_bbr.use_lt_bw <<= 1; 16938 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 16939 log.u_bbr.use_lt_bw <<= 1; 16940 log.u_bbr.use_lt_bw |= rack->gp_ready; 16941 log.u_bbr.pkt_epoch = line; 16942 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 16943 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 16944 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 16945 log.u_bbr.bw_inuse = bw_est; 16946 log.u_bbr.delRate = bw; 16947 if (rack->r_ctl.gp_bw == 0) 16948 log.u_bbr.cur_del_rate = 0; 16949 else 16950 log.u_bbr.cur_del_rate = rack_get_bw(rack); 16951 log.u_bbr.rttProp = len_time; 16952 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 16953 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 16954 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 16955 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 16956 /* We are in slow start */ 16957 log.u_bbr.flex7 = 1; 16958 } else { 16959 /* we are on congestion avoidance */ 16960 log.u_bbr.flex7 = 0; 16961 } 16962 log.u_bbr.flex8 = method; 16963 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 16964 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16965 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 16966 log.u_bbr.cwnd_gain <<= 1; 16967 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 16968 log.u_bbr.cwnd_gain <<= 1; 16969 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 16970 log.u_bbr.cwnd_gain <<= 1; 16971 log.u_bbr.cwnd_gain |= rack->use_fixed_rate; 16972 log.u_bbr.cwnd_gain <<= 1; 16973 log.u_bbr.cwnd_gain |= rack->rc_always_pace; 16974 log.u_bbr.cwnd_gain <<= 1; 16975 log.u_bbr.cwnd_gain |= rack->gp_ready; 16976 log.u_bbr.bbr_substate = quality; 16977 log.u_bbr.bbr_state = rack->dgp_on; 16978 log.u_bbr.bbr_state <<= 1; 16979 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; 16980 log.u_bbr.bbr_state <<= 2; 16981 TCP_LOG_EVENTP(rack->rc_tp, NULL, 16982 &rack->rc_inp->inp_socket->so_rcv, 16983 &rack->rc_inp->inp_socket->so_snd, 16984 BBR_LOG_HPTSI_CALC, 0, 16985 0, &log, false, &tv); 16986 } 16987 } 16988 16989 static uint32_t 16990 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 16991 { 16992 uint32_t new_tso, user_max, pace_one; 16993 16994 user_max = rack->rc_user_set_max_segs * mss; 16995 if (rack->rc_force_max_seg) { 16996 return (user_max); 16997 } 16998 if (rack->use_fixed_rate && 16999 ((rack->r_ctl.crte == NULL) || 17000 (bw != rack->r_ctl.crte->rate))) { 17001 /* Use the user mss since we are not exactly matched */ 17002 return (user_max); 17003 } 17004 if (rack_pace_one_seg || 17005 (rack->r_ctl.rc_user_set_min_segs == 1)) 17006 pace_one = 1; 17007 else 17008 pace_one = 0; 17009 17010 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, 17011 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 17012 if (new_tso > user_max) 17013 new_tso = user_max; 17014 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { 17015 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) 17016 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; 17017 } 17018 if (rack->r_ctl.rc_user_set_min_segs && 17019 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) 17020 new_tso = rack->r_ctl.rc_user_set_min_segs * mss; 17021 return (new_tso); 17022 } 17023 17024 static uint64_t 17025 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b) 17026 { 17027 uint64_t reduced_win; 17028 uint32_t gain; 17029 17030 if (window_input < rc_init_window(rack)) { 17031 /* 17032 * The cwnd is collapsed to 17033 * nearly zero, maybe because of a time-out? 17034 * Lets drop back to the lt-bw. 17035 */ 17036 reduced_win = rack_get_lt_bw(rack); 17037 /* Set the flag so the caller knows its a rate and not a reduced window */ 17038 *rate_set = 1; 17039 gain = 100; 17040 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) { 17041 /* 17042 * If we are in recover our cwnd needs to be less for 17043 * our pacing consideration. 17044 */ 17045 if (rack->rack_hibeta == 0) { 17046 reduced_win = window_input / 2; 17047 gain = 50; 17048 } else { 17049 reduced_win = window_input * rack->r_ctl.saved_hibeta; 17050 reduced_win /= 100; 17051 gain = rack->r_ctl.saved_hibeta; 17052 } 17053 } else { 17054 /* 17055 * Apply Timely factor to increase/decrease the 17056 * amount we are pacing at. 17057 */ 17058 gain = rack_get_output_gain(rack, NULL); 17059 if (gain > rack_gain_p5_ub) { 17060 gain = rack_gain_p5_ub; 17061 } 17062 reduced_win = window_input * gain; 17063 reduced_win /= 100; 17064 } 17065 if (gain_b != NULL) 17066 *gain_b = gain; 17067 /* 17068 * What is being returned here is a trimmed down 17069 * window values in all cases where rate_set is left 17070 * at 0. In one case we actually return the rate (lt_bw). 17071 * the "reduced_win" is returned as a slimmed down cwnd that 17072 * is then calculated by the caller into a rate when rate_set 17073 * is 0. 17074 */ 17075 return (reduced_win); 17076 } 17077 17078 static int32_t 17079 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t pacing_delay, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 17080 { 17081 uint64_t lentim, fill_bw; 17082 17083 rack->r_via_fill_cw = 0; 17084 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 17085 return (pacing_delay); 17086 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 17087 return (pacing_delay); 17088 if (rack->r_ctl.rc_last_us_rtt == 0) 17089 return (pacing_delay); 17090 if (rack->rc_pace_fill_if_rttin_range && 17091 (rack->r_ctl.rc_last_us_rtt >= 17092 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 17093 /* The rtt is huge, N * smallest, lets not fill */ 17094 return (pacing_delay); 17095 } 17096 if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) 17097 return (pacing_delay); 17098 /* 17099 * first lets calculate the b/w based on the last us-rtt 17100 * and the the smallest send window. 17101 */ 17102 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17103 if (rack->rc_fillcw_apply_discount) { 17104 uint32_t rate_set = 0; 17105 17106 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL); 17107 if (rate_set) { 17108 goto at_lt_bw; 17109 } 17110 } 17111 /* Take the rwnd if its smaller */ 17112 if (fill_bw > rack->rc_tp->snd_wnd) 17113 fill_bw = rack->rc_tp->snd_wnd; 17114 /* Now lets make it into a b/w */ 17115 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 17116 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17117 /* Adjust to any cap */ 17118 if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap) 17119 fill_bw = rack->r_ctl.fillcw_cap; 17120 17121 at_lt_bw: 17122 if (rack_bw_multipler > 0) { 17123 /* 17124 * We want to limit fill-cw to the some multiplier 17125 * of the max(lt_bw, gp_est). The normal default 17126 * is 0 for off, so a sysctl has enabled it. 17127 */ 17128 uint64_t lt_bw, gp, rate; 17129 17130 gp = rack_get_gp_est(rack); 17131 lt_bw = rack_get_lt_bw(rack); 17132 if (lt_bw > gp) 17133 rate = lt_bw; 17134 else 17135 rate = gp; 17136 rate *= rack_bw_multipler; 17137 rate /= 100; 17138 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 17139 union tcp_log_stackspecific log; 17140 struct timeval tv; 17141 17142 memset(&log, 0, sizeof(log)); 17143 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17144 log.u_bbr.flex1 = rack_bw_multipler; 17145 log.u_bbr.flex2 = len; 17146 log.u_bbr.cur_del_rate = gp; 17147 log.u_bbr.delRate = lt_bw; 17148 log.u_bbr.bw_inuse = rate; 17149 log.u_bbr.rttProp = fill_bw; 17150 log.u_bbr.flex8 = 44; 17151 tcp_log_event(rack->rc_tp, NULL, NULL, NULL, 17152 BBR_LOG_CWND, 0, 17153 0, &log, false, NULL, 17154 __func__, __LINE__, &tv); 17155 } 17156 if (fill_bw > rate) 17157 fill_bw = rate; 17158 } 17159 /* We are below the min b/w */ 17160 if (non_paced) 17161 *rate_wanted = fill_bw; 17162 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 17163 return (pacing_delay); 17164 rack->r_via_fill_cw = 1; 17165 if (rack->r_rack_hw_rate_caps && 17166 (rack->r_ctl.crte != NULL)) { 17167 uint64_t high_rate; 17168 17169 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 17170 if (fill_bw > high_rate) { 17171 /* We are capping bw at the highest rate table entry */ 17172 if (*rate_wanted > high_rate) { 17173 /* The original rate was also capped */ 17174 rack->r_via_fill_cw = 0; 17175 } 17176 rack_log_hdwr_pacing(rack, 17177 fill_bw, high_rate, __LINE__, 17178 0, 3); 17179 fill_bw = high_rate; 17180 if (capped) 17181 *capped = 1; 17182 } 17183 } else if ((rack->r_ctl.crte == NULL) && 17184 (rack->rack_hdrw_pacing == 0) && 17185 (rack->rack_hdw_pace_ena) && 17186 rack->r_rack_hw_rate_caps && 17187 (rack->rack_attempt_hdwr_pace == 0) && 17188 (rack->rc_inp->inp_route.ro_nh != NULL) && 17189 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17190 /* 17191 * Ok we may have a first attempt that is greater than our top rate 17192 * lets check. 17193 */ 17194 uint64_t high_rate; 17195 17196 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 17197 if (high_rate) { 17198 if (fill_bw > high_rate) { 17199 fill_bw = high_rate; 17200 if (capped) 17201 *capped = 1; 17202 } 17203 } 17204 } 17205 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { 17206 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 17207 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); 17208 fill_bw = rack->r_ctl.bw_rate_cap; 17209 } 17210 /* 17211 * Ok fill_bw holds our mythical b/w to fill the cwnd 17212 * in an rtt (unless it was capped), what does that 17213 * time wise equate too? 17214 */ 17215 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 17216 lentim /= fill_bw; 17217 *rate_wanted = fill_bw; 17218 if (non_paced || (lentim < pacing_delay)) { 17219 rack_log_pacing_delay_calc(rack, len, pacing_delay, fill_bw, 17220 0, lentim, 12, __LINE__, NULL, 0); 17221 return ((int32_t)lentim); 17222 } else 17223 return (pacing_delay); 17224 } 17225 17226 static int32_t 17227 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) 17228 { 17229 uint64_t srtt; 17230 int32_t pacing_delay = 0; 17231 int can_start_hw_pacing = 1; 17232 int err; 17233 int pace_one; 17234 17235 if (rack_pace_one_seg || 17236 (rack->r_ctl.rc_user_set_min_segs == 1)) 17237 pace_one = 1; 17238 else 17239 pace_one = 0; 17240 if (rack->rc_always_pace == 0) { 17241 /* 17242 * We use the most optimistic possible cwnd/srtt for 17243 * sending calculations. This will make our 17244 * calculation anticipate getting more through 17245 * quicker then possible. But thats ok we don't want 17246 * the peer to have a gap in data sending. 17247 */ 17248 uint64_t cwnd, tr_perms = 0; 17249 int32_t reduce; 17250 17251 old_method: 17252 /* 17253 * We keep no precise pacing with the old method 17254 * instead we use the pacer to mitigate bursts. 17255 */ 17256 if (rack->r_ctl.rc_rack_min_rtt) 17257 srtt = rack->r_ctl.rc_rack_min_rtt; 17258 else 17259 srtt = max(tp->t_srtt, 1); 17260 if (rack->r_ctl.rc_rack_largest_cwnd) 17261 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 17262 else 17263 cwnd = rack->r_ctl.cwnd_to_use; 17264 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 17265 tr_perms = (cwnd * 1000) / srtt; 17266 if (tr_perms == 0) { 17267 tr_perms = ctf_fixed_maxseg(tp); 17268 } 17269 /* 17270 * Calculate how long this will take to drain, if 17271 * the calculation comes out to zero, thats ok we 17272 * will use send_a_lot to possibly spin around for 17273 * more increasing tot_len_this_send to the point 17274 * that its going to require a pace, or we hit the 17275 * cwnd. Which in that case we are just waiting for 17276 * a ACK. 17277 */ 17278 pacing_delay = len / tr_perms; 17279 /* Now do we reduce the time so we don't run dry? */ 17280 if (pacing_delay && rack_pacing_delay_reduction) { 17281 reduce = (pacing_delay / rack_pacing_delay_reduction); 17282 if (reduce < pacing_delay) { 17283 pacing_delay -= reduce; 17284 } else 17285 pacing_delay = 0; 17286 } else 17287 reduce = 0; 17288 pacing_delay *= HPTS_USEC_IN_MSEC; 17289 if (rack->rc_pace_to_cwnd) { 17290 uint64_t rate_wanted = 0; 17291 17292 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, len, segsiz, NULL, &rate_wanted, 1); 17293 rack->rc_ack_can_sendout_data = 1; 17294 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 17295 } else 17296 rack_log_pacing_delay_calc(rack, len, pacing_delay, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 17297 /*******************************************************/ 17298 /* RRS: We insert non-paced call to stats here for len */ 17299 /*******************************************************/ 17300 } else { 17301 uint64_t bw_est, res, lentim, rate_wanted; 17302 uint32_t segs, oh; 17303 int capped = 0; 17304 int prev_fill; 17305 17306 if ((rack->r_rr_config == 1) && rsm) { 17307 return (rack->r_ctl.rc_min_to); 17308 } 17309 if (rack->use_fixed_rate) { 17310 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 17311 } else if ((rack->r_ctl.init_rate == 0) && 17312 (rack->r_ctl.gp_bw == 0)) { 17313 /* no way to yet do an estimate */ 17314 bw_est = rate_wanted = 0; 17315 } else if (rack->dgp_on) { 17316 bw_est = rack_get_bw(rack); 17317 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 17318 } else { 17319 uint32_t gain, rate_set = 0; 17320 17321 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17322 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain); 17323 if (rate_set == 0) { 17324 if (rate_wanted > rack->rc_tp->snd_wnd) 17325 rate_wanted = rack->rc_tp->snd_wnd; 17326 /* Now lets make it into a b/w */ 17327 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC; 17328 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17329 } 17330 bw_est = rate_wanted; 17331 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd, 17332 rack->r_ctl.cwnd_to_use, 17333 rate_wanted, bw_est, 17334 rack->r_ctl.rc_last_us_rtt, 17335 88, __LINE__, NULL, gain); 17336 } 17337 if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && 17338 (rack->use_fixed_rate == 0)) { 17339 /* 17340 * No way yet to make a b/w estimate or 17341 * our raise is set incorrectly. 17342 */ 17343 goto old_method; 17344 } 17345 rack_rate_cap_bw(rack, &rate_wanted, &capped); 17346 /* We need to account for all the overheads */ 17347 segs = (len + segsiz - 1) / segsiz; 17348 /* 17349 * We need the diff between 1514 bytes (e-mtu with e-hdr) 17350 * and how much data we put in each packet. Yes this 17351 * means we may be off if we are larger than 1500 bytes 17352 * or smaller. But this just makes us more conservative. 17353 */ 17354 17355 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); 17356 if (rack->r_is_v6) { 17357 #ifdef INET6 17358 oh += sizeof(struct ip6_hdr); 17359 #endif 17360 } else { 17361 #ifdef INET 17362 oh += sizeof(struct ip); 17363 #endif 17364 } 17365 /* We add a fixed 14 for the ethernet header */ 17366 oh += 14; 17367 segs *= oh; 17368 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 17369 res = lentim / rate_wanted; 17370 pacing_delay = (uint32_t)res; 17371 if (rack_hw_rate_min && 17372 (rate_wanted < rack_hw_rate_min)) { 17373 can_start_hw_pacing = 0; 17374 if (rack->r_ctl.crte) { 17375 /* 17376 * Ok we need to release it, we 17377 * have fallen too low. 17378 */ 17379 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17380 rack->r_ctl.crte = NULL; 17381 rack->rack_attempt_hdwr_pace = 0; 17382 rack->rack_hdrw_pacing = 0; 17383 } 17384 } 17385 if (rack->r_ctl.crte && 17386 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17387 /* 17388 * We want more than the hardware can give us, 17389 * don't start any hw pacing. 17390 */ 17391 can_start_hw_pacing = 0; 17392 if (rack->r_rack_hw_rate_caps == 0) { 17393 /* 17394 * Ok we need to release it, we 17395 * want more than the card can give us and 17396 * no rate cap is in place. Set it up so 17397 * when we want less we can retry. 17398 */ 17399 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17400 rack->r_ctl.crte = NULL; 17401 rack->rack_attempt_hdwr_pace = 0; 17402 rack->rack_hdrw_pacing = 0; 17403 } 17404 } 17405 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { 17406 /* 17407 * We lost our rate somehow, this can happen 17408 * if the interface changed underneath us. 17409 */ 17410 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17411 rack->r_ctl.crte = NULL; 17412 /* Lets re-allow attempting to setup pacing */ 17413 rack->rack_hdrw_pacing = 0; 17414 rack->rack_attempt_hdwr_pace = 0; 17415 rack_log_hdwr_pacing(rack, 17416 rate_wanted, bw_est, __LINE__, 17417 0, 6); 17418 } 17419 prev_fill = rack->r_via_fill_cw; 17420 if ((rack->rc_pace_to_cwnd) && 17421 (capped == 0) && 17422 (rack->dgp_on == 1) && 17423 (rack->use_fixed_rate == 0) && 17424 (rack->in_probe_rtt == 0) && 17425 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 17426 /* 17427 * We want to pace at our rate *or* faster to 17428 * fill the cwnd to the max if its not full. 17429 */ 17430 pacing_delay = pace_to_fill_cwnd(rack, pacing_delay, (len+segs), segsiz, &capped, &rate_wanted, 0); 17431 /* Re-check to make sure we are not exceeding our max b/w */ 17432 if ((rack->r_ctl.crte != NULL) && 17433 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17434 /* 17435 * We want more than the hardware can give us, 17436 * don't start any hw pacing. 17437 */ 17438 can_start_hw_pacing = 0; 17439 if (rack->r_rack_hw_rate_caps == 0) { 17440 /* 17441 * Ok we need to release it, we 17442 * want more than the card can give us and 17443 * no rate cap is in place. Set it up so 17444 * when we want less we can retry. 17445 */ 17446 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17447 rack->r_ctl.crte = NULL; 17448 rack->rack_attempt_hdwr_pace = 0; 17449 rack->rack_hdrw_pacing = 0; 17450 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 17451 } 17452 } 17453 } 17454 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 17455 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17456 if ((rack->rack_hdw_pace_ena) && 17457 (can_start_hw_pacing > 0) && 17458 (rack->rack_hdrw_pacing == 0) && 17459 (rack->rack_attempt_hdwr_pace == 0)) { 17460 /* 17461 * Lets attempt to turn on hardware pacing 17462 * if we can. 17463 */ 17464 rack->rack_attempt_hdwr_pace = 1; 17465 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 17466 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17467 rate_wanted, 17468 RS_PACING_GEQ, 17469 &err, &rack->r_ctl.crte_prev_rate); 17470 if (rack->r_ctl.crte) { 17471 rack->rack_hdrw_pacing = 1; 17472 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, 17473 pace_one, rack->r_ctl.crte, 17474 NULL, rack->r_ctl.pace_len_divisor); 17475 rack_log_hdwr_pacing(rack, 17476 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17477 err, 0); 17478 rack->r_ctl.last_hw_bw_req = rate_wanted; 17479 } else { 17480 counter_u64_add(rack_hw_pace_init_fail, 1); 17481 } 17482 } else if (rack->rack_hdrw_pacing && 17483 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 17484 /* Do we need to adjust our rate? */ 17485 const struct tcp_hwrate_limit_table *nrte; 17486 17487 if (rack->r_up_only && 17488 (rate_wanted < rack->r_ctl.crte->rate)) { 17489 /** 17490 * We have four possible states here 17491 * having to do with the previous time 17492 * and this time. 17493 * previous | this-time 17494 * A) 0 | 0 -- fill_cw not in the picture 17495 * B) 1 | 0 -- we were doing a fill-cw but now are not 17496 * C) 1 | 1 -- all rates from fill_cw 17497 * D) 0 | 1 -- we were doing non-fill and now we are filling 17498 * 17499 * For case A, C and D we don't allow a drop. But for 17500 * case B where we now our on our steady rate we do 17501 * allow a drop. 17502 * 17503 */ 17504 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 17505 goto done_w_hdwr; 17506 } 17507 if ((rate_wanted > rack->r_ctl.crte->rate) || 17508 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 17509 if (rack_hw_rate_to_low && 17510 (bw_est < rack_hw_rate_to_low)) { 17511 /* 17512 * The pacing rate is too low for hardware, but 17513 * do allow hardware pacing to be restarted. 17514 */ 17515 rack_log_hdwr_pacing(rack, 17516 bw_est, rack->r_ctl.crte->rate, __LINE__, 17517 0, 5); 17518 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17519 rack->r_ctl.crte = NULL; 17520 rack->rack_attempt_hdwr_pace = 0; 17521 rack->rack_hdrw_pacing = 0; 17522 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17523 goto done_w_hdwr; 17524 } 17525 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 17526 rack->rc_tp, 17527 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17528 rate_wanted, 17529 RS_PACING_GEQ, 17530 &err, &rack->r_ctl.crte_prev_rate); 17531 if (nrte == NULL) { 17532 /* 17533 * Lost the rate, lets drop hardware pacing 17534 * period. 17535 */ 17536 rack->rack_hdrw_pacing = 0; 17537 rack->r_ctl.crte = NULL; 17538 rack_log_hdwr_pacing(rack, 17539 rate_wanted, 0, __LINE__, 17540 err, 1); 17541 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17542 counter_u64_add(rack_hw_pace_lost, 1); 17543 } else if (nrte != rack->r_ctl.crte) { 17544 rack->r_ctl.crte = nrte; 17545 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, 17546 segsiz, pace_one, rack->r_ctl.crte, 17547 NULL, rack->r_ctl.pace_len_divisor); 17548 rack_log_hdwr_pacing(rack, 17549 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17550 err, 2); 17551 rack->r_ctl.last_hw_bw_req = rate_wanted; 17552 } 17553 } else { 17554 /* We just need to adjust the segment size */ 17555 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17556 rack_log_hdwr_pacing(rack, 17557 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17558 0, 4); 17559 rack->r_ctl.last_hw_bw_req = rate_wanted; 17560 } 17561 } 17562 } 17563 done_w_hdwr: 17564 if (rack_limit_time_with_srtt && 17565 (rack->use_fixed_rate == 0) && 17566 (rack->rack_hdrw_pacing == 0)) { 17567 /* 17568 * Sanity check, we do not allow the pacing delay 17569 * to be longer than the SRTT of the path. If it is 17570 * a slow path, then adding a packet should increase 17571 * the RTT and compensate for this i.e. the srtt will 17572 * be greater so the allowed pacing time will be greater. 17573 * 17574 * Note this restriction is not for where a peak rate 17575 * is set, we are doing fixed pacing or hardware pacing. 17576 */ 17577 if (rack->rc_tp->t_srtt) 17578 srtt = rack->rc_tp->t_srtt; 17579 else 17580 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 17581 if (srtt < (uint64_t)pacing_delay) { 17582 rack_log_pacing_delay_calc(rack, srtt, pacing_delay, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 17583 pacing_delay = srtt; 17584 } 17585 } 17586 /*******************************************************************/ 17587 /* RRS: We insert paced call to stats here for len and rate_wanted */ 17588 /*******************************************************************/ 17589 rack_log_pacing_delay_calc(rack, len, pacing_delay, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 17590 } 17591 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 17592 /* 17593 * If this rate is seeing enobufs when it 17594 * goes to send then either the nic is out 17595 * of gas or we are mis-estimating the time 17596 * somehow and not letting the queue empty 17597 * completely. Lets add to the pacing time. 17598 */ 17599 int hw_boost_delay; 17600 17601 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 17602 if (hw_boost_delay > rack_enobuf_hw_max) 17603 hw_boost_delay = rack_enobuf_hw_max; 17604 else if (hw_boost_delay < rack_enobuf_hw_min) 17605 hw_boost_delay = rack_enobuf_hw_min; 17606 pacing_delay += hw_boost_delay; 17607 } 17608 return (pacing_delay); 17609 } 17610 17611 static void 17612 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 17613 tcp_seq startseq, uint32_t sb_offset) 17614 { 17615 struct rack_sendmap *my_rsm = NULL; 17616 17617 if (tp->t_state < TCPS_ESTABLISHED) { 17618 /* 17619 * We don't start any measurements if we are 17620 * not at least established. 17621 */ 17622 return; 17623 } 17624 if (tp->t_state >= TCPS_FIN_WAIT_1) { 17625 /* 17626 * We will get no more data into the SB 17627 * this means we need to have the data available 17628 * before we start a measurement. 17629 */ 17630 17631 if (sbavail(&tptosocket(tp)->so_snd) < 17632 max(rc_init_window(rack), 17633 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 17634 /* Nope not enough data */ 17635 return; 17636 } 17637 } 17638 tp->t_flags |= TF_GPUTINPROG; 17639 rack->r_ctl.rc_gp_cumack_ts = 0; 17640 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 17641 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 17642 tp->gput_seq = startseq; 17643 rack->app_limited_needs_set = 0; 17644 if (rack->in_probe_rtt) 17645 rack->measure_saw_probe_rtt = 1; 17646 else if ((rack->measure_saw_probe_rtt) && 17647 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 17648 rack->measure_saw_probe_rtt = 0; 17649 if (rack->rc_gp_filled) 17650 tp->gput_ts = rack->r_ctl.last_cumack_advance; 17651 else { 17652 /* Special case initial measurement */ 17653 struct timeval tv; 17654 17655 tp->gput_ts = tcp_get_usecs(&tv); 17656 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 17657 } 17658 /* 17659 * We take a guess out into the future, 17660 * if we have no measurement and no 17661 * initial rate, we measure the first 17662 * initial-windows worth of data to 17663 * speed up getting some GP measurement and 17664 * thus start pacing. 17665 */ 17666 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 17667 rack->app_limited_needs_set = 1; 17668 tp->gput_ack = startseq + max(rc_init_window(rack), 17669 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 17670 rack_log_pacing_delay_calc(rack, 17671 tp->gput_seq, 17672 tp->gput_ack, 17673 0, 17674 tp->gput_ts, 17675 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17676 9, 17677 __LINE__, NULL, 0); 17678 rack_tend_gp_marks(tp, rack); 17679 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17680 return; 17681 } 17682 if (sb_offset) { 17683 /* 17684 * We are out somewhere in the sb 17685 * can we use the already outstanding data? 17686 */ 17687 17688 if (rack->r_ctl.rc_app_limited_cnt == 0) { 17689 /* 17690 * Yes first one is good and in this case 17691 * the tp->gput_ts is correctly set based on 17692 * the last ack that arrived (no need to 17693 * set things up when an ack comes in). 17694 */ 17695 my_rsm = tqhash_min(rack->r_ctl.tqh); 17696 if ((my_rsm == NULL) || 17697 (my_rsm->r_rtr_cnt != 1)) { 17698 /* retransmission? */ 17699 goto use_latest; 17700 } 17701 } else { 17702 if (rack->r_ctl.rc_first_appl == NULL) { 17703 /* 17704 * If rc_first_appl is NULL 17705 * then the cnt should be 0. 17706 * This is probably an error, maybe 17707 * a KASSERT would be approprate. 17708 */ 17709 goto use_latest; 17710 } 17711 /* 17712 * If we have a marker pointer to the last one that is 17713 * app limited we can use that, but we need to set 17714 * things up so that when it gets ack'ed we record 17715 * the ack time (if its not already acked). 17716 */ 17717 rack->app_limited_needs_set = 1; 17718 /* 17719 * We want to get to the rsm that is either 17720 * next with space i.e. over 1 MSS or the one 17721 * after that (after the app-limited). 17722 */ 17723 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); 17724 if (my_rsm) { 17725 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 17726 /* Have to use the next one */ 17727 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17728 else { 17729 /* Use after the first MSS of it is acked */ 17730 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 17731 goto start_set; 17732 } 17733 } 17734 if ((my_rsm == NULL) || 17735 (my_rsm->r_rtr_cnt != 1)) { 17736 /* 17737 * Either its a retransmit or 17738 * the last is the app-limited one. 17739 */ 17740 goto use_latest; 17741 } 17742 } 17743 tp->gput_seq = my_rsm->r_start; 17744 start_set: 17745 if (my_rsm->r_flags & RACK_ACKED) { 17746 /* 17747 * This one has been acked use the arrival ack time 17748 */ 17749 struct rack_sendmap *nrsm; 17750 17751 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 17752 rack->app_limited_needs_set = 0; 17753 /* 17754 * Ok in this path we need to use the r_end now 17755 * since this guy is the starting ack. 17756 */ 17757 tp->gput_seq = my_rsm->r_end; 17758 /* 17759 * We also need to adjust up the sendtime 17760 * to the send of the next data after my_rsm. 17761 */ 17762 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17763 if (nrsm != NULL) 17764 my_rsm = nrsm; 17765 else { 17766 /* 17767 * The next as not been sent, thats the 17768 * case for using the latest. 17769 */ 17770 goto use_latest; 17771 } 17772 } 17773 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 17774 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 17775 rack->r_ctl.rc_gp_cumack_ts = 0; 17776 if ((rack->r_ctl.cleared_app_ack == 1) && 17777 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) { 17778 /* 17779 * We just cleared an application limited period 17780 * so the next seq out needs to skip the first 17781 * ack. 17782 */ 17783 rack->app_limited_needs_set = 1; 17784 rack->r_ctl.cleared_app_ack = 0; 17785 } 17786 rack_log_pacing_delay_calc(rack, 17787 tp->gput_seq, 17788 tp->gput_ack, 17789 (uintptr_t)my_rsm, 17790 tp->gput_ts, 17791 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17792 9, 17793 __LINE__, my_rsm, 0); 17794 /* Now lets make sure all are marked as they should be */ 17795 rack_tend_gp_marks(tp, rack); 17796 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17797 return; 17798 } 17799 17800 use_latest: 17801 /* 17802 * We don't know how long we may have been 17803 * idle or if this is the first-send. Lets 17804 * setup the flag so we will trim off 17805 * the first ack'd data so we get a true 17806 * measurement. 17807 */ 17808 rack->app_limited_needs_set = 1; 17809 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 17810 rack->r_ctl.rc_gp_cumack_ts = 0; 17811 /* Find this guy so we can pull the send time */ 17812 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); 17813 if (my_rsm) { 17814 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 17815 if (my_rsm->r_flags & RACK_ACKED) { 17816 /* 17817 * Unlikely since its probably what was 17818 * just transmitted (but I am paranoid). 17819 */ 17820 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 17821 rack->app_limited_needs_set = 0; 17822 } 17823 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 17824 /* This also is unlikely */ 17825 tp->gput_seq = my_rsm->r_start; 17826 } 17827 } else { 17828 /* 17829 * TSNH unless we have some send-map limit, 17830 * and even at that it should not be hitting 17831 * that limit (we should have stopped sending). 17832 */ 17833 struct timeval tv; 17834 17835 microuptime(&tv); 17836 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 17837 } 17838 rack_tend_gp_marks(tp, rack); 17839 rack_log_pacing_delay_calc(rack, 17840 tp->gput_seq, 17841 tp->gput_ack, 17842 (uintptr_t)my_rsm, 17843 tp->gput_ts, 17844 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17845 9, __LINE__, NULL, 0); 17846 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17847 } 17848 17849 static inline uint32_t 17850 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 17851 uint32_t avail, int32_t sb_offset) 17852 { 17853 uint32_t len; 17854 uint32_t sendwin; 17855 17856 if (tp->snd_wnd > cwnd_to_use) 17857 sendwin = cwnd_to_use; 17858 else 17859 sendwin = tp->snd_wnd; 17860 if (ctf_outstanding(tp) >= tp->snd_wnd) { 17861 /* We never want to go over our peers rcv-window */ 17862 len = 0; 17863 } else { 17864 uint32_t flight; 17865 17866 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 17867 if (flight >= sendwin) { 17868 /* 17869 * We have in flight what we are allowed by cwnd (if 17870 * it was rwnd blocking it would have hit above out 17871 * >= tp->snd_wnd). 17872 */ 17873 return (0); 17874 } 17875 len = sendwin - flight; 17876 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 17877 /* We would send too much (beyond the rwnd) */ 17878 len = tp->snd_wnd - ctf_outstanding(tp); 17879 } 17880 if ((len + sb_offset) > avail) { 17881 /* 17882 * We don't have that much in the SB, how much is 17883 * there? 17884 */ 17885 len = avail - sb_offset; 17886 } 17887 } 17888 return (len); 17889 } 17890 17891 static void 17892 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 17893 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 17894 int rsm_is_null, int optlen, int line, uint16_t mode) 17895 { 17896 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 17897 union tcp_log_stackspecific log; 17898 struct timeval tv; 17899 17900 memset(&log, 0, sizeof(log)); 17901 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 17902 log.u_bbr.flex1 = error; 17903 log.u_bbr.flex2 = flags; 17904 log.u_bbr.flex3 = rsm_is_null; 17905 log.u_bbr.flex4 = ipoptlen; 17906 log.u_bbr.flex5 = tp->rcv_numsacks; 17907 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 17908 log.u_bbr.flex7 = optlen; 17909 log.u_bbr.flex8 = rack->r_fsb_inited; 17910 log.u_bbr.applimited = rack->r_fast_output; 17911 log.u_bbr.bw_inuse = rack_get_bw(rack); 17912 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 17913 log.u_bbr.cwnd_gain = mode; 17914 log.u_bbr.pkts_out = orig_len; 17915 log.u_bbr.lt_epoch = len; 17916 log.u_bbr.delivered = line; 17917 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17918 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17919 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 17920 len, &log, false, NULL, __func__, __LINE__, &tv); 17921 } 17922 } 17923 17924 17925 static struct mbuf * 17926 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 17927 struct rack_fast_send_blk *fsb, 17928 int32_t seglimit, int32_t segsize, int hw_tls) 17929 { 17930 #ifdef KERN_TLS 17931 struct ktls_session *tls, *ntls; 17932 #ifdef INVARIANTS 17933 struct mbuf *start; 17934 #endif 17935 #endif 17936 struct mbuf *m, *n, **np, *smb; 17937 struct mbuf *top; 17938 int32_t off, soff; 17939 int32_t len = *plen; 17940 int32_t fragsize; 17941 int32_t len_cp = 0; 17942 uint32_t mlen, frags; 17943 17944 soff = off = the_off; 17945 smb = m = the_m; 17946 np = ⊤ 17947 top = NULL; 17948 #ifdef KERN_TLS 17949 if (hw_tls && (m->m_flags & M_EXTPG)) 17950 tls = m->m_epg_tls; 17951 else 17952 tls = NULL; 17953 #ifdef INVARIANTS 17954 start = m; 17955 #endif 17956 #endif 17957 while (len > 0) { 17958 if (m == NULL) { 17959 *plen = len_cp; 17960 break; 17961 } 17962 #ifdef KERN_TLS 17963 if (hw_tls) { 17964 if (m->m_flags & M_EXTPG) 17965 ntls = m->m_epg_tls; 17966 else 17967 ntls = NULL; 17968 17969 /* 17970 * Avoid mixing TLS records with handshake 17971 * data or TLS records from different 17972 * sessions. 17973 */ 17974 if (tls != ntls) { 17975 MPASS(m != start); 17976 *plen = len_cp; 17977 break; 17978 } 17979 } 17980 #endif 17981 mlen = min(len, m->m_len - off); 17982 if (seglimit) { 17983 /* 17984 * For M_EXTPG mbufs, add 3 segments 17985 * + 1 in case we are crossing page boundaries 17986 * + 2 in case the TLS hdr/trailer are used 17987 * It is cheaper to just add the segments 17988 * than it is to take the cache miss to look 17989 * at the mbuf ext_pgs state in detail. 17990 */ 17991 if (m->m_flags & M_EXTPG) { 17992 fragsize = min(segsize, PAGE_SIZE); 17993 frags = 3; 17994 } else { 17995 fragsize = segsize; 17996 frags = 0; 17997 } 17998 17999 /* Break if we really can't fit anymore. */ 18000 if ((frags + 1) >= seglimit) { 18001 *plen = len_cp; 18002 break; 18003 } 18004 18005 /* 18006 * Reduce size if you can't copy the whole 18007 * mbuf. If we can't copy the whole mbuf, also 18008 * adjust len so the loop will end after this 18009 * mbuf. 18010 */ 18011 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 18012 mlen = (seglimit - frags - 1) * fragsize; 18013 len = mlen; 18014 *plen = len_cp + len; 18015 } 18016 frags += howmany(mlen, fragsize); 18017 if (frags == 0) 18018 frags++; 18019 seglimit -= frags; 18020 KASSERT(seglimit > 0, 18021 ("%s: seglimit went too low", __func__)); 18022 } 18023 n = m_get(M_NOWAIT, m->m_type); 18024 *np = n; 18025 if (n == NULL) 18026 goto nospace; 18027 n->m_len = mlen; 18028 soff += mlen; 18029 len_cp += n->m_len; 18030 if (m->m_flags & (M_EXT | M_EXTPG)) { 18031 n->m_data = m->m_data + off; 18032 mb_dupcl(n, m); 18033 } else { 18034 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 18035 (u_int)n->m_len); 18036 } 18037 len -= n->m_len; 18038 off = 0; 18039 m = m->m_next; 18040 np = &n->m_next; 18041 if (len || (soff == smb->m_len)) { 18042 /* 18043 * We have more so we move forward or 18044 * we have consumed the entire mbuf and 18045 * len has fell to 0. 18046 */ 18047 soff = 0; 18048 smb = m; 18049 } 18050 18051 } 18052 if (fsb != NULL) { 18053 fsb->m = smb; 18054 fsb->off = soff; 18055 if (smb) { 18056 /* 18057 * Save off the size of the mbuf. We do 18058 * this so that we can recognize when it 18059 * has been trimmed by sbcut() as acks 18060 * come in. 18061 */ 18062 fsb->o_m_len = smb->m_len; 18063 fsb->o_t_len = M_TRAILINGROOM(smb); 18064 } else { 18065 /* 18066 * This is the case where the next mbuf went to NULL. This 18067 * means with this copy we have sent everything in the sb. 18068 * In theory we could clear the fast_output flag, but lets 18069 * not since its possible that we could get more added 18070 * and acks that call the extend function which would let 18071 * us send more. 18072 */ 18073 fsb->o_m_len = 0; 18074 fsb->o_t_len = 0; 18075 } 18076 } 18077 return (top); 18078 nospace: 18079 if (top) 18080 m_freem(top); 18081 return (NULL); 18082 18083 } 18084 18085 /* 18086 * This is a copy of m_copym(), taking the TSO segment size/limit 18087 * constraints into account, and advancing the sndptr as it goes. 18088 */ 18089 static struct mbuf * 18090 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 18091 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 18092 { 18093 struct mbuf *m, *n; 18094 int32_t soff; 18095 18096 m = rack->r_ctl.fsb.m; 18097 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { 18098 /* 18099 * The trailing space changed, mbufs can grow 18100 * at the tail but they can't shrink from 18101 * it, KASSERT that. Adjust the orig_m_len to 18102 * compensate for this change. 18103 */ 18104 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), 18105 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 18106 m, 18107 rack, 18108 (intmax_t)M_TRAILINGROOM(m), 18109 rack->r_ctl.fsb.o_t_len, 18110 rack->r_ctl.fsb.o_m_len, 18111 m->m_len)); 18112 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); 18113 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); 18114 } 18115 if (m->m_len < rack->r_ctl.fsb.o_m_len) { 18116 /* 18117 * Mbuf shrank, trimmed off the top by an ack, our 18118 * offset changes. 18119 */ 18120 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), 18121 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", 18122 m, m->m_len, 18123 rack, rack->r_ctl.fsb.o_m_len, 18124 rack->r_ctl.fsb.off)); 18125 18126 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) 18127 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); 18128 else 18129 rack->r_ctl.fsb.off = 0; 18130 rack->r_ctl.fsb.o_m_len = m->m_len; 18131 #ifdef INVARIANTS 18132 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { 18133 panic("rack:%p m:%p m_len grew outside of t_space compensation", 18134 rack, m); 18135 #endif 18136 } 18137 soff = rack->r_ctl.fsb.off; 18138 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 18139 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 18140 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 18141 __FUNCTION__, 18142 rack, *plen, m, m->m_len)); 18143 /* Save off the right location before we copy and advance */ 18144 *s_soff = soff; 18145 *s_mb = rack->r_ctl.fsb.m; 18146 n = rack_fo_base_copym(m, soff, plen, 18147 &rack->r_ctl.fsb, 18148 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 18149 return (n); 18150 } 18151 18152 /* Log the buffer level */ 18153 static void 18154 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, 18155 int len, struct timeval *tv, 18156 uint32_t cts) 18157 { 18158 uint32_t p_rate = 0, p_queue = 0, err = 0; 18159 union tcp_log_stackspecific log; 18160 18161 #ifdef RATELIMIT 18162 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18163 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18164 #endif 18165 memset(&log, 0, sizeof(log)); 18166 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18167 log.u_bbr.flex1 = p_rate; 18168 log.u_bbr.flex2 = p_queue; 18169 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18170 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18171 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18172 log.u_bbr.flex7 = 99; 18173 log.u_bbr.flex8 = 0; 18174 log.u_bbr.pkts_out = err; 18175 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18176 log.u_bbr.timeStamp = cts; 18177 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18178 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18179 len, &log, false, NULL, __func__, __LINE__, tv); 18180 18181 } 18182 18183 static uint32_t 18184 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, 18185 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) 18186 { 18187 uint64_t lentime = 0; 18188 #ifdef RATELIMIT 18189 uint32_t p_rate = 0, p_queue = 0, err; 18190 union tcp_log_stackspecific log; 18191 uint64_t bw; 18192 18193 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18194 /* Failed or queue is zero */ 18195 if (err || (p_queue == 0)) { 18196 lentime = 0; 18197 goto out; 18198 } 18199 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18200 if (err) { 18201 lentime = 0; 18202 goto out; 18203 } 18204 /* 18205 * If we reach here we have some bytes in 18206 * the queue. The number returned is a value 18207 * between 0 and 0xffff where ffff is full 18208 * and 0 is empty. So how best to make this into 18209 * something usable? 18210 * 18211 * The "safer" way is lets take the b/w gotten 18212 * from the query (which should be our b/w rate) 18213 * and pretend that a full send (our rc_pace_max_segs) 18214 * is outstanding. We factor it so its as if a full 18215 * number of our MSS segment is terms of full 18216 * ethernet segments are outstanding. 18217 */ 18218 bw = p_rate / 8; 18219 if (bw) { 18220 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); 18221 lentime *= ETHERNET_SEGMENT_SIZE; 18222 lentime *= (uint64_t)HPTS_USEC_IN_SEC; 18223 lentime /= bw; 18224 } else { 18225 /* TSNH -- KASSERT? */ 18226 lentime = 0; 18227 } 18228 out: 18229 if (tcp_bblogging_on(tp)) { 18230 memset(&log, 0, sizeof(log)); 18231 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18232 log.u_bbr.flex1 = p_rate; 18233 log.u_bbr.flex2 = p_queue; 18234 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18235 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18236 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18237 log.u_bbr.flex7 = 99; 18238 log.u_bbr.flex8 = 0; 18239 log.u_bbr.pkts_out = err; 18240 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18241 log.u_bbr.cur_del_rate = lentime; 18242 log.u_bbr.timeStamp = cts; 18243 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18244 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18245 len, &log, false, NULL, __func__, __LINE__,tv); 18246 } 18247 #endif 18248 return ((uint32_t)lentime); 18249 } 18250 18251 static int 18252 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 18253 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 18254 { 18255 /* 18256 * Enter the fast retransmit path. We are given that a sched_pin is 18257 * in place (if accounting is compliled in) and the cycle count taken 18258 * at the entry is in the ts_val. The concept her is that the rsm 18259 * now holds the mbuf offsets and such so we can directly transmit 18260 * without a lot of overhead, the len field is already set for 18261 * us to prohibit us from sending too much (usually its 1MSS). 18262 */ 18263 struct ip *ip = NULL; 18264 struct udphdr *udp = NULL; 18265 struct tcphdr *th = NULL; 18266 struct mbuf *m = NULL; 18267 struct inpcb *inp; 18268 uint8_t *cpto; 18269 struct tcp_log_buffer *lgb; 18270 #ifdef TCP_ACCOUNTING 18271 uint64_t crtsc; 18272 int cnt_thru = 1; 18273 #endif 18274 struct tcpopt to; 18275 u_char opt[TCP_MAXOLEN]; 18276 uint32_t hdrlen, optlen; 18277 int32_t pacing_delay, segsiz, max_val, tso = 0, error = 0, ulen = 0; 18278 uint16_t flags; 18279 uint32_t if_hw_tsomaxsegcount = 0, startseq; 18280 uint32_t if_hw_tsomaxsegsize; 18281 int32_t ip_sendflag = IP_NO_SND_TAG_RL; 18282 18283 #ifdef INET6 18284 struct ip6_hdr *ip6 = NULL; 18285 18286 if (rack->r_is_v6) { 18287 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18288 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18289 } else 18290 #endif /* INET6 */ 18291 { 18292 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18293 hdrlen = sizeof(struct tcpiphdr); 18294 } 18295 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 18296 goto failed; 18297 } 18298 if (doing_tlp) { 18299 /* Its a TLP add the flag, it may already be there but be sure */ 18300 rsm->r_flags |= RACK_TLP; 18301 } else { 18302 /* If it was a TLP it is not not on this retransmit */ 18303 rsm->r_flags &= ~RACK_TLP; 18304 } 18305 startseq = rsm->r_start; 18306 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 18307 inp = rack->rc_inp; 18308 to.to_flags = 0; 18309 flags = tcp_outflags[tp->t_state]; 18310 if (flags & (TH_SYN|TH_RST)) { 18311 goto failed; 18312 } 18313 if (rsm->r_flags & RACK_HAS_FIN) { 18314 /* We can't send a FIN here */ 18315 goto failed; 18316 } 18317 if (flags & TH_FIN) { 18318 /* We never send a FIN */ 18319 flags &= ~TH_FIN; 18320 } 18321 if (tp->t_flags & TF_RCVD_TSTMP) { 18322 to.to_tsval = ms_cts + tp->ts_offset; 18323 to.to_tsecr = tp->ts_recent; 18324 to.to_flags = TOF_TS; 18325 } 18326 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18327 /* TCP-MD5 (RFC2385). */ 18328 if (tp->t_flags & TF_SIGNATURE) 18329 to.to_flags |= TOF_SIGNATURE; 18330 #endif 18331 optlen = tcp_addoptions(&to, opt); 18332 hdrlen += optlen; 18333 udp = rack->r_ctl.fsb.udp; 18334 if (udp) 18335 hdrlen += sizeof(struct udphdr); 18336 if (rack->r_ctl.rc_pace_max_segs) 18337 max_val = rack->r_ctl.rc_pace_max_segs; 18338 else if (rack->rc_user_set_max_segs) 18339 max_val = rack->rc_user_set_max_segs * segsiz; 18340 else 18341 max_val = len; 18342 if ((tp->t_flags & TF_TSO) && 18343 V_tcp_do_tso && 18344 (len > segsiz) && 18345 (tp->t_port == 0)) 18346 tso = 1; 18347 #ifdef INET6 18348 if (MHLEN < hdrlen + max_linkhdr) 18349 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18350 else 18351 #endif 18352 m = m_gethdr(M_NOWAIT, MT_DATA); 18353 if (m == NULL) 18354 goto failed; 18355 m->m_data += max_linkhdr; 18356 m->m_len = hdrlen; 18357 th = rack->r_ctl.fsb.th; 18358 /* Establish the len to send */ 18359 if (len > max_val) 18360 len = max_val; 18361 if ((tso) && (len + optlen > segsiz)) { 18362 uint32_t if_hw_tsomax; 18363 int32_t max_len; 18364 18365 /* extract TSO information */ 18366 if_hw_tsomax = tp->t_tsomax; 18367 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18368 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18369 /* 18370 * Check if we should limit by maximum payload 18371 * length: 18372 */ 18373 if (if_hw_tsomax != 0) { 18374 /* compute maximum TSO length */ 18375 max_len = (if_hw_tsomax - hdrlen - 18376 max_linkhdr); 18377 if (max_len <= 0) { 18378 goto failed; 18379 } else if (len > max_len) { 18380 len = max_len; 18381 } 18382 } 18383 if (len <= segsiz) { 18384 /* 18385 * In case there are too many small fragments don't 18386 * use TSO: 18387 */ 18388 tso = 0; 18389 } 18390 } else { 18391 tso = 0; 18392 } 18393 if ((tso == 0) && (len > segsiz)) 18394 len = segsiz; 18395 (void)tcp_get_usecs(tv); 18396 if ((len == 0) || 18397 (len <= MHLEN - hdrlen - max_linkhdr)) { 18398 goto failed; 18399 } 18400 th->th_seq = htonl(rsm->r_start); 18401 th->th_ack = htonl(tp->rcv_nxt); 18402 /* 18403 * The PUSH bit should only be applied 18404 * if the full retransmission is made. If 18405 * we are sending less than this is the 18406 * left hand edge and should not have 18407 * the PUSH bit. 18408 */ 18409 if ((rsm->r_flags & RACK_HAD_PUSH) && 18410 (len == (rsm->r_end - rsm->r_start))) 18411 flags |= TH_PUSH; 18412 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 18413 if (th->th_win == 0) { 18414 tp->t_sndzerowin++; 18415 tp->t_flags |= TF_RXWIN0SENT; 18416 } else 18417 tp->t_flags &= ~TF_RXWIN0SENT; 18418 if (rsm->r_flags & RACK_TLP) { 18419 /* 18420 * TLP should not count in retran count, but 18421 * in its own bin 18422 */ 18423 counter_u64_add(rack_tlp_retran, 1); 18424 counter_u64_add(rack_tlp_retran_bytes, len); 18425 } else { 18426 tp->t_sndrexmitpack++; 18427 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18428 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18429 } 18430 #ifdef STATS 18431 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18432 len); 18433 #endif 18434 if (rsm->m == NULL) 18435 goto failed; 18436 if (rsm->m && 18437 ((rsm->orig_m_len != rsm->m->m_len) || 18438 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 18439 /* Fix up the orig_m_len and possibly the mbuf offset */ 18440 rack_adjust_orig_mlen(rsm); 18441 } 18442 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 18443 if (len <= segsiz) { 18444 /* 18445 * Must have ran out of mbufs for the copy 18446 * shorten it to no longer need tso. Lets 18447 * not put on sendalot since we are low on 18448 * mbufs. 18449 */ 18450 tso = 0; 18451 } 18452 if ((m->m_next == NULL) || (len <= 0)){ 18453 goto failed; 18454 } 18455 if (udp) { 18456 if (rack->r_is_v6) 18457 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18458 else 18459 ulen = hdrlen + len - sizeof(struct ip); 18460 udp->uh_ulen = htons(ulen); 18461 } 18462 m->m_pkthdr.rcvif = (struct ifnet *)0; 18463 if (TCPS_HAVERCVDSYN(tp->t_state) && 18464 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 18465 int ect = tcp_ecn_output_established(tp, &flags, len, true); 18466 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18467 (tp->t_flags2 & TF2_ECN_SND_ECE)) 18468 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18469 #ifdef INET6 18470 if (rack->r_is_v6) { 18471 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 18472 ip6->ip6_flow |= htonl(ect << 20); 18473 } 18474 else 18475 #endif 18476 { 18477 ip->ip_tos &= ~IPTOS_ECN_MASK; 18478 ip->ip_tos |= ect; 18479 } 18480 } 18481 if (rack->r_ctl.crte != NULL) { 18482 /* See if we can send via the hw queue */ 18483 pacing_delay = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); 18484 /* If there is nothing in queue (no pacing time) we can send via the hw queue */ 18485 if (pacing_delay == 0) 18486 ip_sendflag = 0; 18487 } 18488 tcp_set_flags(th, flags); 18489 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18490 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18491 if (to.to_flags & TOF_SIGNATURE) { 18492 /* 18493 * Calculate MD5 signature and put it into the place 18494 * determined before. 18495 * NOTE: since TCP options buffer doesn't point into 18496 * mbuf's data, calculate offset and use it. 18497 */ 18498 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18499 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18500 /* 18501 * Do not send segment if the calculation of MD5 18502 * digest has failed. 18503 */ 18504 goto failed; 18505 } 18506 } 18507 #endif 18508 #ifdef INET6 18509 if (rack->r_is_v6) { 18510 if (tp->t_port) { 18511 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18512 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18513 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18514 th->th_sum = htons(0); 18515 UDPSTAT_INC(udps_opackets); 18516 } else { 18517 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18518 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18519 th->th_sum = in6_cksum_pseudo(ip6, 18520 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18521 0); 18522 } 18523 } 18524 #endif 18525 #if defined(INET6) && defined(INET) 18526 else 18527 #endif 18528 #ifdef INET 18529 { 18530 if (tp->t_port) { 18531 m->m_pkthdr.csum_flags = CSUM_UDP; 18532 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18533 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18534 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18535 th->th_sum = htons(0); 18536 UDPSTAT_INC(udps_opackets); 18537 } else { 18538 m->m_pkthdr.csum_flags = CSUM_TCP; 18539 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18540 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18541 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18542 IPPROTO_TCP + len + optlen)); 18543 } 18544 /* IP version must be set here for ipv4/ipv6 checking later */ 18545 KASSERT(ip->ip_v == IPVERSION, 18546 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18547 } 18548 #endif 18549 if (tso) { 18550 /* 18551 * Here we use segsiz since we have no added options besides 18552 * any standard timestamp options (no DSACKs or SACKS are sent 18553 * via either fast-path). 18554 */ 18555 KASSERT(len > segsiz, 18556 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 18557 m->m_pkthdr.csum_flags |= CSUM_TSO; 18558 m->m_pkthdr.tso_segsz = segsiz; 18559 } 18560 #ifdef INET6 18561 if (rack->r_is_v6) { 18562 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 18563 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18564 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18565 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18566 else 18567 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18568 } 18569 #endif 18570 #if defined(INET) && defined(INET6) 18571 else 18572 #endif 18573 #ifdef INET 18574 { 18575 ip->ip_len = htons(m->m_pkthdr.len); 18576 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 18577 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18578 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18579 if (tp->t_port == 0 || len < V_tcp_minmss) { 18580 ip->ip_off |= htons(IP_DF); 18581 } 18582 } else { 18583 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18584 } 18585 } 18586 #endif 18587 if (doing_tlp == 0) { 18588 /* Set we retransmitted */ 18589 rack->rc_gp_saw_rec = 1; 18590 } else { 18591 /* Its a TLP set ca or ss */ 18592 if (tp->snd_cwnd > tp->snd_ssthresh) { 18593 /* Set we sent in CA */ 18594 rack->rc_gp_saw_ca = 1; 18595 } else { 18596 /* Set we sent in SS */ 18597 rack->rc_gp_saw_ss = 1; 18598 } 18599 } 18600 /* Time to copy in our header */ 18601 cpto = mtod(m, uint8_t *); 18602 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18603 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18604 if (optlen) { 18605 bcopy(opt, th + 1, optlen); 18606 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18607 } else { 18608 th->th_off = sizeof(struct tcphdr) >> 2; 18609 } 18610 if (tcp_bblogging_on(rack->rc_tp)) { 18611 union tcp_log_stackspecific log; 18612 18613 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 18614 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 18615 counter_u64_add(rack_collapsed_win_rxt, 1); 18616 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 18617 } 18618 memset(&log, 0, sizeof(log)); 18619 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18620 if (rack->rack_no_prr) 18621 log.u_bbr.flex1 = 0; 18622 else 18623 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18624 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18625 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18626 log.u_bbr.flex4 = max_val; 18627 /* Save off the early/late values */ 18628 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18629 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18630 log.u_bbr.bw_inuse = rack_get_bw(rack); 18631 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 18632 if (doing_tlp == 0) 18633 log.u_bbr.flex8 = 1; 18634 else 18635 log.u_bbr.flex8 = 2; 18636 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18637 log.u_bbr.flex7 = 55; 18638 log.u_bbr.pkts_out = tp->t_maxseg; 18639 log.u_bbr.timeStamp = cts; 18640 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18641 if (rsm->r_rtr_cnt > 0) { 18642 /* 18643 * When we have a retransmit we want to log the 18644 * burst at send and flight at send from before. 18645 */ 18646 log.u_bbr.flex5 = rsm->r_fas; 18647 log.u_bbr.bbr_substate = rsm->r_bas; 18648 } else { 18649 /* 18650 * This is currently unlikely until we do the 18651 * packet pair probes but I will add it for completeness. 18652 */ 18653 log.u_bbr.flex5 = log.u_bbr.inflight; 18654 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 18655 } 18656 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 18657 log.u_bbr.delivered = 0; 18658 log.u_bbr.rttProp = (uintptr_t)rsm; 18659 log.u_bbr.delRate = rsm->r_flags; 18660 log.u_bbr.delRate <<= 31; 18661 log.u_bbr.delRate |= rack->r_must_retran; 18662 log.u_bbr.delRate <<= 1; 18663 log.u_bbr.delRate |= 1; 18664 log.u_bbr.pkt_epoch = __LINE__; 18665 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 18666 len, &log, false, NULL, __func__, __LINE__, tv); 18667 } else 18668 lgb = NULL; 18669 if ((rack->r_ctl.crte != NULL) && 18670 tcp_bblogging_on(tp)) { 18671 rack_log_queue_level(tp, rack, len, tv, cts); 18672 } 18673 #ifdef INET6 18674 if (rack->r_is_v6) { 18675 error = ip6_output(m, inp->in6p_outputopts, 18676 &inp->inp_route6, 18677 ip_sendflag, NULL, NULL, inp); 18678 } 18679 else 18680 #endif 18681 #ifdef INET 18682 { 18683 error = ip_output(m, NULL, 18684 &inp->inp_route, 18685 ip_sendflag, 0, inp); 18686 } 18687 #endif 18688 m = NULL; 18689 if (lgb) { 18690 lgb->tlb_errno = error; 18691 lgb = NULL; 18692 } 18693 /* Move snd_nxt to snd_max so we don't have false retransmissions */ 18694 tp->snd_nxt = tp->snd_max; 18695 if (error) { 18696 goto failed; 18697 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { 18698 rack->rc_hw_nobuf = 0; 18699 rack->r_ctl.rc_agg_delayed = 0; 18700 rack->r_early = 0; 18701 rack->r_late = 0; 18702 rack->r_ctl.rc_agg_early = 0; 18703 } 18704 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 18705 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); 18706 if (doing_tlp) { 18707 rack->rc_tlp_in_progress = 1; 18708 rack->r_ctl.rc_tlp_cnt_out++; 18709 } 18710 if (error == 0) { 18711 counter_u64_add(rack_total_bytes, len); 18712 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 18713 if (doing_tlp) { 18714 rack->rc_last_sent_tlp_past_cumack = 0; 18715 rack->rc_last_sent_tlp_seq_valid = 1; 18716 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 18717 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 18718 } 18719 if (rack->r_ctl.rc_prr_sndcnt >= len) 18720 rack->r_ctl.rc_prr_sndcnt -= len; 18721 else 18722 rack->r_ctl.rc_prr_sndcnt = 0; 18723 } 18724 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 18725 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18726 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 18727 rack->r_ctl.retran_during_recovery += len; 18728 { 18729 int idx; 18730 18731 idx = (len / segsiz) + 3; 18732 if (idx >= TCP_MSS_ACCT_ATIMER) 18733 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18734 else 18735 counter_u64_add(rack_out_size[idx], 1); 18736 } 18737 if (tp->t_rtttime == 0) { 18738 tp->t_rtttime = ticks; 18739 tp->t_rtseq = startseq; 18740 KMOD_TCPSTAT_INC(tcps_segstimed); 18741 } 18742 counter_u64_add(rack_fto_rsm_send, 1); 18743 if (error && (error == ENOBUFS)) { 18744 if (rack->r_ctl.crte != NULL) { 18745 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 18746 if (tcp_bblogging_on(rack->rc_tp)) 18747 rack_log_queue_level(tp, rack, len, tv, cts); 18748 } else 18749 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 18750 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 18751 if (rack->rc_enobuf < 0x7f) 18752 rack->rc_enobuf++; 18753 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) 18754 pacing_delay = 10 * HPTS_USEC_IN_MSEC; 18755 if (rack->r_ctl.crte != NULL) { 18756 counter_u64_add(rack_saw_enobuf_hw, 1); 18757 tcp_rl_log_enobuf(rack->r_ctl.crte); 18758 } 18759 counter_u64_add(rack_saw_enobuf, 1); 18760 } else { 18761 pacing_delay = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); 18762 } 18763 rack_start_hpts_timer(rack, tp, cts, pacing_delay, len, 0); 18764 #ifdef TCP_ACCOUNTING 18765 crtsc = get_cyclecount(); 18766 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18767 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 18768 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 18769 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 18770 } 18771 sched_unpin(); 18772 #endif 18773 return (0); 18774 failed: 18775 if (m) 18776 m_free(m); 18777 return (-1); 18778 } 18779 18780 static void 18781 rack_sndbuf_autoscale(struct tcp_rack *rack) 18782 { 18783 /* 18784 * Automatic sizing of send socket buffer. Often the send buffer 18785 * size is not optimally adjusted to the actual network conditions 18786 * at hand (delay bandwidth product). Setting the buffer size too 18787 * small limits throughput on links with high bandwidth and high 18788 * delay (eg. trans-continental/oceanic links). Setting the 18789 * buffer size too big consumes too much real kernel memory, 18790 * especially with many connections on busy servers. 18791 * 18792 * The criteria to step up the send buffer one notch are: 18793 * 1. receive window of remote host is larger than send buffer 18794 * (with a fudge factor of 5/4th); 18795 * 2. send buffer is filled to 7/8th with data (so we actually 18796 * have data to make use of it); 18797 * 3. send buffer fill has not hit maximal automatic size; 18798 * 4. our send window (slow start and cogestion controlled) is 18799 * larger than sent but unacknowledged data in send buffer. 18800 * 18801 * Note that the rack version moves things much faster since 18802 * we want to avoid hitting cache lines in the rack_fast_output() 18803 * path so this is called much less often and thus moves 18804 * the SB forward by a percentage. 18805 */ 18806 struct socket *so; 18807 struct tcpcb *tp; 18808 uint32_t sendwin, scaleup; 18809 18810 tp = rack->rc_tp; 18811 so = rack->rc_inp->inp_socket; 18812 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 18813 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 18814 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 18815 sbused(&so->so_snd) >= 18816 (so->so_snd.sb_hiwat / 8 * 7) && 18817 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 18818 sendwin >= (sbused(&so->so_snd) - 18819 (tp->snd_max - tp->snd_una))) { 18820 if (rack_autosndbuf_inc) 18821 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 18822 else 18823 scaleup = V_tcp_autosndbuf_inc; 18824 if (scaleup < V_tcp_autosndbuf_inc) 18825 scaleup = V_tcp_autosndbuf_inc; 18826 scaleup += so->so_snd.sb_hiwat; 18827 if (scaleup > V_tcp_autosndbuf_max) 18828 scaleup = V_tcp_autosndbuf_max; 18829 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 18830 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 18831 } 18832 } 18833 } 18834 18835 static int 18836 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 18837 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line) 18838 { 18839 /* 18840 * Enter to do fast output. We are given that the sched_pin is 18841 * in place (if accounting is compiled in) and the cycle count taken 18842 * at entry is in place in ts_val. The idea here is that 18843 * we know how many more bytes needs to be sent (presumably either 18844 * during pacing or to fill the cwnd and that was greater than 18845 * the max-burst). We have how much to send and all the info we 18846 * need to just send. 18847 */ 18848 #ifdef INET 18849 struct ip *ip = NULL; 18850 #endif 18851 struct udphdr *udp = NULL; 18852 struct tcphdr *th = NULL; 18853 struct mbuf *m, *s_mb; 18854 struct inpcb *inp; 18855 uint8_t *cpto; 18856 struct tcp_log_buffer *lgb; 18857 #ifdef TCP_ACCOUNTING 18858 uint64_t crtsc; 18859 #endif 18860 struct tcpopt to; 18861 u_char opt[TCP_MAXOLEN]; 18862 uint32_t hdrlen, optlen; 18863 #ifdef TCP_ACCOUNTING 18864 int cnt_thru = 1; 18865 #endif 18866 int32_t pacing_delay, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 18867 uint16_t flags; 18868 uint32_t s_soff; 18869 uint32_t if_hw_tsomaxsegcount = 0, startseq; 18870 uint32_t if_hw_tsomaxsegsize; 18871 uint32_t add_flag = RACK_SENT_FP; 18872 #ifdef INET6 18873 struct ip6_hdr *ip6 = NULL; 18874 18875 if (rack->r_is_v6) { 18876 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18877 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18878 } else 18879 #endif /* INET6 */ 18880 { 18881 #ifdef INET 18882 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18883 hdrlen = sizeof(struct tcpiphdr); 18884 #endif 18885 } 18886 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 18887 m = NULL; 18888 goto failed; 18889 } 18890 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 18891 startseq = tp->snd_max; 18892 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 18893 inp = rack->rc_inp; 18894 len = rack->r_ctl.fsb.left_to_send; 18895 to.to_flags = 0; 18896 flags = rack->r_ctl.fsb.tcp_flags; 18897 if (tp->t_flags & TF_RCVD_TSTMP) { 18898 to.to_tsval = ms_cts + tp->ts_offset; 18899 to.to_tsecr = tp->ts_recent; 18900 to.to_flags = TOF_TS; 18901 } 18902 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18903 /* TCP-MD5 (RFC2385). */ 18904 if (tp->t_flags & TF_SIGNATURE) 18905 to.to_flags |= TOF_SIGNATURE; 18906 #endif 18907 optlen = tcp_addoptions(&to, opt); 18908 hdrlen += optlen; 18909 udp = rack->r_ctl.fsb.udp; 18910 if (udp) 18911 hdrlen += sizeof(struct udphdr); 18912 if (rack->r_ctl.rc_pace_max_segs) 18913 max_val = rack->r_ctl.rc_pace_max_segs; 18914 else if (rack->rc_user_set_max_segs) 18915 max_val = rack->rc_user_set_max_segs * segsiz; 18916 else 18917 max_val = len; 18918 if ((tp->t_flags & TF_TSO) && 18919 V_tcp_do_tso && 18920 (len > segsiz) && 18921 (tp->t_port == 0)) 18922 tso = 1; 18923 again: 18924 #ifdef INET6 18925 if (MHLEN < hdrlen + max_linkhdr) 18926 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18927 else 18928 #endif 18929 m = m_gethdr(M_NOWAIT, MT_DATA); 18930 if (m == NULL) 18931 goto failed; 18932 m->m_data += max_linkhdr; 18933 m->m_len = hdrlen; 18934 th = rack->r_ctl.fsb.th; 18935 /* Establish the len to send */ 18936 if (len > max_val) 18937 len = max_val; 18938 if ((tso) && (len + optlen > segsiz)) { 18939 uint32_t if_hw_tsomax; 18940 int32_t max_len; 18941 18942 /* extract TSO information */ 18943 if_hw_tsomax = tp->t_tsomax; 18944 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18945 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18946 /* 18947 * Check if we should limit by maximum payload 18948 * length: 18949 */ 18950 if (if_hw_tsomax != 0) { 18951 /* compute maximum TSO length */ 18952 max_len = (if_hw_tsomax - hdrlen - 18953 max_linkhdr); 18954 if (max_len <= 0) { 18955 goto failed; 18956 } else if (len > max_len) { 18957 len = max_len; 18958 } 18959 } 18960 if (len <= segsiz) { 18961 /* 18962 * In case there are too many small fragments don't 18963 * use TSO: 18964 */ 18965 tso = 0; 18966 } 18967 } else { 18968 tso = 0; 18969 } 18970 if ((tso == 0) && (len > segsiz)) 18971 len = segsiz; 18972 (void)tcp_get_usecs(tv); 18973 if ((len == 0) || 18974 (len <= MHLEN - hdrlen - max_linkhdr)) { 18975 goto failed; 18976 } 18977 sb_offset = tp->snd_max - tp->snd_una; 18978 th->th_seq = htonl(tp->snd_max); 18979 th->th_ack = htonl(tp->rcv_nxt); 18980 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 18981 if (th->th_win == 0) { 18982 tp->t_sndzerowin++; 18983 tp->t_flags |= TF_RXWIN0SENT; 18984 } else 18985 tp->t_flags &= ~TF_RXWIN0SENT; 18986 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 18987 KMOD_TCPSTAT_INC(tcps_sndpack); 18988 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 18989 #ifdef STATS 18990 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 18991 len); 18992 #endif 18993 if (rack->r_ctl.fsb.m == NULL) 18994 goto failed; 18995 18996 /* s_mb and s_soff are saved for rack_log_output */ 18997 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 18998 &s_mb, &s_soff); 18999 if (len <= segsiz) { 19000 /* 19001 * Must have ran out of mbufs for the copy 19002 * shorten it to no longer need tso. Lets 19003 * not put on sendalot since we are low on 19004 * mbufs. 19005 */ 19006 tso = 0; 19007 } 19008 if (rack->r_ctl.fsb.rfo_apply_push && 19009 (len == rack->r_ctl.fsb.left_to_send)) { 19010 flags |= TH_PUSH; 19011 add_flag |= RACK_HAD_PUSH; 19012 } 19013 if ((m->m_next == NULL) || (len <= 0)){ 19014 goto failed; 19015 } 19016 if (udp) { 19017 if (rack->r_is_v6) 19018 ulen = hdrlen + len - sizeof(struct ip6_hdr); 19019 else 19020 ulen = hdrlen + len - sizeof(struct ip); 19021 udp->uh_ulen = htons(ulen); 19022 } 19023 m->m_pkthdr.rcvif = (struct ifnet *)0; 19024 if (TCPS_HAVERCVDSYN(tp->t_state) && 19025 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 19026 int ect = tcp_ecn_output_established(tp, &flags, len, false); 19027 if ((tp->t_state == TCPS_SYN_RECEIVED) && 19028 (tp->t_flags2 & TF2_ECN_SND_ECE)) 19029 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 19030 #ifdef INET6 19031 if (rack->r_is_v6) { 19032 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 19033 ip6->ip6_flow |= htonl(ect << 20); 19034 } 19035 else 19036 #endif 19037 { 19038 #ifdef INET 19039 ip->ip_tos &= ~IPTOS_ECN_MASK; 19040 ip->ip_tos |= ect; 19041 #endif 19042 } 19043 } 19044 tcp_set_flags(th, flags); 19045 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 19046 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19047 if (to.to_flags & TOF_SIGNATURE) { 19048 /* 19049 * Calculate MD5 signature and put it into the place 19050 * determined before. 19051 * NOTE: since TCP options buffer doesn't point into 19052 * mbuf's data, calculate offset and use it. 19053 */ 19054 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 19055 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 19056 /* 19057 * Do not send segment if the calculation of MD5 19058 * digest has failed. 19059 */ 19060 goto failed; 19061 } 19062 } 19063 #endif 19064 #ifdef INET6 19065 if (rack->r_is_v6) { 19066 if (tp->t_port) { 19067 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 19068 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19069 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 19070 th->th_sum = htons(0); 19071 UDPSTAT_INC(udps_opackets); 19072 } else { 19073 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 19074 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19075 th->th_sum = in6_cksum_pseudo(ip6, 19076 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 19077 0); 19078 } 19079 } 19080 #endif 19081 #if defined(INET6) && defined(INET) 19082 else 19083 #endif 19084 #ifdef INET 19085 { 19086 if (tp->t_port) { 19087 m->m_pkthdr.csum_flags = CSUM_UDP; 19088 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19089 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 19090 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 19091 th->th_sum = htons(0); 19092 UDPSTAT_INC(udps_opackets); 19093 } else { 19094 m->m_pkthdr.csum_flags = CSUM_TCP; 19095 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19096 th->th_sum = in_pseudo(ip->ip_src.s_addr, 19097 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 19098 IPPROTO_TCP + len + optlen)); 19099 } 19100 /* IP version must be set here for ipv4/ipv6 checking later */ 19101 KASSERT(ip->ip_v == IPVERSION, 19102 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 19103 } 19104 #endif 19105 if (tso) { 19106 /* 19107 * Here we use segsiz since we have no added options besides 19108 * any standard timestamp options (no DSACKs or SACKS are sent 19109 * via either fast-path). 19110 */ 19111 KASSERT(len > segsiz, 19112 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 19113 m->m_pkthdr.csum_flags |= CSUM_TSO; 19114 m->m_pkthdr.tso_segsz = segsiz; 19115 } 19116 #ifdef INET6 19117 if (rack->r_is_v6) { 19118 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 19119 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 19120 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 19121 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19122 else 19123 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19124 } 19125 #endif 19126 #if defined(INET) && defined(INET6) 19127 else 19128 #endif 19129 #ifdef INET 19130 { 19131 ip->ip_len = htons(m->m_pkthdr.len); 19132 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19133 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19134 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19135 if (tp->t_port == 0 || len < V_tcp_minmss) { 19136 ip->ip_off |= htons(IP_DF); 19137 } 19138 } else { 19139 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19140 } 19141 } 19142 #endif 19143 if (tp->snd_cwnd > tp->snd_ssthresh) { 19144 /* Set we sent in CA */ 19145 rack->rc_gp_saw_ca = 1; 19146 } else { 19147 /* Set we sent in SS */ 19148 rack->rc_gp_saw_ss = 1; 19149 } 19150 /* Time to copy in our header */ 19151 cpto = mtod(m, uint8_t *); 19152 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19153 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19154 if (optlen) { 19155 bcopy(opt, th + 1, optlen); 19156 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19157 } else { 19158 th->th_off = sizeof(struct tcphdr) >> 2; 19159 } 19160 if ((rack->r_ctl.crte != NULL) && 19161 tcp_bblogging_on(tp)) { 19162 rack_log_queue_level(tp, rack, len, tv, cts); 19163 } 19164 if (tcp_bblogging_on(rack->rc_tp)) { 19165 union tcp_log_stackspecific log; 19166 19167 memset(&log, 0, sizeof(log)); 19168 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19169 if (rack->rack_no_prr) 19170 log.u_bbr.flex1 = 0; 19171 else 19172 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19173 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19174 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19175 log.u_bbr.flex4 = max_val; 19176 /* Save off the early/late values */ 19177 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19178 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19179 log.u_bbr.bw_inuse = rack_get_bw(rack); 19180 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19181 log.u_bbr.flex8 = 0; 19182 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19183 log.u_bbr.flex7 = 44; 19184 log.u_bbr.pkts_out = tp->t_maxseg; 19185 log.u_bbr.timeStamp = cts; 19186 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19187 log.u_bbr.flex5 = log.u_bbr.inflight; 19188 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19189 log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send; 19190 log.u_bbr.rttProp = 0; 19191 log.u_bbr.delRate = rack->r_must_retran; 19192 log.u_bbr.delRate <<= 1; 19193 log.u_bbr.pkt_epoch = line; 19194 /* For fast output no retrans so just inflight and how many mss we send */ 19195 log.u_bbr.flex5 = log.u_bbr.inflight; 19196 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19197 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19198 len, &log, false, NULL, __func__, __LINE__, tv); 19199 } else 19200 lgb = NULL; 19201 #ifdef INET6 19202 if (rack->r_is_v6) { 19203 error = ip6_output(m, inp->in6p_outputopts, 19204 &inp->inp_route6, 19205 0, NULL, NULL, inp); 19206 } 19207 #endif 19208 #if defined(INET) && defined(INET6) 19209 else 19210 #endif 19211 #ifdef INET 19212 { 19213 error = ip_output(m, NULL, 19214 &inp->inp_route, 19215 0, 0, inp); 19216 } 19217 #endif 19218 if (lgb) { 19219 lgb->tlb_errno = error; 19220 lgb = NULL; 19221 } 19222 if (error) { 19223 *send_err = error; 19224 m = NULL; 19225 goto failed; 19226 } else if (rack->rc_hw_nobuf) { 19227 rack->rc_hw_nobuf = 0; 19228 rack->r_ctl.rc_agg_delayed = 0; 19229 rack->r_early = 0; 19230 rack->r_late = 0; 19231 rack->r_ctl.rc_agg_early = 0; 19232 } 19233 if ((error == 0) && (rack->lt_bw_up == 0)) { 19234 /* Unlikely */ 19235 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv); 19236 rack->r_ctl.lt_seq = tp->snd_una; 19237 rack->lt_bw_up = 1; 19238 } else if ((error == 0) && 19239 (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) { 19240 /* 19241 * Need to record what we have since we are 19242 * approaching seq wrap. 19243 */ 19244 struct timeval tv; 19245 uint64_t tmark; 19246 19247 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 19248 rack->r_ctl.lt_seq = tp->snd_una; 19249 tmark = tcp_get_u64_usecs(&tv); 19250 if (tmark > rack->r_ctl.lt_timemark) { 19251 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 19252 rack->r_ctl.lt_timemark = tmark; 19253 } 19254 } 19255 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 19256 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); 19257 if (tp->snd_una == tp->snd_max) { 19258 rack->r_ctl.rc_tlp_rxt_last_time = cts; 19259 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 19260 tp->t_acktime = ticks; 19261 } 19262 counter_u64_add(rack_total_bytes, len); 19263 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 19264 19265 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19266 *tot_len += len; 19267 if ((tp->t_flags & TF_GPUTINPROG) == 0) 19268 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 19269 tp->snd_max += len; 19270 tp->snd_nxt = tp->snd_max; 19271 if (rack->rc_new_rnd_needed) { 19272 rack_new_round_starts(tp, rack, tp->snd_max); 19273 } 19274 { 19275 int idx; 19276 19277 idx = (len / segsiz) + 3; 19278 if (idx >= TCP_MSS_ACCT_ATIMER) 19279 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19280 else 19281 counter_u64_add(rack_out_size[idx], 1); 19282 } 19283 if (len <= rack->r_ctl.fsb.left_to_send) 19284 rack->r_ctl.fsb.left_to_send -= len; 19285 else 19286 rack->r_ctl.fsb.left_to_send = 0; 19287 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19288 rack->r_fast_output = 0; 19289 rack->r_ctl.fsb.left_to_send = 0; 19290 /* At the end of fast_output scale up the sb */ 19291 SOCK_SENDBUF_LOCK(rack->rc_inp->inp_socket); 19292 rack_sndbuf_autoscale(rack); 19293 SOCK_SENDBUF_UNLOCK(rack->rc_inp->inp_socket); 19294 } 19295 if (tp->t_rtttime == 0) { 19296 tp->t_rtttime = ticks; 19297 tp->t_rtseq = startseq; 19298 KMOD_TCPSTAT_INC(tcps_segstimed); 19299 } 19300 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 19301 (max_val > len) && 19302 (*tot_len < rack->r_ctl.rc_pace_max_segs) && 19303 (tso == 0)) { 19304 max_val -= len; 19305 len = segsiz; 19306 th = rack->r_ctl.fsb.th; 19307 #ifdef TCP_ACCOUNTING 19308 cnt_thru++; 19309 #endif 19310 goto again; 19311 } 19312 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19313 counter_u64_add(rack_fto_send, 1); 19314 pacing_delay = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); 19315 rack_start_hpts_timer(rack, tp, cts, pacing_delay, *tot_len, 0); 19316 #ifdef TCP_ACCOUNTING 19317 crtsc = get_cyclecount(); 19318 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19319 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19320 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19321 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz); 19322 } 19323 sched_unpin(); 19324 #endif 19325 return (0); 19326 failed: 19327 if (m) 19328 m_free(m); 19329 rack->r_fast_output = 0; 19330 return (-1); 19331 } 19332 19333 static inline void 19334 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, 19335 struct sockbuf *sb, 19336 int len, int orig_len, int segsiz, uint32_t pace_max_seg, 19337 bool hw_tls, 19338 uint16_t flags) 19339 { 19340 rack->r_fast_output = 1; 19341 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19342 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19343 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 19344 rack->r_ctl.fsb.tcp_flags = flags; 19345 rack->r_ctl.fsb.left_to_send = orig_len - len; 19346 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { 19347 /* Less than a full sized pace, lets not */ 19348 rack->r_fast_output = 0; 19349 return; 19350 } else { 19351 /* Round down to the nearest pace_max_seg */ 19352 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); 19353 } 19354 if (hw_tls) 19355 rack->r_ctl.fsb.hw_tls = 1; 19356 else 19357 rack->r_ctl.fsb.hw_tls = 0; 19358 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19359 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19360 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19361 (tp->snd_max - tp->snd_una))); 19362 if (rack->r_ctl.fsb.left_to_send < segsiz) 19363 rack->r_fast_output = 0; 19364 else { 19365 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19366 rack->r_ctl.fsb.rfo_apply_push = 1; 19367 else 19368 rack->r_ctl.fsb.rfo_apply_push = 0; 19369 } 19370 } 19371 19372 static uint32_t 19373 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) 19374 { 19375 uint64_t min_time; 19376 uint32_t maxlen; 19377 19378 min_time = (uint64_t)get_hpts_min_sleep_time(); 19379 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); 19380 maxlen = roundup(maxlen, segsiz); 19381 return (maxlen); 19382 } 19383 19384 static struct rack_sendmap * 19385 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 19386 { 19387 struct rack_sendmap *rsm = NULL; 19388 int thresh; 19389 19390 restart: 19391 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 19392 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 19393 /* Nothing, strange turn off validity */ 19394 rack->r_collapse_point_valid = 0; 19395 return (NULL); 19396 } 19397 /* Can we send it yet? */ 19398 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 19399 /* 19400 * Receiver window has not grown enough for 19401 * the segment to be put on the wire. 19402 */ 19403 return (NULL); 19404 } 19405 if (rsm->r_flags & RACK_ACKED) { 19406 /* 19407 * It has been sacked, lets move to the 19408 * next one if possible. 19409 */ 19410 rack->r_ctl.last_collapse_point = rsm->r_end; 19411 /* Are we done? */ 19412 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19413 rack->r_ctl.high_collapse_point)) { 19414 rack->r_collapse_point_valid = 0; 19415 return (NULL); 19416 } 19417 goto restart; 19418 } 19419 /* Now has it been long enough ? */ 19420 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1); 19421 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 19422 rack_log_collapse(rack, rsm->r_start, 19423 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19424 thresh, __LINE__, 6, rsm->r_flags, rsm); 19425 return (rsm); 19426 } 19427 /* Not enough time */ 19428 rack_log_collapse(rack, rsm->r_start, 19429 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19430 thresh, __LINE__, 7, rsm->r_flags, rsm); 19431 return (NULL); 19432 } 19433 19434 static inline void 19435 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) 19436 { 19437 if ((rack->full_size_rxt == 0) && 19438 (rack->shape_rxt_to_pacing_min == 0) && 19439 (*len >= segsiz)) { 19440 *len = segsiz; 19441 } else if (rack->shape_rxt_to_pacing_min && 19442 rack->gp_ready) { 19443 /* We use pacing min as shaping len req */ 19444 uint32_t maxlen; 19445 19446 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 19447 if (*len > maxlen) 19448 *len = maxlen; 19449 } else { 19450 /* 19451 * The else is full_size_rxt is on so send it all 19452 * note we do need to check this for exceeding 19453 * our max segment size due to the fact that 19454 * we do sometimes merge chunks together i.e. 19455 * we cannot just assume that we will never have 19456 * a chunk greater than pace_max_seg 19457 */ 19458 if (*len > pace_max_seg) 19459 *len = pace_max_seg; 19460 } 19461 } 19462 19463 static int 19464 rack_output(struct tcpcb *tp) 19465 { 19466 struct socket *so; 19467 uint32_t recwin; 19468 uint32_t sb_offset, s_moff = 0; 19469 int32_t len, error = 0; 19470 uint16_t flags; 19471 struct mbuf *m, *s_mb = NULL; 19472 struct mbuf *mb; 19473 uint32_t if_hw_tsomaxsegcount = 0; 19474 uint32_t if_hw_tsomaxsegsize; 19475 int32_t segsiz, minseg; 19476 long tot_len_this_send = 0; 19477 #ifdef INET 19478 struct ip *ip = NULL; 19479 #endif 19480 struct udphdr *udp = NULL; 19481 struct tcp_rack *rack; 19482 struct tcphdr *th; 19483 uint8_t pass = 0; 19484 uint8_t mark = 0; 19485 uint8_t check_done = 0; 19486 uint8_t wanted_cookie = 0; 19487 u_char opt[TCP_MAXOLEN]; 19488 unsigned ipoptlen, optlen, hdrlen, ulen=0; 19489 uint32_t rack_seq; 19490 19491 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 19492 unsigned ipsec_optlen = 0; 19493 19494 #endif 19495 int32_t idle, sendalot; 19496 uint32_t tot_idle; 19497 int32_t sub_from_prr = 0; 19498 volatile int32_t sack_rxmit; 19499 struct rack_sendmap *rsm = NULL; 19500 int32_t tso, mtu; 19501 struct tcpopt to; 19502 int32_t pacing_delay = 0; 19503 int32_t sup_rack = 0; 19504 uint32_t cts, ms_cts, delayed, early; 19505 uint32_t add_flag = RACK_SENT_SP; 19506 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 19507 uint8_t doing_tlp = 0; 19508 uint32_t cwnd_to_use, pace_max_seg; 19509 int32_t do_a_prefetch = 0; 19510 int32_t prefetch_rsm = 0; 19511 int32_t orig_len = 0; 19512 struct timeval tv; 19513 int32_t prefetch_so_done = 0; 19514 struct tcp_log_buffer *lgb; 19515 struct inpcb *inp = tptoinpcb(tp); 19516 struct sockbuf *sb; 19517 uint64_t ts_val = 0; 19518 #ifdef TCP_ACCOUNTING 19519 uint64_t crtsc; 19520 #endif 19521 #ifdef INET6 19522 struct ip6_hdr *ip6 = NULL; 19523 int32_t isipv6; 19524 #endif 19525 bool hpts_calling, hw_tls = false; 19526 19527 NET_EPOCH_ASSERT(); 19528 INP_WLOCK_ASSERT(inp); 19529 19530 /* setup and take the cache hits here */ 19531 rack = (struct tcp_rack *)tp->t_fb_ptr; 19532 #ifdef TCP_ACCOUNTING 19533 sched_pin(); 19534 ts_val = get_cyclecount(); 19535 #endif 19536 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); 19537 tp->t_flags2 &= ~TF2_HPTS_CALLS; 19538 #ifdef TCP_OFFLOAD 19539 if (tp->t_flags & TF_TOE) { 19540 #ifdef TCP_ACCOUNTING 19541 sched_unpin(); 19542 #endif 19543 return (tcp_offload_output(tp)); 19544 } 19545 #endif 19546 if (rack->rack_deferred_inited == 0) { 19547 /* 19548 * If we are the connecting socket we will 19549 * hit rack_init() when no sequence numbers 19550 * are setup. This makes it so we must defer 19551 * some initialization. Call that now. 19552 */ 19553 rack_deferred_init(tp, rack); 19554 } 19555 /* 19556 * For TFO connections in SYN_RECEIVED, only allow the initial 19557 * SYN|ACK and those sent by the retransmit timer. 19558 */ 19559 if ((tp->t_flags & TF_FASTOPEN) && 19560 (tp->t_state == TCPS_SYN_RECEIVED) && 19561 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 19562 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 19563 #ifdef TCP_ACCOUNTING 19564 sched_unpin(); 19565 #endif 19566 return (0); 19567 } 19568 #ifdef INET6 19569 if (rack->r_state) { 19570 /* Use the cache line loaded if possible */ 19571 isipv6 = rack->r_is_v6; 19572 } else { 19573 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 19574 } 19575 #endif 19576 early = 0; 19577 cts = tcp_get_usecs(&tv); 19578 ms_cts = tcp_tv_to_msec(&tv); 19579 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 19580 tcp_in_hpts(rack->rc_tp)) { 19581 /* 19582 * We are on the hpts for some timer but not hptsi output. 19583 * Remove from the hpts unconditionally. 19584 */ 19585 rack_timer_cancel(tp, rack, cts, __LINE__); 19586 } 19587 /* Are we pacing and late? */ 19588 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19589 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 19590 /* We are delayed */ 19591 delayed = cts - rack->r_ctl.rc_last_output_to; 19592 } else { 19593 delayed = 0; 19594 } 19595 /* Do the timers, which may override the pacer */ 19596 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 19597 int retval; 19598 19599 retval = rack_process_timers(tp, rack, cts, hpts_calling, 19600 &doing_tlp); 19601 if (retval != 0) { 19602 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 19603 #ifdef TCP_ACCOUNTING 19604 sched_unpin(); 19605 #endif 19606 /* 19607 * If timers want tcp_drop(), then pass error out, 19608 * otherwise suppress it. 19609 */ 19610 return (retval < 0 ? retval : 0); 19611 } 19612 } 19613 if (rack->rc_in_persist) { 19614 if (tcp_in_hpts(rack->rc_tp) == 0) { 19615 /* Timer is not running */ 19616 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19617 } 19618 #ifdef TCP_ACCOUNTING 19619 sched_unpin(); 19620 #endif 19621 return (0); 19622 } 19623 if ((rack->rc_ack_required == 1) && 19624 (rack->r_timer_override == 0)){ 19625 /* A timeout occurred and no ack has arrived */ 19626 if (tcp_in_hpts(rack->rc_tp) == 0) { 19627 /* Timer is not running */ 19628 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19629 } 19630 #ifdef TCP_ACCOUNTING 19631 sched_unpin(); 19632 #endif 19633 return (0); 19634 } 19635 if ((rack->r_timer_override) || 19636 (rack->rc_ack_can_sendout_data) || 19637 (delayed) || 19638 (tp->t_state < TCPS_ESTABLISHED)) { 19639 rack->rc_ack_can_sendout_data = 0; 19640 if (tcp_in_hpts(rack->rc_tp)) 19641 tcp_hpts_remove(rack->rc_tp); 19642 } else if (tcp_in_hpts(rack->rc_tp)) { 19643 /* 19644 * On the hpts you can't pass even if ACKNOW is on, we will 19645 * when the hpts fires. 19646 */ 19647 #ifdef TCP_ACCOUNTING 19648 crtsc = get_cyclecount(); 19649 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19650 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 19651 tp->tcp_cnt_counters[SND_BLOCKED]++; 19652 } 19653 sched_unpin(); 19654 #endif 19655 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 19656 return (0); 19657 } 19658 /* Finish out both pacing early and late accounting */ 19659 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19660 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 19661 early = rack->r_ctl.rc_last_output_to - cts; 19662 } else 19663 early = 0; 19664 if (delayed && (rack->rc_always_pace == 1)) { 19665 rack->r_ctl.rc_agg_delayed += delayed; 19666 rack->r_late = 1; 19667 } else if (early && (rack->rc_always_pace == 1)) { 19668 rack->r_ctl.rc_agg_early += early; 19669 rack->r_early = 1; 19670 } else if (rack->rc_always_pace == 0) { 19671 /* Non-paced we are not late */ 19672 rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0; 19673 rack->r_early = rack->r_late = 0; 19674 } 19675 /* Now that early/late accounting is done turn off the flag */ 19676 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 19677 rack->r_wanted_output = 0; 19678 rack->r_timer_override = 0; 19679 if ((tp->t_state != rack->r_state) && 19680 TCPS_HAVEESTABLISHED(tp->t_state)) { 19681 rack_set_state(tp, rack); 19682 } 19683 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19684 minseg = segsiz; 19685 if (rack->r_ctl.rc_pace_max_segs == 0) 19686 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 19687 else 19688 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 19689 if ((rack->r_fast_output) && 19690 (doing_tlp == 0) && 19691 (tp->rcv_numsacks == 0)) { 19692 int ret; 19693 19694 error = 0; 19695 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); 19696 if (ret > 0) 19697 return(ret); 19698 else if (error) { 19699 inp = rack->rc_inp; 19700 so = inp->inp_socket; 19701 sb = &so->so_snd; 19702 goto nomore; 19703 } else { 19704 /* Return == 0, if there is more we can send tot_len wise fall through and send */ 19705 if (tot_len_this_send >= pace_max_seg) 19706 return (ret); 19707 #ifdef TCP_ACCOUNTING 19708 /* We need to re-pin since fast_output un-pined */ 19709 sched_pin(); 19710 ts_val = get_cyclecount(); 19711 #endif 19712 /* Fall back out so we can send any more that may bring us to pace_max_seg */ 19713 } 19714 } 19715 inp = rack->rc_inp; 19716 /* 19717 * For TFO connections in SYN_SENT or SYN_RECEIVED, 19718 * only allow the initial SYN or SYN|ACK and those sent 19719 * by the retransmit timer. 19720 */ 19721 if ((tp->t_flags & TF_FASTOPEN) && 19722 ((tp->t_state == TCPS_SYN_RECEIVED) || 19723 (tp->t_state == TCPS_SYN_SENT)) && 19724 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 19725 (tp->t_rxtshift == 0)) { /* not a retransmit */ 19726 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19727 #ifdef TCP_ACCOUNTING 19728 sched_unpin(); 19729 #endif 19730 return (0); 19731 } 19732 /* 19733 * Determine length of data that should be transmitted, and flags 19734 * that will be used. If there is some data or critical controls 19735 * (SYN, RST) to send, then transmit; otherwise, investigate 19736 * further. 19737 */ 19738 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 19739 if (tp->t_idle_reduce) { 19740 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 19741 rack_cc_after_idle(rack, tp); 19742 } 19743 tp->t_flags &= ~TF_LASTIDLE; 19744 if (idle) { 19745 if (tp->t_flags & TF_MORETOCOME) { 19746 tp->t_flags |= TF_LASTIDLE; 19747 idle = 0; 19748 } 19749 } 19750 if ((tp->snd_una == tp->snd_max) && 19751 rack->r_ctl.rc_went_idle_time && 19752 (cts > rack->r_ctl.rc_went_idle_time)) { 19753 tot_idle = (cts - rack->r_ctl.rc_went_idle_time); 19754 if (tot_idle > rack_min_probertt_hold) { 19755 /* Count as a probe rtt */ 19756 if (rack->in_probe_rtt == 0) { 19757 rack->r_ctl.rc_lower_rtt_us_cts = cts; 19758 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 19759 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 19760 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 19761 } else { 19762 rack_exit_probertt(rack, cts); 19763 } 19764 } 19765 } else 19766 tot_idle = 0; 19767 if (rack_use_fsb && 19768 (rack->r_ctl.fsb.tcp_ip_hdr) && 19769 (rack->r_fsb_inited == 0) && 19770 (rack->r_state != TCPS_CLOSED)) 19771 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); 19772 if (rack->rc_sendvars_notset == 1) { 19773 rack->rc_sendvars_notset = 0; 19774 /* 19775 * Make sure any TCP timers (keep-alive) is not running. 19776 */ 19777 tcp_timer_stop(tp); 19778 } 19779 if ((rack->rack_no_prr == 1) && 19780 (rack->rc_always_pace == 0)) { 19781 /* 19782 * Sanity check before sending, if we have 19783 * no-pacing enabled and prr is turned off that 19784 * is a logistics error. Correct this by turnning 19785 * prr back on. A user *must* set some form of 19786 * pacing in order to turn PRR off. We do this 19787 * in the output path so that we can avoid socket 19788 * option ordering issues that would occur if we 19789 * tried to do it while setting rack_no_prr on. 19790 */ 19791 rack->rack_no_prr = 0; 19792 } 19793 if ((rack->pcm_enabled == 1) && 19794 (rack->pcm_needed == 0) && 19795 (tot_idle > 0)) { 19796 /* 19797 * We have been idle some micro seconds. We need 19798 * to factor this in to see if a PCM is needed. 19799 */ 19800 uint32_t rtts_idle, rnds; 19801 19802 if (tp->t_srtt) 19803 rtts_idle = tot_idle / tp->t_srtt; 19804 else 19805 rtts_idle = 0; 19806 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 19807 rack->r_ctl.pcm_idle_rounds += rtts_idle; 19808 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 19809 rack->pcm_needed = 1; 19810 rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round ); 19811 } 19812 } 19813 again: 19814 sendalot = 0; 19815 cts = tcp_get_usecs(&tv); 19816 ms_cts = tcp_tv_to_msec(&tv); 19817 tso = 0; 19818 mtu = 0; 19819 if (TCPS_HAVEESTABLISHED(tp->t_state) && 19820 (rack->r_ctl.pcm_max_seg == 0)) { 19821 /* 19822 * We set in our first send so we know that the ctf_fixed_maxseg 19823 * has been fully set. If we do it in rack_init() we most likely 19824 * see 512 bytes so we end up at 5120, not desirable. 19825 */ 19826 rack->r_ctl.pcm_max_seg = rc_init_window(rack); 19827 if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) { 19828 /* 19829 * Assure our initial PCM probe is at least 10 MSS. 19830 */ 19831 rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; 19832 } 19833 } 19834 if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { 19835 uint32_t rw_avail, cwa; 19836 19837 if (tp->snd_wnd > ctf_outstanding(tp)) 19838 rw_avail = tp->snd_wnd - ctf_outstanding(tp); 19839 else 19840 rw_avail = 0; 19841 if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked)) 19842 cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19843 else 19844 cwa = 0; 19845 if ((cwa >= rack->r_ctl.pcm_max_seg) && 19846 (rw_avail > rack->r_ctl.pcm_max_seg)) { 19847 /* Raise up the max seg for this trip through */ 19848 pace_max_seg = rack->r_ctl.pcm_max_seg; 19849 /* Disable any fast output */ 19850 rack->r_fast_output = 0; 19851 } 19852 if (rack_verbose_logging) { 19853 rack_log_pcm(rack, 4, 19854 cwa, rack->r_ctl.pcm_max_seg, rw_avail); 19855 } 19856 } 19857 sb_offset = tp->snd_max - tp->snd_una; 19858 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19859 flags = tcp_outflags[tp->t_state]; 19860 while (rack->rc_free_cnt < rack_free_cache) { 19861 rsm = rack_alloc(rack); 19862 if (rsm == NULL) { 19863 if (hpts_calling) 19864 /* Retry in a ms */ 19865 pacing_delay = (1 * HPTS_USEC_IN_MSEC); 19866 so = inp->inp_socket; 19867 sb = &so->so_snd; 19868 goto just_return_nolock; 19869 } 19870 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 19871 rack->rc_free_cnt++; 19872 rsm = NULL; 19873 } 19874 sack_rxmit = 0; 19875 len = 0; 19876 rsm = NULL; 19877 if (flags & TH_RST) { 19878 SOCK_SENDBUF_LOCK(inp->inp_socket); 19879 so = inp->inp_socket; 19880 sb = &so->so_snd; 19881 goto send; 19882 } 19883 if (rack->r_ctl.rc_resend) { 19884 /* Retransmit timer */ 19885 rsm = rack->r_ctl.rc_resend; 19886 rack->r_ctl.rc_resend = NULL; 19887 len = rsm->r_end - rsm->r_start; 19888 sack_rxmit = 1; 19889 sendalot = 0; 19890 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 19891 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 19892 __func__, __LINE__, 19893 rsm->r_start, tp->snd_una, tp, rack, rsm)); 19894 sb_offset = rsm->r_start - tp->snd_una; 19895 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19896 } else if (rack->r_collapse_point_valid && 19897 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 19898 /* 19899 * If an RSM is returned then enough time has passed 19900 * for us to retransmit it. Move up the collapse point, 19901 * since this rsm has its chance to retransmit now. 19902 */ 19903 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT); 19904 rack->r_ctl.last_collapse_point = rsm->r_end; 19905 /* Are we done? */ 19906 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19907 rack->r_ctl.high_collapse_point)) 19908 rack->r_collapse_point_valid = 0; 19909 sack_rxmit = 1; 19910 /* We are not doing a TLP */ 19911 doing_tlp = 0; 19912 len = rsm->r_end - rsm->r_start; 19913 sb_offset = rsm->r_start - tp->snd_una; 19914 sendalot = 0; 19915 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19916 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 19917 /* We have a retransmit that takes precedence */ 19918 if ((!IN_FASTRECOVERY(tp->t_flags)) && 19919 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 19920 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 19921 /* Enter recovery if not induced by a time-out */ 19922 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 19923 } 19924 #ifdef INVARIANTS 19925 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 19926 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 19927 tp, rack, rsm, rsm->r_start, tp->snd_una); 19928 } 19929 #endif 19930 len = rsm->r_end - rsm->r_start; 19931 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 19932 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 19933 __func__, __LINE__, 19934 rsm->r_start, tp->snd_una, tp, rack, rsm)); 19935 sb_offset = rsm->r_start - tp->snd_una; 19936 sendalot = 0; 19937 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19938 if (len > 0) { 19939 sack_rxmit = 1; 19940 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 19941 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 19942 min(len, segsiz)); 19943 } 19944 } else if (rack->r_ctl.rc_tlpsend) { 19945 /* Tail loss probe */ 19946 long cwin; 19947 long tlen; 19948 19949 /* 19950 * Check if we can do a TLP with a RACK'd packet 19951 * this can happen if we are not doing the rack 19952 * cheat and we skipped to a TLP and it 19953 * went off. 19954 */ 19955 rsm = rack->r_ctl.rc_tlpsend; 19956 /* We are doing a TLP make sure the flag is preent */ 19957 rsm->r_flags |= RACK_TLP; 19958 rack->r_ctl.rc_tlpsend = NULL; 19959 sack_rxmit = 1; 19960 tlen = rsm->r_end - rsm->r_start; 19961 if (tlen > segsiz) 19962 tlen = segsiz; 19963 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 19964 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 19965 __func__, __LINE__, 19966 rsm->r_start, tp->snd_una, tp, rack, rsm)); 19967 sb_offset = rsm->r_start - tp->snd_una; 19968 cwin = min(tp->snd_wnd, tlen); 19969 len = cwin; 19970 } 19971 if (rack->r_must_retran && 19972 (doing_tlp == 0) && 19973 (SEQ_GT(tp->snd_max, tp->snd_una)) && 19974 (rsm == NULL)) { 19975 /* 19976 * There are two different ways that we 19977 * can get into this block: 19978 * a) This is a non-sack connection, we had a time-out 19979 * and thus r_must_retran was set and everything 19980 * left outstanding as been marked for retransmit. 19981 * b) The MTU of the path shrank, so that everything 19982 * was marked to be retransmitted with the smaller 19983 * mtu and r_must_retran was set. 19984 * 19985 * This means that we expect the sendmap (outstanding) 19986 * to all be marked must. We can use the tmap to 19987 * look at them. 19988 * 19989 */ 19990 int sendwin, flight; 19991 19992 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 19993 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 19994 if (flight >= sendwin) { 19995 /* 19996 * We can't send yet. 19997 */ 19998 so = inp->inp_socket; 19999 sb = &so->so_snd; 20000 goto just_return_nolock; 20001 } 20002 /* 20003 * This is the case a/b mentioned above. All 20004 * outstanding/not-acked should be marked. 20005 * We can use the tmap to find them. 20006 */ 20007 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 20008 if (rsm == NULL) { 20009 /* TSNH */ 20010 rack->r_must_retran = 0; 20011 rack->r_ctl.rc_out_at_rto = 0; 20012 so = inp->inp_socket; 20013 sb = &so->so_snd; 20014 goto just_return_nolock; 20015 } 20016 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 20017 /* 20018 * The first one does not have the flag, did we collapse 20019 * further up in our list? 20020 */ 20021 rack->r_must_retran = 0; 20022 rack->r_ctl.rc_out_at_rto = 0; 20023 rsm = NULL; 20024 sack_rxmit = 0; 20025 } else { 20026 sack_rxmit = 1; 20027 len = rsm->r_end - rsm->r_start; 20028 sb_offset = rsm->r_start - tp->snd_una; 20029 sendalot = 0; 20030 if ((rack->full_size_rxt == 0) && 20031 (rack->shape_rxt_to_pacing_min == 0) && 20032 (len >= segsiz)) 20033 len = segsiz; 20034 else if (rack->shape_rxt_to_pacing_min && 20035 rack->gp_ready) { 20036 /* We use pacing min as shaping len req */ 20037 uint32_t maxlen; 20038 20039 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20040 if (len > maxlen) 20041 len = maxlen; 20042 } 20043 /* 20044 * Delay removing the flag RACK_MUST_RXT so 20045 * that the fastpath for retransmit will 20046 * work with this rsm. 20047 */ 20048 } 20049 } 20050 /* 20051 * Enforce a connection sendmap count limit if set 20052 * as long as we are not retransmiting. 20053 */ 20054 if ((rsm == NULL) && 20055 (V_tcp_map_entries_limit > 0) && 20056 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 20057 counter_u64_add(rack_to_alloc_limited, 1); 20058 if (!rack->alloc_limit_reported) { 20059 rack->alloc_limit_reported = 1; 20060 counter_u64_add(rack_alloc_limited_conns, 1); 20061 } 20062 so = inp->inp_socket; 20063 sb = &so->so_snd; 20064 goto just_return_nolock; 20065 } 20066 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 20067 /* we are retransmitting the fin */ 20068 len--; 20069 if (len) { 20070 /* 20071 * When retransmitting data do *not* include the 20072 * FIN. This could happen from a TLP probe. 20073 */ 20074 flags &= ~TH_FIN; 20075 } 20076 } 20077 if (rsm && rack->r_fsb_inited && 20078 rack_use_rsm_rfo && 20079 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 20080 int ret; 20081 20082 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 20083 if (ret == 0) 20084 return (0); 20085 } 20086 so = inp->inp_socket; 20087 sb = &so->so_snd; 20088 if (do_a_prefetch == 0) { 20089 kern_prefetch(sb, &do_a_prefetch); 20090 do_a_prefetch = 1; 20091 } 20092 #ifdef NETFLIX_SHARED_CWND 20093 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 20094 rack->rack_enable_scwnd) { 20095 /* We are doing cwnd sharing */ 20096 if (rack->gp_ready && 20097 (rack->rack_attempted_scwnd == 0) && 20098 (rack->r_ctl.rc_scw == NULL) && 20099 tp->t_lib) { 20100 /* The pcbid is in, lets make an attempt */ 20101 counter_u64_add(rack_try_scwnd, 1); 20102 rack->rack_attempted_scwnd = 1; 20103 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 20104 &rack->r_ctl.rc_scw_index, 20105 segsiz); 20106 } 20107 if (rack->r_ctl.rc_scw && 20108 (rack->rack_scwnd_is_idle == 1) && 20109 sbavail(&so->so_snd)) { 20110 /* we are no longer out of data */ 20111 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20112 rack->rack_scwnd_is_idle = 0; 20113 } 20114 if (rack->r_ctl.rc_scw) { 20115 /* First lets update and get the cwnd */ 20116 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 20117 rack->r_ctl.rc_scw_index, 20118 tp->snd_cwnd, tp->snd_wnd, segsiz); 20119 } 20120 } 20121 #endif 20122 /* 20123 * Get standard flags, and add SYN or FIN if requested by 'hidden' 20124 * state flags. 20125 */ 20126 if (tp->t_flags & TF_NEEDFIN) 20127 flags |= TH_FIN; 20128 if (tp->t_flags & TF_NEEDSYN) 20129 flags |= TH_SYN; 20130 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 20131 void *end_rsm; 20132 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 20133 if (end_rsm) 20134 kern_prefetch(end_rsm, &prefetch_rsm); 20135 prefetch_rsm = 1; 20136 } 20137 SOCK_SENDBUF_LOCK(so); 20138 if ((sack_rxmit == 0) && 20139 (TCPS_HAVEESTABLISHED(tp->t_state) || 20140 (tp->t_flags & TF_FASTOPEN))) { 20141 /* 20142 * We are not retransmitting (sack_rxmit is 0) so we 20143 * are sending new data. This is always based on snd_max. 20144 * Now in theory snd_max may be equal to snd_una, if so 20145 * then nothing is outstanding and the offset would be 0. 20146 */ 20147 uint32_t avail; 20148 20149 avail = sbavail(sb); 20150 if (SEQ_GT(tp->snd_max, tp->snd_una) && avail) 20151 sb_offset = tp->snd_max - tp->snd_una; 20152 else 20153 sb_offset = 0; 20154 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 20155 if (rack->r_ctl.rc_tlp_new_data) { 20156 /* TLP is forcing out new data */ 20157 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 20158 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 20159 } 20160 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 20161 if (tp->snd_wnd > sb_offset) 20162 len = tp->snd_wnd - sb_offset; 20163 else 20164 len = 0; 20165 } else { 20166 len = rack->r_ctl.rc_tlp_new_data; 20167 } 20168 rack->r_ctl.rc_tlp_new_data = 0; 20169 } else { 20170 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 20171 } 20172 if ((rack->r_ctl.crte == NULL) && 20173 IN_FASTRECOVERY(tp->t_flags) && 20174 (rack->full_size_rxt == 0) && 20175 (rack->shape_rxt_to_pacing_min == 0) && 20176 (len > segsiz)) { 20177 /* 20178 * For prr=off, we need to send only 1 MSS 20179 * at a time. We do this because another sack could 20180 * be arriving that causes us to send retransmits and 20181 * we don't want to be on a long pace due to a larger send 20182 * that keeps us from sending out the retransmit. 20183 */ 20184 len = segsiz; 20185 } else if (rack->shape_rxt_to_pacing_min && 20186 rack->gp_ready) { 20187 /* We use pacing min as shaping len req */ 20188 uint32_t maxlen; 20189 20190 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20191 if (len > maxlen) 20192 len = maxlen; 20193 }/* The else is full_size_rxt is on so send it all */ 20194 } else { 20195 uint32_t outstanding; 20196 /* 20197 * We are inside of a Fast recovery episode, this 20198 * is caused by a SACK or 3 dup acks. At this point 20199 * we have sent all the retransmissions and we rely 20200 * on PRR to dictate what we will send in the form of 20201 * new data. 20202 */ 20203 20204 outstanding = tp->snd_max - tp->snd_una; 20205 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 20206 if (tp->snd_wnd > outstanding) { 20207 len = tp->snd_wnd - outstanding; 20208 /* Check to see if we have the data */ 20209 if ((sb_offset + len) > avail) { 20210 /* It does not all fit */ 20211 if (avail > sb_offset) 20212 len = avail - sb_offset; 20213 else 20214 len = 0; 20215 } 20216 } else { 20217 len = 0; 20218 } 20219 } else if (avail > sb_offset) { 20220 len = avail - sb_offset; 20221 } else { 20222 len = 0; 20223 } 20224 if (len > 0) { 20225 if (len > rack->r_ctl.rc_prr_sndcnt) { 20226 len = rack->r_ctl.rc_prr_sndcnt; 20227 } 20228 if (len > 0) { 20229 sub_from_prr = 1; 20230 } 20231 } 20232 if (len > segsiz) { 20233 /* 20234 * We should never send more than a MSS when 20235 * retransmitting or sending new data in prr 20236 * mode unless the override flag is on. Most 20237 * likely the PRR algorithm is not going to 20238 * let us send a lot as well :-) 20239 */ 20240 if (rack->r_ctl.rc_prr_sendalot == 0) { 20241 len = segsiz; 20242 } 20243 } else if (len < segsiz) { 20244 /* 20245 * Do we send any? The idea here is if the 20246 * send empty's the socket buffer we want to 20247 * do it. However if not then lets just wait 20248 * for our prr_sndcnt to get bigger. 20249 */ 20250 long leftinsb; 20251 20252 leftinsb = sbavail(sb) - sb_offset; 20253 if (leftinsb > len) { 20254 /* This send does not empty the sb */ 20255 len = 0; 20256 } 20257 } 20258 } 20259 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 20260 /* 20261 * If you have not established 20262 * and are not doing FAST OPEN 20263 * no data please. 20264 */ 20265 if ((sack_rxmit == 0) && 20266 !(tp->t_flags & TF_FASTOPEN)) { 20267 len = 0; 20268 sb_offset = 0; 20269 } 20270 } 20271 if (prefetch_so_done == 0) { 20272 kern_prefetch(so, &prefetch_so_done); 20273 prefetch_so_done = 1; 20274 } 20275 orig_len = len; 20276 /* 20277 * Lop off SYN bit if it has already been sent. However, if this is 20278 * SYN-SENT state and if segment contains data and if we don't know 20279 * that foreign host supports TAO, suppress sending segment. 20280 */ 20281 if ((flags & TH_SYN) && 20282 SEQ_GT(tp->snd_max, tp->snd_una) && 20283 ((sack_rxmit == 0) && 20284 (tp->t_rxtshift == 0))) { 20285 /* 20286 * When sending additional segments following a TFO SYN|ACK, 20287 * do not include the SYN bit. 20288 */ 20289 if ((tp->t_flags & TF_FASTOPEN) && 20290 (tp->t_state == TCPS_SYN_RECEIVED)) 20291 flags &= ~TH_SYN; 20292 } 20293 /* 20294 * Be careful not to send data and/or FIN on SYN segments. This 20295 * measure is needed to prevent interoperability problems with not 20296 * fully conformant TCP implementations. 20297 */ 20298 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 20299 len = 0; 20300 flags &= ~TH_FIN; 20301 } 20302 /* 20303 * On TFO sockets, ensure no data is sent in the following cases: 20304 * 20305 * - When retransmitting SYN|ACK on a passively-created socket 20306 * 20307 * - When retransmitting SYN on an actively created socket 20308 * 20309 * - When sending a zero-length cookie (cookie request) on an 20310 * actively created socket 20311 * 20312 * - When the socket is in the CLOSED state (RST is being sent) 20313 */ 20314 if ((tp->t_flags & TF_FASTOPEN) && 20315 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 20316 ((tp->t_state == TCPS_SYN_SENT) && 20317 (tp->t_tfo_client_cookie_len == 0)) || 20318 (flags & TH_RST))) { 20319 sack_rxmit = 0; 20320 len = 0; 20321 } 20322 /* Without fast-open there should never be data sent on a SYN */ 20323 if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) { 20324 len = 0; 20325 } 20326 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 20327 /* We only send 1 MSS if we have a DSACK block */ 20328 add_flag |= RACK_SENT_W_DSACK; 20329 len = segsiz; 20330 } 20331 if (len <= 0) { 20332 /* 20333 * We have nothing to send, or the window shrank, or 20334 * is closed, do we need to go into persists? 20335 */ 20336 len = 0; 20337 if ((tp->snd_wnd == 0) && 20338 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20339 (tp->snd_una == tp->snd_max) && 20340 (sb_offset < (int)sbavail(sb))) { 20341 rack_enter_persist(tp, rack, cts, tp->snd_una); 20342 } 20343 } else if ((rsm == NULL) && 20344 (doing_tlp == 0) && 20345 (len < pace_max_seg)) { 20346 /* 20347 * We are not sending a maximum sized segment for 20348 * some reason. Should we not send anything (think 20349 * sws or persists)? 20350 */ 20351 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20352 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20353 (len < minseg) && 20354 (len < (int)(sbavail(sb) - sb_offset))) { 20355 /* 20356 * Here the rwnd is less than 20357 * the minimum pacing size, this is not a retransmit, 20358 * we are established and 20359 * the send is not the last in the socket buffer 20360 * we send nothing, and we may enter persists 20361 * if nothing is outstanding. 20362 */ 20363 len = 0; 20364 if (tp->snd_max == tp->snd_una) { 20365 /* 20366 * Nothing out we can 20367 * go into persists. 20368 */ 20369 rack_enter_persist(tp, rack, cts, tp->snd_una); 20370 } 20371 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 20372 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20373 (len < (int)(sbavail(sb) - sb_offset)) && 20374 (len < minseg)) { 20375 /* 20376 * Here we are not retransmitting, and 20377 * the cwnd is not so small that we could 20378 * not send at least a min size (rxt timer 20379 * not having gone off), We have 2 segments or 20380 * more already in flight, its not the tail end 20381 * of the socket buffer and the cwnd is blocking 20382 * us from sending out a minimum pacing segment size. 20383 * Lets not send anything. 20384 */ 20385 len = 0; 20386 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 20387 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20388 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20389 (len < (int)(sbavail(sb) - sb_offset)) && 20390 (TCPS_HAVEESTABLISHED(tp->t_state))) { 20391 /* 20392 * Here we have a send window but we have 20393 * filled it up and we can't send another pacing segment. 20394 * We also have in flight more than 2 segments 20395 * and we are not completing the sb i.e. we allow 20396 * the last bytes of the sb to go out even if 20397 * its not a full pacing segment. 20398 */ 20399 len = 0; 20400 } else if ((rack->r_ctl.crte != NULL) && 20401 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 20402 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 20403 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 20404 (len < (int)(sbavail(sb) - sb_offset))) { 20405 /* 20406 * Here we are doing hardware pacing, this is not a TLP, 20407 * we are not sending a pace max segment size, there is rwnd 20408 * room to send at least N pace_max_seg, the cwnd is greater 20409 * than or equal to a full pacing segments plus 4 mss and we have 2 or 20410 * more segments in flight and its not the tail of the socket buffer. 20411 * 20412 * We don't want to send instead we need to get more ack's in to 20413 * allow us to send a full pacing segment. Normally, if we are pacing 20414 * about the right speed, we should have finished our pacing 20415 * send as most of the acks have come back if we are at the 20416 * right rate. This is a bit fuzzy since return path delay 20417 * can delay the acks, which is why we want to make sure we 20418 * have cwnd space to have a bit more than a max pace segments in flight. 20419 * 20420 * If we have not gotten our acks back we are pacing at too high a 20421 * rate delaying will not hurt and will bring our GP estimate down by 20422 * injecting the delay. If we don't do this we will send 20423 * 2 MSS out in response to the acks being clocked in which 20424 * defeats the point of hw-pacing (i.e. to help us get 20425 * larger TSO's out). 20426 */ 20427 len = 0; 20428 } 20429 20430 } 20431 /* len will be >= 0 after this point. */ 20432 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 20433 rack_sndbuf_autoscale(rack); 20434 /* 20435 * Decide if we can use TCP Segmentation Offloading (if supported by 20436 * hardware). 20437 * 20438 * TSO may only be used if we are in a pure bulk sending state. The 20439 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 20440 * options prevent using TSO. With TSO the TCP header is the same 20441 * (except for the sequence number) for all generated packets. This 20442 * makes it impossible to transmit any options which vary per 20443 * generated segment or packet. 20444 * 20445 * IPv4 handling has a clear separation of ip options and ip header 20446 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 20447 * the right thing below to provide length of just ip options and thus 20448 * checking for ipoptlen is enough to decide if ip options are present. 20449 */ 20450 ipoptlen = 0; 20451 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20452 /* 20453 * Pre-calculate here as we save another lookup into the darknesses 20454 * of IPsec that way and can actually decide if TSO is ok. 20455 */ 20456 #ifdef INET6 20457 if (isipv6 && IPSEC_ENABLED(ipv6)) 20458 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 20459 #ifdef INET 20460 else 20461 #endif 20462 #endif /* INET6 */ 20463 #ifdef INET 20464 if (IPSEC_ENABLED(ipv4)) 20465 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 20466 #endif /* INET */ 20467 #endif 20468 20469 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20470 ipoptlen += ipsec_optlen; 20471 #endif 20472 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 20473 (tp->t_port == 0) && 20474 ((tp->t_flags & TF_SIGNATURE) == 0) && 20475 sack_rxmit == 0 && 20476 ipoptlen == 0) 20477 tso = 1; 20478 { 20479 uint32_t outstanding __unused; 20480 20481 outstanding = tp->snd_max - tp->snd_una; 20482 if (tp->t_flags & TF_SENTFIN) { 20483 /* 20484 * If we sent a fin, snd_max is 1 higher than 20485 * snd_una 20486 */ 20487 outstanding--; 20488 } 20489 if (sack_rxmit) { 20490 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 20491 flags &= ~TH_FIN; 20492 } 20493 } 20494 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 20495 (long)TCP_MAXWIN << tp->rcv_scale); 20496 20497 /* 20498 * Sender silly window avoidance. We transmit under the following 20499 * conditions when len is non-zero: 20500 * 20501 * - We have a full segment (or more with TSO) - This is the last 20502 * buffer in a write()/send() and we are either idle or running 20503 * NODELAY - we've timed out (e.g. persist timer) - we have more 20504 * then 1/2 the maximum send window's worth of data (receiver may be 20505 * limited the window size) - we need to retransmit 20506 */ 20507 if (len) { 20508 if (len >= segsiz) { 20509 goto send; 20510 } 20511 /* 20512 * NOTE! on localhost connections an 'ack' from the remote 20513 * end may occur synchronously with the output and cause us 20514 * to flush a buffer queued with moretocome. XXX 20515 * 20516 */ 20517 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 20518 (idle || (tp->t_flags & TF_NODELAY)) && 20519 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20520 (tp->t_flags & TF_NOPUSH) == 0) { 20521 pass = 2; 20522 goto send; 20523 } 20524 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 20525 pass = 22; 20526 goto send; 20527 } 20528 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 20529 pass = 4; 20530 goto send; 20531 } 20532 if (sack_rxmit) { 20533 pass = 6; 20534 goto send; 20535 } 20536 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 20537 (ctf_outstanding(tp) < (segsiz * 2))) { 20538 /* 20539 * We have less than two MSS outstanding (delayed ack) 20540 * and our rwnd will not let us send a full sized 20541 * MSS. Lets go ahead and let this small segment 20542 * out because we want to try to have at least two 20543 * packets inflight to not be caught by delayed ack. 20544 */ 20545 pass = 12; 20546 goto send; 20547 } 20548 } 20549 /* 20550 * Sending of standalone window updates. 20551 * 20552 * Window updates are important when we close our window due to a 20553 * full socket buffer and are opening it again after the application 20554 * reads data from it. Once the window has opened again and the 20555 * remote end starts to send again the ACK clock takes over and 20556 * provides the most current window information. 20557 * 20558 * We must avoid the silly window syndrome whereas every read from 20559 * the receive buffer, no matter how small, causes a window update 20560 * to be sent. We also should avoid sending a flurry of window 20561 * updates when the socket buffer had queued a lot of data and the 20562 * application is doing small reads. 20563 * 20564 * Prevent a flurry of pointless window updates by only sending an 20565 * update when we can increase the advertized window by more than 20566 * 1/4th of the socket buffer capacity. When the buffer is getting 20567 * full or is very small be more aggressive and send an update 20568 * whenever we can increase by two mss sized segments. In all other 20569 * situations the ACK's to new incoming data will carry further 20570 * window increases. 20571 * 20572 * Don't send an independent window update if a delayed ACK is 20573 * pending (it will get piggy-backed on it) or the remote side 20574 * already has done a half-close and won't send more data. Skip 20575 * this if the connection is in T/TCP half-open state. 20576 */ 20577 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 20578 !(tp->t_flags & TF_DELACK) && 20579 !TCPS_HAVERCVDFIN(tp->t_state)) { 20580 /* 20581 * "adv" is the amount we could increase the window, taking 20582 * into account that we are limited by TCP_MAXWIN << 20583 * tp->rcv_scale. 20584 */ 20585 int32_t adv; 20586 int oldwin; 20587 20588 adv = recwin; 20589 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 20590 oldwin = (tp->rcv_adv - tp->rcv_nxt); 20591 if (adv > oldwin) 20592 adv -= oldwin; 20593 else { 20594 /* We can't increase the window */ 20595 adv = 0; 20596 } 20597 } else 20598 oldwin = 0; 20599 20600 /* 20601 * If the new window size ends up being the same as or less 20602 * than the old size when it is scaled, then don't force 20603 * a window update. 20604 */ 20605 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 20606 goto dontupdate; 20607 20608 if (adv >= (int32_t)(2 * segsiz) && 20609 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 20610 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 20611 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 20612 pass = 7; 20613 goto send; 20614 } 20615 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 20616 pass = 23; 20617 goto send; 20618 } 20619 } 20620 dontupdate: 20621 20622 /* 20623 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 20624 * is also a catch-all for the retransmit timer timeout case. 20625 */ 20626 if (tp->t_flags & TF_ACKNOW) { 20627 pass = 8; 20628 goto send; 20629 } 20630 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 20631 pass = 9; 20632 goto send; 20633 } 20634 /* 20635 * If our state indicates that FIN should be sent and we have not 20636 * yet done so, then we need to send. 20637 */ 20638 if ((flags & TH_FIN) && 20639 (tp->snd_max == tp->snd_una)) { 20640 pass = 11; 20641 goto send; 20642 } 20643 /* 20644 * No reason to send a segment, just return. 20645 */ 20646 just_return: 20647 SOCK_SENDBUF_UNLOCK(so); 20648 just_return_nolock: 20649 { 20650 int app_limited = CTF_JR_SENT_DATA; 20651 20652 if ((tp->t_flags & TF_FASTOPEN) == 0 && 20653 (flags & TH_FIN) && 20654 (len == 0) && 20655 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 20656 ((tp->snd_max - tp->snd_una) <= segsiz)) { 20657 /* 20658 * Ok less than or right at a MSS is 20659 * outstanding. The original FreeBSD stack would 20660 * have sent a FIN, which can speed things up for 20661 * a transactional application doing a MSG_WAITALL. 20662 * To speed things up since we do *not* send a FIN 20663 * if data is outstanding, we send a "challenge ack". 20664 * The idea behind that is instead of having to have 20665 * the peer wait for the delayed-ack timer to run off 20666 * we send an ack that makes the peer send us an ack. 20667 */ 20668 rack_send_ack_challange(rack); 20669 } 20670 if (tot_len_this_send > 0) { 20671 rack->r_ctl.fsb.recwin = recwin; 20672 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); 20673 if ((error == 0) && 20674 rack_use_rfo && 20675 ((flags & (TH_SYN|TH_FIN)) == 0) && 20676 (ipoptlen == 0) && 20677 rack->r_fsb_inited && 20678 TCPS_HAVEESTABLISHED(tp->t_state) && 20679 ((IN_RECOVERY(tp->t_flags)) == 0) && 20680 (doing_tlp == 0) && 20681 (rack->r_must_retran == 0) && 20682 ((tp->t_flags & TF_NEEDFIN) == 0) && 20683 (len > 0) && (orig_len > 0) && 20684 (orig_len > len) && 20685 ((orig_len - len) >= segsiz) && 20686 ((optlen == 0) || 20687 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 20688 /* We can send at least one more MSS using our fsb */ 20689 rack_setup_fast_output(tp, rack, sb, len, orig_len, 20690 segsiz, pace_max_seg, hw_tls, flags); 20691 } else 20692 rack->r_fast_output = 0; 20693 rack_log_fsb(rack, tp, so, flags, 20694 ipoptlen, orig_len, len, 0, 20695 1, optlen, __LINE__, 1); 20696 /* Assure when we leave that snd_nxt will point to top */ 20697 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 20698 tp->snd_nxt = tp->snd_max; 20699 } else { 20700 int end_window = 0; 20701 uint32_t seq = tp->gput_ack; 20702 20703 rsm = tqhash_max(rack->r_ctl.tqh); 20704 if (rsm) { 20705 /* 20706 * Mark the last sent that we just-returned (hinting 20707 * that delayed ack may play a role in any rtt measurement). 20708 */ 20709 rsm->r_just_ret = 1; 20710 } 20711 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 20712 rack->r_ctl.rc_agg_delayed = 0; 20713 rack->r_early = 0; 20714 rack->r_late = 0; 20715 rack->r_ctl.rc_agg_early = 0; 20716 if ((ctf_outstanding(tp) + 20717 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 20718 minseg)) >= tp->snd_wnd) { 20719 /* We are limited by the rwnd */ 20720 app_limited = CTF_JR_RWND_LIMITED; 20721 if (IN_FASTRECOVERY(tp->t_flags)) 20722 rack->r_ctl.rc_prr_sndcnt = 0; 20723 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 20724 /* We are limited by whats available -- app limited */ 20725 app_limited = CTF_JR_APP_LIMITED; 20726 if (IN_FASTRECOVERY(tp->t_flags)) 20727 rack->r_ctl.rc_prr_sndcnt = 0; 20728 } else if ((idle == 0) && 20729 ((tp->t_flags & TF_NODELAY) == 0) && 20730 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20731 (len < segsiz)) { 20732 /* 20733 * No delay is not on and the 20734 * user is sending less than 1MSS. This 20735 * brings out SWS avoidance so we 20736 * don't send. Another app-limited case. 20737 */ 20738 app_limited = CTF_JR_APP_LIMITED; 20739 } else if (tp->t_flags & TF_NOPUSH) { 20740 /* 20741 * The user has requested no push of 20742 * the last segment and we are 20743 * at the last segment. Another app 20744 * limited case. 20745 */ 20746 app_limited = CTF_JR_APP_LIMITED; 20747 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 20748 /* Its the cwnd */ 20749 app_limited = CTF_JR_CWND_LIMITED; 20750 } else if (IN_FASTRECOVERY(tp->t_flags) && 20751 (rack->rack_no_prr == 0) && 20752 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 20753 app_limited = CTF_JR_PRR; 20754 } else { 20755 /* Now why here are we not sending? */ 20756 #ifdef NOW 20757 #ifdef INVARIANTS 20758 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 20759 #endif 20760 #endif 20761 app_limited = CTF_JR_ASSESSING; 20762 } 20763 /* 20764 * App limited in some fashion, for our pacing GP 20765 * measurements we don't want any gap (even cwnd). 20766 * Close down the measurement window. 20767 */ 20768 if (rack_cwnd_block_ends_measure && 20769 ((app_limited == CTF_JR_CWND_LIMITED) || 20770 (app_limited == CTF_JR_PRR))) { 20771 /* 20772 * The reason we are not sending is 20773 * the cwnd (or prr). We have been configured 20774 * to end the measurement window in 20775 * this case. 20776 */ 20777 end_window = 1; 20778 } else if (rack_rwnd_block_ends_measure && 20779 (app_limited == CTF_JR_RWND_LIMITED)) { 20780 /* 20781 * We are rwnd limited and have been 20782 * configured to end the measurement 20783 * window in this case. 20784 */ 20785 end_window = 1; 20786 } else if (app_limited == CTF_JR_APP_LIMITED) { 20787 /* 20788 * A true application limited period, we have 20789 * ran out of data. 20790 */ 20791 end_window = 1; 20792 } else if (app_limited == CTF_JR_ASSESSING) { 20793 /* 20794 * In the assessing case we hit the end of 20795 * the if/else and had no known reason 20796 * This will panic us under invariants.. 20797 * 20798 * If we get this out in logs we need to 20799 * investagate which reason we missed. 20800 */ 20801 end_window = 1; 20802 } 20803 if (end_window) { 20804 uint8_t log = 0; 20805 20806 /* Adjust the Gput measurement */ 20807 if ((tp->t_flags & TF_GPUTINPROG) && 20808 SEQ_GT(tp->gput_ack, tp->snd_max)) { 20809 tp->gput_ack = tp->snd_max; 20810 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 20811 /* 20812 * There is not enough to measure. 20813 */ 20814 tp->t_flags &= ~TF_GPUTINPROG; 20815 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 20816 rack->r_ctl.rc_gp_srtt /*flex1*/, 20817 tp->gput_seq, 20818 0, 0, 18, __LINE__, NULL, 0); 20819 } else 20820 log = 1; 20821 } 20822 /* Mark the last packet as app limited */ 20823 rsm = tqhash_max(rack->r_ctl.tqh); 20824 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 20825 if (rack->r_ctl.rc_app_limited_cnt == 0) 20826 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 20827 else { 20828 /* 20829 * Go out to the end app limited and mark 20830 * this new one as next and move the end_appl up 20831 * to this guy. 20832 */ 20833 if (rack->r_ctl.rc_end_appl) 20834 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 20835 rack->r_ctl.rc_end_appl = rsm; 20836 } 20837 rsm->r_flags |= RACK_APP_LIMITED; 20838 rack->r_ctl.rc_app_limited_cnt++; 20839 } 20840 if (log) 20841 rack_log_pacing_delay_calc(rack, 20842 rack->r_ctl.rc_app_limited_cnt, seq, 20843 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 20844 } 20845 } 20846 /* Check if we need to go into persists or not */ 20847 if ((tp->snd_max == tp->snd_una) && 20848 TCPS_HAVEESTABLISHED(tp->t_state) && 20849 sbavail(sb) && 20850 (sbavail(sb) > tp->snd_wnd) && 20851 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 20852 /* Yes lets make sure to move to persist before timer-start */ 20853 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 20854 } 20855 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, sup_rack); 20856 rack_log_type_just_return(rack, cts, tot_len_this_send, pacing_delay, hpts_calling, app_limited, cwnd_to_use); 20857 } 20858 #ifdef NETFLIX_SHARED_CWND 20859 if ((sbavail(sb) == 0) && 20860 rack->r_ctl.rc_scw) { 20861 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20862 rack->rack_scwnd_is_idle = 1; 20863 } 20864 #endif 20865 #ifdef TCP_ACCOUNTING 20866 if (tot_len_this_send > 0) { 20867 crtsc = get_cyclecount(); 20868 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20869 tp->tcp_cnt_counters[SND_OUT_DATA]++; 20870 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 20871 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 20872 } 20873 } else { 20874 crtsc = get_cyclecount(); 20875 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20876 tp->tcp_cnt_counters[SND_LIMITED]++; 20877 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 20878 } 20879 } 20880 sched_unpin(); 20881 #endif 20882 return (0); 20883 20884 send: 20885 if ((rack->r_ctl.crte != NULL) && 20886 (rsm == NULL) && 20887 ((rack->rc_hw_nobuf == 1) || 20888 (rack_hw_check_queue && (check_done == 0)))) { 20889 /* 20890 * We only want to do this once with the hw_check_queue, 20891 * for the enobuf case we would only do it once if 20892 * we come around to again, the flag will be clear. 20893 */ 20894 check_done = 1; 20895 pacing_delay = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); 20896 if (pacing_delay) { 20897 rack->r_ctl.rc_agg_delayed = 0; 20898 rack->r_ctl.rc_agg_early = 0; 20899 rack->r_early = 0; 20900 rack->r_late = 0; 20901 SOCK_SENDBUF_UNLOCK(so); 20902 goto skip_all_send; 20903 } 20904 } 20905 if (rsm || sack_rxmit) 20906 counter_u64_add(rack_nfto_resend, 1); 20907 else 20908 counter_u64_add(rack_non_fto_send, 1); 20909 if ((flags & TH_FIN) && 20910 sbavail(sb)) { 20911 /* 20912 * We do not transmit a FIN 20913 * with data outstanding. We 20914 * need to make it so all data 20915 * is acked first. 20916 */ 20917 flags &= ~TH_FIN; 20918 if (TCPS_HAVEESTABLISHED(tp->t_state) && 20919 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 20920 ((tp->snd_max - tp->snd_una) <= segsiz)) { 20921 /* 20922 * Ok less than or right at a MSS is 20923 * outstanding. The original FreeBSD stack would 20924 * have sent a FIN, which can speed things up for 20925 * a transactional application doing a MSG_WAITALL. 20926 * To speed things up since we do *not* send a FIN 20927 * if data is outstanding, we send a "challenge ack". 20928 * The idea behind that is instead of having to have 20929 * the peer wait for the delayed-ack timer to run off 20930 * we send an ack that makes the peer send us an ack. 20931 */ 20932 rack_send_ack_challange(rack); 20933 } 20934 } 20935 /* Enforce stack imposed max seg size if we have one */ 20936 if (pace_max_seg && 20937 (len > pace_max_seg)) { 20938 mark = 1; 20939 len = pace_max_seg; 20940 } 20941 if ((rsm == NULL) && 20942 (rack->pcm_in_progress == 0) && 20943 (rack->r_ctl.pcm_max_seg > 0) && 20944 (len >= rack->r_ctl.pcm_max_seg)) { 20945 /* It is large enough for a measurement */ 20946 add_flag |= RACK_IS_PCM; 20947 rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag); 20948 } else if (rack_verbose_logging) { 20949 rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag); 20950 } 20951 20952 SOCKBUF_LOCK_ASSERT(sb); 20953 if (len > 0) { 20954 if (len >= segsiz) 20955 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 20956 else 20957 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 20958 } 20959 /* 20960 * Before ESTABLISHED, force sending of initial options unless TCP 20961 * set not to do any options. NOTE: we assume that the IP/TCP header 20962 * plus TCP options always fit in a single mbuf, leaving room for a 20963 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 20964 * + optlen <= MCLBYTES 20965 */ 20966 optlen = 0; 20967 #ifdef INET6 20968 if (isipv6) 20969 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 20970 else 20971 #endif 20972 hdrlen = sizeof(struct tcpiphdr); 20973 20974 /* 20975 * Ok what seq are we sending from. If we have 20976 * no rsm to use, then we look at various bits, 20977 * if we are putting out a SYN it will be ISS. 20978 * If we are retransmitting a FIN it will 20979 * be snd_max-1 else its snd_max. 20980 */ 20981 if (rsm == NULL) { 20982 if (flags & TH_SYN) 20983 rack_seq = tp->iss; 20984 else if ((flags & TH_FIN) && 20985 (tp->t_flags & TF_SENTFIN)) 20986 rack_seq = tp->snd_max - 1; 20987 else 20988 rack_seq = tp->snd_max; 20989 } else { 20990 rack_seq = rsm->r_start; 20991 } 20992 /* 20993 * Compute options for segment. We only have to care about SYN and 20994 * established connection segments. Options for SYN-ACK segments 20995 * are handled in TCP syncache. 20996 */ 20997 to.to_flags = 0; 20998 if ((tp->t_flags & TF_NOOPT) == 0) { 20999 /* Maximum segment size. */ 21000 if (flags & TH_SYN) { 21001 to.to_mss = tcp_mssopt(&inp->inp_inc); 21002 if (tp->t_port) 21003 to.to_mss -= V_tcp_udp_tunneling_overhead; 21004 to.to_flags |= TOF_MSS; 21005 21006 /* 21007 * On SYN or SYN|ACK transmits on TFO connections, 21008 * only include the TFO option if it is not a 21009 * retransmit, as the presence of the TFO option may 21010 * have caused the original SYN or SYN|ACK to have 21011 * been dropped by a middlebox. 21012 */ 21013 if ((tp->t_flags & TF_FASTOPEN) && 21014 (tp->t_rxtshift == 0)) { 21015 if (tp->t_state == TCPS_SYN_RECEIVED) { 21016 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 21017 to.to_tfo_cookie = 21018 (u_int8_t *)&tp->t_tfo_cookie.server; 21019 to.to_flags |= TOF_FASTOPEN; 21020 wanted_cookie = 1; 21021 } else if (tp->t_state == TCPS_SYN_SENT) { 21022 to.to_tfo_len = 21023 tp->t_tfo_client_cookie_len; 21024 to.to_tfo_cookie = 21025 tp->t_tfo_cookie.client; 21026 to.to_flags |= TOF_FASTOPEN; 21027 wanted_cookie = 1; 21028 /* 21029 * If we wind up having more data to 21030 * send with the SYN than can fit in 21031 * one segment, don't send any more 21032 * until the SYN|ACK comes back from 21033 * the other end. 21034 */ 21035 sendalot = 0; 21036 } 21037 } 21038 } 21039 /* Window scaling. */ 21040 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 21041 to.to_wscale = tp->request_r_scale; 21042 to.to_flags |= TOF_SCALE; 21043 } 21044 /* Timestamps. */ 21045 if ((tp->t_flags & TF_RCVD_TSTMP) || 21046 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 21047 uint32_t ts_to_use; 21048 21049 if ((rack->r_rcvpath_rtt_up == 1) && 21050 (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) { 21051 /* 21052 * When we are doing a rcv_rtt probe all 21053 * other timestamps use the next msec. This 21054 * is safe since our previous ack is in the 21055 * air and we will just have a few more 21056 * on the next ms. This assures that only 21057 * the one ack has the ms_cts that was on 21058 * our ack-probe. 21059 */ 21060 ts_to_use = ms_cts + 1; 21061 } else { 21062 ts_to_use = ms_cts; 21063 } 21064 to.to_tsval = ts_to_use + tp->ts_offset; 21065 to.to_tsecr = tp->ts_recent; 21066 to.to_flags |= TOF_TS; 21067 if ((len == 0) && 21068 (TCPS_HAVEESTABLISHED(tp->t_state)) && 21069 ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) && 21070 (tp->snd_una == tp->snd_max) && 21071 (flags & TH_ACK) && 21072 (sbavail(sb) == 0) && 21073 (rack->r_ctl.current_round != 0) && 21074 ((flags & (TH_SYN|TH_FIN)) == 0) && 21075 (rack->r_rcvpath_rtt_up == 0)) { 21076 rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts; 21077 rack->r_ctl.last_time_of_arm_rcv = cts; 21078 rack->r_rcvpath_rtt_up = 1; 21079 /* Subtract 1 from seq to force a response */ 21080 rack_seq--; 21081 } 21082 } 21083 /* Set receive buffer autosizing timestamp. */ 21084 if (tp->rfbuf_ts == 0 && 21085 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 21086 tp->rfbuf_ts = ms_cts; 21087 } 21088 /* Selective ACK's. */ 21089 if (tp->t_flags & TF_SACK_PERMIT) { 21090 if (flags & TH_SYN) 21091 to.to_flags |= TOF_SACKPERM; 21092 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 21093 tp->rcv_numsacks > 0) { 21094 to.to_flags |= TOF_SACK; 21095 to.to_nsacks = tp->rcv_numsacks; 21096 to.to_sacks = (u_char *)tp->sackblks; 21097 } 21098 } 21099 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21100 /* TCP-MD5 (RFC2385). */ 21101 if (tp->t_flags & TF_SIGNATURE) 21102 to.to_flags |= TOF_SIGNATURE; 21103 #endif 21104 21105 /* Processing the options. */ 21106 hdrlen += optlen = tcp_addoptions(&to, opt); 21107 /* 21108 * If we wanted a TFO option to be added, but it was unable 21109 * to fit, ensure no data is sent. 21110 */ 21111 if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie && 21112 !(to.to_flags & TOF_FASTOPEN)) 21113 len = 0; 21114 } 21115 if (tp->t_port) { 21116 if (V_tcp_udp_tunneling_port == 0) { 21117 /* The port was removed?? */ 21118 SOCK_SENDBUF_UNLOCK(so); 21119 #ifdef TCP_ACCOUNTING 21120 crtsc = get_cyclecount(); 21121 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21122 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 21123 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 21124 } 21125 sched_unpin(); 21126 #endif 21127 return (EHOSTUNREACH); 21128 } 21129 hdrlen += sizeof(struct udphdr); 21130 } 21131 #ifdef INET6 21132 if (isipv6) 21133 ipoptlen = ip6_optlen(inp); 21134 else 21135 #endif 21136 if (inp->inp_options) 21137 ipoptlen = inp->inp_options->m_len - 21138 offsetof(struct ipoption, ipopt_list); 21139 else 21140 ipoptlen = 0; 21141 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21142 ipoptlen += ipsec_optlen; 21143 #endif 21144 21145 /* 21146 * Adjust data length if insertion of options will bump the packet 21147 * length beyond the t_maxseg length. Clear the FIN bit because we 21148 * cut off the tail of the segment. 21149 */ 21150 if (len + optlen + ipoptlen > tp->t_maxseg) { 21151 if (tso) { 21152 uint32_t if_hw_tsomax; 21153 uint32_t moff; 21154 int32_t max_len; 21155 21156 /* extract TSO information */ 21157 if_hw_tsomax = tp->t_tsomax; 21158 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 21159 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 21160 KASSERT(ipoptlen == 0, 21161 ("%s: TSO can't do IP options", __func__)); 21162 21163 /* 21164 * Check if we should limit by maximum payload 21165 * length: 21166 */ 21167 if (if_hw_tsomax != 0) { 21168 /* compute maximum TSO length */ 21169 max_len = (if_hw_tsomax - hdrlen - 21170 max_linkhdr); 21171 if (max_len <= 0) { 21172 len = 0; 21173 } else if (len > max_len) { 21174 if (doing_tlp == 0) 21175 sendalot = 1; 21176 len = max_len; 21177 mark = 2; 21178 } 21179 } 21180 /* 21181 * Prevent the last segment from being fractional 21182 * unless the send sockbuf can be emptied: 21183 */ 21184 max_len = (tp->t_maxseg - optlen); 21185 if ((sb_offset + len) < sbavail(sb)) { 21186 moff = len % (u_int)max_len; 21187 if (moff != 0) { 21188 mark = 3; 21189 len -= moff; 21190 } 21191 } 21192 /* 21193 * In case there are too many small fragments don't 21194 * use TSO: 21195 */ 21196 if (len <= max_len) { 21197 mark = 4; 21198 tso = 0; 21199 } 21200 /* 21201 * Send the FIN in a separate segment after the bulk 21202 * sending is done. We don't trust the TSO 21203 * implementations to clear the FIN flag on all but 21204 * the last segment. 21205 */ 21206 if (tp->t_flags & TF_NEEDFIN) { 21207 sendalot = 4; 21208 } 21209 } else { 21210 mark = 5; 21211 if (optlen + ipoptlen >= tp->t_maxseg) { 21212 /* 21213 * Since we don't have enough space to put 21214 * the IP header chain and the TCP header in 21215 * one packet as required by RFC 7112, don't 21216 * send it. Also ensure that at least one 21217 * byte of the payload can be put into the 21218 * TCP segment. 21219 */ 21220 SOCK_SENDBUF_UNLOCK(so); 21221 error = EMSGSIZE; 21222 sack_rxmit = 0; 21223 goto out; 21224 } 21225 len = tp->t_maxseg - optlen - ipoptlen; 21226 sendalot = 5; 21227 } 21228 } else { 21229 tso = 0; 21230 mark = 6; 21231 } 21232 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 21233 ("%s: len > IP_MAXPACKET", __func__)); 21234 #ifdef DIAGNOSTIC 21235 #ifdef INET6 21236 if (max_linkhdr + hdrlen > MCLBYTES) 21237 #else 21238 if (max_linkhdr + hdrlen > MHLEN) 21239 #endif 21240 panic("tcphdr too big"); 21241 #endif 21242 21243 /* 21244 * This KASSERT is here to catch edge cases at a well defined place. 21245 * Before, those had triggered (random) panic conditions further 21246 * down. 21247 */ 21248 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 21249 if ((len == 0) && 21250 (flags & TH_FIN) && 21251 (sbused(sb))) { 21252 /* 21253 * We have outstanding data, don't send a fin by itself!. 21254 * 21255 * Check to see if we need to send a challenge ack. 21256 */ 21257 if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && 21258 ((tp->snd_max - tp->snd_una) <= segsiz)) { 21259 /* 21260 * Ok less than or right at a MSS is 21261 * outstanding. The original FreeBSD stack would 21262 * have sent a FIN, which can speed things up for 21263 * a transactional application doing a MSG_WAITALL. 21264 * To speed things up since we do *not* send a FIN 21265 * if data is outstanding, we send a "challenge ack". 21266 * The idea behind that is instead of having to have 21267 * the peer wait for the delayed-ack timer to run off 21268 * we send an ack that makes the peer send us an ack. 21269 */ 21270 rack_send_ack_challange(rack); 21271 } 21272 goto just_return; 21273 } 21274 /* 21275 * Grab a header mbuf, attaching a copy of data to be transmitted, 21276 * and initialize the header from the template for sends on this 21277 * connection. 21278 */ 21279 hw_tls = tp->t_nic_ktls_xmit != 0; 21280 if (len) { 21281 uint32_t max_val; 21282 uint32_t moff; 21283 21284 if (pace_max_seg) 21285 max_val = pace_max_seg; 21286 else 21287 max_val = len; 21288 /* 21289 * We allow a limit on sending with hptsi. 21290 */ 21291 if (len > max_val) { 21292 mark = 7; 21293 len = max_val; 21294 } 21295 #ifdef INET6 21296 if (MHLEN < hdrlen + max_linkhdr) 21297 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 21298 else 21299 #endif 21300 m = m_gethdr(M_NOWAIT, MT_DATA); 21301 21302 if (m == NULL) { 21303 SOCK_SENDBUF_UNLOCK(so); 21304 error = ENOBUFS; 21305 sack_rxmit = 0; 21306 goto out; 21307 } 21308 m->m_data += max_linkhdr; 21309 m->m_len = hdrlen; 21310 21311 /* 21312 * Start the m_copy functions from the closest mbuf to the 21313 * sb_offset in the socket buffer chain. 21314 */ 21315 mb = sbsndptr_noadv(sb, sb_offset, &moff); 21316 s_mb = mb; 21317 s_moff = moff; 21318 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 21319 m_copydata(mb, moff, (int)len, 21320 mtod(m, caddr_t)+hdrlen); 21321 /* 21322 * If we are not retransmitting advance the 21323 * sndptr to help remember the next place in 21324 * the sb. 21325 */ 21326 if (rsm == NULL) 21327 sbsndptr_adv(sb, mb, len); 21328 m->m_len += len; 21329 } else { 21330 struct sockbuf *msb; 21331 21332 /* 21333 * If we are not retransmitting pass in msb so 21334 * the socket buffer can be advanced. Otherwise 21335 * set it to NULL if its a retransmission since 21336 * we don't want to change the sb remembered 21337 * location. 21338 */ 21339 if (rsm == NULL) 21340 msb = sb; 21341 else 21342 msb = NULL; 21343 m->m_next = tcp_m_copym( 21344 mb, moff, &len, 21345 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 21346 ((rsm == NULL) ? hw_tls : 0)); 21347 if (len <= (tp->t_maxseg - optlen)) { 21348 /* 21349 * Must have ran out of mbufs for the copy 21350 * shorten it to no longer need tso. Lets 21351 * not put on sendalot since we are low on 21352 * mbufs. 21353 */ 21354 tso = 0; 21355 } 21356 if (m->m_next == NULL) { 21357 SOCK_SENDBUF_UNLOCK(so); 21358 (void)m_free(m); 21359 error = ENOBUFS; 21360 sack_rxmit = 0; 21361 goto out; 21362 } 21363 } 21364 if (sack_rxmit) { 21365 if (rsm && (rsm->r_flags & RACK_TLP)) { 21366 /* 21367 * TLP should not count in retran count, but 21368 * in its own bin 21369 */ 21370 counter_u64_add(rack_tlp_retran, 1); 21371 counter_u64_add(rack_tlp_retran_bytes, len); 21372 } else { 21373 tp->t_sndrexmitpack++; 21374 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 21375 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 21376 } 21377 #ifdef STATS 21378 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 21379 len); 21380 #endif 21381 } else { 21382 KMOD_TCPSTAT_INC(tcps_sndpack); 21383 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 21384 #ifdef STATS 21385 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 21386 len); 21387 #endif 21388 } 21389 /* 21390 * If we're sending everything we've got, set PUSH. (This 21391 * will keep happy those implementations which only give 21392 * data to the user when a buffer fills or a PUSH comes in.) 21393 */ 21394 if (sb_offset + len == sbused(sb) && 21395 sbused(sb) && 21396 !(flags & TH_SYN)) { 21397 flags |= TH_PUSH; 21398 add_flag |= RACK_HAD_PUSH; 21399 } 21400 SOCK_SENDBUF_UNLOCK(so); 21401 } else { 21402 SOCK_SENDBUF_UNLOCK(so); 21403 if (tp->t_flags & TF_ACKNOW) 21404 KMOD_TCPSTAT_INC(tcps_sndacks); 21405 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 21406 KMOD_TCPSTAT_INC(tcps_sndctrl); 21407 else 21408 KMOD_TCPSTAT_INC(tcps_sndwinup); 21409 21410 m = m_gethdr(M_NOWAIT, MT_DATA); 21411 if (m == NULL) { 21412 error = ENOBUFS; 21413 sack_rxmit = 0; 21414 goto out; 21415 } 21416 #ifdef INET6 21417 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 21418 MHLEN >= hdrlen) { 21419 M_ALIGN(m, hdrlen); 21420 } else 21421 #endif 21422 m->m_data += max_linkhdr; 21423 m->m_len = hdrlen; 21424 } 21425 SOCK_SENDBUF_UNLOCK_ASSERT(so); 21426 m->m_pkthdr.rcvif = (struct ifnet *)0; 21427 #ifdef MAC 21428 mac_inpcb_create_mbuf(inp, m); 21429 #endif 21430 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21431 #ifdef INET6 21432 if (isipv6) 21433 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 21434 else 21435 #endif /* INET6 */ 21436 #ifdef INET 21437 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 21438 #endif 21439 th = rack->r_ctl.fsb.th; 21440 udp = rack->r_ctl.fsb.udp; 21441 if (udp) { 21442 #ifdef INET6 21443 if (isipv6) 21444 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21445 else 21446 #endif /* INET6 */ 21447 ulen = hdrlen + len - sizeof(struct ip); 21448 udp->uh_ulen = htons(ulen); 21449 } 21450 } else { 21451 #ifdef INET6 21452 if (isipv6) { 21453 ip6 = mtod(m, struct ip6_hdr *); 21454 if (tp->t_port) { 21455 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 21456 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21457 udp->uh_dport = tp->t_port; 21458 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21459 udp->uh_ulen = htons(ulen); 21460 th = (struct tcphdr *)(udp + 1); 21461 } else 21462 th = (struct tcphdr *)(ip6 + 1); 21463 tcpip_fillheaders(inp, tp->t_port, ip6, th); 21464 } else 21465 #endif /* INET6 */ 21466 { 21467 #ifdef INET 21468 ip = mtod(m, struct ip *); 21469 if (tp->t_port) { 21470 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 21471 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21472 udp->uh_dport = tp->t_port; 21473 ulen = hdrlen + len - sizeof(struct ip); 21474 udp->uh_ulen = htons(ulen); 21475 th = (struct tcphdr *)(udp + 1); 21476 } else 21477 th = (struct tcphdr *)(ip + 1); 21478 tcpip_fillheaders(inp, tp->t_port, ip, th); 21479 #endif 21480 } 21481 } 21482 /* 21483 * If we are starting a connection, send ECN setup SYN packet. If we 21484 * are on a retransmit, we may resend those bits a number of times 21485 * as per RFC 3168. 21486 */ 21487 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 21488 flags |= tcp_ecn_output_syn_sent(tp); 21489 } 21490 /* Also handle parallel SYN for ECN */ 21491 if (TCPS_HAVERCVDSYN(tp->t_state) && 21492 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 21493 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 21494 if ((tp->t_state == TCPS_SYN_RECEIVED) && 21495 (tp->t_flags2 & TF2_ECN_SND_ECE)) 21496 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 21497 #ifdef INET6 21498 if (isipv6) { 21499 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 21500 ip6->ip6_flow |= htonl(ect << 20); 21501 } 21502 else 21503 #endif 21504 { 21505 #ifdef INET 21506 ip->ip_tos &= ~IPTOS_ECN_MASK; 21507 ip->ip_tos |= ect; 21508 #endif 21509 } 21510 } 21511 th->th_seq = htonl(rack_seq); 21512 th->th_ack = htonl(tp->rcv_nxt); 21513 tcp_set_flags(th, flags); 21514 /* 21515 * Calculate receive window. Don't shrink window, but avoid silly 21516 * window syndrome. 21517 * If a RST segment is sent, advertise a window of zero. 21518 */ 21519 if (flags & TH_RST) { 21520 recwin = 0; 21521 } else { 21522 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 21523 recwin < (long)segsiz) { 21524 recwin = 0; 21525 } 21526 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 21527 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 21528 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 21529 } 21530 21531 /* 21532 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 21533 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 21534 * handled in syncache. 21535 */ 21536 if (flags & TH_SYN) 21537 th->th_win = htons((u_short) 21538 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 21539 else { 21540 /* Avoid shrinking window with window scaling. */ 21541 recwin = roundup2(recwin, 1 << tp->rcv_scale); 21542 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 21543 } 21544 /* 21545 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 21546 * window. This may cause the remote transmitter to stall. This 21547 * flag tells soreceive() to disable delayed acknowledgements when 21548 * draining the buffer. This can occur if the receiver is 21549 * attempting to read more data than can be buffered prior to 21550 * transmitting on the connection. 21551 */ 21552 if (th->th_win == 0) { 21553 tp->t_sndzerowin++; 21554 tp->t_flags |= TF_RXWIN0SENT; 21555 } else 21556 tp->t_flags &= ~TF_RXWIN0SENT; 21557 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 21558 /* Now are we using fsb?, if so copy the template data to the mbuf */ 21559 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21560 uint8_t *cpto; 21561 21562 cpto = mtod(m, uint8_t *); 21563 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 21564 /* 21565 * We have just copied in: 21566 * IP/IP6 21567 * <optional udphdr> 21568 * tcphdr (no options) 21569 * 21570 * We need to grab the correct pointers into the mbuf 21571 * for both the tcp header, and possibly the udp header (if tunneling). 21572 * We do this by using the offset in the copy buffer and adding it 21573 * to the mbuf base pointer (cpto). 21574 */ 21575 #ifdef INET6 21576 if (isipv6) 21577 ip6 = mtod(m, struct ip6_hdr *); 21578 else 21579 #endif /* INET6 */ 21580 #ifdef INET 21581 ip = mtod(m, struct ip *); 21582 #endif 21583 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 21584 /* If we have a udp header lets set it into the mbuf as well */ 21585 if (udp) 21586 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 21587 } 21588 if (optlen) { 21589 bcopy(opt, th + 1, optlen); 21590 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 21591 } 21592 /* 21593 * Put TCP length in extended header, and then checksum extended 21594 * header and data. 21595 */ 21596 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 21597 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21598 if (to.to_flags & TOF_SIGNATURE) { 21599 /* 21600 * Calculate MD5 signature and put it into the place 21601 * determined before. 21602 * NOTE: since TCP options buffer doesn't point into 21603 * mbuf's data, calculate offset and use it. 21604 */ 21605 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 21606 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 21607 /* 21608 * Do not send segment if the calculation of MD5 21609 * digest has failed. 21610 */ 21611 goto out; 21612 } 21613 } 21614 #endif 21615 #ifdef INET6 21616 if (isipv6) { 21617 /* 21618 * ip6_plen is not need to be filled now, and will be filled 21619 * in ip6_output. 21620 */ 21621 if (tp->t_port) { 21622 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 21623 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21624 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 21625 th->th_sum = htons(0); 21626 UDPSTAT_INC(udps_opackets); 21627 } else { 21628 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 21629 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21630 th->th_sum = in6_cksum_pseudo(ip6, 21631 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 21632 0); 21633 } 21634 } 21635 #endif 21636 #if defined(INET6) && defined(INET) 21637 else 21638 #endif 21639 #ifdef INET 21640 { 21641 if (tp->t_port) { 21642 m->m_pkthdr.csum_flags = CSUM_UDP; 21643 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21644 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 21645 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 21646 th->th_sum = htons(0); 21647 UDPSTAT_INC(udps_opackets); 21648 } else { 21649 m->m_pkthdr.csum_flags = CSUM_TCP; 21650 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21651 th->th_sum = in_pseudo(ip->ip_src.s_addr, 21652 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 21653 IPPROTO_TCP + len + optlen)); 21654 } 21655 /* IP version must be set here for ipv4/ipv6 checking later */ 21656 KASSERT(ip->ip_v == IPVERSION, 21657 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 21658 } 21659 #endif 21660 /* 21661 * Enable TSO and specify the size of the segments. The TCP pseudo 21662 * header checksum is always provided. XXX: Fixme: This is currently 21663 * not the case for IPv6. 21664 */ 21665 if (tso) { 21666 /* 21667 * Here we must use t_maxseg and the optlen since 21668 * the optlen may include SACK's (or DSACK). 21669 */ 21670 KASSERT(len > tp->t_maxseg - optlen, 21671 ("%s: len <= tso_segsz", __func__)); 21672 m->m_pkthdr.csum_flags |= CSUM_TSO; 21673 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 21674 } 21675 KASSERT(len + hdrlen == m_length(m, NULL), 21676 ("%s: mbuf chain different than expected: %d + %u != %u", 21677 __func__, len, hdrlen, m_length(m, NULL))); 21678 21679 #ifdef TCP_HHOOK 21680 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 21681 hhook_run_tcp_est_out(tp, th, &to, len, tso); 21682 #endif 21683 if ((rack->r_ctl.crte != NULL) && 21684 (rack->rc_hw_nobuf == 0) && 21685 tcp_bblogging_on(tp)) { 21686 rack_log_queue_level(tp, rack, len, &tv, cts); 21687 } 21688 /* We're getting ready to send; log now. */ 21689 if (tcp_bblogging_on(rack->rc_tp)) { 21690 union tcp_log_stackspecific log; 21691 21692 memset(&log, 0, sizeof(log)); 21693 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 21694 if (rack->rack_no_prr) 21695 log.u_bbr.flex1 = 0; 21696 else 21697 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 21698 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 21699 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 21700 log.u_bbr.flex4 = orig_len; 21701 /* Save off the early/late values */ 21702 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 21703 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 21704 log.u_bbr.bw_inuse = rack_get_bw(rack); 21705 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 21706 log.u_bbr.flex8 = 0; 21707 if (rsm) { 21708 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 21709 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 21710 counter_u64_add(rack_collapsed_win_rxt, 1); 21711 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 21712 } 21713 if (doing_tlp) 21714 log.u_bbr.flex8 = 2; 21715 else 21716 log.u_bbr.flex8 = 1; 21717 } else { 21718 if (doing_tlp) 21719 log.u_bbr.flex8 = 3; 21720 } 21721 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 21722 log.u_bbr.flex7 = mark; 21723 log.u_bbr.flex7 <<= 8; 21724 log.u_bbr.flex7 |= pass; 21725 log.u_bbr.pkts_out = tp->t_maxseg; 21726 log.u_bbr.timeStamp = cts; 21727 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 21728 if (rsm && (rsm->r_rtr_cnt > 0)) { 21729 /* 21730 * When we have a retransmit we want to log the 21731 * burst at send and flight at send from before. 21732 */ 21733 log.u_bbr.flex5 = rsm->r_fas; 21734 log.u_bbr.bbr_substate = rsm->r_bas; 21735 } else { 21736 /* 21737 * New transmits we log in flex5 the inflight again as 21738 * well as the number of segments in our send in the 21739 * substate field. 21740 */ 21741 log.u_bbr.flex5 = log.u_bbr.inflight; 21742 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 21743 } 21744 log.u_bbr.lt_epoch = cwnd_to_use; 21745 log.u_bbr.delivered = sendalot; 21746 log.u_bbr.rttProp = (uintptr_t)rsm; 21747 log.u_bbr.pkt_epoch = __LINE__; 21748 if (rsm) { 21749 log.u_bbr.delRate = rsm->r_flags; 21750 log.u_bbr.delRate <<= 31; 21751 log.u_bbr.delRate |= rack->r_must_retran; 21752 log.u_bbr.delRate <<= 1; 21753 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21754 } else { 21755 log.u_bbr.delRate = rack->r_must_retran; 21756 log.u_bbr.delRate <<= 1; 21757 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21758 } 21759 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 21760 len, &log, false, NULL, __func__, __LINE__, &tv); 21761 } else 21762 lgb = NULL; 21763 21764 /* 21765 * Fill in IP length and desired time to live and send to IP level. 21766 * There should be a better way to handle ttl and tos; we could keep 21767 * them in the template, but need a way to checksum without them. 21768 */ 21769 /* 21770 * m->m_pkthdr.len should have been set before cksum calcuration, 21771 * because in6_cksum() need it. 21772 */ 21773 #ifdef INET6 21774 if (isipv6) { 21775 /* 21776 * we separately set hoplimit for every segment, since the 21777 * user might want to change the value via setsockopt. Also, 21778 * desired default hop limit might be changed via Neighbor 21779 * Discovery. 21780 */ 21781 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 21782 21783 /* 21784 * Set the packet size here for the benefit of DTrace 21785 * probes. ip6_output() will set it properly; it's supposed 21786 * to include the option header lengths as well. 21787 */ 21788 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 21789 21790 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 21791 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 21792 else 21793 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 21794 21795 if (tp->t_state == TCPS_SYN_SENT) 21796 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 21797 21798 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 21799 /* TODO: IPv6 IP6TOS_ECT bit on */ 21800 error = ip6_output(m, 21801 inp->in6p_outputopts, 21802 &inp->inp_route6, 21803 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 21804 NULL, NULL, inp); 21805 21806 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 21807 mtu = inp->inp_route6.ro_nh->nh_mtu; 21808 } 21809 #endif /* INET6 */ 21810 #if defined(INET) && defined(INET6) 21811 else 21812 #endif 21813 #ifdef INET 21814 { 21815 ip->ip_len = htons(m->m_pkthdr.len); 21816 #ifdef INET6 21817 if (inp->inp_vflag & INP_IPV6PROTO) 21818 ip->ip_ttl = in6_selecthlim(inp, NULL); 21819 #endif /* INET6 */ 21820 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 21821 /* 21822 * If we do path MTU discovery, then we set DF on every 21823 * packet. This might not be the best thing to do according 21824 * to RFC3390 Section 2. However the tcp hostcache migitates 21825 * the problem so it affects only the first tcp connection 21826 * with a host. 21827 * 21828 * NB: Don't set DF on small MTU/MSS to have a safe 21829 * fallback. 21830 */ 21831 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 21832 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 21833 if (tp->t_port == 0 || len < V_tcp_minmss) { 21834 ip->ip_off |= htons(IP_DF); 21835 } 21836 } else { 21837 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 21838 } 21839 21840 if (tp->t_state == TCPS_SYN_SENT) 21841 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 21842 21843 TCP_PROBE5(send, NULL, tp, ip, tp, th); 21844 21845 error = ip_output(m, 21846 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21847 inp->inp_options, 21848 #else 21849 NULL, 21850 #endif 21851 &inp->inp_route, 21852 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 21853 inp); 21854 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 21855 mtu = inp->inp_route.ro_nh->nh_mtu; 21856 } 21857 #endif /* INET */ 21858 if (lgb) { 21859 lgb->tlb_errno = error; 21860 lgb = NULL; 21861 } 21862 21863 out: 21864 /* 21865 * In transmit state, time the transmission and arrange for the 21866 * retransmit. In persist state, just set snd_max. 21867 */ 21868 if ((rsm == NULL) && doing_tlp) 21869 add_flag |= RACK_TLP; 21870 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 21871 rack_to_usec_ts(&tv), 21872 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); 21873 if (error == 0) { 21874 if (add_flag & RACK_IS_PCM) { 21875 /* We just launched a PCM */ 21876 /* rrs here log */ 21877 rack->pcm_in_progress = 1; 21878 rack->pcm_needed = 0; 21879 rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag); 21880 } 21881 if (rsm == NULL) { 21882 if (rack->lt_bw_up == 0) { 21883 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv); 21884 rack->r_ctl.lt_seq = tp->snd_una; 21885 rack->lt_bw_up = 1; 21886 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { 21887 /* 21888 * Need to record what we have since we are 21889 * approaching seq wrap. 21890 */ 21891 uint64_t tmark; 21892 21893 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 21894 rack->r_ctl.lt_seq = tp->snd_una; 21895 tmark = tcp_get_u64_usecs(&tv); 21896 if (tmark > rack->r_ctl.lt_timemark) { 21897 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 21898 rack->r_ctl.lt_timemark = tmark; 21899 } 21900 } 21901 } 21902 rack->forced_ack = 0; /* If we send something zap the FA flag */ 21903 counter_u64_add(rack_total_bytes, len); 21904 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 21905 if (rsm && doing_tlp) { 21906 rack->rc_last_sent_tlp_past_cumack = 0; 21907 rack->rc_last_sent_tlp_seq_valid = 1; 21908 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 21909 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 21910 } 21911 if (rack->rc_hw_nobuf) { 21912 rack->rc_hw_nobuf = 0; 21913 rack->r_ctl.rc_agg_delayed = 0; 21914 rack->r_early = 0; 21915 rack->r_late = 0; 21916 rack->r_ctl.rc_agg_early = 0; 21917 } 21918 if (rsm && (doing_tlp == 0)) { 21919 /* Set we retransmitted */ 21920 rack->rc_gp_saw_rec = 1; 21921 } else { 21922 if (cwnd_to_use > tp->snd_ssthresh) { 21923 /* Set we sent in CA */ 21924 rack->rc_gp_saw_ca = 1; 21925 } else { 21926 /* Set we sent in SS */ 21927 rack->rc_gp_saw_ss = 1; 21928 } 21929 } 21930 if (TCPS_HAVEESTABLISHED(tp->t_state) && 21931 (tp->t_flags & TF_SACK_PERMIT) && 21932 tp->rcv_numsacks > 0) 21933 tcp_clean_dsack_blocks(tp); 21934 tot_len_this_send += len; 21935 if (len == 0) { 21936 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 21937 } else { 21938 int idx; 21939 21940 idx = (len / segsiz) + 3; 21941 if (idx >= TCP_MSS_ACCT_ATIMER) 21942 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 21943 else 21944 counter_u64_add(rack_out_size[idx], 1); 21945 } 21946 } 21947 if ((rack->rack_no_prr == 0) && 21948 sub_from_prr && 21949 (error == 0)) { 21950 if (rack->r_ctl.rc_prr_sndcnt >= len) 21951 rack->r_ctl.rc_prr_sndcnt -= len; 21952 else 21953 rack->r_ctl.rc_prr_sndcnt = 0; 21954 } 21955 sub_from_prr = 0; 21956 if (rsm != NULL) { 21957 if (doing_tlp) 21958 /* Make sure the TLP is added */ 21959 rsm->r_flags |= RACK_TLP; 21960 else 21961 /* If its a resend without TLP then it must not have the flag */ 21962 rsm->r_flags &= ~RACK_TLP; 21963 } 21964 if ((error == 0) && 21965 (len > 0) && 21966 (tp->snd_una == tp->snd_max)) 21967 rack->r_ctl.rc_tlp_rxt_last_time = cts; 21968 21969 { 21970 /* 21971 * This block is not associated with the above error == 0 test. 21972 * It is used to advance snd_max if we have a new transmit. 21973 */ 21974 tcp_seq startseq = tp->snd_max; 21975 21976 21977 if (rsm && (doing_tlp == 0)) 21978 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 21979 if (error) 21980 /* We don't log or do anything with errors */ 21981 goto nomore; 21982 if (doing_tlp == 0) { 21983 if (rsm == NULL) { 21984 /* 21985 * Not a retransmission of some 21986 * sort, new data is going out so 21987 * clear our TLP count and flag. 21988 */ 21989 rack->rc_tlp_in_progress = 0; 21990 rack->r_ctl.rc_tlp_cnt_out = 0; 21991 } 21992 } else { 21993 /* 21994 * We have just sent a TLP, mark that it is true 21995 * and make sure our in progress is set so we 21996 * continue to check the count. 21997 */ 21998 rack->rc_tlp_in_progress = 1; 21999 rack->r_ctl.rc_tlp_cnt_out++; 22000 } 22001 /* 22002 * If we are retransmitting we are done, snd_max 22003 * does not get updated. 22004 */ 22005 if (sack_rxmit) 22006 goto nomore; 22007 if ((tp->snd_una == tp->snd_max) && (len > 0)) { 22008 /* 22009 * Update the time we just added data since 22010 * nothing was outstanding. 22011 */ 22012 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 22013 tp->t_acktime = ticks; 22014 } 22015 /* 22016 * Now for special SYN/FIN handling. 22017 */ 22018 if (flags & (TH_SYN | TH_FIN)) { 22019 if ((flags & TH_SYN) && 22020 ((tp->t_flags & TF_SENTSYN) == 0)) { 22021 tp->snd_max++; 22022 tp->t_flags |= TF_SENTSYN; 22023 } 22024 if ((flags & TH_FIN) && 22025 ((tp->t_flags & TF_SENTFIN) == 0)) { 22026 tp->snd_max++; 22027 tp->t_flags |= TF_SENTFIN; 22028 } 22029 } 22030 tp->snd_max += len; 22031 if (rack->rc_new_rnd_needed) { 22032 rack_new_round_starts(tp, rack, tp->snd_max); 22033 } 22034 /* 22035 * Time this transmission if not a retransmission and 22036 * not currently timing anything. 22037 * This is only relevant in case of switching back to 22038 * the base stack. 22039 */ 22040 if (tp->t_rtttime == 0) { 22041 tp->t_rtttime = ticks; 22042 tp->t_rtseq = startseq; 22043 KMOD_TCPSTAT_INC(tcps_segstimed); 22044 } 22045 if (len && 22046 ((tp->t_flags & TF_GPUTINPROG) == 0)) 22047 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 22048 /* 22049 * If we are doing FO we need to update the mbuf position and subtract 22050 * this happens when the peer sends us duplicate information and 22051 * we thus want to send a DSACK. 22052 * 22053 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 22054 * turned off? If not then we are going to echo multiple DSACK blocks 22055 * out (with the TSO), which we should not be doing. 22056 */ 22057 if (rack->r_fast_output && len) { 22058 if (rack->r_ctl.fsb.left_to_send > len) 22059 rack->r_ctl.fsb.left_to_send -= len; 22060 else 22061 rack->r_ctl.fsb.left_to_send = 0; 22062 if (rack->r_ctl.fsb.left_to_send < segsiz) 22063 rack->r_fast_output = 0; 22064 if (rack->r_fast_output) { 22065 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 22066 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 22067 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 22068 } 22069 } 22070 if (rack_pcm_blast == 0) { 22071 if ((orig_len > len) && 22072 (add_flag & RACK_IS_PCM) && 22073 (len < pace_max_seg) && 22074 ((pace_max_seg - len) > segsiz)) { 22075 /* 22076 * We are doing a PCM measurement and we did 22077 * not get enough data in the TSO to meet the 22078 * burst requirement. 22079 */ 22080 uint32_t n_len; 22081 22082 n_len = (orig_len - len); 22083 orig_len -= len; 22084 pace_max_seg -= len; 22085 len = n_len; 22086 sb_offset = tp->snd_max - tp->snd_una; 22087 /* Re-lock for the next spin */ 22088 SOCK_SENDBUF_LOCK(so); 22089 goto send; 22090 } 22091 } else { 22092 if ((orig_len > len) && 22093 (add_flag & RACK_IS_PCM) && 22094 ((orig_len - len) > segsiz)) { 22095 /* 22096 * We are doing a PCM measurement and we did 22097 * not get enough data in the TSO to meet the 22098 * burst requirement. 22099 */ 22100 uint32_t n_len; 22101 22102 n_len = (orig_len - len); 22103 orig_len -= len; 22104 len = n_len; 22105 sb_offset = tp->snd_max - tp->snd_una; 22106 /* Re-lock for the next spin */ 22107 SOCK_SENDBUF_LOCK(so); 22108 goto send; 22109 } 22110 } 22111 } 22112 nomore: 22113 if (error) { 22114 rack->r_ctl.rc_agg_delayed = 0; 22115 rack->r_early = 0; 22116 rack->r_late = 0; 22117 rack->r_ctl.rc_agg_early = 0; 22118 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 22119 /* 22120 * Failures do not advance the seq counter above. For the 22121 * case of ENOBUFS we will fall out and retry in 1ms with 22122 * the hpts. Everything else will just have to retransmit 22123 * with the timer. 22124 * 22125 * In any case, we do not want to loop around for another 22126 * send without a good reason. 22127 */ 22128 sendalot = 0; 22129 switch (error) { 22130 case EPERM: 22131 case EACCES: 22132 tp->t_softerror = error; 22133 #ifdef TCP_ACCOUNTING 22134 crtsc = get_cyclecount(); 22135 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22136 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22137 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22138 } 22139 sched_unpin(); 22140 #endif 22141 return (error); 22142 case ENOBUFS: 22143 /* 22144 * Pace us right away to retry in a some 22145 * time 22146 */ 22147 if (rack->r_ctl.crte != NULL) { 22148 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 22149 if (tcp_bblogging_on(rack->rc_tp)) 22150 rack_log_queue_level(tp, rack, len, &tv, cts); 22151 } else 22152 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 22153 pacing_delay = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 22154 if (rack->rc_enobuf < 0x7f) 22155 rack->rc_enobuf++; 22156 if (pacing_delay < (10 * HPTS_USEC_IN_MSEC)) 22157 pacing_delay = 10 * HPTS_USEC_IN_MSEC; 22158 if (rack->r_ctl.crte != NULL) { 22159 counter_u64_add(rack_saw_enobuf_hw, 1); 22160 tcp_rl_log_enobuf(rack->r_ctl.crte); 22161 } 22162 counter_u64_add(rack_saw_enobuf, 1); 22163 goto enobufs; 22164 case EMSGSIZE: 22165 /* 22166 * For some reason the interface we used initially 22167 * to send segments changed to another or lowered 22168 * its MTU. If TSO was active we either got an 22169 * interface without TSO capabilits or TSO was 22170 * turned off. If we obtained mtu from ip_output() 22171 * then update it and try again. 22172 */ 22173 if (tso) 22174 tp->t_flags &= ~TF_TSO; 22175 if (mtu != 0) { 22176 int saved_mtu; 22177 22178 saved_mtu = tp->t_maxseg; 22179 tcp_mss_update(tp, -1, mtu, NULL, NULL); 22180 if (saved_mtu > tp->t_maxseg) { 22181 goto again; 22182 } 22183 } 22184 pacing_delay = 10 * HPTS_USEC_IN_MSEC; 22185 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); 22186 #ifdef TCP_ACCOUNTING 22187 crtsc = get_cyclecount(); 22188 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22189 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22190 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22191 } 22192 sched_unpin(); 22193 #endif 22194 return (error); 22195 case ENETUNREACH: 22196 counter_u64_add(rack_saw_enetunreach, 1); 22197 /* FALLTHROUGH */ 22198 case EHOSTDOWN: 22199 case EHOSTUNREACH: 22200 case ENETDOWN: 22201 if (TCPS_HAVERCVDSYN(tp->t_state)) { 22202 tp->t_softerror = error; 22203 error = 0; 22204 } 22205 /* FALLTHROUGH */ 22206 default: 22207 pacing_delay = 10 * HPTS_USEC_IN_MSEC; 22208 rack_start_hpts_timer(rack, tp, cts, pacing_delay, 0, 0); 22209 #ifdef TCP_ACCOUNTING 22210 crtsc = get_cyclecount(); 22211 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22212 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22213 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22214 } 22215 sched_unpin(); 22216 #endif 22217 return (error); 22218 } 22219 } else { 22220 rack->rc_enobuf = 0; 22221 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 22222 rack->r_ctl.retran_during_recovery += len; 22223 } 22224 KMOD_TCPSTAT_INC(tcps_sndtotal); 22225 22226 /* 22227 * Data sent (as far as we can tell). If this advertises a larger 22228 * window than any other segment, then remember the size of the 22229 * advertised window. Any pending ACK has now been sent. 22230 */ 22231 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 22232 tp->rcv_adv = tp->rcv_nxt + recwin; 22233 22234 tp->last_ack_sent = tp->rcv_nxt; 22235 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 22236 enobufs: 22237 if (sendalot) { 22238 /* Do we need to turn off sendalot? */ 22239 if (pace_max_seg && 22240 (tot_len_this_send >= pace_max_seg)) { 22241 /* We hit our max. */ 22242 sendalot = 0; 22243 } 22244 } 22245 if ((error == 0) && (flags & TH_FIN)) 22246 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 22247 if (flags & TH_RST) { 22248 /* 22249 * We don't send again after sending a RST. 22250 */ 22251 pacing_delay = 0; 22252 sendalot = 0; 22253 if (error == 0) 22254 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 22255 } else if ((pacing_delay == 0) && (sendalot == 0) && tot_len_this_send) { 22256 /* 22257 * Get our pacing rate, if an error 22258 * occurred in sending (ENOBUF) we would 22259 * hit the else if with slot preset. Other 22260 * errors return. 22261 */ 22262 pacing_delay = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); 22263 } 22264 /* We have sent clear the flag */ 22265 rack->r_ent_rec_ns = 0; 22266 if (rack->r_must_retran) { 22267 if (rsm) { 22268 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 22269 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 22270 /* 22271 * We have retransmitted all. 22272 */ 22273 rack->r_must_retran = 0; 22274 rack->r_ctl.rc_out_at_rto = 0; 22275 } 22276 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22277 /* 22278 * Sending new data will also kill 22279 * the loop. 22280 */ 22281 rack->r_must_retran = 0; 22282 rack->r_ctl.rc_out_at_rto = 0; 22283 } 22284 } 22285 rack->r_ctl.fsb.recwin = recwin; 22286 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 22287 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22288 /* 22289 * We hit an RTO and now have past snd_max at the RTO 22290 * clear all the WAS flags. 22291 */ 22292 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 22293 } 22294 if (pacing_delay) { 22295 /* set the rack tcb into the slot N */ 22296 if ((error == 0) && 22297 rack_use_rfo && 22298 ((flags & (TH_SYN|TH_FIN)) == 0) && 22299 (rsm == NULL) && 22300 (ipoptlen == 0) && 22301 (doing_tlp == 0) && 22302 rack->r_fsb_inited && 22303 TCPS_HAVEESTABLISHED(tp->t_state) && 22304 ((IN_RECOVERY(tp->t_flags)) == 0) && 22305 (rack->r_must_retran == 0) && 22306 ((tp->t_flags & TF_NEEDFIN) == 0) && 22307 (len > 0) && (orig_len > 0) && 22308 (orig_len > len) && 22309 ((orig_len - len) >= segsiz) && 22310 ((optlen == 0) || 22311 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22312 /* We can send at least one more MSS using our fsb */ 22313 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22314 segsiz, pace_max_seg, hw_tls, flags); 22315 } else 22316 rack->r_fast_output = 0; 22317 rack_log_fsb(rack, tp, so, flags, 22318 ipoptlen, orig_len, len, error, 22319 (rsm == NULL), optlen, __LINE__, 2); 22320 } else if (sendalot) { 22321 int ret; 22322 22323 sack_rxmit = 0; 22324 if ((error == 0) && 22325 rack_use_rfo && 22326 ((flags & (TH_SYN|TH_FIN)) == 0) && 22327 (rsm == NULL) && 22328 (doing_tlp == 0) && 22329 (ipoptlen == 0) && 22330 (rack->r_must_retran == 0) && 22331 rack->r_fsb_inited && 22332 TCPS_HAVEESTABLISHED(tp->t_state) && 22333 ((IN_RECOVERY(tp->t_flags)) == 0) && 22334 ((tp->t_flags & TF_NEEDFIN) == 0) && 22335 (len > 0) && (orig_len > 0) && 22336 (orig_len > len) && 22337 ((orig_len - len) >= segsiz) && 22338 ((optlen == 0) || 22339 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22340 /* we can use fast_output for more */ 22341 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22342 segsiz, pace_max_seg, hw_tls, flags); 22343 if (rack->r_fast_output) { 22344 error = 0; 22345 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); 22346 if (ret >= 0) 22347 return (ret); 22348 else if (error) 22349 goto nomore; 22350 22351 } 22352 } 22353 goto again; 22354 } 22355 skip_all_send: 22356 /* Assure when we leave that snd_nxt will point to top */ 22357 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 22358 tp->snd_nxt = tp->snd_max; 22359 rack_start_hpts_timer(rack, tp, cts, pacing_delay, tot_len_this_send, 0); 22360 #ifdef TCP_ACCOUNTING 22361 crtsc = get_cyclecount() - ts_val; 22362 if (tot_len_this_send) { 22363 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22364 tp->tcp_cnt_counters[SND_OUT_DATA]++; 22365 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 22366 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 22367 } 22368 } else { 22369 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22370 tp->tcp_cnt_counters[SND_OUT_ACK]++; 22371 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 22372 } 22373 } 22374 sched_unpin(); 22375 #endif 22376 if (error == ENOBUFS) 22377 error = 0; 22378 return (error); 22379 } 22380 22381 static void 22382 rack_update_seg(struct tcp_rack *rack) 22383 { 22384 uint32_t orig_val; 22385 22386 orig_val = rack->r_ctl.rc_pace_max_segs; 22387 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 22388 if (orig_val != rack->r_ctl.rc_pace_max_segs) 22389 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 22390 } 22391 22392 static void 22393 rack_mtu_change(struct tcpcb *tp) 22394 { 22395 /* 22396 * The MSS may have changed 22397 */ 22398 struct tcp_rack *rack; 22399 struct rack_sendmap *rsm; 22400 22401 rack = (struct tcp_rack *)tp->t_fb_ptr; 22402 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 22403 /* 22404 * The MTU has changed we need to resend everything 22405 * since all we have sent is lost. We first fix 22406 * up the mtu though. 22407 */ 22408 rack_set_pace_segments(tp, rack, __LINE__, NULL); 22409 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 22410 rack_remxt_tmr(tp); 22411 rack->r_fast_output = 0; 22412 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 22413 rack->r_ctl.rc_sacked); 22414 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 22415 rack->r_must_retran = 1; 22416 /* Mark all inflight to needing to be rxt'd */ 22417 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 22418 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); 22419 } 22420 } 22421 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 22422 /* We don't use snd_nxt to retransmit */ 22423 tp->snd_nxt = tp->snd_max; 22424 } 22425 22426 static int 22427 rack_set_dgp(struct tcp_rack *rack) 22428 { 22429 if (rack->dgp_on == 1) 22430 return(0); 22431 if ((rack->use_fixed_rate == 1) && 22432 (rack->rc_always_pace == 1)) { 22433 /* 22434 * We are already pacing another 22435 * way. 22436 */ 22437 return (EBUSY); 22438 } 22439 if (rack->rc_always_pace == 1) { 22440 rack_remove_pacing(rack); 22441 } 22442 if (tcp_incr_dgp_pacing_cnt() == 0) 22443 return (ENOSPC); 22444 rack->r_ctl.pacing_method |= RACK_DGP_PACING; 22445 rack->rc_fillcw_apply_discount = 0; 22446 rack->dgp_on = 1; 22447 rack->rc_always_pace = 1; 22448 rack->rc_pace_dnd = 1; 22449 rack->use_fixed_rate = 0; 22450 if (rack->gp_ready) 22451 rack_set_cc_pacing(rack); 22452 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22453 rack->rack_attempt_hdwr_pace = 0; 22454 /* rxt settings */ 22455 rack->full_size_rxt = 1; 22456 rack->shape_rxt_to_pacing_min = 0; 22457 /* cmpack=1 */ 22458 rack->r_use_cmp_ack = 1; 22459 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 22460 rack->r_use_cmp_ack) 22461 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22462 /* scwnd=1 */ 22463 rack->rack_enable_scwnd = 1; 22464 /* dynamic=100 */ 22465 rack->rc_gp_dyn_mul = 1; 22466 /* gp_inc_ca */ 22467 rack->r_ctl.rack_per_of_gp_ca = 100; 22468 /* rrr_conf=3 */ 22469 rack->r_rr_config = 3; 22470 /* npush=2 */ 22471 rack->r_ctl.rc_no_push_at_mrtt = 2; 22472 /* fillcw=1 */ 22473 rack->rc_pace_to_cwnd = 1; 22474 rack->rc_pace_fill_if_rttin_range = 0; 22475 rack->rtt_limit_mul = 0; 22476 /* noprr=1 */ 22477 rack->rack_no_prr = 1; 22478 /* lscwnd=1 */ 22479 rack->r_limit_scw = 1; 22480 /* gp_inc_rec */ 22481 rack->r_ctl.rack_per_of_gp_rec = 90; 22482 return (0); 22483 } 22484 22485 static int 22486 rack_set_profile(struct tcp_rack *rack, int prof) 22487 { 22488 int err = EINVAL; 22489 if (prof == 1) { 22490 /* 22491 * Profile 1 is "standard" DGP. It ignores 22492 * client buffer level. 22493 */ 22494 err = rack_set_dgp(rack); 22495 if (err) 22496 return (err); 22497 } else if (prof == 6) { 22498 err = rack_set_dgp(rack); 22499 if (err) 22500 return (err); 22501 /* 22502 * Profile 6 tweaks DGP so that it will apply to 22503 * fill-cw the same settings that profile5 does 22504 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). 22505 */ 22506 rack->rc_fillcw_apply_discount = 1; 22507 } else if (prof == 0) { 22508 /* This changes things back to the default settings */ 22509 if (rack->rc_always_pace == 1) { 22510 rack_remove_pacing(rack); 22511 } else { 22512 /* Make sure any stray flags are off */ 22513 rack->dgp_on = 0; 22514 rack->rc_hybrid_mode = 0; 22515 rack->use_fixed_rate = 0; 22516 } 22517 err = 0; 22518 if (rack_fill_cw_state) 22519 rack->rc_pace_to_cwnd = 1; 22520 else 22521 rack->rc_pace_to_cwnd = 0; 22522 22523 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 22524 rack->r_ctl.pacing_method |= RACK_REG_PACING; 22525 rack->rc_always_pace = 1; 22526 if (rack->rack_hibeta) 22527 rack_set_cc_pacing(rack); 22528 } else 22529 rack->rc_always_pace = 0; 22530 if (rack_dsack_std_based & 0x1) { 22531 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 22532 rack->rc_rack_tmr_std_based = 1; 22533 } 22534 if (rack_dsack_std_based & 0x2) { 22535 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 22536 rack->rc_rack_use_dsack = 1; 22537 } 22538 if (rack_use_cmp_acks) 22539 rack->r_use_cmp_ack = 1; 22540 else 22541 rack->r_use_cmp_ack = 0; 22542 if (rack_disable_prr) 22543 rack->rack_no_prr = 1; 22544 else 22545 rack->rack_no_prr = 0; 22546 if (rack_gp_no_rec_chg) 22547 rack->rc_gp_no_rec_chg = 1; 22548 else 22549 rack->rc_gp_no_rec_chg = 0; 22550 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 22551 rack->r_mbuf_queue = 1; 22552 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 22553 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22554 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22555 } else { 22556 rack->r_mbuf_queue = 0; 22557 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22558 } 22559 if (rack_enable_shared_cwnd) 22560 rack->rack_enable_scwnd = 1; 22561 else 22562 rack->rack_enable_scwnd = 0; 22563 if (rack_do_dyn_mul) { 22564 /* When dynamic adjustment is on CA needs to start at 100% */ 22565 rack->rc_gp_dyn_mul = 1; 22566 if (rack_do_dyn_mul >= 100) 22567 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 22568 } else { 22569 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 22570 rack->rc_gp_dyn_mul = 0; 22571 } 22572 rack->r_rr_config = 0; 22573 rack->r_ctl.rc_no_push_at_mrtt = 0; 22574 rack->rc_pace_fill_if_rttin_range = 0; 22575 rack->rtt_limit_mul = 0; 22576 22577 if (rack_enable_hw_pacing) 22578 rack->rack_hdw_pace_ena = 1; 22579 else 22580 rack->rack_hdw_pace_ena = 0; 22581 if (rack_disable_prr) 22582 rack->rack_no_prr = 1; 22583 else 22584 rack->rack_no_prr = 0; 22585 if (rack_limits_scwnd) 22586 rack->r_limit_scw = 1; 22587 else 22588 rack->r_limit_scw = 0; 22589 rack_init_retransmit_value(rack, rack_rxt_controls); 22590 err = 0; 22591 } 22592 return (err); 22593 } 22594 22595 static int 22596 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 22597 { 22598 struct deferred_opt_list *dol; 22599 22600 dol = malloc(sizeof(struct deferred_opt_list), 22601 M_TCPDO, M_NOWAIT|M_ZERO); 22602 if (dol == NULL) { 22603 /* 22604 * No space yikes -- fail out.. 22605 */ 22606 return (0); 22607 } 22608 dol->optname = sopt_name; 22609 dol->optval = loptval; 22610 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 22611 return (1); 22612 } 22613 22614 static int 22615 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) 22616 { 22617 #ifdef TCP_REQUEST_TRK 22618 struct tcp_sendfile_track *sft; 22619 struct timeval tv; 22620 tcp_seq seq; 22621 int err; 22622 22623 microuptime(&tv); 22624 22625 /* Make sure no fixed rate is on */ 22626 rack->use_fixed_rate = 0; 22627 rack->r_ctl.rc_fixed_pacing_rate_rec = 0; 22628 rack->r_ctl.rc_fixed_pacing_rate_ca = 0; 22629 rack->r_ctl.rc_fixed_pacing_rate_ss = 0; 22630 /* Now allocate or find our entry that will have these settings */ 22631 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0); 22632 if (sft == NULL) { 22633 rack->rc_tp->tcp_hybrid_error++; 22634 /* no space, where would it have gone? */ 22635 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; 22636 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); 22637 return (ENOSPC); 22638 } 22639 /* mask our internal flags */ 22640 hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK; 22641 /* The seq will be snd_una + everything in the buffer */ 22642 seq = sft->start_seq; 22643 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { 22644 /* Disabling hybrid pacing */ 22645 if (rack->rc_hybrid_mode) { 22646 rack_set_profile(rack, 0); 22647 rack->rc_tp->tcp_hybrid_stop++; 22648 } 22649 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); 22650 return (0); 22651 } 22652 if (rack->dgp_on == 0) { 22653 /* 22654 * If we have not yet turned DGP on, do so 22655 * now setting pure DGP mode, no buffer level 22656 * response. 22657 */ 22658 if ((err = rack_set_profile(rack, 1)) != 0){ 22659 /* Failed to turn pacing on */ 22660 rack->rc_tp->tcp_hybrid_error++; 22661 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); 22662 return (err); 22663 } 22664 } 22665 /* 22666 * Now we must switch to hybrid mode as well which also 22667 * means moving to regular pacing. 22668 */ 22669 if (rack->rc_hybrid_mode == 0) { 22670 /* First time */ 22671 if (tcp_can_enable_pacing()) { 22672 rack->r_ctl.pacing_method |= RACK_REG_PACING; 22673 rack->rc_hybrid_mode = 1; 22674 } else { 22675 return (ENOSPC); 22676 } 22677 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) { 22678 /* 22679 * This should be true. 22680 */ 22681 tcp_dec_dgp_pacing_cnt(); 22682 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 22683 } 22684 } 22685 /* Now set in our flags */ 22686 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET; 22687 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) 22688 sft->cspr = hybrid->cspr; 22689 else 22690 sft->cspr = 0; 22691 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) 22692 sft->hint_maxseg = hybrid->hint_maxseg; 22693 else 22694 sft->hint_maxseg = 0; 22695 rack->rc_tp->tcp_hybrid_start++; 22696 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); 22697 return (0); 22698 #else 22699 return (ENOTSUP); 22700 #endif 22701 } 22702 22703 static int 22704 rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si) 22705 { 22706 /* We pulled a SSI info log out what was there */ 22707 si->bytes_transmitted = tp->t_sndbytes; 22708 si->bytes_retransmitted = tp->t_snd_rxt_bytes; 22709 return (0); 22710 } 22711 22712 static int 22713 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 22714 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) 22715 22716 { 22717 struct epoch_tracker et; 22718 struct sockopt sopt; 22719 struct cc_newreno_opts opt; 22720 uint64_t val; 22721 int error = 0; 22722 uint16_t ca, ss; 22723 22724 switch (sopt_name) { 22725 case TCP_RACK_SET_RXT_OPTIONS: 22726 if (optval <= 2) { 22727 rack_init_retransmit_value(rack, optval); 22728 } else { 22729 /* 22730 * You must send in 0, 1 or 2 all else is 22731 * invalid. 22732 */ 22733 error = EINVAL; 22734 } 22735 break; 22736 case TCP_RACK_DSACK_OPT: 22737 RACK_OPTS_INC(tcp_rack_dsack_opt); 22738 if (optval & 0x1) { 22739 rack->rc_rack_tmr_std_based = 1; 22740 } else { 22741 rack->rc_rack_tmr_std_based = 0; 22742 } 22743 if (optval & 0x2) { 22744 rack->rc_rack_use_dsack = 1; 22745 } else { 22746 rack->rc_rack_use_dsack = 0; 22747 } 22748 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 22749 break; 22750 case TCP_RACK_PACING_DIVISOR: 22751 RACK_OPTS_INC(tcp_rack_pacing_divisor); 22752 if (optval == 0) { 22753 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 22754 } else { 22755 if (optval < RL_MIN_DIVISOR) 22756 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; 22757 else 22758 rack->r_ctl.pace_len_divisor = optval; 22759 } 22760 break; 22761 case TCP_RACK_HI_BETA: 22762 RACK_OPTS_INC(tcp_rack_hi_beta); 22763 if (optval > 0) { 22764 rack->rack_hibeta = 1; 22765 if ((optval >= 50) && 22766 (optval <= 100)) { 22767 /* 22768 * User wants to set a custom beta. 22769 */ 22770 rack->r_ctl.saved_hibeta = optval; 22771 if (rack->rc_pacing_cc_set) 22772 rack_undo_cc_pacing(rack); 22773 rack->r_ctl.rc_saved_beta = optval; 22774 } 22775 if (rack->rc_pacing_cc_set == 0) 22776 rack_set_cc_pacing(rack); 22777 } else { 22778 rack->rack_hibeta = 0; 22779 if (rack->rc_pacing_cc_set) 22780 rack_undo_cc_pacing(rack); 22781 } 22782 break; 22783 case TCP_RACK_PACING_BETA: 22784 error = EINVAL; 22785 break; 22786 case TCP_RACK_TIMER_SLOP: 22787 RACK_OPTS_INC(tcp_rack_timer_slop); 22788 rack->r_ctl.timer_slop = optval; 22789 if (rack->rc_tp->t_srtt) { 22790 /* 22791 * If we have an SRTT lets update t_rxtcur 22792 * to have the new slop. 22793 */ 22794 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 22795 rack_rto_min, rack_rto_max, 22796 rack->r_ctl.timer_slop); 22797 } 22798 break; 22799 case TCP_RACK_PACING_BETA_ECN: 22800 RACK_OPTS_INC(tcp_rack_beta_ecn); 22801 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 22802 /* This only works for newreno. */ 22803 error = EINVAL; 22804 break; 22805 } 22806 if (rack->rc_pacing_cc_set) { 22807 /* 22808 * Set them into the real CC module 22809 * whats in the rack pcb is the old values 22810 * to be used on restoral/ 22811 */ 22812 sopt.sopt_dir = SOPT_SET; 22813 opt.name = CC_NEWRENO_BETA_ECN; 22814 opt.val = optval; 22815 if (CC_ALGO(tp)->ctl_output != NULL) 22816 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 22817 else 22818 error = ENOENT; 22819 } else { 22820 /* 22821 * Not pacing yet so set it into our local 22822 * rack pcb storage. 22823 */ 22824 rack->r_ctl.rc_saved_beta_ecn = optval; 22825 } 22826 break; 22827 case TCP_DEFER_OPTIONS: 22828 RACK_OPTS_INC(tcp_defer_opt); 22829 if (optval) { 22830 if (rack->gp_ready) { 22831 /* Too late */ 22832 error = EINVAL; 22833 break; 22834 } 22835 rack->defer_options = 1; 22836 } else 22837 rack->defer_options = 0; 22838 break; 22839 case TCP_RACK_MEASURE_CNT: 22840 RACK_OPTS_INC(tcp_rack_measure_cnt); 22841 if (optval && (optval <= 0xff)) { 22842 rack->r_ctl.req_measurements = optval; 22843 } else 22844 error = EINVAL; 22845 break; 22846 case TCP_REC_ABC_VAL: 22847 RACK_OPTS_INC(tcp_rec_abc_val); 22848 if (optval > 0) 22849 rack->r_use_labc_for_rec = 1; 22850 else 22851 rack->r_use_labc_for_rec = 0; 22852 break; 22853 case TCP_RACK_ABC_VAL: 22854 RACK_OPTS_INC(tcp_rack_abc_val); 22855 if ((optval > 0) && (optval < 255)) 22856 rack->rc_labc = optval; 22857 else 22858 error = EINVAL; 22859 break; 22860 case TCP_HDWR_UP_ONLY: 22861 RACK_OPTS_INC(tcp_pacing_up_only); 22862 if (optval) 22863 rack->r_up_only = 1; 22864 else 22865 rack->r_up_only = 0; 22866 break; 22867 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 22868 RACK_OPTS_INC(tcp_fillcw_rate_cap); 22869 rack->r_ctl.fillcw_cap = loptval; 22870 break; 22871 case TCP_PACING_RATE_CAP: 22872 RACK_OPTS_INC(tcp_pacing_rate_cap); 22873 if ((rack->dgp_on == 1) && 22874 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 22875 /* 22876 * If we are doing DGP we need to switch 22877 * to using the pacing limit. 22878 */ 22879 if (tcp_can_enable_pacing() == 0) { 22880 error = ENOSPC; 22881 break; 22882 } 22883 /* 22884 * Now change up the flags and counts to be correct. 22885 */ 22886 rack->r_ctl.pacing_method |= RACK_REG_PACING; 22887 tcp_dec_dgp_pacing_cnt(); 22888 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 22889 } 22890 rack->r_ctl.bw_rate_cap = loptval; 22891 break; 22892 case TCP_HYBRID_PACING: 22893 if (hybrid == NULL) { 22894 error = EINVAL; 22895 break; 22896 } 22897 if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) { 22898 error = EPERM; 22899 break; 22900 } 22901 error = process_hybrid_pacing(rack, hybrid); 22902 break; 22903 case TCP_SIDECHAN_DIS: /* URL:scodm */ 22904 if (optval) 22905 rack->r_ctl.side_chan_dis_mask = optval; 22906 else 22907 rack->r_ctl.side_chan_dis_mask = 0; 22908 break; 22909 case TCP_RACK_PROFILE: 22910 RACK_OPTS_INC(tcp_profile); 22911 error = rack_set_profile(rack, optval); 22912 break; 22913 case TCP_USE_CMP_ACKS: 22914 RACK_OPTS_INC(tcp_use_cmp_acks); 22915 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) { 22916 /* You can't turn it off once its on! */ 22917 error = EINVAL; 22918 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 22919 rack->r_use_cmp_ack = 1; 22920 rack->r_mbuf_queue = 1; 22921 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22922 } 22923 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 22924 tp->t_flags2 |= TF2_MBUF_ACKCMP; 22925 break; 22926 case TCP_SHARED_CWND_TIME_LIMIT: 22927 RACK_OPTS_INC(tcp_lscwnd); 22928 if (optval) 22929 rack->r_limit_scw = 1; 22930 else 22931 rack->r_limit_scw = 0; 22932 break; 22933 case TCP_RACK_DGP_IN_REC: 22934 error = EINVAL; 22935 break; 22936 case TCP_RACK_PACE_TO_FILL: 22937 RACK_OPTS_INC(tcp_fillcw); 22938 if (optval == 0) 22939 rack->rc_pace_to_cwnd = 0; 22940 else { 22941 rack->rc_pace_to_cwnd = 1; 22942 } 22943 if ((optval >= rack_gp_rtt_maxmul) && 22944 rack_gp_rtt_maxmul && 22945 (optval < 0xf)) { 22946 rack->rc_pace_fill_if_rttin_range = 1; 22947 rack->rtt_limit_mul = optval; 22948 } else { 22949 rack->rc_pace_fill_if_rttin_range = 0; 22950 rack->rtt_limit_mul = 0; 22951 } 22952 break; 22953 case TCP_RACK_NO_PUSH_AT_MAX: 22954 RACK_OPTS_INC(tcp_npush); 22955 if (optval == 0) 22956 rack->r_ctl.rc_no_push_at_mrtt = 0; 22957 else if (optval < 0xff) 22958 rack->r_ctl.rc_no_push_at_mrtt = optval; 22959 else 22960 error = EINVAL; 22961 break; 22962 case TCP_SHARED_CWND_ENABLE: 22963 RACK_OPTS_INC(tcp_rack_scwnd); 22964 if (optval == 0) 22965 rack->rack_enable_scwnd = 0; 22966 else 22967 rack->rack_enable_scwnd = 1; 22968 break; 22969 case TCP_RACK_MBUF_QUEUE: 22970 /* Now do we use the LRO mbuf-queue feature */ 22971 RACK_OPTS_INC(tcp_rack_mbufq); 22972 if (optval || rack->r_use_cmp_ack) 22973 rack->r_mbuf_queue = 1; 22974 else 22975 rack->r_mbuf_queue = 0; 22976 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 22977 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22978 else 22979 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22980 break; 22981 case TCP_RACK_NONRXT_CFG_RATE: 22982 RACK_OPTS_INC(tcp_rack_cfg_rate); 22983 if (optval == 0) 22984 rack->rack_rec_nonrxt_use_cr = 0; 22985 else 22986 rack->rack_rec_nonrxt_use_cr = 1; 22987 break; 22988 case TCP_NO_PRR: 22989 RACK_OPTS_INC(tcp_rack_noprr); 22990 if (optval == 0) 22991 rack->rack_no_prr = 0; 22992 else if (optval == 1) 22993 rack->rack_no_prr = 1; 22994 else if (optval == 2) 22995 rack->no_prr_addback = 1; 22996 else 22997 error = EINVAL; 22998 break; 22999 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 23000 if (optval > 0) 23001 rack->cspr_is_fcc = 1; 23002 else 23003 rack->cspr_is_fcc = 0; 23004 break; 23005 case TCP_TIMELY_DYN_ADJ: 23006 RACK_OPTS_INC(tcp_timely_dyn); 23007 if (optval == 0) 23008 rack->rc_gp_dyn_mul = 0; 23009 else { 23010 rack->rc_gp_dyn_mul = 1; 23011 if (optval >= 100) { 23012 /* 23013 * If the user sets something 100 or more 23014 * its the gp_ca value. 23015 */ 23016 rack->r_ctl.rack_per_of_gp_ca = optval; 23017 } 23018 } 23019 break; 23020 case TCP_RACK_DO_DETECTION: 23021 error = EINVAL; 23022 break; 23023 case TCP_RACK_TLP_USE: 23024 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 23025 error = EINVAL; 23026 break; 23027 } 23028 RACK_OPTS_INC(tcp_tlp_use); 23029 rack->rack_tlp_threshold_use = optval; 23030 break; 23031 case TCP_RACK_TLP_REDUCE: 23032 /* RACK TLP cwnd reduction (bool) */ 23033 RACK_OPTS_INC(tcp_rack_tlp_reduce); 23034 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 23035 break; 23036 /* Pacing related ones */ 23037 case TCP_RACK_PACE_ALWAYS: 23038 /* 23039 * zero is old rack method, 1 is new 23040 * method using a pacing rate. 23041 */ 23042 RACK_OPTS_INC(tcp_rack_pace_always); 23043 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23044 error = EPERM; 23045 break; 23046 } 23047 if (optval > 0) { 23048 if (rack->rc_always_pace) { 23049 error = EALREADY; 23050 break; 23051 } else if (tcp_can_enable_pacing()) { 23052 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23053 rack->rc_always_pace = 1; 23054 if (rack->rack_hibeta) 23055 rack_set_cc_pacing(rack); 23056 } 23057 else { 23058 error = ENOSPC; 23059 break; 23060 } 23061 } else { 23062 if (rack->rc_always_pace == 1) { 23063 rack_remove_pacing(rack); 23064 } 23065 } 23066 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23067 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23068 else 23069 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23070 /* A rate may be set irate or other, if so set seg size */ 23071 rack_update_seg(rack); 23072 break; 23073 case TCP_BBR_RACK_INIT_RATE: 23074 RACK_OPTS_INC(tcp_initial_rate); 23075 val = optval; 23076 /* Change from kbits per second to bytes per second */ 23077 val *= 1000; 23078 val /= 8; 23079 rack->r_ctl.init_rate = val; 23080 if (rack->rc_always_pace) 23081 rack_update_seg(rack); 23082 break; 23083 case TCP_BBR_IWINTSO: 23084 error = EINVAL; 23085 break; 23086 case TCP_RACK_FORCE_MSEG: 23087 RACK_OPTS_INC(tcp_rack_force_max_seg); 23088 if (optval) 23089 rack->rc_force_max_seg = 1; 23090 else 23091 rack->rc_force_max_seg = 0; 23092 break; 23093 case TCP_RACK_PACE_MIN_SEG: 23094 RACK_OPTS_INC(tcp_rack_min_seg); 23095 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); 23096 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23097 break; 23098 case TCP_RACK_PACE_MAX_SEG: 23099 /* Max segments size in a pace in bytes */ 23100 RACK_OPTS_INC(tcp_rack_max_seg); 23101 if ((rack->dgp_on == 1) && 23102 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 23103 /* 23104 * If we set a max-seg and are doing DGP then 23105 * we now fall under the pacing limits not the 23106 * DGP ones. 23107 */ 23108 if (tcp_can_enable_pacing() == 0) { 23109 error = ENOSPC; 23110 break; 23111 } 23112 /* 23113 * Now change up the flags and counts to be correct. 23114 */ 23115 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23116 tcp_dec_dgp_pacing_cnt(); 23117 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 23118 } 23119 if (optval <= MAX_USER_SET_SEG) 23120 rack->rc_user_set_max_segs = optval; 23121 else 23122 rack->rc_user_set_max_segs = MAX_USER_SET_SEG; 23123 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23124 break; 23125 case TCP_RACK_PACE_RATE_REC: 23126 /* Set the fixed pacing rate in Bytes per second ca */ 23127 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 23128 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23129 error = EPERM; 23130 break; 23131 } 23132 if (rack->dgp_on) { 23133 /* 23134 * We are already pacing another 23135 * way. 23136 */ 23137 error = EBUSY; 23138 break; 23139 } 23140 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23141 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23142 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23143 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23144 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23145 rack->use_fixed_rate = 1; 23146 if (rack->rack_hibeta) 23147 rack_set_cc_pacing(rack); 23148 rack_log_pacing_delay_calc(rack, 23149 rack->r_ctl.rc_fixed_pacing_rate_ss, 23150 rack->r_ctl.rc_fixed_pacing_rate_ca, 23151 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23152 __LINE__, NULL,0); 23153 break; 23154 23155 case TCP_RACK_PACE_RATE_SS: 23156 /* Set the fixed pacing rate in Bytes per second ca */ 23157 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 23158 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23159 error = EPERM; 23160 break; 23161 } 23162 if (rack->dgp_on) { 23163 /* 23164 * We are already pacing another 23165 * way. 23166 */ 23167 error = EBUSY; 23168 break; 23169 } 23170 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23171 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23172 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23173 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23174 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23175 rack->use_fixed_rate = 1; 23176 if (rack->rack_hibeta) 23177 rack_set_cc_pacing(rack); 23178 rack_log_pacing_delay_calc(rack, 23179 rack->r_ctl.rc_fixed_pacing_rate_ss, 23180 rack->r_ctl.rc_fixed_pacing_rate_ca, 23181 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23182 __LINE__, NULL, 0); 23183 break; 23184 23185 case TCP_RACK_PACE_RATE_CA: 23186 /* Set the fixed pacing rate in Bytes per second ca */ 23187 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 23188 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23189 error = EPERM; 23190 break; 23191 } 23192 if (rack->dgp_on) { 23193 /* 23194 * We are already pacing another 23195 * way. 23196 */ 23197 error = EBUSY; 23198 break; 23199 } 23200 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23201 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23202 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23203 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23204 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23205 rack->use_fixed_rate = 1; 23206 if (rack->rack_hibeta) 23207 rack_set_cc_pacing(rack); 23208 rack_log_pacing_delay_calc(rack, 23209 rack->r_ctl.rc_fixed_pacing_rate_ss, 23210 rack->r_ctl.rc_fixed_pacing_rate_ca, 23211 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23212 __LINE__, NULL, 0); 23213 break; 23214 case TCP_RACK_GP_INCREASE_REC: 23215 RACK_OPTS_INC(tcp_gp_inc_rec); 23216 rack->r_ctl.rack_per_of_gp_rec = optval; 23217 rack_log_pacing_delay_calc(rack, 23218 rack->r_ctl.rack_per_of_gp_ss, 23219 rack->r_ctl.rack_per_of_gp_ca, 23220 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23221 __LINE__, NULL, 0); 23222 break; 23223 case TCP_RACK_GP_INCREASE_CA: 23224 RACK_OPTS_INC(tcp_gp_inc_ca); 23225 ca = optval; 23226 if (ca < 100) { 23227 /* 23228 * We don't allow any reduction 23229 * over the GP b/w. 23230 */ 23231 error = EINVAL; 23232 break; 23233 } 23234 rack->r_ctl.rack_per_of_gp_ca = ca; 23235 rack_log_pacing_delay_calc(rack, 23236 rack->r_ctl.rack_per_of_gp_ss, 23237 rack->r_ctl.rack_per_of_gp_ca, 23238 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23239 __LINE__, NULL, 0); 23240 break; 23241 case TCP_RACK_GP_INCREASE_SS: 23242 RACK_OPTS_INC(tcp_gp_inc_ss); 23243 ss = optval; 23244 if (ss < 100) { 23245 /* 23246 * We don't allow any reduction 23247 * over the GP b/w. 23248 */ 23249 error = EINVAL; 23250 break; 23251 } 23252 rack->r_ctl.rack_per_of_gp_ss = ss; 23253 rack_log_pacing_delay_calc(rack, 23254 rack->r_ctl.rack_per_of_gp_ss, 23255 rack->r_ctl.rack_per_of_gp_ca, 23256 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23257 __LINE__, NULL, 0); 23258 break; 23259 case TCP_RACK_RR_CONF: 23260 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 23261 if (optval && optval <= 3) 23262 rack->r_rr_config = optval; 23263 else 23264 rack->r_rr_config = 0; 23265 break; 23266 case TCP_PACING_DND: /* URL:dnd */ 23267 if (optval > 0) 23268 rack->rc_pace_dnd = 1; 23269 else 23270 rack->rc_pace_dnd = 0; 23271 break; 23272 case TCP_HDWR_RATE_CAP: 23273 RACK_OPTS_INC(tcp_hdwr_rate_cap); 23274 if (optval) { 23275 if (rack->r_rack_hw_rate_caps == 0) 23276 rack->r_rack_hw_rate_caps = 1; 23277 else 23278 error = EALREADY; 23279 } else { 23280 rack->r_rack_hw_rate_caps = 0; 23281 } 23282 break; 23283 case TCP_DGP_UPPER_BOUNDS: 23284 { 23285 uint8_t val; 23286 val = optval & 0x0000ff; 23287 rack->r_ctl.rack_per_upper_bound_ca = val; 23288 val = (optval >> 16) & 0x0000ff; 23289 rack->r_ctl.rack_per_upper_bound_ss = val; 23290 break; 23291 } 23292 case TCP_SS_EEXIT: /* URL:eexit */ 23293 if (optval > 0) { 23294 rack->r_ctl.gp_rnd_thresh = optval & 0x0ff; 23295 if (optval & 0x10000) { 23296 rack->r_ctl.gate_to_fs = 1; 23297 } else { 23298 rack->r_ctl.gate_to_fs = 0; 23299 } 23300 if (optval & 0x20000) { 23301 rack->r_ctl.use_gp_not_last = 1; 23302 } else { 23303 rack->r_ctl.use_gp_not_last = 0; 23304 } 23305 if (optval & 0xfffc0000) { 23306 uint32_t v; 23307 23308 v = (optval >> 18) & 0x00003fff; 23309 if (v >= 1000) 23310 rack->r_ctl.gp_gain_req = v; 23311 } 23312 } else { 23313 /* We do not do ss early exit at all */ 23314 rack->rc_initial_ss_comp = 1; 23315 rack->r_ctl.gp_rnd_thresh = 0; 23316 } 23317 break; 23318 case TCP_RACK_SPLIT_LIMIT: 23319 RACK_OPTS_INC(tcp_split_limit); 23320 rack->r_ctl.rc_split_limit = optval; 23321 break; 23322 case TCP_BBR_HDWR_PACE: 23323 RACK_OPTS_INC(tcp_hdwr_pacing); 23324 if (optval){ 23325 if (rack->rack_hdrw_pacing == 0) { 23326 rack->rack_hdw_pace_ena = 1; 23327 rack->rack_attempt_hdwr_pace = 0; 23328 } else 23329 error = EALREADY; 23330 } else { 23331 rack->rack_hdw_pace_ena = 0; 23332 #ifdef RATELIMIT 23333 if (rack->r_ctl.crte != NULL) { 23334 rack->rack_hdrw_pacing = 0; 23335 rack->rack_attempt_hdwr_pace = 0; 23336 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 23337 rack->r_ctl.crte = NULL; 23338 } 23339 #endif 23340 } 23341 break; 23342 /* End Pacing related ones */ 23343 case TCP_RACK_PRR_SENDALOT: 23344 /* Allow PRR to send more than one seg */ 23345 RACK_OPTS_INC(tcp_rack_prr_sendalot); 23346 rack->r_ctl.rc_prr_sendalot = optval; 23347 break; 23348 case TCP_RACK_MIN_TO: 23349 /* Minimum time between rack t-o's in ms */ 23350 RACK_OPTS_INC(tcp_rack_min_to); 23351 rack->r_ctl.rc_min_to = optval; 23352 break; 23353 case TCP_RACK_EARLY_SEG: 23354 /* If early recovery max segments */ 23355 RACK_OPTS_INC(tcp_rack_early_seg); 23356 rack->r_ctl.rc_early_recovery_segs = optval; 23357 break; 23358 case TCP_RACK_ENABLE_HYSTART: 23359 { 23360 if (optval) { 23361 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23362 if (rack_do_hystart > RACK_HYSTART_ON) 23363 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 23364 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 23365 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 23366 } else { 23367 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 23368 } 23369 } 23370 break; 23371 case TCP_RACK_REORD_THRESH: 23372 /* RACK reorder threshold (shift amount) */ 23373 RACK_OPTS_INC(tcp_rack_reord_thresh); 23374 if ((optval > 0) && (optval < 31)) 23375 rack->r_ctl.rc_reorder_shift = optval; 23376 else 23377 error = EINVAL; 23378 break; 23379 case TCP_RACK_REORD_FADE: 23380 /* Does reordering fade after ms time */ 23381 RACK_OPTS_INC(tcp_rack_reord_fade); 23382 rack->r_ctl.rc_reorder_fade = optval; 23383 break; 23384 case TCP_RACK_TLP_THRESH: 23385 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 23386 RACK_OPTS_INC(tcp_rack_tlp_thresh); 23387 if (optval) 23388 rack->r_ctl.rc_tlp_threshold = optval; 23389 else 23390 error = EINVAL; 23391 break; 23392 case TCP_BBR_USE_RACK_RR: 23393 RACK_OPTS_INC(tcp_rack_rr); 23394 if (optval) 23395 rack->use_rack_rr = 1; 23396 else 23397 rack->use_rack_rr = 0; 23398 break; 23399 case TCP_RACK_PKT_DELAY: 23400 /* RACK added ms i.e. rack-rtt + reord + N */ 23401 RACK_OPTS_INC(tcp_rack_pkt_delay); 23402 rack->r_ctl.rc_pkt_delay = optval; 23403 break; 23404 case TCP_DELACK: 23405 RACK_OPTS_INC(tcp_rack_delayed_ack); 23406 if (optval == 0) 23407 tp->t_delayed_ack = 0; 23408 else 23409 tp->t_delayed_ack = 1; 23410 if (tp->t_flags & TF_DELACK) { 23411 tp->t_flags &= ~TF_DELACK; 23412 tp->t_flags |= TF_ACKNOW; 23413 NET_EPOCH_ENTER(et); 23414 rack_output(tp); 23415 NET_EPOCH_EXIT(et); 23416 } 23417 break; 23418 23419 case TCP_BBR_RACK_RTT_USE: 23420 RACK_OPTS_INC(tcp_rack_rtt_use); 23421 if ((optval != USE_RTT_HIGH) && 23422 (optval != USE_RTT_LOW) && 23423 (optval != USE_RTT_AVG)) 23424 error = EINVAL; 23425 else 23426 rack->r_ctl.rc_rate_sample_method = optval; 23427 break; 23428 case TCP_HONOR_HPTS_MIN: 23429 RACK_OPTS_INC(tcp_honor_hpts); 23430 if (optval) { 23431 rack->r_use_hpts_min = 1; 23432 /* 23433 * Must be between 2 - 80% to be a reduction else 23434 * we keep the default (10%). 23435 */ 23436 if ((optval > 1) && (optval <= 80)) { 23437 rack->r_ctl.max_reduction = optval; 23438 } 23439 } else 23440 rack->r_use_hpts_min = 0; 23441 break; 23442 case TCP_REC_IS_DYN: /* URL:dynrec */ 23443 RACK_OPTS_INC(tcp_dyn_rec); 23444 if (optval) 23445 rack->rc_gp_no_rec_chg = 1; 23446 else 23447 rack->rc_gp_no_rec_chg = 0; 23448 break; 23449 case TCP_NO_TIMELY: 23450 RACK_OPTS_INC(tcp_notimely); 23451 if (optval) { 23452 rack->rc_skip_timely = 1; 23453 rack->r_ctl.rack_per_of_gp_rec = 90; 23454 rack->r_ctl.rack_per_of_gp_ca = 100; 23455 rack->r_ctl.rack_per_of_gp_ss = 250; 23456 } else { 23457 rack->rc_skip_timely = 0; 23458 } 23459 break; 23460 case TCP_GP_USE_LTBW: 23461 if (optval == 0) { 23462 rack->use_lesser_lt_bw = 0; 23463 rack->dis_lt_bw = 1; 23464 } else if (optval == 1) { 23465 rack->use_lesser_lt_bw = 1; 23466 rack->dis_lt_bw = 0; 23467 } else if (optval == 2) { 23468 rack->use_lesser_lt_bw = 0; 23469 rack->dis_lt_bw = 0; 23470 } 23471 break; 23472 case TCP_DATA_AFTER_CLOSE: 23473 RACK_OPTS_INC(tcp_data_after_close); 23474 if (optval) 23475 rack->rc_allow_data_af_clo = 1; 23476 else 23477 rack->rc_allow_data_af_clo = 0; 23478 break; 23479 default: 23480 break; 23481 } 23482 tcp_log_socket_option(tp, sopt_name, optval, error); 23483 return (error); 23484 } 23485 23486 static void 23487 rack_inherit(struct tcpcb *tp, struct inpcb *parent) 23488 { 23489 /* 23490 * A new connection has been created (tp) and 23491 * the parent is the inpcb given. We want to 23492 * apply a read-lock to the parent (we are already 23493 * holding a write lock on the tp) and copy anything 23494 * out of the rack specific data as long as its tfb is 23495 * the same as ours i.e. we are the same stack. Otherwise 23496 * we just return. 23497 */ 23498 struct tcpcb *par; 23499 struct tcp_rack *dest, *src; 23500 int cnt = 0; 23501 23502 par = intotcpcb(parent); 23503 if (par->t_fb != tp->t_fb) { 23504 /* Not the same stack */ 23505 tcp_log_socket_option(tp, 0, 0, 1); 23506 return; 23507 } 23508 /* Ok if we reach here lets setup the two rack pointers */ 23509 dest = (struct tcp_rack *)tp->t_fb_ptr; 23510 src = (struct tcp_rack *)par->t_fb_ptr; 23511 if ((src == NULL) || (dest == NULL)) { 23512 /* Huh? */ 23513 tcp_log_socket_option(tp, 0, 0, 2); 23514 return; 23515 } 23516 /* Now copy out anything we wish to inherit i.e. things in socket-options */ 23517 /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */ 23518 if ((src->dgp_on) && (dest->dgp_on == 0)) { 23519 /* Profile 1 had to be set via sock opt */ 23520 rack_set_dgp(dest); 23521 cnt++; 23522 } 23523 /* TCP_RACK_SET_RXT_OPTIONS */ 23524 if (dest->full_size_rxt != src->full_size_rxt) { 23525 dest->full_size_rxt = src->full_size_rxt; 23526 cnt++; 23527 } 23528 if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) { 23529 dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min; 23530 cnt++; 23531 } 23532 /* TCP_RACK_DSACK_OPT */ 23533 if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) { 23534 dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based; 23535 cnt++; 23536 } 23537 if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) { 23538 dest->rc_rack_use_dsack = src->rc_rack_use_dsack; 23539 cnt++; 23540 } 23541 /* TCP_RACK_PACING_DIVISOR */ 23542 if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) { 23543 dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor; 23544 cnt++; 23545 } 23546 /* TCP_RACK_HI_BETA */ 23547 if (src->rack_hibeta != dest->rack_hibeta) { 23548 cnt++; 23549 if (src->rack_hibeta) { 23550 dest->r_ctl.rc_saved_beta = src->r_ctl.rc_saved_beta; 23551 dest->rack_hibeta = 1; 23552 } else { 23553 dest->rack_hibeta = 0; 23554 } 23555 } 23556 /* TCP_RACK_TIMER_SLOP */ 23557 if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) { 23558 dest->r_ctl.timer_slop = src->r_ctl.timer_slop; 23559 cnt++; 23560 } 23561 /* TCP_RACK_PACING_BETA_ECN */ 23562 if (dest->r_ctl.rc_saved_beta_ecn != src->r_ctl.rc_saved_beta_ecn) { 23563 dest->r_ctl.rc_saved_beta_ecn = src->r_ctl.rc_saved_beta_ecn; 23564 cnt++; 23565 } 23566 /* We do not do TCP_DEFER_OPTIONS */ 23567 /* TCP_RACK_MEASURE_CNT */ 23568 if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) { 23569 dest->r_ctl.req_measurements = src->r_ctl.req_measurements; 23570 cnt++; 23571 } 23572 /* TCP_HDWR_UP_ONLY */ 23573 if (dest->r_up_only != src->r_up_only) { 23574 dest->r_up_only = src->r_up_only; 23575 cnt++; 23576 } 23577 /* TCP_FILLCW_RATE_CAP */ 23578 if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) { 23579 dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap; 23580 cnt++; 23581 } 23582 /* TCP_PACING_RATE_CAP */ 23583 if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) { 23584 dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap; 23585 cnt++; 23586 } 23587 /* A listener can't set TCP_HYBRID_PACING */ 23588 /* TCP_SIDECHAN_DIS */ 23589 if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) { 23590 dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask; 23591 cnt++; 23592 } 23593 /* TCP_SHARED_CWND_TIME_LIMIT */ 23594 if (dest->r_limit_scw != src->r_limit_scw) { 23595 dest->r_limit_scw = src->r_limit_scw; 23596 cnt++; 23597 } 23598 /* TCP_RACK_PACE_TO_FILL */ 23599 if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) { 23600 dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd; 23601 cnt++; 23602 } 23603 if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) { 23604 dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range; 23605 cnt++; 23606 } 23607 if (dest->rtt_limit_mul != src->rtt_limit_mul) { 23608 dest->rtt_limit_mul = src->rtt_limit_mul; 23609 cnt++; 23610 } 23611 /* TCP_RACK_NO_PUSH_AT_MAX */ 23612 if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) { 23613 dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt; 23614 cnt++; 23615 } 23616 /* TCP_SHARED_CWND_ENABLE */ 23617 if (dest->rack_enable_scwnd != src->rack_enable_scwnd) { 23618 dest->rack_enable_scwnd = src->rack_enable_scwnd; 23619 cnt++; 23620 } 23621 /* TCP_USE_CMP_ACKS */ 23622 if (dest->r_use_cmp_ack != src->r_use_cmp_ack) { 23623 dest->r_use_cmp_ack = src->r_use_cmp_ack; 23624 cnt++; 23625 } 23626 23627 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 23628 dest->r_mbuf_queue = src->r_mbuf_queue; 23629 cnt++; 23630 } 23631 /* TCP_RACK_MBUF_QUEUE */ 23632 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 23633 dest->r_mbuf_queue = src->r_mbuf_queue; 23634 cnt++; 23635 } 23636 if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) { 23637 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23638 } else { 23639 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23640 } 23641 if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) { 23642 tp->t_flags2 |= TF2_MBUF_ACKCMP; 23643 } 23644 /* TCP_RACK_NONRXT_CFG_RATE */ 23645 if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) { 23646 dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr; 23647 cnt++; 23648 } 23649 /* TCP_NO_PRR */ 23650 if (dest->rack_no_prr != src->rack_no_prr) { 23651 dest->rack_no_prr = src->rack_no_prr; 23652 cnt++; 23653 } 23654 if (dest->no_prr_addback != src->no_prr_addback) { 23655 dest->no_prr_addback = src->no_prr_addback; 23656 cnt++; 23657 } 23658 /* RACK_CSPR_IS_FCC */ 23659 if (dest->cspr_is_fcc != src->cspr_is_fcc) { 23660 dest->cspr_is_fcc = src->cspr_is_fcc; 23661 cnt++; 23662 } 23663 /* TCP_TIMELY_DYN_ADJ */ 23664 if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) { 23665 dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul; 23666 cnt++; 23667 } 23668 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 23669 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 23670 cnt++; 23671 } 23672 /* TCP_RACK_TLP_USE */ 23673 if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) { 23674 dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use; 23675 cnt++; 23676 } 23677 /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */ 23678 /* TCP_BBR_RACK_INIT_RATE */ 23679 if (dest->r_ctl.init_rate != src->r_ctl.init_rate) { 23680 dest->r_ctl.init_rate = src->r_ctl.init_rate; 23681 cnt++; 23682 } 23683 /* TCP_RACK_FORCE_MSEG */ 23684 if (dest->rc_force_max_seg != src->rc_force_max_seg) { 23685 dest->rc_force_max_seg = src->rc_force_max_seg; 23686 cnt++; 23687 } 23688 /* TCP_RACK_PACE_MIN_SEG */ 23689 if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) { 23690 dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs; 23691 cnt++; 23692 } 23693 /* we don't allow TCP_RACK_PACE_MAX_SEG */ 23694 /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */ 23695 if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) { 23696 dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca; 23697 cnt++; 23698 } 23699 if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) { 23700 dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss; 23701 cnt++; 23702 } 23703 if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) { 23704 dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec; 23705 cnt++; 23706 } 23707 /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */ 23708 if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) { 23709 dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec; 23710 cnt++; 23711 } 23712 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 23713 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 23714 cnt++; 23715 } 23716 23717 if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) { 23718 dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss; 23719 cnt++; 23720 } 23721 /* TCP_RACK_RR_CONF */ 23722 if (dest->r_rr_config != src->r_rr_config) { 23723 dest->r_rr_config = src->r_rr_config; 23724 cnt++; 23725 } 23726 /* TCP_PACING_DND */ 23727 if (dest->rc_pace_dnd != src->rc_pace_dnd) { 23728 dest->rc_pace_dnd = src->rc_pace_dnd; 23729 cnt++; 23730 } 23731 /* TCP_HDWR_RATE_CAP */ 23732 if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) { 23733 dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps; 23734 cnt++; 23735 } 23736 /* TCP_DGP_UPPER_BOUNDS */ 23737 if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) { 23738 dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca; 23739 cnt++; 23740 } 23741 if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) { 23742 dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss; 23743 cnt++; 23744 } 23745 /* TCP_SS_EEXIT */ 23746 if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) { 23747 dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh; 23748 cnt++; 23749 } 23750 if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) { 23751 dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs; 23752 cnt++; 23753 } 23754 if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) { 23755 dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last; 23756 cnt++; 23757 } 23758 if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) { 23759 dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req; 23760 cnt++; 23761 } 23762 /* TCP_BBR_HDWR_PACE */ 23763 if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) { 23764 dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena; 23765 cnt++; 23766 } 23767 if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) { 23768 dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace; 23769 cnt++; 23770 } 23771 /* TCP_RACK_PRR_SENDALOT */ 23772 if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) { 23773 dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot; 23774 cnt++; 23775 } 23776 /* TCP_RACK_MIN_TO */ 23777 if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) { 23778 dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to; 23779 cnt++; 23780 } 23781 /* TCP_RACK_EARLY_SEG */ 23782 if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) { 23783 dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs; 23784 cnt++; 23785 } 23786 /* TCP_RACK_ENABLE_HYSTART */ 23787 if (par->t_ccv.flags != tp->t_ccv.flags) { 23788 cnt++; 23789 if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) { 23790 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23791 if (rack_do_hystart > RACK_HYSTART_ON) 23792 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 23793 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 23794 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 23795 } else { 23796 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 23797 } 23798 } 23799 /* TCP_RACK_REORD_THRESH */ 23800 if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) { 23801 dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift; 23802 cnt++; 23803 } 23804 /* TCP_RACK_REORD_FADE */ 23805 if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) { 23806 dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade; 23807 cnt++; 23808 } 23809 /* TCP_RACK_TLP_THRESH */ 23810 if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) { 23811 dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold; 23812 cnt++; 23813 } 23814 /* TCP_BBR_USE_RACK_RR */ 23815 if (dest->use_rack_rr != src->use_rack_rr) { 23816 dest->use_rack_rr = src->use_rack_rr; 23817 cnt++; 23818 } 23819 /* TCP_RACK_PKT_DELAY */ 23820 if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) { 23821 dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay; 23822 cnt++; 23823 } 23824 /* TCP_DELACK will get copied via the main code if applicable */ 23825 /* TCP_BBR_RACK_RTT_USE */ 23826 if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) { 23827 dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method; 23828 cnt++; 23829 } 23830 /* TCP_HONOR_HPTS_MIN */ 23831 if (dest->r_use_hpts_min != src->r_use_hpts_min) { 23832 dest->r_use_hpts_min = src->r_use_hpts_min; 23833 cnt++; 23834 } 23835 if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) { 23836 dest->r_ctl.max_reduction = src->r_ctl.max_reduction; 23837 cnt++; 23838 } 23839 /* TCP_REC_IS_DYN */ 23840 if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) { 23841 dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg; 23842 cnt++; 23843 } 23844 if (dest->rc_skip_timely != src->rc_skip_timely) { 23845 dest->rc_skip_timely = src->rc_skip_timely; 23846 cnt++; 23847 } 23848 /* TCP_DATA_AFTER_CLOSE */ 23849 if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) { 23850 dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo; 23851 cnt++; 23852 } 23853 /* TCP_GP_USE_LTBW */ 23854 if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) { 23855 dest->use_lesser_lt_bw = src->use_lesser_lt_bw; 23856 cnt++; 23857 } 23858 if (dest->dis_lt_bw != src->dis_lt_bw) { 23859 dest->dis_lt_bw = src->dis_lt_bw; 23860 cnt++; 23861 } 23862 tcp_log_socket_option(tp, 0, cnt, 0); 23863 } 23864 23865 23866 static void 23867 rack_apply_deferred_options(struct tcp_rack *rack) 23868 { 23869 struct deferred_opt_list *dol, *sdol; 23870 uint32_t s_optval; 23871 23872 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 23873 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 23874 /* Disadvantage of deferal is you loose the error return */ 23875 s_optval = (uint32_t)dol->optval; 23876 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); 23877 free(dol, M_TCPDO); 23878 } 23879 } 23880 23881 static void 23882 rack_hw_tls_change(struct tcpcb *tp, int chg) 23883 { 23884 /* Update HW tls state */ 23885 struct tcp_rack *rack; 23886 23887 rack = (struct tcp_rack *)tp->t_fb_ptr; 23888 if (chg) 23889 rack->r_ctl.fsb.hw_tls = 1; 23890 else 23891 rack->r_ctl.fsb.hw_tls = 0; 23892 } 23893 23894 static int 23895 rack_pru_options(struct tcpcb *tp, int flags) 23896 { 23897 if (flags & PRUS_OOB) 23898 return (EOPNOTSUPP); 23899 return (0); 23900 } 23901 23902 static bool 23903 rack_wake_check(struct tcpcb *tp) 23904 { 23905 struct tcp_rack *rack; 23906 struct timeval tv; 23907 uint32_t cts; 23908 23909 rack = (struct tcp_rack *)tp->t_fb_ptr; 23910 if (rack->r_ctl.rc_hpts_flags) { 23911 cts = tcp_get_usecs(&tv); 23912 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ 23913 /* 23914 * Pacing timer is up, check if we are ready. 23915 */ 23916 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) 23917 return (true); 23918 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { 23919 /* 23920 * A timer is up, check if we are ready. 23921 */ 23922 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) 23923 return (true); 23924 } 23925 } 23926 return (false); 23927 } 23928 23929 static struct tcp_function_block __tcp_rack = { 23930 .tfb_tcp_block_name = __XSTRING(STACKNAME), 23931 .tfb_tcp_output = rack_output, 23932 .tfb_do_queued_segments = ctf_do_queued_segments, 23933 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 23934 .tfb_tcp_do_segment = rack_do_segment, 23935 .tfb_tcp_ctloutput = rack_ctloutput, 23936 .tfb_tcp_fb_init = rack_init, 23937 .tfb_tcp_fb_fini = rack_fini, 23938 .tfb_tcp_timer_stop_all = rack_stopall, 23939 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 23940 .tfb_tcp_handoff_ok = rack_handoff_ok, 23941 .tfb_tcp_mtu_chg = rack_mtu_change, 23942 .tfb_pru_options = rack_pru_options, 23943 .tfb_hwtls_change = rack_hw_tls_change, 23944 .tfb_chg_query = rack_chg_query, 23945 .tfb_switch_failed = rack_switch_failed, 23946 .tfb_early_wake_check = rack_wake_check, 23947 .tfb_compute_pipe = rack_compute_pipe, 23948 .tfb_stack_info = rack_stack_information, 23949 .tfb_inherit = rack_inherit, 23950 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK, 23951 23952 }; 23953 23954 /* 23955 * rack_ctloutput() must drop the inpcb lock before performing copyin on 23956 * socket option arguments. When it re-acquires the lock after the copy, it 23957 * has to revalidate that the connection is still valid for the socket 23958 * option. 23959 */ 23960 static int 23961 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) 23962 { 23963 struct inpcb *inp = tptoinpcb(tp); 23964 #ifdef INET 23965 struct ip *ip; 23966 #endif 23967 struct tcp_rack *rack; 23968 struct tcp_hybrid_req hybrid; 23969 uint64_t loptval; 23970 int32_t error = 0, optval; 23971 23972 rack = (struct tcp_rack *)tp->t_fb_ptr; 23973 if (rack == NULL) { 23974 INP_WUNLOCK(inp); 23975 return (EINVAL); 23976 } 23977 #ifdef INET 23978 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 23979 #endif 23980 23981 switch (sopt->sopt_level) { 23982 #ifdef INET6 23983 case IPPROTO_IPV6: 23984 MPASS(inp->inp_vflag & INP_IPV6PROTO); 23985 switch (sopt->sopt_name) { 23986 case IPV6_USE_MIN_MTU: 23987 tcp6_use_min_mtu(tp); 23988 break; 23989 } 23990 INP_WUNLOCK(inp); 23991 return (0); 23992 #endif 23993 #ifdef INET 23994 case IPPROTO_IP: 23995 switch (sopt->sopt_name) { 23996 case IP_TOS: 23997 /* 23998 * The DSCP codepoint has changed, update the fsb. 23999 */ 24000 ip->ip_tos = rack->rc_inp->inp_ip_tos; 24001 break; 24002 case IP_TTL: 24003 /* 24004 * The TTL has changed, update the fsb. 24005 */ 24006 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 24007 break; 24008 } 24009 INP_WUNLOCK(inp); 24010 return (0); 24011 #endif 24012 #ifdef SO_PEERPRIO 24013 case SOL_SOCKET: 24014 switch (sopt->sopt_name) { 24015 case SO_PEERPRIO: /* SC-URL:bs */ 24016 /* Already read in and sanity checked in sosetopt(). */ 24017 if (inp->inp_socket) { 24018 rack->client_bufferlvl = inp->inp_socket->so_peerprio; 24019 } 24020 break; 24021 } 24022 INP_WUNLOCK(inp); 24023 return (0); 24024 #endif 24025 case IPPROTO_TCP: 24026 switch (sopt->sopt_name) { 24027 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 24028 /* Pacing related ones */ 24029 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 24030 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 24031 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ 24032 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 24033 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 24034 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 24035 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 24036 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 24037 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 24038 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 24039 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 24040 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 24041 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 24042 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 24043 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 24044 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 24045 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 24046 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 24047 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 24048 /* End pacing related */ 24049 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 24050 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 24051 case TCP_RACK_MIN_TO: /* URL:min_to */ 24052 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 24053 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 24054 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 24055 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 24056 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 24057 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 24058 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 24059 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 24060 case TCP_NO_PRR: /* URL:noprr */ 24061 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 24062 case TCP_DATA_AFTER_CLOSE: /* no URL */ 24063 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 24064 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 24065 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 24066 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 24067 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 24068 case TCP_RACK_PROFILE: /* URL:profile */ 24069 case TCP_SIDECHAN_DIS: /* URL:scodm */ 24070 case TCP_HYBRID_PACING: /* URL:pacing=hybrid */ 24071 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 24072 case TCP_RACK_ABC_VAL: /* URL:labc */ 24073 case TCP_REC_ABC_VAL: /* URL:reclabc */ 24074 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 24075 case TCP_DEFER_OPTIONS: /* URL:defer */ 24076 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 24077 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 24078 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 24079 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ 24080 case TCP_RACK_HI_BETA: /* URL:hibeta */ 24081 case TCP_RACK_SPLIT_LIMIT: /* URL:split */ 24082 case TCP_SS_EEXIT: /* URL:eexit */ 24083 case TCP_DGP_UPPER_BOUNDS: /* URL:upper */ 24084 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ 24085 case TCP_PACING_DND: /* URL:dnd */ 24086 case TCP_NO_TIMELY: /* URL:notimely */ 24087 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 24088 case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */ 24089 case TCP_REC_IS_DYN: /* URL:dynrec */ 24090 case TCP_GP_USE_LTBW: /* URL:useltbw */ 24091 goto process_opt; 24092 break; 24093 default: 24094 /* Filter off all unknown options to the base stack */ 24095 return (tcp_default_ctloutput(tp, sopt)); 24096 break; 24097 } 24098 default: 24099 INP_WUNLOCK(inp); 24100 return (0); 24101 } 24102 process_opt: 24103 INP_WUNLOCK(inp); 24104 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 24105 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) { 24106 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 24107 /* 24108 * We truncate it down to 32 bits for the socket-option trace this 24109 * means rates > 34Gbps won't show right, but thats probably ok. 24110 */ 24111 optval = (uint32_t)loptval; 24112 } else if (sopt->sopt_name == TCP_HYBRID_PACING) { 24113 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); 24114 } else { 24115 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 24116 /* Save it in 64 bit form too */ 24117 loptval = optval; 24118 } 24119 if (error) 24120 return (error); 24121 INP_WLOCK(inp); 24122 if (tp->t_fb != &__tcp_rack) { 24123 INP_WUNLOCK(inp); 24124 return (ENOPROTOOPT); 24125 } 24126 if (rack->defer_options && (rack->gp_ready == 0) && 24127 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 24128 (sopt->sopt_name != TCP_HYBRID_PACING) && 24129 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && 24130 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 24131 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 24132 /* Options are being deferred */ 24133 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 24134 INP_WUNLOCK(inp); 24135 return (0); 24136 } else { 24137 /* No memory to defer, fail */ 24138 INP_WUNLOCK(inp); 24139 return (ENOMEM); 24140 } 24141 } 24142 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); 24143 INP_WUNLOCK(inp); 24144 return (error); 24145 } 24146 24147 static void 24148 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 24149 { 24150 24151 INP_WLOCK_ASSERT(tptoinpcb(tp)); 24152 bzero(ti, sizeof(*ti)); 24153 24154 ti->tcpi_state = tp->t_state; 24155 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 24156 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 24157 if (tp->t_flags & TF_SACK_PERMIT) 24158 ti->tcpi_options |= TCPI_OPT_SACK; 24159 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 24160 ti->tcpi_options |= TCPI_OPT_WSCALE; 24161 ti->tcpi_snd_wscale = tp->snd_scale; 24162 ti->tcpi_rcv_wscale = tp->rcv_scale; 24163 } 24164 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 24165 ti->tcpi_options |= TCPI_OPT_ECN; 24166 if (tp->t_flags & TF_FASTOPEN) 24167 ti->tcpi_options |= TCPI_OPT_TFO; 24168 /* still kept in ticks is t_rcvtime */ 24169 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 24170 /* Since we hold everything in precise useconds this is easy */ 24171 ti->tcpi_rtt = tp->t_srtt; 24172 ti->tcpi_rttvar = tp->t_rttvar; 24173 ti->tcpi_rto = tp->t_rxtcur; 24174 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 24175 ti->tcpi_snd_cwnd = tp->snd_cwnd; 24176 /* 24177 * FreeBSD-specific extension fields for tcp_info. 24178 */ 24179 ti->tcpi_rcv_space = tp->rcv_wnd; 24180 ti->tcpi_rcv_nxt = tp->rcv_nxt; 24181 ti->tcpi_snd_wnd = tp->snd_wnd; 24182 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 24183 ti->tcpi_snd_nxt = tp->snd_nxt; 24184 ti->tcpi_snd_mss = tp->t_maxseg; 24185 ti->tcpi_rcv_mss = tp->t_maxseg; 24186 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 24187 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 24188 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 24189 ti->tcpi_total_tlp = tp->t_sndtlppack; 24190 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 24191 ti->tcpi_rttmin = tp->t_rttlow; 24192 #ifdef NETFLIX_STATS 24193 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 24194 #endif 24195 #ifdef TCP_OFFLOAD 24196 if (tp->t_flags & TF_TOE) { 24197 ti->tcpi_options |= TCPI_OPT_TOE; 24198 tcp_offload_tcp_info(tp, ti); 24199 } 24200 #endif 24201 } 24202 24203 static int 24204 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) 24205 { 24206 struct inpcb *inp = tptoinpcb(tp); 24207 struct tcp_rack *rack; 24208 int32_t error, optval; 24209 uint64_t val, loptval; 24210 struct tcp_info ti; 24211 /* 24212 * Because all our options are either boolean or an int, we can just 24213 * pull everything into optval and then unlock and copy. If we ever 24214 * add a option that is not a int, then this will have quite an 24215 * impact to this routine. 24216 */ 24217 error = 0; 24218 rack = (struct tcp_rack *)tp->t_fb_ptr; 24219 if (rack == NULL) { 24220 INP_WUNLOCK(inp); 24221 return (EINVAL); 24222 } 24223 switch (sopt->sopt_name) { 24224 case TCP_INFO: 24225 /* First get the info filled */ 24226 rack_fill_info(tp, &ti); 24227 /* Fix up the rtt related fields if needed */ 24228 INP_WUNLOCK(inp); 24229 error = sooptcopyout(sopt, &ti, sizeof ti); 24230 return (error); 24231 /* 24232 * Beta is the congestion control value for NewReno that influences how 24233 * much of a backoff happens when loss is detected. It is normally set 24234 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 24235 * when you exit recovery. 24236 */ 24237 case TCP_RACK_PACING_BETA: 24238 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24239 error = EINVAL; 24240 else if (rack->rc_pacing_cc_set == 0) 24241 optval = rack->r_ctl.rc_saved_beta; 24242 else { 24243 /* 24244 * Reach out into the CC data and report back what 24245 * I have previously set. Yeah it looks hackish but 24246 * we don't want to report the saved values. 24247 */ 24248 if (tp->t_ccv.cc_data) 24249 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; 24250 else 24251 error = EINVAL; 24252 } 24253 break; 24254 /* 24255 * Beta_ecn is the congestion control value for NewReno that influences how 24256 * much of a backoff happens when a ECN mark is detected. It is normally set 24257 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 24258 * you exit recovery. Note that classic ECN has a beta of 50, it is only 24259 * ABE Ecn that uses this "less" value, but we do too with pacing :) 24260 */ 24261 case TCP_RACK_PACING_BETA_ECN: 24262 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24263 error = EINVAL; 24264 else if (rack->rc_pacing_cc_set == 0) 24265 optval = rack->r_ctl.rc_saved_beta_ecn; 24266 else { 24267 /* 24268 * Reach out into the CC data and report back what 24269 * I have previously set. Yeah it looks hackish but 24270 * we don't want to report the saved values. 24271 */ 24272 if (tp->t_ccv.cc_data) 24273 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 24274 else 24275 error = EINVAL; 24276 } 24277 break; 24278 case TCP_RACK_DSACK_OPT: 24279 optval = 0; 24280 if (rack->rc_rack_tmr_std_based) { 24281 optval |= 1; 24282 } 24283 if (rack->rc_rack_use_dsack) { 24284 optval |= 2; 24285 } 24286 break; 24287 case TCP_RACK_ENABLE_HYSTART: 24288 { 24289 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 24290 optval = RACK_HYSTART_ON; 24291 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 24292 optval = RACK_HYSTART_ON_W_SC; 24293 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 24294 optval = RACK_HYSTART_ON_W_SC_C; 24295 } else { 24296 optval = RACK_HYSTART_OFF; 24297 } 24298 } 24299 break; 24300 case TCP_RACK_DGP_IN_REC: 24301 error = EINVAL; 24302 break; 24303 case TCP_RACK_HI_BETA: 24304 optval = rack->rack_hibeta; 24305 break; 24306 case TCP_DEFER_OPTIONS: 24307 optval = rack->defer_options; 24308 break; 24309 case TCP_RACK_MEASURE_CNT: 24310 optval = rack->r_ctl.req_measurements; 24311 break; 24312 case TCP_REC_ABC_VAL: 24313 optval = rack->r_use_labc_for_rec; 24314 break; 24315 case TCP_RACK_ABC_VAL: 24316 optval = rack->rc_labc; 24317 break; 24318 case TCP_HDWR_UP_ONLY: 24319 optval= rack->r_up_only; 24320 break; 24321 case TCP_FILLCW_RATE_CAP: 24322 loptval = rack->r_ctl.fillcw_cap; 24323 break; 24324 case TCP_PACING_RATE_CAP: 24325 loptval = rack->r_ctl.bw_rate_cap; 24326 break; 24327 case TCP_RACK_PROFILE: 24328 /* You cannot retrieve a profile, its write only */ 24329 error = EINVAL; 24330 break; 24331 case TCP_SIDECHAN_DIS: 24332 optval = rack->r_ctl.side_chan_dis_mask; 24333 break; 24334 case TCP_HYBRID_PACING: 24335 /* You cannot retrieve hybrid pacing information, its write only */ 24336 error = EINVAL; 24337 break; 24338 case TCP_USE_CMP_ACKS: 24339 optval = rack->r_use_cmp_ack; 24340 break; 24341 case TCP_RACK_PACE_TO_FILL: 24342 optval = rack->rc_pace_to_cwnd; 24343 break; 24344 case TCP_RACK_NO_PUSH_AT_MAX: 24345 optval = rack->r_ctl.rc_no_push_at_mrtt; 24346 break; 24347 case TCP_SHARED_CWND_ENABLE: 24348 optval = rack->rack_enable_scwnd; 24349 break; 24350 case TCP_RACK_NONRXT_CFG_RATE: 24351 optval = rack->rack_rec_nonrxt_use_cr; 24352 break; 24353 case TCP_NO_PRR: 24354 if (rack->rack_no_prr == 1) 24355 optval = 1; 24356 else if (rack->no_prr_addback == 1) 24357 optval = 2; 24358 else 24359 optval = 0; 24360 break; 24361 case TCP_GP_USE_LTBW: 24362 if (rack->dis_lt_bw) { 24363 /* It is not used */ 24364 optval = 0; 24365 } else if (rack->use_lesser_lt_bw) { 24366 /* we use min() */ 24367 optval = 1; 24368 } else { 24369 /* we use max() */ 24370 optval = 2; 24371 } 24372 break; 24373 case TCP_RACK_DO_DETECTION: 24374 error = EINVAL; 24375 break; 24376 case TCP_RACK_MBUF_QUEUE: 24377 /* Now do we use the LRO mbuf-queue feature */ 24378 optval = rack->r_mbuf_queue; 24379 break; 24380 case RACK_CSPR_IS_FCC: 24381 optval = rack->cspr_is_fcc; 24382 break; 24383 case TCP_TIMELY_DYN_ADJ: 24384 optval = rack->rc_gp_dyn_mul; 24385 break; 24386 case TCP_BBR_IWINTSO: 24387 error = EINVAL; 24388 break; 24389 case TCP_RACK_TLP_REDUCE: 24390 /* RACK TLP cwnd reduction (bool) */ 24391 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 24392 break; 24393 case TCP_BBR_RACK_INIT_RATE: 24394 val = rack->r_ctl.init_rate; 24395 /* convert to kbits per sec */ 24396 val *= 8; 24397 val /= 1000; 24398 optval = (uint32_t)val; 24399 break; 24400 case TCP_RACK_FORCE_MSEG: 24401 optval = rack->rc_force_max_seg; 24402 break; 24403 case TCP_RACK_PACE_MIN_SEG: 24404 optval = rack->r_ctl.rc_user_set_min_segs; 24405 break; 24406 case TCP_RACK_PACE_MAX_SEG: 24407 /* Max segments in a pace */ 24408 optval = rack->rc_user_set_max_segs; 24409 break; 24410 case TCP_RACK_PACE_ALWAYS: 24411 /* Use the always pace method */ 24412 optval = rack->rc_always_pace; 24413 break; 24414 case TCP_RACK_PRR_SENDALOT: 24415 /* Allow PRR to send more than one seg */ 24416 optval = rack->r_ctl.rc_prr_sendalot; 24417 break; 24418 case TCP_RACK_MIN_TO: 24419 /* Minimum time between rack t-o's in ms */ 24420 optval = rack->r_ctl.rc_min_to; 24421 break; 24422 case TCP_RACK_SPLIT_LIMIT: 24423 optval = rack->r_ctl.rc_split_limit; 24424 break; 24425 case TCP_RACK_EARLY_SEG: 24426 /* If early recovery max segments */ 24427 optval = rack->r_ctl.rc_early_recovery_segs; 24428 break; 24429 case TCP_RACK_REORD_THRESH: 24430 /* RACK reorder threshold (shift amount) */ 24431 optval = rack->r_ctl.rc_reorder_shift; 24432 break; 24433 case TCP_SS_EEXIT: 24434 if (rack->r_ctl.gp_rnd_thresh) { 24435 uint32_t v; 24436 24437 v = rack->r_ctl.gp_gain_req; 24438 v <<= 17; 24439 optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff); 24440 if (rack->r_ctl.gate_to_fs == 1) 24441 optval |= 0x10000; 24442 } else 24443 optval = 0; 24444 break; 24445 case TCP_RACK_REORD_FADE: 24446 /* Does reordering fade after ms time */ 24447 optval = rack->r_ctl.rc_reorder_fade; 24448 break; 24449 case TCP_BBR_USE_RACK_RR: 24450 /* Do we use the rack cheat for rxt */ 24451 optval = rack->use_rack_rr; 24452 break; 24453 case TCP_RACK_RR_CONF: 24454 optval = rack->r_rr_config; 24455 break; 24456 case TCP_HDWR_RATE_CAP: 24457 optval = rack->r_rack_hw_rate_caps; 24458 break; 24459 case TCP_BBR_HDWR_PACE: 24460 optval = rack->rack_hdw_pace_ena; 24461 break; 24462 case TCP_RACK_TLP_THRESH: 24463 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 24464 optval = rack->r_ctl.rc_tlp_threshold; 24465 break; 24466 case TCP_RACK_PKT_DELAY: 24467 /* RACK added ms i.e. rack-rtt + reord + N */ 24468 optval = rack->r_ctl.rc_pkt_delay; 24469 break; 24470 case TCP_RACK_TLP_USE: 24471 optval = rack->rack_tlp_threshold_use; 24472 break; 24473 case TCP_PACING_DND: 24474 optval = rack->rc_pace_dnd; 24475 break; 24476 case TCP_RACK_PACE_RATE_CA: 24477 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 24478 break; 24479 case TCP_RACK_PACE_RATE_SS: 24480 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 24481 break; 24482 case TCP_RACK_PACE_RATE_REC: 24483 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 24484 break; 24485 case TCP_DGP_UPPER_BOUNDS: 24486 optval = rack->r_ctl.rack_per_upper_bound_ss; 24487 optval <<= 16; 24488 optval |= rack->r_ctl.rack_per_upper_bound_ca; 24489 break; 24490 case TCP_RACK_GP_INCREASE_SS: 24491 optval = rack->r_ctl.rack_per_of_gp_ca; 24492 break; 24493 case TCP_RACK_GP_INCREASE_CA: 24494 optval = rack->r_ctl.rack_per_of_gp_ss; 24495 break; 24496 case TCP_RACK_PACING_DIVISOR: 24497 optval = rack->r_ctl.pace_len_divisor; 24498 break; 24499 case TCP_BBR_RACK_RTT_USE: 24500 optval = rack->r_ctl.rc_rate_sample_method; 24501 break; 24502 case TCP_DELACK: 24503 optval = tp->t_delayed_ack; 24504 break; 24505 case TCP_DATA_AFTER_CLOSE: 24506 optval = rack->rc_allow_data_af_clo; 24507 break; 24508 case TCP_SHARED_CWND_TIME_LIMIT: 24509 optval = rack->r_limit_scw; 24510 break; 24511 case TCP_HONOR_HPTS_MIN: 24512 if (rack->r_use_hpts_min) 24513 optval = rack->r_ctl.max_reduction; 24514 else 24515 optval = 0; 24516 break; 24517 case TCP_REC_IS_DYN: 24518 optval = rack->rc_gp_no_rec_chg; 24519 break; 24520 case TCP_NO_TIMELY: 24521 optval = rack->rc_skip_timely; 24522 break; 24523 case TCP_RACK_TIMER_SLOP: 24524 optval = rack->r_ctl.timer_slop; 24525 break; 24526 default: 24527 return (tcp_default_ctloutput(tp, sopt)); 24528 break; 24529 } 24530 INP_WUNLOCK(inp); 24531 if (error == 0) { 24532 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 24533 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) 24534 error = sooptcopyout(sopt, &loptval, sizeof loptval); 24535 else 24536 error = sooptcopyout(sopt, &optval, sizeof optval); 24537 } 24538 return (error); 24539 } 24540 24541 static int 24542 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 24543 { 24544 if (sopt->sopt_dir == SOPT_SET) { 24545 return (rack_set_sockopt(tp, sopt)); 24546 } else if (sopt->sopt_dir == SOPT_GET) { 24547 return (rack_get_sockopt(tp, sopt)); 24548 } else { 24549 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 24550 } 24551 } 24552 24553 static const char *rack_stack_names[] = { 24554 __XSTRING(STACKNAME), 24555 #ifdef STACKALIAS 24556 __XSTRING(STACKALIAS), 24557 #endif 24558 }; 24559 24560 static int 24561 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 24562 { 24563 memset(mem, 0, size); 24564 return (0); 24565 } 24566 24567 static void 24568 rack_dtor(void *mem, int32_t size, void *arg) 24569 { 24570 24571 } 24572 24573 static bool rack_mod_inited = false; 24574 24575 static int 24576 tcp_addrack(module_t mod, int32_t type, void *data) 24577 { 24578 int32_t err = 0; 24579 int num_stacks; 24580 24581 switch (type) { 24582 case MOD_LOAD: 24583 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 24584 sizeof(struct rack_sendmap), 24585 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 24586 24587 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 24588 sizeof(struct tcp_rack), 24589 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 24590 24591 sysctl_ctx_init(&rack_sysctl_ctx); 24592 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 24593 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 24594 OID_AUTO, 24595 #ifdef STACKALIAS 24596 __XSTRING(STACKALIAS), 24597 #else 24598 __XSTRING(STACKNAME), 24599 #endif 24600 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 24601 ""); 24602 if (rack_sysctl_root == NULL) { 24603 printf("Failed to add sysctl node\n"); 24604 err = EFAULT; 24605 goto free_uma; 24606 } 24607 rack_init_sysctls(); 24608 num_stacks = nitems(rack_stack_names); 24609 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 24610 rack_stack_names, &num_stacks); 24611 if (err) { 24612 printf("Failed to register %s stack name for " 24613 "%s module\n", rack_stack_names[num_stacks], 24614 __XSTRING(MODNAME)); 24615 sysctl_ctx_free(&rack_sysctl_ctx); 24616 free_uma: 24617 uma_zdestroy(rack_zone); 24618 uma_zdestroy(rack_pcb_zone); 24619 rack_counter_destroy(); 24620 printf("Failed to register rack module -- err:%d\n", err); 24621 return (err); 24622 } 24623 tcp_lro_reg_mbufq(); 24624 rack_mod_inited = true; 24625 break; 24626 case MOD_QUIESCE: 24627 err = deregister_tcp_functions(&__tcp_rack, true, false); 24628 break; 24629 case MOD_UNLOAD: 24630 err = deregister_tcp_functions(&__tcp_rack, false, true); 24631 if (err == EBUSY) 24632 break; 24633 if (rack_mod_inited) { 24634 uma_zdestroy(rack_zone); 24635 uma_zdestroy(rack_pcb_zone); 24636 sysctl_ctx_free(&rack_sysctl_ctx); 24637 rack_counter_destroy(); 24638 rack_mod_inited = false; 24639 } 24640 tcp_lro_dereg_mbufq(); 24641 err = 0; 24642 break; 24643 default: 24644 return (EOPNOTSUPP); 24645 } 24646 return (err); 24647 } 24648 24649 static moduledata_t tcp_rack = { 24650 .name = __XSTRING(MODNAME), 24651 .evhand = tcp_addrack, 24652 .priv = 0 24653 }; 24654 24655 MODULE_VERSION(MODNAME, 1); 24656 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 24657 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 24658 24659 #endif /* #if !defined(INET) && !defined(INET6) */ 24660