1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 #include "opt_ipsec.h" 31 #include "opt_ratelimit.h" 32 #include "opt_kern_tls.h" 33 #if defined(INET) || defined(INET6) 34 #include <sys/param.h> 35 #include <sys/arb.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/mbuf.h> 46 #include <sys/proc.h> /* for proc0 declaration */ 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 #ifdef STATS 52 #include <sys/qmath.h> 53 #include <sys/tree.h> 54 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 55 #else 56 #include <sys/tree.h> 57 #endif 58 #include <sys/refcount.h> 59 #include <sys/queue.h> 60 #include <sys/tim_filter.h> 61 #include <sys/smp.h> 62 #include <sys/kthread.h> 63 #include <sys/kern_prefetch.h> 64 #include <sys/protosw.h> 65 #ifdef TCP_ACCOUNTING 66 #include <sys/sched.h> 67 #include <machine/cpu.h> 68 #endif 69 #include <vm/uma.h> 70 71 #include <net/route.h> 72 #include <net/route/nhop.h> 73 #include <net/vnet.h> 74 75 #define TCPSTATES /* for logging */ 76 77 #include <netinet/in.h> 78 #include <netinet/in_kdtrace.h> 79 #include <netinet/in_pcb.h> 80 #include <netinet/ip.h> 81 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 82 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 83 #include <netinet/ip_var.h> 84 #include <netinet/ip6.h> 85 #include <netinet6/in6_pcb.h> 86 #include <netinet6/ip6_var.h> 87 #include <netinet/tcp.h> 88 #define TCPOUTFLAGS 89 #include <netinet/tcp_fsm.h> 90 #include <netinet/tcp_seq.h> 91 #include <netinet/tcp_timer.h> 92 #include <netinet/tcp_var.h> 93 #include <netinet/tcp_log_buf.h> 94 #include <netinet/tcp_syncache.h> 95 #include <netinet/tcp_hpts.h> 96 #include <netinet/tcp_ratelimit.h> 97 #include <netinet/tcp_accounting.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/cc/cc.h> 100 #include <netinet/cc/cc_newreno.h> 101 #include <netinet/tcp_fastopen.h> 102 #include <netinet/tcp_lro.h> 103 #ifdef NETFLIX_SHARED_CWND 104 #include <netinet/tcp_shared_cwnd.h> 105 #endif 106 #ifdef TCP_OFFLOAD 107 #include <netinet/tcp_offload.h> 108 #endif 109 #ifdef INET6 110 #include <netinet6/tcp6_var.h> 111 #endif 112 #include <netinet/tcp_ecn.h> 113 114 #include <netipsec/ipsec_support.h> 115 116 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 117 #include <netipsec/ipsec.h> 118 #include <netipsec/ipsec6.h> 119 #endif /* IPSEC */ 120 121 #include <netinet/udp.h> 122 #include <netinet/udp_var.h> 123 #include <machine/in_cksum.h> 124 125 #ifdef MAC 126 #include <security/mac/mac_framework.h> 127 #endif 128 #include "sack_filter.h" 129 #include "tcp_rack.h" 130 #include "tailq_hash.h" 131 #include "rack_bbr_common.h" 132 133 uma_zone_t rack_zone; 134 uma_zone_t rack_pcb_zone; 135 136 #ifndef TICKS2SBT 137 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 138 #endif 139 140 VNET_DECLARE(uint32_t, newreno_beta); 141 VNET_DECLARE(uint32_t, newreno_beta_ecn); 142 #define V_newreno_beta VNET(newreno_beta) 143 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 144 145 #define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME) 146 #define M_TCPDO __CONCAT(M_TCPDO, STACKNAME) 147 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block"); 149 MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options"); 150 MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information"); 151 152 struct sysctl_ctx_list rack_sysctl_ctx; 153 struct sysctl_oid *rack_sysctl_root; 154 155 #define CUM_ACKED 1 156 #define SACKED 2 157 158 /* 159 * The RACK module incorporates a number of 160 * TCP ideas that have been put out into the IETF 161 * over the last few years: 162 * - Matt Mathis's Rate Halving which slowly drops 163 * the congestion window so that the ack clock can 164 * be maintained during a recovery. 165 * - Yuchung Cheng's RACK TCP (for which its named) that 166 * will stop us using the number of dup acks and instead 167 * use time as the gage of when we retransmit. 168 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 169 * of Dukkipati et.al. 170 * RACK depends on SACK, so if an endpoint arrives that 171 * cannot do SACK the state machine below will shuttle the 172 * connection back to using the "default" TCP stack that is 173 * in FreeBSD. 174 * 175 * To implement RACK the original TCP stack was first decomposed 176 * into a functional state machine with individual states 177 * for each of the possible TCP connection states. The do_segment 178 * functions role in life is to mandate the connection supports SACK 179 * initially and then assure that the RACK state matches the conenction 180 * state before calling the states do_segment function. Each 181 * state is simplified due to the fact that the original do_segment 182 * has been decomposed and we *know* what state we are in (no 183 * switches on the state) and all tests for SACK are gone. This 184 * greatly simplifies what each state does. 185 * 186 * TCP output is also over-written with a new version since it 187 * must maintain the new rack scoreboard. 188 * 189 */ 190 static int32_t rack_tlp_thresh = 1; 191 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 192 static int32_t rack_tlp_use_greater = 1; 193 static int32_t rack_reorder_thresh = 2; 194 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 195 * - 60 seconds */ 196 static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */ 197 static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */ 198 static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */ 199 static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */ 200 static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */ 201 static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */ 202 static uint32_t rack_policing_do_bw_comp = 1; 203 static uint32_t rack_pcm_every_n_rounds = 100; 204 static uint32_t rack_pcm_blast = 0; 205 static uint32_t rack_pcm_is_enabled = 1; 206 static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */ 207 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ 208 209 static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ 210 static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ 211 212 213 static int32_t rack_rxt_scoreboard_clear_thresh = 2; 214 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ 215 static int32_t rack_rxt_controls = 0; 216 static int32_t rack_fill_cw_state = 0; 217 static uint8_t rack_req_measurements = 1; 218 /* Attack threshold detections */ 219 static uint32_t rack_highest_sack_thresh_seen = 0; 220 static uint32_t rack_highest_move_thresh_seen = 0; 221 static uint32_t rack_merge_out_sacks_on_attack = 0; 222 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 223 static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ 224 static int32_t rack_hw_rate_caps = 0; /* 1; */ 225 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ 226 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 227 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 228 static int32_t rack_hw_up_only = 0; 229 static int32_t rack_stats_gets_ms_rtt = 1; 230 static int32_t rack_prr_addbackmax = 2; 231 static int32_t rack_do_hystart = 0; 232 static int32_t rack_apply_rtt_with_reduced_conf = 0; 233 static int32_t rack_hibeta_setting = 0; 234 static int32_t rack_default_pacing_divisor = 250; 235 static uint16_t rack_pacing_min_seg = 0; 236 static int32_t rack_timely_off = 0; 237 238 static uint32_t sad_seg_size_per = 800; /* 80.0 % */ 239 static int32_t rack_pkt_delay = 1000; 240 static int32_t rack_send_a_lot_in_prr = 1; 241 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 242 static int32_t rack_verbose_logging = 0; 243 static int32_t rack_ignore_data_after_close = 1; 244 static int32_t rack_enable_shared_cwnd = 1; 245 static int32_t rack_use_cmp_acks = 1; 246 static int32_t rack_use_fsb = 1; 247 static int32_t rack_use_rfo = 1; 248 static int32_t rack_use_rsm_rfo = 1; 249 static int32_t rack_max_abc_post_recovery = 2; 250 static int32_t rack_client_low_buf = 0; 251 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 252 static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */ 253 #ifdef TCP_ACCOUNTING 254 static int32_t rack_tcp_accounting = 0; 255 #endif 256 static int32_t rack_limits_scwnd = 1; 257 static int32_t rack_enable_mqueue_for_nonpaced = 0; 258 static int32_t rack_hybrid_allow_set_maxseg = 0; 259 static int32_t rack_disable_prr = 0; 260 static int32_t use_rack_rr = 1; 261 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 262 static int32_t rack_persist_min = 250000; /* 250usec */ 263 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 264 static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ 265 static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ 266 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 267 static int32_t rack_limit_time_with_srtt = 0; 268 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 269 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ 270 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 271 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 272 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 273 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ 274 static int32_t rack_full_buffer_discount = 10; 275 /* 276 * Currently regular tcp has a rto_min of 30ms 277 * the backoff goes 12 times so that ends up 278 * being a total of 122.850 seconds before a 279 * connection is killed. 280 */ 281 static uint32_t rack_def_data_window = 20; 282 static uint32_t rack_goal_bdp = 2; 283 static uint32_t rack_min_srtts = 1; 284 static uint32_t rack_min_measure_usec = 0; 285 static int32_t rack_tlp_min = 10000; /* 10ms */ 286 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 287 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 288 static const int32_t rack_free_cache = 2; 289 static int32_t rack_hptsi_segments = 40; 290 static int32_t rack_rate_sample_method = USE_RTT_LOW; 291 static int32_t rack_pace_every_seg = 0; 292 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 293 static int32_t rack_slot_reduction = 4; 294 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 295 static int32_t rack_cwnd_block_ends_measure = 0; 296 static int32_t rack_rwnd_block_ends_measure = 0; 297 static int32_t rack_def_profile = 0; 298 299 static int32_t rack_lower_cwnd_at_tlp = 0; 300 static int32_t rack_always_send_oldest = 0; 301 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 302 303 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 304 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 305 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 306 307 /* Probertt */ 308 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 309 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 310 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 311 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 312 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 313 314 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 315 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 316 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 317 static uint32_t rack_probertt_use_min_rtt_exit = 0; 318 static uint32_t rack_probe_rtt_sets_cwnd = 0; 319 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 320 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 321 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 322 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 323 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 324 static uint32_t rack_probertt_filter_life = 10000000; 325 static uint32_t rack_probertt_lower_within = 10; 326 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 327 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 328 static int32_t rack_probertt_clear_is = 1; 329 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 330 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 331 332 /* Part of pacing */ 333 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 334 335 /* Timely information: 336 * 337 * Here we have various control parameters on how 338 * timely may change the multiplier. rack_gain_p5_ub 339 * is associated with timely but not directly influencing 340 * the rate decision like the other variables. It controls 341 * the way fill-cw interacts with timely and caps how much 342 * timely can boost the fill-cw b/w. 343 * 344 * The other values are various boost/shrink numbers as well 345 * as potential caps when adjustments are made to the timely 346 * gain (returned by rack_get_output_gain(). Remember too that 347 * the gain returned can be overriden by other factors such as 348 * probeRTT as well as fixed-rate-pacing. 349 */ 350 static int32_t rack_gain_p5_ub = 250; 351 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 352 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 353 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 354 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 355 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 356 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */ 357 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 358 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 359 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 360 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 361 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 362 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 363 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 364 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 365 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 366 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 367 static int32_t rack_use_max_for_nobackoff = 0; 368 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 369 static int32_t rack_timely_no_stopping = 0; 370 static int32_t rack_down_raise_thresh = 100; 371 static int32_t rack_req_segs = 1; 372 static uint64_t rack_bw_rate_cap = 0; 373 static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */ 374 375 376 /* Rack specific counters */ 377 counter_u64_t rack_saw_enobuf; 378 counter_u64_t rack_saw_enobuf_hw; 379 counter_u64_t rack_saw_enetunreach; 380 counter_u64_t rack_persists_sends; 381 counter_u64_t rack_persists_acks; 382 counter_u64_t rack_persists_loss; 383 counter_u64_t rack_persists_lost_ends; 384 counter_u64_t rack_total_bytes; 385 #ifdef INVARIANTS 386 counter_u64_t rack_adjust_map_bw; 387 #endif 388 /* Tail loss probe counters */ 389 counter_u64_t rack_tlp_tot; 390 counter_u64_t rack_tlp_newdata; 391 counter_u64_t rack_tlp_retran; 392 counter_u64_t rack_tlp_retran_bytes; 393 counter_u64_t rack_to_tot; 394 counter_u64_t rack_hot_alloc; 395 counter_u64_t tcp_policer_detected; 396 counter_u64_t rack_to_alloc; 397 counter_u64_t rack_to_alloc_hard; 398 counter_u64_t rack_to_alloc_emerg; 399 counter_u64_t rack_to_alloc_limited; 400 counter_u64_t rack_alloc_limited_conns; 401 counter_u64_t rack_split_limited; 402 counter_u64_t rack_rxt_clamps_cwnd; 403 counter_u64_t rack_rxt_clamps_cwnd_uniq; 404 405 counter_u64_t rack_multi_single_eq; 406 counter_u64_t rack_proc_non_comp_ack; 407 408 counter_u64_t rack_fto_send; 409 counter_u64_t rack_fto_rsm_send; 410 counter_u64_t rack_nfto_resend; 411 counter_u64_t rack_non_fto_send; 412 counter_u64_t rack_extended_rfo; 413 414 counter_u64_t rack_sack_proc_all; 415 counter_u64_t rack_sack_proc_short; 416 counter_u64_t rack_sack_proc_restart; 417 counter_u64_t rack_sack_attacks_detected; 418 counter_u64_t rack_sack_attacks_reversed; 419 counter_u64_t rack_sack_attacks_suspect; 420 counter_u64_t rack_sack_used_next_merge; 421 counter_u64_t rack_sack_splits; 422 counter_u64_t rack_sack_used_prev_merge; 423 counter_u64_t rack_sack_skipped_acked; 424 counter_u64_t rack_ack_total; 425 counter_u64_t rack_express_sack; 426 counter_u64_t rack_sack_total; 427 counter_u64_t rack_move_none; 428 counter_u64_t rack_move_some; 429 430 counter_u64_t rack_input_idle_reduces; 431 counter_u64_t rack_collapsed_win; 432 counter_u64_t rack_collapsed_win_seen; 433 counter_u64_t rack_collapsed_win_rxt; 434 counter_u64_t rack_collapsed_win_rxt_bytes; 435 counter_u64_t rack_try_scwnd; 436 counter_u64_t rack_hw_pace_init_fail; 437 counter_u64_t rack_hw_pace_lost; 438 439 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 440 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 441 442 443 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 444 445 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 446 (tv) = (value) + slop; \ 447 if ((u_long)(tv) < (u_long)(tvmin)) \ 448 (tv) = (tvmin); \ 449 if ((u_long)(tv) > (u_long)(tvmax)) \ 450 (tv) = (tvmax); \ 451 } while (0) 452 453 static void 454 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 455 456 static int 457 rack_process_ack(struct mbuf *m, struct tcphdr *th, 458 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 459 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen); 460 static int 461 rack_process_data(struct mbuf *m, struct tcphdr *th, 462 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 463 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 464 static void 465 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 466 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 467 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 468 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 469 uint8_t limit_type); 470 static struct rack_sendmap * 471 rack_check_recovery_mode(struct tcpcb *tp, 472 uint32_t tsused); 473 static uint32_t 474 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack); 475 static void 476 rack_cong_signal(struct tcpcb *tp, 477 uint32_t type, uint32_t ack, int ); 478 static void rack_counter_destroy(void); 479 static int 480 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt); 481 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 482 static void 483 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 484 static void 485 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 486 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); 487 static void rack_dtor(void *mem, int32_t size, void *arg); 488 static void 489 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 490 uint32_t flex1, uint32_t flex2, 491 uint32_t flex3, uint32_t flex4, 492 uint32_t flex5, uint32_t flex6, 493 uint16_t flex7, uint8_t mod); 494 495 static void 496 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 497 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 498 struct rack_sendmap *rsm, uint8_t quality); 499 static struct rack_sendmap * 500 rack_find_high_nonack(struct tcp_rack *rack, 501 struct rack_sendmap *rsm); 502 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 503 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 504 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 505 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt); 506 static void 507 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 508 tcp_seq th_ack, int line, uint8_t quality); 509 static void 510 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); 511 512 static uint32_t 513 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 514 static int32_t rack_handoff_ok(struct tcpcb *tp); 515 static int32_t rack_init(struct tcpcb *tp, void **ptr); 516 static void rack_init_sysctls(void); 517 518 static void 519 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 520 struct tcphdr *th, int entered_rec, int dup_ack_struck, 521 int *dsack_seen, int *sacks_seen); 522 static void 523 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 524 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 525 struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); 526 527 static uint64_t rack_get_gp_est(struct tcp_rack *rack); 528 529 530 static void 531 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 532 struct rack_sendmap *rsm, uint32_t cts); 533 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 534 static int32_t rack_output(struct tcpcb *tp); 535 536 static uint32_t 537 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 538 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 539 uint32_t cts, uint32_t segsiz); 540 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 541 static void rack_remxt_tmr(struct tcpcb *tp); 542 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); 543 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 544 static int32_t rack_stopall(struct tcpcb *tp); 545 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 546 static uint32_t 547 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 548 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz); 549 static void 550 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 551 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz); 552 static int 553 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 554 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 555 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 556 static int 557 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 558 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 559 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 560 561 static void 562 rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz); 563 564 static int 565 rack_do_closing(struct mbuf *m, struct tcphdr *th, 566 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 567 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 568 static int 569 rack_do_established(struct mbuf *m, struct tcphdr *th, 570 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 571 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 572 static int 573 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 574 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 575 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 576 static int 577 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 578 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 579 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 580 static int 581 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 582 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 583 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 584 static int 585 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 586 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 587 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 588 static int 589 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 590 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 591 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 592 static int 593 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 594 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 595 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 596 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); 597 struct rack_sendmap * 598 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 599 uint32_t tsused); 600 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 601 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 602 static void 603 tcp_rack_partialack(struct tcpcb *tp); 604 static int 605 rack_set_profile(struct tcp_rack *rack, int prof); 606 static void 607 rack_apply_deferred_options(struct tcp_rack *rack); 608 609 int32_t rack_clear_counter=0; 610 611 static uint64_t 612 rack_get_lt_bw(struct tcp_rack *rack) 613 { 614 struct timeval tv; 615 uint64_t tim, bytes; 616 617 tim = rack->r_ctl.lt_bw_time; 618 bytes = rack->r_ctl.lt_bw_bytes; 619 if (rack->lt_bw_up) { 620 /* Include all the current bytes too */ 621 microuptime(&tv); 622 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); 623 tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); 624 } 625 if ((bytes != 0) && (tim != 0)) 626 return ((bytes * (uint64_t)1000000) / tim); 627 else 628 return (0); 629 } 630 631 static void 632 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) 633 { 634 struct sockopt sopt; 635 struct cc_newreno_opts opt; 636 struct newreno old; 637 struct tcpcb *tp; 638 int error, failed = 0; 639 640 tp = rack->rc_tp; 641 if (tp->t_cc == NULL) { 642 /* Tcb is leaving */ 643 return; 644 } 645 rack->rc_pacing_cc_set = 1; 646 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 647 /* Not new-reno we can't play games with beta! */ 648 failed = 1; 649 goto out; 650 651 } 652 if (CC_ALGO(tp)->ctl_output == NULL) { 653 /* Huh, not using new-reno so no swaps.? */ 654 failed = 2; 655 goto out; 656 } 657 /* Get the current values out */ 658 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 659 sopt.sopt_dir = SOPT_GET; 660 opt.name = CC_NEWRENO_BETA; 661 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 662 if (error) { 663 failed = 3; 664 goto out; 665 } 666 old.beta = opt.val; 667 opt.name = CC_NEWRENO_BETA_ECN; 668 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 669 if (error) { 670 failed = 4; 671 goto out; 672 } 673 old.beta_ecn = opt.val; 674 675 /* Now lets set in the values we have stored */ 676 sopt.sopt_dir = SOPT_SET; 677 opt.name = CC_NEWRENO_BETA; 678 opt.val = rack->r_ctl.rc_saved_beta.beta; 679 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 680 if (error) { 681 failed = 5; 682 goto out; 683 } 684 opt.name = CC_NEWRENO_BETA_ECN; 685 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 686 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 687 if (error) { 688 failed = 6; 689 goto out; 690 } 691 /* Save off the values for restoral */ 692 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 693 out: 694 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 695 union tcp_log_stackspecific log; 696 struct timeval tv; 697 struct newreno *ptr; 698 699 ptr = ((struct newreno *)tp->t_ccv.cc_data); 700 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 701 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 702 log.u_bbr.flex1 = ptr->beta; 703 log.u_bbr.flex2 = ptr->beta_ecn; 704 log.u_bbr.flex3 = ptr->newreno_flags; 705 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 706 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 707 log.u_bbr.flex6 = failed; 708 log.u_bbr.flex7 = rack->gp_ready; 709 log.u_bbr.flex7 <<= 1; 710 log.u_bbr.flex7 |= rack->use_fixed_rate; 711 log.u_bbr.flex7 <<= 1; 712 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 713 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 714 log.u_bbr.flex8 = flex8; 715 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 716 0, &log, false, NULL, NULL, 0, &tv); 717 } 718 } 719 720 static void 721 rack_set_cc_pacing(struct tcp_rack *rack) 722 { 723 if (rack->rc_pacing_cc_set) 724 return; 725 /* 726 * Use the swap utility placing in 3 for flex8 to id a 727 * set of a new set of values. 728 */ 729 rack->rc_pacing_cc_set = 1; 730 rack_swap_beta_values(rack, 3); 731 } 732 733 static void 734 rack_undo_cc_pacing(struct tcp_rack *rack) 735 { 736 if (rack->rc_pacing_cc_set == 0) 737 return; 738 /* 739 * Use the swap utility placing in 4 for flex8 to id a 740 * restoral of the old values. 741 */ 742 rack->rc_pacing_cc_set = 0; 743 rack_swap_beta_values(rack, 4); 744 } 745 746 static void 747 rack_remove_pacing(struct tcp_rack *rack) 748 { 749 if (rack->rc_pacing_cc_set) 750 rack_undo_cc_pacing(rack); 751 if (rack->r_ctl.pacing_method & RACK_REG_PACING) 752 tcp_decrement_paced_conn(); 753 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) 754 tcp_dec_dgp_pacing_cnt(); 755 rack->rc_always_pace = 0; 756 rack->r_ctl.pacing_method = RACK_PACING_NONE; 757 rack->dgp_on = 0; 758 rack->rc_hybrid_mode = 0; 759 rack->use_fixed_rate = 0; 760 } 761 762 static void 763 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, 764 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) 765 { 766 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) { 767 union tcp_log_stackspecific log; 768 struct timeval tv; 769 770 memset(&log, 0, sizeof(log)); 771 log.u_bbr.flex1 = seq_end; 772 log.u_bbr.flex2 = rack->rc_tp->gput_seq; 773 log.u_bbr.flex3 = ack_end_t; 774 log.u_bbr.flex4 = rack->rc_tp->gput_ts; 775 log.u_bbr.flex5 = send_end_t; 776 log.u_bbr.flex6 = rack->rc_tp->gput_ack; 777 log.u_bbr.flex7 = mode; 778 log.u_bbr.flex8 = 69; 779 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; 780 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; 781 log.u_bbr.pkts_out = line; 782 log.u_bbr.cwnd_gain = rack->app_limited_needs_set; 783 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; 784 log.u_bbr.epoch = rack->r_ctl.current_round; 785 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 786 if (rsm != NULL) { 787 log.u_bbr.applimited = rsm->r_start; 788 log.u_bbr.delivered = rsm->r_end; 789 log.u_bbr.epoch = rsm->r_flags; 790 } 791 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 792 TCP_LOG_EVENTP(rack->rc_tp, NULL, 793 &rack->rc_inp->inp_socket->so_rcv, 794 &rack->rc_inp->inp_socket->so_snd, 795 BBR_LOG_HPTSI_CALC, 0, 796 0, &log, false, &tv); 797 } 798 } 799 800 static int 801 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 802 { 803 uint32_t stat; 804 int32_t error; 805 806 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 807 if (error || req->newptr == NULL) 808 return error; 809 810 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 811 if (error) 812 return (error); 813 if (stat == 1) { 814 #ifdef INVARIANTS 815 printf("Clearing RACK counters\n"); 816 #endif 817 counter_u64_zero(rack_tlp_tot); 818 counter_u64_zero(rack_tlp_newdata); 819 counter_u64_zero(rack_tlp_retran); 820 counter_u64_zero(rack_tlp_retran_bytes); 821 counter_u64_zero(rack_to_tot); 822 counter_u64_zero(rack_saw_enobuf); 823 counter_u64_zero(rack_saw_enobuf_hw); 824 counter_u64_zero(rack_saw_enetunreach); 825 counter_u64_zero(rack_persists_sends); 826 counter_u64_zero(rack_total_bytes); 827 counter_u64_zero(rack_persists_acks); 828 counter_u64_zero(rack_persists_loss); 829 counter_u64_zero(rack_persists_lost_ends); 830 #ifdef INVARIANTS 831 counter_u64_zero(rack_adjust_map_bw); 832 #endif 833 counter_u64_zero(rack_to_alloc_hard); 834 counter_u64_zero(rack_to_alloc_emerg); 835 counter_u64_zero(rack_sack_proc_all); 836 counter_u64_zero(rack_fto_send); 837 counter_u64_zero(rack_fto_rsm_send); 838 counter_u64_zero(rack_extended_rfo); 839 counter_u64_zero(rack_hw_pace_init_fail); 840 counter_u64_zero(rack_hw_pace_lost); 841 counter_u64_zero(rack_non_fto_send); 842 counter_u64_zero(rack_nfto_resend); 843 counter_u64_zero(rack_sack_proc_short); 844 counter_u64_zero(rack_sack_proc_restart); 845 counter_u64_zero(rack_to_alloc); 846 counter_u64_zero(rack_to_alloc_limited); 847 counter_u64_zero(rack_alloc_limited_conns); 848 counter_u64_zero(rack_split_limited); 849 counter_u64_zero(rack_rxt_clamps_cwnd); 850 counter_u64_zero(rack_rxt_clamps_cwnd_uniq); 851 counter_u64_zero(rack_multi_single_eq); 852 counter_u64_zero(rack_proc_non_comp_ack); 853 counter_u64_zero(rack_sack_attacks_detected); 854 counter_u64_zero(rack_sack_attacks_reversed); 855 counter_u64_zero(rack_sack_attacks_suspect); 856 counter_u64_zero(rack_sack_used_next_merge); 857 counter_u64_zero(rack_sack_used_prev_merge); 858 counter_u64_zero(rack_sack_splits); 859 counter_u64_zero(rack_sack_skipped_acked); 860 counter_u64_zero(rack_ack_total); 861 counter_u64_zero(rack_express_sack); 862 counter_u64_zero(rack_sack_total); 863 counter_u64_zero(rack_move_none); 864 counter_u64_zero(rack_move_some); 865 counter_u64_zero(rack_try_scwnd); 866 counter_u64_zero(rack_collapsed_win); 867 counter_u64_zero(rack_collapsed_win_rxt); 868 counter_u64_zero(rack_collapsed_win_seen); 869 counter_u64_zero(rack_collapsed_win_rxt_bytes); 870 } else if (stat == 2) { 871 #ifdef INVARIANTS 872 printf("Clearing RACK option array\n"); 873 #endif 874 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); 875 } else if (stat == 3) { 876 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); 877 } else if (stat == 4) { 878 #ifdef INVARIANTS 879 printf("Clearing RACK out size array\n"); 880 #endif 881 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); 882 } 883 rack_clear_counter = 0; 884 return (0); 885 } 886 887 static void 888 rack_init_sysctls(void) 889 { 890 struct sysctl_oid *rack_counters; 891 struct sysctl_oid *rack_attack; 892 struct sysctl_oid *rack_pacing; 893 struct sysctl_oid *rack_timely; 894 struct sysctl_oid *rack_timers; 895 struct sysctl_oid *rack_tlp; 896 struct sysctl_oid *rack_misc; 897 struct sysctl_oid *rack_features; 898 struct sysctl_oid *rack_measure; 899 struct sysctl_oid *rack_probertt; 900 struct sysctl_oid *rack_hw_pacing; 901 struct sysctl_oid *rack_policing; 902 903 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 904 SYSCTL_CHILDREN(rack_sysctl_root), 905 OID_AUTO, 906 "sack_attack", 907 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 908 "Rack Sack Attack Counters and Controls"); 909 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_sysctl_root), 911 OID_AUTO, 912 "stats", 913 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 914 "Rack Counters"); 915 SYSCTL_ADD_S32(&rack_sysctl_ctx, 916 SYSCTL_CHILDREN(rack_sysctl_root), 917 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 918 &rack_rate_sample_method , USE_RTT_LOW, 919 "What method should we use for rate sampling 0=high, 1=low "); 920 /* Probe rtt related controls */ 921 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 922 SYSCTL_CHILDREN(rack_sysctl_root), 923 OID_AUTO, 924 "probertt", 925 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 926 "ProbeRTT related Controls"); 927 SYSCTL_ADD_U16(&rack_sysctl_ctx, 928 SYSCTL_CHILDREN(rack_probertt), 929 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 930 &rack_atexit_prtt_hbp, 130, 931 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 932 SYSCTL_ADD_U16(&rack_sysctl_ctx, 933 SYSCTL_CHILDREN(rack_probertt), 934 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 935 &rack_atexit_prtt, 130, 936 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 937 SYSCTL_ADD_U16(&rack_sysctl_ctx, 938 SYSCTL_CHILDREN(rack_probertt), 939 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 940 &rack_per_of_gp_probertt, 60, 941 "What percentage of goodput do we pace at in probertt"); 942 SYSCTL_ADD_U16(&rack_sysctl_ctx, 943 SYSCTL_CHILDREN(rack_probertt), 944 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 945 &rack_per_of_gp_probertt_reduce, 10, 946 "What percentage of goodput do we reduce every gp_srtt"); 947 SYSCTL_ADD_U16(&rack_sysctl_ctx, 948 SYSCTL_CHILDREN(rack_probertt), 949 OID_AUTO, "gp_per_low", CTLFLAG_RW, 950 &rack_per_of_gp_lowthresh, 40, 951 "What percentage of goodput do we allow the multiplier to fall to"); 952 SYSCTL_ADD_U32(&rack_sysctl_ctx, 953 SYSCTL_CHILDREN(rack_probertt), 954 OID_AUTO, "time_between", CTLFLAG_RW, 955 & rack_time_between_probertt, 96000000, 956 "How many useconds between the lowest rtt falling must past before we enter probertt"); 957 SYSCTL_ADD_U32(&rack_sysctl_ctx, 958 SYSCTL_CHILDREN(rack_probertt), 959 OID_AUTO, "safety", CTLFLAG_RW, 960 &rack_probe_rtt_safety_val, 2000000, 961 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 962 SYSCTL_ADD_U32(&rack_sysctl_ctx, 963 SYSCTL_CHILDREN(rack_probertt), 964 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 965 &rack_probe_rtt_sets_cwnd, 0, 966 "Do we set the cwnd too (if always_lower is on)"); 967 SYSCTL_ADD_U32(&rack_sysctl_ctx, 968 SYSCTL_CHILDREN(rack_probertt), 969 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 970 &rack_max_drain_wait, 2, 971 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 972 SYSCTL_ADD_U32(&rack_sysctl_ctx, 973 SYSCTL_CHILDREN(rack_probertt), 974 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 975 &rack_must_drain, 1, 976 "We must drain this many gp_srtt's waiting for flight to reach goal"); 977 SYSCTL_ADD_U32(&rack_sysctl_ctx, 978 SYSCTL_CHILDREN(rack_probertt), 979 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 980 &rack_probertt_use_min_rtt_entry, 1, 981 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 982 SYSCTL_ADD_U32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_probertt), 984 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 985 &rack_probertt_use_min_rtt_exit, 0, 986 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 987 SYSCTL_ADD_U32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_probertt), 989 OID_AUTO, "length_div", CTLFLAG_RW, 990 &rack_probertt_gpsrtt_cnt_div, 0, 991 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 992 SYSCTL_ADD_U32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_probertt), 994 OID_AUTO, "length_mul", CTLFLAG_RW, 995 &rack_probertt_gpsrtt_cnt_mul, 0, 996 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 997 SYSCTL_ADD_U32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_probertt), 999 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 1000 &rack_min_probertt_hold, 200000, 1001 "What is the minimum time we hold probertt at target"); 1002 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_probertt), 1004 OID_AUTO, "filter_life", CTLFLAG_RW, 1005 &rack_probertt_filter_life, 10000000, 1006 "What is the time for the filters life in useconds"); 1007 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1008 SYSCTL_CHILDREN(rack_probertt), 1009 OID_AUTO, "lower_within", CTLFLAG_RW, 1010 &rack_probertt_lower_within, 10, 1011 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 1012 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1013 SYSCTL_CHILDREN(rack_probertt), 1014 OID_AUTO, "must_move", CTLFLAG_RW, 1015 &rack_min_rtt_movement, 250, 1016 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 1017 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1018 SYSCTL_CHILDREN(rack_probertt), 1019 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 1020 &rack_probertt_clear_is, 1, 1021 "Do we clear I/S counts on exiting probe-rtt"); 1022 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1023 SYSCTL_CHILDREN(rack_probertt), 1024 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 1025 &rack_max_drain_hbp, 1, 1026 "How many extra drain gpsrtt's do we get in highly buffered paths"); 1027 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1028 SYSCTL_CHILDREN(rack_probertt), 1029 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 1030 &rack_hbp_thresh, 3, 1031 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 1032 /* Pacing related sysctls */ 1033 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1034 SYSCTL_CHILDREN(rack_sysctl_root), 1035 OID_AUTO, 1036 "pacing", 1037 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1038 "Pacing related Controls"); 1039 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_pacing), 1041 OID_AUTO, "pcm_enabled", CTLFLAG_RW, 1042 &rack_pcm_is_enabled, 1, 1043 "Do we by default do PCM measurements?"); 1044 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_pacing), 1046 OID_AUTO, "pcm_rnds", CTLFLAG_RW, 1047 &rack_pcm_every_n_rounds, 100, 1048 "How many rounds before we need to do a PCM measurement"); 1049 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1050 SYSCTL_CHILDREN(rack_pacing), 1051 OID_AUTO, "pcm_blast", CTLFLAG_RW, 1052 &rack_pcm_blast, 0, 1053 "Blast out the full cwnd/rwnd when doing a PCM measurement"); 1054 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1055 SYSCTL_CHILDREN(rack_pacing), 1056 OID_AUTO, "rnd_gp_gain", CTLFLAG_RW, 1057 &rack_gp_gain_req, 1200, 1058 "How much do we have to increase the GP to record the round 1200 = 120.0"); 1059 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1060 SYSCTL_CHILDREN(rack_pacing), 1061 OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW, 1062 &rack_rnd_cnt_req, 0x10005, 1063 "How many rounds less than rnd_gp_gain will drop us out of SS"); 1064 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1065 SYSCTL_CHILDREN(rack_pacing), 1066 OID_AUTO, "no_timely", CTLFLAG_RW, 1067 &rack_timely_off, 0, 1068 "Do we not use timely in DGP?"); 1069 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1070 SYSCTL_CHILDREN(rack_pacing), 1071 OID_AUTO, "fullbufdisc", CTLFLAG_RW, 1072 &rack_full_buffer_discount, 10, 1073 "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); 1074 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1075 SYSCTL_CHILDREN(rack_pacing), 1076 OID_AUTO, "fillcw", CTLFLAG_RW, 1077 &rack_fill_cw_state, 0, 1078 "Enable fillcw on new connections (default=0 off)?"); 1079 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1080 SYSCTL_CHILDREN(rack_pacing), 1081 OID_AUTO, "min_burst", CTLFLAG_RW, 1082 &rack_pacing_min_seg, 0, 1083 "What is the min burst size for pacing (0 disables)?"); 1084 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1085 SYSCTL_CHILDREN(rack_pacing), 1086 OID_AUTO, "divisor", CTLFLAG_RW, 1087 &rack_default_pacing_divisor, 250, 1088 "What is the default divisor given to the rl code?"); 1089 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1090 SYSCTL_CHILDREN(rack_pacing), 1091 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, 1092 &rack_bw_multipler, 0, 1093 "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?"); 1094 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1095 SYSCTL_CHILDREN(rack_pacing), 1096 OID_AUTO, "max_pace_over", CTLFLAG_RW, 1097 &rack_max_per_above, 30, 1098 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 1099 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1100 SYSCTL_CHILDREN(rack_pacing), 1101 OID_AUTO, "allow1mss", CTLFLAG_RW, 1102 &rack_pace_one_seg, 0, 1103 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); 1104 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1105 SYSCTL_CHILDREN(rack_pacing), 1106 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 1107 &rack_limit_time_with_srtt, 0, 1108 "Do we limit pacing time based on srtt"); 1109 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1110 SYSCTL_CHILDREN(rack_pacing), 1111 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1112 &rack_per_of_gp_ss, 250, 1113 "If non zero, what percentage of goodput to pace at in slow start"); 1114 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1115 SYSCTL_CHILDREN(rack_pacing), 1116 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1117 &rack_per_of_gp_ca, 150, 1118 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1119 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1120 SYSCTL_CHILDREN(rack_pacing), 1121 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1122 &rack_per_of_gp_rec, 200, 1123 "If non zero, what percentage of goodput to pace at in recovery"); 1124 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1125 SYSCTL_CHILDREN(rack_pacing), 1126 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1127 &rack_hptsi_segments, 40, 1128 "What size is the max for TSO segments in pacing and burst mitigation"); 1129 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1130 SYSCTL_CHILDREN(rack_pacing), 1131 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1132 &rack_slot_reduction, 4, 1133 "When doing only burst mitigation what is the reduce divisor"); 1134 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1135 SYSCTL_CHILDREN(rack_sysctl_root), 1136 OID_AUTO, "use_pacing", CTLFLAG_RW, 1137 &rack_pace_every_seg, 0, 1138 "If set we use pacing, if clear we use only the original burst mitigation"); 1139 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1140 SYSCTL_CHILDREN(rack_pacing), 1141 OID_AUTO, "rate_cap", CTLFLAG_RW, 1142 &rack_bw_rate_cap, 0, 1143 "If set we apply this value to the absolute rate cap used by pacing"); 1144 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1145 SYSCTL_CHILDREN(rack_pacing), 1146 OID_AUTO, "fillcw_cap", CTLFLAG_RW, 1147 &rack_fillcw_bw_cap, 3750000, 1148 "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?"); 1149 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1150 SYSCTL_CHILDREN(rack_sysctl_root), 1151 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1152 &rack_req_measurements, 1, 1153 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1154 /* Hardware pacing */ 1155 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1156 SYSCTL_CHILDREN(rack_sysctl_root), 1157 OID_AUTO, 1158 "hdwr_pacing", 1159 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1160 "Pacing related Controls"); 1161 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1162 SYSCTL_CHILDREN(rack_hw_pacing), 1163 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1164 &rack_hw_rwnd_factor, 2, 1165 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1166 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1167 SYSCTL_CHILDREN(rack_hw_pacing), 1168 OID_AUTO, "precheck", CTLFLAG_RW, 1169 &rack_hw_check_queue, 0, 1170 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); 1171 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1172 SYSCTL_CHILDREN(rack_hw_pacing), 1173 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1174 &rack_enobuf_hw_boost_mult, 0, 1175 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1176 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1177 SYSCTL_CHILDREN(rack_hw_pacing), 1178 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1179 &rack_enobuf_hw_max, 2, 1180 "What is the max boost the pacing time if we see a ENOBUFS?"); 1181 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1182 SYSCTL_CHILDREN(rack_hw_pacing), 1183 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1184 &rack_enobuf_hw_min, 2, 1185 "What is the min boost the pacing time if we see a ENOBUFS?"); 1186 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1187 SYSCTL_CHILDREN(rack_hw_pacing), 1188 OID_AUTO, "enable", CTLFLAG_RW, 1189 &rack_enable_hw_pacing, 0, 1190 "Should RACK attempt to use hw pacing?"); 1191 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1192 SYSCTL_CHILDREN(rack_hw_pacing), 1193 OID_AUTO, "rate_cap", CTLFLAG_RW, 1194 &rack_hw_rate_caps, 0, 1195 "Does the highest hardware pacing rate cap the rate we will send at??"); 1196 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1197 SYSCTL_CHILDREN(rack_hw_pacing), 1198 OID_AUTO, "uncap_per", CTLFLAG_RW, 1199 &rack_hw_rate_cap_per, 0, 1200 "If you go over b/w by this amount you will be uncapped (0 = never)"); 1201 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_hw_pacing), 1203 OID_AUTO, "rate_min", CTLFLAG_RW, 1204 &rack_hw_rate_min, 0, 1205 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1206 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1207 SYSCTL_CHILDREN(rack_hw_pacing), 1208 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1209 &rack_hw_rate_to_low, 0, 1210 "If we fall below this rate, dis-engage hw pacing?"); 1211 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_hw_pacing), 1213 OID_AUTO, "up_only", CTLFLAG_RW, 1214 &rack_hw_up_only, 0, 1215 "Do we allow hw pacing to lower the rate selected?"); 1216 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_hw_pacing), 1218 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1219 &rack_hw_pace_extra_slots, 0, 1220 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1221 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1222 SYSCTL_CHILDREN(rack_sysctl_root), 1223 OID_AUTO, 1224 "timely", 1225 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1226 "Rack Timely RTT Controls"); 1227 /* Timely based GP dynmics */ 1228 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1229 SYSCTL_CHILDREN(rack_timely), 1230 OID_AUTO, "upper", CTLFLAG_RW, 1231 &rack_gp_per_bw_mul_up, 2, 1232 "Rack timely upper range for equal b/w (in percentage)"); 1233 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1234 SYSCTL_CHILDREN(rack_timely), 1235 OID_AUTO, "lower", CTLFLAG_RW, 1236 &rack_gp_per_bw_mul_down, 4, 1237 "Rack timely lower range for equal b/w (in percentage)"); 1238 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1239 SYSCTL_CHILDREN(rack_timely), 1240 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1241 &rack_gp_rtt_maxmul, 3, 1242 "Rack timely multiplier of lowest rtt for rtt_max"); 1243 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1244 SYSCTL_CHILDREN(rack_timely), 1245 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1246 &rack_gp_rtt_mindiv, 4, 1247 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1248 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1249 SYSCTL_CHILDREN(rack_timely), 1250 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1251 &rack_gp_rtt_minmul, 1, 1252 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1253 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1254 SYSCTL_CHILDREN(rack_timely), 1255 OID_AUTO, "decrease", CTLFLAG_RW, 1256 &rack_gp_decrease_per, 80, 1257 "Rack timely Beta value 80 = .8 (scaled by 100)"); 1258 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1259 SYSCTL_CHILDREN(rack_timely), 1260 OID_AUTO, "increase", CTLFLAG_RW, 1261 &rack_gp_increase_per, 2, 1262 "Rack timely increase perentage of our GP multiplication factor"); 1263 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1264 SYSCTL_CHILDREN(rack_timely), 1265 OID_AUTO, "lowerbound", CTLFLAG_RW, 1266 &rack_per_lower_bound, 50, 1267 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1268 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1269 SYSCTL_CHILDREN(rack_timely), 1270 OID_AUTO, "p5_upper", CTLFLAG_RW, 1271 &rack_gain_p5_ub, 250, 1272 "Profile 5 upper bound to timely gain"); 1273 1274 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1275 SYSCTL_CHILDREN(rack_timely), 1276 OID_AUTO, "upperboundss", CTLFLAG_RW, 1277 &rack_per_upper_bound_ss, 0, 1278 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1279 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1280 SYSCTL_CHILDREN(rack_timely), 1281 OID_AUTO, "upperboundca", CTLFLAG_RW, 1282 &rack_per_upper_bound_ca, 0, 1283 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1284 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1285 SYSCTL_CHILDREN(rack_timely), 1286 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1287 &rack_do_dyn_mul, 0, 1288 "Rack timely do we enable dynmaic timely goodput by default"); 1289 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_timely), 1291 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1292 &rack_gp_no_rec_chg, 1, 1293 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1294 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_timely), 1296 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1297 &rack_timely_dec_clear, 6, 1298 "Rack timely what threshold do we count to before another boost during b/w decent"); 1299 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1300 SYSCTL_CHILDREN(rack_timely), 1301 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1302 &rack_timely_max_push_rise, 3, 1303 "Rack timely how many times do we push up with b/w increase"); 1304 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1305 SYSCTL_CHILDREN(rack_timely), 1306 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1307 &rack_timely_max_push_drop, 3, 1308 "Rack timely how many times do we push back on b/w decent"); 1309 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1310 SYSCTL_CHILDREN(rack_timely), 1311 OID_AUTO, "min_segs", CTLFLAG_RW, 1312 &rack_timely_min_segs, 4, 1313 "Rack timely when setting the cwnd what is the min num segments"); 1314 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_timely), 1316 OID_AUTO, "noback_max", CTLFLAG_RW, 1317 &rack_use_max_for_nobackoff, 0, 1318 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1319 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1320 SYSCTL_CHILDREN(rack_timely), 1321 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1322 &rack_timely_int_timely_only, 0, 1323 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1324 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1325 SYSCTL_CHILDREN(rack_timely), 1326 OID_AUTO, "nonstop", CTLFLAG_RW, 1327 &rack_timely_no_stopping, 0, 1328 "Rack timely don't stop increase"); 1329 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1330 SYSCTL_CHILDREN(rack_timely), 1331 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1332 &rack_down_raise_thresh, 100, 1333 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1334 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1335 SYSCTL_CHILDREN(rack_timely), 1336 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1337 &rack_req_segs, 1, 1338 "Bottom dragging if not these many segments outstanding and room"); 1339 1340 /* TLP and Rack related parameters */ 1341 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1342 SYSCTL_CHILDREN(rack_sysctl_root), 1343 OID_AUTO, 1344 "tlp", 1345 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1346 "TLP and Rack related Controls"); 1347 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1348 SYSCTL_CHILDREN(rack_tlp), 1349 OID_AUTO, "use_rrr", CTLFLAG_RW, 1350 &use_rack_rr, 1, 1351 "Do we use Rack Rapid Recovery"); 1352 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1353 SYSCTL_CHILDREN(rack_tlp), 1354 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1355 &rack_max_abc_post_recovery, 2, 1356 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1357 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1358 SYSCTL_CHILDREN(rack_tlp), 1359 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1360 &rack_non_rxt_use_cr, 0, 1361 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1362 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_tlp), 1364 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1365 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1366 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1367 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1368 SYSCTL_CHILDREN(rack_tlp), 1369 OID_AUTO, "limit", CTLFLAG_RW, 1370 &rack_tlp_limit, 2, 1371 "How many TLP's can be sent without sending new data"); 1372 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_tlp), 1374 OID_AUTO, "use_greater", CTLFLAG_RW, 1375 &rack_tlp_use_greater, 1, 1376 "Should we use the rack_rtt time if its greater than srtt"); 1377 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1378 SYSCTL_CHILDREN(rack_tlp), 1379 OID_AUTO, "tlpminto", CTLFLAG_RW, 1380 &rack_tlp_min, 10000, 1381 "TLP minimum timeout per the specification (in microseconds)"); 1382 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1383 SYSCTL_CHILDREN(rack_tlp), 1384 OID_AUTO, "send_oldest", CTLFLAG_RW, 1385 &rack_always_send_oldest, 0, 1386 "Should we always send the oldest TLP and RACK-TLP"); 1387 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1388 SYSCTL_CHILDREN(rack_tlp), 1389 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1390 &rack_lower_cwnd_at_tlp, 0, 1391 "When a TLP completes a retran should we enter recovery"); 1392 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1393 SYSCTL_CHILDREN(rack_tlp), 1394 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1395 &rack_reorder_thresh, 2, 1396 "What factor for rack will be added when seeing reordering (shift right)"); 1397 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1398 SYSCTL_CHILDREN(rack_tlp), 1399 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1400 &rack_tlp_thresh, 1, 1401 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1402 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1403 SYSCTL_CHILDREN(rack_tlp), 1404 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1405 &rack_reorder_fade, 60000000, 1406 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1407 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1408 SYSCTL_CHILDREN(rack_tlp), 1409 OID_AUTO, "pktdelay", CTLFLAG_RW, 1410 &rack_pkt_delay, 1000, 1411 "Extra RACK time (in microseconds) besides reordering thresh"); 1412 1413 /* Timer related controls */ 1414 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1415 SYSCTL_CHILDREN(rack_sysctl_root), 1416 OID_AUTO, 1417 "timers", 1418 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1419 "Timer related controls"); 1420 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_timers), 1422 OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW, 1423 &rack_ssthresh_rest_rto_rec, 0, 1424 "When doing recovery -> rto -> recovery do we reset SSthresh?"); 1425 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1426 SYSCTL_CHILDREN(rack_timers), 1427 OID_AUTO, "scoreboard_thresh", CTLFLAG_RW, 1428 &rack_rxt_scoreboard_clear_thresh, 2, 1429 "How many RTO's are allowed before we clear the scoreboard"); 1430 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1431 SYSCTL_CHILDREN(rack_timers), 1432 OID_AUTO, "honor_hpts_min", CTLFLAG_RW, 1433 &rack_honors_hpts_min_to, 1, 1434 "Do rack pacing timers honor hpts min timeout"); 1435 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1436 SYSCTL_CHILDREN(rack_timers), 1437 OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, 1438 &rack_max_reduce, 10, 1439 "Max percentage we will reduce slot by for pacing when we are behind"); 1440 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_timers), 1442 OID_AUTO, "persmin", CTLFLAG_RW, 1443 &rack_persist_min, 250000, 1444 "What is the minimum time in microseconds between persists"); 1445 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1446 SYSCTL_CHILDREN(rack_timers), 1447 OID_AUTO, "persmax", CTLFLAG_RW, 1448 &rack_persist_max, 2000000, 1449 "What is the largest delay in microseconds between persists"); 1450 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1451 SYSCTL_CHILDREN(rack_timers), 1452 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1453 &rack_delayed_ack_time, 40000, 1454 "Delayed ack time (40ms in microseconds)"); 1455 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1456 SYSCTL_CHILDREN(rack_timers), 1457 OID_AUTO, "minrto", CTLFLAG_RW, 1458 &rack_rto_min, 30000, 1459 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1460 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1461 SYSCTL_CHILDREN(rack_timers), 1462 OID_AUTO, "maxrto", CTLFLAG_RW, 1463 &rack_rto_max, 4000000, 1464 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1465 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1466 SYSCTL_CHILDREN(rack_timers), 1467 OID_AUTO, "minto", CTLFLAG_RW, 1468 &rack_min_to, 1000, 1469 "Minimum rack timeout in microseconds"); 1470 /* Measure controls */ 1471 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1472 SYSCTL_CHILDREN(rack_sysctl_root), 1473 OID_AUTO, 1474 "measure", 1475 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1476 "Measure related controls"); 1477 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1478 SYSCTL_CHILDREN(rack_measure), 1479 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1480 &rack_wma_divisor, 8, 1481 "When doing b/w calculation what is the divisor for the WMA"); 1482 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1483 SYSCTL_CHILDREN(rack_measure), 1484 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1485 &rack_cwnd_block_ends_measure, 0, 1486 "Does a cwnd just-return end the measurement window (app limited)"); 1487 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1488 SYSCTL_CHILDREN(rack_measure), 1489 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1490 &rack_rwnd_block_ends_measure, 0, 1491 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1492 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1493 SYSCTL_CHILDREN(rack_measure), 1494 OID_AUTO, "min_target", CTLFLAG_RW, 1495 &rack_def_data_window, 20, 1496 "What is the minimum target window (in mss) for a GP measurements"); 1497 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1498 SYSCTL_CHILDREN(rack_measure), 1499 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1500 &rack_goal_bdp, 2, 1501 "What is the goal BDP to measure"); 1502 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1503 SYSCTL_CHILDREN(rack_measure), 1504 OID_AUTO, "min_srtts", CTLFLAG_RW, 1505 &rack_min_srtts, 1, 1506 "What is the goal BDP to measure"); 1507 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1508 SYSCTL_CHILDREN(rack_measure), 1509 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1510 &rack_min_measure_usec, 0, 1511 "What is the Minimum time time for a measurement if 0, this is off"); 1512 /* Features */ 1513 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1514 SYSCTL_CHILDREN(rack_sysctl_root), 1515 OID_AUTO, 1516 "features", 1517 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1518 "Feature controls"); 1519 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1520 SYSCTL_CHILDREN(rack_features), 1521 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, 1522 &rack_hybrid_allow_set_maxseg, 0, 1523 "Should hybrid pacing allow the setmss command"); 1524 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1525 SYSCTL_CHILDREN(rack_features), 1526 OID_AUTO, "cmpack", CTLFLAG_RW, 1527 &rack_use_cmp_acks, 1, 1528 "Should RACK have LRO send compressed acks"); 1529 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1530 SYSCTL_CHILDREN(rack_features), 1531 OID_AUTO, "fsb", CTLFLAG_RW, 1532 &rack_use_fsb, 1, 1533 "Should RACK use the fast send block?"); 1534 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1535 SYSCTL_CHILDREN(rack_features), 1536 OID_AUTO, "rfo", CTLFLAG_RW, 1537 &rack_use_rfo, 1, 1538 "Should RACK use rack_fast_output()?"); 1539 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1540 SYSCTL_CHILDREN(rack_features), 1541 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1542 &rack_use_rsm_rfo, 1, 1543 "Should RACK use rack_fast_rsm_output()?"); 1544 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1545 SYSCTL_CHILDREN(rack_features), 1546 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1547 &rack_enable_mqueue_for_nonpaced, 0, 1548 "Should RACK use mbuf queuing for non-paced connections"); 1549 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1550 SYSCTL_CHILDREN(rack_features), 1551 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1552 &rack_do_hystart, 0, 1553 "Should RACK enable HyStart++ on connections?"); 1554 /* Policer detection */ 1555 rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1556 SYSCTL_CHILDREN(rack_sysctl_root), 1557 OID_AUTO, 1558 "policing", 1559 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1560 "policer detection"); 1561 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1562 SYSCTL_CHILDREN(rack_policing), 1563 OID_AUTO, "rxt_thresh", CTLFLAG_RW, 1564 &rack_policer_rxt_thresh, 0, 1565 "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)"); 1566 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1567 SYSCTL_CHILDREN(rack_policing), 1568 OID_AUTO, "avg_thresh", CTLFLAG_RW, 1569 &rack_policer_avg_thresh, 0, 1570 "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?"); 1571 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1572 SYSCTL_CHILDREN(rack_policing), 1573 OID_AUTO, "med_thresh", CTLFLAG_RW, 1574 &rack_policer_med_thresh, 0, 1575 "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?"); 1576 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1577 SYSCTL_CHILDREN(rack_policing), 1578 OID_AUTO, "data_thresh", CTLFLAG_RW, 1579 &rack_policer_data_thresh, 64000, 1580 "How many bytes must have gotten through before we can start doing policer detection?"); 1581 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1582 SYSCTL_CHILDREN(rack_policing), 1583 OID_AUTO, "bwcomp", CTLFLAG_RW, 1584 &rack_policing_do_bw_comp, 1, 1585 "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?"); 1586 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1587 SYSCTL_CHILDREN(rack_policing), 1588 OID_AUTO, "recmss", CTLFLAG_RW, 1589 &rack_req_del_mss, 18, 1590 "How many MSS must be delivered during recovery to engage policer detection?"); 1591 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1592 SYSCTL_CHILDREN(rack_policing), 1593 OID_AUTO, "res_div", CTLFLAG_RW, 1594 &rack_policer_bucket_reserve, 20, 1595 "What percentage is reserved in the policer bucket?"); 1596 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1597 SYSCTL_CHILDREN(rack_policing), 1598 OID_AUTO, "min_comp_bw", CTLFLAG_RW, 1599 &rack_pol_min_bw, 125000, 1600 "Do we have a min b/w for b/w compensation (0 = no)?"); 1601 /* Misc rack controls */ 1602 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1603 SYSCTL_CHILDREN(rack_sysctl_root), 1604 OID_AUTO, 1605 "misc", 1606 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1607 "Misc related controls"); 1608 #ifdef TCP_ACCOUNTING 1609 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1610 SYSCTL_CHILDREN(rack_misc), 1611 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1612 &rack_tcp_accounting, 0, 1613 "Should we turn on TCP accounting for all rack sessions?"); 1614 #endif 1615 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1616 SYSCTL_CHILDREN(rack_misc), 1617 OID_AUTO, "dnd", CTLFLAG_RW, 1618 &rack_dnd_default, 0, 1619 "Do not disturb default for rack_rrr = 3"); 1620 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1621 SYSCTL_CHILDREN(rack_misc), 1622 OID_AUTO, "sad_seg_per", CTLFLAG_RW, 1623 &sad_seg_size_per, 800, 1624 "Percentage of segment size needed in a sack 800 = 80.0?"); 1625 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1626 SYSCTL_CHILDREN(rack_misc), 1627 OID_AUTO, "rxt_controls", CTLFLAG_RW, 1628 &rack_rxt_controls, 0, 1629 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); 1630 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1631 SYSCTL_CHILDREN(rack_misc), 1632 OID_AUTO, "rack_hibeta", CTLFLAG_RW, 1633 &rack_hibeta_setting, 0, 1634 "Do we ue a high beta (80 instead of 50)?"); 1635 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1636 SYSCTL_CHILDREN(rack_misc), 1637 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1638 &rack_apply_rtt_with_reduced_conf, 0, 1639 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1640 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1641 SYSCTL_CHILDREN(rack_misc), 1642 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1643 &rack_dsack_std_based, 3, 1644 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1645 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1646 SYSCTL_CHILDREN(rack_misc), 1647 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1648 &rack_prr_addbackmax, 2, 1649 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1650 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1651 SYSCTL_CHILDREN(rack_misc), 1652 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1653 &rack_stats_gets_ms_rtt, 1, 1654 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1655 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1656 SYSCTL_CHILDREN(rack_misc), 1657 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1658 &rack_client_low_buf, 0, 1659 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1660 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1661 SYSCTL_CHILDREN(rack_misc), 1662 OID_AUTO, "defprofile", CTLFLAG_RW, 1663 &rack_def_profile, 0, 1664 "Should RACK use a default profile (0=no, num == profile num)?"); 1665 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1666 SYSCTL_CHILDREN(rack_misc), 1667 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1668 &rack_enable_shared_cwnd, 1, 1669 "Should RACK try to use the shared cwnd on connections where allowed"); 1670 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1671 SYSCTL_CHILDREN(rack_misc), 1672 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1673 &rack_limits_scwnd, 1, 1674 "Should RACK place low end time limits on the shared cwnd feature"); 1675 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1676 SYSCTL_CHILDREN(rack_misc), 1677 OID_AUTO, "no_prr", CTLFLAG_RW, 1678 &rack_disable_prr, 0, 1679 "Should RACK not use prr and only pace (must have pacing on)"); 1680 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1681 SYSCTL_CHILDREN(rack_misc), 1682 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1683 &rack_verbose_logging, 0, 1684 "Should RACK black box logging be verbose"); 1685 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1686 SYSCTL_CHILDREN(rack_misc), 1687 OID_AUTO, "data_after_close", CTLFLAG_RW, 1688 &rack_ignore_data_after_close, 1, 1689 "Do we hold off sending a RST until all pending data is ack'd"); 1690 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1691 SYSCTL_CHILDREN(rack_misc), 1692 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1693 &rack_sack_not_required, 1, 1694 "Do we allow rack to run on connections not supporting SACK"); 1695 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1696 SYSCTL_CHILDREN(rack_misc), 1697 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1698 &rack_send_a_lot_in_prr, 1, 1699 "Send a lot in prr"); 1700 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1701 SYSCTL_CHILDREN(rack_misc), 1702 OID_AUTO, "autoscale", CTLFLAG_RW, 1703 &rack_autosndbuf_inc, 20, 1704 "What percentage should rack scale up its snd buffer by?"); 1705 1706 1707 /* Sack Attacker detection stuff */ 1708 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1709 SYSCTL_CHILDREN(rack_attack), 1710 OID_AUTO, "merge_out", CTLFLAG_RW, 1711 &rack_merge_out_sacks_on_attack, 0, 1712 "Do we merge the sendmap when we decide we are being attacked?"); 1713 1714 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1715 SYSCTL_CHILDREN(rack_attack), 1716 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1717 &rack_highest_sack_thresh_seen, 0, 1718 "Highest sack to ack ratio seen"); 1719 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1720 SYSCTL_CHILDREN(rack_attack), 1721 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1722 &rack_highest_move_thresh_seen, 0, 1723 "Highest move to non-move ratio seen"); 1724 rack_ack_total = counter_u64_alloc(M_WAITOK); 1725 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1726 SYSCTL_CHILDREN(rack_attack), 1727 OID_AUTO, "acktotal", CTLFLAG_RD, 1728 &rack_ack_total, 1729 "Total number of Ack's"); 1730 rack_express_sack = counter_u64_alloc(M_WAITOK); 1731 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1732 SYSCTL_CHILDREN(rack_attack), 1733 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1734 &rack_express_sack, 1735 "Total expresss number of Sack's"); 1736 rack_sack_total = counter_u64_alloc(M_WAITOK); 1737 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1738 SYSCTL_CHILDREN(rack_attack), 1739 OID_AUTO, "sacktotal", CTLFLAG_RD, 1740 &rack_sack_total, 1741 "Total number of SACKs"); 1742 rack_move_none = counter_u64_alloc(M_WAITOK); 1743 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1744 SYSCTL_CHILDREN(rack_attack), 1745 OID_AUTO, "move_none", CTLFLAG_RD, 1746 &rack_move_none, 1747 "Total number of SACK index reuse of positions under threshold"); 1748 rack_move_some = counter_u64_alloc(M_WAITOK); 1749 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1750 SYSCTL_CHILDREN(rack_attack), 1751 OID_AUTO, "move_some", CTLFLAG_RD, 1752 &rack_move_some, 1753 "Total number of SACK index reuse of positions over threshold"); 1754 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1756 SYSCTL_CHILDREN(rack_attack), 1757 OID_AUTO, "attacks", CTLFLAG_RD, 1758 &rack_sack_attacks_detected, 1759 "Total number of SACK attackers that had sack disabled"); 1760 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1762 SYSCTL_CHILDREN(rack_attack), 1763 OID_AUTO, "reversed", CTLFLAG_RD, 1764 &rack_sack_attacks_reversed, 1765 "Total number of SACK attackers that were later determined false positive"); 1766 rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); 1767 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1768 SYSCTL_CHILDREN(rack_attack), 1769 OID_AUTO, "suspect", CTLFLAG_RD, 1770 &rack_sack_attacks_suspect, 1771 "Total number of SACKs that triggered early detection"); 1772 1773 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1774 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1775 SYSCTL_CHILDREN(rack_attack), 1776 OID_AUTO, "nextmerge", CTLFLAG_RD, 1777 &rack_sack_used_next_merge, 1778 "Total number of times we used the next merge"); 1779 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1780 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1781 SYSCTL_CHILDREN(rack_attack), 1782 OID_AUTO, "prevmerge", CTLFLAG_RD, 1783 &rack_sack_used_prev_merge, 1784 "Total number of times we used the prev merge"); 1785 /* Counters */ 1786 rack_total_bytes = counter_u64_alloc(M_WAITOK); 1787 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1788 SYSCTL_CHILDREN(rack_counters), 1789 OID_AUTO, "totalbytes", CTLFLAG_RD, 1790 &rack_total_bytes, 1791 "Total number of bytes sent"); 1792 rack_fto_send = counter_u64_alloc(M_WAITOK); 1793 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1794 SYSCTL_CHILDREN(rack_counters), 1795 OID_AUTO, "fto_send", CTLFLAG_RD, 1796 &rack_fto_send, "Total number of rack_fast_output sends"); 1797 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1799 SYSCTL_CHILDREN(rack_counters), 1800 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1801 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1802 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1803 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1804 SYSCTL_CHILDREN(rack_counters), 1805 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1806 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1807 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1808 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1809 SYSCTL_CHILDREN(rack_counters), 1810 OID_AUTO, "nfto_send", CTLFLAG_RD, 1811 &rack_non_fto_send, "Total number of rack_output first sends"); 1812 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1813 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1814 SYSCTL_CHILDREN(rack_counters), 1815 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1816 &rack_extended_rfo, "Total number of times we extended rfo"); 1817 1818 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1820 SYSCTL_CHILDREN(rack_counters), 1821 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1822 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1823 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1824 1825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1826 SYSCTL_CHILDREN(rack_counters), 1827 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1828 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1829 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1830 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1831 SYSCTL_CHILDREN(rack_counters), 1832 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1833 &rack_tlp_tot, 1834 "Total number of tail loss probe expirations"); 1835 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1836 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1837 SYSCTL_CHILDREN(rack_counters), 1838 OID_AUTO, "tlp_new", CTLFLAG_RD, 1839 &rack_tlp_newdata, 1840 "Total number of tail loss probe sending new data"); 1841 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1842 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1843 SYSCTL_CHILDREN(rack_counters), 1844 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1845 &rack_tlp_retran, 1846 "Total number of tail loss probe sending retransmitted data"); 1847 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1848 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1849 SYSCTL_CHILDREN(rack_counters), 1850 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1851 &rack_tlp_retran_bytes, 1852 "Total bytes of tail loss probe sending retransmitted data"); 1853 rack_to_tot = counter_u64_alloc(M_WAITOK); 1854 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1855 SYSCTL_CHILDREN(rack_counters), 1856 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1857 &rack_to_tot, 1858 "Total number of times the rack to expired"); 1859 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1860 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1861 SYSCTL_CHILDREN(rack_counters), 1862 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1863 &rack_saw_enobuf, 1864 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1865 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1866 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1867 SYSCTL_CHILDREN(rack_counters), 1868 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1869 &rack_saw_enobuf_hw, 1870 "Total number of times a send returned enobuf for hdwr paced connections"); 1871 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1872 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1873 SYSCTL_CHILDREN(rack_counters), 1874 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1875 &rack_saw_enetunreach, 1876 "Total number of times a send received a enetunreachable"); 1877 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1878 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1879 SYSCTL_CHILDREN(rack_counters), 1880 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1881 &rack_hot_alloc, 1882 "Total allocations from the top of our list"); 1883 tcp_policer_detected = counter_u64_alloc(M_WAITOK); 1884 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1885 SYSCTL_CHILDREN(rack_counters), 1886 OID_AUTO, "policer_detected", CTLFLAG_RD, 1887 &tcp_policer_detected, 1888 "Total policer_detections"); 1889 1890 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1891 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1892 SYSCTL_CHILDREN(rack_counters), 1893 OID_AUTO, "allocs", CTLFLAG_RD, 1894 &rack_to_alloc, 1895 "Total allocations of tracking structures"); 1896 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1897 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1898 SYSCTL_CHILDREN(rack_counters), 1899 OID_AUTO, "allochard", CTLFLAG_RD, 1900 &rack_to_alloc_hard, 1901 "Total allocations done with sleeping the hard way"); 1902 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1903 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1904 SYSCTL_CHILDREN(rack_counters), 1905 OID_AUTO, "allocemerg", CTLFLAG_RD, 1906 &rack_to_alloc_emerg, 1907 "Total allocations done from emergency cache"); 1908 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1909 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1910 SYSCTL_CHILDREN(rack_counters), 1911 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1912 &rack_to_alloc_limited, 1913 "Total allocations dropped due to limit"); 1914 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1915 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1916 SYSCTL_CHILDREN(rack_counters), 1917 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1918 &rack_alloc_limited_conns, 1919 "Connections with allocations dropped due to limit"); 1920 rack_split_limited = counter_u64_alloc(M_WAITOK); 1921 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1922 SYSCTL_CHILDREN(rack_counters), 1923 OID_AUTO, "split_limited", CTLFLAG_RD, 1924 &rack_split_limited, 1925 "Split allocations dropped due to limit"); 1926 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); 1927 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1928 SYSCTL_CHILDREN(rack_counters), 1929 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, 1930 &rack_rxt_clamps_cwnd, 1931 "Number of times that excessive rxt clamped the cwnd down"); 1932 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); 1933 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1934 SYSCTL_CHILDREN(rack_counters), 1935 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, 1936 &rack_rxt_clamps_cwnd_uniq, 1937 "Number of connections that have had excessive rxt clamped the cwnd down"); 1938 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1939 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1940 SYSCTL_CHILDREN(rack_counters), 1941 OID_AUTO, "persist_sends", CTLFLAG_RD, 1942 &rack_persists_sends, 1943 "Number of times we sent a persist probe"); 1944 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1945 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1946 SYSCTL_CHILDREN(rack_counters), 1947 OID_AUTO, "persist_acks", CTLFLAG_RD, 1948 &rack_persists_acks, 1949 "Number of times a persist probe was acked"); 1950 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1951 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1952 SYSCTL_CHILDREN(rack_counters), 1953 OID_AUTO, "persist_loss", CTLFLAG_RD, 1954 &rack_persists_loss, 1955 "Number of times we detected a lost persist probe (no ack)"); 1956 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1957 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1958 SYSCTL_CHILDREN(rack_counters), 1959 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1960 &rack_persists_lost_ends, 1961 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1962 #ifdef INVARIANTS 1963 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1964 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1965 SYSCTL_CHILDREN(rack_counters), 1966 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1967 &rack_adjust_map_bw, 1968 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1969 #endif 1970 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1971 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1972 SYSCTL_CHILDREN(rack_counters), 1973 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1974 &rack_multi_single_eq, 1975 "Number of compressed acks total represented"); 1976 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1977 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1978 SYSCTL_CHILDREN(rack_counters), 1979 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1980 &rack_proc_non_comp_ack, 1981 "Number of non compresseds acks that we processed"); 1982 1983 1984 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1985 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1986 SYSCTL_CHILDREN(rack_counters), 1987 OID_AUTO, "sack_long", CTLFLAG_RD, 1988 &rack_sack_proc_all, 1989 "Total times we had to walk whole list for sack processing"); 1990 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1991 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1992 SYSCTL_CHILDREN(rack_counters), 1993 OID_AUTO, "sack_restart", CTLFLAG_RD, 1994 &rack_sack_proc_restart, 1995 "Total times we had to walk whole list due to a restart"); 1996 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1997 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1998 SYSCTL_CHILDREN(rack_counters), 1999 OID_AUTO, "sack_short", CTLFLAG_RD, 2000 &rack_sack_proc_short, 2001 "Total times we took shortcut for sack processing"); 2002 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 2003 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2004 SYSCTL_CHILDREN(rack_attack), 2005 OID_AUTO, "skipacked", CTLFLAG_RD, 2006 &rack_sack_skipped_acked, 2007 "Total number of times we skipped previously sacked"); 2008 rack_sack_splits = counter_u64_alloc(M_WAITOK); 2009 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2010 SYSCTL_CHILDREN(rack_attack), 2011 OID_AUTO, "ofsplit", CTLFLAG_RD, 2012 &rack_sack_splits, 2013 "Total number of times we did the old fashion tree split"); 2014 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 2015 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2016 SYSCTL_CHILDREN(rack_counters), 2017 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 2018 &rack_input_idle_reduces, 2019 "Total number of idle reductions on input"); 2020 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 2021 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2022 SYSCTL_CHILDREN(rack_counters), 2023 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 2024 &rack_collapsed_win_seen, 2025 "Total number of collapsed window events seen (where our window shrinks)"); 2026 2027 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 2028 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2029 SYSCTL_CHILDREN(rack_counters), 2030 OID_AUTO, "collapsed_win", CTLFLAG_RD, 2031 &rack_collapsed_win, 2032 "Total number of collapsed window events where we mark packets"); 2033 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 2034 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2035 SYSCTL_CHILDREN(rack_counters), 2036 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 2037 &rack_collapsed_win_rxt, 2038 "Total number of packets that were retransmitted"); 2039 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 2040 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2041 SYSCTL_CHILDREN(rack_counters), 2042 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 2043 &rack_collapsed_win_rxt_bytes, 2044 "Total number of bytes that were retransmitted"); 2045 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 2046 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 2047 SYSCTL_CHILDREN(rack_counters), 2048 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 2049 &rack_try_scwnd, 2050 "Total number of scwnd attempts"); 2051 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 2052 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 2053 OID_AUTO, "outsize", CTLFLAG_RD, 2054 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 2055 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 2056 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 2057 OID_AUTO, "opts", CTLFLAG_RD, 2058 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 2059 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 2060 SYSCTL_CHILDREN(rack_sysctl_root), 2061 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 2062 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 2063 } 2064 2065 static uint32_t 2066 rc_init_window(struct tcp_rack *rack) 2067 { 2068 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 2069 2070 } 2071 2072 static uint64_t 2073 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 2074 { 2075 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 2076 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 2077 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2078 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 2079 else 2080 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 2081 } 2082 2083 static void 2084 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, 2085 uint64_t data, uint8_t mod, uint16_t aux, 2086 struct tcp_sendfile_track *cur, int line) 2087 { 2088 #ifdef TCP_REQUEST_TRK 2089 int do_log = 0; 2090 2091 /* 2092 * The rate cap one is noisy and only should come out when normal BB logging 2093 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out 2094 * once per chunk and make up the BBpoint that can be turned on by the client. 2095 */ 2096 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2097 /* 2098 * The very noisy two need to only come out when 2099 * we have verbose logging on. 2100 */ 2101 if (rack_verbose_logging != 0) 2102 do_log = tcp_bblogging_on(rack->rc_tp); 2103 else 2104 do_log = 0; 2105 } else if (mod != HYBRID_LOG_BW_MEASURE) { 2106 /* 2107 * All other less noisy logs here except the measure which 2108 * also needs to come out on the point and the log. 2109 */ 2110 do_log = tcp_bblogging_on(rack->rc_tp); 2111 } else { 2112 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); 2113 } 2114 2115 if (do_log) { 2116 union tcp_log_stackspecific log; 2117 struct timeval tv; 2118 uint64_t lt_bw; 2119 2120 /* Convert our ms to a microsecond */ 2121 memset(&log, 0, sizeof(log)); 2122 2123 log.u_bbr.cwnd_gain = line; 2124 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2125 log.u_bbr.rttProp = tim; 2126 log.u_bbr.bw_inuse = cbw; 2127 log.u_bbr.delRate = rack_get_gp_est(rack); 2128 lt_bw = rack_get_lt_bw(rack); 2129 log.u_bbr.flex1 = seq; 2130 log.u_bbr.pacing_gain = aux; 2131 /* lt_bw = < flex3 | flex2 > */ 2132 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); 2133 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); 2134 /* Record the last obtained us rtt in inflight */ 2135 if (cur == NULL) { 2136 /* Make sure we are looking at the right log if an overide comes in */ 2137 cur = rack->r_ctl.rc_last_sft; 2138 } 2139 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) 2140 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; 2141 else { 2142 /* Use the last known rtt i.e. the rack-rtt */ 2143 log.u_bbr.inflight = rack->rc_rack_rtt; 2144 } 2145 if (cur != NULL) { 2146 uint64_t off; 2147 2148 log.u_bbr.cur_del_rate = cur->deadline; 2149 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2150 /* start = < lost | pkt_epoch > */ 2151 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2152 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2153 log.u_bbr.flex6 = cur->start_seq; 2154 log.u_bbr.pkts_out = cur->end_seq; 2155 } else { 2156 /* start = < lost | pkt_epoch > */ 2157 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2158 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2159 /* end = < pkts_out | flex6 > */ 2160 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); 2161 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2162 } 2163 /* first_send = <lt_epoch | epoch> */ 2164 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); 2165 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); 2166 /* localtime = <delivered | applimited>*/ 2167 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2168 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2169 #ifdef TCP_REQUEST_TRK 2170 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2171 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2172 #endif 2173 log.u_bbr.inhpts = 1; 2174 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); 2175 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); 2176 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; 2177 } else { 2178 log.u_bbr.flex7 = 0xffff; 2179 log.u_bbr.cur_del_rate = 0xffffffffffffffff; 2180 } 2181 /* 2182 * Compose bbr_state to be a bit wise 0000ADHF 2183 * where A is the always_pace flag 2184 * where D is the dgp_on flag 2185 * where H is the hybrid_mode on flag 2186 * where F is the use_fixed_rate flag. 2187 */ 2188 log.u_bbr.bbr_state = rack->rc_always_pace; 2189 log.u_bbr.bbr_state <<= 1; 2190 log.u_bbr.bbr_state |= rack->dgp_on; 2191 log.u_bbr.bbr_state <<= 1; 2192 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2193 log.u_bbr.bbr_state <<= 1; 2194 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2195 log.u_bbr.flex8 = mod; 2196 tcp_log_event(rack->rc_tp, NULL, 2197 &rack->rc_inp->inp_socket->so_rcv, 2198 &rack->rc_inp->inp_socket->so_snd, 2199 TCP_HYBRID_PACING_LOG, 0, 2200 0, &log, false, NULL, __func__, __LINE__, &tv); 2201 2202 } 2203 #endif 2204 } 2205 2206 #ifdef TCP_REQUEST_TRK 2207 static void 2208 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line) 2209 { 2210 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) { 2211 union tcp_log_stackspecific log; 2212 struct timeval tv; 2213 uint64_t off; 2214 2215 /* Convert our ms to a microsecond */ 2216 memset(&log, 0, sizeof(log)); 2217 2218 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2219 log.u_bbr.delRate = cur->sent_at_fs; 2220 2221 if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) { 2222 /* 2223 * We did not get a new Rules Applied to set so 2224 * no overlapping send occured, this means the 2225 * current byte counts are correct. 2226 */ 2227 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 2228 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; 2229 } else { 2230 /* 2231 * Overlapping send case, we switched to a new 2232 * send and did a rules applied. 2233 */ 2234 log.u_bbr.cur_del_rate = cur->sent_at_ls; 2235 log.u_bbr.rttProp = cur->rxt_at_ls; 2236 } 2237 log.u_bbr.bw_inuse = cur->rxt_at_fs; 2238 log.u_bbr.cwnd_gain = line; 2239 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2240 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2241 /* start = < flex1 | flex2 > */ 2242 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff); 2243 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2244 /* end = < flex3 | flex4 > */ 2245 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff); 2246 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2247 2248 /* localtime = <delivered | applimited>*/ 2249 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2250 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2251 /* client timestamp = <lt_epoch | epoch>*/ 2252 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff); 2253 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff); 2254 /* now set all the flags in */ 2255 log.u_bbr.pkts_out = cur->hybrid_flags; 2256 log.u_bbr.lost = cur->playout_ms; 2257 log.u_bbr.flex6 = cur->flags; 2258 /* 2259 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases 2260 * where a false retransmit occurred so first_send <-> lastsend may 2261 * include longer time then it actually took if we have a false rxt. 2262 */ 2263 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff); 2264 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff); 2265 /* 2266 * Compose bbr_state to be a bit wise 0000ADHF 2267 * where A is the always_pace flag 2268 * where D is the dgp_on flag 2269 * where H is the hybrid_mode on flag 2270 * where F is the use_fixed_rate flag. 2271 */ 2272 log.u_bbr.bbr_state = rack->rc_always_pace; 2273 log.u_bbr.bbr_state <<= 1; 2274 log.u_bbr.bbr_state |= rack->dgp_on; 2275 log.u_bbr.bbr_state <<= 1; 2276 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2277 log.u_bbr.bbr_state <<= 1; 2278 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2279 2280 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST; 2281 tcp_log_event(rack->rc_tp, NULL, 2282 &rack->rc_inp->inp_socket->so_rcv, 2283 &rack->rc_inp->inp_socket->so_snd, 2284 TCP_HYBRID_PACING_LOG, 0, 2285 0, &log, false, NULL, __func__, __LINE__, &tv); 2286 } 2287 } 2288 #endif 2289 2290 static inline uint64_t 2291 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) 2292 { 2293 uint64_t ret_bw, ether; 2294 uint64_t u_segsiz; 2295 2296 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); 2297 if (rack->r_is_v6){ 2298 #ifdef INET6 2299 ether += sizeof(struct ip6_hdr); 2300 #endif 2301 ether += 14; /* eheader size 6+6+2 */ 2302 } else { 2303 #ifdef INET 2304 ether += sizeof(struct ip); 2305 #endif 2306 ether += 14; /* eheader size 6+6+2 */ 2307 } 2308 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); 2309 ret_bw = bw; 2310 ret_bw *= ether; 2311 ret_bw /= u_segsiz; 2312 return (ret_bw); 2313 } 2314 2315 static void 2316 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) 2317 { 2318 #ifdef TCP_REQUEST_TRK 2319 struct timeval tv; 2320 uint64_t timenow, timeleft, lenleft, lengone, calcbw; 2321 #endif 2322 2323 if (rack->r_ctl.bw_rate_cap == 0) 2324 return; 2325 #ifdef TCP_REQUEST_TRK 2326 if (rack->rc_catch_up && rack->rc_hybrid_mode && 2327 (rack->r_ctl.rc_last_sft != NULL)) { 2328 /* 2329 * We have a dynamic cap. The original target 2330 * is in bw_rate_cap, but we need to look at 2331 * how long it is until we hit the deadline. 2332 */ 2333 struct tcp_sendfile_track *ent; 2334 2335 ent = rack->r_ctl.rc_last_sft; 2336 microuptime(&tv); 2337 timenow = tcp_tv_to_lusectick(&tv); 2338 if (timenow >= ent->deadline) { 2339 /* No time left we do DGP only */ 2340 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2341 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2342 rack->r_ctl.bw_rate_cap = 0; 2343 return; 2344 } 2345 /* We have the time */ 2346 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; 2347 if (timeleft < HPTS_MSEC_IN_SEC) { 2348 /* If there is less than a ms left just use DGPs rate */ 2349 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2350 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2351 rack->r_ctl.bw_rate_cap = 0; 2352 return; 2353 } 2354 /* 2355 * Now lets find the amount of data left to send. 2356 * 2357 * Now ideally we want to use the end_seq to figure out how much more 2358 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. 2359 */ 2360 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) { 2361 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) 2362 lenleft = ent->end_seq - rack->rc_tp->snd_una; 2363 else { 2364 /* TSNH, we should catch it at the send */ 2365 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2366 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2367 rack->r_ctl.bw_rate_cap = 0; 2368 return; 2369 } 2370 } else { 2371 /* 2372 * The hard way, figure out how much is gone and then 2373 * take that away from the total the client asked for 2374 * (thats off by tls overhead if this is tls). 2375 */ 2376 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) 2377 lengone = rack->rc_tp->snd_una - ent->start_seq; 2378 else 2379 lengone = 0; 2380 if (lengone < (ent->end - ent->start)) 2381 lenleft = (ent->end - ent->start) - lengone; 2382 else { 2383 /* TSNH, we should catch it at the send */ 2384 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2385 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2386 rack->r_ctl.bw_rate_cap = 0; 2387 return; 2388 } 2389 } 2390 if (lenleft == 0) { 2391 /* We have it all sent */ 2392 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2393 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__); 2394 if (rack->r_ctl.bw_rate_cap) 2395 goto normal_ratecap; 2396 else 2397 return; 2398 } 2399 calcbw = lenleft * HPTS_USEC_IN_SEC; 2400 calcbw /= timeleft; 2401 /* Now we must compensate for IP/TCP overhead */ 2402 calcbw = rack_compensate_for_linerate(rack, calcbw); 2403 /* Update the bit rate cap */ 2404 rack->r_ctl.bw_rate_cap = calcbw; 2405 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2406 (rack_hybrid_allow_set_maxseg == 1) && 2407 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2408 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2409 uint32_t orig_max; 2410 2411 orig_max = rack->r_ctl.rc_pace_max_segs; 2412 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2413 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); 2414 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2415 } 2416 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2417 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__); 2418 if ((calcbw > 0) && (*bw > calcbw)) { 2419 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2420 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__); 2421 *capped = 1; 2422 *bw = calcbw; 2423 } 2424 return; 2425 } 2426 normal_ratecap: 2427 #endif 2428 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { 2429 #ifdef TCP_REQUEST_TRK 2430 if (rack->rc_hybrid_mode && 2431 rack->rc_catch_up && 2432 (rack->r_ctl.rc_last_sft != NULL) && 2433 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2434 (rack_hybrid_allow_set_maxseg == 1) && 2435 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2436 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2437 uint32_t orig_max; 2438 2439 orig_max = rack->r_ctl.rc_pace_max_segs; 2440 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2441 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); 2442 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2443 } 2444 #endif 2445 *capped = 1; 2446 *bw = rack->r_ctl.bw_rate_cap; 2447 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2448 *bw, 0, 0, 2449 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__); 2450 } 2451 } 2452 2453 static uint64_t 2454 rack_get_gp_est(struct tcp_rack *rack) 2455 { 2456 uint64_t bw, lt_bw, ret_bw; 2457 2458 if (rack->rc_gp_filled == 0) { 2459 /* 2460 * We have yet no b/w measurement, 2461 * if we have a user set initial bw 2462 * return it. If we don't have that and 2463 * we have an srtt, use the tcp IW (10) to 2464 * calculate a fictional b/w over the SRTT 2465 * which is more or less a guess. Note 2466 * we don't use our IW from rack on purpose 2467 * so if we have like IW=30, we are not 2468 * calculating a "huge" b/w. 2469 */ 2470 uint64_t srtt; 2471 2472 if (rack->dis_lt_bw == 1) 2473 lt_bw = 0; 2474 else 2475 lt_bw = rack_get_lt_bw(rack); 2476 if (lt_bw) { 2477 /* 2478 * No goodput bw but a long-term b/w does exist 2479 * lets use that. 2480 */ 2481 ret_bw = lt_bw; 2482 goto compensate; 2483 } 2484 if (rack->r_ctl.init_rate) 2485 return (rack->r_ctl.init_rate); 2486 2487 /* Ok lets come up with the IW guess, if we have a srtt */ 2488 if (rack->rc_tp->t_srtt == 0) { 2489 /* 2490 * Go with old pacing method 2491 * i.e. burst mitigation only. 2492 */ 2493 return (0); 2494 } 2495 /* Ok lets get the initial TCP win (not racks) */ 2496 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2497 srtt = (uint64_t)rack->rc_tp->t_srtt; 2498 bw *= (uint64_t)USECS_IN_SECOND; 2499 bw /= srtt; 2500 ret_bw = bw; 2501 goto compensate; 2502 2503 } 2504 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2505 /* Averaging is done, we can return the value */ 2506 bw = rack->r_ctl.gp_bw; 2507 } else { 2508 /* Still doing initial average must calculate */ 2509 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); 2510 } 2511 if (rack->dis_lt_bw) { 2512 /* We are not using lt-bw */ 2513 ret_bw = bw; 2514 goto compensate; 2515 } 2516 lt_bw = rack_get_lt_bw(rack); 2517 if (lt_bw == 0) { 2518 /* If we don't have one then equate it to the gp_bw */ 2519 lt_bw = rack->r_ctl.gp_bw; 2520 } 2521 if (rack->use_lesser_lt_bw) { 2522 if (lt_bw < bw) 2523 ret_bw = lt_bw; 2524 else 2525 ret_bw = bw; 2526 } else { 2527 if (lt_bw > bw) 2528 ret_bw = lt_bw; 2529 else 2530 ret_bw = bw; 2531 } 2532 /* 2533 * Now lets compensate based on the TCP/IP overhead. Our 2534 * Goodput estimate does not include this so we must pace out 2535 * a bit faster since our pacing calculations do. The pacing 2536 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz 2537 * we are using to do this, so we do that here in the opposite 2538 * direction as well. This means that if we are tunneled and the 2539 * segsiz is say 1200 bytes we will get quite a boost, but its 2540 * compensated for in the pacing time the opposite way. 2541 */ 2542 compensate: 2543 ret_bw = rack_compensate_for_linerate(rack, ret_bw); 2544 return(ret_bw); 2545 } 2546 2547 2548 static uint64_t 2549 rack_get_bw(struct tcp_rack *rack) 2550 { 2551 uint64_t bw; 2552 2553 if (rack->use_fixed_rate) { 2554 /* Return the fixed pacing rate */ 2555 return (rack_get_fixed_pacing_bw(rack)); 2556 } 2557 bw = rack_get_gp_est(rack); 2558 return (bw); 2559 } 2560 2561 static uint16_t 2562 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2563 { 2564 if (rack->use_fixed_rate) { 2565 return (100); 2566 } else if (rack->in_probe_rtt && (rsm == NULL)) 2567 return (rack->r_ctl.rack_per_of_gp_probertt); 2568 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2569 rack->r_ctl.rack_per_of_gp_rec)) { 2570 if (rsm) { 2571 /* a retransmission always use the recovery rate */ 2572 return (rack->r_ctl.rack_per_of_gp_rec); 2573 } else if (rack->rack_rec_nonrxt_use_cr) { 2574 /* Directed to use the configured rate */ 2575 goto configured_rate; 2576 } else if (rack->rack_no_prr && 2577 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2578 /* No PRR, lets just use the b/w estimate only */ 2579 return (100); 2580 } else { 2581 /* 2582 * Here we may have a non-retransmit but we 2583 * have no overrides, so just use the recovery 2584 * rate (prr is in effect). 2585 */ 2586 return (rack->r_ctl.rack_per_of_gp_rec); 2587 } 2588 } 2589 configured_rate: 2590 /* For the configured rate we look at our cwnd vs the ssthresh */ 2591 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2592 return (rack->r_ctl.rack_per_of_gp_ss); 2593 else 2594 return (rack->r_ctl.rack_per_of_gp_ca); 2595 } 2596 2597 static void 2598 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2599 { 2600 /* 2601 * Types of logs (mod value) 2602 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2603 * 2 = a dsack round begins, persist is reset to 16. 2604 * 3 = a dsack round ends 2605 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2606 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2607 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2608 */ 2609 if (tcp_bblogging_on(rack->rc_tp)) { 2610 union tcp_log_stackspecific log; 2611 struct timeval tv; 2612 2613 memset(&log, 0, sizeof(log)); 2614 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2615 log.u_bbr.flex1 <<= 1; 2616 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2617 log.u_bbr.flex1 <<= 1; 2618 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2619 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2620 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2621 log.u_bbr.flex4 = flex4; 2622 log.u_bbr.flex5 = flex5; 2623 log.u_bbr.flex6 = flex6; 2624 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2625 log.u_bbr.flex8 = mod; 2626 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2627 log.u_bbr.epoch = rack->r_ctl.current_round; 2628 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2629 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2630 &rack->rc_inp->inp_socket->so_rcv, 2631 &rack->rc_inp->inp_socket->so_snd, 2632 RACK_DSACK_HANDLING, 0, 2633 0, &log, false, &tv); 2634 } 2635 } 2636 2637 static void 2638 rack_log_hdwr_pacing(struct tcp_rack *rack, 2639 uint64_t rate, uint64_t hw_rate, int line, 2640 int error, uint16_t mod) 2641 { 2642 if (tcp_bblogging_on(rack->rc_tp)) { 2643 union tcp_log_stackspecific log; 2644 struct timeval tv; 2645 const struct ifnet *ifp; 2646 2647 memset(&log, 0, sizeof(log)); 2648 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2649 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2650 if (rack->r_ctl.crte) { 2651 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2652 } else if (rack->rc_inp->inp_route.ro_nh && 2653 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2654 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2655 } else 2656 ifp = NULL; 2657 if (ifp) { 2658 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2659 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2660 } 2661 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2662 log.u_bbr.bw_inuse = rate; 2663 log.u_bbr.flex5 = line; 2664 log.u_bbr.flex6 = error; 2665 log.u_bbr.flex7 = mod; 2666 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2667 log.u_bbr.flex8 = rack->use_fixed_rate; 2668 log.u_bbr.flex8 <<= 1; 2669 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2670 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2671 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2672 if (rack->r_ctl.crte) 2673 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2674 else 2675 log.u_bbr.cur_del_rate = 0; 2676 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2677 log.u_bbr.epoch = rack->r_ctl.current_round; 2678 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2679 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2680 &rack->rc_inp->inp_socket->so_rcv, 2681 &rack->rc_inp->inp_socket->so_snd, 2682 BBR_LOG_HDWR_PACE, 0, 2683 0, &log, false, &tv); 2684 } 2685 } 2686 2687 static uint64_t 2688 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2689 { 2690 /* 2691 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2692 */ 2693 uint64_t bw_est, high_rate; 2694 uint64_t gain; 2695 2696 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2697 bw_est = bw * gain; 2698 bw_est /= (uint64_t)100; 2699 /* Never fall below the minimum (def 64kbps) */ 2700 if (bw_est < RACK_MIN_BW) 2701 bw_est = RACK_MIN_BW; 2702 if (rack->r_rack_hw_rate_caps) { 2703 /* Rate caps are in place */ 2704 if (rack->r_ctl.crte != NULL) { 2705 /* We have a hdwr rate already */ 2706 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2707 if (bw_est >= high_rate) { 2708 /* We are capping bw at the highest rate table entry */ 2709 if (rack_hw_rate_cap_per && 2710 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { 2711 rack->r_rack_hw_rate_caps = 0; 2712 goto done; 2713 } 2714 rack_log_hdwr_pacing(rack, 2715 bw_est, high_rate, __LINE__, 2716 0, 3); 2717 bw_est = high_rate; 2718 if (capped) 2719 *capped = 1; 2720 } 2721 } else if ((rack->rack_hdrw_pacing == 0) && 2722 (rack->rack_hdw_pace_ena) && 2723 (rack->rack_attempt_hdwr_pace == 0) && 2724 (rack->rc_inp->inp_route.ro_nh != NULL) && 2725 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2726 /* 2727 * Special case, we have not yet attempted hardware 2728 * pacing, and yet we may, when we do, find out if we are 2729 * above the highest rate. We need to know the maxbw for the interface 2730 * in question (if it supports ratelimiting). We get back 2731 * a 0, if the interface is not found in the RL lists. 2732 */ 2733 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2734 if (high_rate) { 2735 /* Yep, we have a rate is it above this rate? */ 2736 if (bw_est > high_rate) { 2737 bw_est = high_rate; 2738 if (capped) 2739 *capped = 1; 2740 } 2741 } 2742 } 2743 } 2744 done: 2745 return (bw_est); 2746 } 2747 2748 static void 2749 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2750 { 2751 if (tcp_bblogging_on(rack->rc_tp)) { 2752 union tcp_log_stackspecific log; 2753 struct timeval tv; 2754 2755 if ((mod != 1) && (rack_verbose_logging == 0)) { 2756 /* 2757 * We get 3 values currently for mod 2758 * 1 - We are retransmitting and this tells the reason. 2759 * 2 - We are clearing a dup-ack count. 2760 * 3 - We are incrementing a dup-ack count. 2761 * 2762 * The clear/increment are only logged 2763 * if you have BBverbose on. 2764 */ 2765 return; 2766 } 2767 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2768 log.u_bbr.flex1 = tsused; 2769 log.u_bbr.flex2 = thresh; 2770 log.u_bbr.flex3 = rsm->r_flags; 2771 log.u_bbr.flex4 = rsm->r_dupack; 2772 log.u_bbr.flex5 = rsm->r_start; 2773 log.u_bbr.flex6 = rsm->r_end; 2774 log.u_bbr.flex8 = mod; 2775 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2776 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2777 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2778 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2779 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2780 log.u_bbr.pacing_gain = rack->r_must_retran; 2781 log.u_bbr.epoch = rack->r_ctl.current_round; 2782 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2783 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2784 &rack->rc_inp->inp_socket->so_rcv, 2785 &rack->rc_inp->inp_socket->so_snd, 2786 BBR_LOG_SETTINGS_CHG, 0, 2787 0, &log, false, &tv); 2788 } 2789 } 2790 2791 static void 2792 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2793 { 2794 if (tcp_bblogging_on(rack->rc_tp)) { 2795 union tcp_log_stackspecific log; 2796 struct timeval tv; 2797 2798 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2799 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2800 log.u_bbr.flex2 = to; 2801 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2802 log.u_bbr.flex4 = slot; 2803 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; 2804 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2805 log.u_bbr.flex7 = rack->rc_in_persist; 2806 log.u_bbr.flex8 = which; 2807 if (rack->rack_no_prr) 2808 log.u_bbr.pkts_out = 0; 2809 else 2810 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2811 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2812 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2813 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2814 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2815 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2816 log.u_bbr.pacing_gain = rack->r_must_retran; 2817 log.u_bbr.cwnd_gain = rack->rack_deferred_inited; 2818 log.u_bbr.pkt_epoch = rack->rc_has_collapsed; 2819 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2820 log.u_bbr.lost = rack_rto_min; 2821 log.u_bbr.epoch = rack->r_ctl.roundends; 2822 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2823 log.u_bbr.bw_inuse <<= 32; 2824 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2825 log.u_bbr.applimited = rack->rc_tp->t_flags2; 2826 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2827 &rack->rc_inp->inp_socket->so_rcv, 2828 &rack->rc_inp->inp_socket->so_snd, 2829 BBR_LOG_TIMERSTAR, 0, 2830 0, &log, false, &tv); 2831 } 2832 } 2833 2834 static void 2835 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2836 { 2837 if (tcp_bblogging_on(rack->rc_tp)) { 2838 union tcp_log_stackspecific log; 2839 struct timeval tv; 2840 2841 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2842 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2843 log.u_bbr.flex8 = to_num; 2844 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2845 log.u_bbr.flex2 = rack->rc_rack_rtt; 2846 if (rsm == NULL) 2847 log.u_bbr.flex3 = 0; 2848 else 2849 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2850 if (rack->rack_no_prr) 2851 log.u_bbr.flex5 = 0; 2852 else 2853 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2854 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2855 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2856 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2857 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2858 log.u_bbr.pacing_gain = rack->r_must_retran; 2859 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2860 log.u_bbr.bw_inuse <<= 32; 2861 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2862 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2863 &rack->rc_inp->inp_socket->so_rcv, 2864 &rack->rc_inp->inp_socket->so_snd, 2865 BBR_LOG_RTO, 0, 2866 0, &log, false, &tv); 2867 } 2868 } 2869 2870 static void 2871 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2872 struct rack_sendmap *prev, 2873 struct rack_sendmap *rsm, 2874 struct rack_sendmap *next, 2875 int flag, uint32_t th_ack, int line) 2876 { 2877 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2878 union tcp_log_stackspecific log; 2879 struct timeval tv; 2880 2881 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2882 log.u_bbr.flex8 = flag; 2883 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2884 log.u_bbr.cur_del_rate = (uint64_t)prev; 2885 log.u_bbr.delRate = (uint64_t)rsm; 2886 log.u_bbr.rttProp = (uint64_t)next; 2887 log.u_bbr.flex7 = 0; 2888 if (prev) { 2889 log.u_bbr.flex1 = prev->r_start; 2890 log.u_bbr.flex2 = prev->r_end; 2891 log.u_bbr.flex7 |= 0x4; 2892 } 2893 if (rsm) { 2894 log.u_bbr.flex3 = rsm->r_start; 2895 log.u_bbr.flex4 = rsm->r_end; 2896 log.u_bbr.flex7 |= 0x2; 2897 } 2898 if (next) { 2899 log.u_bbr.flex5 = next->r_start; 2900 log.u_bbr.flex6 = next->r_end; 2901 log.u_bbr.flex7 |= 0x1; 2902 } 2903 log.u_bbr.applimited = line; 2904 log.u_bbr.pkts_out = th_ack; 2905 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2906 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2907 if (rack->rack_no_prr) 2908 log.u_bbr.lost = 0; 2909 else 2910 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2911 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2912 log.u_bbr.bw_inuse <<= 32; 2913 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2914 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2915 &rack->rc_inp->inp_socket->so_rcv, 2916 &rack->rc_inp->inp_socket->so_snd, 2917 TCP_LOG_MAPCHG, 0, 2918 0, &log, false, &tv); 2919 } 2920 } 2921 2922 static void 2923 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2924 struct rack_sendmap *rsm, int conf) 2925 { 2926 if (tcp_bblogging_on(tp)) { 2927 union tcp_log_stackspecific log; 2928 struct timeval tv; 2929 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2930 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2931 log.u_bbr.flex1 = t; 2932 log.u_bbr.flex2 = len; 2933 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2934 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2935 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2936 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2937 log.u_bbr.flex7 = conf; 2938 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2939 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2940 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2941 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2942 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2943 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2944 if (rsm) { 2945 log.u_bbr.pkt_epoch = rsm->r_start; 2946 log.u_bbr.lost = rsm->r_end; 2947 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2948 /* We loose any upper of the 24 bits */ 2949 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2950 } else { 2951 /* Its a SYN */ 2952 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2953 log.u_bbr.lost = 0; 2954 log.u_bbr.cwnd_gain = 0; 2955 log.u_bbr.pacing_gain = 0; 2956 } 2957 /* Write out general bits of interest rrs here */ 2958 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2959 log.u_bbr.use_lt_bw <<= 1; 2960 log.u_bbr.use_lt_bw |= rack->forced_ack; 2961 log.u_bbr.use_lt_bw <<= 1; 2962 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2963 log.u_bbr.use_lt_bw <<= 1; 2964 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2965 log.u_bbr.use_lt_bw <<= 1; 2966 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2967 log.u_bbr.use_lt_bw <<= 1; 2968 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2969 log.u_bbr.use_lt_bw <<= 1; 2970 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2971 log.u_bbr.use_lt_bw <<= 1; 2972 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2973 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2974 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2975 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2976 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2977 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2978 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2979 log.u_bbr.bw_inuse <<= 32; 2980 if (rsm) 2981 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2982 TCP_LOG_EVENTP(tp, NULL, 2983 &rack->rc_inp->inp_socket->so_rcv, 2984 &rack->rc_inp->inp_socket->so_snd, 2985 BBR_LOG_BBRRTT, 0, 2986 0, &log, false, &tv); 2987 2988 2989 } 2990 } 2991 2992 static void 2993 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2994 { 2995 /* 2996 * Log the rtt sample we are 2997 * applying to the srtt algorithm in 2998 * useconds. 2999 */ 3000 if (tcp_bblogging_on(rack->rc_tp)) { 3001 union tcp_log_stackspecific log; 3002 struct timeval tv; 3003 3004 /* Convert our ms to a microsecond */ 3005 memset(&log, 0, sizeof(log)); 3006 log.u_bbr.flex1 = rtt; 3007 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3008 log.u_bbr.flex7 = 1; 3009 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3010 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3011 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3012 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3013 log.u_bbr.pacing_gain = rack->r_must_retran; 3014 /* 3015 * We capture in delRate the upper 32 bits as 3016 * the confidence level we had declared, and the 3017 * lower 32 bits as the actual RTT using the arrival 3018 * timestamp. 3019 */ 3020 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 3021 log.u_bbr.delRate <<= 32; 3022 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 3023 /* Lets capture all the things that make up t_rtxcur */ 3024 log.u_bbr.applimited = rack_rto_min; 3025 log.u_bbr.epoch = rack_rto_max; 3026 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 3027 log.u_bbr.lost = rack_rto_min; 3028 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 3029 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 3030 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 3031 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 3032 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 3033 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3034 &rack->rc_inp->inp_socket->so_rcv, 3035 &rack->rc_inp->inp_socket->so_snd, 3036 TCP_LOG_RTT, 0, 3037 0, &log, false, &tv); 3038 } 3039 } 3040 3041 static void 3042 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 3043 { 3044 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3045 union tcp_log_stackspecific log; 3046 struct timeval tv; 3047 3048 /* Convert our ms to a microsecond */ 3049 memset(&log, 0, sizeof(log)); 3050 log.u_bbr.flex1 = rtt; 3051 log.u_bbr.flex2 = send_time; 3052 log.u_bbr.flex3 = ack_time; 3053 log.u_bbr.flex4 = where; 3054 log.u_bbr.flex7 = 2; 3055 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3056 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3057 log.u_bbr.bw_inuse <<= 32; 3058 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3059 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3060 &rack->rc_inp->inp_socket->so_rcv, 3061 &rack->rc_inp->inp_socket->so_snd, 3062 TCP_LOG_RTT, 0, 3063 0, &log, false, &tv); 3064 } 3065 } 3066 3067 3068 static void 3069 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) 3070 { 3071 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3072 union tcp_log_stackspecific log; 3073 struct timeval tv; 3074 3075 /* Convert our ms to a microsecond */ 3076 memset(&log, 0, sizeof(log)); 3077 log.u_bbr.flex1 = idx; 3078 log.u_bbr.flex2 = rack_ts_to_msec(tsv); 3079 log.u_bbr.flex3 = tsecho; 3080 log.u_bbr.flex7 = 3; 3081 log.u_bbr.rttProp = tsv; 3082 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3083 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3084 log.u_bbr.bw_inuse <<= 32; 3085 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3086 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3087 &rack->rc_inp->inp_socket->so_rcv, 3088 &rack->rc_inp->inp_socket->so_snd, 3089 TCP_LOG_RTT, 0, 3090 0, &log, false, &tv); 3091 } 3092 } 3093 3094 3095 static inline void 3096 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 3097 { 3098 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3099 union tcp_log_stackspecific log; 3100 struct timeval tv; 3101 3102 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3103 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3104 log.u_bbr.flex1 = line; 3105 log.u_bbr.flex2 = tick; 3106 log.u_bbr.flex3 = tp->t_maxunacktime; 3107 log.u_bbr.flex4 = tp->t_acktime; 3108 log.u_bbr.flex8 = event; 3109 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3110 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3111 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3112 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3113 log.u_bbr.pacing_gain = rack->r_must_retran; 3114 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3115 log.u_bbr.bw_inuse <<= 32; 3116 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3117 TCP_LOG_EVENTP(tp, NULL, 3118 &rack->rc_inp->inp_socket->so_rcv, 3119 &rack->rc_inp->inp_socket->so_snd, 3120 BBR_LOG_PROGRESS, 0, 3121 0, &log, false, &tv); 3122 } 3123 } 3124 3125 static void 3126 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) 3127 { 3128 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3129 union tcp_log_stackspecific log; 3130 3131 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3132 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3133 log.u_bbr.flex1 = slot; 3134 if (rack->rack_no_prr) 3135 log.u_bbr.flex2 = 0; 3136 else 3137 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 3138 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3139 log.u_bbr.flex6 = line; 3140 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 3141 log.u_bbr.flex8 = rack->rc_in_persist; 3142 log.u_bbr.timeStamp = cts; 3143 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3144 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3145 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3146 log.u_bbr.pacing_gain = rack->r_must_retran; 3147 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3148 &rack->rc_inp->inp_socket->so_rcv, 3149 &rack->rc_inp->inp_socket->so_snd, 3150 BBR_LOG_BBRSND, 0, 3151 0, &log, false, tv); 3152 } 3153 } 3154 3155 static void 3156 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 3157 { 3158 if (tcp_bblogging_on(rack->rc_tp)) { 3159 union tcp_log_stackspecific log; 3160 struct timeval tv; 3161 3162 memset(&log, 0, sizeof(log)); 3163 log.u_bbr.flex1 = did_out; 3164 log.u_bbr.flex2 = nxt_pkt; 3165 log.u_bbr.flex3 = way_out; 3166 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3167 if (rack->rack_no_prr) 3168 log.u_bbr.flex5 = 0; 3169 else 3170 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3171 log.u_bbr.flex6 = nsegs; 3172 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 3173 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 3174 log.u_bbr.flex7 <<= 1; 3175 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 3176 log.u_bbr.flex7 <<= 1; 3177 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 3178 log.u_bbr.flex8 = rack->rc_in_persist; 3179 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3180 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3181 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3182 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3183 log.u_bbr.use_lt_bw <<= 1; 3184 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3185 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3186 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3187 log.u_bbr.pacing_gain = rack->r_must_retran; 3188 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3189 log.u_bbr.bw_inuse <<= 32; 3190 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3191 log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat; 3192 log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat; 3193 log.u_bbr.lost = rack->rc_tp->t_srtt; 3194 log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt; 3195 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3196 &rack->rc_inp->inp_socket->so_rcv, 3197 &rack->rc_inp->inp_socket->so_snd, 3198 BBR_LOG_DOSEG_DONE, 0, 3199 0, &log, false, &tv); 3200 } 3201 } 3202 3203 static void 3204 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 3205 { 3206 if (tcp_bblogging_on(rack->rc_tp)) { 3207 union tcp_log_stackspecific log; 3208 struct timeval tv; 3209 3210 memset(&log, 0, sizeof(log)); 3211 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 3212 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 3213 log.u_bbr.flex4 = arg1; 3214 log.u_bbr.flex5 = arg2; 3215 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; 3216 log.u_bbr.flex6 = arg3; 3217 log.u_bbr.flex8 = frm; 3218 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3219 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3220 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3221 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 3222 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3223 log.u_bbr.pacing_gain = rack->r_must_retran; 3224 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 3225 &tptosocket(tp)->so_snd, 3226 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 3227 } 3228 } 3229 3230 static void 3231 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 3232 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 3233 { 3234 if (tcp_bblogging_on(rack->rc_tp)) { 3235 union tcp_log_stackspecific log; 3236 struct timeval tv; 3237 3238 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3239 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3240 log.u_bbr.flex1 = slot; 3241 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 3242 log.u_bbr.flex4 = reason; 3243 if (rack->rack_no_prr) 3244 log.u_bbr.flex5 = 0; 3245 else 3246 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3247 log.u_bbr.flex7 = hpts_calling; 3248 log.u_bbr.flex8 = rack->rc_in_persist; 3249 log.u_bbr.lt_epoch = cwnd_to_use; 3250 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3251 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3252 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3253 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3254 log.u_bbr.pacing_gain = rack->r_must_retran; 3255 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 3256 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3257 log.u_bbr.bw_inuse <<= 32; 3258 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3259 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3260 &rack->rc_inp->inp_socket->so_rcv, 3261 &rack->rc_inp->inp_socket->so_snd, 3262 BBR_LOG_JUSTRET, 0, 3263 tlen, &log, false, &tv); 3264 } 3265 } 3266 3267 static void 3268 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 3269 struct timeval *tv, uint32_t flags_on_entry) 3270 { 3271 if (tcp_bblogging_on(rack->rc_tp)) { 3272 union tcp_log_stackspecific log; 3273 3274 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3275 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3276 log.u_bbr.flex1 = line; 3277 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 3278 log.u_bbr.flex3 = flags_on_entry; 3279 log.u_bbr.flex4 = us_cts; 3280 if (rack->rack_no_prr) 3281 log.u_bbr.flex5 = 0; 3282 else 3283 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3284 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3285 log.u_bbr.flex7 = hpts_removed; 3286 log.u_bbr.flex8 = 1; 3287 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 3288 log.u_bbr.timeStamp = us_cts; 3289 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3290 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3291 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3292 log.u_bbr.pacing_gain = rack->r_must_retran; 3293 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3294 log.u_bbr.bw_inuse <<= 32; 3295 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3296 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3297 &rack->rc_inp->inp_socket->so_rcv, 3298 &rack->rc_inp->inp_socket->so_snd, 3299 BBR_LOG_TIMERCANC, 0, 3300 0, &log, false, tv); 3301 } 3302 } 3303 3304 static void 3305 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 3306 uint32_t flex1, uint32_t flex2, 3307 uint32_t flex3, uint32_t flex4, 3308 uint32_t flex5, uint32_t flex6, 3309 uint16_t flex7, uint8_t mod) 3310 { 3311 if (tcp_bblogging_on(rack->rc_tp)) { 3312 union tcp_log_stackspecific log; 3313 struct timeval tv; 3314 3315 if (mod == 1) { 3316 /* No you can't use 1, its for the real to cancel */ 3317 return; 3318 } 3319 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3320 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3321 log.u_bbr.flex1 = flex1; 3322 log.u_bbr.flex2 = flex2; 3323 log.u_bbr.flex3 = flex3; 3324 log.u_bbr.flex4 = flex4; 3325 log.u_bbr.flex5 = flex5; 3326 log.u_bbr.flex6 = flex6; 3327 log.u_bbr.flex7 = flex7; 3328 log.u_bbr.flex8 = mod; 3329 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3330 &rack->rc_inp->inp_socket->so_rcv, 3331 &rack->rc_inp->inp_socket->so_snd, 3332 BBR_LOG_TIMERCANC, 0, 3333 0, &log, false, &tv); 3334 } 3335 } 3336 3337 static void 3338 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 3339 { 3340 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3341 union tcp_log_stackspecific log; 3342 struct timeval tv; 3343 3344 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3345 log.u_bbr.flex1 = timers; 3346 log.u_bbr.flex2 = ret; 3347 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 3348 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3349 log.u_bbr.flex5 = cts; 3350 if (rack->rack_no_prr) 3351 log.u_bbr.flex6 = 0; 3352 else 3353 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 3354 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3355 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3356 log.u_bbr.pacing_gain = rack->r_must_retran; 3357 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3358 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3359 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3360 &rack->rc_inp->inp_socket->so_rcv, 3361 &rack->rc_inp->inp_socket->so_snd, 3362 BBR_LOG_TO_PROCESS, 0, 3363 0, &log, false, &tv); 3364 } 3365 } 3366 3367 static void 3368 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 3369 { 3370 if (tcp_bblogging_on(rack->rc_tp)) { 3371 union tcp_log_stackspecific log; 3372 struct timeval tv; 3373 3374 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3375 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 3376 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 3377 if (rack->rack_no_prr) 3378 log.u_bbr.flex3 = 0; 3379 else 3380 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 3381 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 3382 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 3383 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 3384 log.u_bbr.flex7 = line; 3385 log.u_bbr.flex8 = frm; 3386 log.u_bbr.pkts_out = orig_cwnd; 3387 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3388 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3389 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3390 log.u_bbr.use_lt_bw <<= 1; 3391 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3392 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3393 &rack->rc_inp->inp_socket->so_rcv, 3394 &rack->rc_inp->inp_socket->so_snd, 3395 BBR_LOG_BBRUPD, 0, 3396 0, &log, false, &tv); 3397 } 3398 } 3399 3400 static void 3401 rack_counter_destroy(void) 3402 { 3403 counter_u64_free(rack_total_bytes); 3404 counter_u64_free(rack_fto_send); 3405 counter_u64_free(rack_fto_rsm_send); 3406 counter_u64_free(rack_nfto_resend); 3407 counter_u64_free(rack_hw_pace_init_fail); 3408 counter_u64_free(rack_hw_pace_lost); 3409 counter_u64_free(rack_non_fto_send); 3410 counter_u64_free(rack_extended_rfo); 3411 counter_u64_free(rack_ack_total); 3412 counter_u64_free(rack_express_sack); 3413 counter_u64_free(rack_sack_total); 3414 counter_u64_free(rack_move_none); 3415 counter_u64_free(rack_move_some); 3416 counter_u64_free(rack_sack_attacks_detected); 3417 counter_u64_free(rack_sack_attacks_reversed); 3418 counter_u64_free(rack_sack_attacks_suspect); 3419 counter_u64_free(rack_sack_used_next_merge); 3420 counter_u64_free(rack_sack_used_prev_merge); 3421 counter_u64_free(rack_tlp_tot); 3422 counter_u64_free(rack_tlp_newdata); 3423 counter_u64_free(rack_tlp_retran); 3424 counter_u64_free(rack_tlp_retran_bytes); 3425 counter_u64_free(rack_to_tot); 3426 counter_u64_free(rack_saw_enobuf); 3427 counter_u64_free(rack_saw_enobuf_hw); 3428 counter_u64_free(rack_saw_enetunreach); 3429 counter_u64_free(rack_hot_alloc); 3430 counter_u64_free(tcp_policer_detected); 3431 counter_u64_free(rack_to_alloc); 3432 counter_u64_free(rack_to_alloc_hard); 3433 counter_u64_free(rack_to_alloc_emerg); 3434 counter_u64_free(rack_to_alloc_limited); 3435 counter_u64_free(rack_alloc_limited_conns); 3436 counter_u64_free(rack_split_limited); 3437 counter_u64_free(rack_multi_single_eq); 3438 counter_u64_free(rack_rxt_clamps_cwnd); 3439 counter_u64_free(rack_rxt_clamps_cwnd_uniq); 3440 counter_u64_free(rack_proc_non_comp_ack); 3441 counter_u64_free(rack_sack_proc_all); 3442 counter_u64_free(rack_sack_proc_restart); 3443 counter_u64_free(rack_sack_proc_short); 3444 counter_u64_free(rack_sack_skipped_acked); 3445 counter_u64_free(rack_sack_splits); 3446 counter_u64_free(rack_input_idle_reduces); 3447 counter_u64_free(rack_collapsed_win); 3448 counter_u64_free(rack_collapsed_win_rxt); 3449 counter_u64_free(rack_collapsed_win_rxt_bytes); 3450 counter_u64_free(rack_collapsed_win_seen); 3451 counter_u64_free(rack_try_scwnd); 3452 counter_u64_free(rack_persists_sends); 3453 counter_u64_free(rack_persists_acks); 3454 counter_u64_free(rack_persists_loss); 3455 counter_u64_free(rack_persists_lost_ends); 3456 #ifdef INVARIANTS 3457 counter_u64_free(rack_adjust_map_bw); 3458 #endif 3459 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 3460 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 3461 } 3462 3463 static struct rack_sendmap * 3464 rack_alloc(struct tcp_rack *rack) 3465 { 3466 struct rack_sendmap *rsm; 3467 3468 /* 3469 * First get the top of the list it in 3470 * theory is the "hottest" rsm we have, 3471 * possibly just freed by ack processing. 3472 */ 3473 if (rack->rc_free_cnt > rack_free_cache) { 3474 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3475 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3476 counter_u64_add(rack_hot_alloc, 1); 3477 rack->rc_free_cnt--; 3478 return (rsm); 3479 } 3480 /* 3481 * Once we get under our free cache we probably 3482 * no longer have a "hot" one available. Lets 3483 * get one from UMA. 3484 */ 3485 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3486 if (rsm) { 3487 rack->r_ctl.rc_num_maps_alloced++; 3488 counter_u64_add(rack_to_alloc, 1); 3489 return (rsm); 3490 } 3491 /* 3492 * Dig in to our aux rsm's (the last two) since 3493 * UMA failed to get us one. 3494 */ 3495 if (rack->rc_free_cnt) { 3496 counter_u64_add(rack_to_alloc_emerg, 1); 3497 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3498 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3499 rack->rc_free_cnt--; 3500 return (rsm); 3501 } 3502 return (NULL); 3503 } 3504 3505 static struct rack_sendmap * 3506 rack_alloc_full_limit(struct tcp_rack *rack) 3507 { 3508 if ((V_tcp_map_entries_limit > 0) && 3509 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3510 counter_u64_add(rack_to_alloc_limited, 1); 3511 if (!rack->alloc_limit_reported) { 3512 rack->alloc_limit_reported = 1; 3513 counter_u64_add(rack_alloc_limited_conns, 1); 3514 } 3515 return (NULL); 3516 } 3517 return (rack_alloc(rack)); 3518 } 3519 3520 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3521 static struct rack_sendmap * 3522 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3523 { 3524 struct rack_sendmap *rsm; 3525 3526 if (limit_type) { 3527 /* currently there is only one limit type */ 3528 if (rack->r_ctl.rc_split_limit > 0 && 3529 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { 3530 counter_u64_add(rack_split_limited, 1); 3531 if (!rack->alloc_limit_reported) { 3532 rack->alloc_limit_reported = 1; 3533 counter_u64_add(rack_alloc_limited_conns, 1); 3534 } 3535 return (NULL); 3536 } 3537 } 3538 3539 /* allocate and mark in the limit type, if set */ 3540 rsm = rack_alloc(rack); 3541 if (rsm != NULL && limit_type) { 3542 rsm->r_limit_type = limit_type; 3543 rack->r_ctl.rc_num_split_allocs++; 3544 } 3545 return (rsm); 3546 } 3547 3548 static void 3549 rack_free_trim(struct tcp_rack *rack) 3550 { 3551 struct rack_sendmap *rsm; 3552 3553 /* 3554 * Free up all the tail entries until 3555 * we get our list down to the limit. 3556 */ 3557 while (rack->rc_free_cnt > rack_free_cache) { 3558 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3559 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3560 rack->rc_free_cnt--; 3561 rack->r_ctl.rc_num_maps_alloced--; 3562 uma_zfree(rack_zone, rsm); 3563 } 3564 } 3565 3566 static void 3567 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3568 { 3569 if (rsm->r_flags & RACK_APP_LIMITED) { 3570 if (rack->r_ctl.rc_app_limited_cnt > 0) { 3571 rack->r_ctl.rc_app_limited_cnt--; 3572 } 3573 } 3574 if (rsm->r_limit_type) { 3575 /* currently there is only one limit type */ 3576 rack->r_ctl.rc_num_split_allocs--; 3577 } 3578 if (rsm == rack->r_ctl.rc_first_appl) { 3579 rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start); 3580 rack->r_ctl.cleared_app_ack = 1; 3581 if (rack->r_ctl.rc_app_limited_cnt == 0) 3582 rack->r_ctl.rc_first_appl = NULL; 3583 else 3584 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); 3585 } 3586 if (rsm == rack->r_ctl.rc_resend) 3587 rack->r_ctl.rc_resend = NULL; 3588 if (rsm == rack->r_ctl.rc_end_appl) 3589 rack->r_ctl.rc_end_appl = NULL; 3590 if (rack->r_ctl.rc_tlpsend == rsm) 3591 rack->r_ctl.rc_tlpsend = NULL; 3592 if (rack->r_ctl.rc_sacklast == rsm) 3593 rack->r_ctl.rc_sacklast = NULL; 3594 memset(rsm, 0, sizeof(struct rack_sendmap)); 3595 /* Make sure we are not going to overrun our count limit of 0xff */ 3596 if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) { 3597 rack_free_trim(rack); 3598 } 3599 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3600 rack->rc_free_cnt++; 3601 } 3602 3603 static uint32_t 3604 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3605 { 3606 uint64_t srtt, bw, len, tim; 3607 uint32_t segsiz, def_len, minl; 3608 3609 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3610 def_len = rack_def_data_window * segsiz; 3611 if (rack->rc_gp_filled == 0) { 3612 /* 3613 * We have no measurement (IW is in flight?) so 3614 * we can only guess using our data_window sysctl 3615 * value (usually 20MSS). 3616 */ 3617 return (def_len); 3618 } 3619 /* 3620 * Now we have a number of factors to consider. 3621 * 3622 * 1) We have a desired BDP which is usually 3623 * at least 2. 3624 * 2) We have a minimum number of rtt's usually 1 SRTT 3625 * but we allow it too to be more. 3626 * 3) We want to make sure a measurement last N useconds (if 3627 * we have set rack_min_measure_usec. 3628 * 3629 * We handle the first concern here by trying to create a data 3630 * window of max(rack_def_data_window, DesiredBDP). The 3631 * second concern we handle in not letting the measurement 3632 * window end normally until at least the required SRTT's 3633 * have gone by which is done further below in 3634 * rack_enough_for_measurement(). Finally the third concern 3635 * we also handle here by calculating how long that time 3636 * would take at the current BW and then return the 3637 * max of our first calculation and that length. Note 3638 * that if rack_min_measure_usec is 0, we don't deal 3639 * with concern 3. Also for both Concern 1 and 3 an 3640 * application limited period could end the measurement 3641 * earlier. 3642 * 3643 * So lets calculate the BDP with the "known" b/w using 3644 * the SRTT has our rtt and then multiply it by the 3645 * goal. 3646 */ 3647 bw = rack_get_bw(rack); 3648 srtt = (uint64_t)tp->t_srtt; 3649 len = bw * srtt; 3650 len /= (uint64_t)HPTS_USEC_IN_SEC; 3651 len *= max(1, rack_goal_bdp); 3652 /* Now we need to round up to the nearest MSS */ 3653 len = roundup(len, segsiz); 3654 if (rack_min_measure_usec) { 3655 /* Now calculate our min length for this b/w */ 3656 tim = rack_min_measure_usec; 3657 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3658 if (minl == 0) 3659 minl = 1; 3660 minl = roundup(minl, segsiz); 3661 if (len < minl) 3662 len = minl; 3663 } 3664 /* 3665 * Now if we have a very small window we want 3666 * to attempt to get the window that is 3667 * as small as possible. This happens on 3668 * low b/w connections and we don't want to 3669 * span huge numbers of rtt's between measurements. 3670 * 3671 * We basically include 2 over our "MIN window" so 3672 * that the measurement can be shortened (possibly) by 3673 * an ack'ed packet. 3674 */ 3675 if (len < def_len) 3676 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3677 else 3678 return (max((uint32_t)len, def_len)); 3679 3680 } 3681 3682 static int 3683 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3684 { 3685 uint32_t tim, srtts, segsiz; 3686 3687 /* 3688 * Has enough time passed for the GP measurement to be valid? 3689 */ 3690 if (SEQ_LT(th_ack, tp->gput_seq)) { 3691 /* Not enough bytes yet */ 3692 return (0); 3693 } 3694 if ((tp->snd_max == tp->snd_una) || 3695 (th_ack == tp->snd_max)){ 3696 /* 3697 * All is acked quality of all acked is 3698 * usually low or medium, but we in theory could split 3699 * all acked into two cases, where you got 3700 * a signifigant amount of your window and 3701 * where you did not. For now we leave it 3702 * but it is something to contemplate in the 3703 * future. The danger here is that delayed ack 3704 * is effecting the last byte (which is a 50:50 chance). 3705 */ 3706 *quality = RACK_QUALITY_ALLACKED; 3707 return (1); 3708 } 3709 if (SEQ_GEQ(th_ack, tp->gput_ack)) { 3710 /* 3711 * We obtained our entire window of data we wanted 3712 * no matter if we are in recovery or not then 3713 * its ok since expanding the window does not 3714 * make things fuzzy (or at least not as much). 3715 */ 3716 *quality = RACK_QUALITY_HIGH; 3717 return (1); 3718 } 3719 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3720 if (SEQ_LT(th_ack, tp->gput_ack) && 3721 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3722 /* Not enough bytes yet */ 3723 return (0); 3724 } 3725 if (rack->r_ctl.rc_first_appl && 3726 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3727 /* 3728 * We are up to the app limited send point 3729 * we have to measure irrespective of the time.. 3730 */ 3731 *quality = RACK_QUALITY_APPLIMITED; 3732 return (1); 3733 } 3734 /* Now what about time? */ 3735 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3736 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3737 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 3738 /* 3739 * We do not allow a measurement if we are in recovery 3740 * that would shrink the goodput window we wanted. 3741 * This is to prevent cloudyness of when the last send 3742 * was actually made. 3743 */ 3744 *quality = RACK_QUALITY_HIGH; 3745 return (1); 3746 } 3747 /* Nope not even a full SRTT has passed */ 3748 return (0); 3749 } 3750 3751 static void 3752 rack_log_timely(struct tcp_rack *rack, 3753 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3754 uint64_t up_bnd, int line, uint8_t method) 3755 { 3756 if (tcp_bblogging_on(rack->rc_tp)) { 3757 union tcp_log_stackspecific log; 3758 struct timeval tv; 3759 3760 memset(&log, 0, sizeof(log)); 3761 log.u_bbr.flex1 = logged; 3762 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3763 log.u_bbr.flex2 <<= 4; 3764 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3765 log.u_bbr.flex2 <<= 4; 3766 log.u_bbr.flex2 |= rack->rc_gp_incr; 3767 log.u_bbr.flex2 <<= 4; 3768 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3769 log.u_bbr.flex3 = rack->rc_gp_incr; 3770 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3771 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3772 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3773 log.u_bbr.flex7 = rack->rc_gp_bwred; 3774 log.u_bbr.flex8 = method; 3775 log.u_bbr.cur_del_rate = cur_bw; 3776 log.u_bbr.delRate = low_bnd; 3777 log.u_bbr.bw_inuse = up_bnd; 3778 log.u_bbr.rttProp = rack_get_bw(rack); 3779 log.u_bbr.pkt_epoch = line; 3780 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3781 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3782 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3783 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3784 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3785 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3786 log.u_bbr.cwnd_gain <<= 1; 3787 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3788 log.u_bbr.cwnd_gain <<= 1; 3789 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3790 log.u_bbr.cwnd_gain <<= 1; 3791 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3792 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3793 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3794 &rack->rc_inp->inp_socket->so_rcv, 3795 &rack->rc_inp->inp_socket->so_snd, 3796 TCP_TIMELY_WORK, 0, 3797 0, &log, false, &tv); 3798 } 3799 } 3800 3801 static int 3802 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3803 { 3804 /* 3805 * Before we increase we need to know if 3806 * the estimate just made was less than 3807 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3808 * 3809 * If we already are pacing at a fast enough 3810 * rate to push us faster there is no sense of 3811 * increasing. 3812 * 3813 * We first caculate our actual pacing rate (ss or ca multiplier 3814 * times our cur_bw). 3815 * 3816 * Then we take the last measured rate and multipy by our 3817 * maximum pacing overage to give us a max allowable rate. 3818 * 3819 * If our act_rate is smaller than our max_allowable rate 3820 * then we should increase. Else we should hold steady. 3821 * 3822 */ 3823 uint64_t act_rate, max_allow_rate; 3824 3825 if (rack_timely_no_stopping) 3826 return (1); 3827 3828 if ((cur_bw == 0) || (last_bw_est == 0)) { 3829 /* 3830 * Initial startup case or 3831 * everything is acked case. 3832 */ 3833 rack_log_timely(rack, mult, cur_bw, 0, 0, 3834 __LINE__, 9); 3835 return (1); 3836 } 3837 if (mult <= 100) { 3838 /* 3839 * We can always pace at or slightly above our rate. 3840 */ 3841 rack_log_timely(rack, mult, cur_bw, 0, 0, 3842 __LINE__, 9); 3843 return (1); 3844 } 3845 act_rate = cur_bw * (uint64_t)mult; 3846 act_rate /= 100; 3847 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3848 max_allow_rate /= 100; 3849 if (act_rate < max_allow_rate) { 3850 /* 3851 * Here the rate we are actually pacing at 3852 * is smaller than 10% above our last measurement. 3853 * This means we are pacing below what we would 3854 * like to try to achieve (plus some wiggle room). 3855 */ 3856 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3857 __LINE__, 9); 3858 return (1); 3859 } else { 3860 /* 3861 * Here we are already pacing at least rack_max_per_above(10%) 3862 * what we are getting back. This indicates most likely 3863 * that we are being limited (cwnd/rwnd/app) and can't 3864 * get any more b/w. There is no sense of trying to 3865 * raise up the pacing rate its not speeding us up 3866 * and we already are pacing faster than we are getting. 3867 */ 3868 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3869 __LINE__, 8); 3870 return (0); 3871 } 3872 } 3873 3874 static void 3875 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3876 { 3877 /* 3878 * When we drag bottom, we want to assure 3879 * that no multiplier is below 1.0, if so 3880 * we want to restore it to at least that. 3881 */ 3882 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3883 /* This is unlikely we usually do not touch recovery */ 3884 rack->r_ctl.rack_per_of_gp_rec = 100; 3885 } 3886 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3887 rack->r_ctl.rack_per_of_gp_ca = 100; 3888 } 3889 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3890 rack->r_ctl.rack_per_of_gp_ss = 100; 3891 } 3892 } 3893 3894 static void 3895 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3896 { 3897 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3898 rack->r_ctl.rack_per_of_gp_ca = 100; 3899 } 3900 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3901 rack->r_ctl.rack_per_of_gp_ss = 100; 3902 } 3903 } 3904 3905 static void 3906 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3907 { 3908 int32_t calc, logged, plus; 3909 3910 logged = 0; 3911 3912 if (rack->rc_skip_timely) 3913 return; 3914 if (override) { 3915 /* 3916 * override is passed when we are 3917 * loosing b/w and making one last 3918 * gasp at trying to not loose out 3919 * to a new-reno flow. 3920 */ 3921 goto extra_boost; 3922 } 3923 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3924 if (rack->rc_gp_incr && 3925 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3926 /* 3927 * Reset and get 5 strokes more before the boost. Note 3928 * that the count is 0 based so we have to add one. 3929 */ 3930 extra_boost: 3931 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3932 rack->rc_gp_timely_inc_cnt = 0; 3933 } else 3934 plus = (uint32_t)rack_gp_increase_per; 3935 /* Must be at least 1% increase for true timely increases */ 3936 if ((plus < 1) && 3937 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3938 plus = 1; 3939 if (rack->rc_gp_saw_rec && 3940 (rack->rc_gp_no_rec_chg == 0) && 3941 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3942 rack->r_ctl.rack_per_of_gp_rec)) { 3943 /* We have been in recovery ding it too */ 3944 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3945 if (calc > 0xffff) 3946 calc = 0xffff; 3947 logged |= 1; 3948 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3949 if (rack->r_ctl.rack_per_upper_bound_ca && 3950 (rack->rc_dragged_bottom == 0) && 3951 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) 3952 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; 3953 } 3954 if (rack->rc_gp_saw_ca && 3955 (rack->rc_gp_saw_ss == 0) && 3956 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3957 rack->r_ctl.rack_per_of_gp_ca)) { 3958 /* In CA */ 3959 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3960 if (calc > 0xffff) 3961 calc = 0xffff; 3962 logged |= 2; 3963 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3964 if (rack->r_ctl.rack_per_upper_bound_ca && 3965 (rack->rc_dragged_bottom == 0) && 3966 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) 3967 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; 3968 } 3969 if (rack->rc_gp_saw_ss && 3970 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3971 rack->r_ctl.rack_per_of_gp_ss)) { 3972 /* In SS */ 3973 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3974 if (calc > 0xffff) 3975 calc = 0xffff; 3976 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3977 if (rack->r_ctl.rack_per_upper_bound_ss && 3978 (rack->rc_dragged_bottom == 0) && 3979 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) 3980 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; 3981 logged |= 4; 3982 } 3983 if (logged && 3984 (rack->rc_gp_incr == 0)){ 3985 /* Go into increment mode */ 3986 rack->rc_gp_incr = 1; 3987 rack->rc_gp_timely_inc_cnt = 0; 3988 } 3989 if (rack->rc_gp_incr && 3990 logged && 3991 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3992 rack->rc_gp_timely_inc_cnt++; 3993 } 3994 rack_log_timely(rack, logged, plus, 0, 0, 3995 __LINE__, 1); 3996 } 3997 3998 static uint32_t 3999 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 4000 { 4001 /*- 4002 * norm_grad = rtt_diff / minrtt; 4003 * new_per = curper * (1 - B * norm_grad) 4004 * 4005 * B = rack_gp_decrease_per (default 80%) 4006 * rtt_dif = input var current rtt-diff 4007 * curper = input var current percentage 4008 * minrtt = from rack filter 4009 * 4010 * In order to do the floating point calculations above we 4011 * do an integer conversion. The code looks confusing so let me 4012 * translate it into something that use more variables and 4013 * is clearer for us humans :) 4014 * 4015 * uint64_t norm_grad, inverse, reduce_by, final_result; 4016 * uint32_t perf; 4017 * 4018 * norm_grad = (((uint64_t)rtt_diff * 1000000) / 4019 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt)); 4020 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad; 4021 * inverse /= 1000000; 4022 * reduce_by = (1000000 - inverse); 4023 * final_result = (cur_per * reduce_by) / 1000000; 4024 * perf = (uint32_t)final_result; 4025 */ 4026 uint64_t perf; 4027 4028 perf = (((uint64_t)curper * ((uint64_t)1000000 - 4029 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 4030 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 4031 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 4032 (uint64_t)1000000)) / 4033 (uint64_t)1000000); 4034 if (perf > curper) { 4035 /* TSNH */ 4036 perf = curper - 1; 4037 } 4038 return ((uint32_t)perf); 4039 } 4040 4041 static uint32_t 4042 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 4043 { 4044 /* 4045 * highrttthresh 4046 * result = curper * (1 - (B * ( 1 - ------ )) 4047 * gp_srtt 4048 * 4049 * B = rack_gp_decrease_per (default .8 i.e. 80) 4050 * highrttthresh = filter_min * rack_gp_rtt_maxmul 4051 */ 4052 uint64_t perf; 4053 uint32_t highrttthresh; 4054 4055 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4056 4057 perf = (((uint64_t)curper * ((uint64_t)1000000 - 4058 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 4059 ((uint64_t)highrttthresh * (uint64_t)1000000) / 4060 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 4061 if (tcp_bblogging_on(rack->rc_tp)) { 4062 uint64_t log1; 4063 4064 log1 = rtt; 4065 log1 <<= 32; 4066 log1 |= highrttthresh; 4067 rack_log_timely(rack, 4068 rack_gp_decrease_per, 4069 (uint64_t)curper, 4070 log1, 4071 perf, 4072 __LINE__, 4073 15); 4074 } 4075 return (perf); 4076 } 4077 4078 static void 4079 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 4080 { 4081 uint64_t logvar, logvar2, logvar3; 4082 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 4083 4084 if (rack->rc_skip_timely) 4085 return; 4086 if (rack->rc_gp_incr) { 4087 /* Turn off increment counting */ 4088 rack->rc_gp_incr = 0; 4089 rack->rc_gp_timely_inc_cnt = 0; 4090 } 4091 ss_red = ca_red = rec_red = 0; 4092 logged = 0; 4093 /* Calculate the reduction value */ 4094 if (rtt_diff < 0) { 4095 rtt_diff *= -1; 4096 } 4097 /* Must be at least 1% reduction */ 4098 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 4099 /* We have been in recovery ding it too */ 4100 if (timely_says == 2) { 4101 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 4102 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 4103 if (alt < new_per) 4104 val = alt; 4105 else 4106 val = new_per; 4107 } else 4108 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 4109 if (rack->r_ctl.rack_per_of_gp_rec > val) { 4110 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 4111 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 4112 } else { 4113 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4114 rec_red = 0; 4115 } 4116 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 4117 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4118 logged |= 1; 4119 } 4120 if (rack->rc_gp_saw_ss) { 4121 /* Sent in SS */ 4122 if (timely_says == 2) { 4123 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 4124 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4125 if (alt < new_per) 4126 val = alt; 4127 else 4128 val = new_per; 4129 } else 4130 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4131 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 4132 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 4133 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 4134 } else { 4135 ss_red = new_per; 4136 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4137 logvar = new_per; 4138 logvar <<= 32; 4139 logvar |= alt; 4140 logvar2 = (uint32_t)rtt; 4141 logvar2 <<= 32; 4142 logvar2 |= (uint32_t)rtt_diff; 4143 logvar3 = rack_gp_rtt_maxmul; 4144 logvar3 <<= 32; 4145 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4146 rack_log_timely(rack, timely_says, 4147 logvar2, logvar3, 4148 logvar, __LINE__, 10); 4149 } 4150 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 4151 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4152 logged |= 4; 4153 } else if (rack->rc_gp_saw_ca) { 4154 /* Sent in CA */ 4155 if (timely_says == 2) { 4156 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 4157 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4158 if (alt < new_per) 4159 val = alt; 4160 else 4161 val = new_per; 4162 } else 4163 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4164 if (rack->r_ctl.rack_per_of_gp_ca > val) { 4165 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 4166 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 4167 } else { 4168 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4169 ca_red = 0; 4170 logvar = new_per; 4171 logvar <<= 32; 4172 logvar |= alt; 4173 logvar2 = (uint32_t)rtt; 4174 logvar2 <<= 32; 4175 logvar2 |= (uint32_t)rtt_diff; 4176 logvar3 = rack_gp_rtt_maxmul; 4177 logvar3 <<= 32; 4178 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4179 rack_log_timely(rack, timely_says, 4180 logvar2, logvar3, 4181 logvar, __LINE__, 10); 4182 } 4183 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 4184 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4185 logged |= 2; 4186 } 4187 if (rack->rc_gp_timely_dec_cnt < 0x7) { 4188 rack->rc_gp_timely_dec_cnt++; 4189 if (rack_timely_dec_clear && 4190 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 4191 rack->rc_gp_timely_dec_cnt = 0; 4192 } 4193 logvar = ss_red; 4194 logvar <<= 32; 4195 logvar |= ca_red; 4196 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 4197 __LINE__, 2); 4198 } 4199 4200 static void 4201 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 4202 uint32_t rtt, uint32_t line, uint8_t reas) 4203 { 4204 if (tcp_bblogging_on(rack->rc_tp)) { 4205 union tcp_log_stackspecific log; 4206 struct timeval tv; 4207 4208 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4209 log.u_bbr.flex1 = line; 4210 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 4211 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 4212 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 4213 log.u_bbr.flex5 = rtt; 4214 log.u_bbr.flex6 = rack->rc_highly_buffered; 4215 log.u_bbr.flex6 <<= 1; 4216 log.u_bbr.flex6 |= rack->forced_ack; 4217 log.u_bbr.flex6 <<= 1; 4218 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 4219 log.u_bbr.flex6 <<= 1; 4220 log.u_bbr.flex6 |= rack->in_probe_rtt; 4221 log.u_bbr.flex6 <<= 1; 4222 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 4223 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 4224 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 4225 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 4226 log.u_bbr.flex8 = reas; 4227 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4228 log.u_bbr.delRate = rack_get_bw(rack); 4229 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 4230 log.u_bbr.cur_del_rate <<= 32; 4231 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 4232 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 4233 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 4234 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 4235 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 4236 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 4237 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 4238 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 4239 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4240 log.u_bbr.rttProp = us_cts; 4241 log.u_bbr.rttProp <<= 32; 4242 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 4243 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4244 &rack->rc_inp->inp_socket->so_rcv, 4245 &rack->rc_inp->inp_socket->so_snd, 4246 BBR_LOG_RTT_SHRINKS, 0, 4247 0, &log, false, &rack->r_ctl.act_rcv_time); 4248 } 4249 } 4250 4251 static void 4252 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 4253 { 4254 uint64_t bwdp; 4255 4256 bwdp = rack_get_bw(rack); 4257 bwdp *= (uint64_t)rtt; 4258 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 4259 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 4260 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 4261 /* 4262 * A window protocol must be able to have 4 packets 4263 * outstanding as the floor in order to function 4264 * (especially considering delayed ack :D). 4265 */ 4266 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 4267 } 4268 } 4269 4270 static void 4271 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 4272 { 4273 /** 4274 * ProbeRTT is a bit different in rack_pacing than in 4275 * BBR. It is like BBR in that it uses the lowering of 4276 * the RTT as a signal that we saw something new and 4277 * counts from there for how long between. But it is 4278 * different in that its quite simple. It does not 4279 * play with the cwnd and wait until we get down 4280 * to N segments outstanding and hold that for 4281 * 200ms. Instead it just sets the pacing reduction 4282 * rate to a set percentage (70 by default) and hold 4283 * that for a number of recent GP Srtt's. 4284 */ 4285 uint32_t segsiz; 4286 4287 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4288 if (rack->rc_gp_dyn_mul == 0) 4289 return; 4290 4291 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 4292 /* We are idle */ 4293 return; 4294 } 4295 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4296 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4297 /* 4298 * Stop the goodput now, the idea here is 4299 * that future measurements with in_probe_rtt 4300 * won't register if they are not greater so 4301 * we want to get what info (if any) is available 4302 * now. 4303 */ 4304 rack_do_goodput_measurement(rack->rc_tp, rack, 4305 rack->rc_tp->snd_una, __LINE__, 4306 RACK_QUALITY_PROBERTT); 4307 } 4308 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4309 rack->r_ctl.rc_time_probertt_entered = us_cts; 4310 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4311 rack->r_ctl.rc_pace_min_segs); 4312 rack->in_probe_rtt = 1; 4313 rack->measure_saw_probe_rtt = 1; 4314 rack->r_ctl.rc_time_probertt_starts = 0; 4315 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 4316 if (rack_probertt_use_min_rtt_entry) 4317 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4318 else 4319 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 4320 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4321 __LINE__, RACK_RTTS_ENTERPROBE); 4322 } 4323 4324 static void 4325 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 4326 { 4327 struct rack_sendmap *rsm; 4328 uint32_t segsiz; 4329 4330 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4331 rack->r_ctl.rc_pace_min_segs); 4332 rack->in_probe_rtt = 0; 4333 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4334 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4335 /* 4336 * Stop the goodput now, the idea here is 4337 * that future measurements with in_probe_rtt 4338 * won't register if they are not greater so 4339 * we want to get what info (if any) is available 4340 * now. 4341 */ 4342 rack_do_goodput_measurement(rack->rc_tp, rack, 4343 rack->rc_tp->snd_una, __LINE__, 4344 RACK_QUALITY_PROBERTT); 4345 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 4346 /* 4347 * We don't have enough data to make a measurement. 4348 * So lets just stop and start here after exiting 4349 * probe-rtt. We probably are not interested in 4350 * the results anyway. 4351 */ 4352 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 4353 } 4354 /* 4355 * Measurements through the current snd_max are going 4356 * to be limited by the slower pacing rate. 4357 * 4358 * We need to mark these as app-limited so we 4359 * don't collapse the b/w. 4360 */ 4361 rsm = tqhash_max(rack->r_ctl.tqh); 4362 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 4363 if (rack->r_ctl.rc_app_limited_cnt == 0) 4364 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 4365 else { 4366 /* 4367 * Go out to the end app limited and mark 4368 * this new one as next and move the end_appl up 4369 * to this guy. 4370 */ 4371 if (rack->r_ctl.rc_end_appl) 4372 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 4373 rack->r_ctl.rc_end_appl = rsm; 4374 } 4375 rsm->r_flags |= RACK_APP_LIMITED; 4376 rack->r_ctl.rc_app_limited_cnt++; 4377 } 4378 /* 4379 * Now, we need to examine our pacing rate multipliers. 4380 * If its under 100%, we need to kick it back up to 4381 * 100%. We also don't let it be over our "max" above 4382 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 4383 * Note setting clamp_atexit_prtt to 0 has the effect 4384 * of setting CA/SS to 100% always at exit (which is 4385 * the default behavior). 4386 */ 4387 if (rack_probertt_clear_is) { 4388 rack->rc_gp_incr = 0; 4389 rack->rc_gp_bwred = 0; 4390 rack->rc_gp_timely_inc_cnt = 0; 4391 rack->rc_gp_timely_dec_cnt = 0; 4392 } 4393 /* Do we do any clamping at exit? */ 4394 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 4395 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 4396 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 4397 } 4398 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 4399 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 4400 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 4401 } 4402 /* 4403 * Lets set rtt_diff to 0, so that we will get a "boost" 4404 * after exiting. 4405 */ 4406 rack->r_ctl.rc_rtt_diff = 0; 4407 4408 /* Clear all flags so we start fresh */ 4409 rack->rc_tp->t_bytes_acked = 0; 4410 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4411 /* 4412 * If configured to, set the cwnd and ssthresh to 4413 * our targets. 4414 */ 4415 if (rack_probe_rtt_sets_cwnd) { 4416 uint64_t ebdp; 4417 uint32_t setto; 4418 4419 /* Set ssthresh so we get into CA once we hit our target */ 4420 if (rack_probertt_use_min_rtt_exit == 1) { 4421 /* Set to min rtt */ 4422 rack_set_prtt_target(rack, segsiz, 4423 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4424 } else if (rack_probertt_use_min_rtt_exit == 2) { 4425 /* Set to current gp rtt */ 4426 rack_set_prtt_target(rack, segsiz, 4427 rack->r_ctl.rc_gp_srtt); 4428 } else if (rack_probertt_use_min_rtt_exit == 3) { 4429 /* Set to entry gp rtt */ 4430 rack_set_prtt_target(rack, segsiz, 4431 rack->r_ctl.rc_entry_gp_rtt); 4432 } else { 4433 uint64_t sum; 4434 uint32_t setval; 4435 4436 sum = rack->r_ctl.rc_entry_gp_rtt; 4437 sum *= 10; 4438 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 4439 if (sum >= 20) { 4440 /* 4441 * A highly buffered path needs 4442 * cwnd space for timely to work. 4443 * Lets set things up as if 4444 * we are heading back here again. 4445 */ 4446 setval = rack->r_ctl.rc_entry_gp_rtt; 4447 } else if (sum >= 15) { 4448 /* 4449 * Lets take the smaller of the 4450 * two since we are just somewhat 4451 * buffered. 4452 */ 4453 setval = rack->r_ctl.rc_gp_srtt; 4454 if (setval > rack->r_ctl.rc_entry_gp_rtt) 4455 setval = rack->r_ctl.rc_entry_gp_rtt; 4456 } else { 4457 /* 4458 * Here we are not highly buffered 4459 * and should pick the min we can to 4460 * keep from causing loss. 4461 */ 4462 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4463 } 4464 rack_set_prtt_target(rack, segsiz, 4465 setval); 4466 } 4467 if (rack_probe_rtt_sets_cwnd > 1) { 4468 /* There is a percentage here to boost */ 4469 ebdp = rack->r_ctl.rc_target_probertt_flight; 4470 ebdp *= rack_probe_rtt_sets_cwnd; 4471 ebdp /= 100; 4472 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 4473 } else 4474 setto = rack->r_ctl.rc_target_probertt_flight; 4475 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 4476 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 4477 /* Enforce a min */ 4478 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 4479 } 4480 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 4481 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 4482 } 4483 rack_log_rtt_shrinks(rack, us_cts, 4484 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4485 __LINE__, RACK_RTTS_EXITPROBE); 4486 /* Clear times last so log has all the info */ 4487 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 4488 rack->r_ctl.rc_time_probertt_entered = us_cts; 4489 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4490 rack->r_ctl.rc_time_of_last_probertt = us_cts; 4491 } 4492 4493 static void 4494 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 4495 { 4496 /* Check in on probe-rtt */ 4497 4498 if (rack->rc_gp_filled == 0) { 4499 /* We do not do p-rtt unless we have gp measurements */ 4500 return; 4501 } 4502 if (rack->in_probe_rtt) { 4503 uint64_t no_overflow; 4504 uint32_t endtime, must_stay; 4505 4506 if (rack->r_ctl.rc_went_idle_time && 4507 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 4508 /* 4509 * We went idle during prtt, just exit now. 4510 */ 4511 rack_exit_probertt(rack, us_cts); 4512 } else if (rack_probe_rtt_safety_val && 4513 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 4514 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 4515 /* 4516 * Probe RTT safety value triggered! 4517 */ 4518 rack_log_rtt_shrinks(rack, us_cts, 4519 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4520 __LINE__, RACK_RTTS_SAFETY); 4521 rack_exit_probertt(rack, us_cts); 4522 } 4523 /* Calculate the max we will wait */ 4524 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 4525 if (rack->rc_highly_buffered) 4526 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 4527 /* Calculate the min we must wait */ 4528 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 4529 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 4530 TSTMP_LT(us_cts, endtime)) { 4531 uint32_t calc; 4532 /* Do we lower more? */ 4533 no_exit: 4534 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 4535 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 4536 else 4537 calc = 0; 4538 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4539 if (calc) { 4540 /* Maybe */ 4541 calc *= rack_per_of_gp_probertt_reduce; 4542 if (calc > rack_per_of_gp_probertt) 4543 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4544 else 4545 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4546 /* Limit it too */ 4547 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4548 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4549 } 4550 /* We must reach target or the time set */ 4551 return; 4552 } 4553 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4554 if ((TSTMP_LT(us_cts, must_stay) && 4555 rack->rc_highly_buffered) || 4556 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4557 rack->r_ctl.rc_target_probertt_flight)) { 4558 /* We are not past the must_stay time */ 4559 goto no_exit; 4560 } 4561 rack_log_rtt_shrinks(rack, us_cts, 4562 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4563 __LINE__, RACK_RTTS_REACHTARGET); 4564 rack->r_ctl.rc_time_probertt_starts = us_cts; 4565 if (rack->r_ctl.rc_time_probertt_starts == 0) 4566 rack->r_ctl.rc_time_probertt_starts = 1; 4567 /* Restore back to our rate we want to pace at in prtt */ 4568 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4569 } 4570 /* 4571 * Setup our end time, some number of gp_srtts plus 200ms. 4572 */ 4573 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4574 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4575 if (rack_probertt_gpsrtt_cnt_div) 4576 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4577 else 4578 endtime = 0; 4579 endtime += rack_min_probertt_hold; 4580 endtime += rack->r_ctl.rc_time_probertt_starts; 4581 if (TSTMP_GEQ(us_cts, endtime)) { 4582 /* yes, exit probertt */ 4583 rack_exit_probertt(rack, us_cts); 4584 } 4585 4586 } else if ((rack->rc_skip_timely == 0) && 4587 (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) && 4588 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) { 4589 /* Go into probertt, its been too long since we went lower */ 4590 rack_enter_probertt(rack, us_cts); 4591 } 4592 } 4593 4594 static void 4595 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4596 uint32_t rtt, int32_t rtt_diff) 4597 { 4598 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4599 uint32_t losses; 4600 4601 if ((rack->rc_gp_dyn_mul == 0) || 4602 (rack->use_fixed_rate) || 4603 (rack->in_probe_rtt) || 4604 (rack->rc_always_pace == 0)) { 4605 /* No dynamic GP multiplier in play */ 4606 return; 4607 } 4608 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4609 cur_bw = rack_get_bw(rack); 4610 /* Calculate our up and down range */ 4611 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4612 up_bnd /= 100; 4613 up_bnd += rack->r_ctl.last_gp_comp_bw; 4614 4615 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4616 subfr /= 100; 4617 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4618 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4619 /* 4620 * This is the case where our RTT is above 4621 * the max target and we have been configured 4622 * to just do timely no bonus up stuff in that case. 4623 * 4624 * There are two configurations, set to 1, and we 4625 * just do timely if we are over our max. If its 4626 * set above 1 then we slam the multipliers down 4627 * to 100 and then decrement per timely. 4628 */ 4629 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4630 __LINE__, 3); 4631 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4632 rack_validate_multipliers_at_or_below_100(rack); 4633 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4634 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { 4635 /* 4636 * We are decreasing this is a bit complicated this 4637 * means we are loosing ground. This could be 4638 * because another flow entered and we are competing 4639 * for b/w with it. This will push the RTT up which 4640 * makes timely unusable unless we want to get shoved 4641 * into a corner and just be backed off (the age 4642 * old problem with delay based CC). 4643 * 4644 * On the other hand if it was a route change we 4645 * would like to stay somewhat contained and not 4646 * blow out the buffers. 4647 */ 4648 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4649 __LINE__, 3); 4650 rack->r_ctl.last_gp_comp_bw = cur_bw; 4651 if (rack->rc_gp_bwred == 0) { 4652 /* Go into reduction counting */ 4653 rack->rc_gp_bwred = 1; 4654 rack->rc_gp_timely_dec_cnt = 0; 4655 } 4656 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { 4657 /* 4658 * Push another time with a faster pacing 4659 * to try to gain back (we include override to 4660 * get a full raise factor). 4661 */ 4662 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4663 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4664 (timely_says == 0) || 4665 (rack_down_raise_thresh == 0)) { 4666 /* 4667 * Do an override up in b/w if we were 4668 * below the threshold or if the threshold 4669 * is zero we always do the raise. 4670 */ 4671 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4672 } else { 4673 /* Log it stays the same */ 4674 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4675 __LINE__, 11); 4676 } 4677 rack->rc_gp_timely_dec_cnt++; 4678 /* We are not incrementing really no-count */ 4679 rack->rc_gp_incr = 0; 4680 rack->rc_gp_timely_inc_cnt = 0; 4681 } else { 4682 /* 4683 * Lets just use the RTT 4684 * information and give up 4685 * pushing. 4686 */ 4687 goto use_timely; 4688 } 4689 } else if ((timely_says != 2) && 4690 !losses && 4691 (last_bw_est > up_bnd)) { 4692 /* 4693 * We are increasing b/w lets keep going, updating 4694 * our b/w and ignoring any timely input, unless 4695 * of course we are at our max raise (if there is one). 4696 */ 4697 4698 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4699 __LINE__, 3); 4700 rack->r_ctl.last_gp_comp_bw = cur_bw; 4701 if (rack->rc_gp_saw_ss && 4702 rack->r_ctl.rack_per_upper_bound_ss && 4703 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { 4704 /* 4705 * In cases where we can't go higher 4706 * we should just use timely. 4707 */ 4708 goto use_timely; 4709 } 4710 if (rack->rc_gp_saw_ca && 4711 rack->r_ctl.rack_per_upper_bound_ca && 4712 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { 4713 /* 4714 * In cases where we can't go higher 4715 * we should just use timely. 4716 */ 4717 goto use_timely; 4718 } 4719 rack->rc_gp_bwred = 0; 4720 rack->rc_gp_timely_dec_cnt = 0; 4721 /* You get a set number of pushes if timely is trying to reduce */ 4722 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4723 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4724 } else { 4725 /* Log it stays the same */ 4726 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4727 __LINE__, 12); 4728 } 4729 return; 4730 } else { 4731 /* 4732 * We are staying between the lower and upper range bounds 4733 * so use timely to decide. 4734 */ 4735 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4736 __LINE__, 3); 4737 use_timely: 4738 if (timely_says) { 4739 rack->rc_gp_incr = 0; 4740 rack->rc_gp_timely_inc_cnt = 0; 4741 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4742 !losses && 4743 (last_bw_est < low_bnd)) { 4744 /* We are loosing ground */ 4745 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4746 rack->rc_gp_timely_dec_cnt++; 4747 /* We are not incrementing really no-count */ 4748 rack->rc_gp_incr = 0; 4749 rack->rc_gp_timely_inc_cnt = 0; 4750 } else 4751 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4752 } else { 4753 rack->rc_gp_bwred = 0; 4754 rack->rc_gp_timely_dec_cnt = 0; 4755 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4756 } 4757 } 4758 } 4759 4760 static int32_t 4761 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4762 { 4763 int32_t timely_says; 4764 uint64_t log_mult, log_rtt_a_diff; 4765 4766 log_rtt_a_diff = rtt; 4767 log_rtt_a_diff <<= 32; 4768 log_rtt_a_diff |= (uint32_t)rtt_diff; 4769 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4770 rack_gp_rtt_maxmul)) { 4771 /* Reduce the b/w multiplier */ 4772 timely_says = 2; 4773 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4774 log_mult <<= 32; 4775 log_mult |= prev_rtt; 4776 rack_log_timely(rack, timely_says, log_mult, 4777 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4778 log_rtt_a_diff, __LINE__, 4); 4779 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4780 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4781 max(rack_gp_rtt_mindiv , 1)))) { 4782 /* Increase the b/w multiplier */ 4783 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4784 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4785 max(rack_gp_rtt_mindiv , 1)); 4786 log_mult <<= 32; 4787 log_mult |= prev_rtt; 4788 timely_says = 0; 4789 rack_log_timely(rack, timely_says, log_mult , 4790 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4791 log_rtt_a_diff, __LINE__, 5); 4792 } else { 4793 /* 4794 * Use a gradient to find it the timely gradient 4795 * is: 4796 * grad = rc_rtt_diff / min_rtt; 4797 * 4798 * anything below or equal to 0 will be 4799 * a increase indication. Anything above 4800 * zero is a decrease. Note we take care 4801 * of the actual gradient calculation 4802 * in the reduction (its not needed for 4803 * increase). 4804 */ 4805 log_mult = prev_rtt; 4806 if (rtt_diff <= 0) { 4807 /* 4808 * Rttdiff is less than zero, increase the 4809 * b/w multiplier (its 0 or negative) 4810 */ 4811 timely_says = 0; 4812 rack_log_timely(rack, timely_says, log_mult, 4813 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4814 } else { 4815 /* Reduce the b/w multiplier */ 4816 timely_says = 1; 4817 rack_log_timely(rack, timely_says, log_mult, 4818 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4819 } 4820 } 4821 return (timely_says); 4822 } 4823 4824 static __inline int 4825 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) 4826 { 4827 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4828 SEQ_LEQ(rsm->r_end, tp->gput_ack)) { 4829 /** 4830 * This covers the case that the 4831 * resent is completely inside 4832 * the gp range or up to it. 4833 * |----------------| 4834 * |-----| <or> 4835 * |----| 4836 * <or> |---| 4837 */ 4838 return (1); 4839 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && 4840 SEQ_GT(rsm->r_end, tp->gput_seq)){ 4841 /** 4842 * This covers the case of 4843 * |--------------| 4844 * |-------->| 4845 */ 4846 return (1); 4847 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4848 SEQ_LT(rsm->r_start, tp->gput_ack) && 4849 SEQ_GEQ(rsm->r_end, tp->gput_ack)) { 4850 4851 /** 4852 * This covers the case of 4853 * |--------------| 4854 * |-------->| 4855 */ 4856 return (1); 4857 } 4858 return (0); 4859 } 4860 4861 static __inline void 4862 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) 4863 { 4864 4865 if ((tp->t_flags & TF_GPUTINPROG) == 0) 4866 return; 4867 /* 4868 * We have a Goodput measurement in progress. Mark 4869 * the send if its within the window. If its not 4870 * in the window make sure it does not have the mark. 4871 */ 4872 if (rack_in_gp_window(tp, rsm)) 4873 rsm->r_flags |= RACK_IN_GP_WIN; 4874 else 4875 rsm->r_flags &= ~RACK_IN_GP_WIN; 4876 } 4877 4878 static __inline void 4879 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4880 { 4881 /* A GP measurement is ending, clear all marks on the send map*/ 4882 struct rack_sendmap *rsm = NULL; 4883 4884 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4885 if (rsm == NULL) { 4886 rsm = tqhash_min(rack->r_ctl.tqh); 4887 } 4888 /* Nothing left? */ 4889 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ 4890 rsm->r_flags &= ~RACK_IN_GP_WIN; 4891 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4892 } 4893 } 4894 4895 4896 static __inline void 4897 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4898 { 4899 struct rack_sendmap *rsm = NULL; 4900 4901 if (tp->snd_una == tp->snd_max) { 4902 /* Nothing outstanding yet, nothing to do here */ 4903 return; 4904 } 4905 if (SEQ_GT(tp->gput_seq, tp->snd_una)) { 4906 /* 4907 * We are measuring ahead of some outstanding 4908 * data. We need to walk through up until we get 4909 * to gp_seq marking so that no rsm is set incorrectly 4910 * with RACK_IN_GP_WIN. 4911 */ 4912 rsm = tqhash_min(rack->r_ctl.tqh); 4913 while (rsm != NULL) { 4914 rack_mark_in_gp_win(tp, rsm); 4915 if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) 4916 break; 4917 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4918 } 4919 } 4920 if (rsm == NULL) { 4921 /* 4922 * Need to find the GP seq, if rsm is 4923 * set we stopped as we hit it. 4924 */ 4925 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4926 if (rsm == NULL) 4927 return; 4928 rack_mark_in_gp_win(tp, rsm); 4929 } 4930 /* 4931 * Now we may need to mark already sent rsm, ahead of 4932 * gput_seq in the window since they may have been sent 4933 * *before* we started our measurment. The rsm, if non-null 4934 * has been marked (note if rsm would have been NULL we would have 4935 * returned in the previous block). So we go to the next, and continue 4936 * until we run out of entries or we exceed the gp_ack value. 4937 */ 4938 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4939 while (rsm) { 4940 rack_mark_in_gp_win(tp, rsm); 4941 if (SEQ_GT(rsm->r_end, tp->gput_ack)) 4942 break; 4943 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4944 } 4945 } 4946 4947 static void 4948 rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line) 4949 { 4950 if (tcp_bblogging_on(rack->rc_tp)) { 4951 union tcp_log_stackspecific log; 4952 struct timeval tv; 4953 4954 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4955 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4956 log.u_bbr.flex1 = add_part; 4957 log.u_bbr.flex2 = sub_part; 4958 log.u_bbr.flex3 = rack_wma_divisor; 4959 log.u_bbr.flex4 = srtt; 4960 log.u_bbr.flex7 = (uint16_t)line; 4961 log.u_bbr.flex8 = meth; 4962 log.u_bbr.delRate = rack->r_ctl.gp_bw; 4963 log.u_bbr.cur_del_rate = meas_bw; 4964 log.u_bbr.rttProp = utim; 4965 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4966 &rack->rc_inp->inp_socket->so_rcv, 4967 &rack->rc_inp->inp_socket->so_snd, 4968 BBR_LOG_THRESH_CALC, 0, 4969 0, &log, false, &rack->r_ctl.act_rcv_time); 4970 } 4971 } 4972 4973 static void 4974 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4975 tcp_seq th_ack, int line, uint8_t quality) 4976 { 4977 uint64_t tim, bytes_ps, stim, utim; 4978 uint32_t segsiz, bytes, reqbytes, us_cts; 4979 int32_t gput, new_rtt_diff, timely_says; 4980 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4981 int did_add = 0; 4982 4983 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4984 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4985 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4986 tim = us_cts - tp->gput_ts; 4987 else 4988 tim = 0; 4989 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4990 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4991 else 4992 stim = 0; 4993 /* 4994 * Use the larger of the send time or ack time. This prevents us 4995 * from being influenced by ack artifacts to come up with too 4996 * high of measurement. Note that since we are spanning over many more 4997 * bytes in most of our measurements hopefully that is less likely to 4998 * occur. 4999 */ 5000 if (tim > stim) 5001 utim = max(tim, 1); 5002 else 5003 utim = max(stim, 1); 5004 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 5005 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); 5006 if ((tim == 0) && (stim == 0)) { 5007 /* 5008 * Invalid measurement time, maybe 5009 * all on one ack/one send? 5010 */ 5011 bytes = 0; 5012 bytes_ps = 0; 5013 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5014 0, 0, 0, 10, __LINE__, NULL, quality); 5015 goto skip_measurement; 5016 } 5017 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 5018 /* We never made a us_rtt measurement? */ 5019 bytes = 0; 5020 bytes_ps = 0; 5021 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5022 0, 0, 0, 10, __LINE__, NULL, quality); 5023 goto skip_measurement; 5024 } 5025 /* 5026 * Calculate the maximum possible b/w this connection 5027 * could have. We base our calculation on the lowest 5028 * rtt we have seen during the measurement and the 5029 * largest rwnd the client has given us in that time. This 5030 * forms a BDP that is the maximum that we could ever 5031 * get to the client. Anything larger is not valid. 5032 * 5033 * I originally had code here that rejected measurements 5034 * where the time was less than 1/2 the latest us_rtt. 5035 * But after thinking on that I realized its wrong since 5036 * say you had a 150Mbps or even 1Gbps link, and you 5037 * were a long way away.. example I am in Europe (100ms rtt) 5038 * talking to my 1Gbps link in S.C. Now measuring say 150,000 5039 * bytes my time would be 1.2ms, and yet my rtt would say 5040 * the measurement was invalid the time was < 50ms. The 5041 * same thing is true for 150Mb (8ms of time). 5042 * 5043 * A better way I realized is to look at what the maximum 5044 * the connection could possibly do. This is gated on 5045 * the lowest RTT we have seen and the highest rwnd. 5046 * We should in theory never exceed that, if we are 5047 * then something on the path is storing up packets 5048 * and then feeding them all at once to our endpoint 5049 * messing up our measurement. 5050 */ 5051 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 5052 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 5053 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 5054 if (SEQ_LT(th_ack, tp->gput_seq)) { 5055 /* No measurement can be made */ 5056 bytes = 0; 5057 bytes_ps = 0; 5058 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5059 0, 0, 0, 10, __LINE__, NULL, quality); 5060 goto skip_measurement; 5061 } else 5062 bytes = (th_ack - tp->gput_seq); 5063 bytes_ps = (uint64_t)bytes; 5064 /* 5065 * Don't measure a b/w for pacing unless we have gotten at least 5066 * an initial windows worth of data in this measurement interval. 5067 * 5068 * Small numbers of bytes get badly influenced by delayed ack and 5069 * other artifacts. Note we take the initial window or our 5070 * defined minimum GP (defaulting to 10 which hopefully is the 5071 * IW). 5072 */ 5073 if (rack->rc_gp_filled == 0) { 5074 /* 5075 * The initial estimate is special. We 5076 * have blasted out an IW worth of packets 5077 * without a real valid ack ts results. We 5078 * then setup the app_limited_needs_set flag, 5079 * this should get the first ack in (probably 2 5080 * MSS worth) to be recorded as the timestamp. 5081 * We thus allow a smaller number of bytes i.e. 5082 * IW - 2MSS. 5083 */ 5084 reqbytes -= (2 * segsiz); 5085 /* Also lets fill previous for our first measurement to be neutral */ 5086 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5087 } 5088 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 5089 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5090 rack->r_ctl.rc_app_limited_cnt, 5091 0, 0, 10, __LINE__, NULL, quality); 5092 goto skip_measurement; 5093 } 5094 /* 5095 * We now need to calculate the Timely like status so 5096 * we can update (possibly) the b/w multipliers. 5097 */ 5098 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 5099 if (rack->rc_gp_filled == 0) { 5100 /* No previous reading */ 5101 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 5102 } else { 5103 if (rack->measure_saw_probe_rtt == 0) { 5104 /* 5105 * We don't want a probertt to be counted 5106 * since it will be negative incorrectly. We 5107 * expect to be reducing the RTT when we 5108 * pace at a slower rate. 5109 */ 5110 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 5111 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 5112 } 5113 } 5114 timely_says = rack_make_timely_judgement(rack, 5115 rack->r_ctl.rc_gp_srtt, 5116 rack->r_ctl.rc_rtt_diff, 5117 rack->r_ctl.rc_prev_gp_srtt 5118 ); 5119 bytes_ps *= HPTS_USEC_IN_SEC; 5120 bytes_ps /= utim; 5121 if (bytes_ps > rack->r_ctl.last_max_bw) { 5122 /* 5123 * Something is on path playing 5124 * since this b/w is not possible based 5125 * on our BDP (highest rwnd and lowest rtt 5126 * we saw in the measurement window). 5127 * 5128 * Another option here would be to 5129 * instead skip the measurement. 5130 */ 5131 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 5132 bytes_ps, rack->r_ctl.last_max_bw, 0, 5133 11, __LINE__, NULL, quality); 5134 bytes_ps = rack->r_ctl.last_max_bw; 5135 } 5136 /* We store gp for b/w in bytes per second */ 5137 if (rack->rc_gp_filled == 0) { 5138 /* Initial measurement */ 5139 if (bytes_ps) { 5140 rack->r_ctl.gp_bw = bytes_ps; 5141 rack->rc_gp_filled = 1; 5142 rack->r_ctl.num_measurements = 1; 5143 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 5144 } else { 5145 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5146 rack->r_ctl.rc_app_limited_cnt, 5147 0, 0, 10, __LINE__, NULL, quality); 5148 } 5149 if (tcp_in_hpts(rack->rc_tp) && 5150 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 5151 /* 5152 * Ok we can't trust the pacer in this case 5153 * where we transition from un-paced to paced. 5154 * Or for that matter when the burst mitigation 5155 * was making a wild guess and got it wrong. 5156 * Stop the pacer and clear up all the aggregate 5157 * delays etc. 5158 */ 5159 tcp_hpts_remove(rack->rc_tp); 5160 rack->r_ctl.rc_hpts_flags = 0; 5161 rack->r_ctl.rc_last_output_to = 0; 5162 } 5163 did_add = 2; 5164 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 5165 /* Still a small number run an average */ 5166 rack->r_ctl.gp_bw += bytes_ps; 5167 addpart = rack->r_ctl.num_measurements; 5168 rack->r_ctl.num_measurements++; 5169 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 5170 /* We have collected enough to move forward */ 5171 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 5172 } 5173 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5174 did_add = 3; 5175 } else { 5176 /* 5177 * We want to take 1/wma of the goodput and add in to 7/8th 5178 * of the old value weighted by the srtt. So if your measurement 5179 * period is say 2 SRTT's long you would get 1/4 as the 5180 * value, if it was like 1/2 SRTT then you would get 1/16th. 5181 * 5182 * But we must be careful not to take too much i.e. if the 5183 * srtt is say 20ms and the measurement is taken over 5184 * 400ms our weight would be 400/20 i.e. 20. On the 5185 * other hand if we get a measurement over 1ms with a 5186 * 10ms rtt we only want to take a much smaller portion. 5187 */ 5188 uint8_t meth; 5189 5190 if (rack->r_ctl.num_measurements < 0xff) { 5191 rack->r_ctl.num_measurements++; 5192 } 5193 srtt = (uint64_t)tp->t_srtt; 5194 if (srtt == 0) { 5195 /* 5196 * Strange why did t_srtt go back to zero? 5197 */ 5198 if (rack->r_ctl.rc_rack_min_rtt) 5199 srtt = rack->r_ctl.rc_rack_min_rtt; 5200 else 5201 srtt = HPTS_USEC_IN_MSEC; 5202 } 5203 /* 5204 * XXXrrs: Note for reviewers, in playing with 5205 * dynamic pacing I discovered this GP calculation 5206 * as done originally leads to some undesired results. 5207 * Basically you can get longer measurements contributing 5208 * too much to the WMA. Thus I changed it if you are doing 5209 * dynamic adjustments to only do the aportioned adjustment 5210 * if we have a very small (time wise) measurement. Longer 5211 * measurements just get there weight (defaulting to 1/8) 5212 * add to the WMA. We may want to think about changing 5213 * this to always do that for both sides i.e. dynamic 5214 * and non-dynamic... but considering lots of folks 5215 * were playing with this I did not want to change the 5216 * calculation per.se. without your thoughts.. Lawerence? 5217 * Peter?? 5218 */ 5219 if (rack->rc_gp_dyn_mul == 0) { 5220 subpart = rack->r_ctl.gp_bw * utim; 5221 subpart /= (srtt * 8); 5222 if (subpart < (rack->r_ctl.gp_bw / 2)) { 5223 /* 5224 * The b/w update takes no more 5225 * away then 1/2 our running total 5226 * so factor it in. 5227 */ 5228 addpart = bytes_ps * utim; 5229 addpart /= (srtt * 8); 5230 meth = 1; 5231 } else { 5232 /* 5233 * Don't allow a single measurement 5234 * to account for more than 1/2 of the 5235 * WMA. This could happen on a retransmission 5236 * where utim becomes huge compared to 5237 * srtt (multiple retransmissions when using 5238 * the sending rate which factors in all the 5239 * transmissions from the first one). 5240 */ 5241 subpart = rack->r_ctl.gp_bw / 2; 5242 addpart = bytes_ps / 2; 5243 meth = 2; 5244 } 5245 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5246 resid_bw = rack->r_ctl.gp_bw - subpart; 5247 rack->r_ctl.gp_bw = resid_bw + addpart; 5248 did_add = 1; 5249 } else { 5250 if ((utim / srtt) <= 1) { 5251 /* 5252 * The b/w update was over a small period 5253 * of time. The idea here is to prevent a small 5254 * measurement time period from counting 5255 * too much. So we scale it based on the 5256 * time so it attributes less than 1/rack_wma_divisor 5257 * of its measurement. 5258 */ 5259 subpart = rack->r_ctl.gp_bw * utim; 5260 subpart /= (srtt * rack_wma_divisor); 5261 addpart = bytes_ps * utim; 5262 addpart /= (srtt * rack_wma_divisor); 5263 meth = 3; 5264 } else { 5265 /* 5266 * The scaled measurement was long 5267 * enough so lets just add in the 5268 * portion of the measurement i.e. 1/rack_wma_divisor 5269 */ 5270 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 5271 addpart = bytes_ps / rack_wma_divisor; 5272 meth = 4; 5273 } 5274 if ((rack->measure_saw_probe_rtt == 0) || 5275 (bytes_ps > rack->r_ctl.gp_bw)) { 5276 /* 5277 * For probe-rtt we only add it in 5278 * if its larger, all others we just 5279 * add in. 5280 */ 5281 did_add = 1; 5282 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5283 resid_bw = rack->r_ctl.gp_bw - subpart; 5284 rack->r_ctl.gp_bw = resid_bw + addpart; 5285 } 5286 } 5287 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5288 } 5289 /* 5290 * We only watch the growth of the GP during the initial startup 5291 * or first-slowstart that ensues. If we ever needed to watch 5292 * growth of gp outside of that period all we need to do is 5293 * remove the first clause of this if (rc_initial_ss_comp). 5294 */ 5295 if ((rack->rc_initial_ss_comp == 0) && 5296 (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) { 5297 uint64_t gp_est; 5298 5299 gp_est = bytes_ps; 5300 if (tcp_bblogging_on(rack->rc_tp)) { 5301 union tcp_log_stackspecific log; 5302 struct timeval tv; 5303 5304 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5305 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5306 log.u_bbr.flex1 = rack->r_ctl.current_round; 5307 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 5308 log.u_bbr.delRate = gp_est; 5309 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5310 log.u_bbr.flex8 = 41; 5311 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5312 0, &log, false, NULL, __func__, __LINE__,&tv); 5313 } 5314 if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) || 5315 (rack->r_ctl.last_gpest == 0)) { 5316 /* 5317 * The round we get our measurement averaging going 5318 * is the base round so it always is the source point 5319 * for when we had our first increment. From there on 5320 * we only record the round that had a rise. 5321 */ 5322 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5323 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5324 } else if (gp_est >= rack->r_ctl.last_gpest) { 5325 /* 5326 * Test to see if its gone up enough 5327 * to set the round count up to now. Note 5328 * that on the seeding of the 4th measurement we 5329 */ 5330 gp_est *= 1000; 5331 gp_est /= rack->r_ctl.last_gpest; 5332 if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) { 5333 /* 5334 * We went up enough to record the round. 5335 */ 5336 if (tcp_bblogging_on(rack->rc_tp)) { 5337 union tcp_log_stackspecific log; 5338 struct timeval tv; 5339 5340 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5341 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5342 log.u_bbr.flex1 = rack->r_ctl.current_round; 5343 log.u_bbr.flex2 = (uint32_t)gp_est; 5344 log.u_bbr.flex3 = rack->r_ctl.gp_gain_req; 5345 log.u_bbr.delRate = gp_est; 5346 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5347 log.u_bbr.flex8 = 42; 5348 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5349 0, &log, false, NULL, __func__, __LINE__,&tv); 5350 } 5351 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5352 if (rack->r_ctl.use_gp_not_last == 1) 5353 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5354 else 5355 rack->r_ctl.last_gpest = bytes_ps; 5356 } 5357 } 5358 } 5359 if ((rack->gp_ready == 0) && 5360 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 5361 /* We have enough measurements now */ 5362 rack->gp_ready = 1; 5363 if (rack->dgp_on || 5364 rack->rack_hibeta) 5365 rack_set_cc_pacing(rack); 5366 if (rack->defer_options) 5367 rack_apply_deferred_options(rack); 5368 } 5369 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 5370 rack_get_bw(rack), 22, did_add, NULL, quality); 5371 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 5372 5373 if ((rack->measure_saw_probe_rtt == 0) && 5374 rack->rc_gp_rtt_set) { 5375 if (rack->rc_skip_timely == 0) { 5376 rack_update_multiplier(rack, timely_says, bytes_ps, 5377 rack->r_ctl.rc_gp_srtt, 5378 rack->r_ctl.rc_rtt_diff); 5379 } 5380 } 5381 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 5382 rack_get_bw(rack), 3, line, NULL, quality); 5383 rack_log_pacing_delay_calc(rack, 5384 bytes, /* flex2 */ 5385 tim, /* flex1 */ 5386 bytes_ps, /* bw_inuse */ 5387 rack->r_ctl.gp_bw, /* delRate */ 5388 rack_get_lt_bw(rack), /* rttProp */ 5389 20, line, NULL, 0); 5390 /* reset the gp srtt and setup the new prev */ 5391 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5392 /* Record the lost count for the next measurement */ 5393 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 5394 skip_measurement: 5395 /* 5396 * We restart our diffs based on the gpsrtt in the 5397 * measurement window. 5398 */ 5399 rack->rc_gp_rtt_set = 0; 5400 rack->rc_gp_saw_rec = 0; 5401 rack->rc_gp_saw_ca = 0; 5402 rack->rc_gp_saw_ss = 0; 5403 rack->rc_dragged_bottom = 0; 5404 if (quality == RACK_QUALITY_HIGH) { 5405 /* 5406 * Gput in the stats world is in kbps where bytes_ps is 5407 * bytes per second so we do ((x * 8)/ 1000). 5408 */ 5409 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); 5410 #ifdef STATS 5411 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 5412 gput); 5413 /* 5414 * XXXLAS: This is a temporary hack, and should be 5415 * chained off VOI_TCP_GPUT when stats(9) grows an 5416 * API to deal with chained VOIs. 5417 */ 5418 if (tp->t_stats_gput_prev > 0) 5419 stats_voi_update_abs_s32(tp->t_stats, 5420 VOI_TCP_GPUT_ND, 5421 ((gput - tp->t_stats_gput_prev) * 100) / 5422 tp->t_stats_gput_prev); 5423 #endif 5424 tp->t_stats_gput_prev = gput; 5425 } 5426 tp->t_flags &= ~TF_GPUTINPROG; 5427 /* 5428 * Now are we app limited now and there is space from where we 5429 * were to where we want to go? 5430 * 5431 * We don't do the other case i.e. non-applimited here since 5432 * the next send will trigger us picking up the missing data. 5433 */ 5434 if (rack->r_ctl.rc_first_appl && 5435 TCPS_HAVEESTABLISHED(tp->t_state) && 5436 rack->r_ctl.rc_app_limited_cnt && 5437 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 5438 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 5439 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 5440 /* 5441 * Yep there is enough outstanding to make a measurement here. 5442 */ 5443 struct rack_sendmap *rsm; 5444 5445 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 5446 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 5447 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 5448 rack->app_limited_needs_set = 0; 5449 tp->gput_seq = th_ack; 5450 if (rack->in_probe_rtt) 5451 rack->measure_saw_probe_rtt = 1; 5452 else if ((rack->measure_saw_probe_rtt) && 5453 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 5454 rack->measure_saw_probe_rtt = 0; 5455 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 5456 /* There is a full window to gain info from */ 5457 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 5458 } else { 5459 /* We can only measure up to the applimited point */ 5460 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 5461 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 5462 /* 5463 * We don't have enough to make a measurement. 5464 */ 5465 tp->t_flags &= ~TF_GPUTINPROG; 5466 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 5467 0, 0, 0, 6, __LINE__, NULL, quality); 5468 return; 5469 } 5470 } 5471 if (tp->t_state >= TCPS_FIN_WAIT_1) { 5472 /* 5473 * We will get no more data into the SB 5474 * this means we need to have the data available 5475 * before we start a measurement. 5476 */ 5477 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 5478 /* Nope not enough data. */ 5479 return; 5480 } 5481 } 5482 tp->t_flags |= TF_GPUTINPROG; 5483 /* 5484 * Now we need to find the timestamp of the send at tp->gput_seq 5485 * for the send based measurement. 5486 */ 5487 rack->r_ctl.rc_gp_cumack_ts = 0; 5488 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 5489 if (rsm) { 5490 /* Ok send-based limit is set */ 5491 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 5492 /* 5493 * Move back to include the earlier part 5494 * so our ack time lines up right (this may 5495 * make an overlapping measurement but thats 5496 * ok). 5497 */ 5498 tp->gput_seq = rsm->r_start; 5499 } 5500 if (rsm->r_flags & RACK_ACKED) { 5501 struct rack_sendmap *nrsm; 5502 5503 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 5504 tp->gput_seq = rsm->r_end; 5505 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 5506 if (nrsm) 5507 rsm = nrsm; 5508 else { 5509 rack->app_limited_needs_set = 1; 5510 } 5511 } else 5512 rack->app_limited_needs_set = 1; 5513 /* We always go from the first send */ 5514 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 5515 } else { 5516 /* 5517 * If we don't find the rsm due to some 5518 * send-limit set the current time, which 5519 * basically disables the send-limit. 5520 */ 5521 struct timeval tv; 5522 5523 microuptime(&tv); 5524 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 5525 } 5526 rack_tend_gp_marks(tp, rack); 5527 rack_log_pacing_delay_calc(rack, 5528 tp->gput_seq, 5529 tp->gput_ack, 5530 (uint64_t)rsm, 5531 tp->gput_ts, 5532 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 5533 9, 5534 __LINE__, rsm, quality); 5535 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 5536 } else { 5537 /* 5538 * To make sure proper timestamp merging occurs, we need to clear 5539 * all GP marks if we don't start a measurement. 5540 */ 5541 rack_clear_gp_marks(tp, rack); 5542 } 5543 } 5544 5545 /* 5546 * CC wrapper hook functions 5547 */ 5548 static void 5549 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 5550 uint16_t type, int32_t post_recovery) 5551 { 5552 uint32_t prior_cwnd, acked; 5553 struct tcp_log_buffer *lgb = NULL; 5554 uint8_t labc_to_use, quality; 5555 5556 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5557 tp->t_ccv.nsegs = nsegs; 5558 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 5559 if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 5560 uint32_t max; 5561 5562 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 5563 if (tp->t_ccv.bytes_this_ack > max) { 5564 tp->t_ccv.bytes_this_ack = max; 5565 } 5566 } 5567 #ifdef STATS 5568 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 5569 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 5570 #endif 5571 if ((th_ack == tp->snd_max) && rack->lt_bw_up) { 5572 /* 5573 * We will ack all the data, time to end any 5574 * lt_bw_up we have running until something 5575 * new is sent. Note we need to use the actual 5576 * ack_rcv_time which with pacing may be different. 5577 */ 5578 uint64_t tmark; 5579 5580 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); 5581 rack->r_ctl.lt_seq = tp->snd_max; 5582 tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 5583 if (tmark >= rack->r_ctl.lt_timemark) { 5584 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 5585 } 5586 rack->r_ctl.lt_timemark = tmark; 5587 rack->lt_bw_up = 0; 5588 } 5589 quality = RACK_QUALITY_NONE; 5590 if ((tp->t_flags & TF_GPUTINPROG) && 5591 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 5592 /* Measure the Goodput */ 5593 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 5594 } 5595 /* Which way our we limited, if not cwnd limited no advance in CA */ 5596 if (tp->snd_cwnd <= tp->snd_wnd) 5597 tp->t_ccv.flags |= CCF_CWND_LIMITED; 5598 else 5599 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 5600 if (tp->snd_cwnd > tp->snd_ssthresh) { 5601 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 5602 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 5603 /* For the setting of a window past use the actual scwnd we are using */ 5604 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 5605 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 5606 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 5607 } 5608 } else { 5609 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 5610 tp->t_bytes_acked = 0; 5611 } 5612 prior_cwnd = tp->snd_cwnd; 5613 if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 5614 (rack_client_low_buf && rack->client_bufferlvl && 5615 (rack->client_bufferlvl < rack_client_low_buf))) 5616 labc_to_use = rack->rc_labc; 5617 else 5618 labc_to_use = rack_max_abc_post_recovery; 5619 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5620 union tcp_log_stackspecific log; 5621 struct timeval tv; 5622 5623 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5624 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5625 log.u_bbr.flex1 = th_ack; 5626 log.u_bbr.flex2 = tp->t_ccv.flags; 5627 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5628 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5629 log.u_bbr.flex5 = labc_to_use; 5630 log.u_bbr.flex6 = prior_cwnd; 5631 log.u_bbr.flex7 = V_tcp_do_newsack; 5632 log.u_bbr.flex8 = 1; 5633 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5634 0, &log, false, NULL, __func__, __LINE__,&tv); 5635 } 5636 if (CC_ALGO(tp)->ack_received != NULL) { 5637 /* XXXLAS: Find a way to live without this */ 5638 tp->t_ccv.curack = th_ack; 5639 tp->t_ccv.labc = labc_to_use; 5640 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 5641 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 5642 } 5643 if (lgb) { 5644 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 5645 } 5646 if (rack->r_must_retran) { 5647 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 5648 /* 5649 * We now are beyond the rxt point so lets disable 5650 * the flag. 5651 */ 5652 rack->r_ctl.rc_out_at_rto = 0; 5653 rack->r_must_retran = 0; 5654 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 5655 /* 5656 * Only decrement the rc_out_at_rto if the cwnd advances 5657 * at least a whole segment. Otherwise next time the peer 5658 * acks, we won't be able to send this generaly happens 5659 * when we are in Congestion Avoidance. 5660 */ 5661 if (acked <= rack->r_ctl.rc_out_at_rto){ 5662 rack->r_ctl.rc_out_at_rto -= acked; 5663 } else { 5664 rack->r_ctl.rc_out_at_rto = 0; 5665 } 5666 } 5667 } 5668 #ifdef STATS 5669 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 5670 #endif 5671 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 5672 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 5673 } 5674 if ((rack->rc_initial_ss_comp == 0) && 5675 (tp->snd_cwnd >= tp->snd_ssthresh)) { 5676 /* 5677 * The cwnd has grown beyond ssthresh we have 5678 * entered ca and completed our first Slowstart. 5679 */ 5680 rack->rc_initial_ss_comp = 1; 5681 } 5682 } 5683 5684 static void 5685 tcp_rack_partialack(struct tcpcb *tp) 5686 { 5687 struct tcp_rack *rack; 5688 5689 rack = (struct tcp_rack *)tp->t_fb_ptr; 5690 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5691 /* 5692 * If we are doing PRR and have enough 5693 * room to send <or> we are pacing and prr 5694 * is disabled we will want to see if we 5695 * can send data (by setting r_wanted_output to 5696 * true). 5697 */ 5698 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 5699 rack->rack_no_prr) 5700 rack->r_wanted_output = 1; 5701 } 5702 5703 static inline uint64_t 5704 rack_get_rxt_per(uint64_t snds, uint64_t rxts) 5705 { 5706 uint64_t rxt_per; 5707 5708 if (snds > 0) { 5709 rxt_per = rxts * 1000; 5710 rxt_per /= snds; 5711 } else { 5712 /* This is an unlikely path */ 5713 if (rxts) { 5714 /* Its the max it was all re-transmits */ 5715 rxt_per = 0xffffffffffffffff; 5716 } else { 5717 rxt_per = 0; 5718 } 5719 } 5720 return (rxt_per); 5721 } 5722 5723 static void 5724 policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8) 5725 { 5726 if (tcp_bblogging_on(rack->rc_tp)) { 5727 union tcp_log_stackspecific log; 5728 struct timeval tv; 5729 5730 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5731 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5732 log.u_bbr.flex1 = flex1; 5733 log.u_bbr.flex2 = flex2; 5734 log.u_bbr.flex3 = flex3; 5735 log.u_bbr.flex4 = flex4; 5736 log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket; 5737 log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size; 5738 log.u_bbr.flex7 = 0; 5739 log.u_bbr.flex8 = flex8; 5740 log.u_bbr.bw_inuse = rack->r_ctl.policer_bw; 5741 log.u_bbr.applimited = rack->r_ctl.current_round; 5742 log.u_bbr.epoch = rack->r_ctl.policer_max_seg; 5743 log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery; 5744 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 5745 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 5746 log.u_bbr.rttProp = rack->r_ctl.gp_bw; 5747 log.u_bbr.bbr_state = rack->rc_policer_detected; 5748 log.u_bbr.bbr_substate = 0; 5749 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5750 log.u_bbr.use_lt_bw = rack->policer_detect_on; 5751 log.u_bbr.lt_epoch = 0; 5752 log.u_bbr.pkts_out = 0; 5753 tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, 5754 0, &log, false, NULL, NULL, 0, &tv); 5755 } 5756 5757 } 5758 5759 static void 5760 policer_detection(struct tcpcb *tp, struct tcp_rack *rack, int post_recovery) 5761 { 5762 /* 5763 * Rack excess rxt accounting is turned on. If we 5764 * are above a threshold of rxt's in at least N 5765 * rounds, then back off the cwnd and ssthresh 5766 * to fit into the long-term b/w. 5767 */ 5768 5769 uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0; 5770 uint32_t cnt_of_mape_rxt = 0; 5771 uint64_t snds, rxts, rxt_per, tim, del, del_bw; 5772 int i; 5773 struct timeval tv; 5774 5775 5776 /* 5777 * First is there enough packets delivered during recovery to make 5778 * a determiniation of b/w? 5779 */ 5780 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5781 if ((rack->rc_policer_detected == 0) && 5782 (rack->r_ctl.policer_del_mss > 0) && 5783 ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) { 5784 /* 5785 * Not enough data sent in recovery for initial detection. Once 5786 * we have deteced a policer we allow less than the threshold (polcer_del_mss) 5787 * amount of data in a recovery to let us fall through and double check 5788 * our policer settings and possibly expand or collapse the bucket size and 5789 * the polcier b/w. 5790 * 5791 * Once you are declared to be policed. this block of code cannot be 5792 * reached, instead blocks further down will re-check the policer detection 5793 * triggers and possibly reset the measurements if somehow we have let the 5794 * policer bucket size grow too large. 5795 */ 5796 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5797 policer_detection_log(rack, rack->r_ctl.policer_del_mss, 5798 ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz), 5799 rack->r_ctl.bytes_acked_in_recovery, segsiz, 18); 5800 } 5801 return; 5802 } 5803 tcp_get_usecs(&tv); 5804 tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery; 5805 del = rack->r_ctl.bytes_acked_in_recovery; 5806 if (tim > 0) 5807 del_bw = (del * (uint64_t)1000000) / tim; 5808 else 5809 del_bw = 0; 5810 /* B/W compensation? */ 5811 5812 if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) || 5813 (del_bw > 0))) { 5814 /* 5815 * Sanity check now that the data is in. How long does it 5816 * take for us to pace out two of our policer_max_seg's? 5817 * 5818 * If it is longer than the RTT then we are set 5819 * too slow, maybe because of not enough data 5820 * sent during recovery. 5821 */ 5822 uint64_t lentime, res, srtt, max_delbw, alt_bw; 5823 5824 srtt = (uint64_t)rack_grab_rtt(tp, rack); 5825 if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) 5826 srtt = tp->t_srtt; 5827 lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2; 5828 if (del_bw > rack->r_ctl.policer_bw) { 5829 max_delbw = del_bw; 5830 } else { 5831 max_delbw = rack->r_ctl.policer_bw; 5832 } 5833 res = lentime / max_delbw; 5834 if ((srtt > 0) && (res > srtt)) { 5835 /* 5836 * At this rate we can not get two policer_maxsegs 5837 * out before the ack arrives back. 5838 * 5839 * Lets at least get it raised up so that 5840 * we can be a bit faster than that if possible. 5841 */ 5842 lentime = (rack->r_ctl.policer_max_seg * 2); 5843 tim = srtt; 5844 alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim; 5845 if (alt_bw > max_delbw) { 5846 uint64_t cap_alt_bw; 5847 5848 cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp)); 5849 if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) { 5850 /* We place a min on the cap which defaults to 1Mbps */ 5851 cap_alt_bw = rack_pol_min_bw; 5852 } 5853 if (alt_bw <= cap_alt_bw) { 5854 /* It should be */ 5855 del_bw = alt_bw; 5856 policer_detection_log(rack, 5857 (uint32_t)tim, 5858 rack->r_ctl.policer_max_seg, 5859 0, 5860 0, 5861 16); 5862 } else { 5863 /* 5864 * This is an odd case where likely the RTT is very very 5865 * low. And yet it is still being policed. We don't want 5866 * to get more than (rack_policing_do_bw_comp+1) x del-rate 5867 * where del-rate is what we got in recovery for either the 5868 * first Policer Detection(PD) or this PD we are on now. 5869 */ 5870 del_bw = cap_alt_bw; 5871 policer_detection_log(rack, 5872 (uint32_t)tim, 5873 rack->r_ctl.policer_max_seg, 5874 (uint32_t)max_delbw, 5875 (rack->r_ctl.pol_bw_comp + 1), 5876 16); 5877 } 5878 } 5879 } 5880 } 5881 snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes; 5882 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes; 5883 rxt_per = rack_get_rxt_per(snds, rxts); 5884 /* Figure up the average and median */ 5885 for(i = 0; i < RETRAN_CNT_SIZE; i++) { 5886 if (rack->r_ctl.rc_cnt_of_retran[i] > 0) { 5887 tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; 5888 cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i]; 5889 } 5890 } 5891 if (cnt_of_mape_rxt) 5892 avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt; 5893 else 5894 avg = 0; 5895 alt_med = med = 0; 5896 mid = tot_retran_pkt_count/2; 5897 for(i = 0; i < RETRAN_CNT_SIZE; i++) { 5898 pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; 5899 if (mid > pkts) { 5900 mid -= pkts; 5901 continue; 5902 } 5903 med = (i + 1); 5904 break; 5905 } 5906 mid = cnt_of_mape_rxt / 2; 5907 for(i = 0; i < RETRAN_CNT_SIZE; i++) { 5908 if (mid > rack->r_ctl.rc_cnt_of_retran[i]) { 5909 mid -= rack->r_ctl.rc_cnt_of_retran[i]; 5910 continue; 5911 } 5912 alt_med = (i + 1); 5913 break; 5914 } 5915 if (rack->r_ctl.policer_alt_median) { 5916 /* Swap the medians */ 5917 uint32_t swap; 5918 5919 swap = med; 5920 med = alt_med; 5921 alt_med = swap; 5922 } 5923 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5924 union tcp_log_stackspecific log; 5925 struct timeval tv; 5926 5927 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5928 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5929 log.u_bbr.flex1 = avg; 5930 log.u_bbr.flex2 = med; 5931 log.u_bbr.flex3 = (uint32_t)rxt_per; 5932 log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; 5933 log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; 5934 log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; 5935 log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; 5936 log.u_bbr.flex8 = 1; 5937 log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; 5938 log.u_bbr.applimited = rack->r_ctl.current_round; 5939 log.u_bbr.epoch = rack->r_ctl.policer_max_seg; 5940 log.u_bbr.bw_inuse = del_bw; 5941 log.u_bbr.cur_del_rate = rxts; 5942 log.u_bbr.delRate = snds; 5943 log.u_bbr.rttProp = rack->r_ctl.gp_bw; 5944 log.u_bbr.bbr_state = rack->rc_policer_detected; 5945 log.u_bbr.bbr_substate = 0; 5946 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5947 log.u_bbr.use_lt_bw = rack->policer_detect_on; 5948 log.u_bbr.lt_epoch = (uint32_t)tim; 5949 log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; 5950 tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, 5951 0, &log, false, NULL, NULL, 0, &tv); 5952 } 5953 if (med == RETRAN_CNT_SIZE) { 5954 /* 5955 * If the median is the maximum, then what we 5956 * likely have here is a network breakage. Either that 5957 * or we are so unlucky that all of our traffic is being 5958 * dropped and having to be retransmitted the maximum times 5959 * and this just is not how a policer works. 5960 * 5961 * If it is truely a policer eventually we will come 5962 * through and it won't be the maximum. 5963 */ 5964 return; 5965 } 5966 /* Has enough rounds progressed for us to re-measure? */ 5967 if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) && 5968 (avg >= rack->r_ctl.policer_avg_threshold) && 5969 (med >= rack->r_ctl.policer_med_threshold)) { 5970 /* 5971 * We hit all thresholds that indicate we are 5972 * being policed. Now we may be doing this from a rack timeout 5973 * which then means the rest of recovery will hopefully go 5974 * smoother as we pace. At the end of recovery we will 5975 * fall back in here and reset the values using the 5976 * results of the entire recovery episode (we could also 5977 * hit this as we exit recovery as well which means only 5978 * one time in here). 5979 * 5980 * This is done explicitly that if we hit the thresholds 5981 * again in a second recovery we overwrite the values. We do 5982 * that because over time, as we pace the policer_bucket_size may 5983 * continue to grow. This then provides more and more times when 5984 * we are not pacing to the policer rate. This lets us compensate 5985 * for when we hit a false positive and those flows continue to 5986 * increase. However if its a real policer we will then get over its 5987 * limit, over time, again and thus end up back here hitting the 5988 * thresholds again. 5989 * 5990 * The alternative to this is to instead whenever we pace due to 5991 * policing in rack_policed_sending we could add the amount len paced to the 5992 * idle_snd_una value (which decreases the amount in last_amount_before_rec 5993 * since that is always [th_ack - idle_snd_una]). This would then prevent 5994 * the polcier_bucket_size from growing in additional recovery episodes 5995 * Which would then mean false postives would be pretty much stuck 5996 * after things got back to normal (assuming that what caused the 5997 * false positive was a small network outage). 5998 * 5999 */ 6000 tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET); 6001 if (rack->rc_policer_detected == 0) { 6002 /* 6003 * Increment the stat that tells us we identified 6004 * a policer only once. Note that if we ever allow 6005 * the flag to be cleared (reverted) then we need 6006 * to adjust this to not do multi-counting. 6007 */ 6008 counter_u64_add(tcp_policer_detected, 1); 6009 } 6010 rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes; 6011 rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes; 6012 rack->r_ctl.policer_bw = del_bw; 6013 rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, 6014 rack->r_ctl.policer_bw, 6015 min(ctf_fixed_maxseg(rack->rc_tp), 6016 rack->r_ctl.rc_pace_min_segs), 6017 0, NULL, 6018 NULL, rack->r_ctl.pace_len_divisor); 6019 /* Now what about the policer bucket size */ 6020 rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; 6021 if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { 6022 /* We must be able to send our max-seg or else chaos ensues */ 6023 rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; 6024 } 6025 if (rack->rc_policer_detected == 0) 6026 rack->r_ctl.current_policer_bucket = 0; 6027 if (tcp_bblogging_on(rack->rc_tp)) { 6028 union tcp_log_stackspecific log; 6029 struct timeval tv; 6030 6031 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6032 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6033 log.u_bbr.flex1 = avg; 6034 log.u_bbr.flex2 = med; 6035 log.u_bbr.flex3 = rxt_per; 6036 log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; 6037 log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; 6038 log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; 6039 log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; 6040 log.u_bbr.flex8 = 2; 6041 log.u_bbr.applimited = rack->r_ctl.current_round; 6042 log.u_bbr.bw_inuse = del_bw; 6043 log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; 6044 log.u_bbr.cur_del_rate = rxts; 6045 log.u_bbr.delRate = snds; 6046 log.u_bbr.rttProp = rack->r_ctl.gp_bw; 6047 log.u_bbr.bbr_state = rack->rc_policer_detected; 6048 log.u_bbr.bbr_substate = 0; 6049 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 6050 log.u_bbr.use_lt_bw = rack->policer_detect_on; 6051 log.u_bbr.epoch = rack->r_ctl.policer_max_seg; 6052 log.u_bbr.lt_epoch = (uint32_t)tim; 6053 log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; 6054 tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, 6055 0, &log, false, NULL, NULL, 0, &tv); 6056 /* 6057 * Put out an added log, 19, for the sole purpose 6058 * of getting the txt/rxt so that we can benchmark 6059 * in read-bbrlog the ongoing rxt rate after our 6060 * policer invocation in the HYSTART announcments. 6061 */ 6062 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6063 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); 6064 log.u_bbr.flex1 = alt_med; 6065 log.u_bbr.flex8 = 19; 6066 log.u_bbr.cur_del_rate = tp->t_sndbytes; 6067 log.u_bbr.delRate = tp->t_snd_rxt_bytes; 6068 tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, 6069 0, &log, false, NULL, NULL, 0, &tv); 6070 } 6071 /* Turn off any fast output, thats ended */ 6072 rack->r_fast_output = 0; 6073 /* Mark the time for credits */ 6074 rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL); 6075 if (rack->r_rr_config < 2) { 6076 /* 6077 * We need to be stricter on the RR config so 6078 * the pacing has priority. 6079 */ 6080 rack->r_rr_config = 2; 6081 } 6082 policer_detection_log(rack, 6083 rack->r_ctl.idle_snd_una, 6084 rack->r_ctl.ack_for_idle, 6085 0, 6086 (uint32_t)tim, 6087 14); 6088 rack->rc_policer_detected = 1; 6089 } else if ((rack->rc_policer_detected == 1) && 6090 (post_recovery == 1)) { 6091 /* 6092 * If we are exiting recovery and have already detected 6093 * we need to possibly update the values. 6094 * 6095 * First: Update the idle -> recovery sent value. 6096 */ 6097 uint32_t srtt; 6098 6099 if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { 6100 rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; 6101 } 6102 srtt = (uint64_t)rack_grab_rtt(tp, rack); 6103 if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) 6104 srtt = tp->t_srtt; 6105 if ((srtt != 0) && 6106 (tim < (uint64_t)srtt)) { 6107 /* 6108 * Not long enough. 6109 */ 6110 if (rack_verbose_logging) 6111 policer_detection_log(rack, 6112 (uint32_t)tim, 6113 0, 6114 0, 6115 0, 6116 15); 6117 return; 6118 } 6119 /* 6120 * Finally update the b/w if its grown. 6121 */ 6122 if (del_bw > rack->r_ctl.policer_bw) { 6123 rack->r_ctl.policer_bw = del_bw; 6124 rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, 6125 rack->r_ctl.policer_bw, 6126 min(ctf_fixed_maxseg(rack->rc_tp), 6127 rack->r_ctl.rc_pace_min_segs), 6128 0, NULL, 6129 NULL, rack->r_ctl.pace_len_divisor); 6130 if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { 6131 /* We must be able to send our max-seg or else chaos ensues */ 6132 rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; 6133 } 6134 } 6135 policer_detection_log(rack, 6136 rack->r_ctl.idle_snd_una, 6137 rack->r_ctl.ack_for_idle, 6138 0, 6139 (uint32_t)tim, 6140 3); 6141 } 6142 } 6143 6144 static void 6145 rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how) 6146 { 6147 /* now check with the policer if on */ 6148 if (rack->policer_detect_on == 1) { 6149 policer_detection(tp, rack, 1); 6150 } 6151 /* 6152 * Now exit recovery, note we must do the idle set after the policer_detection 6153 * to get the amount acked prior to recovery correct. 6154 */ 6155 rack->r_ctl.idle_snd_una = tp->snd_una; 6156 EXIT_RECOVERY(tp->t_flags); 6157 } 6158 6159 static void 6160 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 6161 { 6162 struct tcp_rack *rack; 6163 uint32_t orig_cwnd; 6164 6165 orig_cwnd = tp->snd_cwnd; 6166 INP_WLOCK_ASSERT(tptoinpcb(tp)); 6167 rack = (struct tcp_rack *)tp->t_fb_ptr; 6168 /* only alert CC if we alerted when we entered */ 6169 if (CC_ALGO(tp)->post_recovery != NULL) { 6170 tp->t_ccv.curack = th_ack; 6171 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 6172 if (tp->snd_cwnd < tp->snd_ssthresh) { 6173 /* 6174 * Rack has burst control and pacing 6175 * so lets not set this any lower than 6176 * snd_ssthresh per RFC-6582 (option 2). 6177 */ 6178 tp->snd_cwnd = tp->snd_ssthresh; 6179 } 6180 } 6181 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6182 union tcp_log_stackspecific log; 6183 struct timeval tv; 6184 6185 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6186 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6187 log.u_bbr.flex1 = th_ack; 6188 log.u_bbr.flex2 = tp->t_ccv.flags; 6189 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 6190 log.u_bbr.flex4 = tp->t_ccv.nsegs; 6191 log.u_bbr.flex5 = V_tcp_abc_l_var; 6192 log.u_bbr.flex6 = orig_cwnd; 6193 log.u_bbr.flex7 = V_tcp_do_newsack; 6194 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 6195 log.u_bbr.flex8 = 2; 6196 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 6197 0, &log, false, NULL, __func__, __LINE__, &tv); 6198 } 6199 if ((rack->rack_no_prr == 0) && 6200 (rack->no_prr_addback == 0) && 6201 (rack->r_ctl.rc_prr_sndcnt > 0)) { 6202 /* 6203 * Suck the next prr cnt back into cwnd, but 6204 * only do that if we are not application limited. 6205 */ 6206 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 6207 /* 6208 * We are allowed to add back to the cwnd the amount we did 6209 * not get out if: 6210 * a) no_prr_addback is off. 6211 * b) we are not app limited 6212 * c) we are doing prr 6213 * <and> 6214 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 6215 */ 6216 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 6217 rack->r_ctl.rc_prr_sndcnt); 6218 } 6219 rack->r_ctl.rc_prr_sndcnt = 0; 6220 rack_log_to_prr(rack, 1, 0, __LINE__); 6221 } 6222 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 6223 tp->snd_recover = tp->snd_una; 6224 if (rack->r_ctl.dsack_persist) { 6225 rack->r_ctl.dsack_persist--; 6226 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 6227 rack->r_ctl.num_dsack = 0; 6228 } 6229 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 6230 } 6231 if (rack->rto_from_rec == 1) { 6232 rack->rto_from_rec = 0; 6233 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 6234 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 6235 } 6236 rack_exit_recovery(tp, rack, 1); 6237 } 6238 6239 static void 6240 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 6241 { 6242 struct tcp_rack *rack; 6243 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 6244 6245 INP_WLOCK_ASSERT(tptoinpcb(tp)); 6246 #ifdef STATS 6247 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 6248 #endif 6249 if (IN_RECOVERY(tp->t_flags) == 0) { 6250 in_rec_at_entry = 0; 6251 ssthresh_enter = tp->snd_ssthresh; 6252 cwnd_enter = tp->snd_cwnd; 6253 } else 6254 in_rec_at_entry = 1; 6255 rack = (struct tcp_rack *)tp->t_fb_ptr; 6256 switch (type) { 6257 case CC_NDUPACK: 6258 tp->t_flags &= ~TF_WASFRECOVERY; 6259 tp->t_flags &= ~TF_WASCRECOVERY; 6260 if (!IN_FASTRECOVERY(tp->t_flags)) { 6261 struct rack_sendmap *rsm; 6262 struct timeval tv; 6263 uint32_t segsiz; 6264 6265 /* Check if this is the end of the initial Start-up i.e. initial slow-start */ 6266 if (rack->rc_initial_ss_comp == 0) { 6267 /* Yep it is the end of the initial slowstart */ 6268 rack->rc_initial_ss_comp = 1; 6269 } 6270 microuptime(&tv); 6271 rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv); 6272 if (SEQ_GEQ(ack, tp->snd_una)) { 6273 /* 6274 * The ack is above snd_una. Lets see 6275 * if we can establish a postive distance from 6276 * our idle mark. 6277 */ 6278 rack->r_ctl.ack_for_idle = ack; 6279 if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) { 6280 rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una; 6281 } else { 6282 /* No data thru yet */ 6283 rack->r_ctl.last_amount_before_rec = 0; 6284 } 6285 } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) { 6286 /* 6287 * The ack is out of order and behind the snd_una. It may 6288 * have contained SACK information which we processed else 6289 * we would have rejected it. 6290 */ 6291 rack->r_ctl.ack_for_idle = tp->snd_una; 6292 rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una; 6293 } else { 6294 rack->r_ctl.ack_for_idle = ack; 6295 rack->r_ctl.last_amount_before_rec = 0; 6296 } 6297 if (rack->rc_policer_detected) { 6298 /* 6299 * If we are being policed and we have a loss, it 6300 * means our bucket is now empty. This can happen 6301 * where some other flow on the same host sends 6302 * that this connection is not aware of. 6303 */ 6304 rack->r_ctl.current_policer_bucket = 0; 6305 if (rack_verbose_logging) 6306 policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4); 6307 if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { 6308 rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; 6309 } 6310 } 6311 memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran)); 6312 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 6313 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 6314 /* 6315 * Go through the outstanding and re-peg 6316 * any that should have been left in the 6317 * retransmit list (on a double recovery). 6318 */ 6319 if (rsm->r_act_rxt_cnt > 0) { 6320 rack_peg_rxt(rack, rsm, segsiz); 6321 } 6322 } 6323 rack->r_ctl.bytes_acked_in_recovery = 0; 6324 rack->r_ctl.rc_prr_delivered = 0; 6325 rack->r_ctl.rc_prr_out = 0; 6326 rack->r_fast_output = 0; 6327 if (rack->rack_no_prr == 0) { 6328 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6329 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 6330 } 6331 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 6332 tp->snd_recover = tp->snd_max; 6333 if (tp->t_flags2 & TF2_ECN_PERMIT) 6334 tp->t_flags2 |= TF2_ECN_SND_CWR; 6335 } 6336 break; 6337 case CC_ECN: 6338 if (!IN_CONGRECOVERY(tp->t_flags) || 6339 /* 6340 * Allow ECN reaction on ACK to CWR, if 6341 * that data segment was also CE marked. 6342 */ 6343 SEQ_GEQ(ack, tp->snd_recover)) { 6344 EXIT_CONGRECOVERY(tp->t_flags); 6345 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 6346 rack->r_fast_output = 0; 6347 tp->snd_recover = tp->snd_max + 1; 6348 if (tp->t_flags2 & TF2_ECN_PERMIT) 6349 tp->t_flags2 |= TF2_ECN_SND_CWR; 6350 } 6351 break; 6352 case CC_RTO: 6353 tp->t_dupacks = 0; 6354 tp->t_bytes_acked = 0; 6355 rack->r_fast_output = 0; 6356 if (IN_RECOVERY(tp->t_flags)) 6357 rack_exit_recovery(tp, rack, 2); 6358 rack->r_ctl.bytes_acked_in_recovery = 0; 6359 rack->r_ctl.time_entered_recovery = 0; 6360 orig_cwnd = tp->snd_cwnd; 6361 rack_log_to_prr(rack, 16, orig_cwnd, line); 6362 if (CC_ALGO(tp)->cong_signal == NULL) { 6363 /* TSNH */ 6364 tp->snd_ssthresh = max(2, 6365 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 6366 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 6367 tp->snd_cwnd = ctf_fixed_maxseg(tp); 6368 } 6369 if (tp->t_flags2 & TF2_ECN_PERMIT) 6370 tp->t_flags2 |= TF2_ECN_SND_CWR; 6371 break; 6372 case CC_RTO_ERR: 6373 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 6374 /* RTO was unnecessary, so reset everything. */ 6375 tp->snd_cwnd = tp->snd_cwnd_prev; 6376 tp->snd_ssthresh = tp->snd_ssthresh_prev; 6377 tp->snd_recover = tp->snd_recover_prev; 6378 if (tp->t_flags & TF_WASFRECOVERY) { 6379 ENTER_FASTRECOVERY(tp->t_flags); 6380 tp->t_flags &= ~TF_WASFRECOVERY; 6381 } 6382 if (tp->t_flags & TF_WASCRECOVERY) { 6383 ENTER_CONGRECOVERY(tp->t_flags); 6384 tp->t_flags &= ~TF_WASCRECOVERY; 6385 } 6386 tp->snd_nxt = tp->snd_max; 6387 tp->t_badrxtwin = 0; 6388 break; 6389 } 6390 if ((CC_ALGO(tp)->cong_signal != NULL) && 6391 (type != CC_RTO)){ 6392 tp->t_ccv.curack = ack; 6393 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 6394 } 6395 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 6396 rack_log_to_prr(rack, 15, cwnd_enter, line); 6397 rack->r_ctl.dsack_byte_cnt = 0; 6398 rack->r_ctl.retran_during_recovery = 0; 6399 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 6400 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 6401 rack->r_ent_rec_ns = 1; 6402 } 6403 } 6404 6405 static inline void 6406 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 6407 { 6408 uint32_t i_cwnd; 6409 6410 INP_WLOCK_ASSERT(tptoinpcb(tp)); 6411 6412 if (CC_ALGO(tp)->after_idle != NULL) 6413 CC_ALGO(tp)->after_idle(&tp->t_ccv); 6414 6415 if (tp->snd_cwnd == 1) 6416 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 6417 else 6418 i_cwnd = rc_init_window(rack); 6419 6420 /* 6421 * Being idle is no different than the initial window. If the cc 6422 * clamps it down below the initial window raise it to the initial 6423 * window. 6424 */ 6425 if (tp->snd_cwnd < i_cwnd) { 6426 tp->snd_cwnd = i_cwnd; 6427 } 6428 } 6429 6430 /* 6431 * Indicate whether this ack should be delayed. We can delay the ack if 6432 * following conditions are met: 6433 * - There is no delayed ack timer in progress. 6434 * - Our last ack wasn't a 0-sized window. We never want to delay 6435 * the ack that opens up a 0-sized window. 6436 * - LRO wasn't used for this segment. We make sure by checking that the 6437 * segment size is not larger than the MSS. 6438 * - Delayed acks are enabled or this is a half-synchronized T/TCP 6439 * connection. 6440 */ 6441 #define DELAY_ACK(tp, tlen) \ 6442 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 6443 ((tp->t_flags & TF_DELACK) == 0) && \ 6444 (tlen <= tp->t_maxseg) && \ 6445 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 6446 6447 static struct rack_sendmap * 6448 rack_find_lowest_rsm(struct tcp_rack *rack) 6449 { 6450 struct rack_sendmap *rsm; 6451 6452 /* 6453 * Walk the time-order transmitted list looking for an rsm that is 6454 * not acked. This will be the one that was sent the longest time 6455 * ago that is still outstanding. 6456 */ 6457 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 6458 if (rsm->r_flags & RACK_ACKED) { 6459 continue; 6460 } 6461 goto finish; 6462 } 6463 finish: 6464 return (rsm); 6465 } 6466 6467 static struct rack_sendmap * 6468 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 6469 { 6470 struct rack_sendmap *prsm; 6471 6472 /* 6473 * Walk the sequence order list backward until we hit and arrive at 6474 * the highest seq not acked. In theory when this is called it 6475 * should be the last segment (which it was not). 6476 */ 6477 prsm = rsm; 6478 6479 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { 6480 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 6481 continue; 6482 } 6483 return (prsm); 6484 } 6485 return (NULL); 6486 } 6487 6488 static uint32_t 6489 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed) 6490 { 6491 int32_t lro; 6492 uint32_t thresh; 6493 6494 /* 6495 * lro is the flag we use to determine if we have seen reordering. 6496 * If it gets set we have seen reordering. The reorder logic either 6497 * works in one of two ways: 6498 * 6499 * If reorder-fade is configured, then we track the last time we saw 6500 * re-ordering occur. If we reach the point where enough time as 6501 * passed we no longer consider reordering has occuring. 6502 * 6503 * Or if reorder-face is 0, then once we see reordering we consider 6504 * the connection to alway be subject to reordering and just set lro 6505 * to 1. 6506 * 6507 * In the end if lro is non-zero we add the extra time for 6508 * reordering in. 6509 */ 6510 if (srtt == 0) 6511 srtt = 1; 6512 if (rack->r_ctl.rc_reorder_ts) { 6513 if (rack->r_ctl.rc_reorder_fade) { 6514 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 6515 lro = cts - rack->r_ctl.rc_reorder_ts; 6516 if (lro == 0) { 6517 /* 6518 * No time as passed since the last 6519 * reorder, mark it as reordering. 6520 */ 6521 lro = 1; 6522 } 6523 } else { 6524 /* Negative time? */ 6525 lro = 0; 6526 } 6527 if (lro > rack->r_ctl.rc_reorder_fade) { 6528 /* Turn off reordering seen too */ 6529 rack->r_ctl.rc_reorder_ts = 0; 6530 lro = 0; 6531 } 6532 } else { 6533 /* Reodering does not fade */ 6534 lro = 1; 6535 } 6536 } else { 6537 lro = 0; 6538 } 6539 if (rack->rc_rack_tmr_std_based == 0) { 6540 thresh = srtt + rack->r_ctl.rc_pkt_delay; 6541 } else { 6542 /* Standards based pkt-delay is 1/4 srtt */ 6543 thresh = srtt + (srtt >> 2); 6544 } 6545 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 6546 /* It must be set, if not you get 1/4 rtt */ 6547 if (rack->r_ctl.rc_reorder_shift) 6548 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 6549 else 6550 thresh += (srtt >> 2); 6551 } 6552 if (rack->rc_rack_use_dsack && 6553 lro && 6554 (rack->r_ctl.num_dsack > 0)) { 6555 /* 6556 * We only increase the reordering window if we 6557 * have seen reordering <and> we have a DSACK count. 6558 */ 6559 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 6560 if (log_allowed) 6561 rack_log_dsack_event(rack, 4, line, srtt, thresh); 6562 } 6563 /* SRTT * 2 is the ceiling */ 6564 if (thresh > (srtt * 2)) { 6565 thresh = srtt * 2; 6566 } 6567 /* And we don't want it above the RTO max either */ 6568 if (thresh > rack_rto_max) { 6569 thresh = rack_rto_max; 6570 } 6571 if (log_allowed) 6572 rack_log_dsack_event(rack, 6, line, srtt, thresh); 6573 return (thresh); 6574 } 6575 6576 static uint32_t 6577 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 6578 struct rack_sendmap *rsm, uint32_t srtt) 6579 { 6580 struct rack_sendmap *prsm; 6581 uint32_t thresh, len; 6582 int segsiz; 6583 6584 if (srtt == 0) 6585 srtt = 1; 6586 if (rack->r_ctl.rc_tlp_threshold) 6587 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 6588 else 6589 thresh = (srtt * 2); 6590 6591 /* Get the previous sent packet, if any */ 6592 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 6593 len = rsm->r_end - rsm->r_start; 6594 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 6595 /* Exactly like the ID */ 6596 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 6597 uint32_t alt_thresh; 6598 /* 6599 * Compensate for delayed-ack with the d-ack time. 6600 */ 6601 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6602 if (alt_thresh > thresh) 6603 thresh = alt_thresh; 6604 } 6605 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 6606 /* 2.1 behavior */ 6607 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 6608 if (prsm && (len <= segsiz)) { 6609 /* 6610 * Two packets outstanding, thresh should be (2*srtt) + 6611 * possible inter-packet delay (if any). 6612 */ 6613 uint32_t inter_gap = 0; 6614 int idx, nidx; 6615 6616 idx = rsm->r_rtr_cnt - 1; 6617 nidx = prsm->r_rtr_cnt - 1; 6618 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 6619 /* Yes it was sent later (or at the same time) */ 6620 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 6621 } 6622 thresh += inter_gap; 6623 } else if (len <= segsiz) { 6624 /* 6625 * Possibly compensate for delayed-ack. 6626 */ 6627 uint32_t alt_thresh; 6628 6629 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6630 if (alt_thresh > thresh) 6631 thresh = alt_thresh; 6632 } 6633 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 6634 /* 2.2 behavior */ 6635 if (len <= segsiz) { 6636 uint32_t alt_thresh; 6637 /* 6638 * Compensate for delayed-ack with the d-ack time. 6639 */ 6640 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6641 if (alt_thresh > thresh) 6642 thresh = alt_thresh; 6643 } 6644 } 6645 /* Not above an RTO */ 6646 if (thresh > tp->t_rxtcur) { 6647 thresh = tp->t_rxtcur; 6648 } 6649 /* Not above a RTO max */ 6650 if (thresh > rack_rto_max) { 6651 thresh = rack_rto_max; 6652 } 6653 /* Apply user supplied min TLP */ 6654 if (thresh < rack_tlp_min) { 6655 thresh = rack_tlp_min; 6656 } 6657 return (thresh); 6658 } 6659 6660 static uint32_t 6661 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 6662 { 6663 /* 6664 * We want the rack_rtt which is the 6665 * last rtt we measured. However if that 6666 * does not exist we fallback to the srtt (which 6667 * we probably will never do) and then as a last 6668 * resort we use RACK_INITIAL_RTO if no srtt is 6669 * yet set. 6670 */ 6671 if (rack->rc_rack_rtt) 6672 return (rack->rc_rack_rtt); 6673 else if (tp->t_srtt == 0) 6674 return (RACK_INITIAL_RTO); 6675 return (tp->t_srtt); 6676 } 6677 6678 static struct rack_sendmap * 6679 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 6680 { 6681 /* 6682 * Check to see that we don't need to fall into recovery. We will 6683 * need to do so if our oldest transmit is past the time we should 6684 * have had an ack. 6685 */ 6686 struct tcp_rack *rack; 6687 struct rack_sendmap *rsm; 6688 int32_t idx; 6689 uint32_t srtt, thresh; 6690 6691 rack = (struct tcp_rack *)tp->t_fb_ptr; 6692 if (tqhash_empty(rack->r_ctl.tqh)) { 6693 return (NULL); 6694 } 6695 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6696 if (rsm == NULL) 6697 return (NULL); 6698 6699 6700 if (rsm->r_flags & RACK_ACKED) { 6701 rsm = rack_find_lowest_rsm(rack); 6702 if (rsm == NULL) 6703 return (NULL); 6704 } 6705 idx = rsm->r_rtr_cnt - 1; 6706 srtt = rack_grab_rtt(tp, rack); 6707 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 6708 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 6709 return (NULL); 6710 } 6711 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 6712 return (NULL); 6713 } 6714 /* Ok if we reach here we are over-due and this guy can be sent */ 6715 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 6716 return (rsm); 6717 } 6718 6719 static uint32_t 6720 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 6721 { 6722 int32_t t; 6723 int32_t tt; 6724 uint32_t ret_val; 6725 6726 t = (tp->t_srtt + (tp->t_rttvar << 2)); 6727 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 6728 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 6729 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 6730 ret_val = (uint32_t)tt; 6731 return (ret_val); 6732 } 6733 6734 static uint32_t 6735 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 6736 { 6737 /* 6738 * Start the FR timer, we do this based on getting the first one in 6739 * the rc_tmap. Note that if its NULL we must stop the timer. in all 6740 * events we need to stop the running timer (if its running) before 6741 * starting the new one. 6742 */ 6743 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 6744 uint32_t srtt_cur; 6745 int32_t idx; 6746 int32_t is_tlp_timer = 0; 6747 struct rack_sendmap *rsm; 6748 6749 if (rack->t_timers_stopped) { 6750 /* All timers have been stopped none are to run */ 6751 return (0); 6752 } 6753 if (rack->rc_in_persist) { 6754 /* We can't start any timer in persists */ 6755 return (rack_get_persists_timer_val(tp, rack)); 6756 } 6757 rack->rc_on_min_to = 0; 6758 if ((tp->t_state < TCPS_ESTABLISHED) || 6759 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 6760 goto activate_rxt; 6761 } 6762 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6763 if ((rsm == NULL) || sup_rack) { 6764 /* Nothing on the send map or no rack */ 6765 activate_rxt: 6766 time_since_sent = 0; 6767 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6768 if (rsm) { 6769 /* 6770 * Should we discount the RTX timer any? 6771 * 6772 * We want to discount it the smallest amount. 6773 * If a timer (Rack/TLP or RXT) has gone off more 6774 * recently thats the discount we want to use (now - timer time). 6775 * If the retransmit of the oldest packet was more recent then 6776 * we want to use that (now - oldest-packet-last_transmit_time). 6777 * 6778 */ 6779 idx = rsm->r_rtr_cnt - 1; 6780 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 6781 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6782 else 6783 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6784 if (TSTMP_GT(cts, tstmp_touse)) 6785 time_since_sent = cts - tstmp_touse; 6786 } 6787 if (SEQ_LT(tp->snd_una, tp->snd_max) || 6788 sbavail(&tptosocket(tp)->so_snd)) { 6789 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 6790 to = tp->t_rxtcur; 6791 if (to > time_since_sent) 6792 to -= time_since_sent; 6793 else 6794 to = rack->r_ctl.rc_min_to; 6795 if (to == 0) 6796 to = 1; 6797 /* Special case for KEEPINIT */ 6798 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6799 (TP_KEEPINIT(tp) != 0) && 6800 rsm) { 6801 /* 6802 * We have to put a ceiling on the rxt timer 6803 * of the keep-init timeout. 6804 */ 6805 uint32_t max_time, red; 6806 6807 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 6808 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 6809 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 6810 if (red < max_time) 6811 max_time -= red; 6812 else 6813 max_time = 1; 6814 } 6815 /* Reduce timeout to the keep value if needed */ 6816 if (max_time < to) 6817 to = max_time; 6818 } 6819 return (to); 6820 } 6821 return (0); 6822 } 6823 if (rsm->r_flags & RACK_ACKED) { 6824 rsm = rack_find_lowest_rsm(rack); 6825 if (rsm == NULL) { 6826 /* No lowest? */ 6827 goto activate_rxt; 6828 } 6829 } 6830 /* Convert from ms to usecs */ 6831 if ((rsm->r_flags & RACK_SACK_PASSED) || 6832 (rsm->r_flags & RACK_RWND_COLLAPSED) || 6833 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 6834 if ((tp->t_flags & TF_SENTFIN) && 6835 ((tp->snd_max - tp->snd_una) == 1) && 6836 (rsm->r_flags & RACK_HAS_FIN)) { 6837 /* 6838 * We don't start a rack timer if all we have is a 6839 * FIN outstanding. 6840 */ 6841 goto activate_rxt; 6842 } 6843 if ((rack->use_rack_rr == 0) && 6844 (IN_FASTRECOVERY(tp->t_flags)) && 6845 (rack->rack_no_prr == 0) && 6846 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 6847 /* 6848 * We are not cheating, in recovery and 6849 * not enough ack's to yet get our next 6850 * retransmission out. 6851 * 6852 * Note that classified attackers do not 6853 * get to use the rack-cheat. 6854 */ 6855 goto activate_tlp; 6856 } 6857 srtt = rack_grab_rtt(tp, rack); 6858 thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1); 6859 idx = rsm->r_rtr_cnt - 1; 6860 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 6861 if (SEQ_GEQ(exp, cts)) { 6862 to = exp - cts; 6863 if (to < rack->r_ctl.rc_min_to) { 6864 to = rack->r_ctl.rc_min_to; 6865 if (rack->r_rr_config == 3) 6866 rack->rc_on_min_to = 1; 6867 } 6868 } else { 6869 to = rack->r_ctl.rc_min_to; 6870 if (rack->r_rr_config == 3) 6871 rack->rc_on_min_to = 1; 6872 } 6873 } else { 6874 /* Ok we need to do a TLP not RACK */ 6875 activate_tlp: 6876 if ((rack->rc_tlp_in_progress != 0) && 6877 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 6878 /* 6879 * The previous send was a TLP and we have sent 6880 * N TLP's without sending new data. 6881 */ 6882 goto activate_rxt; 6883 } 6884 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 6885 if (rsm == NULL) { 6886 /* We found no rsm to TLP with. */ 6887 goto activate_rxt; 6888 } 6889 if (rsm->r_flags & RACK_HAS_FIN) { 6890 /* If its a FIN we dont do TLP */ 6891 rsm = NULL; 6892 goto activate_rxt; 6893 } 6894 idx = rsm->r_rtr_cnt - 1; 6895 time_since_sent = 0; 6896 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 6897 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6898 else 6899 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6900 if (TSTMP_GT(cts, tstmp_touse)) 6901 time_since_sent = cts - tstmp_touse; 6902 is_tlp_timer = 1; 6903 if (tp->t_srtt) { 6904 if ((rack->rc_srtt_measure_made == 0) && 6905 (tp->t_srtt == 1)) { 6906 /* 6907 * If another stack as run and set srtt to 1, 6908 * then the srtt was 0, so lets use the initial. 6909 */ 6910 srtt = RACK_INITIAL_RTO; 6911 } else { 6912 srtt_cur = tp->t_srtt; 6913 srtt = srtt_cur; 6914 } 6915 } else 6916 srtt = RACK_INITIAL_RTO; 6917 /* 6918 * If the SRTT is not keeping up and the 6919 * rack RTT has spiked we want to use 6920 * the last RTT not the smoothed one. 6921 */ 6922 if (rack_tlp_use_greater && 6923 tp->t_srtt && 6924 (srtt < rack_grab_rtt(tp, rack))) { 6925 srtt = rack_grab_rtt(tp, rack); 6926 } 6927 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 6928 if (thresh > time_since_sent) { 6929 to = thresh - time_since_sent; 6930 } else { 6931 to = rack->r_ctl.rc_min_to; 6932 rack_log_alt_to_to_cancel(rack, 6933 thresh, /* flex1 */ 6934 time_since_sent, /* flex2 */ 6935 tstmp_touse, /* flex3 */ 6936 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 6937 (uint32_t)rsm->r_tim_lastsent[idx], 6938 srtt, 6939 idx, 99); 6940 } 6941 if (to < rack_tlp_min) { 6942 to = rack_tlp_min; 6943 } 6944 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 6945 /* 6946 * If the TLP time works out to larger than the max 6947 * RTO lets not do TLP.. just RTO. 6948 */ 6949 goto activate_rxt; 6950 } 6951 } 6952 if (is_tlp_timer == 0) { 6953 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 6954 } else { 6955 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 6956 } 6957 if (to == 0) 6958 to = 1; 6959 return (to); 6960 } 6961 6962 static void 6963 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) 6964 { 6965 if (rack->rc_in_persist == 0) { 6966 if (tp->t_flags & TF_GPUTINPROG) { 6967 /* 6968 * Stop the goodput now, the calling of the 6969 * measurement function clears the flag. 6970 */ 6971 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 6972 RACK_QUALITY_PERSIST); 6973 } 6974 #ifdef NETFLIX_SHARED_CWND 6975 if (rack->r_ctl.rc_scw) { 6976 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6977 rack->rack_scwnd_is_idle = 1; 6978 } 6979 #endif 6980 rack->r_ctl.rc_went_idle_time = cts; 6981 if (rack->r_ctl.rc_went_idle_time == 0) 6982 rack->r_ctl.rc_went_idle_time = 1; 6983 if (rack->lt_bw_up) { 6984 /* Suspend our LT BW measurement */ 6985 uint64_t tmark; 6986 6987 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); 6988 rack->r_ctl.lt_seq = snd_una; 6989 tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 6990 if (tmark >= rack->r_ctl.lt_timemark) { 6991 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 6992 } 6993 rack->r_ctl.lt_timemark = tmark; 6994 rack->lt_bw_up = 0; 6995 rack->r_persist_lt_bw_off = 1; 6996 } 6997 rack_timer_cancel(tp, rack, cts, __LINE__); 6998 rack->r_ctl.persist_lost_ends = 0; 6999 rack->probe_not_answered = 0; 7000 rack->forced_ack = 0; 7001 tp->t_rxtshift = 0; 7002 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7003 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 7004 rack->rc_in_persist = 1; 7005 } 7006 } 7007 7008 static void 7009 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7010 { 7011 if (tcp_in_hpts(rack->rc_tp)) { 7012 tcp_hpts_remove(rack->rc_tp); 7013 rack->r_ctl.rc_hpts_flags = 0; 7014 } 7015 #ifdef NETFLIX_SHARED_CWND 7016 if (rack->r_ctl.rc_scw) { 7017 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 7018 rack->rack_scwnd_is_idle = 0; 7019 } 7020 #endif 7021 if (rack->rc_gp_dyn_mul && 7022 (rack->use_fixed_rate == 0) && 7023 (rack->rc_always_pace)) { 7024 /* 7025 * Do we count this as if a probe-rtt just 7026 * finished? 7027 */ 7028 uint32_t time_idle, idle_min; 7029 7030 time_idle = cts - rack->r_ctl.rc_went_idle_time; 7031 idle_min = rack_min_probertt_hold; 7032 if (rack_probertt_gpsrtt_cnt_div) { 7033 uint64_t extra; 7034 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 7035 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 7036 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 7037 idle_min += (uint32_t)extra; 7038 } 7039 if (time_idle >= idle_min) { 7040 /* Yes, we count it as a probe-rtt. */ 7041 uint32_t us_cts; 7042 7043 us_cts = tcp_get_usecs(NULL); 7044 if (rack->in_probe_rtt == 0) { 7045 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 7046 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 7047 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 7048 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 7049 } else { 7050 rack_exit_probertt(rack, us_cts); 7051 } 7052 } 7053 } 7054 if (rack->r_persist_lt_bw_off) { 7055 /* Continue where we left off */ 7056 rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL); 7057 rack->lt_bw_up = 1; 7058 rack->r_persist_lt_bw_off = 0; 7059 } 7060 rack->r_ctl.idle_snd_una = tp->snd_una; 7061 rack->rc_in_persist = 0; 7062 rack->r_ctl.rc_went_idle_time = 0; 7063 tp->t_rxtshift = 0; 7064 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7065 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 7066 rack->r_ctl.rc_agg_delayed = 0; 7067 rack->r_early = 0; 7068 rack->r_late = 0; 7069 rack->r_ctl.rc_agg_early = 0; 7070 } 7071 7072 static void 7073 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 7074 struct hpts_diag *diag, struct timeval *tv) 7075 { 7076 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 7077 union tcp_log_stackspecific log; 7078 7079 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 7080 log.u_bbr.flex1 = diag->p_nxt_slot; 7081 log.u_bbr.flex2 = diag->p_cur_slot; 7082 log.u_bbr.flex3 = diag->slot_req; 7083 log.u_bbr.flex4 = diag->inp_hptsslot; 7084 log.u_bbr.flex5 = diag->slot_remaining; 7085 log.u_bbr.flex6 = diag->need_new_to; 7086 log.u_bbr.flex7 = diag->p_hpts_active; 7087 log.u_bbr.flex8 = diag->p_on_min_sleep; 7088 /* Hijack other fields as needed */ 7089 log.u_bbr.epoch = diag->have_slept; 7090 log.u_bbr.lt_epoch = diag->yet_to_sleep; 7091 log.u_bbr.pkts_out = diag->co_ret; 7092 log.u_bbr.applimited = diag->hpts_sleep_time; 7093 log.u_bbr.delivered = diag->p_prev_slot; 7094 log.u_bbr.inflight = diag->p_runningslot; 7095 log.u_bbr.bw_inuse = diag->wheel_slot; 7096 log.u_bbr.rttProp = diag->wheel_cts; 7097 log.u_bbr.timeStamp = cts; 7098 log.u_bbr.delRate = diag->maxslots; 7099 log.u_bbr.cur_del_rate = diag->p_curtick; 7100 log.u_bbr.cur_del_rate <<= 32; 7101 log.u_bbr.cur_del_rate |= diag->p_lasttick; 7102 TCP_LOG_EVENTP(rack->rc_tp, NULL, 7103 &rack->rc_inp->inp_socket->so_rcv, 7104 &rack->rc_inp->inp_socket->so_snd, 7105 BBR_LOG_HPTSDIAG, 0, 7106 0, &log, false, tv); 7107 } 7108 7109 } 7110 7111 static void 7112 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 7113 { 7114 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 7115 union tcp_log_stackspecific log; 7116 struct timeval tv; 7117 7118 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 7119 log.u_bbr.flex1 = sb->sb_flags; 7120 log.u_bbr.flex2 = len; 7121 log.u_bbr.flex3 = sb->sb_state; 7122 log.u_bbr.flex8 = type; 7123 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 7124 TCP_LOG_EVENTP(rack->rc_tp, NULL, 7125 &rack->rc_inp->inp_socket->so_rcv, 7126 &rack->rc_inp->inp_socket->so_snd, 7127 TCP_LOG_SB_WAKE, 0, 7128 len, &log, false, &tv); 7129 } 7130 } 7131 7132 static void 7133 rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 7134 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 7135 { 7136 struct hpts_diag diag; 7137 struct inpcb *inp = tptoinpcb(tp); 7138 struct timeval tv; 7139 uint32_t delayed_ack = 0; 7140 uint32_t hpts_timeout; 7141 uint32_t entry_slot = slot; 7142 uint8_t stopped; 7143 uint32_t left = 0; 7144 uint32_t us_cts; 7145 7146 if ((tp->t_state == TCPS_CLOSED) || 7147 (tp->t_state == TCPS_LISTEN)) { 7148 return; 7149 } 7150 if (tcp_in_hpts(tp)) { 7151 /* Already on the pacer */ 7152 return; 7153 } 7154 stopped = rack->rc_tmr_stopped; 7155 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 7156 left = rack->r_ctl.rc_timer_exp - cts; 7157 } 7158 rack->r_ctl.rc_timer_exp = 0; 7159 rack->r_ctl.rc_hpts_flags = 0; 7160 us_cts = tcp_get_usecs(&tv); 7161 /* Now early/late accounting */ 7162 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 7163 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 7164 /* 7165 * We have a early carry over set, 7166 * we can always add more time so we 7167 * can always make this compensation. 7168 * 7169 * Note if ack's are allowed to wake us do not 7170 * penalize the next timer for being awoke 7171 * by an ack aka the rc_agg_early (non-paced mode). 7172 */ 7173 slot += rack->r_ctl.rc_agg_early; 7174 rack->r_early = 0; 7175 rack->r_ctl.rc_agg_early = 0; 7176 } 7177 if ((rack->r_late) && 7178 ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) { 7179 /* 7180 * This is harder, we can 7181 * compensate some but it 7182 * really depends on what 7183 * the current pacing time is. 7184 */ 7185 if (rack->r_ctl.rc_agg_delayed >= slot) { 7186 /* 7187 * We can't compensate for it all. 7188 * And we have to have some time 7189 * on the clock. We always have a min 7190 * 10 slots (10 x 10 i.e. 100 usecs). 7191 */ 7192 if (slot <= HPTS_TICKS_PER_SLOT) { 7193 /* We gain delay */ 7194 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 7195 slot = HPTS_TICKS_PER_SLOT; 7196 } else { 7197 /* We take off some */ 7198 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 7199 slot = HPTS_TICKS_PER_SLOT; 7200 } 7201 } else { 7202 slot -= rack->r_ctl.rc_agg_delayed; 7203 rack->r_ctl.rc_agg_delayed = 0; 7204 /* Make sure we have 100 useconds at minimum */ 7205 if (slot < HPTS_TICKS_PER_SLOT) { 7206 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 7207 slot = HPTS_TICKS_PER_SLOT; 7208 } 7209 if (rack->r_ctl.rc_agg_delayed == 0) 7210 rack->r_late = 0; 7211 } 7212 } else if (rack->r_late) { 7213 /* r_use_hpts_min is on and so is DGP */ 7214 uint32_t max_red; 7215 7216 max_red = (slot * rack->r_ctl.max_reduction) / 100; 7217 if (max_red >= rack->r_ctl.rc_agg_delayed) { 7218 slot -= rack->r_ctl.rc_agg_delayed; 7219 rack->r_ctl.rc_agg_delayed = 0; 7220 } else { 7221 slot -= max_red; 7222 rack->r_ctl.rc_agg_delayed -= max_red; 7223 } 7224 } 7225 if ((rack->r_use_hpts_min == 1) && 7226 (slot > 0) && 7227 (rack->dgp_on == 1)) { 7228 /* 7229 * We are enforcing a min pacing timer 7230 * based on our hpts min timeout. 7231 */ 7232 uint32_t min; 7233 7234 min = get_hpts_min_sleep_time(); 7235 if (min > slot) { 7236 slot = min; 7237 } 7238 } 7239 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 7240 if (tp->t_flags & TF_DELACK) { 7241 delayed_ack = TICKS_2_USEC(tcp_delacktime); 7242 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 7243 } 7244 if (delayed_ack && ((hpts_timeout == 0) || 7245 (delayed_ack < hpts_timeout))) 7246 hpts_timeout = delayed_ack; 7247 else 7248 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7249 /* 7250 * If no timers are going to run and we will fall off the hptsi 7251 * wheel, we resort to a keep-alive timer if its configured. 7252 */ 7253 if ((hpts_timeout == 0) && 7254 (slot == 0)) { 7255 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 7256 (tp->t_state <= TCPS_CLOSING)) { 7257 /* 7258 * Ok we have no timer (persists, rack, tlp, rxt or 7259 * del-ack), we don't have segments being paced. So 7260 * all that is left is the keepalive timer. 7261 */ 7262 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 7263 /* Get the established keep-alive time */ 7264 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 7265 } else { 7266 /* 7267 * Get the initial setup keep-alive time, 7268 * note that this is probably not going to 7269 * happen, since rack will be running a rxt timer 7270 * if a SYN of some sort is outstanding. It is 7271 * actually handled in rack_timeout_rxt(). 7272 */ 7273 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 7274 } 7275 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 7276 if (rack->in_probe_rtt) { 7277 /* 7278 * We want to instead not wake up a long time from 7279 * now but to wake up about the time we would 7280 * exit probe-rtt and initiate a keep-alive ack. 7281 * This will get us out of probe-rtt and update 7282 * our min-rtt. 7283 */ 7284 hpts_timeout = rack_min_probertt_hold; 7285 } 7286 } 7287 } 7288 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 7289 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 7290 /* 7291 * RACK, TLP, persists and RXT timers all are restartable 7292 * based on actions input .. i.e we received a packet (ack 7293 * or sack) and that changes things (rw, or snd_una etc). 7294 * Thus we can restart them with a new value. For 7295 * keep-alive, delayed_ack we keep track of what was left 7296 * and restart the timer with a smaller value. 7297 */ 7298 if (left < hpts_timeout) 7299 hpts_timeout = left; 7300 } 7301 if (hpts_timeout) { 7302 /* 7303 * Hack alert for now we can't time-out over 2,147,483 7304 * seconds (a bit more than 596 hours), which is probably ok 7305 * :). 7306 */ 7307 if (hpts_timeout > 0x7ffffffe) 7308 hpts_timeout = 0x7ffffffe; 7309 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 7310 } 7311 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 7312 if ((rack->gp_ready == 0) && 7313 (rack->use_fixed_rate == 0) && 7314 (hpts_timeout < slot) && 7315 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 7316 /* 7317 * We have no good estimate yet for the 7318 * old clunky burst mitigation or the 7319 * real pacing. And the tlp or rxt is smaller 7320 * than the pacing calculation. Lets not 7321 * pace that long since we know the calculation 7322 * so far is not accurate. 7323 */ 7324 slot = hpts_timeout; 7325 } 7326 /** 7327 * Turn off all the flags for queuing by default. The 7328 * flags have important meanings to what happens when 7329 * LRO interacts with the transport. Most likely (by default now) 7330 * mbuf_queueing and ack compression are on. So the transport 7331 * has a couple of flags that control what happens (if those 7332 * are not on then these flags won't have any effect since it 7333 * won't go through the queuing LRO path). 7334 * 7335 * TF2_MBUF_QUEUE_READY - This flags says that I am busy 7336 * pacing output, so don't disturb. But 7337 * it also means LRO can wake me if there 7338 * is a SACK arrival. 7339 * 7340 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction 7341 * with the above flag (QUEUE_READY) and 7342 * when present it says don't even wake me 7343 * if a SACK arrives. 7344 * 7345 * The idea behind these flags is that if we are pacing we 7346 * set the MBUF_QUEUE_READY and only get woken up if 7347 * a SACK arrives (which could change things) or if 7348 * our pacing timer expires. If, however, we have a rack 7349 * timer running, then we don't even want a sack to wake 7350 * us since the rack timer has to expire before we can send. 7351 * 7352 * Other cases should usually have none of the flags set 7353 * so LRO can call into us. 7354 */ 7355 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); 7356 if (slot) { 7357 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 7358 rack->r_ctl.rc_last_output_to = us_cts + slot; 7359 /* 7360 * A pacing timer (slot) is being set, in 7361 * such a case we cannot send (we are blocked by 7362 * the timer). So lets tell LRO that it should not 7363 * wake us unless there is a SACK. Note this only 7364 * will be effective if mbuf queueing is on or 7365 * compressed acks are being processed. 7366 */ 7367 tp->t_flags2 |= TF2_MBUF_QUEUE_READY; 7368 /* 7369 * But wait if we have a Rack timer running 7370 * even a SACK should not disturb us (with 7371 * the exception of r_rr_config 3). 7372 */ 7373 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) || 7374 (IN_RECOVERY(tp->t_flags))) { 7375 if (rack->r_rr_config != 3) 7376 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 7377 else if (rack->rc_pace_dnd) { 7378 /* 7379 * When DND is on, we only let a sack 7380 * interrupt us if we are not in recovery. 7381 * 7382 * If DND is off, then we never hit here 7383 * and let all sacks wake us up. 7384 * 7385 */ 7386 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 7387 } 7388 } 7389 if (rack->rc_ack_can_sendout_data) { 7390 /* 7391 * Ahh but wait, this is that special case 7392 * where the pacing timer can be disturbed 7393 * backout the changes (used for non-paced 7394 * burst limiting). 7395 */ 7396 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE | 7397 TF2_MBUF_QUEUE_READY); 7398 } 7399 if ((rack->use_rack_rr) && 7400 (rack->r_rr_config < 2) && 7401 ((hpts_timeout) && (hpts_timeout < slot))) { 7402 /* 7403 * Arrange for the hpts to kick back in after the 7404 * t-o if the t-o does not cause a send. 7405 */ 7406 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 7407 __LINE__, &diag); 7408 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7409 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 7410 } else { 7411 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), 7412 __LINE__, &diag); 7413 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7414 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 7415 } 7416 } else if (hpts_timeout) { 7417 /* 7418 * With respect to t_flags2(?) here, lets let any new acks wake 7419 * us up here. Since we are not pacing (no pacing timer), output 7420 * can happen so we should let it. If its a Rack timer, then any inbound 7421 * packet probably won't change the sending (we will be blocked) 7422 * but it may change the prr stats so letting it in (the set defaults 7423 * at the start of this block) are good enough. 7424 */ 7425 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7426 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 7427 __LINE__, &diag); 7428 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7429 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 7430 } else { 7431 /* No timer starting */ 7432 #ifdef INVARIANTS 7433 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 7434 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 7435 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 7436 } 7437 #endif 7438 } 7439 rack->rc_tmr_stopped = 0; 7440 if (slot) 7441 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 7442 } 7443 7444 static void 7445 rack_mark_lost(struct tcpcb *tp, 7446 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 7447 { 7448 struct rack_sendmap *nrsm; 7449 uint32_t thresh, exp; 7450 7451 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 7452 nrsm = rsm; 7453 TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) { 7454 if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) { 7455 /* Got up to all that were marked sack-passed */ 7456 break; 7457 } 7458 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 7459 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 7460 if (TSTMP_LT(exp, cts) || (exp == cts)) { 7461 /* We now consider it lost */ 7462 nrsm->r_flags |= RACK_WAS_LOST; 7463 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 7464 } else { 7465 /* Past here it won't be lost so stop */ 7466 break; 7467 } 7468 } 7469 } 7470 } 7471 7472 /* 7473 * RACK Timer, here we simply do logging and house keeping. 7474 * the normal rack_output() function will call the 7475 * appropriate thing to check if we need to do a RACK retransmit. 7476 * We return 1, saying don't proceed with rack_output only 7477 * when all timers have been stopped (destroyed PCB?). 7478 */ 7479 static int 7480 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7481 { 7482 /* 7483 * This timer simply provides an internal trigger to send out data. 7484 * The check_recovery_mode call will see if there are needed 7485 * retransmissions, if so we will enter fast-recovery. The output 7486 * call may or may not do the same thing depending on sysctl 7487 * settings. 7488 */ 7489 struct rack_sendmap *rsm; 7490 7491 counter_u64_add(rack_to_tot, 1); 7492 if (rack->r_state && (rack->r_state != tp->t_state)) 7493 rack_set_state(tp, rack); 7494 rack->rc_on_min_to = 0; 7495 rsm = rack_check_recovery_mode(tp, cts); 7496 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 7497 if (rsm) { 7498 /* We need to stroke any lost that are now declared as lost */ 7499 rack_mark_lost(tp, rack, rsm, cts); 7500 rack->r_ctl.rc_resend = rsm; 7501 rack->r_timer_override = 1; 7502 if (rack->use_rack_rr) { 7503 /* 7504 * Don't accumulate extra pacing delay 7505 * we are allowing the rack timer to 7506 * over-ride pacing i.e. rrr takes precedence 7507 * if the pacing interval is longer than the rrr 7508 * time (in other words we get the min pacing 7509 * time versus rrr pacing time). 7510 */ 7511 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7512 } 7513 } 7514 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 7515 if (rsm == NULL) { 7516 /* restart a timer and return 1 */ 7517 rack_start_hpts_timer(rack, tp, cts, 7518 0, 0, 0); 7519 return (1); 7520 } 7521 if ((rack->policer_detect_on == 1) && 7522 (rack->rc_policer_detected == 0)) { 7523 /* 7524 * We do this early if we have not 7525 * deteceted to attempt to detect 7526 * quicker. Normally we want to do this 7527 * as recovery exits (and we will again). 7528 */ 7529 policer_detection(tp, rack, 0); 7530 } 7531 return (0); 7532 } 7533 7534 7535 7536 static void 7537 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 7538 { 7539 7540 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { 7541 /* 7542 * The trailing space changed, mbufs can grow 7543 * at the tail but they can't shrink from 7544 * it, KASSERT that. Adjust the orig_m_len to 7545 * compensate for this change. 7546 */ 7547 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), 7548 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 7549 rsm->m, 7550 rsm, 7551 (intmax_t)M_TRAILINGROOM(rsm->m), 7552 rsm->orig_t_space, 7553 rsm->orig_m_len, 7554 rsm->m->m_len)); 7555 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); 7556 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7557 } 7558 if (rsm->m->m_len < rsm->orig_m_len) { 7559 /* 7560 * Mbuf shrank, trimmed off the top by an ack, our 7561 * offset changes. 7562 */ 7563 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), 7564 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", 7565 rsm->m, rsm->m->m_len, 7566 rsm, rsm->orig_m_len, 7567 rsm->soff)); 7568 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) 7569 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 7570 else 7571 rsm->soff = 0; 7572 rsm->orig_m_len = rsm->m->m_len; 7573 #ifdef INVARIANTS 7574 } else if (rsm->m->m_len > rsm->orig_m_len) { 7575 panic("rsm:%p m:%p m_len grew outside of t_space compensation", 7576 rsm, rsm->m); 7577 #endif 7578 } 7579 } 7580 7581 static void 7582 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 7583 { 7584 struct mbuf *m; 7585 uint32_t soff; 7586 7587 if (src_rsm->m && 7588 ((src_rsm->orig_m_len != src_rsm->m->m_len) || 7589 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { 7590 /* Fix up the orig_m_len and possibly the mbuf offset */ 7591 rack_adjust_orig_mlen(src_rsm); 7592 } 7593 m = src_rsm->m; 7594 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 7595 while (soff >= m->m_len) { 7596 /* Move out past this mbuf */ 7597 soff -= m->m_len; 7598 m = m->m_next; 7599 KASSERT((m != NULL), 7600 ("rsm:%p nrsm:%p hit at soff:%u null m", 7601 src_rsm, rsm, soff)); 7602 if (m == NULL) { 7603 /* This should *not* happen which is why there is a kassert */ 7604 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7605 (src_rsm->r_start - rack->rc_tp->snd_una), 7606 &src_rsm->soff); 7607 src_rsm->orig_m_len = src_rsm->m->m_len; 7608 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); 7609 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7610 (rsm->r_start - rack->rc_tp->snd_una), 7611 &rsm->soff); 7612 rsm->orig_m_len = rsm->m->m_len; 7613 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7614 return; 7615 } 7616 } 7617 rsm->m = m; 7618 rsm->soff = soff; 7619 rsm->orig_m_len = m->m_len; 7620 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7621 } 7622 7623 static __inline void 7624 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 7625 struct rack_sendmap *rsm, uint32_t start) 7626 { 7627 int idx; 7628 7629 nrsm->r_start = start; 7630 nrsm->r_end = rsm->r_end; 7631 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 7632 nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt; 7633 nrsm->r_flags = rsm->r_flags; 7634 nrsm->r_dupack = rsm->r_dupack; 7635 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 7636 nrsm->r_rtr_bytes = 0; 7637 nrsm->r_fas = rsm->r_fas; 7638 nrsm->r_bas = rsm->r_bas; 7639 tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start); 7640 nrsm->r_just_ret = rsm->r_just_ret; 7641 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 7642 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 7643 } 7644 /* Now if we have SYN flag we keep it on the left edge */ 7645 if (nrsm->r_flags & RACK_HAS_SYN) 7646 nrsm->r_flags &= ~RACK_HAS_SYN; 7647 /* Now if we have a FIN flag we keep it on the right edge */ 7648 if (rsm->r_flags & RACK_HAS_FIN) 7649 rsm->r_flags &= ~RACK_HAS_FIN; 7650 /* Push bit must go to the right edge as well */ 7651 if (rsm->r_flags & RACK_HAD_PUSH) 7652 rsm->r_flags &= ~RACK_HAD_PUSH; 7653 /* Clone over the state of the hw_tls flag */ 7654 nrsm->r_hw_tls = rsm->r_hw_tls; 7655 /* 7656 * Now we need to find nrsm's new location in the mbuf chain 7657 * we basically calculate a new offset, which is soff + 7658 * how much is left in original rsm. Then we walk out the mbuf 7659 * chain to find the righ position, it may be the same mbuf 7660 * or maybe not. 7661 */ 7662 KASSERT(((rsm->m != NULL) || 7663 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 7664 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 7665 if (rsm->m) 7666 rack_setup_offset_for_rsm(rack, rsm, nrsm); 7667 } 7668 7669 static struct rack_sendmap * 7670 rack_merge_rsm(struct tcp_rack *rack, 7671 struct rack_sendmap *l_rsm, 7672 struct rack_sendmap *r_rsm) 7673 { 7674 /* 7675 * We are merging two ack'd RSM's, 7676 * the l_rsm is on the left (lower seq 7677 * values) and the r_rsm is on the right 7678 * (higher seq value). The simplest way 7679 * to merge these is to move the right 7680 * one into the left. I don't think there 7681 * is any reason we need to try to find 7682 * the oldest (or last oldest retransmitted). 7683 */ 7684 rack_log_map_chg(rack->rc_tp, rack, NULL, 7685 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 7686 tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end); 7687 if (l_rsm->r_dupack < r_rsm->r_dupack) 7688 l_rsm->r_dupack = r_rsm->r_dupack; 7689 if (r_rsm->r_rtr_bytes) 7690 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 7691 if (r_rsm->r_in_tmap) { 7692 /* This really should not happen */ 7693 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 7694 r_rsm->r_in_tmap = 0; 7695 } 7696 7697 /* Now the flags */ 7698 if (r_rsm->r_flags & RACK_HAS_FIN) 7699 l_rsm->r_flags |= RACK_HAS_FIN; 7700 if (r_rsm->r_flags & RACK_TLP) 7701 l_rsm->r_flags |= RACK_TLP; 7702 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 7703 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 7704 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 7705 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 7706 /* 7707 * If both are app-limited then let the 7708 * free lower the count. If right is app 7709 * limited and left is not, transfer. 7710 */ 7711 l_rsm->r_flags |= RACK_APP_LIMITED; 7712 r_rsm->r_flags &= ~RACK_APP_LIMITED; 7713 if (r_rsm == rack->r_ctl.rc_first_appl) 7714 rack->r_ctl.rc_first_appl = l_rsm; 7715 } 7716 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); 7717 /* 7718 * We keep the largest value, which is the newest 7719 * send. We do this in case a segment that is 7720 * joined together and not part of a GP estimate 7721 * later gets expanded into the GP estimate. 7722 * 7723 * We prohibit the merging of unlike kinds i.e. 7724 * all pieces that are in the GP estimate can be 7725 * merged and all pieces that are not in a GP estimate 7726 * can be merged, but not disimilar pieces. Combine 7727 * this with taking the highest here and we should 7728 * be ok unless of course the client reneges. Then 7729 * all bets are off. 7730 */ 7731 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < 7732 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { 7733 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; 7734 } 7735 /* 7736 * When merging two RSM's we also need to consider the ack time and keep 7737 * newest. If the ack gets merged into a measurement then that is the 7738 * one we will want to be using. 7739 */ 7740 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) 7741 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; 7742 7743 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 7744 /* Transfer the split limit to the map we free */ 7745 r_rsm->r_limit_type = l_rsm->r_limit_type; 7746 l_rsm->r_limit_type = 0; 7747 } 7748 rack_free(rack, r_rsm); 7749 l_rsm->r_flags |= RACK_MERGED; 7750 return (l_rsm); 7751 } 7752 7753 /* 7754 * TLP Timer, here we simply setup what segment we want to 7755 * have the TLP expire on, the normal rack_output() will then 7756 * send it out. 7757 * 7758 * We return 1, saying don't proceed with rack_output only 7759 * when all timers have been stopped (destroyed PCB?). 7760 */ 7761 static int 7762 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 7763 { 7764 /* 7765 * Tail Loss Probe. 7766 */ 7767 struct rack_sendmap *rsm = NULL; 7768 int insret __diagused; 7769 struct socket *so = tptosocket(tp); 7770 uint32_t amm; 7771 uint32_t out, avail; 7772 int collapsed_win = 0; 7773 7774 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7775 /* Its not time yet */ 7776 return (0); 7777 } 7778 if (ctf_progress_timeout_check(tp, true)) { 7779 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7780 return (-ETIMEDOUT); /* tcp_drop() */ 7781 } 7782 /* 7783 * A TLP timer has expired. We have been idle for 2 rtts. So we now 7784 * need to figure out how to force a full MSS segment out. 7785 */ 7786 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 7787 rack->r_ctl.retran_during_recovery = 0; 7788 rack->r_might_revert = 0; 7789 rack->r_ctl.dsack_byte_cnt = 0; 7790 counter_u64_add(rack_tlp_tot, 1); 7791 if (rack->r_state && (rack->r_state != tp->t_state)) 7792 rack_set_state(tp, rack); 7793 avail = sbavail(&so->so_snd); 7794 out = tp->snd_max - tp->snd_una; 7795 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 7796 /* special case, we need a retransmission */ 7797 collapsed_win = 1; 7798 goto need_retran; 7799 } 7800 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 7801 rack->r_ctl.dsack_persist--; 7802 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7803 rack->r_ctl.num_dsack = 0; 7804 } 7805 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7806 } 7807 if ((tp->t_flags & TF_GPUTINPROG) && 7808 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 7809 /* 7810 * If this is the second in a row 7811 * TLP and we are doing a measurement 7812 * its time to abandon the measurement. 7813 * Something is likely broken on 7814 * the clients network and measuring a 7815 * broken network does us no good. 7816 */ 7817 tp->t_flags &= ~TF_GPUTINPROG; 7818 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7819 rack->r_ctl.rc_gp_srtt /*flex1*/, 7820 tp->gput_seq, 7821 0, 0, 18, __LINE__, NULL, 0); 7822 } 7823 /* 7824 * Check our send oldest always settings, and if 7825 * there is an oldest to send jump to the need_retran. 7826 */ 7827 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 7828 goto need_retran; 7829 7830 if (avail > out) { 7831 /* New data is available */ 7832 amm = avail - out; 7833 if (amm > ctf_fixed_maxseg(tp)) { 7834 amm = ctf_fixed_maxseg(tp); 7835 if ((amm + out) > tp->snd_wnd) { 7836 /* We are rwnd limited */ 7837 goto need_retran; 7838 } 7839 } else if (amm < ctf_fixed_maxseg(tp)) { 7840 /* not enough to fill a MTU */ 7841 goto need_retran; 7842 } 7843 if (IN_FASTRECOVERY(tp->t_flags)) { 7844 /* Unlikely */ 7845 if (rack->rack_no_prr == 0) { 7846 if (out + amm <= tp->snd_wnd) { 7847 rack->r_ctl.rc_prr_sndcnt = amm; 7848 rack->r_ctl.rc_tlp_new_data = amm; 7849 rack_log_to_prr(rack, 4, 0, __LINE__); 7850 } 7851 } else 7852 goto need_retran; 7853 } else { 7854 /* Set the send-new override */ 7855 if (out + amm <= tp->snd_wnd) 7856 rack->r_ctl.rc_tlp_new_data = amm; 7857 else 7858 goto need_retran; 7859 } 7860 rack->r_ctl.rc_tlpsend = NULL; 7861 counter_u64_add(rack_tlp_newdata, 1); 7862 goto send; 7863 } 7864 need_retran: 7865 /* 7866 * Ok we need to arrange the last un-acked segment to be re-sent, or 7867 * optionally the first un-acked segment. 7868 */ 7869 if (collapsed_win == 0) { 7870 if (rack_always_send_oldest) 7871 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7872 else { 7873 rsm = tqhash_max(rack->r_ctl.tqh); 7874 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 7875 rsm = rack_find_high_nonack(rack, rsm); 7876 } 7877 } 7878 if (rsm == NULL) { 7879 #ifdef TCP_BLACKBOX 7880 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 7881 #endif 7882 goto out; 7883 } 7884 } else { 7885 /* 7886 * We had a collapsed window, lets find 7887 * the point before the collapse. 7888 */ 7889 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) 7890 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); 7891 else { 7892 rsm = tqhash_min(rack->r_ctl.tqh); 7893 } 7894 if (rsm == NULL) { 7895 /* Huh */ 7896 goto out; 7897 } 7898 } 7899 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 7900 /* 7901 * We need to split this the last segment in two. 7902 */ 7903 struct rack_sendmap *nrsm; 7904 7905 nrsm = rack_alloc_full_limit(rack); 7906 if (nrsm == NULL) { 7907 /* 7908 * No memory to split, we will just exit and punt 7909 * off to the RXT timer. 7910 */ 7911 goto out; 7912 } 7913 rack_clone_rsm(rack, nrsm, rsm, 7914 (rsm->r_end - ctf_fixed_maxseg(tp))); 7915 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7916 #ifndef INVARIANTS 7917 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 7918 #else 7919 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 7920 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 7921 nrsm, insret, rack, rsm); 7922 } 7923 #endif 7924 if (rsm->r_in_tmap) { 7925 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7926 nrsm->r_in_tmap = 1; 7927 } 7928 rsm = nrsm; 7929 } 7930 rack->r_ctl.rc_tlpsend = rsm; 7931 send: 7932 /* Make sure output path knows we are doing a TLP */ 7933 *doing_tlp = 1; 7934 rack->r_timer_override = 1; 7935 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7936 return (0); 7937 out: 7938 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7939 return (0); 7940 } 7941 7942 /* 7943 * Delayed ack Timer, here we simply need to setup the 7944 * ACK_NOW flag and remove the DELACK flag. From there 7945 * the output routine will send the ack out. 7946 * 7947 * We only return 1, saying don't proceed, if all timers 7948 * are stopped (destroyed PCB?). 7949 */ 7950 static int 7951 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7952 { 7953 7954 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 7955 tp->t_flags &= ~TF_DELACK; 7956 tp->t_flags |= TF_ACKNOW; 7957 KMOD_TCPSTAT_INC(tcps_delack); 7958 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7959 return (0); 7960 } 7961 7962 static inline int 7963 rack_send_ack_challange(struct tcp_rack *rack) 7964 { 7965 struct tcptemp *t_template; 7966 7967 t_template = tcpip_maketemplate(rack->rc_inp); 7968 if (t_template) { 7969 if (rack->forced_ack == 0) { 7970 rack->forced_ack = 1; 7971 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7972 } else { 7973 rack->probe_not_answered = 1; 7974 } 7975 tcp_respond(rack->rc_tp, t_template->tt_ipgen, 7976 &t_template->tt_t, (struct mbuf *)NULL, 7977 rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0); 7978 free(t_template, M_TEMP); 7979 /* This does send an ack so kill any D-ack timer */ 7980 if (rack->rc_tp->t_flags & TF_DELACK) 7981 rack->rc_tp->t_flags &= ~TF_DELACK; 7982 return(1); 7983 } else 7984 return (0); 7985 7986 } 7987 7988 /* 7989 * Persists timer, here we simply send the 7990 * same thing as a keepalive will. 7991 * the one byte send. 7992 * 7993 * We only return 1, saying don't proceed, if all timers 7994 * are stopped (destroyed PCB?). 7995 */ 7996 static int 7997 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7998 { 7999 int32_t retval = 1; 8000 8001 if (rack->rc_in_persist == 0) 8002 return (0); 8003 if (ctf_progress_timeout_check(tp, false)) { 8004 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 8005 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 8006 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 8007 return (-ETIMEDOUT); /* tcp_drop() */ 8008 } 8009 /* 8010 * Persistence timer into zero window. Force a byte to be output, if 8011 * possible. 8012 */ 8013 KMOD_TCPSTAT_INC(tcps_persisttimeo); 8014 /* 8015 * Hack: if the peer is dead/unreachable, we do not time out if the 8016 * window is closed. After a full backoff, drop the connection if 8017 * the idle time (no responses to probes) reaches the maximum 8018 * backoff that we would use if retransmitting. 8019 */ 8020 if (tp->t_rxtshift >= V_tcp_retries && 8021 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 8022 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 8023 KMOD_TCPSTAT_INC(tcps_persistdrop); 8024 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 8025 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 8026 retval = -ETIMEDOUT; /* tcp_drop() */ 8027 goto out; 8028 } 8029 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 8030 tp->snd_una == tp->snd_max) 8031 rack_exit_persist(tp, rack, cts); 8032 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 8033 /* 8034 * If the user has closed the socket then drop a persisting 8035 * connection after a much reduced timeout. 8036 */ 8037 if (tp->t_state > TCPS_CLOSE_WAIT && 8038 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 8039 KMOD_TCPSTAT_INC(tcps_persistdrop); 8040 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 8041 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 8042 retval = -ETIMEDOUT; /* tcp_drop() */ 8043 goto out; 8044 } 8045 if (rack_send_ack_challange(rack)) { 8046 /* only set it if we were answered */ 8047 if (rack->probe_not_answered) { 8048 counter_u64_add(rack_persists_loss, 1); 8049 rack->r_ctl.persist_lost_ends++; 8050 } 8051 counter_u64_add(rack_persists_sends, 1); 8052 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 8053 } 8054 if (tp->t_rxtshift < V_tcp_retries) 8055 tp->t_rxtshift++; 8056 out: 8057 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 8058 rack_start_hpts_timer(rack, tp, cts, 8059 0, 0, 0); 8060 return (retval); 8061 } 8062 8063 /* 8064 * If a keepalive goes off, we had no other timers 8065 * happening. We always return 1 here since this 8066 * routine either drops the connection or sends 8067 * out a segment with respond. 8068 */ 8069 static int 8070 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 8071 { 8072 struct inpcb *inp = tptoinpcb(tp); 8073 8074 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 8075 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 8076 /* 8077 * Keep-alive timer went off; send something or drop connection if 8078 * idle for too long. 8079 */ 8080 KMOD_TCPSTAT_INC(tcps_keeptimeo); 8081 if (tp->t_state < TCPS_ESTABLISHED) 8082 goto dropit; 8083 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 8084 tp->t_state <= TCPS_CLOSING) { 8085 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 8086 goto dropit; 8087 /* 8088 * Send a packet designed to force a response if the peer is 8089 * up and reachable: either an ACK if the connection is 8090 * still alive, or an RST if the peer has closed the 8091 * connection due to timeout or reboot. Using sequence 8092 * number tp->snd_una-1 causes the transmitted zero-length 8093 * segment to lie outside the receive window; by the 8094 * protocol spec, this requires the correspondent TCP to 8095 * respond. 8096 */ 8097 KMOD_TCPSTAT_INC(tcps_keepprobe); 8098 rack_send_ack_challange(rack); 8099 } 8100 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 8101 return (1); 8102 dropit: 8103 KMOD_TCPSTAT_INC(tcps_keepdrops); 8104 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 8105 return (-ETIMEDOUT); /* tcp_drop() */ 8106 } 8107 8108 /* 8109 * Retransmit helper function, clear up all the ack 8110 * flags and take care of important book keeping. 8111 */ 8112 static void 8113 rack_remxt_tmr(struct tcpcb *tp) 8114 { 8115 /* 8116 * The retransmit timer went off, all sack'd blocks must be 8117 * un-acked. 8118 */ 8119 struct rack_sendmap *rsm, *trsm = NULL; 8120 struct tcp_rack *rack; 8121 8122 rack = (struct tcp_rack *)tp->t_fb_ptr; 8123 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 8124 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 8125 rack->r_timer_override = 1; 8126 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 8127 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 8128 rack->r_late = 0; 8129 rack->r_early = 0; 8130 rack->r_ctl.rc_agg_delayed = 0; 8131 rack->r_ctl.rc_agg_early = 0; 8132 if (rack->r_state && (rack->r_state != tp->t_state)) 8133 rack_set_state(tp, rack); 8134 if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) { 8135 /* 8136 * We do not clear the scoreboard until we have had 8137 * more than rack_rxt_scoreboard_clear_thresh time-outs. 8138 */ 8139 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8140 if (rack->r_ctl.rc_resend != NULL) 8141 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 8142 8143 return; 8144 } 8145 /* 8146 * Ideally we would like to be able to 8147 * mark SACK-PASS on anything not acked here. 8148 * 8149 * However, if we do that we would burst out 8150 * all that data 1ms apart. This would be unwise, 8151 * so for now we will just let the normal rxt timer 8152 * and tlp timer take care of it. 8153 * 8154 * Also we really need to stick them back in sequence 8155 * order. This way we send in the proper order and any 8156 * sacks that come floating in will "re-ack" the data. 8157 * To do this we zap the tmap with an INIT and then 8158 * walk through and place every rsm in the tail queue 8159 * hash table back in its seq ordered place. 8160 */ 8161 TAILQ_INIT(&rack->r_ctl.rc_tmap); 8162 8163 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 8164 rsm->r_dupack = 0; 8165 if (rack_verbose_logging) 8166 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8167 /* We must re-add it back to the tlist */ 8168 if (trsm == NULL) { 8169 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8170 } else { 8171 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 8172 } 8173 rsm->r_in_tmap = 1; 8174 trsm = rsm; 8175 if (rsm->r_flags & RACK_ACKED) 8176 rsm->r_flags |= RACK_WAS_ACKED; 8177 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST); 8178 rsm->r_flags |= RACK_MUST_RXT; 8179 } 8180 /* zero the lost since it's all gone */ 8181 rack->r_ctl.rc_considered_lost = 0; 8182 /* Clear the count (we just un-acked them) */ 8183 rack->r_ctl.rc_sacked = 0; 8184 rack->r_ctl.rc_sacklast = NULL; 8185 /* Clear the tlp rtx mark */ 8186 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 8187 if (rack->r_ctl.rc_resend != NULL) 8188 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 8189 rack->r_ctl.rc_prr_sndcnt = 0; 8190 rack_log_to_prr(rack, 6, 0, __LINE__); 8191 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 8192 if (rack->r_ctl.rc_resend != NULL) 8193 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 8194 if (((tp->t_flags & TF_SACK_PERMIT) == 0) && 8195 ((tp->t_flags & TF_SENTFIN) == 0)) { 8196 /* 8197 * For non-sack customers new data 8198 * needs to go out as retransmits until 8199 * we retransmit up to snd_max. 8200 */ 8201 rack->r_must_retran = 1; 8202 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 8203 rack->r_ctl.rc_sacked); 8204 } 8205 } 8206 8207 static void 8208 rack_convert_rtts(struct tcpcb *tp) 8209 { 8210 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 8211 tp->t_rxtcur = RACK_REXMTVAL(tp); 8212 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 8213 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 8214 } 8215 if (tp->t_rxtcur > rack_rto_max) { 8216 tp->t_rxtcur = rack_rto_max; 8217 } 8218 } 8219 8220 static void 8221 rack_cc_conn_init(struct tcpcb *tp) 8222 { 8223 struct tcp_rack *rack; 8224 uint32_t srtt; 8225 8226 rack = (struct tcp_rack *)tp->t_fb_ptr; 8227 srtt = tp->t_srtt; 8228 cc_conn_init(tp); 8229 /* 8230 * Now convert to rack's internal format, 8231 * if required. 8232 */ 8233 if ((srtt == 0) && (tp->t_srtt != 0)) 8234 rack_convert_rtts(tp); 8235 /* 8236 * We want a chance to stay in slowstart as 8237 * we create a connection. TCP spec says that 8238 * initially ssthresh is infinite. For our 8239 * purposes that is the snd_wnd. 8240 */ 8241 if (tp->snd_ssthresh < tp->snd_wnd) { 8242 tp->snd_ssthresh = tp->snd_wnd; 8243 } 8244 /* 8245 * We also want to assure a IW worth of 8246 * data can get inflight. 8247 */ 8248 if (rc_init_window(rack) < tp->snd_cwnd) 8249 tp->snd_cwnd = rc_init_window(rack); 8250 } 8251 8252 /* 8253 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 8254 * we will setup to retransmit the lowest seq number outstanding. 8255 */ 8256 static int 8257 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 8258 { 8259 struct inpcb *inp = tptoinpcb(tp); 8260 int32_t rexmt; 8261 int32_t retval = 0; 8262 bool isipv6; 8263 8264 if ((tp->t_flags & TF_GPUTINPROG) && 8265 (tp->t_rxtshift)) { 8266 /* 8267 * We have had a second timeout 8268 * measurements on successive rxt's are not profitable. 8269 * It is unlikely to be of any use (the network is 8270 * broken or the client went away). 8271 */ 8272 tp->t_flags &= ~TF_GPUTINPROG; 8273 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 8274 rack->r_ctl.rc_gp_srtt /*flex1*/, 8275 tp->gput_seq, 8276 0, 0, 18, __LINE__, NULL, 0); 8277 } 8278 if (ctf_progress_timeout_check(tp, false)) { 8279 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 8280 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 8281 return (-ETIMEDOUT); /* tcp_drop() */ 8282 } 8283 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 8284 rack->r_ctl.retran_during_recovery = 0; 8285 rack->rc_ack_required = 1; 8286 rack->r_ctl.dsack_byte_cnt = 0; 8287 if (IN_RECOVERY(tp->t_flags) && 8288 (rack->rto_from_rec == 0)) { 8289 /* 8290 * Mark that we had a rto while in recovery 8291 * and save the ssthresh so if we go back 8292 * into recovery we will have a chance 8293 * to slowstart back to the level. 8294 */ 8295 rack->rto_from_rec = 1; 8296 rack->r_ctl.rto_ssthresh = tp->snd_ssthresh; 8297 } 8298 if (IN_FASTRECOVERY(tp->t_flags)) 8299 tp->t_flags |= TF_WASFRECOVERY; 8300 else 8301 tp->t_flags &= ~TF_WASFRECOVERY; 8302 if (IN_CONGRECOVERY(tp->t_flags)) 8303 tp->t_flags |= TF_WASCRECOVERY; 8304 else 8305 tp->t_flags &= ~TF_WASCRECOVERY; 8306 if (TCPS_HAVEESTABLISHED(tp->t_state) && 8307 (tp->snd_una == tp->snd_max)) { 8308 /* Nothing outstanding .. nothing to do */ 8309 return (0); 8310 } 8311 if (rack->r_ctl.dsack_persist) { 8312 rack->r_ctl.dsack_persist--; 8313 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 8314 rack->r_ctl.num_dsack = 0; 8315 } 8316 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 8317 } 8318 /* 8319 * Rack can only run one timer at a time, so we cannot 8320 * run a KEEPINIT (gating SYN sending) and a retransmit 8321 * timer for the SYN. So if we are in a front state and 8322 * have a KEEPINIT timer we need to check the first transmit 8323 * against now to see if we have exceeded the KEEPINIT time 8324 * (if one is set). 8325 */ 8326 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 8327 (TP_KEEPINIT(tp) != 0)) { 8328 struct rack_sendmap *rsm; 8329 8330 rsm = tqhash_min(rack->r_ctl.tqh); 8331 if (rsm) { 8332 /* Ok we have something outstanding to test keepinit with */ 8333 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 8334 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 8335 /* We have exceeded the KEEPINIT time */ 8336 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 8337 goto drop_it; 8338 } 8339 } 8340 } 8341 /* 8342 * Retransmission timer went off. Message has not been acked within 8343 * retransmit interval. Back off to a longer retransmit interval 8344 * and retransmit one segment. 8345 */ 8346 if ((rack->r_ctl.rc_resend == NULL) || 8347 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 8348 /* 8349 * If the rwnd collapsed on 8350 * the one we are retransmitting 8351 * it does not count against the 8352 * rxt count. 8353 */ 8354 tp->t_rxtshift++; 8355 } 8356 rack_remxt_tmr(tp); 8357 if (tp->t_rxtshift > V_tcp_retries) { 8358 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 8359 drop_it: 8360 tp->t_rxtshift = V_tcp_retries; 8361 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 8362 /* XXXGL: previously t_softerror was casted to uint16_t */ 8363 MPASS(tp->t_softerror >= 0); 8364 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 8365 goto out; /* tcp_drop() */ 8366 } 8367 if (tp->t_state == TCPS_SYN_SENT) { 8368 /* 8369 * If the SYN was retransmitted, indicate CWND to be limited 8370 * to 1 segment in cc_conn_init(). 8371 */ 8372 tp->snd_cwnd = 1; 8373 } else if (tp->t_rxtshift == 1) { 8374 /* 8375 * first retransmit; record ssthresh and cwnd so they can be 8376 * recovered if this turns out to be a "bad" retransmit. A 8377 * retransmit is considered "bad" if an ACK for this segment 8378 * is received within RTT/2 interval; the assumption here is 8379 * that the ACK was already in flight. See "On Estimating 8380 * End-to-End Network Path Properties" by Allman and Paxson 8381 * for more details. 8382 */ 8383 tp->snd_cwnd_prev = tp->snd_cwnd; 8384 tp->snd_ssthresh_prev = tp->snd_ssthresh; 8385 tp->snd_recover_prev = tp->snd_recover; 8386 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 8387 tp->t_flags |= TF_PREVVALID; 8388 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 8389 tp->t_flags &= ~TF_PREVVALID; 8390 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 8391 if ((tp->t_state == TCPS_SYN_SENT) || 8392 (tp->t_state == TCPS_SYN_RECEIVED)) 8393 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 8394 else 8395 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 8396 8397 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 8398 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 8399 /* 8400 * We enter the path for PLMTUD if connection is established or, if 8401 * connection is FIN_WAIT_1 status, reason for the last is that if 8402 * amount of data we send is very small, we could send it in couple 8403 * of packets and process straight to FIN. In that case we won't 8404 * catch ESTABLISHED state. 8405 */ 8406 #ifdef INET6 8407 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 8408 #else 8409 isipv6 = false; 8410 #endif 8411 if (((V_tcp_pmtud_blackhole_detect == 1) || 8412 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 8413 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 8414 ((tp->t_state == TCPS_ESTABLISHED) || 8415 (tp->t_state == TCPS_FIN_WAIT_1))) { 8416 /* 8417 * Idea here is that at each stage of mtu probe (usually, 8418 * 1448 -> 1188 -> 524) should be given 2 chances to recover 8419 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 8420 * should take care of that. 8421 */ 8422 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 8423 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 8424 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 8425 tp->t_rxtshift % 2 == 0)) { 8426 /* 8427 * Enter Path MTU Black-hole Detection mechanism: - 8428 * Disable Path MTU Discovery (IP "DF" bit). - 8429 * Reduce MTU to lower value than what we negotiated 8430 * with peer. 8431 */ 8432 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 8433 /* Record that we may have found a black hole. */ 8434 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 8435 /* Keep track of previous MSS. */ 8436 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 8437 } 8438 8439 /* 8440 * Reduce the MSS to blackhole value or to the 8441 * default in an attempt to retransmit. 8442 */ 8443 #ifdef INET6 8444 if (isipv6 && 8445 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 8446 /* Use the sysctl tuneable blackhole MSS. */ 8447 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 8448 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 8449 } else if (isipv6) { 8450 /* Use the default MSS. */ 8451 tp->t_maxseg = V_tcp_v6mssdflt; 8452 /* 8453 * Disable Path MTU Discovery when we switch 8454 * to minmss. 8455 */ 8456 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8457 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 8458 } 8459 #endif 8460 #if defined(INET6) && defined(INET) 8461 else 8462 #endif 8463 #ifdef INET 8464 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 8465 /* Use the sysctl tuneable blackhole MSS. */ 8466 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 8467 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 8468 } else { 8469 /* Use the default MSS. */ 8470 tp->t_maxseg = V_tcp_mssdflt; 8471 /* 8472 * Disable Path MTU Discovery when we switch 8473 * to minmss. 8474 */ 8475 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8476 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 8477 } 8478 #endif 8479 } else { 8480 /* 8481 * If further retransmissions are still unsuccessful 8482 * with a lowered MTU, maybe this isn't a blackhole 8483 * and we restore the previous MSS and blackhole 8484 * detection flags. The limit '6' is determined by 8485 * giving each probe stage (1448, 1188, 524) 2 8486 * chances to recover. 8487 */ 8488 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 8489 (tp->t_rxtshift >= 6)) { 8490 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8491 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 8492 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 8493 if (tp->t_maxseg < V_tcp_mssdflt) { 8494 /* 8495 * The MSS is so small we should not 8496 * process incoming SACK's since we are 8497 * subject to attack in such a case. 8498 */ 8499 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 8500 } else { 8501 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 8502 } 8503 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 8504 } 8505 } 8506 } 8507 /* 8508 * Disable RFC1323 and SACK if we haven't got any response to 8509 * our third SYN to work-around some broken terminal servers 8510 * (most of which have hopefully been retired) that have bad VJ 8511 * header compression code which trashes TCP segments containing 8512 * unknown-to-them TCP options. 8513 */ 8514 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 8515 (tp->t_rxtshift == 3)) 8516 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 8517 /* 8518 * If we backed off this far, our srtt estimate is probably bogus. 8519 * Clobber it so we'll take the next rtt measurement as our srtt; 8520 * move the current srtt into rttvar to keep the current retransmit 8521 * times until then. 8522 */ 8523 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 8524 #ifdef INET6 8525 if ((inp->inp_vflag & INP_IPV6) != 0) 8526 in6_losing(inp); 8527 else 8528 #endif 8529 in_losing(inp); 8530 tp->t_rttvar += tp->t_srtt; 8531 tp->t_srtt = 0; 8532 } 8533 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8534 tp->snd_recover = tp->snd_max; 8535 tp->t_flags |= TF_ACKNOW; 8536 tp->t_rtttime = 0; 8537 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 8538 out: 8539 return (retval); 8540 } 8541 8542 static int 8543 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 8544 { 8545 int32_t ret = 0; 8546 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 8547 8548 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8549 (tp->t_flags & TF_GPUTINPROG)) { 8550 /* 8551 * We have a goodput in progress 8552 * and we have entered a late state. 8553 * Do we have enough data in the sb 8554 * to handle the GPUT request? 8555 */ 8556 uint32_t bytes; 8557 8558 bytes = tp->gput_ack - tp->gput_seq; 8559 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 8560 bytes += tp->gput_seq - tp->snd_una; 8561 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 8562 /* 8563 * There are not enough bytes in the socket 8564 * buffer that have been sent to cover this 8565 * measurement. Cancel it. 8566 */ 8567 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 8568 rack->r_ctl.rc_gp_srtt /*flex1*/, 8569 tp->gput_seq, 8570 0, 0, 18, __LINE__, NULL, 0); 8571 tp->t_flags &= ~TF_GPUTINPROG; 8572 } 8573 } 8574 if (timers == 0) { 8575 return (0); 8576 } 8577 if (tp->t_state == TCPS_LISTEN) { 8578 /* no timers on listen sockets */ 8579 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 8580 return (0); 8581 return (1); 8582 } 8583 if ((timers & PACE_TMR_RACK) && 8584 rack->rc_on_min_to) { 8585 /* 8586 * For the rack timer when we 8587 * are on a min-timeout (which means rrr_conf = 3) 8588 * we don't want to check the timer. It may 8589 * be going off for a pace and thats ok we 8590 * want to send the retransmit (if its ready). 8591 * 8592 * If its on a normal rack timer (non-min) then 8593 * we will check if its expired. 8594 */ 8595 goto skip_time_check; 8596 } 8597 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 8598 uint32_t left; 8599 8600 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 8601 ret = -1; 8602 rack_log_to_processing(rack, cts, ret, 0); 8603 return (0); 8604 } 8605 if (hpts_calling == 0) { 8606 /* 8607 * A user send or queued mbuf (sack) has called us? We 8608 * return 0 and let the pacing guards 8609 * deal with it if they should or 8610 * should not cause a send. 8611 */ 8612 ret = -2; 8613 rack_log_to_processing(rack, cts, ret, 0); 8614 return (0); 8615 } 8616 /* 8617 * Ok our timer went off early and we are not paced false 8618 * alarm, go back to sleep. We make sure we don't have 8619 * no-sack wakeup on since we no longer have a PKT_OUTPUT 8620 * flag in place. 8621 */ 8622 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; 8623 ret = -3; 8624 left = rack->r_ctl.rc_timer_exp - cts; 8625 tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); 8626 rack_log_to_processing(rack, cts, ret, left); 8627 return (1); 8628 } 8629 skip_time_check: 8630 rack->rc_tmr_stopped = 0; 8631 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 8632 if (timers & PACE_TMR_DELACK) { 8633 ret = rack_timeout_delack(tp, rack, cts); 8634 } else if (timers & PACE_TMR_RACK) { 8635 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8636 rack->r_fast_output = 0; 8637 ret = rack_timeout_rack(tp, rack, cts); 8638 } else if (timers & PACE_TMR_TLP) { 8639 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8640 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 8641 } else if (timers & PACE_TMR_RXT) { 8642 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8643 rack->r_fast_output = 0; 8644 ret = rack_timeout_rxt(tp, rack, cts); 8645 } else if (timers & PACE_TMR_PERSIT) { 8646 ret = rack_timeout_persist(tp, rack, cts); 8647 } else if (timers & PACE_TMR_KEEP) { 8648 ret = rack_timeout_keepalive(tp, rack, cts); 8649 } 8650 rack_log_to_processing(rack, cts, ret, timers); 8651 return (ret); 8652 } 8653 8654 static void 8655 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 8656 { 8657 struct timeval tv; 8658 uint32_t us_cts, flags_on_entry; 8659 uint8_t hpts_removed = 0; 8660 8661 flags_on_entry = rack->r_ctl.rc_hpts_flags; 8662 us_cts = tcp_get_usecs(&tv); 8663 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 8664 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 8665 ((tp->snd_max - tp->snd_una) == 0))) { 8666 tcp_hpts_remove(rack->rc_tp); 8667 hpts_removed = 1; 8668 /* If we were not delayed cancel out the flag. */ 8669 if ((tp->snd_max - tp->snd_una) == 0) 8670 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8671 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8672 } 8673 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 8674 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 8675 if (tcp_in_hpts(rack->rc_tp) && 8676 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 8677 /* 8678 * Canceling timer's when we have no output being 8679 * paced. We also must remove ourselves from the 8680 * hpts. 8681 */ 8682 tcp_hpts_remove(rack->rc_tp); 8683 hpts_removed = 1; 8684 } 8685 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 8686 } 8687 if (hpts_removed == 0) 8688 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8689 } 8690 8691 static int 8692 rack_stopall(struct tcpcb *tp) 8693 { 8694 struct tcp_rack *rack; 8695 8696 rack = (struct tcp_rack *)tp->t_fb_ptr; 8697 rack->t_timers_stopped = 1; 8698 8699 tcp_hpts_remove(tp); 8700 8701 return (0); 8702 } 8703 8704 static void 8705 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) 8706 { 8707 /* 8708 * Assure no timers are running. 8709 */ 8710 if (tcp_timer_active(tp, TT_PERSIST)) { 8711 /* We enter in persists, set the flag appropriately */ 8712 rack->rc_in_persist = 1; 8713 } 8714 if (tcp_in_hpts(rack->rc_tp)) { 8715 tcp_hpts_remove(rack->rc_tp); 8716 } 8717 } 8718 8719 /* 8720 * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This 8721 * array is zeroed at the start of recovery. Each time a segment 8722 * is retransmitted, we translate that into a number of packets 8723 * (based on segsiz) and based on how many times its been retransmitted 8724 * increment by the number of packets the counter that represents 8725 * retansmitted N times. Index 0 is retransmitted 1 time, index 1 8726 * is retransmitted 2 times etc. 8727 * 8728 * So for example when we send a 4344 byte transmission with a 1448 8729 * byte segsize, and its the third time we have retransmitted this 8730 * segment, we would add to the rc_cnt_of_retran[2] the value of 8731 * 3. That represents 3 MSS were retransmitted 3 times (index is 8732 * the number of times retranmitted minus 1). 8733 */ 8734 static void 8735 rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) 8736 { 8737 int idx; 8738 uint32_t peg; 8739 8740 peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; 8741 peg /= segsiz; 8742 idx = rsm->r_act_rxt_cnt - 1; 8743 if (idx >= RETRAN_CNT_SIZE) 8744 idx = RETRAN_CNT_SIZE - 1; 8745 /* Max of a uint16_t retransmits in a bucket */ 8746 if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff) 8747 rack->r_ctl.rc_cnt_of_retran[idx] += peg; 8748 else 8749 rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff; 8750 } 8751 8752 /* 8753 * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This 8754 * array is zeroed at the start of recovery. Each time a segment 8755 * is retransmitted, we translate that into a number of packets 8756 * (based on segsiz) and based on how many times its been retransmitted 8757 * increment by the number of packets the counter that represents 8758 * retansmitted N times. Index 0 is retransmitted 1 time, index 1 8759 * is retransmitted 2 times etc. 8760 * 8761 * The rack_unpeg_rxt is used when we go to retransmit a segment 8762 * again. Basically if the segment had previously been retransmitted 8763 * say 3 times (as our previous example illustrated in the comment 8764 * above rack_peg_rxt() prior to calling that and incrementing 8765 * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would 8766 * subtract back the previous add from its last rxt (in this 8767 * example r_act_cnt would have been 2 for 2 retransmissions. So 8768 * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove 8769 * those 3 segments. You will see this in the rack_update_rsm() 8770 * below where we do: 8771 * if (rsm->r_act_rxt_cnt > 0) { 8772 * rack_unpeg_rxt(rack, rsm, segsiz); 8773 * } 8774 * rsm->r_act_rxt_cnt++; 8775 * rack_peg_rxt(rack, rsm, segsiz); 8776 * 8777 * This effectively moves the count from rc_cnt_of_retran[1] to 8778 * rc_cnt_of_retran[2]. 8779 */ 8780 static void 8781 rack_unpeg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) 8782 { 8783 int idx; 8784 uint32_t peg; 8785 8786 idx = rsm->r_act_rxt_cnt - 1; 8787 if (idx >= RETRAN_CNT_SIZE) 8788 idx = RETRAN_CNT_SIZE - 1; 8789 peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; 8790 peg /= segsiz; 8791 if (peg < rack->r_ctl.rc_cnt_of_retran[idx]) 8792 rack->r_ctl.rc_cnt_of_retran[idx] -= peg; 8793 else { 8794 /* TSNH */ 8795 rack->r_ctl.rc_cnt_of_retran[idx] = 0; 8796 } 8797 } 8798 8799 static void 8800 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 8801 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz) 8802 { 8803 int32_t idx; 8804 8805 rsm->r_rtr_cnt++; 8806 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 8807 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 8808 rsm->r_flags |= RACK_OVERMAX; 8809 } 8810 if (rsm->r_act_rxt_cnt > 0) { 8811 /* Drop the count back for this, its retransmitting again */ 8812 rack_unpeg_rxt(rack, rsm, segsiz); 8813 } 8814 rsm->r_act_rxt_cnt++; 8815 /* Peg the count/index */ 8816 rack_peg_rxt(rack, rsm, segsiz); 8817 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8818 rsm->r_dupack = 0; 8819 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 8820 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 8821 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 8822 } 8823 if (rsm->r_flags & RACK_WAS_LOST) { 8824 /* 8825 * We retransmitted it putting it back in flight 8826 * remove the lost desgination and reduce the 8827 * bytes considered lost. 8828 */ 8829 rsm->r_flags &= ~RACK_WAS_LOST; 8830 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), 8831 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 8832 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) 8833 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; 8834 else 8835 rack->r_ctl.rc_considered_lost = 0; 8836 } 8837 idx = rsm->r_rtr_cnt - 1; 8838 rsm->r_tim_lastsent[idx] = ts; 8839 /* 8840 * Here we don't add in the len of send, since its already 8841 * in snduna <->snd_max. 8842 */ 8843 rsm->r_fas = ctf_flight_size(rack->rc_tp, 8844 rack->r_ctl.rc_sacked); 8845 if (rsm->r_flags & RACK_ACKED) { 8846 /* Problably MTU discovery messing with us */ 8847 rsm->r_flags &= ~RACK_ACKED; 8848 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8849 } 8850 if (rsm->r_in_tmap) { 8851 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8852 rsm->r_in_tmap = 0; 8853 } 8854 /* Lets make sure it really is in or not the GP window */ 8855 rack_mark_in_gp_win(tp, rsm); 8856 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8857 rsm->r_in_tmap = 1; 8858 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); 8859 /* Take off the must retransmit flag, if its on */ 8860 if (rsm->r_flags & RACK_MUST_RXT) { 8861 if (rack->r_must_retran) 8862 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 8863 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 8864 /* 8865 * We have retransmitted all we need. Clear 8866 * any must retransmit flags. 8867 */ 8868 rack->r_must_retran = 0; 8869 rack->r_ctl.rc_out_at_rto = 0; 8870 } 8871 rsm->r_flags &= ~RACK_MUST_RXT; 8872 } 8873 /* Remove any collapsed flag */ 8874 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8875 if (rsm->r_flags & RACK_SACK_PASSED) { 8876 /* We have retransmitted due to the SACK pass */ 8877 rsm->r_flags &= ~RACK_SACK_PASSED; 8878 rsm->r_flags |= RACK_WAS_SACKPASS; 8879 } 8880 } 8881 8882 static uint32_t 8883 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 8884 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz) 8885 { 8886 /* 8887 * We (re-)transmitted starting at rsm->r_start for some length 8888 * (possibly less than r_end. 8889 */ 8890 struct rack_sendmap *nrsm; 8891 int insret __diagused; 8892 uint32_t c_end; 8893 int32_t len; 8894 8895 len = *lenp; 8896 c_end = rsm->r_start + len; 8897 if (SEQ_GEQ(c_end, rsm->r_end)) { 8898 /* 8899 * We retransmitted the whole piece or more than the whole 8900 * slopping into the next rsm. 8901 */ 8902 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8903 if (c_end == rsm->r_end) { 8904 *lenp = 0; 8905 return (0); 8906 } else { 8907 int32_t act_len; 8908 8909 /* Hangs over the end return whats left */ 8910 act_len = rsm->r_end - rsm->r_start; 8911 *lenp = (len - act_len); 8912 return (rsm->r_end); 8913 } 8914 /* We don't get out of this block. */ 8915 } 8916 /* 8917 * Here we retransmitted less than the whole thing which means we 8918 * have to split this into what was transmitted and what was not. 8919 */ 8920 nrsm = rack_alloc_full_limit(rack); 8921 if (nrsm == NULL) { 8922 /* 8923 * We can't get memory, so lets not proceed. 8924 */ 8925 *lenp = 0; 8926 return (0); 8927 } 8928 /* 8929 * So here we are going to take the original rsm and make it what we 8930 * retransmitted. nrsm will be the tail portion we did not 8931 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 8932 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 8933 * 1, 6 and the new piece will be 6, 11. 8934 */ 8935 rack_clone_rsm(rack, nrsm, rsm, c_end); 8936 nrsm->r_dupack = 0; 8937 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8938 #ifndef INVARIANTS 8939 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8940 #else 8941 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8942 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8943 nrsm, insret, rack, rsm); 8944 } 8945 #endif 8946 if (rsm->r_in_tmap) { 8947 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8948 nrsm->r_in_tmap = 1; 8949 } 8950 rsm->r_flags &= (~RACK_HAS_FIN); 8951 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8952 /* Log a split of rsm into rsm and nrsm */ 8953 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8954 *lenp = 0; 8955 return (0); 8956 } 8957 8958 static void 8959 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 8960 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 8961 struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb, 8962 uint32_t s_moff, int hw_tls, int segsiz) 8963 { 8964 struct tcp_rack *rack; 8965 struct rack_sendmap *rsm, *nrsm; 8966 int insret __diagused; 8967 8968 register uint32_t snd_max, snd_una; 8969 8970 /* 8971 * Add to the RACK log of packets in flight or retransmitted. If 8972 * there is a TS option we will use the TS echoed, if not we will 8973 * grab a TS. 8974 * 8975 * Retransmissions will increment the count and move the ts to its 8976 * proper place. Note that if options do not include TS's then we 8977 * won't be able to effectively use the ACK for an RTT on a retran. 8978 * 8979 * Notes about r_start and r_end. Lets consider a send starting at 8980 * sequence 1 for 10 bytes. In such an example the r_start would be 8981 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 8982 * This means that r_end is actually the first sequence for the next 8983 * slot (11). 8984 * 8985 */ 8986 /* 8987 * If err is set what do we do XXXrrs? should we not add the thing? 8988 * -- i.e. return if err != 0 or should we pretend we sent it? -- 8989 * i.e. proceed with add ** do this for now. 8990 */ 8991 INP_WLOCK_ASSERT(tptoinpcb(tp)); 8992 if (err) 8993 /* 8994 * We don't log errors -- we could but snd_max does not 8995 * advance in this case either. 8996 */ 8997 return; 8998 8999 if (th_flags & TH_RST) { 9000 /* 9001 * We don't log resets and we return immediately from 9002 * sending 9003 */ 9004 return; 9005 } 9006 rack = (struct tcp_rack *)tp->t_fb_ptr; 9007 snd_una = tp->snd_una; 9008 snd_max = tp->snd_max; 9009 if (th_flags & (TH_SYN | TH_FIN)) { 9010 /* 9011 * The call to rack_log_output is made before bumping 9012 * snd_max. This means we can record one extra byte on a SYN 9013 * or FIN if seq_out is adding more on and a FIN is present 9014 * (and we are not resending). 9015 */ 9016 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 9017 len++; 9018 if (th_flags & TH_FIN) 9019 len++; 9020 } 9021 if (SEQ_LEQ((seq_out + len), snd_una)) { 9022 /* Are sending an old segment to induce an ack (keep-alive)? */ 9023 return; 9024 } 9025 if (SEQ_LT(seq_out, snd_una)) { 9026 /* huh? should we panic? */ 9027 uint32_t end; 9028 9029 end = seq_out + len; 9030 seq_out = snd_una; 9031 if (SEQ_GEQ(end, seq_out)) 9032 len = end - seq_out; 9033 else 9034 len = 0; 9035 } 9036 if (len == 0) { 9037 /* We don't log zero window probes */ 9038 return; 9039 } 9040 if (IN_FASTRECOVERY(tp->t_flags)) { 9041 rack->r_ctl.rc_prr_out += len; 9042 } 9043 /* First question is it a retransmission or new? */ 9044 if (seq_out == snd_max) { 9045 /* Its new */ 9046 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts); 9047 again: 9048 rsm = rack_alloc(rack); 9049 if (rsm == NULL) { 9050 /* 9051 * Hmm out of memory and the tcb got destroyed while 9052 * we tried to wait. 9053 */ 9054 return; 9055 } 9056 if (th_flags & TH_FIN) { 9057 rsm->r_flags = RACK_HAS_FIN|add_flag; 9058 } else { 9059 rsm->r_flags = add_flag; 9060 } 9061 if (hw_tls) 9062 rsm->r_hw_tls = 1; 9063 rsm->r_tim_lastsent[0] = cts; 9064 rsm->r_rtr_cnt = 1; 9065 rsm->r_act_rxt_cnt = 0; 9066 rsm->r_rtr_bytes = 0; 9067 if (th_flags & TH_SYN) { 9068 /* The data space is one beyond snd_una */ 9069 rsm->r_flags |= RACK_HAS_SYN; 9070 } 9071 rsm->r_start = seq_out; 9072 rsm->r_end = rsm->r_start + len; 9073 rack_mark_in_gp_win(tp, rsm); 9074 rsm->r_dupack = 0; 9075 /* 9076 * save off the mbuf location that 9077 * sndmbuf_noadv returned (which is 9078 * where we started copying from).. 9079 */ 9080 rsm->m = s_mb; 9081 rsm->soff = s_moff; 9082 /* 9083 * Here we do add in the len of send, since its not yet 9084 * reflected in in snduna <->snd_max 9085 */ 9086 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 9087 rack->r_ctl.rc_sacked) + 9088 (rsm->r_end - rsm->r_start)); 9089 if ((rack->rc_initial_ss_comp == 0) && 9090 (rack->r_ctl.ss_hi_fs < rsm->r_fas)) { 9091 rack->r_ctl.ss_hi_fs = rsm->r_fas; 9092 } 9093 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 9094 if (rsm->m) { 9095 if (rsm->m->m_len <= rsm->soff) { 9096 /* 9097 * XXXrrs Question, will this happen? 9098 * 9099 * If sbsndptr is set at the correct place 9100 * then s_moff should always be somewhere 9101 * within rsm->m. But if the sbsndptr was 9102 * off then that won't be true. If it occurs 9103 * we need to walkout to the correct location. 9104 */ 9105 struct mbuf *lm; 9106 9107 lm = rsm->m; 9108 while (lm->m_len <= rsm->soff) { 9109 rsm->soff -= lm->m_len; 9110 lm = lm->m_next; 9111 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 9112 __func__, rack, s_moff, s_mb, rsm->soff)); 9113 } 9114 rsm->m = lm; 9115 } 9116 rsm->orig_m_len = rsm->m->m_len; 9117 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 9118 } else { 9119 rsm->orig_m_len = 0; 9120 rsm->orig_t_space = 0; 9121 } 9122 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); 9123 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9124 /* Log a new rsm */ 9125 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 9126 #ifndef INVARIANTS 9127 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 9128 #else 9129 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 9130 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 9131 nrsm, insret, rack, rsm); 9132 } 9133 #endif 9134 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9135 rsm->r_in_tmap = 1; 9136 if (rsm->r_flags & RACK_IS_PCM) { 9137 rack->r_ctl.pcm_i.send_time = cts; 9138 rack->r_ctl.pcm_i.eseq = rsm->r_end; 9139 /* First time through we set the start too */ 9140 if (rack->pcm_in_progress == 0) 9141 rack->r_ctl.pcm_i.sseq = rsm->r_start; 9142 } 9143 /* 9144 * Special case detection, is there just a single 9145 * packet outstanding when we are not in recovery? 9146 * 9147 * If this is true mark it so. 9148 */ 9149 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 9150 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 9151 struct rack_sendmap *prsm; 9152 9153 prsm = tqhash_prev(rack->r_ctl.tqh, rsm); 9154 if (prsm) 9155 prsm->r_one_out_nr = 1; 9156 } 9157 return; 9158 } 9159 /* 9160 * If we reach here its a retransmission and we need to find it. 9161 */ 9162 more: 9163 if (hintrsm && (hintrsm->r_start == seq_out)) { 9164 rsm = hintrsm; 9165 hintrsm = NULL; 9166 } else { 9167 /* No hints sorry */ 9168 rsm = NULL; 9169 } 9170 if ((rsm) && (rsm->r_start == seq_out)) { 9171 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 9172 if (len == 0) { 9173 return; 9174 } else { 9175 goto more; 9176 } 9177 } 9178 /* Ok it was not the last pointer go through it the hard way. */ 9179 refind: 9180 rsm = tqhash_find(rack->r_ctl.tqh, seq_out); 9181 if (rsm) { 9182 if (rsm->r_start == seq_out) { 9183 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 9184 if (len == 0) { 9185 return; 9186 } else { 9187 goto refind; 9188 } 9189 } 9190 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 9191 /* Transmitted within this piece */ 9192 /* 9193 * Ok we must split off the front and then let the 9194 * update do the rest 9195 */ 9196 nrsm = rack_alloc_full_limit(rack); 9197 if (nrsm == NULL) { 9198 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); 9199 return; 9200 } 9201 /* 9202 * copy rsm to nrsm and then trim the front of rsm 9203 * to not include this part. 9204 */ 9205 rack_clone_rsm(rack, nrsm, rsm, seq_out); 9206 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 9207 #ifndef INVARIANTS 9208 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9209 #else 9210 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9211 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 9212 nrsm, insret, rack, rsm); 9213 } 9214 #endif 9215 if (rsm->r_in_tmap) { 9216 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9217 nrsm->r_in_tmap = 1; 9218 } 9219 rsm->r_flags &= (~RACK_HAS_FIN); 9220 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); 9221 if (len == 0) { 9222 return; 9223 } else if (len > 0) 9224 goto refind; 9225 } 9226 } 9227 /* 9228 * Hmm not found in map did they retransmit both old and on into the 9229 * new? 9230 */ 9231 if (seq_out == tp->snd_max) { 9232 goto again; 9233 } else if (SEQ_LT(seq_out, tp->snd_max)) { 9234 #ifdef INVARIANTS 9235 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 9236 seq_out, len, tp->snd_una, tp->snd_max); 9237 printf("Starting Dump of all rack entries\n"); 9238 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 9239 printf("rsm:%p start:%u end:%u\n", 9240 rsm, rsm->r_start, rsm->r_end); 9241 } 9242 printf("Dump complete\n"); 9243 panic("seq_out not found rack:%p tp:%p", 9244 rack, tp); 9245 #endif 9246 } else { 9247 #ifdef INVARIANTS 9248 /* 9249 * Hmm beyond sndmax? (only if we are using the new rtt-pack 9250 * flag) 9251 */ 9252 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 9253 seq_out, len, tp->snd_max, tp); 9254 #endif 9255 } 9256 } 9257 9258 /* 9259 * Record one of the RTT updates from an ack into 9260 * our sample structure. 9261 */ 9262 9263 static void 9264 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 9265 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 9266 { 9267 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 9268 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 9269 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 9270 } 9271 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 9272 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 9273 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 9274 } 9275 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 9276 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 9277 rack->r_ctl.rc_gp_lowrtt = us_rtt; 9278 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 9279 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 9280 } 9281 if ((confidence == 1) && 9282 ((rsm == NULL) || 9283 (rsm->r_just_ret) || 9284 (rsm->r_one_out_nr && 9285 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 9286 /* 9287 * If the rsm had a just return 9288 * hit it then we can't trust the 9289 * rtt measurement for buffer deterimination 9290 * Note that a confidence of 2, indicates 9291 * SACK'd which overrides the r_just_ret or 9292 * the r_one_out_nr. If it was a CUM-ACK and 9293 * we had only two outstanding, but get an 9294 * ack for only 1. Then that also lowers our 9295 * confidence. 9296 */ 9297 confidence = 0; 9298 } 9299 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 9300 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 9301 if (rack->r_ctl.rack_rs.confidence == 0) { 9302 /* 9303 * We take anything with no current confidence 9304 * saved. 9305 */ 9306 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 9307 rack->r_ctl.rack_rs.confidence = confidence; 9308 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 9309 } else if (confidence != 0) { 9310 /* 9311 * Once we have a confident number, 9312 * we can update it with a smaller 9313 * value since this confident number 9314 * may include the DSACK time until 9315 * the next segment (the second one) arrived. 9316 */ 9317 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 9318 rack->r_ctl.rack_rs.confidence = confidence; 9319 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 9320 } 9321 } 9322 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 9323 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 9324 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 9325 rack->r_ctl.rack_rs.rs_rtt_cnt++; 9326 } 9327 9328 /* 9329 * Collect new round-trip time estimate 9330 * and update averages and current timeout. 9331 */ 9332 static void 9333 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 9334 { 9335 int32_t delta; 9336 int32_t rtt; 9337 9338 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 9339 /* No valid sample */ 9340 return; 9341 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 9342 /* We are to use the lowest RTT seen in a single ack */ 9343 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 9344 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 9345 /* We are to use the highest RTT seen in a single ack */ 9346 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 9347 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 9348 /* We are to use the average RTT seen in a single ack */ 9349 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 9350 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 9351 } else { 9352 #ifdef INVARIANTS 9353 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 9354 #endif 9355 return; 9356 } 9357 if (rtt == 0) 9358 rtt = 1; 9359 if (rack->rc_gp_rtt_set == 0) { 9360 /* 9361 * With no RTT we have to accept 9362 * even one we are not confident of. 9363 */ 9364 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 9365 rack->rc_gp_rtt_set = 1; 9366 } else if (rack->r_ctl.rack_rs.confidence) { 9367 /* update the running gp srtt */ 9368 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 9369 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 9370 } 9371 if (rack->r_ctl.rack_rs.confidence) { 9372 /* 9373 * record the low and high for highly buffered path computation, 9374 * we only do this if we are confident (not a retransmission). 9375 */ 9376 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 9377 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9378 } 9379 if (rack->rc_highly_buffered == 0) { 9380 /* 9381 * Currently once we declare a path has 9382 * highly buffered there is no going 9383 * back, which may be a problem... 9384 */ 9385 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 9386 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 9387 rack->r_ctl.rc_highest_us_rtt, 9388 rack->r_ctl.rc_lowest_us_rtt, 9389 RACK_RTTS_SEEHBP); 9390 rack->rc_highly_buffered = 1; 9391 } 9392 } 9393 } 9394 if ((rack->r_ctl.rack_rs.confidence) || 9395 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 9396 /* 9397 * If we are highly confident of it <or> it was 9398 * never retransmitted we accept it as the last us_rtt. 9399 */ 9400 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9401 /* The lowest rtt can be set if its was not retransmited */ 9402 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 9403 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9404 if (rack->r_ctl.rc_lowest_us_rtt == 0) 9405 rack->r_ctl.rc_lowest_us_rtt = 1; 9406 } 9407 } 9408 rack = (struct tcp_rack *)tp->t_fb_ptr; 9409 if (tp->t_srtt != 0) { 9410 /* 9411 * We keep a simple srtt in microseconds, like our rtt 9412 * measurement. We don't need to do any tricks with shifting 9413 * etc. Instead we just add in 1/8th of the new measurement 9414 * and subtract out 1/8 of the old srtt. We do the same with 9415 * the variance after finding the absolute value of the 9416 * difference between this sample and the current srtt. 9417 */ 9418 delta = tp->t_srtt - rtt; 9419 /* Take off 1/8th of the current sRTT */ 9420 tp->t_srtt -= (tp->t_srtt >> 3); 9421 /* Add in 1/8th of the new RTT just measured */ 9422 tp->t_srtt += (rtt >> 3); 9423 if (tp->t_srtt <= 0) 9424 tp->t_srtt = 1; 9425 /* Now lets make the absolute value of the variance */ 9426 if (delta < 0) 9427 delta = -delta; 9428 /* Subtract out 1/8th */ 9429 tp->t_rttvar -= (tp->t_rttvar >> 3); 9430 /* Add in 1/8th of the new variance we just saw */ 9431 tp->t_rttvar += (delta >> 3); 9432 if (tp->t_rttvar <= 0) 9433 tp->t_rttvar = 1; 9434 } else { 9435 /* 9436 * No rtt measurement yet - use the unsmoothed rtt. Set the 9437 * variance to half the rtt (so our first retransmit happens 9438 * at 3*rtt). 9439 */ 9440 tp->t_srtt = rtt; 9441 tp->t_rttvar = rtt >> 1; 9442 } 9443 rack->rc_srtt_measure_made = 1; 9444 KMOD_TCPSTAT_INC(tcps_rttupdated); 9445 if (tp->t_rttupdated < UCHAR_MAX) 9446 tp->t_rttupdated++; 9447 #ifdef STATS 9448 if (rack_stats_gets_ms_rtt == 0) { 9449 /* Send in the microsecond rtt used for rxt timeout purposes */ 9450 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 9451 } else if (rack_stats_gets_ms_rtt == 1) { 9452 /* Send in the millisecond rtt used for rxt timeout purposes */ 9453 int32_t ms_rtt; 9454 9455 /* Round up */ 9456 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 9457 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 9458 } else if (rack_stats_gets_ms_rtt == 2) { 9459 /* Send in the millisecond rtt has close to the path RTT as we can get */ 9460 int32_t ms_rtt; 9461 9462 /* Round up */ 9463 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 9464 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 9465 } else { 9466 /* Send in the microsecond rtt has close to the path RTT as we can get */ 9467 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 9468 } 9469 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 9470 #endif 9471 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 9472 /* 9473 * the retransmit should happen at rtt + 4 * rttvar. Because of the 9474 * way we do the smoothing, srtt and rttvar will each average +1/2 9475 * tick of bias. When we compute the retransmit timer, we want 1/2 9476 * tick of rounding and 1 extra tick because of +-1/2 tick 9477 * uncertainty in the firing of the timer. The bias will give us 9478 * exactly the 1.5 tick we need. But, because the bias is 9479 * statistical, we have to test that we don't drop below the minimum 9480 * feasible timer (which is 2 ticks). 9481 */ 9482 tp->t_rxtshift = 0; 9483 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9484 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 9485 rack_log_rtt_sample(rack, rtt); 9486 tp->t_softerror = 0; 9487 } 9488 9489 9490 static void 9491 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 9492 { 9493 /* 9494 * Apply to filter the inbound us-rtt at us_cts. 9495 */ 9496 uint32_t old_rtt; 9497 9498 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 9499 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 9500 us_rtt, us_cts); 9501 if (old_rtt > us_rtt) { 9502 /* We just hit a new lower rtt time */ 9503 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 9504 __LINE__, RACK_RTTS_NEWRTT); 9505 /* 9506 * Only count it if its lower than what we saw within our 9507 * calculated range. 9508 */ 9509 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 9510 if (rack_probertt_lower_within && 9511 rack->rc_gp_dyn_mul && 9512 (rack->use_fixed_rate == 0) && 9513 (rack->rc_always_pace)) { 9514 /* 9515 * We are seeing a new lower rtt very close 9516 * to the time that we would have entered probe-rtt. 9517 * This is probably due to the fact that a peer flow 9518 * has entered probe-rtt. Lets go in now too. 9519 */ 9520 uint32_t val; 9521 9522 val = rack_probertt_lower_within * rack_time_between_probertt; 9523 val /= 100; 9524 if ((rack->in_probe_rtt == 0) && 9525 (rack->rc_skip_timely == 0) && 9526 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 9527 rack_enter_probertt(rack, us_cts); 9528 } 9529 } 9530 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 9531 } 9532 } 9533 } 9534 9535 static int 9536 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 9537 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 9538 { 9539 uint32_t us_rtt; 9540 int32_t i, all; 9541 uint32_t t, len_acked; 9542 9543 if ((rsm->r_flags & RACK_ACKED) || 9544 (rsm->r_flags & RACK_WAS_ACKED)) 9545 /* Already done */ 9546 return (0); 9547 if (rsm->r_no_rtt_allowed) { 9548 /* Not allowed */ 9549 return (0); 9550 } 9551 if (ack_type == CUM_ACKED) { 9552 if (SEQ_GT(th_ack, rsm->r_end)) { 9553 len_acked = rsm->r_end - rsm->r_start; 9554 all = 1; 9555 } else { 9556 len_acked = th_ack - rsm->r_start; 9557 all = 0; 9558 } 9559 } else { 9560 len_acked = rsm->r_end - rsm->r_start; 9561 all = 0; 9562 } 9563 if (rsm->r_rtr_cnt == 1) { 9564 9565 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9566 if ((int)t <= 0) 9567 t = 1; 9568 if (!tp->t_rttlow || tp->t_rttlow > t) 9569 tp->t_rttlow = t; 9570 if (!rack->r_ctl.rc_rack_min_rtt || 9571 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9572 rack->r_ctl.rc_rack_min_rtt = t; 9573 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9574 rack->r_ctl.rc_rack_min_rtt = 1; 9575 } 9576 } 9577 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 9578 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9579 else 9580 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9581 if (us_rtt == 0) 9582 us_rtt = 1; 9583 if (CC_ALGO(tp)->rttsample != NULL) { 9584 /* Kick the RTT to the CC */ 9585 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 9586 } 9587 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 9588 if (ack_type == SACKED) { 9589 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 9590 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 9591 } else { 9592 /* 9593 * We need to setup what our confidence 9594 * is in this ack. 9595 * 9596 * If the rsm was app limited and it is 9597 * less than a mss in length (the end 9598 * of the send) then we have a gap. If we 9599 * were app limited but say we were sending 9600 * multiple MSS's then we are more confident 9601 * int it. 9602 * 9603 * When we are not app-limited then we see if 9604 * the rsm is being included in the current 9605 * measurement, we tell this by the app_limited_needs_set 9606 * flag. 9607 * 9608 * Note that being cwnd blocked is not applimited 9609 * as well as the pacing delay between packets which 9610 * are sending only 1 or 2 MSS's also will show up 9611 * in the RTT. We probably need to examine this algorithm 9612 * a bit more and enhance it to account for the delay 9613 * between rsm's. We could do that by saving off the 9614 * pacing delay of each rsm (in an rsm) and then 9615 * factoring that in somehow though for now I am 9616 * not sure how :) 9617 */ 9618 int calc_conf = 0; 9619 9620 if (rsm->r_flags & RACK_APP_LIMITED) { 9621 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 9622 calc_conf = 0; 9623 else 9624 calc_conf = 1; 9625 } else if (rack->app_limited_needs_set == 0) { 9626 calc_conf = 1; 9627 } else { 9628 calc_conf = 0; 9629 } 9630 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 9631 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 9632 calc_conf, rsm, rsm->r_rtr_cnt); 9633 } 9634 if ((rsm->r_flags & RACK_TLP) && 9635 (!IN_FASTRECOVERY(tp->t_flags))) { 9636 /* Segment was a TLP and our retrans matched */ 9637 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 9638 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 9639 } 9640 } 9641 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9642 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9643 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9644 /* New more recent rack_tmit_time */ 9645 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9646 if (rack->r_ctl.rc_rack_tmit_time == 0) 9647 rack->r_ctl.rc_rack_tmit_time = 1; 9648 rack->rc_rack_rtt = t; 9649 } 9650 return (1); 9651 } 9652 /* 9653 * We clear the soft/rxtshift since we got an ack. 9654 * There is no assurance we will call the commit() function 9655 * so we need to clear these to avoid incorrect handling. 9656 */ 9657 tp->t_rxtshift = 0; 9658 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9659 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9660 tp->t_softerror = 0; 9661 if (to && (to->to_flags & TOF_TS) && 9662 (ack_type == CUM_ACKED) && 9663 (to->to_tsecr) && 9664 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 9665 /* 9666 * Now which timestamp does it match? In this block the ACK 9667 * must be coming from a previous transmission. 9668 */ 9669 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9670 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 9671 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9672 if ((int)t <= 0) 9673 t = 1; 9674 if (CC_ALGO(tp)->rttsample != NULL) { 9675 /* 9676 * Kick the RTT to the CC, here 9677 * we lie a bit in that we know the 9678 * retransmission is correct even though 9679 * we retransmitted. This is because 9680 * we match the timestamps. 9681 */ 9682 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 9683 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 9684 else 9685 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 9686 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 9687 } 9688 if ((i + 1) < rsm->r_rtr_cnt) { 9689 /* 9690 * The peer ack'd from our previous 9691 * transmission. We have a spurious 9692 * retransmission and thus we dont 9693 * want to update our rack_rtt. 9694 * 9695 * Hmm should there be a CC revert here? 9696 * 9697 */ 9698 return (0); 9699 } 9700 if (!tp->t_rttlow || tp->t_rttlow > t) 9701 tp->t_rttlow = t; 9702 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9703 rack->r_ctl.rc_rack_min_rtt = t; 9704 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9705 rack->r_ctl.rc_rack_min_rtt = 1; 9706 } 9707 } 9708 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9709 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9710 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9711 /* New more recent rack_tmit_time */ 9712 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9713 if (rack->r_ctl.rc_rack_tmit_time == 0) 9714 rack->r_ctl.rc_rack_tmit_time = 1; 9715 rack->rc_rack_rtt = t; 9716 } 9717 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 9718 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 9719 rsm->r_rtr_cnt); 9720 return (1); 9721 } 9722 } 9723 /* If we are logging log out the sendmap */ 9724 if (tcp_bblogging_on(rack->rc_tp)) { 9725 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9726 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); 9727 } 9728 } 9729 goto ts_not_found; 9730 } else { 9731 /* 9732 * Ok its a SACK block that we retransmitted. or a windows 9733 * machine without timestamps. We can tell nothing from the 9734 * time-stamp since its not there or the time the peer last 9735 * received a segment that moved forward its cum-ack point. 9736 */ 9737 ts_not_found: 9738 i = rsm->r_rtr_cnt - 1; 9739 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9740 if ((int)t <= 0) 9741 t = 1; 9742 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9743 /* 9744 * We retransmitted and the ack came back in less 9745 * than the smallest rtt we have observed. We most 9746 * likely did an improper retransmit as outlined in 9747 * 6.2 Step 2 point 2 in the rack-draft so we 9748 * don't want to update our rack_rtt. We in 9749 * theory (in future) might want to think about reverting our 9750 * cwnd state but we won't for now. 9751 */ 9752 return (0); 9753 } else if (rack->r_ctl.rc_rack_min_rtt) { 9754 /* 9755 * We retransmitted it and the retransmit did the 9756 * job. 9757 */ 9758 if (!rack->r_ctl.rc_rack_min_rtt || 9759 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9760 rack->r_ctl.rc_rack_min_rtt = t; 9761 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9762 rack->r_ctl.rc_rack_min_rtt = 1; 9763 } 9764 } 9765 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9766 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9767 (uint32_t)rsm->r_tim_lastsent[i]))) { 9768 /* New more recent rack_tmit_time */ 9769 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 9770 if (rack->r_ctl.rc_rack_tmit_time == 0) 9771 rack->r_ctl.rc_rack_tmit_time = 1; 9772 rack->rc_rack_rtt = t; 9773 } 9774 return (1); 9775 } 9776 } 9777 return (0); 9778 } 9779 9780 /* 9781 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 9782 */ 9783 static void 9784 rack_log_sack_passed(struct tcpcb *tp, 9785 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 9786 { 9787 struct rack_sendmap *nrsm; 9788 uint32_t thresh; 9789 9790 /* Get our rxt threshold for lost consideration */ 9791 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 9792 /* Now start looking at rsm's */ 9793 nrsm = rsm; 9794 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 9795 rack_head, r_tnext) { 9796 if (nrsm == rsm) { 9797 /* Skip original segment he is acked */ 9798 continue; 9799 } 9800 if (nrsm->r_flags & RACK_ACKED) { 9801 /* 9802 * Skip ack'd segments, though we 9803 * should not see these, since tmap 9804 * should not have ack'd segments. 9805 */ 9806 continue; 9807 } 9808 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 9809 /* 9810 * If the peer dropped the rwnd on 9811 * these then we don't worry about them. 9812 */ 9813 continue; 9814 } 9815 /* Check lost state */ 9816 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 9817 uint32_t exp; 9818 9819 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 9820 if (TSTMP_LT(exp, cts) || (exp == cts)) { 9821 /* We consider it lost */ 9822 nrsm->r_flags |= RACK_WAS_LOST; 9823 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 9824 } 9825 } 9826 if (nrsm->r_flags & RACK_SACK_PASSED) { 9827 /* 9828 * We found one that is already marked 9829 * passed, we have been here before and 9830 * so all others below this are marked. 9831 */ 9832 break; 9833 } 9834 nrsm->r_flags |= RACK_SACK_PASSED; 9835 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 9836 } 9837 } 9838 9839 static void 9840 rack_need_set_test(struct tcpcb *tp, 9841 struct tcp_rack *rack, 9842 struct rack_sendmap *rsm, 9843 tcp_seq th_ack, 9844 int line, 9845 int use_which) 9846 { 9847 struct rack_sendmap *s_rsm; 9848 9849 if ((tp->t_flags & TF_GPUTINPROG) && 9850 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9851 /* 9852 * We were app limited, and this ack 9853 * butts up or goes beyond the point where we want 9854 * to start our next measurement. We need 9855 * to record the new gput_ts as here and 9856 * possibly update the start sequence. 9857 */ 9858 uint32_t seq, ts; 9859 9860 if (rsm->r_rtr_cnt > 1) { 9861 /* 9862 * This is a retransmit, can we 9863 * really make any assessment at this 9864 * point? We are not really sure of 9865 * the timestamp, is it this or the 9866 * previous transmission? 9867 * 9868 * Lets wait for something better that 9869 * is not retransmitted. 9870 */ 9871 return; 9872 } 9873 seq = tp->gput_seq; 9874 ts = tp->gput_ts; 9875 rack->app_limited_needs_set = 0; 9876 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 9877 /* Do we start at a new end? */ 9878 if ((use_which == RACK_USE_BEG) && 9879 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 9880 /* 9881 * When we get an ACK that just eats 9882 * up some of the rsm, we set RACK_USE_BEG 9883 * since whats at r_start (i.e. th_ack) 9884 * is left unacked and thats where the 9885 * measurement now starts. 9886 */ 9887 tp->gput_seq = rsm->r_start; 9888 } 9889 if ((use_which == RACK_USE_END) && 9890 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9891 /* 9892 * We use the end when the cumack 9893 * is moving forward and completely 9894 * deleting the rsm passed so basically 9895 * r_end holds th_ack. 9896 * 9897 * For SACK's we also want to use the end 9898 * since this piece just got sacked and 9899 * we want to target anything after that 9900 * in our measurement. 9901 */ 9902 tp->gput_seq = rsm->r_end; 9903 } 9904 if (use_which == RACK_USE_END_OR_THACK) { 9905 /* 9906 * special case for ack moving forward, 9907 * not a sack, we need to move all the 9908 * way up to where this ack cum-ack moves 9909 * to. 9910 */ 9911 if (SEQ_GT(th_ack, rsm->r_end)) 9912 tp->gput_seq = th_ack; 9913 else 9914 tp->gput_seq = rsm->r_end; 9915 } 9916 if (SEQ_LT(tp->gput_seq, tp->snd_max)) 9917 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 9918 else 9919 s_rsm = NULL; 9920 /* 9921 * Pick up the correct send time if we can the rsm passed in 9922 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other 9923 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will 9924 * find a different seq i.e. the next send up. 9925 * 9926 * If that has not been sent, s_rsm will be NULL and we must 9927 * arrange it so this function will get called again by setting 9928 * app_limited_needs_set. 9929 */ 9930 if (s_rsm) 9931 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; 9932 else { 9933 /* If we hit here we have to have *not* sent tp->gput_seq */ 9934 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 9935 /* Set it up so we will go through here again */ 9936 rack->app_limited_needs_set = 1; 9937 } 9938 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 9939 /* 9940 * We moved beyond this guy's range, re-calculate 9941 * the new end point. 9942 */ 9943 if (rack->rc_gp_filled == 0) { 9944 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 9945 } else { 9946 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 9947 } 9948 } 9949 /* 9950 * We are moving the goal post, we may be able to clear the 9951 * measure_saw_probe_rtt flag. 9952 */ 9953 if ((rack->in_probe_rtt == 0) && 9954 (rack->measure_saw_probe_rtt) && 9955 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 9956 rack->measure_saw_probe_rtt = 0; 9957 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 9958 seq, tp->gput_seq, 9959 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9960 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9961 5, line, NULL, 0); 9962 if (rack->rc_gp_filled && 9963 ((tp->gput_ack - tp->gput_seq) < 9964 max(rc_init_window(rack), (MIN_GP_WIN * 9965 ctf_fixed_maxseg(tp))))) { 9966 uint32_t ideal_amount; 9967 9968 ideal_amount = rack_get_measure_window(tp, rack); 9969 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 9970 /* 9971 * There is no sense of continuing this measurement 9972 * because its too small to gain us anything we 9973 * trust. Skip it and that way we can start a new 9974 * measurement quicker. 9975 */ 9976 tp->t_flags &= ~TF_GPUTINPROG; 9977 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 9978 0, 0, 9979 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9980 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9981 6, __LINE__, NULL, 0); 9982 } else { 9983 /* 9984 * Reset the window further out. 9985 */ 9986 tp->gput_ack = tp->gput_seq + ideal_amount; 9987 } 9988 } 9989 rack_tend_gp_marks(tp, rack); 9990 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); 9991 } 9992 } 9993 9994 static inline int 9995 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 9996 { 9997 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 9998 /* Behind our TLP definition or right at */ 9999 return (0); 10000 } 10001 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 10002 /* The start is beyond or right at our end of TLP definition */ 10003 return (0); 10004 } 10005 /* It has to be a sub-part of the original TLP recorded */ 10006 return (1); 10007 } 10008 10009 static uint32_t 10010 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 10011 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, 10012 uint32_t segsiz) 10013 { 10014 uint32_t start, end, changed = 0; 10015 struct rack_sendmap stack_map; 10016 struct rack_sendmap *rsm, *nrsm, *prev, *next; 10017 int insret __diagused; 10018 int32_t used_ref = 1; 10019 int can_use_hookery = 0; 10020 10021 start = sack->start; 10022 end = sack->end; 10023 rsm = *prsm; 10024 10025 do_rest_ofb: 10026 if ((rsm == NULL) || 10027 (SEQ_LT(end, rsm->r_start)) || 10028 (SEQ_GEQ(start, rsm->r_end)) || 10029 (SEQ_LT(start, rsm->r_start))) { 10030 /* 10031 * We are not in the right spot, 10032 * find the correct spot in the tree. 10033 */ 10034 used_ref = 0; 10035 rsm = tqhash_find(rack->r_ctl.tqh, start); 10036 } 10037 if (rsm == NULL) { 10038 /* TSNH */ 10039 goto out; 10040 } 10041 /* Ok we have an ACK for some piece of this rsm */ 10042 if (rsm->r_start != start) { 10043 if ((rsm->r_flags & RACK_ACKED) == 0) { 10044 /* 10045 * Before any splitting or hookery is 10046 * done is it a TLP of interest i.e. rxt? 10047 */ 10048 if ((rsm->r_flags & RACK_TLP) && 10049 (rsm->r_rtr_cnt > 1)) { 10050 /* 10051 * We are splitting a rxt TLP, check 10052 * if we need to save off the start/end 10053 */ 10054 if (rack->rc_last_tlp_acked_set && 10055 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10056 /* 10057 * We already turned this on since we are inside 10058 * the previous one was a partially sack now we 10059 * are getting another one (maybe all of it). 10060 * 10061 */ 10062 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10063 /* 10064 * Lets make sure we have all of it though. 10065 */ 10066 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10067 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10068 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10069 rack->r_ctl.last_tlp_acked_end); 10070 } 10071 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10072 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10073 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10074 rack->r_ctl.last_tlp_acked_end); 10075 } 10076 } else { 10077 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10078 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10079 rack->rc_last_tlp_past_cumack = 0; 10080 rack->rc_last_tlp_acked_set = 1; 10081 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10082 } 10083 } 10084 /** 10085 * Need to split this in two pieces the before and after, 10086 * the before remains in the map, the after must be 10087 * added. In other words we have: 10088 * rsm |--------------| 10089 * sackblk |-------> 10090 * rsm will become 10091 * rsm |---| 10092 * and nrsm will be the sacked piece 10093 * nrsm |----------| 10094 * 10095 * But before we start down that path lets 10096 * see if the sack spans over on top of 10097 * the next guy and it is already sacked. 10098 * 10099 */ 10100 /* 10101 * Hookery can only be used if the two entries 10102 * are in the same bucket and neither one of 10103 * them staddle the bucket line. 10104 */ 10105 next = tqhash_next(rack->r_ctl.tqh, rsm); 10106 if (next && 10107 (rsm->bindex == next->bindex) && 10108 ((rsm->r_flags & RACK_STRADDLE) == 0) && 10109 ((next->r_flags & RACK_STRADDLE) == 0) && 10110 ((rsm->r_flags & RACK_IS_PCM) == 0) && 10111 ((next->r_flags & RACK_IS_PCM) == 0) && 10112 (rsm->r_flags & RACK_IN_GP_WIN) && 10113 (next->r_flags & RACK_IN_GP_WIN)) 10114 can_use_hookery = 1; 10115 else 10116 can_use_hookery = 0; 10117 if (next && can_use_hookery && 10118 (next->r_flags & RACK_ACKED) && 10119 SEQ_GEQ(end, next->r_start)) { 10120 /** 10121 * So the next one is already acked, and 10122 * we can thus by hookery use our stack_map 10123 * to reflect the piece being sacked and 10124 * then adjust the two tree entries moving 10125 * the start and ends around. So we start like: 10126 * rsm |------------| (not-acked) 10127 * next |-----------| (acked) 10128 * sackblk |--------> 10129 * We want to end like so: 10130 * rsm |------| (not-acked) 10131 * next |-----------------| (acked) 10132 * nrsm |-----| 10133 * Where nrsm is a temporary stack piece we 10134 * use to update all the gizmos. 10135 */ 10136 /* Copy up our fudge block */ 10137 nrsm = &stack_map; 10138 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 10139 /* Now adjust our tree blocks */ 10140 tqhash_update_end(rack->r_ctl.tqh, rsm, start); 10141 next->r_start = start; 10142 rsm->r_flags |= RACK_SHUFFLED; 10143 next->r_flags |= RACK_SHUFFLED; 10144 /* Now we must adjust back where next->m is */ 10145 rack_setup_offset_for_rsm(rack, rsm, next); 10146 /* 10147 * Which timestamp do we keep? It is rather 10148 * important in GP measurements to have the 10149 * accurate end of the send window. 10150 * 10151 * We keep the largest value, which is the newest 10152 * send. We do this in case a segment that is 10153 * joined together and not part of a GP estimate 10154 * later gets expanded into the GP estimate. 10155 * 10156 * We prohibit the merging of unlike kinds i.e. 10157 * all pieces that are in the GP estimate can be 10158 * merged and all pieces that are not in a GP estimate 10159 * can be merged, but not disimilar pieces. Combine 10160 * this with taking the highest here and we should 10161 * be ok unless of course the client reneges. Then 10162 * all bets are off. 10163 */ 10164 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < 10165 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) 10166 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; 10167 /* 10168 * And we must keep the newest ack arrival time. 10169 */ 10170 if (next->r_ack_arrival < 10171 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 10172 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10173 10174 10175 /* We don't need to adjust rsm, it did not change */ 10176 /* Clear out the dup ack count of the remainder */ 10177 rsm->r_dupack = 0; 10178 rsm->r_just_ret = 0; 10179 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10180 /* Now lets make sure our fudge block is right */ 10181 nrsm->r_start = start; 10182 /* Now lets update all the stats and such */ 10183 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 10184 if (rack->app_limited_needs_set) 10185 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 10186 changed += (nrsm->r_end - nrsm->r_start); 10187 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 10188 if (rsm->r_flags & RACK_WAS_LOST) { 10189 int my_chg; 10190 10191 my_chg = (nrsm->r_end - nrsm->r_start); 10192 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 10193 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 10194 if (my_chg <= rack->r_ctl.rc_considered_lost) 10195 rack->r_ctl.rc_considered_lost -= my_chg; 10196 else 10197 rack->r_ctl.rc_considered_lost = 0; 10198 } 10199 if (nrsm->r_flags & RACK_SACK_PASSED) { 10200 rack->r_ctl.rc_reorder_ts = cts; 10201 if (rack->r_ctl.rc_reorder_ts == 0) 10202 rack->r_ctl.rc_reorder_ts = 1; 10203 } 10204 /* 10205 * Now we want to go up from rsm (the 10206 * one left un-acked) to the next one 10207 * in the tmap. We do this so when 10208 * we walk backwards we include marking 10209 * sack-passed on rsm (The one passed in 10210 * is skipped since it is generally called 10211 * on something sacked before removing it 10212 * from the tmap). 10213 */ 10214 if (rsm->r_in_tmap) { 10215 nrsm = TAILQ_NEXT(rsm, r_tnext); 10216 /* 10217 * Now that we have the next 10218 * one walk backwards from there. 10219 */ 10220 if (nrsm && nrsm->r_in_tmap) 10221 rack_log_sack_passed(tp, rack, nrsm, cts); 10222 } 10223 /* Now are we done? */ 10224 if (SEQ_LT(end, next->r_end) || 10225 (end == next->r_end)) { 10226 /* Done with block */ 10227 goto out; 10228 } 10229 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 10230 counter_u64_add(rack_sack_used_next_merge, 1); 10231 /* Postion for the next block */ 10232 start = next->r_end; 10233 rsm = tqhash_next(rack->r_ctl.tqh, next); 10234 if (rsm == NULL) 10235 goto out; 10236 } else { 10237 /** 10238 * We can't use any hookery here, so we 10239 * need to split the map. We enter like 10240 * so: 10241 * rsm |--------| 10242 * sackblk |-----> 10243 * We will add the new block nrsm and 10244 * that will be the new portion, and then 10245 * fall through after reseting rsm. So we 10246 * split and look like this: 10247 * rsm |----| 10248 * sackblk |-----> 10249 * nrsm |---| 10250 * We then fall through reseting 10251 * rsm to nrsm, so the next block 10252 * picks it up. 10253 */ 10254 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10255 if (nrsm == NULL) { 10256 /* 10257 * failed XXXrrs what can we do but loose the sack 10258 * info? 10259 */ 10260 goto out; 10261 } 10262 counter_u64_add(rack_sack_splits, 1); 10263 rack_clone_rsm(rack, nrsm, rsm, start); 10264 rsm->r_just_ret = 0; 10265 #ifndef INVARIANTS 10266 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 10267 #else 10268 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 10269 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 10270 nrsm, insret, rack, rsm); 10271 } 10272 #endif 10273 if (rsm->r_in_tmap) { 10274 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10275 nrsm->r_in_tmap = 1; 10276 } 10277 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 10278 rsm->r_flags &= (~RACK_HAS_FIN); 10279 /* Position us to point to the new nrsm that starts the sack blk */ 10280 rsm = nrsm; 10281 } 10282 } else { 10283 /* Already sacked this piece */ 10284 counter_u64_add(rack_sack_skipped_acked, 1); 10285 if (end == rsm->r_end) { 10286 /* Done with block */ 10287 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10288 goto out; 10289 } else if (SEQ_LT(end, rsm->r_end)) { 10290 /* A partial sack to a already sacked block */ 10291 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10292 goto out; 10293 } else { 10294 /* 10295 * The end goes beyond this guy 10296 * reposition the start to the 10297 * next block. 10298 */ 10299 start = rsm->r_end; 10300 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10301 if (rsm == NULL) 10302 goto out; 10303 } 10304 } 10305 } 10306 if (SEQ_GEQ(end, rsm->r_end)) { 10307 /** 10308 * The end of this block is either beyond this guy or right 10309 * at this guy. I.e.: 10310 * rsm --- |-----| 10311 * end |-----| 10312 * <or> 10313 * end |---------| 10314 */ 10315 if ((rsm->r_flags & RACK_ACKED) == 0) { 10316 /* 10317 * Is it a TLP of interest? 10318 */ 10319 if ((rsm->r_flags & RACK_TLP) && 10320 (rsm->r_rtr_cnt > 1)) { 10321 /* 10322 * We are splitting a rxt TLP, check 10323 * if we need to save off the start/end 10324 */ 10325 if (rack->rc_last_tlp_acked_set && 10326 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10327 /* 10328 * We already turned this on since we are inside 10329 * the previous one was a partially sack now we 10330 * are getting another one (maybe all of it). 10331 */ 10332 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10333 /* 10334 * Lets make sure we have all of it though. 10335 */ 10336 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10337 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10338 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10339 rack->r_ctl.last_tlp_acked_end); 10340 } 10341 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10342 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10343 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10344 rack->r_ctl.last_tlp_acked_end); 10345 } 10346 } else { 10347 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10348 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10349 rack->rc_last_tlp_past_cumack = 0; 10350 rack->rc_last_tlp_acked_set = 1; 10351 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10352 } 10353 } 10354 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 10355 changed += (rsm->r_end - rsm->r_start); 10356 /* You get a count for acking a whole segment or more */ 10357 if (rsm->r_flags & RACK_WAS_LOST) { 10358 int my_chg; 10359 10360 my_chg = (rsm->r_end - rsm->r_start); 10361 rsm->r_flags &= ~RACK_WAS_LOST; 10362 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 10363 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 10364 if (my_chg <= rack->r_ctl.rc_considered_lost) 10365 rack->r_ctl.rc_considered_lost -= my_chg; 10366 else 10367 rack->r_ctl.rc_considered_lost = 0; 10368 } 10369 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 10370 if (rsm->r_in_tmap) /* should be true */ 10371 rack_log_sack_passed(tp, rack, rsm, cts); 10372 /* Is Reordering occuring? */ 10373 if (rsm->r_flags & RACK_SACK_PASSED) { 10374 rsm->r_flags &= ~RACK_SACK_PASSED; 10375 rack->r_ctl.rc_reorder_ts = cts; 10376 if (rack->r_ctl.rc_reorder_ts == 0) 10377 rack->r_ctl.rc_reorder_ts = 1; 10378 } 10379 if (rack->app_limited_needs_set) 10380 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 10381 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10382 rsm->r_flags |= RACK_ACKED; 10383 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 10384 if (rsm->r_in_tmap) { 10385 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10386 rsm->r_in_tmap = 0; 10387 } 10388 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 10389 } else { 10390 counter_u64_add(rack_sack_skipped_acked, 1); 10391 } 10392 if (end == rsm->r_end) { 10393 /* This block only - done, setup for next */ 10394 goto out; 10395 } 10396 /* 10397 * There is more not coverend by this rsm move on 10398 * to the next block in the tail queue hash table. 10399 */ 10400 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 10401 start = rsm->r_end; 10402 rsm = nrsm; 10403 if (rsm == NULL) 10404 goto out; 10405 goto do_rest_ofb; 10406 } 10407 /** 10408 * The end of this sack block is smaller than 10409 * our rsm i.e.: 10410 * rsm --- |-----| 10411 * end |--| 10412 */ 10413 if ((rsm->r_flags & RACK_ACKED) == 0) { 10414 /* 10415 * Is it a TLP of interest? 10416 */ 10417 if ((rsm->r_flags & RACK_TLP) && 10418 (rsm->r_rtr_cnt > 1)) { 10419 /* 10420 * We are splitting a rxt TLP, check 10421 * if we need to save off the start/end 10422 */ 10423 if (rack->rc_last_tlp_acked_set && 10424 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10425 /* 10426 * We already turned this on since we are inside 10427 * the previous one was a partially sack now we 10428 * are getting another one (maybe all of it). 10429 */ 10430 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10431 /* 10432 * Lets make sure we have all of it though. 10433 */ 10434 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10435 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10436 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10437 rack->r_ctl.last_tlp_acked_end); 10438 } 10439 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10440 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10441 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10442 rack->r_ctl.last_tlp_acked_end); 10443 } 10444 } else { 10445 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10446 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10447 rack->rc_last_tlp_past_cumack = 0; 10448 rack->rc_last_tlp_acked_set = 1; 10449 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10450 } 10451 } 10452 /* 10453 * Hookery can only be used if the two entries 10454 * are in the same bucket and neither one of 10455 * them staddle the bucket line. 10456 */ 10457 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10458 if (prev && 10459 (rsm->bindex == prev->bindex) && 10460 ((rsm->r_flags & RACK_STRADDLE) == 0) && 10461 ((prev->r_flags & RACK_STRADDLE) == 0) && 10462 ((rsm->r_flags & RACK_IS_PCM) == 0) && 10463 ((prev->r_flags & RACK_IS_PCM) == 0) && 10464 (rsm->r_flags & RACK_IN_GP_WIN) && 10465 (prev->r_flags & RACK_IN_GP_WIN)) 10466 can_use_hookery = 1; 10467 else 10468 can_use_hookery = 0; 10469 if (prev && can_use_hookery && 10470 (prev->r_flags & RACK_ACKED)) { 10471 /** 10472 * Goal, we want the right remainder of rsm to shrink 10473 * in place and span from (rsm->r_start = end) to rsm->r_end. 10474 * We want to expand prev to go all the way 10475 * to prev->r_end <- end. 10476 * so in the tree we have before: 10477 * prev |--------| (acked) 10478 * rsm |-------| (non-acked) 10479 * sackblk |-| 10480 * We churn it so we end up with 10481 * prev |----------| (acked) 10482 * rsm |-----| (non-acked) 10483 * nrsm |-| (temporary) 10484 * 10485 * Note if either prev/rsm is a TLP we don't 10486 * do this. 10487 */ 10488 nrsm = &stack_map; 10489 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 10490 tqhash_update_end(rack->r_ctl.tqh, prev, end); 10491 rsm->r_start = end; 10492 rsm->r_flags |= RACK_SHUFFLED; 10493 prev->r_flags |= RACK_SHUFFLED; 10494 /* Now adjust nrsm (stack copy) to be 10495 * the one that is the small 10496 * piece that was "sacked". 10497 */ 10498 nrsm->r_end = end; 10499 rsm->r_dupack = 0; 10500 /* 10501 * Which timestamp do we keep? It is rather 10502 * important in GP measurements to have the 10503 * accurate end of the send window. 10504 * 10505 * We keep the largest value, which is the newest 10506 * send. We do this in case a segment that is 10507 * joined together and not part of a GP estimate 10508 * later gets expanded into the GP estimate. 10509 * 10510 * We prohibit the merging of unlike kinds i.e. 10511 * all pieces that are in the GP estimate can be 10512 * merged and all pieces that are not in a GP estimate 10513 * can be merged, but not disimilar pieces. Combine 10514 * this with taking the highest here and we should 10515 * be ok unless of course the client reneges. Then 10516 * all bets are off. 10517 */ 10518 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < 10519 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { 10520 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 10521 } 10522 /* 10523 * And we must keep the newest ack arrival time. 10524 */ 10525 10526 if(prev->r_ack_arrival < 10527 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 10528 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10529 10530 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10531 /* 10532 * Now that the rsm has had its start moved forward 10533 * lets go ahead and get its new place in the world. 10534 */ 10535 rack_setup_offset_for_rsm(rack, prev, rsm); 10536 /* 10537 * Now nrsm is our new little piece 10538 * that is acked (which was merged 10539 * to prev). Update the rtt and changed 10540 * based on that. Also check for reordering. 10541 */ 10542 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 10543 if (rack->app_limited_needs_set) 10544 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 10545 changed += (nrsm->r_end - nrsm->r_start); 10546 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 10547 if (rsm->r_flags & RACK_WAS_LOST) { 10548 int my_chg; 10549 10550 my_chg = (nrsm->r_end - nrsm->r_start); 10551 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 10552 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 10553 if (my_chg <= rack->r_ctl.rc_considered_lost) 10554 rack->r_ctl.rc_considered_lost -= my_chg; 10555 else 10556 rack->r_ctl.rc_considered_lost = 0; 10557 } 10558 if (nrsm->r_flags & RACK_SACK_PASSED) { 10559 rack->r_ctl.rc_reorder_ts = cts; 10560 if (rack->r_ctl.rc_reorder_ts == 0) 10561 rack->r_ctl.rc_reorder_ts = 1; 10562 } 10563 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 10564 rsm = prev; 10565 counter_u64_add(rack_sack_used_prev_merge, 1); 10566 } else { 10567 /** 10568 * This is the case where our previous 10569 * block is not acked either, so we must 10570 * split the block in two. 10571 */ 10572 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10573 if (nrsm == NULL) { 10574 /* failed rrs what can we do but loose the sack info? */ 10575 goto out; 10576 } 10577 if ((rsm->r_flags & RACK_TLP) && 10578 (rsm->r_rtr_cnt > 1)) { 10579 /* 10580 * We are splitting a rxt TLP, check 10581 * if we need to save off the start/end 10582 */ 10583 if (rack->rc_last_tlp_acked_set && 10584 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10585 /* 10586 * We already turned this on since this block is inside 10587 * the previous one was a partially sack now we 10588 * are getting another one (maybe all of it). 10589 */ 10590 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10591 /* 10592 * Lets make sure we have all of it though. 10593 */ 10594 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10595 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10596 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10597 rack->r_ctl.last_tlp_acked_end); 10598 } 10599 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10600 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10601 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10602 rack->r_ctl.last_tlp_acked_end); 10603 } 10604 } else { 10605 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10606 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10607 rack->rc_last_tlp_acked_set = 1; 10608 rack->rc_last_tlp_past_cumack = 0; 10609 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10610 } 10611 } 10612 /** 10613 * In this case nrsm becomes 10614 * nrsm->r_start = end; 10615 * nrsm->r_end = rsm->r_end; 10616 * which is un-acked. 10617 * <and> 10618 * rsm->r_end = nrsm->r_start; 10619 * i.e. the remaining un-acked 10620 * piece is left on the left 10621 * hand side. 10622 * 10623 * So we start like this 10624 * rsm |----------| (not acked) 10625 * sackblk |---| 10626 * build it so we have 10627 * rsm |---| (acked) 10628 * nrsm |------| (not acked) 10629 */ 10630 counter_u64_add(rack_sack_splits, 1); 10631 rack_clone_rsm(rack, nrsm, rsm, end); 10632 rsm->r_flags &= (~RACK_HAS_FIN); 10633 rsm->r_just_ret = 0; 10634 #ifndef INVARIANTS 10635 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 10636 #else 10637 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 10638 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p", 10639 nrsm, insret, rack, rsm); 10640 } 10641 #endif 10642 if (rsm->r_in_tmap) { 10643 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10644 nrsm->r_in_tmap = 1; 10645 } 10646 nrsm->r_dupack = 0; 10647 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 10648 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 10649 changed += (rsm->r_end - rsm->r_start); 10650 if (rsm->r_flags & RACK_WAS_LOST) { 10651 int my_chg; 10652 10653 my_chg = (rsm->r_end - rsm->r_start); 10654 rsm->r_flags &= ~RACK_WAS_LOST; 10655 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 10656 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 10657 if (my_chg <= rack->r_ctl.rc_considered_lost) 10658 rack->r_ctl.rc_considered_lost -= my_chg; 10659 else 10660 rack->r_ctl.rc_considered_lost = 0; 10661 } 10662 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 10663 10664 if (rsm->r_in_tmap) /* should be true */ 10665 rack_log_sack_passed(tp, rack, rsm, cts); 10666 /* Is Reordering occuring? */ 10667 if (rsm->r_flags & RACK_SACK_PASSED) { 10668 rsm->r_flags &= ~RACK_SACK_PASSED; 10669 rack->r_ctl.rc_reorder_ts = cts; 10670 if (rack->r_ctl.rc_reorder_ts == 0) 10671 rack->r_ctl.rc_reorder_ts = 1; 10672 } 10673 if (rack->app_limited_needs_set) 10674 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 10675 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10676 rsm->r_flags |= RACK_ACKED; 10677 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 10678 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 10679 if (rsm->r_in_tmap) { 10680 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10681 rsm->r_in_tmap = 0; 10682 } 10683 } 10684 } else if (start != end){ 10685 /* 10686 * The block was already acked. 10687 */ 10688 counter_u64_add(rack_sack_skipped_acked, 1); 10689 } 10690 out: 10691 if (rsm && 10692 ((rsm->r_flags & RACK_TLP) == 0) && 10693 (rsm->r_flags & RACK_ACKED)) { 10694 /* 10695 * Now can we merge where we worked 10696 * with either the previous or 10697 * next block? 10698 */ 10699 next = tqhash_next(rack->r_ctl.tqh, rsm); 10700 while (next) { 10701 if (next->r_flags & RACK_TLP) 10702 break; 10703 /* Only allow merges between ones in or out of GP window */ 10704 if ((next->r_flags & RACK_IN_GP_WIN) && 10705 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10706 break; 10707 } 10708 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10709 ((next->r_flags & RACK_IN_GP_WIN) == 0)) { 10710 break; 10711 } 10712 if (rsm->bindex != next->bindex) 10713 break; 10714 if (rsm->r_flags & RACK_STRADDLE) 10715 break; 10716 if (rsm->r_flags & RACK_IS_PCM) 10717 break; 10718 if (next->r_flags & RACK_STRADDLE) 10719 break; 10720 if (next->r_flags & RACK_IS_PCM) 10721 break; 10722 if (next->r_flags & RACK_ACKED) { 10723 /* yep this and next can be merged */ 10724 rsm = rack_merge_rsm(rack, rsm, next); 10725 next = tqhash_next(rack->r_ctl.tqh, rsm); 10726 } else 10727 break; 10728 } 10729 /* Now what about the previous? */ 10730 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10731 while (prev) { 10732 if (prev->r_flags & RACK_TLP) 10733 break; 10734 /* Only allow merges between ones in or out of GP window */ 10735 if ((prev->r_flags & RACK_IN_GP_WIN) && 10736 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10737 break; 10738 } 10739 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10740 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { 10741 break; 10742 } 10743 if (rsm->bindex != prev->bindex) 10744 break; 10745 if (rsm->r_flags & RACK_STRADDLE) 10746 break; 10747 if (rsm->r_flags & RACK_IS_PCM) 10748 break; 10749 if (prev->r_flags & RACK_STRADDLE) 10750 break; 10751 if (prev->r_flags & RACK_IS_PCM) 10752 break; 10753 if (prev->r_flags & RACK_ACKED) { 10754 /* yep the previous and this can be merged */ 10755 rsm = rack_merge_rsm(rack, prev, rsm); 10756 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10757 } else 10758 break; 10759 } 10760 } 10761 if (used_ref == 0) { 10762 counter_u64_add(rack_sack_proc_all, 1); 10763 } else { 10764 counter_u64_add(rack_sack_proc_short, 1); 10765 } 10766 /* Save off the next one for quick reference. */ 10767 nrsm = tqhash_find(rack->r_ctl.tqh, end); 10768 *prsm = rack->r_ctl.rc_sacklast = nrsm; 10769 if (IN_RECOVERY(tp->t_flags)) { 10770 rack->r_ctl.bytes_acked_in_recovery += changed; 10771 } 10772 return (changed); 10773 } 10774 10775 static void inline 10776 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 10777 { 10778 struct rack_sendmap *tmap; 10779 10780 tmap = NULL; 10781 while (rsm && (rsm->r_flags & RACK_ACKED)) { 10782 /* Its no longer sacked, mark it so */ 10783 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10784 #ifdef INVARIANTS 10785 if (rsm->r_in_tmap) { 10786 panic("rack:%p rsm:%p flags:0x%x in tmap?", 10787 rack, rsm, rsm->r_flags); 10788 } 10789 #endif 10790 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 10791 /* Rebuild it into our tmap */ 10792 if (tmap == NULL) { 10793 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10794 tmap = rsm; 10795 } else { 10796 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 10797 tmap = rsm; 10798 } 10799 tmap->r_in_tmap = 1; 10800 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10801 } 10802 /* 10803 * Now lets possibly clear the sack filter so we start 10804 * recognizing sacks that cover this area. 10805 */ 10806 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 10807 10808 } 10809 10810 10811 static void inline 10812 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) 10813 { 10814 /* 10815 * We look at advancing the end send time for our GP 10816 * measurement tracking only as the cumulative acknowledgment 10817 * moves forward. You might wonder about this, why not 10818 * at every transmission or retransmission within the 10819 * GP window update the rc_gp_cumack_ts? Well its rather 10820 * nuanced but basically the GP window *may* expand (as 10821 * it does below) or worse and harder to track it may shrink. 10822 * 10823 * This last makes it impossible to track at the time of 10824 * the send, since you may set forward your rc_gp_cumack_ts 10825 * when you send, because that send *is* in your currently 10826 * "guessed" window, but then it shrinks. Now which was 10827 * the send time of the last bytes in the window, by the 10828 * time you ask that question that part of the sendmap 10829 * is freed. So you don't know and you will have too 10830 * long of send window. Instead by updating the time 10831 * marker only when the cumack advances this assures us 10832 * that we will have only the sends in the window of our 10833 * GP measurement. 10834 * 10835 * Another complication from this is the 10836 * merging of sendmap entries. During SACK processing this 10837 * can happen to conserve the sendmap size. That breaks 10838 * everything down in tracking the send window of the GP 10839 * estimate. So to prevent that and keep it working with 10840 * a tiny bit more limited merging, we only allow like 10841 * types to be merged. I.e. if two sends are in the GP window 10842 * then its ok to merge them together. If two sends are not 10843 * in the GP window its ok to merge them together too. Though 10844 * one send in and one send out cannot be merged. We combine 10845 * this with never allowing the shrinking of the GP window when 10846 * we are in recovery so that we can properly calculate the 10847 * sending times. 10848 * 10849 * This all of course seems complicated, because it is.. :) 10850 * 10851 * The cum-ack is being advanced upon the sendmap. 10852 * If we are not doing a GP estimate don't 10853 * proceed. 10854 */ 10855 uint64_t ts; 10856 10857 if ((tp->t_flags & TF_GPUTINPROG) == 0) 10858 return; 10859 /* 10860 * If this sendmap entry is going 10861 * beyond the measurement window we had picked, 10862 * expand the measurement window by that much. 10863 */ 10864 if (SEQ_GT(rsm->r_end, tp->gput_ack)) { 10865 tp->gput_ack = rsm->r_end; 10866 } 10867 /* 10868 * If we have not setup a ack, then we 10869 * have no idea if the newly acked pieces 10870 * will be "in our seq measurement range". If 10871 * it is when we clear the app_limited_needs_set 10872 * flag the timestamp will be updated. 10873 */ 10874 if (rack->app_limited_needs_set) 10875 return; 10876 /* 10877 * Finally, we grab out the latest timestamp 10878 * that this packet was sent and then see 10879 * if: 10880 * a) The packet touches are newly defined GP range. 10881 * b) The time is greater than (newer) than the 10882 * one we currently have. If so we update 10883 * our sending end time window. 10884 * 10885 * Note we *do not* do this at send time. The reason 10886 * is that if you do you *may* pick up a newer timestamp 10887 * for a range you are not going to measure. We project 10888 * out how far and then sometimes modify that to be 10889 * smaller. If that occurs then you will have a send 10890 * that does not belong to the range included. 10891 */ 10892 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= 10893 rack->r_ctl.rc_gp_cumack_ts) 10894 return; 10895 if (rack_in_gp_window(tp, rsm)) { 10896 rack->r_ctl.rc_gp_cumack_ts = ts; 10897 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, 10898 __LINE__, from, rsm); 10899 } 10900 } 10901 10902 static void 10903 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) 10904 { 10905 struct rack_sendmap *rsm; 10906 /* 10907 * The ACK point is advancing to th_ack, we must drop off 10908 * the packets in the rack log and calculate any eligble 10909 * RTT's. 10910 */ 10911 10912 if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) { 10913 /* 10914 * If we have some sack blocks in the filter 10915 * lets prune them out by calling sfb with no blocks. 10916 */ 10917 sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack); 10918 } 10919 if (SEQ_GT(th_ack, tp->snd_una)) { 10920 /* Clear any app ack remembered settings */ 10921 rack->r_ctl.cleared_app_ack = 0; 10922 } 10923 rack->r_wanted_output = 1; 10924 if (SEQ_GT(th_ack, tp->snd_una)) 10925 rack->r_ctl.last_cumack_advance = acktime; 10926 10927 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 10928 if ((rack->rc_last_tlp_acked_set == 1)&& 10929 (rack->rc_last_tlp_past_cumack == 1) && 10930 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 10931 /* 10932 * We have reached the point where our last rack 10933 * tlp retransmit sequence is ahead of the cum-ack. 10934 * This can only happen when the cum-ack moves all 10935 * the way around (its been a full 2^^31+1 bytes 10936 * or more since we sent a retransmitted TLP). Lets 10937 * turn off the valid flag since its not really valid. 10938 * 10939 * Note since sack's also turn on this event we have 10940 * a complication, we have to wait to age it out until 10941 * the cum-ack is by the TLP before checking which is 10942 * what the next else clause does. 10943 */ 10944 rack_log_dsack_event(rack, 9, __LINE__, 10945 rack->r_ctl.last_tlp_acked_start, 10946 rack->r_ctl.last_tlp_acked_end); 10947 rack->rc_last_tlp_acked_set = 0; 10948 rack->rc_last_tlp_past_cumack = 0; 10949 } else if ((rack->rc_last_tlp_acked_set == 1) && 10950 (rack->rc_last_tlp_past_cumack == 0) && 10951 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 10952 /* 10953 * It is safe to start aging TLP's out. 10954 */ 10955 rack->rc_last_tlp_past_cumack = 1; 10956 } 10957 /* We do the same for the tlp send seq as well */ 10958 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10959 (rack->rc_last_sent_tlp_past_cumack == 1) && 10960 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 10961 rack_log_dsack_event(rack, 9, __LINE__, 10962 rack->r_ctl.last_sent_tlp_seq, 10963 (rack->r_ctl.last_sent_tlp_seq + 10964 rack->r_ctl.last_sent_tlp_len)); 10965 rack->rc_last_sent_tlp_seq_valid = 0; 10966 rack->rc_last_sent_tlp_past_cumack = 0; 10967 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10968 (rack->rc_last_sent_tlp_past_cumack == 0) && 10969 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 10970 /* 10971 * It is safe to start aging TLP's send. 10972 */ 10973 rack->rc_last_sent_tlp_past_cumack = 1; 10974 } 10975 more: 10976 rsm = tqhash_min(rack->r_ctl.tqh); 10977 if (rsm == NULL) { 10978 if ((th_ack - 1) == tp->iss) { 10979 /* 10980 * For the SYN incoming case we will not 10981 * have called tcp_output for the sending of 10982 * the SYN, so there will be no map. All 10983 * other cases should probably be a panic. 10984 */ 10985 return; 10986 } 10987 if (tp->t_flags & TF_SENTFIN) { 10988 /* if we sent a FIN we often will not have map */ 10989 return; 10990 } 10991 #ifdef INVARIANTS 10992 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n", 10993 tp, 10994 tp->t_state, th_ack, rack, 10995 tp->snd_una, tp->snd_max); 10996 #endif 10997 return; 10998 } 10999 if (SEQ_LT(th_ack, rsm->r_start)) { 11000 /* Huh map is missing this */ 11001 #ifdef INVARIANTS 11002 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 11003 rsm->r_start, 11004 th_ack, tp->t_state, rack->r_state); 11005 #endif 11006 return; 11007 } 11008 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 11009 11010 /* Now was it a retransmitted TLP? */ 11011 if ((rsm->r_flags & RACK_TLP) && 11012 (rsm->r_rtr_cnt > 1)) { 11013 /* 11014 * Yes, this rsm was a TLP and retransmitted, remember that 11015 * since if a DSACK comes back on this we don't want 11016 * to think of it as a reordered segment. This may 11017 * get updated again with possibly even other TLPs 11018 * in flight, but thats ok. Only when we don't send 11019 * a retransmitted TLP for 1/2 the sequences space 11020 * will it get turned off (above). 11021 */ 11022 if (rack->rc_last_tlp_acked_set && 11023 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 11024 /* 11025 * We already turned this on since the end matches, 11026 * the previous one was a partially ack now we 11027 * are getting another one (maybe all of it). 11028 */ 11029 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 11030 /* 11031 * Lets make sure we have all of it though. 11032 */ 11033 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 11034 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 11035 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 11036 rack->r_ctl.last_tlp_acked_end); 11037 } 11038 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 11039 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 11040 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 11041 rack->r_ctl.last_tlp_acked_end); 11042 } 11043 } else { 11044 rack->rc_last_tlp_past_cumack = 1; 11045 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 11046 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 11047 rack->rc_last_tlp_acked_set = 1; 11048 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 11049 } 11050 } 11051 /* Now do we consume the whole thing? */ 11052 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 11053 if (SEQ_GEQ(th_ack, rsm->r_end)) { 11054 /* Its all consumed. */ 11055 uint32_t left; 11056 uint8_t newly_acked; 11057 11058 if (rsm->r_flags & RACK_WAS_LOST) { 11059 /* 11060 * This can happen when we marked it as lost 11061 * and yet before retransmitting we get an ack 11062 * which can happen due to reordering. 11063 */ 11064 rsm->r_flags &= ~RACK_WAS_LOST; 11065 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), 11066 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 11067 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) 11068 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; 11069 else 11070 rack->r_ctl.rc_considered_lost = 0; 11071 } 11072 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 11073 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 11074 rsm->r_rtr_bytes = 0; 11075 /* 11076 * Record the time of highest cumack sent if its in our measurement 11077 * window and possibly bump out the end. 11078 */ 11079 rack_rsm_sender_update(rack, tp, rsm, 4); 11080 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 11081 if (rsm->r_in_tmap) { 11082 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 11083 rsm->r_in_tmap = 0; 11084 } 11085 newly_acked = 1; 11086 if (((rsm->r_flags & RACK_ACKED) == 0) && 11087 (IN_RECOVERY(tp->t_flags))) { 11088 rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start); 11089 } 11090 if (rsm->r_flags & RACK_ACKED) { 11091 /* 11092 * It was acked on the scoreboard -- remove 11093 * it from total 11094 */ 11095 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 11096 newly_acked = 0; 11097 } else if (rsm->r_flags & RACK_SACK_PASSED) { 11098 /* 11099 * There are segments ACKED on the 11100 * scoreboard further up. We are seeing 11101 * reordering. 11102 */ 11103 rsm->r_flags &= ~RACK_SACK_PASSED; 11104 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 11105 rsm->r_flags |= RACK_ACKED; 11106 rack->r_ctl.rc_reorder_ts = cts; 11107 if (rack->r_ctl.rc_reorder_ts == 0) 11108 rack->r_ctl.rc_reorder_ts = 1; 11109 if (rack->r_ent_rec_ns) { 11110 /* 11111 * We have sent no more, and we saw an sack 11112 * then ack arrive. 11113 */ 11114 rack->r_might_revert = 1; 11115 } 11116 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 11117 } else { 11118 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 11119 } 11120 if ((rsm->r_flags & RACK_TO_REXT) && 11121 (tp->t_flags & TF_RCVD_TSTMP) && 11122 (to->to_flags & TOF_TS) && 11123 (to->to_tsecr != 0) && 11124 (tp->t_flags & TF_PREVVALID)) { 11125 /* 11126 * We can use the timestamp to see 11127 * if this retransmission was from the 11128 * first transmit. If so we made a mistake. 11129 */ 11130 tp->t_flags &= ~TF_PREVVALID; 11131 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 11132 /* The first transmit is what this ack is for */ 11133 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 11134 } 11135 } 11136 left = th_ack - rsm->r_end; 11137 if (rack->app_limited_needs_set && newly_acked) 11138 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 11139 /* Free back to zone */ 11140 rack_free(rack, rsm); 11141 if (left) { 11142 goto more; 11143 } 11144 /* Check for reneging */ 11145 rsm = tqhash_min(rack->r_ctl.tqh); 11146 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 11147 /* 11148 * The peer has moved snd_una up to 11149 * the edge of this send, i.e. one 11150 * that it had previously acked. The only 11151 * way that can be true if the peer threw 11152 * away data (space issues) that it had 11153 * previously sacked (else it would have 11154 * given us snd_una up to (rsm->r_end). 11155 * We need to undo the acked markings here. 11156 * 11157 * Note we have to look to make sure th_ack is 11158 * our rsm->r_start in case we get an old ack 11159 * where th_ack is behind snd_una. 11160 */ 11161 rack_peer_reneges(rack, rsm, th_ack); 11162 } 11163 return; 11164 } 11165 if (rsm->r_flags & RACK_ACKED) { 11166 /* 11167 * It was acked on the scoreboard -- remove it from 11168 * total for the part being cum-acked. 11169 */ 11170 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 11171 } else { 11172 if (((rsm->r_flags & RACK_ACKED) == 0) && 11173 (IN_RECOVERY(tp->t_flags))) { 11174 rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start); 11175 } 11176 rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack); 11177 } 11178 /* And what about the lost flag? */ 11179 if (rsm->r_flags & RACK_WAS_LOST) { 11180 /* 11181 * This can happen when we marked it as lost 11182 * and yet before retransmitting we get an ack 11183 * which can happen due to reordering. In this 11184 * case its only a partial ack of the send. 11185 */ 11186 KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)), 11187 ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack)); 11188 if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)) 11189 rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start; 11190 else 11191 rack->r_ctl.rc_considered_lost = 0; 11192 } 11193 /* 11194 * Clear the dup ack count for 11195 * the piece that remains. 11196 */ 11197 rsm->r_dupack = 0; 11198 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 11199 if (rsm->r_rtr_bytes) { 11200 /* 11201 * It was retransmitted adjust the 11202 * sack holes for what was acked. 11203 */ 11204 int ack_am; 11205 11206 ack_am = (th_ack - rsm->r_start); 11207 if (ack_am >= rsm->r_rtr_bytes) { 11208 rack->r_ctl.rc_holes_rxt -= ack_am; 11209 rsm->r_rtr_bytes -= ack_am; 11210 } 11211 } 11212 /* 11213 * Update where the piece starts and record 11214 * the time of send of highest cumack sent if 11215 * its in our GP range. 11216 */ 11217 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 11218 /* Now we need to move our offset forward too */ 11219 if (rsm->m && 11220 ((rsm->orig_m_len != rsm->m->m_len) || 11221 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 11222 /* Fix up the orig_m_len and possibly the mbuf offset */ 11223 rack_adjust_orig_mlen(rsm); 11224 } 11225 rsm->soff += (th_ack - rsm->r_start); 11226 rack_rsm_sender_update(rack, tp, rsm, 5); 11227 /* The trim will move th_ack into r_start for us */ 11228 tqhash_trim(rack->r_ctl.tqh, th_ack); 11229 /* Now do we need to move the mbuf fwd too? */ 11230 { 11231 struct mbuf *m; 11232 uint32_t soff; 11233 11234 m = rsm->m; 11235 soff = rsm->soff; 11236 if (m) { 11237 while (soff >= m->m_len) { 11238 soff -= m->m_len; 11239 KASSERT((m->m_next != NULL), 11240 (" rsm:%p off:%u soff:%u m:%p", 11241 rsm, rsm->soff, soff, m)); 11242 m = m->m_next; 11243 if (m == NULL) { 11244 /* 11245 * This is a fall-back that prevents a panic. In reality 11246 * we should be able to walk the mbuf's and find our place. 11247 * At this point snd_una has not been updated with the sbcut() yet 11248 * but tqhash_trim did update rsm->r_start so the offset calcuation 11249 * should work fine. This is undesirable since we will take cache 11250 * hits to access the socket buffer. And even more puzzling is that 11251 * it happens occasionally. It should not :( 11252 */ 11253 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 11254 (rsm->r_start - tp->snd_una), 11255 &soff); 11256 break; 11257 } 11258 } 11259 /* 11260 * Now save in our updated values. 11261 */ 11262 rsm->m = m; 11263 rsm->soff = soff; 11264 rsm->orig_m_len = rsm->m->m_len; 11265 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11266 } 11267 } 11268 if (rack->app_limited_needs_set && 11269 SEQ_GEQ(th_ack, tp->gput_seq)) 11270 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 11271 } 11272 11273 static void 11274 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 11275 { 11276 struct rack_sendmap *rsm; 11277 int sack_pass_fnd = 0; 11278 11279 if (rack->r_might_revert) { 11280 /* 11281 * Ok we have reordering, have not sent anything, we 11282 * might want to revert the congestion state if nothing 11283 * further has SACK_PASSED on it. Lets check. 11284 * 11285 * We also get here when we have DSACKs come in for 11286 * all the data that we FR'd. Note that a rxt or tlp 11287 * timer clears this from happening. 11288 */ 11289 11290 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 11291 if (rsm->r_flags & RACK_SACK_PASSED) { 11292 sack_pass_fnd = 1; 11293 break; 11294 } 11295 } 11296 if (sack_pass_fnd == 0) { 11297 /* 11298 * We went into recovery 11299 * incorrectly due to reordering! 11300 */ 11301 int orig_cwnd; 11302 11303 rack->r_ent_rec_ns = 0; 11304 orig_cwnd = tp->snd_cwnd; 11305 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 11306 tp->snd_recover = tp->snd_una; 11307 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 11308 if (IN_RECOVERY(tp->t_flags)) { 11309 rack_exit_recovery(tp, rack, 3); 11310 if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){ 11311 /* 11312 * We were in recovery, had an RTO 11313 * and then re-entered recovery (more sack's arrived) 11314 * and we have properly recorded the old ssthresh from 11315 * the first recovery. We want to be able to slow-start 11316 * back to this level. The ssthresh from the timeout 11317 * and then back into recovery will end up most likely 11318 * to be min(cwnd=1mss, 2mss). Which makes it basically 11319 * so we get no slow-start after our RTO. 11320 */ 11321 rack->rto_from_rec = 0; 11322 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 11323 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 11324 } 11325 } 11326 rack->r_ctl.bytes_acked_in_recovery = 0; 11327 rack->r_ctl.time_entered_recovery = 0; 11328 } 11329 rack->r_might_revert = 0; 11330 } 11331 } 11332 11333 11334 static int 11335 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 11336 { 11337 11338 uint32_t am, l_end; 11339 int was_tlp = 0; 11340 11341 if (SEQ_GT(end, start)) 11342 am = end - start; 11343 else 11344 am = 0; 11345 if ((rack->rc_last_tlp_acked_set ) && 11346 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 11347 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 11348 /* 11349 * The DSACK is because of a TLP which we don't 11350 * do anything with the reordering window over since 11351 * it was not reordering that caused the DSACK but 11352 * our previous retransmit TLP. 11353 */ 11354 rack_log_dsack_event(rack, 7, __LINE__, start, end); 11355 was_tlp = 1; 11356 goto skip_dsack_round; 11357 } 11358 if (rack->rc_last_sent_tlp_seq_valid) { 11359 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 11360 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 11361 (SEQ_LEQ(end, l_end))) { 11362 /* 11363 * This dsack is from the last sent TLP, ignore it 11364 * for reordering purposes. 11365 */ 11366 rack_log_dsack_event(rack, 7, __LINE__, start, end); 11367 was_tlp = 1; 11368 goto skip_dsack_round; 11369 } 11370 } 11371 if (rack->rc_dsack_round_seen == 0) { 11372 rack->rc_dsack_round_seen = 1; 11373 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 11374 rack->r_ctl.num_dsack++; 11375 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 11376 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 11377 } 11378 skip_dsack_round: 11379 /* 11380 * We keep track of how many DSACK blocks we get 11381 * after a recovery incident. 11382 */ 11383 rack->r_ctl.dsack_byte_cnt += am; 11384 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 11385 rack->r_ctl.retran_during_recovery && 11386 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 11387 /* 11388 * False recovery most likely culprit is reordering. If 11389 * nothing else is missing we need to revert. 11390 */ 11391 rack->r_might_revert = 1; 11392 rack_handle_might_revert(rack->rc_tp, rack); 11393 rack->r_might_revert = 0; 11394 rack->r_ctl.retran_during_recovery = 0; 11395 rack->r_ctl.dsack_byte_cnt = 0; 11396 } 11397 return (was_tlp); 11398 } 11399 11400 static uint32_t 11401 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 11402 { 11403 return (((tp->snd_max - snd_una) - 11404 (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt); 11405 } 11406 11407 static int32_t 11408 rack_compute_pipe(struct tcpcb *tp) 11409 { 11410 return ((int32_t)do_rack_compute_pipe(tp, 11411 (struct tcp_rack *)tp->t_fb_ptr, 11412 tp->snd_una)); 11413 } 11414 11415 static void 11416 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 11417 { 11418 /* Deal with changed and PRR here (in recovery only) */ 11419 uint32_t pipe, snd_una; 11420 11421 rack->r_ctl.rc_prr_delivered += changed; 11422 11423 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 11424 /* 11425 * It is all outstanding, we are application limited 11426 * and thus we don't need more room to send anything. 11427 * Note we use tp->snd_una here and not th_ack because 11428 * the data as yet not been cut from the sb. 11429 */ 11430 rack->r_ctl.rc_prr_sndcnt = 0; 11431 return; 11432 } 11433 /* Compute prr_sndcnt */ 11434 if (SEQ_GT(tp->snd_una, th_ack)) { 11435 snd_una = tp->snd_una; 11436 } else { 11437 snd_una = th_ack; 11438 } 11439 pipe = do_rack_compute_pipe(tp, rack, snd_una); 11440 if (pipe > tp->snd_ssthresh) { 11441 long sndcnt; 11442 11443 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 11444 if (rack->r_ctl.rc_prr_recovery_fs > 0) 11445 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 11446 else { 11447 rack->r_ctl.rc_prr_sndcnt = 0; 11448 rack_log_to_prr(rack, 9, 0, __LINE__); 11449 sndcnt = 0; 11450 } 11451 sndcnt++; 11452 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 11453 sndcnt -= rack->r_ctl.rc_prr_out; 11454 else 11455 sndcnt = 0; 11456 rack->r_ctl.rc_prr_sndcnt = sndcnt; 11457 rack_log_to_prr(rack, 10, 0, __LINE__); 11458 } else { 11459 uint32_t limit; 11460 11461 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 11462 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 11463 else 11464 limit = 0; 11465 if (changed > limit) 11466 limit = changed; 11467 limit += ctf_fixed_maxseg(tp); 11468 if (tp->snd_ssthresh > pipe) { 11469 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 11470 rack_log_to_prr(rack, 11, 0, __LINE__); 11471 } else { 11472 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 11473 rack_log_to_prr(rack, 12, 0, __LINE__); 11474 } 11475 } 11476 } 11477 11478 static void 11479 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, 11480 int *dsack_seen, int *sacks_seen) 11481 { 11482 uint32_t changed; 11483 struct tcp_rack *rack; 11484 struct rack_sendmap *rsm; 11485 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 11486 register uint32_t th_ack; 11487 int32_t i, j, k, num_sack_blks = 0; 11488 uint32_t cts, acked, ack_point; 11489 int loop_start = 0; 11490 uint32_t tsused; 11491 uint32_t segsiz; 11492 11493 11494 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11495 if (tcp_get_flags(th) & TH_RST) { 11496 /* We don't log resets */ 11497 return; 11498 } 11499 rack = (struct tcp_rack *)tp->t_fb_ptr; 11500 cts = tcp_get_usecs(NULL); 11501 rsm = tqhash_min(rack->r_ctl.tqh); 11502 changed = 0; 11503 th_ack = th->th_ack; 11504 segsiz = ctf_fixed_maxseg(rack->rc_tp); 11505 if (BYTES_THIS_ACK(tp, th) >= segsiz) { 11506 /* 11507 * You only get credit for 11508 * MSS and greater (and you get extra 11509 * credit for larger cum-ack moves). 11510 */ 11511 int ac; 11512 11513 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 11514 counter_u64_add(rack_ack_total, ac); 11515 } 11516 if (SEQ_GT(th_ack, tp->snd_una)) { 11517 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 11518 tp->t_acktime = ticks; 11519 } 11520 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 11521 changed = th_ack - rsm->r_start; 11522 if (changed) { 11523 rack_process_to_cumack(tp, rack, th_ack, cts, to, 11524 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 11525 } 11526 if ((to->to_flags & TOF_SACK) == 0) { 11527 /* We are done nothing left and no sack. */ 11528 rack_handle_might_revert(tp, rack); 11529 /* 11530 * For cases where we struck a dup-ack 11531 * with no SACK, add to the changes so 11532 * PRR will work right. 11533 */ 11534 if (dup_ack_struck && (changed == 0)) { 11535 changed += ctf_fixed_maxseg(rack->rc_tp); 11536 } 11537 goto out; 11538 } 11539 /* Sack block processing */ 11540 if (SEQ_GT(th_ack, tp->snd_una)) 11541 ack_point = th_ack; 11542 else 11543 ack_point = tp->snd_una; 11544 for (i = 0; i < to->to_nsacks; i++) { 11545 bcopy((to->to_sacks + i * TCPOLEN_SACK), 11546 &sack, sizeof(sack)); 11547 sack.start = ntohl(sack.start); 11548 sack.end = ntohl(sack.end); 11549 if (SEQ_GT(sack.end, sack.start) && 11550 SEQ_GT(sack.start, ack_point) && 11551 SEQ_LT(sack.start, tp->snd_max) && 11552 SEQ_GT(sack.end, ack_point) && 11553 SEQ_LEQ(sack.end, tp->snd_max)) { 11554 sack_blocks[num_sack_blks] = sack; 11555 num_sack_blks++; 11556 } else if (SEQ_LEQ(sack.start, th_ack) && 11557 SEQ_LEQ(sack.end, th_ack)) { 11558 int was_tlp; 11559 11560 if (dsack_seen != NULL) 11561 *dsack_seen = 1; 11562 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 11563 /* 11564 * Its a D-SACK block. 11565 */ 11566 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 11567 } 11568 } 11569 if (rack->rc_dsack_round_seen) { 11570 /* Is the dsack roound over? */ 11571 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 11572 /* Yes it is */ 11573 rack->rc_dsack_round_seen = 0; 11574 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 11575 } 11576 } 11577 /* 11578 * Sort the SACK blocks so we can update the rack scoreboard with 11579 * just one pass. 11580 */ 11581 num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks, 11582 num_sack_blks, th->th_ack); 11583 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 11584 if (sacks_seen != NULL) 11585 *sacks_seen = num_sack_blks; 11586 if (num_sack_blks == 0) { 11587 /* Nothing to sack, but we need to update counts */ 11588 goto out_with_totals; 11589 } 11590 /* Its a sack of some sort */ 11591 if (num_sack_blks < 2) { 11592 /* Only one, we don't need to sort */ 11593 goto do_sack_work; 11594 } 11595 /* Sort the sacks */ 11596 for (i = 0; i < num_sack_blks; i++) { 11597 for (j = i + 1; j < num_sack_blks; j++) { 11598 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 11599 sack = sack_blocks[i]; 11600 sack_blocks[i] = sack_blocks[j]; 11601 sack_blocks[j] = sack; 11602 } 11603 } 11604 } 11605 /* 11606 * Now are any of the sack block ends the same (yes some 11607 * implementations send these)? 11608 */ 11609 again: 11610 if (num_sack_blks == 0) 11611 goto out_with_totals; 11612 if (num_sack_blks > 1) { 11613 for (i = 0; i < num_sack_blks; i++) { 11614 for (j = i + 1; j < num_sack_blks; j++) { 11615 if (sack_blocks[i].end == sack_blocks[j].end) { 11616 /* 11617 * Ok these two have the same end we 11618 * want the smallest end and then 11619 * throw away the larger and start 11620 * again. 11621 */ 11622 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 11623 /* 11624 * The second block covers 11625 * more area use that 11626 */ 11627 sack_blocks[i].start = sack_blocks[j].start; 11628 } 11629 /* 11630 * Now collapse out the dup-sack and 11631 * lower the count 11632 */ 11633 for (k = (j + 1); k < num_sack_blks; k++) { 11634 sack_blocks[j].start = sack_blocks[k].start; 11635 sack_blocks[j].end = sack_blocks[k].end; 11636 j++; 11637 } 11638 num_sack_blks--; 11639 goto again; 11640 } 11641 } 11642 } 11643 } 11644 do_sack_work: 11645 /* 11646 * First lets look to see if 11647 * we have retransmitted and 11648 * can use the transmit next? 11649 */ 11650 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11651 if (rsm && 11652 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 11653 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 11654 /* 11655 * We probably did the FR and the next 11656 * SACK in continues as we would expect. 11657 */ 11658 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz); 11659 if (acked) { 11660 rack->r_wanted_output = 1; 11661 changed += acked; 11662 } 11663 if (num_sack_blks == 1) { 11664 /* 11665 * This is what we would expect from 11666 * a normal implementation to happen 11667 * after we have retransmitted the FR, 11668 * i.e the sack-filter pushes down 11669 * to 1 block and the next to be retransmitted 11670 * is the sequence in the sack block (has more 11671 * are acked). Count this as ACK'd data to boost 11672 * up the chances of recovering any false positives. 11673 */ 11674 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 11675 counter_u64_add(rack_express_sack, 1); 11676 goto out_with_totals; 11677 } else { 11678 /* 11679 * Start the loop through the 11680 * rest of blocks, past the first block. 11681 */ 11682 loop_start = 1; 11683 } 11684 } 11685 counter_u64_add(rack_sack_total, 1); 11686 rsm = rack->r_ctl.rc_sacklast; 11687 for (i = loop_start; i < num_sack_blks; i++) { 11688 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz); 11689 if (acked) { 11690 rack->r_wanted_output = 1; 11691 changed += acked; 11692 } 11693 } 11694 out_with_totals: 11695 if (num_sack_blks > 1) { 11696 /* 11697 * You get an extra stroke if 11698 * you have more than one sack-blk, this 11699 * could be where we are skipping forward 11700 * and the sack-filter is still working, or 11701 * it could be an attacker constantly 11702 * moving us. 11703 */ 11704 counter_u64_add(rack_move_some, 1); 11705 } 11706 out: 11707 if (changed) { 11708 /* Something changed cancel the rack timer */ 11709 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11710 } 11711 tsused = tcp_get_usecs(NULL); 11712 rsm = tcp_rack_output(tp, rack, tsused); 11713 if ((!IN_FASTRECOVERY(tp->t_flags)) && 11714 rsm && 11715 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 11716 /* Enter recovery */ 11717 entered_recovery = 1; 11718 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 11719 /* 11720 * When we enter recovery we need to assure we send 11721 * one packet. 11722 */ 11723 if (rack->rack_no_prr == 0) { 11724 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 11725 rack_log_to_prr(rack, 8, 0, __LINE__); 11726 } 11727 rack->r_timer_override = 1; 11728 rack->r_early = 0; 11729 rack->r_ctl.rc_agg_early = 0; 11730 } else if (IN_FASTRECOVERY(tp->t_flags) && 11731 rsm && 11732 (rack->r_rr_config == 3)) { 11733 /* 11734 * Assure we can output and we get no 11735 * remembered pace time except the retransmit. 11736 */ 11737 rack->r_timer_override = 1; 11738 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11739 rack->r_ctl.rc_resend = rsm; 11740 } 11741 if (IN_FASTRECOVERY(tp->t_flags) && 11742 (rack->rack_no_prr == 0) && 11743 (entered_recovery == 0)) { 11744 rack_update_prr(tp, rack, changed, th_ack); 11745 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 11746 ((tcp_in_hpts(rack->rc_tp) == 0) && 11747 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 11748 /* 11749 * If you are pacing output you don't want 11750 * to override. 11751 */ 11752 rack->r_early = 0; 11753 rack->r_ctl.rc_agg_early = 0; 11754 rack->r_timer_override = 1; 11755 } 11756 } 11757 } 11758 11759 static void 11760 rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) 11761 { 11762 struct rack_sendmap *rsm; 11763 11764 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11765 while (rsm) { 11766 /* 11767 * We need to skip anything already set 11768 * to be retransmitted. 11769 */ 11770 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11771 (rsm->r_flags & RACK_MUST_RXT)) { 11772 rsm = TAILQ_NEXT(rsm, r_tnext); 11773 continue; 11774 } 11775 break; 11776 } 11777 if (rsm && (rsm->r_dupack < 0xff)) { 11778 rsm->r_dupack++; 11779 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 11780 struct timeval tv; 11781 uint32_t cts; 11782 /* 11783 * Here we see if we need to retransmit. For 11784 * a SACK type connection if enough time has passed 11785 * we will get a return of the rsm. For a non-sack 11786 * connection we will get the rsm returned if the 11787 * dupack value is 3 or more. 11788 */ 11789 cts = tcp_get_usecs(&tv); 11790 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 11791 if (rack->r_ctl.rc_resend != NULL) { 11792 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 11793 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 11794 th_ack, __LINE__); 11795 } 11796 rack->r_wanted_output = 1; 11797 rack->r_timer_override = 1; 11798 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 11799 } 11800 } else { 11801 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 11802 } 11803 } 11804 } 11805 11806 static void 11807 rack_check_bottom_drag(struct tcpcb *tp, 11808 struct tcp_rack *rack, 11809 struct socket *so) 11810 { 11811 /* 11812 * So what is dragging bottom? 11813 * 11814 * Dragging bottom means you were under pacing and had a 11815 * delay in processing inbound acks waiting on our pacing 11816 * timer to expire. While you were waiting all of the acknowledgments 11817 * for the packets you sent have arrived. This means we are pacing 11818 * way underneath the bottleneck to the point where our Goodput 11819 * measurements stop working, since they require more than one 11820 * ack (usually at least 8 packets worth with multiple acks so we can 11821 * gauge the inter-ack times). If that occurs we have a real problem 11822 * since we are stuck in a hole that we can't get out of without 11823 * something speeding us up. 11824 * 11825 * We also check to see if we are widdling down to just one segment 11826 * outstanding. If this occurs and we have room to send in our cwnd/rwnd 11827 * then we are adding the delayed ack interval into our measurments and 11828 * we need to speed up slightly. 11829 */ 11830 uint32_t segsiz, minseg; 11831 11832 segsiz = ctf_fixed_maxseg(tp); 11833 minseg = segsiz; 11834 if (tp->snd_max == tp->snd_una) { 11835 /* 11836 * We are doing dynamic pacing and we are way 11837 * under. Basically everything got acked while 11838 * we were still waiting on the pacer to expire. 11839 * 11840 * This means we need to boost the b/w in 11841 * addition to any earlier boosting of 11842 * the multiplier. 11843 */ 11844 uint64_t lt_bw; 11845 11846 tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM); 11847 lt_bw = rack_get_lt_bw(rack); 11848 rack->rc_dragged_bottom = 1; 11849 rack_validate_multipliers_at_or_above100(rack); 11850 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 11851 (rack->dis_lt_bw == 0) && 11852 (rack->use_lesser_lt_bw == 0) && 11853 (lt_bw > 0)) { 11854 /* 11855 * Lets use the long-term b/w we have 11856 * been getting as a base. 11857 */ 11858 if (rack->rc_gp_filled == 0) { 11859 if (lt_bw > ONE_POINT_TWO_MEG) { 11860 /* 11861 * If we have no measurement 11862 * don't let us set in more than 11863 * 1.2Mbps. If we are still too 11864 * low after pacing with this we 11865 * will hopefully have a max b/w 11866 * available to sanity check things. 11867 */ 11868 lt_bw = ONE_POINT_TWO_MEG; 11869 } 11870 rack->r_ctl.rc_rtt_diff = 0; 11871 rack->r_ctl.gp_bw = lt_bw; 11872 rack->rc_gp_filled = 1; 11873 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11874 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11875 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11876 } else if (lt_bw > rack->r_ctl.gp_bw) { 11877 rack->r_ctl.rc_rtt_diff = 0; 11878 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11879 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11880 rack->r_ctl.gp_bw = lt_bw; 11881 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11882 } else 11883 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11884 if ((rack->gp_ready == 0) && 11885 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 11886 /* We have enough measurements now */ 11887 rack->gp_ready = 1; 11888 if (rack->dgp_on || 11889 rack->rack_hibeta) 11890 rack_set_cc_pacing(rack); 11891 if (rack->defer_options) 11892 rack_apply_deferred_options(rack); 11893 } 11894 } else { 11895 /* 11896 * zero rtt possibly?, settle for just an old increase. 11897 */ 11898 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11899 } 11900 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 11901 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 11902 minseg)) && 11903 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 11904 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 11905 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 11906 (segsiz * rack_req_segs))) { 11907 /* 11908 * We are doing dynamic GP pacing and 11909 * we have everything except 1MSS or less 11910 * bytes left out. We are still pacing away. 11911 * And there is data that could be sent, This 11912 * means we are inserting delayed ack time in 11913 * our measurements because we are pacing too slow. 11914 */ 11915 rack_validate_multipliers_at_or_above100(rack); 11916 rack->rc_dragged_bottom = 1; 11917 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11918 } 11919 } 11920 11921 #ifdef TCP_REQUEST_TRK 11922 static void 11923 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, 11924 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err) 11925 { 11926 int do_log; 11927 11928 do_log = tcp_bblogging_on(rack->rc_tp); 11929 if (do_log == 0) { 11930 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) 11931 return; 11932 /* We only allow the three below with point logging on */ 11933 if ((mod != HYBRID_LOG_RULES_APP) && 11934 (mod != HYBRID_LOG_RULES_SET) && 11935 (mod != HYBRID_LOG_REQ_COMP)) 11936 return; 11937 11938 } 11939 if (do_log) { 11940 union tcp_log_stackspecific log; 11941 struct timeval tv; 11942 11943 /* Convert our ms to a microsecond */ 11944 memset(&log, 0, sizeof(log)); 11945 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11946 log.u_bbr.flex1 = seq; 11947 log.u_bbr.cwnd_gain = line; 11948 if (cur != NULL) { 11949 uint64_t off; 11950 11951 log.u_bbr.flex2 = cur->start_seq; 11952 log.u_bbr.flex3 = cur->end_seq; 11953 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 11954 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); 11955 log.u_bbr.flex6 = cur->flags; 11956 log.u_bbr.pkts_out = cur->hybrid_flags; 11957 log.u_bbr.rttProp = cur->timestamp; 11958 log.u_bbr.cur_del_rate = cur->cspr; 11959 log.u_bbr.bw_inuse = cur->start; 11960 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); 11961 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; 11962 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); 11963 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; 11964 log.u_bbr.inhpts = 1; 11965 #ifdef TCP_REQUEST_TRK 11966 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 11967 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 11968 #endif 11969 } else { 11970 log.u_bbr.flex2 = err; 11971 } 11972 /* 11973 * Fill in flex7 to be CHD (catchup|hybrid|DGP) 11974 */ 11975 log.u_bbr.flex7 = rack->rc_catch_up; 11976 log.u_bbr.flex7 <<= 1; 11977 log.u_bbr.flex7 |= rack->rc_hybrid_mode; 11978 log.u_bbr.flex7 <<= 1; 11979 log.u_bbr.flex7 |= rack->dgp_on; 11980 /* 11981 * Compose bbr_state to be a bit wise 0000ADHF 11982 * where A is the always_pace flag 11983 * where D is the dgp_on flag 11984 * where H is the hybrid_mode on flag 11985 * where F is the use_fixed_rate flag. 11986 */ 11987 log.u_bbr.bbr_state = rack->rc_always_pace; 11988 log.u_bbr.bbr_state <<= 1; 11989 log.u_bbr.bbr_state |= rack->dgp_on; 11990 log.u_bbr.bbr_state <<= 1; 11991 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 11992 log.u_bbr.bbr_state <<= 1; 11993 log.u_bbr.bbr_state |= rack->use_fixed_rate; 11994 log.u_bbr.flex8 = mod; 11995 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; 11996 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; 11997 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11998 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; 11999 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; 12000 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; 12001 tcp_log_event(rack->rc_tp, NULL, 12002 &rack->rc_inp->inp_socket->so_rcv, 12003 &rack->rc_inp->inp_socket->so_snd, 12004 TCP_HYBRID_PACING_LOG, 0, 12005 0, &log, false, NULL, __func__, __LINE__, &tv); 12006 } 12007 } 12008 #endif 12009 12010 #ifdef TCP_REQUEST_TRK 12011 static void 12012 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 12013 { 12014 struct tcp_sendfile_track *rc_cur, *orig_ent; 12015 struct tcpcb *tp; 12016 int err = 0; 12017 12018 orig_ent = rack->r_ctl.rc_last_sft; 12019 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); 12020 if (rc_cur == NULL) { 12021 /* If not in the beginning what about the end piece */ 12022 if (rack->rc_hybrid_mode) 12023 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 12024 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1)); 12025 } else { 12026 err = 12345; 12027 } 12028 /* If we find no parameters we are in straight DGP mode */ 12029 if(rc_cur == NULL) { 12030 /* None found for this seq, just DGP for now */ 12031 if (rack->rc_hybrid_mode) { 12032 rack->r_ctl.client_suggested_maxseg = 0; 12033 rack->rc_catch_up = 0; 12034 if (rack->cspr_is_fcc == 0) 12035 rack->r_ctl.bw_rate_cap = 0; 12036 else 12037 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 12038 } 12039 if (rack->rc_hybrid_mode) { 12040 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 12041 } 12042 if (rack->r_ctl.rc_last_sft) { 12043 rack->r_ctl.rc_last_sft = NULL; 12044 } 12045 return; 12046 } 12047 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) { 12048 /* This entry was never setup for hybrid pacing on/off etc */ 12049 if (rack->rc_hybrid_mode) { 12050 rack->r_ctl.client_suggested_maxseg = 0; 12051 rack->rc_catch_up = 0; 12052 rack->r_ctl.bw_rate_cap = 0; 12053 } 12054 if (rack->r_ctl.rc_last_sft) { 12055 rack->r_ctl.rc_last_sft = NULL; 12056 } 12057 if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 12058 rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND; 12059 rc_cur->first_send = cts; 12060 rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes; 12061 rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 12062 } 12063 return; 12064 } 12065 /* 12066 * Ok if we have a new entry *or* have never 12067 * set up an entry we need to proceed. If 12068 * we have already set it up this entry we 12069 * just continue along with what we already 12070 * setup. 12071 */ 12072 tp = rack->rc_tp; 12073 if ((rack->r_ctl.rc_last_sft != NULL) && 12074 (rack->r_ctl.rc_last_sft == rc_cur)) { 12075 /* Its already in place */ 12076 if (rack->rc_hybrid_mode) 12077 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); 12078 return; 12079 } 12080 if (rack->rc_hybrid_mode == 0) { 12081 rack->r_ctl.rc_last_sft = rc_cur; 12082 if (orig_ent) { 12083 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 12084 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 12085 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 12086 } 12087 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 12088 return; 12089 } 12090 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ 12091 /* Compensate for all the header overhead's */ 12092 if (rack->cspr_is_fcc == 0) 12093 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 12094 else 12095 rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 12096 } else { 12097 if (rack->rc_hybrid_mode) { 12098 if (rack->cspr_is_fcc == 0) 12099 rack->r_ctl.bw_rate_cap = 0; 12100 else 12101 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 12102 } 12103 } 12104 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) 12105 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; 12106 else 12107 rack->r_ctl.client_suggested_maxseg = 0; 12108 if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) { 12109 /* 12110 * It is the same timestamp as the previous one 12111 * add the hybrid flag that will indicate we use 12112 * sendtime not arrival time for catch-up mode. 12113 */ 12114 rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME; 12115 } 12116 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && 12117 (rc_cur->cspr > 0)) { 12118 uint64_t len; 12119 12120 rack->rc_catch_up = 1; 12121 /* 12122 * Calculate the deadline time, first set the 12123 * time to when the request arrived. 12124 */ 12125 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) { 12126 /* 12127 * For cases where its a duplicate tm (we received more 12128 * than one request for a tm) we want to use now, the point 12129 * where we are just sending the first bit of the request. 12130 */ 12131 rc_cur->deadline = cts; 12132 } else { 12133 /* 12134 * Here we have a different tm from the last request 12135 * so we want to use arrival time as our base. 12136 */ 12137 rc_cur->deadline = rc_cur->localtime; 12138 } 12139 /* 12140 * Next calculate the length and compensate for 12141 * TLS if need be. 12142 */ 12143 len = rc_cur->end - rc_cur->start; 12144 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { 12145 /* 12146 * This session is doing TLS. Take a swag guess 12147 * at the overhead. 12148 */ 12149 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); 12150 } 12151 /* 12152 * Now considering the size, and the cspr, what is the time that 12153 * would be required at the cspr rate. Here we use the raw 12154 * cspr value since the client only looks at the raw data. We 12155 * do use len which includes TLS overhead, but not the TCP/IP etc. 12156 * That will get made up for in the CU pacing rate set. 12157 */ 12158 len *= HPTS_USEC_IN_SEC; 12159 len /= rc_cur->cspr; 12160 rc_cur->deadline += len; 12161 } else { 12162 rack->rc_catch_up = 0; 12163 rc_cur->deadline = 0; 12164 } 12165 if (rack->r_ctl.client_suggested_maxseg != 0) { 12166 /* 12167 * We need to reset the max pace segs if we have a 12168 * client_suggested_maxseg. 12169 */ 12170 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12171 } 12172 if (orig_ent) { 12173 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 12174 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 12175 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 12176 } 12177 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 12178 /* Remember it for next time and for CU mode */ 12179 rack->r_ctl.rc_last_sft = rc_cur; 12180 rack->r_ctl.last_tm_mark = rc_cur->timestamp; 12181 } 12182 #endif 12183 12184 static void 12185 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 12186 { 12187 #ifdef TCP_REQUEST_TRK 12188 struct tcp_sendfile_track *ent; 12189 12190 ent = rack->r_ctl.rc_last_sft; 12191 if ((ent == NULL) || 12192 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || 12193 (SEQ_GEQ(seq, ent->end_seq))) { 12194 /* Time to update the track. */ 12195 rack_set_dgp_hybrid_mode(rack, seq, len, cts); 12196 ent = rack->r_ctl.rc_last_sft; 12197 } 12198 /* Out of all */ 12199 if (ent == NULL) { 12200 return; 12201 } 12202 if (SEQ_LT(ent->end_seq, (seq + len))) { 12203 /* 12204 * This is the case where our end_seq guess 12205 * was wrong. This is usually due to TLS having 12206 * more bytes then our guess. It could also be the 12207 * case that the client sent in two requests closely 12208 * and the SB is full of both so we are sending part 12209 * of each (end|beg). In such a case lets move this 12210 * guys end to match the end of this send. That 12211 * way it will complete when all of it is acked. 12212 */ 12213 ent->end_seq = (seq + len); 12214 if (rack->rc_hybrid_mode) 12215 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__); 12216 } 12217 /* Now validate we have set the send time of this one */ 12218 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 12219 ent->flags |= TCP_TRK_TRACK_FLG_FSND; 12220 ent->first_send = cts; 12221 ent->sent_at_fs = rack->rc_tp->t_sndbytes; 12222 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 12223 } 12224 #endif 12225 } 12226 12227 static void 12228 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 12229 { 12230 /* 12231 * The fast output path is enabled and we 12232 * have moved the cumack forward. Lets see if 12233 * we can expand forward the fast path length by 12234 * that amount. What we would ideally like to 12235 * do is increase the number of bytes in the 12236 * fast path block (left_to_send) by the 12237 * acked amount. However we have to gate that 12238 * by two factors: 12239 * 1) The amount outstanding and the rwnd of the peer 12240 * (i.e. we don't want to exceed the rwnd of the peer). 12241 * <and> 12242 * 2) The amount of data left in the socket buffer (i.e. 12243 * we can't send beyond what is in the buffer). 12244 * 12245 * Note that this does not take into account any increase 12246 * in the cwnd. We will only extend the fast path by 12247 * what was acked. 12248 */ 12249 uint32_t new_total, gating_val; 12250 12251 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 12252 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 12253 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 12254 if (new_total <= gating_val) { 12255 /* We can increase left_to_send by the acked amount */ 12256 counter_u64_add(rack_extended_rfo, 1); 12257 rack->r_ctl.fsb.left_to_send = new_total; 12258 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 12259 ("rack:%p left_to_send:%u sbavail:%u out:%u", 12260 rack, rack->r_ctl.fsb.left_to_send, 12261 sbavail(&rack->rc_inp->inp_socket->so_snd), 12262 (tp->snd_max - tp->snd_una))); 12263 12264 } 12265 } 12266 12267 static void 12268 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) 12269 { 12270 /* 12271 * Here any sendmap entry that points to the 12272 * beginning mbuf must be adjusted to the correct 12273 * offset. This must be called with: 12274 * 1) The socket buffer locked 12275 * 2) snd_una adjusted to its new position. 12276 * 12277 * Note that (2) implies rack_ack_received has also 12278 * been called and all the sbcut's have been done. 12279 * 12280 * We grab the first mbuf in the socket buffer and 12281 * then go through the front of the sendmap, recalculating 12282 * the stored offset for any sendmap entry that has 12283 * that mbuf. We must use the sb functions to do this 12284 * since its possible an add was done has well as 12285 * the subtraction we may have just completed. This should 12286 * not be a penalty though, since we just referenced the sb 12287 * to go in and trim off the mbufs that we freed (of course 12288 * there will be a penalty for the sendmap references though). 12289 * 12290 * Note also with INVARIANT on, we validate with a KASSERT 12291 * that the first sendmap entry has a soff of 0. 12292 * 12293 */ 12294 struct mbuf *m; 12295 struct rack_sendmap *rsm; 12296 tcp_seq snd_una; 12297 #ifdef INVARIANTS 12298 int first_processed = 0; 12299 #endif 12300 12301 snd_una = rack->rc_tp->snd_una; 12302 SOCKBUF_LOCK_ASSERT(sb); 12303 m = sb->sb_mb; 12304 rsm = tqhash_min(rack->r_ctl.tqh); 12305 if ((rsm == NULL) || (m == NULL)) { 12306 /* Nothing outstanding */ 12307 return; 12308 } 12309 /* The very first RSM's mbuf must point to the head mbuf in the sb */ 12310 KASSERT((rsm->m == m), 12311 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", 12312 rack, sb, rsm)); 12313 while (rsm->m && (rsm->m == m)) { 12314 /* one to adjust */ 12315 #ifdef INVARIANTS 12316 struct mbuf *tm; 12317 uint32_t soff; 12318 12319 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 12320 if ((rsm->orig_m_len != m->m_len) || 12321 (rsm->orig_t_space != M_TRAILINGROOM(m))){ 12322 rack_adjust_orig_mlen(rsm); 12323 } 12324 if (first_processed == 0) { 12325 KASSERT((rsm->soff == 0), 12326 ("Rack:%p rsm:%p -- rsm at head but soff not zero", 12327 rack, rsm)); 12328 first_processed = 1; 12329 } 12330 if ((rsm->soff != soff) || (rsm->m != tm)) { 12331 /* 12332 * This is not a fatal error, we anticipate it 12333 * might happen (the else code), so we count it here 12334 * so that under invariant we can see that it really 12335 * does happen. 12336 */ 12337 counter_u64_add(rack_adjust_map_bw, 1); 12338 } 12339 rsm->m = tm; 12340 rsm->soff = soff; 12341 if (tm) { 12342 rsm->orig_m_len = rsm->m->m_len; 12343 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 12344 } else { 12345 rsm->orig_m_len = 0; 12346 rsm->orig_t_space = 0; 12347 } 12348 #else 12349 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 12350 if (rsm->m) { 12351 rsm->orig_m_len = rsm->m->m_len; 12352 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 12353 } else { 12354 rsm->orig_m_len = 0; 12355 rsm->orig_t_space = 0; 12356 } 12357 #endif 12358 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 12359 if (rsm == NULL) 12360 break; 12361 } 12362 } 12363 12364 #ifdef TCP_REQUEST_TRK 12365 static inline void 12366 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) 12367 { 12368 struct tcp_sendfile_track *ent; 12369 int i; 12370 12371 if ((rack->rc_hybrid_mode == 0) && 12372 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { 12373 /* 12374 * Just do normal completions hybrid pacing is not on 12375 * and CLDL is off as well. 12376 */ 12377 tcp_req_check_for_comp(rack->rc_tp, th_ack); 12378 return; 12379 } 12380 /* 12381 * Originally I was just going to find the th_ack associated 12382 * with an entry. But then I realized a large strech ack could 12383 * in theory ack two or more requests at once. So instead we 12384 * need to find all entries that are completed by th_ack not 12385 * just a single entry and do our logging. 12386 */ 12387 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 12388 while (ent != NULL) { 12389 /* 12390 * We may be doing hybrid pacing or CLDL and need more details possibly 12391 * so we do it manually instead of calling 12392 * tcp_req_check_for_comp() 12393 */ 12394 uint64_t laa, tim, data, cbw, ftim; 12395 12396 /* Ok this ack frees it */ 12397 rack_log_hybrid(rack, th_ack, 12398 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); 12399 rack_log_hybrid_sends(rack, ent, __LINE__); 12400 /* calculate the time based on the ack arrival */ 12401 data = ent->end - ent->start; 12402 laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 12403 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { 12404 if (ent->first_send > ent->localtime) 12405 ftim = ent->first_send; 12406 else 12407 ftim = ent->localtime; 12408 } else { 12409 /* TSNH */ 12410 ftim = ent->localtime; 12411 } 12412 if (laa > ent->localtime) 12413 tim = laa - ftim; 12414 else 12415 tim = 0; 12416 cbw = data * HPTS_USEC_IN_SEC; 12417 if (tim > 0) 12418 cbw /= tim; 12419 else 12420 cbw = 0; 12421 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__); 12422 /* 12423 * Check to see if we are freeing what we are pointing to send wise 12424 * if so be sure to NULL the pointer so we know we are no longer 12425 * set to anything. 12426 */ 12427 if (ent == rack->r_ctl.rc_last_sft) { 12428 rack->r_ctl.rc_last_sft = NULL; 12429 if (rack->rc_hybrid_mode) { 12430 rack->rc_catch_up = 0; 12431 if (rack->cspr_is_fcc == 0) 12432 rack->r_ctl.bw_rate_cap = 0; 12433 else 12434 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 12435 rack->r_ctl.client_suggested_maxseg = 0; 12436 } 12437 } 12438 /* Generate the log that the tcp_netflix call would have */ 12439 tcp_req_log_req_info(rack->rc_tp, ent, 12440 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 12441 /* Free it and see if there is another one */ 12442 tcp_req_free_a_slot(rack->rc_tp, ent); 12443 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 12444 } 12445 } 12446 #endif 12447 12448 12449 /* 12450 * Return value of 1, we do not need to call rack_process_data(). 12451 * return value of 0, rack_process_data can be called. 12452 * For ret_val if its 0 the TCP is locked, if its non-zero 12453 * its unlocked and probably unsafe to touch the TCB. 12454 */ 12455 static int 12456 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12457 struct tcpcb *tp, struct tcpopt *to, 12458 uint32_t tiwin, int32_t tlen, 12459 int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen) 12460 { 12461 int32_t ourfinisacked = 0; 12462 int32_t nsegs, acked_amount; 12463 int32_t acked; 12464 struct mbuf *mfree; 12465 struct tcp_rack *rack; 12466 int32_t under_pacing = 0; 12467 int32_t post_recovery = 0; 12468 uint32_t p_cwnd; 12469 12470 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12471 12472 rack = (struct tcp_rack *)tp->t_fb_ptr; 12473 if (SEQ_GT(th->th_ack, tp->snd_max)) { 12474 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 12475 &rack->r_ctl.challenge_ack_ts, 12476 &rack->r_ctl.challenge_ack_cnt); 12477 rack->r_wanted_output = 1; 12478 return (1); 12479 } 12480 if (rack->gp_ready && 12481 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12482 under_pacing = 1; 12483 } 12484 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 12485 int in_rec, dup_ack_struck = 0; 12486 int dsack_seen = 0, sacks_seen = 0; 12487 12488 in_rec = IN_FASTRECOVERY(tp->t_flags); 12489 if (rack->rc_in_persist) { 12490 tp->t_rxtshift = 0; 12491 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12492 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12493 } 12494 12495 if ((th->th_ack == tp->snd_una) && 12496 (tiwin == tp->snd_wnd) && 12497 (orig_tlen == 0) && 12498 ((to->to_flags & TOF_SACK) == 0)) { 12499 rack_strike_dupack(rack, th->th_ack); 12500 dup_ack_struck = 1; 12501 } 12502 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), 12503 dup_ack_struck, &dsack_seen, &sacks_seen); 12504 12505 } 12506 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12507 /* 12508 * Old ack, behind (or duplicate to) the last one rcv'd 12509 * Note: We mark reordering is occuring if its 12510 * less than and we have not closed our window. 12511 */ 12512 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 12513 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12514 if (rack->r_ctl.rc_reorder_ts == 0) 12515 rack->r_ctl.rc_reorder_ts = 1; 12516 } 12517 return (0); 12518 } 12519 /* 12520 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 12521 * something we sent. 12522 */ 12523 if (tp->t_flags & TF_NEEDSYN) { 12524 /* 12525 * T/TCP: Connection was half-synchronized, and our SYN has 12526 * been ACK'd (so connection is now fully synchronized). Go 12527 * to non-starred state, increment snd_una for ACK of SYN, 12528 * and check if we can do window scaling. 12529 */ 12530 tp->t_flags &= ~TF_NEEDSYN; 12531 tp->snd_una++; 12532 /* Do window scaling? */ 12533 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12534 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12535 tp->rcv_scale = tp->request_r_scale; 12536 /* Send window already scaled. */ 12537 } 12538 } 12539 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12540 12541 acked = BYTES_THIS_ACK(tp, th); 12542 if (acked) { 12543 /* 12544 * Any time we move the cum-ack forward clear 12545 * keep-alive tied probe-not-answered. The 12546 * persists clears its own on entry. 12547 */ 12548 rack->probe_not_answered = 0; 12549 } 12550 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12551 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12552 /* 12553 * If we just performed our first retransmit, and the ACK arrives 12554 * within our recovery window, then it was a mistake to do the 12555 * retransmit in the first place. Recover our original cwnd and 12556 * ssthresh, and proceed to transmit where we left off. 12557 */ 12558 if ((tp->t_flags & TF_PREVVALID) && 12559 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12560 tp->t_flags &= ~TF_PREVVALID; 12561 if (tp->t_rxtshift == 1 && 12562 (int)(ticks - tp->t_badrxtwin) < 0) 12563 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12564 } 12565 if (acked) { 12566 /* assure we are not backed off */ 12567 tp->t_rxtshift = 0; 12568 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12569 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12570 rack->rc_tlp_in_progress = 0; 12571 rack->r_ctl.rc_tlp_cnt_out = 0; 12572 /* 12573 * If it is the RXT timer we want to 12574 * stop it, so we can restart a TLP. 12575 */ 12576 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12577 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12578 #ifdef TCP_REQUEST_TRK 12579 rack_req_check_for_comp(rack, th->th_ack); 12580 #endif 12581 } 12582 /* 12583 * If we have a timestamp reply, update smoothed round trip time. If 12584 * no timestamp is present but transmit timer is running and timed 12585 * sequence number was acked, update smoothed round trip time. Since 12586 * we now have an rtt measurement, cancel the timer backoff (cf., 12587 * Phil Karn's retransmit alg.). Recompute the initial retransmit 12588 * timer. 12589 * 12590 * Some boxes send broken timestamp replies during the SYN+ACK 12591 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12592 * and blow up the retransmit timer. 12593 */ 12594 /* 12595 * If all outstanding data is acked, stop retransmit timer and 12596 * remember to restart (more output or persist). If there is more 12597 * data to be acked, restart retransmit timer, using current 12598 * (possibly backed-off) value. 12599 */ 12600 if (acked == 0) { 12601 if (ofia) 12602 *ofia = ourfinisacked; 12603 return (0); 12604 } 12605 if (IN_RECOVERY(tp->t_flags)) { 12606 if (SEQ_LT(th->th_ack, tp->snd_recover) && 12607 (SEQ_LT(th->th_ack, tp->snd_max))) { 12608 tcp_rack_partialack(tp); 12609 } else { 12610 rack_post_recovery(tp, th->th_ack); 12611 post_recovery = 1; 12612 /* 12613 * Grab the segsiz, multiply by 2 and add the snd_cwnd 12614 * that is the max the CC should add if we are exiting 12615 * recovery and doing a late add. 12616 */ 12617 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12618 p_cwnd <<= 1; 12619 p_cwnd += tp->snd_cwnd; 12620 } 12621 } else if ((rack->rto_from_rec == 1) && 12622 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 12623 /* 12624 * We were in recovery, hit a rxt timeout 12625 * and never re-entered recovery. The timeout(s) 12626 * made up all the lost data. In such a case 12627 * we need to clear the rto_from_rec flag. 12628 */ 12629 rack->rto_from_rec = 0; 12630 } 12631 /* 12632 * Let the congestion control algorithm update congestion control 12633 * related information. This typically means increasing the 12634 * congestion window. 12635 */ 12636 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery); 12637 if (post_recovery && 12638 (tp->snd_cwnd > p_cwnd)) { 12639 /* Must be non-newreno (cubic) getting too ahead of itself */ 12640 tp->snd_cwnd = p_cwnd; 12641 } 12642 SOCKBUF_LOCK(&so->so_snd); 12643 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 12644 tp->snd_wnd -= acked_amount; 12645 mfree = sbcut_locked(&so->so_snd, acked_amount); 12646 if ((sbused(&so->so_snd) == 0) && 12647 (acked > acked_amount) && 12648 (tp->t_state >= TCPS_FIN_WAIT_1) && 12649 (tp->t_flags & TF_SENTFIN)) { 12650 /* 12651 * We must be sure our fin 12652 * was sent and acked (we can be 12653 * in FIN_WAIT_1 without having 12654 * sent the fin). 12655 */ 12656 ourfinisacked = 1; 12657 } 12658 tp->snd_una = th->th_ack; 12659 /* wakeups? */ 12660 if (acked_amount && sbavail(&so->so_snd)) 12661 rack_adjust_sendmap_head(rack, &so->so_snd); 12662 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12663 /* NB: sowwakeup_locked() does an implicit unlock. */ 12664 sowwakeup_locked(so); 12665 m_freem(mfree); 12666 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 12667 tp->snd_recover = tp->snd_una; 12668 12669 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 12670 tp->snd_nxt = tp->snd_max; 12671 } 12672 if (under_pacing && 12673 (rack->use_fixed_rate == 0) && 12674 (rack->in_probe_rtt == 0) && 12675 rack->rc_gp_dyn_mul && 12676 rack->rc_always_pace) { 12677 /* Check if we are dragging bottom */ 12678 rack_check_bottom_drag(tp, rack, so); 12679 } 12680 if (tp->snd_una == tp->snd_max) { 12681 /* Nothing left outstanding */ 12682 tp->t_flags &= ~TF_PREVVALID; 12683 rack->r_ctl.idle_snd_una = tp->snd_una; 12684 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 12685 if (rack->r_ctl.rc_went_idle_time == 0) 12686 rack->r_ctl.rc_went_idle_time = 1; 12687 rack->r_ctl.retran_during_recovery = 0; 12688 rack->r_ctl.dsack_byte_cnt = 0; 12689 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12690 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12691 tp->t_acktime = 0; 12692 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12693 rack->rc_suspicious = 0; 12694 /* Set need output so persist might get set */ 12695 rack->r_wanted_output = 1; 12696 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12697 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 12698 (sbavail(&so->so_snd) == 0) && 12699 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 12700 /* 12701 * The socket was gone and the 12702 * peer sent data (now or in the past), time to 12703 * reset him. 12704 */ 12705 *ret_val = 1; 12706 /* tcp_close will kill the inp pre-log the Reset */ 12707 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 12708 tp = tcp_close(tp); 12709 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 12710 return (1); 12711 } 12712 } 12713 if (ofia) 12714 *ofia = ourfinisacked; 12715 return (0); 12716 } 12717 12718 12719 static void 12720 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 12721 int dir, uint32_t flags, struct rack_sendmap *rsm) 12722 { 12723 if (tcp_bblogging_on(rack->rc_tp)) { 12724 union tcp_log_stackspecific log; 12725 struct timeval tv; 12726 12727 memset(&log, 0, sizeof(log)); 12728 log.u_bbr.flex1 = cnt; 12729 log.u_bbr.flex2 = split; 12730 log.u_bbr.flex3 = out; 12731 log.u_bbr.flex4 = line; 12732 log.u_bbr.flex5 = rack->r_must_retran; 12733 log.u_bbr.flex6 = flags; 12734 log.u_bbr.flex7 = rack->rc_has_collapsed; 12735 log.u_bbr.flex8 = dir; /* 12736 * 1 is collapsed, 0 is uncollapsed, 12737 * 2 is log of a rsm being marked, 3 is a split. 12738 */ 12739 if (rsm == NULL) 12740 log.u_bbr.rttProp = 0; 12741 else 12742 log.u_bbr.rttProp = (uint64_t)rsm; 12743 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 12744 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 12745 TCP_LOG_EVENTP(rack->rc_tp, NULL, 12746 &rack->rc_inp->inp_socket->so_rcv, 12747 &rack->rc_inp->inp_socket->so_snd, 12748 TCP_RACK_LOG_COLLAPSE, 0, 12749 0, &log, false, &tv); 12750 } 12751 } 12752 12753 static void 12754 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) 12755 { 12756 /* 12757 * Here all we do is mark the collapsed point and set the flag. 12758 * This may happen again and again, but there is no 12759 * sense splitting our map until we know where the 12760 * peer finally lands in the collapse. 12761 */ 12762 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12763 if ((rack->rc_has_collapsed == 0) || 12764 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) 12765 counter_u64_add(rack_collapsed_win_seen, 1); 12766 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; 12767 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 12768 rack->rc_has_collapsed = 1; 12769 rack->r_collapse_point_valid = 1; 12770 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 12771 } 12772 12773 static void 12774 rack_un_collapse_window(struct tcp_rack *rack, int line) 12775 { 12776 struct rack_sendmap *nrsm, *rsm; 12777 int cnt = 0, split = 0; 12778 int insret __diagused; 12779 12780 12781 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12782 rack->rc_has_collapsed = 0; 12783 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 12784 if (rsm == NULL) { 12785 /* Nothing to do maybe the peer ack'ed it all */ 12786 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12787 return; 12788 } 12789 /* Now do we need to split this one? */ 12790 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 12791 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 12792 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 12793 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 12794 if (nrsm == NULL) { 12795 /* We can't get a rsm, mark all? */ 12796 nrsm = rsm; 12797 goto no_split; 12798 } 12799 /* Clone it */ 12800 split = 1; 12801 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 12802 #ifndef INVARIANTS 12803 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 12804 #else 12805 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 12806 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 12807 nrsm, insret, rack, rsm); 12808 } 12809 #endif 12810 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 12811 rack->r_ctl.last_collapse_point, __LINE__); 12812 if (rsm->r_in_tmap) { 12813 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 12814 nrsm->r_in_tmap = 1; 12815 } 12816 /* 12817 * Set in the new RSM as the 12818 * collapsed starting point 12819 */ 12820 rsm = nrsm; 12821 } 12822 12823 no_split: 12824 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { 12825 cnt++; 12826 nrsm->r_flags |= RACK_RWND_COLLAPSED; 12827 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 12828 cnt++; 12829 } 12830 if (cnt) { 12831 counter_u64_add(rack_collapsed_win, 1); 12832 } 12833 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12834 } 12835 12836 static void 12837 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 12838 int32_t tlen, int32_t tfo_syn) 12839 { 12840 if (DELAY_ACK(tp, tlen) || tfo_syn) { 12841 rack_timer_cancel(tp, rack, 12842 rack->r_ctl.rc_rcvtime, __LINE__); 12843 tp->t_flags |= TF_DELACK; 12844 } else { 12845 rack->r_wanted_output = 1; 12846 tp->t_flags |= TF_ACKNOW; 12847 } 12848 } 12849 12850 static void 12851 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 12852 { 12853 /* 12854 * If fast output is in progress, lets validate that 12855 * the new window did not shrink on us and make it 12856 * so fast output should end. 12857 */ 12858 if (rack->r_fast_output) { 12859 uint32_t out; 12860 12861 /* 12862 * Calculate what we will send if left as is 12863 * and compare that to our send window. 12864 */ 12865 out = ctf_outstanding(tp); 12866 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 12867 /* ok we have an issue */ 12868 if (out >= tp->snd_wnd) { 12869 /* Turn off fast output the window is met or collapsed */ 12870 rack->r_fast_output = 0; 12871 } else { 12872 /* we have some room left */ 12873 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 12874 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 12875 /* If not at least 1 full segment never mind */ 12876 rack->r_fast_output = 0; 12877 } 12878 } 12879 } 12880 } 12881 } 12882 12883 /* 12884 * Return value of 1, the TCB is unlocked and most 12885 * likely gone, return value of 0, the TCP is still 12886 * locked. 12887 */ 12888 static int 12889 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 12890 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 12891 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 12892 { 12893 /* 12894 * Update window information. Don't look at window if no ACK: TAC's 12895 * send garbage on first SYN. 12896 */ 12897 int32_t nsegs; 12898 int32_t tfo_syn; 12899 struct tcp_rack *rack; 12900 12901 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12902 12903 rack = (struct tcp_rack *)tp->t_fb_ptr; 12904 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12905 if ((thflags & TH_ACK) && 12906 (SEQ_LT(tp->snd_wl1, th->th_seq) || 12907 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 12908 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 12909 /* keep track of pure window updates */ 12910 if (tlen == 0 && 12911 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 12912 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 12913 tp->snd_wnd = tiwin; 12914 rack_validate_fo_sendwin_up(tp, rack); 12915 tp->snd_wl1 = th->th_seq; 12916 tp->snd_wl2 = th->th_ack; 12917 if (tp->snd_wnd > tp->max_sndwnd) 12918 tp->max_sndwnd = tp->snd_wnd; 12919 rack->r_wanted_output = 1; 12920 } else if (thflags & TH_ACK) { 12921 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 12922 tp->snd_wnd = tiwin; 12923 rack_validate_fo_sendwin_up(tp, rack); 12924 tp->snd_wl1 = th->th_seq; 12925 tp->snd_wl2 = th->th_ack; 12926 } 12927 } 12928 if (tp->snd_wnd < ctf_outstanding(tp)) 12929 /* The peer collapsed the window */ 12930 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12931 else if (rack->rc_has_collapsed) 12932 rack_un_collapse_window(rack, __LINE__); 12933 if ((rack->r_collapse_point_valid) && 12934 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 12935 rack->r_collapse_point_valid = 0; 12936 /* Was persist timer active and now we have window space? */ 12937 if ((rack->rc_in_persist != 0) && 12938 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12939 rack->r_ctl.rc_pace_min_segs))) { 12940 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12941 tp->snd_nxt = tp->snd_max; 12942 /* Make sure we output to start the timer */ 12943 rack->r_wanted_output = 1; 12944 } 12945 /* Do we enter persists? */ 12946 if ((rack->rc_in_persist == 0) && 12947 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12948 TCPS_HAVEESTABLISHED(tp->t_state) && 12949 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12950 sbavail(&tptosocket(tp)->so_snd) && 12951 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12952 /* 12953 * Here the rwnd is less than 12954 * the pacing size, we are established, 12955 * nothing is outstanding, and there is 12956 * data to send. Enter persists. 12957 */ 12958 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 12959 } 12960 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 12961 m_freem(m); 12962 return (0); 12963 } 12964 /* 12965 * don't process the URG bit, ignore them drag 12966 * along the up. 12967 */ 12968 tp->rcv_up = tp->rcv_nxt; 12969 12970 /* 12971 * Process the segment text, merging it into the TCP sequencing 12972 * queue, and arranging for acknowledgment of receipt if necessary. 12973 * This process logically involves adjusting tp->rcv_wnd as data is 12974 * presented to the user (this happens in tcp_usrreq.c, case 12975 * PRU_RCVD). If a FIN has already been received on this connection 12976 * then we just ignore the text. 12977 */ 12978 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 12979 (tp->t_flags & TF_FASTOPEN)); 12980 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 12981 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12982 tcp_seq save_start = th->th_seq; 12983 tcp_seq save_rnxt = tp->rcv_nxt; 12984 int save_tlen = tlen; 12985 12986 m_adj(m, drop_hdrlen); /* delayed header drop */ 12987 /* 12988 * Insert segment which includes th into TCP reassembly 12989 * queue with control block tp. Set thflags to whether 12990 * reassembly now includes a segment with FIN. This handles 12991 * the common case inline (segment is the next to be 12992 * received on an established connection, and the queue is 12993 * empty), avoiding linkage into and removal from the queue 12994 * and repetition of various conversions. Set DELACK for 12995 * segments received in order, but ack immediately when 12996 * segments are out of order (so fast retransmit can work). 12997 */ 12998 if (th->th_seq == tp->rcv_nxt && 12999 SEGQ_EMPTY(tp) && 13000 (TCPS_HAVEESTABLISHED(tp->t_state) || 13001 tfo_syn)) { 13002 #ifdef NETFLIX_SB_LIMITS 13003 u_int mcnt, appended; 13004 13005 if (so->so_rcv.sb_shlim) { 13006 mcnt = m_memcnt(m); 13007 appended = 0; 13008 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 13009 CFO_NOSLEEP, NULL) == false) { 13010 counter_u64_add(tcp_sb_shlim_fails, 1); 13011 m_freem(m); 13012 return (0); 13013 } 13014 } 13015 #endif 13016 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 13017 tp->rcv_nxt += tlen; 13018 if (tlen && 13019 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 13020 (tp->t_fbyte_in == 0)) { 13021 tp->t_fbyte_in = ticks; 13022 if (tp->t_fbyte_in == 0) 13023 tp->t_fbyte_in = 1; 13024 if (tp->t_fbyte_out && tp->t_fbyte_in) 13025 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 13026 } 13027 thflags = tcp_get_flags(th) & TH_FIN; 13028 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 13029 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 13030 SOCKBUF_LOCK(&so->so_rcv); 13031 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13032 m_freem(m); 13033 } else { 13034 int32_t newsize; 13035 13036 if (tlen > 0) { 13037 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 13038 if (newsize) 13039 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 13040 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 13041 } 13042 #ifdef NETFLIX_SB_LIMITS 13043 appended = 13044 #endif 13045 sbappendstream_locked(&so->so_rcv, m, 0); 13046 } 13047 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 13048 /* NB: sorwakeup_locked() does an implicit unlock. */ 13049 sorwakeup_locked(so); 13050 #ifdef NETFLIX_SB_LIMITS 13051 if (so->so_rcv.sb_shlim && appended != mcnt) 13052 counter_fo_release(so->so_rcv.sb_shlim, 13053 mcnt - appended); 13054 #endif 13055 } else { 13056 /* 13057 * XXX: Due to the header drop above "th" is 13058 * theoretically invalid by now. Fortunately 13059 * m_adj() doesn't actually frees any mbufs when 13060 * trimming from the head. 13061 */ 13062 tcp_seq temp = save_start; 13063 13064 thflags = tcp_reass(tp, th, &temp, &tlen, m); 13065 tp->t_flags |= TF_ACKNOW; 13066 if (tp->t_flags & TF_WAKESOR) { 13067 tp->t_flags &= ~TF_WAKESOR; 13068 /* NB: sorwakeup_locked() does an implicit unlock. */ 13069 sorwakeup_locked(so); 13070 } 13071 } 13072 if ((tp->t_flags & TF_SACK_PERMIT) && 13073 (save_tlen > 0) && 13074 TCPS_HAVEESTABLISHED(tp->t_state)) { 13075 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 13076 /* 13077 * DSACK actually handled in the fastpath 13078 * above. 13079 */ 13080 tcp_update_sack_list(tp, save_start, 13081 save_start + save_tlen); 13082 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 13083 if ((tp->rcv_numsacks >= 1) && 13084 (tp->sackblks[0].end == save_start)) { 13085 /* 13086 * Partial overlap, recorded at todrop 13087 * above. 13088 */ 13089 tcp_update_sack_list(tp, 13090 tp->sackblks[0].start, 13091 tp->sackblks[0].end); 13092 } else { 13093 tcp_update_dsack_list(tp, save_start, 13094 save_start + save_tlen); 13095 } 13096 } else if (tlen >= save_tlen) { 13097 /* Update of sackblks. */ 13098 tcp_update_dsack_list(tp, save_start, 13099 save_start + save_tlen); 13100 } else if (tlen > 0) { 13101 tcp_update_dsack_list(tp, save_start, 13102 save_start + tlen); 13103 } 13104 } 13105 } else { 13106 m_freem(m); 13107 thflags &= ~TH_FIN; 13108 } 13109 13110 /* 13111 * If FIN is received ACK the FIN and let the user know that the 13112 * connection is closing. 13113 */ 13114 if (thflags & TH_FIN) { 13115 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 13116 /* The socket upcall is handled by socantrcvmore. */ 13117 socantrcvmore(so); 13118 /* 13119 * If connection is half-synchronized (ie NEEDSYN 13120 * flag on) then delay ACK, so it may be piggybacked 13121 * when SYN is sent. Otherwise, since we received a 13122 * FIN then no more input can be expected, send ACK 13123 * now. 13124 */ 13125 if (tp->t_flags & TF_NEEDSYN) { 13126 rack_timer_cancel(tp, rack, 13127 rack->r_ctl.rc_rcvtime, __LINE__); 13128 tp->t_flags |= TF_DELACK; 13129 } else { 13130 tp->t_flags |= TF_ACKNOW; 13131 } 13132 tp->rcv_nxt++; 13133 } 13134 switch (tp->t_state) { 13135 /* 13136 * In SYN_RECEIVED and ESTABLISHED STATES enter the 13137 * CLOSE_WAIT state. 13138 */ 13139 case TCPS_SYN_RECEIVED: 13140 tp->t_starttime = ticks; 13141 /* FALLTHROUGH */ 13142 case TCPS_ESTABLISHED: 13143 rack_timer_cancel(tp, rack, 13144 rack->r_ctl.rc_rcvtime, __LINE__); 13145 tcp_state_change(tp, TCPS_CLOSE_WAIT); 13146 break; 13147 13148 /* 13149 * If still in FIN_WAIT_1 STATE FIN has not been 13150 * acked so enter the CLOSING state. 13151 */ 13152 case TCPS_FIN_WAIT_1: 13153 rack_timer_cancel(tp, rack, 13154 rack->r_ctl.rc_rcvtime, __LINE__); 13155 tcp_state_change(tp, TCPS_CLOSING); 13156 break; 13157 13158 /* 13159 * In FIN_WAIT_2 state enter the TIME_WAIT state, 13160 * starting the time-wait timer, turning off the 13161 * other standard timers. 13162 */ 13163 case TCPS_FIN_WAIT_2: 13164 rack_timer_cancel(tp, rack, 13165 rack->r_ctl.rc_rcvtime, __LINE__); 13166 tcp_twstart(tp); 13167 return (1); 13168 } 13169 } 13170 /* 13171 * Return any desired output. 13172 */ 13173 if ((tp->t_flags & TF_ACKNOW) || 13174 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 13175 rack->r_wanted_output = 1; 13176 } 13177 return (0); 13178 } 13179 13180 /* 13181 * Here nothing is really faster, its just that we 13182 * have broken out the fast-data path also just like 13183 * the fast-ack. 13184 */ 13185 static int 13186 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 13187 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13188 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 13189 { 13190 int32_t nsegs; 13191 int32_t newsize = 0; /* automatic sockbuf scaling */ 13192 struct tcp_rack *rack; 13193 #ifdef NETFLIX_SB_LIMITS 13194 u_int mcnt, appended; 13195 #endif 13196 13197 /* 13198 * If last ACK falls within this segment's sequence numbers, record 13199 * the timestamp. NOTE that the test is modified according to the 13200 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 13201 */ 13202 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 13203 return (0); 13204 } 13205 if (tiwin && tiwin != tp->snd_wnd) { 13206 return (0); 13207 } 13208 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 13209 return (0); 13210 } 13211 if (__predict_false((to->to_flags & TOF_TS) && 13212 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 13213 return (0); 13214 } 13215 if (__predict_false((th->th_ack != tp->snd_una))) { 13216 return (0); 13217 } 13218 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 13219 return (0); 13220 } 13221 if ((to->to_flags & TOF_TS) != 0 && 13222 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 13223 tp->ts_recent_age = tcp_ts_getticks(); 13224 tp->ts_recent = to->to_tsval; 13225 } 13226 rack = (struct tcp_rack *)tp->t_fb_ptr; 13227 /* 13228 * This is a pure, in-sequence data packet with nothing on the 13229 * reassembly queue and we have enough buffer space to take it. 13230 */ 13231 nsegs = max(1, m->m_pkthdr.lro_nsegs); 13232 13233 #ifdef NETFLIX_SB_LIMITS 13234 if (so->so_rcv.sb_shlim) { 13235 mcnt = m_memcnt(m); 13236 appended = 0; 13237 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 13238 CFO_NOSLEEP, NULL) == false) { 13239 counter_u64_add(tcp_sb_shlim_fails, 1); 13240 m_freem(m); 13241 return (1); 13242 } 13243 } 13244 #endif 13245 /* Clean receiver SACK report if present */ 13246 if (tp->rcv_numsacks) 13247 tcp_clean_sackreport(tp); 13248 KMOD_TCPSTAT_INC(tcps_preddat); 13249 tp->rcv_nxt += tlen; 13250 if (tlen && 13251 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 13252 (tp->t_fbyte_in == 0)) { 13253 tp->t_fbyte_in = ticks; 13254 if (tp->t_fbyte_in == 0) 13255 tp->t_fbyte_in = 1; 13256 if (tp->t_fbyte_out && tp->t_fbyte_in) 13257 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 13258 } 13259 /* 13260 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 13261 */ 13262 tp->snd_wl1 = th->th_seq; 13263 /* 13264 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 13265 */ 13266 tp->rcv_up = tp->rcv_nxt; 13267 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 13268 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 13269 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 13270 13271 /* Add data to socket buffer. */ 13272 SOCKBUF_LOCK(&so->so_rcv); 13273 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13274 m_freem(m); 13275 } else { 13276 /* 13277 * Set new socket buffer size. Give up when limit is 13278 * reached. 13279 */ 13280 if (newsize) 13281 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 13282 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 13283 m_adj(m, drop_hdrlen); /* delayed header drop */ 13284 #ifdef NETFLIX_SB_LIMITS 13285 appended = 13286 #endif 13287 sbappendstream_locked(&so->so_rcv, m, 0); 13288 ctf_calc_rwin(so, tp); 13289 } 13290 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 13291 /* NB: sorwakeup_locked() does an implicit unlock. */ 13292 sorwakeup_locked(so); 13293 #ifdef NETFLIX_SB_LIMITS 13294 if (so->so_rcv.sb_shlim && mcnt != appended) 13295 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 13296 #endif 13297 rack_handle_delayed_ack(tp, rack, tlen, 0); 13298 if (tp->snd_una == tp->snd_max) 13299 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 13300 return (1); 13301 } 13302 13303 /* 13304 * This subfunction is used to try to highly optimize the 13305 * fast path. We again allow window updates that are 13306 * in sequence to remain in the fast-path. We also add 13307 * in the __predict's to attempt to help the compiler. 13308 * Note that if we return a 0, then we can *not* process 13309 * it and the caller should push the packet into the 13310 * slow-path. 13311 */ 13312 static int 13313 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 13314 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13315 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 13316 { 13317 int32_t acked; 13318 int32_t nsegs; 13319 int32_t under_pacing = 0; 13320 struct tcp_rack *rack; 13321 13322 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 13323 /* Old ack, behind (or duplicate to) the last one rcv'd */ 13324 return (0); 13325 } 13326 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 13327 /* Above what we have sent? */ 13328 return (0); 13329 } 13330 if (__predict_false(tiwin == 0)) { 13331 /* zero window */ 13332 return (0); 13333 } 13334 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 13335 /* We need a SYN or a FIN, unlikely.. */ 13336 return (0); 13337 } 13338 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 13339 /* Timestamp is behind .. old ack with seq wrap? */ 13340 return (0); 13341 } 13342 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 13343 /* Still recovering */ 13344 return (0); 13345 } 13346 rack = (struct tcp_rack *)tp->t_fb_ptr; 13347 if (rack->r_ctl.rc_sacked) { 13348 /* We have sack holes on our scoreboard */ 13349 return (0); 13350 } 13351 /* Ok if we reach here, we can process a fast-ack */ 13352 if (rack->gp_ready && 13353 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 13354 under_pacing = 1; 13355 } 13356 nsegs = max(1, m->m_pkthdr.lro_nsegs); 13357 rack_log_ack(tp, to, th, 0, 0, NULL, NULL); 13358 /* Did the window get updated? */ 13359 if (tiwin != tp->snd_wnd) { 13360 tp->snd_wnd = tiwin; 13361 rack_validate_fo_sendwin_up(tp, rack); 13362 tp->snd_wl1 = th->th_seq; 13363 if (tp->snd_wnd > tp->max_sndwnd) 13364 tp->max_sndwnd = tp->snd_wnd; 13365 } 13366 /* Do we exit persists? */ 13367 if ((rack->rc_in_persist != 0) && 13368 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 13369 rack->r_ctl.rc_pace_min_segs))) { 13370 rack_exit_persist(tp, rack, cts); 13371 } 13372 /* Do we enter persists? */ 13373 if ((rack->rc_in_persist == 0) && 13374 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 13375 TCPS_HAVEESTABLISHED(tp->t_state) && 13376 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 13377 sbavail(&tptosocket(tp)->so_snd) && 13378 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 13379 /* 13380 * Here the rwnd is less than 13381 * the pacing size, we are established, 13382 * nothing is outstanding, and there is 13383 * data to send. Enter persists. 13384 */ 13385 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); 13386 } 13387 /* 13388 * If last ACK falls within this segment's sequence numbers, record 13389 * the timestamp. NOTE that the test is modified according to the 13390 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 13391 */ 13392 if ((to->to_flags & TOF_TS) != 0 && 13393 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 13394 tp->ts_recent_age = tcp_ts_getticks(); 13395 tp->ts_recent = to->to_tsval; 13396 } 13397 /* 13398 * This is a pure ack for outstanding data. 13399 */ 13400 KMOD_TCPSTAT_INC(tcps_predack); 13401 13402 /* 13403 * "bad retransmit" recovery. 13404 */ 13405 if ((tp->t_flags & TF_PREVVALID) && 13406 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13407 tp->t_flags &= ~TF_PREVVALID; 13408 if (tp->t_rxtshift == 1 && 13409 (int)(ticks - tp->t_badrxtwin) < 0) 13410 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 13411 } 13412 /* 13413 * Recalculate the transmit timer / rtt. 13414 * 13415 * Some boxes send broken timestamp replies during the SYN+ACK 13416 * phase, ignore timestamps of 0 or we could calculate a huge RTT 13417 * and blow up the retransmit timer. 13418 */ 13419 acked = BYTES_THIS_ACK(tp, th); 13420 13421 #ifdef TCP_HHOOK 13422 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 13423 hhook_run_tcp_est_in(tp, th, to); 13424 #endif 13425 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 13426 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13427 if (acked) { 13428 struct mbuf *mfree; 13429 13430 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 13431 SOCKBUF_LOCK(&so->so_snd); 13432 mfree = sbcut_locked(&so->so_snd, acked); 13433 tp->snd_una = th->th_ack; 13434 /* Note we want to hold the sb lock through the sendmap adjust */ 13435 rack_adjust_sendmap_head(rack, &so->so_snd); 13436 /* Wake up the socket if we have room to write more */ 13437 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13438 sowwakeup_locked(so); 13439 m_freem(mfree); 13440 tp->t_rxtshift = 0; 13441 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13442 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13443 rack->rc_tlp_in_progress = 0; 13444 rack->r_ctl.rc_tlp_cnt_out = 0; 13445 /* 13446 * If it is the RXT timer we want to 13447 * stop it, so we can restart a TLP. 13448 */ 13449 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13450 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13451 13452 #ifdef TCP_REQUEST_TRK 13453 rack_req_check_for_comp(rack, th->th_ack); 13454 #endif 13455 } 13456 /* 13457 * Let the congestion control algorithm update congestion control 13458 * related information. This typically means increasing the 13459 * congestion window. 13460 */ 13461 if (tp->snd_wnd < ctf_outstanding(tp)) { 13462 /* The peer collapsed the window */ 13463 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 13464 } else if (rack->rc_has_collapsed) 13465 rack_un_collapse_window(rack, __LINE__); 13466 if ((rack->r_collapse_point_valid) && 13467 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 13468 rack->r_collapse_point_valid = 0; 13469 /* 13470 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 13471 */ 13472 tp->snd_wl2 = th->th_ack; 13473 tp->t_dupacks = 0; 13474 m_freem(m); 13475 /* ND6_HINT(tp); *//* Some progress has been made. */ 13476 13477 /* 13478 * If all outstanding data are acked, stop retransmit timer, 13479 * otherwise restart timer using current (possibly backed-off) 13480 * value. If process is waiting for space, wakeup/selwakeup/signal. 13481 * If data are ready to send, let tcp_output decide between more 13482 * output or persist. 13483 */ 13484 if (under_pacing && 13485 (rack->use_fixed_rate == 0) && 13486 (rack->in_probe_rtt == 0) && 13487 rack->rc_gp_dyn_mul && 13488 rack->rc_always_pace) { 13489 /* Check if we are dragging bottom */ 13490 rack_check_bottom_drag(tp, rack, so); 13491 } 13492 if (tp->snd_una == tp->snd_max) { 13493 tp->t_flags &= ~TF_PREVVALID; 13494 rack->r_ctl.retran_during_recovery = 0; 13495 rack->rc_suspicious = 0; 13496 rack->r_ctl.dsack_byte_cnt = 0; 13497 rack->r_ctl.idle_snd_una = tp->snd_una; 13498 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13499 if (rack->r_ctl.rc_went_idle_time == 0) 13500 rack->r_ctl.rc_went_idle_time = 1; 13501 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13502 if (sbavail(&tptosocket(tp)->so_snd) == 0) 13503 tp->t_acktime = 0; 13504 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13505 } 13506 if (acked && rack->r_fast_output) 13507 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 13508 if (sbavail(&so->so_snd)) { 13509 rack->r_wanted_output = 1; 13510 } 13511 return (1); 13512 } 13513 13514 /* 13515 * Return value of 1, the TCB is unlocked and most 13516 * likely gone, return value of 0, the TCP is still 13517 * locked. 13518 */ 13519 static int 13520 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 13521 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13522 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13523 { 13524 int32_t ret_val = 0; 13525 int32_t orig_tlen = tlen; 13526 int32_t todrop; 13527 int32_t ourfinisacked = 0; 13528 struct tcp_rack *rack; 13529 13530 INP_WLOCK_ASSERT(tptoinpcb(tp)); 13531 13532 ctf_calc_rwin(so, tp); 13533 /* 13534 * If the state is SYN_SENT: if seg contains an ACK, but not for our 13535 * SYN, drop the input. if seg contains a RST, then drop the 13536 * connection. if seg does not contain SYN, then drop it. Otherwise 13537 * this is an acceptable SYN segment initialize tp->rcv_nxt and 13538 * tp->irs if seg contains ack then advance tp->snd_una if seg 13539 * contains an ECE and ECN support is enabled, the stream is ECN 13540 * capable. if SYN has been acked change to ESTABLISHED else 13541 * SYN_RCVD state arrange for segment to be acked (eventually) 13542 * continue processing rest of data/controls. 13543 */ 13544 if ((thflags & TH_ACK) && 13545 (SEQ_LEQ(th->th_ack, tp->iss) || 13546 SEQ_GT(th->th_ack, tp->snd_max))) { 13547 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13548 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13549 return (1); 13550 } 13551 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 13552 TCP_PROBE5(connect__refused, NULL, tp, 13553 mtod(m, const char *), tp, th); 13554 tp = tcp_drop(tp, ECONNREFUSED); 13555 ctf_do_drop(m, tp); 13556 return (1); 13557 } 13558 if (thflags & TH_RST) { 13559 ctf_do_drop(m, tp); 13560 return (1); 13561 } 13562 if (!(thflags & TH_SYN)) { 13563 ctf_do_drop(m, tp); 13564 return (1); 13565 } 13566 tp->irs = th->th_seq; 13567 tcp_rcvseqinit(tp); 13568 rack = (struct tcp_rack *)tp->t_fb_ptr; 13569 if (thflags & TH_ACK) { 13570 int tfo_partial = 0; 13571 13572 KMOD_TCPSTAT_INC(tcps_connects); 13573 soisconnected(so); 13574 #ifdef MAC 13575 mac_socketpeer_set_from_mbuf(m, so); 13576 #endif 13577 /* Do window scaling on this connection? */ 13578 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13579 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13580 tp->rcv_scale = tp->request_r_scale; 13581 } 13582 tp->rcv_adv += min(tp->rcv_wnd, 13583 TCP_MAXWIN << tp->rcv_scale); 13584 /* 13585 * If not all the data that was sent in the TFO SYN 13586 * has been acked, resend the remainder right away. 13587 */ 13588 if ((tp->t_flags & TF_FASTOPEN) && 13589 (tp->snd_una != tp->snd_max)) { 13590 /* Was it a partial ack? */ 13591 if (SEQ_LT(th->th_ack, tp->snd_max)) 13592 tfo_partial = 1; 13593 } 13594 /* 13595 * If there's data, delay ACK; if there's also a FIN ACKNOW 13596 * will be turned on later. 13597 */ 13598 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 13599 rack_timer_cancel(tp, rack, 13600 rack->r_ctl.rc_rcvtime, __LINE__); 13601 tp->t_flags |= TF_DELACK; 13602 } else { 13603 rack->r_wanted_output = 1; 13604 tp->t_flags |= TF_ACKNOW; 13605 } 13606 13607 tcp_ecn_input_syn_sent(tp, thflags, iptos); 13608 13609 if (SEQ_GT(th->th_ack, tp->snd_una)) { 13610 /* 13611 * We advance snd_una for the 13612 * fast open case. If th_ack is 13613 * acknowledging data beyond 13614 * snd_una we can't just call 13615 * ack-processing since the 13616 * data stream in our send-map 13617 * will start at snd_una + 1 (one 13618 * beyond the SYN). If its just 13619 * equal we don't need to do that 13620 * and there is no send_map. 13621 */ 13622 tp->snd_una++; 13623 if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) { 13624 /* 13625 * We sent a SYN with data, and thus have a 13626 * sendmap entry with a SYN set. Lets find it 13627 * and take off the send bit and the byte and 13628 * set it up to be what we send (send it next). 13629 */ 13630 struct rack_sendmap *rsm; 13631 13632 rsm = tqhash_min(rack->r_ctl.tqh); 13633 if (rsm) { 13634 if (rsm->r_flags & RACK_HAS_SYN) { 13635 rsm->r_flags &= ~RACK_HAS_SYN; 13636 rsm->r_start++; 13637 } 13638 rack->r_ctl.rc_resend = rsm; 13639 } 13640 } 13641 } 13642 /* 13643 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 13644 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 13645 */ 13646 tp->t_starttime = ticks; 13647 if (tp->t_flags & TF_NEEDFIN) { 13648 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13649 tp->t_flags &= ~TF_NEEDFIN; 13650 thflags &= ~TH_SYN; 13651 } else { 13652 tcp_state_change(tp, TCPS_ESTABLISHED); 13653 TCP_PROBE5(connect__established, NULL, tp, 13654 mtod(m, const char *), tp, th); 13655 rack_cc_conn_init(tp); 13656 } 13657 } else { 13658 /* 13659 * Received initial SYN in SYN-SENT[*] state => simultaneous 13660 * open. If segment contains CC option and there is a 13661 * cached CC, apply TAO test. If it succeeds, connection is * 13662 * half-synchronized. Otherwise, do 3-way handshake: 13663 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 13664 * there was no CC option, clear cached CC value. 13665 */ 13666 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 13667 tcp_state_change(tp, TCPS_SYN_RECEIVED); 13668 } 13669 /* 13670 * Advance th->th_seq to correspond to first data byte. If data, 13671 * trim to stay within window, dropping FIN if necessary. 13672 */ 13673 th->th_seq++; 13674 if (tlen > tp->rcv_wnd) { 13675 todrop = tlen - tp->rcv_wnd; 13676 m_adj(m, -todrop); 13677 tlen = tp->rcv_wnd; 13678 thflags &= ~TH_FIN; 13679 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 13680 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 13681 } 13682 tp->snd_wl1 = th->th_seq - 1; 13683 tp->rcv_up = th->th_seq; 13684 /* 13685 * Client side of transaction: already sent SYN and data. If the 13686 * remote host used T/TCP to validate the SYN, our data will be 13687 * ACK'd; if so, enter normal data segment processing in the middle 13688 * of step 5, ack processing. Otherwise, goto step 6. 13689 */ 13690 if (thflags & TH_ACK) { 13691 /* For syn-sent we need to possibly update the rtt */ 13692 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13693 uint32_t t, mcts; 13694 13695 mcts = tcp_ts_getticks(); 13696 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13697 if (!tp->t_rttlow || tp->t_rttlow > t) 13698 tp->t_rttlow = t; 13699 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 13700 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13701 tcp_rack_xmit_timer_commit(rack, tp); 13702 } 13703 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) 13704 return (ret_val); 13705 /* We may have changed to FIN_WAIT_1 above */ 13706 if (tp->t_state == TCPS_FIN_WAIT_1) { 13707 /* 13708 * In FIN_WAIT_1 STATE in addition to the processing 13709 * for the ESTABLISHED state if our FIN is now 13710 * acknowledged then enter FIN_WAIT_2. 13711 */ 13712 if (ourfinisacked) { 13713 /* 13714 * If we can't receive any more data, then 13715 * closing user can proceed. Starting the 13716 * timer is contrary to the specification, 13717 * but if we don't get a FIN we'll hang 13718 * forever. 13719 * 13720 * XXXjl: we should release the tp also, and 13721 * use a compressed state. 13722 */ 13723 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13724 soisdisconnected(so); 13725 tcp_timer_activate(tp, TT_2MSL, 13726 (tcp_fast_finwait2_recycle ? 13727 tcp_finwait2_timeout : 13728 TP_MAXIDLE(tp))); 13729 } 13730 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13731 } 13732 } 13733 } 13734 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13735 tiwin, thflags, nxt_pkt)); 13736 } 13737 13738 /* 13739 * Return value of 1, the TCB is unlocked and most 13740 * likely gone, return value of 0, the TCP is still 13741 * locked. 13742 */ 13743 static int 13744 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 13745 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13746 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13747 { 13748 struct tcp_rack *rack; 13749 int32_t orig_tlen = tlen; 13750 int32_t ret_val = 0; 13751 int32_t ourfinisacked = 0; 13752 13753 rack = (struct tcp_rack *)tp->t_fb_ptr; 13754 ctf_calc_rwin(so, tp); 13755 if ((thflags & TH_RST) || 13756 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13757 return (__ctf_process_rst(m, th, so, tp, 13758 &rack->r_ctl.challenge_ack_ts, 13759 &rack->r_ctl.challenge_ack_cnt)); 13760 if ((thflags & TH_ACK) && 13761 (SEQ_LEQ(th->th_ack, tp->snd_una) || 13762 SEQ_GT(th->th_ack, tp->snd_max))) { 13763 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13764 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13765 return (1); 13766 } 13767 if (tp->t_flags & TF_FASTOPEN) { 13768 /* 13769 * When a TFO connection is in SYN_RECEIVED, the 13770 * only valid packets are the initial SYN, a 13771 * retransmit/copy of the initial SYN (possibly with 13772 * a subset of the original data), a valid ACK, a 13773 * FIN, or a RST. 13774 */ 13775 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 13776 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13777 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13778 return (1); 13779 } else if (thflags & TH_SYN) { 13780 /* non-initial SYN is ignored */ 13781 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 13782 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 13783 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 13784 ctf_do_drop(m, NULL); 13785 return (0); 13786 } 13787 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 13788 ctf_do_drop(m, NULL); 13789 return (0); 13790 } 13791 } 13792 13793 /* 13794 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13795 * it's less than ts_recent, drop it. 13796 */ 13797 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13798 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13799 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13800 return (ret_val); 13801 } 13802 /* 13803 * In the SYN-RECEIVED state, validate that the packet belongs to 13804 * this connection before trimming the data to fit the receive 13805 * window. Check the sequence number versus IRS since we know the 13806 * sequence numbers haven't wrapped. This is a partial fix for the 13807 * "LAND" DoS attack. 13808 */ 13809 if (SEQ_LT(th->th_seq, tp->irs)) { 13810 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13811 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13812 return (1); 13813 } 13814 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13815 &rack->r_ctl.challenge_ack_ts, 13816 &rack->r_ctl.challenge_ack_cnt)) { 13817 return (ret_val); 13818 } 13819 /* 13820 * If last ACK falls within this segment's sequence numbers, record 13821 * its timestamp. NOTE: 1) That the test incorporates suggestions 13822 * from the latest proposal of the tcplw@cray.com list (Braden 13823 * 1993/04/26). 2) That updating only on newer timestamps interferes 13824 * with our earlier PAWS tests, so this check should be solely 13825 * predicated on the sequence space of this segment. 3) That we 13826 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13827 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13828 * SEG.Len, This modified check allows us to overcome RFC1323's 13829 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13830 * p.869. In such cases, we can still calculate the RTT correctly 13831 * when RCV.NXT == Last.ACK.Sent. 13832 */ 13833 if ((to->to_flags & TOF_TS) != 0 && 13834 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13835 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13836 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13837 tp->ts_recent_age = tcp_ts_getticks(); 13838 tp->ts_recent = to->to_tsval; 13839 } 13840 tp->snd_wnd = tiwin; 13841 rack_validate_fo_sendwin_up(tp, rack); 13842 /* 13843 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13844 * is on (half-synchronized state), then queue data for later 13845 * processing; else drop segment and return. 13846 */ 13847 if ((thflags & TH_ACK) == 0) { 13848 if (tp->t_flags & TF_FASTOPEN) { 13849 rack_cc_conn_init(tp); 13850 } 13851 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13852 tiwin, thflags, nxt_pkt)); 13853 } 13854 KMOD_TCPSTAT_INC(tcps_connects); 13855 if (tp->t_flags & TF_SONOTCONN) { 13856 tp->t_flags &= ~TF_SONOTCONN; 13857 soisconnected(so); 13858 } 13859 /* Do window scaling? */ 13860 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13861 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13862 tp->rcv_scale = tp->request_r_scale; 13863 } 13864 /* 13865 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 13866 * FIN-WAIT-1 13867 */ 13868 tp->t_starttime = ticks; 13869 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { 13870 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 13871 tp->t_tfo_pending = NULL; 13872 } 13873 if (tp->t_flags & TF_NEEDFIN) { 13874 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13875 tp->t_flags &= ~TF_NEEDFIN; 13876 } else { 13877 tcp_state_change(tp, TCPS_ESTABLISHED); 13878 TCP_PROBE5(accept__established, NULL, tp, 13879 mtod(m, const char *), tp, th); 13880 /* 13881 * TFO connections call cc_conn_init() during SYN 13882 * processing. Calling it again here for such connections 13883 * is not harmless as it would undo the snd_cwnd reduction 13884 * that occurs when a TFO SYN|ACK is retransmitted. 13885 */ 13886 if (!(tp->t_flags & TF_FASTOPEN)) 13887 rack_cc_conn_init(tp); 13888 } 13889 /* 13890 * Account for the ACK of our SYN prior to 13891 * regular ACK processing below, except for 13892 * simultaneous SYN, which is handled later. 13893 */ 13894 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 13895 tp->snd_una++; 13896 /* 13897 * If segment contains data or ACK, will call tcp_reass() later; if 13898 * not, do so now to pass queued data to user. 13899 */ 13900 if (tlen == 0 && (thflags & TH_FIN) == 0) { 13901 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 13902 (struct mbuf *)0); 13903 if (tp->t_flags & TF_WAKESOR) { 13904 tp->t_flags &= ~TF_WAKESOR; 13905 /* NB: sorwakeup_locked() does an implicit unlock. */ 13906 sorwakeup_locked(so); 13907 } 13908 } 13909 tp->snd_wl1 = th->th_seq - 1; 13910 /* For syn-recv we need to possibly update the rtt */ 13911 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13912 uint32_t t, mcts; 13913 13914 mcts = tcp_ts_getticks(); 13915 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13916 if (!tp->t_rttlow || tp->t_rttlow > t) 13917 tp->t_rttlow = t; 13918 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 13919 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13920 tcp_rack_xmit_timer_commit(rack, tp); 13921 } 13922 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13923 return (ret_val); 13924 } 13925 if (tp->t_state == TCPS_FIN_WAIT_1) { 13926 /* We could have went to FIN_WAIT_1 (or EST) above */ 13927 /* 13928 * In FIN_WAIT_1 STATE in addition to the processing for the 13929 * ESTABLISHED state if our FIN is now acknowledged then 13930 * enter FIN_WAIT_2. 13931 */ 13932 if (ourfinisacked) { 13933 /* 13934 * If we can't receive any more data, then closing 13935 * user can proceed. Starting the timer is contrary 13936 * to the specification, but if we don't get a FIN 13937 * we'll hang forever. 13938 * 13939 * XXXjl: we should release the tp also, and use a 13940 * compressed state. 13941 */ 13942 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13943 soisdisconnected(so); 13944 tcp_timer_activate(tp, TT_2MSL, 13945 (tcp_fast_finwait2_recycle ? 13946 tcp_finwait2_timeout : 13947 TP_MAXIDLE(tp))); 13948 } 13949 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13950 } 13951 } 13952 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13953 tiwin, thflags, nxt_pkt)); 13954 } 13955 13956 /* 13957 * Return value of 1, the TCB is unlocked and most 13958 * likely gone, return value of 0, the TCP is still 13959 * locked. 13960 */ 13961 static int 13962 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 13963 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13964 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13965 { 13966 int32_t ret_val = 0; 13967 int32_t orig_tlen = tlen; 13968 struct tcp_rack *rack; 13969 13970 /* 13971 * Header prediction: check for the two common cases of a 13972 * uni-directional data xfer. If the packet has no control flags, 13973 * is in-sequence, the window didn't change and we're not 13974 * retransmitting, it's a candidate. If the length is zero and the 13975 * ack moved forward, we're the sender side of the xfer. Just free 13976 * the data acked & wake any higher level process that was blocked 13977 * waiting for space. If the length is non-zero and the ack didn't 13978 * move, we're the receiver side. If we're getting packets in-order 13979 * (the reassembly queue is empty), add the data toc The socket 13980 * buffer and note that we need a delayed ack. Make sure that the 13981 * hidden state-flags are also off. Since we check for 13982 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 13983 */ 13984 rack = (struct tcp_rack *)tp->t_fb_ptr; 13985 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 13986 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 13987 __predict_true(SEGQ_EMPTY(tp)) && 13988 __predict_true(th->th_seq == tp->rcv_nxt)) { 13989 if (tlen == 0) { 13990 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 13991 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 13992 return (0); 13993 } 13994 } else { 13995 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 13996 tiwin, nxt_pkt, iptos)) { 13997 return (0); 13998 } 13999 } 14000 } 14001 ctf_calc_rwin(so, tp); 14002 14003 if ((thflags & TH_RST) || 14004 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14005 return (__ctf_process_rst(m, th, so, tp, 14006 &rack->r_ctl.challenge_ack_ts, 14007 &rack->r_ctl.challenge_ack_cnt)); 14008 14009 /* 14010 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14011 * synchronized state. 14012 */ 14013 if (thflags & TH_SYN) { 14014 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14015 return (ret_val); 14016 } 14017 /* 14018 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14019 * it's less than ts_recent, drop it. 14020 */ 14021 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14022 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14023 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14024 return (ret_val); 14025 } 14026 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14027 &rack->r_ctl.challenge_ack_ts, 14028 &rack->r_ctl.challenge_ack_cnt)) { 14029 return (ret_val); 14030 } 14031 /* 14032 * If last ACK falls within this segment's sequence numbers, record 14033 * its timestamp. NOTE: 1) That the test incorporates suggestions 14034 * from the latest proposal of the tcplw@cray.com list (Braden 14035 * 1993/04/26). 2) That updating only on newer timestamps interferes 14036 * with our earlier PAWS tests, so this check should be solely 14037 * predicated on the sequence space of this segment. 3) That we 14038 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14039 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14040 * SEG.Len, This modified check allows us to overcome RFC1323's 14041 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14042 * p.869. In such cases, we can still calculate the RTT correctly 14043 * when RCV.NXT == Last.ACK.Sent. 14044 */ 14045 if ((to->to_flags & TOF_TS) != 0 && 14046 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14047 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14048 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14049 tp->ts_recent_age = tcp_ts_getticks(); 14050 tp->ts_recent = to->to_tsval; 14051 } 14052 /* 14053 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14054 * is on (half-synchronized state), then queue data for later 14055 * processing; else drop segment and return. 14056 */ 14057 if ((thflags & TH_ACK) == 0) { 14058 if (tp->t_flags & TF_NEEDSYN) { 14059 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14060 tiwin, thflags, nxt_pkt)); 14061 14062 } else if (tp->t_flags & TF_ACKNOW) { 14063 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14064 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14065 return (ret_val); 14066 } else { 14067 ctf_do_drop(m, NULL); 14068 return (0); 14069 } 14070 } 14071 /* 14072 * Ack processing. 14073 */ 14074 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 14075 return (ret_val); 14076 } 14077 if (sbavail(&so->so_snd)) { 14078 if (ctf_progress_timeout_check(tp, true)) { 14079 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 14080 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14081 return (1); 14082 } 14083 } 14084 /* State changes only happen in rack_process_data() */ 14085 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14086 tiwin, thflags, nxt_pkt)); 14087 } 14088 14089 /* 14090 * Return value of 1, the TCB is unlocked and most 14091 * likely gone, return value of 0, the TCP is still 14092 * locked. 14093 */ 14094 static int 14095 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 14096 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14097 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14098 { 14099 int32_t ret_val = 0; 14100 int32_t orig_tlen = tlen; 14101 struct tcp_rack *rack; 14102 14103 rack = (struct tcp_rack *)tp->t_fb_ptr; 14104 ctf_calc_rwin(so, tp); 14105 if ((thflags & TH_RST) || 14106 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14107 return (__ctf_process_rst(m, th, so, tp, 14108 &rack->r_ctl.challenge_ack_ts, 14109 &rack->r_ctl.challenge_ack_cnt)); 14110 /* 14111 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14112 * synchronized state. 14113 */ 14114 if (thflags & TH_SYN) { 14115 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14116 return (ret_val); 14117 } 14118 /* 14119 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14120 * it's less than ts_recent, drop it. 14121 */ 14122 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14123 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14124 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14125 return (ret_val); 14126 } 14127 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14128 &rack->r_ctl.challenge_ack_ts, 14129 &rack->r_ctl.challenge_ack_cnt)) { 14130 return (ret_val); 14131 } 14132 /* 14133 * If last ACK falls within this segment's sequence numbers, record 14134 * its timestamp. NOTE: 1) That the test incorporates suggestions 14135 * from the latest proposal of the tcplw@cray.com list (Braden 14136 * 1993/04/26). 2) That updating only on newer timestamps interferes 14137 * with our earlier PAWS tests, so this check should be solely 14138 * predicated on the sequence space of this segment. 3) That we 14139 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14140 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14141 * SEG.Len, This modified check allows us to overcome RFC1323's 14142 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14143 * p.869. In such cases, we can still calculate the RTT correctly 14144 * when RCV.NXT == Last.ACK.Sent. 14145 */ 14146 if ((to->to_flags & TOF_TS) != 0 && 14147 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14148 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14149 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14150 tp->ts_recent_age = tcp_ts_getticks(); 14151 tp->ts_recent = to->to_tsval; 14152 } 14153 /* 14154 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14155 * is on (half-synchronized state), then queue data for later 14156 * processing; else drop segment and return. 14157 */ 14158 if ((thflags & TH_ACK) == 0) { 14159 if (tp->t_flags & TF_NEEDSYN) { 14160 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14161 tiwin, thflags, nxt_pkt)); 14162 14163 } else if (tp->t_flags & TF_ACKNOW) { 14164 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14165 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14166 return (ret_val); 14167 } else { 14168 ctf_do_drop(m, NULL); 14169 return (0); 14170 } 14171 } 14172 /* 14173 * Ack processing. 14174 */ 14175 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 14176 return (ret_val); 14177 } 14178 if (sbavail(&so->so_snd)) { 14179 if (ctf_progress_timeout_check(tp, true)) { 14180 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14181 tp, tick, PROGRESS_DROP, __LINE__); 14182 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14183 return (1); 14184 } 14185 } 14186 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14187 tiwin, thflags, nxt_pkt)); 14188 } 14189 14190 static int 14191 rack_check_data_after_close(struct mbuf *m, 14192 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 14193 { 14194 struct tcp_rack *rack; 14195 14196 rack = (struct tcp_rack *)tp->t_fb_ptr; 14197 if (rack->rc_allow_data_af_clo == 0) { 14198 close_now: 14199 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 14200 /* tcp_close will kill the inp pre-log the Reset */ 14201 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14202 tp = tcp_close(tp); 14203 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 14204 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 14205 return (1); 14206 } 14207 if (sbavail(&so->so_snd) == 0) 14208 goto close_now; 14209 /* Ok we allow data that is ignored and a followup reset */ 14210 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 14211 tp->rcv_nxt = th->th_seq + *tlen; 14212 tp->t_flags2 |= TF2_DROP_AF_DATA; 14213 rack->r_wanted_output = 1; 14214 *tlen = 0; 14215 return (0); 14216 } 14217 14218 /* 14219 * Return value of 1, the TCB is unlocked and most 14220 * likely gone, return value of 0, the TCP is still 14221 * locked. 14222 */ 14223 static int 14224 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 14225 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14226 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14227 { 14228 int32_t ret_val = 0; 14229 int32_t orig_tlen = tlen; 14230 int32_t ourfinisacked = 0; 14231 struct tcp_rack *rack; 14232 14233 rack = (struct tcp_rack *)tp->t_fb_ptr; 14234 ctf_calc_rwin(so, tp); 14235 14236 if ((thflags & TH_RST) || 14237 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14238 return (__ctf_process_rst(m, th, so, tp, 14239 &rack->r_ctl.challenge_ack_ts, 14240 &rack->r_ctl.challenge_ack_cnt)); 14241 /* 14242 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14243 * synchronized state. 14244 */ 14245 if (thflags & TH_SYN) { 14246 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14247 return (ret_val); 14248 } 14249 /* 14250 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14251 * it's less than ts_recent, drop it. 14252 */ 14253 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14254 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14255 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14256 return (ret_val); 14257 } 14258 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14259 &rack->r_ctl.challenge_ack_ts, 14260 &rack->r_ctl.challenge_ack_cnt)) { 14261 return (ret_val); 14262 } 14263 /* 14264 * If new data are received on a connection after the user processes 14265 * are gone, then RST the other end. 14266 */ 14267 if ((tp->t_flags & TF_CLOSED) && tlen && 14268 rack_check_data_after_close(m, tp, &tlen, th, so)) 14269 return (1); 14270 /* 14271 * If last ACK falls within this segment's sequence numbers, record 14272 * its timestamp. NOTE: 1) That the test incorporates suggestions 14273 * from the latest proposal of the tcplw@cray.com list (Braden 14274 * 1993/04/26). 2) That updating only on newer timestamps interferes 14275 * with our earlier PAWS tests, so this check should be solely 14276 * predicated on the sequence space of this segment. 3) That we 14277 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14278 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14279 * SEG.Len, This modified check allows us to overcome RFC1323's 14280 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14281 * p.869. In such cases, we can still calculate the RTT correctly 14282 * when RCV.NXT == Last.ACK.Sent. 14283 */ 14284 if ((to->to_flags & TOF_TS) != 0 && 14285 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14286 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14287 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14288 tp->ts_recent_age = tcp_ts_getticks(); 14289 tp->ts_recent = to->to_tsval; 14290 } 14291 /* 14292 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14293 * is on (half-synchronized state), then queue data for later 14294 * processing; else drop segment and return. 14295 */ 14296 if ((thflags & TH_ACK) == 0) { 14297 if (tp->t_flags & TF_NEEDSYN) { 14298 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14299 tiwin, thflags, nxt_pkt)); 14300 } else if (tp->t_flags & TF_ACKNOW) { 14301 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14302 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14303 return (ret_val); 14304 } else { 14305 ctf_do_drop(m, NULL); 14306 return (0); 14307 } 14308 } 14309 /* 14310 * Ack processing. 14311 */ 14312 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 14313 return (ret_val); 14314 } 14315 if (ourfinisacked) { 14316 /* 14317 * If we can't receive any more data, then closing user can 14318 * proceed. Starting the timer is contrary to the 14319 * specification, but if we don't get a FIN we'll hang 14320 * forever. 14321 * 14322 * XXXjl: we should release the tp also, and use a 14323 * compressed state. 14324 */ 14325 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 14326 soisdisconnected(so); 14327 tcp_timer_activate(tp, TT_2MSL, 14328 (tcp_fast_finwait2_recycle ? 14329 tcp_finwait2_timeout : 14330 TP_MAXIDLE(tp))); 14331 } 14332 tcp_state_change(tp, TCPS_FIN_WAIT_2); 14333 } 14334 if (sbavail(&so->so_snd)) { 14335 if (ctf_progress_timeout_check(tp, true)) { 14336 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14337 tp, tick, PROGRESS_DROP, __LINE__); 14338 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14339 return (1); 14340 } 14341 } 14342 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14343 tiwin, thflags, nxt_pkt)); 14344 } 14345 14346 /* 14347 * Return value of 1, the TCB is unlocked and most 14348 * likely gone, return value of 0, the TCP is still 14349 * locked. 14350 */ 14351 static int 14352 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 14353 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14354 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14355 { 14356 int32_t ret_val = 0; 14357 int32_t orig_tlen = tlen; 14358 int32_t ourfinisacked = 0; 14359 struct tcp_rack *rack; 14360 14361 rack = (struct tcp_rack *)tp->t_fb_ptr; 14362 ctf_calc_rwin(so, tp); 14363 14364 if ((thflags & TH_RST) || 14365 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14366 return (__ctf_process_rst(m, th, so, tp, 14367 &rack->r_ctl.challenge_ack_ts, 14368 &rack->r_ctl.challenge_ack_cnt)); 14369 /* 14370 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14371 * synchronized state. 14372 */ 14373 if (thflags & TH_SYN) { 14374 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14375 return (ret_val); 14376 } 14377 /* 14378 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14379 * it's less than ts_recent, drop it. 14380 */ 14381 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14382 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14383 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14384 return (ret_val); 14385 } 14386 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14387 &rack->r_ctl.challenge_ack_ts, 14388 &rack->r_ctl.challenge_ack_cnt)) { 14389 return (ret_val); 14390 } 14391 /* 14392 * If last ACK falls within this segment's sequence numbers, record 14393 * its timestamp. NOTE: 1) That the test incorporates suggestions 14394 * from the latest proposal of the tcplw@cray.com list (Braden 14395 * 1993/04/26). 2) That updating only on newer timestamps interferes 14396 * with our earlier PAWS tests, so this check should be solely 14397 * predicated on the sequence space of this segment. 3) That we 14398 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14399 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14400 * SEG.Len, This modified check allows us to overcome RFC1323's 14401 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14402 * p.869. In such cases, we can still calculate the RTT correctly 14403 * when RCV.NXT == Last.ACK.Sent. 14404 */ 14405 if ((to->to_flags & TOF_TS) != 0 && 14406 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14407 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14408 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14409 tp->ts_recent_age = tcp_ts_getticks(); 14410 tp->ts_recent = to->to_tsval; 14411 } 14412 /* 14413 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14414 * is on (half-synchronized state), then queue data for later 14415 * processing; else drop segment and return. 14416 */ 14417 if ((thflags & TH_ACK) == 0) { 14418 if (tp->t_flags & TF_NEEDSYN) { 14419 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14420 tiwin, thflags, nxt_pkt)); 14421 } else if (tp->t_flags & TF_ACKNOW) { 14422 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14423 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14424 return (ret_val); 14425 } else { 14426 ctf_do_drop(m, NULL); 14427 return (0); 14428 } 14429 } 14430 /* 14431 * Ack processing. 14432 */ 14433 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 14434 return (ret_val); 14435 } 14436 if (ourfinisacked) { 14437 tcp_twstart(tp); 14438 m_freem(m); 14439 return (1); 14440 } 14441 if (sbavail(&so->so_snd)) { 14442 if (ctf_progress_timeout_check(tp, true)) { 14443 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14444 tp, tick, PROGRESS_DROP, __LINE__); 14445 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14446 return (1); 14447 } 14448 } 14449 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14450 tiwin, thflags, nxt_pkt)); 14451 } 14452 14453 /* 14454 * Return value of 1, the TCB is unlocked and most 14455 * likely gone, return value of 0, the TCP is still 14456 * locked. 14457 */ 14458 static int 14459 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 14460 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14461 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14462 { 14463 int32_t ret_val = 0; 14464 int32_t orig_tlen; 14465 int32_t ourfinisacked = 0; 14466 struct tcp_rack *rack; 14467 14468 rack = (struct tcp_rack *)tp->t_fb_ptr; 14469 ctf_calc_rwin(so, tp); 14470 14471 if ((thflags & TH_RST) || 14472 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14473 return (__ctf_process_rst(m, th, so, tp, 14474 &rack->r_ctl.challenge_ack_ts, 14475 &rack->r_ctl.challenge_ack_cnt)); 14476 /* 14477 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14478 * synchronized state. 14479 */ 14480 if (thflags & TH_SYN) { 14481 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14482 return (ret_val); 14483 } 14484 /* 14485 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14486 * it's less than ts_recent, drop it. 14487 */ 14488 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14489 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14490 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14491 return (ret_val); 14492 } 14493 orig_tlen = tlen; 14494 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14495 &rack->r_ctl.challenge_ack_ts, 14496 &rack->r_ctl.challenge_ack_cnt)) { 14497 return (ret_val); 14498 } 14499 /* 14500 * If last ACK falls within this segment's sequence numbers, record 14501 * its timestamp. NOTE: 1) That the test incorporates suggestions 14502 * from the latest proposal of the tcplw@cray.com list (Braden 14503 * 1993/04/26). 2) That updating only on newer timestamps interferes 14504 * with our earlier PAWS tests, so this check should be solely 14505 * predicated on the sequence space of this segment. 3) That we 14506 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14507 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14508 * SEG.Len, This modified check allows us to overcome RFC1323's 14509 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14510 * p.869. In such cases, we can still calculate the RTT correctly 14511 * when RCV.NXT == Last.ACK.Sent. 14512 */ 14513 if ((to->to_flags & TOF_TS) != 0 && 14514 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14515 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14516 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14517 tp->ts_recent_age = tcp_ts_getticks(); 14518 tp->ts_recent = to->to_tsval; 14519 } 14520 /* 14521 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14522 * is on (half-synchronized state), then queue data for later 14523 * processing; else drop segment and return. 14524 */ 14525 if ((thflags & TH_ACK) == 0) { 14526 if (tp->t_flags & TF_NEEDSYN) { 14527 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14528 tiwin, thflags, nxt_pkt)); 14529 } else if (tp->t_flags & TF_ACKNOW) { 14530 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14531 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14532 return (ret_val); 14533 } else { 14534 ctf_do_drop(m, NULL); 14535 return (0); 14536 } 14537 } 14538 /* 14539 * case TCPS_LAST_ACK: Ack processing. 14540 */ 14541 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 14542 return (ret_val); 14543 } 14544 if (ourfinisacked) { 14545 tp = tcp_close(tp); 14546 ctf_do_drop(m, tp); 14547 return (1); 14548 } 14549 if (sbavail(&so->so_snd)) { 14550 if (ctf_progress_timeout_check(tp, true)) { 14551 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14552 tp, tick, PROGRESS_DROP, __LINE__); 14553 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14554 return (1); 14555 } 14556 } 14557 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14558 tiwin, thflags, nxt_pkt)); 14559 } 14560 14561 /* 14562 * Return value of 1, the TCB is unlocked and most 14563 * likely gone, return value of 0, the TCP is still 14564 * locked. 14565 */ 14566 static int 14567 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 14568 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14569 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14570 { 14571 int32_t ret_val = 0; 14572 int32_t orig_tlen = tlen; 14573 int32_t ourfinisacked = 0; 14574 struct tcp_rack *rack; 14575 14576 rack = (struct tcp_rack *)tp->t_fb_ptr; 14577 ctf_calc_rwin(so, tp); 14578 14579 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 14580 if ((thflags & TH_RST) || 14581 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14582 return (__ctf_process_rst(m, th, so, tp, 14583 &rack->r_ctl.challenge_ack_ts, 14584 &rack->r_ctl.challenge_ack_cnt)); 14585 /* 14586 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14587 * synchronized state. 14588 */ 14589 if (thflags & TH_SYN) { 14590 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14591 return (ret_val); 14592 } 14593 /* 14594 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14595 * it's less than ts_recent, drop it. 14596 */ 14597 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14598 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14599 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14600 return (ret_val); 14601 } 14602 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14603 &rack->r_ctl.challenge_ack_ts, 14604 &rack->r_ctl.challenge_ack_cnt)) { 14605 return (ret_val); 14606 } 14607 /* 14608 * If new data are received on a connection after the user processes 14609 * are gone, then RST the other end. 14610 */ 14611 if ((tp->t_flags & TF_CLOSED) && tlen && 14612 rack_check_data_after_close(m, tp, &tlen, th, so)) 14613 return (1); 14614 /* 14615 * If last ACK falls within this segment's sequence numbers, record 14616 * its timestamp. NOTE: 1) That the test incorporates suggestions 14617 * from the latest proposal of the tcplw@cray.com list (Braden 14618 * 1993/04/26). 2) That updating only on newer timestamps interferes 14619 * with our earlier PAWS tests, so this check should be solely 14620 * predicated on the sequence space of this segment. 3) That we 14621 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14622 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14623 * SEG.Len, This modified check allows us to overcome RFC1323's 14624 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14625 * p.869. In such cases, we can still calculate the RTT correctly 14626 * when RCV.NXT == Last.ACK.Sent. 14627 */ 14628 if ((to->to_flags & TOF_TS) != 0 && 14629 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14630 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14631 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14632 tp->ts_recent_age = tcp_ts_getticks(); 14633 tp->ts_recent = to->to_tsval; 14634 } 14635 /* 14636 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14637 * is on (half-synchronized state), then queue data for later 14638 * processing; else drop segment and return. 14639 */ 14640 if ((thflags & TH_ACK) == 0) { 14641 if (tp->t_flags & TF_NEEDSYN) { 14642 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14643 tiwin, thflags, nxt_pkt)); 14644 } else if (tp->t_flags & TF_ACKNOW) { 14645 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14646 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14647 return (ret_val); 14648 } else { 14649 ctf_do_drop(m, NULL); 14650 return (0); 14651 } 14652 } 14653 /* 14654 * Ack processing. 14655 */ 14656 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 14657 return (ret_val); 14658 } 14659 if (sbavail(&so->so_snd)) { 14660 if (ctf_progress_timeout_check(tp, true)) { 14661 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14662 tp, tick, PROGRESS_DROP, __LINE__); 14663 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14664 return (1); 14665 } 14666 } 14667 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14668 tiwin, thflags, nxt_pkt)); 14669 } 14670 14671 static void inline 14672 rack_clear_rate_sample(struct tcp_rack *rack) 14673 { 14674 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 14675 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 14676 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 14677 } 14678 14679 static void 14680 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 14681 { 14682 uint64_t bw_est, rate_wanted; 14683 int chged = 0; 14684 uint32_t user_max, orig_min, orig_max; 14685 14686 #ifdef TCP_REQUEST_TRK 14687 if (rack->rc_hybrid_mode && 14688 (rack->r_ctl.rc_pace_max_segs != 0) && 14689 (rack_hybrid_allow_set_maxseg == 1) && 14690 (rack->r_ctl.rc_last_sft != NULL)) { 14691 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; 14692 return; 14693 } 14694 #endif 14695 orig_min = rack->r_ctl.rc_pace_min_segs; 14696 orig_max = rack->r_ctl.rc_pace_max_segs; 14697 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 14698 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 14699 chged = 1; 14700 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 14701 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 14702 if (user_max != rack->r_ctl.rc_pace_max_segs) 14703 chged = 1; 14704 } 14705 if (rack->rc_force_max_seg) { 14706 rack->r_ctl.rc_pace_max_segs = user_max; 14707 } else if (rack->use_fixed_rate) { 14708 bw_est = rack_get_bw(rack); 14709 if ((rack->r_ctl.crte == NULL) || 14710 (bw_est != rack->r_ctl.crte->rate)) { 14711 rack->r_ctl.rc_pace_max_segs = user_max; 14712 } else { 14713 /* We are pacing right at the hardware rate */ 14714 uint32_t segsiz, pace_one; 14715 14716 if (rack_pace_one_seg || 14717 (rack->r_ctl.rc_user_set_min_segs == 1)) 14718 pace_one = 1; 14719 else 14720 pace_one = 0; 14721 segsiz = min(ctf_fixed_maxseg(tp), 14722 rack->r_ctl.rc_pace_min_segs); 14723 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( 14724 tp, bw_est, segsiz, pace_one, 14725 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 14726 } 14727 } else if (rack->rc_always_pace) { 14728 if (rack->r_ctl.gp_bw || 14729 rack->r_ctl.init_rate) { 14730 /* We have a rate of some sort set */ 14731 uint32_t orig; 14732 14733 bw_est = rack_get_bw(rack); 14734 orig = rack->r_ctl.rc_pace_max_segs; 14735 if (fill_override) 14736 rate_wanted = *fill_override; 14737 else 14738 rate_wanted = rack_get_gp_est(rack); 14739 if (rate_wanted) { 14740 /* We have something */ 14741 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 14742 rate_wanted, 14743 ctf_fixed_maxseg(rack->rc_tp)); 14744 } else 14745 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 14746 if (orig != rack->r_ctl.rc_pace_max_segs) 14747 chged = 1; 14748 } else if ((rack->r_ctl.gp_bw == 0) && 14749 (rack->r_ctl.rc_pace_max_segs == 0)) { 14750 /* 14751 * If we have nothing limit us to bursting 14752 * out IW sized pieces. 14753 */ 14754 chged = 1; 14755 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 14756 } 14757 } 14758 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 14759 chged = 1; 14760 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 14761 } 14762 if (chged) 14763 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 14764 } 14765 14766 14767 static void 14768 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) 14769 { 14770 #ifdef INET6 14771 struct ip6_hdr *ip6 = NULL; 14772 #endif 14773 #ifdef INET 14774 struct ip *ip = NULL; 14775 #endif 14776 struct udphdr *udp = NULL; 14777 14778 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 14779 #ifdef INET6 14780 if (rack->r_is_v6) { 14781 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 14782 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 14783 if (tp->t_port) { 14784 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14785 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 14786 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14787 udp->uh_dport = tp->t_port; 14788 rack->r_ctl.fsb.udp = udp; 14789 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14790 } else 14791 { 14792 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 14793 rack->r_ctl.fsb.udp = NULL; 14794 } 14795 tcpip_fillheaders(rack->rc_inp, 14796 tp->t_port, 14797 ip6, rack->r_ctl.fsb.th); 14798 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); 14799 } else 14800 #endif /* INET6 */ 14801 #ifdef INET 14802 { 14803 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 14804 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 14805 if (tp->t_port) { 14806 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14807 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 14808 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14809 udp->uh_dport = tp->t_port; 14810 rack->r_ctl.fsb.udp = udp; 14811 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14812 } else 14813 { 14814 rack->r_ctl.fsb.udp = NULL; 14815 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 14816 } 14817 tcpip_fillheaders(rack->rc_inp, 14818 tp->t_port, 14819 ip, rack->r_ctl.fsb.th); 14820 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; 14821 } 14822 #endif 14823 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), 14824 (long)TCP_MAXWIN << tp->rcv_scale); 14825 rack->r_fsb_inited = 1; 14826 } 14827 14828 static int 14829 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 14830 { 14831 /* 14832 * Allocate the larger of spaces V6 if available else just 14833 * V4 and include udphdr (overbook) 14834 */ 14835 #ifdef INET6 14836 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 14837 #else 14838 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 14839 #endif 14840 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 14841 M_TCPFSB, M_NOWAIT|M_ZERO); 14842 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 14843 return (ENOMEM); 14844 } 14845 rack->r_fsb_inited = 0; 14846 return (0); 14847 } 14848 14849 static void 14850 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) 14851 { 14852 /* 14853 * Types of logs (mod value) 14854 * 20 - Initial round setup 14855 * 21 - Rack declares a new round. 14856 */ 14857 struct tcpcb *tp; 14858 14859 tp = rack->rc_tp; 14860 if (tcp_bblogging_on(tp)) { 14861 union tcp_log_stackspecific log; 14862 struct timeval tv; 14863 14864 memset(&log, 0, sizeof(log)); 14865 log.u_bbr.flex1 = rack->r_ctl.current_round; 14866 log.u_bbr.flex2 = rack->r_ctl.roundends; 14867 log.u_bbr.flex3 = high_seq; 14868 log.u_bbr.flex4 = tp->snd_max; 14869 log.u_bbr.flex8 = mod; 14870 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14871 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 14872 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 14873 TCP_LOG_EVENTP(tp, NULL, 14874 &tptosocket(tp)->so_rcv, 14875 &tptosocket(tp)->so_snd, 14876 TCP_HYSTART, 0, 14877 0, &log, false, &tv); 14878 } 14879 } 14880 14881 static void 14882 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) 14883 { 14884 rack->rack_deferred_inited = 1; 14885 rack->r_ctl.roundends = tp->snd_max; 14886 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 14887 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 14888 } 14889 14890 static void 14891 rack_init_retransmit_value(struct tcp_rack *rack, int ctl) 14892 { 14893 /* Retransmit bit controls. 14894 * 14895 * The setting of these values control one of 14896 * three settings you can have and dictate 14897 * how rack does retransmissions. Note this 14898 * is in *any* mode i.e. pacing on or off DGP 14899 * fixed rate pacing, or just bursting rack. 14900 * 14901 * 1 - Use full sized retransmits i.e. limit 14902 * the size to whatever the pace_max_segments 14903 * size is. 14904 * 14905 * 2 - Use pacer min granularity as a guide to 14906 * the size combined with the current calculated 14907 * goodput b/w measurement. So for example if 14908 * the goodput is measured at 20Mbps we would 14909 * calculate 8125 (pacer minimum 250usec in 14910 * that b/w) and then round it up to the next 14911 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. 14912 * 14913 * 0 - The rack default 1 MSS (anything not 0/1/2 14914 * fall here too if we are setting via rack_init()). 14915 * 14916 */ 14917 if (ctl == 1) { 14918 rack->full_size_rxt = 1; 14919 rack->shape_rxt_to_pacing_min = 0; 14920 } else if (ctl == 2) { 14921 rack->full_size_rxt = 0; 14922 rack->shape_rxt_to_pacing_min = 1; 14923 } else { 14924 rack->full_size_rxt = 0; 14925 rack->shape_rxt_to_pacing_min = 0; 14926 } 14927 } 14928 14929 static void 14930 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, 14931 uint32_t flex1, 14932 uint32_t flex2, 14933 uint32_t flex3) 14934 { 14935 if (tcp_bblogging_on(rack->rc_tp)) { 14936 union tcp_log_stackspecific log; 14937 struct timeval tv; 14938 14939 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14940 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14941 log.u_bbr.flex8 = mod; 14942 log.u_bbr.flex1 = flex1; 14943 log.u_bbr.flex2 = flex2; 14944 log.u_bbr.flex3 = flex3; 14945 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, 14946 0, &log, false, NULL, __func__, __LINE__, &tv); 14947 } 14948 } 14949 14950 static int 14951 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) 14952 { 14953 struct tcp_rack *rack; 14954 struct rack_sendmap *rsm; 14955 int i; 14956 14957 14958 rack = (struct tcp_rack *)tp->t_fb_ptr; 14959 switch (reqr->req) { 14960 case TCP_QUERY_SENDMAP: 14961 if ((reqr->req_param == tp->snd_max) || 14962 (tp->snd_max == tp->snd_una)){ 14963 /* Unlikely */ 14964 return (0); 14965 } 14966 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); 14967 if (rsm == NULL) { 14968 /* Can't find that seq -- unlikely */ 14969 return (0); 14970 } 14971 reqr->sendmap_start = rsm->r_start; 14972 reqr->sendmap_end = rsm->r_end; 14973 reqr->sendmap_send_cnt = rsm->r_rtr_cnt; 14974 reqr->sendmap_fas = rsm->r_fas; 14975 if (reqr->sendmap_send_cnt > SNDMAP_NRTX) 14976 reqr->sendmap_send_cnt = SNDMAP_NRTX; 14977 for(i=0; i<reqr->sendmap_send_cnt; i++) 14978 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; 14979 reqr->sendmap_ack_arrival = rsm->r_ack_arrival; 14980 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; 14981 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; 14982 reqr->sendmap_dupacks = rsm->r_dupack; 14983 rack_log_chg_info(tp, rack, 1, 14984 rsm->r_start, 14985 rsm->r_end, 14986 rsm->r_flags); 14987 return(1); 14988 break; 14989 case TCP_QUERY_TIMERS_UP: 14990 if (rack->r_ctl.rc_hpts_flags == 0) { 14991 /* no timers up */ 14992 return (0); 14993 } 14994 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; 14995 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14996 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; 14997 } 14998 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14999 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; 15000 } 15001 rack_log_chg_info(tp, rack, 2, 15002 rack->r_ctl.rc_hpts_flags, 15003 rack->r_ctl.rc_last_output_to, 15004 rack->r_ctl.rc_timer_exp); 15005 return (1); 15006 break; 15007 case TCP_QUERY_RACK_TIMES: 15008 /* Reordering items */ 15009 reqr->rack_num_dsacks = rack->r_ctl.num_dsack; 15010 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; 15011 /* Timerstamps and timers */ 15012 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; 15013 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; 15014 reqr->rack_rtt = rack->rc_rack_rtt; 15015 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; 15016 reqr->rack_srtt_measured = rack->rc_srtt_measure_made; 15017 /* PRR data */ 15018 reqr->rack_sacked = rack->r_ctl.rc_sacked; 15019 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; 15020 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; 15021 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; 15022 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; 15023 reqr->rack_prr_out = rack->r_ctl.rc_prr_out; 15024 /* TLP and persists info */ 15025 reqr->rack_tlp_out = rack->rc_tlp_in_progress; 15026 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; 15027 if (rack->rc_in_persist) { 15028 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; 15029 reqr->rack_in_persist = 1; 15030 } else { 15031 reqr->rack_time_went_idle = 0; 15032 reqr->rack_in_persist = 0; 15033 } 15034 if (rack->r_wanted_output) 15035 reqr->rack_wanted_output = 1; 15036 else 15037 reqr->rack_wanted_output = 0; 15038 return (1); 15039 break; 15040 default: 15041 return (-EINVAL); 15042 } 15043 } 15044 15045 static void 15046 rack_switch_failed(struct tcpcb *tp) 15047 { 15048 /* 15049 * This method gets called if a stack switch was 15050 * attempted and it failed. We are left 15051 * but our hpts timers were stopped and we 15052 * need to validate time units and t_flags2. 15053 */ 15054 struct tcp_rack *rack; 15055 struct timeval tv; 15056 uint32_t cts; 15057 uint32_t toval; 15058 struct hpts_diag diag; 15059 15060 rack = (struct tcp_rack *)tp->t_fb_ptr; 15061 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 15062 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 15063 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 15064 else 15065 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 15066 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15067 tp->t_flags2 |= TF2_MBUF_ACKCMP; 15068 if (tp->t_in_hpts > IHPTS_NONE) { 15069 /* Strange */ 15070 return; 15071 } 15072 cts = tcp_get_usecs(&tv); 15073 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 15074 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 15075 toval = rack->r_ctl.rc_last_output_to - cts; 15076 } else { 15077 /* one slot please */ 15078 toval = HPTS_TICKS_PER_SLOT; 15079 } 15080 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 15081 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 15082 toval = rack->r_ctl.rc_timer_exp - cts; 15083 } else { 15084 /* one slot please */ 15085 toval = HPTS_TICKS_PER_SLOT; 15086 } 15087 } else 15088 toval = HPTS_TICKS_PER_SLOT; 15089 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), 15090 __LINE__, &diag); 15091 rack_log_hpts_diag(rack, cts, &diag, &tv); 15092 } 15093 15094 static int 15095 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) 15096 { 15097 struct rack_sendmap *rsm, *ersm; 15098 int insret __diagused; 15099 /* 15100 * When initing outstanding, we must be quite careful 15101 * to not refer to tp->t_fb_ptr. This has the old rack 15102 * pointer in it, not the "new" one (when we are doing 15103 * a stack switch). 15104 */ 15105 15106 15107 if (tp->t_fb->tfb_chg_query == NULL) { 15108 /* Create a send map for the current outstanding data */ 15109 15110 rsm = rack_alloc(rack); 15111 if (rsm == NULL) { 15112 uma_zfree(rack_pcb_zone, ptr); 15113 return (ENOMEM); 15114 } 15115 rsm->r_no_rtt_allowed = 1; 15116 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 15117 rsm->r_rtr_cnt = 1; 15118 rsm->r_rtr_bytes = 0; 15119 if (tp->t_flags & TF_SENTFIN) 15120 rsm->r_flags |= RACK_HAS_FIN; 15121 rsm->r_end = tp->snd_max; 15122 if (tp->snd_una == tp->iss) { 15123 /* The data space is one beyond snd_una */ 15124 rsm->r_flags |= RACK_HAS_SYN; 15125 rsm->r_start = tp->iss; 15126 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 15127 } else 15128 rsm->r_start = tp->snd_una; 15129 rsm->r_dupack = 0; 15130 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 15131 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 15132 if (rsm->m) { 15133 rsm->orig_m_len = rsm->m->m_len; 15134 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 15135 } else { 15136 rsm->orig_m_len = 0; 15137 rsm->orig_t_space = 0; 15138 } 15139 } else { 15140 /* 15141 * This can happen if we have a stand-alone FIN or 15142 * SYN. 15143 */ 15144 rsm->m = NULL; 15145 rsm->orig_m_len = 0; 15146 rsm->orig_t_space = 0; 15147 rsm->soff = 0; 15148 } 15149 #ifdef INVARIANTS 15150 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 15151 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 15152 insret, rack, rsm); 15153 } 15154 #else 15155 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 15156 #endif 15157 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 15158 rsm->r_in_tmap = 1; 15159 } else { 15160 /* We have a query mechanism, lets use it */ 15161 struct tcp_query_resp qr; 15162 int i; 15163 tcp_seq at; 15164 15165 at = tp->snd_una; 15166 while (at != tp->snd_max) { 15167 memset(&qr, 0, sizeof(qr)); 15168 qr.req = TCP_QUERY_SENDMAP; 15169 qr.req_param = at; 15170 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) 15171 break; 15172 /* Move forward */ 15173 at = qr.sendmap_end; 15174 /* Now lets build the entry for this one */ 15175 rsm = rack_alloc(rack); 15176 if (rsm == NULL) { 15177 uma_zfree(rack_pcb_zone, ptr); 15178 return (ENOMEM); 15179 } 15180 memset(rsm, 0, sizeof(struct rack_sendmap)); 15181 /* Now configure the rsm and insert it */ 15182 rsm->r_dupack = qr.sendmap_dupacks; 15183 rsm->r_start = qr.sendmap_start; 15184 rsm->r_end = qr.sendmap_end; 15185 if (qr.sendmap_fas) 15186 rsm->r_fas = qr.sendmap_end; 15187 else 15188 rsm->r_fas = rsm->r_start - tp->snd_una; 15189 /* 15190 * We have carefully aligned the bits 15191 * so that all we have to do is copy over 15192 * the bits with the mask. 15193 */ 15194 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; 15195 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; 15196 rsm->r_rtr_cnt = qr.sendmap_send_cnt; 15197 rsm->r_ack_arrival = qr.sendmap_ack_arrival; 15198 for (i=0 ; i<rsm->r_rtr_cnt; i++) 15199 rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; 15200 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 15201 (rsm->r_start - tp->snd_una), &rsm->soff); 15202 if (rsm->m) { 15203 rsm->orig_m_len = rsm->m->m_len; 15204 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 15205 } else { 15206 rsm->orig_m_len = 0; 15207 rsm->orig_t_space = 0; 15208 } 15209 #ifdef INVARIANTS 15210 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 15211 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 15212 insret, rack, rsm); 15213 } 15214 #else 15215 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 15216 #endif 15217 if ((rsm->r_flags & RACK_ACKED) == 0) { 15218 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { 15219 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > 15220 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { 15221 /* 15222 * If the existing ersm was sent at 15223 * a later time than the new one, then 15224 * the new one should appear ahead of this 15225 * ersm. 15226 */ 15227 rsm->r_in_tmap = 1; 15228 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); 15229 break; 15230 } 15231 } 15232 if (rsm->r_in_tmap == 0) { 15233 /* 15234 * Not found so shove it on the tail. 15235 */ 15236 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 15237 rsm->r_in_tmap = 1; 15238 } 15239 } else { 15240 if ((rack->r_ctl.rc_sacklast == NULL) || 15241 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { 15242 rack->r_ctl.rc_sacklast = rsm; 15243 } 15244 } 15245 rack_log_chg_info(tp, rack, 3, 15246 rsm->r_start, 15247 rsm->r_end, 15248 rsm->r_flags); 15249 } 15250 } 15251 return (0); 15252 } 15253 15254 static void 15255 rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval) 15256 { 15257 /* 15258 * P = Percent of retransmits 499 = 49.9% 15259 * A = Average number 1 (.1%) -> 169 (16.9%) 15260 * M = Median number of retrans 1 - 16 15261 * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP 15262 * 15263 */ 15264 uint16_t per, upp; 15265 15266 per = optval & 0x0000ffff; 15267 rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff); 15268 upp = ((optval & 0xffff0000) >> 16); 15269 rack->r_ctl.policer_avg_threshold = (0x00ff & upp); 15270 rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff); 15271 if ((rack->r_ctl.policer_rxt_threshold > 0) && 15272 (rack->r_ctl.policer_avg_threshold > 0) && 15273 (rack->r_ctl.policer_med_threshold > 0)) { 15274 rack->policer_detect_on = 1; 15275 } else { 15276 rack->policer_detect_on = 0; 15277 } 15278 rack->r_ctl.saved_policer_val = optval; 15279 policer_detection_log(rack, optval, 15280 rack->r_ctl.policer_avg_threshold, 15281 rack->r_ctl.policer_med_threshold, 15282 rack->r_ctl.policer_rxt_threshold, 11); 15283 } 15284 15285 static int32_t 15286 rack_init(struct tcpcb *tp, void **ptr) 15287 { 15288 struct inpcb *inp = tptoinpcb(tp); 15289 struct tcp_rack *rack = NULL; 15290 uint32_t iwin, snt, us_cts; 15291 size_t sz; 15292 int err, no_query; 15293 15294 tcp_hpts_init(tp); 15295 15296 /* 15297 * First are we the initial or are we a switched stack? 15298 * If we are initing via tcp_newtcppcb the ptr passed 15299 * will be tp->t_fb_ptr. If its a stack switch that 15300 * has a previous stack we can query it will be a local 15301 * var that will in the end be set into t_fb_ptr. 15302 */ 15303 if (ptr == &tp->t_fb_ptr) 15304 no_query = 1; 15305 else 15306 no_query = 0; 15307 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 15308 if (*ptr == NULL) { 15309 /* 15310 * We need to allocate memory but cant. The INP and INP_INFO 15311 * locks and they are recursive (happens during setup. So a 15312 * scheme to drop the locks fails :( 15313 * 15314 */ 15315 return(ENOMEM); 15316 } 15317 memset(*ptr, 0, sizeof(struct tcp_rack)); 15318 rack = (struct tcp_rack *)*ptr; 15319 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); 15320 if (rack->r_ctl.tqh == NULL) { 15321 uma_zfree(rack_pcb_zone, rack); 15322 return(ENOMEM); 15323 } 15324 tqhash_init(rack->r_ctl.tqh); 15325 TAILQ_INIT(&rack->r_ctl.rc_free); 15326 TAILQ_INIT(&rack->r_ctl.rc_tmap); 15327 rack->rc_tp = tp; 15328 rack->rc_inp = inp; 15329 /* Set the flag */ 15330 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 15331 /* Probably not needed but lets be sure */ 15332 rack_clear_rate_sample(rack); 15333 /* 15334 * Save off the default values, socket options will poke 15335 * at these if pacing is not on or we have not yet 15336 * reached where pacing is on (gp_ready/fixed enabled). 15337 * When they get set into the CC module (when gp_ready 15338 * is enabled or we enable fixed) then we will set these 15339 * values into the CC and place in here the old values 15340 * so we have a restoral. Then we will set the flag 15341 * rc_pacing_cc_set. That way whenever we turn off pacing 15342 * or switch off this stack, we will know to go restore 15343 * the saved values. 15344 * 15345 * We specifically put into the beta the ecn value for pacing. 15346 */ 15347 rack->rc_new_rnd_needed = 1; 15348 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; 15349 /* We want abe like behavior as well */ 15350 15351 rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 15352 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 15353 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 15354 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 15355 rack->r_ctl.policer_del_mss = rack_req_del_mss; 15356 if ((rack_policer_rxt_thresh > 0) && 15357 (rack_policer_avg_thresh > 0) && 15358 (rack_policer_med_thresh > 0)) { 15359 rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh; 15360 rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh; 15361 rack->r_ctl.policer_med_threshold = rack_policer_med_thresh; 15362 rack->policer_detect_on = 1; 15363 } else { 15364 rack->policer_detect_on = 0; 15365 } 15366 if (rack_fill_cw_state) 15367 rack->rc_pace_to_cwnd = 1; 15368 if (rack_pacing_min_seg) 15369 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; 15370 if (use_rack_rr) 15371 rack->use_rack_rr = 1; 15372 if (rack_dnd_default) { 15373 rack->rc_pace_dnd = 1; 15374 } 15375 if (V_tcp_delack_enabled) 15376 tp->t_delayed_ack = 1; 15377 else 15378 tp->t_delayed_ack = 0; 15379 #ifdef TCP_ACCOUNTING 15380 if (rack_tcp_accounting) { 15381 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 15382 } 15383 #endif 15384 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY; 15385 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); 15386 rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT); 15387 if (rack->r_ctl.pcm_s == NULL) { 15388 rack->r_ctl.pcm_i.cnt_alloc = 0; 15389 } 15390 #ifdef NETFLIX_STATS 15391 rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask; 15392 #endif 15393 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 15394 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 15395 if (rack_enable_shared_cwnd) 15396 rack->rack_enable_scwnd = 1; 15397 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 15398 rack->rc_user_set_max_segs = rack_hptsi_segments; 15399 rack->r_ctl.max_reduction = rack_max_reduce; 15400 rack->rc_force_max_seg = 0; 15401 TAILQ_INIT(&rack->r_ctl.opt_list); 15402 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 15403 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 15404 if (rack_hibeta_setting) { 15405 rack->rack_hibeta = 1; 15406 if ((rack_hibeta_setting >= 50) && 15407 (rack_hibeta_setting <= 100)) { 15408 rack->r_ctl.rc_saved_beta.beta = rack_hibeta_setting; 15409 rack->r_ctl.saved_hibeta = rack_hibeta_setting; 15410 } 15411 } else { 15412 rack->r_ctl.saved_hibeta = 50; 15413 } 15414 /* 15415 * We initialize to all ones so we never match 0 15416 * just in case the client sends in 0, it hopefully 15417 * will never have all 1's in ms :-) 15418 */ 15419 rack->r_ctl.last_tm_mark = 0xffffffffffffffff; 15420 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 15421 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 15422 rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp; 15423 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 15424 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 15425 rack->r_ctl.rc_highest_us_rtt = 0; 15426 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 15427 rack->pcm_enabled = rack_pcm_is_enabled; 15428 if (rack_fillcw_bw_cap) 15429 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 15430 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 15431 if (rack_use_cmp_acks) 15432 rack->r_use_cmp_ack = 1; 15433 if (rack_disable_prr) 15434 rack->rack_no_prr = 1; 15435 if (rack_gp_no_rec_chg) 15436 rack->rc_gp_no_rec_chg = 1; 15437 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 15438 rack->r_ctl.pacing_method |= RACK_REG_PACING; 15439 rack->rc_always_pace = 1; 15440 if (rack->rack_hibeta) 15441 rack_set_cc_pacing(rack); 15442 } else 15443 rack->rc_always_pace = 0; 15444 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 15445 rack->r_mbuf_queue = 1; 15446 else 15447 rack->r_mbuf_queue = 0; 15448 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15449 if (rack_limits_scwnd) 15450 rack->r_limit_scw = 1; 15451 else 15452 rack->r_limit_scw = 0; 15453 rack_init_retransmit_value(rack, rack_rxt_controls); 15454 rack->rc_labc = V_tcp_abc_l_var; 15455 if (rack_honors_hpts_min_to) 15456 rack->r_use_hpts_min = 1; 15457 if (tp->snd_una != 0) { 15458 rack->r_ctl.idle_snd_una = tp->snd_una; 15459 rack->rc_sendvars_notset = 0; 15460 /* 15461 * Make sure any TCP timers are not running. 15462 */ 15463 tcp_timer_stop(tp); 15464 } else { 15465 /* 15466 * Server side, we are called from the 15467 * syn-cache. This means none of the 15468 * snd_una/max are set yet so we have 15469 * to defer this until the first send. 15470 */ 15471 rack->rc_sendvars_notset = 1; 15472 } 15473 15474 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 15475 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 15476 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 15477 rack->r_ctl.rc_min_to = rack_min_to; 15478 microuptime(&rack->r_ctl.act_rcv_time); 15479 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 15480 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 15481 if (rack_hw_up_only) 15482 rack->r_up_only = 1; 15483 if (rack_do_dyn_mul) { 15484 /* When dynamic adjustment is on CA needs to start at 100% */ 15485 rack->rc_gp_dyn_mul = 1; 15486 if (rack_do_dyn_mul >= 100) 15487 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 15488 } else 15489 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 15490 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 15491 if (rack_timely_off) { 15492 rack->rc_skip_timely = 1; 15493 } 15494 if (rack->rc_skip_timely) { 15495 rack->r_ctl.rack_per_of_gp_rec = 90; 15496 rack->r_ctl.rack_per_of_gp_ca = 100; 15497 rack->r_ctl.rack_per_of_gp_ss = 250; 15498 } 15499 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 15500 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 15501 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 15502 15503 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 15504 rack_probertt_filter_life); 15505 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 15506 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 15507 rack->r_ctl.rc_time_of_last_probertt = us_cts; 15508 rack->r_ctl.rc_went_idle_time = us_cts; 15509 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks() - (tcp_ack_war_time_window + 1); 15510 rack->r_ctl.rc_time_probertt_starts = 0; 15511 15512 rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff; 15513 if (rack_rnd_cnt_req & 0x10000) 15514 rack->r_ctl.gate_to_fs = 1; 15515 rack->r_ctl.gp_gain_req = rack_gp_gain_req; 15516 if ((rack_rnd_cnt_req & 0x100) > 0) { 15517 15518 } 15519 if (rack_dsack_std_based & 0x1) { 15520 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 15521 rack->rc_rack_tmr_std_based = 1; 15522 } 15523 if (rack_dsack_std_based & 0x2) { 15524 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 15525 rack->rc_rack_use_dsack = 1; 15526 } 15527 /* We require at least one measurement, even if the sysctl is 0 */ 15528 if (rack_req_measurements) 15529 rack->r_ctl.req_measurements = rack_req_measurements; 15530 else 15531 rack->r_ctl.req_measurements = 1; 15532 if (rack_enable_hw_pacing) 15533 rack->rack_hdw_pace_ena = 1; 15534 if (rack_hw_rate_caps) 15535 rack->r_rack_hw_rate_caps = 1; 15536 if (rack_non_rxt_use_cr) 15537 rack->rack_rec_nonrxt_use_cr = 1; 15538 /* Lets setup the fsb block */ 15539 err = rack_init_fsb(tp, rack); 15540 if (err) { 15541 uma_zfree(rack_pcb_zone, *ptr); 15542 *ptr = NULL; 15543 return (err); 15544 } 15545 if (rack_do_hystart) { 15546 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 15547 if (rack_do_hystart > 1) 15548 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 15549 if (rack_do_hystart > 2) 15550 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 15551 } 15552 /* Log what we will do with queries */ 15553 rack_log_chg_info(tp, rack, 7, 15554 no_query, 0, 0); 15555 if (rack_def_profile) 15556 rack_set_profile(rack, rack_def_profile); 15557 /* Cancel the GP measurement in progress */ 15558 tp->t_flags &= ~TF_GPUTINPROG; 15559 if ((tp->t_state != TCPS_CLOSED) && 15560 (tp->t_state != TCPS_TIME_WAIT)) { 15561 /* 15562 * We are already open, we may 15563 * need to adjust a few things. 15564 */ 15565 if (SEQ_GT(tp->snd_max, tp->iss)) 15566 snt = tp->snd_max - tp->iss; 15567 else 15568 snt = 0; 15569 iwin = rc_init_window(rack); 15570 if ((snt < iwin) && 15571 (no_query == 1)) { 15572 /* We are not past the initial window 15573 * on the first init (i.e. a stack switch 15574 * has not yet occured) so we need to make 15575 * sure cwnd and ssthresh is correct. 15576 */ 15577 if (tp->snd_cwnd < iwin) 15578 tp->snd_cwnd = iwin; 15579 /* 15580 * If we are within the initial window 15581 * we want ssthresh to be unlimited. Setting 15582 * it to the rwnd (which the default stack does 15583 * and older racks) is not really a good idea 15584 * since we want to be in SS and grow both the 15585 * cwnd and the rwnd (via dynamic rwnd growth). If 15586 * we set it to the rwnd then as the peer grows its 15587 * rwnd we will be stuck in CA and never hit SS. 15588 * 15589 * Its far better to raise it up high (this takes the 15590 * risk that there as been a loss already, probably 15591 * we should have an indicator in all stacks of loss 15592 * but we don't), but considering the normal use this 15593 * is a risk worth taking. The consequences of not 15594 * hitting SS are far worse than going one more time 15595 * into it early on (before we have sent even a IW). 15596 * It is highly unlikely that we will have had a loss 15597 * before getting the IW out. 15598 */ 15599 tp->snd_ssthresh = 0xffffffff; 15600 } 15601 /* 15602 * Any init based on sequence numbers 15603 * should be done in the deferred init path 15604 * since we can be CLOSED and not have them 15605 * inited when rack_init() is called. We 15606 * are not closed so lets call it. 15607 */ 15608 rack_deferred_init(tp, rack); 15609 } 15610 if ((tp->t_state != TCPS_CLOSED) && 15611 (tp->t_state != TCPS_TIME_WAIT) && 15612 (no_query == 0) && 15613 (tp->snd_una != tp->snd_max)) { 15614 err = rack_init_outstanding(tp, rack, us_cts, *ptr); 15615 if (err) { 15616 *ptr = NULL; 15617 return(err); 15618 } 15619 } 15620 rack_stop_all_timers(tp, rack); 15621 /* Setup all the t_flags2 */ 15622 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 15623 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 15624 else 15625 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 15626 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15627 tp->t_flags2 |= TF2_MBUF_ACKCMP; 15628 /* 15629 * Timers in Rack are kept in microseconds so lets 15630 * convert any initial incoming variables 15631 * from ticks into usecs. Note that we 15632 * also change the values of t_srtt and t_rttvar, if 15633 * they are non-zero. They are kept with a 5 15634 * bit decimal so we have to carefully convert 15635 * these to get the full precision. 15636 */ 15637 rack_convert_rtts(tp); 15638 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); 15639 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { 15640 /* We do not start any timers on DROPPED connections */ 15641 if (tp->t_fb->tfb_chg_query == NULL) { 15642 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15643 } else { 15644 struct tcp_query_resp qr; 15645 int ret; 15646 15647 memset(&qr, 0, sizeof(qr)); 15648 15649 /* Get the misc time stamps and such for rack */ 15650 qr.req = TCP_QUERY_RACK_TIMES; 15651 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15652 if (ret == 1) { 15653 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; 15654 rack->r_ctl.num_dsack = qr.rack_num_dsacks; 15655 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; 15656 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; 15657 rack->rc_rack_rtt = qr.rack_rtt; 15658 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; 15659 rack->r_ctl.rc_sacked = qr.rack_sacked; 15660 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; 15661 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; 15662 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; 15663 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; 15664 rack->r_ctl.rc_prr_out = qr.rack_prr_out; 15665 if (qr.rack_tlp_out) { 15666 rack->rc_tlp_in_progress = 1; 15667 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; 15668 } else { 15669 rack->rc_tlp_in_progress = 0; 15670 rack->r_ctl.rc_tlp_cnt_out = 0; 15671 } 15672 if (qr.rack_srtt_measured) 15673 rack->rc_srtt_measure_made = 1; 15674 if (qr.rack_in_persist == 1) { 15675 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; 15676 #ifdef NETFLIX_SHARED_CWND 15677 if (rack->r_ctl.rc_scw) { 15678 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 15679 rack->rack_scwnd_is_idle = 1; 15680 } 15681 #endif 15682 rack->r_ctl.persist_lost_ends = 0; 15683 rack->probe_not_answered = 0; 15684 rack->forced_ack = 0; 15685 tp->t_rxtshift = 0; 15686 rack->rc_in_persist = 1; 15687 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 15688 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 15689 } 15690 if (qr.rack_wanted_output) 15691 rack->r_wanted_output = 1; 15692 rack_log_chg_info(tp, rack, 6, 15693 qr.rack_min_rtt, 15694 qr.rack_rtt, 15695 qr.rack_reorder_ts); 15696 } 15697 /* Get the old stack timers */ 15698 qr.req_param = 0; 15699 qr.req = TCP_QUERY_TIMERS_UP; 15700 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15701 if (ret) { 15702 /* 15703 * non-zero return means we have a timer('s) 15704 * to start. Zero means no timer (no keepalive 15705 * I suppose). 15706 */ 15707 uint32_t tov = 0; 15708 15709 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; 15710 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { 15711 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; 15712 if (TSTMP_GT(qr.timer_pacing_to, us_cts)) 15713 tov = qr.timer_pacing_to - us_cts; 15714 else 15715 tov = HPTS_TICKS_PER_SLOT; 15716 } 15717 if (qr.timer_hpts_flags & PACE_TMR_MASK) { 15718 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; 15719 if (tov == 0) { 15720 if (TSTMP_GT(qr.timer_timer_exp, us_cts)) 15721 tov = qr.timer_timer_exp - us_cts; 15722 else 15723 tov = HPTS_TICKS_PER_SLOT; 15724 } 15725 } 15726 rack_log_chg_info(tp, rack, 4, 15727 rack->r_ctl.rc_hpts_flags, 15728 rack->r_ctl.rc_last_output_to, 15729 rack->r_ctl.rc_timer_exp); 15730 if (tov) { 15731 struct hpts_diag diag; 15732 15733 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), 15734 __LINE__, &diag); 15735 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); 15736 } 15737 } 15738 } 15739 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 15740 __LINE__, RACK_RTTS_INIT); 15741 } 15742 return (0); 15743 } 15744 15745 static int 15746 rack_handoff_ok(struct tcpcb *tp) 15747 { 15748 if ((tp->t_state == TCPS_CLOSED) || 15749 (tp->t_state == TCPS_LISTEN)) { 15750 /* Sure no problem though it may not stick */ 15751 return (0); 15752 } 15753 if ((tp->t_state == TCPS_SYN_SENT) || 15754 (tp->t_state == TCPS_SYN_RECEIVED)) { 15755 /* 15756 * We really don't know if you support sack, 15757 * you have to get to ESTAB or beyond to tell. 15758 */ 15759 return (EAGAIN); 15760 } 15761 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 15762 /* 15763 * Rack will only send a FIN after all data is acknowledged. 15764 * So in this case we have more data outstanding. We can't 15765 * switch stacks until either all data and only the FIN 15766 * is left (in which case rack_init() now knows how 15767 * to deal with that) <or> all is acknowledged and we 15768 * are only left with incoming data, though why you 15769 * would want to switch to rack after all data is acknowledged 15770 * I have no idea (rrs)! 15771 */ 15772 return (EAGAIN); 15773 } 15774 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 15775 return (0); 15776 } 15777 /* 15778 * If we reach here we don't do SACK on this connection so we can 15779 * never do rack. 15780 */ 15781 return (EINVAL); 15782 } 15783 15784 static void 15785 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 15786 { 15787 15788 if (tp->t_fb_ptr) { 15789 uint32_t cnt_free = 0; 15790 struct tcp_rack *rack; 15791 struct rack_sendmap *rsm; 15792 15793 tcp_handle_orphaned_packets(tp); 15794 tp->t_flags &= ~TF_FORCEDATA; 15795 rack = (struct tcp_rack *)tp->t_fb_ptr; 15796 rack_log_pacing_delay_calc(rack, 15797 0, 15798 0, 15799 0, 15800 rack_get_gp_est(rack), /* delRate */ 15801 rack_get_lt_bw(rack), /* rttProp */ 15802 20, __LINE__, NULL, 0); 15803 #ifdef NETFLIX_SHARED_CWND 15804 if (rack->r_ctl.rc_scw) { 15805 uint32_t limit; 15806 15807 if (rack->r_limit_scw) 15808 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 15809 else 15810 limit = 0; 15811 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 15812 rack->r_ctl.rc_scw_index, 15813 limit); 15814 rack->r_ctl.rc_scw = NULL; 15815 } 15816 #endif 15817 if (rack->r_ctl.fsb.tcp_ip_hdr) { 15818 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 15819 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 15820 rack->r_ctl.fsb.th = NULL; 15821 } 15822 if (rack->rc_always_pace == 1) { 15823 rack_remove_pacing(rack); 15824 } 15825 /* Clean up any options if they were not applied */ 15826 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 15827 struct deferred_opt_list *dol; 15828 15829 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 15830 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 15831 free(dol, M_TCPDO); 15832 } 15833 /* rack does not use force data but other stacks may clear it */ 15834 if (rack->r_ctl.crte != NULL) { 15835 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 15836 rack->rack_hdrw_pacing = 0; 15837 rack->r_ctl.crte = NULL; 15838 } 15839 #ifdef TCP_BLACKBOX 15840 tcp_log_flowend(tp); 15841 #endif 15842 /* 15843 * Lets take a different approach to purging just 15844 * get each one and free it like a cum-ack would and 15845 * not use a foreach loop. 15846 */ 15847 rsm = tqhash_min(rack->r_ctl.tqh); 15848 while (rsm) { 15849 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 15850 rack->r_ctl.rc_num_maps_alloced--; 15851 uma_zfree(rack_zone, rsm); 15852 rsm = tqhash_min(rack->r_ctl.tqh); 15853 } 15854 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15855 while (rsm) { 15856 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 15857 rack->r_ctl.rc_num_maps_alloced--; 15858 rack->rc_free_cnt--; 15859 cnt_free++; 15860 uma_zfree(rack_zone, rsm); 15861 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15862 } 15863 if (rack->r_ctl.pcm_s != NULL) { 15864 free(rack->r_ctl.pcm_s, M_TCPPCM); 15865 rack->r_ctl.pcm_s = NULL; 15866 rack->r_ctl.pcm_i.cnt_alloc = 0; 15867 rack->r_ctl.pcm_i.cnt = 0; 15868 } 15869 if ((rack->r_ctl.rc_num_maps_alloced > 0) && 15870 (tcp_bblogging_on(tp))) { 15871 union tcp_log_stackspecific log; 15872 struct timeval tv; 15873 15874 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15875 log.u_bbr.flex8 = 10; 15876 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; 15877 log.u_bbr.flex2 = rack->rc_free_cnt; 15878 log.u_bbr.flex3 = cnt_free; 15879 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15880 rsm = tqhash_min(rack->r_ctl.tqh); 15881 log.u_bbr.delRate = (uint64_t)rsm; 15882 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15883 log.u_bbr.cur_del_rate = (uint64_t)rsm; 15884 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15885 log.u_bbr.pkt_epoch = __LINE__; 15886 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15887 0, &log, false, NULL, NULL, 0, &tv); 15888 } 15889 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), 15890 ("rack:%p num_aloc:%u after freeing all?", 15891 rack, 15892 rack->r_ctl.rc_num_maps_alloced)); 15893 rack->rc_free_cnt = 0; 15894 free(rack->r_ctl.tqh, M_TCPFSB); 15895 rack->r_ctl.tqh = NULL; 15896 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 15897 tp->t_fb_ptr = NULL; 15898 } 15899 /* Make sure snd_nxt is correctly set */ 15900 tp->snd_nxt = tp->snd_max; 15901 } 15902 15903 static void 15904 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 15905 { 15906 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 15907 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 15908 } 15909 switch (tp->t_state) { 15910 case TCPS_SYN_SENT: 15911 rack->r_state = TCPS_SYN_SENT; 15912 rack->r_substate = rack_do_syn_sent; 15913 break; 15914 case TCPS_SYN_RECEIVED: 15915 rack->r_state = TCPS_SYN_RECEIVED; 15916 rack->r_substate = rack_do_syn_recv; 15917 break; 15918 case TCPS_ESTABLISHED: 15919 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15920 rack->r_state = TCPS_ESTABLISHED; 15921 rack->r_substate = rack_do_established; 15922 break; 15923 case TCPS_CLOSE_WAIT: 15924 rack->r_state = TCPS_CLOSE_WAIT; 15925 rack->r_substate = rack_do_close_wait; 15926 break; 15927 case TCPS_FIN_WAIT_1: 15928 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15929 rack->r_state = TCPS_FIN_WAIT_1; 15930 rack->r_substate = rack_do_fin_wait_1; 15931 break; 15932 case TCPS_CLOSING: 15933 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15934 rack->r_state = TCPS_CLOSING; 15935 rack->r_substate = rack_do_closing; 15936 break; 15937 case TCPS_LAST_ACK: 15938 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15939 rack->r_state = TCPS_LAST_ACK; 15940 rack->r_substate = rack_do_lastack; 15941 break; 15942 case TCPS_FIN_WAIT_2: 15943 rack->r_state = TCPS_FIN_WAIT_2; 15944 rack->r_substate = rack_do_fin_wait_2; 15945 break; 15946 case TCPS_LISTEN: 15947 case TCPS_CLOSED: 15948 case TCPS_TIME_WAIT: 15949 default: 15950 break; 15951 }; 15952 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15953 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 15954 15955 } 15956 15957 static void 15958 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 15959 { 15960 /* 15961 * We received an ack, and then did not 15962 * call send or were bounced out due to the 15963 * hpts was running. Now a timer is up as well, is 15964 * it the right timer? 15965 */ 15966 struct rack_sendmap *rsm; 15967 int tmr_up; 15968 15969 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 15970 if (tcp_in_hpts(rack->rc_tp) == 0) { 15971 /* 15972 * Ok we probably need some timer up, but no 15973 * matter what the mask we are not in hpts. We 15974 * may have received an old ack and thus did nothing. 15975 */ 15976 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15977 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15978 return; 15979 } 15980 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 15981 return; 15982 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 15983 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 15984 (tmr_up == PACE_TMR_RXT)) { 15985 /* Should be an RXT */ 15986 return; 15987 } 15988 if (rsm == NULL) { 15989 /* Nothing outstanding? */ 15990 if (tp->t_flags & TF_DELACK) { 15991 if (tmr_up == PACE_TMR_DELACK) 15992 /* We are supposed to have delayed ack up and we do */ 15993 return; 15994 } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) { 15995 /* 15996 * if we hit enobufs then we would expect the possibility 15997 * of nothing outstanding and the RXT up (and the hptsi timer). 15998 */ 15999 return; 16000 } else if (((V_tcp_always_keepalive || 16001 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 16002 (tp->t_state <= TCPS_CLOSING)) && 16003 (tmr_up == PACE_TMR_KEEP) && 16004 (tp->snd_max == tp->snd_una)) { 16005 /* We should have keep alive up and we do */ 16006 return; 16007 } 16008 } 16009 if (SEQ_GT(tp->snd_max, tp->snd_una) && 16010 ((tmr_up == PACE_TMR_TLP) || 16011 (tmr_up == PACE_TMR_RACK) || 16012 (tmr_up == PACE_TMR_RXT))) { 16013 /* 16014 * Either a Rack, TLP or RXT is fine if we 16015 * have outstanding data. 16016 */ 16017 return; 16018 } else if (tmr_up == PACE_TMR_DELACK) { 16019 /* 16020 * If the delayed ack was going to go off 16021 * before the rtx/tlp/rack timer were going to 16022 * expire, then that would be the timer in control. 16023 * Note we don't check the time here trusting the 16024 * code is correct. 16025 */ 16026 return; 16027 } 16028 /* 16029 * Ok the timer originally started is not what we want now. 16030 * We will force the hpts to be stopped if any, and restart 16031 * with the slot set to what was in the saved slot. 16032 */ 16033 if (tcp_in_hpts(rack->rc_tp)) { 16034 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 16035 uint32_t us_cts; 16036 16037 us_cts = tcp_get_usecs(NULL); 16038 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 16039 rack->r_early = 1; 16040 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 16041 } 16042 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16043 } 16044 tcp_hpts_remove(rack->rc_tp); 16045 } 16046 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16047 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 16048 } 16049 16050 16051 static void 16052 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) 16053 { 16054 if ((SEQ_LT(tp->snd_wl1, seq) || 16055 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 16056 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 16057 /* keep track of pure window updates */ 16058 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 16059 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 16060 tp->snd_wnd = tiwin; 16061 rack_validate_fo_sendwin_up(tp, rack); 16062 tp->snd_wl1 = seq; 16063 tp->snd_wl2 = ack; 16064 if (tp->snd_wnd > tp->max_sndwnd) 16065 tp->max_sndwnd = tp->snd_wnd; 16066 rack->r_wanted_output = 1; 16067 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 16068 tp->snd_wnd = tiwin; 16069 rack_validate_fo_sendwin_up(tp, rack); 16070 tp->snd_wl1 = seq; 16071 tp->snd_wl2 = ack; 16072 } else { 16073 /* Not a valid win update */ 16074 return; 16075 } 16076 if (tp->snd_wnd > tp->max_sndwnd) 16077 tp->max_sndwnd = tp->snd_wnd; 16078 /* Do we exit persists? */ 16079 if ((rack->rc_in_persist != 0) && 16080 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 16081 rack->r_ctl.rc_pace_min_segs))) { 16082 rack_exit_persist(tp, rack, cts); 16083 } 16084 /* Do we enter persists? */ 16085 if ((rack->rc_in_persist == 0) && 16086 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 16087 TCPS_HAVEESTABLISHED(tp->t_state) && 16088 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 16089 sbavail(&tptosocket(tp)->so_snd) && 16090 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 16091 /* 16092 * Here the rwnd is less than 16093 * the pacing size, we are established, 16094 * nothing is outstanding, and there is 16095 * data to send. Enter persists. 16096 */ 16097 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); 16098 } 16099 } 16100 16101 static void 16102 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 16103 { 16104 16105 if (tcp_bblogging_on(rack->rc_tp)) { 16106 struct inpcb *inp = tptoinpcb(tp); 16107 union tcp_log_stackspecific log; 16108 struct timeval ltv; 16109 char tcp_hdr_buf[60]; 16110 struct tcphdr *th; 16111 struct timespec ts; 16112 uint32_t orig_snd_una; 16113 uint8_t xx = 0; 16114 16115 #ifdef TCP_REQUEST_TRK 16116 struct tcp_sendfile_track *tcp_req; 16117 16118 if (SEQ_GT(ae->ack, tp->snd_una)) { 16119 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1)); 16120 } else { 16121 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); 16122 } 16123 #endif 16124 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16125 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 16126 if (rack->rack_no_prr == 0) 16127 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16128 else 16129 log.u_bbr.flex1 = 0; 16130 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 16131 log.u_bbr.use_lt_bw <<= 1; 16132 log.u_bbr.use_lt_bw |= rack->r_might_revert; 16133 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 16134 log.u_bbr.bbr_state = rack->rc_free_cnt; 16135 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 16136 log.u_bbr.pkts_out = tp->t_maxseg; 16137 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 16138 log.u_bbr.flex7 = 1; 16139 log.u_bbr.lost = ae->flags; 16140 log.u_bbr.cwnd_gain = ackval; 16141 log.u_bbr.pacing_gain = 0x2; 16142 if (ae->flags & TSTMP_HDWR) { 16143 /* Record the hardware timestamp if present */ 16144 log.u_bbr.flex3 = M_TSTMP; 16145 ts.tv_sec = ae->timestamp / 1000000000; 16146 ts.tv_nsec = ae->timestamp % 1000000000; 16147 ltv.tv_sec = ts.tv_sec; 16148 ltv.tv_usec = ts.tv_nsec / 1000; 16149 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 16150 } else if (ae->flags & TSTMP_LRO) { 16151 /* Record the LRO the arrival timestamp */ 16152 log.u_bbr.flex3 = M_TSTMP_LRO; 16153 ts.tv_sec = ae->timestamp / 1000000000; 16154 ts.tv_nsec = ae->timestamp % 1000000000; 16155 ltv.tv_sec = ts.tv_sec; 16156 ltv.tv_usec = ts.tv_nsec / 1000; 16157 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 16158 } 16159 log.u_bbr.timeStamp = tcp_get_usecs(<v); 16160 /* Log the rcv time */ 16161 log.u_bbr.delRate = ae->timestamp; 16162 #ifdef TCP_REQUEST_TRK 16163 log.u_bbr.applimited = tp->t_tcpreq_closed; 16164 log.u_bbr.applimited <<= 8; 16165 log.u_bbr.applimited |= tp->t_tcpreq_open; 16166 log.u_bbr.applimited <<= 8; 16167 log.u_bbr.applimited |= tp->t_tcpreq_req; 16168 if (tcp_req) { 16169 /* Copy out any client req info */ 16170 /* seconds */ 16171 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 16172 /* useconds */ 16173 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 16174 log.u_bbr.rttProp = tcp_req->timestamp; 16175 log.u_bbr.cur_del_rate = tcp_req->start; 16176 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 16177 log.u_bbr.flex8 |= 1; 16178 } else { 16179 log.u_bbr.flex8 |= 2; 16180 log.u_bbr.bw_inuse = tcp_req->end; 16181 } 16182 log.u_bbr.flex6 = tcp_req->start_seq; 16183 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 16184 log.u_bbr.flex8 |= 4; 16185 log.u_bbr.epoch = tcp_req->end_seq; 16186 } 16187 } 16188 #endif 16189 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 16190 th = (struct tcphdr *)tcp_hdr_buf; 16191 th->th_seq = ae->seq; 16192 th->th_ack = ae->ack; 16193 th->th_win = ae->win; 16194 /* Now fill in the ports */ 16195 th->th_sport = inp->inp_fport; 16196 th->th_dport = inp->inp_lport; 16197 tcp_set_flags(th, ae->flags); 16198 /* Now do we have a timestamp option? */ 16199 if (ae->flags & HAS_TSTMP) { 16200 u_char *cp; 16201 uint32_t val; 16202 16203 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 16204 cp = (u_char *)(th + 1); 16205 *cp = TCPOPT_NOP; 16206 cp++; 16207 *cp = TCPOPT_NOP; 16208 cp++; 16209 *cp = TCPOPT_TIMESTAMP; 16210 cp++; 16211 *cp = TCPOLEN_TIMESTAMP; 16212 cp++; 16213 val = htonl(ae->ts_value); 16214 bcopy((char *)&val, 16215 (char *)cp, sizeof(uint32_t)); 16216 val = htonl(ae->ts_echo); 16217 bcopy((char *)&val, 16218 (char *)(cp + 4), sizeof(uint32_t)); 16219 } else 16220 th->th_off = (sizeof(struct tcphdr) >> 2); 16221 16222 /* 16223 * For sane logging we need to play a little trick. 16224 * If the ack were fully processed we would have moved 16225 * snd_una to high_seq, but since compressed acks are 16226 * processed in two phases, at this point (logging) snd_una 16227 * won't be advanced. So we would see multiple acks showing 16228 * the advancement. We can prevent that by "pretending" that 16229 * snd_una was advanced and then un-advancing it so that the 16230 * logging code has the right value for tlb_snd_una. 16231 */ 16232 if (tp->snd_una != high_seq) { 16233 orig_snd_una = tp->snd_una; 16234 tp->snd_una = high_seq; 16235 xx = 1; 16236 } else 16237 xx = 0; 16238 TCP_LOG_EVENTP(tp, th, 16239 &tptosocket(tp)->so_rcv, 16240 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 16241 0, &log, true, <v); 16242 if (xx) { 16243 tp->snd_una = orig_snd_una; 16244 } 16245 } 16246 16247 } 16248 16249 static void 16250 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 16251 { 16252 uint32_t us_rtt; 16253 /* 16254 * A persist or keep-alive was forced out, update our 16255 * min rtt time. Note now worry about lost responses. 16256 * When a subsequent keep-alive or persist times out 16257 * and forced_ack is still on, then the last probe 16258 * was not responded to. In such cases we have a 16259 * sysctl that controls the behavior. Either we apply 16260 * the rtt but with reduced confidence (0). Or we just 16261 * plain don't apply the rtt estimate. Having data flow 16262 * will clear the probe_not_answered flag i.e. cum-ack 16263 * move forward <or> exiting and reentering persists. 16264 */ 16265 16266 rack->forced_ack = 0; 16267 rack->rc_tp->t_rxtshift = 0; 16268 if ((rack->rc_in_persist && 16269 (tiwin == rack->rc_tp->snd_wnd)) || 16270 (rack->rc_in_persist == 0)) { 16271 /* 16272 * In persists only apply the RTT update if this is 16273 * a response to our window probe. And that 16274 * means the rwnd sent must match the current 16275 * snd_wnd. If it does not, then we got a 16276 * window update ack instead. For keepalive 16277 * we allow the answer no matter what the window. 16278 * 16279 * Note that if the probe_not_answered is set then 16280 * the forced_ack_ts is the oldest one i.e. the first 16281 * probe sent that might have been lost. This assures 16282 * us that if we do calculate an RTT it is longer not 16283 * some short thing. 16284 */ 16285 if (rack->rc_in_persist) 16286 counter_u64_add(rack_persists_acks, 1); 16287 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 16288 if (us_rtt == 0) 16289 us_rtt = 1; 16290 if (rack->probe_not_answered == 0) { 16291 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 16292 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 16293 } else { 16294 /* We have a retransmitted probe here too */ 16295 if (rack_apply_rtt_with_reduced_conf) { 16296 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 16297 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 16298 } 16299 } 16300 } 16301 } 16302 16303 static void 16304 rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 16305 { 16306 /* 16307 * The next send has occurred mark the end of the round 16308 * as when that data gets acknowledged. We can 16309 * also do common things we might need to do when 16310 * a round begins. 16311 */ 16312 rack->r_ctl.roundends = tp->snd_max; 16313 rack->rc_new_rnd_needed = 0; 16314 rack_log_hystart_event(rack, tp->snd_max, 4); 16315 } 16316 16317 16318 static void 16319 rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, 16320 uint32_t flex3) 16321 { 16322 if (tcp_bblogging_on(rack->rc_tp)) { 16323 union tcp_log_stackspecific log; 16324 struct timeval tv; 16325 16326 (void)tcp_get_usecs(&tv); 16327 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16328 log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); 16329 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16330 log.u_bbr.flex8 = mod; 16331 log.u_bbr.flex1 = flex1; 16332 log.u_bbr.flex2 = flex2; 16333 log.u_bbr.flex3 = flex3; 16334 log.u_bbr.flex4 = rack_pcm_every_n_rounds; 16335 log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds; 16336 log.u_bbr.bbr_substate = rack->pcm_needed; 16337 log.u_bbr.bbr_substate <<= 1; 16338 log.u_bbr.bbr_substate |= rack->pcm_in_progress; 16339 log.u_bbr.bbr_substate <<= 1; 16340 log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */ 16341 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, 16342 0, &log, false, NULL, NULL, 0, &tv); 16343 } 16344 } 16345 16346 static void 16347 rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 16348 { 16349 /* 16350 * The round (current_round) has ended. We now 16351 * setup for the next round by incrementing the 16352 * round numnber and doing any round specific 16353 * things. 16354 */ 16355 rack_log_hystart_event(rack, high_seq, 21); 16356 rack->r_ctl.current_round++; 16357 /* New round (current_round) begins at next send */ 16358 rack->rc_new_rnd_needed = 1; 16359 if ((rack->pcm_enabled == 1) && 16360 (rack->pcm_needed == 0) && 16361 (rack->pcm_in_progress == 0)) { 16362 /* 16363 * If we have enabled PCM, then we need to 16364 * check if the round has adanced to the state 16365 * where one is required. 16366 */ 16367 int rnds; 16368 16369 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 16370 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 16371 rack->pcm_needed = 1; 16372 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 16373 } else if (rack_verbose_logging) { 16374 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 16375 } 16376 } 16377 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 16378 /* We have hystart enabled send the round info in */ 16379 if (CC_ALGO(tp)->newround != NULL) { 16380 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 16381 } 16382 } 16383 /* 16384 * For DGP an initial startup check. We want to validate 16385 * that we are not just pushing on slow-start and just 16386 * not gaining.. i.e. filling buffers without getting any 16387 * boost in b/w during the inital slow-start. 16388 */ 16389 if (rack->dgp_on && 16390 (rack->rc_initial_ss_comp == 0) && 16391 (tp->snd_cwnd < tp->snd_ssthresh) && 16392 (rack->r_ctl.num_measurements >= RACK_REQ_AVG) && 16393 (rack->r_ctl.gp_rnd_thresh > 0) && 16394 ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) { 16395 16396 /* 16397 * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where 16398 * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets 16399 * exit SS. 16400 * 16401 * Pick up the flight size now as we enter slowstart (not the 16402 * cwnd which may be inflated). 16403 */ 16404 rack->rc_initial_ss_comp = 1; 16405 16406 if (tcp_bblogging_on(rack->rc_tp)) { 16407 union tcp_log_stackspecific log; 16408 struct timeval tv; 16409 16410 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16411 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 16412 log.u_bbr.flex1 = rack->r_ctl.current_round; 16413 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 16414 log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh; 16415 log.u_bbr.flex4 = rack->r_ctl.gate_to_fs; 16416 log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs; 16417 log.u_bbr.flex8 = 40; 16418 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 16419 0, &log, false, NULL, __func__, __LINE__,&tv); 16420 } 16421 if ((rack->r_ctl.gate_to_fs == 1) && 16422 (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) { 16423 tp->snd_cwnd = rack->r_ctl.ss_hi_fs; 16424 } 16425 tp->snd_ssthresh = tp->snd_cwnd - 1; 16426 /* Turn off any fast output running */ 16427 rack->r_fast_output = 0; 16428 } 16429 } 16430 16431 static int 16432 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 16433 { 16434 /* 16435 * Handle a "special" compressed ack mbuf. Each incoming 16436 * ack has only four possible dispositions: 16437 * 16438 * A) It moves the cum-ack forward 16439 * B) It is behind the cum-ack. 16440 * C) It is a window-update ack. 16441 * D) It is a dup-ack. 16442 * 16443 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 16444 * in the incoming mbuf. We also need to still pay attention 16445 * to nxt_pkt since there may be another packet after this 16446 * one. 16447 */ 16448 #ifdef TCP_ACCOUNTING 16449 uint64_t ts_val; 16450 uint64_t rdstc; 16451 #endif 16452 int segsiz; 16453 struct timespec ts; 16454 struct tcp_rack *rack; 16455 struct tcp_ackent *ae; 16456 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 16457 int cnt, i, did_out, ourfinisacked = 0; 16458 struct tcpopt to_holder, *to = NULL; 16459 #ifdef TCP_ACCOUNTING 16460 int win_up_req = 0; 16461 #endif 16462 int nsegs = 0; 16463 int under_pacing = 0; 16464 int post_recovery = 0; 16465 #ifdef TCP_ACCOUNTING 16466 sched_pin(); 16467 #endif 16468 rack = (struct tcp_rack *)tp->t_fb_ptr; 16469 if (rack->gp_ready && 16470 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 16471 under_pacing = 1; 16472 16473 if (rack->r_state != tp->t_state) 16474 rack_set_state(tp, rack); 16475 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16476 (tp->t_flags & TF_GPUTINPROG)) { 16477 /* 16478 * We have a goodput in progress 16479 * and we have entered a late state. 16480 * Do we have enough data in the sb 16481 * to handle the GPUT request? 16482 */ 16483 uint32_t bytes; 16484 16485 bytes = tp->gput_ack - tp->gput_seq; 16486 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 16487 bytes += tp->gput_seq - tp->snd_una; 16488 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 16489 /* 16490 * There are not enough bytes in the socket 16491 * buffer that have been sent to cover this 16492 * measurement. Cancel it. 16493 */ 16494 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 16495 rack->r_ctl.rc_gp_srtt /*flex1*/, 16496 tp->gput_seq, 16497 0, 0, 18, __LINE__, NULL, 0); 16498 tp->t_flags &= ~TF_GPUTINPROG; 16499 } 16500 } 16501 to = &to_holder; 16502 to->to_flags = 0; 16503 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 16504 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 16505 cnt = m->m_len / sizeof(struct tcp_ackent); 16506 counter_u64_add(rack_multi_single_eq, cnt); 16507 high_seq = tp->snd_una; 16508 the_win = tp->snd_wnd; 16509 win_seq = tp->snd_wl1; 16510 win_upd_ack = tp->snd_wl2; 16511 cts = tcp_tv_to_usectick(tv); 16512 ms_cts = tcp_tv_to_mssectick(tv); 16513 rack->r_ctl.rc_rcvtime = cts; 16514 segsiz = ctf_fixed_maxseg(tp); 16515 if ((rack->rc_gp_dyn_mul) && 16516 (rack->use_fixed_rate == 0) && 16517 (rack->rc_always_pace)) { 16518 /* Check in on probertt */ 16519 rack_check_probe_rtt(rack, cts); 16520 } 16521 for (i = 0; i < cnt; i++) { 16522 #ifdef TCP_ACCOUNTING 16523 ts_val = get_cyclecount(); 16524 #endif 16525 rack_clear_rate_sample(rack); 16526 ae = ((mtod(m, struct tcp_ackent *)) + i); 16527 if (ae->flags & TH_FIN) 16528 rack_log_pacing_delay_calc(rack, 16529 0, 16530 0, 16531 0, 16532 rack_get_gp_est(rack), /* delRate */ 16533 rack_get_lt_bw(rack), /* rttProp */ 16534 20, __LINE__, NULL, 0); 16535 /* Setup the window */ 16536 tiwin = ae->win << tp->snd_scale; 16537 if (tiwin > rack->r_ctl.rc_high_rwnd) 16538 rack->r_ctl.rc_high_rwnd = tiwin; 16539 /* figure out the type of ack */ 16540 if (SEQ_LT(ae->ack, high_seq)) { 16541 /* Case B*/ 16542 ae->ack_val_set = ACK_BEHIND; 16543 } else if (SEQ_GT(ae->ack, high_seq)) { 16544 /* Case A */ 16545 ae->ack_val_set = ACK_CUMACK; 16546 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 16547 /* Case D */ 16548 ae->ack_val_set = ACK_DUPACK; 16549 } else { 16550 /* Case C */ 16551 ae->ack_val_set = ACK_RWND; 16552 } 16553 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16554 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 16555 /* Validate timestamp */ 16556 if (ae->flags & HAS_TSTMP) { 16557 /* Setup for a timestamp */ 16558 to->to_flags = TOF_TS; 16559 ae->ts_echo -= tp->ts_offset; 16560 to->to_tsecr = ae->ts_echo; 16561 to->to_tsval = ae->ts_value; 16562 /* 16563 * If echoed timestamp is later than the current time, fall back to 16564 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16565 * were used when this connection was established. 16566 */ 16567 if (TSTMP_GT(ae->ts_echo, ms_cts)) 16568 to->to_tsecr = 0; 16569 if (tp->ts_recent && 16570 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 16571 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 16572 #ifdef TCP_ACCOUNTING 16573 rdstc = get_cyclecount(); 16574 if (rdstc > ts_val) { 16575 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16576 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 16577 } 16578 } 16579 #endif 16580 continue; 16581 } 16582 } 16583 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 16584 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 16585 tp->ts_recent_age = tcp_ts_getticks(); 16586 tp->ts_recent = ae->ts_value; 16587 } 16588 } else { 16589 /* Setup for a no options */ 16590 to->to_flags = 0; 16591 } 16592 /* Update the rcv time and perform idle reduction possibly */ 16593 if (tp->t_idle_reduce && 16594 (tp->snd_max == tp->snd_una) && 16595 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16596 counter_u64_add(rack_input_idle_reduces, 1); 16597 rack_cc_after_idle(rack, tp); 16598 } 16599 tp->t_rcvtime = ticks; 16600 /* Now what about ECN of a chain of pure ACKs? */ 16601 if (tcp_ecn_input_segment(tp, ae->flags, 0, 16602 tcp_packets_this_ack(tp, ae->ack), 16603 ae->codepoint)) 16604 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 16605 #ifdef TCP_ACCOUNTING 16606 /* Count for the specific type of ack in */ 16607 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16608 tp->tcp_cnt_counters[ae->ack_val_set]++; 16609 } 16610 #endif 16611 /* 16612 * Note how we could move up these in the determination 16613 * above, but we don't so that way the timestamp checks (and ECN) 16614 * is done first before we do any processing on the ACK. 16615 * The non-compressed path through the code has this 16616 * weakness (noted by @jtl) that it actually does some 16617 * processing before verifying the timestamp information. 16618 * We don't take that path here which is why we set 16619 * the ack_val_set first, do the timestamp and ecn 16620 * processing, and then look at what we have setup. 16621 */ 16622 if (ae->ack_val_set == ACK_BEHIND) { 16623 /* 16624 * Case B flag reordering, if window is not closed 16625 * or it could be a keep-alive or persists 16626 */ 16627 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 16628 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 16629 if (rack->r_ctl.rc_reorder_ts == 0) 16630 rack->r_ctl.rc_reorder_ts = 1; 16631 } 16632 } else if (ae->ack_val_set == ACK_DUPACK) { 16633 /* Case D */ 16634 rack_strike_dupack(rack, ae->ack); 16635 } else if (ae->ack_val_set == ACK_RWND) { 16636 /* Case C */ 16637 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 16638 ts.tv_sec = ae->timestamp / 1000000000; 16639 ts.tv_nsec = ae->timestamp % 1000000000; 16640 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16641 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16642 } else { 16643 rack->r_ctl.act_rcv_time = *tv; 16644 } 16645 if (rack->forced_ack) { 16646 rack_handle_probe_response(rack, tiwin, 16647 tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 16648 } 16649 #ifdef TCP_ACCOUNTING 16650 win_up_req = 1; 16651 #endif 16652 win_upd_ack = ae->ack; 16653 win_seq = ae->seq; 16654 the_win = tiwin; 16655 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 16656 } else { 16657 /* Case A */ 16658 if (SEQ_GT(ae->ack, tp->snd_max)) { 16659 /* 16660 * We just send an ack since the incoming 16661 * ack is beyond the largest seq we sent. 16662 */ 16663 if ((tp->t_flags & TF_ACKNOW) == 0) { 16664 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 16665 if (tp->t_flags && TF_ACKNOW) 16666 rack->r_wanted_output = 1; 16667 } 16668 } else { 16669 nsegs++; 16670 /* If the window changed setup to update */ 16671 if (tiwin != tp->snd_wnd) { 16672 win_upd_ack = ae->ack; 16673 win_seq = ae->seq; 16674 the_win = tiwin; 16675 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 16676 } 16677 #ifdef TCP_ACCOUNTING 16678 /* Account for the acks */ 16679 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16680 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 16681 } 16682 #endif 16683 high_seq = ae->ack; 16684 /* Setup our act_rcv_time */ 16685 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 16686 ts.tv_sec = ae->timestamp / 1000000000; 16687 ts.tv_nsec = ae->timestamp % 1000000000; 16688 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16689 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16690 } else { 16691 rack->r_ctl.act_rcv_time = *tv; 16692 } 16693 rack_process_to_cumack(tp, rack, ae->ack, cts, to, 16694 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 16695 #ifdef TCP_REQUEST_TRK 16696 rack_req_check_for_comp(rack, high_seq); 16697 #endif 16698 if (rack->rc_dsack_round_seen) { 16699 /* Is the dsack round over? */ 16700 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 16701 /* Yes it is */ 16702 rack->rc_dsack_round_seen = 0; 16703 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 16704 } 16705 } 16706 } 16707 } 16708 /* And lets be sure to commit the rtt measurements for this ack */ 16709 tcp_rack_xmit_timer_commit(rack, tp); 16710 #ifdef TCP_ACCOUNTING 16711 rdstc = get_cyclecount(); 16712 if (rdstc > ts_val) { 16713 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16714 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 16715 if (ae->ack_val_set == ACK_CUMACK) 16716 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 16717 } 16718 } 16719 #endif 16720 } 16721 #ifdef TCP_ACCOUNTING 16722 ts_val = get_cyclecount(); 16723 #endif 16724 /* Tend to any collapsed window */ 16725 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 16726 /* The peer collapsed the window */ 16727 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); 16728 } else if (rack->rc_has_collapsed) 16729 rack_un_collapse_window(rack, __LINE__); 16730 if ((rack->r_collapse_point_valid) && 16731 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 16732 rack->r_collapse_point_valid = 0; 16733 acked_amount = acked = (high_seq - tp->snd_una); 16734 if (acked) { 16735 /* 16736 * The draft (v3) calls for us to use SEQ_GEQ, but that 16737 * causes issues when we are just going app limited. Lets 16738 * instead use SEQ_GT <or> where its equal but more data 16739 * is outstanding. 16740 * 16741 * Also make sure we are on the last ack of a series. We 16742 * have to have all the ack's processed in queue to know 16743 * if there is something left outstanding. 16744 * 16745 */ 16746 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && 16747 (rack->rc_new_rnd_needed == 0) && 16748 (nxt_pkt == 0)) { 16749 /* 16750 * We have crossed into a new round with 16751 * this th_ack value. 16752 */ 16753 rack_new_round_setup(tp, rack, high_seq); 16754 } 16755 /* 16756 * Clear the probe not answered flag 16757 * since cum-ack moved forward. 16758 */ 16759 rack->probe_not_answered = 0; 16760 if (tp->t_flags & TF_NEEDSYN) { 16761 /* 16762 * T/TCP: Connection was half-synchronized, and our SYN has 16763 * been ACK'd (so connection is now fully synchronized). Go 16764 * to non-starred state, increment snd_una for ACK of SYN, 16765 * and check if we can do window scaling. 16766 */ 16767 tp->t_flags &= ~TF_NEEDSYN; 16768 tp->snd_una++; 16769 acked_amount = acked = (high_seq - tp->snd_una); 16770 } 16771 if (acked > sbavail(&so->so_snd)) 16772 acked_amount = sbavail(&so->so_snd); 16773 if (IN_FASTRECOVERY(tp->t_flags) && 16774 (rack->rack_no_prr == 0)) 16775 rack_update_prr(tp, rack, acked_amount, high_seq); 16776 if (IN_RECOVERY(tp->t_flags)) { 16777 if (SEQ_LT(high_seq, tp->snd_recover) && 16778 (SEQ_LT(high_seq, tp->snd_max))) { 16779 tcp_rack_partialack(tp); 16780 } else { 16781 rack_post_recovery(tp, high_seq); 16782 post_recovery = 1; 16783 } 16784 } else if ((rack->rto_from_rec == 1) && 16785 SEQ_GEQ(high_seq, tp->snd_recover)) { 16786 /* 16787 * We were in recovery, hit a rxt timeout 16788 * and never re-entered recovery. The timeout(s) 16789 * made up all the lost data. In such a case 16790 * we need to clear the rto_from_rec flag. 16791 */ 16792 rack->rto_from_rec = 0; 16793 } 16794 /* Handle the rack-log-ack part (sendmap) */ 16795 if ((sbused(&so->so_snd) == 0) && 16796 (acked > acked_amount) && 16797 (tp->t_state >= TCPS_FIN_WAIT_1) && 16798 (tp->t_flags & TF_SENTFIN)) { 16799 /* 16800 * We must be sure our fin 16801 * was sent and acked (we can be 16802 * in FIN_WAIT_1 without having 16803 * sent the fin). 16804 */ 16805 ourfinisacked = 1; 16806 /* 16807 * Lets make sure snd_una is updated 16808 * since most likely acked_amount = 0 (it 16809 * should be). 16810 */ 16811 tp->snd_una = high_seq; 16812 } 16813 /* Did we make a RTO error? */ 16814 if ((tp->t_flags & TF_PREVVALID) && 16815 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 16816 tp->t_flags &= ~TF_PREVVALID; 16817 if (tp->t_rxtshift == 1 && 16818 (int)(ticks - tp->t_badrxtwin) < 0) 16819 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 16820 } 16821 /* Handle the data in the socket buffer */ 16822 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 16823 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 16824 if (acked_amount > 0) { 16825 uint32_t p_cwnd; 16826 struct mbuf *mfree; 16827 16828 if (post_recovery) { 16829 /* 16830 * Grab the segsiz, multiply by 2 and add the snd_cwnd 16831 * that is the max the CC should add if we are exiting 16832 * recovery and doing a late add. 16833 */ 16834 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16835 p_cwnd <<= 1; 16836 p_cwnd += tp->snd_cwnd; 16837 } 16838 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery); 16839 if (post_recovery && (tp->snd_cwnd > p_cwnd)) { 16840 /* Must be non-newreno (cubic) getting too ahead of itself */ 16841 tp->snd_cwnd = p_cwnd; 16842 } 16843 SOCKBUF_LOCK(&so->so_snd); 16844 mfree = sbcut_locked(&so->so_snd, acked_amount); 16845 tp->snd_una = high_seq; 16846 /* Note we want to hold the sb lock through the sendmap adjust */ 16847 rack_adjust_sendmap_head(rack, &so->so_snd); 16848 /* Wake up the socket if we have room to write more */ 16849 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 16850 sowwakeup_locked(so); 16851 m_freem(mfree); 16852 } 16853 /* update progress */ 16854 tp->t_acktime = ticks; 16855 rack_log_progress_event(rack, tp, tp->t_acktime, 16856 PROGRESS_UPDATE, __LINE__); 16857 /* Clear out shifts and such */ 16858 tp->t_rxtshift = 0; 16859 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 16860 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 16861 rack->rc_tlp_in_progress = 0; 16862 rack->r_ctl.rc_tlp_cnt_out = 0; 16863 /* Send recover and snd_nxt must be dragged along */ 16864 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 16865 tp->snd_recover = tp->snd_una; 16866 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 16867 tp->snd_nxt = tp->snd_max; 16868 /* 16869 * If the RXT timer is running we want to 16870 * stop it, so we can restart a TLP (or new RXT). 16871 */ 16872 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 16873 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16874 tp->snd_wl2 = high_seq; 16875 tp->t_dupacks = 0; 16876 if (under_pacing && 16877 (rack->use_fixed_rate == 0) && 16878 (rack->in_probe_rtt == 0) && 16879 rack->rc_gp_dyn_mul && 16880 rack->rc_always_pace) { 16881 /* Check if we are dragging bottom */ 16882 rack_check_bottom_drag(tp, rack, so); 16883 } 16884 if (tp->snd_una == tp->snd_max) { 16885 tp->t_flags &= ~TF_PREVVALID; 16886 rack->r_ctl.retran_during_recovery = 0; 16887 rack->rc_suspicious = 0; 16888 rack->r_ctl.dsack_byte_cnt = 0; 16889 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 16890 if (rack->r_ctl.rc_went_idle_time == 0) 16891 rack->r_ctl.rc_went_idle_time = 1; 16892 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 16893 if (sbavail(&tptosocket(tp)->so_snd) == 0) 16894 tp->t_acktime = 0; 16895 /* Set so we might enter persists... */ 16896 rack->r_wanted_output = 1; 16897 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16898 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 16899 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16900 (sbavail(&so->so_snd) == 0) && 16901 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 16902 /* 16903 * The socket was gone and the 16904 * peer sent data (not now in the past), time to 16905 * reset him. 16906 */ 16907 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16908 /* tcp_close will kill the inp pre-log the Reset */ 16909 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 16910 #ifdef TCP_ACCOUNTING 16911 rdstc = get_cyclecount(); 16912 if (rdstc > ts_val) { 16913 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16914 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16915 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16916 } 16917 } 16918 #endif 16919 m_freem(m); 16920 tp = tcp_close(tp); 16921 if (tp == NULL) { 16922 #ifdef TCP_ACCOUNTING 16923 sched_unpin(); 16924 #endif 16925 return (1); 16926 } 16927 /* 16928 * We would normally do drop-with-reset which would 16929 * send back a reset. We can't since we don't have 16930 * all the needed bits. Instead lets arrange for 16931 * a call to tcp_output(). That way since we 16932 * are in the closed state we will generate a reset. 16933 * 16934 * Note if tcp_accounting is on we don't unpin since 16935 * we do that after the goto label. 16936 */ 16937 goto send_out_a_rst; 16938 } 16939 if ((sbused(&so->so_snd) == 0) && 16940 (tp->t_state >= TCPS_FIN_WAIT_1) && 16941 (tp->t_flags & TF_SENTFIN)) { 16942 /* 16943 * If we can't receive any more data, then closing user can 16944 * proceed. Starting the timer is contrary to the 16945 * specification, but if we don't get a FIN we'll hang 16946 * forever. 16947 * 16948 */ 16949 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16950 soisdisconnected(so); 16951 tcp_timer_activate(tp, TT_2MSL, 16952 (tcp_fast_finwait2_recycle ? 16953 tcp_finwait2_timeout : 16954 TP_MAXIDLE(tp))); 16955 } 16956 if (ourfinisacked == 0) { 16957 /* 16958 * We don't change to fin-wait-2 if we have our fin acked 16959 * which means we are probably in TCPS_CLOSING. 16960 */ 16961 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16962 } 16963 } 16964 } 16965 /* Wake up the socket if we have room to write more */ 16966 if (sbavail(&so->so_snd)) { 16967 rack->r_wanted_output = 1; 16968 if (ctf_progress_timeout_check(tp, true)) { 16969 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 16970 tp, tick, PROGRESS_DROP, __LINE__); 16971 /* 16972 * We cheat here and don't send a RST, we should send one 16973 * when the pacer drops the connection. 16974 */ 16975 #ifdef TCP_ACCOUNTING 16976 rdstc = get_cyclecount(); 16977 if (rdstc > ts_val) { 16978 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16979 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16980 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16981 } 16982 } 16983 sched_unpin(); 16984 #endif 16985 (void)tcp_drop(tp, ETIMEDOUT); 16986 m_freem(m); 16987 return (1); 16988 } 16989 } 16990 if (ourfinisacked) { 16991 switch(tp->t_state) { 16992 case TCPS_CLOSING: 16993 #ifdef TCP_ACCOUNTING 16994 rdstc = get_cyclecount(); 16995 if (rdstc > ts_val) { 16996 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16997 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16998 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16999 } 17000 } 17001 sched_unpin(); 17002 #endif 17003 tcp_twstart(tp); 17004 m_freem(m); 17005 return (1); 17006 break; 17007 case TCPS_LAST_ACK: 17008 #ifdef TCP_ACCOUNTING 17009 rdstc = get_cyclecount(); 17010 if (rdstc > ts_val) { 17011 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17012 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 17013 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 17014 } 17015 } 17016 sched_unpin(); 17017 #endif 17018 tp = tcp_close(tp); 17019 ctf_do_drop(m, tp); 17020 return (1); 17021 break; 17022 case TCPS_FIN_WAIT_1: 17023 #ifdef TCP_ACCOUNTING 17024 rdstc = get_cyclecount(); 17025 if (rdstc > ts_val) { 17026 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17027 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 17028 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 17029 } 17030 } 17031 #endif 17032 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 17033 soisdisconnected(so); 17034 tcp_timer_activate(tp, TT_2MSL, 17035 (tcp_fast_finwait2_recycle ? 17036 tcp_finwait2_timeout : 17037 TP_MAXIDLE(tp))); 17038 } 17039 tcp_state_change(tp, TCPS_FIN_WAIT_2); 17040 break; 17041 default: 17042 break; 17043 } 17044 } 17045 if (rack->r_fast_output) { 17046 /* 17047 * We re doing fast output.. can we expand that? 17048 */ 17049 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 17050 } 17051 #ifdef TCP_ACCOUNTING 17052 rdstc = get_cyclecount(); 17053 if (rdstc > ts_val) { 17054 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17055 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 17056 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 17057 } 17058 } 17059 17060 } else if (win_up_req) { 17061 rdstc = get_cyclecount(); 17062 if (rdstc > ts_val) { 17063 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17064 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 17065 } 17066 } 17067 #endif 17068 } 17069 /* Now is there a next packet, if so we are done */ 17070 m_freem(m); 17071 did_out = 0; 17072 if (nxt_pkt) { 17073 #ifdef TCP_ACCOUNTING 17074 sched_unpin(); 17075 #endif 17076 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 17077 return (0); 17078 } 17079 rack_handle_might_revert(tp, rack); 17080 ctf_calc_rwin(so, tp); 17081 if ((rack->r_wanted_output != 0) || 17082 (rack->r_fast_output != 0) || 17083 (tp->t_flags & TF_ACKNOW )) { 17084 send_out_a_rst: 17085 if (tcp_output(tp) < 0) { 17086 #ifdef TCP_ACCOUNTING 17087 sched_unpin(); 17088 #endif 17089 return (1); 17090 } 17091 did_out = 1; 17092 } 17093 if (tp->t_flags2 & TF2_HPTS_CALLS) 17094 tp->t_flags2 &= ~TF2_HPTS_CALLS; 17095 rack_free_trim(rack); 17096 #ifdef TCP_ACCOUNTING 17097 sched_unpin(); 17098 #endif 17099 rack_timer_audit(tp, rack, &so->so_snd); 17100 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 17101 return (0); 17102 } 17103 17104 #define TCP_LRO_TS_OPTION \ 17105 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 17106 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) 17107 17108 static int 17109 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 17110 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, 17111 struct timeval *tv) 17112 { 17113 struct inpcb *inp = tptoinpcb(tp); 17114 struct socket *so = tptosocket(tp); 17115 #ifdef TCP_ACCOUNTING 17116 uint64_t ts_val; 17117 #endif 17118 int32_t thflags, retval, did_out = 0; 17119 int32_t way_out = 0; 17120 /* 17121 * cts - is the current time from tv (caller gets ts) in microseconds. 17122 * ms_cts - is the current time from tv in milliseconds. 17123 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 17124 */ 17125 uint32_t cts, us_cts, ms_cts; 17126 uint32_t tiwin; 17127 struct timespec ts; 17128 struct tcpopt to; 17129 struct tcp_rack *rack; 17130 struct rack_sendmap *rsm; 17131 int32_t prev_state = 0; 17132 int no_output = 0; 17133 int slot_remaining = 0; 17134 #ifdef TCP_ACCOUNTING 17135 int ack_val_set = 0xf; 17136 #endif 17137 int nsegs; 17138 17139 NET_EPOCH_ASSERT(); 17140 INP_WLOCK_ASSERT(inp); 17141 17142 /* 17143 * tv passed from common code is from either M_TSTMP_LRO or 17144 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 17145 */ 17146 rack = (struct tcp_rack *)tp->t_fb_ptr; 17147 if (rack->rack_deferred_inited == 0) { 17148 /* 17149 * If we are the connecting socket we will 17150 * hit rack_init() when no sequence numbers 17151 * are setup. This makes it so we must defer 17152 * some initialization. Call that now. 17153 */ 17154 rack_deferred_init(tp, rack); 17155 } 17156 /* 17157 * Check to see if we need to skip any output plans. This 17158 * can happen in the non-LRO path where we are pacing and 17159 * must process the ack coming in but need to defer sending 17160 * anything becase a pacing timer is running. 17161 */ 17162 us_cts = tcp_tv_to_usectick(tv); 17163 if (m->m_flags & M_ACKCMP) { 17164 /* 17165 * All compressed ack's are ack's by definition so 17166 * remove any ack required flag and then do the processing. 17167 */ 17168 rack->rc_ack_required = 0; 17169 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 17170 } 17171 thflags = tcp_get_flags(th); 17172 if ((rack->rc_always_pace == 1) && 17173 (rack->rc_ack_can_sendout_data == 0) && 17174 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 17175 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) { 17176 /* 17177 * Ok conditions are right for queuing the packets 17178 * but we do have to check the flags in the inp, it 17179 * could be, if a sack is present, we want to be awoken and 17180 * so should process the packets. 17181 */ 17182 slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; 17183 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { 17184 no_output = 1; 17185 } else { 17186 /* 17187 * If there is no options, or just a 17188 * timestamp option, we will want to queue 17189 * the packets. This is the same that LRO does 17190 * and will need to change with accurate ECN. 17191 */ 17192 uint32_t *ts_ptr; 17193 int optlen; 17194 17195 optlen = (th->th_off << 2) - sizeof(struct tcphdr); 17196 ts_ptr = (uint32_t *)(th + 1); 17197 if ((optlen == 0) || 17198 ((optlen == TCPOLEN_TSTAMP_APPA) && 17199 (*ts_ptr == TCP_LRO_TS_OPTION))) 17200 no_output = 1; 17201 } 17202 if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { 17203 /* 17204 * It is unrealistic to think we can pace in less than 17205 * the minimum granularity of the pacer (def:250usec). So 17206 * if we have less than that time remaining we should go 17207 * ahead and allow output to be "early". We will attempt to 17208 * make up for it in any pacing time we try to apply on 17209 * the outbound packet. 17210 */ 17211 no_output = 0; 17212 } 17213 } 17214 /* 17215 * If there is a RST or FIN lets dump out the bw 17216 * with a FIN the connection may go on but we 17217 * may not. 17218 */ 17219 if ((thflags & TH_FIN) || (thflags & TH_RST)) 17220 rack_log_pacing_delay_calc(rack, 17221 rack->r_ctl.gp_bw, 17222 0, 17223 0, 17224 rack_get_gp_est(rack), /* delRate */ 17225 rack_get_lt_bw(rack), /* rttProp */ 17226 20, __LINE__, NULL, 0); 17227 if (m->m_flags & M_ACKCMP) { 17228 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 17229 } 17230 cts = tcp_tv_to_usectick(tv); 17231 ms_cts = tcp_tv_to_mssectick(tv); 17232 nsegs = m->m_pkthdr.lro_nsegs; 17233 counter_u64_add(rack_proc_non_comp_ack, 1); 17234 #ifdef TCP_ACCOUNTING 17235 sched_pin(); 17236 if (thflags & TH_ACK) 17237 ts_val = get_cyclecount(); 17238 #endif 17239 if ((m->m_flags & M_TSTMP) || 17240 (m->m_flags & M_TSTMP_LRO)) { 17241 mbuf_tstmp2timespec(m, &ts); 17242 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 17243 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 17244 } else 17245 rack->r_ctl.act_rcv_time = *tv; 17246 kern_prefetch(rack, &prev_state); 17247 prev_state = 0; 17248 /* 17249 * Unscale the window into a 32-bit value. For the SYN_SENT state 17250 * the scale is zero. 17251 */ 17252 tiwin = th->th_win << tp->snd_scale; 17253 #ifdef TCP_ACCOUNTING 17254 if (thflags & TH_ACK) { 17255 /* 17256 * We have a tradeoff here. We can either do what we are 17257 * doing i.e. pinning to this CPU and then doing the accounting 17258 * <or> we could do a critical enter, setup the rdtsc and cpu 17259 * as in below, and then validate we are on the same CPU on 17260 * exit. I have choosen to not do the critical enter since 17261 * that often will gain you a context switch, and instead lock 17262 * us (line above this if) to the same CPU with sched_pin(). This 17263 * means we may be context switched out for a higher priority 17264 * interupt but we won't be moved to another CPU. 17265 * 17266 * If this occurs (which it won't very often since we most likely 17267 * are running this code in interupt context and only a higher 17268 * priority will bump us ... clock?) we will falsely add in 17269 * to the time the interupt processing time plus the ack processing 17270 * time. This is ok since its a rare event. 17271 */ 17272 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 17273 ctf_fixed_maxseg(tp)); 17274 } 17275 #endif 17276 /* 17277 * Parse options on any incoming segment. 17278 */ 17279 memset(&to, 0, sizeof(to)); 17280 tcp_dooptions(&to, (u_char *)(th + 1), 17281 (th->th_off << 2) - sizeof(struct tcphdr), 17282 (thflags & TH_SYN) ? TO_SYN : 0); 17283 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 17284 __func__)); 17285 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 17286 __func__)); 17287 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { 17288 /* 17289 * We don't look at sack's from the 17290 * peer because the MSS is too small which 17291 * can subject us to an attack. 17292 */ 17293 to.to_flags &= ~TOF_SACK; 17294 } 17295 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 17296 (tp->t_flags & TF_GPUTINPROG)) { 17297 /* 17298 * We have a goodput in progress 17299 * and we have entered a late state. 17300 * Do we have enough data in the sb 17301 * to handle the GPUT request? 17302 */ 17303 uint32_t bytes; 17304 17305 bytes = tp->gput_ack - tp->gput_seq; 17306 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 17307 bytes += tp->gput_seq - tp->snd_una; 17308 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 17309 /* 17310 * There are not enough bytes in the socket 17311 * buffer that have been sent to cover this 17312 * measurement. Cancel it. 17313 */ 17314 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 17315 rack->r_ctl.rc_gp_srtt /*flex1*/, 17316 tp->gput_seq, 17317 0, 0, 18, __LINE__, NULL, 0); 17318 tp->t_flags &= ~TF_GPUTINPROG; 17319 } 17320 } 17321 if (tcp_bblogging_on(rack->rc_tp)) { 17322 union tcp_log_stackspecific log; 17323 struct timeval ltv; 17324 #ifdef TCP_REQUEST_TRK 17325 struct tcp_sendfile_track *tcp_req; 17326 17327 if (SEQ_GT(th->th_ack, tp->snd_una)) { 17328 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1)); 17329 } else { 17330 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); 17331 } 17332 #endif 17333 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 17334 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 17335 if (rack->rack_no_prr == 0) 17336 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 17337 else 17338 log.u_bbr.flex1 = 0; 17339 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 17340 log.u_bbr.use_lt_bw <<= 1; 17341 log.u_bbr.use_lt_bw |= rack->r_might_revert; 17342 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 17343 log.u_bbr.bbr_state = rack->rc_free_cnt; 17344 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17345 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 17346 log.u_bbr.flex3 = m->m_flags; 17347 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 17348 log.u_bbr.lost = thflags; 17349 log.u_bbr.pacing_gain = 0x1; 17350 #ifdef TCP_ACCOUNTING 17351 log.u_bbr.cwnd_gain = ack_val_set; 17352 #endif 17353 log.u_bbr.flex7 = 2; 17354 if (m->m_flags & M_TSTMP) { 17355 /* Record the hardware timestamp if present */ 17356 mbuf_tstmp2timespec(m, &ts); 17357 ltv.tv_sec = ts.tv_sec; 17358 ltv.tv_usec = ts.tv_nsec / 1000; 17359 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 17360 } else if (m->m_flags & M_TSTMP_LRO) { 17361 /* Record the LRO the arrival timestamp */ 17362 mbuf_tstmp2timespec(m, &ts); 17363 ltv.tv_sec = ts.tv_sec; 17364 ltv.tv_usec = ts.tv_nsec / 1000; 17365 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 17366 } 17367 log.u_bbr.timeStamp = tcp_get_usecs(<v); 17368 /* Log the rcv time */ 17369 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 17370 #ifdef TCP_REQUEST_TRK 17371 log.u_bbr.applimited = tp->t_tcpreq_closed; 17372 log.u_bbr.applimited <<= 8; 17373 log.u_bbr.applimited |= tp->t_tcpreq_open; 17374 log.u_bbr.applimited <<= 8; 17375 log.u_bbr.applimited |= tp->t_tcpreq_req; 17376 if (tcp_req) { 17377 /* Copy out any client req info */ 17378 /* seconds */ 17379 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 17380 /* useconds */ 17381 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 17382 log.u_bbr.rttProp = tcp_req->timestamp; 17383 log.u_bbr.cur_del_rate = tcp_req->start; 17384 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 17385 log.u_bbr.flex8 |= 1; 17386 } else { 17387 log.u_bbr.flex8 |= 2; 17388 log.u_bbr.bw_inuse = tcp_req->end; 17389 } 17390 log.u_bbr.flex6 = tcp_req->start_seq; 17391 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 17392 log.u_bbr.flex8 |= 4; 17393 log.u_bbr.epoch = tcp_req->end_seq; 17394 } 17395 } 17396 #endif 17397 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 17398 tlen, &log, true, <v); 17399 } 17400 /* Remove ack required flag if set, we have one */ 17401 if (thflags & TH_ACK) 17402 rack->rc_ack_required = 0; 17403 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 17404 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 17405 way_out = 4; 17406 retval = 0; 17407 m_freem(m); 17408 goto done_with_input; 17409 } 17410 /* 17411 * If a segment with the ACK-bit set arrives in the SYN-SENT state 17412 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 17413 */ 17414 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 17415 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 17416 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 17417 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 17418 #ifdef TCP_ACCOUNTING 17419 sched_unpin(); 17420 #endif 17421 return (1); 17422 } 17423 /* 17424 * If timestamps were negotiated during SYN/ACK and a 17425 * segment without a timestamp is received, silently drop 17426 * the segment, unless it is a RST segment or missing timestamps are 17427 * tolerated. 17428 * See section 3.2 of RFC 7323. 17429 */ 17430 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 17431 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 17432 way_out = 5; 17433 retval = 0; 17434 m_freem(m); 17435 goto done_with_input; 17436 } 17437 /* 17438 * Segment received on connection. Reset idle time and keep-alive 17439 * timer. XXX: This should be done after segment validation to 17440 * ignore broken/spoofed segs. 17441 */ 17442 if (tp->t_idle_reduce && 17443 (tp->snd_max == tp->snd_una) && 17444 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 17445 counter_u64_add(rack_input_idle_reduces, 1); 17446 rack_cc_after_idle(rack, tp); 17447 } 17448 tp->t_rcvtime = ticks; 17449 #ifdef STATS 17450 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 17451 #endif 17452 if (tiwin > rack->r_ctl.rc_high_rwnd) 17453 rack->r_ctl.rc_high_rwnd = tiwin; 17454 /* 17455 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 17456 * this to occur after we've validated the segment. 17457 */ 17458 if (tcp_ecn_input_segment(tp, thflags, tlen, 17459 tcp_packets_this_ack(tp, th->th_ack), 17460 iptos)) 17461 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 17462 17463 /* 17464 * If echoed timestamp is later than the current time, fall back to 17465 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 17466 * were used when this connection was established. 17467 */ 17468 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 17469 to.to_tsecr -= tp->ts_offset; 17470 if (TSTMP_GT(to.to_tsecr, ms_cts)) 17471 to.to_tsecr = 0; 17472 } 17473 if ((rack->r_rcvpath_rtt_up == 1) && 17474 (to.to_flags & TOF_TS) && 17475 (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) { 17476 uint32_t rtt = 0; 17477 17478 /* 17479 * We are receiving only and thus not sending 17480 * data to do an RTT. We set a flag when we first 17481 * sent this TS to the peer. We now have it back 17482 * and have an RTT to share. We log it as a conf 17483 * 4, we are not so sure about it.. since we 17484 * may have lost an ack. 17485 */ 17486 if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv)) 17487 rtt = (cts - rack->r_ctl.last_time_of_arm_rcv); 17488 rack->r_rcvpath_rtt_up = 0; 17489 /* Submit and commit the timer */ 17490 if (rtt > 0) { 17491 tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1); 17492 tcp_rack_xmit_timer_commit(rack, tp); 17493 } 17494 } 17495 /* 17496 * If its the first time in we need to take care of options and 17497 * verify we can do SACK for rack! 17498 */ 17499 if (rack->r_state == 0) { 17500 /* Should be init'd by rack_init() */ 17501 KASSERT(rack->rc_inp != NULL, 17502 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 17503 if (rack->rc_inp == NULL) { 17504 rack->rc_inp = inp; 17505 } 17506 17507 /* 17508 * Process options only when we get SYN/ACK back. The SYN 17509 * case for incoming connections is handled in tcp_syncache. 17510 * According to RFC1323 the window field in a SYN (i.e., a 17511 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 17512 * this is traditional behavior, may need to be cleaned up. 17513 */ 17514 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 17515 /* Handle parallel SYN for ECN */ 17516 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 17517 if ((to.to_flags & TOF_SCALE) && 17518 (tp->t_flags & TF_REQ_SCALE)) { 17519 tp->t_flags |= TF_RCVD_SCALE; 17520 tp->snd_scale = to.to_wscale; 17521 } else 17522 tp->t_flags &= ~TF_REQ_SCALE; 17523 /* 17524 * Initial send window. It will be updated with the 17525 * next incoming segment to the scaled value. 17526 */ 17527 tp->snd_wnd = th->th_win; 17528 rack_validate_fo_sendwin_up(tp, rack); 17529 if ((to.to_flags & TOF_TS) && 17530 (tp->t_flags & TF_REQ_TSTMP)) { 17531 tp->t_flags |= TF_RCVD_TSTMP; 17532 tp->ts_recent = to.to_tsval; 17533 tp->ts_recent_age = cts; 17534 } else 17535 tp->t_flags &= ~TF_REQ_TSTMP; 17536 if (to.to_flags & TOF_MSS) { 17537 tcp_mss(tp, to.to_mss); 17538 } 17539 if ((tp->t_flags & TF_SACK_PERMIT) && 17540 (to.to_flags & TOF_SACKPERM) == 0) 17541 tp->t_flags &= ~TF_SACK_PERMIT; 17542 if (tp->t_flags & TF_FASTOPEN) { 17543 if (to.to_flags & TOF_FASTOPEN) { 17544 uint16_t mss; 17545 17546 if (to.to_flags & TOF_MSS) 17547 mss = to.to_mss; 17548 else 17549 if ((inp->inp_vflag & INP_IPV6) != 0) 17550 mss = TCP6_MSS; 17551 else 17552 mss = TCP_MSS; 17553 tcp_fastopen_update_cache(tp, mss, 17554 to.to_tfo_len, to.to_tfo_cookie); 17555 } else 17556 tcp_fastopen_disable_path(tp); 17557 } 17558 } 17559 /* 17560 * At this point we are at the initial call. Here we decide 17561 * if we are doing RACK or not. We do this by seeing if 17562 * TF_SACK_PERMIT is set and the sack-not-required is clear. 17563 * The code now does do dup-ack counting so if you don't 17564 * switch back you won't get rack & TLP, but you will still 17565 * get this stack. 17566 */ 17567 17568 if ((rack_sack_not_required == 0) && 17569 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 17570 tcp_switch_back_to_default(tp); 17571 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen, 17572 tlen, iptos); 17573 #ifdef TCP_ACCOUNTING 17574 sched_unpin(); 17575 #endif 17576 return (1); 17577 } 17578 tcp_set_hpts(tp); 17579 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 17580 } 17581 if (thflags & TH_FIN) 17582 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 17583 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 17584 if ((rack->rc_gp_dyn_mul) && 17585 (rack->use_fixed_rate == 0) && 17586 (rack->rc_always_pace)) { 17587 /* Check in on probertt */ 17588 rack_check_probe_rtt(rack, cts); 17589 } 17590 rack_clear_rate_sample(rack); 17591 if ((rack->forced_ack) && 17592 ((tcp_get_flags(th) & TH_RST) == 0)) { 17593 rack_handle_probe_response(rack, tiwin, us_cts); 17594 } 17595 /* 17596 * This is the one exception case where we set the rack state 17597 * always. All other times (timers etc) we must have a rack-state 17598 * set (so we assure we have done the checks above for SACK). 17599 */ 17600 rack->r_ctl.rc_rcvtime = cts; 17601 if (rack->r_state != tp->t_state) 17602 rack_set_state(tp, rack); 17603 if (SEQ_GT(th->th_ack, tp->snd_una) && 17604 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) 17605 kern_prefetch(rsm, &prev_state); 17606 prev_state = rack->r_state; 17607 if ((thflags & TH_RST) && 17608 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 17609 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 17610 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { 17611 /* The connection will be killed by a reset check the tracepoint */ 17612 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); 17613 } 17614 retval = (*rack->r_substate) (m, th, so, 17615 tp, &to, drop_hdrlen, 17616 tlen, tiwin, thflags, nxt_pkt, iptos); 17617 if (retval == 0) { 17618 /* 17619 * If retval is 1 the tcb is unlocked and most likely the tp 17620 * is gone. 17621 */ 17622 INP_WLOCK_ASSERT(inp); 17623 if ((rack->rc_gp_dyn_mul) && 17624 (rack->rc_always_pace) && 17625 (rack->use_fixed_rate == 0) && 17626 rack->in_probe_rtt && 17627 (rack->r_ctl.rc_time_probertt_starts == 0)) { 17628 /* 17629 * If we are going for target, lets recheck before 17630 * we output. 17631 */ 17632 rack_check_probe_rtt(rack, cts); 17633 } 17634 if (rack->set_pacing_done_a_iw == 0) { 17635 /* How much has been acked? */ 17636 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 17637 /* We have enough to set in the pacing segment size */ 17638 rack->set_pacing_done_a_iw = 1; 17639 rack_set_pace_segments(tp, rack, __LINE__, NULL); 17640 } 17641 } 17642 tcp_rack_xmit_timer_commit(rack, tp); 17643 #ifdef TCP_ACCOUNTING 17644 /* 17645 * If we set the ack_val_se to what ack processing we are doing 17646 * we also want to track how many cycles we burned. Note 17647 * the bits after tcp_output we let be "free". This is because 17648 * we are also tracking the tcp_output times as well. Note the 17649 * use of 0xf here since we only have 11 counter (0 - 0xa) and 17650 * 0xf cannot be returned and is what we initialize it too to 17651 * indicate we are not doing the tabulations. 17652 */ 17653 if (ack_val_set != 0xf) { 17654 uint64_t crtsc; 17655 17656 crtsc = get_cyclecount(); 17657 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17658 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 17659 } 17660 } 17661 #endif 17662 if ((nxt_pkt == 0) && (no_output == 0)) { 17663 if ((rack->r_wanted_output != 0) || 17664 (tp->t_flags & TF_ACKNOW) || 17665 (rack->r_fast_output != 0)) { 17666 17667 do_output_now: 17668 if (tcp_output(tp) < 0) { 17669 #ifdef TCP_ACCOUNTING 17670 sched_unpin(); 17671 #endif 17672 return (1); 17673 } 17674 did_out = 1; 17675 } 17676 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 17677 rack_free_trim(rack); 17678 } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { 17679 goto do_output_now; 17680 } else if ((no_output == 1) && 17681 (nxt_pkt == 0) && 17682 (tcp_in_hpts(rack->rc_tp) == 0)) { 17683 /* 17684 * We are not in hpts and we had a pacing timer up. Use 17685 * the remaining time (slot_remaining) to restart the timer. 17686 */ 17687 KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); 17688 rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); 17689 rack_free_trim(rack); 17690 } 17691 /* Clear the flag, it may have been cleared by output but we may not have */ 17692 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) 17693 tp->t_flags2 &= ~TF2_HPTS_CALLS; 17694 /* 17695 * The draft (v3) calls for us to use SEQ_GEQ, but that 17696 * causes issues when we are just going app limited. Lets 17697 * instead use SEQ_GT <or> where its equal but more data 17698 * is outstanding. 17699 * 17700 * Also make sure we are on the last ack of a series. We 17701 * have to have all the ack's processed in queue to know 17702 * if there is something left outstanding. 17703 */ 17704 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && 17705 (rack->rc_new_rnd_needed == 0) && 17706 (nxt_pkt == 0)) { 17707 /* 17708 * We have crossed into a new round with 17709 * the new snd_unae. 17710 */ 17711 rack_new_round_setup(tp, rack, tp->snd_una); 17712 } 17713 if ((nxt_pkt == 0) && 17714 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 17715 (SEQ_GT(tp->snd_max, tp->snd_una) || 17716 (tp->t_flags & TF_DELACK) || 17717 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 17718 (tp->t_state <= TCPS_CLOSING)))) { 17719 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 17720 if ((tp->snd_max == tp->snd_una) && 17721 ((tp->t_flags & TF_DELACK) == 0) && 17722 (tcp_in_hpts(rack->rc_tp)) && 17723 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 17724 /* keep alive not needed if we are hptsi output yet */ 17725 ; 17726 } else { 17727 int late = 0; 17728 if (tcp_in_hpts(tp)) { 17729 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 17730 us_cts = tcp_get_usecs(NULL); 17731 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 17732 rack->r_early = 1; 17733 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 17734 } else 17735 late = 1; 17736 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 17737 } 17738 tcp_hpts_remove(tp); 17739 } 17740 if (late && (did_out == 0)) { 17741 /* 17742 * We are late in the sending 17743 * and we did not call the output 17744 * (this probably should not happen). 17745 */ 17746 goto do_output_now; 17747 } 17748 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 17749 } 17750 way_out = 1; 17751 } else if (nxt_pkt == 0) { 17752 /* Do we have the correct timer running? */ 17753 rack_timer_audit(tp, rack, &so->so_snd); 17754 way_out = 2; 17755 } 17756 done_with_input: 17757 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 17758 if (did_out) 17759 rack->r_wanted_output = 0; 17760 } 17761 17762 #ifdef TCP_ACCOUNTING 17763 sched_unpin(); 17764 #endif 17765 return (retval); 17766 } 17767 17768 static void 17769 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 17770 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 17771 { 17772 struct timeval tv; 17773 17774 /* First lets see if we have old packets */ 17775 if (!STAILQ_EMPTY(&tp->t_inqueue)) { 17776 if (ctf_do_queued_segments(tp, 1)) { 17777 m_freem(m); 17778 return; 17779 } 17780 } 17781 if (m->m_flags & M_TSTMP_LRO) { 17782 mbuf_tstmp2timeval(m, &tv); 17783 } else { 17784 /* Should not be should we kassert instead? */ 17785 tcp_get_usecs(&tv); 17786 } 17787 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0, 17788 &tv) == 0) { 17789 INP_WUNLOCK(tptoinpcb(tp)); 17790 } 17791 } 17792 17793 struct rack_sendmap * 17794 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 17795 { 17796 struct rack_sendmap *rsm = NULL; 17797 int32_t idx; 17798 uint32_t srtt = 0, thresh = 0, ts_low = 0; 17799 17800 /* Return the next guy to be re-transmitted */ 17801 if (tqhash_empty(rack->r_ctl.tqh)) { 17802 return (NULL); 17803 } 17804 if (tp->t_flags & TF_SENTFIN) { 17805 /* retran the end FIN? */ 17806 return (NULL); 17807 } 17808 /* ok lets look at this one */ 17809 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17810 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 17811 return (rsm); 17812 } 17813 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 17814 goto check_it; 17815 } 17816 rsm = rack_find_lowest_rsm(rack); 17817 if (rsm == NULL) { 17818 return (NULL); 17819 } 17820 check_it: 17821 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 17822 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 17823 /* 17824 * No sack so we automatically do the 3 strikes and 17825 * retransmit (no rack timer would be started). 17826 */ 17827 return (rsm); 17828 } 17829 if (rsm->r_flags & RACK_ACKED) { 17830 return (NULL); 17831 } 17832 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 17833 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 17834 /* Its not yet ready */ 17835 return (NULL); 17836 } 17837 srtt = rack_grab_rtt(tp, rack); 17838 idx = rsm->r_rtr_cnt - 1; 17839 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 17840 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 17841 if ((tsused == ts_low) || 17842 (TSTMP_LT(tsused, ts_low))) { 17843 /* No time since sending */ 17844 return (NULL); 17845 } 17846 if ((tsused - ts_low) < thresh) { 17847 /* It has not been long enough yet */ 17848 return (NULL); 17849 } 17850 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 17851 ((rsm->r_flags & RACK_SACK_PASSED))) { 17852 /* 17853 * We have passed the dup-ack threshold <or> 17854 * a SACK has indicated this is missing. 17855 * Note that if you are a declared attacker 17856 * it is only the dup-ack threshold that 17857 * will cause retransmits. 17858 */ 17859 /* log retransmit reason */ 17860 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 17861 rack->r_fast_output = 0; 17862 return (rsm); 17863 } 17864 return (NULL); 17865 } 17866 17867 static void 17868 rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, 17869 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 17870 int line, struct rack_sendmap *rsm, uint8_t quality) 17871 { 17872 if (tcp_bblogging_on(rack->rc_tp)) { 17873 union tcp_log_stackspecific log; 17874 struct timeval tv; 17875 17876 if (rack_verbose_logging == 0) { 17877 /* 17878 * We are not verbose screen out all but 17879 * ones we always want. 17880 */ 17881 if ((method != 2) && 17882 (method != 3) && 17883 (method != 7) && 17884 (method != 89) && 17885 (method != 14) && 17886 (method != 20)) { 17887 return; 17888 } 17889 } 17890 memset(&log, 0, sizeof(log)); 17891 log.u_bbr.flex1 = slot; 17892 log.u_bbr.flex2 = len; 17893 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 17894 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 17895 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 17896 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 17897 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 17898 log.u_bbr.use_lt_bw <<= 1; 17899 log.u_bbr.use_lt_bw |= rack->r_late; 17900 log.u_bbr.use_lt_bw <<= 1; 17901 log.u_bbr.use_lt_bw |= rack->r_early; 17902 log.u_bbr.use_lt_bw <<= 1; 17903 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 17904 log.u_bbr.use_lt_bw <<= 1; 17905 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 17906 log.u_bbr.use_lt_bw <<= 1; 17907 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 17908 log.u_bbr.use_lt_bw <<= 1; 17909 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 17910 log.u_bbr.use_lt_bw <<= 1; 17911 log.u_bbr.use_lt_bw |= rack->gp_ready; 17912 log.u_bbr.pkt_epoch = line; 17913 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 17914 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 17915 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 17916 log.u_bbr.bw_inuse = bw_est; 17917 log.u_bbr.delRate = bw; 17918 if (rack->r_ctl.gp_bw == 0) 17919 log.u_bbr.cur_del_rate = 0; 17920 else 17921 log.u_bbr.cur_del_rate = rack_get_bw(rack); 17922 log.u_bbr.rttProp = len_time; 17923 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 17924 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 17925 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 17926 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 17927 /* We are in slow start */ 17928 log.u_bbr.flex7 = 1; 17929 } else { 17930 /* we are on congestion avoidance */ 17931 log.u_bbr.flex7 = 0; 17932 } 17933 log.u_bbr.flex8 = method; 17934 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17935 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17936 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 17937 log.u_bbr.cwnd_gain <<= 1; 17938 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 17939 log.u_bbr.cwnd_gain <<= 1; 17940 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 17941 log.u_bbr.bbr_substate = quality; 17942 log.u_bbr.bbr_state = rack->dgp_on; 17943 log.u_bbr.bbr_state <<= 1; 17944 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; 17945 log.u_bbr.bbr_state <<= 2; 17946 TCP_LOG_EVENTP(rack->rc_tp, NULL, 17947 &rack->rc_inp->inp_socket->so_rcv, 17948 &rack->rc_inp->inp_socket->so_snd, 17949 BBR_LOG_HPTSI_CALC, 0, 17950 0, &log, false, &tv); 17951 } 17952 } 17953 17954 static uint32_t 17955 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 17956 { 17957 uint32_t new_tso, user_max, pace_one; 17958 17959 user_max = rack->rc_user_set_max_segs * mss; 17960 if (rack->rc_force_max_seg) { 17961 return (user_max); 17962 } 17963 if (rack->use_fixed_rate && 17964 ((rack->r_ctl.crte == NULL) || 17965 (bw != rack->r_ctl.crte->rate))) { 17966 /* Use the user mss since we are not exactly matched */ 17967 return (user_max); 17968 } 17969 if (rack_pace_one_seg || 17970 (rack->r_ctl.rc_user_set_min_segs == 1)) 17971 pace_one = 1; 17972 else 17973 pace_one = 0; 17974 17975 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, 17976 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 17977 if (new_tso > user_max) 17978 new_tso = user_max; 17979 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { 17980 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) 17981 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; 17982 } 17983 if (rack->r_ctl.rc_user_set_min_segs && 17984 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) 17985 new_tso = rack->r_ctl.rc_user_set_min_segs * mss; 17986 return (new_tso); 17987 } 17988 17989 static uint64_t 17990 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b) 17991 { 17992 uint64_t reduced_win; 17993 uint32_t gain; 17994 17995 if (window_input < rc_init_window(rack)) { 17996 /* 17997 * The cwnd is collapsed to 17998 * nearly zero, maybe because of a time-out? 17999 * Lets drop back to the lt-bw. 18000 */ 18001 reduced_win = rack_get_lt_bw(rack); 18002 /* Set the flag so the caller knows its a rate and not a reduced window */ 18003 *rate_set = 1; 18004 gain = 100; 18005 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) { 18006 /* 18007 * If we are in recover our cwnd needs to be less for 18008 * our pacing consideration. 18009 */ 18010 if (rack->rack_hibeta == 0) { 18011 reduced_win = window_input / 2; 18012 gain = 50; 18013 } else { 18014 reduced_win = window_input * rack->r_ctl.saved_hibeta; 18015 reduced_win /= 100; 18016 gain = rack->r_ctl.saved_hibeta; 18017 } 18018 } else { 18019 /* 18020 * Apply Timely factor to increase/decrease the 18021 * amount we are pacing at. 18022 */ 18023 gain = rack_get_output_gain(rack, NULL); 18024 if (gain > rack_gain_p5_ub) { 18025 gain = rack_gain_p5_ub; 18026 } 18027 reduced_win = window_input * gain; 18028 reduced_win /= 100; 18029 } 18030 if (gain_b != NULL) 18031 *gain_b = gain; 18032 /* 18033 * What is being returned here is a trimmed down 18034 * window values in all cases where rate_set is left 18035 * at 0. In one case we actually return the rate (lt_bw). 18036 * the "reduced_win" is returned as a slimmed down cwnd that 18037 * is then calculated by the caller into a rate when rate_set 18038 * is 0. 18039 */ 18040 return (reduced_win); 18041 } 18042 18043 static int32_t 18044 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 18045 { 18046 uint64_t lentim, fill_bw; 18047 18048 rack->r_via_fill_cw = 0; 18049 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 18050 return (slot); 18051 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 18052 return (slot); 18053 if (rack->r_ctl.rc_last_us_rtt == 0) 18054 return (slot); 18055 if (rack->rc_pace_fill_if_rttin_range && 18056 (rack->r_ctl.rc_last_us_rtt >= 18057 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 18058 /* The rtt is huge, N * smallest, lets not fill */ 18059 return (slot); 18060 } 18061 if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) 18062 return (slot); 18063 /* 18064 * first lets calculate the b/w based on the last us-rtt 18065 * and the the smallest send window. 18066 */ 18067 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 18068 if (rack->rc_fillcw_apply_discount) { 18069 uint32_t rate_set = 0; 18070 18071 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL); 18072 if (rate_set) { 18073 goto at_lt_bw; 18074 } 18075 } 18076 /* Take the rwnd if its smaller */ 18077 if (fill_bw > rack->rc_tp->snd_wnd) 18078 fill_bw = rack->rc_tp->snd_wnd; 18079 /* Now lets make it into a b/w */ 18080 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 18081 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 18082 /* Adjust to any cap */ 18083 if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap) 18084 fill_bw = rack->r_ctl.fillcw_cap; 18085 18086 at_lt_bw: 18087 if (rack_bw_multipler > 0) { 18088 /* 18089 * We want to limit fill-cw to the some multiplier 18090 * of the max(lt_bw, gp_est). The normal default 18091 * is 0 for off, so a sysctl has enabled it. 18092 */ 18093 uint64_t lt_bw, gp, rate; 18094 18095 gp = rack_get_gp_est(rack); 18096 lt_bw = rack_get_lt_bw(rack); 18097 if (lt_bw > gp) 18098 rate = lt_bw; 18099 else 18100 rate = gp; 18101 rate *= rack_bw_multipler; 18102 rate /= 100; 18103 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 18104 union tcp_log_stackspecific log; 18105 struct timeval tv; 18106 18107 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18108 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 18109 log.u_bbr.flex1 = rack_bw_multipler; 18110 log.u_bbr.flex2 = len; 18111 log.u_bbr.cur_del_rate = gp; 18112 log.u_bbr.delRate = lt_bw; 18113 log.u_bbr.bw_inuse = rate; 18114 log.u_bbr.rttProp = fill_bw; 18115 log.u_bbr.flex8 = 44; 18116 tcp_log_event(rack->rc_tp, NULL, NULL, NULL, 18117 BBR_LOG_CWND, 0, 18118 0, &log, false, NULL, 18119 __func__, __LINE__, &tv); 18120 } 18121 if (fill_bw > rate) 18122 fill_bw = rate; 18123 } 18124 /* We are below the min b/w */ 18125 if (non_paced) 18126 *rate_wanted = fill_bw; 18127 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 18128 return (slot); 18129 rack->r_via_fill_cw = 1; 18130 if (rack->r_rack_hw_rate_caps && 18131 (rack->r_ctl.crte != NULL)) { 18132 uint64_t high_rate; 18133 18134 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 18135 if (fill_bw > high_rate) { 18136 /* We are capping bw at the highest rate table entry */ 18137 if (*rate_wanted > high_rate) { 18138 /* The original rate was also capped */ 18139 rack->r_via_fill_cw = 0; 18140 } 18141 rack_log_hdwr_pacing(rack, 18142 fill_bw, high_rate, __LINE__, 18143 0, 3); 18144 fill_bw = high_rate; 18145 if (capped) 18146 *capped = 1; 18147 } 18148 } else if ((rack->r_ctl.crte == NULL) && 18149 (rack->rack_hdrw_pacing == 0) && 18150 (rack->rack_hdw_pace_ena) && 18151 rack->r_rack_hw_rate_caps && 18152 (rack->rack_attempt_hdwr_pace == 0) && 18153 (rack->rc_inp->inp_route.ro_nh != NULL) && 18154 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 18155 /* 18156 * Ok we may have a first attempt that is greater than our top rate 18157 * lets check. 18158 */ 18159 uint64_t high_rate; 18160 18161 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 18162 if (high_rate) { 18163 if (fill_bw > high_rate) { 18164 fill_bw = high_rate; 18165 if (capped) 18166 *capped = 1; 18167 } 18168 } 18169 } 18170 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { 18171 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 18172 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); 18173 fill_bw = rack->r_ctl.bw_rate_cap; 18174 } 18175 /* 18176 * Ok fill_bw holds our mythical b/w to fill the cwnd 18177 * in an rtt (unless it was capped), what does that 18178 * time wise equate too? 18179 */ 18180 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 18181 lentim /= fill_bw; 18182 *rate_wanted = fill_bw; 18183 if (non_paced || (lentim < slot)) { 18184 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 18185 0, lentim, 12, __LINE__, NULL, 0); 18186 return ((int32_t)lentim); 18187 } else 18188 return (slot); 18189 } 18190 18191 static uint32_t 18192 rack_policer_check_send(struct tcp_rack *rack, uint32_t len, uint32_t segsiz, uint32_t *needs) 18193 { 18194 uint64_t calc; 18195 18196 rack->rc_policer_should_pace = 0; 18197 calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size; 18198 calc /= 100; 18199 /* 18200 * Now lets look at if we want more than is in the bucket <or> 18201 * we want more than is reserved in the bucket. 18202 */ 18203 if (rack_verbose_logging > 0) 18204 policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8); 18205 if ((calc > rack->r_ctl.current_policer_bucket) || 18206 (len >= (rack->r_ctl.current_policer_bucket - calc))) { 18207 /* 18208 * We may want to pace depending on if we are going 18209 * into the reserve or not. 18210 */ 18211 uint32_t newlen; 18212 18213 if (calc > rack->r_ctl.current_policer_bucket) { 18214 /* 18215 * This will eat into the reserve if we 18216 * don't have room at all some lines 18217 * below will catch it. 18218 */ 18219 newlen = rack->r_ctl.policer_max_seg; 18220 rack->rc_policer_should_pace = 1; 18221 } else { 18222 /* 18223 * We have all of the reserve plus something in the bucket 18224 * that we can give out. 18225 */ 18226 newlen = rack->r_ctl.current_policer_bucket - calc; 18227 if (newlen < rack->r_ctl.policer_max_seg) { 18228 /* 18229 * Into the reserve to get a full policer_max_seg 18230 * so we set the len to that and eat into 18231 * the reserve. If we go over the code 18232 * below will make us wait. 18233 */ 18234 newlen = rack->r_ctl.policer_max_seg; 18235 rack->rc_policer_should_pace = 1; 18236 } 18237 } 18238 if (newlen > rack->r_ctl.current_policer_bucket) { 18239 /* We have to wait some */ 18240 *needs = newlen - rack->r_ctl.current_policer_bucket; 18241 return (0); 18242 } 18243 if (rack_verbose_logging > 0) 18244 policer_detection_log(rack, len, segsiz, newlen, 0, 9); 18245 len = newlen; 18246 } /* else we have all len available above the reserve */ 18247 if (rack_verbose_logging > 0) 18248 policer_detection_log(rack, len, segsiz, calc, 0, 10); 18249 return (len); 18250 } 18251 18252 static uint32_t 18253 rack_policed_sending(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, uint32_t segsiz, int call_line) 18254 { 18255 /* 18256 * Given a send of len, and a token bucket set at current_policer_bucket_size 18257 * are we close enough to the end of the bucket that we need to pace? If so 18258 * calculate out a time and return it. Otherwise subtract the tokens from 18259 * the bucket. 18260 */ 18261 uint64_t calc; 18262 18263 if ((rack->r_ctl.policer_bw == 0) || 18264 (rack->r_ctl.policer_bucket_size < segsiz)) { 18265 /* 18266 * We should have an estimate here... 18267 */ 18268 return (0); 18269 } 18270 calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size; 18271 calc /= 100; 18272 if ((rack->r_ctl.current_policer_bucket < len) || 18273 (rack->rc_policer_should_pace == 1) || 18274 ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) { 18275 /* we need to pace */ 18276 uint64_t lentim, res; 18277 uint32_t slot; 18278 18279 lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC; 18280 res = lentim / rack->r_ctl.policer_bw; 18281 slot = (uint32_t)res; 18282 if (rack->r_ctl.current_policer_bucket > len) 18283 rack->r_ctl.current_policer_bucket -= len; 18284 else 18285 rack->r_ctl.current_policer_bucket = 0; 18286 policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5); 18287 rack->rc_policer_should_pace = 0; 18288 return(slot); 18289 } 18290 /* Just take tokens out of the bucket and let rack do whatever it would have */ 18291 policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6); 18292 if (len < rack->r_ctl.current_policer_bucket) { 18293 rack->r_ctl.current_policer_bucket -= len; 18294 } else { 18295 rack->r_ctl.current_policer_bucket = 0; 18296 } 18297 return (0); 18298 } 18299 18300 18301 static int32_t 18302 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) 18303 { 18304 uint64_t srtt; 18305 int32_t slot = 0; 18306 int32_t minslot = 0; 18307 int can_start_hw_pacing = 1; 18308 int err; 18309 int pace_one; 18310 18311 if (rack_pace_one_seg || 18312 (rack->r_ctl.rc_user_set_min_segs == 1)) 18313 pace_one = 1; 18314 else 18315 pace_one = 0; 18316 if (rack->rc_policer_detected == 1) { 18317 /* 18318 * A policer has been detected and we 18319 * have all of our data (policer-bw and 18320 * policer bucket size) calculated. Call 18321 * into the function to find out if we are 18322 * overriding the time. 18323 */ 18324 slot = rack_policed_sending(rack, tp, len, segsiz, line); 18325 if (slot) { 18326 uint64_t logbw; 18327 18328 logbw = rack->r_ctl.current_policer_bucket; 18329 logbw <<= 32; 18330 logbw |= rack->r_ctl.policer_bucket_size; 18331 rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0); 18332 return(slot); 18333 } 18334 } 18335 if (rack->rc_always_pace == 0) { 18336 /* 18337 * We use the most optimistic possible cwnd/srtt for 18338 * sending calculations. This will make our 18339 * calculation anticipate getting more through 18340 * quicker then possible. But thats ok we don't want 18341 * the peer to have a gap in data sending. 18342 */ 18343 uint64_t cwnd, tr_perms = 0; 18344 int32_t reduce = 0; 18345 18346 old_method: 18347 /* 18348 * We keep no precise pacing with the old method 18349 * instead we use the pacer to mitigate bursts. 18350 */ 18351 if (rack->r_ctl.rc_rack_min_rtt) 18352 srtt = rack->r_ctl.rc_rack_min_rtt; 18353 else 18354 srtt = max(tp->t_srtt, 1); 18355 if (rack->r_ctl.rc_rack_largest_cwnd) 18356 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 18357 else 18358 cwnd = rack->r_ctl.cwnd_to_use; 18359 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 18360 tr_perms = (cwnd * 1000) / srtt; 18361 if (tr_perms == 0) { 18362 tr_perms = ctf_fixed_maxseg(tp); 18363 } 18364 /* 18365 * Calculate how long this will take to drain, if 18366 * the calculation comes out to zero, thats ok we 18367 * will use send_a_lot to possibly spin around for 18368 * more increasing tot_len_this_send to the point 18369 * that its going to require a pace, or we hit the 18370 * cwnd. Which in that case we are just waiting for 18371 * a ACK. 18372 */ 18373 slot = len / tr_perms; 18374 /* Now do we reduce the time so we don't run dry? */ 18375 if (slot && rack_slot_reduction) { 18376 reduce = (slot / rack_slot_reduction); 18377 if (reduce < slot) { 18378 slot -= reduce; 18379 } else 18380 slot = 0; 18381 } 18382 slot *= HPTS_USEC_IN_MSEC; 18383 if (rack->rc_pace_to_cwnd) { 18384 uint64_t rate_wanted = 0; 18385 18386 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 18387 rack->rc_ack_can_sendout_data = 1; 18388 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 18389 } else 18390 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 18391 /*******************************************************/ 18392 /* RRS: We insert non-paced call to stats here for len */ 18393 /*******************************************************/ 18394 } else { 18395 uint64_t bw_est, res, lentim, rate_wanted; 18396 uint32_t segs, oh; 18397 int capped = 0; 18398 int prev_fill; 18399 18400 if ((rack->r_rr_config == 1) && rsm) { 18401 return (rack->r_ctl.rc_min_to); 18402 } 18403 if (rack->use_fixed_rate) { 18404 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 18405 } else if ((rack->r_ctl.init_rate == 0) && 18406 (rack->r_ctl.gp_bw == 0)) { 18407 /* no way to yet do an estimate */ 18408 bw_est = rate_wanted = 0; 18409 } else if (rack->dgp_on) { 18410 bw_est = rack_get_bw(rack); 18411 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 18412 } else { 18413 uint32_t gain, rate_set = 0; 18414 18415 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 18416 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain); 18417 if (rate_set == 0) { 18418 if (rate_wanted > rack->rc_tp->snd_wnd) 18419 rate_wanted = rack->rc_tp->snd_wnd; 18420 /* Now lets make it into a b/w */ 18421 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC; 18422 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 18423 } 18424 bw_est = rate_wanted; 18425 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd, 18426 rack->r_ctl.cwnd_to_use, 18427 rate_wanted, bw_est, 18428 rack->r_ctl.rc_last_us_rtt, 18429 88, __LINE__, NULL, gain); 18430 } 18431 if ((bw_est == 0) || (rate_wanted == 0) || 18432 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 18433 /* 18434 * No way yet to make a b/w estimate or 18435 * our raise is set incorrectly. 18436 */ 18437 goto old_method; 18438 } 18439 rack_rate_cap_bw(rack, &rate_wanted, &capped); 18440 /* We need to account for all the overheads */ 18441 segs = (len + segsiz - 1) / segsiz; 18442 /* 18443 * We need the diff between 1514 bytes (e-mtu with e-hdr) 18444 * and how much data we put in each packet. Yes this 18445 * means we may be off if we are larger than 1500 bytes 18446 * or smaller. But this just makes us more conservative. 18447 */ 18448 18449 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); 18450 if (rack->r_is_v6) { 18451 #ifdef INET6 18452 oh += sizeof(struct ip6_hdr); 18453 #endif 18454 } else { 18455 #ifdef INET 18456 oh += sizeof(struct ip); 18457 #endif 18458 } 18459 /* We add a fixed 14 for the ethernet header */ 18460 oh += 14; 18461 segs *= oh; 18462 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 18463 res = lentim / rate_wanted; 18464 slot = (uint32_t)res; 18465 if (rack_hw_rate_min && 18466 (rate_wanted < rack_hw_rate_min)) { 18467 can_start_hw_pacing = 0; 18468 if (rack->r_ctl.crte) { 18469 /* 18470 * Ok we need to release it, we 18471 * have fallen too low. 18472 */ 18473 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 18474 rack->r_ctl.crte = NULL; 18475 rack->rack_attempt_hdwr_pace = 0; 18476 rack->rack_hdrw_pacing = 0; 18477 } 18478 } 18479 if (rack->r_ctl.crte && 18480 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 18481 /* 18482 * We want more than the hardware can give us, 18483 * don't start any hw pacing. 18484 */ 18485 can_start_hw_pacing = 0; 18486 if (rack->r_rack_hw_rate_caps == 0) { 18487 /* 18488 * Ok we need to release it, we 18489 * want more than the card can give us and 18490 * no rate cap is in place. Set it up so 18491 * when we want less we can retry. 18492 */ 18493 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 18494 rack->r_ctl.crte = NULL; 18495 rack->rack_attempt_hdwr_pace = 0; 18496 rack->rack_hdrw_pacing = 0; 18497 } 18498 } 18499 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { 18500 /* 18501 * We lost our rate somehow, this can happen 18502 * if the interface changed underneath us. 18503 */ 18504 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 18505 rack->r_ctl.crte = NULL; 18506 /* Lets re-allow attempting to setup pacing */ 18507 rack->rack_hdrw_pacing = 0; 18508 rack->rack_attempt_hdwr_pace = 0; 18509 rack_log_hdwr_pacing(rack, 18510 rate_wanted, bw_est, __LINE__, 18511 0, 6); 18512 } 18513 prev_fill = rack->r_via_fill_cw; 18514 if ((rack->rc_pace_to_cwnd) && 18515 (capped == 0) && 18516 (rack->dgp_on == 1) && 18517 (rack->use_fixed_rate == 0) && 18518 (rack->in_probe_rtt == 0) && 18519 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 18520 /* 18521 * We want to pace at our rate *or* faster to 18522 * fill the cwnd to the max if its not full. 18523 */ 18524 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 18525 /* Re-check to make sure we are not exceeding our max b/w */ 18526 if ((rack->r_ctl.crte != NULL) && 18527 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 18528 /* 18529 * We want more than the hardware can give us, 18530 * don't start any hw pacing. 18531 */ 18532 can_start_hw_pacing = 0; 18533 if (rack->r_rack_hw_rate_caps == 0) { 18534 /* 18535 * Ok we need to release it, we 18536 * want more than the card can give us and 18537 * no rate cap is in place. Set it up so 18538 * when we want less we can retry. 18539 */ 18540 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 18541 rack->r_ctl.crte = NULL; 18542 rack->rack_attempt_hdwr_pace = 0; 18543 rack->rack_hdrw_pacing = 0; 18544 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 18545 } 18546 } 18547 } 18548 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 18549 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 18550 if ((rack->rack_hdw_pace_ena) && 18551 (can_start_hw_pacing > 0) && 18552 (rack->rack_hdrw_pacing == 0) && 18553 (rack->rack_attempt_hdwr_pace == 0)) { 18554 /* 18555 * Lets attempt to turn on hardware pacing 18556 * if we can. 18557 */ 18558 rack->rack_attempt_hdwr_pace = 1; 18559 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 18560 rack->rc_inp->inp_route.ro_nh->nh_ifp, 18561 rate_wanted, 18562 RS_PACING_GEQ, 18563 &err, &rack->r_ctl.crte_prev_rate); 18564 if (rack->r_ctl.crte) { 18565 rack->rack_hdrw_pacing = 1; 18566 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, 18567 pace_one, rack->r_ctl.crte, 18568 NULL, rack->r_ctl.pace_len_divisor); 18569 rack_log_hdwr_pacing(rack, 18570 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 18571 err, 0); 18572 rack->r_ctl.last_hw_bw_req = rate_wanted; 18573 } else { 18574 counter_u64_add(rack_hw_pace_init_fail, 1); 18575 } 18576 } else if (rack->rack_hdrw_pacing && 18577 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 18578 /* Do we need to adjust our rate? */ 18579 const struct tcp_hwrate_limit_table *nrte; 18580 18581 if (rack->r_up_only && 18582 (rate_wanted < rack->r_ctl.crte->rate)) { 18583 /** 18584 * We have four possible states here 18585 * having to do with the previous time 18586 * and this time. 18587 * previous | this-time 18588 * A) 0 | 0 -- fill_cw not in the picture 18589 * B) 1 | 0 -- we were doing a fill-cw but now are not 18590 * C) 1 | 1 -- all rates from fill_cw 18591 * D) 0 | 1 -- we were doing non-fill and now we are filling 18592 * 18593 * For case A, C and D we don't allow a drop. But for 18594 * case B where we now our on our steady rate we do 18595 * allow a drop. 18596 * 18597 */ 18598 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 18599 goto done_w_hdwr; 18600 } 18601 if ((rate_wanted > rack->r_ctl.crte->rate) || 18602 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 18603 if (rack_hw_rate_to_low && 18604 (bw_est < rack_hw_rate_to_low)) { 18605 /* 18606 * The pacing rate is too low for hardware, but 18607 * do allow hardware pacing to be restarted. 18608 */ 18609 rack_log_hdwr_pacing(rack, 18610 bw_est, rack->r_ctl.crte->rate, __LINE__, 18611 0, 5); 18612 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 18613 rack->r_ctl.crte = NULL; 18614 rack->rack_attempt_hdwr_pace = 0; 18615 rack->rack_hdrw_pacing = 0; 18616 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 18617 goto done_w_hdwr; 18618 } 18619 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 18620 rack->rc_tp, 18621 rack->rc_inp->inp_route.ro_nh->nh_ifp, 18622 rate_wanted, 18623 RS_PACING_GEQ, 18624 &err, &rack->r_ctl.crte_prev_rate); 18625 if (nrte == NULL) { 18626 /* 18627 * Lost the rate, lets drop hardware pacing 18628 * period. 18629 */ 18630 rack->rack_hdrw_pacing = 0; 18631 rack->r_ctl.crte = NULL; 18632 rack_log_hdwr_pacing(rack, 18633 rate_wanted, 0, __LINE__, 18634 err, 1); 18635 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 18636 counter_u64_add(rack_hw_pace_lost, 1); 18637 } else if (nrte != rack->r_ctl.crte) { 18638 rack->r_ctl.crte = nrte; 18639 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, 18640 segsiz, pace_one, rack->r_ctl.crte, 18641 NULL, rack->r_ctl.pace_len_divisor); 18642 rack_log_hdwr_pacing(rack, 18643 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 18644 err, 2); 18645 rack->r_ctl.last_hw_bw_req = rate_wanted; 18646 } 18647 } else { 18648 /* We just need to adjust the segment size */ 18649 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 18650 rack_log_hdwr_pacing(rack, 18651 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 18652 0, 4); 18653 rack->r_ctl.last_hw_bw_req = rate_wanted; 18654 } 18655 } 18656 } 18657 if (minslot && (minslot > slot)) { 18658 rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, 18659 98, __LINE__, NULL, 0); 18660 slot = minslot; 18661 } 18662 done_w_hdwr: 18663 if (rack_limit_time_with_srtt && 18664 (rack->use_fixed_rate == 0) && 18665 (rack->rack_hdrw_pacing == 0)) { 18666 /* 18667 * Sanity check, we do not allow the pacing delay 18668 * to be longer than the SRTT of the path. If it is 18669 * a slow path, then adding a packet should increase 18670 * the RTT and compensate for this i.e. the srtt will 18671 * be greater so the allowed pacing time will be greater. 18672 * 18673 * Note this restriction is not for where a peak rate 18674 * is set, we are doing fixed pacing or hardware pacing. 18675 */ 18676 if (rack->rc_tp->t_srtt) 18677 srtt = rack->rc_tp->t_srtt; 18678 else 18679 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 18680 if (srtt < (uint64_t)slot) { 18681 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 18682 slot = srtt; 18683 } 18684 } 18685 /*******************************************************************/ 18686 /* RRS: We insert paced call to stats here for len and rate_wanted */ 18687 /*******************************************************************/ 18688 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 18689 } 18690 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 18691 /* 18692 * If this rate is seeing enobufs when it 18693 * goes to send then either the nic is out 18694 * of gas or we are mis-estimating the time 18695 * somehow and not letting the queue empty 18696 * completely. Lets add to the pacing time. 18697 */ 18698 int hw_boost_delay; 18699 18700 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 18701 if (hw_boost_delay > rack_enobuf_hw_max) 18702 hw_boost_delay = rack_enobuf_hw_max; 18703 else if (hw_boost_delay < rack_enobuf_hw_min) 18704 hw_boost_delay = rack_enobuf_hw_min; 18705 slot += hw_boost_delay; 18706 } 18707 return (slot); 18708 } 18709 18710 static void 18711 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 18712 tcp_seq startseq, uint32_t sb_offset) 18713 { 18714 struct rack_sendmap *my_rsm = NULL; 18715 18716 if (tp->t_state < TCPS_ESTABLISHED) { 18717 /* 18718 * We don't start any measurements if we are 18719 * not at least established. 18720 */ 18721 return; 18722 } 18723 if (tp->t_state >= TCPS_FIN_WAIT_1) { 18724 /* 18725 * We will get no more data into the SB 18726 * this means we need to have the data available 18727 * before we start a measurement. 18728 */ 18729 18730 if (sbavail(&tptosocket(tp)->so_snd) < 18731 max(rc_init_window(rack), 18732 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 18733 /* Nope not enough data */ 18734 return; 18735 } 18736 } 18737 tp->t_flags |= TF_GPUTINPROG; 18738 rack->r_ctl.rc_gp_cumack_ts = 0; 18739 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 18740 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 18741 tp->gput_seq = startseq; 18742 rack->app_limited_needs_set = 0; 18743 if (rack->in_probe_rtt) 18744 rack->measure_saw_probe_rtt = 1; 18745 else if ((rack->measure_saw_probe_rtt) && 18746 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 18747 rack->measure_saw_probe_rtt = 0; 18748 if (rack->rc_gp_filled) 18749 tp->gput_ts = rack->r_ctl.last_cumack_advance; 18750 else { 18751 /* Special case initial measurement */ 18752 struct timeval tv; 18753 18754 tp->gput_ts = tcp_get_usecs(&tv); 18755 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18756 } 18757 /* 18758 * We take a guess out into the future, 18759 * if we have no measurement and no 18760 * initial rate, we measure the first 18761 * initial-windows worth of data to 18762 * speed up getting some GP measurement and 18763 * thus start pacing. 18764 */ 18765 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 18766 rack->app_limited_needs_set = 1; 18767 tp->gput_ack = startseq + max(rc_init_window(rack), 18768 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 18769 rack_log_pacing_delay_calc(rack, 18770 tp->gput_seq, 18771 tp->gput_ack, 18772 0, 18773 tp->gput_ts, 18774 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18775 9, 18776 __LINE__, NULL, 0); 18777 rack_tend_gp_marks(tp, rack); 18778 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18779 return; 18780 } 18781 if (sb_offset) { 18782 /* 18783 * We are out somewhere in the sb 18784 * can we use the already outstanding data? 18785 */ 18786 18787 if (rack->r_ctl.rc_app_limited_cnt == 0) { 18788 /* 18789 * Yes first one is good and in this case 18790 * the tp->gput_ts is correctly set based on 18791 * the last ack that arrived (no need to 18792 * set things up when an ack comes in). 18793 */ 18794 my_rsm = tqhash_min(rack->r_ctl.tqh); 18795 if ((my_rsm == NULL) || 18796 (my_rsm->r_rtr_cnt != 1)) { 18797 /* retransmission? */ 18798 goto use_latest; 18799 } 18800 } else { 18801 if (rack->r_ctl.rc_first_appl == NULL) { 18802 /* 18803 * If rc_first_appl is NULL 18804 * then the cnt should be 0. 18805 * This is probably an error, maybe 18806 * a KASSERT would be approprate. 18807 */ 18808 goto use_latest; 18809 } 18810 /* 18811 * If we have a marker pointer to the last one that is 18812 * app limited we can use that, but we need to set 18813 * things up so that when it gets ack'ed we record 18814 * the ack time (if its not already acked). 18815 */ 18816 rack->app_limited_needs_set = 1; 18817 /* 18818 * We want to get to the rsm that is either 18819 * next with space i.e. over 1 MSS or the one 18820 * after that (after the app-limited). 18821 */ 18822 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); 18823 if (my_rsm) { 18824 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 18825 /* Have to use the next one */ 18826 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 18827 else { 18828 /* Use after the first MSS of it is acked */ 18829 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 18830 goto start_set; 18831 } 18832 } 18833 if ((my_rsm == NULL) || 18834 (my_rsm->r_rtr_cnt != 1)) { 18835 /* 18836 * Either its a retransmit or 18837 * the last is the app-limited one. 18838 */ 18839 goto use_latest; 18840 } 18841 } 18842 tp->gput_seq = my_rsm->r_start; 18843 start_set: 18844 if (my_rsm->r_flags & RACK_ACKED) { 18845 /* 18846 * This one has been acked use the arrival ack time 18847 */ 18848 struct rack_sendmap *nrsm; 18849 18850 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 18851 rack->app_limited_needs_set = 0; 18852 /* 18853 * Ok in this path we need to use the r_end now 18854 * since this guy is the starting ack. 18855 */ 18856 tp->gput_seq = my_rsm->r_end; 18857 /* 18858 * We also need to adjust up the sendtime 18859 * to the send of the next data after my_rsm. 18860 */ 18861 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 18862 if (nrsm != NULL) 18863 my_rsm = nrsm; 18864 else { 18865 /* 18866 * The next as not been sent, thats the 18867 * case for using the latest. 18868 */ 18869 goto use_latest; 18870 } 18871 } 18872 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 18873 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 18874 rack->r_ctl.rc_gp_cumack_ts = 0; 18875 if ((rack->r_ctl.cleared_app_ack == 1) && 18876 (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) { 18877 /* 18878 * We just cleared an application limited period 18879 * so the next seq out needs to skip the first 18880 * ack. 18881 */ 18882 rack->app_limited_needs_set = 1; 18883 rack->r_ctl.cleared_app_ack = 0; 18884 } 18885 rack_log_pacing_delay_calc(rack, 18886 tp->gput_seq, 18887 tp->gput_ack, 18888 (uint64_t)my_rsm, 18889 tp->gput_ts, 18890 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18891 9, 18892 __LINE__, my_rsm, 0); 18893 /* Now lets make sure all are marked as they should be */ 18894 rack_tend_gp_marks(tp, rack); 18895 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18896 return; 18897 } 18898 18899 use_latest: 18900 /* 18901 * We don't know how long we may have been 18902 * idle or if this is the first-send. Lets 18903 * setup the flag so we will trim off 18904 * the first ack'd data so we get a true 18905 * measurement. 18906 */ 18907 rack->app_limited_needs_set = 1; 18908 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 18909 rack->r_ctl.rc_gp_cumack_ts = 0; 18910 /* Find this guy so we can pull the send time */ 18911 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); 18912 if (my_rsm) { 18913 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 18914 if (my_rsm->r_flags & RACK_ACKED) { 18915 /* 18916 * Unlikely since its probably what was 18917 * just transmitted (but I am paranoid). 18918 */ 18919 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 18920 rack->app_limited_needs_set = 0; 18921 } 18922 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 18923 /* This also is unlikely */ 18924 tp->gput_seq = my_rsm->r_start; 18925 } 18926 } else { 18927 /* 18928 * TSNH unless we have some send-map limit, 18929 * and even at that it should not be hitting 18930 * that limit (we should have stopped sending). 18931 */ 18932 struct timeval tv; 18933 18934 microuptime(&tv); 18935 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18936 } 18937 rack_tend_gp_marks(tp, rack); 18938 rack_log_pacing_delay_calc(rack, 18939 tp->gput_seq, 18940 tp->gput_ack, 18941 (uint64_t)my_rsm, 18942 tp->gput_ts, 18943 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18944 9, __LINE__, NULL, 0); 18945 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18946 } 18947 18948 static inline uint32_t 18949 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 18950 uint32_t avail, int32_t sb_offset) 18951 { 18952 uint32_t len; 18953 uint32_t sendwin; 18954 18955 if (tp->snd_wnd > cwnd_to_use) 18956 sendwin = cwnd_to_use; 18957 else 18958 sendwin = tp->snd_wnd; 18959 if (ctf_outstanding(tp) >= tp->snd_wnd) { 18960 /* We never want to go over our peers rcv-window */ 18961 len = 0; 18962 } else { 18963 uint32_t flight; 18964 18965 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 18966 if (flight >= sendwin) { 18967 /* 18968 * We have in flight what we are allowed by cwnd (if 18969 * it was rwnd blocking it would have hit above out 18970 * >= tp->snd_wnd). 18971 */ 18972 return (0); 18973 } 18974 len = sendwin - flight; 18975 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 18976 /* We would send too much (beyond the rwnd) */ 18977 len = tp->snd_wnd - ctf_outstanding(tp); 18978 } 18979 if ((len + sb_offset) > avail) { 18980 /* 18981 * We don't have that much in the SB, how much is 18982 * there? 18983 */ 18984 len = avail - sb_offset; 18985 } 18986 } 18987 return (len); 18988 } 18989 18990 static void 18991 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 18992 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 18993 int rsm_is_null, int optlen, int line, uint16_t mode) 18994 { 18995 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 18996 union tcp_log_stackspecific log; 18997 struct timeval tv; 18998 18999 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19000 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19001 log.u_bbr.flex1 = error; 19002 log.u_bbr.flex2 = flags; 19003 log.u_bbr.flex3 = rsm_is_null; 19004 log.u_bbr.flex4 = ipoptlen; 19005 log.u_bbr.flex5 = tp->rcv_numsacks; 19006 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19007 log.u_bbr.flex7 = optlen; 19008 log.u_bbr.flex8 = rack->r_fsb_inited; 19009 log.u_bbr.applimited = rack->r_fast_output; 19010 log.u_bbr.bw_inuse = rack_get_bw(rack); 19011 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19012 log.u_bbr.cwnd_gain = mode; 19013 log.u_bbr.pkts_out = orig_len; 19014 log.u_bbr.lt_epoch = len; 19015 log.u_bbr.delivered = line; 19016 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 19017 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19018 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 19019 len, &log, false, NULL, __func__, __LINE__, &tv); 19020 } 19021 } 19022 19023 19024 static struct mbuf * 19025 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 19026 struct rack_fast_send_blk *fsb, 19027 int32_t seglimit, int32_t segsize, int hw_tls) 19028 { 19029 #ifdef KERN_TLS 19030 struct ktls_session *tls, *ntls; 19031 #ifdef INVARIANTS 19032 struct mbuf *start; 19033 #endif 19034 #endif 19035 struct mbuf *m, *n, **np, *smb; 19036 struct mbuf *top; 19037 int32_t off, soff; 19038 int32_t len = *plen; 19039 int32_t fragsize; 19040 int32_t len_cp = 0; 19041 uint32_t mlen, frags; 19042 19043 soff = off = the_off; 19044 smb = m = the_m; 19045 np = ⊤ 19046 top = NULL; 19047 #ifdef KERN_TLS 19048 if (hw_tls && (m->m_flags & M_EXTPG)) 19049 tls = m->m_epg_tls; 19050 else 19051 tls = NULL; 19052 #ifdef INVARIANTS 19053 start = m; 19054 #endif 19055 #endif 19056 while (len > 0) { 19057 if (m == NULL) { 19058 *plen = len_cp; 19059 break; 19060 } 19061 #ifdef KERN_TLS 19062 if (hw_tls) { 19063 if (m->m_flags & M_EXTPG) 19064 ntls = m->m_epg_tls; 19065 else 19066 ntls = NULL; 19067 19068 /* 19069 * Avoid mixing TLS records with handshake 19070 * data or TLS records from different 19071 * sessions. 19072 */ 19073 if (tls != ntls) { 19074 MPASS(m != start); 19075 *plen = len_cp; 19076 break; 19077 } 19078 } 19079 #endif 19080 mlen = min(len, m->m_len - off); 19081 if (seglimit) { 19082 /* 19083 * For M_EXTPG mbufs, add 3 segments 19084 * + 1 in case we are crossing page boundaries 19085 * + 2 in case the TLS hdr/trailer are used 19086 * It is cheaper to just add the segments 19087 * than it is to take the cache miss to look 19088 * at the mbuf ext_pgs state in detail. 19089 */ 19090 if (m->m_flags & M_EXTPG) { 19091 fragsize = min(segsize, PAGE_SIZE); 19092 frags = 3; 19093 } else { 19094 fragsize = segsize; 19095 frags = 0; 19096 } 19097 19098 /* Break if we really can't fit anymore. */ 19099 if ((frags + 1) >= seglimit) { 19100 *plen = len_cp; 19101 break; 19102 } 19103 19104 /* 19105 * Reduce size if you can't copy the whole 19106 * mbuf. If we can't copy the whole mbuf, also 19107 * adjust len so the loop will end after this 19108 * mbuf. 19109 */ 19110 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 19111 mlen = (seglimit - frags - 1) * fragsize; 19112 len = mlen; 19113 *plen = len_cp + len; 19114 } 19115 frags += howmany(mlen, fragsize); 19116 if (frags == 0) 19117 frags++; 19118 seglimit -= frags; 19119 KASSERT(seglimit > 0, 19120 ("%s: seglimit went too low", __func__)); 19121 } 19122 n = m_get(M_NOWAIT, m->m_type); 19123 *np = n; 19124 if (n == NULL) 19125 goto nospace; 19126 n->m_len = mlen; 19127 soff += mlen; 19128 len_cp += n->m_len; 19129 if (m->m_flags & (M_EXT | M_EXTPG)) { 19130 n->m_data = m->m_data + off; 19131 mb_dupcl(n, m); 19132 } else { 19133 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 19134 (u_int)n->m_len); 19135 } 19136 len -= n->m_len; 19137 off = 0; 19138 m = m->m_next; 19139 np = &n->m_next; 19140 if (len || (soff == smb->m_len)) { 19141 /* 19142 * We have more so we move forward or 19143 * we have consumed the entire mbuf and 19144 * len has fell to 0. 19145 */ 19146 soff = 0; 19147 smb = m; 19148 } 19149 19150 } 19151 if (fsb != NULL) { 19152 fsb->m = smb; 19153 fsb->off = soff; 19154 if (smb) { 19155 /* 19156 * Save off the size of the mbuf. We do 19157 * this so that we can recognize when it 19158 * has been trimmed by sbcut() as acks 19159 * come in. 19160 */ 19161 fsb->o_m_len = smb->m_len; 19162 fsb->o_t_len = M_TRAILINGROOM(smb); 19163 } else { 19164 /* 19165 * This is the case where the next mbuf went to NULL. This 19166 * means with this copy we have sent everything in the sb. 19167 * In theory we could clear the fast_output flag, but lets 19168 * not since its possible that we could get more added 19169 * and acks that call the extend function which would let 19170 * us send more. 19171 */ 19172 fsb->o_m_len = 0; 19173 fsb->o_t_len = 0; 19174 } 19175 } 19176 return (top); 19177 nospace: 19178 if (top) 19179 m_freem(top); 19180 return (NULL); 19181 19182 } 19183 19184 /* 19185 * This is a copy of m_copym(), taking the TSO segment size/limit 19186 * constraints into account, and advancing the sndptr as it goes. 19187 */ 19188 static struct mbuf * 19189 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 19190 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 19191 { 19192 struct mbuf *m, *n; 19193 int32_t soff; 19194 19195 m = rack->r_ctl.fsb.m; 19196 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { 19197 /* 19198 * The trailing space changed, mbufs can grow 19199 * at the tail but they can't shrink from 19200 * it, KASSERT that. Adjust the orig_m_len to 19201 * compensate for this change. 19202 */ 19203 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), 19204 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 19205 m, 19206 rack, 19207 (intmax_t)M_TRAILINGROOM(m), 19208 rack->r_ctl.fsb.o_t_len, 19209 rack->r_ctl.fsb.o_m_len, 19210 m->m_len)); 19211 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); 19212 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); 19213 } 19214 if (m->m_len < rack->r_ctl.fsb.o_m_len) { 19215 /* 19216 * Mbuf shrank, trimmed off the top by an ack, our 19217 * offset changes. 19218 */ 19219 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), 19220 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", 19221 m, m->m_len, 19222 rack, rack->r_ctl.fsb.o_m_len, 19223 rack->r_ctl.fsb.off)); 19224 19225 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) 19226 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); 19227 else 19228 rack->r_ctl.fsb.off = 0; 19229 rack->r_ctl.fsb.o_m_len = m->m_len; 19230 #ifdef INVARIANTS 19231 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { 19232 panic("rack:%p m:%p m_len grew outside of t_space compensation", 19233 rack, m); 19234 #endif 19235 } 19236 soff = rack->r_ctl.fsb.off; 19237 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 19238 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 19239 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 19240 __FUNCTION__, 19241 rack, *plen, m, m->m_len)); 19242 /* Save off the right location before we copy and advance */ 19243 *s_soff = soff; 19244 *s_mb = rack->r_ctl.fsb.m; 19245 n = rack_fo_base_copym(m, soff, plen, 19246 &rack->r_ctl.fsb, 19247 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 19248 return (n); 19249 } 19250 19251 /* Log the buffer level */ 19252 static void 19253 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, 19254 int len, struct timeval *tv, 19255 uint32_t cts) 19256 { 19257 uint32_t p_rate = 0, p_queue = 0, err = 0; 19258 union tcp_log_stackspecific log; 19259 19260 #ifdef RATELIMIT 19261 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 19262 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 19263 #endif 19264 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19265 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19266 log.u_bbr.flex1 = p_rate; 19267 log.u_bbr.flex2 = p_queue; 19268 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 19269 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 19270 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 19271 log.u_bbr.flex7 = 99; 19272 log.u_bbr.flex8 = 0; 19273 log.u_bbr.pkts_out = err; 19274 log.u_bbr.delRate = rack->r_ctl.crte->rate; 19275 log.u_bbr.timeStamp = cts; 19276 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19277 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 19278 len, &log, false, NULL, __func__, __LINE__, tv); 19279 19280 } 19281 19282 static uint32_t 19283 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, 19284 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) 19285 { 19286 uint64_t lentime = 0; 19287 #ifdef RATELIMIT 19288 uint32_t p_rate = 0, p_queue = 0, err; 19289 union tcp_log_stackspecific log; 19290 uint64_t bw; 19291 19292 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 19293 /* Failed or queue is zero */ 19294 if (err || (p_queue == 0)) { 19295 lentime = 0; 19296 goto out; 19297 } 19298 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 19299 if (err) { 19300 lentime = 0; 19301 goto out; 19302 } 19303 /* 19304 * If we reach here we have some bytes in 19305 * the queue. The number returned is a value 19306 * between 0 and 0xffff where ffff is full 19307 * and 0 is empty. So how best to make this into 19308 * something usable? 19309 * 19310 * The "safer" way is lets take the b/w gotten 19311 * from the query (which should be our b/w rate) 19312 * and pretend that a full send (our rc_pace_max_segs) 19313 * is outstanding. We factor it so its as if a full 19314 * number of our MSS segment is terms of full 19315 * ethernet segments are outstanding. 19316 */ 19317 bw = p_rate / 8; 19318 if (bw) { 19319 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); 19320 lentime *= ETHERNET_SEGMENT_SIZE; 19321 lentime *= (uint64_t)HPTS_USEC_IN_SEC; 19322 lentime /= bw; 19323 } else { 19324 /* TSNH -- KASSERT? */ 19325 lentime = 0; 19326 } 19327 out: 19328 if (tcp_bblogging_on(tp)) { 19329 memset(&log, 0, sizeof(log)); 19330 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19331 log.u_bbr.flex1 = p_rate; 19332 log.u_bbr.flex2 = p_queue; 19333 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 19334 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 19335 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 19336 log.u_bbr.flex7 = 99; 19337 log.u_bbr.flex8 = 0; 19338 log.u_bbr.pkts_out = err; 19339 log.u_bbr.delRate = rack->r_ctl.crte->rate; 19340 log.u_bbr.cur_del_rate = lentime; 19341 log.u_bbr.timeStamp = cts; 19342 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19343 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 19344 len, &log, false, NULL, __func__, __LINE__,tv); 19345 } 19346 #endif 19347 return ((uint32_t)lentime); 19348 } 19349 19350 static int 19351 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 19352 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 19353 { 19354 /* 19355 * Enter the fast retransmit path. We are given that a sched_pin is 19356 * in place (if accounting is compliled in) and the cycle count taken 19357 * at the entry is in the ts_val. The concept her is that the rsm 19358 * now holds the mbuf offsets and such so we can directly transmit 19359 * without a lot of overhead, the len field is already set for 19360 * us to prohibit us from sending too much (usually its 1MSS). 19361 */ 19362 struct ip *ip = NULL; 19363 struct udphdr *udp = NULL; 19364 struct tcphdr *th = NULL; 19365 struct mbuf *m = NULL; 19366 struct inpcb *inp; 19367 uint8_t *cpto; 19368 struct tcp_log_buffer *lgb; 19369 #ifdef TCP_ACCOUNTING 19370 uint64_t crtsc; 19371 int cnt_thru = 1; 19372 #endif 19373 struct tcpopt to; 19374 u_char opt[TCP_MAXOLEN]; 19375 uint32_t hdrlen, optlen; 19376 int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; 19377 uint16_t flags; 19378 uint32_t if_hw_tsomaxsegcount = 0, startseq; 19379 uint32_t if_hw_tsomaxsegsize; 19380 int32_t ip_sendflag = IP_NO_SND_TAG_RL; 19381 19382 #ifdef INET6 19383 struct ip6_hdr *ip6 = NULL; 19384 19385 if (rack->r_is_v6) { 19386 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 19387 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 19388 } else 19389 #endif /* INET6 */ 19390 { 19391 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 19392 hdrlen = sizeof(struct tcpiphdr); 19393 } 19394 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 19395 goto failed; 19396 } 19397 if (doing_tlp) { 19398 /* Its a TLP add the flag, it may already be there but be sure */ 19399 rsm->r_flags |= RACK_TLP; 19400 } else { 19401 /* If it was a TLP it is not not on this retransmit */ 19402 rsm->r_flags &= ~RACK_TLP; 19403 } 19404 startseq = rsm->r_start; 19405 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19406 inp = rack->rc_inp; 19407 to.to_flags = 0; 19408 flags = tcp_outflags[tp->t_state]; 19409 if (flags & (TH_SYN|TH_RST)) { 19410 goto failed; 19411 } 19412 if (rsm->r_flags & RACK_HAS_FIN) { 19413 /* We can't send a FIN here */ 19414 goto failed; 19415 } 19416 if (flags & TH_FIN) { 19417 /* We never send a FIN */ 19418 flags &= ~TH_FIN; 19419 } 19420 if (tp->t_flags & TF_RCVD_TSTMP) { 19421 to.to_tsval = ms_cts + tp->ts_offset; 19422 to.to_tsecr = tp->ts_recent; 19423 to.to_flags = TOF_TS; 19424 } 19425 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19426 /* TCP-MD5 (RFC2385). */ 19427 if (tp->t_flags & TF_SIGNATURE) 19428 to.to_flags |= TOF_SIGNATURE; 19429 #endif 19430 optlen = tcp_addoptions(&to, opt); 19431 hdrlen += optlen; 19432 udp = rack->r_ctl.fsb.udp; 19433 if (udp) 19434 hdrlen += sizeof(struct udphdr); 19435 if (rack->r_ctl.rc_pace_max_segs) 19436 max_val = rack->r_ctl.rc_pace_max_segs; 19437 else if (rack->rc_user_set_max_segs) 19438 max_val = rack->rc_user_set_max_segs * segsiz; 19439 else 19440 max_val = len; 19441 if ((tp->t_flags & TF_TSO) && 19442 V_tcp_do_tso && 19443 (len > segsiz) && 19444 (tp->t_port == 0)) 19445 tso = 1; 19446 #ifdef INET6 19447 if (MHLEN < hdrlen + max_linkhdr) 19448 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 19449 else 19450 #endif 19451 m = m_gethdr(M_NOWAIT, MT_DATA); 19452 if (m == NULL) 19453 goto failed; 19454 m->m_data += max_linkhdr; 19455 m->m_len = hdrlen; 19456 th = rack->r_ctl.fsb.th; 19457 /* Establish the len to send */ 19458 if (len > max_val) 19459 len = max_val; 19460 if ((tso) && (len + optlen > segsiz)) { 19461 uint32_t if_hw_tsomax; 19462 int32_t max_len; 19463 19464 /* extract TSO information */ 19465 if_hw_tsomax = tp->t_tsomax; 19466 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 19467 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 19468 /* 19469 * Check if we should limit by maximum payload 19470 * length: 19471 */ 19472 if (if_hw_tsomax != 0) { 19473 /* compute maximum TSO length */ 19474 max_len = (if_hw_tsomax - hdrlen - 19475 max_linkhdr); 19476 if (max_len <= 0) { 19477 goto failed; 19478 } else if (len > max_len) { 19479 len = max_len; 19480 } 19481 } 19482 if (len <= segsiz) { 19483 /* 19484 * In case there are too many small fragments don't 19485 * use TSO: 19486 */ 19487 tso = 0; 19488 } 19489 } else { 19490 tso = 0; 19491 } 19492 if ((tso == 0) && (len > segsiz)) 19493 len = segsiz; 19494 (void)tcp_get_usecs(tv); 19495 if ((len == 0) || 19496 (len <= MHLEN - hdrlen - max_linkhdr)) { 19497 goto failed; 19498 } 19499 th->th_seq = htonl(rsm->r_start); 19500 th->th_ack = htonl(tp->rcv_nxt); 19501 /* 19502 * The PUSH bit should only be applied 19503 * if the full retransmission is made. If 19504 * we are sending less than this is the 19505 * left hand edge and should not have 19506 * the PUSH bit. 19507 */ 19508 if ((rsm->r_flags & RACK_HAD_PUSH) && 19509 (len == (rsm->r_end - rsm->r_start))) 19510 flags |= TH_PUSH; 19511 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 19512 if (th->th_win == 0) { 19513 tp->t_sndzerowin++; 19514 tp->t_flags |= TF_RXWIN0SENT; 19515 } else 19516 tp->t_flags &= ~TF_RXWIN0SENT; 19517 if (rsm->r_flags & RACK_TLP) { 19518 /* 19519 * TLP should not count in retran count, but 19520 * in its own bin 19521 */ 19522 counter_u64_add(rack_tlp_retran, 1); 19523 counter_u64_add(rack_tlp_retran_bytes, len); 19524 } else { 19525 tp->t_sndrexmitpack++; 19526 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 19527 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 19528 } 19529 #ifdef STATS 19530 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 19531 len); 19532 #endif 19533 if (rsm->m == NULL) 19534 goto failed; 19535 if (rsm->m && 19536 ((rsm->orig_m_len != rsm->m->m_len) || 19537 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 19538 /* Fix up the orig_m_len and possibly the mbuf offset */ 19539 rack_adjust_orig_mlen(rsm); 19540 } 19541 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 19542 if (len <= segsiz) { 19543 /* 19544 * Must have ran out of mbufs for the copy 19545 * shorten it to no longer need tso. Lets 19546 * not put on sendalot since we are low on 19547 * mbufs. 19548 */ 19549 tso = 0; 19550 } 19551 if ((m->m_next == NULL) || (len <= 0)){ 19552 goto failed; 19553 } 19554 if (udp) { 19555 if (rack->r_is_v6) 19556 ulen = hdrlen + len - sizeof(struct ip6_hdr); 19557 else 19558 ulen = hdrlen + len - sizeof(struct ip); 19559 udp->uh_ulen = htons(ulen); 19560 } 19561 m->m_pkthdr.rcvif = (struct ifnet *)0; 19562 if (TCPS_HAVERCVDSYN(tp->t_state) && 19563 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 19564 int ect = tcp_ecn_output_established(tp, &flags, len, true); 19565 if ((tp->t_state == TCPS_SYN_RECEIVED) && 19566 (tp->t_flags2 & TF2_ECN_SND_ECE)) 19567 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 19568 #ifdef INET6 19569 if (rack->r_is_v6) { 19570 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 19571 ip6->ip6_flow |= htonl(ect << 20); 19572 } 19573 else 19574 #endif 19575 { 19576 ip->ip_tos &= ~IPTOS_ECN_MASK; 19577 ip->ip_tos |= ect; 19578 } 19579 } 19580 if (rack->r_ctl.crte != NULL) { 19581 /* See if we can send via the hw queue */ 19582 slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); 19583 /* If there is nothing in queue (no pacing time) we can send via the hw queue */ 19584 if (slot == 0) 19585 ip_sendflag = 0; 19586 } 19587 tcp_set_flags(th, flags); 19588 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 19589 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19590 if (to.to_flags & TOF_SIGNATURE) { 19591 /* 19592 * Calculate MD5 signature and put it into the place 19593 * determined before. 19594 * NOTE: since TCP options buffer doesn't point into 19595 * mbuf's data, calculate offset and use it. 19596 */ 19597 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 19598 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 19599 /* 19600 * Do not send segment if the calculation of MD5 19601 * digest has failed. 19602 */ 19603 goto failed; 19604 } 19605 } 19606 #endif 19607 #ifdef INET6 19608 if (rack->r_is_v6) { 19609 if (tp->t_port) { 19610 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 19611 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19612 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 19613 th->th_sum = htons(0); 19614 UDPSTAT_INC(udps_opackets); 19615 } else { 19616 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 19617 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19618 th->th_sum = in6_cksum_pseudo(ip6, 19619 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 19620 0); 19621 } 19622 } 19623 #endif 19624 #if defined(INET6) && defined(INET) 19625 else 19626 #endif 19627 #ifdef INET 19628 { 19629 if (tp->t_port) { 19630 m->m_pkthdr.csum_flags = CSUM_UDP; 19631 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19632 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 19633 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 19634 th->th_sum = htons(0); 19635 UDPSTAT_INC(udps_opackets); 19636 } else { 19637 m->m_pkthdr.csum_flags = CSUM_TCP; 19638 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19639 th->th_sum = in_pseudo(ip->ip_src.s_addr, 19640 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 19641 IPPROTO_TCP + len + optlen)); 19642 } 19643 /* IP version must be set here for ipv4/ipv6 checking later */ 19644 KASSERT(ip->ip_v == IPVERSION, 19645 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 19646 } 19647 #endif 19648 if (tso) { 19649 /* 19650 * Here we use segsiz since we have no added options besides 19651 * any standard timestamp options (no DSACKs or SACKS are sent 19652 * via either fast-path). 19653 */ 19654 KASSERT(len > segsiz, 19655 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 19656 m->m_pkthdr.csum_flags |= CSUM_TSO; 19657 m->m_pkthdr.tso_segsz = segsiz; 19658 } 19659 #ifdef INET6 19660 if (rack->r_is_v6) { 19661 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 19662 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 19663 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 19664 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19665 else 19666 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19667 } 19668 #endif 19669 #if defined(INET) && defined(INET6) 19670 else 19671 #endif 19672 #ifdef INET 19673 { 19674 ip->ip_len = htons(m->m_pkthdr.len); 19675 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19676 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19677 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19678 if (tp->t_port == 0 || len < V_tcp_minmss) { 19679 ip->ip_off |= htons(IP_DF); 19680 } 19681 } else { 19682 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19683 } 19684 } 19685 #endif 19686 if (doing_tlp == 0) { 19687 /* Set we retransmitted */ 19688 rack->rc_gp_saw_rec = 1; 19689 } else { 19690 /* Its a TLP set ca or ss */ 19691 if (tp->snd_cwnd > tp->snd_ssthresh) { 19692 /* Set we sent in CA */ 19693 rack->rc_gp_saw_ca = 1; 19694 } else { 19695 /* Set we sent in SS */ 19696 rack->rc_gp_saw_ss = 1; 19697 } 19698 } 19699 /* Time to copy in our header */ 19700 cpto = mtod(m, uint8_t *); 19701 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19702 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19703 if (optlen) { 19704 bcopy(opt, th + 1, optlen); 19705 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19706 } else { 19707 th->th_off = sizeof(struct tcphdr) >> 2; 19708 } 19709 if (tcp_bblogging_on(rack->rc_tp)) { 19710 union tcp_log_stackspecific log; 19711 19712 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 19713 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 19714 counter_u64_add(rack_collapsed_win_rxt, 1); 19715 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 19716 } 19717 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19718 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19719 if (rack->rack_no_prr) 19720 log.u_bbr.flex1 = 0; 19721 else 19722 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19723 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19724 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19725 log.u_bbr.flex4 = max_val; 19726 /* Save off the early/late values */ 19727 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19728 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19729 log.u_bbr.bw_inuse = rack_get_bw(rack); 19730 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19731 if (doing_tlp == 0) 19732 log.u_bbr.flex8 = 1; 19733 else 19734 log.u_bbr.flex8 = 2; 19735 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19736 log.u_bbr.flex7 = 55; 19737 log.u_bbr.pkts_out = tp->t_maxseg; 19738 log.u_bbr.timeStamp = cts; 19739 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19740 if (rsm && (rsm->r_rtr_cnt > 0)) { 19741 /* 19742 * When we have a retransmit we want to log the 19743 * burst at send and flight at send from before. 19744 */ 19745 log.u_bbr.flex5 = rsm->r_fas; 19746 log.u_bbr.bbr_substate = rsm->r_bas; 19747 } else { 19748 /* 19749 * This is currently unlikely until we do the 19750 * packet pair probes but I will add it for completeness. 19751 */ 19752 log.u_bbr.flex5 = log.u_bbr.inflight; 19753 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19754 } 19755 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19756 log.u_bbr.delivered = 0; 19757 log.u_bbr.rttProp = (uint64_t)rsm; 19758 log.u_bbr.delRate = rsm->r_flags; 19759 log.u_bbr.delRate <<= 31; 19760 log.u_bbr.delRate |= rack->r_must_retran; 19761 log.u_bbr.delRate <<= 1; 19762 log.u_bbr.delRate |= 1; 19763 log.u_bbr.pkt_epoch = __LINE__; 19764 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19765 len, &log, false, NULL, __func__, __LINE__, tv); 19766 } else 19767 lgb = NULL; 19768 if ((rack->r_ctl.crte != NULL) && 19769 tcp_bblogging_on(tp)) { 19770 rack_log_queue_level(tp, rack, len, tv, cts); 19771 } 19772 #ifdef INET6 19773 if (rack->r_is_v6) { 19774 error = ip6_output(m, inp->in6p_outputopts, 19775 &inp->inp_route6, 19776 ip_sendflag, NULL, NULL, inp); 19777 } 19778 else 19779 #endif 19780 #ifdef INET 19781 { 19782 error = ip_output(m, NULL, 19783 &inp->inp_route, 19784 ip_sendflag, 0, inp); 19785 } 19786 #endif 19787 m = NULL; 19788 if (lgb) { 19789 lgb->tlb_errno = error; 19790 lgb = NULL; 19791 } 19792 /* Move snd_nxt to snd_max so we don't have false retransmissions */ 19793 tp->snd_nxt = tp->snd_max; 19794 if (error) { 19795 goto failed; 19796 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { 19797 rack->rc_hw_nobuf = 0; 19798 rack->r_ctl.rc_agg_delayed = 0; 19799 rack->r_early = 0; 19800 rack->r_late = 0; 19801 rack->r_ctl.rc_agg_early = 0; 19802 } 19803 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 19804 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); 19805 if (doing_tlp) { 19806 rack->rc_tlp_in_progress = 1; 19807 rack->r_ctl.rc_tlp_cnt_out++; 19808 } 19809 if (error == 0) { 19810 counter_u64_add(rack_total_bytes, len); 19811 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 19812 if (doing_tlp) { 19813 rack->rc_last_sent_tlp_past_cumack = 0; 19814 rack->rc_last_sent_tlp_seq_valid = 1; 19815 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 19816 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 19817 } 19818 if (rack->r_ctl.rc_prr_sndcnt >= len) 19819 rack->r_ctl.rc_prr_sndcnt -= len; 19820 else 19821 rack->r_ctl.rc_prr_sndcnt = 0; 19822 } 19823 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19824 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19825 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 19826 rack->r_ctl.retran_during_recovery += len; 19827 { 19828 int idx; 19829 19830 idx = (len / segsiz) + 3; 19831 if (idx >= TCP_MSS_ACCT_ATIMER) 19832 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19833 else 19834 counter_u64_add(rack_out_size[idx], 1); 19835 } 19836 if (tp->t_rtttime == 0) { 19837 tp->t_rtttime = ticks; 19838 tp->t_rtseq = startseq; 19839 KMOD_TCPSTAT_INC(tcps_segstimed); 19840 } 19841 counter_u64_add(rack_fto_rsm_send, 1); 19842 if (error && (error == ENOBUFS)) { 19843 if (rack->r_ctl.crte != NULL) { 19844 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 19845 if (tcp_bblogging_on(rack->rc_tp)) 19846 rack_log_queue_level(tp, rack, len, tv, cts); 19847 } else 19848 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 19849 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 19850 if (rack->rc_enobuf < 0x7f) 19851 rack->rc_enobuf++; 19852 if (slot < (10 * HPTS_USEC_IN_MSEC)) 19853 slot = 10 * HPTS_USEC_IN_MSEC; 19854 if (rack->r_ctl.crte != NULL) { 19855 counter_u64_add(rack_saw_enobuf_hw, 1); 19856 tcp_rl_log_enobuf(rack->r_ctl.crte); 19857 } 19858 counter_u64_add(rack_saw_enobuf, 1); 19859 } else { 19860 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); 19861 } 19862 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 19863 #ifdef TCP_ACCOUNTING 19864 crtsc = get_cyclecount(); 19865 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19866 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19867 } 19868 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19869 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19870 } 19871 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19872 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 19873 } 19874 sched_unpin(); 19875 #endif 19876 return (0); 19877 failed: 19878 if (m) 19879 m_free(m); 19880 return (-1); 19881 } 19882 19883 static void 19884 rack_sndbuf_autoscale(struct tcp_rack *rack) 19885 { 19886 /* 19887 * Automatic sizing of send socket buffer. Often the send buffer 19888 * size is not optimally adjusted to the actual network conditions 19889 * at hand (delay bandwidth product). Setting the buffer size too 19890 * small limits throughput on links with high bandwidth and high 19891 * delay (eg. trans-continental/oceanic links). Setting the 19892 * buffer size too big consumes too much real kernel memory, 19893 * especially with many connections on busy servers. 19894 * 19895 * The criteria to step up the send buffer one notch are: 19896 * 1. receive window of remote host is larger than send buffer 19897 * (with a fudge factor of 5/4th); 19898 * 2. send buffer is filled to 7/8th with data (so we actually 19899 * have data to make use of it); 19900 * 3. send buffer fill has not hit maximal automatic size; 19901 * 4. our send window (slow start and cogestion controlled) is 19902 * larger than sent but unacknowledged data in send buffer. 19903 * 19904 * Note that the rack version moves things much faster since 19905 * we want to avoid hitting cache lines in the rack_fast_output() 19906 * path so this is called much less often and thus moves 19907 * the SB forward by a percentage. 19908 */ 19909 struct socket *so; 19910 struct tcpcb *tp; 19911 uint32_t sendwin, scaleup; 19912 19913 tp = rack->rc_tp; 19914 so = rack->rc_inp->inp_socket; 19915 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 19916 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 19917 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 19918 sbused(&so->so_snd) >= 19919 (so->so_snd.sb_hiwat / 8 * 7) && 19920 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 19921 sendwin >= (sbused(&so->so_snd) - 19922 (tp->snd_max - tp->snd_una))) { 19923 if (rack_autosndbuf_inc) 19924 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 19925 else 19926 scaleup = V_tcp_autosndbuf_inc; 19927 if (scaleup < V_tcp_autosndbuf_inc) 19928 scaleup = V_tcp_autosndbuf_inc; 19929 scaleup += so->so_snd.sb_hiwat; 19930 if (scaleup > V_tcp_autosndbuf_max) 19931 scaleup = V_tcp_autosndbuf_max; 19932 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 19933 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 19934 } 19935 } 19936 } 19937 19938 static int 19939 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 19940 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 19941 { 19942 /* 19943 * Enter to do fast output. We are given that the sched_pin is 19944 * in place (if accounting is compiled in) and the cycle count taken 19945 * at entry is in place in ts_val. The idea here is that 19946 * we know how many more bytes needs to be sent (presumably either 19947 * during pacing or to fill the cwnd and that was greater than 19948 * the max-burst). We have how much to send and all the info we 19949 * need to just send. 19950 */ 19951 #ifdef INET 19952 struct ip *ip = NULL; 19953 #endif 19954 struct udphdr *udp = NULL; 19955 struct tcphdr *th = NULL; 19956 struct mbuf *m, *s_mb; 19957 struct inpcb *inp; 19958 uint8_t *cpto; 19959 struct tcp_log_buffer *lgb; 19960 #ifdef TCP_ACCOUNTING 19961 uint64_t crtsc; 19962 #endif 19963 struct tcpopt to; 19964 u_char opt[TCP_MAXOLEN]; 19965 uint32_t hdrlen, optlen; 19966 #ifdef TCP_ACCOUNTING 19967 int cnt_thru = 1; 19968 #endif 19969 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 19970 uint16_t flags; 19971 uint32_t s_soff; 19972 uint32_t if_hw_tsomaxsegcount = 0, startseq; 19973 uint32_t if_hw_tsomaxsegsize; 19974 uint32_t add_flag = RACK_SENT_FP; 19975 #ifdef INET6 19976 struct ip6_hdr *ip6 = NULL; 19977 19978 if (rack->r_is_v6) { 19979 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 19980 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 19981 } else 19982 #endif /* INET6 */ 19983 { 19984 #ifdef INET 19985 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 19986 hdrlen = sizeof(struct tcpiphdr); 19987 #endif 19988 } 19989 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 19990 m = NULL; 19991 goto failed; 19992 } 19993 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19994 startseq = tp->snd_max; 19995 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19996 inp = rack->rc_inp; 19997 len = rack->r_ctl.fsb.left_to_send; 19998 to.to_flags = 0; 19999 flags = rack->r_ctl.fsb.tcp_flags; 20000 if (tp->t_flags & TF_RCVD_TSTMP) { 20001 to.to_tsval = ms_cts + tp->ts_offset; 20002 to.to_tsecr = tp->ts_recent; 20003 to.to_flags = TOF_TS; 20004 } 20005 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 20006 /* TCP-MD5 (RFC2385). */ 20007 if (tp->t_flags & TF_SIGNATURE) 20008 to.to_flags |= TOF_SIGNATURE; 20009 #endif 20010 optlen = tcp_addoptions(&to, opt); 20011 hdrlen += optlen; 20012 udp = rack->r_ctl.fsb.udp; 20013 if (udp) 20014 hdrlen += sizeof(struct udphdr); 20015 if (rack->r_ctl.rc_pace_max_segs) 20016 max_val = rack->r_ctl.rc_pace_max_segs; 20017 else if (rack->rc_user_set_max_segs) 20018 max_val = rack->rc_user_set_max_segs * segsiz; 20019 else 20020 max_val = len; 20021 if ((tp->t_flags & TF_TSO) && 20022 V_tcp_do_tso && 20023 (len > segsiz) && 20024 (tp->t_port == 0)) 20025 tso = 1; 20026 again: 20027 #ifdef INET6 20028 if (MHLEN < hdrlen + max_linkhdr) 20029 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 20030 else 20031 #endif 20032 m = m_gethdr(M_NOWAIT, MT_DATA); 20033 if (m == NULL) 20034 goto failed; 20035 m->m_data += max_linkhdr; 20036 m->m_len = hdrlen; 20037 th = rack->r_ctl.fsb.th; 20038 /* Establish the len to send */ 20039 if (len > max_val) 20040 len = max_val; 20041 if ((tso) && (len + optlen > segsiz)) { 20042 uint32_t if_hw_tsomax; 20043 int32_t max_len; 20044 20045 /* extract TSO information */ 20046 if_hw_tsomax = tp->t_tsomax; 20047 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 20048 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 20049 /* 20050 * Check if we should limit by maximum payload 20051 * length: 20052 */ 20053 if (if_hw_tsomax != 0) { 20054 /* compute maximum TSO length */ 20055 max_len = (if_hw_tsomax - hdrlen - 20056 max_linkhdr); 20057 if (max_len <= 0) { 20058 goto failed; 20059 } else if (len > max_len) { 20060 len = max_len; 20061 } 20062 } 20063 if (len <= segsiz) { 20064 /* 20065 * In case there are too many small fragments don't 20066 * use TSO: 20067 */ 20068 tso = 0; 20069 } 20070 } else { 20071 tso = 0; 20072 } 20073 if ((tso == 0) && (len > segsiz)) 20074 len = segsiz; 20075 (void)tcp_get_usecs(tv); 20076 if ((len == 0) || 20077 (len <= MHLEN - hdrlen - max_linkhdr)) { 20078 goto failed; 20079 } 20080 sb_offset = tp->snd_max - tp->snd_una; 20081 th->th_seq = htonl(tp->snd_max); 20082 th->th_ack = htonl(tp->rcv_nxt); 20083 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 20084 if (th->th_win == 0) { 20085 tp->t_sndzerowin++; 20086 tp->t_flags |= TF_RXWIN0SENT; 20087 } else 20088 tp->t_flags &= ~TF_RXWIN0SENT; 20089 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 20090 KMOD_TCPSTAT_INC(tcps_sndpack); 20091 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 20092 #ifdef STATS 20093 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 20094 len); 20095 #endif 20096 if (rack->r_ctl.fsb.m == NULL) 20097 goto failed; 20098 20099 /* s_mb and s_soff are saved for rack_log_output */ 20100 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 20101 &s_mb, &s_soff); 20102 if (len <= segsiz) { 20103 /* 20104 * Must have ran out of mbufs for the copy 20105 * shorten it to no longer need tso. Lets 20106 * not put on sendalot since we are low on 20107 * mbufs. 20108 */ 20109 tso = 0; 20110 } 20111 if (rack->r_ctl.fsb.rfo_apply_push && 20112 (len == rack->r_ctl.fsb.left_to_send)) { 20113 tcp_set_flags(th, flags | TH_PUSH); 20114 add_flag |= RACK_HAD_PUSH; 20115 } 20116 if ((m->m_next == NULL) || (len <= 0)){ 20117 goto failed; 20118 } 20119 if (udp) { 20120 if (rack->r_is_v6) 20121 ulen = hdrlen + len - sizeof(struct ip6_hdr); 20122 else 20123 ulen = hdrlen + len - sizeof(struct ip); 20124 udp->uh_ulen = htons(ulen); 20125 } 20126 m->m_pkthdr.rcvif = (struct ifnet *)0; 20127 if (TCPS_HAVERCVDSYN(tp->t_state) && 20128 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 20129 int ect = tcp_ecn_output_established(tp, &flags, len, false); 20130 if ((tp->t_state == TCPS_SYN_RECEIVED) && 20131 (tp->t_flags2 & TF2_ECN_SND_ECE)) 20132 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 20133 #ifdef INET6 20134 if (rack->r_is_v6) { 20135 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 20136 ip6->ip6_flow |= htonl(ect << 20); 20137 } 20138 else 20139 #endif 20140 { 20141 #ifdef INET 20142 ip->ip_tos &= ~IPTOS_ECN_MASK; 20143 ip->ip_tos |= ect; 20144 #endif 20145 } 20146 } 20147 tcp_set_flags(th, flags); 20148 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 20149 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 20150 if (to.to_flags & TOF_SIGNATURE) { 20151 /* 20152 * Calculate MD5 signature and put it into the place 20153 * determined before. 20154 * NOTE: since TCP options buffer doesn't point into 20155 * mbuf's data, calculate offset and use it. 20156 */ 20157 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 20158 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 20159 /* 20160 * Do not send segment if the calculation of MD5 20161 * digest has failed. 20162 */ 20163 goto failed; 20164 } 20165 } 20166 #endif 20167 #ifdef INET6 20168 if (rack->r_is_v6) { 20169 if (tp->t_port) { 20170 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 20171 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 20172 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 20173 th->th_sum = htons(0); 20174 UDPSTAT_INC(udps_opackets); 20175 } else { 20176 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 20177 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 20178 th->th_sum = in6_cksum_pseudo(ip6, 20179 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 20180 0); 20181 } 20182 } 20183 #endif 20184 #if defined(INET6) && defined(INET) 20185 else 20186 #endif 20187 #ifdef INET 20188 { 20189 if (tp->t_port) { 20190 m->m_pkthdr.csum_flags = CSUM_UDP; 20191 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 20192 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 20193 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 20194 th->th_sum = htons(0); 20195 UDPSTAT_INC(udps_opackets); 20196 } else { 20197 m->m_pkthdr.csum_flags = CSUM_TCP; 20198 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 20199 th->th_sum = in_pseudo(ip->ip_src.s_addr, 20200 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 20201 IPPROTO_TCP + len + optlen)); 20202 } 20203 /* IP version must be set here for ipv4/ipv6 checking later */ 20204 KASSERT(ip->ip_v == IPVERSION, 20205 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 20206 } 20207 #endif 20208 if (tso) { 20209 /* 20210 * Here we use segsiz since we have no added options besides 20211 * any standard timestamp options (no DSACKs or SACKS are sent 20212 * via either fast-path). 20213 */ 20214 KASSERT(len > segsiz, 20215 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 20216 m->m_pkthdr.csum_flags |= CSUM_TSO; 20217 m->m_pkthdr.tso_segsz = segsiz; 20218 } 20219 #ifdef INET6 20220 if (rack->r_is_v6) { 20221 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 20222 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 20223 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 20224 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 20225 else 20226 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 20227 } 20228 #endif 20229 #if defined(INET) && defined(INET6) 20230 else 20231 #endif 20232 #ifdef INET 20233 { 20234 ip->ip_len = htons(m->m_pkthdr.len); 20235 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 20236 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 20237 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 20238 if (tp->t_port == 0 || len < V_tcp_minmss) { 20239 ip->ip_off |= htons(IP_DF); 20240 } 20241 } else { 20242 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 20243 } 20244 } 20245 #endif 20246 if (tp->snd_cwnd > tp->snd_ssthresh) { 20247 /* Set we sent in CA */ 20248 rack->rc_gp_saw_ca = 1; 20249 } else { 20250 /* Set we sent in SS */ 20251 rack->rc_gp_saw_ss = 1; 20252 } 20253 /* Time to copy in our header */ 20254 cpto = mtod(m, uint8_t *); 20255 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 20256 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 20257 if (optlen) { 20258 bcopy(opt, th + 1, optlen); 20259 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 20260 } else { 20261 th->th_off = sizeof(struct tcphdr) >> 2; 20262 } 20263 if ((rack->r_ctl.crte != NULL) && 20264 tcp_bblogging_on(tp)) { 20265 rack_log_queue_level(tp, rack, len, tv, cts); 20266 } 20267 if (tcp_bblogging_on(rack->rc_tp)) { 20268 union tcp_log_stackspecific log; 20269 20270 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 20271 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 20272 if (rack->rack_no_prr) 20273 log.u_bbr.flex1 = 0; 20274 else 20275 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 20276 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 20277 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 20278 log.u_bbr.flex4 = max_val; 20279 /* Save off the early/late values */ 20280 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 20281 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 20282 log.u_bbr.bw_inuse = rack_get_bw(rack); 20283 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 20284 log.u_bbr.flex8 = 0; 20285 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 20286 log.u_bbr.flex7 = 44; 20287 log.u_bbr.pkts_out = tp->t_maxseg; 20288 log.u_bbr.timeStamp = cts; 20289 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 20290 log.u_bbr.flex5 = log.u_bbr.inflight; 20291 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 20292 log.u_bbr.delivered = 0; 20293 log.u_bbr.rttProp = 0; 20294 log.u_bbr.delRate = rack->r_must_retran; 20295 log.u_bbr.delRate <<= 1; 20296 log.u_bbr.pkt_epoch = __LINE__; 20297 /* For fast output no retrans so just inflight and how many mss we send */ 20298 log.u_bbr.flex5 = log.u_bbr.inflight; 20299 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 20300 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 20301 len, &log, false, NULL, __func__, __LINE__, tv); 20302 } else 20303 lgb = NULL; 20304 #ifdef INET6 20305 if (rack->r_is_v6) { 20306 error = ip6_output(m, inp->in6p_outputopts, 20307 &inp->inp_route6, 20308 0, NULL, NULL, inp); 20309 } 20310 #endif 20311 #if defined(INET) && defined(INET6) 20312 else 20313 #endif 20314 #ifdef INET 20315 { 20316 error = ip_output(m, NULL, 20317 &inp->inp_route, 20318 0, 0, inp); 20319 } 20320 #endif 20321 if (lgb) { 20322 lgb->tlb_errno = error; 20323 lgb = NULL; 20324 } 20325 if (error) { 20326 *send_err = error; 20327 m = NULL; 20328 goto failed; 20329 } else if (rack->rc_hw_nobuf) { 20330 rack->rc_hw_nobuf = 0; 20331 rack->r_ctl.rc_agg_delayed = 0; 20332 rack->r_early = 0; 20333 rack->r_late = 0; 20334 rack->r_ctl.rc_agg_early = 0; 20335 } 20336 if ((error == 0) && (rack->lt_bw_up == 0)) { 20337 /* Unlikely */ 20338 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); 20339 rack->r_ctl.lt_seq = tp->snd_una; 20340 rack->lt_bw_up = 1; 20341 } else if ((error == 0) && 20342 (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) { 20343 /* 20344 * Need to record what we have since we are 20345 * approaching seq wrap. 20346 */ 20347 struct timeval tv; 20348 uint64_t tmark; 20349 20350 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 20351 rack->r_ctl.lt_seq = tp->snd_una; 20352 tmark = tcp_get_u64_usecs(&tv); 20353 if (tmark > rack->r_ctl.lt_timemark) { 20354 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 20355 rack->r_ctl.lt_timemark = tmark; 20356 } 20357 } 20358 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 20359 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); 20360 m = NULL; 20361 if (tp->snd_una == tp->snd_max) { 20362 rack->r_ctl.rc_tlp_rxt_last_time = cts; 20363 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 20364 tp->t_acktime = ticks; 20365 } 20366 counter_u64_add(rack_total_bytes, len); 20367 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 20368 20369 rack->forced_ack = 0; /* If we send something zap the FA flag */ 20370 tot_len += len; 20371 if ((tp->t_flags & TF_GPUTINPROG) == 0) 20372 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 20373 tp->snd_max += len; 20374 tp->snd_nxt = tp->snd_max; 20375 if (rack->rc_new_rnd_needed) { 20376 rack_new_round_starts(tp, rack, tp->snd_max); 20377 } 20378 { 20379 int idx; 20380 20381 idx = (len / segsiz) + 3; 20382 if (idx >= TCP_MSS_ACCT_ATIMER) 20383 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 20384 else 20385 counter_u64_add(rack_out_size[idx], 1); 20386 } 20387 if (len <= rack->r_ctl.fsb.left_to_send) 20388 rack->r_ctl.fsb.left_to_send -= len; 20389 else 20390 rack->r_ctl.fsb.left_to_send = 0; 20391 if (rack->r_ctl.fsb.left_to_send < segsiz) { 20392 rack->r_fast_output = 0; 20393 rack->r_ctl.fsb.left_to_send = 0; 20394 /* At the end of fast_output scale up the sb */ 20395 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 20396 rack_sndbuf_autoscale(rack); 20397 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 20398 } 20399 if (tp->t_rtttime == 0) { 20400 tp->t_rtttime = ticks; 20401 tp->t_rtseq = startseq; 20402 KMOD_TCPSTAT_INC(tcps_segstimed); 20403 } 20404 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 20405 (max_val > len) && 20406 (tso == 0)) { 20407 max_val -= len; 20408 len = segsiz; 20409 th = rack->r_ctl.fsb.th; 20410 #ifdef TCP_ACCOUNTING 20411 cnt_thru++; 20412 #endif 20413 goto again; 20414 } 20415 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 20416 counter_u64_add(rack_fto_send, 1); 20417 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__); 20418 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 20419 #ifdef TCP_ACCOUNTING 20420 crtsc = get_cyclecount(); 20421 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20422 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 20423 } 20424 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20425 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 20426 } 20427 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20428 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 20429 } 20430 sched_unpin(); 20431 #endif 20432 return (0); 20433 failed: 20434 if (m) 20435 m_free(m); 20436 rack->r_fast_output = 0; 20437 return (-1); 20438 } 20439 20440 static inline void 20441 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, 20442 struct sockbuf *sb, 20443 int len, int orig_len, int segsiz, uint32_t pace_max_seg, 20444 bool hw_tls, 20445 uint16_t flags) 20446 { 20447 rack->r_fast_output = 1; 20448 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 20449 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 20450 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 20451 rack->r_ctl.fsb.tcp_flags = flags; 20452 rack->r_ctl.fsb.left_to_send = orig_len - len; 20453 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { 20454 /* Less than a full sized pace, lets not */ 20455 rack->r_fast_output = 0; 20456 return; 20457 } else { 20458 /* Round down to the nearest pace_max_seg */ 20459 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); 20460 } 20461 if (hw_tls) 20462 rack->r_ctl.fsb.hw_tls = 1; 20463 else 20464 rack->r_ctl.fsb.hw_tls = 0; 20465 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 20466 ("rack:%p left_to_send:%u sbavail:%u out:%u", 20467 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 20468 (tp->snd_max - tp->snd_una))); 20469 if (rack->r_ctl.fsb.left_to_send < segsiz) 20470 rack->r_fast_output = 0; 20471 else { 20472 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 20473 rack->r_ctl.fsb.rfo_apply_push = 1; 20474 else 20475 rack->r_ctl.fsb.rfo_apply_push = 0; 20476 } 20477 } 20478 20479 static uint32_t 20480 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) 20481 { 20482 uint64_t min_time; 20483 uint32_t maxlen; 20484 20485 min_time = (uint64_t)get_hpts_min_sleep_time(); 20486 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); 20487 maxlen = roundup(maxlen, segsiz); 20488 return (maxlen); 20489 } 20490 20491 static struct rack_sendmap * 20492 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 20493 { 20494 struct rack_sendmap *rsm = NULL; 20495 int thresh; 20496 20497 restart: 20498 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 20499 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 20500 /* Nothing, strange turn off validity */ 20501 rack->r_collapse_point_valid = 0; 20502 return (NULL); 20503 } 20504 /* Can we send it yet? */ 20505 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 20506 /* 20507 * Receiver window has not grown enough for 20508 * the segment to be put on the wire. 20509 */ 20510 return (NULL); 20511 } 20512 if (rsm->r_flags & RACK_ACKED) { 20513 /* 20514 * It has been sacked, lets move to the 20515 * next one if possible. 20516 */ 20517 rack->r_ctl.last_collapse_point = rsm->r_end; 20518 /* Are we done? */ 20519 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 20520 rack->r_ctl.high_collapse_point)) { 20521 rack->r_collapse_point_valid = 0; 20522 return (NULL); 20523 } 20524 goto restart; 20525 } 20526 /* Now has it been long enough ? */ 20527 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1); 20528 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 20529 rack_log_collapse(rack, rsm->r_start, 20530 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 20531 thresh, __LINE__, 6, rsm->r_flags, rsm); 20532 return (rsm); 20533 } 20534 /* Not enough time */ 20535 rack_log_collapse(rack, rsm->r_start, 20536 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 20537 thresh, __LINE__, 7, rsm->r_flags, rsm); 20538 return (NULL); 20539 } 20540 20541 static void 20542 rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line) 20543 { 20544 /* 20545 * We were idle some time (idle_t) and so our policer bucket 20546 * needs to grow. It can go no higher than policer_bucket_size. 20547 */ 20548 uint64_t len; 20549 20550 len = idle_t * rack->r_ctl.policer_bw; 20551 len /= HPTS_USEC_IN_SEC; 20552 rack->r_ctl.current_policer_bucket += (uint32_t)len; 20553 if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) { 20554 rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size; 20555 } 20556 if (rack_verbose_logging > 0) 20557 policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7); 20558 } 20559 20560 static inline void 20561 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) 20562 { 20563 if ((rack->full_size_rxt == 0) && 20564 (rack->shape_rxt_to_pacing_min == 0) && 20565 (*len >= segsiz)) { 20566 *len = segsiz; 20567 } else if (rack->shape_rxt_to_pacing_min && 20568 rack->gp_ready) { 20569 /* We use pacing min as shaping len req */ 20570 uint32_t maxlen; 20571 20572 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20573 if (*len > maxlen) 20574 *len = maxlen; 20575 } else { 20576 /* 20577 * The else is full_size_rxt is on so send it all 20578 * note we do need to check this for exceeding 20579 * our max segment size due to the fact that 20580 * we do sometimes merge chunks together i.e. 20581 * we cannot just assume that we will never have 20582 * a chunk greater than pace_max_seg 20583 */ 20584 if (*len > pace_max_seg) 20585 *len = pace_max_seg; 20586 } 20587 } 20588 20589 static int 20590 rack_output(struct tcpcb *tp) 20591 { 20592 struct socket *so; 20593 uint32_t recwin; 20594 uint32_t sb_offset, s_moff = 0; 20595 int32_t len, error = 0; 20596 uint16_t flags; 20597 struct mbuf *m, *s_mb = NULL; 20598 struct mbuf *mb; 20599 uint32_t if_hw_tsomaxsegcount = 0; 20600 uint32_t if_hw_tsomaxsegsize; 20601 int32_t segsiz, minseg; 20602 long tot_len_this_send = 0; 20603 #ifdef INET 20604 struct ip *ip = NULL; 20605 #endif 20606 struct udphdr *udp = NULL; 20607 struct tcp_rack *rack; 20608 struct tcphdr *th; 20609 uint8_t pass = 0; 20610 uint8_t mark = 0; 20611 uint8_t check_done = 0; 20612 uint8_t wanted_cookie = 0; 20613 u_char opt[TCP_MAXOLEN]; 20614 unsigned ipoptlen, optlen, hdrlen, ulen=0; 20615 uint32_t rack_seq; 20616 20617 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20618 unsigned ipsec_optlen = 0; 20619 20620 #endif 20621 int32_t idle, sendalot; 20622 uint32_t tot_idle; 20623 int32_t sub_from_prr = 0; 20624 volatile int32_t sack_rxmit; 20625 struct rack_sendmap *rsm = NULL; 20626 int32_t tso, mtu; 20627 struct tcpopt to; 20628 int32_t slot = 0; 20629 int32_t sup_rack = 0; 20630 uint32_t cts, ms_cts, delayed, early; 20631 uint32_t add_flag = RACK_SENT_SP; 20632 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 20633 uint8_t doing_tlp = 0; 20634 uint32_t cwnd_to_use, pace_max_seg; 20635 int32_t do_a_prefetch = 0; 20636 int32_t prefetch_rsm = 0; 20637 int32_t orig_len = 0; 20638 struct timeval tv; 20639 int32_t prefetch_so_done = 0; 20640 struct tcp_log_buffer *lgb; 20641 struct inpcb *inp = tptoinpcb(tp); 20642 struct sockbuf *sb; 20643 uint64_t ts_val = 0; 20644 #ifdef TCP_ACCOUNTING 20645 uint64_t crtsc; 20646 #endif 20647 #ifdef INET6 20648 struct ip6_hdr *ip6 = NULL; 20649 int32_t isipv6; 20650 #endif 20651 bool hpts_calling, hw_tls = false; 20652 20653 NET_EPOCH_ASSERT(); 20654 INP_WLOCK_ASSERT(inp); 20655 20656 /* setup and take the cache hits here */ 20657 rack = (struct tcp_rack *)tp->t_fb_ptr; 20658 #ifdef TCP_ACCOUNTING 20659 sched_pin(); 20660 ts_val = get_cyclecount(); 20661 #endif 20662 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); 20663 tp->t_flags2 &= ~TF2_HPTS_CALLS; 20664 #ifdef TCP_OFFLOAD 20665 if (tp->t_flags & TF_TOE) { 20666 #ifdef TCP_ACCOUNTING 20667 sched_unpin(); 20668 #endif 20669 return (tcp_offload_output(tp)); 20670 } 20671 #endif 20672 if (rack->rack_deferred_inited == 0) { 20673 /* 20674 * If we are the connecting socket we will 20675 * hit rack_init() when no sequence numbers 20676 * are setup. This makes it so we must defer 20677 * some initialization. Call that now. 20678 */ 20679 rack_deferred_init(tp, rack); 20680 } 20681 /* 20682 * For TFO connections in SYN_RECEIVED, only allow the initial 20683 * SYN|ACK and those sent by the retransmit timer. 20684 */ 20685 if ((tp->t_flags & TF_FASTOPEN) && 20686 (tp->t_state == TCPS_SYN_RECEIVED) && 20687 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 20688 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 20689 #ifdef TCP_ACCOUNTING 20690 sched_unpin(); 20691 #endif 20692 return (0); 20693 } 20694 #ifdef INET6 20695 if (rack->r_state) { 20696 /* Use the cache line loaded if possible */ 20697 isipv6 = rack->r_is_v6; 20698 } else { 20699 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 20700 } 20701 #endif 20702 early = 0; 20703 cts = tcp_get_usecs(&tv); 20704 ms_cts = tcp_tv_to_mssectick(&tv); 20705 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 20706 tcp_in_hpts(rack->rc_tp)) { 20707 /* 20708 * We are on the hpts for some timer but not hptsi output. 20709 * Remove from the hpts unconditionally. 20710 */ 20711 rack_timer_cancel(tp, rack, cts, __LINE__); 20712 } 20713 /* Are we pacing and late? */ 20714 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 20715 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 20716 /* We are delayed */ 20717 delayed = cts - rack->r_ctl.rc_last_output_to; 20718 } else { 20719 delayed = 0; 20720 } 20721 /* Do the timers, which may override the pacer */ 20722 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 20723 int retval; 20724 20725 retval = rack_process_timers(tp, rack, cts, hpts_calling, 20726 &doing_tlp); 20727 if (retval != 0) { 20728 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 20729 #ifdef TCP_ACCOUNTING 20730 sched_unpin(); 20731 #endif 20732 /* 20733 * If timers want tcp_drop(), then pass error out, 20734 * otherwise suppress it. 20735 */ 20736 return (retval < 0 ? retval : 0); 20737 } 20738 } 20739 if (rack->rc_in_persist) { 20740 if (tcp_in_hpts(rack->rc_tp) == 0) { 20741 /* Timer is not running */ 20742 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 20743 } 20744 #ifdef TCP_ACCOUNTING 20745 sched_unpin(); 20746 #endif 20747 return (0); 20748 } 20749 if ((rack->rc_ack_required == 1) && 20750 (rack->r_timer_override == 0)){ 20751 /* A timeout occurred and no ack has arrived */ 20752 if (tcp_in_hpts(rack->rc_tp) == 0) { 20753 /* Timer is not running */ 20754 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 20755 } 20756 #ifdef TCP_ACCOUNTING 20757 sched_unpin(); 20758 #endif 20759 return (0); 20760 } 20761 if ((rack->r_timer_override) || 20762 (rack->rc_ack_can_sendout_data) || 20763 (delayed) || 20764 (tp->t_state < TCPS_ESTABLISHED)) { 20765 rack->rc_ack_can_sendout_data = 0; 20766 if (tcp_in_hpts(rack->rc_tp)) 20767 tcp_hpts_remove(rack->rc_tp); 20768 } else if (tcp_in_hpts(rack->rc_tp)) { 20769 /* 20770 * On the hpts you can't pass even if ACKNOW is on, we will 20771 * when the hpts fires. 20772 */ 20773 #ifdef TCP_ACCOUNTING 20774 crtsc = get_cyclecount(); 20775 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20776 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 20777 } 20778 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20779 tp->tcp_cnt_counters[SND_BLOCKED]++; 20780 } 20781 sched_unpin(); 20782 #endif 20783 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 20784 return (0); 20785 } 20786 /* Finish out both pacing early and late accounting */ 20787 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 20788 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 20789 early = rack->r_ctl.rc_last_output_to - cts; 20790 } else 20791 early = 0; 20792 if (delayed && (rack->rc_always_pace == 1)) { 20793 rack->r_ctl.rc_agg_delayed += delayed; 20794 rack->r_late = 1; 20795 } else if (early && (rack->rc_always_pace == 1)) { 20796 rack->r_ctl.rc_agg_early += early; 20797 rack->r_early = 1; 20798 } else if (rack->rc_always_pace == 0) { 20799 /* Non-paced we are not late */ 20800 rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0; 20801 rack->r_early = rack->r_late = 0; 20802 } 20803 /* Now that early/late accounting is done turn off the flag */ 20804 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 20805 rack->r_wanted_output = 0; 20806 rack->r_timer_override = 0; 20807 if ((tp->t_state != rack->r_state) && 20808 TCPS_HAVEESTABLISHED(tp->t_state)) { 20809 rack_set_state(tp, rack); 20810 } 20811 if ((rack->r_fast_output) && 20812 (doing_tlp == 0) && 20813 (tp->rcv_numsacks == 0)) { 20814 int ret; 20815 20816 error = 0; 20817 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 20818 if (ret >= 0) 20819 return(ret); 20820 else if (error) { 20821 inp = rack->rc_inp; 20822 so = inp->inp_socket; 20823 sb = &so->so_snd; 20824 goto nomore; 20825 } 20826 } 20827 inp = rack->rc_inp; 20828 /* 20829 * For TFO connections in SYN_SENT or SYN_RECEIVED, 20830 * only allow the initial SYN or SYN|ACK and those sent 20831 * by the retransmit timer. 20832 */ 20833 if ((tp->t_flags & TF_FASTOPEN) && 20834 ((tp->t_state == TCPS_SYN_RECEIVED) || 20835 (tp->t_state == TCPS_SYN_SENT)) && 20836 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 20837 (tp->t_rxtshift == 0)) { /* not a retransmit */ 20838 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 20839 so = inp->inp_socket; 20840 sb = &so->so_snd; 20841 goto just_return_nolock; 20842 } 20843 /* 20844 * Determine length of data that should be transmitted, and flags 20845 * that will be used. If there is some data or critical controls 20846 * (SYN, RST) to send, then transmit; otherwise, investigate 20847 * further. 20848 */ 20849 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 20850 if (tp->t_idle_reduce) { 20851 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 20852 rack_cc_after_idle(rack, tp); 20853 } 20854 tp->t_flags &= ~TF_LASTIDLE; 20855 if (idle) { 20856 if (tp->t_flags & TF_MORETOCOME) { 20857 tp->t_flags |= TF_LASTIDLE; 20858 idle = 0; 20859 } 20860 } 20861 if ((tp->snd_una == tp->snd_max) && 20862 rack->r_ctl.rc_went_idle_time && 20863 (cts > rack->r_ctl.rc_went_idle_time)) { 20864 tot_idle = (cts - rack->r_ctl.rc_went_idle_time); 20865 if (tot_idle > rack_min_probertt_hold) { 20866 /* Count as a probe rtt */ 20867 if (rack->in_probe_rtt == 0) { 20868 rack->r_ctl.rc_lower_rtt_us_cts = cts; 20869 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 20870 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 20871 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 20872 } else { 20873 rack_exit_probertt(rack, cts); 20874 } 20875 } 20876 } 20877 if(rack->policer_detect_on) { 20878 /* 20879 * If we are doing policer detetion we at a minium 20880 * record the time but if possible add back to 20881 * the bucket based on the idle time. 20882 */ 20883 uint64_t idle_t, u64_cts; 20884 20885 segsiz = min(ctf_fixed_maxseg(tp), 20886 rack->r_ctl.rc_pace_min_segs); 20887 u64_cts = tcp_tv_to_lusectick(&tv); 20888 if ((rack->rc_policer_detected == 1) && 20889 (rack->r_ctl.policer_bucket_size > segsiz) && 20890 (rack->r_ctl.policer_bw > 0) && 20891 (u64_cts > rack->r_ctl.last_sendtime)) { 20892 /* We are being policed add back the time */ 20893 idle_t = u64_cts - rack->r_ctl.last_sendtime; 20894 rack_credit_back_policer_idle_time(rack, idle_t, __LINE__); 20895 } 20896 rack->r_ctl.last_sendtime = u64_cts; 20897 } 20898 if (rack_use_fsb && 20899 (rack->r_ctl.fsb.tcp_ip_hdr) && 20900 (rack->r_fsb_inited == 0) && 20901 (rack->r_state != TCPS_CLOSED)) 20902 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); 20903 if (rack->rc_sendvars_notset == 1) { 20904 rack->r_ctl.idle_snd_una = tp->snd_una; 20905 rack->rc_sendvars_notset = 0; 20906 /* 20907 * Make sure any TCP timers (keep-alive) is not running. 20908 */ 20909 tcp_timer_stop(tp); 20910 } 20911 if ((rack->rack_no_prr == 1) && 20912 (rack->rc_always_pace == 0)) { 20913 /* 20914 * Sanity check before sending, if we have 20915 * no-pacing enabled and prr is turned off that 20916 * is a logistics error. Correct this by turnning 20917 * prr back on. A user *must* set some form of 20918 * pacing in order to turn PRR off. We do this 20919 * in the output path so that we can avoid socket 20920 * option ordering issues that would occur if we 20921 * tried to do it while setting rack_no_prr on. 20922 */ 20923 rack->rack_no_prr = 0; 20924 } 20925 if ((rack->pcm_enabled == 1) && 20926 (rack->pcm_needed == 0) && 20927 (tot_idle > 0)) { 20928 /* 20929 * We have been idle some micro seconds. We need 20930 * to factor this in to see if a PCM is needed. 20931 */ 20932 uint32_t rtts_idle, rnds; 20933 20934 if (tp->t_srtt) 20935 rtts_idle = tot_idle / tp->t_srtt; 20936 else 20937 rtts_idle = 0; 20938 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 20939 rack->r_ctl.pcm_idle_rounds += rtts_idle; 20940 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 20941 rack->pcm_needed = 1; 20942 rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round ); 20943 } 20944 } 20945 again: 20946 sendalot = 0; 20947 cts = tcp_get_usecs(&tv); 20948 ms_cts = tcp_tv_to_mssectick(&tv); 20949 tso = 0; 20950 mtu = 0; 20951 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 20952 minseg = segsiz; 20953 if (rack->r_ctl.rc_pace_max_segs == 0) 20954 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 20955 else 20956 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 20957 if (TCPS_HAVEESTABLISHED(tp->t_state) && 20958 (rack->r_ctl.pcm_max_seg == 0)) { 20959 /* 20960 * We set in our first send so we know that the ctf_fixed_maxseg 20961 * has been fully set. If we do it in rack_init() we most likely 20962 * see 512 bytes so we end up at 5120, not desirable. 20963 */ 20964 rack->r_ctl.pcm_max_seg = rc_init_window(rack); 20965 if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) { 20966 /* 20967 * Assure our initial PCM probe is at least 10 MSS. 20968 */ 20969 rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; 20970 } 20971 } 20972 if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { 20973 uint32_t rw_avail, cwa; 20974 20975 if (tp->snd_wnd > ctf_outstanding(tp)) 20976 rw_avail = tp->snd_wnd - ctf_outstanding(tp); 20977 else 20978 rw_avail = 0; 20979 if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked)) 20980 cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 20981 else 20982 cwa = 0; 20983 if ((cwa >= rack->r_ctl.pcm_max_seg) && 20984 (rw_avail > rack->r_ctl.pcm_max_seg)) { 20985 /* Raise up the max seg for this trip through */ 20986 pace_max_seg = rack->r_ctl.pcm_max_seg; 20987 /* Disable any fast output */ 20988 rack->r_fast_output = 0; 20989 } 20990 if (rack_verbose_logging) { 20991 rack_log_pcm(rack, 4, 20992 cwa, rack->r_ctl.pcm_max_seg, rw_avail); 20993 } 20994 } 20995 sb_offset = tp->snd_max - tp->snd_una; 20996 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 20997 flags = tcp_outflags[tp->t_state]; 20998 while (rack->rc_free_cnt < rack_free_cache) { 20999 rsm = rack_alloc(rack); 21000 if (rsm == NULL) { 21001 if (hpts_calling) 21002 /* Retry in a ms */ 21003 slot = (1 * HPTS_USEC_IN_MSEC); 21004 so = inp->inp_socket; 21005 sb = &so->so_snd; 21006 goto just_return_nolock; 21007 } 21008 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 21009 rack->rc_free_cnt++; 21010 rsm = NULL; 21011 } 21012 sack_rxmit = 0; 21013 len = 0; 21014 rsm = NULL; 21015 if (flags & TH_RST) { 21016 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 21017 so = inp->inp_socket; 21018 sb = &so->so_snd; 21019 goto send; 21020 } 21021 if (rack->r_ctl.rc_resend) { 21022 /* Retransmit timer */ 21023 rsm = rack->r_ctl.rc_resend; 21024 rack->r_ctl.rc_resend = NULL; 21025 len = rsm->r_end - rsm->r_start; 21026 sack_rxmit = 1; 21027 sendalot = 0; 21028 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 21029 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 21030 __func__, __LINE__, 21031 rsm->r_start, tp->snd_una, tp, rack, rsm)); 21032 sb_offset = rsm->r_start - tp->snd_una; 21033 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 21034 } else if (rack->r_collapse_point_valid && 21035 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 21036 /* 21037 * If an RSM is returned then enough time has passed 21038 * for us to retransmit it. Move up the collapse point, 21039 * since this rsm has its chance to retransmit now. 21040 */ 21041 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT); 21042 rack->r_ctl.last_collapse_point = rsm->r_end; 21043 /* Are we done? */ 21044 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 21045 rack->r_ctl.high_collapse_point)) 21046 rack->r_collapse_point_valid = 0; 21047 sack_rxmit = 1; 21048 /* We are not doing a TLP */ 21049 doing_tlp = 0; 21050 len = rsm->r_end - rsm->r_start; 21051 sb_offset = rsm->r_start - tp->snd_una; 21052 sendalot = 0; 21053 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 21054 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 21055 /* We have a retransmit that takes precedence */ 21056 if ((!IN_FASTRECOVERY(tp->t_flags)) && 21057 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 21058 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 21059 /* Enter recovery if not induced by a time-out */ 21060 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 21061 } 21062 #ifdef INVARIANTS 21063 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 21064 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 21065 tp, rack, rsm, rsm->r_start, tp->snd_una); 21066 } 21067 #endif 21068 len = rsm->r_end - rsm->r_start; 21069 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 21070 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 21071 __func__, __LINE__, 21072 rsm->r_start, tp->snd_una, tp, rack, rsm)); 21073 sb_offset = rsm->r_start - tp->snd_una; 21074 sendalot = 0; 21075 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 21076 if (len > 0) { 21077 sack_rxmit = 1; 21078 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 21079 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 21080 min(len, segsiz)); 21081 } 21082 } else if (rack->r_ctl.rc_tlpsend) { 21083 /* Tail loss probe */ 21084 long cwin; 21085 long tlen; 21086 21087 /* 21088 * Check if we can do a TLP with a RACK'd packet 21089 * this can happen if we are not doing the rack 21090 * cheat and we skipped to a TLP and it 21091 * went off. 21092 */ 21093 rsm = rack->r_ctl.rc_tlpsend; 21094 /* We are doing a TLP make sure the flag is preent */ 21095 rsm->r_flags |= RACK_TLP; 21096 rack->r_ctl.rc_tlpsend = NULL; 21097 sack_rxmit = 1; 21098 tlen = rsm->r_end - rsm->r_start; 21099 if (tlen > segsiz) 21100 tlen = segsiz; 21101 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 21102 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 21103 __func__, __LINE__, 21104 rsm->r_start, tp->snd_una, tp, rack, rsm)); 21105 sb_offset = rsm->r_start - tp->snd_una; 21106 cwin = min(tp->snd_wnd, tlen); 21107 len = cwin; 21108 } 21109 if (rack->r_must_retran && 21110 (doing_tlp == 0) && 21111 (SEQ_GT(tp->snd_max, tp->snd_una)) && 21112 (rsm == NULL)) { 21113 /* 21114 * There are two different ways that we 21115 * can get into this block: 21116 * a) This is a non-sack connection, we had a time-out 21117 * and thus r_must_retran was set and everything 21118 * left outstanding as been marked for retransmit. 21119 * b) The MTU of the path shrank, so that everything 21120 * was marked to be retransmitted with the smaller 21121 * mtu and r_must_retran was set. 21122 * 21123 * This means that we expect the sendmap (outstanding) 21124 * to all be marked must. We can use the tmap to 21125 * look at them. 21126 * 21127 */ 21128 int sendwin, flight; 21129 21130 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 21131 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 21132 if (flight >= sendwin) { 21133 /* 21134 * We can't send yet. 21135 */ 21136 so = inp->inp_socket; 21137 sb = &so->so_snd; 21138 goto just_return_nolock; 21139 } 21140 /* 21141 * This is the case a/b mentioned above. All 21142 * outstanding/not-acked should be marked. 21143 * We can use the tmap to find them. 21144 */ 21145 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 21146 if (rsm == NULL) { 21147 /* TSNH */ 21148 rack->r_must_retran = 0; 21149 rack->r_ctl.rc_out_at_rto = 0; 21150 so = inp->inp_socket; 21151 sb = &so->so_snd; 21152 goto just_return_nolock; 21153 } 21154 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 21155 /* 21156 * The first one does not have the flag, did we collapse 21157 * further up in our list? 21158 */ 21159 rack->r_must_retran = 0; 21160 rack->r_ctl.rc_out_at_rto = 0; 21161 rsm = NULL; 21162 sack_rxmit = 0; 21163 } else { 21164 sack_rxmit = 1; 21165 len = rsm->r_end - rsm->r_start; 21166 sb_offset = rsm->r_start - tp->snd_una; 21167 sendalot = 0; 21168 if ((rack->full_size_rxt == 0) && 21169 (rack->shape_rxt_to_pacing_min == 0) && 21170 (len >= segsiz)) 21171 len = segsiz; 21172 else if (rack->shape_rxt_to_pacing_min && 21173 rack->gp_ready) { 21174 /* We use pacing min as shaping len req */ 21175 uint32_t maxlen; 21176 21177 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 21178 if (len > maxlen) 21179 len = maxlen; 21180 } 21181 /* 21182 * Delay removing the flag RACK_MUST_RXT so 21183 * that the fastpath for retransmit will 21184 * work with this rsm. 21185 */ 21186 } 21187 } 21188 /* 21189 * Enforce a connection sendmap count limit if set 21190 * as long as we are not retransmiting. 21191 */ 21192 if ((rsm == NULL) && 21193 (V_tcp_map_entries_limit > 0) && 21194 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 21195 counter_u64_add(rack_to_alloc_limited, 1); 21196 if (!rack->alloc_limit_reported) { 21197 rack->alloc_limit_reported = 1; 21198 counter_u64_add(rack_alloc_limited_conns, 1); 21199 } 21200 so = inp->inp_socket; 21201 sb = &so->so_snd; 21202 goto just_return_nolock; 21203 } 21204 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 21205 /* we are retransmitting the fin */ 21206 len--; 21207 if (len) { 21208 /* 21209 * When retransmitting data do *not* include the 21210 * FIN. This could happen from a TLP probe. 21211 */ 21212 flags &= ~TH_FIN; 21213 } 21214 } 21215 if (rsm && rack->r_fsb_inited && 21216 rack_use_rsm_rfo && 21217 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 21218 int ret; 21219 21220 if ((rack->rc_policer_detected == 1) && 21221 (rack->r_ctl.policer_bucket_size > segsiz) && 21222 (rack->r_ctl.policer_bw > 0)) { 21223 /* Check to see if there is room */ 21224 if (rack->r_ctl.current_policer_bucket < len) { 21225 goto skip_fast_output; 21226 } 21227 } 21228 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 21229 if (ret == 0) 21230 return (0); 21231 } 21232 skip_fast_output: 21233 so = inp->inp_socket; 21234 sb = &so->so_snd; 21235 if (do_a_prefetch == 0) { 21236 kern_prefetch(sb, &do_a_prefetch); 21237 do_a_prefetch = 1; 21238 } 21239 #ifdef NETFLIX_SHARED_CWND 21240 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 21241 rack->rack_enable_scwnd) { 21242 /* We are doing cwnd sharing */ 21243 if (rack->gp_ready && 21244 (rack->rack_attempted_scwnd == 0) && 21245 (rack->r_ctl.rc_scw == NULL) && 21246 tp->t_lib) { 21247 /* The pcbid is in, lets make an attempt */ 21248 counter_u64_add(rack_try_scwnd, 1); 21249 rack->rack_attempted_scwnd = 1; 21250 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 21251 &rack->r_ctl.rc_scw_index, 21252 segsiz); 21253 } 21254 if (rack->r_ctl.rc_scw && 21255 (rack->rack_scwnd_is_idle == 1) && 21256 sbavail(&so->so_snd)) { 21257 /* we are no longer out of data */ 21258 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 21259 rack->rack_scwnd_is_idle = 0; 21260 } 21261 if (rack->r_ctl.rc_scw) { 21262 /* First lets update and get the cwnd */ 21263 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 21264 rack->r_ctl.rc_scw_index, 21265 tp->snd_cwnd, tp->snd_wnd, segsiz); 21266 } 21267 } 21268 #endif 21269 /* 21270 * Get standard flags, and add SYN or FIN if requested by 'hidden' 21271 * state flags. 21272 */ 21273 if (tp->t_flags & TF_NEEDFIN) 21274 flags |= TH_FIN; 21275 if (tp->t_flags & TF_NEEDSYN) 21276 flags |= TH_SYN; 21277 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 21278 void *end_rsm; 21279 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 21280 if (end_rsm) 21281 kern_prefetch(end_rsm, &prefetch_rsm); 21282 prefetch_rsm = 1; 21283 } 21284 SOCKBUF_LOCK(sb); 21285 if ((sack_rxmit == 0) && 21286 (TCPS_HAVEESTABLISHED(tp->t_state) || 21287 (tp->t_flags & TF_FASTOPEN))) { 21288 /* 21289 * We are not retransmitting (sack_rxmit is 0) so we 21290 * are sending new data. This is always based on snd_max. 21291 * Now in theory snd_max may be equal to snd_una, if so 21292 * then nothing is outstanding and the offset would be 0. 21293 */ 21294 uint32_t avail; 21295 21296 avail = sbavail(sb); 21297 if (SEQ_GT(tp->snd_max, tp->snd_una) && avail) 21298 sb_offset = tp->snd_max - tp->snd_una; 21299 else 21300 sb_offset = 0; 21301 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 21302 if (rack->r_ctl.rc_tlp_new_data) { 21303 /* TLP is forcing out new data */ 21304 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 21305 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 21306 } 21307 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 21308 if (tp->snd_wnd > sb_offset) 21309 len = tp->snd_wnd - sb_offset; 21310 else 21311 len = 0; 21312 } else { 21313 len = rack->r_ctl.rc_tlp_new_data; 21314 } 21315 rack->r_ctl.rc_tlp_new_data = 0; 21316 } else { 21317 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 21318 } 21319 if ((rack->r_ctl.crte == NULL) && 21320 IN_FASTRECOVERY(tp->t_flags) && 21321 (rack->full_size_rxt == 0) && 21322 (rack->shape_rxt_to_pacing_min == 0) && 21323 (len > segsiz)) { 21324 /* 21325 * For prr=off, we need to send only 1 MSS 21326 * at a time. We do this because another sack could 21327 * be arriving that causes us to send retransmits and 21328 * we don't want to be on a long pace due to a larger send 21329 * that keeps us from sending out the retransmit. 21330 */ 21331 len = segsiz; 21332 } else if (rack->shape_rxt_to_pacing_min && 21333 rack->gp_ready) { 21334 /* We use pacing min as shaping len req */ 21335 uint32_t maxlen; 21336 21337 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 21338 if (len > maxlen) 21339 len = maxlen; 21340 }/* The else is full_size_rxt is on so send it all */ 21341 } else { 21342 uint32_t outstanding; 21343 /* 21344 * We are inside of a Fast recovery episode, this 21345 * is caused by a SACK or 3 dup acks. At this point 21346 * we have sent all the retransmissions and we rely 21347 * on PRR to dictate what we will send in the form of 21348 * new data. 21349 */ 21350 21351 outstanding = tp->snd_max - tp->snd_una; 21352 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 21353 if (tp->snd_wnd > outstanding) { 21354 len = tp->snd_wnd - outstanding; 21355 /* Check to see if we have the data */ 21356 if ((sb_offset + len) > avail) { 21357 /* It does not all fit */ 21358 if (avail > sb_offset) 21359 len = avail - sb_offset; 21360 else 21361 len = 0; 21362 } 21363 } else { 21364 len = 0; 21365 } 21366 } else if (avail > sb_offset) { 21367 len = avail - sb_offset; 21368 } else { 21369 len = 0; 21370 } 21371 if (len > 0) { 21372 if (len > rack->r_ctl.rc_prr_sndcnt) { 21373 len = rack->r_ctl.rc_prr_sndcnt; 21374 } 21375 if (len > 0) { 21376 sub_from_prr = 1; 21377 } 21378 } 21379 if (len > segsiz) { 21380 /* 21381 * We should never send more than a MSS when 21382 * retransmitting or sending new data in prr 21383 * mode unless the override flag is on. Most 21384 * likely the PRR algorithm is not going to 21385 * let us send a lot as well :-) 21386 */ 21387 if (rack->r_ctl.rc_prr_sendalot == 0) { 21388 len = segsiz; 21389 } 21390 } else if (len < segsiz) { 21391 /* 21392 * Do we send any? The idea here is if the 21393 * send empty's the socket buffer we want to 21394 * do it. However if not then lets just wait 21395 * for our prr_sndcnt to get bigger. 21396 */ 21397 long leftinsb; 21398 21399 leftinsb = sbavail(sb) - sb_offset; 21400 if (leftinsb > len) { 21401 /* This send does not empty the sb */ 21402 len = 0; 21403 } 21404 } 21405 } 21406 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 21407 /* 21408 * If you have not established 21409 * and are not doing FAST OPEN 21410 * no data please. 21411 */ 21412 if ((sack_rxmit == 0) && 21413 !(tp->t_flags & TF_FASTOPEN)) { 21414 len = 0; 21415 sb_offset = 0; 21416 } 21417 } 21418 if (prefetch_so_done == 0) { 21419 kern_prefetch(so, &prefetch_so_done); 21420 prefetch_so_done = 1; 21421 } 21422 orig_len = len; 21423 if ((rack->rc_policer_detected == 1) && 21424 (rack->r_ctl.policer_bucket_size > segsiz) && 21425 (rack->r_ctl.policer_bw > 0) && 21426 (len > 0)) { 21427 /* 21428 * Ok we believe we have a policer watching 21429 * what we send, can we send len? If not can 21430 * we tune it down to a smaller value? 21431 */ 21432 uint32_t plen, buck_needs; 21433 21434 plen = rack_policer_check_send(rack, len, segsiz, &buck_needs); 21435 if (plen == 0) { 21436 /* 21437 * We are not allowed to send. How long 21438 * do we need to pace for i.e. how long 21439 * before len is available to send? 21440 */ 21441 uint64_t lentime; 21442 21443 lentime = buck_needs; 21444 lentime *= HPTS_USEC_IN_SEC; 21445 lentime /= rack->r_ctl.policer_bw; 21446 slot = (uint32_t)lentime; 21447 tot_len_this_send = 0; 21448 SOCKBUF_UNLOCK(sb); 21449 if (rack_verbose_logging > 0) 21450 policer_detection_log(rack, len, slot, buck_needs, 0, 12); 21451 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 21452 rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use); 21453 goto just_return_clean; 21454 } 21455 if (plen < len) { 21456 sendalot = 0; 21457 len = plen; 21458 } 21459 } 21460 /* 21461 * Lop off SYN bit if it has already been sent. However, if this is 21462 * SYN-SENT state and if segment contains data and if we don't know 21463 * that foreign host supports TAO, suppress sending segment. 21464 */ 21465 if ((flags & TH_SYN) && 21466 SEQ_GT(tp->snd_max, tp->snd_una) && 21467 ((sack_rxmit == 0) && 21468 (tp->t_rxtshift == 0))) { 21469 /* 21470 * When sending additional segments following a TFO SYN|ACK, 21471 * do not include the SYN bit. 21472 */ 21473 if ((tp->t_flags & TF_FASTOPEN) && 21474 (tp->t_state == TCPS_SYN_RECEIVED)) 21475 flags &= ~TH_SYN; 21476 } 21477 /* 21478 * Be careful not to send data and/or FIN on SYN segments. This 21479 * measure is needed to prevent interoperability problems with not 21480 * fully conformant TCP implementations. 21481 */ 21482 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 21483 len = 0; 21484 flags &= ~TH_FIN; 21485 } 21486 /* 21487 * On TFO sockets, ensure no data is sent in the following cases: 21488 * 21489 * - When retransmitting SYN|ACK on a passively-created socket 21490 * 21491 * - When retransmitting SYN on an actively created socket 21492 * 21493 * - When sending a zero-length cookie (cookie request) on an 21494 * actively created socket 21495 * 21496 * - When the socket is in the CLOSED state (RST is being sent) 21497 */ 21498 if ((tp->t_flags & TF_FASTOPEN) && 21499 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 21500 ((tp->t_state == TCPS_SYN_SENT) && 21501 (tp->t_tfo_client_cookie_len == 0)) || 21502 (flags & TH_RST))) { 21503 sack_rxmit = 0; 21504 len = 0; 21505 } 21506 /* Without fast-open there should never be data sent on a SYN */ 21507 if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) { 21508 len = 0; 21509 } 21510 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 21511 /* We only send 1 MSS if we have a DSACK block */ 21512 add_flag |= RACK_SENT_W_DSACK; 21513 len = segsiz; 21514 } 21515 if (len <= 0) { 21516 /* 21517 * We have nothing to send, or the window shrank, or 21518 * is closed, do we need to go into persists? 21519 */ 21520 len = 0; 21521 if ((tp->snd_wnd == 0) && 21522 (TCPS_HAVEESTABLISHED(tp->t_state)) && 21523 (tp->snd_una == tp->snd_max) && 21524 (sb_offset < (int)sbavail(sb))) { 21525 rack_enter_persist(tp, rack, cts, tp->snd_una); 21526 } 21527 } else if ((rsm == NULL) && 21528 (doing_tlp == 0) && 21529 (len < pace_max_seg)) { 21530 /* 21531 * We are not sending a maximum sized segment for 21532 * some reason. Should we not send anything (think 21533 * sws or persists)? 21534 */ 21535 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 21536 (TCPS_HAVEESTABLISHED(tp->t_state)) && 21537 (len < minseg) && 21538 (len < (int)(sbavail(sb) - sb_offset))) { 21539 /* 21540 * Here the rwnd is less than 21541 * the minimum pacing size, this is not a retransmit, 21542 * we are established and 21543 * the send is not the last in the socket buffer 21544 * we send nothing, and we may enter persists 21545 * if nothing is outstanding. 21546 */ 21547 len = 0; 21548 if (tp->snd_max == tp->snd_una) { 21549 /* 21550 * Nothing out we can 21551 * go into persists. 21552 */ 21553 rack_enter_persist(tp, rack, cts, tp->snd_una); 21554 } 21555 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 21556 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 21557 (len < (int)(sbavail(sb) - sb_offset)) && 21558 (len < minseg)) { 21559 /* 21560 * Here we are not retransmitting, and 21561 * the cwnd is not so small that we could 21562 * not send at least a min size (rxt timer 21563 * not having gone off), We have 2 segments or 21564 * more already in flight, its not the tail end 21565 * of the socket buffer and the cwnd is blocking 21566 * us from sending out a minimum pacing segment size. 21567 * Lets not send anything. 21568 */ 21569 len = 0; 21570 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 21571 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 21572 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 21573 (len < (int)(sbavail(sb) - sb_offset)) && 21574 (TCPS_HAVEESTABLISHED(tp->t_state))) { 21575 /* 21576 * Here we have a send window but we have 21577 * filled it up and we can't send another pacing segment. 21578 * We also have in flight more than 2 segments 21579 * and we are not completing the sb i.e. we allow 21580 * the last bytes of the sb to go out even if 21581 * its not a full pacing segment. 21582 */ 21583 len = 0; 21584 } else if ((rack->r_ctl.crte != NULL) && 21585 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 21586 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 21587 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 21588 (len < (int)(sbavail(sb) - sb_offset))) { 21589 /* 21590 * Here we are doing hardware pacing, this is not a TLP, 21591 * we are not sending a pace max segment size, there is rwnd 21592 * room to send at least N pace_max_seg, the cwnd is greater 21593 * than or equal to a full pacing segments plus 4 mss and we have 2 or 21594 * more segments in flight and its not the tail of the socket buffer. 21595 * 21596 * We don't want to send instead we need to get more ack's in to 21597 * allow us to send a full pacing segment. Normally, if we are pacing 21598 * about the right speed, we should have finished our pacing 21599 * send as most of the acks have come back if we are at the 21600 * right rate. This is a bit fuzzy since return path delay 21601 * can delay the acks, which is why we want to make sure we 21602 * have cwnd space to have a bit more than a max pace segments in flight. 21603 * 21604 * If we have not gotten our acks back we are pacing at too high a 21605 * rate delaying will not hurt and will bring our GP estimate down by 21606 * injecting the delay. If we don't do this we will send 21607 * 2 MSS out in response to the acks being clocked in which 21608 * defeats the point of hw-pacing (i.e. to help us get 21609 * larger TSO's out). 21610 */ 21611 len = 0; 21612 } 21613 21614 } 21615 /* len will be >= 0 after this point. */ 21616 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 21617 rack_sndbuf_autoscale(rack); 21618 /* 21619 * Decide if we can use TCP Segmentation Offloading (if supported by 21620 * hardware). 21621 * 21622 * TSO may only be used if we are in a pure bulk sending state. The 21623 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 21624 * options prevent using TSO. With TSO the TCP header is the same 21625 * (except for the sequence number) for all generated packets. This 21626 * makes it impossible to transmit any options which vary per 21627 * generated segment or packet. 21628 * 21629 * IPv4 handling has a clear separation of ip options and ip header 21630 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 21631 * the right thing below to provide length of just ip options and thus 21632 * checking for ipoptlen is enough to decide if ip options are present. 21633 */ 21634 ipoptlen = 0; 21635 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21636 /* 21637 * Pre-calculate here as we save another lookup into the darknesses 21638 * of IPsec that way and can actually decide if TSO is ok. 21639 */ 21640 #ifdef INET6 21641 if (isipv6 && IPSEC_ENABLED(ipv6)) 21642 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 21643 #ifdef INET 21644 else 21645 #endif 21646 #endif /* INET6 */ 21647 #ifdef INET 21648 if (IPSEC_ENABLED(ipv4)) 21649 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 21650 #endif /* INET */ 21651 #endif 21652 21653 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21654 ipoptlen += ipsec_optlen; 21655 #endif 21656 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 21657 (tp->t_port == 0) && 21658 ((tp->t_flags & TF_SIGNATURE) == 0) && 21659 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 21660 ipoptlen == 0) 21661 tso = 1; 21662 { 21663 uint32_t outstanding __unused; 21664 21665 outstanding = tp->snd_max - tp->snd_una; 21666 if (tp->t_flags & TF_SENTFIN) { 21667 /* 21668 * If we sent a fin, snd_max is 1 higher than 21669 * snd_una 21670 */ 21671 outstanding--; 21672 } 21673 if (sack_rxmit) { 21674 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 21675 flags &= ~TH_FIN; 21676 } 21677 } 21678 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 21679 (long)TCP_MAXWIN << tp->rcv_scale); 21680 21681 /* 21682 * Sender silly window avoidance. We transmit under the following 21683 * conditions when len is non-zero: 21684 * 21685 * - We have a full segment (or more with TSO) - This is the last 21686 * buffer in a write()/send() and we are either idle or running 21687 * NODELAY - we've timed out (e.g. persist timer) - we have more 21688 * then 1/2 the maximum send window's worth of data (receiver may be 21689 * limited the window size) - we need to retransmit 21690 */ 21691 if (len) { 21692 if (len >= segsiz) { 21693 goto send; 21694 } 21695 /* 21696 * NOTE! on localhost connections an 'ack' from the remote 21697 * end may occur synchronously with the output and cause us 21698 * to flush a buffer queued with moretocome. XXX 21699 * 21700 */ 21701 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 21702 (idle || (tp->t_flags & TF_NODELAY)) && 21703 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 21704 (tp->t_flags & TF_NOPUSH) == 0) { 21705 pass = 2; 21706 goto send; 21707 } 21708 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 21709 pass = 22; 21710 goto send; 21711 } 21712 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 21713 pass = 4; 21714 goto send; 21715 } 21716 if (sack_rxmit) { 21717 pass = 6; 21718 goto send; 21719 } 21720 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 21721 (ctf_outstanding(tp) < (segsiz * 2))) { 21722 /* 21723 * We have less than two MSS outstanding (delayed ack) 21724 * and our rwnd will not let us send a full sized 21725 * MSS. Lets go ahead and let this small segment 21726 * out because we want to try to have at least two 21727 * packets inflight to not be caught by delayed ack. 21728 */ 21729 pass = 12; 21730 goto send; 21731 } 21732 } 21733 /* 21734 * Sending of standalone window updates. 21735 * 21736 * Window updates are important when we close our window due to a 21737 * full socket buffer and are opening it again after the application 21738 * reads data from it. Once the window has opened again and the 21739 * remote end starts to send again the ACK clock takes over and 21740 * provides the most current window information. 21741 * 21742 * We must avoid the silly window syndrome whereas every read from 21743 * the receive buffer, no matter how small, causes a window update 21744 * to be sent. We also should avoid sending a flurry of window 21745 * updates when the socket buffer had queued a lot of data and the 21746 * application is doing small reads. 21747 * 21748 * Prevent a flurry of pointless window updates by only sending an 21749 * update when we can increase the advertized window by more than 21750 * 1/4th of the socket buffer capacity. When the buffer is getting 21751 * full or is very small be more aggressive and send an update 21752 * whenever we can increase by two mss sized segments. In all other 21753 * situations the ACK's to new incoming data will carry further 21754 * window increases. 21755 * 21756 * Don't send an independent window update if a delayed ACK is 21757 * pending (it will get piggy-backed on it) or the remote side 21758 * already has done a half-close and won't send more data. Skip 21759 * this if the connection is in T/TCP half-open state. 21760 */ 21761 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 21762 !(tp->t_flags & TF_DELACK) && 21763 !TCPS_HAVERCVDFIN(tp->t_state)) { 21764 /* 21765 * "adv" is the amount we could increase the window, taking 21766 * into account that we are limited by TCP_MAXWIN << 21767 * tp->rcv_scale. 21768 */ 21769 int32_t adv; 21770 int oldwin; 21771 21772 adv = recwin; 21773 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 21774 oldwin = (tp->rcv_adv - tp->rcv_nxt); 21775 if (adv > oldwin) 21776 adv -= oldwin; 21777 else { 21778 /* We can't increase the window */ 21779 adv = 0; 21780 } 21781 } else 21782 oldwin = 0; 21783 21784 /* 21785 * If the new window size ends up being the same as or less 21786 * than the old size when it is scaled, then don't force 21787 * a window update. 21788 */ 21789 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 21790 goto dontupdate; 21791 21792 if (adv >= (int32_t)(2 * segsiz) && 21793 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 21794 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 21795 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 21796 pass = 7; 21797 goto send; 21798 } 21799 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 21800 pass = 23; 21801 goto send; 21802 } 21803 } 21804 dontupdate: 21805 21806 /* 21807 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 21808 * is also a catch-all for the retransmit timer timeout case. 21809 */ 21810 if (tp->t_flags & TF_ACKNOW) { 21811 pass = 8; 21812 goto send; 21813 } 21814 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 21815 pass = 9; 21816 goto send; 21817 } 21818 /* 21819 * If our state indicates that FIN should be sent and we have not 21820 * yet done so, then we need to send. 21821 */ 21822 if ((flags & TH_FIN) && 21823 (tp->snd_max == tp->snd_una)) { 21824 pass = 11; 21825 goto send; 21826 } 21827 /* 21828 * No reason to send a segment, just return. 21829 */ 21830 just_return: 21831 SOCKBUF_UNLOCK(sb); 21832 just_return_nolock: 21833 { 21834 int app_limited = CTF_JR_SENT_DATA; 21835 21836 if ((tp->t_flags & TF_FASTOPEN) == 0 && 21837 (flags & TH_FIN) && 21838 (len == 0) && 21839 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 21840 ((tp->snd_max - tp->snd_una) <= segsiz)) { 21841 /* 21842 * Ok less than or right at a MSS is 21843 * outstanding. The original FreeBSD stack would 21844 * have sent a FIN, which can speed things up for 21845 * a transactional application doing a MSG_WAITALL. 21846 * To speed things up since we do *not* send a FIN 21847 * if data is outstanding, we send a "challenge ack". 21848 * The idea behind that is instead of having to have 21849 * the peer wait for the delayed-ack timer to run off 21850 * we send an ack that makes the peer send us an ack. 21851 */ 21852 rack_send_ack_challange(rack); 21853 } 21854 if (tot_len_this_send > 0) { 21855 rack->r_ctl.fsb.recwin = recwin; 21856 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); 21857 if ((error == 0) && 21858 (rack->rc_policer_detected == 0) && 21859 rack_use_rfo && 21860 ((flags & (TH_SYN|TH_FIN)) == 0) && 21861 (ipoptlen == 0) && 21862 (tp->rcv_numsacks == 0) && 21863 rack->r_fsb_inited && 21864 TCPS_HAVEESTABLISHED(tp->t_state) && 21865 ((IN_RECOVERY(tp->t_flags)) == 0) && 21866 (rack->r_must_retran == 0) && 21867 ((tp->t_flags & TF_NEEDFIN) == 0) && 21868 (len > 0) && (orig_len > 0) && 21869 (orig_len > len) && 21870 ((orig_len - len) >= segsiz) && 21871 ((optlen == 0) || 21872 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 21873 /* We can send at least one more MSS using our fsb */ 21874 rack_setup_fast_output(tp, rack, sb, len, orig_len, 21875 segsiz, pace_max_seg, hw_tls, flags); 21876 } else 21877 rack->r_fast_output = 0; 21878 rack_log_fsb(rack, tp, so, flags, 21879 ipoptlen, orig_len, len, 0, 21880 1, optlen, __LINE__, 1); 21881 /* Assure when we leave that snd_nxt will point to top */ 21882 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 21883 tp->snd_nxt = tp->snd_max; 21884 } else { 21885 int end_window = 0; 21886 uint32_t seq = tp->gput_ack; 21887 21888 rsm = tqhash_max(rack->r_ctl.tqh); 21889 if (rsm) { 21890 /* 21891 * Mark the last sent that we just-returned (hinting 21892 * that delayed ack may play a role in any rtt measurement). 21893 */ 21894 rsm->r_just_ret = 1; 21895 } 21896 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 21897 rack->r_ctl.rc_agg_delayed = 0; 21898 rack->r_early = 0; 21899 rack->r_late = 0; 21900 rack->r_ctl.rc_agg_early = 0; 21901 if ((ctf_outstanding(tp) + 21902 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 21903 minseg)) >= tp->snd_wnd) { 21904 /* We are limited by the rwnd */ 21905 app_limited = CTF_JR_RWND_LIMITED; 21906 if (IN_FASTRECOVERY(tp->t_flags)) 21907 rack->r_ctl.rc_prr_sndcnt = 0; 21908 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 21909 /* We are limited by whats available -- app limited */ 21910 app_limited = CTF_JR_APP_LIMITED; 21911 if (IN_FASTRECOVERY(tp->t_flags)) 21912 rack->r_ctl.rc_prr_sndcnt = 0; 21913 } else if ((idle == 0) && 21914 ((tp->t_flags & TF_NODELAY) == 0) && 21915 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 21916 (len < segsiz)) { 21917 /* 21918 * No delay is not on and the 21919 * user is sending less than 1MSS. This 21920 * brings out SWS avoidance so we 21921 * don't send. Another app-limited case. 21922 */ 21923 app_limited = CTF_JR_APP_LIMITED; 21924 } else if (tp->t_flags & TF_NOPUSH) { 21925 /* 21926 * The user has requested no push of 21927 * the last segment and we are 21928 * at the last segment. Another app 21929 * limited case. 21930 */ 21931 app_limited = CTF_JR_APP_LIMITED; 21932 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 21933 /* Its the cwnd */ 21934 app_limited = CTF_JR_CWND_LIMITED; 21935 } else if (IN_FASTRECOVERY(tp->t_flags) && 21936 (rack->rack_no_prr == 0) && 21937 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 21938 app_limited = CTF_JR_PRR; 21939 } else { 21940 /* Now why here are we not sending? */ 21941 #ifdef NOW 21942 #ifdef INVARIANTS 21943 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 21944 #endif 21945 #endif 21946 app_limited = CTF_JR_ASSESSING; 21947 } 21948 /* 21949 * App limited in some fashion, for our pacing GP 21950 * measurements we don't want any gap (even cwnd). 21951 * Close down the measurement window. 21952 */ 21953 if (rack_cwnd_block_ends_measure && 21954 ((app_limited == CTF_JR_CWND_LIMITED) || 21955 (app_limited == CTF_JR_PRR))) { 21956 /* 21957 * The reason we are not sending is 21958 * the cwnd (or prr). We have been configured 21959 * to end the measurement window in 21960 * this case. 21961 */ 21962 end_window = 1; 21963 } else if (rack_rwnd_block_ends_measure && 21964 (app_limited == CTF_JR_RWND_LIMITED)) { 21965 /* 21966 * We are rwnd limited and have been 21967 * configured to end the measurement 21968 * window in this case. 21969 */ 21970 end_window = 1; 21971 } else if (app_limited == CTF_JR_APP_LIMITED) { 21972 /* 21973 * A true application limited period, we have 21974 * ran out of data. 21975 */ 21976 end_window = 1; 21977 } else if (app_limited == CTF_JR_ASSESSING) { 21978 /* 21979 * In the assessing case we hit the end of 21980 * the if/else and had no known reason 21981 * This will panic us under invariants.. 21982 * 21983 * If we get this out in logs we need to 21984 * investagate which reason we missed. 21985 */ 21986 end_window = 1; 21987 } 21988 if (end_window) { 21989 uint8_t log = 0; 21990 21991 /* Adjust the Gput measurement */ 21992 if ((tp->t_flags & TF_GPUTINPROG) && 21993 SEQ_GT(tp->gput_ack, tp->snd_max)) { 21994 tp->gput_ack = tp->snd_max; 21995 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 21996 /* 21997 * There is not enough to measure. 21998 */ 21999 tp->t_flags &= ~TF_GPUTINPROG; 22000 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 22001 rack->r_ctl.rc_gp_srtt /*flex1*/, 22002 tp->gput_seq, 22003 0, 0, 18, __LINE__, NULL, 0); 22004 } else 22005 log = 1; 22006 } 22007 /* Mark the last packet has app limited */ 22008 rsm = tqhash_max(rack->r_ctl.tqh); 22009 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 22010 if (rack->r_ctl.rc_app_limited_cnt == 0) 22011 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 22012 else { 22013 /* 22014 * Go out to the end app limited and mark 22015 * this new one as next and move the end_appl up 22016 * to this guy. 22017 */ 22018 if (rack->r_ctl.rc_end_appl) 22019 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 22020 rack->r_ctl.rc_end_appl = rsm; 22021 } 22022 rsm->r_flags |= RACK_APP_LIMITED; 22023 rack->r_ctl.rc_app_limited_cnt++; 22024 } 22025 if (log) 22026 rack_log_pacing_delay_calc(rack, 22027 rack->r_ctl.rc_app_limited_cnt, seq, 22028 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 22029 } 22030 } 22031 /* Check if we need to go into persists or not */ 22032 if ((tp->snd_max == tp->snd_una) && 22033 TCPS_HAVEESTABLISHED(tp->t_state) && 22034 sbavail(sb) && 22035 (sbavail(sb) > tp->snd_wnd) && 22036 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 22037 /* Yes lets make sure to move to persist before timer-start */ 22038 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 22039 } 22040 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 22041 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 22042 } 22043 just_return_clean: 22044 #ifdef NETFLIX_SHARED_CWND 22045 if ((sbavail(sb) == 0) && 22046 rack->r_ctl.rc_scw) { 22047 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 22048 rack->rack_scwnd_is_idle = 1; 22049 } 22050 #endif 22051 #ifdef TCP_ACCOUNTING 22052 if (tot_len_this_send > 0) { 22053 crtsc = get_cyclecount(); 22054 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22055 tp->tcp_cnt_counters[SND_OUT_DATA]++; 22056 } 22057 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22058 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 22059 } 22060 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22061 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 22062 } 22063 } else { 22064 crtsc = get_cyclecount(); 22065 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22066 tp->tcp_cnt_counters[SND_LIMITED]++; 22067 } 22068 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22069 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 22070 } 22071 } 22072 sched_unpin(); 22073 #endif 22074 return (0); 22075 22076 send: 22077 if ((rack->r_ctl.crte != NULL) && 22078 (rsm == NULL) && 22079 ((rack->rc_hw_nobuf == 1) || 22080 (rack_hw_check_queue && (check_done == 0)))) { 22081 /* 22082 * We only want to do this once with the hw_check_queue, 22083 * for the enobuf case we would only do it once if 22084 * we come around to again, the flag will be clear. 22085 */ 22086 check_done = 1; 22087 slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); 22088 if (slot) { 22089 rack->r_ctl.rc_agg_delayed = 0; 22090 rack->r_ctl.rc_agg_early = 0; 22091 rack->r_early = 0; 22092 rack->r_late = 0; 22093 SOCKBUF_UNLOCK(&so->so_snd); 22094 goto skip_all_send; 22095 } 22096 } 22097 if (rsm || sack_rxmit) 22098 counter_u64_add(rack_nfto_resend, 1); 22099 else 22100 counter_u64_add(rack_non_fto_send, 1); 22101 if ((flags & TH_FIN) && 22102 sbavail(sb)) { 22103 /* 22104 * We do not transmit a FIN 22105 * with data outstanding. We 22106 * need to make it so all data 22107 * is acked first. 22108 */ 22109 flags &= ~TH_FIN; 22110 if (TCPS_HAVEESTABLISHED(tp->t_state) && 22111 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 22112 ((tp->snd_max - tp->snd_una) <= segsiz)) { 22113 /* 22114 * Ok less than or right at a MSS is 22115 * outstanding. The original FreeBSD stack would 22116 * have sent a FIN, which can speed things up for 22117 * a transactional application doing a MSG_WAITALL. 22118 * To speed things up since we do *not* send a FIN 22119 * if data is outstanding, we send a "challenge ack". 22120 * The idea behind that is instead of having to have 22121 * the peer wait for the delayed-ack timer to run off 22122 * we send an ack that makes the peer send us an ack. 22123 */ 22124 rack_send_ack_challange(rack); 22125 } 22126 } 22127 /* Enforce stack imposed max seg size if we have one */ 22128 if (pace_max_seg && 22129 (len > pace_max_seg)) { 22130 mark = 1; 22131 len = pace_max_seg; 22132 } 22133 if ((rsm == NULL) && 22134 (rack->pcm_in_progress == 0) && 22135 (rack->r_ctl.pcm_max_seg > 0) && 22136 (len >= rack->r_ctl.pcm_max_seg)) { 22137 /* It is large enough for a measurement */ 22138 add_flag |= RACK_IS_PCM; 22139 rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag); 22140 } else if (rack_verbose_logging) { 22141 rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag); 22142 } 22143 22144 SOCKBUF_LOCK_ASSERT(sb); 22145 if (len > 0) { 22146 if (len >= segsiz) 22147 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 22148 else 22149 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 22150 } 22151 /* 22152 * Before ESTABLISHED, force sending of initial options unless TCP 22153 * set not to do any options. NOTE: we assume that the IP/TCP header 22154 * plus TCP options always fit in a single mbuf, leaving room for a 22155 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 22156 * + optlen <= MCLBYTES 22157 */ 22158 optlen = 0; 22159 #ifdef INET6 22160 if (isipv6) 22161 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 22162 else 22163 #endif 22164 hdrlen = sizeof(struct tcpiphdr); 22165 22166 /* 22167 * Ok what seq are we sending from. If we have 22168 * no rsm to use, then we look at various bits, 22169 * if we are putting out a SYN it will be ISS. 22170 * If we are retransmitting a FIN it will 22171 * be snd_max-1 else its snd_max. 22172 */ 22173 if (rsm == NULL) { 22174 if (flags & TH_SYN) 22175 rack_seq = tp->iss; 22176 else if ((flags & TH_FIN) && 22177 (tp->t_flags & TF_SENTFIN)) 22178 rack_seq = tp->snd_max - 1; 22179 else 22180 rack_seq = tp->snd_max; 22181 } else { 22182 rack_seq = rsm->r_start; 22183 } 22184 /* 22185 * Compute options for segment. We only have to care about SYN and 22186 * established connection segments. Options for SYN-ACK segments 22187 * are handled in TCP syncache. 22188 */ 22189 to.to_flags = 0; 22190 if ((tp->t_flags & TF_NOOPT) == 0) { 22191 /* Maximum segment size. */ 22192 if (flags & TH_SYN) { 22193 to.to_mss = tcp_mssopt(&inp->inp_inc); 22194 if (tp->t_port) 22195 to.to_mss -= V_tcp_udp_tunneling_overhead; 22196 to.to_flags |= TOF_MSS; 22197 22198 /* 22199 * On SYN or SYN|ACK transmits on TFO connections, 22200 * only include the TFO option if it is not a 22201 * retransmit, as the presence of the TFO option may 22202 * have caused the original SYN or SYN|ACK to have 22203 * been dropped by a middlebox. 22204 */ 22205 if ((tp->t_flags & TF_FASTOPEN) && 22206 (tp->t_rxtshift == 0)) { 22207 if (tp->t_state == TCPS_SYN_RECEIVED) { 22208 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 22209 to.to_tfo_cookie = 22210 (u_int8_t *)&tp->t_tfo_cookie.server; 22211 to.to_flags |= TOF_FASTOPEN; 22212 wanted_cookie = 1; 22213 } else if (tp->t_state == TCPS_SYN_SENT) { 22214 to.to_tfo_len = 22215 tp->t_tfo_client_cookie_len; 22216 to.to_tfo_cookie = 22217 tp->t_tfo_cookie.client; 22218 to.to_flags |= TOF_FASTOPEN; 22219 wanted_cookie = 1; 22220 /* 22221 * If we wind up having more data to 22222 * send with the SYN than can fit in 22223 * one segment, don't send any more 22224 * until the SYN|ACK comes back from 22225 * the other end. 22226 */ 22227 sendalot = 0; 22228 } 22229 } 22230 } 22231 /* Window scaling. */ 22232 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 22233 to.to_wscale = tp->request_r_scale; 22234 to.to_flags |= TOF_SCALE; 22235 } 22236 /* Timestamps. */ 22237 if ((tp->t_flags & TF_RCVD_TSTMP) || 22238 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 22239 uint32_t ts_to_use; 22240 22241 if ((rack->r_rcvpath_rtt_up == 1) && 22242 (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) { 22243 /* 22244 * When we are doing a rcv_rtt probe all 22245 * other timestamps use the next msec. This 22246 * is safe since our previous ack is in the 22247 * air and we will just have a few more 22248 * on the next ms. This assures that only 22249 * the one ack has the ms_cts that was on 22250 * our ack-probe. 22251 */ 22252 ts_to_use = ms_cts + 1; 22253 } else { 22254 ts_to_use = ms_cts; 22255 } 22256 to.to_tsval = ts_to_use + tp->ts_offset; 22257 to.to_tsecr = tp->ts_recent; 22258 to.to_flags |= TOF_TS; 22259 if ((len == 0) && 22260 (TCPS_HAVEESTABLISHED(tp->t_state)) && 22261 ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) && 22262 (tp->snd_una == tp->snd_max) && 22263 (flags & TH_ACK) && 22264 (sbavail(sb) == 0) && 22265 (rack->r_ctl.current_round != 0) && 22266 ((flags & (TH_SYN|TH_FIN)) == 0) && 22267 (rack->r_rcvpath_rtt_up == 0)) { 22268 rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts; 22269 rack->r_ctl.last_time_of_arm_rcv = cts; 22270 rack->r_rcvpath_rtt_up = 1; 22271 /* Subtract 1 from seq to force a response */ 22272 rack_seq--; 22273 } 22274 } 22275 /* Set receive buffer autosizing timestamp. */ 22276 if (tp->rfbuf_ts == 0 && 22277 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 22278 tp->rfbuf_ts = ms_cts; 22279 } 22280 /* Selective ACK's. */ 22281 if (tp->t_flags & TF_SACK_PERMIT) { 22282 if (flags & TH_SYN) 22283 to.to_flags |= TOF_SACKPERM; 22284 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 22285 tp->rcv_numsacks > 0) { 22286 to.to_flags |= TOF_SACK; 22287 to.to_nsacks = tp->rcv_numsacks; 22288 to.to_sacks = (u_char *)tp->sackblks; 22289 } 22290 } 22291 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 22292 /* TCP-MD5 (RFC2385). */ 22293 if (tp->t_flags & TF_SIGNATURE) 22294 to.to_flags |= TOF_SIGNATURE; 22295 #endif 22296 22297 /* Processing the options. */ 22298 hdrlen += optlen = tcp_addoptions(&to, opt); 22299 /* 22300 * If we wanted a TFO option to be added, but it was unable 22301 * to fit, ensure no data is sent. 22302 */ 22303 if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie && 22304 !(to.to_flags & TOF_FASTOPEN)) 22305 len = 0; 22306 } 22307 if (tp->t_port) { 22308 if (V_tcp_udp_tunneling_port == 0) { 22309 /* The port was removed?? */ 22310 SOCKBUF_UNLOCK(&so->so_snd); 22311 #ifdef TCP_ACCOUNTING 22312 crtsc = get_cyclecount(); 22313 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22314 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22315 } 22316 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22317 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22318 } 22319 sched_unpin(); 22320 #endif 22321 return (EHOSTUNREACH); 22322 } 22323 hdrlen += sizeof(struct udphdr); 22324 } 22325 #ifdef INET6 22326 if (isipv6) 22327 ipoptlen = ip6_optlen(inp); 22328 else 22329 #endif 22330 if (inp->inp_options) 22331 ipoptlen = inp->inp_options->m_len - 22332 offsetof(struct ipoption, ipopt_list); 22333 else 22334 ipoptlen = 0; 22335 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 22336 ipoptlen += ipsec_optlen; 22337 #endif 22338 22339 /* 22340 * Adjust data length if insertion of options will bump the packet 22341 * length beyond the t_maxseg length. Clear the FIN bit because we 22342 * cut off the tail of the segment. 22343 */ 22344 if (len + optlen + ipoptlen > tp->t_maxseg) { 22345 if (tso) { 22346 uint32_t if_hw_tsomax; 22347 uint32_t moff; 22348 int32_t max_len; 22349 22350 /* extract TSO information */ 22351 if_hw_tsomax = tp->t_tsomax; 22352 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 22353 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 22354 KASSERT(ipoptlen == 0, 22355 ("%s: TSO can't do IP options", __func__)); 22356 22357 /* 22358 * Check if we should limit by maximum payload 22359 * length: 22360 */ 22361 if (if_hw_tsomax != 0) { 22362 /* compute maximum TSO length */ 22363 max_len = (if_hw_tsomax - hdrlen - 22364 max_linkhdr); 22365 if (max_len <= 0) { 22366 len = 0; 22367 } else if (len > max_len) { 22368 sendalot = 1; 22369 len = max_len; 22370 mark = 2; 22371 } 22372 } 22373 /* 22374 * Prevent the last segment from being fractional 22375 * unless the send sockbuf can be emptied: 22376 */ 22377 max_len = (tp->t_maxseg - optlen); 22378 if ((sb_offset + len) < sbavail(sb)) { 22379 moff = len % (u_int)max_len; 22380 if (moff != 0) { 22381 mark = 3; 22382 len -= moff; 22383 } 22384 } 22385 /* 22386 * In case there are too many small fragments don't 22387 * use TSO: 22388 */ 22389 if (len <= max_len) { 22390 mark = 4; 22391 tso = 0; 22392 } 22393 /* 22394 * Send the FIN in a separate segment after the bulk 22395 * sending is done. We don't trust the TSO 22396 * implementations to clear the FIN flag on all but 22397 * the last segment. 22398 */ 22399 if (tp->t_flags & TF_NEEDFIN) { 22400 sendalot = 4; 22401 } 22402 } else { 22403 mark = 5; 22404 if (optlen + ipoptlen >= tp->t_maxseg) { 22405 /* 22406 * Since we don't have enough space to put 22407 * the IP header chain and the TCP header in 22408 * one packet as required by RFC 7112, don't 22409 * send it. Also ensure that at least one 22410 * byte of the payload can be put into the 22411 * TCP segment. 22412 */ 22413 SOCKBUF_UNLOCK(&so->so_snd); 22414 error = EMSGSIZE; 22415 sack_rxmit = 0; 22416 goto out; 22417 } 22418 len = tp->t_maxseg - optlen - ipoptlen; 22419 sendalot = 5; 22420 } 22421 } else { 22422 tso = 0; 22423 mark = 6; 22424 } 22425 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 22426 ("%s: len > IP_MAXPACKET", __func__)); 22427 #ifdef DIAGNOSTIC 22428 #ifdef INET6 22429 if (max_linkhdr + hdrlen > MCLBYTES) 22430 #else 22431 if (max_linkhdr + hdrlen > MHLEN) 22432 #endif 22433 panic("tcphdr too big"); 22434 #endif 22435 22436 /* 22437 * This KASSERT is here to catch edge cases at a well defined place. 22438 * Before, those had triggered (random) panic conditions further 22439 * down. 22440 */ 22441 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 22442 if ((len == 0) && 22443 (flags & TH_FIN) && 22444 (sbused(sb))) { 22445 /* 22446 * We have outstanding data, don't send a fin by itself!. 22447 * 22448 * Check to see if we need to send a challenge ack. 22449 */ 22450 if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && 22451 ((tp->snd_max - tp->snd_una) <= segsiz)) { 22452 /* 22453 * Ok less than or right at a MSS is 22454 * outstanding. The original FreeBSD stack would 22455 * have sent a FIN, which can speed things up for 22456 * a transactional application doing a MSG_WAITALL. 22457 * To speed things up since we do *not* send a FIN 22458 * if data is outstanding, we send a "challenge ack". 22459 * The idea behind that is instead of having to have 22460 * the peer wait for the delayed-ack timer to run off 22461 * we send an ack that makes the peer send us an ack. 22462 */ 22463 rack_send_ack_challange(rack); 22464 } 22465 goto just_return; 22466 } 22467 /* 22468 * Grab a header mbuf, attaching a copy of data to be transmitted, 22469 * and initialize the header from the template for sends on this 22470 * connection. 22471 */ 22472 hw_tls = tp->t_nic_ktls_xmit != 0; 22473 if (len) { 22474 uint32_t max_val; 22475 uint32_t moff; 22476 22477 if (pace_max_seg) 22478 max_val = pace_max_seg; 22479 else 22480 max_val = len; 22481 /* 22482 * We allow a limit on sending with hptsi. 22483 */ 22484 if (len > max_val) { 22485 mark = 7; 22486 len = max_val; 22487 } 22488 #ifdef INET6 22489 if (MHLEN < hdrlen + max_linkhdr) 22490 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 22491 else 22492 #endif 22493 m = m_gethdr(M_NOWAIT, MT_DATA); 22494 22495 if (m == NULL) { 22496 SOCKBUF_UNLOCK(sb); 22497 error = ENOBUFS; 22498 sack_rxmit = 0; 22499 goto out; 22500 } 22501 m->m_data += max_linkhdr; 22502 m->m_len = hdrlen; 22503 22504 /* 22505 * Start the m_copy functions from the closest mbuf to the 22506 * sb_offset in the socket buffer chain. 22507 */ 22508 mb = sbsndptr_noadv(sb, sb_offset, &moff); 22509 s_mb = mb; 22510 s_moff = moff; 22511 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 22512 m_copydata(mb, moff, (int)len, 22513 mtod(m, caddr_t)+hdrlen); 22514 /* 22515 * If we are not retransmitting advance the 22516 * sndptr to help remember the next place in 22517 * the sb. 22518 */ 22519 if (rsm == NULL) 22520 sbsndptr_adv(sb, mb, len); 22521 m->m_len += len; 22522 } else { 22523 struct sockbuf *msb; 22524 22525 /* 22526 * If we are not retransmitting pass in msb so 22527 * the socket buffer can be advanced. Otherwise 22528 * set it to NULL if its a retransmission since 22529 * we don't want to change the sb remembered 22530 * location. 22531 */ 22532 if (rsm == NULL) 22533 msb = sb; 22534 else 22535 msb = NULL; 22536 m->m_next = tcp_m_copym( 22537 mb, moff, &len, 22538 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 22539 ((rsm == NULL) ? hw_tls : 0) 22540 #ifdef NETFLIX_COPY_ARGS 22541 , &s_mb, &s_moff 22542 #endif 22543 ); 22544 if (len <= (tp->t_maxseg - optlen)) { 22545 /* 22546 * Must have ran out of mbufs for the copy 22547 * shorten it to no longer need tso. Lets 22548 * not put on sendalot since we are low on 22549 * mbufs. 22550 */ 22551 tso = 0; 22552 } 22553 if (m->m_next == NULL) { 22554 SOCKBUF_UNLOCK(sb); 22555 (void)m_free(m); 22556 error = ENOBUFS; 22557 sack_rxmit = 0; 22558 goto out; 22559 } 22560 } 22561 if (sack_rxmit) { 22562 if (rsm && (rsm->r_flags & RACK_TLP)) { 22563 /* 22564 * TLP should not count in retran count, but 22565 * in its own bin 22566 */ 22567 counter_u64_add(rack_tlp_retran, 1); 22568 counter_u64_add(rack_tlp_retran_bytes, len); 22569 } else { 22570 tp->t_sndrexmitpack++; 22571 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 22572 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 22573 } 22574 #ifdef STATS 22575 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 22576 len); 22577 #endif 22578 } else { 22579 KMOD_TCPSTAT_INC(tcps_sndpack); 22580 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 22581 #ifdef STATS 22582 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 22583 len); 22584 #endif 22585 } 22586 /* 22587 * If we're sending everything we've got, set PUSH. (This 22588 * will keep happy those implementations which only give 22589 * data to the user when a buffer fills or a PUSH comes in.) 22590 */ 22591 if (sb_offset + len == sbused(sb) && 22592 sbused(sb) && 22593 !(flags & TH_SYN)) { 22594 flags |= TH_PUSH; 22595 add_flag |= RACK_HAD_PUSH; 22596 } 22597 22598 SOCKBUF_UNLOCK(sb); 22599 } else { 22600 SOCKBUF_UNLOCK(sb); 22601 if (tp->t_flags & TF_ACKNOW) 22602 KMOD_TCPSTAT_INC(tcps_sndacks); 22603 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 22604 KMOD_TCPSTAT_INC(tcps_sndctrl); 22605 else 22606 KMOD_TCPSTAT_INC(tcps_sndwinup); 22607 22608 m = m_gethdr(M_NOWAIT, MT_DATA); 22609 if (m == NULL) { 22610 error = ENOBUFS; 22611 sack_rxmit = 0; 22612 goto out; 22613 } 22614 #ifdef INET6 22615 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 22616 MHLEN >= hdrlen) { 22617 M_ALIGN(m, hdrlen); 22618 } else 22619 #endif 22620 m->m_data += max_linkhdr; 22621 m->m_len = hdrlen; 22622 } 22623 SOCKBUF_UNLOCK_ASSERT(sb); 22624 m->m_pkthdr.rcvif = (struct ifnet *)0; 22625 #ifdef MAC 22626 mac_inpcb_create_mbuf(inp, m); 22627 #endif 22628 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 22629 #ifdef INET6 22630 if (isipv6) 22631 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 22632 else 22633 #endif /* INET6 */ 22634 #ifdef INET 22635 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 22636 #endif 22637 th = rack->r_ctl.fsb.th; 22638 udp = rack->r_ctl.fsb.udp; 22639 if (udp) { 22640 #ifdef INET6 22641 if (isipv6) 22642 ulen = hdrlen + len - sizeof(struct ip6_hdr); 22643 else 22644 #endif /* INET6 */ 22645 ulen = hdrlen + len - sizeof(struct ip); 22646 udp->uh_ulen = htons(ulen); 22647 } 22648 } else { 22649 #ifdef INET6 22650 if (isipv6) { 22651 ip6 = mtod(m, struct ip6_hdr *); 22652 if (tp->t_port) { 22653 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 22654 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 22655 udp->uh_dport = tp->t_port; 22656 ulen = hdrlen + len - sizeof(struct ip6_hdr); 22657 udp->uh_ulen = htons(ulen); 22658 th = (struct tcphdr *)(udp + 1); 22659 } else 22660 th = (struct tcphdr *)(ip6 + 1); 22661 tcpip_fillheaders(inp, tp->t_port, ip6, th); 22662 } else 22663 #endif /* INET6 */ 22664 { 22665 #ifdef INET 22666 ip = mtod(m, struct ip *); 22667 if (tp->t_port) { 22668 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 22669 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 22670 udp->uh_dport = tp->t_port; 22671 ulen = hdrlen + len - sizeof(struct ip); 22672 udp->uh_ulen = htons(ulen); 22673 th = (struct tcphdr *)(udp + 1); 22674 } else 22675 th = (struct tcphdr *)(ip + 1); 22676 tcpip_fillheaders(inp, tp->t_port, ip, th); 22677 #endif 22678 } 22679 } 22680 /* 22681 * If we are starting a connection, send ECN setup SYN packet. If we 22682 * are on a retransmit, we may resend those bits a number of times 22683 * as per RFC 3168. 22684 */ 22685 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 22686 flags |= tcp_ecn_output_syn_sent(tp); 22687 } 22688 /* Also handle parallel SYN for ECN */ 22689 if (TCPS_HAVERCVDSYN(tp->t_state) && 22690 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 22691 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 22692 if ((tp->t_state == TCPS_SYN_RECEIVED) && 22693 (tp->t_flags2 & TF2_ECN_SND_ECE)) 22694 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 22695 #ifdef INET6 22696 if (isipv6) { 22697 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 22698 ip6->ip6_flow |= htonl(ect << 20); 22699 } 22700 else 22701 #endif 22702 { 22703 #ifdef INET 22704 ip->ip_tos &= ~IPTOS_ECN_MASK; 22705 ip->ip_tos |= ect; 22706 #endif 22707 } 22708 } 22709 th->th_seq = htonl(rack_seq); 22710 th->th_ack = htonl(tp->rcv_nxt); 22711 tcp_set_flags(th, flags); 22712 /* 22713 * Calculate receive window. Don't shrink window, but avoid silly 22714 * window syndrome. 22715 * If a RST segment is sent, advertise a window of zero. 22716 */ 22717 if (flags & TH_RST) { 22718 recwin = 0; 22719 } else { 22720 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 22721 recwin < (long)segsiz) { 22722 recwin = 0; 22723 } 22724 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 22725 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 22726 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 22727 } 22728 22729 /* 22730 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 22731 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 22732 * handled in syncache. 22733 */ 22734 if (flags & TH_SYN) 22735 th->th_win = htons((u_short) 22736 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 22737 else { 22738 /* Avoid shrinking window with window scaling. */ 22739 recwin = roundup2(recwin, 1 << tp->rcv_scale); 22740 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 22741 } 22742 /* 22743 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 22744 * window. This may cause the remote transmitter to stall. This 22745 * flag tells soreceive() to disable delayed acknowledgements when 22746 * draining the buffer. This can occur if the receiver is 22747 * attempting to read more data than can be buffered prior to 22748 * transmitting on the connection. 22749 */ 22750 if (th->th_win == 0) { 22751 tp->t_sndzerowin++; 22752 tp->t_flags |= TF_RXWIN0SENT; 22753 } else 22754 tp->t_flags &= ~TF_RXWIN0SENT; 22755 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 22756 /* Now are we using fsb?, if so copy the template data to the mbuf */ 22757 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 22758 uint8_t *cpto; 22759 22760 cpto = mtod(m, uint8_t *); 22761 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 22762 /* 22763 * We have just copied in: 22764 * IP/IP6 22765 * <optional udphdr> 22766 * tcphdr (no options) 22767 * 22768 * We need to grab the correct pointers into the mbuf 22769 * for both the tcp header, and possibly the udp header (if tunneling). 22770 * We do this by using the offset in the copy buffer and adding it 22771 * to the mbuf base pointer (cpto). 22772 */ 22773 #ifdef INET6 22774 if (isipv6) 22775 ip6 = mtod(m, struct ip6_hdr *); 22776 else 22777 #endif /* INET6 */ 22778 #ifdef INET 22779 ip = mtod(m, struct ip *); 22780 #endif 22781 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 22782 /* If we have a udp header lets set it into the mbuf as well */ 22783 if (udp) 22784 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 22785 } 22786 if (optlen) { 22787 bcopy(opt, th + 1, optlen); 22788 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 22789 } 22790 /* 22791 * Put TCP length in extended header, and then checksum extended 22792 * header and data. 22793 */ 22794 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 22795 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 22796 if (to.to_flags & TOF_SIGNATURE) { 22797 /* 22798 * Calculate MD5 signature and put it into the place 22799 * determined before. 22800 * NOTE: since TCP options buffer doesn't point into 22801 * mbuf's data, calculate offset and use it. 22802 */ 22803 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 22804 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 22805 /* 22806 * Do not send segment if the calculation of MD5 22807 * digest has failed. 22808 */ 22809 goto out; 22810 } 22811 } 22812 #endif 22813 #ifdef INET6 22814 if (isipv6) { 22815 /* 22816 * ip6_plen is not need to be filled now, and will be filled 22817 * in ip6_output. 22818 */ 22819 if (tp->t_port) { 22820 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 22821 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 22822 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 22823 th->th_sum = htons(0); 22824 UDPSTAT_INC(udps_opackets); 22825 } else { 22826 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 22827 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 22828 th->th_sum = in6_cksum_pseudo(ip6, 22829 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 22830 0); 22831 } 22832 } 22833 #endif 22834 #if defined(INET6) && defined(INET) 22835 else 22836 #endif 22837 #ifdef INET 22838 { 22839 if (tp->t_port) { 22840 m->m_pkthdr.csum_flags = CSUM_UDP; 22841 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 22842 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 22843 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 22844 th->th_sum = htons(0); 22845 UDPSTAT_INC(udps_opackets); 22846 } else { 22847 m->m_pkthdr.csum_flags = CSUM_TCP; 22848 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 22849 th->th_sum = in_pseudo(ip->ip_src.s_addr, 22850 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 22851 IPPROTO_TCP + len + optlen)); 22852 } 22853 /* IP version must be set here for ipv4/ipv6 checking later */ 22854 KASSERT(ip->ip_v == IPVERSION, 22855 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 22856 } 22857 #endif 22858 /* 22859 * Enable TSO and specify the size of the segments. The TCP pseudo 22860 * header checksum is always provided. XXX: Fixme: This is currently 22861 * not the case for IPv6. 22862 */ 22863 if (tso) { 22864 /* 22865 * Here we must use t_maxseg and the optlen since 22866 * the optlen may include SACK's (or DSACK). 22867 */ 22868 KASSERT(len > tp->t_maxseg - optlen, 22869 ("%s: len <= tso_segsz", __func__)); 22870 m->m_pkthdr.csum_flags |= CSUM_TSO; 22871 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 22872 } 22873 KASSERT(len + hdrlen == m_length(m, NULL), 22874 ("%s: mbuf chain different than expected: %d + %u != %u", 22875 __func__, len, hdrlen, m_length(m, NULL))); 22876 22877 #ifdef TCP_HHOOK 22878 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 22879 hhook_run_tcp_est_out(tp, th, &to, len, tso); 22880 #endif 22881 if ((rack->r_ctl.crte != NULL) && 22882 (rack->rc_hw_nobuf == 0) && 22883 tcp_bblogging_on(tp)) { 22884 rack_log_queue_level(tp, rack, len, &tv, cts); 22885 } 22886 /* We're getting ready to send; log now. */ 22887 if (tcp_bblogging_on(rack->rc_tp)) { 22888 union tcp_log_stackspecific log; 22889 22890 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 22891 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 22892 if (rack->rack_no_prr) 22893 log.u_bbr.flex1 = 0; 22894 else 22895 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 22896 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 22897 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 22898 log.u_bbr.flex4 = orig_len; 22899 /* Save off the early/late values */ 22900 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 22901 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 22902 log.u_bbr.bw_inuse = rack_get_bw(rack); 22903 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 22904 log.u_bbr.flex8 = 0; 22905 if (rsm) { 22906 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 22907 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 22908 counter_u64_add(rack_collapsed_win_rxt, 1); 22909 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 22910 } 22911 if (doing_tlp) 22912 log.u_bbr.flex8 = 2; 22913 else 22914 log.u_bbr.flex8 = 1; 22915 } else { 22916 if (doing_tlp) 22917 log.u_bbr.flex8 = 3; 22918 } 22919 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 22920 log.u_bbr.flex7 = mark; 22921 log.u_bbr.flex7 <<= 8; 22922 log.u_bbr.flex7 |= pass; 22923 log.u_bbr.pkts_out = tp->t_maxseg; 22924 log.u_bbr.timeStamp = cts; 22925 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 22926 if (rsm && (rsm->r_rtr_cnt > 0)) { 22927 /* 22928 * When we have a retransmit we want to log the 22929 * burst at send and flight at send from before. 22930 */ 22931 log.u_bbr.flex5 = rsm->r_fas; 22932 log.u_bbr.bbr_substate = rsm->r_bas; 22933 } else { 22934 /* 22935 * New transmits we log in flex5 the inflight again as 22936 * well as the number of segments in our send in the 22937 * substate field. 22938 */ 22939 log.u_bbr.flex5 = log.u_bbr.inflight; 22940 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 22941 } 22942 log.u_bbr.lt_epoch = cwnd_to_use; 22943 log.u_bbr.delivered = sendalot; 22944 log.u_bbr.rttProp = (uint64_t)rsm; 22945 log.u_bbr.pkt_epoch = __LINE__; 22946 if (rsm) { 22947 log.u_bbr.delRate = rsm->r_flags; 22948 log.u_bbr.delRate <<= 31; 22949 log.u_bbr.delRate |= rack->r_must_retran; 22950 log.u_bbr.delRate <<= 1; 22951 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 22952 } else { 22953 log.u_bbr.delRate = rack->r_must_retran; 22954 log.u_bbr.delRate <<= 1; 22955 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 22956 } 22957 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 22958 len, &log, false, NULL, __func__, __LINE__, &tv); 22959 } else 22960 lgb = NULL; 22961 22962 /* 22963 * Fill in IP length and desired time to live and send to IP level. 22964 * There should be a better way to handle ttl and tos; we could keep 22965 * them in the template, but need a way to checksum without them. 22966 */ 22967 /* 22968 * m->m_pkthdr.len should have been set before cksum calcuration, 22969 * because in6_cksum() need it. 22970 */ 22971 #ifdef INET6 22972 if (isipv6) { 22973 /* 22974 * we separately set hoplimit for every segment, since the 22975 * user might want to change the value via setsockopt. Also, 22976 * desired default hop limit might be changed via Neighbor 22977 * Discovery. 22978 */ 22979 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 22980 22981 /* 22982 * Set the packet size here for the benefit of DTrace 22983 * probes. ip6_output() will set it properly; it's supposed 22984 * to include the option header lengths as well. 22985 */ 22986 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 22987 22988 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 22989 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 22990 else 22991 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 22992 22993 if (tp->t_state == TCPS_SYN_SENT) 22994 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 22995 22996 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 22997 /* TODO: IPv6 IP6TOS_ECT bit on */ 22998 error = ip6_output(m, 22999 inp->in6p_outputopts, 23000 &inp->inp_route6, 23001 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 23002 NULL, NULL, inp); 23003 23004 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 23005 mtu = inp->inp_route6.ro_nh->nh_mtu; 23006 } 23007 #endif /* INET6 */ 23008 #if defined(INET) && defined(INET6) 23009 else 23010 #endif 23011 #ifdef INET 23012 { 23013 ip->ip_len = htons(m->m_pkthdr.len); 23014 #ifdef INET6 23015 if (inp->inp_vflag & INP_IPV6PROTO) 23016 ip->ip_ttl = in6_selecthlim(inp, NULL); 23017 #endif /* INET6 */ 23018 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 23019 /* 23020 * If we do path MTU discovery, then we set DF on every 23021 * packet. This might not be the best thing to do according 23022 * to RFC3390 Section 2. However the tcp hostcache migitates 23023 * the problem so it affects only the first tcp connection 23024 * with a host. 23025 * 23026 * NB: Don't set DF on small MTU/MSS to have a safe 23027 * fallback. 23028 */ 23029 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 23030 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 23031 if (tp->t_port == 0 || len < V_tcp_minmss) { 23032 ip->ip_off |= htons(IP_DF); 23033 } 23034 } else { 23035 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 23036 } 23037 23038 if (tp->t_state == TCPS_SYN_SENT) 23039 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 23040 23041 TCP_PROBE5(send, NULL, tp, ip, tp, th); 23042 23043 error = ip_output(m, 23044 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 23045 inp->inp_options, 23046 #else 23047 NULL, 23048 #endif 23049 &inp->inp_route, 23050 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 23051 inp); 23052 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 23053 mtu = inp->inp_route.ro_nh->nh_mtu; 23054 } 23055 #endif /* INET */ 23056 if (lgb) { 23057 lgb->tlb_errno = error; 23058 lgb = NULL; 23059 } 23060 23061 out: 23062 /* 23063 * In transmit state, time the transmission and arrange for the 23064 * retransmit. In persist state, just set snd_max. 23065 */ 23066 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 23067 rack_to_usec_ts(&tv), 23068 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); 23069 if (error == 0) { 23070 if (add_flag & RACK_IS_PCM) { 23071 /* We just launched a PCM */ 23072 /* rrs here log */ 23073 rack->pcm_in_progress = 1; 23074 rack->pcm_needed = 0; 23075 rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag); 23076 } 23077 if (rsm == NULL) { 23078 if (rack->lt_bw_up == 0) { 23079 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); 23080 rack->r_ctl.lt_seq = tp->snd_una; 23081 rack->lt_bw_up = 1; 23082 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { 23083 /* 23084 * Need to record what we have since we are 23085 * approaching seq wrap. 23086 */ 23087 uint64_t tmark; 23088 23089 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 23090 rack->r_ctl.lt_seq = tp->snd_una; 23091 tmark = tcp_get_u64_usecs(&tv); 23092 if (tmark > rack->r_ctl.lt_timemark) { 23093 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 23094 rack->r_ctl.lt_timemark = tmark; 23095 } 23096 } 23097 } 23098 rack->forced_ack = 0; /* If we send something zap the FA flag */ 23099 counter_u64_add(rack_total_bytes, len); 23100 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 23101 if (rsm && doing_tlp) { 23102 rack->rc_last_sent_tlp_past_cumack = 0; 23103 rack->rc_last_sent_tlp_seq_valid = 1; 23104 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 23105 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 23106 } 23107 if (rack->rc_hw_nobuf) { 23108 rack->rc_hw_nobuf = 0; 23109 rack->r_ctl.rc_agg_delayed = 0; 23110 rack->r_early = 0; 23111 rack->r_late = 0; 23112 rack->r_ctl.rc_agg_early = 0; 23113 } 23114 if (rsm && (doing_tlp == 0)) { 23115 /* Set we retransmitted */ 23116 rack->rc_gp_saw_rec = 1; 23117 } else { 23118 if (cwnd_to_use > tp->snd_ssthresh) { 23119 /* Set we sent in CA */ 23120 rack->rc_gp_saw_ca = 1; 23121 } else { 23122 /* Set we sent in SS */ 23123 rack->rc_gp_saw_ss = 1; 23124 } 23125 } 23126 if (TCPS_HAVEESTABLISHED(tp->t_state) && 23127 (tp->t_flags & TF_SACK_PERMIT) && 23128 tp->rcv_numsacks > 0) 23129 tcp_clean_dsack_blocks(tp); 23130 tot_len_this_send += len; 23131 if (len == 0) { 23132 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 23133 } else { 23134 int idx; 23135 23136 idx = (len / segsiz) + 3; 23137 if (idx >= TCP_MSS_ACCT_ATIMER) 23138 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 23139 else 23140 counter_u64_add(rack_out_size[idx], 1); 23141 } 23142 } 23143 if ((rack->rack_no_prr == 0) && 23144 sub_from_prr && 23145 (error == 0)) { 23146 if (rack->r_ctl.rc_prr_sndcnt >= len) 23147 rack->r_ctl.rc_prr_sndcnt -= len; 23148 else 23149 rack->r_ctl.rc_prr_sndcnt = 0; 23150 } 23151 sub_from_prr = 0; 23152 if (doing_tlp) { 23153 /* Make sure the TLP is added */ 23154 add_flag |= RACK_TLP; 23155 } else if (rsm) { 23156 /* If its a resend without TLP then it must not have the flag */ 23157 rsm->r_flags &= ~RACK_TLP; 23158 } 23159 23160 23161 if ((error == 0) && 23162 (len > 0) && 23163 (tp->snd_una == tp->snd_max)) 23164 rack->r_ctl.rc_tlp_rxt_last_time = cts; 23165 23166 { 23167 /* 23168 * This block is not associated with the above error == 0 test. 23169 * It is used to advance snd_max if we have a new transmit. 23170 */ 23171 tcp_seq startseq = tp->snd_max; 23172 23173 23174 if (rsm && (doing_tlp == 0)) 23175 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 23176 if (error) 23177 /* We don't log or do anything with errors */ 23178 goto nomore; 23179 if (doing_tlp == 0) { 23180 if (rsm == NULL) { 23181 /* 23182 * Not a retransmission of some 23183 * sort, new data is going out so 23184 * clear our TLP count and flag. 23185 */ 23186 rack->rc_tlp_in_progress = 0; 23187 rack->r_ctl.rc_tlp_cnt_out = 0; 23188 } 23189 } else { 23190 /* 23191 * We have just sent a TLP, mark that it is true 23192 * and make sure our in progress is set so we 23193 * continue to check the count. 23194 */ 23195 rack->rc_tlp_in_progress = 1; 23196 rack->r_ctl.rc_tlp_cnt_out++; 23197 } 23198 /* 23199 * If we are retransmitting we are done, snd_max 23200 * does not get updated. 23201 */ 23202 if (sack_rxmit) 23203 goto nomore; 23204 if ((tp->snd_una == tp->snd_max) && (len > 0)) { 23205 /* 23206 * Update the time we just added data since 23207 * nothing was outstanding. 23208 */ 23209 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 23210 tp->t_acktime = ticks; 23211 } 23212 /* 23213 * Now for special SYN/FIN handling. 23214 */ 23215 if (flags & (TH_SYN | TH_FIN)) { 23216 if ((flags & TH_SYN) && 23217 ((tp->t_flags & TF_SENTSYN) == 0)) { 23218 tp->snd_max++; 23219 tp->t_flags |= TF_SENTSYN; 23220 } 23221 if ((flags & TH_FIN) && 23222 ((tp->t_flags & TF_SENTFIN) == 0)) { 23223 tp->snd_max++; 23224 tp->t_flags |= TF_SENTFIN; 23225 } 23226 } 23227 tp->snd_max += len; 23228 if (rack->rc_new_rnd_needed) { 23229 rack_new_round_starts(tp, rack, tp->snd_max); 23230 } 23231 /* 23232 * Time this transmission if not a retransmission and 23233 * not currently timing anything. 23234 * This is only relevant in case of switching back to 23235 * the base stack. 23236 */ 23237 if (tp->t_rtttime == 0) { 23238 tp->t_rtttime = ticks; 23239 tp->t_rtseq = startseq; 23240 KMOD_TCPSTAT_INC(tcps_segstimed); 23241 } 23242 if (len && 23243 ((tp->t_flags & TF_GPUTINPROG) == 0)) 23244 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 23245 /* 23246 * If we are doing FO we need to update the mbuf position and subtract 23247 * this happens when the peer sends us duplicate information and 23248 * we thus want to send a DSACK. 23249 * 23250 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 23251 * turned off? If not then we are going to echo multiple DSACK blocks 23252 * out (with the TSO), which we should not be doing. 23253 */ 23254 if (rack->r_fast_output && len) { 23255 if (rack->r_ctl.fsb.left_to_send > len) 23256 rack->r_ctl.fsb.left_to_send -= len; 23257 else 23258 rack->r_ctl.fsb.left_to_send = 0; 23259 if (rack->r_ctl.fsb.left_to_send < segsiz) 23260 rack->r_fast_output = 0; 23261 if (rack->r_fast_output) { 23262 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 23263 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 23264 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 23265 } 23266 } 23267 if (rack_pcm_blast == 0) { 23268 if ((orig_len > len) && 23269 (add_flag & RACK_IS_PCM) && 23270 (len < pace_max_seg) && 23271 ((pace_max_seg - len) > segsiz)) { 23272 /* 23273 * We are doing a PCM measurement and we did 23274 * not get enough data in the TSO to meet the 23275 * burst requirement. 23276 */ 23277 uint32_t n_len; 23278 23279 n_len = (orig_len - len); 23280 orig_len -= len; 23281 pace_max_seg -= len; 23282 len = n_len; 23283 sb_offset = tp->snd_max - tp->snd_una; 23284 /* Re-lock for the next spin */ 23285 SOCKBUF_LOCK(sb); 23286 goto send; 23287 } 23288 } else { 23289 if ((orig_len > len) && 23290 (add_flag & RACK_IS_PCM) && 23291 ((orig_len - len) > segsiz)) { 23292 /* 23293 * We are doing a PCM measurement and we did 23294 * not get enough data in the TSO to meet the 23295 * burst requirement. 23296 */ 23297 uint32_t n_len; 23298 23299 n_len = (orig_len - len); 23300 orig_len -= len; 23301 len = n_len; 23302 sb_offset = tp->snd_max - tp->snd_una; 23303 /* Re-lock for the next spin */ 23304 SOCKBUF_LOCK(sb); 23305 goto send; 23306 } 23307 } 23308 } 23309 nomore: 23310 if (error) { 23311 rack->r_ctl.rc_agg_delayed = 0; 23312 rack->r_early = 0; 23313 rack->r_late = 0; 23314 rack->r_ctl.rc_agg_early = 0; 23315 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 23316 /* 23317 * Failures do not advance the seq counter above. For the 23318 * case of ENOBUFS we will fall out and retry in 1ms with 23319 * the hpts. Everything else will just have to retransmit 23320 * with the timer. 23321 * 23322 * In any case, we do not want to loop around for another 23323 * send without a good reason. 23324 */ 23325 sendalot = 0; 23326 switch (error) { 23327 case EPERM: 23328 case EACCES: 23329 tp->t_softerror = error; 23330 #ifdef TCP_ACCOUNTING 23331 crtsc = get_cyclecount(); 23332 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23333 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 23334 } 23335 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23336 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 23337 } 23338 sched_unpin(); 23339 #endif 23340 return (error); 23341 case ENOBUFS: 23342 /* 23343 * Pace us right away to retry in a some 23344 * time 23345 */ 23346 if (rack->r_ctl.crte != NULL) { 23347 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 23348 if (tcp_bblogging_on(rack->rc_tp)) 23349 rack_log_queue_level(tp, rack, len, &tv, cts); 23350 } else 23351 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 23352 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 23353 if (rack->rc_enobuf < 0x7f) 23354 rack->rc_enobuf++; 23355 if (slot < (10 * HPTS_USEC_IN_MSEC)) 23356 slot = 10 * HPTS_USEC_IN_MSEC; 23357 if (rack->r_ctl.crte != NULL) { 23358 counter_u64_add(rack_saw_enobuf_hw, 1); 23359 tcp_rl_log_enobuf(rack->r_ctl.crte); 23360 } 23361 counter_u64_add(rack_saw_enobuf, 1); 23362 goto enobufs; 23363 case EMSGSIZE: 23364 /* 23365 * For some reason the interface we used initially 23366 * to send segments changed to another or lowered 23367 * its MTU. If TSO was active we either got an 23368 * interface without TSO capabilits or TSO was 23369 * turned off. If we obtained mtu from ip_output() 23370 * then update it and try again. 23371 */ 23372 if (tso) 23373 tp->t_flags &= ~TF_TSO; 23374 if (mtu != 0) { 23375 int saved_mtu; 23376 23377 saved_mtu = tp->t_maxseg; 23378 tcp_mss_update(tp, -1, mtu, NULL, NULL); 23379 if (saved_mtu > tp->t_maxseg) { 23380 goto again; 23381 } 23382 } 23383 slot = 10 * HPTS_USEC_IN_MSEC; 23384 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 23385 #ifdef TCP_ACCOUNTING 23386 crtsc = get_cyclecount(); 23387 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23388 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 23389 } 23390 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23391 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 23392 } 23393 sched_unpin(); 23394 #endif 23395 return (error); 23396 case ENETUNREACH: 23397 counter_u64_add(rack_saw_enetunreach, 1); 23398 case EHOSTDOWN: 23399 case EHOSTUNREACH: 23400 case ENETDOWN: 23401 if (TCPS_HAVERCVDSYN(tp->t_state)) { 23402 tp->t_softerror = error; 23403 } 23404 /* FALLTHROUGH */ 23405 default: 23406 slot = 10 * HPTS_USEC_IN_MSEC; 23407 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 23408 #ifdef TCP_ACCOUNTING 23409 crtsc = get_cyclecount(); 23410 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23411 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 23412 } 23413 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23414 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 23415 } 23416 sched_unpin(); 23417 #endif 23418 return (error); 23419 } 23420 } else { 23421 rack->rc_enobuf = 0; 23422 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 23423 rack->r_ctl.retran_during_recovery += len; 23424 } 23425 KMOD_TCPSTAT_INC(tcps_sndtotal); 23426 23427 /* 23428 * Data sent (as far as we can tell). If this advertises a larger 23429 * window than any other segment, then remember the size of the 23430 * advertised window. Any pending ACK has now been sent. 23431 */ 23432 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 23433 tp->rcv_adv = tp->rcv_nxt + recwin; 23434 23435 tp->last_ack_sent = tp->rcv_nxt; 23436 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 23437 enobufs: 23438 if (sendalot) { 23439 /* Do we need to turn off sendalot? */ 23440 if (pace_max_seg && 23441 (tot_len_this_send >= pace_max_seg)) { 23442 /* We hit our max. */ 23443 sendalot = 0; 23444 } 23445 } 23446 if ((error == 0) && (flags & TH_FIN)) 23447 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 23448 if (flags & TH_RST) { 23449 /* 23450 * We don't send again after sending a RST. 23451 */ 23452 slot = 0; 23453 sendalot = 0; 23454 if (error == 0) 23455 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 23456 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 23457 /* 23458 * Get our pacing rate, if an error 23459 * occurred in sending (ENOBUF) we would 23460 * hit the else if with slot preset. Other 23461 * errors return. 23462 */ 23463 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); 23464 } 23465 /* We have sent clear the flag */ 23466 rack->r_ent_rec_ns = 0; 23467 if (rack->r_must_retran) { 23468 if (rsm) { 23469 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 23470 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 23471 /* 23472 * We have retransmitted all. 23473 */ 23474 rack->r_must_retran = 0; 23475 rack->r_ctl.rc_out_at_rto = 0; 23476 } 23477 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 23478 /* 23479 * Sending new data will also kill 23480 * the loop. 23481 */ 23482 rack->r_must_retran = 0; 23483 rack->r_ctl.rc_out_at_rto = 0; 23484 } 23485 } 23486 rack->r_ctl.fsb.recwin = recwin; 23487 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 23488 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 23489 /* 23490 * We hit an RTO and now have past snd_max at the RTO 23491 * clear all the WAS flags. 23492 */ 23493 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 23494 } 23495 if (slot) { 23496 /* set the rack tcb into the slot N */ 23497 if ((error == 0) && 23498 rack_use_rfo && 23499 ((flags & (TH_SYN|TH_FIN)) == 0) && 23500 (rsm == NULL) && 23501 (ipoptlen == 0) && 23502 (tp->rcv_numsacks == 0) && 23503 (rack->rc_policer_detected == 0) && 23504 rack->r_fsb_inited && 23505 TCPS_HAVEESTABLISHED(tp->t_state) && 23506 ((IN_RECOVERY(tp->t_flags)) == 0) && 23507 (rack->r_must_retran == 0) && 23508 ((tp->t_flags & TF_NEEDFIN) == 0) && 23509 (len > 0) && (orig_len > 0) && 23510 (orig_len > len) && 23511 ((orig_len - len) >= segsiz) && 23512 ((optlen == 0) || 23513 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 23514 /* We can send at least one more MSS using our fsb */ 23515 rack_setup_fast_output(tp, rack, sb, len, orig_len, 23516 segsiz, pace_max_seg, hw_tls, flags); 23517 } else 23518 rack->r_fast_output = 0; 23519 rack_log_fsb(rack, tp, so, flags, 23520 ipoptlen, orig_len, len, error, 23521 (rsm == NULL), optlen, __LINE__, 2); 23522 } else if (sendalot) { 23523 int ret; 23524 23525 sack_rxmit = 0; 23526 if ((error == 0) && 23527 rack_use_rfo && 23528 ((flags & (TH_SYN|TH_FIN)) == 0) && 23529 (rsm == NULL) && 23530 (ipoptlen == 0) && 23531 (tp->rcv_numsacks == 0) && 23532 (rack->r_must_retran == 0) && 23533 rack->r_fsb_inited && 23534 TCPS_HAVEESTABLISHED(tp->t_state) && 23535 ((IN_RECOVERY(tp->t_flags)) == 0) && 23536 ((tp->t_flags & TF_NEEDFIN) == 0) && 23537 (len > 0) && (orig_len > 0) && 23538 (orig_len > len) && 23539 ((orig_len - len) >= segsiz) && 23540 ((optlen == 0) || 23541 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 23542 /* we can use fast_output for more */ 23543 rack_setup_fast_output(tp, rack, sb, len, orig_len, 23544 segsiz, pace_max_seg, hw_tls, flags); 23545 if (rack->r_fast_output) { 23546 error = 0; 23547 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 23548 if (ret >= 0) 23549 return (ret); 23550 else if (error) 23551 goto nomore; 23552 23553 } 23554 } 23555 goto again; 23556 } 23557 skip_all_send: 23558 /* Assure when we leave that snd_nxt will point to top */ 23559 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 23560 tp->snd_nxt = tp->snd_max; 23561 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 23562 #ifdef TCP_ACCOUNTING 23563 crtsc = get_cyclecount() - ts_val; 23564 if (tot_len_this_send) { 23565 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23566 tp->tcp_cnt_counters[SND_OUT_DATA]++; 23567 } 23568 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23569 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 23570 } 23571 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23572 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 23573 } 23574 } else { 23575 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23576 tp->tcp_cnt_counters[SND_OUT_ACK]++; 23577 } 23578 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 23579 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 23580 } 23581 } 23582 sched_unpin(); 23583 #endif 23584 if (error == ENOBUFS) 23585 error = 0; 23586 return (error); 23587 } 23588 23589 static void 23590 rack_update_seg(struct tcp_rack *rack) 23591 { 23592 uint32_t orig_val; 23593 23594 orig_val = rack->r_ctl.rc_pace_max_segs; 23595 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 23596 if (orig_val != rack->r_ctl.rc_pace_max_segs) 23597 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 23598 } 23599 23600 static void 23601 rack_mtu_change(struct tcpcb *tp) 23602 { 23603 /* 23604 * The MSS may have changed 23605 */ 23606 struct tcp_rack *rack; 23607 struct rack_sendmap *rsm; 23608 23609 rack = (struct tcp_rack *)tp->t_fb_ptr; 23610 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 23611 /* 23612 * The MTU has changed we need to resend everything 23613 * since all we have sent is lost. We first fix 23614 * up the mtu though. 23615 */ 23616 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23617 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 23618 rack_remxt_tmr(tp); 23619 rack->r_fast_output = 0; 23620 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 23621 rack->r_ctl.rc_sacked); 23622 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 23623 rack->r_must_retran = 1; 23624 /* Mark all inflight to needing to be rxt'd */ 23625 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 23626 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); 23627 } 23628 } 23629 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 23630 /* We don't use snd_nxt to retransmit */ 23631 tp->snd_nxt = tp->snd_max; 23632 } 23633 23634 static int 23635 rack_set_dgp(struct tcp_rack *rack) 23636 { 23637 if (rack->dgp_on == 1) 23638 return(0); 23639 if ((rack->use_fixed_rate == 1) && 23640 (rack->rc_always_pace == 1)) { 23641 /* 23642 * We are already pacing another 23643 * way. 23644 */ 23645 return (EBUSY); 23646 } 23647 if (rack->rc_always_pace == 1) { 23648 rack_remove_pacing(rack); 23649 } 23650 if (tcp_incr_dgp_pacing_cnt() == 0) 23651 return (ENOSPC); 23652 rack->r_ctl.pacing_method |= RACK_DGP_PACING; 23653 rack->rc_fillcw_apply_discount = 0; 23654 rack->dgp_on = 1; 23655 rack->rc_always_pace = 1; 23656 rack->rc_pace_dnd = 1; 23657 rack->use_fixed_rate = 0; 23658 if (rack->gp_ready) 23659 rack_set_cc_pacing(rack); 23660 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23661 rack->rack_attempt_hdwr_pace = 0; 23662 /* rxt settings */ 23663 rack->full_size_rxt = 1; 23664 rack->shape_rxt_to_pacing_min = 0; 23665 /* cmpack=1 */ 23666 rack->r_use_cmp_ack = 1; 23667 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 23668 rack->r_use_cmp_ack) 23669 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 23670 /* scwnd=1 */ 23671 rack->rack_enable_scwnd = 1; 23672 /* dynamic=100 */ 23673 rack->rc_gp_dyn_mul = 1; 23674 /* gp_inc_ca */ 23675 rack->r_ctl.rack_per_of_gp_ca = 100; 23676 /* rrr_conf=3 */ 23677 rack->r_rr_config = 3; 23678 /* npush=2 */ 23679 rack->r_ctl.rc_no_push_at_mrtt = 2; 23680 /* fillcw=1 */ 23681 rack->rc_pace_to_cwnd = 1; 23682 rack->rc_pace_fill_if_rttin_range = 0; 23683 rack->rtt_limit_mul = 0; 23684 /* noprr=1 */ 23685 rack->rack_no_prr = 1; 23686 /* lscwnd=1 */ 23687 rack->r_limit_scw = 1; 23688 /* gp_inc_rec */ 23689 rack->r_ctl.rack_per_of_gp_rec = 90; 23690 return (0); 23691 } 23692 23693 static int 23694 rack_set_profile(struct tcp_rack *rack, int prof) 23695 { 23696 int err = EINVAL; 23697 if (prof == 1) { 23698 /* 23699 * Profile 1 is "standard" DGP. It ignores 23700 * client buffer level. 23701 */ 23702 err = rack_set_dgp(rack); 23703 if (err) 23704 return (err); 23705 } else if (prof == 6) { 23706 err = rack_set_dgp(rack); 23707 if (err) 23708 return (err); 23709 /* 23710 * Profile 6 tweaks DGP so that it will apply to 23711 * fill-cw the same settings that profile5 does 23712 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). 23713 */ 23714 rack->rc_fillcw_apply_discount = 1; 23715 } else if (prof == 0) { 23716 /* This changes things back to the default settings */ 23717 if (rack->rc_always_pace == 1) { 23718 rack_remove_pacing(rack); 23719 } else { 23720 /* Make sure any stray flags are off */ 23721 rack->dgp_on = 0; 23722 rack->rc_hybrid_mode = 0; 23723 rack->use_fixed_rate = 0; 23724 } 23725 err = 0; 23726 if (rack_fill_cw_state) 23727 rack->rc_pace_to_cwnd = 1; 23728 else 23729 rack->rc_pace_to_cwnd = 0; 23730 23731 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 23732 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23733 rack->rc_always_pace = 1; 23734 if (rack->rack_hibeta) 23735 rack_set_cc_pacing(rack); 23736 } else 23737 rack->rc_always_pace = 0; 23738 if (rack_dsack_std_based & 0x1) { 23739 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 23740 rack->rc_rack_tmr_std_based = 1; 23741 } 23742 if (rack_dsack_std_based & 0x2) { 23743 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 23744 rack->rc_rack_use_dsack = 1; 23745 } 23746 if (rack_use_cmp_acks) 23747 rack->r_use_cmp_ack = 1; 23748 else 23749 rack->r_use_cmp_ack = 0; 23750 if (rack_disable_prr) 23751 rack->rack_no_prr = 1; 23752 else 23753 rack->rack_no_prr = 0; 23754 if (rack_gp_no_rec_chg) 23755 rack->rc_gp_no_rec_chg = 1; 23756 else 23757 rack->rc_gp_no_rec_chg = 0; 23758 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 23759 rack->r_mbuf_queue = 1; 23760 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 23761 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 23762 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23763 } else { 23764 rack->r_mbuf_queue = 0; 23765 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23766 } 23767 if (rack_enable_shared_cwnd) 23768 rack->rack_enable_scwnd = 1; 23769 else 23770 rack->rack_enable_scwnd = 0; 23771 if (rack_do_dyn_mul) { 23772 /* When dynamic adjustment is on CA needs to start at 100% */ 23773 rack->rc_gp_dyn_mul = 1; 23774 if (rack_do_dyn_mul >= 100) 23775 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 23776 } else { 23777 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 23778 rack->rc_gp_dyn_mul = 0; 23779 } 23780 rack->r_rr_config = 0; 23781 rack->r_ctl.rc_no_push_at_mrtt = 0; 23782 rack->rc_pace_fill_if_rttin_range = 0; 23783 rack->rtt_limit_mul = 0; 23784 23785 if (rack_enable_hw_pacing) 23786 rack->rack_hdw_pace_ena = 1; 23787 else 23788 rack->rack_hdw_pace_ena = 0; 23789 if (rack_disable_prr) 23790 rack->rack_no_prr = 1; 23791 else 23792 rack->rack_no_prr = 0; 23793 if (rack_limits_scwnd) 23794 rack->r_limit_scw = 1; 23795 else 23796 rack->r_limit_scw = 0; 23797 rack_init_retransmit_value(rack, rack_rxt_controls); 23798 err = 0; 23799 } 23800 return (err); 23801 } 23802 23803 static int 23804 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 23805 { 23806 struct deferred_opt_list *dol; 23807 23808 dol = malloc(sizeof(struct deferred_opt_list), 23809 M_TCPDO, M_NOWAIT|M_ZERO); 23810 if (dol == NULL) { 23811 /* 23812 * No space yikes -- fail out.. 23813 */ 23814 return (0); 23815 } 23816 dol->optname = sopt_name; 23817 dol->optval = loptval; 23818 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 23819 return (1); 23820 } 23821 23822 static int 23823 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) 23824 { 23825 #ifdef TCP_REQUEST_TRK 23826 struct tcp_sendfile_track *sft; 23827 struct timeval tv; 23828 tcp_seq seq; 23829 int err; 23830 23831 microuptime(&tv); 23832 23833 /* Make sure no fixed rate is on */ 23834 rack->use_fixed_rate = 0; 23835 rack->r_ctl.rc_fixed_pacing_rate_rec = 0; 23836 rack->r_ctl.rc_fixed_pacing_rate_ca = 0; 23837 rack->r_ctl.rc_fixed_pacing_rate_ss = 0; 23838 /* Now allocate or find our entry that will have these settings */ 23839 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0); 23840 if (sft == NULL) { 23841 rack->rc_tp->tcp_hybrid_error++; 23842 /* no space, where would it have gone? */ 23843 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; 23844 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); 23845 return (ENOSPC); 23846 } 23847 /* mask our internal flags */ 23848 hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK; 23849 /* The seq will be snd_una + everything in the buffer */ 23850 seq = sft->start_seq; 23851 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { 23852 /* Disabling hybrid pacing */ 23853 if (rack->rc_hybrid_mode) { 23854 rack_set_profile(rack, 0); 23855 rack->rc_tp->tcp_hybrid_stop++; 23856 } 23857 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); 23858 return (0); 23859 } 23860 if (rack->dgp_on == 0) { 23861 /* 23862 * If we have not yet turned DGP on, do so 23863 * now setting pure DGP mode, no buffer level 23864 * response. 23865 */ 23866 if ((err = rack_set_profile(rack, 1)) != 0){ 23867 /* Failed to turn pacing on */ 23868 rack->rc_tp->tcp_hybrid_error++; 23869 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); 23870 return (err); 23871 } 23872 } 23873 /* 23874 * Now we must switch to hybrid mode as well which also 23875 * means moving to regular pacing. 23876 */ 23877 if (rack->rc_hybrid_mode == 0) { 23878 /* First time */ 23879 if (tcp_can_enable_pacing()) { 23880 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23881 rack->rc_hybrid_mode = 1; 23882 } else { 23883 return (ENOSPC); 23884 } 23885 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) { 23886 /* 23887 * This should be true. 23888 */ 23889 tcp_dec_dgp_pacing_cnt(); 23890 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 23891 } 23892 } 23893 /* Now set in our flags */ 23894 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET; 23895 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) 23896 sft->cspr = hybrid->cspr; 23897 else 23898 sft->cspr = 0; 23899 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) 23900 sft->hint_maxseg = hybrid->hint_maxseg; 23901 else 23902 sft->hint_maxseg = 0; 23903 rack->rc_tp->tcp_hybrid_start++; 23904 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); 23905 return (0); 23906 #else 23907 return (ENOTSUP); 23908 #endif 23909 } 23910 23911 static int 23912 rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si) 23913 { 23914 /* 23915 * Gather rack specific information. 23916 */ 23917 struct tcp_rack *rack; 23918 23919 rack = (struct tcp_rack *)tp->t_fb_ptr; 23920 /* We pulled a SSI info log out what was there */ 23921 policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20); 23922 if (rack->policer_detect_on) { 23923 si->policer_detection_enabled = 1; 23924 if (rack->rc_policer_detected) { 23925 si->policer_detected = 1; 23926 si->policer_bucket_size = rack->r_ctl.policer_bucket_size; 23927 si->policer_last_bw = rack->r_ctl.policer_bw; 23928 } else { 23929 si->policer_detected = 0; 23930 si->policer_bucket_size = 0; 23931 si->policer_last_bw = 0; 23932 } 23933 si->current_round = rack->r_ctl.current_round; 23934 si->highly_buffered = rack->rc_highly_buffered; 23935 } 23936 si->bytes_transmitted = tp->t_sndbytes; 23937 si->bytes_retransmitted = tp->t_snd_rxt_bytes; 23938 return (0); 23939 } 23940 23941 static int 23942 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 23943 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) 23944 23945 { 23946 struct epoch_tracker et; 23947 struct sockopt sopt; 23948 struct cc_newreno_opts opt; 23949 uint64_t val; 23950 int error = 0; 23951 uint16_t ca, ss; 23952 23953 switch (sopt_name) { 23954 case TCP_RACK_SET_RXT_OPTIONS: 23955 if ((optval >= 0) && (optval <= 2)) { 23956 rack_init_retransmit_value(rack, optval); 23957 } else { 23958 /* 23959 * You must send in 0, 1 or 2 all else is 23960 * invalid. 23961 */ 23962 error = EINVAL; 23963 } 23964 break; 23965 case TCP_RACK_DSACK_OPT: 23966 RACK_OPTS_INC(tcp_rack_dsack_opt); 23967 if (optval & 0x1) { 23968 rack->rc_rack_tmr_std_based = 1; 23969 } else { 23970 rack->rc_rack_tmr_std_based = 0; 23971 } 23972 if (optval & 0x2) { 23973 rack->rc_rack_use_dsack = 1; 23974 } else { 23975 rack->rc_rack_use_dsack = 0; 23976 } 23977 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 23978 break; 23979 case TCP_RACK_PACING_DIVISOR: 23980 RACK_OPTS_INC(tcp_rack_pacing_divisor); 23981 if (optval == 0) { 23982 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 23983 } else { 23984 if (optval < RL_MIN_DIVISOR) 23985 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; 23986 else 23987 rack->r_ctl.pace_len_divisor = optval; 23988 } 23989 break; 23990 case TCP_RACK_HI_BETA: 23991 RACK_OPTS_INC(tcp_rack_hi_beta); 23992 if (optval > 0) { 23993 rack->rack_hibeta = 1; 23994 if ((optval >= 50) && 23995 (optval <= 100)) { 23996 /* 23997 * User wants to set a custom beta. 23998 */ 23999 rack->r_ctl.saved_hibeta = optval; 24000 if (rack->rc_pacing_cc_set) 24001 rack_undo_cc_pacing(rack); 24002 rack->r_ctl.rc_saved_beta.beta = optval; 24003 } 24004 if (rack->rc_pacing_cc_set == 0) 24005 rack_set_cc_pacing(rack); 24006 } else { 24007 rack->rack_hibeta = 0; 24008 if (rack->rc_pacing_cc_set) 24009 rack_undo_cc_pacing(rack); 24010 } 24011 break; 24012 case TCP_RACK_PACING_BETA: 24013 error = EINVAL; 24014 break; 24015 case TCP_RACK_TIMER_SLOP: 24016 RACK_OPTS_INC(tcp_rack_timer_slop); 24017 rack->r_ctl.timer_slop = optval; 24018 if (rack->rc_tp->t_srtt) { 24019 /* 24020 * If we have an SRTT lets update t_rxtcur 24021 * to have the new slop. 24022 */ 24023 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 24024 rack_rto_min, rack_rto_max, 24025 rack->r_ctl.timer_slop); 24026 } 24027 break; 24028 case TCP_RACK_PACING_BETA_ECN: 24029 RACK_OPTS_INC(tcp_rack_beta_ecn); 24030 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 24031 /* This only works for newreno. */ 24032 error = EINVAL; 24033 break; 24034 } 24035 if (rack->rc_pacing_cc_set) { 24036 /* 24037 * Set them into the real CC module 24038 * whats in the rack pcb is the old values 24039 * to be used on restoral/ 24040 */ 24041 sopt.sopt_dir = SOPT_SET; 24042 opt.name = CC_NEWRENO_BETA_ECN; 24043 opt.val = optval; 24044 if (CC_ALGO(tp)->ctl_output != NULL) 24045 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 24046 else 24047 error = ENOENT; 24048 } else { 24049 /* 24050 * Not pacing yet so set it into our local 24051 * rack pcb storage. 24052 */ 24053 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 24054 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; 24055 } 24056 break; 24057 case TCP_DEFER_OPTIONS: 24058 RACK_OPTS_INC(tcp_defer_opt); 24059 if (optval) { 24060 if (rack->gp_ready) { 24061 /* Too late */ 24062 error = EINVAL; 24063 break; 24064 } 24065 rack->defer_options = 1; 24066 } else 24067 rack->defer_options = 0; 24068 break; 24069 case TCP_RACK_MEASURE_CNT: 24070 RACK_OPTS_INC(tcp_rack_measure_cnt); 24071 if (optval && (optval <= 0xff)) { 24072 rack->r_ctl.req_measurements = optval; 24073 } else 24074 error = EINVAL; 24075 break; 24076 case TCP_REC_ABC_VAL: 24077 RACK_OPTS_INC(tcp_rec_abc_val); 24078 if (optval > 0) 24079 rack->r_use_labc_for_rec = 1; 24080 else 24081 rack->r_use_labc_for_rec = 0; 24082 break; 24083 case TCP_RACK_ABC_VAL: 24084 RACK_OPTS_INC(tcp_rack_abc_val); 24085 if ((optval > 0) && (optval < 255)) 24086 rack->rc_labc = optval; 24087 else 24088 error = EINVAL; 24089 break; 24090 case TCP_HDWR_UP_ONLY: 24091 RACK_OPTS_INC(tcp_pacing_up_only); 24092 if (optval) 24093 rack->r_up_only = 1; 24094 else 24095 rack->r_up_only = 0; 24096 break; 24097 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 24098 RACK_OPTS_INC(tcp_fillcw_rate_cap); 24099 rack->r_ctl.fillcw_cap = loptval; 24100 break; 24101 case TCP_PACING_RATE_CAP: 24102 RACK_OPTS_INC(tcp_pacing_rate_cap); 24103 if ((rack->dgp_on == 1) && 24104 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 24105 /* 24106 * If we are doing DGP we need to switch 24107 * to using the pacing limit. 24108 */ 24109 if (tcp_can_enable_pacing() == 0) { 24110 error = ENOSPC; 24111 break; 24112 } 24113 /* 24114 * Now change up the flags and counts to be correct. 24115 */ 24116 rack->r_ctl.pacing_method |= RACK_REG_PACING; 24117 tcp_dec_dgp_pacing_cnt(); 24118 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 24119 } 24120 rack->r_ctl.bw_rate_cap = loptval; 24121 break; 24122 case TCP_HYBRID_PACING: 24123 if (hybrid == NULL) { 24124 error = EINVAL; 24125 break; 24126 } 24127 if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) { 24128 error = EPERM; 24129 break; 24130 } 24131 error = process_hybrid_pacing(rack, hybrid); 24132 break; 24133 case TCP_SIDECHAN_DIS: /* URL:scodm */ 24134 if (optval) 24135 rack->r_ctl.side_chan_dis_mask = optval; 24136 else 24137 rack->r_ctl.side_chan_dis_mask = 0; 24138 break; 24139 case TCP_RACK_PROFILE: 24140 RACK_OPTS_INC(tcp_profile); 24141 error = rack_set_profile(rack, optval); 24142 break; 24143 case TCP_USE_CMP_ACKS: 24144 RACK_OPTS_INC(tcp_use_cmp_acks); 24145 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) { 24146 /* You can't turn it off once its on! */ 24147 error = EINVAL; 24148 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 24149 rack->r_use_cmp_ack = 1; 24150 rack->r_mbuf_queue = 1; 24151 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 24152 } 24153 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 24154 tp->t_flags2 |= TF2_MBUF_ACKCMP; 24155 break; 24156 case TCP_SHARED_CWND_TIME_LIMIT: 24157 RACK_OPTS_INC(tcp_lscwnd); 24158 if (optval) 24159 rack->r_limit_scw = 1; 24160 else 24161 rack->r_limit_scw = 0; 24162 break; 24163 case TCP_RACK_DGP_IN_REC: 24164 error = EINVAL; 24165 break; 24166 case TCP_POLICER_DETECT: /* URL:pol_det */ 24167 RACK_OPTS_INC(tcp_pol_detect); 24168 rack_translate_policer_detect(rack, optval); 24169 break; 24170 case TCP_POLICER_MSS: 24171 RACK_OPTS_INC(tcp_pol_mss); 24172 rack->r_ctl.policer_del_mss = (uint8_t)optval; 24173 if (optval & 0x00000100) { 24174 /* 24175 * Value is setup like so: 24176 * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM 24177 * Where MMMM MMMM is MSS setting 24178 * I (9th bit) is the Postive value that 24179 * says it is being set (if its 0 then the 24180 * upper bits 11 - 32 have no meaning. 24181 * This allows setting it off with 24182 * 0x000001MM. 24183 * 24184 * The 10th bit is used to turn on the 24185 * alternate median (not the expanded one). 24186 * 24187 */ 24188 rack->r_ctl.pol_bw_comp = (optval >> 10); 24189 } 24190 if (optval & 0x00000200) { 24191 rack->r_ctl.policer_alt_median = 1; 24192 } else { 24193 rack->r_ctl.policer_alt_median = 0; 24194 } 24195 break; 24196 case TCP_RACK_PACE_TO_FILL: 24197 RACK_OPTS_INC(tcp_fillcw); 24198 if (optval == 0) 24199 rack->rc_pace_to_cwnd = 0; 24200 else { 24201 rack->rc_pace_to_cwnd = 1; 24202 } 24203 if ((optval >= rack_gp_rtt_maxmul) && 24204 rack_gp_rtt_maxmul && 24205 (optval < 0xf)) { 24206 rack->rc_pace_fill_if_rttin_range = 1; 24207 rack->rtt_limit_mul = optval; 24208 } else { 24209 rack->rc_pace_fill_if_rttin_range = 0; 24210 rack->rtt_limit_mul = 0; 24211 } 24212 break; 24213 case TCP_RACK_NO_PUSH_AT_MAX: 24214 RACK_OPTS_INC(tcp_npush); 24215 if (optval == 0) 24216 rack->r_ctl.rc_no_push_at_mrtt = 0; 24217 else if (optval < 0xff) 24218 rack->r_ctl.rc_no_push_at_mrtt = optval; 24219 else 24220 error = EINVAL; 24221 break; 24222 case TCP_SHARED_CWND_ENABLE: 24223 RACK_OPTS_INC(tcp_rack_scwnd); 24224 if (optval == 0) 24225 rack->rack_enable_scwnd = 0; 24226 else 24227 rack->rack_enable_scwnd = 1; 24228 break; 24229 case TCP_RACK_MBUF_QUEUE: 24230 /* Now do we use the LRO mbuf-queue feature */ 24231 RACK_OPTS_INC(tcp_rack_mbufq); 24232 if (optval || rack->r_use_cmp_ack) 24233 rack->r_mbuf_queue = 1; 24234 else 24235 rack->r_mbuf_queue = 0; 24236 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 24237 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 24238 else 24239 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 24240 break; 24241 case TCP_RACK_NONRXT_CFG_RATE: 24242 RACK_OPTS_INC(tcp_rack_cfg_rate); 24243 if (optval == 0) 24244 rack->rack_rec_nonrxt_use_cr = 0; 24245 else 24246 rack->rack_rec_nonrxt_use_cr = 1; 24247 break; 24248 case TCP_NO_PRR: 24249 RACK_OPTS_INC(tcp_rack_noprr); 24250 if (optval == 0) 24251 rack->rack_no_prr = 0; 24252 else if (optval == 1) 24253 rack->rack_no_prr = 1; 24254 else if (optval == 2) 24255 rack->no_prr_addback = 1; 24256 else 24257 error = EINVAL; 24258 break; 24259 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 24260 if (optval > 0) 24261 rack->cspr_is_fcc = 1; 24262 else 24263 rack->cspr_is_fcc = 0; 24264 break; 24265 case TCP_TIMELY_DYN_ADJ: 24266 RACK_OPTS_INC(tcp_timely_dyn); 24267 if (optval == 0) 24268 rack->rc_gp_dyn_mul = 0; 24269 else { 24270 rack->rc_gp_dyn_mul = 1; 24271 if (optval >= 100) { 24272 /* 24273 * If the user sets something 100 or more 24274 * its the gp_ca value. 24275 */ 24276 rack->r_ctl.rack_per_of_gp_ca = optval; 24277 } 24278 } 24279 break; 24280 case TCP_RACK_DO_DETECTION: 24281 error = EINVAL; 24282 break; 24283 case TCP_RACK_TLP_USE: 24284 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 24285 error = EINVAL; 24286 break; 24287 } 24288 RACK_OPTS_INC(tcp_tlp_use); 24289 rack->rack_tlp_threshold_use = optval; 24290 break; 24291 case TCP_RACK_TLP_REDUCE: 24292 /* RACK TLP cwnd reduction (bool) */ 24293 RACK_OPTS_INC(tcp_rack_tlp_reduce); 24294 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 24295 break; 24296 /* Pacing related ones */ 24297 case TCP_RACK_PACE_ALWAYS: 24298 /* 24299 * zero is old rack method, 1 is new 24300 * method using a pacing rate. 24301 */ 24302 RACK_OPTS_INC(tcp_rack_pace_always); 24303 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 24304 error = EPERM; 24305 break; 24306 } 24307 if (optval > 0) { 24308 if (rack->rc_always_pace) { 24309 error = EALREADY; 24310 break; 24311 } else if (tcp_can_enable_pacing()) { 24312 rack->r_ctl.pacing_method |= RACK_REG_PACING; 24313 rack->rc_always_pace = 1; 24314 if (rack->rack_hibeta) 24315 rack_set_cc_pacing(rack); 24316 } 24317 else { 24318 error = ENOSPC; 24319 break; 24320 } 24321 } else { 24322 if (rack->rc_always_pace == 1) { 24323 rack_remove_pacing(rack); 24324 } 24325 } 24326 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 24327 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 24328 else 24329 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 24330 /* A rate may be set irate or other, if so set seg size */ 24331 rack_update_seg(rack); 24332 break; 24333 case TCP_BBR_RACK_INIT_RATE: 24334 RACK_OPTS_INC(tcp_initial_rate); 24335 val = optval; 24336 /* Change from kbits per second to bytes per second */ 24337 val *= 1000; 24338 val /= 8; 24339 rack->r_ctl.init_rate = val; 24340 if (rack->rc_always_pace) 24341 rack_update_seg(rack); 24342 break; 24343 case TCP_BBR_IWINTSO: 24344 error = EINVAL; 24345 break; 24346 case TCP_RACK_FORCE_MSEG: 24347 RACK_OPTS_INC(tcp_rack_force_max_seg); 24348 if (optval) 24349 rack->rc_force_max_seg = 1; 24350 else 24351 rack->rc_force_max_seg = 0; 24352 break; 24353 case TCP_RACK_PACE_MIN_SEG: 24354 RACK_OPTS_INC(tcp_rack_min_seg); 24355 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); 24356 rack_set_pace_segments(tp, rack, __LINE__, NULL); 24357 break; 24358 case TCP_RACK_PACE_MAX_SEG: 24359 /* Max segments size in a pace in bytes */ 24360 RACK_OPTS_INC(tcp_rack_max_seg); 24361 if ((rack->dgp_on == 1) && 24362 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 24363 /* 24364 * If we set a max-seg and are doing DGP then 24365 * we now fall under the pacing limits not the 24366 * DGP ones. 24367 */ 24368 if (tcp_can_enable_pacing() == 0) { 24369 error = ENOSPC; 24370 break; 24371 } 24372 /* 24373 * Now change up the flags and counts to be correct. 24374 */ 24375 rack->r_ctl.pacing_method |= RACK_REG_PACING; 24376 tcp_dec_dgp_pacing_cnt(); 24377 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 24378 } 24379 if (optval <= MAX_USER_SET_SEG) 24380 rack->rc_user_set_max_segs = optval; 24381 else 24382 rack->rc_user_set_max_segs = MAX_USER_SET_SEG; 24383 rack_set_pace_segments(tp, rack, __LINE__, NULL); 24384 break; 24385 case TCP_RACK_PACE_RATE_REC: 24386 /* Set the fixed pacing rate in Bytes per second ca */ 24387 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 24388 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 24389 error = EPERM; 24390 break; 24391 } 24392 if (rack->dgp_on) { 24393 /* 24394 * We are already pacing another 24395 * way. 24396 */ 24397 error = EBUSY; 24398 break; 24399 } 24400 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 24401 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 24402 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 24403 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 24404 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 24405 rack->use_fixed_rate = 1; 24406 if (rack->rack_hibeta) 24407 rack_set_cc_pacing(rack); 24408 rack_log_pacing_delay_calc(rack, 24409 rack->r_ctl.rc_fixed_pacing_rate_ss, 24410 rack->r_ctl.rc_fixed_pacing_rate_ca, 24411 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 24412 __LINE__, NULL,0); 24413 break; 24414 24415 case TCP_RACK_PACE_RATE_SS: 24416 /* Set the fixed pacing rate in Bytes per second ca */ 24417 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 24418 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 24419 error = EPERM; 24420 break; 24421 } 24422 if (rack->dgp_on) { 24423 /* 24424 * We are already pacing another 24425 * way. 24426 */ 24427 error = EBUSY; 24428 break; 24429 } 24430 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 24431 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 24432 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 24433 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 24434 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 24435 rack->use_fixed_rate = 1; 24436 if (rack->rack_hibeta) 24437 rack_set_cc_pacing(rack); 24438 rack_log_pacing_delay_calc(rack, 24439 rack->r_ctl.rc_fixed_pacing_rate_ss, 24440 rack->r_ctl.rc_fixed_pacing_rate_ca, 24441 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 24442 __LINE__, NULL, 0); 24443 break; 24444 24445 case TCP_RACK_PACE_RATE_CA: 24446 /* Set the fixed pacing rate in Bytes per second ca */ 24447 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 24448 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 24449 error = EPERM; 24450 break; 24451 } 24452 if (rack->dgp_on) { 24453 /* 24454 * We are already pacing another 24455 * way. 24456 */ 24457 error = EBUSY; 24458 break; 24459 } 24460 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 24461 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 24462 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 24463 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 24464 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 24465 rack->use_fixed_rate = 1; 24466 if (rack->rack_hibeta) 24467 rack_set_cc_pacing(rack); 24468 rack_log_pacing_delay_calc(rack, 24469 rack->r_ctl.rc_fixed_pacing_rate_ss, 24470 rack->r_ctl.rc_fixed_pacing_rate_ca, 24471 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 24472 __LINE__, NULL, 0); 24473 break; 24474 case TCP_RACK_GP_INCREASE_REC: 24475 RACK_OPTS_INC(tcp_gp_inc_rec); 24476 rack->r_ctl.rack_per_of_gp_rec = optval; 24477 rack_log_pacing_delay_calc(rack, 24478 rack->r_ctl.rack_per_of_gp_ss, 24479 rack->r_ctl.rack_per_of_gp_ca, 24480 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 24481 __LINE__, NULL, 0); 24482 break; 24483 case TCP_RACK_GP_INCREASE_CA: 24484 RACK_OPTS_INC(tcp_gp_inc_ca); 24485 ca = optval; 24486 if (ca < 100) { 24487 /* 24488 * We don't allow any reduction 24489 * over the GP b/w. 24490 */ 24491 error = EINVAL; 24492 break; 24493 } 24494 rack->r_ctl.rack_per_of_gp_ca = ca; 24495 rack_log_pacing_delay_calc(rack, 24496 rack->r_ctl.rack_per_of_gp_ss, 24497 rack->r_ctl.rack_per_of_gp_ca, 24498 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 24499 __LINE__, NULL, 0); 24500 break; 24501 case TCP_RACK_GP_INCREASE_SS: 24502 RACK_OPTS_INC(tcp_gp_inc_ss); 24503 ss = optval; 24504 if (ss < 100) { 24505 /* 24506 * We don't allow any reduction 24507 * over the GP b/w. 24508 */ 24509 error = EINVAL; 24510 break; 24511 } 24512 rack->r_ctl.rack_per_of_gp_ss = ss; 24513 rack_log_pacing_delay_calc(rack, 24514 rack->r_ctl.rack_per_of_gp_ss, 24515 rack->r_ctl.rack_per_of_gp_ca, 24516 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 24517 __LINE__, NULL, 0); 24518 break; 24519 case TCP_RACK_RR_CONF: 24520 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 24521 if (optval && optval <= 3) 24522 rack->r_rr_config = optval; 24523 else 24524 rack->r_rr_config = 0; 24525 break; 24526 case TCP_PACING_DND: /* URL:dnd */ 24527 if (optval > 0) 24528 rack->rc_pace_dnd = 1; 24529 else 24530 rack->rc_pace_dnd = 0; 24531 break; 24532 case TCP_HDWR_RATE_CAP: 24533 RACK_OPTS_INC(tcp_hdwr_rate_cap); 24534 if (optval) { 24535 if (rack->r_rack_hw_rate_caps == 0) 24536 rack->r_rack_hw_rate_caps = 1; 24537 else 24538 error = EALREADY; 24539 } else { 24540 rack->r_rack_hw_rate_caps = 0; 24541 } 24542 break; 24543 case TCP_DGP_UPPER_BOUNDS: 24544 { 24545 uint8_t val; 24546 val = optval & 0x0000ff; 24547 rack->r_ctl.rack_per_upper_bound_ca = val; 24548 val = (optval >> 16) & 0x0000ff; 24549 rack->r_ctl.rack_per_upper_bound_ss = val; 24550 break; 24551 } 24552 case TCP_SS_EEXIT: /* URL:eexit */ 24553 if (optval > 0) { 24554 rack->r_ctl.gp_rnd_thresh = optval & 0x0ff; 24555 if (optval & 0x10000) { 24556 rack->r_ctl.gate_to_fs = 1; 24557 } else { 24558 rack->r_ctl.gate_to_fs = 0; 24559 } 24560 if (optval & 0x20000) { 24561 rack->r_ctl.use_gp_not_last = 1; 24562 } else { 24563 rack->r_ctl.use_gp_not_last = 0; 24564 } 24565 if (optval & 0xfffc0000) { 24566 uint32_t v; 24567 24568 v = (optval >> 18) & 0x00003fff; 24569 if (v >= 1000) 24570 rack->r_ctl.gp_gain_req = v; 24571 } 24572 } else { 24573 /* We do not do ss early exit at all */ 24574 rack->rc_initial_ss_comp = 1; 24575 rack->r_ctl.gp_rnd_thresh = 0; 24576 } 24577 break; 24578 case TCP_RACK_SPLIT_LIMIT: 24579 RACK_OPTS_INC(tcp_split_limit); 24580 rack->r_ctl.rc_split_limit = optval; 24581 break; 24582 case TCP_BBR_HDWR_PACE: 24583 RACK_OPTS_INC(tcp_hdwr_pacing); 24584 if (optval){ 24585 if (rack->rack_hdrw_pacing == 0) { 24586 rack->rack_hdw_pace_ena = 1; 24587 rack->rack_attempt_hdwr_pace = 0; 24588 } else 24589 error = EALREADY; 24590 } else { 24591 rack->rack_hdw_pace_ena = 0; 24592 #ifdef RATELIMIT 24593 if (rack->r_ctl.crte != NULL) { 24594 rack->rack_hdrw_pacing = 0; 24595 rack->rack_attempt_hdwr_pace = 0; 24596 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 24597 rack->r_ctl.crte = NULL; 24598 } 24599 #endif 24600 } 24601 break; 24602 /* End Pacing related ones */ 24603 case TCP_RACK_PRR_SENDALOT: 24604 /* Allow PRR to send more than one seg */ 24605 RACK_OPTS_INC(tcp_rack_prr_sendalot); 24606 rack->r_ctl.rc_prr_sendalot = optval; 24607 break; 24608 case TCP_RACK_MIN_TO: 24609 /* Minimum time between rack t-o's in ms */ 24610 RACK_OPTS_INC(tcp_rack_min_to); 24611 rack->r_ctl.rc_min_to = optval; 24612 break; 24613 case TCP_RACK_EARLY_SEG: 24614 /* If early recovery max segments */ 24615 RACK_OPTS_INC(tcp_rack_early_seg); 24616 rack->r_ctl.rc_early_recovery_segs = optval; 24617 break; 24618 case TCP_RACK_ENABLE_HYSTART: 24619 { 24620 if (optval) { 24621 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 24622 if (rack_do_hystart > RACK_HYSTART_ON) 24623 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 24624 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 24625 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 24626 } else { 24627 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 24628 } 24629 } 24630 break; 24631 case TCP_RACK_REORD_THRESH: 24632 /* RACK reorder threshold (shift amount) */ 24633 RACK_OPTS_INC(tcp_rack_reord_thresh); 24634 if ((optval > 0) && (optval < 31)) 24635 rack->r_ctl.rc_reorder_shift = optval; 24636 else 24637 error = EINVAL; 24638 break; 24639 case TCP_RACK_REORD_FADE: 24640 /* Does reordering fade after ms time */ 24641 RACK_OPTS_INC(tcp_rack_reord_fade); 24642 rack->r_ctl.rc_reorder_fade = optval; 24643 break; 24644 case TCP_RACK_TLP_THRESH: 24645 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 24646 RACK_OPTS_INC(tcp_rack_tlp_thresh); 24647 if (optval) 24648 rack->r_ctl.rc_tlp_threshold = optval; 24649 else 24650 error = EINVAL; 24651 break; 24652 case TCP_BBR_USE_RACK_RR: 24653 RACK_OPTS_INC(tcp_rack_rr); 24654 if (optval) 24655 rack->use_rack_rr = 1; 24656 else 24657 rack->use_rack_rr = 0; 24658 break; 24659 case TCP_RACK_PKT_DELAY: 24660 /* RACK added ms i.e. rack-rtt + reord + N */ 24661 RACK_OPTS_INC(tcp_rack_pkt_delay); 24662 rack->r_ctl.rc_pkt_delay = optval; 24663 break; 24664 case TCP_DELACK: 24665 RACK_OPTS_INC(tcp_rack_delayed_ack); 24666 if (optval == 0) 24667 tp->t_delayed_ack = 0; 24668 else 24669 tp->t_delayed_ack = 1; 24670 if (tp->t_flags & TF_DELACK) { 24671 tp->t_flags &= ~TF_DELACK; 24672 tp->t_flags |= TF_ACKNOW; 24673 NET_EPOCH_ENTER(et); 24674 rack_output(tp); 24675 NET_EPOCH_EXIT(et); 24676 } 24677 break; 24678 24679 case TCP_BBR_RACK_RTT_USE: 24680 RACK_OPTS_INC(tcp_rack_rtt_use); 24681 if ((optval != USE_RTT_HIGH) && 24682 (optval != USE_RTT_LOW) && 24683 (optval != USE_RTT_AVG)) 24684 error = EINVAL; 24685 else 24686 rack->r_ctl.rc_rate_sample_method = optval; 24687 break; 24688 case TCP_HONOR_HPTS_MIN: 24689 RACK_OPTS_INC(tcp_honor_hpts); 24690 if (optval) { 24691 rack->r_use_hpts_min = 1; 24692 /* 24693 * Must be between 2 - 80% to be a reduction else 24694 * we keep the default (10%). 24695 */ 24696 if ((optval > 1) && (optval <= 80)) { 24697 rack->r_ctl.max_reduction = optval; 24698 } 24699 } else 24700 rack->r_use_hpts_min = 0; 24701 break; 24702 case TCP_REC_IS_DYN: /* URL:dynrec */ 24703 RACK_OPTS_INC(tcp_dyn_rec); 24704 if (optval) 24705 rack->rc_gp_no_rec_chg = 1; 24706 else 24707 rack->rc_gp_no_rec_chg = 0; 24708 break; 24709 case TCP_NO_TIMELY: 24710 RACK_OPTS_INC(tcp_notimely); 24711 if (optval) { 24712 rack->rc_skip_timely = 1; 24713 rack->r_ctl.rack_per_of_gp_rec = 90; 24714 rack->r_ctl.rack_per_of_gp_ca = 100; 24715 rack->r_ctl.rack_per_of_gp_ss = 250; 24716 } else { 24717 rack->rc_skip_timely = 0; 24718 } 24719 break; 24720 case TCP_GP_USE_LTBW: 24721 if (optval == 0) { 24722 rack->use_lesser_lt_bw = 0; 24723 rack->dis_lt_bw = 1; 24724 } else if (optval == 1) { 24725 rack->use_lesser_lt_bw = 1; 24726 rack->dis_lt_bw = 0; 24727 } else if (optval == 2) { 24728 rack->use_lesser_lt_bw = 0; 24729 rack->dis_lt_bw = 0; 24730 } 24731 break; 24732 case TCP_DATA_AFTER_CLOSE: 24733 RACK_OPTS_INC(tcp_data_after_close); 24734 if (optval) 24735 rack->rc_allow_data_af_clo = 1; 24736 else 24737 rack->rc_allow_data_af_clo = 0; 24738 break; 24739 default: 24740 break; 24741 } 24742 tcp_log_socket_option(tp, sopt_name, optval, error); 24743 return (error); 24744 } 24745 24746 static void 24747 rack_inherit(struct tcpcb *tp, struct inpcb *parent) 24748 { 24749 /* 24750 * A new connection has been created (tp) and 24751 * the parent is the inpcb given. We want to 24752 * apply a read-lock to the parent (we are already 24753 * holding a write lock on the tp) and copy anything 24754 * out of the rack specific data as long as its tfb is 24755 * the same as ours i.e. we are the same stack. Otherwise 24756 * we just return. 24757 */ 24758 struct tcpcb *par; 24759 struct tcp_rack *dest, *src; 24760 int cnt = 0; 24761 24762 par = intotcpcb(parent); 24763 if (par->t_fb != tp->t_fb) { 24764 /* Not the same stack */ 24765 tcp_log_socket_option(tp, 0, 0, 1); 24766 return; 24767 } 24768 /* Ok if we reach here lets setup the two rack pointers */ 24769 dest = (struct tcp_rack *)tp->t_fb_ptr; 24770 src = (struct tcp_rack *)par->t_fb_ptr; 24771 if ((src == NULL) || (dest == NULL)) { 24772 /* Huh? */ 24773 tcp_log_socket_option(tp, 0, 0, 2); 24774 return; 24775 } 24776 /* Now copy out anything we wish to inherit i.e. things in socket-options */ 24777 /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */ 24778 if ((src->dgp_on) && (dest->dgp_on == 0)) { 24779 /* Profile 1 had to be set via sock opt */ 24780 rack_set_dgp(dest); 24781 cnt++; 24782 } 24783 /* TCP_RACK_SET_RXT_OPTIONS */ 24784 if (dest->full_size_rxt != src->full_size_rxt) { 24785 dest->full_size_rxt = src->full_size_rxt; 24786 cnt++; 24787 } 24788 if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) { 24789 dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min; 24790 cnt++; 24791 } 24792 /* TCP_RACK_DSACK_OPT */ 24793 if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) { 24794 dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based; 24795 cnt++; 24796 } 24797 if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) { 24798 dest->rc_rack_use_dsack = src->rc_rack_use_dsack; 24799 cnt++; 24800 } 24801 /* TCP_RACK_PACING_DIVISOR */ 24802 if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) { 24803 dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor; 24804 cnt++; 24805 } 24806 /* TCP_RACK_HI_BETA */ 24807 if (src->rack_hibeta != dest->rack_hibeta) { 24808 cnt++; 24809 if (src->rack_hibeta) { 24810 dest->r_ctl.rc_saved_beta.beta = src->r_ctl.rc_saved_beta.beta; 24811 dest->rack_hibeta = 1; 24812 } else { 24813 dest->rack_hibeta = 0; 24814 } 24815 } 24816 /* TCP_RACK_TIMER_SLOP */ 24817 if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) { 24818 dest->r_ctl.timer_slop = src->r_ctl.timer_slop; 24819 cnt++; 24820 } 24821 /* TCP_RACK_PACING_BETA_ECN */ 24822 if (dest->r_ctl.rc_saved_beta.beta_ecn != src->r_ctl.rc_saved_beta.beta_ecn) { 24823 dest->r_ctl.rc_saved_beta.beta_ecn = src->r_ctl.rc_saved_beta.beta_ecn; 24824 cnt++; 24825 } 24826 if (dest->r_ctl.rc_saved_beta.newreno_flags != src->r_ctl.rc_saved_beta.newreno_flags) { 24827 dest->r_ctl.rc_saved_beta.newreno_flags = src->r_ctl.rc_saved_beta.newreno_flags; 24828 cnt++; 24829 } 24830 /* We do not do TCP_DEFER_OPTIONS */ 24831 /* TCP_RACK_MEASURE_CNT */ 24832 if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) { 24833 dest->r_ctl.req_measurements = src->r_ctl.req_measurements; 24834 cnt++; 24835 } 24836 /* TCP_HDWR_UP_ONLY */ 24837 if (dest->r_up_only != src->r_up_only) { 24838 dest->r_up_only = src->r_up_only; 24839 cnt++; 24840 } 24841 /* TCP_FILLCW_RATE_CAP */ 24842 if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) { 24843 dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap; 24844 cnt++; 24845 } 24846 /* TCP_PACING_RATE_CAP */ 24847 if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) { 24848 dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap; 24849 cnt++; 24850 } 24851 /* A listener can't set TCP_HYBRID_PACING */ 24852 /* TCP_SIDECHAN_DIS */ 24853 if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) { 24854 dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask; 24855 cnt++; 24856 } 24857 /* TCP_SHARED_CWND_TIME_LIMIT */ 24858 if (dest->r_limit_scw != src->r_limit_scw) { 24859 dest->r_limit_scw = src->r_limit_scw; 24860 cnt++; 24861 } 24862 /* TCP_POLICER_DETECT */ 24863 if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) { 24864 dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold; 24865 cnt++; 24866 } 24867 if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) { 24868 dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold; 24869 cnt++; 24870 } 24871 if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) { 24872 dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold; 24873 cnt++; 24874 } 24875 if (dest->policer_detect_on != src->policer_detect_on) { 24876 dest->policer_detect_on = src->policer_detect_on; 24877 cnt++; 24878 } 24879 24880 if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) { 24881 dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val; 24882 cnt++; 24883 } 24884 /* TCP_POLICER_MSS */ 24885 if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) { 24886 dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss; 24887 cnt++; 24888 } 24889 24890 if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) { 24891 dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp; 24892 cnt++; 24893 } 24894 24895 if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) { 24896 dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median; 24897 cnt++; 24898 } 24899 /* TCP_RACK_PACE_TO_FILL */ 24900 if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) { 24901 dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd; 24902 cnt++; 24903 } 24904 if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) { 24905 dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range; 24906 cnt++; 24907 } 24908 if (dest->rtt_limit_mul != src->rtt_limit_mul) { 24909 dest->rtt_limit_mul = src->rtt_limit_mul; 24910 cnt++; 24911 } 24912 /* TCP_RACK_NO_PUSH_AT_MAX */ 24913 if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) { 24914 dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt; 24915 cnt++; 24916 } 24917 /* TCP_SHARED_CWND_ENABLE */ 24918 if (dest->rack_enable_scwnd != src->rack_enable_scwnd) { 24919 dest->rack_enable_scwnd = src->rack_enable_scwnd; 24920 cnt++; 24921 } 24922 /* TCP_USE_CMP_ACKS */ 24923 if (dest->r_use_cmp_ack != src->r_use_cmp_ack) { 24924 dest->r_use_cmp_ack = src->r_use_cmp_ack; 24925 cnt++; 24926 } 24927 24928 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 24929 dest->r_mbuf_queue = src->r_mbuf_queue; 24930 cnt++; 24931 } 24932 /* TCP_RACK_MBUF_QUEUE */ 24933 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 24934 dest->r_mbuf_queue = src->r_mbuf_queue; 24935 cnt++; 24936 } 24937 if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) { 24938 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 24939 } else { 24940 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 24941 } 24942 if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) { 24943 tp->t_flags2 |= TF2_MBUF_ACKCMP; 24944 } 24945 /* TCP_RACK_NONRXT_CFG_RATE */ 24946 if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) { 24947 dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr; 24948 cnt++; 24949 } 24950 /* TCP_NO_PRR */ 24951 if (dest->rack_no_prr != src->rack_no_prr) { 24952 dest->rack_no_prr = src->rack_no_prr; 24953 cnt++; 24954 } 24955 if (dest->no_prr_addback != src->no_prr_addback) { 24956 dest->no_prr_addback = src->no_prr_addback; 24957 cnt++; 24958 } 24959 /* RACK_CSPR_IS_FCC */ 24960 if (dest->cspr_is_fcc != src->cspr_is_fcc) { 24961 dest->cspr_is_fcc = src->cspr_is_fcc; 24962 cnt++; 24963 } 24964 /* TCP_TIMELY_DYN_ADJ */ 24965 if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) { 24966 dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul; 24967 cnt++; 24968 } 24969 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 24970 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 24971 cnt++; 24972 } 24973 /* TCP_RACK_TLP_USE */ 24974 if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) { 24975 dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use; 24976 cnt++; 24977 } 24978 /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */ 24979 /* TCP_BBR_RACK_INIT_RATE */ 24980 if (dest->r_ctl.init_rate != src->r_ctl.init_rate) { 24981 dest->r_ctl.init_rate = src->r_ctl.init_rate; 24982 cnt++; 24983 } 24984 /* TCP_RACK_FORCE_MSEG */ 24985 if (dest->rc_force_max_seg != src->rc_force_max_seg) { 24986 dest->rc_force_max_seg = src->rc_force_max_seg; 24987 cnt++; 24988 } 24989 /* TCP_RACK_PACE_MIN_SEG */ 24990 if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) { 24991 dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs; 24992 cnt++; 24993 } 24994 /* we don't allow TCP_RACK_PACE_MAX_SEG */ 24995 /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */ 24996 if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) { 24997 dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca; 24998 cnt++; 24999 } 25000 if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) { 25001 dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss; 25002 cnt++; 25003 } 25004 if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) { 25005 dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec; 25006 cnt++; 25007 } 25008 /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */ 25009 if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) { 25010 dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec; 25011 cnt++; 25012 } 25013 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 25014 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 25015 cnt++; 25016 } 25017 25018 if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) { 25019 dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss; 25020 cnt++; 25021 } 25022 /* TCP_RACK_RR_CONF */ 25023 if (dest->r_rr_config != src->r_rr_config) { 25024 dest->r_rr_config = src->r_rr_config; 25025 cnt++; 25026 } 25027 /* TCP_PACING_DND */ 25028 if (dest->rc_pace_dnd != src->rc_pace_dnd) { 25029 dest->rc_pace_dnd = src->rc_pace_dnd; 25030 cnt++; 25031 } 25032 /* TCP_HDWR_RATE_CAP */ 25033 if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) { 25034 dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps; 25035 cnt++; 25036 } 25037 /* TCP_DGP_UPPER_BOUNDS */ 25038 if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) { 25039 dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca; 25040 cnt++; 25041 } 25042 if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) { 25043 dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss; 25044 cnt++; 25045 } 25046 /* TCP_SS_EEXIT */ 25047 if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) { 25048 dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh; 25049 cnt++; 25050 } 25051 if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) { 25052 dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs; 25053 cnt++; 25054 } 25055 if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) { 25056 dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last; 25057 cnt++; 25058 } 25059 if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) { 25060 dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req; 25061 cnt++; 25062 } 25063 /* TCP_BBR_HDWR_PACE */ 25064 if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) { 25065 dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena; 25066 cnt++; 25067 } 25068 if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) { 25069 dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace; 25070 cnt++; 25071 } 25072 /* TCP_RACK_PRR_SENDALOT */ 25073 if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) { 25074 dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot; 25075 cnt++; 25076 } 25077 /* TCP_RACK_MIN_TO */ 25078 if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) { 25079 dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to; 25080 cnt++; 25081 } 25082 /* TCP_RACK_EARLY_SEG */ 25083 if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) { 25084 dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs; 25085 cnt++; 25086 } 25087 /* TCP_RACK_ENABLE_HYSTART */ 25088 if (par->t_ccv.flags != tp->t_ccv.flags) { 25089 cnt++; 25090 if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) { 25091 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 25092 if (rack_do_hystart > RACK_HYSTART_ON) 25093 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 25094 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 25095 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 25096 } else { 25097 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 25098 } 25099 } 25100 /* TCP_RACK_REORD_THRESH */ 25101 if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) { 25102 dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift; 25103 cnt++; 25104 } 25105 /* TCP_RACK_REORD_FADE */ 25106 if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) { 25107 dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade; 25108 cnt++; 25109 } 25110 /* TCP_RACK_TLP_THRESH */ 25111 if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) { 25112 dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold; 25113 cnt++; 25114 } 25115 /* TCP_BBR_USE_RACK_RR */ 25116 if (dest->use_rack_rr != src->use_rack_rr) { 25117 dest->use_rack_rr = src->use_rack_rr; 25118 cnt++; 25119 } 25120 /* TCP_RACK_PKT_DELAY */ 25121 if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) { 25122 dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay; 25123 cnt++; 25124 } 25125 /* TCP_DELACK will get copied via the main code if applicable */ 25126 /* TCP_BBR_RACK_RTT_USE */ 25127 if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) { 25128 dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method; 25129 cnt++; 25130 } 25131 /* TCP_HONOR_HPTS_MIN */ 25132 if (dest->r_use_hpts_min != src->r_use_hpts_min) { 25133 dest->r_use_hpts_min = src->r_use_hpts_min; 25134 cnt++; 25135 } 25136 if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) { 25137 dest->r_ctl.max_reduction = src->r_ctl.max_reduction; 25138 cnt++; 25139 } 25140 /* TCP_REC_IS_DYN */ 25141 if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) { 25142 dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg; 25143 cnt++; 25144 } 25145 if (dest->rc_skip_timely != src->rc_skip_timely) { 25146 dest->rc_skip_timely = src->rc_skip_timely; 25147 cnt++; 25148 } 25149 /* TCP_DATA_AFTER_CLOSE */ 25150 if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) { 25151 dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo; 25152 cnt++; 25153 } 25154 /* TCP_GP_USE_LTBW */ 25155 if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) { 25156 dest->use_lesser_lt_bw = src->use_lesser_lt_bw; 25157 cnt++; 25158 } 25159 if (dest->dis_lt_bw != src->dis_lt_bw) { 25160 dest->dis_lt_bw = src->dis_lt_bw; 25161 cnt++; 25162 } 25163 tcp_log_socket_option(tp, 0, cnt, 0); 25164 } 25165 25166 25167 static void 25168 rack_apply_deferred_options(struct tcp_rack *rack) 25169 { 25170 struct deferred_opt_list *dol, *sdol; 25171 uint32_t s_optval; 25172 25173 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 25174 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 25175 /* Disadvantage of deferal is you loose the error return */ 25176 s_optval = (uint32_t)dol->optval; 25177 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); 25178 free(dol, M_TCPDO); 25179 } 25180 } 25181 25182 static void 25183 rack_hw_tls_change(struct tcpcb *tp, int chg) 25184 { 25185 /* Update HW tls state */ 25186 struct tcp_rack *rack; 25187 25188 rack = (struct tcp_rack *)tp->t_fb_ptr; 25189 if (chg) 25190 rack->r_ctl.fsb.hw_tls = 1; 25191 else 25192 rack->r_ctl.fsb.hw_tls = 0; 25193 } 25194 25195 static int 25196 rack_pru_options(struct tcpcb *tp, int flags) 25197 { 25198 if (flags & PRUS_OOB) 25199 return (EOPNOTSUPP); 25200 return (0); 25201 } 25202 25203 static bool 25204 rack_wake_check(struct tcpcb *tp) 25205 { 25206 struct tcp_rack *rack; 25207 struct timeval tv; 25208 uint32_t cts; 25209 25210 rack = (struct tcp_rack *)tp->t_fb_ptr; 25211 if (rack->r_ctl.rc_hpts_flags) { 25212 cts = tcp_get_usecs(&tv); 25213 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ 25214 /* 25215 * Pacing timer is up, check if we are ready. 25216 */ 25217 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) 25218 return (true); 25219 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { 25220 /* 25221 * A timer is up, check if we are ready. 25222 */ 25223 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) 25224 return (true); 25225 } 25226 } 25227 return (false); 25228 } 25229 25230 static struct tcp_function_block __tcp_rack = { 25231 .tfb_tcp_block_name = __XSTRING(STACKNAME), 25232 .tfb_tcp_output = rack_output, 25233 .tfb_do_queued_segments = ctf_do_queued_segments, 25234 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 25235 .tfb_tcp_do_segment = rack_do_segment, 25236 .tfb_tcp_ctloutput = rack_ctloutput, 25237 .tfb_tcp_fb_init = rack_init, 25238 .tfb_tcp_fb_fini = rack_fini, 25239 .tfb_tcp_timer_stop_all = rack_stopall, 25240 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 25241 .tfb_tcp_handoff_ok = rack_handoff_ok, 25242 .tfb_tcp_mtu_chg = rack_mtu_change, 25243 .tfb_pru_options = rack_pru_options, 25244 .tfb_hwtls_change = rack_hw_tls_change, 25245 .tfb_chg_query = rack_chg_query, 25246 .tfb_switch_failed = rack_switch_failed, 25247 .tfb_early_wake_check = rack_wake_check, 25248 .tfb_compute_pipe = rack_compute_pipe, 25249 .tfb_stack_info = rack_stack_information, 25250 .tfb_inherit = rack_inherit, 25251 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 25252 25253 }; 25254 25255 /* 25256 * rack_ctloutput() must drop the inpcb lock before performing copyin on 25257 * socket option arguments. When it re-acquires the lock after the copy, it 25258 * has to revalidate that the connection is still valid for the socket 25259 * option. 25260 */ 25261 static int 25262 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) 25263 { 25264 struct inpcb *inp = tptoinpcb(tp); 25265 #ifdef INET 25266 struct ip *ip; 25267 #endif 25268 struct tcp_rack *rack; 25269 struct tcp_hybrid_req hybrid; 25270 uint64_t loptval; 25271 int32_t error = 0, optval; 25272 25273 rack = (struct tcp_rack *)tp->t_fb_ptr; 25274 if (rack == NULL) { 25275 INP_WUNLOCK(inp); 25276 return (EINVAL); 25277 } 25278 #ifdef INET 25279 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 25280 #endif 25281 25282 switch (sopt->sopt_level) { 25283 #ifdef INET6 25284 case IPPROTO_IPV6: 25285 MPASS(inp->inp_vflag & INP_IPV6PROTO); 25286 switch (sopt->sopt_name) { 25287 case IPV6_USE_MIN_MTU: 25288 tcp6_use_min_mtu(tp); 25289 break; 25290 } 25291 INP_WUNLOCK(inp); 25292 return (0); 25293 #endif 25294 #ifdef INET 25295 case IPPROTO_IP: 25296 switch (sopt->sopt_name) { 25297 case IP_TOS: 25298 /* 25299 * The DSCP codepoint has changed, update the fsb. 25300 */ 25301 ip->ip_tos = rack->rc_inp->inp_ip_tos; 25302 break; 25303 case IP_TTL: 25304 /* 25305 * The TTL has changed, update the fsb. 25306 */ 25307 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 25308 break; 25309 } 25310 INP_WUNLOCK(inp); 25311 return (0); 25312 #endif 25313 #ifdef SO_PEERPRIO 25314 case SOL_SOCKET: 25315 switch (sopt->sopt_name) { 25316 case SO_PEERPRIO: /* SC-URL:bs */ 25317 /* Already read in and sanity checked in sosetopt(). */ 25318 if (inp->inp_socket) { 25319 rack->client_bufferlvl = inp->inp_socket->so_peerprio; 25320 } 25321 break; 25322 } 25323 INP_WUNLOCK(inp); 25324 return (0); 25325 #endif 25326 case IPPROTO_TCP: 25327 switch (sopt->sopt_name) { 25328 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 25329 /* Pacing related ones */ 25330 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 25331 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 25332 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ 25333 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 25334 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 25335 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 25336 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 25337 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 25338 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 25339 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 25340 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 25341 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 25342 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 25343 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 25344 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 25345 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 25346 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 25347 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 25348 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 25349 /* End pacing related */ 25350 case TCP_POLICER_DETECT: /* URL:pol_det */ 25351 case TCP_POLICER_MSS: /* URL:pol_mss */ 25352 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 25353 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 25354 case TCP_RACK_MIN_TO: /* URL:min_to */ 25355 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 25356 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 25357 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 25358 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 25359 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 25360 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 25361 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 25362 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 25363 case TCP_NO_PRR: /* URL:noprr */ 25364 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 25365 case TCP_DATA_AFTER_CLOSE: /* no URL */ 25366 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 25367 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 25368 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 25369 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 25370 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 25371 case TCP_RACK_PROFILE: /* URL:profile */ 25372 case TCP_SIDECHAN_DIS: /* URL:scodm */ 25373 case TCP_HYBRID_PACING: /* URL:pacing=hybrid */ 25374 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 25375 case TCP_RACK_ABC_VAL: /* URL:labc */ 25376 case TCP_REC_ABC_VAL: /* URL:reclabc */ 25377 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 25378 case TCP_DEFER_OPTIONS: /* URL:defer */ 25379 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 25380 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 25381 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 25382 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ 25383 case TCP_RACK_HI_BETA: /* URL:hibeta */ 25384 case TCP_RACK_SPLIT_LIMIT: /* URL:split */ 25385 case TCP_SS_EEXIT: /* URL:eexit */ 25386 case TCP_DGP_UPPER_BOUNDS: /* URL:upper */ 25387 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ 25388 case TCP_PACING_DND: /* URL:dnd */ 25389 case TCP_NO_TIMELY: /* URL:notimely */ 25390 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 25391 case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */ 25392 case TCP_REC_IS_DYN: /* URL:dynrec */ 25393 case TCP_GP_USE_LTBW: /* URL:useltbw */ 25394 goto process_opt; 25395 break; 25396 default: 25397 /* Filter off all unknown options to the base stack */ 25398 return (tcp_default_ctloutput(tp, sopt)); 25399 break; 25400 } 25401 default: 25402 INP_WUNLOCK(inp); 25403 return (0); 25404 } 25405 process_opt: 25406 INP_WUNLOCK(inp); 25407 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 25408 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) { 25409 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 25410 /* 25411 * We truncate it down to 32 bits for the socket-option trace this 25412 * means rates > 34Gbps won't show right, but thats probably ok. 25413 */ 25414 optval = (uint32_t)loptval; 25415 } else if (sopt->sopt_name == TCP_HYBRID_PACING) { 25416 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); 25417 } else { 25418 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 25419 /* Save it in 64 bit form too */ 25420 loptval = optval; 25421 } 25422 if (error) 25423 return (error); 25424 INP_WLOCK(inp); 25425 if (tp->t_fb != &__tcp_rack) { 25426 INP_WUNLOCK(inp); 25427 return (ENOPROTOOPT); 25428 } 25429 if (rack->defer_options && (rack->gp_ready == 0) && 25430 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 25431 (sopt->sopt_name != TCP_HYBRID_PACING) && 25432 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && 25433 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 25434 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 25435 /* Options are being deferred */ 25436 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 25437 INP_WUNLOCK(inp); 25438 return (0); 25439 } else { 25440 /* No memory to defer, fail */ 25441 INP_WUNLOCK(inp); 25442 return (ENOMEM); 25443 } 25444 } 25445 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); 25446 INP_WUNLOCK(inp); 25447 return (error); 25448 } 25449 25450 static void 25451 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 25452 { 25453 25454 INP_WLOCK_ASSERT(tptoinpcb(tp)); 25455 bzero(ti, sizeof(*ti)); 25456 25457 ti->tcpi_state = tp->t_state; 25458 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 25459 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 25460 if (tp->t_flags & TF_SACK_PERMIT) 25461 ti->tcpi_options |= TCPI_OPT_SACK; 25462 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 25463 ti->tcpi_options |= TCPI_OPT_WSCALE; 25464 ti->tcpi_snd_wscale = tp->snd_scale; 25465 ti->tcpi_rcv_wscale = tp->rcv_scale; 25466 } 25467 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 25468 ti->tcpi_options |= TCPI_OPT_ECN; 25469 if (tp->t_flags & TF_FASTOPEN) 25470 ti->tcpi_options |= TCPI_OPT_TFO; 25471 /* still kept in ticks is t_rcvtime */ 25472 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 25473 /* Since we hold everything in precise useconds this is easy */ 25474 ti->tcpi_rtt = tp->t_srtt; 25475 ti->tcpi_rttvar = tp->t_rttvar; 25476 ti->tcpi_rto = tp->t_rxtcur; 25477 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 25478 ti->tcpi_snd_cwnd = tp->snd_cwnd; 25479 /* 25480 * FreeBSD-specific extension fields for tcp_info. 25481 */ 25482 ti->tcpi_rcv_space = tp->rcv_wnd; 25483 ti->tcpi_rcv_nxt = tp->rcv_nxt; 25484 ti->tcpi_snd_wnd = tp->snd_wnd; 25485 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 25486 ti->tcpi_snd_nxt = tp->snd_nxt; 25487 ti->tcpi_snd_mss = tp->t_maxseg; 25488 ti->tcpi_rcv_mss = tp->t_maxseg; 25489 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 25490 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 25491 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 25492 ti->tcpi_total_tlp = tp->t_sndtlppack; 25493 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 25494 ti->tcpi_rttmin = tp->t_rttlow; 25495 #ifdef NETFLIX_STATS 25496 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 25497 #endif 25498 #ifdef TCP_OFFLOAD 25499 if (tp->t_flags & TF_TOE) { 25500 ti->tcpi_options |= TCPI_OPT_TOE; 25501 tcp_offload_tcp_info(tp, ti); 25502 } 25503 #endif 25504 } 25505 25506 static int 25507 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) 25508 { 25509 struct inpcb *inp = tptoinpcb(tp); 25510 struct tcp_rack *rack; 25511 int32_t error, optval; 25512 uint64_t val, loptval; 25513 struct tcp_info ti; 25514 /* 25515 * Because all our options are either boolean or an int, we can just 25516 * pull everything into optval and then unlock and copy. If we ever 25517 * add a option that is not a int, then this will have quite an 25518 * impact to this routine. 25519 */ 25520 error = 0; 25521 rack = (struct tcp_rack *)tp->t_fb_ptr; 25522 if (rack == NULL) { 25523 INP_WUNLOCK(inp); 25524 return (EINVAL); 25525 } 25526 switch (sopt->sopt_name) { 25527 case TCP_INFO: 25528 /* First get the info filled */ 25529 rack_fill_info(tp, &ti); 25530 /* Fix up the rtt related fields if needed */ 25531 INP_WUNLOCK(inp); 25532 error = sooptcopyout(sopt, &ti, sizeof ti); 25533 return (error); 25534 /* 25535 * Beta is the congestion control value for NewReno that influences how 25536 * much of a backoff happens when loss is detected. It is normally set 25537 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 25538 * when you exit recovery. 25539 */ 25540 case TCP_RACK_PACING_BETA: 25541 break; 25542 /* 25543 * Beta_ecn is the congestion control value for NewReno that influences how 25544 * much of a backoff happens when a ECN mark is detected. It is normally set 25545 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 25546 * you exit recovery. Note that classic ECN has a beta of 50, it is only 25547 * ABE Ecn that uses this "less" value, but we do too with pacing :) 25548 */ 25549 25550 case TCP_RACK_PACING_BETA_ECN: 25551 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 25552 error = EINVAL; 25553 else if (rack->rc_pacing_cc_set == 0) 25554 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 25555 else { 25556 /* 25557 * Reach out into the CC data and report back what 25558 * I have previously set. Yeah it looks hackish but 25559 * we don't want to report the saved values. 25560 */ 25561 if (tp->t_ccv.cc_data) 25562 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 25563 else 25564 error = EINVAL; 25565 } 25566 break; 25567 case TCP_RACK_DSACK_OPT: 25568 optval = 0; 25569 if (rack->rc_rack_tmr_std_based) { 25570 optval |= 1; 25571 } 25572 if (rack->rc_rack_use_dsack) { 25573 optval |= 2; 25574 } 25575 break; 25576 case TCP_RACK_ENABLE_HYSTART: 25577 { 25578 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 25579 optval = RACK_HYSTART_ON; 25580 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 25581 optval = RACK_HYSTART_ON_W_SC; 25582 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 25583 optval = RACK_HYSTART_ON_W_SC_C; 25584 } else { 25585 optval = RACK_HYSTART_OFF; 25586 } 25587 } 25588 break; 25589 case TCP_RACK_DGP_IN_REC: 25590 error = EINVAL; 25591 break; 25592 case TCP_RACK_HI_BETA: 25593 optval = rack->rack_hibeta; 25594 break; 25595 case TCP_POLICER_MSS: 25596 optval = rack->r_ctl.policer_del_mss; 25597 break; 25598 case TCP_POLICER_DETECT: 25599 optval = rack->r_ctl.saved_policer_val; 25600 break; 25601 case TCP_DEFER_OPTIONS: 25602 optval = rack->defer_options; 25603 break; 25604 case TCP_RACK_MEASURE_CNT: 25605 optval = rack->r_ctl.req_measurements; 25606 break; 25607 case TCP_REC_ABC_VAL: 25608 optval = rack->r_use_labc_for_rec; 25609 break; 25610 case TCP_RACK_ABC_VAL: 25611 optval = rack->rc_labc; 25612 break; 25613 case TCP_HDWR_UP_ONLY: 25614 optval= rack->r_up_only; 25615 break; 25616 case TCP_FILLCW_RATE_CAP: 25617 loptval = rack->r_ctl.fillcw_cap; 25618 break; 25619 case TCP_PACING_RATE_CAP: 25620 loptval = rack->r_ctl.bw_rate_cap; 25621 break; 25622 case TCP_RACK_PROFILE: 25623 /* You cannot retrieve a profile, its write only */ 25624 error = EINVAL; 25625 break; 25626 case TCP_SIDECHAN_DIS: 25627 optval = rack->r_ctl.side_chan_dis_mask; 25628 break; 25629 case TCP_HYBRID_PACING: 25630 /* You cannot retrieve hybrid pacing information, its write only */ 25631 error = EINVAL; 25632 break; 25633 case TCP_USE_CMP_ACKS: 25634 optval = rack->r_use_cmp_ack; 25635 break; 25636 case TCP_RACK_PACE_TO_FILL: 25637 optval = rack->rc_pace_to_cwnd; 25638 break; 25639 case TCP_RACK_NO_PUSH_AT_MAX: 25640 optval = rack->r_ctl.rc_no_push_at_mrtt; 25641 break; 25642 case TCP_SHARED_CWND_ENABLE: 25643 optval = rack->rack_enable_scwnd; 25644 break; 25645 case TCP_RACK_NONRXT_CFG_RATE: 25646 optval = rack->rack_rec_nonrxt_use_cr; 25647 break; 25648 case TCP_NO_PRR: 25649 if (rack->rack_no_prr == 1) 25650 optval = 1; 25651 else if (rack->no_prr_addback == 1) 25652 optval = 2; 25653 else 25654 optval = 0; 25655 break; 25656 case TCP_GP_USE_LTBW: 25657 if (rack->dis_lt_bw) { 25658 /* It is not used */ 25659 optval = 0; 25660 } else if (rack->use_lesser_lt_bw) { 25661 /* we use min() */ 25662 optval = 1; 25663 } else { 25664 /* we use max() */ 25665 optval = 2; 25666 } 25667 break; 25668 case TCP_RACK_DO_DETECTION: 25669 error = EINVAL; 25670 break; 25671 case TCP_RACK_MBUF_QUEUE: 25672 /* Now do we use the LRO mbuf-queue feature */ 25673 optval = rack->r_mbuf_queue; 25674 break; 25675 case RACK_CSPR_IS_FCC: 25676 optval = rack->cspr_is_fcc; 25677 break; 25678 case TCP_TIMELY_DYN_ADJ: 25679 optval = rack->rc_gp_dyn_mul; 25680 break; 25681 case TCP_BBR_IWINTSO: 25682 error = EINVAL; 25683 break; 25684 case TCP_RACK_TLP_REDUCE: 25685 /* RACK TLP cwnd reduction (bool) */ 25686 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 25687 break; 25688 case TCP_BBR_RACK_INIT_RATE: 25689 val = rack->r_ctl.init_rate; 25690 /* convert to kbits per sec */ 25691 val *= 8; 25692 val /= 1000; 25693 optval = (uint32_t)val; 25694 break; 25695 case TCP_RACK_FORCE_MSEG: 25696 optval = rack->rc_force_max_seg; 25697 break; 25698 case TCP_RACK_PACE_MIN_SEG: 25699 optval = rack->r_ctl.rc_user_set_min_segs; 25700 break; 25701 case TCP_RACK_PACE_MAX_SEG: 25702 /* Max segments in a pace */ 25703 optval = rack->rc_user_set_max_segs; 25704 break; 25705 case TCP_RACK_PACE_ALWAYS: 25706 /* Use the always pace method */ 25707 optval = rack->rc_always_pace; 25708 break; 25709 case TCP_RACK_PRR_SENDALOT: 25710 /* Allow PRR to send more than one seg */ 25711 optval = rack->r_ctl.rc_prr_sendalot; 25712 break; 25713 case TCP_RACK_MIN_TO: 25714 /* Minimum time between rack t-o's in ms */ 25715 optval = rack->r_ctl.rc_min_to; 25716 break; 25717 case TCP_RACK_SPLIT_LIMIT: 25718 optval = rack->r_ctl.rc_split_limit; 25719 break; 25720 case TCP_RACK_EARLY_SEG: 25721 /* If early recovery max segments */ 25722 optval = rack->r_ctl.rc_early_recovery_segs; 25723 break; 25724 case TCP_RACK_REORD_THRESH: 25725 /* RACK reorder threshold (shift amount) */ 25726 optval = rack->r_ctl.rc_reorder_shift; 25727 break; 25728 case TCP_SS_EEXIT: 25729 if (rack->r_ctl.gp_rnd_thresh) { 25730 uint32_t v; 25731 25732 v = rack->r_ctl.gp_gain_req; 25733 v <<= 17; 25734 optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff); 25735 if (rack->r_ctl.gate_to_fs == 1) 25736 optval |= 0x10000; 25737 } else 25738 optval = 0; 25739 break; 25740 case TCP_RACK_REORD_FADE: 25741 /* Does reordering fade after ms time */ 25742 optval = rack->r_ctl.rc_reorder_fade; 25743 break; 25744 case TCP_BBR_USE_RACK_RR: 25745 /* Do we use the rack cheat for rxt */ 25746 optval = rack->use_rack_rr; 25747 break; 25748 case TCP_RACK_RR_CONF: 25749 optval = rack->r_rr_config; 25750 break; 25751 case TCP_HDWR_RATE_CAP: 25752 optval = rack->r_rack_hw_rate_caps; 25753 break; 25754 case TCP_BBR_HDWR_PACE: 25755 optval = rack->rack_hdw_pace_ena; 25756 break; 25757 case TCP_RACK_TLP_THRESH: 25758 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 25759 optval = rack->r_ctl.rc_tlp_threshold; 25760 break; 25761 case TCP_RACK_PKT_DELAY: 25762 /* RACK added ms i.e. rack-rtt + reord + N */ 25763 optval = rack->r_ctl.rc_pkt_delay; 25764 break; 25765 case TCP_RACK_TLP_USE: 25766 optval = rack->rack_tlp_threshold_use; 25767 break; 25768 case TCP_PACING_DND: 25769 optval = rack->rc_pace_dnd; 25770 break; 25771 case TCP_RACK_PACE_RATE_CA: 25772 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 25773 break; 25774 case TCP_RACK_PACE_RATE_SS: 25775 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 25776 break; 25777 case TCP_RACK_PACE_RATE_REC: 25778 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 25779 break; 25780 case TCP_DGP_UPPER_BOUNDS: 25781 optval = rack->r_ctl.rack_per_upper_bound_ss; 25782 optval <<= 16; 25783 optval |= rack->r_ctl.rack_per_upper_bound_ca; 25784 break; 25785 case TCP_RACK_GP_INCREASE_SS: 25786 optval = rack->r_ctl.rack_per_of_gp_ca; 25787 break; 25788 case TCP_RACK_GP_INCREASE_CA: 25789 optval = rack->r_ctl.rack_per_of_gp_ss; 25790 break; 25791 case TCP_RACK_PACING_DIVISOR: 25792 optval = rack->r_ctl.pace_len_divisor; 25793 break; 25794 case TCP_BBR_RACK_RTT_USE: 25795 optval = rack->r_ctl.rc_rate_sample_method; 25796 break; 25797 case TCP_DELACK: 25798 optval = tp->t_delayed_ack; 25799 break; 25800 case TCP_DATA_AFTER_CLOSE: 25801 optval = rack->rc_allow_data_af_clo; 25802 break; 25803 case TCP_SHARED_CWND_TIME_LIMIT: 25804 optval = rack->r_limit_scw; 25805 break; 25806 case TCP_HONOR_HPTS_MIN: 25807 if (rack->r_use_hpts_min) 25808 optval = rack->r_ctl.max_reduction; 25809 else 25810 optval = 0; 25811 break; 25812 case TCP_REC_IS_DYN: 25813 optval = rack->rc_gp_no_rec_chg; 25814 break; 25815 case TCP_NO_TIMELY: 25816 optval = rack->rc_skip_timely; 25817 break; 25818 case TCP_RACK_TIMER_SLOP: 25819 optval = rack->r_ctl.timer_slop; 25820 break; 25821 default: 25822 return (tcp_default_ctloutput(tp, sopt)); 25823 break; 25824 } 25825 INP_WUNLOCK(inp); 25826 if (error == 0) { 25827 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 25828 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) 25829 error = sooptcopyout(sopt, &loptval, sizeof loptval); 25830 else 25831 error = sooptcopyout(sopt, &optval, sizeof optval); 25832 } 25833 return (error); 25834 } 25835 25836 static int 25837 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 25838 { 25839 if (sopt->sopt_dir == SOPT_SET) { 25840 return (rack_set_sockopt(tp, sopt)); 25841 } else if (sopt->sopt_dir == SOPT_GET) { 25842 return (rack_get_sockopt(tp, sopt)); 25843 } else { 25844 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 25845 } 25846 } 25847 25848 static const char *rack_stack_names[] = { 25849 __XSTRING(STACKNAME), 25850 #ifdef STACKALIAS 25851 __XSTRING(STACKALIAS), 25852 #endif 25853 }; 25854 25855 static int 25856 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 25857 { 25858 memset(mem, 0, size); 25859 return (0); 25860 } 25861 25862 static void 25863 rack_dtor(void *mem, int32_t size, void *arg) 25864 { 25865 25866 } 25867 25868 static bool rack_mod_inited = false; 25869 25870 static int 25871 tcp_addrack(module_t mod, int32_t type, void *data) 25872 { 25873 int32_t err = 0; 25874 int num_stacks; 25875 25876 switch (type) { 25877 case MOD_LOAD: 25878 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 25879 sizeof(struct rack_sendmap), 25880 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 25881 25882 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 25883 sizeof(struct tcp_rack), 25884 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 25885 25886 sysctl_ctx_init(&rack_sysctl_ctx); 25887 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 25888 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 25889 OID_AUTO, 25890 #ifdef STACKALIAS 25891 __XSTRING(STACKALIAS), 25892 #else 25893 __XSTRING(STACKNAME), 25894 #endif 25895 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 25896 ""); 25897 if (rack_sysctl_root == NULL) { 25898 printf("Failed to add sysctl node\n"); 25899 err = EFAULT; 25900 goto free_uma; 25901 } 25902 rack_init_sysctls(); 25903 num_stacks = nitems(rack_stack_names); 25904 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 25905 rack_stack_names, &num_stacks); 25906 if (err) { 25907 printf("Failed to register %s stack name for " 25908 "%s module\n", rack_stack_names[num_stacks], 25909 __XSTRING(MODNAME)); 25910 sysctl_ctx_free(&rack_sysctl_ctx); 25911 free_uma: 25912 uma_zdestroy(rack_zone); 25913 uma_zdestroy(rack_pcb_zone); 25914 rack_counter_destroy(); 25915 printf("Failed to register rack module -- err:%d\n", err); 25916 return (err); 25917 } 25918 tcp_lro_reg_mbufq(); 25919 rack_mod_inited = true; 25920 break; 25921 case MOD_QUIESCE: 25922 err = deregister_tcp_functions(&__tcp_rack, true, false); 25923 break; 25924 case MOD_UNLOAD: 25925 err = deregister_tcp_functions(&__tcp_rack, false, true); 25926 if (err == EBUSY) 25927 break; 25928 if (rack_mod_inited) { 25929 uma_zdestroy(rack_zone); 25930 uma_zdestroy(rack_pcb_zone); 25931 sysctl_ctx_free(&rack_sysctl_ctx); 25932 rack_counter_destroy(); 25933 rack_mod_inited = false; 25934 } 25935 tcp_lro_dereg_mbufq(); 25936 err = 0; 25937 break; 25938 default: 25939 return (EOPNOTSUPP); 25940 } 25941 return (err); 25942 } 25943 25944 static moduledata_t tcp_rack = { 25945 .name = __XSTRING(MODNAME), 25946 .evhand = tcp_addrack, 25947 .priv = 0 25948 }; 25949 25950 MODULE_VERSION(MODNAME, 1); 25951 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 25952 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 25953 25954 #endif /* #if !defined(INET) && !defined(INET6) */ 25955