1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 #include "opt_ipsec.h" 31 #include "opt_ratelimit.h" 32 #include "opt_kern_tls.h" 33 #if defined(INET) || defined(INET6) 34 #include <sys/param.h> 35 #include <sys/arb.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/mbuf.h> 46 #include <sys/proc.h> /* for proc0 declaration */ 47 #include <sys/socket.h> 48 #include <sys/socketvar.h> 49 #include <sys/sysctl.h> 50 #include <sys/systm.h> 51 #ifdef STATS 52 #include <sys/qmath.h> 53 #include <sys/tree.h> 54 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 55 #else 56 #include <sys/tree.h> 57 #endif 58 #include <sys/refcount.h> 59 #include <sys/queue.h> 60 #include <sys/tim_filter.h> 61 #include <sys/smp.h> 62 #include <sys/kthread.h> 63 #include <sys/kern_prefetch.h> 64 #include <sys/protosw.h> 65 #ifdef TCP_ACCOUNTING 66 #include <sys/sched.h> 67 #include <machine/cpu.h> 68 #endif 69 #include <vm/uma.h> 70 71 #include <net/route.h> 72 #include <net/route/nhop.h> 73 #include <net/vnet.h> 74 75 #define TCPSTATES /* for logging */ 76 77 #include <netinet/in.h> 78 #include <netinet/in_kdtrace.h> 79 #include <netinet/in_pcb.h> 80 #include <netinet/ip.h> 81 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 82 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 83 #include <netinet/ip_var.h> 84 #include <netinet/ip6.h> 85 #include <netinet6/in6_pcb.h> 86 #include <netinet6/ip6_var.h> 87 #include <netinet/tcp.h> 88 #define TCPOUTFLAGS 89 #include <netinet/tcp_fsm.h> 90 #include <netinet/tcp_seq.h> 91 #include <netinet/tcp_timer.h> 92 #include <netinet/tcp_var.h> 93 #include <netinet/tcp_log_buf.h> 94 #include <netinet/tcp_syncache.h> 95 #include <netinet/tcp_hpts.h> 96 #include <netinet/tcp_ratelimit.h> 97 #include <netinet/tcp_accounting.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/cc/cc.h> 100 #include <netinet/cc/cc_newreno.h> 101 #include <netinet/tcp_fastopen.h> 102 #include <netinet/tcp_lro.h> 103 #ifdef NETFLIX_SHARED_CWND 104 #include <netinet/tcp_shared_cwnd.h> 105 #endif 106 #ifdef TCP_OFFLOAD 107 #include <netinet/tcp_offload.h> 108 #endif 109 #ifdef INET6 110 #include <netinet6/tcp6_var.h> 111 #endif 112 #include <netinet/tcp_ecn.h> 113 114 #include <netipsec/ipsec_support.h> 115 116 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 117 #include <netipsec/ipsec.h> 118 #include <netipsec/ipsec6.h> 119 #endif /* IPSEC */ 120 121 #include <netinet/udp.h> 122 #include <netinet/udp_var.h> 123 #include <machine/in_cksum.h> 124 125 #ifdef MAC 126 #include <security/mac/mac_framework.h> 127 #endif 128 #include "sack_filter.h" 129 #include "tcp_rack.h" 130 #include "tailq_hash.h" 131 #include "rack_bbr_common.h" 132 133 uma_zone_t rack_zone; 134 uma_zone_t rack_pcb_zone; 135 136 #ifndef TICKS2SBT 137 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 138 #endif 139 140 VNET_DECLARE(uint32_t, newreno_beta); 141 VNET_DECLARE(uint32_t, newreno_beta_ecn); 142 #define V_newreno_beta VNET(newreno_beta) 143 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 144 145 146 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 147 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 148 149 struct sysctl_ctx_list rack_sysctl_ctx; 150 struct sysctl_oid *rack_sysctl_root; 151 152 #define CUM_ACKED 1 153 #define SACKED 2 154 155 /* 156 * The RACK module incorporates a number of 157 * TCP ideas that have been put out into the IETF 158 * over the last few years: 159 * - Matt Mathis's Rate Halving which slowly drops 160 * the congestion window so that the ack clock can 161 * be maintained during a recovery. 162 * - Yuchung Cheng's RACK TCP (for which its named) that 163 * will stop us using the number of dup acks and instead 164 * use time as the gage of when we retransmit. 165 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 166 * of Dukkipati et.al. 167 * RACK depends on SACK, so if an endpoint arrives that 168 * cannot do SACK the state machine below will shuttle the 169 * connection back to using the "default" TCP stack that is 170 * in FreeBSD. 171 * 172 * To implement RACK the original TCP stack was first decomposed 173 * into a functional state machine with individual states 174 * for each of the possible TCP connection states. The do_segment 175 * functions role in life is to mandate the connection supports SACK 176 * initially and then assure that the RACK state matches the conenction 177 * state before calling the states do_segment function. Each 178 * state is simplified due to the fact that the original do_segment 179 * has been decomposed and we *know* what state we are in (no 180 * switches on the state) and all tests for SACK are gone. This 181 * greatly simplifies what each state does. 182 * 183 * TCP output is also over-written with a new version since it 184 * must maintain the new rack scoreboard. 185 * 186 */ 187 static int32_t rack_tlp_thresh = 1; 188 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 189 static int32_t rack_tlp_use_greater = 1; 190 static int32_t rack_reorder_thresh = 2; 191 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 192 * - 60 seconds */ 193 static uint32_t rack_clamp_ss_upper = 110; 194 static uint32_t rack_clamp_ca_upper = 105; 195 static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */ 196 static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */ 197 static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */ 198 static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */ 199 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ 200 static int32_t rack_rxt_controls = 0; 201 static int32_t rack_fill_cw_state = 0; 202 static uint8_t rack_req_measurements = 1; 203 /* Attack threshold detections */ 204 static uint32_t rack_highest_sack_thresh_seen = 0; 205 static uint32_t rack_highest_move_thresh_seen = 0; 206 static uint32_t rack_merge_out_sacks_on_attack = 0; 207 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 208 static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ 209 static int32_t rack_hw_rate_caps = 0; /* 1; */ 210 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ 211 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 212 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 213 static int32_t rack_hw_up_only = 0; 214 static int32_t rack_stats_gets_ms_rtt = 1; 215 static int32_t rack_prr_addbackmax = 2; 216 static int32_t rack_do_hystart = 0; 217 static int32_t rack_apply_rtt_with_reduced_conf = 0; 218 static int32_t rack_hibeta_setting = 0; 219 static int32_t rack_default_pacing_divisor = 250; 220 static int32_t rack_uses_full_dgp_in_rec = 1; 221 static uint16_t rack_pacing_min_seg = 0; 222 223 224 static uint32_t sad_seg_size_per = 800; /* 80.0 % */ 225 static int32_t rack_pkt_delay = 1000; 226 static int32_t rack_send_a_lot_in_prr = 1; 227 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 228 static int32_t rack_verbose_logging = 0; 229 static int32_t rack_ignore_data_after_close = 1; 230 static int32_t rack_enable_shared_cwnd = 1; 231 static int32_t rack_use_cmp_acks = 1; 232 static int32_t rack_use_fsb = 1; 233 static int32_t rack_use_rfo = 1; 234 static int32_t rack_use_rsm_rfo = 1; 235 static int32_t rack_max_abc_post_recovery = 2; 236 static int32_t rack_client_low_buf = 0; 237 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 238 static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */ 239 #ifdef TCP_ACCOUNTING 240 static int32_t rack_tcp_accounting = 0; 241 #endif 242 static int32_t rack_limits_scwnd = 1; 243 static int32_t rack_enable_mqueue_for_nonpaced = 0; 244 static int32_t rack_hybrid_allow_set_maxseg = 0; 245 static int32_t rack_disable_prr = 0; 246 static int32_t use_rack_rr = 1; 247 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 248 static int32_t rack_persist_min = 250000; /* 250usec */ 249 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 250 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 251 static int32_t rack_default_init_window = 0; /* Use system default */ 252 static int32_t rack_limit_time_with_srtt = 0; 253 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 254 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ 255 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 256 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 257 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 258 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ 259 static int32_t rack_full_buffer_discount = 10; 260 /* 261 * Currently regular tcp has a rto_min of 30ms 262 * the backoff goes 12 times so that ends up 263 * being a total of 122.850 seconds before a 264 * connection is killed. 265 */ 266 static uint32_t rack_def_data_window = 20; 267 static uint32_t rack_goal_bdp = 2; 268 static uint32_t rack_min_srtts = 1; 269 static uint32_t rack_min_measure_usec = 0; 270 static int32_t rack_tlp_min = 10000; /* 10ms */ 271 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 272 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 273 static const int32_t rack_free_cache = 2; 274 static int32_t rack_hptsi_segments = 40; 275 static int32_t rack_rate_sample_method = USE_RTT_LOW; 276 static int32_t rack_pace_every_seg = 0; 277 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 278 static int32_t rack_slot_reduction = 4; 279 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 280 static int32_t rack_cwnd_block_ends_measure = 0; 281 static int32_t rack_rwnd_block_ends_measure = 0; 282 static int32_t rack_def_profile = 0; 283 284 static int32_t rack_lower_cwnd_at_tlp = 0; 285 static int32_t rack_limited_retran = 0; 286 static int32_t rack_always_send_oldest = 0; 287 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 288 289 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 290 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 291 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 292 293 /* Probertt */ 294 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 295 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 296 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 297 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 298 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 299 300 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 301 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 302 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 303 static uint32_t rack_probertt_use_min_rtt_exit = 0; 304 static uint32_t rack_probe_rtt_sets_cwnd = 0; 305 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 306 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 307 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 308 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 309 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 310 static uint32_t rack_probertt_filter_life = 10000000; 311 static uint32_t rack_probertt_lower_within = 10; 312 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 313 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 314 static int32_t rack_probertt_clear_is = 1; 315 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 316 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 317 318 /* Part of pacing */ 319 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 320 321 /* Timely information: 322 * 323 * Here we have various control parameters on how 324 * timely may change the multiplier. rack_gain_p5_ub 325 * is associated with timely but not directly influencing 326 * the rate decision like the other variables. It controls 327 * the way fill-cw interacts with timely and caps how much 328 * timely can boost the fill-cw b/w. 329 * 330 * The other values are various boost/shrink numbers as well 331 * as potential caps when adjustments are made to the timely 332 * gain (returned by rack_get_output_gain(). Remember too that 333 * the gain returned can be overriden by other factors such as 334 * probeRTT as well as fixed-rate-pacing. 335 */ 336 static int32_t rack_gain_p5_ub = 250; 337 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 338 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 339 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 340 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 341 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 342 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */ 343 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 344 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 345 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 346 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 347 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 348 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 349 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 350 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 351 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 352 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 353 static int32_t rack_use_max_for_nobackoff = 0; 354 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 355 static int32_t rack_timely_no_stopping = 0; 356 static int32_t rack_down_raise_thresh = 100; 357 static int32_t rack_req_segs = 1; 358 static uint64_t rack_bw_rate_cap = 0; 359 360 361 /* Rack specific counters */ 362 counter_u64_t rack_saw_enobuf; 363 counter_u64_t rack_saw_enobuf_hw; 364 counter_u64_t rack_saw_enetunreach; 365 counter_u64_t rack_persists_sends; 366 counter_u64_t rack_persists_acks; 367 counter_u64_t rack_persists_loss; 368 counter_u64_t rack_persists_lost_ends; 369 counter_u64_t rack_total_bytes; 370 #ifdef INVARIANTS 371 counter_u64_t rack_adjust_map_bw; 372 #endif 373 /* Tail loss probe counters */ 374 counter_u64_t rack_tlp_tot; 375 counter_u64_t rack_tlp_newdata; 376 counter_u64_t rack_tlp_retran; 377 counter_u64_t rack_tlp_retran_bytes; 378 counter_u64_t rack_to_tot; 379 counter_u64_t rack_hot_alloc; 380 counter_u64_t rack_to_alloc; 381 counter_u64_t rack_to_alloc_hard; 382 counter_u64_t rack_to_alloc_emerg; 383 counter_u64_t rack_to_alloc_limited; 384 counter_u64_t rack_alloc_limited_conns; 385 counter_u64_t rack_split_limited; 386 counter_u64_t rack_rxt_clamps_cwnd; 387 counter_u64_t rack_rxt_clamps_cwnd_uniq; 388 389 counter_u64_t rack_multi_single_eq; 390 counter_u64_t rack_proc_non_comp_ack; 391 392 counter_u64_t rack_fto_send; 393 counter_u64_t rack_fto_rsm_send; 394 counter_u64_t rack_nfto_resend; 395 counter_u64_t rack_non_fto_send; 396 counter_u64_t rack_extended_rfo; 397 398 counter_u64_t rack_sack_proc_all; 399 counter_u64_t rack_sack_proc_short; 400 counter_u64_t rack_sack_proc_restart; 401 counter_u64_t rack_sack_attacks_detected; 402 counter_u64_t rack_sack_attacks_reversed; 403 counter_u64_t rack_sack_attacks_suspect; 404 counter_u64_t rack_sack_used_next_merge; 405 counter_u64_t rack_sack_splits; 406 counter_u64_t rack_sack_used_prev_merge; 407 counter_u64_t rack_sack_skipped_acked; 408 counter_u64_t rack_ack_total; 409 counter_u64_t rack_express_sack; 410 counter_u64_t rack_sack_total; 411 counter_u64_t rack_move_none; 412 counter_u64_t rack_move_some; 413 414 counter_u64_t rack_input_idle_reduces; 415 counter_u64_t rack_collapsed_win; 416 counter_u64_t rack_collapsed_win_seen; 417 counter_u64_t rack_collapsed_win_rxt; 418 counter_u64_t rack_collapsed_win_rxt_bytes; 419 counter_u64_t rack_try_scwnd; 420 counter_u64_t rack_hw_pace_init_fail; 421 counter_u64_t rack_hw_pace_lost; 422 423 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 424 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 425 426 427 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 428 429 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 430 (tv) = (value) + slop; \ 431 if ((u_long)(tv) < (u_long)(tvmin)) \ 432 (tv) = (tvmin); \ 433 if ((u_long)(tv) > (u_long)(tvmax)) \ 434 (tv) = (tvmax); \ 435 } while (0) 436 437 static void 438 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 439 440 static int 441 rack_process_ack(struct mbuf *m, struct tcphdr *th, 442 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 443 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 444 static int 445 rack_process_data(struct mbuf *m, struct tcphdr *th, 446 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 447 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 448 static void 449 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 450 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 451 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 452 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 453 uint8_t limit_type); 454 static struct rack_sendmap * 455 rack_check_recovery_mode(struct tcpcb *tp, 456 uint32_t tsused); 457 static void 458 rack_cong_signal(struct tcpcb *tp, 459 uint32_t type, uint32_t ack, int ); 460 static void rack_counter_destroy(void); 461 static int 462 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt); 463 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 464 static void 465 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 466 static void 467 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 468 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); 469 static void rack_dtor(void *mem, int32_t size, void *arg); 470 static void 471 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 472 uint32_t flex1, uint32_t flex2, 473 uint32_t flex3, uint32_t flex4, 474 uint32_t flex5, uint32_t flex6, 475 uint16_t flex7, uint8_t mod); 476 477 static void 478 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 479 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 480 struct rack_sendmap *rsm, uint8_t quality); 481 static struct rack_sendmap * 482 rack_find_high_nonack(struct tcp_rack *rack, 483 struct rack_sendmap *rsm); 484 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 485 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 486 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 487 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt); 488 static void 489 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 490 tcp_seq th_ack, int line, uint8_t quality); 491 static void 492 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); 493 494 static uint32_t 495 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 496 static int32_t rack_handoff_ok(struct tcpcb *tp); 497 static int32_t rack_init(struct tcpcb *tp, void **ptr); 498 static void rack_init_sysctls(void); 499 500 static void 501 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 502 struct tcphdr *th, int entered_rec, int dup_ack_struck, 503 int *dsack_seen, int *sacks_seen); 504 static void 505 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 506 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 507 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); 508 509 static uint64_t rack_get_gp_est(struct tcp_rack *rack); 510 511 static void 512 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 513 struct rack_sendmap *rsm); 514 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 515 static int32_t rack_output(struct tcpcb *tp); 516 517 static uint32_t 518 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 519 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 520 uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz); 521 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 522 static void rack_remxt_tmr(struct tcpcb *tp); 523 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); 524 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 525 static int32_t rack_stopall(struct tcpcb *tp); 526 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 527 static uint32_t 528 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 529 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz); 530 static void 531 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 532 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz); 533 static int 534 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 535 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 536 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 537 static int 538 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 539 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 540 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 541 static int 542 rack_do_closing(struct mbuf *m, struct tcphdr *th, 543 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 544 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 545 static int 546 rack_do_established(struct mbuf *m, struct tcphdr *th, 547 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 548 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 549 static int 550 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 551 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 552 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 553 static int 554 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 555 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 556 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 557 static int 558 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 559 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 560 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 561 static int 562 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 563 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 564 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 565 static int 566 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 567 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 568 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 569 static int 570 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 571 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 572 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 573 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); 574 struct rack_sendmap * 575 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 576 uint32_t tsused); 577 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 578 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 579 static void 580 tcp_rack_partialack(struct tcpcb *tp); 581 static int 582 rack_set_profile(struct tcp_rack *rack, int prof); 583 static void 584 rack_apply_deferred_options(struct tcp_rack *rack); 585 586 int32_t rack_clear_counter=0; 587 588 static uint64_t 589 rack_get_lt_bw(struct tcp_rack *rack) 590 { 591 struct timeval tv; 592 uint64_t tim, bytes; 593 594 tim = rack->r_ctl.lt_bw_time; 595 bytes = rack->r_ctl.lt_bw_bytes; 596 if (rack->lt_bw_up) { 597 /* Include all the current bytes too */ 598 microuptime(&tv); 599 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); 600 tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); 601 } 602 if ((bytes != 0) && (tim != 0)) 603 return ((bytes * (uint64_t)1000000) / tim); 604 else 605 return (0); 606 } 607 608 static void 609 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) 610 { 611 struct sockopt sopt; 612 struct cc_newreno_opts opt; 613 struct newreno old; 614 struct tcpcb *tp; 615 int error, failed = 0; 616 617 tp = rack->rc_tp; 618 if (tp->t_cc == NULL) { 619 /* Tcb is leaving */ 620 return; 621 } 622 rack->rc_pacing_cc_set = 1; 623 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 624 /* Not new-reno we can't play games with beta! */ 625 failed = 1; 626 goto out; 627 628 } 629 if (CC_ALGO(tp)->ctl_output == NULL) { 630 /* Huh, not using new-reno so no swaps.? */ 631 failed = 2; 632 goto out; 633 } 634 /* Get the current values out */ 635 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 636 sopt.sopt_dir = SOPT_GET; 637 opt.name = CC_NEWRENO_BETA; 638 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 639 if (error) { 640 failed = 3; 641 goto out; 642 } 643 old.beta = opt.val; 644 opt.name = CC_NEWRENO_BETA_ECN; 645 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 646 if (error) { 647 failed = 4; 648 goto out; 649 } 650 old.beta_ecn = opt.val; 651 652 /* Now lets set in the values we have stored */ 653 sopt.sopt_dir = SOPT_SET; 654 opt.name = CC_NEWRENO_BETA; 655 opt.val = rack->r_ctl.rc_saved_beta.beta; 656 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 657 if (error) { 658 failed = 5; 659 goto out; 660 } 661 opt.name = CC_NEWRENO_BETA_ECN; 662 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 663 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 664 if (error) { 665 failed = 6; 666 goto out; 667 } 668 /* Save off the values for restoral */ 669 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 670 out: 671 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 672 union tcp_log_stackspecific log; 673 struct timeval tv; 674 struct newreno *ptr; 675 676 ptr = ((struct newreno *)tp->t_ccv.cc_data); 677 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 678 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 679 log.u_bbr.flex1 = ptr->beta; 680 log.u_bbr.flex2 = ptr->beta_ecn; 681 log.u_bbr.flex3 = ptr->newreno_flags; 682 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 683 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 684 log.u_bbr.flex6 = failed; 685 log.u_bbr.flex7 = rack->gp_ready; 686 log.u_bbr.flex7 <<= 1; 687 log.u_bbr.flex7 |= rack->use_fixed_rate; 688 log.u_bbr.flex7 <<= 1; 689 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 690 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 691 log.u_bbr.flex8 = flex8; 692 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 693 0, &log, false, NULL, NULL, 0, &tv); 694 } 695 } 696 697 static void 698 rack_set_cc_pacing(struct tcp_rack *rack) 699 { 700 if (rack->rc_pacing_cc_set) 701 return; 702 /* 703 * Use the swap utility placing in 3 for flex8 to id a 704 * set of a new set of values. 705 */ 706 rack->rc_pacing_cc_set = 1; 707 rack_swap_beta_values(rack, 3); 708 } 709 710 static void 711 rack_undo_cc_pacing(struct tcp_rack *rack) 712 { 713 if (rack->rc_pacing_cc_set == 0) 714 return; 715 /* 716 * Use the swap utility placing in 4 for flex8 to id a 717 * restoral of the old values. 718 */ 719 rack->rc_pacing_cc_set = 0; 720 rack_swap_beta_values(rack, 4); 721 } 722 723 static void 724 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, 725 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) 726 { 727 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) { 728 union tcp_log_stackspecific log; 729 struct timeval tv; 730 731 memset(&log, 0, sizeof(log)); 732 log.u_bbr.flex1 = seq_end; 733 log.u_bbr.flex2 = rack->rc_tp->gput_seq; 734 log.u_bbr.flex3 = ack_end_t; 735 log.u_bbr.flex4 = rack->rc_tp->gput_ts; 736 log.u_bbr.flex5 = send_end_t; 737 log.u_bbr.flex6 = rack->rc_tp->gput_ack; 738 log.u_bbr.flex7 = mode; 739 log.u_bbr.flex8 = 69; 740 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; 741 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; 742 log.u_bbr.pkts_out = line; 743 log.u_bbr.cwnd_gain = rack->app_limited_needs_set; 744 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; 745 if (rsm != NULL) { 746 log.u_bbr.applimited = rsm->r_start; 747 log.u_bbr.delivered = rsm->r_end; 748 log.u_bbr.epoch = rsm->r_flags; 749 } 750 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 751 TCP_LOG_EVENTP(rack->rc_tp, NULL, 752 &rack->rc_inp->inp_socket->so_rcv, 753 &rack->rc_inp->inp_socket->so_snd, 754 BBR_LOG_HPTSI_CALC, 0, 755 0, &log, false, &tv); 756 } 757 } 758 759 static int 760 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 761 { 762 uint32_t stat; 763 int32_t error; 764 765 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 766 if (error || req->newptr == NULL) 767 return error; 768 769 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 770 if (error) 771 return (error); 772 if (stat == 1) { 773 #ifdef INVARIANTS 774 printf("Clearing RACK counters\n"); 775 #endif 776 counter_u64_zero(rack_tlp_tot); 777 counter_u64_zero(rack_tlp_newdata); 778 counter_u64_zero(rack_tlp_retran); 779 counter_u64_zero(rack_tlp_retran_bytes); 780 counter_u64_zero(rack_to_tot); 781 counter_u64_zero(rack_saw_enobuf); 782 counter_u64_zero(rack_saw_enobuf_hw); 783 counter_u64_zero(rack_saw_enetunreach); 784 counter_u64_zero(rack_persists_sends); 785 counter_u64_zero(rack_total_bytes); 786 counter_u64_zero(rack_persists_acks); 787 counter_u64_zero(rack_persists_loss); 788 counter_u64_zero(rack_persists_lost_ends); 789 #ifdef INVARIANTS 790 counter_u64_zero(rack_adjust_map_bw); 791 #endif 792 counter_u64_zero(rack_to_alloc_hard); 793 counter_u64_zero(rack_to_alloc_emerg); 794 counter_u64_zero(rack_sack_proc_all); 795 counter_u64_zero(rack_fto_send); 796 counter_u64_zero(rack_fto_rsm_send); 797 counter_u64_zero(rack_extended_rfo); 798 counter_u64_zero(rack_hw_pace_init_fail); 799 counter_u64_zero(rack_hw_pace_lost); 800 counter_u64_zero(rack_non_fto_send); 801 counter_u64_zero(rack_nfto_resend); 802 counter_u64_zero(rack_sack_proc_short); 803 counter_u64_zero(rack_sack_proc_restart); 804 counter_u64_zero(rack_to_alloc); 805 counter_u64_zero(rack_to_alloc_limited); 806 counter_u64_zero(rack_alloc_limited_conns); 807 counter_u64_zero(rack_split_limited); 808 counter_u64_zero(rack_rxt_clamps_cwnd); 809 counter_u64_zero(rack_rxt_clamps_cwnd_uniq); 810 counter_u64_zero(rack_multi_single_eq); 811 counter_u64_zero(rack_proc_non_comp_ack); 812 counter_u64_zero(rack_sack_attacks_detected); 813 counter_u64_zero(rack_sack_attacks_reversed); 814 counter_u64_zero(rack_sack_attacks_suspect); 815 counter_u64_zero(rack_sack_used_next_merge); 816 counter_u64_zero(rack_sack_used_prev_merge); 817 counter_u64_zero(rack_sack_splits); 818 counter_u64_zero(rack_sack_skipped_acked); 819 counter_u64_zero(rack_ack_total); 820 counter_u64_zero(rack_express_sack); 821 counter_u64_zero(rack_sack_total); 822 counter_u64_zero(rack_move_none); 823 counter_u64_zero(rack_move_some); 824 counter_u64_zero(rack_try_scwnd); 825 counter_u64_zero(rack_collapsed_win); 826 counter_u64_zero(rack_collapsed_win_rxt); 827 counter_u64_zero(rack_collapsed_win_seen); 828 counter_u64_zero(rack_collapsed_win_rxt_bytes); 829 } else if (stat == 2) { 830 #ifdef INVARIANTS 831 printf("Clearing RACK option array\n"); 832 #endif 833 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); 834 } else if (stat == 3) { 835 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); 836 } else if (stat == 4) { 837 #ifdef INVARIANTS 838 printf("Clearing RACK out size array\n"); 839 #endif 840 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); 841 } 842 rack_clear_counter = 0; 843 return (0); 844 } 845 846 static void 847 rack_init_sysctls(void) 848 { 849 struct sysctl_oid *rack_counters; 850 struct sysctl_oid *rack_attack; 851 struct sysctl_oid *rack_pacing; 852 struct sysctl_oid *rack_timely; 853 struct sysctl_oid *rack_timers; 854 struct sysctl_oid *rack_tlp; 855 struct sysctl_oid *rack_misc; 856 struct sysctl_oid *rack_features; 857 struct sysctl_oid *rack_measure; 858 struct sysctl_oid *rack_probertt; 859 struct sysctl_oid *rack_hw_pacing; 860 861 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 862 SYSCTL_CHILDREN(rack_sysctl_root), 863 OID_AUTO, 864 "sack_attack", 865 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 866 "Rack Sack Attack Counters and Controls"); 867 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 868 SYSCTL_CHILDREN(rack_sysctl_root), 869 OID_AUTO, 870 "stats", 871 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 872 "Rack Counters"); 873 SYSCTL_ADD_S32(&rack_sysctl_ctx, 874 SYSCTL_CHILDREN(rack_sysctl_root), 875 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 876 &rack_rate_sample_method , USE_RTT_LOW, 877 "What method should we use for rate sampling 0=high, 1=low "); 878 /* Probe rtt related controls */ 879 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 880 SYSCTL_CHILDREN(rack_sysctl_root), 881 OID_AUTO, 882 "probertt", 883 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 884 "ProbeRTT related Controls"); 885 SYSCTL_ADD_U16(&rack_sysctl_ctx, 886 SYSCTL_CHILDREN(rack_probertt), 887 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 888 &rack_atexit_prtt_hbp, 130, 889 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 890 SYSCTL_ADD_U16(&rack_sysctl_ctx, 891 SYSCTL_CHILDREN(rack_probertt), 892 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 893 &rack_atexit_prtt, 130, 894 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 895 SYSCTL_ADD_U16(&rack_sysctl_ctx, 896 SYSCTL_CHILDREN(rack_probertt), 897 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 898 &rack_per_of_gp_probertt, 60, 899 "What percentage of goodput do we pace at in probertt"); 900 SYSCTL_ADD_U16(&rack_sysctl_ctx, 901 SYSCTL_CHILDREN(rack_probertt), 902 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 903 &rack_per_of_gp_probertt_reduce, 10, 904 "What percentage of goodput do we reduce every gp_srtt"); 905 SYSCTL_ADD_U16(&rack_sysctl_ctx, 906 SYSCTL_CHILDREN(rack_probertt), 907 OID_AUTO, "gp_per_low", CTLFLAG_RW, 908 &rack_per_of_gp_lowthresh, 40, 909 "What percentage of goodput do we allow the multiplier to fall to"); 910 SYSCTL_ADD_U32(&rack_sysctl_ctx, 911 SYSCTL_CHILDREN(rack_probertt), 912 OID_AUTO, "time_between", CTLFLAG_RW, 913 & rack_time_between_probertt, 96000000, 914 "How many useconds between the lowest rtt falling must past before we enter probertt"); 915 SYSCTL_ADD_U32(&rack_sysctl_ctx, 916 SYSCTL_CHILDREN(rack_probertt), 917 OID_AUTO, "safety", CTLFLAG_RW, 918 &rack_probe_rtt_safety_val, 2000000, 919 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 920 SYSCTL_ADD_U32(&rack_sysctl_ctx, 921 SYSCTL_CHILDREN(rack_probertt), 922 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 923 &rack_probe_rtt_sets_cwnd, 0, 924 "Do we set the cwnd too (if always_lower is on)"); 925 SYSCTL_ADD_U32(&rack_sysctl_ctx, 926 SYSCTL_CHILDREN(rack_probertt), 927 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 928 &rack_max_drain_wait, 2, 929 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 930 SYSCTL_ADD_U32(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_probertt), 932 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 933 &rack_must_drain, 1, 934 "We must drain this many gp_srtt's waiting for flight to reach goal"); 935 SYSCTL_ADD_U32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_probertt), 937 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 938 &rack_probertt_use_min_rtt_entry, 1, 939 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 940 SYSCTL_ADD_U32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_probertt), 942 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 943 &rack_probertt_use_min_rtt_exit, 0, 944 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 945 SYSCTL_ADD_U32(&rack_sysctl_ctx, 946 SYSCTL_CHILDREN(rack_probertt), 947 OID_AUTO, "length_div", CTLFLAG_RW, 948 &rack_probertt_gpsrtt_cnt_div, 0, 949 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 950 SYSCTL_ADD_U32(&rack_sysctl_ctx, 951 SYSCTL_CHILDREN(rack_probertt), 952 OID_AUTO, "length_mul", CTLFLAG_RW, 953 &rack_probertt_gpsrtt_cnt_mul, 0, 954 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 955 SYSCTL_ADD_U32(&rack_sysctl_ctx, 956 SYSCTL_CHILDREN(rack_probertt), 957 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 958 &rack_min_probertt_hold, 200000, 959 "What is the minimum time we hold probertt at target"); 960 SYSCTL_ADD_U32(&rack_sysctl_ctx, 961 SYSCTL_CHILDREN(rack_probertt), 962 OID_AUTO, "filter_life", CTLFLAG_RW, 963 &rack_probertt_filter_life, 10000000, 964 "What is the time for the filters life in useconds"); 965 SYSCTL_ADD_U32(&rack_sysctl_ctx, 966 SYSCTL_CHILDREN(rack_probertt), 967 OID_AUTO, "lower_within", CTLFLAG_RW, 968 &rack_probertt_lower_within, 10, 969 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 970 SYSCTL_ADD_U32(&rack_sysctl_ctx, 971 SYSCTL_CHILDREN(rack_probertt), 972 OID_AUTO, "must_move", CTLFLAG_RW, 973 &rack_min_rtt_movement, 250, 974 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 975 SYSCTL_ADD_U32(&rack_sysctl_ctx, 976 SYSCTL_CHILDREN(rack_probertt), 977 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 978 &rack_probertt_clear_is, 1, 979 "Do we clear I/S counts on exiting probe-rtt"); 980 SYSCTL_ADD_S32(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_probertt), 982 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 983 &rack_max_drain_hbp, 1, 984 "How many extra drain gpsrtt's do we get in highly buffered paths"); 985 SYSCTL_ADD_S32(&rack_sysctl_ctx, 986 SYSCTL_CHILDREN(rack_probertt), 987 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 988 &rack_hbp_thresh, 3, 989 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 990 /* Pacing related sysctls */ 991 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 992 SYSCTL_CHILDREN(rack_sysctl_root), 993 OID_AUTO, 994 "pacing", 995 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 996 "Pacing related Controls"); 997 SYSCTL_ADD_S32(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_pacing), 999 OID_AUTO, "fulldgpinrec", CTLFLAG_RW, 1000 &rack_uses_full_dgp_in_rec, 1, 1001 "Do we use all DGP features in recovery (fillcw, timely et.al.)?"); 1002 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_pacing), 1004 OID_AUTO, "fullbufdisc", CTLFLAG_RW, 1005 &rack_full_buffer_discount, 10, 1006 "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); 1007 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1008 SYSCTL_CHILDREN(rack_pacing), 1009 OID_AUTO, "fillcw", CTLFLAG_RW, 1010 &rack_fill_cw_state, 0, 1011 "Enable fillcw on new connections (default=0 off)?"); 1012 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1013 SYSCTL_CHILDREN(rack_pacing), 1014 OID_AUTO, "min_burst", CTLFLAG_RW, 1015 &rack_pacing_min_seg, 0, 1016 "What is the min burst size for pacing (0 disables)?"); 1017 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1018 SYSCTL_CHILDREN(rack_pacing), 1019 OID_AUTO, "divisor", CTLFLAG_RW, 1020 &rack_default_pacing_divisor, 4, 1021 "What is the default divisor given to the rl code?"); 1022 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1023 SYSCTL_CHILDREN(rack_pacing), 1024 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, 1025 &rack_bw_multipler, 2, 1026 "What is the multiplier of the current gp_est that fillcw can increase the b/w too?"); 1027 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1028 SYSCTL_CHILDREN(rack_pacing), 1029 OID_AUTO, "max_pace_over", CTLFLAG_RW, 1030 &rack_max_per_above, 30, 1031 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 1032 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1033 SYSCTL_CHILDREN(rack_pacing), 1034 OID_AUTO, "allow1mss", CTLFLAG_RW, 1035 &rack_pace_one_seg, 0, 1036 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); 1037 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1038 SYSCTL_CHILDREN(rack_pacing), 1039 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 1040 &rack_limit_time_with_srtt, 0, 1041 "Do we limit pacing time based on srtt"); 1042 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1043 SYSCTL_CHILDREN(rack_pacing), 1044 OID_AUTO, "init_win", CTLFLAG_RW, 1045 &rack_default_init_window, 0, 1046 "Do we have a rack initial window 0 = system default"); 1047 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1048 SYSCTL_CHILDREN(rack_pacing), 1049 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1050 &rack_per_of_gp_ss, 250, 1051 "If non zero, what percentage of goodput to pace at in slow start"); 1052 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1053 SYSCTL_CHILDREN(rack_pacing), 1054 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1055 &rack_per_of_gp_ca, 150, 1056 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1057 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1058 SYSCTL_CHILDREN(rack_pacing), 1059 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1060 &rack_per_of_gp_rec, 200, 1061 "If non zero, what percentage of goodput to pace at in recovery"); 1062 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1063 SYSCTL_CHILDREN(rack_pacing), 1064 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1065 &rack_hptsi_segments, 40, 1066 "What size is the max for TSO segments in pacing and burst mitigation"); 1067 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1068 SYSCTL_CHILDREN(rack_pacing), 1069 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1070 &rack_slot_reduction, 4, 1071 "When doing only burst mitigation what is the reduce divisor"); 1072 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1073 SYSCTL_CHILDREN(rack_sysctl_root), 1074 OID_AUTO, "use_pacing", CTLFLAG_RW, 1075 &rack_pace_every_seg, 0, 1076 "If set we use pacing, if clear we use only the original burst mitigation"); 1077 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1078 SYSCTL_CHILDREN(rack_pacing), 1079 OID_AUTO, "rate_cap", CTLFLAG_RW, 1080 &rack_bw_rate_cap, 0, 1081 "If set we apply this value to the absolute rate cap used by pacing"); 1082 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1083 SYSCTL_CHILDREN(rack_sysctl_root), 1084 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1085 &rack_req_measurements, 1, 1086 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1087 /* Hardware pacing */ 1088 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1089 SYSCTL_CHILDREN(rack_sysctl_root), 1090 OID_AUTO, 1091 "hdwr_pacing", 1092 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1093 "Pacing related Controls"); 1094 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1095 SYSCTL_CHILDREN(rack_hw_pacing), 1096 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1097 &rack_hw_rwnd_factor, 2, 1098 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1099 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1100 SYSCTL_CHILDREN(rack_hw_pacing), 1101 OID_AUTO, "precheck", CTLFLAG_RW, 1102 &rack_hw_check_queue, 0, 1103 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); 1104 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1105 SYSCTL_CHILDREN(rack_hw_pacing), 1106 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1107 &rack_enobuf_hw_boost_mult, 0, 1108 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1109 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1110 SYSCTL_CHILDREN(rack_hw_pacing), 1111 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1112 &rack_enobuf_hw_max, 2, 1113 "What is the max boost the pacing time if we see a ENOBUFS?"); 1114 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1115 SYSCTL_CHILDREN(rack_hw_pacing), 1116 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1117 &rack_enobuf_hw_min, 2, 1118 "What is the min boost the pacing time if we see a ENOBUFS?"); 1119 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1120 SYSCTL_CHILDREN(rack_hw_pacing), 1121 OID_AUTO, "enable", CTLFLAG_RW, 1122 &rack_enable_hw_pacing, 0, 1123 "Should RACK attempt to use hw pacing?"); 1124 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1125 SYSCTL_CHILDREN(rack_hw_pacing), 1126 OID_AUTO, "rate_cap", CTLFLAG_RW, 1127 &rack_hw_rate_caps, 0, 1128 "Does the highest hardware pacing rate cap the rate we will send at??"); 1129 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1130 SYSCTL_CHILDREN(rack_hw_pacing), 1131 OID_AUTO, "uncap_per", CTLFLAG_RW, 1132 &rack_hw_rate_cap_per, 0, 1133 "If you go over b/w by this amount you will be uncapped (0 = never)"); 1134 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1135 SYSCTL_CHILDREN(rack_hw_pacing), 1136 OID_AUTO, "rate_min", CTLFLAG_RW, 1137 &rack_hw_rate_min, 0, 1138 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1139 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1140 SYSCTL_CHILDREN(rack_hw_pacing), 1141 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1142 &rack_hw_rate_to_low, 0, 1143 "If we fall below this rate, dis-engage hw pacing?"); 1144 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1145 SYSCTL_CHILDREN(rack_hw_pacing), 1146 OID_AUTO, "up_only", CTLFLAG_RW, 1147 &rack_hw_up_only, 0, 1148 "Do we allow hw pacing to lower the rate selected?"); 1149 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1150 SYSCTL_CHILDREN(rack_hw_pacing), 1151 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1152 &rack_hw_pace_extra_slots, 0, 1153 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1154 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1155 SYSCTL_CHILDREN(rack_sysctl_root), 1156 OID_AUTO, 1157 "timely", 1158 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1159 "Rack Timely RTT Controls"); 1160 /* Timely based GP dynmics */ 1161 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1162 SYSCTL_CHILDREN(rack_timely), 1163 OID_AUTO, "upper", CTLFLAG_RW, 1164 &rack_gp_per_bw_mul_up, 2, 1165 "Rack timely upper range for equal b/w (in percentage)"); 1166 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1167 SYSCTL_CHILDREN(rack_timely), 1168 OID_AUTO, "lower", CTLFLAG_RW, 1169 &rack_gp_per_bw_mul_down, 4, 1170 "Rack timely lower range for equal b/w (in percentage)"); 1171 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1172 SYSCTL_CHILDREN(rack_timely), 1173 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1174 &rack_gp_rtt_maxmul, 3, 1175 "Rack timely multiplier of lowest rtt for rtt_max"); 1176 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1177 SYSCTL_CHILDREN(rack_timely), 1178 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1179 &rack_gp_rtt_mindiv, 4, 1180 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1181 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1182 SYSCTL_CHILDREN(rack_timely), 1183 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1184 &rack_gp_rtt_minmul, 1, 1185 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1186 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1187 SYSCTL_CHILDREN(rack_timely), 1188 OID_AUTO, "decrease", CTLFLAG_RW, 1189 &rack_gp_decrease_per, 80, 1190 "Rack timely Beta value 80 = .8 (scaled by 100)"); 1191 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1192 SYSCTL_CHILDREN(rack_timely), 1193 OID_AUTO, "increase", CTLFLAG_RW, 1194 &rack_gp_increase_per, 2, 1195 "Rack timely increase perentage of our GP multiplication factor"); 1196 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1197 SYSCTL_CHILDREN(rack_timely), 1198 OID_AUTO, "lowerbound", CTLFLAG_RW, 1199 &rack_per_lower_bound, 50, 1200 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1201 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_timely), 1203 OID_AUTO, "p5_upper", CTLFLAG_RW, 1204 &rack_gain_p5_ub, 250, 1205 "Profile 5 upper bound to timely gain"); 1206 1207 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1208 SYSCTL_CHILDREN(rack_timely), 1209 OID_AUTO, "upperboundss", CTLFLAG_RW, 1210 &rack_per_upper_bound_ss, 0, 1211 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1212 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1213 SYSCTL_CHILDREN(rack_timely), 1214 OID_AUTO, "upperboundca", CTLFLAG_RW, 1215 &rack_per_upper_bound_ca, 0, 1216 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1217 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1218 SYSCTL_CHILDREN(rack_timely), 1219 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1220 &rack_do_dyn_mul, 0, 1221 "Rack timely do we enable dynmaic timely goodput by default"); 1222 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1223 SYSCTL_CHILDREN(rack_timely), 1224 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1225 &rack_gp_no_rec_chg, 1, 1226 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1227 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1228 SYSCTL_CHILDREN(rack_timely), 1229 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1230 &rack_timely_dec_clear, 6, 1231 "Rack timely what threshold do we count to before another boost during b/w decent"); 1232 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1233 SYSCTL_CHILDREN(rack_timely), 1234 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1235 &rack_timely_max_push_rise, 3, 1236 "Rack timely how many times do we push up with b/w increase"); 1237 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1238 SYSCTL_CHILDREN(rack_timely), 1239 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1240 &rack_timely_max_push_drop, 3, 1241 "Rack timely how many times do we push back on b/w decent"); 1242 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1243 SYSCTL_CHILDREN(rack_timely), 1244 OID_AUTO, "min_segs", CTLFLAG_RW, 1245 &rack_timely_min_segs, 4, 1246 "Rack timely when setting the cwnd what is the min num segments"); 1247 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1248 SYSCTL_CHILDREN(rack_timely), 1249 OID_AUTO, "noback_max", CTLFLAG_RW, 1250 &rack_use_max_for_nobackoff, 0, 1251 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1252 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1253 SYSCTL_CHILDREN(rack_timely), 1254 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1255 &rack_timely_int_timely_only, 0, 1256 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1257 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1258 SYSCTL_CHILDREN(rack_timely), 1259 OID_AUTO, "nonstop", CTLFLAG_RW, 1260 &rack_timely_no_stopping, 0, 1261 "Rack timely don't stop increase"); 1262 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1263 SYSCTL_CHILDREN(rack_timely), 1264 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1265 &rack_down_raise_thresh, 100, 1266 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1267 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1268 SYSCTL_CHILDREN(rack_timely), 1269 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1270 &rack_req_segs, 1, 1271 "Bottom dragging if not these many segments outstanding and room"); 1272 1273 /* TLP and Rack related parameters */ 1274 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1275 SYSCTL_CHILDREN(rack_sysctl_root), 1276 OID_AUTO, 1277 "tlp", 1278 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1279 "TLP and Rack related Controls"); 1280 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1281 SYSCTL_CHILDREN(rack_tlp), 1282 OID_AUTO, "use_rrr", CTLFLAG_RW, 1283 &use_rack_rr, 1, 1284 "Do we use Rack Rapid Recovery"); 1285 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1286 SYSCTL_CHILDREN(rack_tlp), 1287 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1288 &rack_max_abc_post_recovery, 2, 1289 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1290 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1291 SYSCTL_CHILDREN(rack_tlp), 1292 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1293 &rack_non_rxt_use_cr, 0, 1294 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1295 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1296 SYSCTL_CHILDREN(rack_tlp), 1297 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1298 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1299 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1300 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1301 SYSCTL_CHILDREN(rack_tlp), 1302 OID_AUTO, "limit", CTLFLAG_RW, 1303 &rack_tlp_limit, 2, 1304 "How many TLP's can be sent without sending new data"); 1305 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1306 SYSCTL_CHILDREN(rack_tlp), 1307 OID_AUTO, "use_greater", CTLFLAG_RW, 1308 &rack_tlp_use_greater, 1, 1309 "Should we use the rack_rtt time if its greater than srtt"); 1310 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1311 SYSCTL_CHILDREN(rack_tlp), 1312 OID_AUTO, "tlpminto", CTLFLAG_RW, 1313 &rack_tlp_min, 10000, 1314 "TLP minimum timeout per the specification (in microseconds)"); 1315 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1316 SYSCTL_CHILDREN(rack_tlp), 1317 OID_AUTO, "send_oldest", CTLFLAG_RW, 1318 &rack_always_send_oldest, 0, 1319 "Should we always send the oldest TLP and RACK-TLP"); 1320 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_tlp), 1322 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1323 &rack_limited_retran, 0, 1324 "How many times can a rack timeout drive out sends"); 1325 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1326 SYSCTL_CHILDREN(rack_tlp), 1327 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1328 &rack_lower_cwnd_at_tlp, 0, 1329 "When a TLP completes a retran should we enter recovery"); 1330 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1331 SYSCTL_CHILDREN(rack_tlp), 1332 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1333 &rack_reorder_thresh, 2, 1334 "What factor for rack will be added when seeing reordering (shift right)"); 1335 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1336 SYSCTL_CHILDREN(rack_tlp), 1337 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1338 &rack_tlp_thresh, 1, 1339 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1340 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1341 SYSCTL_CHILDREN(rack_tlp), 1342 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1343 &rack_reorder_fade, 60000000, 1344 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1345 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1346 SYSCTL_CHILDREN(rack_tlp), 1347 OID_AUTO, "pktdelay", CTLFLAG_RW, 1348 &rack_pkt_delay, 1000, 1349 "Extra RACK time (in microseconds) besides reordering thresh"); 1350 1351 /* Timer related controls */ 1352 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1353 SYSCTL_CHILDREN(rack_sysctl_root), 1354 OID_AUTO, 1355 "timers", 1356 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1357 "Timer related controls"); 1358 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1359 SYSCTL_CHILDREN(rack_timers), 1360 OID_AUTO, "persmin", CTLFLAG_RW, 1361 &rack_persist_min, 250000, 1362 "What is the minimum time in microseconds between persists"); 1363 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1364 SYSCTL_CHILDREN(rack_timers), 1365 OID_AUTO, "persmax", CTLFLAG_RW, 1366 &rack_persist_max, 2000000, 1367 "What is the largest delay in microseconds between persists"); 1368 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1369 SYSCTL_CHILDREN(rack_timers), 1370 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1371 &rack_delayed_ack_time, 40000, 1372 "Delayed ack time (40ms in microseconds)"); 1373 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1374 SYSCTL_CHILDREN(rack_timers), 1375 OID_AUTO, "minrto", CTLFLAG_RW, 1376 &rack_rto_min, 30000, 1377 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1378 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_timers), 1380 OID_AUTO, "maxrto", CTLFLAG_RW, 1381 &rack_rto_max, 4000000, 1382 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1383 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1384 SYSCTL_CHILDREN(rack_timers), 1385 OID_AUTO, "minto", CTLFLAG_RW, 1386 &rack_min_to, 1000, 1387 "Minimum rack timeout in microseconds"); 1388 /* Measure controls */ 1389 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1390 SYSCTL_CHILDREN(rack_sysctl_root), 1391 OID_AUTO, 1392 "measure", 1393 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1394 "Measure related controls"); 1395 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1396 SYSCTL_CHILDREN(rack_measure), 1397 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1398 &rack_wma_divisor, 8, 1399 "When doing b/w calculation what is the divisor for the WMA"); 1400 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1401 SYSCTL_CHILDREN(rack_measure), 1402 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1403 &rack_cwnd_block_ends_measure, 0, 1404 "Does a cwnd just-return end the measurement window (app limited)"); 1405 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1406 SYSCTL_CHILDREN(rack_measure), 1407 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1408 &rack_rwnd_block_ends_measure, 0, 1409 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1410 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1411 SYSCTL_CHILDREN(rack_measure), 1412 OID_AUTO, "min_target", CTLFLAG_RW, 1413 &rack_def_data_window, 20, 1414 "What is the minimum target window (in mss) for a GP measurements"); 1415 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1416 SYSCTL_CHILDREN(rack_measure), 1417 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1418 &rack_goal_bdp, 2, 1419 "What is the goal BDP to measure"); 1420 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_measure), 1422 OID_AUTO, "min_srtts", CTLFLAG_RW, 1423 &rack_min_srtts, 1, 1424 "What is the goal BDP to measure"); 1425 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1426 SYSCTL_CHILDREN(rack_measure), 1427 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1428 &rack_min_measure_usec, 0, 1429 "What is the Minimum time time for a measurement if 0, this is off"); 1430 /* Features */ 1431 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1432 SYSCTL_CHILDREN(rack_sysctl_root), 1433 OID_AUTO, 1434 "features", 1435 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1436 "Feature controls"); 1437 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1438 SYSCTL_CHILDREN(rack_features), 1439 OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW, 1440 &rack_rxt_clamp_thresh, 0, 1441 "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP"); 1442 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1443 SYSCTL_CHILDREN(rack_features), 1444 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, 1445 &rack_hybrid_allow_set_maxseg, 0, 1446 "Should hybrid pacing allow the setmss command"); 1447 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1448 SYSCTL_CHILDREN(rack_features), 1449 OID_AUTO, "cmpack", CTLFLAG_RW, 1450 &rack_use_cmp_acks, 1, 1451 "Should RACK have LRO send compressed acks"); 1452 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1453 SYSCTL_CHILDREN(rack_features), 1454 OID_AUTO, "fsb", CTLFLAG_RW, 1455 &rack_use_fsb, 1, 1456 "Should RACK use the fast send block?"); 1457 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1458 SYSCTL_CHILDREN(rack_features), 1459 OID_AUTO, "rfo", CTLFLAG_RW, 1460 &rack_use_rfo, 1, 1461 "Should RACK use rack_fast_output()?"); 1462 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1463 SYSCTL_CHILDREN(rack_features), 1464 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1465 &rack_use_rsm_rfo, 1, 1466 "Should RACK use rack_fast_rsm_output()?"); 1467 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1468 SYSCTL_CHILDREN(rack_features), 1469 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1470 &rack_enable_mqueue_for_nonpaced, 0, 1471 "Should RACK use mbuf queuing for non-paced connections"); 1472 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1473 SYSCTL_CHILDREN(rack_features), 1474 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1475 &rack_do_hystart, 0, 1476 "Should RACK enable HyStart++ on connections?"); 1477 /* Misc rack controls */ 1478 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1479 SYSCTL_CHILDREN(rack_sysctl_root), 1480 OID_AUTO, 1481 "misc", 1482 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1483 "Misc related controls"); 1484 #ifdef TCP_ACCOUNTING 1485 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1486 SYSCTL_CHILDREN(rack_misc), 1487 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1488 &rack_tcp_accounting, 0, 1489 "Should we turn on TCP accounting for all rack sessions?"); 1490 #endif 1491 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1492 SYSCTL_CHILDREN(rack_misc), 1493 OID_AUTO, "dnd", CTLFLAG_RW, 1494 &rack_dnd_default, 0, 1495 "Do not disturb default for rack_rrr = 3"); 1496 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1497 SYSCTL_CHILDREN(rack_misc), 1498 OID_AUTO, "sad_seg_per", CTLFLAG_RW, 1499 &sad_seg_size_per, 800, 1500 "Percentage of segment size needed in a sack 800 = 80.0?"); 1501 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1502 SYSCTL_CHILDREN(rack_misc), 1503 OID_AUTO, "rxt_controls", CTLFLAG_RW, 1504 &rack_rxt_controls, 0, 1505 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); 1506 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1507 SYSCTL_CHILDREN(rack_misc), 1508 OID_AUTO, "rack_hibeta", CTLFLAG_RW, 1509 &rack_hibeta_setting, 0, 1510 "Do we ue a high beta (80 instead of 50)?"); 1511 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1512 SYSCTL_CHILDREN(rack_misc), 1513 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1514 &rack_apply_rtt_with_reduced_conf, 0, 1515 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1516 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1517 SYSCTL_CHILDREN(rack_misc), 1518 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1519 &rack_dsack_std_based, 3, 1520 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1521 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1522 SYSCTL_CHILDREN(rack_misc), 1523 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1524 &rack_prr_addbackmax, 2, 1525 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1526 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1527 SYSCTL_CHILDREN(rack_misc), 1528 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1529 &rack_stats_gets_ms_rtt, 1, 1530 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1531 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1532 SYSCTL_CHILDREN(rack_misc), 1533 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1534 &rack_client_low_buf, 0, 1535 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1536 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1537 SYSCTL_CHILDREN(rack_misc), 1538 OID_AUTO, "defprofile", CTLFLAG_RW, 1539 &rack_def_profile, 0, 1540 "Should RACK use a default profile (0=no, num == profile num)?"); 1541 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1542 SYSCTL_CHILDREN(rack_misc), 1543 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1544 &rack_enable_shared_cwnd, 1, 1545 "Should RACK try to use the shared cwnd on connections where allowed"); 1546 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1547 SYSCTL_CHILDREN(rack_misc), 1548 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1549 &rack_limits_scwnd, 1, 1550 "Should RACK place low end time limits on the shared cwnd feature"); 1551 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1552 SYSCTL_CHILDREN(rack_misc), 1553 OID_AUTO, "no_prr", CTLFLAG_RW, 1554 &rack_disable_prr, 0, 1555 "Should RACK not use prr and only pace (must have pacing on)"); 1556 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1557 SYSCTL_CHILDREN(rack_misc), 1558 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1559 &rack_verbose_logging, 0, 1560 "Should RACK black box logging be verbose"); 1561 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1562 SYSCTL_CHILDREN(rack_misc), 1563 OID_AUTO, "data_after_close", CTLFLAG_RW, 1564 &rack_ignore_data_after_close, 1, 1565 "Do we hold off sending a RST until all pending data is ack'd"); 1566 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1567 SYSCTL_CHILDREN(rack_misc), 1568 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1569 &rack_sack_not_required, 1, 1570 "Do we allow rack to run on connections not supporting SACK"); 1571 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1572 SYSCTL_CHILDREN(rack_misc), 1573 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1574 &rack_send_a_lot_in_prr, 1, 1575 "Send a lot in prr"); 1576 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1577 SYSCTL_CHILDREN(rack_misc), 1578 OID_AUTO, "autoscale", CTLFLAG_RW, 1579 &rack_autosndbuf_inc, 20, 1580 "What percentage should rack scale up its snd buffer by?"); 1581 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1582 SYSCTL_CHILDREN(rack_misc), 1583 OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW, 1584 &rack_rxt_min_rnds, 10, 1585 "Number of rounds needed between RTT clamps due to high loss rates"); 1586 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1587 SYSCTL_CHILDREN(rack_misc), 1588 OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW, 1589 &rack_unclamp_round_thresh, 100, 1590 "Number of rounds needed with no loss to unclamp"); 1591 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1592 SYSCTL_CHILDREN(rack_misc), 1593 OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW, 1594 &rack_unclamp_rxt_thresh, 5, 1595 "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n"); 1596 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1597 SYSCTL_CHILDREN(rack_misc), 1598 OID_AUTO, "clamp_ss_upper", CTLFLAG_RW, 1599 &rack_clamp_ss_upper, 110, 1600 "Clamp percentage ceiling in SS?"); 1601 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1602 SYSCTL_CHILDREN(rack_misc), 1603 OID_AUTO, "clamp_ca_upper", CTLFLAG_RW, 1604 &rack_clamp_ca_upper, 110, 1605 "Clamp percentage ceiling in CA?"); 1606 /* Sack Attacker detection stuff */ 1607 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1608 SYSCTL_CHILDREN(rack_attack), 1609 OID_AUTO, "merge_out", CTLFLAG_RW, 1610 &rack_merge_out_sacks_on_attack, 0, 1611 "Do we merge the sendmap when we decide we are being attacked?"); 1612 1613 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1614 SYSCTL_CHILDREN(rack_attack), 1615 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1616 &rack_highest_sack_thresh_seen, 0, 1617 "Highest sack to ack ratio seen"); 1618 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1619 SYSCTL_CHILDREN(rack_attack), 1620 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1621 &rack_highest_move_thresh_seen, 0, 1622 "Highest move to non-move ratio seen"); 1623 rack_ack_total = counter_u64_alloc(M_WAITOK); 1624 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1625 SYSCTL_CHILDREN(rack_attack), 1626 OID_AUTO, "acktotal", CTLFLAG_RD, 1627 &rack_ack_total, 1628 "Total number of Ack's"); 1629 rack_express_sack = counter_u64_alloc(M_WAITOK); 1630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1631 SYSCTL_CHILDREN(rack_attack), 1632 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1633 &rack_express_sack, 1634 "Total expresss number of Sack's"); 1635 rack_sack_total = counter_u64_alloc(M_WAITOK); 1636 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1637 SYSCTL_CHILDREN(rack_attack), 1638 OID_AUTO, "sacktotal", CTLFLAG_RD, 1639 &rack_sack_total, 1640 "Total number of SACKs"); 1641 rack_move_none = counter_u64_alloc(M_WAITOK); 1642 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1643 SYSCTL_CHILDREN(rack_attack), 1644 OID_AUTO, "move_none", CTLFLAG_RD, 1645 &rack_move_none, 1646 "Total number of SACK index reuse of positions under threshold"); 1647 rack_move_some = counter_u64_alloc(M_WAITOK); 1648 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1649 SYSCTL_CHILDREN(rack_attack), 1650 OID_AUTO, "move_some", CTLFLAG_RD, 1651 &rack_move_some, 1652 "Total number of SACK index reuse of positions over threshold"); 1653 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1654 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1655 SYSCTL_CHILDREN(rack_attack), 1656 OID_AUTO, "attacks", CTLFLAG_RD, 1657 &rack_sack_attacks_detected, 1658 "Total number of SACK attackers that had sack disabled"); 1659 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1660 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1661 SYSCTL_CHILDREN(rack_attack), 1662 OID_AUTO, "reversed", CTLFLAG_RD, 1663 &rack_sack_attacks_reversed, 1664 "Total number of SACK attackers that were later determined false positive"); 1665 rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); 1666 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1667 SYSCTL_CHILDREN(rack_attack), 1668 OID_AUTO, "suspect", CTLFLAG_RD, 1669 &rack_sack_attacks_suspect, 1670 "Total number of SACKs that triggered early detection"); 1671 1672 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1673 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1674 SYSCTL_CHILDREN(rack_attack), 1675 OID_AUTO, "nextmerge", CTLFLAG_RD, 1676 &rack_sack_used_next_merge, 1677 "Total number of times we used the next merge"); 1678 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1679 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1680 SYSCTL_CHILDREN(rack_attack), 1681 OID_AUTO, "prevmerge", CTLFLAG_RD, 1682 &rack_sack_used_prev_merge, 1683 "Total number of times we used the prev merge"); 1684 /* Counters */ 1685 rack_total_bytes = counter_u64_alloc(M_WAITOK); 1686 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1687 SYSCTL_CHILDREN(rack_counters), 1688 OID_AUTO, "totalbytes", CTLFLAG_RD, 1689 &rack_total_bytes, 1690 "Total number of bytes sent"); 1691 rack_fto_send = counter_u64_alloc(M_WAITOK); 1692 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1693 SYSCTL_CHILDREN(rack_counters), 1694 OID_AUTO, "fto_send", CTLFLAG_RD, 1695 &rack_fto_send, "Total number of rack_fast_output sends"); 1696 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1697 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1698 SYSCTL_CHILDREN(rack_counters), 1699 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1700 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1701 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1702 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1703 SYSCTL_CHILDREN(rack_counters), 1704 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1705 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1706 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1707 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1708 SYSCTL_CHILDREN(rack_counters), 1709 OID_AUTO, "nfto_send", CTLFLAG_RD, 1710 &rack_non_fto_send, "Total number of rack_output first sends"); 1711 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1712 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1713 SYSCTL_CHILDREN(rack_counters), 1714 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1715 &rack_extended_rfo, "Total number of times we extended rfo"); 1716 1717 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1718 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1719 SYSCTL_CHILDREN(rack_counters), 1720 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1721 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1722 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1723 1724 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1725 SYSCTL_CHILDREN(rack_counters), 1726 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1727 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1728 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1729 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1730 SYSCTL_CHILDREN(rack_counters), 1731 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1732 &rack_tlp_tot, 1733 "Total number of tail loss probe expirations"); 1734 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1735 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1736 SYSCTL_CHILDREN(rack_counters), 1737 OID_AUTO, "tlp_new", CTLFLAG_RD, 1738 &rack_tlp_newdata, 1739 "Total number of tail loss probe sending new data"); 1740 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1741 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1742 SYSCTL_CHILDREN(rack_counters), 1743 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1744 &rack_tlp_retran, 1745 "Total number of tail loss probe sending retransmitted data"); 1746 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1747 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1748 SYSCTL_CHILDREN(rack_counters), 1749 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1750 &rack_tlp_retran_bytes, 1751 "Total bytes of tail loss probe sending retransmitted data"); 1752 rack_to_tot = counter_u64_alloc(M_WAITOK); 1753 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1754 SYSCTL_CHILDREN(rack_counters), 1755 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1756 &rack_to_tot, 1757 "Total number of times the rack to expired"); 1758 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1759 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1760 SYSCTL_CHILDREN(rack_counters), 1761 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1762 &rack_saw_enobuf, 1763 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1764 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1765 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1766 SYSCTL_CHILDREN(rack_counters), 1767 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1768 &rack_saw_enobuf_hw, 1769 "Total number of times a send returned enobuf for hdwr paced connections"); 1770 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1771 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1772 SYSCTL_CHILDREN(rack_counters), 1773 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1774 &rack_saw_enetunreach, 1775 "Total number of times a send received a enetunreachable"); 1776 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1777 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1778 SYSCTL_CHILDREN(rack_counters), 1779 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1780 &rack_hot_alloc, 1781 "Total allocations from the top of our list"); 1782 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1783 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1784 SYSCTL_CHILDREN(rack_counters), 1785 OID_AUTO, "allocs", CTLFLAG_RD, 1786 &rack_to_alloc, 1787 "Total allocations of tracking structures"); 1788 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1789 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1790 SYSCTL_CHILDREN(rack_counters), 1791 OID_AUTO, "allochard", CTLFLAG_RD, 1792 &rack_to_alloc_hard, 1793 "Total allocations done with sleeping the hard way"); 1794 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1795 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1796 SYSCTL_CHILDREN(rack_counters), 1797 OID_AUTO, "allocemerg", CTLFLAG_RD, 1798 &rack_to_alloc_emerg, 1799 "Total allocations done from emergency cache"); 1800 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1801 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1802 SYSCTL_CHILDREN(rack_counters), 1803 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1804 &rack_to_alloc_limited, 1805 "Total allocations dropped due to limit"); 1806 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1807 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1808 SYSCTL_CHILDREN(rack_counters), 1809 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1810 &rack_alloc_limited_conns, 1811 "Connections with allocations dropped due to limit"); 1812 rack_split_limited = counter_u64_alloc(M_WAITOK); 1813 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1814 SYSCTL_CHILDREN(rack_counters), 1815 OID_AUTO, "split_limited", CTLFLAG_RD, 1816 &rack_split_limited, 1817 "Split allocations dropped due to limit"); 1818 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); 1819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1820 SYSCTL_CHILDREN(rack_counters), 1821 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, 1822 &rack_rxt_clamps_cwnd, 1823 "Number of times that excessive rxt clamped the cwnd down"); 1824 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); 1825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1826 SYSCTL_CHILDREN(rack_counters), 1827 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, 1828 &rack_rxt_clamps_cwnd_uniq, 1829 "Number of connections that have had excessive rxt clamped the cwnd down"); 1830 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1831 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1832 SYSCTL_CHILDREN(rack_counters), 1833 OID_AUTO, "persist_sends", CTLFLAG_RD, 1834 &rack_persists_sends, 1835 "Number of times we sent a persist probe"); 1836 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1837 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1838 SYSCTL_CHILDREN(rack_counters), 1839 OID_AUTO, "persist_acks", CTLFLAG_RD, 1840 &rack_persists_acks, 1841 "Number of times a persist probe was acked"); 1842 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1843 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1844 SYSCTL_CHILDREN(rack_counters), 1845 OID_AUTO, "persist_loss", CTLFLAG_RD, 1846 &rack_persists_loss, 1847 "Number of times we detected a lost persist probe (no ack)"); 1848 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1849 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1850 SYSCTL_CHILDREN(rack_counters), 1851 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1852 &rack_persists_lost_ends, 1853 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1854 #ifdef INVARIANTS 1855 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1856 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1857 SYSCTL_CHILDREN(rack_counters), 1858 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1859 &rack_adjust_map_bw, 1860 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1861 #endif 1862 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1863 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1864 SYSCTL_CHILDREN(rack_counters), 1865 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1866 &rack_multi_single_eq, 1867 "Number of compressed acks total represented"); 1868 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1869 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1870 SYSCTL_CHILDREN(rack_counters), 1871 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1872 &rack_proc_non_comp_ack, 1873 "Number of non compresseds acks that we processed"); 1874 1875 1876 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1877 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1878 SYSCTL_CHILDREN(rack_counters), 1879 OID_AUTO, "sack_long", CTLFLAG_RD, 1880 &rack_sack_proc_all, 1881 "Total times we had to walk whole list for sack processing"); 1882 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1883 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1884 SYSCTL_CHILDREN(rack_counters), 1885 OID_AUTO, "sack_restart", CTLFLAG_RD, 1886 &rack_sack_proc_restart, 1887 "Total times we had to walk whole list due to a restart"); 1888 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1889 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1890 SYSCTL_CHILDREN(rack_counters), 1891 OID_AUTO, "sack_short", CTLFLAG_RD, 1892 &rack_sack_proc_short, 1893 "Total times we took shortcut for sack processing"); 1894 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1895 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1896 SYSCTL_CHILDREN(rack_attack), 1897 OID_AUTO, "skipacked", CTLFLAG_RD, 1898 &rack_sack_skipped_acked, 1899 "Total number of times we skipped previously sacked"); 1900 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1901 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1902 SYSCTL_CHILDREN(rack_attack), 1903 OID_AUTO, "ofsplit", CTLFLAG_RD, 1904 &rack_sack_splits, 1905 "Total number of times we did the old fashion tree split"); 1906 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1907 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1908 SYSCTL_CHILDREN(rack_counters), 1909 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1910 &rack_input_idle_reduces, 1911 "Total number of idle reductions on input"); 1912 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 1913 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1914 SYSCTL_CHILDREN(rack_counters), 1915 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 1916 &rack_collapsed_win_seen, 1917 "Total number of collapsed window events seen (where our window shrinks)"); 1918 1919 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1920 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1921 SYSCTL_CHILDREN(rack_counters), 1922 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1923 &rack_collapsed_win, 1924 "Total number of collapsed window events where we mark packets"); 1925 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 1926 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1927 SYSCTL_CHILDREN(rack_counters), 1928 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 1929 &rack_collapsed_win_rxt, 1930 "Total number of packets that were retransmitted"); 1931 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 1932 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1933 SYSCTL_CHILDREN(rack_counters), 1934 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 1935 &rack_collapsed_win_rxt_bytes, 1936 "Total number of bytes that were retransmitted"); 1937 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1938 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1939 SYSCTL_CHILDREN(rack_counters), 1940 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1941 &rack_try_scwnd, 1942 "Total number of scwnd attempts"); 1943 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1944 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1945 OID_AUTO, "outsize", CTLFLAG_RD, 1946 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1947 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1948 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1949 OID_AUTO, "opts", CTLFLAG_RD, 1950 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1951 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1952 SYSCTL_CHILDREN(rack_sysctl_root), 1953 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1954 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1955 } 1956 1957 static uint32_t 1958 rc_init_window(struct tcp_rack *rack) 1959 { 1960 uint32_t win; 1961 1962 if (rack->rc_init_win == 0) { 1963 /* 1964 * Nothing set by the user, use the system stack 1965 * default. 1966 */ 1967 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1968 } 1969 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1970 return (win); 1971 } 1972 1973 static uint64_t 1974 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1975 { 1976 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1977 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1978 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1979 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1980 else 1981 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1982 } 1983 1984 static void 1985 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, 1986 uint64_t data, uint8_t mod, uint16_t aux, 1987 struct tcp_sendfile_track *cur, int line) 1988 { 1989 #ifdef TCP_REQUEST_TRK 1990 int do_log = 0; 1991 1992 /* 1993 * The rate cap one is noisy and only should come out when normal BB logging 1994 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out 1995 * once per chunk and make up the BBpoint that can be turned on by the client. 1996 */ 1997 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 1998 /* 1999 * The very noisy two need to only come out when 2000 * we have verbose logging on. 2001 */ 2002 if (rack_verbose_logging != 0) 2003 do_log = tcp_bblogging_on(rack->rc_tp); 2004 else 2005 do_log = 0; 2006 } else if (mod != HYBRID_LOG_BW_MEASURE) { 2007 /* 2008 * All other less noisy logs here except the measure which 2009 * also needs to come out on the point and the log. 2010 */ 2011 do_log = tcp_bblogging_on(rack->rc_tp); 2012 } else { 2013 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); 2014 } 2015 2016 if (do_log) { 2017 union tcp_log_stackspecific log; 2018 struct timeval tv; 2019 uint64_t lt_bw; 2020 2021 /* Convert our ms to a microsecond */ 2022 memset(&log, 0, sizeof(log)); 2023 2024 log.u_bbr.cwnd_gain = line; 2025 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2026 log.u_bbr.rttProp = tim; 2027 log.u_bbr.bw_inuse = cbw; 2028 log.u_bbr.delRate = rack_get_gp_est(rack); 2029 lt_bw = rack_get_lt_bw(rack); 2030 log.u_bbr.flex1 = seq; 2031 log.u_bbr.pacing_gain = aux; 2032 /* lt_bw = < flex3 | flex2 > */ 2033 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); 2034 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); 2035 /* Record the last obtained us rtt in inflight */ 2036 if (cur == NULL) { 2037 /* Make sure we are looking at the right log if an overide comes in */ 2038 cur = rack->r_ctl.rc_last_sft; 2039 } 2040 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) 2041 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; 2042 else { 2043 /* Use the last known rtt i.e. the rack-rtt */ 2044 log.u_bbr.inflight = rack->rc_rack_rtt; 2045 } 2046 if (cur != NULL) { 2047 uint64_t off; 2048 2049 log.u_bbr.cur_del_rate = cur->deadline; 2050 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2051 /* start = < lost | pkt_epoch > */ 2052 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2053 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2054 log.u_bbr.flex6 = cur->start_seq; 2055 log.u_bbr.pkts_out = cur->end_seq; 2056 } else { 2057 /* start = < lost | pkt_epoch > */ 2058 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2059 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2060 /* end = < pkts_out | flex6 > */ 2061 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); 2062 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2063 } 2064 /* first_send = <lt_epoch | epoch> */ 2065 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); 2066 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); 2067 /* localtime = <delivered | applimited>*/ 2068 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2069 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2070 #ifdef TCP_REQUEST_TRK 2071 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2072 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2073 #endif 2074 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); 2075 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); 2076 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; 2077 } else { 2078 log.u_bbr.flex7 = 0xffff; 2079 log.u_bbr.cur_del_rate = 0xffffffffffffffff; 2080 } 2081 /* 2082 * Compose bbr_state to be a bit wise 0000ADHF 2083 * where A is the always_pace flag 2084 * where D is the dgp_on flag 2085 * where H is the hybrid_mode on flag 2086 * where F is the use_fixed_rate flag. 2087 */ 2088 log.u_bbr.bbr_state = rack->rc_always_pace; 2089 log.u_bbr.bbr_state <<= 1; 2090 log.u_bbr.bbr_state |= rack->dgp_on; 2091 log.u_bbr.bbr_state <<= 1; 2092 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2093 log.u_bbr.bbr_state <<= 1; 2094 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2095 log.u_bbr.flex8 = mod; 2096 tcp_log_event(rack->rc_tp, NULL, 2097 &rack->rc_inp->inp_socket->so_rcv, 2098 &rack->rc_inp->inp_socket->so_snd, 2099 TCP_HYBRID_PACING_LOG, 0, 2100 0, &log, false, NULL, __func__, __LINE__, &tv); 2101 2102 } 2103 #endif 2104 } 2105 2106 #ifdef TCP_REQUEST_TRK 2107 static void 2108 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line) 2109 { 2110 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) { 2111 union tcp_log_stackspecific log; 2112 struct timeval tv; 2113 uint64_t off; 2114 2115 /* Convert our ms to a microsecond */ 2116 memset(&log, 0, sizeof(log)); 2117 2118 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2119 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 2120 log.u_bbr.delRate = cur->sent_at_fs; 2121 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; 2122 log.u_bbr.bw_inuse = cur->rxt_at_fs; 2123 log.u_bbr.cwnd_gain = line; 2124 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2125 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2126 /* start = < flex1 | flex2 > */ 2127 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff); 2128 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2129 /* end = < flex3 | flex4 > */ 2130 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff); 2131 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2132 2133 /* localtime = <delivered | applimited>*/ 2134 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2135 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2136 /* client timestamp = <lt_epoch | epoch>*/ 2137 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff); 2138 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff); 2139 /* now set all the flags in */ 2140 log.u_bbr.pkts_out = cur->hybrid_flags; 2141 log.u_bbr.flex6 = cur->flags; 2142 /* 2143 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases 2144 * where a false retransmit occurred so first_send <-> lastsend may 2145 * include longer time then it actually took if we have a false rxt. 2146 */ 2147 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff); 2148 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff); 2149 2150 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST; 2151 tcp_log_event(rack->rc_tp, NULL, 2152 &rack->rc_inp->inp_socket->so_rcv, 2153 &rack->rc_inp->inp_socket->so_snd, 2154 TCP_HYBRID_PACING_LOG, 0, 2155 0, &log, false, NULL, __func__, __LINE__, &tv); 2156 } 2157 } 2158 #endif 2159 2160 static inline uint64_t 2161 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) 2162 { 2163 uint64_t ret_bw, ether; 2164 uint64_t u_segsiz; 2165 2166 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); 2167 if (rack->r_is_v6){ 2168 #ifdef INET6 2169 ether += sizeof(struct ip6_hdr); 2170 #endif 2171 ether += 14; /* eheader size 6+6+2 */ 2172 } else { 2173 #ifdef INET 2174 ether += sizeof(struct ip); 2175 #endif 2176 ether += 14; /* eheader size 6+6+2 */ 2177 } 2178 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); 2179 ret_bw = bw; 2180 ret_bw *= ether; 2181 ret_bw /= u_segsiz; 2182 return (ret_bw); 2183 } 2184 2185 static void 2186 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) 2187 { 2188 #ifdef TCP_REQUEST_TRK 2189 struct timeval tv; 2190 uint64_t timenow, timeleft, lenleft, lengone, calcbw; 2191 #endif 2192 2193 if (rack->r_ctl.bw_rate_cap == 0) 2194 return; 2195 #ifdef TCP_REQUEST_TRK 2196 if (rack->rc_catch_up && rack->rc_hybrid_mode && 2197 (rack->r_ctl.rc_last_sft != NULL)) { 2198 /* 2199 * We have a dynamic cap. The original target 2200 * is in bw_rate_cap, but we need to look at 2201 * how long it is until we hit the deadline. 2202 */ 2203 struct tcp_sendfile_track *ent; 2204 2205 ent = rack->r_ctl.rc_last_sft; 2206 microuptime(&tv); 2207 timenow = tcp_tv_to_lusectick(&tv); 2208 if (timenow >= ent->deadline) { 2209 /* No time left we do DGP only */ 2210 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2211 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2212 rack->r_ctl.bw_rate_cap = 0; 2213 return; 2214 } 2215 /* We have the time */ 2216 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; 2217 if (timeleft < HPTS_MSEC_IN_SEC) { 2218 /* If there is less than a ms left just use DGPs rate */ 2219 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2220 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2221 rack->r_ctl.bw_rate_cap = 0; 2222 return; 2223 } 2224 /* 2225 * Now lets find the amount of data left to send. 2226 * 2227 * Now ideally we want to use the end_seq to figure out how much more 2228 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. 2229 */ 2230 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) { 2231 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) 2232 lenleft = ent->end_seq - rack->rc_tp->snd_una; 2233 else { 2234 /* TSNH, we should catch it at the send */ 2235 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2236 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2237 rack->r_ctl.bw_rate_cap = 0; 2238 return; 2239 } 2240 } else { 2241 /* 2242 * The hard way, figure out how much is gone and then 2243 * take that away from the total the client asked for 2244 * (thats off by tls overhead if this is tls). 2245 */ 2246 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) 2247 lengone = rack->rc_tp->snd_una - ent->start_seq; 2248 else 2249 lengone = 0; 2250 if (lengone < (ent->end - ent->start)) 2251 lenleft = (ent->end - ent->start) - lengone; 2252 else { 2253 /* TSNH, we should catch it at the send */ 2254 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2255 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2256 rack->r_ctl.bw_rate_cap = 0; 2257 return; 2258 } 2259 } 2260 if (lenleft == 0) { 2261 /* We have it all sent */ 2262 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2263 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__); 2264 if (rack->r_ctl.bw_rate_cap) 2265 goto normal_ratecap; 2266 else 2267 return; 2268 } 2269 calcbw = lenleft * HPTS_USEC_IN_SEC; 2270 calcbw /= timeleft; 2271 /* Now we must compensate for IP/TCP overhead */ 2272 calcbw = rack_compensate_for_linerate(rack, calcbw); 2273 /* Update the bit rate cap */ 2274 rack->r_ctl.bw_rate_cap = calcbw; 2275 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2276 (rack_hybrid_allow_set_maxseg == 1) && 2277 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2278 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2279 uint32_t orig_max; 2280 2281 orig_max = rack->r_ctl.rc_pace_max_segs; 2282 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2283 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); 2284 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2285 } 2286 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2287 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__); 2288 if ((calcbw > 0) && (*bw > calcbw)) { 2289 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2290 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__); 2291 *capped = 1; 2292 *bw = calcbw; 2293 } 2294 return; 2295 } 2296 normal_ratecap: 2297 #endif 2298 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { 2299 #ifdef TCP_REQUEST_TRK 2300 if (rack->rc_hybrid_mode && 2301 rack->rc_catch_up && 2302 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2303 (rack_hybrid_allow_set_maxseg == 1) && 2304 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2305 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2306 uint32_t orig_max; 2307 2308 orig_max = rack->r_ctl.rc_pace_max_segs; 2309 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2310 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); 2311 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2312 } 2313 #endif 2314 *capped = 1; 2315 *bw = rack->r_ctl.bw_rate_cap; 2316 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2317 *bw, 0, 0, 2318 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__); 2319 } 2320 } 2321 2322 static uint64_t 2323 rack_get_gp_est(struct tcp_rack *rack) 2324 { 2325 uint64_t bw, lt_bw, ret_bw; 2326 2327 if (rack->rc_gp_filled == 0) { 2328 /* 2329 * We have yet no b/w measurement, 2330 * if we have a user set initial bw 2331 * return it. If we don't have that and 2332 * we have an srtt, use the tcp IW (10) to 2333 * calculate a fictional b/w over the SRTT 2334 * which is more or less a guess. Note 2335 * we don't use our IW from rack on purpose 2336 * so if we have like IW=30, we are not 2337 * calculating a "huge" b/w. 2338 */ 2339 uint64_t srtt; 2340 2341 lt_bw = rack_get_lt_bw(rack); 2342 if (lt_bw) { 2343 /* 2344 * No goodput bw but a long-term b/w does exist 2345 * lets use that. 2346 */ 2347 ret_bw = lt_bw; 2348 goto compensate; 2349 } 2350 if (rack->r_ctl.init_rate) 2351 return (rack->r_ctl.init_rate); 2352 2353 /* Ok lets come up with the IW guess, if we have a srtt */ 2354 if (rack->rc_tp->t_srtt == 0) { 2355 /* 2356 * Go with old pacing method 2357 * i.e. burst mitigation only. 2358 */ 2359 return (0); 2360 } 2361 /* Ok lets get the initial TCP win (not racks) */ 2362 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2363 srtt = (uint64_t)rack->rc_tp->t_srtt; 2364 bw *= (uint64_t)USECS_IN_SECOND; 2365 bw /= srtt; 2366 ret_bw = bw; 2367 goto compensate; 2368 2369 } 2370 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2371 /* Averaging is done, we can return the value */ 2372 bw = rack->r_ctl.gp_bw; 2373 } else { 2374 /* Still doing initial average must calculate */ 2375 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); 2376 } 2377 lt_bw = rack_get_lt_bw(rack); 2378 if (lt_bw == 0) { 2379 /* If we don't have one then equate it to the gp_bw */ 2380 lt_bw = rack->r_ctl.gp_bw; 2381 } 2382 if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){ 2383 /* if clamped take the lowest */ 2384 if (lt_bw < bw) 2385 ret_bw = lt_bw; 2386 else 2387 ret_bw = bw; 2388 } else { 2389 /* If not set for clamped to get lowest, take the highest */ 2390 if (lt_bw > bw) 2391 ret_bw = lt_bw; 2392 else 2393 ret_bw = bw; 2394 } 2395 /* 2396 * Now lets compensate based on the TCP/IP overhead. Our 2397 * Goodput estimate does not include this so we must pace out 2398 * a bit faster since our pacing calculations do. The pacing 2399 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz 2400 * we are using to do this, so we do that here in the opposite 2401 * direction as well. This means that if we are tunneled and the 2402 * segsiz is say 1200 bytes we will get quite a boost, but its 2403 * compensated for in the pacing time the opposite way. 2404 */ 2405 compensate: 2406 ret_bw = rack_compensate_for_linerate(rack, ret_bw); 2407 return(ret_bw); 2408 } 2409 2410 2411 static uint64_t 2412 rack_get_bw(struct tcp_rack *rack) 2413 { 2414 uint64_t bw; 2415 2416 if (rack->use_fixed_rate) { 2417 /* Return the fixed pacing rate */ 2418 return (rack_get_fixed_pacing_bw(rack)); 2419 } 2420 bw = rack_get_gp_est(rack); 2421 return (bw); 2422 } 2423 2424 static uint16_t 2425 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2426 { 2427 if (rack->use_fixed_rate) { 2428 return (100); 2429 } else if (rack->in_probe_rtt && (rsm == NULL)) 2430 return (rack->r_ctl.rack_per_of_gp_probertt); 2431 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2432 rack->r_ctl.rack_per_of_gp_rec)) { 2433 if (rsm) { 2434 /* a retransmission always use the recovery rate */ 2435 return (rack->r_ctl.rack_per_of_gp_rec); 2436 } else if (rack->rack_rec_nonrxt_use_cr) { 2437 /* Directed to use the configured rate */ 2438 goto configured_rate; 2439 } else if (rack->rack_no_prr && 2440 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2441 /* No PRR, lets just use the b/w estimate only */ 2442 return (100); 2443 } else { 2444 /* 2445 * Here we may have a non-retransmit but we 2446 * have no overrides, so just use the recovery 2447 * rate (prr is in effect). 2448 */ 2449 return (rack->r_ctl.rack_per_of_gp_rec); 2450 } 2451 } 2452 configured_rate: 2453 /* For the configured rate we look at our cwnd vs the ssthresh */ 2454 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2455 return (rack->r_ctl.rack_per_of_gp_ss); 2456 else 2457 return (rack->r_ctl.rack_per_of_gp_ca); 2458 } 2459 2460 static void 2461 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2462 { 2463 /* 2464 * Types of logs (mod value) 2465 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2466 * 2 = a dsack round begins, persist is reset to 16. 2467 * 3 = a dsack round ends 2468 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2469 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2470 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2471 */ 2472 if (tcp_bblogging_on(rack->rc_tp)) { 2473 union tcp_log_stackspecific log; 2474 struct timeval tv; 2475 2476 memset(&log, 0, sizeof(log)); 2477 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2478 log.u_bbr.flex1 <<= 1; 2479 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2480 log.u_bbr.flex1 <<= 1; 2481 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2482 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2483 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2484 log.u_bbr.flex4 = flex4; 2485 log.u_bbr.flex5 = flex5; 2486 log.u_bbr.flex6 = flex6; 2487 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2488 log.u_bbr.flex8 = mod; 2489 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2490 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2491 &rack->rc_inp->inp_socket->so_rcv, 2492 &rack->rc_inp->inp_socket->so_snd, 2493 RACK_DSACK_HANDLING, 0, 2494 0, &log, false, &tv); 2495 } 2496 } 2497 2498 static void 2499 rack_log_hdwr_pacing(struct tcp_rack *rack, 2500 uint64_t rate, uint64_t hw_rate, int line, 2501 int error, uint16_t mod) 2502 { 2503 if (tcp_bblogging_on(rack->rc_tp)) { 2504 union tcp_log_stackspecific log; 2505 struct timeval tv; 2506 const struct ifnet *ifp; 2507 2508 memset(&log, 0, sizeof(log)); 2509 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2510 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2511 if (rack->r_ctl.crte) { 2512 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2513 } else if (rack->rc_inp->inp_route.ro_nh && 2514 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2515 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2516 } else 2517 ifp = NULL; 2518 if (ifp) { 2519 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2520 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2521 } 2522 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2523 log.u_bbr.bw_inuse = rate; 2524 log.u_bbr.flex5 = line; 2525 log.u_bbr.flex6 = error; 2526 log.u_bbr.flex7 = mod; 2527 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2528 log.u_bbr.flex8 = rack->use_fixed_rate; 2529 log.u_bbr.flex8 <<= 1; 2530 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2531 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2532 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2533 if (rack->r_ctl.crte) 2534 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2535 else 2536 log.u_bbr.cur_del_rate = 0; 2537 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2538 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2539 &rack->rc_inp->inp_socket->so_rcv, 2540 &rack->rc_inp->inp_socket->so_snd, 2541 BBR_LOG_HDWR_PACE, 0, 2542 0, &log, false, &tv); 2543 } 2544 } 2545 2546 static uint64_t 2547 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2548 { 2549 /* 2550 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2551 */ 2552 uint64_t bw_est, high_rate; 2553 uint64_t gain; 2554 2555 if ((rack->r_pacing_discount == 0) || 2556 (rack_full_buffer_discount == 0)) { 2557 /* 2558 * No buffer level based discount from client buffer 2559 * level is enabled or the feature is disabled. 2560 */ 2561 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2562 bw_est = bw * gain; 2563 bw_est /= (uint64_t)100; 2564 } else { 2565 /* 2566 * We have a discount in place apply it with 2567 * just a 100% gain (we get no boost if the buffer 2568 * is full). 2569 */ 2570 uint64_t discount; 2571 2572 discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm); 2573 discount /= 100; 2574 /* What %% of the b/w do we discount */ 2575 bw_est = bw - discount; 2576 } 2577 /* Never fall below the minimum (def 64kbps) */ 2578 if (bw_est < RACK_MIN_BW) 2579 bw_est = RACK_MIN_BW; 2580 if (rack->r_rack_hw_rate_caps) { 2581 /* Rate caps are in place */ 2582 if (rack->r_ctl.crte != NULL) { 2583 /* We have a hdwr rate already */ 2584 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2585 if (bw_est >= high_rate) { 2586 /* We are capping bw at the highest rate table entry */ 2587 if (rack_hw_rate_cap_per && 2588 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { 2589 rack->r_rack_hw_rate_caps = 0; 2590 goto done; 2591 } 2592 rack_log_hdwr_pacing(rack, 2593 bw_est, high_rate, __LINE__, 2594 0, 3); 2595 bw_est = high_rate; 2596 if (capped) 2597 *capped = 1; 2598 } 2599 } else if ((rack->rack_hdrw_pacing == 0) && 2600 (rack->rack_hdw_pace_ena) && 2601 (rack->rack_attempt_hdwr_pace == 0) && 2602 (rack->rc_inp->inp_route.ro_nh != NULL) && 2603 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2604 /* 2605 * Special case, we have not yet attempted hardware 2606 * pacing, and yet we may, when we do, find out if we are 2607 * above the highest rate. We need to know the maxbw for the interface 2608 * in question (if it supports ratelimiting). We get back 2609 * a 0, if the interface is not found in the RL lists. 2610 */ 2611 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2612 if (high_rate) { 2613 /* Yep, we have a rate is it above this rate? */ 2614 if (bw_est > high_rate) { 2615 bw_est = high_rate; 2616 if (capped) 2617 *capped = 1; 2618 } 2619 } 2620 } 2621 } 2622 done: 2623 return (bw_est); 2624 } 2625 2626 static void 2627 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2628 { 2629 if (tcp_bblogging_on(rack->rc_tp)) { 2630 union tcp_log_stackspecific log; 2631 struct timeval tv; 2632 2633 if (rack->sack_attack_disable > 0) 2634 goto log_anyway; 2635 if ((mod != 1) && (rack_verbose_logging == 0)) { 2636 /* 2637 * We get 3 values currently for mod 2638 * 1 - We are retransmitting and this tells the reason. 2639 * 2 - We are clearing a dup-ack count. 2640 * 3 - We are incrementing a dup-ack count. 2641 * 2642 * The clear/increment are only logged 2643 * if you have BBverbose on. 2644 */ 2645 return; 2646 } 2647 log_anyway: 2648 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2649 log.u_bbr.flex1 = tsused; 2650 log.u_bbr.flex2 = thresh; 2651 log.u_bbr.flex3 = rsm->r_flags; 2652 log.u_bbr.flex4 = rsm->r_dupack; 2653 log.u_bbr.flex5 = rsm->r_start; 2654 log.u_bbr.flex6 = rsm->r_end; 2655 log.u_bbr.flex8 = mod; 2656 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2657 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2658 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2659 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2660 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2661 log.u_bbr.pacing_gain = rack->r_must_retran; 2662 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2663 &rack->rc_inp->inp_socket->so_rcv, 2664 &rack->rc_inp->inp_socket->so_snd, 2665 BBR_LOG_SETTINGS_CHG, 0, 2666 0, &log, false, &tv); 2667 } 2668 } 2669 2670 static void 2671 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2672 { 2673 if (tcp_bblogging_on(rack->rc_tp)) { 2674 union tcp_log_stackspecific log; 2675 struct timeval tv; 2676 2677 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2678 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2679 log.u_bbr.flex2 = to; 2680 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2681 log.u_bbr.flex4 = slot; 2682 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; 2683 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2684 log.u_bbr.flex7 = rack->rc_in_persist; 2685 log.u_bbr.flex8 = which; 2686 if (rack->rack_no_prr) 2687 log.u_bbr.pkts_out = 0; 2688 else 2689 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2690 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2691 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2692 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2693 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2694 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2695 log.u_bbr.pacing_gain = rack->r_must_retran; 2696 log.u_bbr.cwnd_gain = rack->rack_deferred_inited; 2697 log.u_bbr.pkt_epoch = rack->rc_has_collapsed; 2698 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2699 log.u_bbr.lost = rack_rto_min; 2700 log.u_bbr.epoch = rack->r_ctl.roundends; 2701 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2702 &rack->rc_inp->inp_socket->so_rcv, 2703 &rack->rc_inp->inp_socket->so_snd, 2704 BBR_LOG_TIMERSTAR, 0, 2705 0, &log, false, &tv); 2706 } 2707 } 2708 2709 static void 2710 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2711 { 2712 if (tcp_bblogging_on(rack->rc_tp)) { 2713 union tcp_log_stackspecific log; 2714 struct timeval tv; 2715 2716 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2717 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2718 log.u_bbr.flex8 = to_num; 2719 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2720 log.u_bbr.flex2 = rack->rc_rack_rtt; 2721 if (rsm == NULL) 2722 log.u_bbr.flex3 = 0; 2723 else 2724 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2725 if (rack->rack_no_prr) 2726 log.u_bbr.flex5 = 0; 2727 else 2728 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2729 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2730 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2731 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2732 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2733 log.u_bbr.pacing_gain = rack->r_must_retran; 2734 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2735 &rack->rc_inp->inp_socket->so_rcv, 2736 &rack->rc_inp->inp_socket->so_snd, 2737 BBR_LOG_RTO, 0, 2738 0, &log, false, &tv); 2739 } 2740 } 2741 2742 static void 2743 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2744 struct rack_sendmap *prev, 2745 struct rack_sendmap *rsm, 2746 struct rack_sendmap *next, 2747 int flag, uint32_t th_ack, int line) 2748 { 2749 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2750 union tcp_log_stackspecific log; 2751 struct timeval tv; 2752 2753 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2754 log.u_bbr.flex8 = flag; 2755 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2756 log.u_bbr.cur_del_rate = (uint64_t)prev; 2757 log.u_bbr.delRate = (uint64_t)rsm; 2758 log.u_bbr.rttProp = (uint64_t)next; 2759 log.u_bbr.flex7 = 0; 2760 if (prev) { 2761 log.u_bbr.flex1 = prev->r_start; 2762 log.u_bbr.flex2 = prev->r_end; 2763 log.u_bbr.flex7 |= 0x4; 2764 } 2765 if (rsm) { 2766 log.u_bbr.flex3 = rsm->r_start; 2767 log.u_bbr.flex4 = rsm->r_end; 2768 log.u_bbr.flex7 |= 0x2; 2769 } 2770 if (next) { 2771 log.u_bbr.flex5 = next->r_start; 2772 log.u_bbr.flex6 = next->r_end; 2773 log.u_bbr.flex7 |= 0x1; 2774 } 2775 log.u_bbr.applimited = line; 2776 log.u_bbr.pkts_out = th_ack; 2777 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2778 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2779 if (rack->rack_no_prr) 2780 log.u_bbr.lost = 0; 2781 else 2782 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2783 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2784 &rack->rc_inp->inp_socket->so_rcv, 2785 &rack->rc_inp->inp_socket->so_snd, 2786 TCP_LOG_MAPCHG, 0, 2787 0, &log, false, &tv); 2788 } 2789 } 2790 2791 static void 2792 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2793 struct rack_sendmap *rsm, int conf) 2794 { 2795 if (tcp_bblogging_on(tp)) { 2796 union tcp_log_stackspecific log; 2797 struct timeval tv; 2798 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2799 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2800 log.u_bbr.flex1 = t; 2801 log.u_bbr.flex2 = len; 2802 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2803 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2804 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2805 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2806 log.u_bbr.flex7 = conf; 2807 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2808 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2809 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2810 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2811 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2812 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2813 if (rsm) { 2814 log.u_bbr.pkt_epoch = rsm->r_start; 2815 log.u_bbr.lost = rsm->r_end; 2816 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2817 /* We loose any upper of the 24 bits */ 2818 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2819 } else { 2820 /* Its a SYN */ 2821 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2822 log.u_bbr.lost = 0; 2823 log.u_bbr.cwnd_gain = 0; 2824 log.u_bbr.pacing_gain = 0; 2825 } 2826 /* Write out general bits of interest rrs here */ 2827 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2828 log.u_bbr.use_lt_bw <<= 1; 2829 log.u_bbr.use_lt_bw |= rack->forced_ack; 2830 log.u_bbr.use_lt_bw <<= 1; 2831 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2832 log.u_bbr.use_lt_bw <<= 1; 2833 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2834 log.u_bbr.use_lt_bw <<= 1; 2835 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2836 log.u_bbr.use_lt_bw <<= 1; 2837 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2838 log.u_bbr.use_lt_bw <<= 1; 2839 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2840 log.u_bbr.use_lt_bw <<= 1; 2841 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2842 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2843 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2844 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2845 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2846 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2847 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2848 log.u_bbr.bw_inuse <<= 32; 2849 if (rsm) 2850 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2851 TCP_LOG_EVENTP(tp, NULL, 2852 &rack->rc_inp->inp_socket->so_rcv, 2853 &rack->rc_inp->inp_socket->so_snd, 2854 BBR_LOG_BBRRTT, 0, 2855 0, &log, false, &tv); 2856 2857 2858 } 2859 } 2860 2861 static void 2862 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2863 { 2864 /* 2865 * Log the rtt sample we are 2866 * applying to the srtt algorithm in 2867 * useconds. 2868 */ 2869 if (tcp_bblogging_on(rack->rc_tp)) { 2870 union tcp_log_stackspecific log; 2871 struct timeval tv; 2872 2873 /* Convert our ms to a microsecond */ 2874 memset(&log, 0, sizeof(log)); 2875 log.u_bbr.flex1 = rtt; 2876 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2877 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2878 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2879 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2880 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2881 log.u_bbr.flex7 = 1; 2882 log.u_bbr.flex8 = rack->sack_attack_disable; 2883 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2884 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2885 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2886 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2887 log.u_bbr.pacing_gain = rack->r_must_retran; 2888 /* 2889 * We capture in delRate the upper 32 bits as 2890 * the confidence level we had declared, and the 2891 * lower 32 bits as the actual RTT using the arrival 2892 * timestamp. 2893 */ 2894 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2895 log.u_bbr.delRate <<= 32; 2896 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2897 /* Lets capture all the things that make up t_rtxcur */ 2898 log.u_bbr.applimited = rack_rto_min; 2899 log.u_bbr.epoch = rack_rto_max; 2900 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2901 log.u_bbr.lost = rack_rto_min; 2902 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2903 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2904 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2905 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2906 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2907 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2908 &rack->rc_inp->inp_socket->so_rcv, 2909 &rack->rc_inp->inp_socket->so_snd, 2910 TCP_LOG_RTT, 0, 2911 0, &log, false, &tv); 2912 } 2913 } 2914 2915 static void 2916 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2917 { 2918 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2919 union tcp_log_stackspecific log; 2920 struct timeval tv; 2921 2922 /* Convert our ms to a microsecond */ 2923 memset(&log, 0, sizeof(log)); 2924 log.u_bbr.flex1 = rtt; 2925 log.u_bbr.flex2 = send_time; 2926 log.u_bbr.flex3 = ack_time; 2927 log.u_bbr.flex4 = where; 2928 log.u_bbr.flex7 = 2; 2929 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2930 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2931 &rack->rc_inp->inp_socket->so_rcv, 2932 &rack->rc_inp->inp_socket->so_snd, 2933 TCP_LOG_RTT, 0, 2934 0, &log, false, &tv); 2935 } 2936 } 2937 2938 2939 static void 2940 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) 2941 { 2942 if (tcp_bblogging_on(rack->rc_tp)) { 2943 union tcp_log_stackspecific log; 2944 struct timeval tv; 2945 2946 /* Convert our ms to a microsecond */ 2947 memset(&log, 0, sizeof(log)); 2948 log.u_bbr.flex1 = idx; 2949 log.u_bbr.flex2 = rack_ts_to_msec(tsv); 2950 log.u_bbr.flex3 = tsecho; 2951 log.u_bbr.flex7 = 3; 2952 log.u_bbr.rttProp = tsv; 2953 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2954 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2955 &rack->rc_inp->inp_socket->so_rcv, 2956 &rack->rc_inp->inp_socket->so_snd, 2957 TCP_LOG_RTT, 0, 2958 0, &log, false, &tv); 2959 } 2960 } 2961 2962 2963 static inline void 2964 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2965 { 2966 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2967 union tcp_log_stackspecific log; 2968 struct timeval tv; 2969 2970 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2971 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2972 log.u_bbr.flex1 = line; 2973 log.u_bbr.flex2 = tick; 2974 log.u_bbr.flex3 = tp->t_maxunacktime; 2975 log.u_bbr.flex4 = tp->t_acktime; 2976 log.u_bbr.flex8 = event; 2977 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2978 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2979 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2980 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2981 log.u_bbr.pacing_gain = rack->r_must_retran; 2982 TCP_LOG_EVENTP(tp, NULL, 2983 &rack->rc_inp->inp_socket->so_rcv, 2984 &rack->rc_inp->inp_socket->so_snd, 2985 BBR_LOG_PROGRESS, 0, 2986 0, &log, false, &tv); 2987 } 2988 } 2989 2990 static void 2991 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) 2992 { 2993 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2994 union tcp_log_stackspecific log; 2995 2996 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2997 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2998 log.u_bbr.flex1 = slot; 2999 if (rack->rack_no_prr) 3000 log.u_bbr.flex2 = 0; 3001 else 3002 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 3003 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3004 log.u_bbr.flex5 = rack->r_ctl.ack_during_sd; 3005 log.u_bbr.flex6 = line; 3006 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 3007 log.u_bbr.flex8 = rack->rc_in_persist; 3008 log.u_bbr.timeStamp = cts; 3009 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3010 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3011 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3012 log.u_bbr.pacing_gain = rack->r_must_retran; 3013 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3014 &rack->rc_inp->inp_socket->so_rcv, 3015 &rack->rc_inp->inp_socket->so_snd, 3016 BBR_LOG_BBRSND, 0, 3017 0, &log, false, tv); 3018 } 3019 } 3020 3021 static void 3022 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 3023 { 3024 if (tcp_bblogging_on(rack->rc_tp)) { 3025 union tcp_log_stackspecific log; 3026 struct timeval tv; 3027 3028 memset(&log, 0, sizeof(log)); 3029 log.u_bbr.flex1 = did_out; 3030 log.u_bbr.flex2 = nxt_pkt; 3031 log.u_bbr.flex3 = way_out; 3032 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3033 if (rack->rack_no_prr) 3034 log.u_bbr.flex5 = 0; 3035 else 3036 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3037 log.u_bbr.flex6 = nsegs; 3038 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 3039 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 3040 log.u_bbr.flex7 <<= 1; 3041 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 3042 log.u_bbr.flex7 <<= 1; 3043 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 3044 log.u_bbr.flex8 = rack->rc_in_persist; 3045 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3046 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3047 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3048 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3049 log.u_bbr.use_lt_bw <<= 1; 3050 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3051 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3052 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3053 log.u_bbr.pacing_gain = rack->r_must_retran; 3054 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3055 &rack->rc_inp->inp_socket->so_rcv, 3056 &rack->rc_inp->inp_socket->so_snd, 3057 BBR_LOG_DOSEG_DONE, 0, 3058 0, &log, false, &tv); 3059 } 3060 } 3061 3062 static void 3063 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 3064 { 3065 if (tcp_bblogging_on(rack->rc_tp)) { 3066 union tcp_log_stackspecific log; 3067 struct timeval tv; 3068 3069 memset(&log, 0, sizeof(log)); 3070 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 3071 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 3072 log.u_bbr.flex4 = arg1; 3073 log.u_bbr.flex5 = arg2; 3074 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; 3075 log.u_bbr.flex6 = arg3; 3076 log.u_bbr.flex8 = frm; 3077 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3078 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3079 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3080 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 3081 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3082 log.u_bbr.pacing_gain = rack->r_must_retran; 3083 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 3084 &tptosocket(tp)->so_snd, 3085 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 3086 } 3087 } 3088 3089 static void 3090 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 3091 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 3092 { 3093 if (tcp_bblogging_on(rack->rc_tp)) { 3094 union tcp_log_stackspecific log; 3095 struct timeval tv; 3096 3097 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3098 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3099 log.u_bbr.flex1 = slot; 3100 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 3101 log.u_bbr.flex4 = reason; 3102 if (rack->rack_no_prr) 3103 log.u_bbr.flex5 = 0; 3104 else 3105 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3106 log.u_bbr.flex7 = hpts_calling; 3107 log.u_bbr.flex8 = rack->rc_in_persist; 3108 log.u_bbr.lt_epoch = cwnd_to_use; 3109 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3110 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3111 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3112 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3113 log.u_bbr.pacing_gain = rack->r_must_retran; 3114 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 3115 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3116 &rack->rc_inp->inp_socket->so_rcv, 3117 &rack->rc_inp->inp_socket->so_snd, 3118 BBR_LOG_JUSTRET, 0, 3119 tlen, &log, false, &tv); 3120 } 3121 } 3122 3123 static void 3124 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 3125 struct timeval *tv, uint32_t flags_on_entry) 3126 { 3127 if (tcp_bblogging_on(rack->rc_tp)) { 3128 union tcp_log_stackspecific log; 3129 3130 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3131 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3132 log.u_bbr.flex1 = line; 3133 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 3134 log.u_bbr.flex3 = flags_on_entry; 3135 log.u_bbr.flex4 = us_cts; 3136 if (rack->rack_no_prr) 3137 log.u_bbr.flex5 = 0; 3138 else 3139 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3140 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3141 log.u_bbr.flex7 = hpts_removed; 3142 log.u_bbr.flex8 = 1; 3143 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 3144 log.u_bbr.timeStamp = us_cts; 3145 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3146 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3147 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3148 log.u_bbr.pacing_gain = rack->r_must_retran; 3149 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3150 &rack->rc_inp->inp_socket->so_rcv, 3151 &rack->rc_inp->inp_socket->so_snd, 3152 BBR_LOG_TIMERCANC, 0, 3153 0, &log, false, tv); 3154 } 3155 } 3156 3157 static void 3158 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 3159 uint32_t flex1, uint32_t flex2, 3160 uint32_t flex3, uint32_t flex4, 3161 uint32_t flex5, uint32_t flex6, 3162 uint16_t flex7, uint8_t mod) 3163 { 3164 if (tcp_bblogging_on(rack->rc_tp)) { 3165 union tcp_log_stackspecific log; 3166 struct timeval tv; 3167 3168 if (mod == 1) { 3169 /* No you can't use 1, its for the real to cancel */ 3170 return; 3171 } 3172 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3173 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3174 log.u_bbr.flex1 = flex1; 3175 log.u_bbr.flex2 = flex2; 3176 log.u_bbr.flex3 = flex3; 3177 log.u_bbr.flex4 = flex4; 3178 log.u_bbr.flex5 = flex5; 3179 log.u_bbr.flex6 = flex6; 3180 log.u_bbr.flex7 = flex7; 3181 log.u_bbr.flex8 = mod; 3182 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3183 &rack->rc_inp->inp_socket->so_rcv, 3184 &rack->rc_inp->inp_socket->so_snd, 3185 BBR_LOG_TIMERCANC, 0, 3186 0, &log, false, &tv); 3187 } 3188 } 3189 3190 static void 3191 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 3192 { 3193 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3194 union tcp_log_stackspecific log; 3195 struct timeval tv; 3196 3197 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3198 log.u_bbr.flex1 = timers; 3199 log.u_bbr.flex2 = ret; 3200 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 3201 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3202 log.u_bbr.flex5 = cts; 3203 if (rack->rack_no_prr) 3204 log.u_bbr.flex6 = 0; 3205 else 3206 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 3207 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3208 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3209 log.u_bbr.pacing_gain = rack->r_must_retran; 3210 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3211 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3212 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3213 &rack->rc_inp->inp_socket->so_rcv, 3214 &rack->rc_inp->inp_socket->so_snd, 3215 BBR_LOG_TO_PROCESS, 0, 3216 0, &log, false, &tv); 3217 } 3218 } 3219 3220 static void 3221 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 3222 { 3223 if (tcp_bblogging_on(rack->rc_tp)) { 3224 union tcp_log_stackspecific log; 3225 struct timeval tv; 3226 3227 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3228 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 3229 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 3230 if (rack->rack_no_prr) 3231 log.u_bbr.flex3 = 0; 3232 else 3233 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 3234 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 3235 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 3236 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 3237 log.u_bbr.flex7 = line; 3238 log.u_bbr.flex8 = frm; 3239 log.u_bbr.pkts_out = orig_cwnd; 3240 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3241 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3242 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3243 log.u_bbr.use_lt_bw <<= 1; 3244 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3245 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3246 &rack->rc_inp->inp_socket->so_rcv, 3247 &rack->rc_inp->inp_socket->so_snd, 3248 BBR_LOG_BBRUPD, 0, 3249 0, &log, false, &tv); 3250 } 3251 } 3252 3253 #ifdef TCP_SAD_DETECTION 3254 static void 3255 rack_log_sad(struct tcp_rack *rack, int event) 3256 { 3257 if (tcp_bblogging_on(rack->rc_tp)) { 3258 union tcp_log_stackspecific log; 3259 struct timeval tv; 3260 3261 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3262 log.u_bbr.flex1 = rack->r_ctl.sack_count; 3263 log.u_bbr.flex2 = rack->r_ctl.ack_count; 3264 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 3265 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 3266 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 3267 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 3268 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 3269 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 3270 log.u_bbr.lt_epoch |= rack->do_detection; 3271 log.u_bbr.applimited = tcp_map_minimum; 3272 log.u_bbr.flex7 = rack->sack_attack_disable; 3273 log.u_bbr.flex8 = event; 3274 log.u_bbr.bbr_state = rack->rc_suspicious; 3275 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3276 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3277 log.u_bbr.delivered = tcp_sad_decay_val; 3278 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3279 &rack->rc_inp->inp_socket->so_rcv, 3280 &rack->rc_inp->inp_socket->so_snd, 3281 TCP_SAD_DETECT, 0, 3282 0, &log, false, &tv); 3283 } 3284 } 3285 #endif 3286 3287 static void 3288 rack_counter_destroy(void) 3289 { 3290 counter_u64_free(rack_total_bytes); 3291 counter_u64_free(rack_fto_send); 3292 counter_u64_free(rack_fto_rsm_send); 3293 counter_u64_free(rack_nfto_resend); 3294 counter_u64_free(rack_hw_pace_init_fail); 3295 counter_u64_free(rack_hw_pace_lost); 3296 counter_u64_free(rack_non_fto_send); 3297 counter_u64_free(rack_extended_rfo); 3298 counter_u64_free(rack_ack_total); 3299 counter_u64_free(rack_express_sack); 3300 counter_u64_free(rack_sack_total); 3301 counter_u64_free(rack_move_none); 3302 counter_u64_free(rack_move_some); 3303 counter_u64_free(rack_sack_attacks_detected); 3304 counter_u64_free(rack_sack_attacks_reversed); 3305 counter_u64_free(rack_sack_attacks_suspect); 3306 counter_u64_free(rack_sack_used_next_merge); 3307 counter_u64_free(rack_sack_used_prev_merge); 3308 counter_u64_free(rack_tlp_tot); 3309 counter_u64_free(rack_tlp_newdata); 3310 counter_u64_free(rack_tlp_retran); 3311 counter_u64_free(rack_tlp_retran_bytes); 3312 counter_u64_free(rack_to_tot); 3313 counter_u64_free(rack_saw_enobuf); 3314 counter_u64_free(rack_saw_enobuf_hw); 3315 counter_u64_free(rack_saw_enetunreach); 3316 counter_u64_free(rack_hot_alloc); 3317 counter_u64_free(rack_to_alloc); 3318 counter_u64_free(rack_to_alloc_hard); 3319 counter_u64_free(rack_to_alloc_emerg); 3320 counter_u64_free(rack_to_alloc_limited); 3321 counter_u64_free(rack_alloc_limited_conns); 3322 counter_u64_free(rack_split_limited); 3323 counter_u64_free(rack_multi_single_eq); 3324 counter_u64_free(rack_rxt_clamps_cwnd); 3325 counter_u64_free(rack_rxt_clamps_cwnd_uniq); 3326 counter_u64_free(rack_proc_non_comp_ack); 3327 counter_u64_free(rack_sack_proc_all); 3328 counter_u64_free(rack_sack_proc_restart); 3329 counter_u64_free(rack_sack_proc_short); 3330 counter_u64_free(rack_sack_skipped_acked); 3331 counter_u64_free(rack_sack_splits); 3332 counter_u64_free(rack_input_idle_reduces); 3333 counter_u64_free(rack_collapsed_win); 3334 counter_u64_free(rack_collapsed_win_rxt); 3335 counter_u64_free(rack_collapsed_win_rxt_bytes); 3336 counter_u64_free(rack_collapsed_win_seen); 3337 counter_u64_free(rack_try_scwnd); 3338 counter_u64_free(rack_persists_sends); 3339 counter_u64_free(rack_persists_acks); 3340 counter_u64_free(rack_persists_loss); 3341 counter_u64_free(rack_persists_lost_ends); 3342 #ifdef INVARIANTS 3343 counter_u64_free(rack_adjust_map_bw); 3344 #endif 3345 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 3346 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 3347 } 3348 3349 static struct rack_sendmap * 3350 rack_alloc(struct tcp_rack *rack) 3351 { 3352 struct rack_sendmap *rsm; 3353 3354 /* 3355 * First get the top of the list it in 3356 * theory is the "hottest" rsm we have, 3357 * possibly just freed by ack processing. 3358 */ 3359 if (rack->rc_free_cnt > rack_free_cache) { 3360 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3361 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3362 counter_u64_add(rack_hot_alloc, 1); 3363 rack->rc_free_cnt--; 3364 return (rsm); 3365 } 3366 /* 3367 * Once we get under our free cache we probably 3368 * no longer have a "hot" one available. Lets 3369 * get one from UMA. 3370 */ 3371 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3372 if (rsm) { 3373 rack->r_ctl.rc_num_maps_alloced++; 3374 counter_u64_add(rack_to_alloc, 1); 3375 return (rsm); 3376 } 3377 /* 3378 * Dig in to our aux rsm's (the last two) since 3379 * UMA failed to get us one. 3380 */ 3381 if (rack->rc_free_cnt) { 3382 counter_u64_add(rack_to_alloc_emerg, 1); 3383 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3384 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3385 rack->rc_free_cnt--; 3386 return (rsm); 3387 } 3388 return (NULL); 3389 } 3390 3391 static struct rack_sendmap * 3392 rack_alloc_full_limit(struct tcp_rack *rack) 3393 { 3394 if ((V_tcp_map_entries_limit > 0) && 3395 (rack->do_detection == 0) && 3396 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3397 counter_u64_add(rack_to_alloc_limited, 1); 3398 if (!rack->alloc_limit_reported) { 3399 rack->alloc_limit_reported = 1; 3400 counter_u64_add(rack_alloc_limited_conns, 1); 3401 } 3402 return (NULL); 3403 } 3404 return (rack_alloc(rack)); 3405 } 3406 3407 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3408 static struct rack_sendmap * 3409 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3410 { 3411 struct rack_sendmap *rsm; 3412 3413 if (limit_type) { 3414 /* currently there is only one limit type */ 3415 if (rack->r_ctl.rc_split_limit > 0 && 3416 (rack->do_detection == 0) && 3417 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { 3418 counter_u64_add(rack_split_limited, 1); 3419 if (!rack->alloc_limit_reported) { 3420 rack->alloc_limit_reported = 1; 3421 counter_u64_add(rack_alloc_limited_conns, 1); 3422 } 3423 return (NULL); 3424 #ifdef TCP_SAD_DETECTION 3425 } else if ((tcp_sad_limit != 0) && 3426 (rack->do_detection == 1) && 3427 (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) { 3428 counter_u64_add(rack_split_limited, 1); 3429 if (!rack->alloc_limit_reported) { 3430 rack->alloc_limit_reported = 1; 3431 counter_u64_add(rack_alloc_limited_conns, 1); 3432 } 3433 return (NULL); 3434 #endif 3435 } 3436 } 3437 3438 /* allocate and mark in the limit type, if set */ 3439 rsm = rack_alloc(rack); 3440 if (rsm != NULL && limit_type) { 3441 rsm->r_limit_type = limit_type; 3442 rack->r_ctl.rc_num_split_allocs++; 3443 } 3444 return (rsm); 3445 } 3446 3447 static void 3448 rack_free_trim(struct tcp_rack *rack) 3449 { 3450 struct rack_sendmap *rsm; 3451 3452 /* 3453 * Free up all the tail entries until 3454 * we get our list down to the limit. 3455 */ 3456 while (rack->rc_free_cnt > rack_free_cache) { 3457 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3458 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3459 rack->rc_free_cnt--; 3460 rack->r_ctl.rc_num_maps_alloced--; 3461 uma_zfree(rack_zone, rsm); 3462 } 3463 } 3464 3465 static void 3466 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3467 { 3468 if (rsm->r_flags & RACK_APP_LIMITED) { 3469 if (rack->r_ctl.rc_app_limited_cnt > 0) { 3470 rack->r_ctl.rc_app_limited_cnt--; 3471 } 3472 } 3473 if (rsm->r_limit_type) { 3474 /* currently there is only one limit type */ 3475 rack->r_ctl.rc_num_split_allocs--; 3476 } 3477 if (rsm == rack->r_ctl.rc_first_appl) { 3478 if (rack->r_ctl.rc_app_limited_cnt == 0) 3479 rack->r_ctl.rc_first_appl = NULL; 3480 else 3481 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); 3482 } 3483 if (rsm == rack->r_ctl.rc_resend) 3484 rack->r_ctl.rc_resend = NULL; 3485 if (rsm == rack->r_ctl.rc_end_appl) 3486 rack->r_ctl.rc_end_appl = NULL; 3487 if (rack->r_ctl.rc_tlpsend == rsm) 3488 rack->r_ctl.rc_tlpsend = NULL; 3489 if (rack->r_ctl.rc_sacklast == rsm) 3490 rack->r_ctl.rc_sacklast = NULL; 3491 memset(rsm, 0, sizeof(struct rack_sendmap)); 3492 /* Make sure we are not going to overrun our count limit of 0xff */ 3493 if ((rack->rc_free_cnt + 1) > 0xff) { 3494 rack_free_trim(rack); 3495 } 3496 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3497 rack->rc_free_cnt++; 3498 } 3499 3500 static uint32_t 3501 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3502 { 3503 uint64_t srtt, bw, len, tim; 3504 uint32_t segsiz, def_len, minl; 3505 3506 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3507 def_len = rack_def_data_window * segsiz; 3508 if (rack->rc_gp_filled == 0) { 3509 /* 3510 * We have no measurement (IW is in flight?) so 3511 * we can only guess using our data_window sysctl 3512 * value (usually 20MSS). 3513 */ 3514 return (def_len); 3515 } 3516 /* 3517 * Now we have a number of factors to consider. 3518 * 3519 * 1) We have a desired BDP which is usually 3520 * at least 2. 3521 * 2) We have a minimum number of rtt's usually 1 SRTT 3522 * but we allow it too to be more. 3523 * 3) We want to make sure a measurement last N useconds (if 3524 * we have set rack_min_measure_usec. 3525 * 3526 * We handle the first concern here by trying to create a data 3527 * window of max(rack_def_data_window, DesiredBDP). The 3528 * second concern we handle in not letting the measurement 3529 * window end normally until at least the required SRTT's 3530 * have gone by which is done further below in 3531 * rack_enough_for_measurement(). Finally the third concern 3532 * we also handle here by calculating how long that time 3533 * would take at the current BW and then return the 3534 * max of our first calculation and that length. Note 3535 * that if rack_min_measure_usec is 0, we don't deal 3536 * with concern 3. Also for both Concern 1 and 3 an 3537 * application limited period could end the measurement 3538 * earlier. 3539 * 3540 * So lets calculate the BDP with the "known" b/w using 3541 * the SRTT has our rtt and then multiply it by the 3542 * goal. 3543 */ 3544 bw = rack_get_bw(rack); 3545 srtt = (uint64_t)tp->t_srtt; 3546 len = bw * srtt; 3547 len /= (uint64_t)HPTS_USEC_IN_SEC; 3548 len *= max(1, rack_goal_bdp); 3549 /* Now we need to round up to the nearest MSS */ 3550 len = roundup(len, segsiz); 3551 if (rack_min_measure_usec) { 3552 /* Now calculate our min length for this b/w */ 3553 tim = rack_min_measure_usec; 3554 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3555 if (minl == 0) 3556 minl = 1; 3557 minl = roundup(minl, segsiz); 3558 if (len < minl) 3559 len = minl; 3560 } 3561 /* 3562 * Now if we have a very small window we want 3563 * to attempt to get the window that is 3564 * as small as possible. This happens on 3565 * low b/w connections and we don't want to 3566 * span huge numbers of rtt's between measurements. 3567 * 3568 * We basically include 2 over our "MIN window" so 3569 * that the measurement can be shortened (possibly) by 3570 * an ack'ed packet. 3571 */ 3572 if (len < def_len) 3573 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3574 else 3575 return (max((uint32_t)len, def_len)); 3576 3577 } 3578 3579 static int 3580 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3581 { 3582 uint32_t tim, srtts, segsiz; 3583 3584 /* 3585 * Has enough time passed for the GP measurement to be valid? 3586 */ 3587 if (SEQ_LT(th_ack, tp->gput_seq)) { 3588 /* Not enough bytes yet */ 3589 return (0); 3590 } 3591 if ((tp->snd_max == tp->snd_una) || 3592 (th_ack == tp->snd_max)){ 3593 /* 3594 * All is acked quality of all acked is 3595 * usually low or medium, but we in theory could split 3596 * all acked into two cases, where you got 3597 * a signifigant amount of your window and 3598 * where you did not. For now we leave it 3599 * but it is something to contemplate in the 3600 * future. The danger here is that delayed ack 3601 * is effecting the last byte (which is a 50:50 chance). 3602 */ 3603 *quality = RACK_QUALITY_ALLACKED; 3604 return (1); 3605 } 3606 if (SEQ_GEQ(th_ack, tp->gput_ack)) { 3607 /* 3608 * We obtained our entire window of data we wanted 3609 * no matter if we are in recovery or not then 3610 * its ok since expanding the window does not 3611 * make things fuzzy (or at least not as much). 3612 */ 3613 *quality = RACK_QUALITY_HIGH; 3614 return (1); 3615 } 3616 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3617 if (SEQ_LT(th_ack, tp->gput_ack) && 3618 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3619 /* Not enough bytes yet */ 3620 return (0); 3621 } 3622 if (rack->r_ctl.rc_first_appl && 3623 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3624 /* 3625 * We are up to the app limited send point 3626 * we have to measure irrespective of the time.. 3627 */ 3628 *quality = RACK_QUALITY_APPLIMITED; 3629 return (1); 3630 } 3631 /* Now what about time? */ 3632 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3633 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3634 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 3635 /* 3636 * We do not allow a measurement if we are in recovery 3637 * that would shrink the goodput window we wanted. 3638 * This is to prevent cloudyness of when the last send 3639 * was actually made. 3640 */ 3641 *quality = RACK_QUALITY_HIGH; 3642 return (1); 3643 } 3644 /* Nope not even a full SRTT has passed */ 3645 return (0); 3646 } 3647 3648 static void 3649 rack_log_timely(struct tcp_rack *rack, 3650 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3651 uint64_t up_bnd, int line, uint8_t method) 3652 { 3653 if (tcp_bblogging_on(rack->rc_tp)) { 3654 union tcp_log_stackspecific log; 3655 struct timeval tv; 3656 3657 memset(&log, 0, sizeof(log)); 3658 log.u_bbr.flex1 = logged; 3659 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3660 log.u_bbr.flex2 <<= 4; 3661 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3662 log.u_bbr.flex2 <<= 4; 3663 log.u_bbr.flex2 |= rack->rc_gp_incr; 3664 log.u_bbr.flex2 <<= 4; 3665 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3666 log.u_bbr.flex3 = rack->rc_gp_incr; 3667 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3668 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3669 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3670 log.u_bbr.flex7 = rack->rc_gp_bwred; 3671 log.u_bbr.flex8 = method; 3672 log.u_bbr.cur_del_rate = cur_bw; 3673 log.u_bbr.delRate = low_bnd; 3674 log.u_bbr.bw_inuse = up_bnd; 3675 log.u_bbr.rttProp = rack_get_bw(rack); 3676 log.u_bbr.pkt_epoch = line; 3677 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3678 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3679 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3680 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3681 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3682 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3683 log.u_bbr.cwnd_gain <<= 1; 3684 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3685 log.u_bbr.cwnd_gain <<= 1; 3686 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3687 log.u_bbr.cwnd_gain <<= 1; 3688 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3689 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3690 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3691 &rack->rc_inp->inp_socket->so_rcv, 3692 &rack->rc_inp->inp_socket->so_snd, 3693 TCP_TIMELY_WORK, 0, 3694 0, &log, false, &tv); 3695 } 3696 } 3697 3698 static int 3699 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3700 { 3701 /* 3702 * Before we increase we need to know if 3703 * the estimate just made was less than 3704 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3705 * 3706 * If we already are pacing at a fast enough 3707 * rate to push us faster there is no sense of 3708 * increasing. 3709 * 3710 * We first caculate our actual pacing rate (ss or ca multiplier 3711 * times our cur_bw). 3712 * 3713 * Then we take the last measured rate and multipy by our 3714 * maximum pacing overage to give us a max allowable rate. 3715 * 3716 * If our act_rate is smaller than our max_allowable rate 3717 * then we should increase. Else we should hold steady. 3718 * 3719 */ 3720 uint64_t act_rate, max_allow_rate; 3721 3722 if (rack_timely_no_stopping) 3723 return (1); 3724 3725 if ((cur_bw == 0) || (last_bw_est == 0)) { 3726 /* 3727 * Initial startup case or 3728 * everything is acked case. 3729 */ 3730 rack_log_timely(rack, mult, cur_bw, 0, 0, 3731 __LINE__, 9); 3732 return (1); 3733 } 3734 if (mult <= 100) { 3735 /* 3736 * We can always pace at or slightly above our rate. 3737 */ 3738 rack_log_timely(rack, mult, cur_bw, 0, 0, 3739 __LINE__, 9); 3740 return (1); 3741 } 3742 act_rate = cur_bw * (uint64_t)mult; 3743 act_rate /= 100; 3744 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3745 max_allow_rate /= 100; 3746 if (act_rate < max_allow_rate) { 3747 /* 3748 * Here the rate we are actually pacing at 3749 * is smaller than 10% above our last measurement. 3750 * This means we are pacing below what we would 3751 * like to try to achieve (plus some wiggle room). 3752 */ 3753 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3754 __LINE__, 9); 3755 return (1); 3756 } else { 3757 /* 3758 * Here we are already pacing at least rack_max_per_above(10%) 3759 * what we are getting back. This indicates most likely 3760 * that we are being limited (cwnd/rwnd/app) and can't 3761 * get any more b/w. There is no sense of trying to 3762 * raise up the pacing rate its not speeding us up 3763 * and we already are pacing faster than we are getting. 3764 */ 3765 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3766 __LINE__, 8); 3767 return (0); 3768 } 3769 } 3770 3771 static void 3772 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3773 { 3774 /* 3775 * When we drag bottom, we want to assure 3776 * that no multiplier is below 1.0, if so 3777 * we want to restore it to at least that. 3778 */ 3779 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3780 /* This is unlikely we usually do not touch recovery */ 3781 rack->r_ctl.rack_per_of_gp_rec = 100; 3782 } 3783 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3784 rack->r_ctl.rack_per_of_gp_ca = 100; 3785 } 3786 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3787 rack->r_ctl.rack_per_of_gp_ss = 100; 3788 } 3789 } 3790 3791 static void 3792 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3793 { 3794 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3795 rack->r_ctl.rack_per_of_gp_ca = 100; 3796 } 3797 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3798 rack->r_ctl.rack_per_of_gp_ss = 100; 3799 } 3800 } 3801 3802 static void 3803 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3804 { 3805 int32_t calc, logged, plus; 3806 3807 logged = 0; 3808 3809 if (override) { 3810 /* 3811 * override is passed when we are 3812 * loosing b/w and making one last 3813 * gasp at trying to not loose out 3814 * to a new-reno flow. 3815 */ 3816 goto extra_boost; 3817 } 3818 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3819 if (rack->rc_gp_incr && 3820 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3821 /* 3822 * Reset and get 5 strokes more before the boost. Note 3823 * that the count is 0 based so we have to add one. 3824 */ 3825 extra_boost: 3826 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3827 rack->rc_gp_timely_inc_cnt = 0; 3828 } else 3829 plus = (uint32_t)rack_gp_increase_per; 3830 /* Must be at least 1% increase for true timely increases */ 3831 if ((plus < 1) && 3832 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3833 plus = 1; 3834 if (rack->rc_gp_saw_rec && 3835 (rack->rc_gp_no_rec_chg == 0) && 3836 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3837 rack->r_ctl.rack_per_of_gp_rec)) { 3838 /* We have been in recovery ding it too */ 3839 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3840 if (calc > 0xffff) 3841 calc = 0xffff; 3842 logged |= 1; 3843 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3844 if (rack->r_ctl.rack_per_upper_bound_ca && 3845 (rack->rc_dragged_bottom == 0) && 3846 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) 3847 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; 3848 } 3849 if (rack->rc_gp_saw_ca && 3850 (rack->rc_gp_saw_ss == 0) && 3851 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3852 rack->r_ctl.rack_per_of_gp_ca)) { 3853 /* In CA */ 3854 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3855 if (calc > 0xffff) 3856 calc = 0xffff; 3857 logged |= 2; 3858 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3859 if (rack->r_ctl.rack_per_upper_bound_ca && 3860 (rack->rc_dragged_bottom == 0) && 3861 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) 3862 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; 3863 } 3864 if (rack->rc_gp_saw_ss && 3865 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3866 rack->r_ctl.rack_per_of_gp_ss)) { 3867 /* In SS */ 3868 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3869 if (calc > 0xffff) 3870 calc = 0xffff; 3871 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3872 if (rack->r_ctl.rack_per_upper_bound_ss && 3873 (rack->rc_dragged_bottom == 0) && 3874 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) 3875 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; 3876 logged |= 4; 3877 } 3878 if (logged && 3879 (rack->rc_gp_incr == 0)){ 3880 /* Go into increment mode */ 3881 rack->rc_gp_incr = 1; 3882 rack->rc_gp_timely_inc_cnt = 0; 3883 } 3884 if (rack->rc_gp_incr && 3885 logged && 3886 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3887 rack->rc_gp_timely_inc_cnt++; 3888 } 3889 rack_log_timely(rack, logged, plus, 0, 0, 3890 __LINE__, 1); 3891 } 3892 3893 static uint32_t 3894 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3895 { 3896 /*- 3897 * norm_grad = rtt_diff / minrtt; 3898 * new_per = curper * (1 - B * norm_grad) 3899 * 3900 * B = rack_gp_decrease_per (default 80%) 3901 * rtt_dif = input var current rtt-diff 3902 * curper = input var current percentage 3903 * minrtt = from rack filter 3904 * 3905 * In order to do the floating point calculations above we 3906 * do an integer conversion. The code looks confusing so let me 3907 * translate it into something that use more variables and 3908 * is clearer for us humans :) 3909 * 3910 * uint64_t norm_grad, inverse, reduce_by, final_result; 3911 * uint32_t perf; 3912 * 3913 * norm_grad = (((uint64_t)rtt_diff * 1000000) / 3914 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt)); 3915 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad; 3916 * inverse /= 1000000; 3917 * reduce_by = (1000000 - inverse); 3918 * final_result = (cur_per * reduce_by) / 1000000; 3919 * perf = (uint32_t)final_result; 3920 */ 3921 uint64_t perf; 3922 3923 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3924 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3925 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3926 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3927 (uint64_t)1000000)) / 3928 (uint64_t)1000000); 3929 if (perf > curper) { 3930 /* TSNH */ 3931 perf = curper - 1; 3932 } 3933 return ((uint32_t)perf); 3934 } 3935 3936 static uint32_t 3937 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3938 { 3939 /* 3940 * highrttthresh 3941 * result = curper * (1 - (B * ( 1 - ------ )) 3942 * gp_srtt 3943 * 3944 * B = rack_gp_decrease_per (default .8 i.e. 80) 3945 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3946 */ 3947 uint64_t perf; 3948 uint32_t highrttthresh; 3949 3950 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3951 3952 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3953 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3954 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3955 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3956 if (tcp_bblogging_on(rack->rc_tp)) { 3957 uint64_t log1; 3958 3959 log1 = rtt; 3960 log1 <<= 32; 3961 log1 |= highrttthresh; 3962 rack_log_timely(rack, 3963 rack_gp_decrease_per, 3964 (uint64_t)curper, 3965 log1, 3966 perf, 3967 __LINE__, 3968 15); 3969 } 3970 return (perf); 3971 } 3972 3973 static void 3974 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3975 { 3976 uint64_t logvar, logvar2, logvar3; 3977 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3978 3979 if (rack->rc_gp_incr) { 3980 /* Turn off increment counting */ 3981 rack->rc_gp_incr = 0; 3982 rack->rc_gp_timely_inc_cnt = 0; 3983 } 3984 ss_red = ca_red = rec_red = 0; 3985 logged = 0; 3986 /* Calculate the reduction value */ 3987 if (rtt_diff < 0) { 3988 rtt_diff *= -1; 3989 } 3990 /* Must be at least 1% reduction */ 3991 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3992 /* We have been in recovery ding it too */ 3993 if (timely_says == 2) { 3994 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3995 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3996 if (alt < new_per) 3997 val = alt; 3998 else 3999 val = new_per; 4000 } else 4001 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 4002 if (rack->r_ctl.rack_per_of_gp_rec > val) { 4003 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 4004 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 4005 } else { 4006 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4007 rec_red = 0; 4008 } 4009 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 4010 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4011 logged |= 1; 4012 } 4013 if (rack->rc_gp_saw_ss) { 4014 /* Sent in SS */ 4015 if (timely_says == 2) { 4016 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 4017 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4018 if (alt < new_per) 4019 val = alt; 4020 else 4021 val = new_per; 4022 } else 4023 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4024 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 4025 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 4026 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 4027 } else { 4028 ss_red = new_per; 4029 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4030 logvar = new_per; 4031 logvar <<= 32; 4032 logvar |= alt; 4033 logvar2 = (uint32_t)rtt; 4034 logvar2 <<= 32; 4035 logvar2 |= (uint32_t)rtt_diff; 4036 logvar3 = rack_gp_rtt_maxmul; 4037 logvar3 <<= 32; 4038 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4039 rack_log_timely(rack, timely_says, 4040 logvar2, logvar3, 4041 logvar, __LINE__, 10); 4042 } 4043 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 4044 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4045 logged |= 4; 4046 } else if (rack->rc_gp_saw_ca) { 4047 /* Sent in CA */ 4048 if (timely_says == 2) { 4049 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 4050 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4051 if (alt < new_per) 4052 val = alt; 4053 else 4054 val = new_per; 4055 } else 4056 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4057 if (rack->r_ctl.rack_per_of_gp_ca > val) { 4058 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 4059 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 4060 } else { 4061 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4062 ca_red = 0; 4063 logvar = new_per; 4064 logvar <<= 32; 4065 logvar |= alt; 4066 logvar2 = (uint32_t)rtt; 4067 logvar2 <<= 32; 4068 logvar2 |= (uint32_t)rtt_diff; 4069 logvar3 = rack_gp_rtt_maxmul; 4070 logvar3 <<= 32; 4071 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4072 rack_log_timely(rack, timely_says, 4073 logvar2, logvar3, 4074 logvar, __LINE__, 10); 4075 } 4076 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 4077 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4078 logged |= 2; 4079 } 4080 if (rack->rc_gp_timely_dec_cnt < 0x7) { 4081 rack->rc_gp_timely_dec_cnt++; 4082 if (rack_timely_dec_clear && 4083 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 4084 rack->rc_gp_timely_dec_cnt = 0; 4085 } 4086 logvar = ss_red; 4087 logvar <<= 32; 4088 logvar |= ca_red; 4089 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 4090 __LINE__, 2); 4091 } 4092 4093 static void 4094 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 4095 uint32_t rtt, uint32_t line, uint8_t reas) 4096 { 4097 if (tcp_bblogging_on(rack->rc_tp)) { 4098 union tcp_log_stackspecific log; 4099 struct timeval tv; 4100 4101 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4102 log.u_bbr.flex1 = line; 4103 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 4104 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 4105 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 4106 log.u_bbr.flex5 = rtt; 4107 log.u_bbr.flex6 = rack->rc_highly_buffered; 4108 log.u_bbr.flex6 <<= 1; 4109 log.u_bbr.flex6 |= rack->forced_ack; 4110 log.u_bbr.flex6 <<= 1; 4111 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 4112 log.u_bbr.flex6 <<= 1; 4113 log.u_bbr.flex6 |= rack->in_probe_rtt; 4114 log.u_bbr.flex6 <<= 1; 4115 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 4116 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 4117 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 4118 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 4119 log.u_bbr.flex8 = reas; 4120 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4121 log.u_bbr.delRate = rack_get_bw(rack); 4122 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 4123 log.u_bbr.cur_del_rate <<= 32; 4124 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 4125 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 4126 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 4127 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 4128 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 4129 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 4130 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 4131 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 4132 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4133 log.u_bbr.rttProp = us_cts; 4134 log.u_bbr.rttProp <<= 32; 4135 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 4136 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4137 &rack->rc_inp->inp_socket->so_rcv, 4138 &rack->rc_inp->inp_socket->so_snd, 4139 BBR_LOG_RTT_SHRINKS, 0, 4140 0, &log, false, &rack->r_ctl.act_rcv_time); 4141 } 4142 } 4143 4144 static void 4145 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 4146 { 4147 uint64_t bwdp; 4148 4149 bwdp = rack_get_bw(rack); 4150 bwdp *= (uint64_t)rtt; 4151 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 4152 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 4153 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 4154 /* 4155 * A window protocol must be able to have 4 packets 4156 * outstanding as the floor in order to function 4157 * (especially considering delayed ack :D). 4158 */ 4159 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 4160 } 4161 } 4162 4163 static void 4164 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 4165 { 4166 /** 4167 * ProbeRTT is a bit different in rack_pacing than in 4168 * BBR. It is like BBR in that it uses the lowering of 4169 * the RTT as a signal that we saw something new and 4170 * counts from there for how long between. But it is 4171 * different in that its quite simple. It does not 4172 * play with the cwnd and wait until we get down 4173 * to N segments outstanding and hold that for 4174 * 200ms. Instead it just sets the pacing reduction 4175 * rate to a set percentage (70 by default) and hold 4176 * that for a number of recent GP Srtt's. 4177 */ 4178 uint32_t segsiz; 4179 4180 if (rack->rc_gp_dyn_mul == 0) 4181 return; 4182 4183 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 4184 /* We are idle */ 4185 return; 4186 } 4187 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4188 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4189 /* 4190 * Stop the goodput now, the idea here is 4191 * that future measurements with in_probe_rtt 4192 * won't register if they are not greater so 4193 * we want to get what info (if any) is available 4194 * now. 4195 */ 4196 rack_do_goodput_measurement(rack->rc_tp, rack, 4197 rack->rc_tp->snd_una, __LINE__, 4198 RACK_QUALITY_PROBERTT); 4199 } 4200 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4201 rack->r_ctl.rc_time_probertt_entered = us_cts; 4202 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4203 rack->r_ctl.rc_pace_min_segs); 4204 rack->in_probe_rtt = 1; 4205 rack->measure_saw_probe_rtt = 1; 4206 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4207 rack->r_ctl.rc_time_probertt_starts = 0; 4208 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 4209 if (rack_probertt_use_min_rtt_entry) 4210 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4211 else 4212 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 4213 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4214 __LINE__, RACK_RTTS_ENTERPROBE); 4215 } 4216 4217 static void 4218 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 4219 { 4220 struct rack_sendmap *rsm; 4221 uint32_t segsiz; 4222 4223 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4224 rack->r_ctl.rc_pace_min_segs); 4225 rack->in_probe_rtt = 0; 4226 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4227 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4228 /* 4229 * Stop the goodput now, the idea here is 4230 * that future measurements with in_probe_rtt 4231 * won't register if they are not greater so 4232 * we want to get what info (if any) is available 4233 * now. 4234 */ 4235 rack_do_goodput_measurement(rack->rc_tp, rack, 4236 rack->rc_tp->snd_una, __LINE__, 4237 RACK_QUALITY_PROBERTT); 4238 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 4239 /* 4240 * We don't have enough data to make a measurement. 4241 * So lets just stop and start here after exiting 4242 * probe-rtt. We probably are not interested in 4243 * the results anyway. 4244 */ 4245 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 4246 } 4247 /* 4248 * Measurements through the current snd_max are going 4249 * to be limited by the slower pacing rate. 4250 * 4251 * We need to mark these as app-limited so we 4252 * don't collapse the b/w. 4253 */ 4254 rsm = tqhash_max(rack->r_ctl.tqh); 4255 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 4256 if (rack->r_ctl.rc_app_limited_cnt == 0) 4257 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 4258 else { 4259 /* 4260 * Go out to the end app limited and mark 4261 * this new one as next and move the end_appl up 4262 * to this guy. 4263 */ 4264 if (rack->r_ctl.rc_end_appl) 4265 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 4266 rack->r_ctl.rc_end_appl = rsm; 4267 } 4268 rsm->r_flags |= RACK_APP_LIMITED; 4269 rack->r_ctl.rc_app_limited_cnt++; 4270 } 4271 /* 4272 * Now, we need to examine our pacing rate multipliers. 4273 * If its under 100%, we need to kick it back up to 4274 * 100%. We also don't let it be over our "max" above 4275 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 4276 * Note setting clamp_atexit_prtt to 0 has the effect 4277 * of setting CA/SS to 100% always at exit (which is 4278 * the default behavior). 4279 */ 4280 if (rack_probertt_clear_is) { 4281 rack->rc_gp_incr = 0; 4282 rack->rc_gp_bwred = 0; 4283 rack->rc_gp_timely_inc_cnt = 0; 4284 rack->rc_gp_timely_dec_cnt = 0; 4285 } 4286 /* Do we do any clamping at exit? */ 4287 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 4288 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 4289 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 4290 } 4291 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 4292 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 4293 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 4294 } 4295 /* 4296 * Lets set rtt_diff to 0, so that we will get a "boost" 4297 * after exiting. 4298 */ 4299 rack->r_ctl.rc_rtt_diff = 0; 4300 4301 /* Clear all flags so we start fresh */ 4302 rack->rc_tp->t_bytes_acked = 0; 4303 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4304 /* 4305 * If configured to, set the cwnd and ssthresh to 4306 * our targets. 4307 */ 4308 if (rack_probe_rtt_sets_cwnd) { 4309 uint64_t ebdp; 4310 uint32_t setto; 4311 4312 /* Set ssthresh so we get into CA once we hit our target */ 4313 if (rack_probertt_use_min_rtt_exit == 1) { 4314 /* Set to min rtt */ 4315 rack_set_prtt_target(rack, segsiz, 4316 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4317 } else if (rack_probertt_use_min_rtt_exit == 2) { 4318 /* Set to current gp rtt */ 4319 rack_set_prtt_target(rack, segsiz, 4320 rack->r_ctl.rc_gp_srtt); 4321 } else if (rack_probertt_use_min_rtt_exit == 3) { 4322 /* Set to entry gp rtt */ 4323 rack_set_prtt_target(rack, segsiz, 4324 rack->r_ctl.rc_entry_gp_rtt); 4325 } else { 4326 uint64_t sum; 4327 uint32_t setval; 4328 4329 sum = rack->r_ctl.rc_entry_gp_rtt; 4330 sum *= 10; 4331 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 4332 if (sum >= 20) { 4333 /* 4334 * A highly buffered path needs 4335 * cwnd space for timely to work. 4336 * Lets set things up as if 4337 * we are heading back here again. 4338 */ 4339 setval = rack->r_ctl.rc_entry_gp_rtt; 4340 } else if (sum >= 15) { 4341 /* 4342 * Lets take the smaller of the 4343 * two since we are just somewhat 4344 * buffered. 4345 */ 4346 setval = rack->r_ctl.rc_gp_srtt; 4347 if (setval > rack->r_ctl.rc_entry_gp_rtt) 4348 setval = rack->r_ctl.rc_entry_gp_rtt; 4349 } else { 4350 /* 4351 * Here we are not highly buffered 4352 * and should pick the min we can to 4353 * keep from causing loss. 4354 */ 4355 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4356 } 4357 rack_set_prtt_target(rack, segsiz, 4358 setval); 4359 } 4360 if (rack_probe_rtt_sets_cwnd > 1) { 4361 /* There is a percentage here to boost */ 4362 ebdp = rack->r_ctl.rc_target_probertt_flight; 4363 ebdp *= rack_probe_rtt_sets_cwnd; 4364 ebdp /= 100; 4365 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 4366 } else 4367 setto = rack->r_ctl.rc_target_probertt_flight; 4368 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 4369 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 4370 /* Enforce a min */ 4371 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 4372 } 4373 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 4374 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 4375 } 4376 rack_log_rtt_shrinks(rack, us_cts, 4377 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4378 __LINE__, RACK_RTTS_EXITPROBE); 4379 /* Clear times last so log has all the info */ 4380 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 4381 rack->r_ctl.rc_time_probertt_entered = us_cts; 4382 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4383 rack->r_ctl.rc_time_of_last_probertt = us_cts; 4384 } 4385 4386 static void 4387 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 4388 { 4389 /* Check in on probe-rtt */ 4390 if (rack->rc_gp_filled == 0) { 4391 /* We do not do p-rtt unless we have gp measurements */ 4392 return; 4393 } 4394 if (rack->in_probe_rtt) { 4395 uint64_t no_overflow; 4396 uint32_t endtime, must_stay; 4397 4398 if (rack->r_ctl.rc_went_idle_time && 4399 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 4400 /* 4401 * We went idle during prtt, just exit now. 4402 */ 4403 rack_exit_probertt(rack, us_cts); 4404 } else if (rack_probe_rtt_safety_val && 4405 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 4406 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 4407 /* 4408 * Probe RTT safety value triggered! 4409 */ 4410 rack_log_rtt_shrinks(rack, us_cts, 4411 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4412 __LINE__, RACK_RTTS_SAFETY); 4413 rack_exit_probertt(rack, us_cts); 4414 } 4415 /* Calculate the max we will wait */ 4416 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 4417 if (rack->rc_highly_buffered) 4418 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 4419 /* Calculate the min we must wait */ 4420 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 4421 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 4422 TSTMP_LT(us_cts, endtime)) { 4423 uint32_t calc; 4424 /* Do we lower more? */ 4425 no_exit: 4426 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 4427 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 4428 else 4429 calc = 0; 4430 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4431 if (calc) { 4432 /* Maybe */ 4433 calc *= rack_per_of_gp_probertt_reduce; 4434 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4435 /* Limit it too */ 4436 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4437 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4438 } 4439 /* We must reach target or the time set */ 4440 return; 4441 } 4442 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4443 if ((TSTMP_LT(us_cts, must_stay) && 4444 rack->rc_highly_buffered) || 4445 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4446 rack->r_ctl.rc_target_probertt_flight)) { 4447 /* We are not past the must_stay time */ 4448 goto no_exit; 4449 } 4450 rack_log_rtt_shrinks(rack, us_cts, 4451 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4452 __LINE__, RACK_RTTS_REACHTARGET); 4453 rack->r_ctl.rc_time_probertt_starts = us_cts; 4454 if (rack->r_ctl.rc_time_probertt_starts == 0) 4455 rack->r_ctl.rc_time_probertt_starts = 1; 4456 /* Restore back to our rate we want to pace at in prtt */ 4457 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4458 } 4459 /* 4460 * Setup our end time, some number of gp_srtts plus 200ms. 4461 */ 4462 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4463 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4464 if (rack_probertt_gpsrtt_cnt_div) 4465 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4466 else 4467 endtime = 0; 4468 endtime += rack_min_probertt_hold; 4469 endtime += rack->r_ctl.rc_time_probertt_starts; 4470 if (TSTMP_GEQ(us_cts, endtime)) { 4471 /* yes, exit probertt */ 4472 rack_exit_probertt(rack, us_cts); 4473 } 4474 4475 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 4476 /* Go into probertt, its been too long since we went lower */ 4477 rack_enter_probertt(rack, us_cts); 4478 } 4479 } 4480 4481 static void 4482 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4483 uint32_t rtt, int32_t rtt_diff) 4484 { 4485 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4486 uint32_t losses; 4487 4488 if ((rack->rc_gp_dyn_mul == 0) || 4489 (rack->use_fixed_rate) || 4490 (rack->in_probe_rtt) || 4491 (rack->rc_always_pace == 0)) { 4492 /* No dynamic GP multiplier in play */ 4493 return; 4494 } 4495 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4496 cur_bw = rack_get_bw(rack); 4497 /* Calculate our up and down range */ 4498 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4499 up_bnd /= 100; 4500 up_bnd += rack->r_ctl.last_gp_comp_bw; 4501 4502 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4503 subfr /= 100; 4504 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4505 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4506 /* 4507 * This is the case where our RTT is above 4508 * the max target and we have been configured 4509 * to just do timely no bonus up stuff in that case. 4510 * 4511 * There are two configurations, set to 1, and we 4512 * just do timely if we are over our max. If its 4513 * set above 1 then we slam the multipliers down 4514 * to 100 and then decrement per timely. 4515 */ 4516 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4517 __LINE__, 3); 4518 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4519 rack_validate_multipliers_at_or_below_100(rack); 4520 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4521 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { 4522 /* 4523 * We are decreasing this is a bit complicated this 4524 * means we are loosing ground. This could be 4525 * because another flow entered and we are competing 4526 * for b/w with it. This will push the RTT up which 4527 * makes timely unusable unless we want to get shoved 4528 * into a corner and just be backed off (the age 4529 * old problem with delay based CC). 4530 * 4531 * On the other hand if it was a route change we 4532 * would like to stay somewhat contained and not 4533 * blow out the buffers. 4534 */ 4535 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4536 __LINE__, 3); 4537 rack->r_ctl.last_gp_comp_bw = cur_bw; 4538 if (rack->rc_gp_bwred == 0) { 4539 /* Go into reduction counting */ 4540 rack->rc_gp_bwred = 1; 4541 rack->rc_gp_timely_dec_cnt = 0; 4542 } 4543 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { 4544 /* 4545 * Push another time with a faster pacing 4546 * to try to gain back (we include override to 4547 * get a full raise factor). 4548 */ 4549 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4550 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4551 (timely_says == 0) || 4552 (rack_down_raise_thresh == 0)) { 4553 /* 4554 * Do an override up in b/w if we were 4555 * below the threshold or if the threshold 4556 * is zero we always do the raise. 4557 */ 4558 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4559 } else { 4560 /* Log it stays the same */ 4561 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4562 __LINE__, 11); 4563 } 4564 rack->rc_gp_timely_dec_cnt++; 4565 /* We are not incrementing really no-count */ 4566 rack->rc_gp_incr = 0; 4567 rack->rc_gp_timely_inc_cnt = 0; 4568 } else { 4569 /* 4570 * Lets just use the RTT 4571 * information and give up 4572 * pushing. 4573 */ 4574 goto use_timely; 4575 } 4576 } else if ((timely_says != 2) && 4577 !losses && 4578 (last_bw_est > up_bnd)) { 4579 /* 4580 * We are increasing b/w lets keep going, updating 4581 * our b/w and ignoring any timely input, unless 4582 * of course we are at our max raise (if there is one). 4583 */ 4584 4585 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4586 __LINE__, 3); 4587 rack->r_ctl.last_gp_comp_bw = cur_bw; 4588 if (rack->rc_gp_saw_ss && 4589 rack->r_ctl.rack_per_upper_bound_ss && 4590 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { 4591 /* 4592 * In cases where we can't go higher 4593 * we should just use timely. 4594 */ 4595 goto use_timely; 4596 } 4597 if (rack->rc_gp_saw_ca && 4598 rack->r_ctl.rack_per_upper_bound_ca && 4599 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { 4600 /* 4601 * In cases where we can't go higher 4602 * we should just use timely. 4603 */ 4604 goto use_timely; 4605 } 4606 rack->rc_gp_bwred = 0; 4607 rack->rc_gp_timely_dec_cnt = 0; 4608 /* You get a set number of pushes if timely is trying to reduce */ 4609 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4610 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4611 } else { 4612 /* Log it stays the same */ 4613 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4614 __LINE__, 12); 4615 } 4616 return; 4617 } else { 4618 /* 4619 * We are staying between the lower and upper range bounds 4620 * so use timely to decide. 4621 */ 4622 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4623 __LINE__, 3); 4624 use_timely: 4625 if (timely_says) { 4626 rack->rc_gp_incr = 0; 4627 rack->rc_gp_timely_inc_cnt = 0; 4628 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4629 !losses && 4630 (last_bw_est < low_bnd)) { 4631 /* We are loosing ground */ 4632 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4633 rack->rc_gp_timely_dec_cnt++; 4634 /* We are not incrementing really no-count */ 4635 rack->rc_gp_incr = 0; 4636 rack->rc_gp_timely_inc_cnt = 0; 4637 } else 4638 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4639 } else { 4640 rack->rc_gp_bwred = 0; 4641 rack->rc_gp_timely_dec_cnt = 0; 4642 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4643 } 4644 } 4645 } 4646 4647 static int32_t 4648 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4649 { 4650 int32_t timely_says; 4651 uint64_t log_mult, log_rtt_a_diff; 4652 4653 log_rtt_a_diff = rtt; 4654 log_rtt_a_diff <<= 32; 4655 log_rtt_a_diff |= (uint32_t)rtt_diff; 4656 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4657 rack_gp_rtt_maxmul)) { 4658 /* Reduce the b/w multiplier */ 4659 timely_says = 2; 4660 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4661 log_mult <<= 32; 4662 log_mult |= prev_rtt; 4663 rack_log_timely(rack, timely_says, log_mult, 4664 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4665 log_rtt_a_diff, __LINE__, 4); 4666 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4667 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4668 max(rack_gp_rtt_mindiv , 1)))) { 4669 /* Increase the b/w multiplier */ 4670 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4671 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4672 max(rack_gp_rtt_mindiv , 1)); 4673 log_mult <<= 32; 4674 log_mult |= prev_rtt; 4675 timely_says = 0; 4676 rack_log_timely(rack, timely_says, log_mult , 4677 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4678 log_rtt_a_diff, __LINE__, 5); 4679 } else { 4680 /* 4681 * Use a gradient to find it the timely gradient 4682 * is: 4683 * grad = rc_rtt_diff / min_rtt; 4684 * 4685 * anything below or equal to 0 will be 4686 * a increase indication. Anything above 4687 * zero is a decrease. Note we take care 4688 * of the actual gradient calculation 4689 * in the reduction (its not needed for 4690 * increase). 4691 */ 4692 log_mult = prev_rtt; 4693 if (rtt_diff <= 0) { 4694 /* 4695 * Rttdiff is less than zero, increase the 4696 * b/w multiplier (its 0 or negative) 4697 */ 4698 timely_says = 0; 4699 rack_log_timely(rack, timely_says, log_mult, 4700 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4701 } else { 4702 /* Reduce the b/w multiplier */ 4703 timely_says = 1; 4704 rack_log_timely(rack, timely_says, log_mult, 4705 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4706 } 4707 } 4708 return (timely_says); 4709 } 4710 4711 static __inline int 4712 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) 4713 { 4714 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4715 SEQ_LEQ(rsm->r_end, tp->gput_ack)) { 4716 /** 4717 * This covers the case that the 4718 * resent is completely inside 4719 * the gp range or up to it. 4720 * |----------------| 4721 * |-----| <or> 4722 * |----| 4723 * <or> |---| 4724 */ 4725 return (1); 4726 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && 4727 SEQ_GT(rsm->r_end, tp->gput_seq)){ 4728 /** 4729 * This covers the case of 4730 * |--------------| 4731 * |-------->| 4732 */ 4733 return (1); 4734 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4735 SEQ_LT(rsm->r_start, tp->gput_ack) && 4736 SEQ_GEQ(rsm->r_end, tp->gput_ack)) { 4737 4738 /** 4739 * This covers the case of 4740 * |--------------| 4741 * |-------->| 4742 */ 4743 return (1); 4744 } 4745 return (0); 4746 } 4747 4748 static __inline void 4749 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) 4750 { 4751 4752 if ((tp->t_flags & TF_GPUTINPROG) == 0) 4753 return; 4754 /* 4755 * We have a Goodput measurement in progress. Mark 4756 * the send if its within the window. If its not 4757 * in the window make sure it does not have the mark. 4758 */ 4759 if (rack_in_gp_window(tp, rsm)) 4760 rsm->r_flags |= RACK_IN_GP_WIN; 4761 else 4762 rsm->r_flags &= ~RACK_IN_GP_WIN; 4763 } 4764 4765 static __inline void 4766 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4767 { 4768 /* A GP measurement is ending, clear all marks on the send map*/ 4769 struct rack_sendmap *rsm = NULL; 4770 4771 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4772 if (rsm == NULL) { 4773 rsm = tqhash_min(rack->r_ctl.tqh); 4774 } 4775 /* Nothing left? */ 4776 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ 4777 rsm->r_flags &= ~RACK_IN_GP_WIN; 4778 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4779 } 4780 } 4781 4782 4783 static __inline void 4784 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4785 { 4786 struct rack_sendmap *rsm = NULL; 4787 4788 if (tp->snd_una == tp->snd_max) { 4789 /* Nothing outstanding yet, nothing to do here */ 4790 return; 4791 } 4792 if (SEQ_GT(tp->gput_seq, tp->snd_una)) { 4793 /* 4794 * We are measuring ahead of some outstanding 4795 * data. We need to walk through up until we get 4796 * to gp_seq marking so that no rsm is set incorrectly 4797 * with RACK_IN_GP_WIN. 4798 */ 4799 rsm = tqhash_min(rack->r_ctl.tqh); 4800 while (rsm != NULL) { 4801 rack_mark_in_gp_win(tp, rsm); 4802 if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) 4803 break; 4804 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4805 } 4806 } 4807 if (rsm == NULL) { 4808 /* 4809 * Need to find the GP seq, if rsm is 4810 * set we stopped as we hit it. 4811 */ 4812 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4813 if (rsm == NULL) 4814 return; 4815 rack_mark_in_gp_win(tp, rsm); 4816 } 4817 /* 4818 * Now we may need to mark already sent rsm, ahead of 4819 * gput_seq in the window since they may have been sent 4820 * *before* we started our measurment. The rsm, if non-null 4821 * has been marked (note if rsm would have been NULL we would have 4822 * returned in the previous block). So we go to the next, and continue 4823 * until we run out of entries or we exceed the gp_ack value. 4824 */ 4825 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4826 while (rsm) { 4827 rack_mark_in_gp_win(tp, rsm); 4828 if (SEQ_GT(rsm->r_end, tp->gput_ack)) 4829 break; 4830 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4831 } 4832 } 4833 4834 static void 4835 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4836 tcp_seq th_ack, int line, uint8_t quality) 4837 { 4838 uint64_t tim, bytes_ps, stim, utim; 4839 uint32_t segsiz, bytes, reqbytes, us_cts; 4840 int32_t gput, new_rtt_diff, timely_says; 4841 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4842 int did_add = 0; 4843 4844 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4845 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4846 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4847 tim = us_cts - tp->gput_ts; 4848 else 4849 tim = 0; 4850 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4851 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4852 else 4853 stim = 0; 4854 /* 4855 * Use the larger of the send time or ack time. This prevents us 4856 * from being influenced by ack artifacts to come up with too 4857 * high of measurement. Note that since we are spanning over many more 4858 * bytes in most of our measurements hopefully that is less likely to 4859 * occur. 4860 */ 4861 if (tim > stim) 4862 utim = max(tim, 1); 4863 else 4864 utim = max(stim, 1); 4865 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4866 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); 4867 if ((tim == 0) && (stim == 0)) { 4868 /* 4869 * Invalid measurement time, maybe 4870 * all on one ack/one send? 4871 */ 4872 bytes = 0; 4873 bytes_ps = 0; 4874 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4875 0, 0, 0, 10, __LINE__, NULL, quality); 4876 goto skip_measurement; 4877 } 4878 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4879 /* We never made a us_rtt measurement? */ 4880 bytes = 0; 4881 bytes_ps = 0; 4882 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4883 0, 0, 0, 10, __LINE__, NULL, quality); 4884 goto skip_measurement; 4885 } 4886 /* 4887 * Calculate the maximum possible b/w this connection 4888 * could have. We base our calculation on the lowest 4889 * rtt we have seen during the measurement and the 4890 * largest rwnd the client has given us in that time. This 4891 * forms a BDP that is the maximum that we could ever 4892 * get to the client. Anything larger is not valid. 4893 * 4894 * I originally had code here that rejected measurements 4895 * where the time was less than 1/2 the latest us_rtt. 4896 * But after thinking on that I realized its wrong since 4897 * say you had a 150Mbps or even 1Gbps link, and you 4898 * were a long way away.. example I am in Europe (100ms rtt) 4899 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4900 * bytes my time would be 1.2ms, and yet my rtt would say 4901 * the measurement was invalid the time was < 50ms. The 4902 * same thing is true for 150Mb (8ms of time). 4903 * 4904 * A better way I realized is to look at what the maximum 4905 * the connection could possibly do. This is gated on 4906 * the lowest RTT we have seen and the highest rwnd. 4907 * We should in theory never exceed that, if we are 4908 * then something on the path is storing up packets 4909 * and then feeding them all at once to our endpoint 4910 * messing up our measurement. 4911 */ 4912 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4913 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4914 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4915 if (SEQ_LT(th_ack, tp->gput_seq)) { 4916 /* No measurement can be made */ 4917 bytes = 0; 4918 bytes_ps = 0; 4919 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4920 0, 0, 0, 10, __LINE__, NULL, quality); 4921 goto skip_measurement; 4922 } else 4923 bytes = (th_ack - tp->gput_seq); 4924 bytes_ps = (uint64_t)bytes; 4925 /* 4926 * Don't measure a b/w for pacing unless we have gotten at least 4927 * an initial windows worth of data in this measurement interval. 4928 * 4929 * Small numbers of bytes get badly influenced by delayed ack and 4930 * other artifacts. Note we take the initial window or our 4931 * defined minimum GP (defaulting to 10 which hopefully is the 4932 * IW). 4933 */ 4934 if (rack->rc_gp_filled == 0) { 4935 /* 4936 * The initial estimate is special. We 4937 * have blasted out an IW worth of packets 4938 * without a real valid ack ts results. We 4939 * then setup the app_limited_needs_set flag, 4940 * this should get the first ack in (probably 2 4941 * MSS worth) to be recorded as the timestamp. 4942 * We thus allow a smaller number of bytes i.e. 4943 * IW - 2MSS. 4944 */ 4945 reqbytes -= (2 * segsiz); 4946 /* Also lets fill previous for our first measurement to be neutral */ 4947 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4948 } 4949 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4950 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4951 rack->r_ctl.rc_app_limited_cnt, 4952 0, 0, 10, __LINE__, NULL, quality); 4953 goto skip_measurement; 4954 } 4955 /* 4956 * We now need to calculate the Timely like status so 4957 * we can update (possibly) the b/w multipliers. 4958 */ 4959 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4960 if (rack->rc_gp_filled == 0) { 4961 /* No previous reading */ 4962 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4963 } else { 4964 if (rack->measure_saw_probe_rtt == 0) { 4965 /* 4966 * We don't want a probertt to be counted 4967 * since it will be negative incorrectly. We 4968 * expect to be reducing the RTT when we 4969 * pace at a slower rate. 4970 */ 4971 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4972 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4973 } 4974 } 4975 timely_says = rack_make_timely_judgement(rack, 4976 rack->r_ctl.rc_gp_srtt, 4977 rack->r_ctl.rc_rtt_diff, 4978 rack->r_ctl.rc_prev_gp_srtt 4979 ); 4980 bytes_ps *= HPTS_USEC_IN_SEC; 4981 bytes_ps /= utim; 4982 if (bytes_ps > rack->r_ctl.last_max_bw) { 4983 /* 4984 * Something is on path playing 4985 * since this b/w is not possible based 4986 * on our BDP (highest rwnd and lowest rtt 4987 * we saw in the measurement window). 4988 * 4989 * Another option here would be to 4990 * instead skip the measurement. 4991 */ 4992 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4993 bytes_ps, rack->r_ctl.last_max_bw, 0, 4994 11, __LINE__, NULL, quality); 4995 bytes_ps = rack->r_ctl.last_max_bw; 4996 } 4997 /* We store gp for b/w in bytes per second */ 4998 if (rack->rc_gp_filled == 0) { 4999 /* Initial measurement */ 5000 if (bytes_ps) { 5001 rack->r_ctl.gp_bw = bytes_ps; 5002 rack->rc_gp_filled = 1; 5003 rack->r_ctl.num_measurements = 1; 5004 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 5005 } else { 5006 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5007 rack->r_ctl.rc_app_limited_cnt, 5008 0, 0, 10, __LINE__, NULL, quality); 5009 } 5010 if (tcp_in_hpts(rack->rc_tp) && 5011 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 5012 /* 5013 * Ok we can't trust the pacer in this case 5014 * where we transition from un-paced to paced. 5015 * Or for that matter when the burst mitigation 5016 * was making a wild guess and got it wrong. 5017 * Stop the pacer and clear up all the aggregate 5018 * delays etc. 5019 */ 5020 tcp_hpts_remove(rack->rc_tp); 5021 rack->r_ctl.rc_hpts_flags = 0; 5022 rack->r_ctl.rc_last_output_to = 0; 5023 } 5024 did_add = 2; 5025 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 5026 /* Still a small number run an average */ 5027 rack->r_ctl.gp_bw += bytes_ps; 5028 addpart = rack->r_ctl.num_measurements; 5029 rack->r_ctl.num_measurements++; 5030 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 5031 /* We have collected enough to move forward */ 5032 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 5033 } 5034 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5035 did_add = 3; 5036 } else { 5037 /* 5038 * We want to take 1/wma of the goodput and add in to 7/8th 5039 * of the old value weighted by the srtt. So if your measurement 5040 * period is say 2 SRTT's long you would get 1/4 as the 5041 * value, if it was like 1/2 SRTT then you would get 1/16th. 5042 * 5043 * But we must be careful not to take too much i.e. if the 5044 * srtt is say 20ms and the measurement is taken over 5045 * 400ms our weight would be 400/20 i.e. 20. On the 5046 * other hand if we get a measurement over 1ms with a 5047 * 10ms rtt we only want to take a much smaller portion. 5048 */ 5049 if (rack->r_ctl.num_measurements < 0xff) { 5050 rack->r_ctl.num_measurements++; 5051 } 5052 srtt = (uint64_t)tp->t_srtt; 5053 if (srtt == 0) { 5054 /* 5055 * Strange why did t_srtt go back to zero? 5056 */ 5057 if (rack->r_ctl.rc_rack_min_rtt) 5058 srtt = rack->r_ctl.rc_rack_min_rtt; 5059 else 5060 srtt = HPTS_USEC_IN_MSEC; 5061 } 5062 /* 5063 * XXXrrs: Note for reviewers, in playing with 5064 * dynamic pacing I discovered this GP calculation 5065 * as done originally leads to some undesired results. 5066 * Basically you can get longer measurements contributing 5067 * too much to the WMA. Thus I changed it if you are doing 5068 * dynamic adjustments to only do the aportioned adjustment 5069 * if we have a very small (time wise) measurement. Longer 5070 * measurements just get there weight (defaulting to 1/8) 5071 * add to the WMA. We may want to think about changing 5072 * this to always do that for both sides i.e. dynamic 5073 * and non-dynamic... but considering lots of folks 5074 * were playing with this I did not want to change the 5075 * calculation per.se. without your thoughts.. Lawerence? 5076 * Peter?? 5077 */ 5078 if (rack->rc_gp_dyn_mul == 0) { 5079 subpart = rack->r_ctl.gp_bw * utim; 5080 subpart /= (srtt * 8); 5081 if (subpart < (rack->r_ctl.gp_bw / 2)) { 5082 /* 5083 * The b/w update takes no more 5084 * away then 1/2 our running total 5085 * so factor it in. 5086 */ 5087 addpart = bytes_ps * utim; 5088 addpart /= (srtt * 8); 5089 } else { 5090 /* 5091 * Don't allow a single measurement 5092 * to account for more than 1/2 of the 5093 * WMA. This could happen on a retransmission 5094 * where utim becomes huge compared to 5095 * srtt (multiple retransmissions when using 5096 * the sending rate which factors in all the 5097 * transmissions from the first one). 5098 */ 5099 subpart = rack->r_ctl.gp_bw / 2; 5100 addpart = bytes_ps / 2; 5101 } 5102 resid_bw = rack->r_ctl.gp_bw - subpart; 5103 rack->r_ctl.gp_bw = resid_bw + addpart; 5104 did_add = 1; 5105 } else { 5106 if ((utim / srtt) <= 1) { 5107 /* 5108 * The b/w update was over a small period 5109 * of time. The idea here is to prevent a small 5110 * measurement time period from counting 5111 * too much. So we scale it based on the 5112 * time so it attributes less than 1/rack_wma_divisor 5113 * of its measurement. 5114 */ 5115 subpart = rack->r_ctl.gp_bw * utim; 5116 subpart /= (srtt * rack_wma_divisor); 5117 addpart = bytes_ps * utim; 5118 addpart /= (srtt * rack_wma_divisor); 5119 } else { 5120 /* 5121 * The scaled measurement was long 5122 * enough so lets just add in the 5123 * portion of the measurement i.e. 1/rack_wma_divisor 5124 */ 5125 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 5126 addpart = bytes_ps / rack_wma_divisor; 5127 } 5128 if ((rack->measure_saw_probe_rtt == 0) || 5129 (bytes_ps > rack->r_ctl.gp_bw)) { 5130 /* 5131 * For probe-rtt we only add it in 5132 * if its larger, all others we just 5133 * add in. 5134 */ 5135 did_add = 1; 5136 resid_bw = rack->r_ctl.gp_bw - subpart; 5137 rack->r_ctl.gp_bw = resid_bw + addpart; 5138 } 5139 } 5140 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5141 } 5142 if ((rack->gp_ready == 0) && 5143 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 5144 /* We have enough measurements now */ 5145 rack->gp_ready = 1; 5146 if (rack->dgp_on || 5147 rack->rack_hibeta) 5148 rack_set_cc_pacing(rack); 5149 if (rack->defer_options) 5150 rack_apply_deferred_options(rack); 5151 } 5152 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 5153 rack_get_bw(rack), 22, did_add, NULL, quality); 5154 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 5155 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 5156 rack_update_multiplier(rack, timely_says, bytes_ps, 5157 rack->r_ctl.rc_gp_srtt, 5158 rack->r_ctl.rc_rtt_diff); 5159 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 5160 rack_get_bw(rack), 3, line, NULL, quality); 5161 rack_log_pacing_delay_calc(rack, 5162 bytes, /* flex2 */ 5163 tim, /* flex1 */ 5164 bytes_ps, /* bw_inuse */ 5165 rack->r_ctl.gp_bw, /* delRate */ 5166 rack_get_lt_bw(rack), /* rttProp */ 5167 20, line, NULL, 0); 5168 /* reset the gp srtt and setup the new prev */ 5169 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5170 /* Record the lost count for the next measurement */ 5171 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 5172 skip_measurement: 5173 /* 5174 * We restart our diffs based on the gpsrtt in the 5175 * measurement window. 5176 */ 5177 rack->rc_gp_rtt_set = 0; 5178 rack->rc_gp_saw_rec = 0; 5179 rack->rc_gp_saw_ca = 0; 5180 rack->rc_gp_saw_ss = 0; 5181 rack->rc_dragged_bottom = 0; 5182 5183 if (quality == RACK_QUALITY_HIGH) { 5184 /* 5185 * Gput in the stats world is in kbps where bytes_ps is 5186 * bytes per second so we do ((x * 8)/ 1000). 5187 */ 5188 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); 5189 #ifdef STATS 5190 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 5191 gput); 5192 /* 5193 * XXXLAS: This is a temporary hack, and should be 5194 * chained off VOI_TCP_GPUT when stats(9) grows an 5195 * API to deal with chained VOIs. 5196 */ 5197 if (tp->t_stats_gput_prev > 0) 5198 stats_voi_update_abs_s32(tp->t_stats, 5199 VOI_TCP_GPUT_ND, 5200 ((gput - tp->t_stats_gput_prev) * 100) / 5201 tp->t_stats_gput_prev); 5202 #endif 5203 tp->t_stats_gput_prev = gput; 5204 } 5205 tp->t_flags &= ~TF_GPUTINPROG; 5206 /* 5207 * Now are we app limited now and there is space from where we 5208 * were to where we want to go? 5209 * 5210 * We don't do the other case i.e. non-applimited here since 5211 * the next send will trigger us picking up the missing data. 5212 */ 5213 if (rack->r_ctl.rc_first_appl && 5214 TCPS_HAVEESTABLISHED(tp->t_state) && 5215 rack->r_ctl.rc_app_limited_cnt && 5216 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 5217 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 5218 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 5219 /* 5220 * Yep there is enough outstanding to make a measurement here. 5221 */ 5222 struct rack_sendmap *rsm; 5223 5224 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 5225 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 5226 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 5227 rack->app_limited_needs_set = 0; 5228 tp->gput_seq = th_ack; 5229 if (rack->in_probe_rtt) 5230 rack->measure_saw_probe_rtt = 1; 5231 else if ((rack->measure_saw_probe_rtt) && 5232 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 5233 rack->measure_saw_probe_rtt = 0; 5234 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 5235 /* There is a full window to gain info from */ 5236 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 5237 } else { 5238 /* We can only measure up to the applimited point */ 5239 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 5240 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 5241 /* 5242 * We don't have enough to make a measurement. 5243 */ 5244 tp->t_flags &= ~TF_GPUTINPROG; 5245 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 5246 0, 0, 0, 6, __LINE__, NULL, quality); 5247 return; 5248 } 5249 } 5250 if (tp->t_state >= TCPS_FIN_WAIT_1) { 5251 /* 5252 * We will get no more data into the SB 5253 * this means we need to have the data available 5254 * before we start a measurement. 5255 */ 5256 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 5257 /* Nope not enough data. */ 5258 return; 5259 } 5260 } 5261 tp->t_flags |= TF_GPUTINPROG; 5262 /* 5263 * Now we need to find the timestamp of the send at tp->gput_seq 5264 * for the send based measurement. 5265 */ 5266 rack->r_ctl.rc_gp_cumack_ts = 0; 5267 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 5268 if (rsm) { 5269 /* Ok send-based limit is set */ 5270 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 5271 /* 5272 * Move back to include the earlier part 5273 * so our ack time lines up right (this may 5274 * make an overlapping measurement but thats 5275 * ok). 5276 */ 5277 tp->gput_seq = rsm->r_start; 5278 } 5279 if (rsm->r_flags & RACK_ACKED) { 5280 struct rack_sendmap *nrsm; 5281 5282 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 5283 tp->gput_seq = rsm->r_end; 5284 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 5285 if (nrsm) 5286 rsm = nrsm; 5287 else { 5288 rack->app_limited_needs_set = 1; 5289 } 5290 } else 5291 rack->app_limited_needs_set = 1; 5292 /* We always go from the first send */ 5293 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 5294 } else { 5295 /* 5296 * If we don't find the rsm due to some 5297 * send-limit set the current time, which 5298 * basically disables the send-limit. 5299 */ 5300 struct timeval tv; 5301 5302 microuptime(&tv); 5303 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 5304 } 5305 rack_tend_gp_marks(tp, rack); 5306 rack_log_pacing_delay_calc(rack, 5307 tp->gput_seq, 5308 tp->gput_ack, 5309 (uint64_t)rsm, 5310 tp->gput_ts, 5311 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 5312 9, 5313 __LINE__, rsm, quality); 5314 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 5315 } else { 5316 /* 5317 * To make sure proper timestamp merging occurs, we need to clear 5318 * all GP marks if we don't start a measurement. 5319 */ 5320 rack_clear_gp_marks(tp, rack); 5321 } 5322 } 5323 5324 /* 5325 * CC wrapper hook functions 5326 */ 5327 static void 5328 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 5329 uint16_t type, int32_t recovery) 5330 { 5331 uint32_t prior_cwnd, acked; 5332 struct tcp_log_buffer *lgb = NULL; 5333 uint8_t labc_to_use, quality; 5334 5335 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5336 tp->t_ccv.nsegs = nsegs; 5337 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 5338 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 5339 uint32_t max; 5340 5341 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 5342 if (tp->t_ccv.bytes_this_ack > max) { 5343 tp->t_ccv.bytes_this_ack = max; 5344 } 5345 } 5346 #ifdef STATS 5347 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 5348 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 5349 #endif 5350 if ((th_ack == tp->snd_max) && rack->lt_bw_up) { 5351 /* We will ack all, time 5352 * to end any lt_bw_up we 5353 * have running until something 5354 * new is sent. 5355 */ 5356 struct timeval tv; 5357 5358 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); 5359 rack->r_ctl.lt_seq = tp->snd_max; 5360 (void)tcp_get_usecs(&tv); 5361 rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); 5362 rack->lt_bw_up = 0; 5363 } 5364 quality = RACK_QUALITY_NONE; 5365 if ((tp->t_flags & TF_GPUTINPROG) && 5366 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 5367 /* Measure the Goodput */ 5368 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 5369 } 5370 /* Which way our we limited, if not cwnd limited no advance in CA */ 5371 if (tp->snd_cwnd <= tp->snd_wnd) 5372 tp->t_ccv.flags |= CCF_CWND_LIMITED; 5373 else 5374 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 5375 if (tp->snd_cwnd > tp->snd_ssthresh) { 5376 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 5377 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 5378 /* For the setting of a window past use the actual scwnd we are using */ 5379 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 5380 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 5381 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 5382 } 5383 } else { 5384 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 5385 tp->t_bytes_acked = 0; 5386 } 5387 prior_cwnd = tp->snd_cwnd; 5388 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 5389 (rack_client_low_buf && rack->client_bufferlvl && 5390 (rack->client_bufferlvl < rack_client_low_buf))) 5391 labc_to_use = rack->rc_labc; 5392 else 5393 labc_to_use = rack_max_abc_post_recovery; 5394 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5395 union tcp_log_stackspecific log; 5396 struct timeval tv; 5397 5398 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5399 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5400 log.u_bbr.flex1 = th_ack; 5401 log.u_bbr.flex2 = tp->t_ccv.flags; 5402 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5403 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5404 log.u_bbr.flex5 = labc_to_use; 5405 log.u_bbr.flex6 = prior_cwnd; 5406 log.u_bbr.flex7 = V_tcp_do_newsack; 5407 log.u_bbr.flex8 = 1; 5408 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5409 0, &log, false, NULL, __func__, __LINE__,&tv); 5410 } 5411 if (CC_ALGO(tp)->ack_received != NULL) { 5412 /* XXXLAS: Find a way to live without this */ 5413 tp->t_ccv.curack = th_ack; 5414 tp->t_ccv.labc = labc_to_use; 5415 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 5416 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 5417 } 5418 if (lgb) { 5419 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 5420 } 5421 if (rack->r_must_retran) { 5422 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 5423 /* 5424 * We now are beyond the rxt point so lets disable 5425 * the flag. 5426 */ 5427 rack->r_ctl.rc_out_at_rto = 0; 5428 rack->r_must_retran = 0; 5429 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 5430 /* 5431 * Only decrement the rc_out_at_rto if the cwnd advances 5432 * at least a whole segment. Otherwise next time the peer 5433 * acks, we won't be able to send this generaly happens 5434 * when we are in Congestion Avoidance. 5435 */ 5436 if (acked <= rack->r_ctl.rc_out_at_rto){ 5437 rack->r_ctl.rc_out_at_rto -= acked; 5438 } else { 5439 rack->r_ctl.rc_out_at_rto = 0; 5440 } 5441 } 5442 } 5443 #ifdef STATS 5444 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 5445 #endif 5446 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 5447 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 5448 } 5449 } 5450 5451 static void 5452 tcp_rack_partialack(struct tcpcb *tp) 5453 { 5454 struct tcp_rack *rack; 5455 5456 rack = (struct tcp_rack *)tp->t_fb_ptr; 5457 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5458 /* 5459 * If we are doing PRR and have enough 5460 * room to send <or> we are pacing and prr 5461 * is disabled we will want to see if we 5462 * can send data (by setting r_wanted_output to 5463 * true). 5464 */ 5465 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 5466 rack->rack_no_prr) 5467 rack->r_wanted_output = 1; 5468 } 5469 5470 static inline void 5471 rack_set_most_aggr(struct tcp_rack *rack) 5472 { 5473 rack->r_fill_less_agg = 0; 5474 /* Once the cwnd as been clamped we don't do fill_cw */ 5475 if (rack->r_cwnd_was_clamped == 0) 5476 rack->rc_pace_to_cwnd = 1; 5477 rack->r_pacing_discount = 0; 5478 } 5479 5480 static inline void 5481 rack_limit_fillcw(struct tcp_rack *rack) 5482 { 5483 rack->r_fill_less_agg = 1; 5484 /* Once the cwnd as been clamped we don't do fill_cw */ 5485 if (rack->r_cwnd_was_clamped == 0) 5486 rack->rc_pace_to_cwnd = 1; 5487 rack->r_pacing_discount = 0; 5488 } 5489 5490 static inline void 5491 rack_disable_fillcw(struct tcp_rack *rack) 5492 { 5493 rack->r_fill_less_agg = 1; 5494 rack->rc_pace_to_cwnd = 0; 5495 rack->r_pacing_discount = 0; 5496 } 5497 5498 static void 5499 rack_client_buffer_level_set(struct tcp_rack *rack) 5500 { 5501 /* 5502 * Only if DGP is on do we do anything that 5503 * changes stack behavior. If DGP is off all 5504 * we will do is issue a BB log (if BB logging is 5505 * on) and return. 5506 */ 5507 if (rack->dgp_on == 0) { 5508 rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl, 5509 0, 0, 0, 30, __LINE__, NULL, 0); 5510 return; 5511 } 5512 if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) { 5513 goto set_most_agg; 5514 } 5515 /* 5516 * We are in DGP so what setting should we 5517 * apply based on where the client is? 5518 */ 5519 switch(rack->r_ctl.rc_dgp_bl_agg) { 5520 default: 5521 case DGP_LEVEL0: 5522 set_most_agg: 5523 rack_set_most_aggr(rack); 5524 break; 5525 case DGP_LEVEL1: 5526 if (rack->client_bufferlvl == 4) 5527 rack_limit_fillcw(rack); 5528 else if (rack->client_bufferlvl == 5) 5529 rack_disable_fillcw(rack); 5530 else 5531 rack_set_most_aggr(rack); 5532 break; 5533 case DGP_LEVEL2: 5534 if (rack->client_bufferlvl == 3) 5535 rack_limit_fillcw(rack); 5536 else if (rack->client_bufferlvl == 4) 5537 rack_disable_fillcw(rack); 5538 else if (rack->client_bufferlvl == 5) { 5539 rack_disable_fillcw(rack); 5540 rack->r_pacing_discount = 1; 5541 rack->r_ctl.pacing_discount_amm = 1; 5542 } else 5543 rack_set_most_aggr(rack); 5544 break; 5545 case DGP_LEVEL3: 5546 if (rack->client_bufferlvl == 2) 5547 rack_limit_fillcw(rack); 5548 else if (rack->client_bufferlvl == 3) 5549 rack_disable_fillcw(rack); 5550 else if (rack->client_bufferlvl == 4) { 5551 rack_disable_fillcw(rack); 5552 rack->r_pacing_discount = 1; 5553 rack->r_ctl.pacing_discount_amm = 1; 5554 } else if (rack->client_bufferlvl == 5) { 5555 rack_disable_fillcw(rack); 5556 rack->r_pacing_discount = 1; 5557 rack->r_ctl.pacing_discount_amm = 2; 5558 } else 5559 rack_set_most_aggr(rack); 5560 break; 5561 } 5562 rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0, 5563 0, 0, 30, __LINE__, NULL, 0); 5564 } 5565 5566 static void 5567 do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack) 5568 { 5569 /* 5570 * Can we unclamp. We unclamp if more than 5571 * N rounds have transpired with no loss. 5572 */ 5573 uint64_t snds, rxts, rxt_per; 5574 uint32_t rnds; 5575 5576 rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; 5577 if ((rack_unclamp_round_thresh > 0) && 5578 (rnds >= rack_unclamp_round_thresh)) { 5579 snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; 5580 KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp, 5581 (uintmax_t)snds)); 5582 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; 5583 rxt_per = rxts * 1000; 5584 rxt_per /= snds; 5585 if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) { 5586 /* Unclamp */ 5587 if (tcp_bblogging_on(rack->rc_tp)) { 5588 union tcp_log_stackspecific log; 5589 struct timeval tv; 5590 5591 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5592 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5593 log.u_bbr.flex3 = rnds; 5594 log.u_bbr.flex4 = rack_unclamp_round_thresh; 5595 log.u_bbr.flex5 = (uint32_t)rxt_per; 5596 log.u_bbr.flex8 = 6; 5597 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; 5598 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; 5599 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; 5600 log.u_bbr.applimited = rack->r_ctl.max_clamps; 5601 log.u_bbr.epoch = rack->r_ctl.clamp_options; 5602 log.u_bbr.cur_del_rate = rxts; 5603 log.u_bbr.bw_inuse = rack_get_lt_bw(rack); 5604 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5605 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); 5606 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); 5607 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5608 0, &log, false, NULL, NULL, 0, &tv); 5609 } 5610 rack->r_ctl.num_of_clamps_applied = 0; 5611 rack->r_cwnd_was_clamped = 0; 5612 rack->excess_rxt_on = 1; 5613 if (rack->r_ctl.clamp_options) { 5614 /* 5615 * We only allow fillcw to be toggled 5616 * if you are setting a max seg too. 5617 */ 5618 if (rack->r_ctl.clamp_options & 0x1) { 5619 if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { 5620 /* turn on fill cw for non-dgp*/ 5621 rack->rc_pace_to_cwnd = 0; 5622 } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { 5623 /* For DGP we want it off */ 5624 rack->rc_pace_to_cwnd = 1; 5625 } 5626 } 5627 } 5628 if (rack->dgp_on) { 5629 /* Reset all multipliers to 100.0 so just the measured bw */ 5630 /* Crash any per boosts down to 100% */ 5631 rack->r_ctl.rack_per_of_gp_rec = 100; 5632 rack->r_ctl.rack_per_of_gp_ss = 100; 5633 rack->r_ctl.rack_per_of_gp_ca = 100; 5634 /* Set in an upper bound for ss/ca % increase */ 5635 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 5636 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 5637 } 5638 } 5639 } 5640 } 5641 5642 static void 5643 do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack) 5644 { 5645 /* 5646 * Rack excess rxt accounting is turned on. If we 5647 * are above a threshold of rxt's in at least N 5648 * rounds, then back off the cwnd and ssthresh 5649 * to fit into the long-term b/w. 5650 */ 5651 uint64_t snds, rxts, rxt_per, lt_bw, bdp; 5652 uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0; 5653 5654 /* Is it shut off by 0 rounds? */ 5655 if (rack_rxt_min_rnds == 0) 5656 return; 5657 if ((rack->r_ctl.max_clamps > 0) && 5658 (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) { 5659 /* 5660 * The idea, if max_clamps is set, is that if clamping it 5661 * N times did not work again, then there is no sense 5662 * clamping it again. The link is just a lossy link and 5663 * our clamps are doing no good. Turn it off so we don't come 5664 * back here again. 5665 */ 5666 rack->excess_rxt_on = 0; 5667 rack->r_cwnd_was_clamped = 0; 5668 rack->r_ctl.num_of_clamps_applied = 0; 5669 return; 5670 } 5671 snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; 5672 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; 5673 rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; 5674 /* Has enough rounds progressed for us to re-measure? */ 5675 if ((rnds >= rack_rxt_min_rnds) && 5676 (rack->r_ctl.rxt_threshold > 0)){ 5677 rxt_per = rxts * 1000; 5678 rxt_per /= snds; 5679 if (rxt_per >= rack->r_ctl.rxt_threshold) { 5680 /* 5681 * Action required: 5682 * We are above our excess retransmit level, lets 5683 * cut down the cwnd and ssthresh to match the long-term 5684 * b/w we are getting. 5685 */ 5686 /* First disable scwnd if enabled */ 5687 #ifdef NETFLIX_SHARED_CWND 5688 rack->rack_enable_scwnd = 0; 5689 if (rack->r_ctl.rc_scw) { 5690 uint32_t limit; 5691 5692 shared_cwnd_was_enabled = 1; 5693 if (rack->r_limit_scw) 5694 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 5695 else 5696 limit = 0; 5697 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 5698 rack->r_ctl.rc_scw_index, 5699 limit); 5700 rack->r_ctl.rc_scw = NULL; 5701 } 5702 5703 #endif 5704 /* Calculate what the cwnd and ssthresh should be */ 5705 tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT); 5706 lt_bw = rack_get_lt_bw(rack); 5707 if (lt_bw == 0) { 5708 /* 5709 * No lt_bw, lets chop things to one MSS 5710 * and the ssthresh to the iwnd. 5711 */ 5712 reset_to_iw: 5713 new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5714 new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp)); 5715 } else { 5716 rtt = rack->rc_rack_rtt; 5717 if (rtt == 0) { 5718 /* If we have no rack_rtt drop to the IW situation */ 5719 goto reset_to_iw; 5720 } 5721 bdp = lt_bw * (uint64_t)rtt; 5722 bdp /= HPTS_USEC_IN_SEC; 5723 new_cwnd = (uint32_t)bdp; 5724 new_ssthresh = new_cwnd - 1; 5725 if (new_cwnd < ctf_fixed_maxseg(tp)) { 5726 /* Rock bottom, goto IW settings */ 5727 goto reset_to_iw; 5728 } 5729 } 5730 rack->r_cwnd_was_clamped = 1; 5731 rack->r_ctl.num_of_clamps_applied++; 5732 /* Reset the counter fromn now */ 5733 tp->t_bytes_acked = 0; 5734 /* 5735 * Now what about options? 5736 * We look at the bottom 8 bits: 5737 * F = fill cw bit (toggle it if set) 5738 * S = Segment bits 5739 * M = set max segment bit 5740 * 5741 * SSSS SSMF 5742 */ 5743 if (rack->r_ctl.clamp_options) { 5744 if (rack->r_ctl.clamp_options & 0x1) { 5745 if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { 5746 /* turn on fill cw for non-dgp*/ 5747 rack->rc_pace_to_cwnd = 1; 5748 } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { 5749 /* For DGP we want it off */ 5750 rack->rc_pace_to_cwnd = 0; 5751 } 5752 } 5753 } 5754 if (rack->dgp_on) { 5755 /* Reset all multipliers to 100.0 so just the measured bw */ 5756 /* Crash any per boosts down to 100% */ 5757 rack->r_ctl.rack_per_of_gp_rec = 100; 5758 rack->r_ctl.rack_per_of_gp_ss = 100; 5759 rack->r_ctl.rack_per_of_gp_ca = 100; 5760 /* Set in an upper bound for ss/ca % increase */ 5761 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper; 5762 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper; 5763 /* Now move to the lt_bw */ 5764 rack->r_ctl.gp_bw = lt_bw; 5765 rack->rc_gp_filled = 1; 5766 rack->r_ctl.num_measurements = RACK_REQ_AVG; 5767 } 5768 if (tcp_bblogging_on(rack->rc_tp)) { 5769 union tcp_log_stackspecific log; 5770 struct timeval tv; 5771 5772 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5773 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5774 log.u_bbr.flex1 = new_cwnd; 5775 log.u_bbr.flex2 = new_ssthresh; 5776 log.u_bbr.flex3 = rnds; 5777 log.u_bbr.flex4 = rack_rxt_min_rnds; 5778 log.u_bbr.flex5 = rtt; 5779 log.u_bbr.flex6 = shared_cwnd_was_enabled; 5780 log.u_bbr.flex8 = 5; 5781 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; 5782 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; 5783 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; 5784 log.u_bbr.applimited = rack->r_ctl.max_clamps; 5785 log.u_bbr.epoch = rack->r_ctl.clamp_options; 5786 log.u_bbr.cur_del_rate = rxts; 5787 log.u_bbr.delRate = snds; 5788 log.u_bbr.rttProp = rack->r_ctl.rxt_threshold; 5789 log.u_bbr.bw_inuse = lt_bw; 5790 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5791 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); 5792 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); 5793 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5794 0, &log, false, NULL, NULL, 0, &tv); 5795 } 5796 /* Update our point where we did it */ 5797 if (rack->r_ctl.already_had_a_excess == 0) { 5798 rack->r_ctl.already_had_a_excess = 1; 5799 counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1); 5800 } 5801 counter_u64_add(rack_rxt_clamps_cwnd, 1); 5802 rack->r_ctl.last_sndbytes = tp->t_sndbytes; 5803 rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes; 5804 rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round; 5805 if (new_cwnd < tp->snd_cwnd) 5806 tp->snd_cwnd = new_cwnd; 5807 if (new_ssthresh < tp->snd_ssthresh) 5808 tp->snd_ssthresh = new_ssthresh; 5809 } 5810 } 5811 } 5812 5813 static void 5814 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 5815 { 5816 struct tcp_rack *rack; 5817 uint32_t orig_cwnd; 5818 5819 orig_cwnd = tp->snd_cwnd; 5820 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5821 rack = (struct tcp_rack *)tp->t_fb_ptr; 5822 /* only alert CC if we alerted when we entered */ 5823 if (CC_ALGO(tp)->post_recovery != NULL) { 5824 tp->t_ccv.curack = th_ack; 5825 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 5826 if (tp->snd_cwnd < tp->snd_ssthresh) { 5827 /* 5828 * Rack has burst control and pacing 5829 * so lets not set this any lower than 5830 * snd_ssthresh per RFC-6582 (option 2). 5831 */ 5832 tp->snd_cwnd = tp->snd_ssthresh; 5833 } 5834 } 5835 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5836 union tcp_log_stackspecific log; 5837 struct timeval tv; 5838 5839 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5840 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5841 log.u_bbr.flex1 = th_ack; 5842 log.u_bbr.flex2 = tp->t_ccv.flags; 5843 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5844 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5845 log.u_bbr.flex5 = V_tcp_abc_l_var; 5846 log.u_bbr.flex6 = orig_cwnd; 5847 log.u_bbr.flex7 = V_tcp_do_newsack; 5848 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 5849 log.u_bbr.flex8 = 2; 5850 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5851 0, &log, false, NULL, __func__, __LINE__, &tv); 5852 } 5853 if ((rack->rack_no_prr == 0) && 5854 (rack->no_prr_addback == 0) && 5855 (rack->r_ctl.rc_prr_sndcnt > 0)) { 5856 /* 5857 * Suck the next prr cnt back into cwnd, but 5858 * only do that if we are not application limited. 5859 */ 5860 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 5861 /* 5862 * We are allowed to add back to the cwnd the amount we did 5863 * not get out if: 5864 * a) no_prr_addback is off. 5865 * b) we are not app limited 5866 * c) we are doing prr 5867 * <and> 5868 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 5869 */ 5870 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 5871 rack->r_ctl.rc_prr_sndcnt); 5872 } 5873 rack->r_ctl.rc_prr_sndcnt = 0; 5874 rack_log_to_prr(rack, 1, 0, __LINE__); 5875 } 5876 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 5877 tp->snd_recover = tp->snd_una; 5878 if (rack->r_ctl.dsack_persist) { 5879 rack->r_ctl.dsack_persist--; 5880 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 5881 rack->r_ctl.num_dsack = 0; 5882 } 5883 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 5884 } 5885 EXIT_RECOVERY(tp->t_flags); 5886 if (rack->r_ctl.full_dgp_in_rec) 5887 rack_client_buffer_level_set(rack); 5888 } 5889 5890 static void 5891 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 5892 { 5893 struct tcp_rack *rack; 5894 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 5895 5896 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5897 #ifdef STATS 5898 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 5899 #endif 5900 if (IN_RECOVERY(tp->t_flags) == 0) { 5901 in_rec_at_entry = 0; 5902 ssthresh_enter = tp->snd_ssthresh; 5903 cwnd_enter = tp->snd_cwnd; 5904 } else 5905 in_rec_at_entry = 1; 5906 rack = (struct tcp_rack *)tp->t_fb_ptr; 5907 switch (type) { 5908 case CC_NDUPACK: 5909 tp->t_flags &= ~TF_WASFRECOVERY; 5910 tp->t_flags &= ~TF_WASCRECOVERY; 5911 if (!IN_FASTRECOVERY(tp->t_flags)) { 5912 if (rack->dgp_on && rack->r_cwnd_was_clamped) { 5913 /* Reset the gains so that on exit we will be softer longer */ 5914 rack->r_ctl.rack_per_of_gp_rec = 100; 5915 rack->r_ctl.rack_per_of_gp_ss = 98; 5916 rack->r_ctl.rack_per_of_gp_ca = 98; 5917 } 5918 rack->r_ctl.rc_prr_delivered = 0; 5919 rack->r_ctl.rc_prr_out = 0; 5920 rack->r_fast_output = 0; 5921 if (rack->rack_no_prr == 0) { 5922 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5923 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 5924 } 5925 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 5926 tp->snd_recover = tp->snd_max; 5927 if (tp->t_flags2 & TF2_ECN_PERMIT) 5928 tp->t_flags2 |= TF2_ECN_SND_CWR; 5929 } 5930 break; 5931 case CC_ECN: 5932 if (!IN_CONGRECOVERY(tp->t_flags) || 5933 /* 5934 * Allow ECN reaction on ACK to CWR, if 5935 * that data segment was also CE marked. 5936 */ 5937 SEQ_GEQ(ack, tp->snd_recover)) { 5938 EXIT_CONGRECOVERY(tp->t_flags); 5939 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 5940 rack->r_fast_output = 0; 5941 tp->snd_recover = tp->snd_max + 1; 5942 if (tp->t_flags2 & TF2_ECN_PERMIT) 5943 tp->t_flags2 |= TF2_ECN_SND_CWR; 5944 } 5945 break; 5946 case CC_RTO: 5947 tp->t_dupacks = 0; 5948 tp->t_bytes_acked = 0; 5949 rack->r_fast_output = 0; 5950 EXIT_RECOVERY(tp->t_flags); 5951 if (tp->t_rxtshift == 1) { 5952 tp->snd_ssthresh = max(2, 5953 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 5954 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 5955 } 5956 orig_cwnd = tp->snd_cwnd; 5957 tp->snd_cwnd = ctf_fixed_maxseg(tp); 5958 rack_log_to_prr(rack, 16, orig_cwnd, line); 5959 if (tp->t_flags2 & TF2_ECN_PERMIT) 5960 tp->t_flags2 |= TF2_ECN_SND_CWR; 5961 break; 5962 case CC_RTO_ERR: 5963 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 5964 /* RTO was unnecessary, so reset everything. */ 5965 tp->snd_cwnd = tp->snd_cwnd_prev; 5966 tp->snd_ssthresh = tp->snd_ssthresh_prev; 5967 tp->snd_recover = tp->snd_recover_prev; 5968 if (tp->t_flags & TF_WASFRECOVERY) { 5969 ENTER_FASTRECOVERY(tp->t_flags); 5970 tp->t_flags &= ~TF_WASFRECOVERY; 5971 } 5972 if (tp->t_flags & TF_WASCRECOVERY) { 5973 ENTER_CONGRECOVERY(tp->t_flags); 5974 tp->t_flags &= ~TF_WASCRECOVERY; 5975 } 5976 tp->snd_nxt = tp->snd_max; 5977 tp->t_badrxtwin = 0; 5978 break; 5979 } 5980 if ((CC_ALGO(tp)->cong_signal != NULL) && 5981 (type != CC_RTO)){ 5982 tp->t_ccv.curack = ack; 5983 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 5984 } 5985 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 5986 rack_log_to_prr(rack, 15, cwnd_enter, line); 5987 if (rack->r_ctl.full_dgp_in_rec) 5988 rack_client_buffer_level_set(rack); 5989 rack->r_ctl.dsack_byte_cnt = 0; 5990 rack->r_ctl.retran_during_recovery = 0; 5991 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 5992 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 5993 rack->r_ent_rec_ns = 1; 5994 } 5995 } 5996 5997 static inline void 5998 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 5999 { 6000 uint32_t i_cwnd; 6001 6002 INP_WLOCK_ASSERT(tptoinpcb(tp)); 6003 6004 if (CC_ALGO(tp)->after_idle != NULL) 6005 CC_ALGO(tp)->after_idle(&tp->t_ccv); 6006 6007 if (tp->snd_cwnd == 1) 6008 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 6009 else 6010 i_cwnd = rc_init_window(rack); 6011 6012 /* 6013 * Being idle is no different than the initial window. If the cc 6014 * clamps it down below the initial window raise it to the initial 6015 * window. 6016 */ 6017 if (tp->snd_cwnd < i_cwnd) { 6018 tp->snd_cwnd = i_cwnd; 6019 } 6020 } 6021 6022 /* 6023 * Indicate whether this ack should be delayed. We can delay the ack if 6024 * following conditions are met: 6025 * - There is no delayed ack timer in progress. 6026 * - Our last ack wasn't a 0-sized window. We never want to delay 6027 * the ack that opens up a 0-sized window. 6028 * - LRO wasn't used for this segment. We make sure by checking that the 6029 * segment size is not larger than the MSS. 6030 * - Delayed acks are enabled or this is a half-synchronized T/TCP 6031 * connection. 6032 */ 6033 #define DELAY_ACK(tp, tlen) \ 6034 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 6035 ((tp->t_flags & TF_DELACK) == 0) && \ 6036 (tlen <= tp->t_maxseg) && \ 6037 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 6038 6039 static struct rack_sendmap * 6040 rack_find_lowest_rsm(struct tcp_rack *rack) 6041 { 6042 struct rack_sendmap *rsm; 6043 6044 /* 6045 * Walk the time-order transmitted list looking for an rsm that is 6046 * not acked. This will be the one that was sent the longest time 6047 * ago that is still outstanding. 6048 */ 6049 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 6050 if (rsm->r_flags & RACK_ACKED) { 6051 continue; 6052 } 6053 goto finish; 6054 } 6055 finish: 6056 return (rsm); 6057 } 6058 6059 static struct rack_sendmap * 6060 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 6061 { 6062 struct rack_sendmap *prsm; 6063 6064 /* 6065 * Walk the sequence order list backward until we hit and arrive at 6066 * the highest seq not acked. In theory when this is called it 6067 * should be the last segment (which it was not). 6068 */ 6069 prsm = rsm; 6070 6071 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { 6072 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 6073 continue; 6074 } 6075 return (prsm); 6076 } 6077 return (NULL); 6078 } 6079 6080 static uint32_t 6081 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 6082 { 6083 int32_t lro; 6084 uint32_t thresh; 6085 6086 /* 6087 * lro is the flag we use to determine if we have seen reordering. 6088 * If it gets set we have seen reordering. The reorder logic either 6089 * works in one of two ways: 6090 * 6091 * If reorder-fade is configured, then we track the last time we saw 6092 * re-ordering occur. If we reach the point where enough time as 6093 * passed we no longer consider reordering has occuring. 6094 * 6095 * Or if reorder-face is 0, then once we see reordering we consider 6096 * the connection to alway be subject to reordering and just set lro 6097 * to 1. 6098 * 6099 * In the end if lro is non-zero we add the extra time for 6100 * reordering in. 6101 */ 6102 if (srtt == 0) 6103 srtt = 1; 6104 if (rack->r_ctl.rc_reorder_ts) { 6105 if (rack->r_ctl.rc_reorder_fade) { 6106 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 6107 lro = cts - rack->r_ctl.rc_reorder_ts; 6108 if (lro == 0) { 6109 /* 6110 * No time as passed since the last 6111 * reorder, mark it as reordering. 6112 */ 6113 lro = 1; 6114 } 6115 } else { 6116 /* Negative time? */ 6117 lro = 0; 6118 } 6119 if (lro > rack->r_ctl.rc_reorder_fade) { 6120 /* Turn off reordering seen too */ 6121 rack->r_ctl.rc_reorder_ts = 0; 6122 lro = 0; 6123 } 6124 } else { 6125 /* Reodering does not fade */ 6126 lro = 1; 6127 } 6128 } else { 6129 lro = 0; 6130 } 6131 if (rack->rc_rack_tmr_std_based == 0) { 6132 thresh = srtt + rack->r_ctl.rc_pkt_delay; 6133 } else { 6134 /* Standards based pkt-delay is 1/4 srtt */ 6135 thresh = srtt + (srtt >> 2); 6136 } 6137 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 6138 /* It must be set, if not you get 1/4 rtt */ 6139 if (rack->r_ctl.rc_reorder_shift) 6140 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 6141 else 6142 thresh += (srtt >> 2); 6143 } 6144 if (rack->rc_rack_use_dsack && 6145 lro && 6146 (rack->r_ctl.num_dsack > 0)) { 6147 /* 6148 * We only increase the reordering window if we 6149 * have seen reordering <and> we have a DSACK count. 6150 */ 6151 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 6152 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh); 6153 } 6154 /* SRTT * 2 is the ceiling */ 6155 if (thresh > (srtt * 2)) { 6156 thresh = srtt * 2; 6157 } 6158 /* And we don't want it above the RTO max either */ 6159 if (thresh > rack_rto_max) { 6160 thresh = rack_rto_max; 6161 } 6162 rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh); 6163 return (thresh); 6164 } 6165 6166 static uint32_t 6167 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 6168 struct rack_sendmap *rsm, uint32_t srtt) 6169 { 6170 struct rack_sendmap *prsm; 6171 uint32_t thresh, len; 6172 int segsiz; 6173 6174 if (srtt == 0) 6175 srtt = 1; 6176 if (rack->r_ctl.rc_tlp_threshold) 6177 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 6178 else 6179 thresh = (srtt * 2); 6180 6181 /* Get the previous sent packet, if any */ 6182 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 6183 len = rsm->r_end - rsm->r_start; 6184 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 6185 /* Exactly like the ID */ 6186 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 6187 uint32_t alt_thresh; 6188 /* 6189 * Compensate for delayed-ack with the d-ack time. 6190 */ 6191 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6192 if (alt_thresh > thresh) 6193 thresh = alt_thresh; 6194 } 6195 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 6196 /* 2.1 behavior */ 6197 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 6198 if (prsm && (len <= segsiz)) { 6199 /* 6200 * Two packets outstanding, thresh should be (2*srtt) + 6201 * possible inter-packet delay (if any). 6202 */ 6203 uint32_t inter_gap = 0; 6204 int idx, nidx; 6205 6206 idx = rsm->r_rtr_cnt - 1; 6207 nidx = prsm->r_rtr_cnt - 1; 6208 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 6209 /* Yes it was sent later (or at the same time) */ 6210 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 6211 } 6212 thresh += inter_gap; 6213 } else if (len <= segsiz) { 6214 /* 6215 * Possibly compensate for delayed-ack. 6216 */ 6217 uint32_t alt_thresh; 6218 6219 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6220 if (alt_thresh > thresh) 6221 thresh = alt_thresh; 6222 } 6223 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 6224 /* 2.2 behavior */ 6225 if (len <= segsiz) { 6226 uint32_t alt_thresh; 6227 /* 6228 * Compensate for delayed-ack with the d-ack time. 6229 */ 6230 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6231 if (alt_thresh > thresh) 6232 thresh = alt_thresh; 6233 } 6234 } 6235 /* Not above an RTO */ 6236 if (thresh > tp->t_rxtcur) { 6237 thresh = tp->t_rxtcur; 6238 } 6239 /* Not above a RTO max */ 6240 if (thresh > rack_rto_max) { 6241 thresh = rack_rto_max; 6242 } 6243 /* Apply user supplied min TLP */ 6244 if (thresh < rack_tlp_min) { 6245 thresh = rack_tlp_min; 6246 } 6247 return (thresh); 6248 } 6249 6250 static uint32_t 6251 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 6252 { 6253 /* 6254 * We want the rack_rtt which is the 6255 * last rtt we measured. However if that 6256 * does not exist we fallback to the srtt (which 6257 * we probably will never do) and then as a last 6258 * resort we use RACK_INITIAL_RTO if no srtt is 6259 * yet set. 6260 */ 6261 if (rack->rc_rack_rtt) 6262 return (rack->rc_rack_rtt); 6263 else if (tp->t_srtt == 0) 6264 return (RACK_INITIAL_RTO); 6265 return (tp->t_srtt); 6266 } 6267 6268 static struct rack_sendmap * 6269 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 6270 { 6271 /* 6272 * Check to see that we don't need to fall into recovery. We will 6273 * need to do so if our oldest transmit is past the time we should 6274 * have had an ack. 6275 */ 6276 struct tcp_rack *rack; 6277 struct rack_sendmap *rsm; 6278 int32_t idx; 6279 uint32_t srtt, thresh; 6280 6281 rack = (struct tcp_rack *)tp->t_fb_ptr; 6282 if (tqhash_empty(rack->r_ctl.tqh)) { 6283 return (NULL); 6284 } 6285 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6286 if (rsm == NULL) 6287 return (NULL); 6288 6289 6290 if (rsm->r_flags & RACK_ACKED) { 6291 rsm = rack_find_lowest_rsm(rack); 6292 if (rsm == NULL) 6293 return (NULL); 6294 } 6295 idx = rsm->r_rtr_cnt - 1; 6296 srtt = rack_grab_rtt(tp, rack); 6297 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6298 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 6299 return (NULL); 6300 } 6301 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 6302 return (NULL); 6303 } 6304 /* Ok if we reach here we are over-due and this guy can be sent */ 6305 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 6306 return (rsm); 6307 } 6308 6309 static uint32_t 6310 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 6311 { 6312 int32_t t; 6313 int32_t tt; 6314 uint32_t ret_val; 6315 6316 t = (tp->t_srtt + (tp->t_rttvar << 2)); 6317 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 6318 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 6319 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 6320 ret_val = (uint32_t)tt; 6321 return (ret_val); 6322 } 6323 6324 static uint32_t 6325 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 6326 { 6327 /* 6328 * Start the FR timer, we do this based on getting the first one in 6329 * the rc_tmap. Note that if its NULL we must stop the timer. in all 6330 * events we need to stop the running timer (if its running) before 6331 * starting the new one. 6332 */ 6333 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 6334 uint32_t srtt_cur; 6335 int32_t idx; 6336 int32_t is_tlp_timer = 0; 6337 struct rack_sendmap *rsm; 6338 6339 if (rack->t_timers_stopped) { 6340 /* All timers have been stopped none are to run */ 6341 return (0); 6342 } 6343 if (rack->rc_in_persist) { 6344 /* We can't start any timer in persists */ 6345 return (rack_get_persists_timer_val(tp, rack)); 6346 } 6347 rack->rc_on_min_to = 0; 6348 if ((tp->t_state < TCPS_ESTABLISHED) || 6349 (rack->sack_attack_disable > 0) || 6350 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 6351 goto activate_rxt; 6352 } 6353 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6354 if ((rsm == NULL) || sup_rack) { 6355 /* Nothing on the send map or no rack */ 6356 activate_rxt: 6357 time_since_sent = 0; 6358 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6359 if (rsm) { 6360 /* 6361 * Should we discount the RTX timer any? 6362 * 6363 * We want to discount it the smallest amount. 6364 * If a timer (Rack/TLP or RXT) has gone off more 6365 * recently thats the discount we want to use (now - timer time). 6366 * If the retransmit of the oldest packet was more recent then 6367 * we want to use that (now - oldest-packet-last_transmit_time). 6368 * 6369 */ 6370 idx = rsm->r_rtr_cnt - 1; 6371 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 6372 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6373 else 6374 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6375 if (TSTMP_GT(cts, tstmp_touse)) 6376 time_since_sent = cts - tstmp_touse; 6377 } 6378 if (SEQ_LT(tp->snd_una, tp->snd_max) || 6379 sbavail(&tptosocket(tp)->so_snd)) { 6380 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 6381 to = tp->t_rxtcur; 6382 if (to > time_since_sent) 6383 to -= time_since_sent; 6384 else 6385 to = rack->r_ctl.rc_min_to; 6386 if (to == 0) 6387 to = 1; 6388 /* Special case for KEEPINIT */ 6389 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6390 (TP_KEEPINIT(tp) != 0) && 6391 rsm) { 6392 /* 6393 * We have to put a ceiling on the rxt timer 6394 * of the keep-init timeout. 6395 */ 6396 uint32_t max_time, red; 6397 6398 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 6399 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 6400 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 6401 if (red < max_time) 6402 max_time -= red; 6403 else 6404 max_time = 1; 6405 } 6406 /* Reduce timeout to the keep value if needed */ 6407 if (max_time < to) 6408 to = max_time; 6409 } 6410 return (to); 6411 } 6412 return (0); 6413 } 6414 if (rsm->r_flags & RACK_ACKED) { 6415 rsm = rack_find_lowest_rsm(rack); 6416 if (rsm == NULL) { 6417 /* No lowest? */ 6418 goto activate_rxt; 6419 } 6420 } 6421 if (rack->sack_attack_disable) { 6422 /* 6423 * We don't want to do 6424 * any TLP's if you are an attacker. 6425 * Though if you are doing what 6426 * is expected you may still have 6427 * SACK-PASSED marks. 6428 */ 6429 goto activate_rxt; 6430 } 6431 /* Convert from ms to usecs */ 6432 if ((rsm->r_flags & RACK_SACK_PASSED) || 6433 (rsm->r_flags & RACK_RWND_COLLAPSED) || 6434 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 6435 if ((tp->t_flags & TF_SENTFIN) && 6436 ((tp->snd_max - tp->snd_una) == 1) && 6437 (rsm->r_flags & RACK_HAS_FIN)) { 6438 /* 6439 * We don't start a rack timer if all we have is a 6440 * FIN outstanding. 6441 */ 6442 goto activate_rxt; 6443 } 6444 if ((rack->use_rack_rr == 0) && 6445 (IN_FASTRECOVERY(tp->t_flags)) && 6446 (rack->rack_no_prr == 0) && 6447 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 6448 /* 6449 * We are not cheating, in recovery and 6450 * not enough ack's to yet get our next 6451 * retransmission out. 6452 * 6453 * Note that classified attackers do not 6454 * get to use the rack-cheat. 6455 */ 6456 goto activate_tlp; 6457 } 6458 srtt = rack_grab_rtt(tp, rack); 6459 thresh = rack_calc_thresh_rack(rack, srtt, cts); 6460 idx = rsm->r_rtr_cnt - 1; 6461 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 6462 if (SEQ_GEQ(exp, cts)) { 6463 to = exp - cts; 6464 if (to < rack->r_ctl.rc_min_to) { 6465 to = rack->r_ctl.rc_min_to; 6466 if (rack->r_rr_config == 3) 6467 rack->rc_on_min_to = 1; 6468 } 6469 } else { 6470 to = rack->r_ctl.rc_min_to; 6471 if (rack->r_rr_config == 3) 6472 rack->rc_on_min_to = 1; 6473 } 6474 } else { 6475 /* Ok we need to do a TLP not RACK */ 6476 activate_tlp: 6477 if ((rack->rc_tlp_in_progress != 0) && 6478 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 6479 /* 6480 * The previous send was a TLP and we have sent 6481 * N TLP's without sending new data. 6482 */ 6483 goto activate_rxt; 6484 } 6485 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 6486 if (rsm == NULL) { 6487 /* We found no rsm to TLP with. */ 6488 goto activate_rxt; 6489 } 6490 if (rsm->r_flags & RACK_HAS_FIN) { 6491 /* If its a FIN we dont do TLP */ 6492 rsm = NULL; 6493 goto activate_rxt; 6494 } 6495 idx = rsm->r_rtr_cnt - 1; 6496 time_since_sent = 0; 6497 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 6498 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6499 else 6500 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6501 if (TSTMP_GT(cts, tstmp_touse)) 6502 time_since_sent = cts - tstmp_touse; 6503 is_tlp_timer = 1; 6504 if (tp->t_srtt) { 6505 if ((rack->rc_srtt_measure_made == 0) && 6506 (tp->t_srtt == 1)) { 6507 /* 6508 * If another stack as run and set srtt to 1, 6509 * then the srtt was 0, so lets use the initial. 6510 */ 6511 srtt = RACK_INITIAL_RTO; 6512 } else { 6513 srtt_cur = tp->t_srtt; 6514 srtt = srtt_cur; 6515 } 6516 } else 6517 srtt = RACK_INITIAL_RTO; 6518 /* 6519 * If the SRTT is not keeping up and the 6520 * rack RTT has spiked we want to use 6521 * the last RTT not the smoothed one. 6522 */ 6523 if (rack_tlp_use_greater && 6524 tp->t_srtt && 6525 (srtt < rack_grab_rtt(tp, rack))) { 6526 srtt = rack_grab_rtt(tp, rack); 6527 } 6528 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 6529 if (thresh > time_since_sent) { 6530 to = thresh - time_since_sent; 6531 } else { 6532 to = rack->r_ctl.rc_min_to; 6533 rack_log_alt_to_to_cancel(rack, 6534 thresh, /* flex1 */ 6535 time_since_sent, /* flex2 */ 6536 tstmp_touse, /* flex3 */ 6537 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 6538 (uint32_t)rsm->r_tim_lastsent[idx], 6539 srtt, 6540 idx, 99); 6541 } 6542 if (to < rack_tlp_min) { 6543 to = rack_tlp_min; 6544 } 6545 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 6546 /* 6547 * If the TLP time works out to larger than the max 6548 * RTO lets not do TLP.. just RTO. 6549 */ 6550 goto activate_rxt; 6551 } 6552 } 6553 if (is_tlp_timer == 0) { 6554 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 6555 } else { 6556 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 6557 } 6558 if (to == 0) 6559 to = 1; 6560 return (to); 6561 } 6562 6563 static void 6564 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) 6565 { 6566 struct timeval tv; 6567 6568 if (rack->rc_in_persist == 0) { 6569 if (tp->t_flags & TF_GPUTINPROG) { 6570 /* 6571 * Stop the goodput now, the calling of the 6572 * measurement function clears the flag. 6573 */ 6574 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 6575 RACK_QUALITY_PERSIST); 6576 } 6577 #ifdef NETFLIX_SHARED_CWND 6578 if (rack->r_ctl.rc_scw) { 6579 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6580 rack->rack_scwnd_is_idle = 1; 6581 } 6582 #endif 6583 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv); 6584 if (rack->lt_bw_up) { 6585 /* Suspend our LT BW measurement */ 6586 uint64_t tmark; 6587 6588 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); 6589 rack->r_ctl.lt_seq = snd_una; 6590 tmark = tcp_tv_to_lusectick(&tv); 6591 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 6592 rack->r_ctl.lt_timemark = tmark; 6593 rack->lt_bw_up = 0; 6594 rack->r_persist_lt_bw_off = 1; 6595 } 6596 if (rack->r_ctl.rc_went_idle_time == 0) 6597 rack->r_ctl.rc_went_idle_time = 1; 6598 rack_timer_cancel(tp, rack, cts, __LINE__); 6599 rack->r_ctl.persist_lost_ends = 0; 6600 rack->probe_not_answered = 0; 6601 rack->forced_ack = 0; 6602 tp->t_rxtshift = 0; 6603 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6604 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6605 rack->rc_in_persist = 1; 6606 } 6607 } 6608 6609 static void 6610 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6611 { 6612 struct timeval tv; 6613 uint32_t t_time; 6614 6615 if (tcp_in_hpts(rack->rc_tp)) { 6616 tcp_hpts_remove(rack->rc_tp); 6617 rack->r_ctl.rc_hpts_flags = 0; 6618 } 6619 #ifdef NETFLIX_SHARED_CWND 6620 if (rack->r_ctl.rc_scw) { 6621 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6622 rack->rack_scwnd_is_idle = 0; 6623 } 6624 #endif 6625 t_time = tcp_get_usecs(&tv); 6626 if (rack->rc_gp_dyn_mul && 6627 (rack->use_fixed_rate == 0) && 6628 (rack->rc_always_pace)) { 6629 /* 6630 * Do we count this as if a probe-rtt just 6631 * finished? 6632 */ 6633 uint32_t time_idle, idle_min; 6634 6635 time_idle = t_time - rack->r_ctl.rc_went_idle_time; 6636 idle_min = rack_min_probertt_hold; 6637 if (rack_probertt_gpsrtt_cnt_div) { 6638 uint64_t extra; 6639 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 6640 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 6641 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 6642 idle_min += (uint32_t)extra; 6643 } 6644 if (time_idle >= idle_min) { 6645 /* Yes, we count it as a probe-rtt. */ 6646 uint32_t us_cts; 6647 6648 us_cts = tcp_get_usecs(NULL); 6649 if (rack->in_probe_rtt == 0) { 6650 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6651 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 6652 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 6653 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 6654 } else { 6655 rack_exit_probertt(rack, us_cts); 6656 } 6657 } 6658 } 6659 if (rack->r_persist_lt_bw_off) { 6660 /* Continue where we left off */ 6661 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); 6662 rack->lt_bw_up = 1; 6663 rack->r_persist_lt_bw_off = 0; 6664 } 6665 rack->rc_in_persist = 0; 6666 rack->r_ctl.rc_went_idle_time = 0; 6667 tp->t_rxtshift = 0; 6668 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6669 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6670 rack->r_ctl.rc_agg_delayed = 0; 6671 rack->r_early = 0; 6672 rack->r_late = 0; 6673 rack->r_ctl.rc_agg_early = 0; 6674 } 6675 6676 static void 6677 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 6678 struct hpts_diag *diag, struct timeval *tv) 6679 { 6680 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6681 union tcp_log_stackspecific log; 6682 6683 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6684 log.u_bbr.flex1 = diag->p_nxt_slot; 6685 log.u_bbr.flex2 = diag->p_cur_slot; 6686 log.u_bbr.flex3 = diag->slot_req; 6687 log.u_bbr.flex4 = diag->inp_hptsslot; 6688 log.u_bbr.flex5 = diag->slot_remaining; 6689 log.u_bbr.flex6 = diag->need_new_to; 6690 log.u_bbr.flex7 = diag->p_hpts_active; 6691 log.u_bbr.flex8 = diag->p_on_min_sleep; 6692 /* Hijack other fields as needed */ 6693 log.u_bbr.epoch = diag->have_slept; 6694 log.u_bbr.lt_epoch = diag->yet_to_sleep; 6695 log.u_bbr.pkts_out = diag->co_ret; 6696 log.u_bbr.applimited = diag->hpts_sleep_time; 6697 log.u_bbr.delivered = diag->p_prev_slot; 6698 log.u_bbr.inflight = diag->p_runningslot; 6699 log.u_bbr.bw_inuse = diag->wheel_slot; 6700 log.u_bbr.rttProp = diag->wheel_cts; 6701 log.u_bbr.timeStamp = cts; 6702 log.u_bbr.delRate = diag->maxslots; 6703 log.u_bbr.cur_del_rate = diag->p_curtick; 6704 log.u_bbr.cur_del_rate <<= 32; 6705 log.u_bbr.cur_del_rate |= diag->p_lasttick; 6706 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6707 &rack->rc_inp->inp_socket->so_rcv, 6708 &rack->rc_inp->inp_socket->so_snd, 6709 BBR_LOG_HPTSDIAG, 0, 6710 0, &log, false, tv); 6711 } 6712 6713 } 6714 6715 static void 6716 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 6717 { 6718 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6719 union tcp_log_stackspecific log; 6720 struct timeval tv; 6721 6722 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6723 log.u_bbr.flex1 = sb->sb_flags; 6724 log.u_bbr.flex2 = len; 6725 log.u_bbr.flex3 = sb->sb_state; 6726 log.u_bbr.flex8 = type; 6727 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6728 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6729 &rack->rc_inp->inp_socket->so_rcv, 6730 &rack->rc_inp->inp_socket->so_snd, 6731 TCP_LOG_SB_WAKE, 0, 6732 len, &log, false, &tv); 6733 } 6734 } 6735 6736 static void 6737 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 6738 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 6739 { 6740 struct hpts_diag diag; 6741 struct inpcb *inp = tptoinpcb(tp); 6742 struct timeval tv; 6743 uint32_t delayed_ack = 0; 6744 uint32_t hpts_timeout; 6745 uint32_t entry_slot = slot; 6746 uint8_t stopped; 6747 uint32_t left = 0; 6748 uint32_t us_cts; 6749 6750 if ((tp->t_state == TCPS_CLOSED) || 6751 (tp->t_state == TCPS_LISTEN)) { 6752 return; 6753 } 6754 if (tcp_in_hpts(tp)) { 6755 /* Already on the pacer */ 6756 return; 6757 } 6758 stopped = rack->rc_tmr_stopped; 6759 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 6760 left = rack->r_ctl.rc_timer_exp - cts; 6761 } 6762 rack->r_ctl.rc_timer_exp = 0; 6763 rack->r_ctl.rc_hpts_flags = 0; 6764 us_cts = tcp_get_usecs(&tv); 6765 /* Now early/late accounting */ 6766 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 6767 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 6768 /* 6769 * We have a early carry over set, 6770 * we can always add more time so we 6771 * can always make this compensation. 6772 * 6773 * Note if ack's are allowed to wake us do not 6774 * penalize the next timer for being awoke 6775 * by an ack aka the rc_agg_early (non-paced mode). 6776 */ 6777 slot += rack->r_ctl.rc_agg_early; 6778 rack->r_early = 0; 6779 rack->r_ctl.rc_agg_early = 0; 6780 } 6781 if (rack->r_late) { 6782 /* 6783 * This is harder, we can 6784 * compensate some but it 6785 * really depends on what 6786 * the current pacing time is. 6787 */ 6788 if (rack->r_ctl.rc_agg_delayed >= slot) { 6789 /* 6790 * We can't compensate for it all. 6791 * And we have to have some time 6792 * on the clock. We always have a min 6793 * 10 slots (10 x 10 i.e. 100 usecs). 6794 */ 6795 if (slot <= HPTS_TICKS_PER_SLOT) { 6796 /* We gain delay */ 6797 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 6798 slot = HPTS_TICKS_PER_SLOT; 6799 } else { 6800 /* We take off some */ 6801 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 6802 slot = HPTS_TICKS_PER_SLOT; 6803 } 6804 } else { 6805 slot -= rack->r_ctl.rc_agg_delayed; 6806 rack->r_ctl.rc_agg_delayed = 0; 6807 /* Make sure we have 100 useconds at minimum */ 6808 if (slot < HPTS_TICKS_PER_SLOT) { 6809 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 6810 slot = HPTS_TICKS_PER_SLOT; 6811 } 6812 if (rack->r_ctl.rc_agg_delayed == 0) 6813 rack->r_late = 0; 6814 } 6815 } 6816 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 6817 #ifdef TCP_SAD_DETECTION 6818 if (rack->sack_attack_disable && 6819 (rack->r_ctl.ack_during_sd > 0) && 6820 (slot < tcp_sad_pacing_interval)) { 6821 /* 6822 * We have a potential attacker on 6823 * the line. We have possibly some 6824 * (or now) pacing time set. We want to 6825 * slow down the processing of sacks by some 6826 * amount (if it is an attacker). Set the default 6827 * slot for attackers in place (unless the original 6828 * interval is longer). Its stored in 6829 * micro-seconds, so lets convert to msecs. 6830 */ 6831 slot = tcp_sad_pacing_interval; 6832 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 6833 rack->r_ctl.ack_during_sd = 0; 6834 } 6835 #endif 6836 if (tp->t_flags & TF_DELACK) { 6837 delayed_ack = TICKS_2_USEC(tcp_delacktime); 6838 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 6839 } 6840 if (delayed_ack && ((hpts_timeout == 0) || 6841 (delayed_ack < hpts_timeout))) 6842 hpts_timeout = delayed_ack; 6843 else 6844 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6845 /* 6846 * If no timers are going to run and we will fall off the hptsi 6847 * wheel, we resort to a keep-alive timer if its configured. 6848 */ 6849 if ((hpts_timeout == 0) && 6850 (slot == 0)) { 6851 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6852 (tp->t_state <= TCPS_CLOSING)) { 6853 /* 6854 * Ok we have no timer (persists, rack, tlp, rxt or 6855 * del-ack), we don't have segments being paced. So 6856 * all that is left is the keepalive timer. 6857 */ 6858 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6859 /* Get the established keep-alive time */ 6860 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 6861 } else { 6862 /* 6863 * Get the initial setup keep-alive time, 6864 * note that this is probably not going to 6865 * happen, since rack will be running a rxt timer 6866 * if a SYN of some sort is outstanding. It is 6867 * actually handled in rack_timeout_rxt(). 6868 */ 6869 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 6870 } 6871 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 6872 if (rack->in_probe_rtt) { 6873 /* 6874 * We want to instead not wake up a long time from 6875 * now but to wake up about the time we would 6876 * exit probe-rtt and initiate a keep-alive ack. 6877 * This will get us out of probe-rtt and update 6878 * our min-rtt. 6879 */ 6880 hpts_timeout = rack_min_probertt_hold; 6881 } 6882 } 6883 } 6884 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 6885 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 6886 /* 6887 * RACK, TLP, persists and RXT timers all are restartable 6888 * based on actions input .. i.e we received a packet (ack 6889 * or sack) and that changes things (rw, or snd_una etc). 6890 * Thus we can restart them with a new value. For 6891 * keep-alive, delayed_ack we keep track of what was left 6892 * and restart the timer with a smaller value. 6893 */ 6894 if (left < hpts_timeout) 6895 hpts_timeout = left; 6896 } 6897 if (hpts_timeout) { 6898 /* 6899 * Hack alert for now we can't time-out over 2,147,483 6900 * seconds (a bit more than 596 hours), which is probably ok 6901 * :). 6902 */ 6903 if (hpts_timeout > 0x7ffffffe) 6904 hpts_timeout = 0x7ffffffe; 6905 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 6906 } 6907 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 6908 if ((rack->gp_ready == 0) && 6909 (rack->use_fixed_rate == 0) && 6910 (hpts_timeout < slot) && 6911 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 6912 /* 6913 * We have no good estimate yet for the 6914 * old clunky burst mitigation or the 6915 * real pacing. And the tlp or rxt is smaller 6916 * than the pacing calculation. Lets not 6917 * pace that long since we know the calculation 6918 * so far is not accurate. 6919 */ 6920 slot = hpts_timeout; 6921 } 6922 /** 6923 * Turn off all the flags for queuing by default. The 6924 * flags have important meanings to what happens when 6925 * LRO interacts with the transport. Most likely (by default now) 6926 * mbuf_queueing and ack compression are on. So the transport 6927 * has a couple of flags that control what happens (if those 6928 * are not on then these flags won't have any effect since it 6929 * won't go through the queuing LRO path). 6930 * 6931 * TF2_MBUF_QUEUE_READY - This flags says that I am busy 6932 * pacing output, so don't disturb. But 6933 * it also means LRO can wake me if there 6934 * is a SACK arrival. 6935 * 6936 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction 6937 * with the above flag (QUEUE_READY) and 6938 * when present it says don't even wake me 6939 * if a SACK arrives. 6940 * 6941 * The idea behind these flags is that if we are pacing we 6942 * set the MBUF_QUEUE_READY and only get woken up if 6943 * a SACK arrives (which could change things) or if 6944 * our pacing timer expires. If, however, we have a rack 6945 * timer running, then we don't even want a sack to wake 6946 * us since the rack timer has to expire before we can send. 6947 * 6948 * Other cases should usually have none of the flags set 6949 * so LRO can call into us. 6950 */ 6951 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); 6952 if (slot) { 6953 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 6954 rack->r_ctl.rc_last_output_to = us_cts + slot; 6955 /* 6956 * A pacing timer (slot) is being set, in 6957 * such a case we cannot send (we are blocked by 6958 * the timer). So lets tell LRO that it should not 6959 * wake us unless there is a SACK. Note this only 6960 * will be effective if mbuf queueing is on or 6961 * compressed acks are being processed. 6962 */ 6963 tp->t_flags2 |= TF2_MBUF_QUEUE_READY; 6964 /* 6965 * But wait if we have a Rack timer running 6966 * even a SACK should not disturb us (with 6967 * the exception of r_rr_config 3). 6968 */ 6969 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) || 6970 (IN_RECOVERY(tp->t_flags))) { 6971 if (rack->r_rr_config != 3) 6972 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6973 else if (rack->rc_pace_dnd) { 6974 /* 6975 * When DND is on, we only let a sack 6976 * interrupt us if we are not in recovery. 6977 * 6978 * If DND is off, then we never hit here 6979 * and let all sacks wake us up. 6980 * 6981 */ 6982 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6983 } 6984 } 6985 /* For sack attackers we want to ignore sack */ 6986 if (rack->sack_attack_disable == 1) { 6987 tp->t_flags2 |= (TF2_DONT_SACK_QUEUE | 6988 TF2_MBUF_QUEUE_READY); 6989 } else if (rack->rc_ack_can_sendout_data) { 6990 /* 6991 * Ahh but wait, this is that special case 6992 * where the pacing timer can be disturbed 6993 * backout the changes (used for non-paced 6994 * burst limiting). 6995 */ 6996 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE | 6997 TF2_MBUF_QUEUE_READY); 6998 } 6999 if ((rack->use_rack_rr) && 7000 (rack->r_rr_config < 2) && 7001 ((hpts_timeout) && (hpts_timeout < slot))) { 7002 /* 7003 * Arrange for the hpts to kick back in after the 7004 * t-o if the t-o does not cause a send. 7005 */ 7006 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 7007 __LINE__, &diag); 7008 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7009 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 7010 } else { 7011 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), 7012 __LINE__, &diag); 7013 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7014 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 7015 } 7016 } else if (hpts_timeout) { 7017 /* 7018 * With respect to t_flags2(?) here, lets let any new acks wake 7019 * us up here. Since we are not pacing (no pacing timer), output 7020 * can happen so we should let it. If its a Rack timer, then any inbound 7021 * packet probably won't change the sending (we will be blocked) 7022 * but it may change the prr stats so letting it in (the set defaults 7023 * at the start of this block) are good enough. 7024 */ 7025 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7026 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 7027 __LINE__, &diag); 7028 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 7029 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 7030 } else { 7031 /* No timer starting */ 7032 #ifdef INVARIANTS 7033 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 7034 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 7035 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 7036 } 7037 #endif 7038 } 7039 rack->rc_tmr_stopped = 0; 7040 if (slot) 7041 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 7042 } 7043 7044 /* 7045 * RACK Timer, here we simply do logging and house keeping. 7046 * the normal rack_output() function will call the 7047 * appropriate thing to check if we need to do a RACK retransmit. 7048 * We return 1, saying don't proceed with rack_output only 7049 * when all timers have been stopped (destroyed PCB?). 7050 */ 7051 static int 7052 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7053 { 7054 /* 7055 * This timer simply provides an internal trigger to send out data. 7056 * The check_recovery_mode call will see if there are needed 7057 * retransmissions, if so we will enter fast-recovery. The output 7058 * call may or may not do the same thing depending on sysctl 7059 * settings. 7060 */ 7061 struct rack_sendmap *rsm; 7062 7063 counter_u64_add(rack_to_tot, 1); 7064 if (rack->r_state && (rack->r_state != tp->t_state)) 7065 rack_set_state(tp, rack); 7066 rack->rc_on_min_to = 0; 7067 rsm = rack_check_recovery_mode(tp, cts); 7068 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 7069 if (rsm) { 7070 rack->r_ctl.rc_resend = rsm; 7071 rack->r_timer_override = 1; 7072 if (rack->use_rack_rr) { 7073 /* 7074 * Don't accumulate extra pacing delay 7075 * we are allowing the rack timer to 7076 * over-ride pacing i.e. rrr takes precedence 7077 * if the pacing interval is longer than the rrr 7078 * time (in other words we get the min pacing 7079 * time versus rrr pacing time). 7080 */ 7081 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7082 } 7083 } 7084 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 7085 if (rsm == NULL) { 7086 /* restart a timer and return 1 */ 7087 rack_start_hpts_timer(rack, tp, cts, 7088 0, 0, 0); 7089 return (1); 7090 } 7091 return (0); 7092 } 7093 7094 7095 7096 static void 7097 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 7098 { 7099 7100 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { 7101 /* 7102 * The trailing space changed, mbufs can grow 7103 * at the tail but they can't shrink from 7104 * it, KASSERT that. Adjust the orig_m_len to 7105 * compensate for this change. 7106 */ 7107 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), 7108 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 7109 rsm->m, 7110 rsm, 7111 (intmax_t)M_TRAILINGROOM(rsm->m), 7112 rsm->orig_t_space, 7113 rsm->orig_m_len, 7114 rsm->m->m_len)); 7115 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); 7116 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7117 } 7118 if (rsm->m->m_len < rsm->orig_m_len) { 7119 /* 7120 * Mbuf shrank, trimmed off the top by an ack, our 7121 * offset changes. 7122 */ 7123 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), 7124 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", 7125 rsm->m, rsm->m->m_len, 7126 rsm, rsm->orig_m_len, 7127 rsm->soff)); 7128 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) 7129 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 7130 else 7131 rsm->soff = 0; 7132 rsm->orig_m_len = rsm->m->m_len; 7133 #ifdef INVARIANTS 7134 } else if (rsm->m->m_len > rsm->orig_m_len) { 7135 panic("rsm:%p m:%p m_len grew outside of t_space compensation", 7136 rsm, rsm->m); 7137 #endif 7138 } 7139 } 7140 7141 static void 7142 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 7143 { 7144 struct mbuf *m; 7145 uint32_t soff; 7146 7147 if (src_rsm->m && 7148 ((src_rsm->orig_m_len != src_rsm->m->m_len) || 7149 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { 7150 /* Fix up the orig_m_len and possibly the mbuf offset */ 7151 rack_adjust_orig_mlen(src_rsm); 7152 } 7153 m = src_rsm->m; 7154 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 7155 while (soff >= m->m_len) { 7156 /* Move out past this mbuf */ 7157 soff -= m->m_len; 7158 m = m->m_next; 7159 KASSERT((m != NULL), 7160 ("rsm:%p nrsm:%p hit at soff:%u null m", 7161 src_rsm, rsm, soff)); 7162 if (m == NULL) { 7163 /* This should *not* happen which is why there is a kassert */ 7164 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7165 (src_rsm->r_start - rack->rc_tp->snd_una), 7166 &src_rsm->soff); 7167 src_rsm->orig_m_len = src_rsm->m->m_len; 7168 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); 7169 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7170 (rsm->r_start - rack->rc_tp->snd_una), 7171 &rsm->soff); 7172 rsm->orig_m_len = rsm->m->m_len; 7173 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7174 return; 7175 } 7176 } 7177 rsm->m = m; 7178 rsm->soff = soff; 7179 rsm->orig_m_len = m->m_len; 7180 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7181 } 7182 7183 static __inline void 7184 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 7185 struct rack_sendmap *rsm, uint32_t start) 7186 { 7187 int idx; 7188 7189 nrsm->r_start = start; 7190 nrsm->r_end = rsm->r_end; 7191 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 7192 nrsm->r_flags = rsm->r_flags; 7193 nrsm->r_dupack = rsm->r_dupack; 7194 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 7195 nrsm->r_rtr_bytes = 0; 7196 nrsm->r_fas = rsm->r_fas; 7197 nrsm->r_bas = rsm->r_bas; 7198 rsm->r_end = nrsm->r_start; 7199 nrsm->r_just_ret = rsm->r_just_ret; 7200 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 7201 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 7202 } 7203 /* Now if we have SYN flag we keep it on the left edge */ 7204 if (nrsm->r_flags & RACK_HAS_SYN) 7205 nrsm->r_flags &= ~RACK_HAS_SYN; 7206 /* Now if we have a FIN flag we keep it on the right edge */ 7207 if (rsm->r_flags & RACK_HAS_FIN) 7208 rsm->r_flags &= ~RACK_HAS_FIN; 7209 /* Push bit must go to the right edge as well */ 7210 if (rsm->r_flags & RACK_HAD_PUSH) 7211 rsm->r_flags &= ~RACK_HAD_PUSH; 7212 /* Clone over the state of the hw_tls flag */ 7213 nrsm->r_hw_tls = rsm->r_hw_tls; 7214 /* 7215 * Now we need to find nrsm's new location in the mbuf chain 7216 * we basically calculate a new offset, which is soff + 7217 * how much is left in original rsm. Then we walk out the mbuf 7218 * chain to find the righ position, it may be the same mbuf 7219 * or maybe not. 7220 */ 7221 KASSERT(((rsm->m != NULL) || 7222 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 7223 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 7224 if (rsm->m) 7225 rack_setup_offset_for_rsm(rack, rsm, nrsm); 7226 } 7227 7228 static struct rack_sendmap * 7229 rack_merge_rsm(struct tcp_rack *rack, 7230 struct rack_sendmap *l_rsm, 7231 struct rack_sendmap *r_rsm) 7232 { 7233 /* 7234 * We are merging two ack'd RSM's, 7235 * the l_rsm is on the left (lower seq 7236 * values) and the r_rsm is on the right 7237 * (higher seq value). The simplest way 7238 * to merge these is to move the right 7239 * one into the left. I don't think there 7240 * is any reason we need to try to find 7241 * the oldest (or last oldest retransmitted). 7242 */ 7243 rack_log_map_chg(rack->rc_tp, rack, NULL, 7244 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 7245 l_rsm->r_end = r_rsm->r_end; 7246 if (l_rsm->r_dupack < r_rsm->r_dupack) 7247 l_rsm->r_dupack = r_rsm->r_dupack; 7248 if (r_rsm->r_rtr_bytes) 7249 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 7250 if (r_rsm->r_in_tmap) { 7251 /* This really should not happen */ 7252 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 7253 r_rsm->r_in_tmap = 0; 7254 } 7255 7256 /* Now the flags */ 7257 if (r_rsm->r_flags & RACK_HAS_FIN) 7258 l_rsm->r_flags |= RACK_HAS_FIN; 7259 if (r_rsm->r_flags & RACK_TLP) 7260 l_rsm->r_flags |= RACK_TLP; 7261 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 7262 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 7263 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 7264 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 7265 /* 7266 * If both are app-limited then let the 7267 * free lower the count. If right is app 7268 * limited and left is not, transfer. 7269 */ 7270 l_rsm->r_flags |= RACK_APP_LIMITED; 7271 r_rsm->r_flags &= ~RACK_APP_LIMITED; 7272 if (r_rsm == rack->r_ctl.rc_first_appl) 7273 rack->r_ctl.rc_first_appl = l_rsm; 7274 } 7275 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); 7276 /* 7277 * We keep the largest value, which is the newest 7278 * send. We do this in case a segment that is 7279 * joined together and not part of a GP estimate 7280 * later gets expanded into the GP estimate. 7281 * 7282 * We prohibit the merging of unlike kinds i.e. 7283 * all pieces that are in the GP estimate can be 7284 * merged and all pieces that are not in a GP estimate 7285 * can be merged, but not disimilar pieces. Combine 7286 * this with taking the highest here and we should 7287 * be ok unless of course the client reneges. Then 7288 * all bets are off. 7289 */ 7290 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < 7291 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { 7292 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; 7293 } 7294 /* 7295 * When merging two RSM's we also need to consider the ack time and keep 7296 * newest. If the ack gets merged into a measurement then that is the 7297 * one we will want to be using. 7298 */ 7299 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) 7300 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; 7301 7302 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 7303 /* Transfer the split limit to the map we free */ 7304 r_rsm->r_limit_type = l_rsm->r_limit_type; 7305 l_rsm->r_limit_type = 0; 7306 } 7307 rack_free(rack, r_rsm); 7308 l_rsm->r_flags |= RACK_MERGED; 7309 return (l_rsm); 7310 } 7311 7312 /* 7313 * TLP Timer, here we simply setup what segment we want to 7314 * have the TLP expire on, the normal rack_output() will then 7315 * send it out. 7316 * 7317 * We return 1, saying don't proceed with rack_output only 7318 * when all timers have been stopped (destroyed PCB?). 7319 */ 7320 static int 7321 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 7322 { 7323 /* 7324 * Tail Loss Probe. 7325 */ 7326 struct rack_sendmap *rsm = NULL; 7327 int insret __diagused; 7328 struct socket *so = tptosocket(tp); 7329 uint32_t amm; 7330 uint32_t out, avail; 7331 int collapsed_win = 0; 7332 7333 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7334 /* Its not time yet */ 7335 return (0); 7336 } 7337 if (ctf_progress_timeout_check(tp, true)) { 7338 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7339 return (-ETIMEDOUT); /* tcp_drop() */ 7340 } 7341 /* 7342 * A TLP timer has expired. We have been idle for 2 rtts. So we now 7343 * need to figure out how to force a full MSS segment out. 7344 */ 7345 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 7346 rack->r_ctl.retran_during_recovery = 0; 7347 rack->r_ctl.dsack_byte_cnt = 0; 7348 counter_u64_add(rack_tlp_tot, 1); 7349 if (rack->r_state && (rack->r_state != tp->t_state)) 7350 rack_set_state(tp, rack); 7351 avail = sbavail(&so->so_snd); 7352 out = tp->snd_max - tp->snd_una; 7353 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 7354 /* special case, we need a retransmission */ 7355 collapsed_win = 1; 7356 goto need_retran; 7357 } 7358 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 7359 rack->r_ctl.dsack_persist--; 7360 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7361 rack->r_ctl.num_dsack = 0; 7362 } 7363 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7364 } 7365 if ((tp->t_flags & TF_GPUTINPROG) && 7366 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 7367 /* 7368 * If this is the second in a row 7369 * TLP and we are doing a measurement 7370 * its time to abandon the measurement. 7371 * Something is likely broken on 7372 * the clients network and measuring a 7373 * broken network does us no good. 7374 */ 7375 tp->t_flags &= ~TF_GPUTINPROG; 7376 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7377 rack->r_ctl.rc_gp_srtt /*flex1*/, 7378 tp->gput_seq, 7379 0, 0, 18, __LINE__, NULL, 0); 7380 } 7381 /* 7382 * Check our send oldest always settings, and if 7383 * there is an oldest to send jump to the need_retran. 7384 */ 7385 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 7386 goto need_retran; 7387 7388 if (avail > out) { 7389 /* New data is available */ 7390 amm = avail - out; 7391 if (amm > ctf_fixed_maxseg(tp)) { 7392 amm = ctf_fixed_maxseg(tp); 7393 if ((amm + out) > tp->snd_wnd) { 7394 /* We are rwnd limited */ 7395 goto need_retran; 7396 } 7397 } else if (amm < ctf_fixed_maxseg(tp)) { 7398 /* not enough to fill a MTU */ 7399 goto need_retran; 7400 } 7401 if (IN_FASTRECOVERY(tp->t_flags)) { 7402 /* Unlikely */ 7403 if (rack->rack_no_prr == 0) { 7404 if (out + amm <= tp->snd_wnd) { 7405 rack->r_ctl.rc_prr_sndcnt = amm; 7406 rack->r_ctl.rc_tlp_new_data = amm; 7407 rack_log_to_prr(rack, 4, 0, __LINE__); 7408 } 7409 } else 7410 goto need_retran; 7411 } else { 7412 /* Set the send-new override */ 7413 if (out + amm <= tp->snd_wnd) 7414 rack->r_ctl.rc_tlp_new_data = amm; 7415 else 7416 goto need_retran; 7417 } 7418 rack->r_ctl.rc_tlpsend = NULL; 7419 counter_u64_add(rack_tlp_newdata, 1); 7420 goto send; 7421 } 7422 need_retran: 7423 /* 7424 * Ok we need to arrange the last un-acked segment to be re-sent, or 7425 * optionally the first un-acked segment. 7426 */ 7427 if (collapsed_win == 0) { 7428 if (rack_always_send_oldest) 7429 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7430 else { 7431 rsm = tqhash_max(rack->r_ctl.tqh); 7432 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 7433 rsm = rack_find_high_nonack(rack, rsm); 7434 } 7435 } 7436 if (rsm == NULL) { 7437 #ifdef TCP_BLACKBOX 7438 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 7439 #endif 7440 goto out; 7441 } 7442 } else { 7443 /* 7444 * We had a collapsed window, lets find 7445 * the point before the collapse. 7446 */ 7447 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) 7448 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); 7449 else { 7450 rsm = tqhash_min(rack->r_ctl.tqh); 7451 } 7452 if (rsm == NULL) { 7453 /* Huh */ 7454 goto out; 7455 } 7456 } 7457 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 7458 /* 7459 * We need to split this the last segment in two. 7460 */ 7461 struct rack_sendmap *nrsm; 7462 7463 nrsm = rack_alloc_full_limit(rack); 7464 if (nrsm == NULL) { 7465 /* 7466 * No memory to split, we will just exit and punt 7467 * off to the RXT timer. 7468 */ 7469 goto out; 7470 } 7471 rack_clone_rsm(rack, nrsm, rsm, 7472 (rsm->r_end - ctf_fixed_maxseg(tp))); 7473 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7474 #ifndef INVARIANTS 7475 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 7476 #else 7477 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 7478 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 7479 nrsm, insret, rack, rsm); 7480 } 7481 #endif 7482 if (rsm->r_in_tmap) { 7483 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7484 nrsm->r_in_tmap = 1; 7485 } 7486 rsm = nrsm; 7487 } 7488 rack->r_ctl.rc_tlpsend = rsm; 7489 send: 7490 /* Make sure output path knows we are doing a TLP */ 7491 *doing_tlp = 1; 7492 rack->r_timer_override = 1; 7493 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7494 return (0); 7495 out: 7496 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7497 return (0); 7498 } 7499 7500 /* 7501 * Delayed ack Timer, here we simply need to setup the 7502 * ACK_NOW flag and remove the DELACK flag. From there 7503 * the output routine will send the ack out. 7504 * 7505 * We only return 1, saying don't proceed, if all timers 7506 * are stopped (destroyed PCB?). 7507 */ 7508 static int 7509 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7510 { 7511 7512 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 7513 tp->t_flags &= ~TF_DELACK; 7514 tp->t_flags |= TF_ACKNOW; 7515 KMOD_TCPSTAT_INC(tcps_delack); 7516 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7517 return (0); 7518 } 7519 7520 /* 7521 * Persists timer, here we simply send the 7522 * same thing as a keepalive will. 7523 * the one byte send. 7524 * 7525 * We only return 1, saying don't proceed, if all timers 7526 * are stopped (destroyed PCB?). 7527 */ 7528 static int 7529 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7530 { 7531 struct tcptemp *t_template; 7532 int32_t retval = 1; 7533 7534 if (rack->rc_in_persist == 0) 7535 return (0); 7536 if (ctf_progress_timeout_check(tp, false)) { 7537 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7538 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7539 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7540 return (-ETIMEDOUT); /* tcp_drop() */ 7541 } 7542 /* 7543 * Persistence timer into zero window. Force a byte to be output, if 7544 * possible. 7545 */ 7546 KMOD_TCPSTAT_INC(tcps_persisttimeo); 7547 /* 7548 * Hack: if the peer is dead/unreachable, we do not time out if the 7549 * window is closed. After a full backoff, drop the connection if 7550 * the idle time (no responses to probes) reaches the maximum 7551 * backoff that we would use if retransmitting. 7552 */ 7553 if (tp->t_rxtshift >= V_tcp_retries && 7554 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 7555 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 7556 KMOD_TCPSTAT_INC(tcps_persistdrop); 7557 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7558 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7559 retval = -ETIMEDOUT; /* tcp_drop() */ 7560 goto out; 7561 } 7562 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 7563 tp->snd_una == tp->snd_max) 7564 rack_exit_persist(tp, rack, cts); 7565 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 7566 /* 7567 * If the user has closed the socket then drop a persisting 7568 * connection after a much reduced timeout. 7569 */ 7570 if (tp->t_state > TCPS_CLOSE_WAIT && 7571 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 7572 KMOD_TCPSTAT_INC(tcps_persistdrop); 7573 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7574 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7575 retval = -ETIMEDOUT; /* tcp_drop() */ 7576 goto out; 7577 } 7578 t_template = tcpip_maketemplate(rack->rc_inp); 7579 if (t_template) { 7580 /* only set it if we were answered */ 7581 if (rack->forced_ack == 0) { 7582 rack->forced_ack = 1; 7583 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7584 } else { 7585 rack->probe_not_answered = 1; 7586 counter_u64_add(rack_persists_loss, 1); 7587 rack->r_ctl.persist_lost_ends++; 7588 } 7589 counter_u64_add(rack_persists_sends, 1); 7590 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 7591 tcp_respond(tp, t_template->tt_ipgen, 7592 &t_template->tt_t, (struct mbuf *)NULL, 7593 tp->rcv_nxt, tp->snd_una - 1, 0); 7594 /* This sends an ack */ 7595 if (tp->t_flags & TF_DELACK) 7596 tp->t_flags &= ~TF_DELACK; 7597 free(t_template, M_TEMP); 7598 } 7599 if (tp->t_rxtshift < V_tcp_retries) 7600 tp->t_rxtshift++; 7601 out: 7602 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 7603 rack_start_hpts_timer(rack, tp, cts, 7604 0, 0, 0); 7605 return (retval); 7606 } 7607 7608 /* 7609 * If a keepalive goes off, we had no other timers 7610 * happening. We always return 1 here since this 7611 * routine either drops the connection or sends 7612 * out a segment with respond. 7613 */ 7614 static int 7615 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7616 { 7617 struct tcptemp *t_template; 7618 struct inpcb *inp = tptoinpcb(tp); 7619 7620 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 7621 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 7622 /* 7623 * Keep-alive timer went off; send something or drop connection if 7624 * idle for too long. 7625 */ 7626 KMOD_TCPSTAT_INC(tcps_keeptimeo); 7627 if (tp->t_state < TCPS_ESTABLISHED) 7628 goto dropit; 7629 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 7630 tp->t_state <= TCPS_CLOSING) { 7631 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 7632 goto dropit; 7633 /* 7634 * Send a packet designed to force a response if the peer is 7635 * up and reachable: either an ACK if the connection is 7636 * still alive, or an RST if the peer has closed the 7637 * connection due to timeout or reboot. Using sequence 7638 * number tp->snd_una-1 causes the transmitted zero-length 7639 * segment to lie outside the receive window; by the 7640 * protocol spec, this requires the correspondent TCP to 7641 * respond. 7642 */ 7643 KMOD_TCPSTAT_INC(tcps_keepprobe); 7644 t_template = tcpip_maketemplate(inp); 7645 if (t_template) { 7646 if (rack->forced_ack == 0) { 7647 rack->forced_ack = 1; 7648 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7649 } else { 7650 rack->probe_not_answered = 1; 7651 } 7652 tcp_respond(tp, t_template->tt_ipgen, 7653 &t_template->tt_t, (struct mbuf *)NULL, 7654 tp->rcv_nxt, tp->snd_una - 1, 0); 7655 free(t_template, M_TEMP); 7656 } 7657 } 7658 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 7659 return (1); 7660 dropit: 7661 KMOD_TCPSTAT_INC(tcps_keepdrops); 7662 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7663 return (-ETIMEDOUT); /* tcp_drop() */ 7664 } 7665 7666 /* 7667 * Retransmit helper function, clear up all the ack 7668 * flags and take care of important book keeping. 7669 */ 7670 static void 7671 rack_remxt_tmr(struct tcpcb *tp) 7672 { 7673 /* 7674 * The retransmit timer went off, all sack'd blocks must be 7675 * un-acked. 7676 */ 7677 struct rack_sendmap *rsm, *trsm = NULL; 7678 struct tcp_rack *rack; 7679 7680 rack = (struct tcp_rack *)tp->t_fb_ptr; 7681 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 7682 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 7683 if (rack->r_state && (rack->r_state != tp->t_state)) 7684 rack_set_state(tp, rack); 7685 /* 7686 * Ideally we would like to be able to 7687 * mark SACK-PASS on anything not acked here. 7688 * 7689 * However, if we do that we would burst out 7690 * all that data 1ms apart. This would be unwise, 7691 * so for now we will just let the normal rxt timer 7692 * and tlp timer take care of it. 7693 * 7694 * Also we really need to stick them back in sequence 7695 * order. This way we send in the proper order and any 7696 * sacks that come floating in will "re-ack" the data. 7697 * To do this we zap the tmap with an INIT and then 7698 * walk through and place every rsm in the tail queue 7699 * hash table back in its seq ordered place. 7700 */ 7701 TAILQ_INIT(&rack->r_ctl.rc_tmap); 7702 7703 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 7704 rsm->r_dupack = 0; 7705 if (rack_verbose_logging) 7706 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7707 /* We must re-add it back to the tlist */ 7708 if (trsm == NULL) { 7709 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7710 } else { 7711 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 7712 } 7713 rsm->r_in_tmap = 1; 7714 trsm = rsm; 7715 if (rsm->r_flags & RACK_ACKED) 7716 rsm->r_flags |= RACK_WAS_ACKED; 7717 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); 7718 rsm->r_flags |= RACK_MUST_RXT; 7719 } 7720 /* Clear the count (we just un-acked them) */ 7721 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 7722 rack->r_ctl.rc_sacked = 0; 7723 rack->r_ctl.rc_sacklast = NULL; 7724 rack->r_ctl.rc_agg_delayed = 0; 7725 rack->r_early = 0; 7726 rack->r_ctl.rc_agg_early = 0; 7727 rack->r_late = 0; 7728 /* Clear the tlp rtx mark */ 7729 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7730 if (rack->r_ctl.rc_resend != NULL) 7731 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7732 rack->r_ctl.rc_prr_sndcnt = 0; 7733 rack_log_to_prr(rack, 6, 0, __LINE__); 7734 rack->r_timer_override = 1; 7735 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 7736 #ifdef TCP_SAD_DETECTION 7737 || (rack->sack_attack_disable != 0) 7738 #endif 7739 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 7740 /* 7741 * For non-sack customers new data 7742 * needs to go out as retransmits until 7743 * we retransmit up to snd_max. 7744 */ 7745 rack->r_must_retran = 1; 7746 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 7747 rack->r_ctl.rc_sacked); 7748 } 7749 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 7750 } 7751 7752 static void 7753 rack_convert_rtts(struct tcpcb *tp) 7754 { 7755 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 7756 tp->t_rxtcur = RACK_REXMTVAL(tp); 7757 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 7758 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 7759 } 7760 if (tp->t_rxtcur > rack_rto_max) { 7761 tp->t_rxtcur = rack_rto_max; 7762 } 7763 } 7764 7765 static void 7766 rack_cc_conn_init(struct tcpcb *tp) 7767 { 7768 struct tcp_rack *rack; 7769 uint32_t srtt; 7770 7771 rack = (struct tcp_rack *)tp->t_fb_ptr; 7772 srtt = tp->t_srtt; 7773 cc_conn_init(tp); 7774 /* 7775 * Now convert to rack's internal format, 7776 * if required. 7777 */ 7778 if ((srtt == 0) && (tp->t_srtt != 0)) 7779 rack_convert_rtts(tp); 7780 /* 7781 * We want a chance to stay in slowstart as 7782 * we create a connection. TCP spec says that 7783 * initially ssthresh is infinite. For our 7784 * purposes that is the snd_wnd. 7785 */ 7786 if (tp->snd_ssthresh < tp->snd_wnd) { 7787 tp->snd_ssthresh = tp->snd_wnd; 7788 } 7789 /* 7790 * We also want to assure a IW worth of 7791 * data can get inflight. 7792 */ 7793 if (rc_init_window(rack) < tp->snd_cwnd) 7794 tp->snd_cwnd = rc_init_window(rack); 7795 } 7796 7797 /* 7798 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 7799 * we will setup to retransmit the lowest seq number outstanding. 7800 */ 7801 static int 7802 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7803 { 7804 struct inpcb *inp = tptoinpcb(tp); 7805 int32_t rexmt; 7806 int32_t retval = 0; 7807 bool isipv6; 7808 7809 if ((tp->t_flags & TF_GPUTINPROG) && 7810 (tp->t_rxtshift)) { 7811 /* 7812 * We have had a second timeout 7813 * measurements on successive rxt's are not profitable. 7814 * It is unlikely to be of any use (the network is 7815 * broken or the client went away). 7816 */ 7817 tp->t_flags &= ~TF_GPUTINPROG; 7818 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7819 rack->r_ctl.rc_gp_srtt /*flex1*/, 7820 tp->gput_seq, 7821 0, 0, 18, __LINE__, NULL, 0); 7822 } 7823 if (ctf_progress_timeout_check(tp, false)) { 7824 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7825 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7826 return (-ETIMEDOUT); /* tcp_drop() */ 7827 } 7828 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 7829 rack->r_ctl.retran_during_recovery = 0; 7830 rack->rc_ack_required = 1; 7831 rack->r_ctl.dsack_byte_cnt = 0; 7832 if (IN_FASTRECOVERY(tp->t_flags)) 7833 tp->t_flags |= TF_WASFRECOVERY; 7834 else 7835 tp->t_flags &= ~TF_WASFRECOVERY; 7836 if (IN_CONGRECOVERY(tp->t_flags)) 7837 tp->t_flags |= TF_WASCRECOVERY; 7838 else 7839 tp->t_flags &= ~TF_WASCRECOVERY; 7840 if (TCPS_HAVEESTABLISHED(tp->t_state) && 7841 (tp->snd_una == tp->snd_max)) { 7842 /* Nothing outstanding .. nothing to do */ 7843 return (0); 7844 } 7845 if (rack->r_ctl.dsack_persist) { 7846 rack->r_ctl.dsack_persist--; 7847 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7848 rack->r_ctl.num_dsack = 0; 7849 } 7850 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7851 } 7852 /* 7853 * Rack can only run one timer at a time, so we cannot 7854 * run a KEEPINIT (gating SYN sending) and a retransmit 7855 * timer for the SYN. So if we are in a front state and 7856 * have a KEEPINIT timer we need to check the first transmit 7857 * against now to see if we have exceeded the KEEPINIT time 7858 * (if one is set). 7859 */ 7860 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 7861 (TP_KEEPINIT(tp) != 0)) { 7862 struct rack_sendmap *rsm; 7863 7864 rsm = tqhash_min(rack->r_ctl.tqh); 7865 if (rsm) { 7866 /* Ok we have something outstanding to test keepinit with */ 7867 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 7868 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 7869 /* We have exceeded the KEEPINIT time */ 7870 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7871 goto drop_it; 7872 } 7873 } 7874 } 7875 /* 7876 * Retransmission timer went off. Message has not been acked within 7877 * retransmit interval. Back off to a longer retransmit interval 7878 * and retransmit one segment. 7879 */ 7880 rack_remxt_tmr(tp); 7881 if ((rack->r_ctl.rc_resend == NULL) || 7882 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 7883 /* 7884 * If the rwnd collapsed on 7885 * the one we are retransmitting 7886 * it does not count against the 7887 * rxt count. 7888 */ 7889 tp->t_rxtshift++; 7890 } 7891 if (tp->t_rxtshift > V_tcp_retries) { 7892 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7893 drop_it: 7894 tp->t_rxtshift = V_tcp_retries; 7895 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 7896 /* XXXGL: previously t_softerror was casted to uint16_t */ 7897 MPASS(tp->t_softerror >= 0); 7898 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 7899 goto out; /* tcp_drop() */ 7900 } 7901 if (tp->t_state == TCPS_SYN_SENT) { 7902 /* 7903 * If the SYN was retransmitted, indicate CWND to be limited 7904 * to 1 segment in cc_conn_init(). 7905 */ 7906 tp->snd_cwnd = 1; 7907 } else if (tp->t_rxtshift == 1) { 7908 /* 7909 * first retransmit; record ssthresh and cwnd so they can be 7910 * recovered if this turns out to be a "bad" retransmit. A 7911 * retransmit is considered "bad" if an ACK for this segment 7912 * is received within RTT/2 interval; the assumption here is 7913 * that the ACK was already in flight. See "On Estimating 7914 * End-to-End Network Path Properties" by Allman and Paxson 7915 * for more details. 7916 */ 7917 tp->snd_cwnd_prev = tp->snd_cwnd; 7918 tp->snd_ssthresh_prev = tp->snd_ssthresh; 7919 tp->snd_recover_prev = tp->snd_recover; 7920 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 7921 tp->t_flags |= TF_PREVVALID; 7922 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 7923 tp->t_flags &= ~TF_PREVVALID; 7924 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 7925 if ((tp->t_state == TCPS_SYN_SENT) || 7926 (tp->t_state == TCPS_SYN_RECEIVED)) 7927 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 7928 else 7929 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 7930 7931 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 7932 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 7933 /* 7934 * We enter the path for PLMTUD if connection is established or, if 7935 * connection is FIN_WAIT_1 status, reason for the last is that if 7936 * amount of data we send is very small, we could send it in couple 7937 * of packets and process straight to FIN. In that case we won't 7938 * catch ESTABLISHED state. 7939 */ 7940 #ifdef INET6 7941 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 7942 #else 7943 isipv6 = false; 7944 #endif 7945 if (((V_tcp_pmtud_blackhole_detect == 1) || 7946 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 7947 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 7948 ((tp->t_state == TCPS_ESTABLISHED) || 7949 (tp->t_state == TCPS_FIN_WAIT_1))) { 7950 /* 7951 * Idea here is that at each stage of mtu probe (usually, 7952 * 1448 -> 1188 -> 524) should be given 2 chances to recover 7953 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 7954 * should take care of that. 7955 */ 7956 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 7957 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 7958 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 7959 tp->t_rxtshift % 2 == 0)) { 7960 /* 7961 * Enter Path MTU Black-hole Detection mechanism: - 7962 * Disable Path MTU Discovery (IP "DF" bit). - 7963 * Reduce MTU to lower value than what we negotiated 7964 * with peer. 7965 */ 7966 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 7967 /* Record that we may have found a black hole. */ 7968 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 7969 /* Keep track of previous MSS. */ 7970 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 7971 } 7972 7973 /* 7974 * Reduce the MSS to blackhole value or to the 7975 * default in an attempt to retransmit. 7976 */ 7977 #ifdef INET6 7978 if (isipv6 && 7979 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 7980 /* Use the sysctl tuneable blackhole MSS. */ 7981 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 7982 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7983 } else if (isipv6) { 7984 /* Use the default MSS. */ 7985 tp->t_maxseg = V_tcp_v6mssdflt; 7986 /* 7987 * Disable Path MTU Discovery when we switch 7988 * to minmss. 7989 */ 7990 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7991 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7992 } 7993 #endif 7994 #if defined(INET6) && defined(INET) 7995 else 7996 #endif 7997 #ifdef INET 7998 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 7999 /* Use the sysctl tuneable blackhole MSS. */ 8000 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 8001 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 8002 } else { 8003 /* Use the default MSS. */ 8004 tp->t_maxseg = V_tcp_mssdflt; 8005 /* 8006 * Disable Path MTU Discovery when we switch 8007 * to minmss. 8008 */ 8009 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 8010 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 8011 } 8012 #endif 8013 } else { 8014 /* 8015 * If further retransmissions are still unsuccessful 8016 * with a lowered MTU, maybe this isn't a blackhole 8017 * and we restore the previous MSS and blackhole 8018 * detection flags. The limit '6' is determined by 8019 * giving each probe stage (1448, 1188, 524) 2 8020 * chances to recover. 8021 */ 8022 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 8023 (tp->t_rxtshift >= 6)) { 8024 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 8025 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 8026 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 8027 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 8028 } 8029 } 8030 } 8031 /* 8032 * Disable RFC1323 and SACK if we haven't got any response to 8033 * our third SYN to work-around some broken terminal servers 8034 * (most of which have hopefully been retired) that have bad VJ 8035 * header compression code which trashes TCP segments containing 8036 * unknown-to-them TCP options. 8037 */ 8038 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 8039 (tp->t_rxtshift == 3)) 8040 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 8041 /* 8042 * If we backed off this far, our srtt estimate is probably bogus. 8043 * Clobber it so we'll take the next rtt measurement as our srtt; 8044 * move the current srtt into rttvar to keep the current retransmit 8045 * times until then. 8046 */ 8047 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 8048 #ifdef INET6 8049 if ((inp->inp_vflag & INP_IPV6) != 0) 8050 in6_losing(inp); 8051 else 8052 #endif 8053 in_losing(inp); 8054 tp->t_rttvar += tp->t_srtt; 8055 tp->t_srtt = 0; 8056 } 8057 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8058 tp->snd_recover = tp->snd_max; 8059 tp->t_flags |= TF_ACKNOW; 8060 tp->t_rtttime = 0; 8061 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 8062 out: 8063 return (retval); 8064 } 8065 8066 static int 8067 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 8068 { 8069 int32_t ret = 0; 8070 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 8071 8072 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8073 (tp->t_flags & TF_GPUTINPROG)) { 8074 /* 8075 * We have a goodput in progress 8076 * and we have entered a late state. 8077 * Do we have enough data in the sb 8078 * to handle the GPUT request? 8079 */ 8080 uint32_t bytes; 8081 8082 bytes = tp->gput_ack - tp->gput_seq; 8083 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 8084 bytes += tp->gput_seq - tp->snd_una; 8085 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 8086 /* 8087 * There are not enough bytes in the socket 8088 * buffer that have been sent to cover this 8089 * measurement. Cancel it. 8090 */ 8091 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 8092 rack->r_ctl.rc_gp_srtt /*flex1*/, 8093 tp->gput_seq, 8094 0, 0, 18, __LINE__, NULL, 0); 8095 tp->t_flags &= ~TF_GPUTINPROG; 8096 } 8097 } 8098 if (timers == 0) { 8099 return (0); 8100 } 8101 if (tp->t_state == TCPS_LISTEN) { 8102 /* no timers on listen sockets */ 8103 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 8104 return (0); 8105 return (1); 8106 } 8107 if ((timers & PACE_TMR_RACK) && 8108 rack->rc_on_min_to) { 8109 /* 8110 * For the rack timer when we 8111 * are on a min-timeout (which means rrr_conf = 3) 8112 * we don't want to check the timer. It may 8113 * be going off for a pace and thats ok we 8114 * want to send the retransmit (if its ready). 8115 * 8116 * If its on a normal rack timer (non-min) then 8117 * we will check if its expired. 8118 */ 8119 goto skip_time_check; 8120 } 8121 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 8122 uint32_t left; 8123 8124 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 8125 ret = -1; 8126 rack_log_to_processing(rack, cts, ret, 0); 8127 return (0); 8128 } 8129 if (hpts_calling == 0) { 8130 /* 8131 * A user send or queued mbuf (sack) has called us? We 8132 * return 0 and let the pacing guards 8133 * deal with it if they should or 8134 * should not cause a send. 8135 */ 8136 ret = -2; 8137 rack_log_to_processing(rack, cts, ret, 0); 8138 return (0); 8139 } 8140 /* 8141 * Ok our timer went off early and we are not paced false 8142 * alarm, go back to sleep. We make sure we don't have 8143 * no-sack wakeup on since we no longer have a PKT_OUTPUT 8144 * flag in place. 8145 */ 8146 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; 8147 ret = -3; 8148 left = rack->r_ctl.rc_timer_exp - cts; 8149 tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); 8150 rack_log_to_processing(rack, cts, ret, left); 8151 return (1); 8152 } 8153 skip_time_check: 8154 rack->rc_tmr_stopped = 0; 8155 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 8156 if (timers & PACE_TMR_DELACK) { 8157 ret = rack_timeout_delack(tp, rack, cts); 8158 } else if (timers & PACE_TMR_RACK) { 8159 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8160 rack->r_fast_output = 0; 8161 ret = rack_timeout_rack(tp, rack, cts); 8162 } else if (timers & PACE_TMR_TLP) { 8163 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8164 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 8165 } else if (timers & PACE_TMR_RXT) { 8166 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8167 rack->r_fast_output = 0; 8168 ret = rack_timeout_rxt(tp, rack, cts); 8169 } else if (timers & PACE_TMR_PERSIT) { 8170 ret = rack_timeout_persist(tp, rack, cts); 8171 } else if (timers & PACE_TMR_KEEP) { 8172 ret = rack_timeout_keepalive(tp, rack, cts); 8173 } 8174 rack_log_to_processing(rack, cts, ret, timers); 8175 return (ret); 8176 } 8177 8178 static void 8179 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 8180 { 8181 struct timeval tv; 8182 uint32_t us_cts, flags_on_entry; 8183 uint8_t hpts_removed = 0; 8184 8185 flags_on_entry = rack->r_ctl.rc_hpts_flags; 8186 us_cts = tcp_get_usecs(&tv); 8187 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 8188 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 8189 ((tp->snd_max - tp->snd_una) == 0))) { 8190 tcp_hpts_remove(rack->rc_tp); 8191 hpts_removed = 1; 8192 /* If we were not delayed cancel out the flag. */ 8193 if ((tp->snd_max - tp->snd_una) == 0) 8194 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8195 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8196 } 8197 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 8198 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 8199 if (tcp_in_hpts(rack->rc_tp) && 8200 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 8201 /* 8202 * Canceling timer's when we have no output being 8203 * paced. We also must remove ourselves from the 8204 * hpts. 8205 */ 8206 tcp_hpts_remove(rack->rc_tp); 8207 hpts_removed = 1; 8208 } 8209 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 8210 } 8211 if (hpts_removed == 0) 8212 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8213 } 8214 8215 static int 8216 rack_stopall(struct tcpcb *tp) 8217 { 8218 struct tcp_rack *rack; 8219 8220 rack = (struct tcp_rack *)tp->t_fb_ptr; 8221 rack->t_timers_stopped = 1; 8222 8223 tcp_hpts_remove(tp); 8224 8225 return (0); 8226 } 8227 8228 static void 8229 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) 8230 { 8231 /* 8232 * Assure no timers are running. 8233 */ 8234 if (tcp_timer_active(tp, TT_PERSIST)) { 8235 /* We enter in persists, set the flag appropriately */ 8236 rack->rc_in_persist = 1; 8237 } 8238 if (tcp_in_hpts(rack->rc_tp)) { 8239 tcp_hpts_remove(rack->rc_tp); 8240 } 8241 } 8242 8243 static void 8244 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 8245 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz) 8246 { 8247 int32_t idx; 8248 8249 rsm->r_rtr_cnt++; 8250 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8251 rsm->r_dupack = 0; 8252 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 8253 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 8254 rsm->r_flags |= RACK_OVERMAX; 8255 } 8256 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 8257 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 8258 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 8259 } 8260 idx = rsm->r_rtr_cnt - 1; 8261 rsm->r_tim_lastsent[idx] = ts; 8262 /* 8263 * Here we don't add in the len of send, since its already 8264 * in snduna <->snd_max. 8265 */ 8266 rsm->r_fas = ctf_flight_size(rack->rc_tp, 8267 rack->r_ctl.rc_sacked); 8268 if (rsm->r_flags & RACK_ACKED) { 8269 /* Problably MTU discovery messing with us */ 8270 rsm->r_flags &= ~RACK_ACKED; 8271 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8272 } 8273 if (rsm->r_in_tmap) { 8274 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8275 rsm->r_in_tmap = 0; 8276 } 8277 /* Lets make sure it really is in or not the GP window */ 8278 rack_mark_in_gp_win(tp, rsm); 8279 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8280 rsm->r_in_tmap = 1; 8281 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); 8282 /* Take off the must retransmit flag, if its on */ 8283 if (rsm->r_flags & RACK_MUST_RXT) { 8284 if (rack->r_must_retran) 8285 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 8286 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 8287 /* 8288 * We have retransmitted all we need. Clear 8289 * any must retransmit flags. 8290 */ 8291 rack->r_must_retran = 0; 8292 rack->r_ctl.rc_out_at_rto = 0; 8293 } 8294 rsm->r_flags &= ~RACK_MUST_RXT; 8295 } 8296 /* Remove any collapsed flag */ 8297 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8298 if (rsm->r_flags & RACK_SACK_PASSED) { 8299 /* We have retransmitted due to the SACK pass */ 8300 rsm->r_flags &= ~RACK_SACK_PASSED; 8301 rsm->r_flags |= RACK_WAS_SACKPASS; 8302 } 8303 } 8304 8305 static uint32_t 8306 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 8307 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz) 8308 { 8309 /* 8310 * We (re-)transmitted starting at rsm->r_start for some length 8311 * (possibly less than r_end. 8312 */ 8313 struct rack_sendmap *nrsm; 8314 int insret __diagused; 8315 uint32_t c_end; 8316 int32_t len; 8317 8318 len = *lenp; 8319 c_end = rsm->r_start + len; 8320 if (SEQ_GEQ(c_end, rsm->r_end)) { 8321 /* 8322 * We retransmitted the whole piece or more than the whole 8323 * slopping into the next rsm. 8324 */ 8325 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8326 if (c_end == rsm->r_end) { 8327 *lenp = 0; 8328 return (0); 8329 } else { 8330 int32_t act_len; 8331 8332 /* Hangs over the end return whats left */ 8333 act_len = rsm->r_end - rsm->r_start; 8334 *lenp = (len - act_len); 8335 return (rsm->r_end); 8336 } 8337 /* We don't get out of this block. */ 8338 } 8339 /* 8340 * Here we retransmitted less than the whole thing which means we 8341 * have to split this into what was transmitted and what was not. 8342 */ 8343 nrsm = rack_alloc_full_limit(rack); 8344 if (nrsm == NULL) { 8345 /* 8346 * We can't get memory, so lets not proceed. 8347 */ 8348 *lenp = 0; 8349 return (0); 8350 } 8351 /* 8352 * So here we are going to take the original rsm and make it what we 8353 * retransmitted. nrsm will be the tail portion we did not 8354 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 8355 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 8356 * 1, 6 and the new piece will be 6, 11. 8357 */ 8358 rack_clone_rsm(rack, nrsm, rsm, c_end); 8359 nrsm->r_dupack = 0; 8360 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8361 #ifndef INVARIANTS 8362 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8363 #else 8364 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8365 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8366 nrsm, insret, rack, rsm); 8367 } 8368 #endif 8369 if (rsm->r_in_tmap) { 8370 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8371 nrsm->r_in_tmap = 1; 8372 } 8373 rsm->r_flags &= (~RACK_HAS_FIN); 8374 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8375 /* Log a split of rsm into rsm and nrsm */ 8376 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8377 *lenp = 0; 8378 return (0); 8379 } 8380 8381 static void 8382 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 8383 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 8384 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, 8385 uint32_t s_moff, int hw_tls, int segsiz) 8386 { 8387 struct tcp_rack *rack; 8388 struct rack_sendmap *rsm, *nrsm; 8389 int insret __diagused; 8390 8391 register uint32_t snd_max, snd_una; 8392 8393 /* 8394 * Add to the RACK log of packets in flight or retransmitted. If 8395 * there is a TS option we will use the TS echoed, if not we will 8396 * grab a TS. 8397 * 8398 * Retransmissions will increment the count and move the ts to its 8399 * proper place. Note that if options do not include TS's then we 8400 * won't be able to effectively use the ACK for an RTT on a retran. 8401 * 8402 * Notes about r_start and r_end. Lets consider a send starting at 8403 * sequence 1 for 10 bytes. In such an example the r_start would be 8404 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 8405 * This means that r_end is actually the first sequence for the next 8406 * slot (11). 8407 * 8408 */ 8409 /* 8410 * If err is set what do we do XXXrrs? should we not add the thing? 8411 * -- i.e. return if err != 0 or should we pretend we sent it? -- 8412 * i.e. proceed with add ** do this for now. 8413 */ 8414 INP_WLOCK_ASSERT(tptoinpcb(tp)); 8415 if (err) 8416 /* 8417 * We don't log errors -- we could but snd_max does not 8418 * advance in this case either. 8419 */ 8420 return; 8421 8422 if (th_flags & TH_RST) { 8423 /* 8424 * We don't log resets and we return immediately from 8425 * sending 8426 */ 8427 return; 8428 } 8429 rack = (struct tcp_rack *)tp->t_fb_ptr; 8430 snd_una = tp->snd_una; 8431 snd_max = tp->snd_max; 8432 if (th_flags & (TH_SYN | TH_FIN)) { 8433 /* 8434 * The call to rack_log_output is made before bumping 8435 * snd_max. This means we can record one extra byte on a SYN 8436 * or FIN if seq_out is adding more on and a FIN is present 8437 * (and we are not resending). 8438 */ 8439 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 8440 len++; 8441 if (th_flags & TH_FIN) 8442 len++; 8443 if (SEQ_LT(snd_max, tp->snd_nxt)) { 8444 /* 8445 * The add/update as not been done for the FIN/SYN 8446 * yet. 8447 */ 8448 snd_max = tp->snd_nxt; 8449 } 8450 } 8451 if (SEQ_LEQ((seq_out + len), snd_una)) { 8452 /* Are sending an old segment to induce an ack (keep-alive)? */ 8453 return; 8454 } 8455 if (SEQ_LT(seq_out, snd_una)) { 8456 /* huh? should we panic? */ 8457 uint32_t end; 8458 8459 end = seq_out + len; 8460 seq_out = snd_una; 8461 if (SEQ_GEQ(end, seq_out)) 8462 len = end - seq_out; 8463 else 8464 len = 0; 8465 } 8466 if (len == 0) { 8467 /* We don't log zero window probes */ 8468 return; 8469 } 8470 if (IN_FASTRECOVERY(tp->t_flags)) { 8471 rack->r_ctl.rc_prr_out += len; 8472 } 8473 /* First question is it a retransmission or new? */ 8474 if (seq_out == snd_max) { 8475 /* Its new */ 8476 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts); 8477 again: 8478 rsm = rack_alloc(rack); 8479 if (rsm == NULL) { 8480 /* 8481 * Hmm out of memory and the tcb got destroyed while 8482 * we tried to wait. 8483 */ 8484 return; 8485 } 8486 if (th_flags & TH_FIN) { 8487 rsm->r_flags = RACK_HAS_FIN|add_flag; 8488 } else { 8489 rsm->r_flags = add_flag; 8490 } 8491 if (hw_tls) 8492 rsm->r_hw_tls = 1; 8493 rsm->r_tim_lastsent[0] = cts; 8494 rsm->r_rtr_cnt = 1; 8495 rsm->r_rtr_bytes = 0; 8496 if (th_flags & TH_SYN) { 8497 /* The data space is one beyond snd_una */ 8498 rsm->r_flags |= RACK_HAS_SYN; 8499 } 8500 rsm->r_start = seq_out; 8501 rsm->r_end = rsm->r_start + len; 8502 rack_mark_in_gp_win(tp, rsm); 8503 rsm->r_dupack = 0; 8504 /* 8505 * save off the mbuf location that 8506 * sndmbuf_noadv returned (which is 8507 * where we started copying from).. 8508 */ 8509 rsm->m = s_mb; 8510 rsm->soff = s_moff; 8511 /* 8512 * Here we do add in the len of send, since its not yet 8513 * reflected in in snduna <->snd_max 8514 */ 8515 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 8516 rack->r_ctl.rc_sacked) + 8517 (rsm->r_end - rsm->r_start)); 8518 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 8519 if (rsm->m) { 8520 if (rsm->m->m_len <= rsm->soff) { 8521 /* 8522 * XXXrrs Question, will this happen? 8523 * 8524 * If sbsndptr is set at the correct place 8525 * then s_moff should always be somewhere 8526 * within rsm->m. But if the sbsndptr was 8527 * off then that won't be true. If it occurs 8528 * we need to walkout to the correct location. 8529 */ 8530 struct mbuf *lm; 8531 8532 lm = rsm->m; 8533 while (lm->m_len <= rsm->soff) { 8534 rsm->soff -= lm->m_len; 8535 lm = lm->m_next; 8536 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 8537 __func__, rack, s_moff, s_mb, rsm->soff)); 8538 } 8539 rsm->m = lm; 8540 } 8541 rsm->orig_m_len = rsm->m->m_len; 8542 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 8543 } else { 8544 rsm->orig_m_len = 0; 8545 rsm->orig_t_space = 0; 8546 } 8547 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); 8548 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8549 /* Log a new rsm */ 8550 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 8551 #ifndef INVARIANTS 8552 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 8553 #else 8554 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 8555 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8556 nrsm, insret, rack, rsm); 8557 } 8558 #endif 8559 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8560 rsm->r_in_tmap = 1; 8561 /* 8562 * Special case detection, is there just a single 8563 * packet outstanding when we are not in recovery? 8564 * 8565 * If this is true mark it so. 8566 */ 8567 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 8568 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 8569 struct rack_sendmap *prsm; 8570 8571 prsm = tqhash_prev(rack->r_ctl.tqh, rsm); 8572 if (prsm) 8573 prsm->r_one_out_nr = 1; 8574 } 8575 return; 8576 } 8577 /* 8578 * If we reach here its a retransmission and we need to find it. 8579 */ 8580 more: 8581 if (hintrsm && (hintrsm->r_start == seq_out)) { 8582 rsm = hintrsm; 8583 hintrsm = NULL; 8584 } else { 8585 /* No hints sorry */ 8586 rsm = NULL; 8587 } 8588 if ((rsm) && (rsm->r_start == seq_out)) { 8589 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8590 if (len == 0) { 8591 return; 8592 } else { 8593 goto more; 8594 } 8595 } 8596 /* Ok it was not the last pointer go through it the hard way. */ 8597 refind: 8598 rsm = tqhash_find(rack->r_ctl.tqh, seq_out); 8599 if (rsm) { 8600 if (rsm->r_start == seq_out) { 8601 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8602 if (len == 0) { 8603 return; 8604 } else { 8605 goto refind; 8606 } 8607 } 8608 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 8609 /* Transmitted within this piece */ 8610 /* 8611 * Ok we must split off the front and then let the 8612 * update do the rest 8613 */ 8614 nrsm = rack_alloc_full_limit(rack); 8615 if (nrsm == NULL) { 8616 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); 8617 return; 8618 } 8619 /* 8620 * copy rsm to nrsm and then trim the front of rsm 8621 * to not include this part. 8622 */ 8623 rack_clone_rsm(rack, nrsm, rsm, seq_out); 8624 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8625 #ifndef INVARIANTS 8626 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8627 #else 8628 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8629 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8630 nrsm, insret, rack, rsm); 8631 } 8632 #endif 8633 if (rsm->r_in_tmap) { 8634 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8635 nrsm->r_in_tmap = 1; 8636 } 8637 rsm->r_flags &= (~RACK_HAS_FIN); 8638 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); 8639 if (len == 0) { 8640 return; 8641 } else if (len > 0) 8642 goto refind; 8643 } 8644 } 8645 /* 8646 * Hmm not found in map did they retransmit both old and on into the 8647 * new? 8648 */ 8649 if (seq_out == tp->snd_max) { 8650 goto again; 8651 } else if (SEQ_LT(seq_out, tp->snd_max)) { 8652 #ifdef INVARIANTS 8653 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 8654 seq_out, len, tp->snd_una, tp->snd_max); 8655 printf("Starting Dump of all rack entries\n"); 8656 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 8657 printf("rsm:%p start:%u end:%u\n", 8658 rsm, rsm->r_start, rsm->r_end); 8659 } 8660 printf("Dump complete\n"); 8661 panic("seq_out not found rack:%p tp:%p", 8662 rack, tp); 8663 #endif 8664 } else { 8665 #ifdef INVARIANTS 8666 /* 8667 * Hmm beyond sndmax? (only if we are using the new rtt-pack 8668 * flag) 8669 */ 8670 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 8671 seq_out, len, tp->snd_max, tp); 8672 #endif 8673 } 8674 } 8675 8676 /* 8677 * Record one of the RTT updates from an ack into 8678 * our sample structure. 8679 */ 8680 8681 static void 8682 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 8683 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 8684 { 8685 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8686 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 8687 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 8688 } 8689 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8690 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 8691 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 8692 } 8693 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 8694 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 8695 rack->r_ctl.rc_gp_lowrtt = us_rtt; 8696 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 8697 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 8698 } 8699 if ((confidence == 1) && 8700 ((rsm == NULL) || 8701 (rsm->r_just_ret) || 8702 (rsm->r_one_out_nr && 8703 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 8704 /* 8705 * If the rsm had a just return 8706 * hit it then we can't trust the 8707 * rtt measurement for buffer deterimination 8708 * Note that a confidence of 2, indicates 8709 * SACK'd which overrides the r_just_ret or 8710 * the r_one_out_nr. If it was a CUM-ACK and 8711 * we had only two outstanding, but get an 8712 * ack for only 1. Then that also lowers our 8713 * confidence. 8714 */ 8715 confidence = 0; 8716 } 8717 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8718 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 8719 if (rack->r_ctl.rack_rs.confidence == 0) { 8720 /* 8721 * We take anything with no current confidence 8722 * saved. 8723 */ 8724 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8725 rack->r_ctl.rack_rs.confidence = confidence; 8726 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8727 } else if (confidence != 0) { 8728 /* 8729 * Once we have a confident number, 8730 * we can update it with a smaller 8731 * value since this confident number 8732 * may include the DSACK time until 8733 * the next segment (the second one) arrived. 8734 */ 8735 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8736 rack->r_ctl.rack_rs.confidence = confidence; 8737 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8738 } 8739 } 8740 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 8741 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 8742 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 8743 rack->r_ctl.rack_rs.rs_rtt_cnt++; 8744 } 8745 8746 /* 8747 * Collect new round-trip time estimate 8748 * and update averages and current timeout. 8749 */ 8750 static void 8751 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 8752 { 8753 int32_t delta; 8754 int32_t rtt; 8755 8756 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 8757 /* No valid sample */ 8758 return; 8759 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 8760 /* We are to use the lowest RTT seen in a single ack */ 8761 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 8762 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 8763 /* We are to use the highest RTT seen in a single ack */ 8764 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 8765 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 8766 /* We are to use the average RTT seen in a single ack */ 8767 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 8768 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 8769 } else { 8770 #ifdef INVARIANTS 8771 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 8772 #endif 8773 return; 8774 } 8775 if (rtt == 0) 8776 rtt = 1; 8777 if (rack->rc_gp_rtt_set == 0) { 8778 /* 8779 * With no RTT we have to accept 8780 * even one we are not confident of. 8781 */ 8782 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 8783 rack->rc_gp_rtt_set = 1; 8784 } else if (rack->r_ctl.rack_rs.confidence) { 8785 /* update the running gp srtt */ 8786 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 8787 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 8788 } 8789 if (rack->r_ctl.rack_rs.confidence) { 8790 /* 8791 * record the low and high for highly buffered path computation, 8792 * we only do this if we are confident (not a retransmission). 8793 */ 8794 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 8795 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8796 } 8797 if (rack->rc_highly_buffered == 0) { 8798 /* 8799 * Currently once we declare a path has 8800 * highly buffered there is no going 8801 * back, which may be a problem... 8802 */ 8803 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 8804 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 8805 rack->r_ctl.rc_highest_us_rtt, 8806 rack->r_ctl.rc_lowest_us_rtt, 8807 RACK_RTTS_SEEHBP); 8808 rack->rc_highly_buffered = 1; 8809 } 8810 } 8811 } 8812 if ((rack->r_ctl.rack_rs.confidence) || 8813 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 8814 /* 8815 * If we are highly confident of it <or> it was 8816 * never retransmitted we accept it as the last us_rtt. 8817 */ 8818 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8819 /* The lowest rtt can be set if its was not retransmited */ 8820 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 8821 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8822 if (rack->r_ctl.rc_lowest_us_rtt == 0) 8823 rack->r_ctl.rc_lowest_us_rtt = 1; 8824 } 8825 } 8826 rack = (struct tcp_rack *)tp->t_fb_ptr; 8827 if (tp->t_srtt != 0) { 8828 /* 8829 * We keep a simple srtt in microseconds, like our rtt 8830 * measurement. We don't need to do any tricks with shifting 8831 * etc. Instead we just add in 1/8th of the new measurement 8832 * and subtract out 1/8 of the old srtt. We do the same with 8833 * the variance after finding the absolute value of the 8834 * difference between this sample and the current srtt. 8835 */ 8836 delta = tp->t_srtt - rtt; 8837 /* Take off 1/8th of the current sRTT */ 8838 tp->t_srtt -= (tp->t_srtt >> 3); 8839 /* Add in 1/8th of the new RTT just measured */ 8840 tp->t_srtt += (rtt >> 3); 8841 if (tp->t_srtt <= 0) 8842 tp->t_srtt = 1; 8843 /* Now lets make the absolute value of the variance */ 8844 if (delta < 0) 8845 delta = -delta; 8846 /* Subtract out 1/8th */ 8847 tp->t_rttvar -= (tp->t_rttvar >> 3); 8848 /* Add in 1/8th of the new variance we just saw */ 8849 tp->t_rttvar += (delta >> 3); 8850 if (tp->t_rttvar <= 0) 8851 tp->t_rttvar = 1; 8852 } else { 8853 /* 8854 * No rtt measurement yet - use the unsmoothed rtt. Set the 8855 * variance to half the rtt (so our first retransmit happens 8856 * at 3*rtt). 8857 */ 8858 tp->t_srtt = rtt; 8859 tp->t_rttvar = rtt >> 1; 8860 } 8861 rack->rc_srtt_measure_made = 1; 8862 KMOD_TCPSTAT_INC(tcps_rttupdated); 8863 if (tp->t_rttupdated < UCHAR_MAX) 8864 tp->t_rttupdated++; 8865 #ifdef STATS 8866 if (rack_stats_gets_ms_rtt == 0) { 8867 /* Send in the microsecond rtt used for rxt timeout purposes */ 8868 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 8869 } else if (rack_stats_gets_ms_rtt == 1) { 8870 /* Send in the millisecond rtt used for rxt timeout purposes */ 8871 int32_t ms_rtt; 8872 8873 /* Round up */ 8874 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8875 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8876 } else if (rack_stats_gets_ms_rtt == 2) { 8877 /* Send in the millisecond rtt has close to the path RTT as we can get */ 8878 int32_t ms_rtt; 8879 8880 /* Round up */ 8881 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8882 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8883 } else { 8884 /* Send in the microsecond rtt has close to the path RTT as we can get */ 8885 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8886 } 8887 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8888 #endif 8889 /* 8890 * the retransmit should happen at rtt + 4 * rttvar. Because of the 8891 * way we do the smoothing, srtt and rttvar will each average +1/2 8892 * tick of bias. When we compute the retransmit timer, we want 1/2 8893 * tick of rounding and 1 extra tick because of +-1/2 tick 8894 * uncertainty in the firing of the timer. The bias will give us 8895 * exactly the 1.5 tick we need. But, because the bias is 8896 * statistical, we have to test that we don't drop below the minimum 8897 * feasible timer (which is 2 ticks). 8898 */ 8899 tp->t_rxtshift = 0; 8900 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8901 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 8902 rack_log_rtt_sample(rack, rtt); 8903 tp->t_softerror = 0; 8904 } 8905 8906 8907 static void 8908 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 8909 { 8910 /* 8911 * Apply to filter the inbound us-rtt at us_cts. 8912 */ 8913 uint32_t old_rtt; 8914 8915 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 8916 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 8917 us_rtt, us_cts); 8918 if (old_rtt > us_rtt) { 8919 /* We just hit a new lower rtt time */ 8920 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 8921 __LINE__, RACK_RTTS_NEWRTT); 8922 /* 8923 * Only count it if its lower than what we saw within our 8924 * calculated range. 8925 */ 8926 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 8927 if (rack_probertt_lower_within && 8928 rack->rc_gp_dyn_mul && 8929 (rack->use_fixed_rate == 0) && 8930 (rack->rc_always_pace)) { 8931 /* 8932 * We are seeing a new lower rtt very close 8933 * to the time that we would have entered probe-rtt. 8934 * This is probably due to the fact that a peer flow 8935 * has entered probe-rtt. Lets go in now too. 8936 */ 8937 uint32_t val; 8938 8939 val = rack_probertt_lower_within * rack_time_between_probertt; 8940 val /= 100; 8941 if ((rack->in_probe_rtt == 0) && 8942 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 8943 rack_enter_probertt(rack, us_cts); 8944 } 8945 } 8946 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 8947 } 8948 } 8949 } 8950 8951 static int 8952 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 8953 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 8954 { 8955 uint32_t us_rtt; 8956 int32_t i, all; 8957 uint32_t t, len_acked; 8958 8959 if ((rsm->r_flags & RACK_ACKED) || 8960 (rsm->r_flags & RACK_WAS_ACKED)) 8961 /* Already done */ 8962 return (0); 8963 if (rsm->r_no_rtt_allowed) { 8964 /* Not allowed */ 8965 return (0); 8966 } 8967 if (ack_type == CUM_ACKED) { 8968 if (SEQ_GT(th_ack, rsm->r_end)) { 8969 len_acked = rsm->r_end - rsm->r_start; 8970 all = 1; 8971 } else { 8972 len_acked = th_ack - rsm->r_start; 8973 all = 0; 8974 } 8975 } else { 8976 len_acked = rsm->r_end - rsm->r_start; 8977 all = 0; 8978 } 8979 if (rsm->r_rtr_cnt == 1) { 8980 8981 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8982 if ((int)t <= 0) 8983 t = 1; 8984 if (!tp->t_rttlow || tp->t_rttlow > t) 8985 tp->t_rttlow = t; 8986 if (!rack->r_ctl.rc_rack_min_rtt || 8987 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8988 rack->r_ctl.rc_rack_min_rtt = t; 8989 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8990 rack->r_ctl.rc_rack_min_rtt = 1; 8991 } 8992 } 8993 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 8994 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8995 else 8996 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8997 if (us_rtt == 0) 8998 us_rtt = 1; 8999 if (CC_ALGO(tp)->rttsample != NULL) { 9000 /* Kick the RTT to the CC */ 9001 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 9002 } 9003 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 9004 if (ack_type == SACKED) { 9005 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 9006 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 9007 } else { 9008 /* 9009 * We need to setup what our confidence 9010 * is in this ack. 9011 * 9012 * If the rsm was app limited and it is 9013 * less than a mss in length (the end 9014 * of the send) then we have a gap. If we 9015 * were app limited but say we were sending 9016 * multiple MSS's then we are more confident 9017 * int it. 9018 * 9019 * When we are not app-limited then we see if 9020 * the rsm is being included in the current 9021 * measurement, we tell this by the app_limited_needs_set 9022 * flag. 9023 * 9024 * Note that being cwnd blocked is not applimited 9025 * as well as the pacing delay between packets which 9026 * are sending only 1 or 2 MSS's also will show up 9027 * in the RTT. We probably need to examine this algorithm 9028 * a bit more and enhance it to account for the delay 9029 * between rsm's. We could do that by saving off the 9030 * pacing delay of each rsm (in an rsm) and then 9031 * factoring that in somehow though for now I am 9032 * not sure how :) 9033 */ 9034 int calc_conf = 0; 9035 9036 if (rsm->r_flags & RACK_APP_LIMITED) { 9037 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 9038 calc_conf = 0; 9039 else 9040 calc_conf = 1; 9041 } else if (rack->app_limited_needs_set == 0) { 9042 calc_conf = 1; 9043 } else { 9044 calc_conf = 0; 9045 } 9046 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 9047 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 9048 calc_conf, rsm, rsm->r_rtr_cnt); 9049 } 9050 if ((rsm->r_flags & RACK_TLP) && 9051 (!IN_FASTRECOVERY(tp->t_flags))) { 9052 /* Segment was a TLP and our retrans matched */ 9053 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 9054 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 9055 } 9056 } 9057 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9058 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9059 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9060 /* New more recent rack_tmit_time */ 9061 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9062 if (rack->r_ctl.rc_rack_tmit_time == 0) 9063 rack->r_ctl.rc_rack_tmit_time = 1; 9064 rack->rc_rack_rtt = t; 9065 } 9066 return (1); 9067 } 9068 /* 9069 * We clear the soft/rxtshift since we got an ack. 9070 * There is no assurance we will call the commit() function 9071 * so we need to clear these to avoid incorrect handling. 9072 */ 9073 tp->t_rxtshift = 0; 9074 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9075 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9076 tp->t_softerror = 0; 9077 if (to && (to->to_flags & TOF_TS) && 9078 (ack_type == CUM_ACKED) && 9079 (to->to_tsecr) && 9080 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 9081 /* 9082 * Now which timestamp does it match? In this block the ACK 9083 * must be coming from a previous transmission. 9084 */ 9085 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9086 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 9087 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9088 if ((int)t <= 0) 9089 t = 1; 9090 if (CC_ALGO(tp)->rttsample != NULL) { 9091 /* 9092 * Kick the RTT to the CC, here 9093 * we lie a bit in that we know the 9094 * retransmission is correct even though 9095 * we retransmitted. This is because 9096 * we match the timestamps. 9097 */ 9098 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 9099 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 9100 else 9101 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 9102 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 9103 } 9104 if ((i + 1) < rsm->r_rtr_cnt) { 9105 /* 9106 * The peer ack'd from our previous 9107 * transmission. We have a spurious 9108 * retransmission and thus we dont 9109 * want to update our rack_rtt. 9110 * 9111 * Hmm should there be a CC revert here? 9112 * 9113 */ 9114 return (0); 9115 } 9116 if (!tp->t_rttlow || tp->t_rttlow > t) 9117 tp->t_rttlow = t; 9118 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9119 rack->r_ctl.rc_rack_min_rtt = t; 9120 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9121 rack->r_ctl.rc_rack_min_rtt = 1; 9122 } 9123 } 9124 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9125 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9126 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9127 /* New more recent rack_tmit_time */ 9128 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9129 if (rack->r_ctl.rc_rack_tmit_time == 0) 9130 rack->r_ctl.rc_rack_tmit_time = 1; 9131 rack->rc_rack_rtt = t; 9132 } 9133 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 9134 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 9135 rsm->r_rtr_cnt); 9136 return (1); 9137 } 9138 } 9139 /* If we are logging log out the sendmap */ 9140 if (tcp_bblogging_on(rack->rc_tp)) { 9141 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9142 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); 9143 } 9144 } 9145 goto ts_not_found; 9146 } else { 9147 /* 9148 * Ok its a SACK block that we retransmitted. or a windows 9149 * machine without timestamps. We can tell nothing from the 9150 * time-stamp since its not there or the time the peer last 9151 * received a segment that moved forward its cum-ack point. 9152 */ 9153 ts_not_found: 9154 i = rsm->r_rtr_cnt - 1; 9155 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9156 if ((int)t <= 0) 9157 t = 1; 9158 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9159 /* 9160 * We retransmitted and the ack came back in less 9161 * than the smallest rtt we have observed. We most 9162 * likely did an improper retransmit as outlined in 9163 * 6.2 Step 2 point 2 in the rack-draft so we 9164 * don't want to update our rack_rtt. We in 9165 * theory (in future) might want to think about reverting our 9166 * cwnd state but we won't for now. 9167 */ 9168 return (0); 9169 } else if (rack->r_ctl.rc_rack_min_rtt) { 9170 /* 9171 * We retransmitted it and the retransmit did the 9172 * job. 9173 */ 9174 if (!rack->r_ctl.rc_rack_min_rtt || 9175 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9176 rack->r_ctl.rc_rack_min_rtt = t; 9177 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9178 rack->r_ctl.rc_rack_min_rtt = 1; 9179 } 9180 } 9181 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9182 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9183 (uint32_t)rsm->r_tim_lastsent[i]))) { 9184 /* New more recent rack_tmit_time */ 9185 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 9186 if (rack->r_ctl.rc_rack_tmit_time == 0) 9187 rack->r_ctl.rc_rack_tmit_time = 1; 9188 rack->rc_rack_rtt = t; 9189 } 9190 return (1); 9191 } 9192 } 9193 return (0); 9194 } 9195 9196 /* 9197 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 9198 */ 9199 static void 9200 rack_log_sack_passed(struct tcpcb *tp, 9201 struct tcp_rack *rack, struct rack_sendmap *rsm) 9202 { 9203 struct rack_sendmap *nrsm; 9204 9205 nrsm = rsm; 9206 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 9207 rack_head, r_tnext) { 9208 if (nrsm == rsm) { 9209 /* Skip original segment he is acked */ 9210 continue; 9211 } 9212 if (nrsm->r_flags & RACK_ACKED) { 9213 /* 9214 * Skip ack'd segments, though we 9215 * should not see these, since tmap 9216 * should not have ack'd segments. 9217 */ 9218 continue; 9219 } 9220 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 9221 /* 9222 * If the peer dropped the rwnd on 9223 * these then we don't worry about them. 9224 */ 9225 continue; 9226 } 9227 if (nrsm->r_flags & RACK_SACK_PASSED) { 9228 /* 9229 * We found one that is already marked 9230 * passed, we have been here before and 9231 * so all others below this are marked. 9232 */ 9233 break; 9234 } 9235 nrsm->r_flags |= RACK_SACK_PASSED; 9236 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 9237 } 9238 } 9239 9240 static void 9241 rack_need_set_test(struct tcpcb *tp, 9242 struct tcp_rack *rack, 9243 struct rack_sendmap *rsm, 9244 tcp_seq th_ack, 9245 int line, 9246 int use_which) 9247 { 9248 struct rack_sendmap *s_rsm; 9249 9250 if ((tp->t_flags & TF_GPUTINPROG) && 9251 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9252 /* 9253 * We were app limited, and this ack 9254 * butts up or goes beyond the point where we want 9255 * to start our next measurement. We need 9256 * to record the new gput_ts as here and 9257 * possibly update the start sequence. 9258 */ 9259 uint32_t seq, ts; 9260 9261 if (rsm->r_rtr_cnt > 1) { 9262 /* 9263 * This is a retransmit, can we 9264 * really make any assessment at this 9265 * point? We are not really sure of 9266 * the timestamp, is it this or the 9267 * previous transmission? 9268 * 9269 * Lets wait for something better that 9270 * is not retransmitted. 9271 */ 9272 return; 9273 } 9274 seq = tp->gput_seq; 9275 ts = tp->gput_ts; 9276 rack->app_limited_needs_set = 0; 9277 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 9278 /* Do we start at a new end? */ 9279 if ((use_which == RACK_USE_BEG) && 9280 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 9281 /* 9282 * When we get an ACK that just eats 9283 * up some of the rsm, we set RACK_USE_BEG 9284 * since whats at r_start (i.e. th_ack) 9285 * is left unacked and thats where the 9286 * measurement now starts. 9287 */ 9288 tp->gput_seq = rsm->r_start; 9289 } 9290 if ((use_which == RACK_USE_END) && 9291 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9292 /* 9293 * We use the end when the cumack 9294 * is moving forward and completely 9295 * deleting the rsm passed so basically 9296 * r_end holds th_ack. 9297 * 9298 * For SACK's we also want to use the end 9299 * since this piece just got sacked and 9300 * we want to target anything after that 9301 * in our measurement. 9302 */ 9303 tp->gput_seq = rsm->r_end; 9304 } 9305 if (use_which == RACK_USE_END_OR_THACK) { 9306 /* 9307 * special case for ack moving forward, 9308 * not a sack, we need to move all the 9309 * way up to where this ack cum-ack moves 9310 * to. 9311 */ 9312 if (SEQ_GT(th_ack, rsm->r_end)) 9313 tp->gput_seq = th_ack; 9314 else 9315 tp->gput_seq = rsm->r_end; 9316 } 9317 if (SEQ_LT(tp->gput_seq, tp->snd_max)) 9318 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 9319 else 9320 s_rsm = NULL; 9321 /* 9322 * Pick up the correct send time if we can the rsm passed in 9323 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other 9324 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will 9325 * find a different seq i.e. the next send up. 9326 * 9327 * If that has not been sent, s_rsm will be NULL and we must 9328 * arrange it so this function will get called again by setting 9329 * app_limited_needs_set. 9330 */ 9331 if (s_rsm) 9332 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; 9333 else { 9334 /* If we hit here we have to have *not* sent tp->gput_seq */ 9335 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 9336 /* Set it up so we will go through here again */ 9337 rack->app_limited_needs_set = 1; 9338 } 9339 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 9340 /* 9341 * We moved beyond this guy's range, re-calculate 9342 * the new end point. 9343 */ 9344 if (rack->rc_gp_filled == 0) { 9345 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 9346 } else { 9347 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 9348 } 9349 } 9350 /* 9351 * We are moving the goal post, we may be able to clear the 9352 * measure_saw_probe_rtt flag. 9353 */ 9354 if ((rack->in_probe_rtt == 0) && 9355 (rack->measure_saw_probe_rtt) && 9356 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 9357 rack->measure_saw_probe_rtt = 0; 9358 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 9359 seq, tp->gput_seq, 9360 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9361 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9362 5, line, NULL, 0); 9363 if (rack->rc_gp_filled && 9364 ((tp->gput_ack - tp->gput_seq) < 9365 max(rc_init_window(rack), (MIN_GP_WIN * 9366 ctf_fixed_maxseg(tp))))) { 9367 uint32_t ideal_amount; 9368 9369 ideal_amount = rack_get_measure_window(tp, rack); 9370 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 9371 /* 9372 * There is no sense of continuing this measurement 9373 * because its too small to gain us anything we 9374 * trust. Skip it and that way we can start a new 9375 * measurement quicker. 9376 */ 9377 tp->t_flags &= ~TF_GPUTINPROG; 9378 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 9379 0, 0, 9380 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9381 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9382 6, __LINE__, NULL, 0); 9383 } else { 9384 /* 9385 * Reset the window further out. 9386 */ 9387 tp->gput_ack = tp->gput_seq + ideal_amount; 9388 } 9389 } 9390 rack_tend_gp_marks(tp, rack); 9391 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); 9392 } 9393 } 9394 9395 static inline int 9396 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 9397 { 9398 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 9399 /* Behind our TLP definition or right at */ 9400 return (0); 9401 } 9402 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 9403 /* The start is beyond or right at our end of TLP definition */ 9404 return (0); 9405 } 9406 /* It has to be a sub-part of the original TLP recorded */ 9407 return (1); 9408 } 9409 9410 9411 9412 static uint32_t 9413 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 9414 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, 9415 int *no_extra, 9416 int *moved_two, uint32_t segsiz) 9417 { 9418 uint32_t start, end, changed = 0; 9419 struct rack_sendmap stack_map; 9420 struct rack_sendmap *rsm, *nrsm, *prev, *next; 9421 int insret __diagused; 9422 int32_t used_ref = 1; 9423 int moved = 0; 9424 #ifdef TCP_SAD_DETECTION 9425 int allow_segsiz; 9426 int first_time_through = 1; 9427 #endif 9428 int noextra = 0; 9429 int can_use_hookery = 0; 9430 9431 start = sack->start; 9432 end = sack->end; 9433 rsm = *prsm; 9434 9435 #ifdef TCP_SAD_DETECTION 9436 /* 9437 * There are a strange number of proxys and meddle boxes in the world 9438 * that seem to cut up segments on different boundaries. This gets us 9439 * smaller sacks that are still ok in terms of it being an attacker. 9440 * We use the base segsiz to calculate an allowable smallness but 9441 * also enforce a min on the segsiz in case it is an attacker playing 9442 * games with MSS. So basically if the sack arrives and it is 9443 * larger than a worse case 960 bytes, we don't classify the guy 9444 * as supicious. 9445 */ 9446 allow_segsiz = max(segsiz, 1200) * sad_seg_size_per; 9447 allow_segsiz /= 1000; 9448 #endif 9449 do_rest_ofb: 9450 if ((rsm == NULL) || 9451 (SEQ_LT(end, rsm->r_start)) || 9452 (SEQ_GEQ(start, rsm->r_end)) || 9453 (SEQ_LT(start, rsm->r_start))) { 9454 /* 9455 * We are not in the right spot, 9456 * find the correct spot in the tree. 9457 */ 9458 used_ref = 0; 9459 rsm = tqhash_find(rack->r_ctl.tqh, start); 9460 moved++; 9461 } 9462 if (rsm == NULL) { 9463 /* TSNH */ 9464 goto out; 9465 } 9466 #ifdef TCP_SAD_DETECTION 9467 /* Now we must check for suspicous activity */ 9468 if ((first_time_through == 1) && 9469 ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) && 9470 ((rsm->r_flags & RACK_PMTU_CHG) == 0) && 9471 ((rsm->r_flags & RACK_TLP) == 0)) { 9472 /* 9473 * Its less than a full MSS or the segment being acked 9474 * this should only happen if the rsm in question had the 9475 * r_just_ret flag set <and> the end matches the end of 9476 * the rsm block. 9477 * 9478 * Note we do not look at segments that have had TLP's on 9479 * them since we can get un-reported rwnd collapses that 9480 * basically we TLP on and then we get back a sack block 9481 * that goes from the start to only a small way. 9482 * 9483 */ 9484 int loss, ok; 9485 9486 ok = 0; 9487 if (SEQ_GEQ(end, rsm->r_end)) { 9488 if (rsm->r_just_ret == 1) { 9489 /* This was at the end of a send which is ok */ 9490 ok = 1; 9491 } else { 9492 /* A bit harder was it the end of our segment */ 9493 int segs, len; 9494 9495 len = (rsm->r_end - rsm->r_start); 9496 segs = len / segsiz; 9497 segs *= segsiz; 9498 if ((segs + (rsm->r_end - start)) == len) { 9499 /* 9500 * So this last bit was the 9501 * end of our send if we cut it 9502 * up into segsiz pieces so its ok. 9503 */ 9504 ok = 1; 9505 } 9506 } 9507 } 9508 if (ok == 0) { 9509 /* 9510 * This guy is doing something suspicious 9511 * lets start detection. 9512 */ 9513 if (rack->rc_suspicious == 0) { 9514 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT); 9515 counter_u64_add(rack_sack_attacks_suspect, 1); 9516 rack->rc_suspicious = 1; 9517 rack_log_sad(rack, 4); 9518 if (tcp_bblogging_on(rack->rc_tp)) { 9519 union tcp_log_stackspecific log; 9520 struct timeval tv; 9521 9522 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 9523 log.u_bbr.flex1 = end; 9524 log.u_bbr.flex2 = start; 9525 log.u_bbr.flex3 = rsm->r_end; 9526 log.u_bbr.flex4 = rsm->r_start; 9527 log.u_bbr.flex5 = segsiz; 9528 log.u_bbr.flex6 = rsm->r_fas; 9529 log.u_bbr.flex7 = rsm->r_bas; 9530 log.u_bbr.flex8 = 5; 9531 log.u_bbr.pkts_out = rsm->r_flags; 9532 log.u_bbr.bbr_state = rack->rc_suspicious; 9533 log.u_bbr.bbr_substate = rsm->r_just_ret; 9534 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 9535 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9536 TCP_LOG_EVENTP(rack->rc_tp, NULL, 9537 &rack->rc_inp->inp_socket->so_rcv, 9538 &rack->rc_inp->inp_socket->so_snd, 9539 TCP_SAD_DETECTION, 0, 9540 0, &log, false, &tv); 9541 } 9542 } 9543 /* You loose some ack count every time you sack 9544 * a small bit that is not butting to the end of 9545 * what we have sent. This is because we never 9546 * send small bits unless its the end of the sb. 9547 * Anyone sending a sack that is not at the end 9548 * is thus very very suspicious. 9549 */ 9550 loss = (segsiz/2) / (end - start); 9551 if (loss < rack->r_ctl.ack_count) 9552 rack->r_ctl.ack_count -= loss; 9553 else 9554 rack->r_ctl.ack_count = 0; 9555 } 9556 } 9557 first_time_through = 0; 9558 #endif 9559 /* Ok we have an ACK for some piece of this rsm */ 9560 if (rsm->r_start != start) { 9561 if ((rsm->r_flags & RACK_ACKED) == 0) { 9562 /* 9563 * Before any splitting or hookery is 9564 * done is it a TLP of interest i.e. rxt? 9565 */ 9566 if ((rsm->r_flags & RACK_TLP) && 9567 (rsm->r_rtr_cnt > 1)) { 9568 /* 9569 * We are splitting a rxt TLP, check 9570 * if we need to save off the start/end 9571 */ 9572 if (rack->rc_last_tlp_acked_set && 9573 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9574 /* 9575 * We already turned this on since we are inside 9576 * the previous one was a partially sack now we 9577 * are getting another one (maybe all of it). 9578 * 9579 */ 9580 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9581 /* 9582 * Lets make sure we have all of it though. 9583 */ 9584 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9585 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9586 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9587 rack->r_ctl.last_tlp_acked_end); 9588 } 9589 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9590 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9591 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9592 rack->r_ctl.last_tlp_acked_end); 9593 } 9594 } else { 9595 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9596 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9597 rack->rc_last_tlp_past_cumack = 0; 9598 rack->rc_last_tlp_acked_set = 1; 9599 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9600 } 9601 } 9602 /** 9603 * Need to split this in two pieces the before and after, 9604 * the before remains in the map, the after must be 9605 * added. In other words we have: 9606 * rsm |--------------| 9607 * sackblk |-------> 9608 * rsm will become 9609 * rsm |---| 9610 * and nrsm will be the sacked piece 9611 * nrsm |----------| 9612 * 9613 * But before we start down that path lets 9614 * see if the sack spans over on top of 9615 * the next guy and it is already sacked. 9616 * 9617 */ 9618 /* 9619 * Hookery can only be used if the two entries 9620 * are in the same bucket and neither one of 9621 * them staddle the bucket line. 9622 */ 9623 next = tqhash_next(rack->r_ctl.tqh, rsm); 9624 if (next && 9625 (rsm->bindex == next->bindex) && 9626 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9627 ((next->r_flags & RACK_STRADDLE) == 0) && 9628 (rsm->r_flags & RACK_IN_GP_WIN) && 9629 (next->r_flags & RACK_IN_GP_WIN)) 9630 can_use_hookery = 1; 9631 else if (next && 9632 (rsm->bindex == next->bindex) && 9633 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9634 ((next->r_flags & RACK_STRADDLE) == 0) && 9635 ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && 9636 ((next->r_flags & RACK_IN_GP_WIN) == 0)) 9637 can_use_hookery = 1; 9638 else 9639 can_use_hookery = 0; 9640 if (next && can_use_hookery && 9641 (next->r_flags & RACK_ACKED) && 9642 SEQ_GEQ(end, next->r_start)) { 9643 /** 9644 * So the next one is already acked, and 9645 * we can thus by hookery use our stack_map 9646 * to reflect the piece being sacked and 9647 * then adjust the two tree entries moving 9648 * the start and ends around. So we start like: 9649 * rsm |------------| (not-acked) 9650 * next |-----------| (acked) 9651 * sackblk |--------> 9652 * We want to end like so: 9653 * rsm |------| (not-acked) 9654 * next |-----------------| (acked) 9655 * nrsm |-----| 9656 * Where nrsm is a temporary stack piece we 9657 * use to update all the gizmos. 9658 */ 9659 /* Copy up our fudge block */ 9660 noextra++; 9661 nrsm = &stack_map; 9662 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9663 /* Now adjust our tree blocks */ 9664 rsm->r_end = start; 9665 next->r_start = start; 9666 rsm->r_flags |= RACK_SHUFFLED; 9667 next->r_flags |= RACK_SHUFFLED; 9668 /* Now we must adjust back where next->m is */ 9669 rack_setup_offset_for_rsm(rack, rsm, next); 9670 /* 9671 * Which timestamp do we keep? It is rather 9672 * important in GP measurements to have the 9673 * accurate end of the send window. 9674 * 9675 * We keep the largest value, which is the newest 9676 * send. We do this in case a segment that is 9677 * joined together and not part of a GP estimate 9678 * later gets expanded into the GP estimate. 9679 * 9680 * We prohibit the merging of unlike kinds i.e. 9681 * all pieces that are in the GP estimate can be 9682 * merged and all pieces that are not in a GP estimate 9683 * can be merged, but not disimilar pieces. Combine 9684 * this with taking the highest here and we should 9685 * be ok unless of course the client reneges. Then 9686 * all bets are off. 9687 */ 9688 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < 9689 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) 9690 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; 9691 /* 9692 * And we must keep the newest ack arrival time. 9693 */ 9694 if (next->r_ack_arrival < 9695 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9696 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9697 9698 9699 /* We don't need to adjust rsm, it did not change */ 9700 /* Clear out the dup ack count of the remainder */ 9701 rsm->r_dupack = 0; 9702 rsm->r_just_ret = 0; 9703 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9704 /* Now lets make sure our fudge block is right */ 9705 nrsm->r_start = start; 9706 /* Now lets update all the stats and such */ 9707 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9708 if (rack->app_limited_needs_set) 9709 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9710 changed += (nrsm->r_end - nrsm->r_start); 9711 /* You get a count for acking a whole segment or more */ 9712 if ((nrsm->r_end - nrsm->r_start) >= segsiz) 9713 rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); 9714 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9715 if (nrsm->r_flags & RACK_SACK_PASSED) { 9716 rack->r_ctl.rc_reorder_ts = cts; 9717 if (rack->r_ctl.rc_reorder_ts == 0) 9718 rack->r_ctl.rc_reorder_ts = 1; 9719 } 9720 /* 9721 * Now we want to go up from rsm (the 9722 * one left un-acked) to the next one 9723 * in the tmap. We do this so when 9724 * we walk backwards we include marking 9725 * sack-passed on rsm (The one passed in 9726 * is skipped since it is generally called 9727 * on something sacked before removing it 9728 * from the tmap). 9729 */ 9730 if (rsm->r_in_tmap) { 9731 nrsm = TAILQ_NEXT(rsm, r_tnext); 9732 /* 9733 * Now that we have the next 9734 * one walk backwards from there. 9735 */ 9736 if (nrsm && nrsm->r_in_tmap) 9737 rack_log_sack_passed(tp, rack, nrsm); 9738 } 9739 /* Now are we done? */ 9740 if (SEQ_LT(end, next->r_end) || 9741 (end == next->r_end)) { 9742 /* Done with block */ 9743 goto out; 9744 } 9745 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 9746 counter_u64_add(rack_sack_used_next_merge, 1); 9747 /* Postion for the next block */ 9748 start = next->r_end; 9749 rsm = tqhash_next(rack->r_ctl.tqh, next); 9750 if (rsm == NULL) 9751 goto out; 9752 } else { 9753 /** 9754 * We can't use any hookery here, so we 9755 * need to split the map. We enter like 9756 * so: 9757 * rsm |--------| 9758 * sackblk |-----> 9759 * We will add the new block nrsm and 9760 * that will be the new portion, and then 9761 * fall through after reseting rsm. So we 9762 * split and look like this: 9763 * rsm |----| 9764 * sackblk |-----> 9765 * nrsm |---| 9766 * We then fall through reseting 9767 * rsm to nrsm, so the next block 9768 * picks it up. 9769 */ 9770 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9771 if (nrsm == NULL) { 9772 /* 9773 * failed XXXrrs what can we do but loose the sack 9774 * info? 9775 */ 9776 goto out; 9777 } 9778 counter_u64_add(rack_sack_splits, 1); 9779 rack_clone_rsm(rack, nrsm, rsm, start); 9780 moved++; 9781 rsm->r_just_ret = 0; 9782 #ifndef INVARIANTS 9783 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9784 #else 9785 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9786 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 9787 nrsm, insret, rack, rsm); 9788 } 9789 #endif 9790 if (rsm->r_in_tmap) { 9791 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9792 nrsm->r_in_tmap = 1; 9793 } 9794 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 9795 rsm->r_flags &= (~RACK_HAS_FIN); 9796 /* Position us to point to the new nrsm that starts the sack blk */ 9797 rsm = nrsm; 9798 } 9799 } else { 9800 /* Already sacked this piece */ 9801 counter_u64_add(rack_sack_skipped_acked, 1); 9802 moved++; 9803 if (end == rsm->r_end) { 9804 /* Done with block */ 9805 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9806 goto out; 9807 } else if (SEQ_LT(end, rsm->r_end)) { 9808 /* A partial sack to a already sacked block */ 9809 moved++; 9810 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9811 goto out; 9812 } else { 9813 /* 9814 * The end goes beyond this guy 9815 * reposition the start to the 9816 * next block. 9817 */ 9818 start = rsm->r_end; 9819 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9820 if (rsm == NULL) 9821 goto out; 9822 } 9823 } 9824 } 9825 if (SEQ_GEQ(end, rsm->r_end)) { 9826 /** 9827 * The end of this block is either beyond this guy or right 9828 * at this guy. I.e.: 9829 * rsm --- |-----| 9830 * end |-----| 9831 * <or> 9832 * end |---------| 9833 */ 9834 if ((rsm->r_flags & RACK_ACKED) == 0) { 9835 /* 9836 * Is it a TLP of interest? 9837 */ 9838 if ((rsm->r_flags & RACK_TLP) && 9839 (rsm->r_rtr_cnt > 1)) { 9840 /* 9841 * We are splitting a rxt TLP, check 9842 * if we need to save off the start/end 9843 */ 9844 if (rack->rc_last_tlp_acked_set && 9845 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9846 /* 9847 * We already turned this on since we are inside 9848 * the previous one was a partially sack now we 9849 * are getting another one (maybe all of it). 9850 */ 9851 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9852 /* 9853 * Lets make sure we have all of it though. 9854 */ 9855 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9856 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9857 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9858 rack->r_ctl.last_tlp_acked_end); 9859 } 9860 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9861 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9862 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9863 rack->r_ctl.last_tlp_acked_end); 9864 } 9865 } else { 9866 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9867 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9868 rack->rc_last_tlp_past_cumack = 0; 9869 rack->rc_last_tlp_acked_set = 1; 9870 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9871 } 9872 } 9873 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9874 changed += (rsm->r_end - rsm->r_start); 9875 /* You get a count for acking a whole segment or more */ 9876 if ((rsm->r_end - rsm->r_start) >= segsiz) 9877 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); 9878 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9879 if (rsm->r_in_tmap) /* should be true */ 9880 rack_log_sack_passed(tp, rack, rsm); 9881 /* Is Reordering occuring? */ 9882 if (rsm->r_flags & RACK_SACK_PASSED) { 9883 rsm->r_flags &= ~RACK_SACK_PASSED; 9884 rack->r_ctl.rc_reorder_ts = cts; 9885 if (rack->r_ctl.rc_reorder_ts == 0) 9886 rack->r_ctl.rc_reorder_ts = 1; 9887 } 9888 if (rack->app_limited_needs_set) 9889 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9890 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9891 rsm->r_flags |= RACK_ACKED; 9892 if (rsm->r_in_tmap) { 9893 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9894 rsm->r_in_tmap = 0; 9895 } 9896 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 9897 } else { 9898 counter_u64_add(rack_sack_skipped_acked, 1); 9899 moved++; 9900 } 9901 if (end == rsm->r_end) { 9902 /* This block only - done, setup for next */ 9903 goto out; 9904 } 9905 /* 9906 * There is more not coverend by this rsm move on 9907 * to the next block in the tail queue hash table. 9908 */ 9909 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 9910 start = rsm->r_end; 9911 rsm = nrsm; 9912 if (rsm == NULL) 9913 goto out; 9914 goto do_rest_ofb; 9915 } 9916 /** 9917 * The end of this sack block is smaller than 9918 * our rsm i.e.: 9919 * rsm --- |-----| 9920 * end |--| 9921 */ 9922 if ((rsm->r_flags & RACK_ACKED) == 0) { 9923 /* 9924 * Is it a TLP of interest? 9925 */ 9926 if ((rsm->r_flags & RACK_TLP) && 9927 (rsm->r_rtr_cnt > 1)) { 9928 /* 9929 * We are splitting a rxt TLP, check 9930 * if we need to save off the start/end 9931 */ 9932 if (rack->rc_last_tlp_acked_set && 9933 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9934 /* 9935 * We already turned this on since we are inside 9936 * the previous one was a partially sack now we 9937 * are getting another one (maybe all of it). 9938 */ 9939 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9940 /* 9941 * Lets make sure we have all of it though. 9942 */ 9943 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9944 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9945 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9946 rack->r_ctl.last_tlp_acked_end); 9947 } 9948 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9949 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9950 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9951 rack->r_ctl.last_tlp_acked_end); 9952 } 9953 } else { 9954 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9955 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9956 rack->rc_last_tlp_past_cumack = 0; 9957 rack->rc_last_tlp_acked_set = 1; 9958 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9959 } 9960 } 9961 /* 9962 * Hookery can only be used if the two entries 9963 * are in the same bucket and neither one of 9964 * them staddle the bucket line. 9965 */ 9966 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9967 if (prev && 9968 (rsm->bindex == prev->bindex) && 9969 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9970 ((prev->r_flags & RACK_STRADDLE) == 0) && 9971 (rsm->r_flags & RACK_IN_GP_WIN) && 9972 (prev->r_flags & RACK_IN_GP_WIN)) 9973 can_use_hookery = 1; 9974 else if (prev && 9975 (rsm->bindex == prev->bindex) && 9976 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9977 ((prev->r_flags & RACK_STRADDLE) == 0) && 9978 ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && 9979 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) 9980 can_use_hookery = 1; 9981 else 9982 can_use_hookery = 0; 9983 9984 if (prev && can_use_hookery && 9985 (prev->r_flags & RACK_ACKED)) { 9986 /** 9987 * Goal, we want the right remainder of rsm to shrink 9988 * in place and span from (rsm->r_start = end) to rsm->r_end. 9989 * We want to expand prev to go all the way 9990 * to prev->r_end <- end. 9991 * so in the tree we have before: 9992 * prev |--------| (acked) 9993 * rsm |-------| (non-acked) 9994 * sackblk |-| 9995 * We churn it so we end up with 9996 * prev |----------| (acked) 9997 * rsm |-----| (non-acked) 9998 * nrsm |-| (temporary) 9999 * 10000 * Note if either prev/rsm is a TLP we don't 10001 * do this. 10002 */ 10003 noextra++; 10004 nrsm = &stack_map; 10005 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 10006 prev->r_end = end; 10007 rsm->r_start = end; 10008 rsm->r_flags |= RACK_SHUFFLED; 10009 prev->r_flags |= RACK_SHUFFLED; 10010 /* Now adjust nrsm (stack copy) to be 10011 * the one that is the small 10012 * piece that was "sacked". 10013 */ 10014 nrsm->r_end = end; 10015 rsm->r_dupack = 0; 10016 /* 10017 * Which timestamp do we keep? It is rather 10018 * important in GP measurements to have the 10019 * accurate end of the send window. 10020 * 10021 * We keep the largest value, which is the newest 10022 * send. We do this in case a segment that is 10023 * joined together and not part of a GP estimate 10024 * later gets expanded into the GP estimate. 10025 * 10026 * We prohibit the merging of unlike kinds i.e. 10027 * all pieces that are in the GP estimate can be 10028 * merged and all pieces that are not in a GP estimate 10029 * can be merged, but not disimilar pieces. Combine 10030 * this with taking the highest here and we should 10031 * be ok unless of course the client reneges. Then 10032 * all bets are off. 10033 */ 10034 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < 10035 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { 10036 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 10037 } 10038 /* 10039 * And we must keep the newest ack arrival time. 10040 */ 10041 10042 if(prev->r_ack_arrival < 10043 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 10044 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10045 10046 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10047 /* 10048 * Now that the rsm has had its start moved forward 10049 * lets go ahead and get its new place in the world. 10050 */ 10051 rack_setup_offset_for_rsm(rack, prev, rsm); 10052 /* 10053 * Now nrsm is our new little piece 10054 * that is acked (which was merged 10055 * to prev). Update the rtt and changed 10056 * based on that. Also check for reordering. 10057 */ 10058 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 10059 if (rack->app_limited_needs_set) 10060 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 10061 changed += (nrsm->r_end - nrsm->r_start); 10062 /* You get a count for acking a whole segment or more */ 10063 if ((nrsm->r_end - nrsm->r_start) >= segsiz) 10064 rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); 10065 10066 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 10067 if (nrsm->r_flags & RACK_SACK_PASSED) { 10068 rack->r_ctl.rc_reorder_ts = cts; 10069 if (rack->r_ctl.rc_reorder_ts == 0) 10070 rack->r_ctl.rc_reorder_ts = 1; 10071 } 10072 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 10073 rsm = prev; 10074 counter_u64_add(rack_sack_used_prev_merge, 1); 10075 } else { 10076 /** 10077 * This is the case where our previous 10078 * block is not acked either, so we must 10079 * split the block in two. 10080 */ 10081 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10082 if (nrsm == NULL) { 10083 /* failed rrs what can we do but loose the sack info? */ 10084 goto out; 10085 } 10086 if ((rsm->r_flags & RACK_TLP) && 10087 (rsm->r_rtr_cnt > 1)) { 10088 /* 10089 * We are splitting a rxt TLP, check 10090 * if we need to save off the start/end 10091 */ 10092 if (rack->rc_last_tlp_acked_set && 10093 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10094 /* 10095 * We already turned this on since this block is inside 10096 * the previous one was a partially sack now we 10097 * are getting another one (maybe all of it). 10098 */ 10099 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10100 /* 10101 * Lets make sure we have all of it though. 10102 */ 10103 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10104 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10105 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10106 rack->r_ctl.last_tlp_acked_end); 10107 } 10108 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10109 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10110 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10111 rack->r_ctl.last_tlp_acked_end); 10112 } 10113 } else { 10114 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10115 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10116 rack->rc_last_tlp_acked_set = 1; 10117 rack->rc_last_tlp_past_cumack = 0; 10118 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10119 } 10120 } 10121 /** 10122 * In this case nrsm becomes 10123 * nrsm->r_start = end; 10124 * nrsm->r_end = rsm->r_end; 10125 * which is un-acked. 10126 * <and> 10127 * rsm->r_end = nrsm->r_start; 10128 * i.e. the remaining un-acked 10129 * piece is left on the left 10130 * hand side. 10131 * 10132 * So we start like this 10133 * rsm |----------| (not acked) 10134 * sackblk |---| 10135 * build it so we have 10136 * rsm |---| (acked) 10137 * nrsm |------| (not acked) 10138 */ 10139 counter_u64_add(rack_sack_splits, 1); 10140 rack_clone_rsm(rack, nrsm, rsm, end); 10141 moved++; 10142 rsm->r_flags &= (~RACK_HAS_FIN); 10143 rsm->r_just_ret = 0; 10144 #ifndef INVARIANTS 10145 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 10146 #else 10147 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 10148 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p", 10149 nrsm, insret, rack, rsm); 10150 } 10151 #endif 10152 if (rsm->r_in_tmap) { 10153 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10154 nrsm->r_in_tmap = 1; 10155 } 10156 nrsm->r_dupack = 0; 10157 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 10158 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 10159 changed += (rsm->r_end - rsm->r_start); 10160 /* You get a count for acking a whole segment or more */ 10161 if ((rsm->r_end - rsm->r_start) >= segsiz) 10162 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); 10163 10164 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 10165 if (rsm->r_in_tmap) /* should be true */ 10166 rack_log_sack_passed(tp, rack, rsm); 10167 /* Is Reordering occuring? */ 10168 if (rsm->r_flags & RACK_SACK_PASSED) { 10169 rsm->r_flags &= ~RACK_SACK_PASSED; 10170 rack->r_ctl.rc_reorder_ts = cts; 10171 if (rack->r_ctl.rc_reorder_ts == 0) 10172 rack->r_ctl.rc_reorder_ts = 1; 10173 } 10174 if (rack->app_limited_needs_set) 10175 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 10176 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10177 rsm->r_flags |= RACK_ACKED; 10178 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 10179 if (rsm->r_in_tmap) { 10180 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10181 rsm->r_in_tmap = 0; 10182 } 10183 } 10184 } else if (start != end){ 10185 /* 10186 * The block was already acked. 10187 */ 10188 counter_u64_add(rack_sack_skipped_acked, 1); 10189 moved++; 10190 } 10191 out: 10192 if (rsm && 10193 ((rsm->r_flags & RACK_TLP) == 0) && 10194 (rsm->r_flags & RACK_ACKED)) { 10195 /* 10196 * Now can we merge where we worked 10197 * with either the previous or 10198 * next block? 10199 */ 10200 next = tqhash_next(rack->r_ctl.tqh, rsm); 10201 while (next) { 10202 if (next->r_flags & RACK_TLP) 10203 break; 10204 /* Only allow merges between ones in or out of GP window */ 10205 if ((next->r_flags & RACK_IN_GP_WIN) && 10206 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10207 break; 10208 } 10209 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10210 ((next->r_flags & RACK_IN_GP_WIN) == 0)) { 10211 break; 10212 } 10213 if (rsm->bindex != next->bindex) 10214 break; 10215 if (rsm->r_flags & RACK_STRADDLE) 10216 break; 10217 if (next->r_flags & RACK_STRADDLE) 10218 break; 10219 if (next->r_flags & RACK_ACKED) { 10220 /* yep this and next can be merged */ 10221 rsm = rack_merge_rsm(rack, rsm, next); 10222 noextra++; 10223 next = tqhash_next(rack->r_ctl.tqh, rsm); 10224 } else 10225 break; 10226 } 10227 /* Now what about the previous? */ 10228 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10229 while (prev) { 10230 if (prev->r_flags & RACK_TLP) 10231 break; 10232 /* Only allow merges between ones in or out of GP window */ 10233 if ((prev->r_flags & RACK_IN_GP_WIN) && 10234 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10235 break; 10236 } 10237 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10238 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { 10239 break; 10240 } 10241 if (rsm->bindex != prev->bindex) 10242 break; 10243 if (rsm->r_flags & RACK_STRADDLE) 10244 break; 10245 if (prev->r_flags & RACK_STRADDLE) 10246 break; 10247 if (prev->r_flags & RACK_ACKED) { 10248 /* yep the previous and this can be merged */ 10249 rsm = rack_merge_rsm(rack, prev, rsm); 10250 noextra++; 10251 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10252 } else 10253 break; 10254 } 10255 } 10256 if (used_ref == 0) { 10257 counter_u64_add(rack_sack_proc_all, 1); 10258 } else { 10259 counter_u64_add(rack_sack_proc_short, 1); 10260 } 10261 /* Save off the next one for quick reference. */ 10262 nrsm = tqhash_find(rack->r_ctl.tqh, end); 10263 *prsm = rack->r_ctl.rc_sacklast = nrsm; 10264 /* Pass back the moved. */ 10265 *moved_two = moved; 10266 *no_extra = noextra; 10267 return (changed); 10268 } 10269 10270 static void inline 10271 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 10272 { 10273 struct rack_sendmap *tmap; 10274 10275 tmap = NULL; 10276 while (rsm && (rsm->r_flags & RACK_ACKED)) { 10277 /* Its no longer sacked, mark it so */ 10278 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10279 #ifdef INVARIANTS 10280 if (rsm->r_in_tmap) { 10281 panic("rack:%p rsm:%p flags:0x%x in tmap?", 10282 rack, rsm, rsm->r_flags); 10283 } 10284 #endif 10285 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 10286 /* Rebuild it into our tmap */ 10287 if (tmap == NULL) { 10288 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10289 tmap = rsm; 10290 } else { 10291 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 10292 tmap = rsm; 10293 } 10294 tmap->r_in_tmap = 1; 10295 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10296 } 10297 /* 10298 * Now lets possibly clear the sack filter so we start 10299 * recognizing sacks that cover this area. 10300 */ 10301 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 10302 10303 } 10304 10305 static void 10306 rack_do_decay(struct tcp_rack *rack) 10307 { 10308 struct timeval res; 10309 10310 #define timersub(tvp, uvp, vvp) \ 10311 do { \ 10312 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 10313 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 10314 if ((vvp)->tv_usec < 0) { \ 10315 (vvp)->tv_sec--; \ 10316 (vvp)->tv_usec += 1000000; \ 10317 } \ 10318 } while (0) 10319 10320 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 10321 #undef timersub 10322 10323 rack->r_ctl.input_pkt++; 10324 if ((rack->rc_in_persist) || 10325 (res.tv_sec >= 1) || 10326 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 10327 /* 10328 * Check for decay of non-SAD, 10329 * we want all SAD detection metrics to 10330 * decay 1/4 per second (or more) passed. 10331 * Current default is 800 so it decays 10332 * 80% every second. 10333 */ 10334 #ifdef TCP_SAD_DETECTION 10335 uint32_t pkt_delta; 10336 10337 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 10338 #endif 10339 /* Update our saved tracking values */ 10340 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 10341 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10342 /* Now do we escape without decay? */ 10343 #ifdef TCP_SAD_DETECTION 10344 if (rack->rc_in_persist || 10345 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 10346 (pkt_delta < tcp_sad_low_pps)){ 10347 /* 10348 * We don't decay idle connections 10349 * or ones that have a low input pps. 10350 */ 10351 return; 10352 } 10353 /* Decay the counters */ 10354 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 10355 tcp_sad_decay_val); 10356 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 10357 tcp_sad_decay_val); 10358 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 10359 tcp_sad_decay_val); 10360 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 10361 tcp_sad_decay_val); 10362 #endif 10363 } 10364 } 10365 10366 static void inline 10367 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) 10368 { 10369 /* 10370 * We look at advancing the end send time for our GP 10371 * measurement tracking only as the cumulative acknowledgment 10372 * moves forward. You might wonder about this, why not 10373 * at every transmission or retransmission within the 10374 * GP window update the rc_gp_cumack_ts? Well its rather 10375 * nuanced but basically the GP window *may* expand (as 10376 * it does below) or worse and harder to track it may shrink. 10377 * 10378 * This last makes it impossible to track at the time of 10379 * the send, since you may set forward your rc_gp_cumack_ts 10380 * when you send, because that send *is* in your currently 10381 * "guessed" window, but then it shrinks. Now which was 10382 * the send time of the last bytes in the window, by the 10383 * time you ask that question that part of the sendmap 10384 * is freed. So you don't know and you will have too 10385 * long of send window. Instead by updating the time 10386 * marker only when the cumack advances this assures us 10387 * that we will have only the sends in the window of our 10388 * GP measurement. 10389 * 10390 * Another complication from this is the 10391 * merging of sendmap entries. During SACK processing this 10392 * can happen to conserve the sendmap size. That breaks 10393 * everything down in tracking the send window of the GP 10394 * estimate. So to prevent that and keep it working with 10395 * a tiny bit more limited merging, we only allow like 10396 * types to be merged. I.e. if two sends are in the GP window 10397 * then its ok to merge them together. If two sends are not 10398 * in the GP window its ok to merge them together too. Though 10399 * one send in and one send out cannot be merged. We combine 10400 * this with never allowing the shrinking of the GP window when 10401 * we are in recovery so that we can properly calculate the 10402 * sending times. 10403 * 10404 * This all of course seems complicated, because it is.. :) 10405 * 10406 * The cum-ack is being advanced upon the sendmap. 10407 * If we are not doing a GP estimate don't 10408 * proceed. 10409 */ 10410 uint64_t ts; 10411 10412 if ((tp->t_flags & TF_GPUTINPROG) == 0) 10413 return; 10414 /* 10415 * If this sendmap entry is going 10416 * beyond the measurement window we had picked, 10417 * expand the measurement window by that much. 10418 */ 10419 if (SEQ_GT(rsm->r_end, tp->gput_ack)) { 10420 tp->gput_ack = rsm->r_end; 10421 } 10422 /* 10423 * If we have not setup a ack, then we 10424 * have no idea if the newly acked pieces 10425 * will be "in our seq measurement range". If 10426 * it is when we clear the app_limited_needs_set 10427 * flag the timestamp will be updated. 10428 */ 10429 if (rack->app_limited_needs_set) 10430 return; 10431 /* 10432 * Finally, we grab out the latest timestamp 10433 * that this packet was sent and then see 10434 * if: 10435 * a) The packet touches are newly defined GP range. 10436 * b) The time is greater than (newer) than the 10437 * one we currently have. If so we update 10438 * our sending end time window. 10439 * 10440 * Note we *do not* do this at send time. The reason 10441 * is that if you do you *may* pick up a newer timestamp 10442 * for a range you are not going to measure. We project 10443 * out how far and then sometimes modify that to be 10444 * smaller. If that occurs then you will have a send 10445 * that does not belong to the range included. 10446 */ 10447 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= 10448 rack->r_ctl.rc_gp_cumack_ts) 10449 return; 10450 if (rack_in_gp_window(tp, rsm)) { 10451 rack->r_ctl.rc_gp_cumack_ts = ts; 10452 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, 10453 __LINE__, from, rsm); 10454 } 10455 } 10456 10457 static void 10458 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) 10459 { 10460 struct rack_sendmap *rsm; 10461 /* 10462 * The ACK point is advancing to th_ack, we must drop off 10463 * the packets in the rack log and calculate any eligble 10464 * RTT's. 10465 */ 10466 10467 rack->r_wanted_output = 1; 10468 if (SEQ_GT(th_ack, tp->snd_una)) 10469 rack->r_ctl.last_cumack_advance = acktime; 10470 10471 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 10472 if ((rack->rc_last_tlp_acked_set == 1)&& 10473 (rack->rc_last_tlp_past_cumack == 1) && 10474 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 10475 /* 10476 * We have reached the point where our last rack 10477 * tlp retransmit sequence is ahead of the cum-ack. 10478 * This can only happen when the cum-ack moves all 10479 * the way around (its been a full 2^^31+1 bytes 10480 * or more since we sent a retransmitted TLP). Lets 10481 * turn off the valid flag since its not really valid. 10482 * 10483 * Note since sack's also turn on this event we have 10484 * a complication, we have to wait to age it out until 10485 * the cum-ack is by the TLP before checking which is 10486 * what the next else clause does. 10487 */ 10488 rack_log_dsack_event(rack, 9, __LINE__, 10489 rack->r_ctl.last_tlp_acked_start, 10490 rack->r_ctl.last_tlp_acked_end); 10491 rack->rc_last_tlp_acked_set = 0; 10492 rack->rc_last_tlp_past_cumack = 0; 10493 } else if ((rack->rc_last_tlp_acked_set == 1) && 10494 (rack->rc_last_tlp_past_cumack == 0) && 10495 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 10496 /* 10497 * It is safe to start aging TLP's out. 10498 */ 10499 rack->rc_last_tlp_past_cumack = 1; 10500 } 10501 /* We do the same for the tlp send seq as well */ 10502 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10503 (rack->rc_last_sent_tlp_past_cumack == 1) && 10504 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 10505 rack_log_dsack_event(rack, 9, __LINE__, 10506 rack->r_ctl.last_sent_tlp_seq, 10507 (rack->r_ctl.last_sent_tlp_seq + 10508 rack->r_ctl.last_sent_tlp_len)); 10509 rack->rc_last_sent_tlp_seq_valid = 0; 10510 rack->rc_last_sent_tlp_past_cumack = 0; 10511 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10512 (rack->rc_last_sent_tlp_past_cumack == 0) && 10513 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 10514 /* 10515 * It is safe to start aging TLP's send. 10516 */ 10517 rack->rc_last_sent_tlp_past_cumack = 1; 10518 } 10519 more: 10520 rsm = tqhash_min(rack->r_ctl.tqh); 10521 if (rsm == NULL) { 10522 if ((th_ack - 1) == tp->iss) { 10523 /* 10524 * For the SYN incoming case we will not 10525 * have called tcp_output for the sending of 10526 * the SYN, so there will be no map. All 10527 * other cases should probably be a panic. 10528 */ 10529 return; 10530 } 10531 if (tp->t_flags & TF_SENTFIN) { 10532 /* if we sent a FIN we often will not have map */ 10533 return; 10534 } 10535 #ifdef INVARIANTS 10536 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 10537 tp, 10538 tp->t_state, th_ack, rack, 10539 tp->snd_una, tp->snd_max, tp->snd_nxt); 10540 #endif 10541 return; 10542 } 10543 if (SEQ_LT(th_ack, rsm->r_start)) { 10544 /* Huh map is missing this */ 10545 #ifdef INVARIANTS 10546 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 10547 rsm->r_start, 10548 th_ack, tp->t_state, rack->r_state); 10549 #endif 10550 return; 10551 } 10552 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 10553 10554 /* Now was it a retransmitted TLP? */ 10555 if ((rsm->r_flags & RACK_TLP) && 10556 (rsm->r_rtr_cnt > 1)) { 10557 /* 10558 * Yes, this rsm was a TLP and retransmitted, remember that 10559 * since if a DSACK comes back on this we don't want 10560 * to think of it as a reordered segment. This may 10561 * get updated again with possibly even other TLPs 10562 * in flight, but thats ok. Only when we don't send 10563 * a retransmitted TLP for 1/2 the sequences space 10564 * will it get turned off (above). 10565 */ 10566 if (rack->rc_last_tlp_acked_set && 10567 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10568 /* 10569 * We already turned this on since the end matches, 10570 * the previous one was a partially ack now we 10571 * are getting another one (maybe all of it). 10572 */ 10573 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10574 /* 10575 * Lets make sure we have all of it though. 10576 */ 10577 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10578 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10579 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10580 rack->r_ctl.last_tlp_acked_end); 10581 } 10582 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10583 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10584 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10585 rack->r_ctl.last_tlp_acked_end); 10586 } 10587 } else { 10588 rack->rc_last_tlp_past_cumack = 1; 10589 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10590 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10591 rack->rc_last_tlp_acked_set = 1; 10592 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10593 } 10594 } 10595 /* Now do we consume the whole thing? */ 10596 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 10597 if (SEQ_GEQ(th_ack, rsm->r_end)) { 10598 /* Its all consumed. */ 10599 uint32_t left; 10600 uint8_t newly_acked; 10601 10602 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 10603 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 10604 rsm->r_rtr_bytes = 0; 10605 /* 10606 * Record the time of highest cumack sent if its in our measurement 10607 * window and possibly bump out the end. 10608 */ 10609 rack_rsm_sender_update(rack, tp, rsm, 4); 10610 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 10611 if (rsm->r_in_tmap) { 10612 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10613 rsm->r_in_tmap = 0; 10614 } 10615 newly_acked = 1; 10616 if (rsm->r_flags & RACK_ACKED) { 10617 /* 10618 * It was acked on the scoreboard -- remove 10619 * it from total 10620 */ 10621 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10622 newly_acked = 0; 10623 } else if (rsm->r_flags & RACK_SACK_PASSED) { 10624 /* 10625 * There are segments ACKED on the 10626 * scoreboard further up. We are seeing 10627 * reordering. 10628 */ 10629 rsm->r_flags &= ~RACK_SACK_PASSED; 10630 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10631 rsm->r_flags |= RACK_ACKED; 10632 rack->r_ctl.rc_reorder_ts = cts; 10633 if (rack->r_ctl.rc_reorder_ts == 0) 10634 rack->r_ctl.rc_reorder_ts = 1; 10635 if (rack->r_ent_rec_ns) { 10636 /* 10637 * We have sent no more, and we saw an sack 10638 * then ack arrive. 10639 */ 10640 rack->r_might_revert = 1; 10641 } 10642 } 10643 if ((rsm->r_flags & RACK_TO_REXT) && 10644 (tp->t_flags & TF_RCVD_TSTMP) && 10645 (to->to_flags & TOF_TS) && 10646 (to->to_tsecr != 0) && 10647 (tp->t_flags & TF_PREVVALID)) { 10648 /* 10649 * We can use the timestamp to see 10650 * if this retransmission was from the 10651 * first transmit. If so we made a mistake. 10652 */ 10653 tp->t_flags &= ~TF_PREVVALID; 10654 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 10655 /* The first transmit is what this ack is for */ 10656 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 10657 } 10658 } 10659 left = th_ack - rsm->r_end; 10660 if (rack->app_limited_needs_set && newly_acked) 10661 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 10662 /* Free back to zone */ 10663 rack_free(rack, rsm); 10664 if (left) { 10665 goto more; 10666 } 10667 /* Check for reneging */ 10668 rsm = tqhash_min(rack->r_ctl.tqh); 10669 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 10670 /* 10671 * The peer has moved snd_una up to 10672 * the edge of this send, i.e. one 10673 * that it had previously acked. The only 10674 * way that can be true if the peer threw 10675 * away data (space issues) that it had 10676 * previously sacked (else it would have 10677 * given us snd_una up to (rsm->r_end). 10678 * We need to undo the acked markings here. 10679 * 10680 * Note we have to look to make sure th_ack is 10681 * our rsm->r_start in case we get an old ack 10682 * where th_ack is behind snd_una. 10683 */ 10684 rack_peer_reneges(rack, rsm, th_ack); 10685 } 10686 return; 10687 } 10688 if (rsm->r_flags & RACK_ACKED) { 10689 /* 10690 * It was acked on the scoreboard -- remove it from 10691 * total for the part being cum-acked. 10692 */ 10693 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 10694 } 10695 /* 10696 * Clear the dup ack count for 10697 * the piece that remains. 10698 */ 10699 rsm->r_dupack = 0; 10700 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10701 if (rsm->r_rtr_bytes) { 10702 /* 10703 * It was retransmitted adjust the 10704 * sack holes for what was acked. 10705 */ 10706 int ack_am; 10707 10708 ack_am = (th_ack - rsm->r_start); 10709 if (ack_am >= rsm->r_rtr_bytes) { 10710 rack->r_ctl.rc_holes_rxt -= ack_am; 10711 rsm->r_rtr_bytes -= ack_am; 10712 } 10713 } 10714 /* 10715 * Update where the piece starts and record 10716 * the time of send of highest cumack sent if 10717 * its in our GP range. 10718 */ 10719 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 10720 /* Now we need to move our offset forward too */ 10721 if (rsm->m && 10722 ((rsm->orig_m_len != rsm->m->m_len) || 10723 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 10724 /* Fix up the orig_m_len and possibly the mbuf offset */ 10725 rack_adjust_orig_mlen(rsm); 10726 } 10727 rsm->soff += (th_ack - rsm->r_start); 10728 rack_rsm_sender_update(rack, tp, rsm, 5); 10729 /* The trim will move th_ack into r_start for us */ 10730 tqhash_trim(rack->r_ctl.tqh, th_ack); 10731 /* Now do we need to move the mbuf fwd too? */ 10732 { 10733 struct mbuf *m; 10734 uint32_t soff; 10735 10736 m = rsm->m; 10737 soff = rsm->soff; 10738 if (m) { 10739 while (soff >= m->m_len) { 10740 soff -= m->m_len; 10741 KASSERT((m->m_next != NULL), 10742 (" rsm:%p off:%u soff:%u m:%p", 10743 rsm, rsm->soff, soff, m)); 10744 m = m->m_next; 10745 if (m == NULL) { 10746 /* 10747 * This is a fall-back that prevents a panic. In reality 10748 * we should be able to walk the mbuf's and find our place. 10749 * At this point snd_una has not been updated with the sbcut() yet 10750 * but tqhash_trim did update rsm->r_start so the offset calcuation 10751 * should work fine. This is undesirable since we will take cache 10752 * hits to access the socket buffer. And even more puzzling is that 10753 * it happens occasionally. It should not :( 10754 */ 10755 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 10756 (rsm->r_start - tp->snd_una), 10757 &soff); 10758 break; 10759 } 10760 } 10761 /* 10762 * Now save in our updated values. 10763 */ 10764 rsm->m = m; 10765 rsm->soff = soff; 10766 rsm->orig_m_len = rsm->m->m_len; 10767 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 10768 } 10769 } 10770 if (rack->app_limited_needs_set && 10771 SEQ_GEQ(th_ack, tp->gput_seq)) 10772 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 10773 } 10774 10775 static void 10776 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 10777 { 10778 struct rack_sendmap *rsm; 10779 int sack_pass_fnd = 0; 10780 10781 if (rack->r_might_revert) { 10782 /* 10783 * Ok we have reordering, have not sent anything, we 10784 * might want to revert the congestion state if nothing 10785 * further has SACK_PASSED on it. Lets check. 10786 * 10787 * We also get here when we have DSACKs come in for 10788 * all the data that we FR'd. Note that a rxt or tlp 10789 * timer clears this from happening. 10790 */ 10791 10792 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 10793 if (rsm->r_flags & RACK_SACK_PASSED) { 10794 sack_pass_fnd = 1; 10795 break; 10796 } 10797 } 10798 if (sack_pass_fnd == 0) { 10799 /* 10800 * We went into recovery 10801 * incorrectly due to reordering! 10802 */ 10803 int orig_cwnd; 10804 10805 rack->r_ent_rec_ns = 0; 10806 orig_cwnd = tp->snd_cwnd; 10807 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 10808 tp->snd_recover = tp->snd_una; 10809 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 10810 EXIT_RECOVERY(tp->t_flags); 10811 } 10812 rack->r_might_revert = 0; 10813 } 10814 } 10815 10816 #ifdef TCP_SAD_DETECTION 10817 10818 static void 10819 rack_merge_out_sacks(struct tcp_rack *rack) 10820 { 10821 struct rack_sendmap *cur, *next, *rsm, *trsm = NULL; 10822 10823 cur = tqhash_min(rack->r_ctl.tqh); 10824 while(cur) { 10825 next = tqhash_next(rack->r_ctl.tqh, cur); 10826 /* 10827 * The idea is to go through all and merge back 10828 * together the pieces sent together, 10829 */ 10830 if ((next != NULL) && 10831 (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) { 10832 rack_merge_rsm(rack, cur, next); 10833 } else { 10834 cur = next; 10835 } 10836 } 10837 /* 10838 * now treat it like a rxt event, everything is outstanding 10839 * and sent nothing acvked and dupacks are all zero. If this 10840 * is not an attacker it will have to dupack its way through 10841 * it all. 10842 */ 10843 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10844 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 10845 rsm->r_dupack = 0; 10846 /* We must re-add it back to the tlist */ 10847 if (trsm == NULL) { 10848 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10849 } else { 10850 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 10851 } 10852 rsm->r_in_tmap = 1; 10853 trsm = rsm; 10854 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); 10855 } 10856 sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una); 10857 } 10858 10859 static void 10860 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 10861 { 10862 int do_detection = 0; 10863 10864 if (rack->sack_attack_disable || rack->rc_suspicious) { 10865 /* 10866 * If we have been disabled we must detect 10867 * to possibly reverse it. Or if the guy has 10868 * sent in suspicious sacks we want to do detection too. 10869 */ 10870 do_detection = 1; 10871 10872 } else if ((rack->do_detection || tcp_force_detection) && 10873 (tcp_sack_to_ack_thresh > 0) && 10874 (tcp_sack_to_move_thresh > 0) && 10875 (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) { 10876 /* 10877 * We only detect here if: 10878 * 1) System wide forcing is on <or> do_detection is on 10879 * <and> 10880 * 2) We have thresholds for move and ack (set one to 0 and we are off) 10881 * <and> 10882 * 3) We have maps allocated larger than our min (500). 10883 */ 10884 do_detection = 1; 10885 } 10886 if (do_detection > 0) { 10887 /* 10888 * We have thresholds set to find 10889 * possible attackers and disable sack. 10890 * Check them. 10891 */ 10892 uint64_t ackratio, moveratio, movetotal; 10893 10894 /* Log detecting */ 10895 rack_log_sad(rack, 1); 10896 /* Do we establish a ack ratio */ 10897 if ((rack->r_ctl.sack_count > tcp_map_minimum) || 10898 (rack->rc_suspicious == 1) || 10899 (rack->sack_attack_disable > 0)) { 10900 ackratio = (uint64_t)(rack->r_ctl.sack_count); 10901 ackratio *= (uint64_t)(1000); 10902 if (rack->r_ctl.ack_count) 10903 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 10904 else { 10905 /* We can hit this due to ack totals degregation (via small sacks) */ 10906 ackratio = 1000; 10907 } 10908 } else { 10909 /* 10910 * No ack ratio needed if we have not 10911 * seen more sacks then the number of map entries. 10912 * The exception to that is if we have disabled sack then 10913 * we need to find a ratio. 10914 */ 10915 ackratio = 0; 10916 } 10917 10918 if ((rack->sack_attack_disable == 0) && 10919 (ackratio > rack_highest_sack_thresh_seen)) 10920 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 10921 /* Do we establish a move ratio? */ 10922 if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) || 10923 (rack->rc_suspicious == 1) || 10924 (rack->sack_attack_disable > 0)) { 10925 /* 10926 * We need to have more sack moves than maps 10927 * allocated to have a move ratio considered. 10928 */ 10929 movetotal = rack->r_ctl.sack_moved_extra; 10930 movetotal += rack->r_ctl.sack_noextra_move; 10931 moveratio = rack->r_ctl.sack_moved_extra; 10932 moveratio *= (uint64_t)1000; 10933 if (movetotal) 10934 moveratio /= movetotal; 10935 else { 10936 /* No moves, thats pretty good */ 10937 moveratio = 0; 10938 } 10939 } else { 10940 /* 10941 * Not enough moves have occured to consider 10942 * if we are out of whack in that ratio. 10943 * The exception to that is if we have disabled sack then 10944 * we need to find a ratio. 10945 */ 10946 moveratio = 0; 10947 } 10948 if ((rack->sack_attack_disable == 0) && 10949 (moveratio > rack_highest_move_thresh_seen)) 10950 rack_highest_move_thresh_seen = (uint32_t)moveratio; 10951 /* Now the tests */ 10952 if (rack->sack_attack_disable == 0) { 10953 /* Not disabled, do we need to disable? */ 10954 if ((ackratio > tcp_sack_to_ack_thresh) && 10955 (moveratio > tcp_sack_to_move_thresh)) { 10956 /* Disable sack processing */ 10957 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); 10958 rack->sack_attack_disable = 1; 10959 /* set it so we have the built in delay */ 10960 rack->r_ctl.ack_during_sd = 1; 10961 if (rack_merge_out_sacks_on_attack) 10962 rack_merge_out_sacks(rack); 10963 counter_u64_add(rack_sack_attacks_detected, 1); 10964 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); 10965 /* Clamp the cwnd at flight size */ 10966 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 10967 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10968 rack_log_sad(rack, 2); 10969 } 10970 } else { 10971 /* We are sack-disabled check for false positives */ 10972 if ((ackratio <= tcp_restoral_thresh) || 10973 ((rack_merge_out_sacks_on_attack == 0) && 10974 (rack->rc_suspicious == 0) && 10975 (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) { 10976 rack->sack_attack_disable = 0; 10977 rack_log_sad(rack, 3); 10978 /* Restart counting */ 10979 rack->r_ctl.sack_count = 0; 10980 rack->r_ctl.sack_moved_extra = 0; 10981 rack->r_ctl.sack_noextra_move = 1; 10982 rack->rc_suspicious = 0; 10983 rack->r_ctl.ack_count = max(1, 10984 (bytes_this_ack / segsiz)); 10985 10986 counter_u64_add(rack_sack_attacks_reversed, 1); 10987 /* Restore the cwnd */ 10988 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 10989 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 10990 } 10991 } 10992 } 10993 } 10994 #endif 10995 10996 static int 10997 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 10998 { 10999 11000 uint32_t am, l_end; 11001 int was_tlp = 0; 11002 11003 if (SEQ_GT(end, start)) 11004 am = end - start; 11005 else 11006 am = 0; 11007 if ((rack->rc_last_tlp_acked_set ) && 11008 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 11009 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 11010 /* 11011 * The DSACK is because of a TLP which we don't 11012 * do anything with the reordering window over since 11013 * it was not reordering that caused the DSACK but 11014 * our previous retransmit TLP. 11015 */ 11016 rack_log_dsack_event(rack, 7, __LINE__, start, end); 11017 was_tlp = 1; 11018 goto skip_dsack_round; 11019 } 11020 if (rack->rc_last_sent_tlp_seq_valid) { 11021 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 11022 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 11023 (SEQ_LEQ(end, l_end))) { 11024 /* 11025 * This dsack is from the last sent TLP, ignore it 11026 * for reordering purposes. 11027 */ 11028 rack_log_dsack_event(rack, 7, __LINE__, start, end); 11029 was_tlp = 1; 11030 goto skip_dsack_round; 11031 } 11032 } 11033 if (rack->rc_dsack_round_seen == 0) { 11034 rack->rc_dsack_round_seen = 1; 11035 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 11036 rack->r_ctl.num_dsack++; 11037 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 11038 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 11039 } 11040 skip_dsack_round: 11041 /* 11042 * We keep track of how many DSACK blocks we get 11043 * after a recovery incident. 11044 */ 11045 rack->r_ctl.dsack_byte_cnt += am; 11046 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 11047 rack->r_ctl.retran_during_recovery && 11048 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 11049 /* 11050 * False recovery most likely culprit is reordering. If 11051 * nothing else is missing we need to revert. 11052 */ 11053 rack->r_might_revert = 1; 11054 rack_handle_might_revert(rack->rc_tp, rack); 11055 rack->r_might_revert = 0; 11056 rack->r_ctl.retran_during_recovery = 0; 11057 rack->r_ctl.dsack_byte_cnt = 0; 11058 } 11059 return (was_tlp); 11060 } 11061 11062 static uint32_t 11063 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 11064 { 11065 return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt); 11066 } 11067 11068 static int32_t 11069 rack_compute_pipe(struct tcpcb *tp) 11070 { 11071 return ((int32_t)do_rack_compute_pipe(tp, 11072 (struct tcp_rack *)tp->t_fb_ptr, 11073 tp->snd_una)); 11074 } 11075 11076 static void 11077 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 11078 { 11079 /* Deal with changed and PRR here (in recovery only) */ 11080 uint32_t pipe, snd_una; 11081 11082 rack->r_ctl.rc_prr_delivered += changed; 11083 11084 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 11085 /* 11086 * It is all outstanding, we are application limited 11087 * and thus we don't need more room to send anything. 11088 * Note we use tp->snd_una here and not th_ack because 11089 * the data as yet not been cut from the sb. 11090 */ 11091 rack->r_ctl.rc_prr_sndcnt = 0; 11092 return; 11093 } 11094 /* Compute prr_sndcnt */ 11095 if (SEQ_GT(tp->snd_una, th_ack)) { 11096 snd_una = tp->snd_una; 11097 } else { 11098 snd_una = th_ack; 11099 } 11100 pipe = do_rack_compute_pipe(tp, rack, snd_una); 11101 if (pipe > tp->snd_ssthresh) { 11102 long sndcnt; 11103 11104 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 11105 if (rack->r_ctl.rc_prr_recovery_fs > 0) 11106 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 11107 else { 11108 rack->r_ctl.rc_prr_sndcnt = 0; 11109 rack_log_to_prr(rack, 9, 0, __LINE__); 11110 sndcnt = 0; 11111 } 11112 sndcnt++; 11113 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 11114 sndcnt -= rack->r_ctl.rc_prr_out; 11115 else 11116 sndcnt = 0; 11117 rack->r_ctl.rc_prr_sndcnt = sndcnt; 11118 rack_log_to_prr(rack, 10, 0, __LINE__); 11119 } else { 11120 uint32_t limit; 11121 11122 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 11123 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 11124 else 11125 limit = 0; 11126 if (changed > limit) 11127 limit = changed; 11128 limit += ctf_fixed_maxseg(tp); 11129 if (tp->snd_ssthresh > pipe) { 11130 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 11131 rack_log_to_prr(rack, 11, 0, __LINE__); 11132 } else { 11133 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 11134 rack_log_to_prr(rack, 12, 0, __LINE__); 11135 } 11136 } 11137 } 11138 11139 static void 11140 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, 11141 int *dsack_seen, int *sacks_seen) 11142 { 11143 uint32_t changed; 11144 struct tcp_rack *rack; 11145 struct rack_sendmap *rsm; 11146 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 11147 register uint32_t th_ack; 11148 int32_t i, j, k, num_sack_blks = 0; 11149 uint32_t cts, acked, ack_point; 11150 int loop_start = 0, moved_two = 0, no_extra = 0; 11151 uint32_t tsused; 11152 uint32_t segsiz, o_cnt; 11153 11154 11155 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11156 if (tcp_get_flags(th) & TH_RST) { 11157 /* We don't log resets */ 11158 return; 11159 } 11160 rack = (struct tcp_rack *)tp->t_fb_ptr; 11161 cts = tcp_get_usecs(NULL); 11162 rsm = tqhash_min(rack->r_ctl.tqh); 11163 changed = 0; 11164 th_ack = th->th_ack; 11165 if (rack->sack_attack_disable == 0) 11166 rack_do_decay(rack); 11167 segsiz = ctf_fixed_maxseg(rack->rc_tp); 11168 if (BYTES_THIS_ACK(tp, th) >= segsiz) { 11169 /* 11170 * You only get credit for 11171 * MSS and greater (and you get extra 11172 * credit for larger cum-ack moves). 11173 */ 11174 int ac; 11175 11176 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 11177 rack->r_ctl.ack_count += ac; 11178 counter_u64_add(rack_ack_total, ac); 11179 } 11180 if (rack->r_ctl.ack_count > 0xfff00000) { 11181 /* 11182 * reduce the number to keep us under 11183 * a uint32_t. 11184 */ 11185 rack->r_ctl.ack_count /= 2; 11186 rack->r_ctl.sack_count /= 2; 11187 } 11188 if (SEQ_GT(th_ack, tp->snd_una)) { 11189 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 11190 tp->t_acktime = ticks; 11191 } 11192 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 11193 changed = th_ack - rsm->r_start; 11194 if (changed) { 11195 rack_process_to_cumack(tp, rack, th_ack, cts, to, 11196 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 11197 } 11198 if ((to->to_flags & TOF_SACK) == 0) { 11199 /* We are done nothing left and no sack. */ 11200 rack_handle_might_revert(tp, rack); 11201 /* 11202 * For cases where we struck a dup-ack 11203 * with no SACK, add to the changes so 11204 * PRR will work right. 11205 */ 11206 if (dup_ack_struck && (changed == 0)) { 11207 changed += ctf_fixed_maxseg(rack->rc_tp); 11208 } 11209 goto out; 11210 } 11211 /* Sack block processing */ 11212 if (SEQ_GT(th_ack, tp->snd_una)) 11213 ack_point = th_ack; 11214 else 11215 ack_point = tp->snd_una; 11216 for (i = 0; i < to->to_nsacks; i++) { 11217 bcopy((to->to_sacks + i * TCPOLEN_SACK), 11218 &sack, sizeof(sack)); 11219 sack.start = ntohl(sack.start); 11220 sack.end = ntohl(sack.end); 11221 if (SEQ_GT(sack.end, sack.start) && 11222 SEQ_GT(sack.start, ack_point) && 11223 SEQ_LT(sack.start, tp->snd_max) && 11224 SEQ_GT(sack.end, ack_point) && 11225 SEQ_LEQ(sack.end, tp->snd_max)) { 11226 sack_blocks[num_sack_blks] = sack; 11227 num_sack_blks++; 11228 } else if (SEQ_LEQ(sack.start, th_ack) && 11229 SEQ_LEQ(sack.end, th_ack)) { 11230 int was_tlp; 11231 11232 if (dsack_seen != NULL) 11233 *dsack_seen = 1; 11234 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 11235 /* 11236 * Its a D-SACK block. 11237 */ 11238 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 11239 } 11240 } 11241 if (rack->rc_dsack_round_seen) { 11242 /* Is the dsack roound over? */ 11243 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 11244 /* Yes it is */ 11245 rack->rc_dsack_round_seen = 0; 11246 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 11247 } 11248 } 11249 /* 11250 * Sort the SACK blocks so we can update the rack scoreboard with 11251 * just one pass. 11252 */ 11253 o_cnt = num_sack_blks; 11254 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 11255 num_sack_blks, th->th_ack); 11256 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 11257 if (sacks_seen != NULL) 11258 *sacks_seen = num_sack_blks; 11259 if (num_sack_blks == 0) { 11260 /* Nothing to sack, but we need to update counts */ 11261 if ((o_cnt == 1) && 11262 (*dsack_seen != 1)) 11263 rack->r_ctl.sack_count++; 11264 else if (o_cnt > 1) 11265 rack->r_ctl.sack_count++; 11266 goto out_with_totals; 11267 } 11268 if (rack->sack_attack_disable) { 11269 /* 11270 * An attacker disablement is in place, for 11271 * every sack block that is not at least a full MSS 11272 * count up sack_count. 11273 */ 11274 for (i = 0; i < num_sack_blks; i++) { 11275 if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) { 11276 rack->r_ctl.sack_count++; 11277 } 11278 if (rack->r_ctl.sack_count > 0xfff00000) { 11279 /* 11280 * reduce the number to keep us under 11281 * a uint32_t. 11282 */ 11283 rack->r_ctl.ack_count /= 2; 11284 rack->r_ctl.sack_count /= 2; 11285 } 11286 } 11287 goto out; 11288 } 11289 /* Its a sack of some sort */ 11290 rack->r_ctl.sack_count += num_sack_blks; 11291 if (rack->r_ctl.sack_count > 0xfff00000) { 11292 /* 11293 * reduce the number to keep us under 11294 * a uint32_t. 11295 */ 11296 rack->r_ctl.ack_count /= 2; 11297 rack->r_ctl.sack_count /= 2; 11298 } 11299 if (num_sack_blks < 2) { 11300 /* Only one, we don't need to sort */ 11301 goto do_sack_work; 11302 } 11303 /* Sort the sacks */ 11304 for (i = 0; i < num_sack_blks; i++) { 11305 for (j = i + 1; j < num_sack_blks; j++) { 11306 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 11307 sack = sack_blocks[i]; 11308 sack_blocks[i] = sack_blocks[j]; 11309 sack_blocks[j] = sack; 11310 } 11311 } 11312 } 11313 /* 11314 * Now are any of the sack block ends the same (yes some 11315 * implementations send these)? 11316 */ 11317 again: 11318 if (num_sack_blks == 0) 11319 goto out_with_totals; 11320 if (num_sack_blks > 1) { 11321 for (i = 0; i < num_sack_blks; i++) { 11322 for (j = i + 1; j < num_sack_blks; j++) { 11323 if (sack_blocks[i].end == sack_blocks[j].end) { 11324 /* 11325 * Ok these two have the same end we 11326 * want the smallest end and then 11327 * throw away the larger and start 11328 * again. 11329 */ 11330 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 11331 /* 11332 * The second block covers 11333 * more area use that 11334 */ 11335 sack_blocks[i].start = sack_blocks[j].start; 11336 } 11337 /* 11338 * Now collapse out the dup-sack and 11339 * lower the count 11340 */ 11341 for (k = (j + 1); k < num_sack_blks; k++) { 11342 sack_blocks[j].start = sack_blocks[k].start; 11343 sack_blocks[j].end = sack_blocks[k].end; 11344 j++; 11345 } 11346 num_sack_blks--; 11347 goto again; 11348 } 11349 } 11350 } 11351 } 11352 do_sack_work: 11353 /* 11354 * First lets look to see if 11355 * we have retransmitted and 11356 * can use the transmit next? 11357 */ 11358 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11359 if (rsm && 11360 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 11361 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 11362 /* 11363 * We probably did the FR and the next 11364 * SACK in continues as we would expect. 11365 */ 11366 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz); 11367 if (acked) { 11368 rack->r_wanted_output = 1; 11369 changed += acked; 11370 } 11371 if (num_sack_blks == 1) { 11372 /* 11373 * This is what we would expect from 11374 * a normal implementation to happen 11375 * after we have retransmitted the FR, 11376 * i.e the sack-filter pushes down 11377 * to 1 block and the next to be retransmitted 11378 * is the sequence in the sack block (has more 11379 * are acked). Count this as ACK'd data to boost 11380 * up the chances of recovering any false positives. 11381 */ 11382 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 11383 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 11384 counter_u64_add(rack_express_sack, 1); 11385 if (rack->r_ctl.ack_count > 0xfff00000) { 11386 /* 11387 * reduce the number to keep us under 11388 * a uint32_t. 11389 */ 11390 rack->r_ctl.ack_count /= 2; 11391 rack->r_ctl.sack_count /= 2; 11392 } 11393 if (moved_two) { 11394 /* 11395 * If we did not get a SACK for at least a MSS and 11396 * had to move at all, or if we moved more than our 11397 * threshold, it counts against the "extra" move. 11398 */ 11399 rack->r_ctl.sack_moved_extra += moved_two; 11400 rack->r_ctl.sack_noextra_move += no_extra; 11401 counter_u64_add(rack_move_some, 1); 11402 } else { 11403 /* 11404 * else we did not have to move 11405 * any more than we would expect. 11406 */ 11407 rack->r_ctl.sack_noextra_move += no_extra; 11408 rack->r_ctl.sack_noextra_move++; 11409 counter_u64_add(rack_move_none, 1); 11410 } 11411 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 11412 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 11413 rack->r_ctl.sack_moved_extra /= 2; 11414 rack->r_ctl.sack_noextra_move /= 2; 11415 } 11416 goto out_with_totals; 11417 } else { 11418 /* 11419 * Start the loop through the 11420 * rest of blocks, past the first block. 11421 */ 11422 loop_start = 1; 11423 } 11424 } 11425 counter_u64_add(rack_sack_total, 1); 11426 rsm = rack->r_ctl.rc_sacklast; 11427 for (i = loop_start; i < num_sack_blks; i++) { 11428 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz); 11429 if (acked) { 11430 rack->r_wanted_output = 1; 11431 changed += acked; 11432 } 11433 if (moved_two) { 11434 /* 11435 * If we did not get a SACK for at least a MSS and 11436 * had to move at all, or if we moved more than our 11437 * threshold, it counts against the "extra" move. 11438 */ 11439 rack->r_ctl.sack_moved_extra += moved_two; 11440 rack->r_ctl.sack_noextra_move += no_extra; 11441 counter_u64_add(rack_move_some, 1); 11442 } else { 11443 /* 11444 * else we did not have to move 11445 * any more than we would expect. 11446 */ 11447 rack->r_ctl.sack_noextra_move += no_extra; 11448 rack->r_ctl.sack_noextra_move++; 11449 counter_u64_add(rack_move_none, 1); 11450 } 11451 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 11452 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 11453 rack->r_ctl.sack_moved_extra /= 2; 11454 rack->r_ctl.sack_noextra_move /= 2; 11455 } 11456 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 11457 /* 11458 * If the SACK was not a full MSS then 11459 * we add to sack_count the number of 11460 * MSS's (or possibly more than 11461 * a MSS if its a TSO send) we had to skip by. 11462 */ 11463 rack->r_ctl.sack_count += moved_two; 11464 if (rack->r_ctl.sack_count > 0xfff00000) { 11465 rack->r_ctl.ack_count /= 2; 11466 rack->r_ctl.sack_count /= 2; 11467 } 11468 counter_u64_add(rack_sack_total, moved_two); 11469 } 11470 /* 11471 * Now we need to setup for the next 11472 * round. First we make sure we won't 11473 * exceed the size of our uint32_t on 11474 * the various counts, and then clear out 11475 * moved_two. 11476 */ 11477 moved_two = 0; 11478 no_extra = 0; 11479 } 11480 out_with_totals: 11481 if (num_sack_blks > 1) { 11482 /* 11483 * You get an extra stroke if 11484 * you have more than one sack-blk, this 11485 * could be where we are skipping forward 11486 * and the sack-filter is still working, or 11487 * it could be an attacker constantly 11488 * moving us. 11489 */ 11490 rack->r_ctl.sack_moved_extra++; 11491 counter_u64_add(rack_move_some, 1); 11492 } 11493 out: 11494 #ifdef TCP_SAD_DETECTION 11495 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 11496 #endif 11497 if (changed) { 11498 /* Something changed cancel the rack timer */ 11499 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11500 } 11501 tsused = tcp_get_usecs(NULL); 11502 rsm = tcp_rack_output(tp, rack, tsused); 11503 if ((!IN_FASTRECOVERY(tp->t_flags)) && 11504 rsm && 11505 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 11506 /* Enter recovery */ 11507 entered_recovery = 1; 11508 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 11509 /* 11510 * When we enter recovery we need to assure we send 11511 * one packet. 11512 */ 11513 if (rack->rack_no_prr == 0) { 11514 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 11515 rack_log_to_prr(rack, 8, 0, __LINE__); 11516 } 11517 rack->r_timer_override = 1; 11518 rack->r_early = 0; 11519 rack->r_ctl.rc_agg_early = 0; 11520 } else if (IN_FASTRECOVERY(tp->t_flags) && 11521 rsm && 11522 (rack->r_rr_config == 3)) { 11523 /* 11524 * Assure we can output and we get no 11525 * remembered pace time except the retransmit. 11526 */ 11527 rack->r_timer_override = 1; 11528 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11529 rack->r_ctl.rc_resend = rsm; 11530 } 11531 if (IN_FASTRECOVERY(tp->t_flags) && 11532 (rack->rack_no_prr == 0) && 11533 (entered_recovery == 0)) { 11534 rack_update_prr(tp, rack, changed, th_ack); 11535 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 11536 ((tcp_in_hpts(rack->rc_tp) == 0) && 11537 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 11538 /* 11539 * If you are pacing output you don't want 11540 * to override. 11541 */ 11542 rack->r_early = 0; 11543 rack->r_ctl.rc_agg_early = 0; 11544 rack->r_timer_override = 1; 11545 } 11546 } 11547 } 11548 11549 static void 11550 rack_strike_dupack(struct tcp_rack *rack) 11551 { 11552 struct rack_sendmap *rsm; 11553 11554 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11555 while (rsm) { 11556 /* 11557 * We need to skip anything already set 11558 * to be retransmitted. 11559 */ 11560 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11561 (rsm->r_flags & RACK_MUST_RXT)) { 11562 rsm = TAILQ_NEXT(rsm, r_tnext); 11563 continue; 11564 } 11565 break; 11566 } 11567 if (rsm && (rsm->r_dupack < 0xff)) { 11568 rsm->r_dupack++; 11569 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 11570 struct timeval tv; 11571 uint32_t cts; 11572 /* 11573 * Here we see if we need to retransmit. For 11574 * a SACK type connection if enough time has passed 11575 * we will get a return of the rsm. For a non-sack 11576 * connection we will get the rsm returned if the 11577 * dupack value is 3 or more. 11578 */ 11579 cts = tcp_get_usecs(&tv); 11580 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 11581 if (rack->r_ctl.rc_resend != NULL) { 11582 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 11583 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 11584 rack->rc_tp->snd_una, __LINE__); 11585 } 11586 rack->r_wanted_output = 1; 11587 rack->r_timer_override = 1; 11588 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 11589 } 11590 } else { 11591 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 11592 } 11593 } 11594 } 11595 11596 static void 11597 rack_check_bottom_drag(struct tcpcb *tp, 11598 struct tcp_rack *rack, 11599 struct socket *so) 11600 { 11601 uint32_t segsiz, minseg; 11602 11603 segsiz = ctf_fixed_maxseg(tp); 11604 minseg = segsiz; 11605 if (tp->snd_max == tp->snd_una) { 11606 /* 11607 * We are doing dynamic pacing and we are way 11608 * under. Basically everything got acked while 11609 * we were still waiting on the pacer to expire. 11610 * 11611 * This means we need to boost the b/w in 11612 * addition to any earlier boosting of 11613 * the multiplier. 11614 */ 11615 uint64_t lt_bw; 11616 11617 lt_bw = rack_get_lt_bw(rack); 11618 rack->rc_dragged_bottom = 1; 11619 rack_validate_multipliers_at_or_above100(rack); 11620 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 11621 (lt_bw > 0)) { 11622 /* 11623 * Lets use the long-term b/w we have 11624 * been getting as a base. 11625 */ 11626 if (rack->rc_gp_filled == 0) { 11627 if (lt_bw > ONE_POINT_TWO_MEG) { 11628 /* 11629 * If we have no measurement 11630 * don't let us set in more than 11631 * 1.2Mbps. If we are still too 11632 * low after pacing with this we 11633 * will hopefully have a max b/w 11634 * available to sanity check things. 11635 */ 11636 lt_bw = ONE_POINT_TWO_MEG; 11637 } 11638 rack->r_ctl.rc_rtt_diff = 0; 11639 rack->r_ctl.gp_bw = lt_bw; 11640 rack->rc_gp_filled = 1; 11641 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11642 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11643 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11644 } else if (lt_bw > rack->r_ctl.gp_bw) { 11645 rack->r_ctl.rc_rtt_diff = 0; 11646 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11647 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11648 rack->r_ctl.gp_bw = lt_bw; 11649 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11650 } else 11651 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11652 if ((rack->gp_ready == 0) && 11653 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 11654 /* We have enough measurements now */ 11655 rack->gp_ready = 1; 11656 if (rack->dgp_on || 11657 rack->rack_hibeta) 11658 rack_set_cc_pacing(rack); 11659 if (rack->defer_options) 11660 rack_apply_deferred_options(rack); 11661 } 11662 } else { 11663 /* 11664 * zero rtt possibly?, settle for just an old increase. 11665 */ 11666 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11667 } 11668 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 11669 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 11670 minseg)) && 11671 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 11672 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 11673 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 11674 (segsiz * rack_req_segs))) { 11675 /* 11676 * We are doing dynamic GP pacing and 11677 * we have everything except 1MSS or less 11678 * bytes left out. We are still pacing away. 11679 * And there is data that could be sent, This 11680 * means we are inserting delayed ack time in 11681 * our measurements because we are pacing too slow. 11682 */ 11683 rack_validate_multipliers_at_or_above100(rack); 11684 rack->rc_dragged_bottom = 1; 11685 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11686 } 11687 } 11688 11689 #ifdef TCP_REQUEST_TRK 11690 static void 11691 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, 11692 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err) 11693 { 11694 int do_log; 11695 11696 do_log = tcp_bblogging_on(rack->rc_tp); 11697 if (do_log == 0) { 11698 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) 11699 return; 11700 /* We only allow the three below with point logging on */ 11701 if ((mod != HYBRID_LOG_RULES_APP) && 11702 (mod != HYBRID_LOG_RULES_SET) && 11703 (mod != HYBRID_LOG_REQ_COMP)) 11704 return; 11705 11706 } 11707 if (do_log) { 11708 union tcp_log_stackspecific log; 11709 struct timeval tv; 11710 11711 /* Convert our ms to a microsecond */ 11712 memset(&log, 0, sizeof(log)); 11713 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11714 log.u_bbr.flex1 = seq; 11715 log.u_bbr.cwnd_gain = line; 11716 if (cur != NULL) { 11717 uint64_t off; 11718 11719 log.u_bbr.flex2 = cur->start_seq; 11720 log.u_bbr.flex3 = cur->end_seq; 11721 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 11722 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); 11723 log.u_bbr.flex6 = cur->flags; 11724 log.u_bbr.pkts_out = cur->hybrid_flags; 11725 log.u_bbr.rttProp = cur->timestamp; 11726 log.u_bbr.cur_del_rate = cur->cspr; 11727 log.u_bbr.bw_inuse = cur->start; 11728 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); 11729 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; 11730 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); 11731 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; 11732 log.u_bbr.bbr_state = 1; 11733 #ifdef TCP_REQUEST_TRK 11734 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 11735 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 11736 #endif 11737 } else { 11738 log.u_bbr.flex2 = err; 11739 } 11740 /* 11741 * Fill in flex7 to be CHD (catchup|hybrid|DGP) 11742 */ 11743 log.u_bbr.flex7 = rack->rc_catch_up; 11744 log.u_bbr.flex7 <<= 1; 11745 log.u_bbr.flex7 |= rack->rc_hybrid_mode; 11746 log.u_bbr.flex7 <<= 1; 11747 log.u_bbr.flex7 |= rack->dgp_on; 11748 log.u_bbr.flex8 = mod; 11749 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; 11750 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; 11751 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11752 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; 11753 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; 11754 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; 11755 tcp_log_event(rack->rc_tp, NULL, 11756 &rack->rc_inp->inp_socket->so_rcv, 11757 &rack->rc_inp->inp_socket->so_snd, 11758 TCP_HYBRID_PACING_LOG, 0, 11759 0, &log, false, NULL, __func__, __LINE__, &tv); 11760 } 11761 } 11762 #endif 11763 11764 #ifdef TCP_REQUEST_TRK 11765 static void 11766 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len) 11767 { 11768 struct tcp_sendfile_track *rc_cur; 11769 struct tcpcb *tp; 11770 int err = 0; 11771 11772 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); 11773 if (rc_cur == NULL) { 11774 /* If not in the beginning what about the end piece */ 11775 if (rack->rc_hybrid_mode) 11776 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11777 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1)); 11778 } else { 11779 err = 12345; 11780 } 11781 /* If we find no parameters we are in straight DGP mode */ 11782 if(rc_cur == NULL) { 11783 /* None found for this seq, just DGP for now */ 11784 rack->r_ctl.client_suggested_maxseg = 0; 11785 rack->rc_catch_up = 0; 11786 rack->r_ctl.bw_rate_cap = 0; 11787 if (rack->rc_hybrid_mode) 11788 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11789 if (rack->r_ctl.rc_last_sft) { 11790 rack->r_ctl.rc_last_sft = NULL; 11791 } 11792 return; 11793 } 11794 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) { 11795 /* This entry was never setup for hybrid pacing on/off etc */ 11796 return; 11797 } 11798 /* 11799 * Ok if we have a new entry *or* have never 11800 * set up an entry we need to proceed. If 11801 * we have already set it up this entry we 11802 * just continue along with what we already 11803 * setup. 11804 */ 11805 tp = rack->rc_tp; 11806 if ((rack->r_ctl.rc_last_sft != NULL) && 11807 (rack->r_ctl.rc_last_sft == rc_cur)) { 11808 /* Its already in place */ 11809 if (rack->rc_hybrid_mode) 11810 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); 11811 return; 11812 } 11813 if (rack->rc_hybrid_mode == 0) { 11814 rack->r_ctl.rc_last_sft = rc_cur; 11815 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11816 return; 11817 } 11818 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ 11819 /* Compensate for all the header overhead's */ 11820 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11821 } else 11822 rack->r_ctl.bw_rate_cap = 0; 11823 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) 11824 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; 11825 else 11826 rack->r_ctl.client_suggested_maxseg = 0; 11827 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && 11828 (rc_cur->cspr > 0)) { 11829 uint64_t len; 11830 11831 rack->rc_catch_up = 1; 11832 /* 11833 * Calculate the deadline time, first set the 11834 * time to when the request arrived. 11835 */ 11836 rc_cur->deadline = rc_cur->localtime; 11837 /* 11838 * Next calculate the length and compensate for 11839 * TLS if need be. 11840 */ 11841 len = rc_cur->end - rc_cur->start; 11842 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { 11843 /* 11844 * This session is doing TLS. Take a swag guess 11845 * at the overhead. 11846 */ 11847 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); 11848 } 11849 /* 11850 * Now considering the size, and the cspr, what is the time that 11851 * would be required at the cspr rate. Here we use the raw 11852 * cspr value since the client only looks at the raw data. We 11853 * do use len which includes TLS overhead, but not the TCP/IP etc. 11854 * That will get made up for in the CU pacing rate set. 11855 */ 11856 len *= HPTS_USEC_IN_SEC; 11857 len /= rc_cur->cspr; 11858 rc_cur->deadline += len; 11859 } else { 11860 rack->rc_catch_up = 0; 11861 rc_cur->deadline = 0; 11862 } 11863 if (rack->r_ctl.client_suggested_maxseg != 0) { 11864 /* 11865 * We need to reset the max pace segs if we have a 11866 * client_suggested_maxseg. 11867 */ 11868 rack_set_pace_segments(tp, rack, __LINE__, NULL); 11869 } 11870 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11871 /* Remember it for next time and for CU mode */ 11872 rack->r_ctl.rc_last_sft = rc_cur; 11873 } 11874 #endif 11875 11876 static void 11877 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11878 { 11879 #ifdef TCP_REQUEST_TRK 11880 struct tcp_sendfile_track *ent; 11881 11882 ent = rack->r_ctl.rc_last_sft; 11883 if ((ent == NULL) || 11884 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || 11885 (SEQ_GEQ(seq, ent->end_seq))) { 11886 /* Time to update the track. */ 11887 rack_set_dgp_hybrid_mode(rack, seq, len); 11888 ent = rack->r_ctl.rc_last_sft; 11889 } 11890 /* Out of all */ 11891 if (ent == NULL) { 11892 return; 11893 } 11894 if (SEQ_LT(ent->end_seq, (seq + len))) { 11895 /* 11896 * This is the case where our end_seq guess 11897 * was wrong. This is usually due to TLS having 11898 * more bytes then our guess. It could also be the 11899 * case that the client sent in two requests closely 11900 * and the SB is full of both so we are sending part 11901 * of each (end|beg). In such a case lets move this 11902 * guys end to match the end of this send. That 11903 * way it will complete when all of it is acked. 11904 */ 11905 ent->end_seq = (seq + len); 11906 if (rack->rc_hybrid_mode) 11907 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__); 11908 } 11909 /* Now validate we have set the send time of this one */ 11910 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11911 ent->flags |= TCP_TRK_TRACK_FLG_FSND; 11912 ent->first_send = cts; 11913 ent->sent_at_fs = rack->rc_tp->t_sndbytes; 11914 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11915 } 11916 #endif 11917 } 11918 11919 static void 11920 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 11921 { 11922 /* 11923 * The fast output path is enabled and we 11924 * have moved the cumack forward. Lets see if 11925 * we can expand forward the fast path length by 11926 * that amount. What we would ideally like to 11927 * do is increase the number of bytes in the 11928 * fast path block (left_to_send) by the 11929 * acked amount. However we have to gate that 11930 * by two factors: 11931 * 1) The amount outstanding and the rwnd of the peer 11932 * (i.e. we don't want to exceed the rwnd of the peer). 11933 * <and> 11934 * 2) The amount of data left in the socket buffer (i.e. 11935 * we can't send beyond what is in the buffer). 11936 * 11937 * Note that this does not take into account any increase 11938 * in the cwnd. We will only extend the fast path by 11939 * what was acked. 11940 */ 11941 uint32_t new_total, gating_val; 11942 11943 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 11944 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 11945 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 11946 if (new_total <= gating_val) { 11947 /* We can increase left_to_send by the acked amount */ 11948 counter_u64_add(rack_extended_rfo, 1); 11949 rack->r_ctl.fsb.left_to_send = new_total; 11950 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 11951 ("rack:%p left_to_send:%u sbavail:%u out:%u", 11952 rack, rack->r_ctl.fsb.left_to_send, 11953 sbavail(&rack->rc_inp->inp_socket->so_snd), 11954 (tp->snd_max - tp->snd_una))); 11955 11956 } 11957 } 11958 11959 static void 11960 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) 11961 { 11962 /* 11963 * Here any sendmap entry that points to the 11964 * beginning mbuf must be adjusted to the correct 11965 * offset. This must be called with: 11966 * 1) The socket buffer locked 11967 * 2) snd_una adjusted to its new position. 11968 * 11969 * Note that (2) implies rack_ack_received has also 11970 * been called and all the sbcut's have been done. 11971 * 11972 * We grab the first mbuf in the socket buffer and 11973 * then go through the front of the sendmap, recalculating 11974 * the stored offset for any sendmap entry that has 11975 * that mbuf. We must use the sb functions to do this 11976 * since its possible an add was done has well as 11977 * the subtraction we may have just completed. This should 11978 * not be a penalty though, since we just referenced the sb 11979 * to go in and trim off the mbufs that we freed (of course 11980 * there will be a penalty for the sendmap references though). 11981 * 11982 * Note also with INVARIANT on, we validate with a KASSERT 11983 * that the first sendmap entry has a soff of 0. 11984 * 11985 */ 11986 struct mbuf *m; 11987 struct rack_sendmap *rsm; 11988 tcp_seq snd_una; 11989 #ifdef INVARIANTS 11990 int first_processed = 0; 11991 #endif 11992 11993 snd_una = rack->rc_tp->snd_una; 11994 SOCKBUF_LOCK_ASSERT(sb); 11995 m = sb->sb_mb; 11996 rsm = tqhash_min(rack->r_ctl.tqh); 11997 if ((rsm == NULL) || (m == NULL)) { 11998 /* Nothing outstanding */ 11999 return; 12000 } 12001 /* The very first RSM's mbuf must point to the head mbuf in the sb */ 12002 KASSERT((rsm->m == m), 12003 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", 12004 rack, sb, rsm)); 12005 while (rsm->m && (rsm->m == m)) { 12006 /* one to adjust */ 12007 #ifdef INVARIANTS 12008 struct mbuf *tm; 12009 uint32_t soff; 12010 12011 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 12012 if ((rsm->orig_m_len != m->m_len) || 12013 (rsm->orig_t_space != M_TRAILINGROOM(m))){ 12014 rack_adjust_orig_mlen(rsm); 12015 } 12016 if (first_processed == 0) { 12017 KASSERT((rsm->soff == 0), 12018 ("Rack:%p rsm:%p -- rsm at head but soff not zero", 12019 rack, rsm)); 12020 first_processed = 1; 12021 } 12022 if ((rsm->soff != soff) || (rsm->m != tm)) { 12023 /* 12024 * This is not a fatal error, we anticipate it 12025 * might happen (the else code), so we count it here 12026 * so that under invariant we can see that it really 12027 * does happen. 12028 */ 12029 counter_u64_add(rack_adjust_map_bw, 1); 12030 } 12031 rsm->m = tm; 12032 rsm->soff = soff; 12033 if (tm) { 12034 rsm->orig_m_len = rsm->m->m_len; 12035 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 12036 } else { 12037 rsm->orig_m_len = 0; 12038 rsm->orig_t_space = 0; 12039 } 12040 #else 12041 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 12042 if (rsm->m) { 12043 rsm->orig_m_len = rsm->m->m_len; 12044 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 12045 } else { 12046 rsm->orig_m_len = 0; 12047 rsm->orig_t_space = 0; 12048 } 12049 #endif 12050 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 12051 if (rsm == NULL) 12052 break; 12053 } 12054 } 12055 12056 #ifdef TCP_REQUEST_TRK 12057 static inline void 12058 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) 12059 { 12060 struct tcp_sendfile_track *ent; 12061 int i; 12062 12063 if ((rack->rc_hybrid_mode == 0) && 12064 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { 12065 /* 12066 * Just do normal completions hybrid pacing is not on 12067 * and CLDL is off as well. 12068 */ 12069 tcp_req_check_for_comp(rack->rc_tp, th_ack); 12070 return; 12071 } 12072 /* 12073 * Originally I was just going to find the th_ack associated 12074 * with an entry. But then I realized a large strech ack could 12075 * in theory ack two or more requests at once. So instead we 12076 * need to find all entries that are completed by th_ack not 12077 * just a single entry and do our logging. 12078 */ 12079 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 12080 while (ent != NULL) { 12081 /* 12082 * We may be doing hybrid pacing or CLDL and need more details possibly 12083 * so we do it manually instead of calling 12084 * tcp_req_check_for_comp() 12085 */ 12086 uint64_t laa, tim, data, cbw, ftim; 12087 12088 /* Ok this ack frees it */ 12089 rack_log_hybrid(rack, th_ack, 12090 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); 12091 rack_log_hybrid_sends(rack, ent, __LINE__); 12092 /* calculate the time based on the ack arrival */ 12093 data = ent->end - ent->start; 12094 laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 12095 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { 12096 if (ent->first_send > ent->localtime) 12097 ftim = ent->first_send; 12098 else 12099 ftim = ent->localtime; 12100 } else { 12101 /* TSNH */ 12102 ftim = ent->localtime; 12103 } 12104 if (laa > ent->localtime) 12105 tim = laa - ftim; 12106 else 12107 tim = 0; 12108 cbw = data * HPTS_USEC_IN_SEC; 12109 if (tim > 0) 12110 cbw /= tim; 12111 else 12112 cbw = 0; 12113 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__); 12114 /* 12115 * Check to see if we are freeing what we are pointing to send wise 12116 * if so be sure to NULL the pointer so we know we are no longer 12117 * set to anything. 12118 */ 12119 if (ent == rack->r_ctl.rc_last_sft) 12120 rack->r_ctl.rc_last_sft = NULL; 12121 /* Generate the log that the tcp_netflix call would have */ 12122 tcp_req_log_req_info(rack->rc_tp, ent, 12123 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 12124 /* Free it and see if there is another one */ 12125 tcp_req_free_a_slot(rack->rc_tp, ent); 12126 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 12127 } 12128 } 12129 #endif 12130 12131 12132 /* 12133 * Return value of 1, we do not need to call rack_process_data(). 12134 * return value of 0, rack_process_data can be called. 12135 * For ret_val if its 0 the TCP is locked, if its non-zero 12136 * its unlocked and probably unsafe to touch the TCB. 12137 */ 12138 static int 12139 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12140 struct tcpcb *tp, struct tcpopt *to, 12141 uint32_t tiwin, int32_t tlen, 12142 int32_t * ofia, int32_t thflags, int32_t *ret_val) 12143 { 12144 int32_t ourfinisacked = 0; 12145 int32_t nsegs, acked_amount; 12146 int32_t acked; 12147 struct mbuf *mfree; 12148 struct tcp_rack *rack; 12149 int32_t under_pacing = 0; 12150 int32_t recovery = 0; 12151 12152 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12153 12154 rack = (struct tcp_rack *)tp->t_fb_ptr; 12155 if (SEQ_GT(th->th_ack, tp->snd_max)) { 12156 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 12157 &rack->r_ctl.challenge_ack_ts, 12158 &rack->r_ctl.challenge_ack_cnt); 12159 rack->r_wanted_output = 1; 12160 return (1); 12161 } 12162 if (rack->gp_ready && 12163 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12164 under_pacing = 1; 12165 } 12166 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 12167 int in_rec, dup_ack_struck = 0; 12168 int dsack_seen = 0, sacks_seen = 0; 12169 12170 in_rec = IN_FASTRECOVERY(tp->t_flags); 12171 if (rack->rc_in_persist) { 12172 tp->t_rxtshift = 0; 12173 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12174 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12175 } 12176 12177 if ((th->th_ack == tp->snd_una) && 12178 (tiwin == tp->snd_wnd) && 12179 ((to->to_flags & TOF_SACK) == 0)) { 12180 rack_strike_dupack(rack); 12181 dup_ack_struck = 1; 12182 } 12183 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), 12184 dup_ack_struck, &dsack_seen, &sacks_seen); 12185 if ((rack->sack_attack_disable > 0) && 12186 (th->th_ack == tp->snd_una) && 12187 (tiwin == tp->snd_wnd) && 12188 (dsack_seen == 0) && 12189 (sacks_seen > 0)) { 12190 /* 12191 * If sacks have been disabled we may 12192 * want to strike a dup-ack "ignoring" the 12193 * sack as long as the sack was not a "dsack". Note 12194 * that if no sack is sent (TOF_SACK is off) then the 12195 * normal dsack code above rack_log_ack() would have 12196 * already struck. So this is just to catch the case 12197 * were we are ignoring sacks from this guy due to 12198 * it being a suspected attacker. 12199 */ 12200 rack_strike_dupack(rack); 12201 } 12202 12203 } 12204 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12205 /* 12206 * Old ack, behind (or duplicate to) the last one rcv'd 12207 * Note: We mark reordering is occuring if its 12208 * less than and we have not closed our window. 12209 */ 12210 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 12211 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12212 if (rack->r_ctl.rc_reorder_ts == 0) 12213 rack->r_ctl.rc_reorder_ts = 1; 12214 } 12215 return (0); 12216 } 12217 /* 12218 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 12219 * something we sent. 12220 */ 12221 if (tp->t_flags & TF_NEEDSYN) { 12222 /* 12223 * T/TCP: Connection was half-synchronized, and our SYN has 12224 * been ACK'd (so connection is now fully synchronized). Go 12225 * to non-starred state, increment snd_una for ACK of SYN, 12226 * and check if we can do window scaling. 12227 */ 12228 tp->t_flags &= ~TF_NEEDSYN; 12229 tp->snd_una++; 12230 /* Do window scaling? */ 12231 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12232 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12233 tp->rcv_scale = tp->request_r_scale; 12234 /* Send window already scaled. */ 12235 } 12236 } 12237 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12238 12239 acked = BYTES_THIS_ACK(tp, th); 12240 if (acked) { 12241 /* 12242 * Any time we move the cum-ack forward clear 12243 * keep-alive tied probe-not-answered. The 12244 * persists clears its own on entry. 12245 */ 12246 rack->probe_not_answered = 0; 12247 } 12248 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12249 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12250 /* 12251 * If we just performed our first retransmit, and the ACK arrives 12252 * within our recovery window, then it was a mistake to do the 12253 * retransmit in the first place. Recover our original cwnd and 12254 * ssthresh, and proceed to transmit where we left off. 12255 */ 12256 if ((tp->t_flags & TF_PREVVALID) && 12257 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12258 tp->t_flags &= ~TF_PREVVALID; 12259 if (tp->t_rxtshift == 1 && 12260 (int)(ticks - tp->t_badrxtwin) < 0) 12261 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12262 } 12263 if (acked) { 12264 /* assure we are not backed off */ 12265 tp->t_rxtshift = 0; 12266 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12267 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12268 rack->rc_tlp_in_progress = 0; 12269 rack->r_ctl.rc_tlp_cnt_out = 0; 12270 /* 12271 * If it is the RXT timer we want to 12272 * stop it, so we can restart a TLP. 12273 */ 12274 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12275 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12276 #ifdef TCP_REQUEST_TRK 12277 rack_req_check_for_comp(rack, th->th_ack); 12278 #endif 12279 } 12280 /* 12281 * If we have a timestamp reply, update smoothed round trip time. If 12282 * no timestamp is present but transmit timer is running and timed 12283 * sequence number was acked, update smoothed round trip time. Since 12284 * we now have an rtt measurement, cancel the timer backoff (cf., 12285 * Phil Karn's retransmit alg.). Recompute the initial retransmit 12286 * timer. 12287 * 12288 * Some boxes send broken timestamp replies during the SYN+ACK 12289 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12290 * and blow up the retransmit timer. 12291 */ 12292 /* 12293 * If all outstanding data is acked, stop retransmit timer and 12294 * remember to restart (more output or persist). If there is more 12295 * data to be acked, restart retransmit timer, using current 12296 * (possibly backed-off) value. 12297 */ 12298 if (acked == 0) { 12299 if (ofia) 12300 *ofia = ourfinisacked; 12301 return (0); 12302 } 12303 if (IN_RECOVERY(tp->t_flags)) { 12304 if (SEQ_LT(th->th_ack, tp->snd_recover) && 12305 (SEQ_LT(th->th_ack, tp->snd_max))) { 12306 tcp_rack_partialack(tp); 12307 } else { 12308 rack_post_recovery(tp, th->th_ack); 12309 recovery = 1; 12310 } 12311 } 12312 /* 12313 * Let the congestion control algorithm update congestion control 12314 * related information. This typically means increasing the 12315 * congestion window. 12316 */ 12317 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 12318 SOCKBUF_LOCK(&so->so_snd); 12319 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 12320 tp->snd_wnd -= acked_amount; 12321 mfree = sbcut_locked(&so->so_snd, acked_amount); 12322 if ((sbused(&so->so_snd) == 0) && 12323 (acked > acked_amount) && 12324 (tp->t_state >= TCPS_FIN_WAIT_1) && 12325 (tp->t_flags & TF_SENTFIN)) { 12326 /* 12327 * We must be sure our fin 12328 * was sent and acked (we can be 12329 * in FIN_WAIT_1 without having 12330 * sent the fin). 12331 */ 12332 ourfinisacked = 1; 12333 } 12334 tp->snd_una = th->th_ack; 12335 /* wakeups? */ 12336 if (acked_amount && sbavail(&so->so_snd)) 12337 rack_adjust_sendmap_head(rack, &so->so_snd); 12338 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12339 /* NB: sowwakeup_locked() does an implicit unlock. */ 12340 sowwakeup_locked(so); 12341 /* now check the rxt clamps */ 12342 if ((recovery == 1) && 12343 (rack->excess_rxt_on) && 12344 (rack->r_cwnd_was_clamped == 0)) { 12345 do_rack_excess_rxt(tp, rack); 12346 } else if (rack->r_cwnd_was_clamped) 12347 do_rack_check_for_unclamp(tp, rack); 12348 m_freem(mfree); 12349 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 12350 tp->snd_recover = tp->snd_una; 12351 12352 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 12353 tp->snd_nxt = tp->snd_max; 12354 } 12355 if (under_pacing && 12356 (rack->use_fixed_rate == 0) && 12357 (rack->in_probe_rtt == 0) && 12358 rack->rc_gp_dyn_mul && 12359 rack->rc_always_pace) { 12360 /* Check if we are dragging bottom */ 12361 rack_check_bottom_drag(tp, rack, so); 12362 } 12363 if (tp->snd_una == tp->snd_max) { 12364 /* Nothing left outstanding */ 12365 tp->t_flags &= ~TF_PREVVALID; 12366 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 12367 rack->r_ctl.retran_during_recovery = 0; 12368 rack->r_ctl.dsack_byte_cnt = 0; 12369 if (rack->r_ctl.rc_went_idle_time == 0) 12370 rack->r_ctl.rc_went_idle_time = 1; 12371 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12372 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12373 tp->t_acktime = 0; 12374 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12375 rack->rc_suspicious = 0; 12376 /* Set need output so persist might get set */ 12377 rack->r_wanted_output = 1; 12378 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12379 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 12380 (sbavail(&so->so_snd) == 0) && 12381 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 12382 /* 12383 * The socket was gone and the 12384 * peer sent data (now or in the past), time to 12385 * reset him. 12386 */ 12387 *ret_val = 1; 12388 /* tcp_close will kill the inp pre-log the Reset */ 12389 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 12390 tp = tcp_close(tp); 12391 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 12392 return (1); 12393 } 12394 } 12395 if (ofia) 12396 *ofia = ourfinisacked; 12397 return (0); 12398 } 12399 12400 12401 static void 12402 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 12403 int dir, uint32_t flags, struct rack_sendmap *rsm) 12404 { 12405 if (tcp_bblogging_on(rack->rc_tp)) { 12406 union tcp_log_stackspecific log; 12407 struct timeval tv; 12408 12409 memset(&log, 0, sizeof(log)); 12410 log.u_bbr.flex1 = cnt; 12411 log.u_bbr.flex2 = split; 12412 log.u_bbr.flex3 = out; 12413 log.u_bbr.flex4 = line; 12414 log.u_bbr.flex5 = rack->r_must_retran; 12415 log.u_bbr.flex6 = flags; 12416 log.u_bbr.flex7 = rack->rc_has_collapsed; 12417 log.u_bbr.flex8 = dir; /* 12418 * 1 is collapsed, 0 is uncollapsed, 12419 * 2 is log of a rsm being marked, 3 is a split. 12420 */ 12421 if (rsm == NULL) 12422 log.u_bbr.rttProp = 0; 12423 else 12424 log.u_bbr.rttProp = (uint64_t)rsm; 12425 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 12426 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 12427 TCP_LOG_EVENTP(rack->rc_tp, NULL, 12428 &rack->rc_inp->inp_socket->so_rcv, 12429 &rack->rc_inp->inp_socket->so_snd, 12430 TCP_RACK_LOG_COLLAPSE, 0, 12431 0, &log, false, &tv); 12432 } 12433 } 12434 12435 static void 12436 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) 12437 { 12438 /* 12439 * Here all we do is mark the collapsed point and set the flag. 12440 * This may happen again and again, but there is no 12441 * sense splitting our map until we know where the 12442 * peer finally lands in the collapse. 12443 */ 12444 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12445 if ((rack->rc_has_collapsed == 0) || 12446 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) 12447 counter_u64_add(rack_collapsed_win_seen, 1); 12448 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; 12449 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 12450 rack->rc_has_collapsed = 1; 12451 rack->r_collapse_point_valid = 1; 12452 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 12453 } 12454 12455 static void 12456 rack_un_collapse_window(struct tcp_rack *rack, int line) 12457 { 12458 struct rack_sendmap *nrsm, *rsm; 12459 int cnt = 0, split = 0; 12460 int insret __diagused; 12461 12462 12463 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12464 rack->rc_has_collapsed = 0; 12465 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 12466 if (rsm == NULL) { 12467 /* Nothing to do maybe the peer ack'ed it all */ 12468 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12469 return; 12470 } 12471 /* Now do we need to split this one? */ 12472 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 12473 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 12474 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 12475 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 12476 if (nrsm == NULL) { 12477 /* We can't get a rsm, mark all? */ 12478 nrsm = rsm; 12479 goto no_split; 12480 } 12481 /* Clone it */ 12482 split = 1; 12483 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 12484 #ifndef INVARIANTS 12485 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 12486 #else 12487 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 12488 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 12489 nrsm, insret, rack, rsm); 12490 } 12491 #endif 12492 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 12493 rack->r_ctl.last_collapse_point, __LINE__); 12494 if (rsm->r_in_tmap) { 12495 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 12496 nrsm->r_in_tmap = 1; 12497 } 12498 /* 12499 * Set in the new RSM as the 12500 * collapsed starting point 12501 */ 12502 rsm = nrsm; 12503 } 12504 12505 no_split: 12506 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { 12507 cnt++; 12508 nrsm->r_flags |= RACK_RWND_COLLAPSED; 12509 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 12510 cnt++; 12511 } 12512 if (cnt) { 12513 counter_u64_add(rack_collapsed_win, 1); 12514 } 12515 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12516 } 12517 12518 static void 12519 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 12520 int32_t tlen, int32_t tfo_syn) 12521 { 12522 if (DELAY_ACK(tp, tlen) || tfo_syn) { 12523 rack_timer_cancel(tp, rack, 12524 rack->r_ctl.rc_rcvtime, __LINE__); 12525 tp->t_flags |= TF_DELACK; 12526 } else { 12527 rack->r_wanted_output = 1; 12528 tp->t_flags |= TF_ACKNOW; 12529 } 12530 } 12531 12532 static void 12533 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 12534 { 12535 /* 12536 * If fast output is in progress, lets validate that 12537 * the new window did not shrink on us and make it 12538 * so fast output should end. 12539 */ 12540 if (rack->r_fast_output) { 12541 uint32_t out; 12542 12543 /* 12544 * Calculate what we will send if left as is 12545 * and compare that to our send window. 12546 */ 12547 out = ctf_outstanding(tp); 12548 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 12549 /* ok we have an issue */ 12550 if (out >= tp->snd_wnd) { 12551 /* Turn off fast output the window is met or collapsed */ 12552 rack->r_fast_output = 0; 12553 } else { 12554 /* we have some room left */ 12555 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 12556 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 12557 /* If not at least 1 full segment never mind */ 12558 rack->r_fast_output = 0; 12559 } 12560 } 12561 } 12562 } 12563 } 12564 12565 12566 /* 12567 * Return value of 1, the TCB is unlocked and most 12568 * likely gone, return value of 0, the TCP is still 12569 * locked. 12570 */ 12571 static int 12572 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 12573 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 12574 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 12575 { 12576 /* 12577 * Update window information. Don't look at window if no ACK: TAC's 12578 * send garbage on first SYN. 12579 */ 12580 int32_t nsegs; 12581 int32_t tfo_syn; 12582 struct tcp_rack *rack; 12583 12584 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12585 12586 rack = (struct tcp_rack *)tp->t_fb_ptr; 12587 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12588 if ((thflags & TH_ACK) && 12589 (SEQ_LT(tp->snd_wl1, th->th_seq) || 12590 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 12591 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 12592 /* keep track of pure window updates */ 12593 if (tlen == 0 && 12594 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 12595 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 12596 tp->snd_wnd = tiwin; 12597 rack_validate_fo_sendwin_up(tp, rack); 12598 tp->snd_wl1 = th->th_seq; 12599 tp->snd_wl2 = th->th_ack; 12600 if (tp->snd_wnd > tp->max_sndwnd) 12601 tp->max_sndwnd = tp->snd_wnd; 12602 rack->r_wanted_output = 1; 12603 } else if (thflags & TH_ACK) { 12604 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 12605 tp->snd_wnd = tiwin; 12606 rack_validate_fo_sendwin_up(tp, rack); 12607 tp->snd_wl1 = th->th_seq; 12608 tp->snd_wl2 = th->th_ack; 12609 } 12610 } 12611 if (tp->snd_wnd < ctf_outstanding(tp)) 12612 /* The peer collapsed the window */ 12613 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12614 else if (rack->rc_has_collapsed) 12615 rack_un_collapse_window(rack, __LINE__); 12616 if ((rack->r_collapse_point_valid) && 12617 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 12618 rack->r_collapse_point_valid = 0; 12619 /* Was persist timer active and now we have window space? */ 12620 if ((rack->rc_in_persist != 0) && 12621 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12622 rack->r_ctl.rc_pace_min_segs))) { 12623 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12624 tp->snd_nxt = tp->snd_max; 12625 /* Make sure we output to start the timer */ 12626 rack->r_wanted_output = 1; 12627 } 12628 /* Do we enter persists? */ 12629 if ((rack->rc_in_persist == 0) && 12630 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12631 TCPS_HAVEESTABLISHED(tp->t_state) && 12632 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12633 sbavail(&tptosocket(tp)->so_snd) && 12634 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12635 /* 12636 * Here the rwnd is less than 12637 * the pacing size, we are established, 12638 * nothing is outstanding, and there is 12639 * data to send. Enter persists. 12640 */ 12641 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 12642 } 12643 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 12644 m_freem(m); 12645 return (0); 12646 } 12647 /* 12648 * don't process the URG bit, ignore them drag 12649 * along the up. 12650 */ 12651 tp->rcv_up = tp->rcv_nxt; 12652 12653 /* 12654 * Process the segment text, merging it into the TCP sequencing 12655 * queue, and arranging for acknowledgment of receipt if necessary. 12656 * This process logically involves adjusting tp->rcv_wnd as data is 12657 * presented to the user (this happens in tcp_usrreq.c, case 12658 * PRU_RCVD). If a FIN has already been received on this connection 12659 * then we just ignore the text. 12660 */ 12661 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 12662 IS_FASTOPEN(tp->t_flags)); 12663 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 12664 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12665 tcp_seq save_start = th->th_seq; 12666 tcp_seq save_rnxt = tp->rcv_nxt; 12667 int save_tlen = tlen; 12668 12669 m_adj(m, drop_hdrlen); /* delayed header drop */ 12670 /* 12671 * Insert segment which includes th into TCP reassembly 12672 * queue with control block tp. Set thflags to whether 12673 * reassembly now includes a segment with FIN. This handles 12674 * the common case inline (segment is the next to be 12675 * received on an established connection, and the queue is 12676 * empty), avoiding linkage into and removal from the queue 12677 * and repetition of various conversions. Set DELACK for 12678 * segments received in order, but ack immediately when 12679 * segments are out of order (so fast retransmit can work). 12680 */ 12681 if (th->th_seq == tp->rcv_nxt && 12682 SEGQ_EMPTY(tp) && 12683 (TCPS_HAVEESTABLISHED(tp->t_state) || 12684 tfo_syn)) { 12685 #ifdef NETFLIX_SB_LIMITS 12686 u_int mcnt, appended; 12687 12688 if (so->so_rcv.sb_shlim) { 12689 mcnt = m_memcnt(m); 12690 appended = 0; 12691 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12692 CFO_NOSLEEP, NULL) == false) { 12693 counter_u64_add(tcp_sb_shlim_fails, 1); 12694 m_freem(m); 12695 return (0); 12696 } 12697 } 12698 #endif 12699 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 12700 tp->rcv_nxt += tlen; 12701 if (tlen && 12702 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12703 (tp->t_fbyte_in == 0)) { 12704 tp->t_fbyte_in = ticks; 12705 if (tp->t_fbyte_in == 0) 12706 tp->t_fbyte_in = 1; 12707 if (tp->t_fbyte_out && tp->t_fbyte_in) 12708 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12709 } 12710 thflags = tcp_get_flags(th) & TH_FIN; 12711 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12712 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12713 SOCKBUF_LOCK(&so->so_rcv); 12714 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12715 m_freem(m); 12716 } else 12717 #ifdef NETFLIX_SB_LIMITS 12718 appended = 12719 #endif 12720 sbappendstream_locked(&so->so_rcv, m, 0); 12721 12722 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12723 /* NB: sorwakeup_locked() does an implicit unlock. */ 12724 sorwakeup_locked(so); 12725 #ifdef NETFLIX_SB_LIMITS 12726 if (so->so_rcv.sb_shlim && appended != mcnt) 12727 counter_fo_release(so->so_rcv.sb_shlim, 12728 mcnt - appended); 12729 #endif 12730 } else { 12731 /* 12732 * XXX: Due to the header drop above "th" is 12733 * theoretically invalid by now. Fortunately 12734 * m_adj() doesn't actually frees any mbufs when 12735 * trimming from the head. 12736 */ 12737 tcp_seq temp = save_start; 12738 12739 thflags = tcp_reass(tp, th, &temp, &tlen, m); 12740 tp->t_flags |= TF_ACKNOW; 12741 if (tp->t_flags & TF_WAKESOR) { 12742 tp->t_flags &= ~TF_WAKESOR; 12743 /* NB: sorwakeup_locked() does an implicit unlock. */ 12744 sorwakeup_locked(so); 12745 } 12746 } 12747 if ((tp->t_flags & TF_SACK_PERMIT) && 12748 (save_tlen > 0) && 12749 TCPS_HAVEESTABLISHED(tp->t_state)) { 12750 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 12751 /* 12752 * DSACK actually handled in the fastpath 12753 * above. 12754 */ 12755 tcp_update_sack_list(tp, save_start, 12756 save_start + save_tlen); 12757 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 12758 if ((tp->rcv_numsacks >= 1) && 12759 (tp->sackblks[0].end == save_start)) { 12760 /* 12761 * Partial overlap, recorded at todrop 12762 * above. 12763 */ 12764 tcp_update_sack_list(tp, 12765 tp->sackblks[0].start, 12766 tp->sackblks[0].end); 12767 } else { 12768 tcp_update_dsack_list(tp, save_start, 12769 save_start + save_tlen); 12770 } 12771 } else if (tlen >= save_tlen) { 12772 /* Update of sackblks. */ 12773 tcp_update_dsack_list(tp, save_start, 12774 save_start + save_tlen); 12775 } else if (tlen > 0) { 12776 tcp_update_dsack_list(tp, save_start, 12777 save_start + tlen); 12778 } 12779 } 12780 } else { 12781 m_freem(m); 12782 thflags &= ~TH_FIN; 12783 } 12784 12785 /* 12786 * If FIN is received ACK the FIN and let the user know that the 12787 * connection is closing. 12788 */ 12789 if (thflags & TH_FIN) { 12790 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12791 /* The socket upcall is handled by socantrcvmore. */ 12792 socantrcvmore(so); 12793 /* 12794 * If connection is half-synchronized (ie NEEDSYN 12795 * flag on) then delay ACK, so it may be piggybacked 12796 * when SYN is sent. Otherwise, since we received a 12797 * FIN then no more input can be expected, send ACK 12798 * now. 12799 */ 12800 if (tp->t_flags & TF_NEEDSYN) { 12801 rack_timer_cancel(tp, rack, 12802 rack->r_ctl.rc_rcvtime, __LINE__); 12803 tp->t_flags |= TF_DELACK; 12804 } else { 12805 tp->t_flags |= TF_ACKNOW; 12806 } 12807 tp->rcv_nxt++; 12808 } 12809 switch (tp->t_state) { 12810 /* 12811 * In SYN_RECEIVED and ESTABLISHED STATES enter the 12812 * CLOSE_WAIT state. 12813 */ 12814 case TCPS_SYN_RECEIVED: 12815 tp->t_starttime = ticks; 12816 /* FALLTHROUGH */ 12817 case TCPS_ESTABLISHED: 12818 rack_timer_cancel(tp, rack, 12819 rack->r_ctl.rc_rcvtime, __LINE__); 12820 tcp_state_change(tp, TCPS_CLOSE_WAIT); 12821 break; 12822 12823 /* 12824 * If still in FIN_WAIT_1 STATE FIN has not been 12825 * acked so enter the CLOSING state. 12826 */ 12827 case TCPS_FIN_WAIT_1: 12828 rack_timer_cancel(tp, rack, 12829 rack->r_ctl.rc_rcvtime, __LINE__); 12830 tcp_state_change(tp, TCPS_CLOSING); 12831 break; 12832 12833 /* 12834 * In FIN_WAIT_2 state enter the TIME_WAIT state, 12835 * starting the time-wait timer, turning off the 12836 * other standard timers. 12837 */ 12838 case TCPS_FIN_WAIT_2: 12839 rack_timer_cancel(tp, rack, 12840 rack->r_ctl.rc_rcvtime, __LINE__); 12841 tcp_twstart(tp); 12842 return (1); 12843 } 12844 } 12845 /* 12846 * Return any desired output. 12847 */ 12848 if ((tp->t_flags & TF_ACKNOW) || 12849 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 12850 rack->r_wanted_output = 1; 12851 } 12852 return (0); 12853 } 12854 12855 /* 12856 * Here nothing is really faster, its just that we 12857 * have broken out the fast-data path also just like 12858 * the fast-ack. 12859 */ 12860 static int 12861 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 12862 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12863 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 12864 { 12865 int32_t nsegs; 12866 int32_t newsize = 0; /* automatic sockbuf scaling */ 12867 struct tcp_rack *rack; 12868 #ifdef NETFLIX_SB_LIMITS 12869 u_int mcnt, appended; 12870 #endif 12871 12872 /* 12873 * If last ACK falls within this segment's sequence numbers, record 12874 * the timestamp. NOTE that the test is modified according to the 12875 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12876 */ 12877 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 12878 return (0); 12879 } 12880 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 12881 return (0); 12882 } 12883 if (tiwin && tiwin != tp->snd_wnd) { 12884 return (0); 12885 } 12886 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 12887 return (0); 12888 } 12889 if (__predict_false((to->to_flags & TOF_TS) && 12890 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 12891 return (0); 12892 } 12893 if (__predict_false((th->th_ack != tp->snd_una))) { 12894 return (0); 12895 } 12896 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 12897 return (0); 12898 } 12899 if ((to->to_flags & TOF_TS) != 0 && 12900 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12901 tp->ts_recent_age = tcp_ts_getticks(); 12902 tp->ts_recent = to->to_tsval; 12903 } 12904 rack = (struct tcp_rack *)tp->t_fb_ptr; 12905 /* 12906 * This is a pure, in-sequence data packet with nothing on the 12907 * reassembly queue and we have enough buffer space to take it. 12908 */ 12909 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12910 12911 #ifdef NETFLIX_SB_LIMITS 12912 if (so->so_rcv.sb_shlim) { 12913 mcnt = m_memcnt(m); 12914 appended = 0; 12915 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12916 CFO_NOSLEEP, NULL) == false) { 12917 counter_u64_add(tcp_sb_shlim_fails, 1); 12918 m_freem(m); 12919 return (1); 12920 } 12921 } 12922 #endif 12923 /* Clean receiver SACK report if present */ 12924 if (tp->rcv_numsacks) 12925 tcp_clean_sackreport(tp); 12926 KMOD_TCPSTAT_INC(tcps_preddat); 12927 tp->rcv_nxt += tlen; 12928 if (tlen && 12929 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12930 (tp->t_fbyte_in == 0)) { 12931 tp->t_fbyte_in = ticks; 12932 if (tp->t_fbyte_in == 0) 12933 tp->t_fbyte_in = 1; 12934 if (tp->t_fbyte_out && tp->t_fbyte_in) 12935 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12936 } 12937 /* 12938 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 12939 */ 12940 tp->snd_wl1 = th->th_seq; 12941 /* 12942 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 12943 */ 12944 tp->rcv_up = tp->rcv_nxt; 12945 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12946 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12947 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12948 12949 /* Add data to socket buffer. */ 12950 SOCKBUF_LOCK(&so->so_rcv); 12951 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12952 m_freem(m); 12953 } else { 12954 /* 12955 * Set new socket buffer size. Give up when limit is 12956 * reached. 12957 */ 12958 if (newsize) 12959 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12960 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12961 m_adj(m, drop_hdrlen); /* delayed header drop */ 12962 #ifdef NETFLIX_SB_LIMITS 12963 appended = 12964 #endif 12965 sbappendstream_locked(&so->so_rcv, m, 0); 12966 ctf_calc_rwin(so, tp); 12967 } 12968 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12969 /* NB: sorwakeup_locked() does an implicit unlock. */ 12970 sorwakeup_locked(so); 12971 #ifdef NETFLIX_SB_LIMITS 12972 if (so->so_rcv.sb_shlim && mcnt != appended) 12973 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 12974 #endif 12975 rack_handle_delayed_ack(tp, rack, tlen, 0); 12976 if (tp->snd_una == tp->snd_max) 12977 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12978 return (1); 12979 } 12980 12981 /* 12982 * This subfunction is used to try to highly optimize the 12983 * fast path. We again allow window updates that are 12984 * in sequence to remain in the fast-path. We also add 12985 * in the __predict's to attempt to help the compiler. 12986 * Note that if we return a 0, then we can *not* process 12987 * it and the caller should push the packet into the 12988 * slow-path. 12989 */ 12990 static int 12991 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12992 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12993 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 12994 { 12995 int32_t acked; 12996 int32_t nsegs; 12997 int32_t under_pacing = 0; 12998 struct tcp_rack *rack; 12999 13000 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 13001 /* Old ack, behind (or duplicate to) the last one rcv'd */ 13002 return (0); 13003 } 13004 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 13005 /* Above what we have sent? */ 13006 return (0); 13007 } 13008 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 13009 /* We are retransmitting */ 13010 return (0); 13011 } 13012 if (__predict_false(tiwin == 0)) { 13013 /* zero window */ 13014 return (0); 13015 } 13016 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 13017 /* We need a SYN or a FIN, unlikely.. */ 13018 return (0); 13019 } 13020 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 13021 /* Timestamp is behind .. old ack with seq wrap? */ 13022 return (0); 13023 } 13024 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 13025 /* Still recovering */ 13026 return (0); 13027 } 13028 rack = (struct tcp_rack *)tp->t_fb_ptr; 13029 if (rack->r_ctl.rc_sacked) { 13030 /* We have sack holes on our scoreboard */ 13031 return (0); 13032 } 13033 /* Ok if we reach here, we can process a fast-ack */ 13034 if (rack->gp_ready && 13035 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 13036 under_pacing = 1; 13037 } 13038 nsegs = max(1, m->m_pkthdr.lro_nsegs); 13039 rack_log_ack(tp, to, th, 0, 0, NULL, NULL); 13040 /* Did the window get updated? */ 13041 if (tiwin != tp->snd_wnd) { 13042 tp->snd_wnd = tiwin; 13043 rack_validate_fo_sendwin_up(tp, rack); 13044 tp->snd_wl1 = th->th_seq; 13045 if (tp->snd_wnd > tp->max_sndwnd) 13046 tp->max_sndwnd = tp->snd_wnd; 13047 } 13048 /* Do we exit persists? */ 13049 if ((rack->rc_in_persist != 0) && 13050 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 13051 rack->r_ctl.rc_pace_min_segs))) { 13052 rack_exit_persist(tp, rack, cts); 13053 } 13054 /* Do we enter persists? */ 13055 if ((rack->rc_in_persist == 0) && 13056 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 13057 TCPS_HAVEESTABLISHED(tp->t_state) && 13058 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 13059 sbavail(&tptosocket(tp)->so_snd) && 13060 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 13061 /* 13062 * Here the rwnd is less than 13063 * the pacing size, we are established, 13064 * nothing is outstanding, and there is 13065 * data to send. Enter persists. 13066 */ 13067 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); 13068 } 13069 /* 13070 * If last ACK falls within this segment's sequence numbers, record 13071 * the timestamp. NOTE that the test is modified according to the 13072 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 13073 */ 13074 if ((to->to_flags & TOF_TS) != 0 && 13075 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 13076 tp->ts_recent_age = tcp_ts_getticks(); 13077 tp->ts_recent = to->to_tsval; 13078 } 13079 /* 13080 * This is a pure ack for outstanding data. 13081 */ 13082 KMOD_TCPSTAT_INC(tcps_predack); 13083 13084 /* 13085 * "bad retransmit" recovery. 13086 */ 13087 if ((tp->t_flags & TF_PREVVALID) && 13088 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13089 tp->t_flags &= ~TF_PREVVALID; 13090 if (tp->t_rxtshift == 1 && 13091 (int)(ticks - tp->t_badrxtwin) < 0) 13092 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 13093 } 13094 /* 13095 * Recalculate the transmit timer / rtt. 13096 * 13097 * Some boxes send broken timestamp replies during the SYN+ACK 13098 * phase, ignore timestamps of 0 or we could calculate a huge RTT 13099 * and blow up the retransmit timer. 13100 */ 13101 acked = BYTES_THIS_ACK(tp, th); 13102 13103 #ifdef TCP_HHOOK 13104 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 13105 hhook_run_tcp_est_in(tp, th, to); 13106 #endif 13107 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 13108 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13109 if (acked) { 13110 struct mbuf *mfree; 13111 13112 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 13113 SOCKBUF_LOCK(&so->so_snd); 13114 mfree = sbcut_locked(&so->so_snd, acked); 13115 tp->snd_una = th->th_ack; 13116 /* Note we want to hold the sb lock through the sendmap adjust */ 13117 rack_adjust_sendmap_head(rack, &so->so_snd); 13118 /* Wake up the socket if we have room to write more */ 13119 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13120 sowwakeup_locked(so); 13121 m_freem(mfree); 13122 tp->t_rxtshift = 0; 13123 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13124 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13125 rack->rc_tlp_in_progress = 0; 13126 rack->r_ctl.rc_tlp_cnt_out = 0; 13127 /* 13128 * If it is the RXT timer we want to 13129 * stop it, so we can restart a TLP. 13130 */ 13131 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13132 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13133 13134 #ifdef TCP_REQUEST_TRK 13135 rack_req_check_for_comp(rack, th->th_ack); 13136 #endif 13137 } 13138 /* 13139 * Let the congestion control algorithm update congestion control 13140 * related information. This typically means increasing the 13141 * congestion window. 13142 */ 13143 if (tp->snd_wnd < ctf_outstanding(tp)) { 13144 /* The peer collapsed the window */ 13145 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 13146 } else if (rack->rc_has_collapsed) 13147 rack_un_collapse_window(rack, __LINE__); 13148 if ((rack->r_collapse_point_valid) && 13149 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 13150 rack->r_collapse_point_valid = 0; 13151 /* 13152 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 13153 */ 13154 tp->snd_wl2 = th->th_ack; 13155 tp->t_dupacks = 0; 13156 m_freem(m); 13157 /* ND6_HINT(tp); *//* Some progress has been made. */ 13158 13159 /* 13160 * If all outstanding data are acked, stop retransmit timer, 13161 * otherwise restart timer using current (possibly backed-off) 13162 * value. If process is waiting for space, wakeup/selwakeup/signal. 13163 * If data are ready to send, let tcp_output decide between more 13164 * output or persist. 13165 */ 13166 if (under_pacing && 13167 (rack->use_fixed_rate == 0) && 13168 (rack->in_probe_rtt == 0) && 13169 rack->rc_gp_dyn_mul && 13170 rack->rc_always_pace) { 13171 /* Check if we are dragging bottom */ 13172 rack_check_bottom_drag(tp, rack, so); 13173 } 13174 if (tp->snd_una == tp->snd_max) { 13175 tp->t_flags &= ~TF_PREVVALID; 13176 rack->r_ctl.retran_during_recovery = 0; 13177 rack->rc_suspicious = 0; 13178 rack->r_ctl.dsack_byte_cnt = 0; 13179 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13180 if (rack->r_ctl.rc_went_idle_time == 0) 13181 rack->r_ctl.rc_went_idle_time = 1; 13182 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13183 if (sbavail(&tptosocket(tp)->so_snd) == 0) 13184 tp->t_acktime = 0; 13185 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13186 } 13187 if (acked && rack->r_fast_output) 13188 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 13189 if (sbavail(&so->so_snd)) { 13190 rack->r_wanted_output = 1; 13191 } 13192 return (1); 13193 } 13194 13195 /* 13196 * Return value of 1, the TCB is unlocked and most 13197 * likely gone, return value of 0, the TCP is still 13198 * locked. 13199 */ 13200 static int 13201 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 13202 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13203 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13204 { 13205 int32_t ret_val = 0; 13206 int32_t todrop; 13207 int32_t ourfinisacked = 0; 13208 struct tcp_rack *rack; 13209 13210 INP_WLOCK_ASSERT(tptoinpcb(tp)); 13211 13212 ctf_calc_rwin(so, tp); 13213 /* 13214 * If the state is SYN_SENT: if seg contains an ACK, but not for our 13215 * SYN, drop the input. if seg contains a RST, then drop the 13216 * connection. if seg does not contain SYN, then drop it. Otherwise 13217 * this is an acceptable SYN segment initialize tp->rcv_nxt and 13218 * tp->irs if seg contains ack then advance tp->snd_una if seg 13219 * contains an ECE and ECN support is enabled, the stream is ECN 13220 * capable. if SYN has been acked change to ESTABLISHED else 13221 * SYN_RCVD state arrange for segment to be acked (eventually) 13222 * continue processing rest of data/controls. 13223 */ 13224 if ((thflags & TH_ACK) && 13225 (SEQ_LEQ(th->th_ack, tp->iss) || 13226 SEQ_GT(th->th_ack, tp->snd_max))) { 13227 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13228 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13229 return (1); 13230 } 13231 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 13232 TCP_PROBE5(connect__refused, NULL, tp, 13233 mtod(m, const char *), tp, th); 13234 tp = tcp_drop(tp, ECONNREFUSED); 13235 ctf_do_drop(m, tp); 13236 return (1); 13237 } 13238 if (thflags & TH_RST) { 13239 ctf_do_drop(m, tp); 13240 return (1); 13241 } 13242 if (!(thflags & TH_SYN)) { 13243 ctf_do_drop(m, tp); 13244 return (1); 13245 } 13246 tp->irs = th->th_seq; 13247 tcp_rcvseqinit(tp); 13248 rack = (struct tcp_rack *)tp->t_fb_ptr; 13249 if (thflags & TH_ACK) { 13250 int tfo_partial = 0; 13251 13252 KMOD_TCPSTAT_INC(tcps_connects); 13253 soisconnected(so); 13254 #ifdef MAC 13255 mac_socketpeer_set_from_mbuf(m, so); 13256 #endif 13257 /* Do window scaling on this connection? */ 13258 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13259 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13260 tp->rcv_scale = tp->request_r_scale; 13261 } 13262 tp->rcv_adv += min(tp->rcv_wnd, 13263 TCP_MAXWIN << tp->rcv_scale); 13264 /* 13265 * If not all the data that was sent in the TFO SYN 13266 * has been acked, resend the remainder right away. 13267 */ 13268 if (IS_FASTOPEN(tp->t_flags) && 13269 (tp->snd_una != tp->snd_max)) { 13270 tp->snd_nxt = th->th_ack; 13271 tfo_partial = 1; 13272 } 13273 /* 13274 * If there's data, delay ACK; if there's also a FIN ACKNOW 13275 * will be turned on later. 13276 */ 13277 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 13278 rack_timer_cancel(tp, rack, 13279 rack->r_ctl.rc_rcvtime, __LINE__); 13280 tp->t_flags |= TF_DELACK; 13281 } else { 13282 rack->r_wanted_output = 1; 13283 tp->t_flags |= TF_ACKNOW; 13284 } 13285 13286 tcp_ecn_input_syn_sent(tp, thflags, iptos); 13287 13288 if (SEQ_GT(th->th_ack, tp->snd_una)) { 13289 /* 13290 * We advance snd_una for the 13291 * fast open case. If th_ack is 13292 * acknowledging data beyond 13293 * snd_una we can't just call 13294 * ack-processing since the 13295 * data stream in our send-map 13296 * will start at snd_una + 1 (one 13297 * beyond the SYN). If its just 13298 * equal we don't need to do that 13299 * and there is no send_map. 13300 */ 13301 tp->snd_una++; 13302 } 13303 /* 13304 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 13305 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 13306 */ 13307 tp->t_starttime = ticks; 13308 if (tp->t_flags & TF_NEEDFIN) { 13309 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13310 tp->t_flags &= ~TF_NEEDFIN; 13311 thflags &= ~TH_SYN; 13312 } else { 13313 tcp_state_change(tp, TCPS_ESTABLISHED); 13314 TCP_PROBE5(connect__established, NULL, tp, 13315 mtod(m, const char *), tp, th); 13316 rack_cc_conn_init(tp); 13317 } 13318 } else { 13319 /* 13320 * Received initial SYN in SYN-SENT[*] state => simultaneous 13321 * open. If segment contains CC option and there is a 13322 * cached CC, apply TAO test. If it succeeds, connection is * 13323 * half-synchronized. Otherwise, do 3-way handshake: 13324 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 13325 * there was no CC option, clear cached CC value. 13326 */ 13327 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 13328 tcp_state_change(tp, TCPS_SYN_RECEIVED); 13329 } 13330 /* 13331 * Advance th->th_seq to correspond to first data byte. If data, 13332 * trim to stay within window, dropping FIN if necessary. 13333 */ 13334 th->th_seq++; 13335 if (tlen > tp->rcv_wnd) { 13336 todrop = tlen - tp->rcv_wnd; 13337 m_adj(m, -todrop); 13338 tlen = tp->rcv_wnd; 13339 thflags &= ~TH_FIN; 13340 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 13341 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 13342 } 13343 tp->snd_wl1 = th->th_seq - 1; 13344 tp->rcv_up = th->th_seq; 13345 /* 13346 * Client side of transaction: already sent SYN and data. If the 13347 * remote host used T/TCP to validate the SYN, our data will be 13348 * ACK'd; if so, enter normal data segment processing in the middle 13349 * of step 5, ack processing. Otherwise, goto step 6. 13350 */ 13351 if (thflags & TH_ACK) { 13352 /* For syn-sent we need to possibly update the rtt */ 13353 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13354 uint32_t t, mcts; 13355 13356 mcts = tcp_ts_getticks(); 13357 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13358 if (!tp->t_rttlow || tp->t_rttlow > t) 13359 tp->t_rttlow = t; 13360 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 13361 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13362 tcp_rack_xmit_timer_commit(rack, tp); 13363 } 13364 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 13365 return (ret_val); 13366 /* We may have changed to FIN_WAIT_1 above */ 13367 if (tp->t_state == TCPS_FIN_WAIT_1) { 13368 /* 13369 * In FIN_WAIT_1 STATE in addition to the processing 13370 * for the ESTABLISHED state if our FIN is now 13371 * acknowledged then enter FIN_WAIT_2. 13372 */ 13373 if (ourfinisacked) { 13374 /* 13375 * If we can't receive any more data, then 13376 * closing user can proceed. Starting the 13377 * timer is contrary to the specification, 13378 * but if we don't get a FIN we'll hang 13379 * forever. 13380 * 13381 * XXXjl: we should release the tp also, and 13382 * use a compressed state. 13383 */ 13384 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13385 soisdisconnected(so); 13386 tcp_timer_activate(tp, TT_2MSL, 13387 (tcp_fast_finwait2_recycle ? 13388 tcp_finwait2_timeout : 13389 TP_MAXIDLE(tp))); 13390 } 13391 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13392 } 13393 } 13394 } 13395 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13396 tiwin, thflags, nxt_pkt)); 13397 } 13398 13399 /* 13400 * Return value of 1, the TCB is unlocked and most 13401 * likely gone, return value of 0, the TCP is still 13402 * locked. 13403 */ 13404 static int 13405 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 13406 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13407 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13408 { 13409 struct tcp_rack *rack; 13410 int32_t ret_val = 0; 13411 int32_t ourfinisacked = 0; 13412 13413 rack = (struct tcp_rack *)tp->t_fb_ptr; 13414 ctf_calc_rwin(so, tp); 13415 if ((thflags & TH_RST) || 13416 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13417 return (__ctf_process_rst(m, th, so, tp, 13418 &rack->r_ctl.challenge_ack_ts, 13419 &rack->r_ctl.challenge_ack_cnt)); 13420 if ((thflags & TH_ACK) && 13421 (SEQ_LEQ(th->th_ack, tp->snd_una) || 13422 SEQ_GT(th->th_ack, tp->snd_max))) { 13423 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13424 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13425 return (1); 13426 } 13427 if (IS_FASTOPEN(tp->t_flags)) { 13428 /* 13429 * When a TFO connection is in SYN_RECEIVED, the 13430 * only valid packets are the initial SYN, a 13431 * retransmit/copy of the initial SYN (possibly with 13432 * a subset of the original data), a valid ACK, a 13433 * FIN, or a RST. 13434 */ 13435 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 13436 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13437 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13438 return (1); 13439 } else if (thflags & TH_SYN) { 13440 /* non-initial SYN is ignored */ 13441 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 13442 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 13443 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 13444 ctf_do_drop(m, NULL); 13445 return (0); 13446 } 13447 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 13448 ctf_do_drop(m, NULL); 13449 return (0); 13450 } 13451 } 13452 13453 /* 13454 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13455 * it's less than ts_recent, drop it. 13456 */ 13457 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13458 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13459 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13460 return (ret_val); 13461 } 13462 /* 13463 * In the SYN-RECEIVED state, validate that the packet belongs to 13464 * this connection before trimming the data to fit the receive 13465 * window. Check the sequence number versus IRS since we know the 13466 * sequence numbers haven't wrapped. This is a partial fix for the 13467 * "LAND" DoS attack. 13468 */ 13469 if (SEQ_LT(th->th_seq, tp->irs)) { 13470 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13471 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13472 return (1); 13473 } 13474 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13475 &rack->r_ctl.challenge_ack_ts, 13476 &rack->r_ctl.challenge_ack_cnt)) { 13477 return (ret_val); 13478 } 13479 /* 13480 * If last ACK falls within this segment's sequence numbers, record 13481 * its timestamp. NOTE: 1) That the test incorporates suggestions 13482 * from the latest proposal of the tcplw@cray.com list (Braden 13483 * 1993/04/26). 2) That updating only on newer timestamps interferes 13484 * with our earlier PAWS tests, so this check should be solely 13485 * predicated on the sequence space of this segment. 3) That we 13486 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13487 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13488 * SEG.Len, This modified check allows us to overcome RFC1323's 13489 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13490 * p.869. In such cases, we can still calculate the RTT correctly 13491 * when RCV.NXT == Last.ACK.Sent. 13492 */ 13493 if ((to->to_flags & TOF_TS) != 0 && 13494 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13495 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13496 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13497 tp->ts_recent_age = tcp_ts_getticks(); 13498 tp->ts_recent = to->to_tsval; 13499 } 13500 tp->snd_wnd = tiwin; 13501 rack_validate_fo_sendwin_up(tp, rack); 13502 /* 13503 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13504 * is on (half-synchronized state), then queue data for later 13505 * processing; else drop segment and return. 13506 */ 13507 if ((thflags & TH_ACK) == 0) { 13508 if (IS_FASTOPEN(tp->t_flags)) { 13509 rack_cc_conn_init(tp); 13510 } 13511 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13512 tiwin, thflags, nxt_pkt)); 13513 } 13514 KMOD_TCPSTAT_INC(tcps_connects); 13515 if (tp->t_flags & TF_SONOTCONN) { 13516 tp->t_flags &= ~TF_SONOTCONN; 13517 soisconnected(so); 13518 } 13519 /* Do window scaling? */ 13520 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13521 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13522 tp->rcv_scale = tp->request_r_scale; 13523 } 13524 /* 13525 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 13526 * FIN-WAIT-1 13527 */ 13528 tp->t_starttime = ticks; 13529 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 13530 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 13531 tp->t_tfo_pending = NULL; 13532 } 13533 if (tp->t_flags & TF_NEEDFIN) { 13534 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13535 tp->t_flags &= ~TF_NEEDFIN; 13536 } else { 13537 tcp_state_change(tp, TCPS_ESTABLISHED); 13538 TCP_PROBE5(accept__established, NULL, tp, 13539 mtod(m, const char *), tp, th); 13540 /* 13541 * TFO connections call cc_conn_init() during SYN 13542 * processing. Calling it again here for such connections 13543 * is not harmless as it would undo the snd_cwnd reduction 13544 * that occurs when a TFO SYN|ACK is retransmitted. 13545 */ 13546 if (!IS_FASTOPEN(tp->t_flags)) 13547 rack_cc_conn_init(tp); 13548 } 13549 /* 13550 * Account for the ACK of our SYN prior to 13551 * regular ACK processing below, except for 13552 * simultaneous SYN, which is handled later. 13553 */ 13554 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 13555 tp->snd_una++; 13556 /* 13557 * If segment contains data or ACK, will call tcp_reass() later; if 13558 * not, do so now to pass queued data to user. 13559 */ 13560 if (tlen == 0 && (thflags & TH_FIN) == 0) { 13561 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 13562 (struct mbuf *)0); 13563 if (tp->t_flags & TF_WAKESOR) { 13564 tp->t_flags &= ~TF_WAKESOR; 13565 /* NB: sorwakeup_locked() does an implicit unlock. */ 13566 sorwakeup_locked(so); 13567 } 13568 } 13569 tp->snd_wl1 = th->th_seq - 1; 13570 /* For syn-recv we need to possibly update the rtt */ 13571 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13572 uint32_t t, mcts; 13573 13574 mcts = tcp_ts_getticks(); 13575 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13576 if (!tp->t_rttlow || tp->t_rttlow > t) 13577 tp->t_rttlow = t; 13578 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 13579 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13580 tcp_rack_xmit_timer_commit(rack, tp); 13581 } 13582 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 13583 return (ret_val); 13584 } 13585 if (tp->t_state == TCPS_FIN_WAIT_1) { 13586 /* We could have went to FIN_WAIT_1 (or EST) above */ 13587 /* 13588 * In FIN_WAIT_1 STATE in addition to the processing for the 13589 * ESTABLISHED state if our FIN is now acknowledged then 13590 * enter FIN_WAIT_2. 13591 */ 13592 if (ourfinisacked) { 13593 /* 13594 * If we can't receive any more data, then closing 13595 * user can proceed. Starting the timer is contrary 13596 * to the specification, but if we don't get a FIN 13597 * we'll hang forever. 13598 * 13599 * XXXjl: we should release the tp also, and use a 13600 * compressed state. 13601 */ 13602 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13603 soisdisconnected(so); 13604 tcp_timer_activate(tp, TT_2MSL, 13605 (tcp_fast_finwait2_recycle ? 13606 tcp_finwait2_timeout : 13607 TP_MAXIDLE(tp))); 13608 } 13609 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13610 } 13611 } 13612 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13613 tiwin, thflags, nxt_pkt)); 13614 } 13615 13616 /* 13617 * Return value of 1, the TCB is unlocked and most 13618 * likely gone, return value of 0, the TCP is still 13619 * locked. 13620 */ 13621 static int 13622 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 13623 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13624 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13625 { 13626 int32_t ret_val = 0; 13627 struct tcp_rack *rack; 13628 13629 /* 13630 * Header prediction: check for the two common cases of a 13631 * uni-directional data xfer. If the packet has no control flags, 13632 * is in-sequence, the window didn't change and we're not 13633 * retransmitting, it's a candidate. If the length is zero and the 13634 * ack moved forward, we're the sender side of the xfer. Just free 13635 * the data acked & wake any higher level process that was blocked 13636 * waiting for space. If the length is non-zero and the ack didn't 13637 * move, we're the receiver side. If we're getting packets in-order 13638 * (the reassembly queue is empty), add the data toc The socket 13639 * buffer and note that we need a delayed ack. Make sure that the 13640 * hidden state-flags are also off. Since we check for 13641 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 13642 */ 13643 rack = (struct tcp_rack *)tp->t_fb_ptr; 13644 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 13645 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 13646 __predict_true(SEGQ_EMPTY(tp)) && 13647 __predict_true(th->th_seq == tp->rcv_nxt)) { 13648 if (tlen == 0) { 13649 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 13650 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 13651 return (0); 13652 } 13653 } else { 13654 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 13655 tiwin, nxt_pkt, iptos)) { 13656 return (0); 13657 } 13658 } 13659 } 13660 ctf_calc_rwin(so, tp); 13661 13662 if ((thflags & TH_RST) || 13663 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13664 return (__ctf_process_rst(m, th, so, tp, 13665 &rack->r_ctl.challenge_ack_ts, 13666 &rack->r_ctl.challenge_ack_cnt)); 13667 13668 /* 13669 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13670 * synchronized state. 13671 */ 13672 if (thflags & TH_SYN) { 13673 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13674 return (ret_val); 13675 } 13676 /* 13677 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13678 * it's less than ts_recent, drop it. 13679 */ 13680 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13681 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13682 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13683 return (ret_val); 13684 } 13685 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13686 &rack->r_ctl.challenge_ack_ts, 13687 &rack->r_ctl.challenge_ack_cnt)) { 13688 return (ret_val); 13689 } 13690 /* 13691 * If last ACK falls within this segment's sequence numbers, record 13692 * its timestamp. NOTE: 1) That the test incorporates suggestions 13693 * from the latest proposal of the tcplw@cray.com list (Braden 13694 * 1993/04/26). 2) That updating only on newer timestamps interferes 13695 * with our earlier PAWS tests, so this check should be solely 13696 * predicated on the sequence space of this segment. 3) That we 13697 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13698 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13699 * SEG.Len, This modified check allows us to overcome RFC1323's 13700 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13701 * p.869. In such cases, we can still calculate the RTT correctly 13702 * when RCV.NXT == Last.ACK.Sent. 13703 */ 13704 if ((to->to_flags & TOF_TS) != 0 && 13705 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13706 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13707 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13708 tp->ts_recent_age = tcp_ts_getticks(); 13709 tp->ts_recent = to->to_tsval; 13710 } 13711 /* 13712 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13713 * is on (half-synchronized state), then queue data for later 13714 * processing; else drop segment and return. 13715 */ 13716 if ((thflags & TH_ACK) == 0) { 13717 if (tp->t_flags & TF_NEEDSYN) { 13718 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13719 tiwin, thflags, nxt_pkt)); 13720 13721 } else if (tp->t_flags & TF_ACKNOW) { 13722 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13723 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13724 return (ret_val); 13725 } else { 13726 ctf_do_drop(m, NULL); 13727 return (0); 13728 } 13729 } 13730 /* 13731 * Ack processing. 13732 */ 13733 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 13734 return (ret_val); 13735 } 13736 if (sbavail(&so->so_snd)) { 13737 if (ctf_progress_timeout_check(tp, true)) { 13738 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 13739 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13740 return (1); 13741 } 13742 } 13743 /* State changes only happen in rack_process_data() */ 13744 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13745 tiwin, thflags, nxt_pkt)); 13746 } 13747 13748 /* 13749 * Return value of 1, the TCB is unlocked and most 13750 * likely gone, return value of 0, the TCP is still 13751 * locked. 13752 */ 13753 static int 13754 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 13755 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13756 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13757 { 13758 int32_t ret_val = 0; 13759 struct tcp_rack *rack; 13760 13761 rack = (struct tcp_rack *)tp->t_fb_ptr; 13762 ctf_calc_rwin(so, tp); 13763 if ((thflags & TH_RST) || 13764 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13765 return (__ctf_process_rst(m, th, so, tp, 13766 &rack->r_ctl.challenge_ack_ts, 13767 &rack->r_ctl.challenge_ack_cnt)); 13768 /* 13769 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13770 * synchronized state. 13771 */ 13772 if (thflags & TH_SYN) { 13773 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13774 return (ret_val); 13775 } 13776 /* 13777 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13778 * it's less than ts_recent, drop it. 13779 */ 13780 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13781 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13782 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13783 return (ret_val); 13784 } 13785 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13786 &rack->r_ctl.challenge_ack_ts, 13787 &rack->r_ctl.challenge_ack_cnt)) { 13788 return (ret_val); 13789 } 13790 /* 13791 * If last ACK falls within this segment's sequence numbers, record 13792 * its timestamp. NOTE: 1) That the test incorporates suggestions 13793 * from the latest proposal of the tcplw@cray.com list (Braden 13794 * 1993/04/26). 2) That updating only on newer timestamps interferes 13795 * with our earlier PAWS tests, so this check should be solely 13796 * predicated on the sequence space of this segment. 3) That we 13797 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13798 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13799 * SEG.Len, This modified check allows us to overcome RFC1323's 13800 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13801 * p.869. In such cases, we can still calculate the RTT correctly 13802 * when RCV.NXT == Last.ACK.Sent. 13803 */ 13804 if ((to->to_flags & TOF_TS) != 0 && 13805 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13806 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13807 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13808 tp->ts_recent_age = tcp_ts_getticks(); 13809 tp->ts_recent = to->to_tsval; 13810 } 13811 /* 13812 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13813 * is on (half-synchronized state), then queue data for later 13814 * processing; else drop segment and return. 13815 */ 13816 if ((thflags & TH_ACK) == 0) { 13817 if (tp->t_flags & TF_NEEDSYN) { 13818 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13819 tiwin, thflags, nxt_pkt)); 13820 13821 } else if (tp->t_flags & TF_ACKNOW) { 13822 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13823 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13824 return (ret_val); 13825 } else { 13826 ctf_do_drop(m, NULL); 13827 return (0); 13828 } 13829 } 13830 /* 13831 * Ack processing. 13832 */ 13833 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 13834 return (ret_val); 13835 } 13836 if (sbavail(&so->so_snd)) { 13837 if (ctf_progress_timeout_check(tp, true)) { 13838 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13839 tp, tick, PROGRESS_DROP, __LINE__); 13840 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13841 return (1); 13842 } 13843 } 13844 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13845 tiwin, thflags, nxt_pkt)); 13846 } 13847 13848 static int 13849 rack_check_data_after_close(struct mbuf *m, 13850 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 13851 { 13852 struct tcp_rack *rack; 13853 13854 rack = (struct tcp_rack *)tp->t_fb_ptr; 13855 if (rack->rc_allow_data_af_clo == 0) { 13856 close_now: 13857 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13858 /* tcp_close will kill the inp pre-log the Reset */ 13859 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13860 tp = tcp_close(tp); 13861 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 13862 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 13863 return (1); 13864 } 13865 if (sbavail(&so->so_snd) == 0) 13866 goto close_now; 13867 /* Ok we allow data that is ignored and a followup reset */ 13868 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13869 tp->rcv_nxt = th->th_seq + *tlen; 13870 tp->t_flags2 |= TF2_DROP_AF_DATA; 13871 rack->r_wanted_output = 1; 13872 *tlen = 0; 13873 return (0); 13874 } 13875 13876 /* 13877 * Return value of 1, the TCB is unlocked and most 13878 * likely gone, return value of 0, the TCP is still 13879 * locked. 13880 */ 13881 static int 13882 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 13883 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13884 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13885 { 13886 int32_t ret_val = 0; 13887 int32_t ourfinisacked = 0; 13888 struct tcp_rack *rack; 13889 13890 rack = (struct tcp_rack *)tp->t_fb_ptr; 13891 ctf_calc_rwin(so, tp); 13892 13893 if ((thflags & TH_RST) || 13894 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13895 return (__ctf_process_rst(m, th, so, tp, 13896 &rack->r_ctl.challenge_ack_ts, 13897 &rack->r_ctl.challenge_ack_cnt)); 13898 /* 13899 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13900 * synchronized state. 13901 */ 13902 if (thflags & TH_SYN) { 13903 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13904 return (ret_val); 13905 } 13906 /* 13907 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13908 * it's less than ts_recent, drop it. 13909 */ 13910 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13911 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13912 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13913 return (ret_val); 13914 } 13915 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13916 &rack->r_ctl.challenge_ack_ts, 13917 &rack->r_ctl.challenge_ack_cnt)) { 13918 return (ret_val); 13919 } 13920 /* 13921 * If new data are received on a connection after the user processes 13922 * are gone, then RST the other end. 13923 */ 13924 if ((tp->t_flags & TF_CLOSED) && tlen && 13925 rack_check_data_after_close(m, tp, &tlen, th, so)) 13926 return (1); 13927 /* 13928 * If last ACK falls within this segment's sequence numbers, record 13929 * its timestamp. NOTE: 1) That the test incorporates suggestions 13930 * from the latest proposal of the tcplw@cray.com list (Braden 13931 * 1993/04/26). 2) That updating only on newer timestamps interferes 13932 * with our earlier PAWS tests, so this check should be solely 13933 * predicated on the sequence space of this segment. 3) That we 13934 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13935 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13936 * SEG.Len, This modified check allows us to overcome RFC1323's 13937 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13938 * p.869. In such cases, we can still calculate the RTT correctly 13939 * when RCV.NXT == Last.ACK.Sent. 13940 */ 13941 if ((to->to_flags & TOF_TS) != 0 && 13942 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13943 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13944 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13945 tp->ts_recent_age = tcp_ts_getticks(); 13946 tp->ts_recent = to->to_tsval; 13947 } 13948 /* 13949 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13950 * is on (half-synchronized state), then queue data for later 13951 * processing; else drop segment and return. 13952 */ 13953 if ((thflags & TH_ACK) == 0) { 13954 if (tp->t_flags & TF_NEEDSYN) { 13955 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13956 tiwin, thflags, nxt_pkt)); 13957 } else if (tp->t_flags & TF_ACKNOW) { 13958 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13959 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13960 return (ret_val); 13961 } else { 13962 ctf_do_drop(m, NULL); 13963 return (0); 13964 } 13965 } 13966 /* 13967 * Ack processing. 13968 */ 13969 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 13970 return (ret_val); 13971 } 13972 if (ourfinisacked) { 13973 /* 13974 * If we can't receive any more data, then closing user can 13975 * proceed. Starting the timer is contrary to the 13976 * specification, but if we don't get a FIN we'll hang 13977 * forever. 13978 * 13979 * XXXjl: we should release the tp also, and use a 13980 * compressed state. 13981 */ 13982 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13983 soisdisconnected(so); 13984 tcp_timer_activate(tp, TT_2MSL, 13985 (tcp_fast_finwait2_recycle ? 13986 tcp_finwait2_timeout : 13987 TP_MAXIDLE(tp))); 13988 } 13989 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13990 } 13991 if (sbavail(&so->so_snd)) { 13992 if (ctf_progress_timeout_check(tp, true)) { 13993 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13994 tp, tick, PROGRESS_DROP, __LINE__); 13995 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13996 return (1); 13997 } 13998 } 13999 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14000 tiwin, thflags, nxt_pkt)); 14001 } 14002 14003 /* 14004 * Return value of 1, the TCB is unlocked and most 14005 * likely gone, return value of 0, the TCP is still 14006 * locked. 14007 */ 14008 static int 14009 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 14010 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14011 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14012 { 14013 int32_t ret_val = 0; 14014 int32_t ourfinisacked = 0; 14015 struct tcp_rack *rack; 14016 14017 rack = (struct tcp_rack *)tp->t_fb_ptr; 14018 ctf_calc_rwin(so, tp); 14019 14020 if ((thflags & TH_RST) || 14021 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14022 return (__ctf_process_rst(m, th, so, tp, 14023 &rack->r_ctl.challenge_ack_ts, 14024 &rack->r_ctl.challenge_ack_cnt)); 14025 /* 14026 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14027 * synchronized state. 14028 */ 14029 if (thflags & TH_SYN) { 14030 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14031 return (ret_val); 14032 } 14033 /* 14034 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14035 * it's less than ts_recent, drop it. 14036 */ 14037 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14038 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14039 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14040 return (ret_val); 14041 } 14042 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14043 &rack->r_ctl.challenge_ack_ts, 14044 &rack->r_ctl.challenge_ack_cnt)) { 14045 return (ret_val); 14046 } 14047 /* 14048 * If new data are received on a connection after the user processes 14049 * are gone, then RST the other end. 14050 */ 14051 if ((tp->t_flags & TF_CLOSED) && tlen && 14052 rack_check_data_after_close(m, tp, &tlen, th, so)) 14053 return (1); 14054 /* 14055 * If last ACK falls within this segment's sequence numbers, record 14056 * its timestamp. NOTE: 1) That the test incorporates suggestions 14057 * from the latest proposal of the tcplw@cray.com list (Braden 14058 * 1993/04/26). 2) That updating only on newer timestamps interferes 14059 * with our earlier PAWS tests, so this check should be solely 14060 * predicated on the sequence space of this segment. 3) That we 14061 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14062 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14063 * SEG.Len, This modified check allows us to overcome RFC1323's 14064 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14065 * p.869. In such cases, we can still calculate the RTT correctly 14066 * when RCV.NXT == Last.ACK.Sent. 14067 */ 14068 if ((to->to_flags & TOF_TS) != 0 && 14069 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14070 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14071 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14072 tp->ts_recent_age = tcp_ts_getticks(); 14073 tp->ts_recent = to->to_tsval; 14074 } 14075 /* 14076 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14077 * is on (half-synchronized state), then queue data for later 14078 * processing; else drop segment and return. 14079 */ 14080 if ((thflags & TH_ACK) == 0) { 14081 if (tp->t_flags & TF_NEEDSYN) { 14082 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14083 tiwin, thflags, nxt_pkt)); 14084 } else if (tp->t_flags & TF_ACKNOW) { 14085 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14086 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14087 return (ret_val); 14088 } else { 14089 ctf_do_drop(m, NULL); 14090 return (0); 14091 } 14092 } 14093 /* 14094 * Ack processing. 14095 */ 14096 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 14097 return (ret_val); 14098 } 14099 if (ourfinisacked) { 14100 tcp_twstart(tp); 14101 m_freem(m); 14102 return (1); 14103 } 14104 if (sbavail(&so->so_snd)) { 14105 if (ctf_progress_timeout_check(tp, true)) { 14106 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14107 tp, tick, PROGRESS_DROP, __LINE__); 14108 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14109 return (1); 14110 } 14111 } 14112 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14113 tiwin, thflags, nxt_pkt)); 14114 } 14115 14116 /* 14117 * Return value of 1, the TCB is unlocked and most 14118 * likely gone, return value of 0, the TCP is still 14119 * locked. 14120 */ 14121 static int 14122 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 14123 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14124 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14125 { 14126 int32_t ret_val = 0; 14127 int32_t ourfinisacked = 0; 14128 struct tcp_rack *rack; 14129 14130 rack = (struct tcp_rack *)tp->t_fb_ptr; 14131 ctf_calc_rwin(so, tp); 14132 14133 if ((thflags & TH_RST) || 14134 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14135 return (__ctf_process_rst(m, th, so, tp, 14136 &rack->r_ctl.challenge_ack_ts, 14137 &rack->r_ctl.challenge_ack_cnt)); 14138 /* 14139 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14140 * synchronized state. 14141 */ 14142 if (thflags & TH_SYN) { 14143 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14144 return (ret_val); 14145 } 14146 /* 14147 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14148 * it's less than ts_recent, drop it. 14149 */ 14150 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14151 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14152 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14153 return (ret_val); 14154 } 14155 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14156 &rack->r_ctl.challenge_ack_ts, 14157 &rack->r_ctl.challenge_ack_cnt)) { 14158 return (ret_val); 14159 } 14160 /* 14161 * If new data are received on a connection after the user processes 14162 * are gone, then RST the other end. 14163 */ 14164 if ((tp->t_flags & TF_CLOSED) && tlen && 14165 rack_check_data_after_close(m, tp, &tlen, th, so)) 14166 return (1); 14167 /* 14168 * If last ACK falls within this segment's sequence numbers, record 14169 * its timestamp. NOTE: 1) That the test incorporates suggestions 14170 * from the latest proposal of the tcplw@cray.com list (Braden 14171 * 1993/04/26). 2) That updating only on newer timestamps interferes 14172 * with our earlier PAWS tests, so this check should be solely 14173 * predicated on the sequence space of this segment. 3) That we 14174 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14175 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14176 * SEG.Len, This modified check allows us to overcome RFC1323's 14177 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14178 * p.869. In such cases, we can still calculate the RTT correctly 14179 * when RCV.NXT == Last.ACK.Sent. 14180 */ 14181 if ((to->to_flags & TOF_TS) != 0 && 14182 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14183 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14184 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14185 tp->ts_recent_age = tcp_ts_getticks(); 14186 tp->ts_recent = to->to_tsval; 14187 } 14188 /* 14189 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14190 * is on (half-synchronized state), then queue data for later 14191 * processing; else drop segment and return. 14192 */ 14193 if ((thflags & TH_ACK) == 0) { 14194 if (tp->t_flags & TF_NEEDSYN) { 14195 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14196 tiwin, thflags, nxt_pkt)); 14197 } else if (tp->t_flags & TF_ACKNOW) { 14198 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14199 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14200 return (ret_val); 14201 } else { 14202 ctf_do_drop(m, NULL); 14203 return (0); 14204 } 14205 } 14206 /* 14207 * case TCPS_LAST_ACK: Ack processing. 14208 */ 14209 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 14210 return (ret_val); 14211 } 14212 if (ourfinisacked) { 14213 tp = tcp_close(tp); 14214 ctf_do_drop(m, tp); 14215 return (1); 14216 } 14217 if (sbavail(&so->so_snd)) { 14218 if (ctf_progress_timeout_check(tp, true)) { 14219 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14220 tp, tick, PROGRESS_DROP, __LINE__); 14221 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14222 return (1); 14223 } 14224 } 14225 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14226 tiwin, thflags, nxt_pkt)); 14227 } 14228 14229 /* 14230 * Return value of 1, the TCB is unlocked and most 14231 * likely gone, return value of 0, the TCP is still 14232 * locked. 14233 */ 14234 static int 14235 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 14236 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14237 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14238 { 14239 int32_t ret_val = 0; 14240 int32_t ourfinisacked = 0; 14241 struct tcp_rack *rack; 14242 14243 rack = (struct tcp_rack *)tp->t_fb_ptr; 14244 ctf_calc_rwin(so, tp); 14245 14246 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 14247 if ((thflags & TH_RST) || 14248 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14249 return (__ctf_process_rst(m, th, so, tp, 14250 &rack->r_ctl.challenge_ack_ts, 14251 &rack->r_ctl.challenge_ack_cnt)); 14252 /* 14253 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14254 * synchronized state. 14255 */ 14256 if (thflags & TH_SYN) { 14257 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14258 return (ret_val); 14259 } 14260 /* 14261 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14262 * it's less than ts_recent, drop it. 14263 */ 14264 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14265 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14266 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14267 return (ret_val); 14268 } 14269 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14270 &rack->r_ctl.challenge_ack_ts, 14271 &rack->r_ctl.challenge_ack_cnt)) { 14272 return (ret_val); 14273 } 14274 /* 14275 * If new data are received on a connection after the user processes 14276 * are gone, then RST the other end. 14277 */ 14278 if ((tp->t_flags & TF_CLOSED) && tlen && 14279 rack_check_data_after_close(m, tp, &tlen, th, so)) 14280 return (1); 14281 /* 14282 * If last ACK falls within this segment's sequence numbers, record 14283 * its timestamp. NOTE: 1) That the test incorporates suggestions 14284 * from the latest proposal of the tcplw@cray.com list (Braden 14285 * 1993/04/26). 2) That updating only on newer timestamps interferes 14286 * with our earlier PAWS tests, so this check should be solely 14287 * predicated on the sequence space of this segment. 3) That we 14288 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14289 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14290 * SEG.Len, This modified check allows us to overcome RFC1323's 14291 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14292 * p.869. In such cases, we can still calculate the RTT correctly 14293 * when RCV.NXT == Last.ACK.Sent. 14294 */ 14295 if ((to->to_flags & TOF_TS) != 0 && 14296 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14297 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14298 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14299 tp->ts_recent_age = tcp_ts_getticks(); 14300 tp->ts_recent = to->to_tsval; 14301 } 14302 /* 14303 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14304 * is on (half-synchronized state), then queue data for later 14305 * processing; else drop segment and return. 14306 */ 14307 if ((thflags & TH_ACK) == 0) { 14308 if (tp->t_flags & TF_NEEDSYN) { 14309 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14310 tiwin, thflags, nxt_pkt)); 14311 } else if (tp->t_flags & TF_ACKNOW) { 14312 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14313 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14314 return (ret_val); 14315 } else { 14316 ctf_do_drop(m, NULL); 14317 return (0); 14318 } 14319 } 14320 /* 14321 * Ack processing. 14322 */ 14323 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 14324 return (ret_val); 14325 } 14326 if (sbavail(&so->so_snd)) { 14327 if (ctf_progress_timeout_check(tp, true)) { 14328 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14329 tp, tick, PROGRESS_DROP, __LINE__); 14330 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14331 return (1); 14332 } 14333 } 14334 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14335 tiwin, thflags, nxt_pkt)); 14336 } 14337 14338 static void inline 14339 rack_clear_rate_sample(struct tcp_rack *rack) 14340 { 14341 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 14342 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 14343 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 14344 } 14345 14346 static void 14347 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 14348 { 14349 uint64_t bw_est, rate_wanted; 14350 int chged = 0; 14351 uint32_t user_max, orig_min, orig_max; 14352 14353 #ifdef TCP_REQUEST_TRK 14354 if (rack->rc_hybrid_mode && 14355 (rack->r_ctl.rc_pace_max_segs != 0) && 14356 (rack_hybrid_allow_set_maxseg == 1) && 14357 (rack->r_ctl.rc_last_sft != NULL)) { 14358 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; 14359 return; 14360 } 14361 #endif 14362 orig_min = rack->r_ctl.rc_pace_min_segs; 14363 orig_max = rack->r_ctl.rc_pace_max_segs; 14364 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 14365 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 14366 chged = 1; 14367 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 14368 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 14369 if (user_max != rack->r_ctl.rc_pace_max_segs) 14370 chged = 1; 14371 } 14372 if (rack->rc_force_max_seg) { 14373 rack->r_ctl.rc_pace_max_segs = user_max; 14374 } else if (rack->use_fixed_rate) { 14375 bw_est = rack_get_bw(rack); 14376 if ((rack->r_ctl.crte == NULL) || 14377 (bw_est != rack->r_ctl.crte->rate)) { 14378 rack->r_ctl.rc_pace_max_segs = user_max; 14379 } else { 14380 /* We are pacing right at the hardware rate */ 14381 uint32_t segsiz, pace_one; 14382 14383 if (rack_pace_one_seg || 14384 (rack->r_ctl.rc_user_set_min_segs == 1)) 14385 pace_one = 1; 14386 else 14387 pace_one = 0; 14388 segsiz = min(ctf_fixed_maxseg(tp), 14389 rack->r_ctl.rc_pace_min_segs); 14390 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( 14391 tp, bw_est, segsiz, pace_one, 14392 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 14393 } 14394 } else if (rack->rc_always_pace) { 14395 if (rack->r_ctl.gp_bw || 14396 rack->r_ctl.init_rate) { 14397 /* We have a rate of some sort set */ 14398 uint32_t orig; 14399 14400 bw_est = rack_get_bw(rack); 14401 orig = rack->r_ctl.rc_pace_max_segs; 14402 if (fill_override) 14403 rate_wanted = *fill_override; 14404 else 14405 rate_wanted = rack_get_gp_est(rack); 14406 if (rate_wanted) { 14407 /* We have something */ 14408 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 14409 rate_wanted, 14410 ctf_fixed_maxseg(rack->rc_tp)); 14411 } else 14412 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 14413 if (orig != rack->r_ctl.rc_pace_max_segs) 14414 chged = 1; 14415 } else if ((rack->r_ctl.gp_bw == 0) && 14416 (rack->r_ctl.rc_pace_max_segs == 0)) { 14417 /* 14418 * If we have nothing limit us to bursting 14419 * out IW sized pieces. 14420 */ 14421 chged = 1; 14422 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 14423 } 14424 } 14425 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 14426 chged = 1; 14427 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 14428 } 14429 if (chged) 14430 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 14431 } 14432 14433 14434 static void 14435 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) 14436 { 14437 #ifdef INET6 14438 struct ip6_hdr *ip6 = NULL; 14439 #endif 14440 #ifdef INET 14441 struct ip *ip = NULL; 14442 #endif 14443 struct udphdr *udp = NULL; 14444 14445 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 14446 #ifdef INET6 14447 if (rack->r_is_v6) { 14448 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 14449 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 14450 if (tp->t_port) { 14451 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14452 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 14453 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14454 udp->uh_dport = tp->t_port; 14455 rack->r_ctl.fsb.udp = udp; 14456 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14457 } else 14458 { 14459 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 14460 rack->r_ctl.fsb.udp = NULL; 14461 } 14462 tcpip_fillheaders(rack->rc_inp, 14463 tp->t_port, 14464 ip6, rack->r_ctl.fsb.th); 14465 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); 14466 } else 14467 #endif /* INET6 */ 14468 #ifdef INET 14469 { 14470 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 14471 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 14472 if (tp->t_port) { 14473 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14474 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 14475 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14476 udp->uh_dport = tp->t_port; 14477 rack->r_ctl.fsb.udp = udp; 14478 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14479 } else 14480 { 14481 rack->r_ctl.fsb.udp = NULL; 14482 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 14483 } 14484 tcpip_fillheaders(rack->rc_inp, 14485 tp->t_port, 14486 ip, rack->r_ctl.fsb.th); 14487 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; 14488 } 14489 #endif 14490 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), 14491 (long)TCP_MAXWIN << tp->rcv_scale); 14492 rack->r_fsb_inited = 1; 14493 } 14494 14495 static int 14496 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 14497 { 14498 /* 14499 * Allocate the larger of spaces V6 if available else just 14500 * V4 and include udphdr (overbook) 14501 */ 14502 #ifdef INET6 14503 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 14504 #else 14505 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 14506 #endif 14507 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 14508 M_TCPFSB, M_NOWAIT|M_ZERO); 14509 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 14510 return (ENOMEM); 14511 } 14512 rack->r_fsb_inited = 0; 14513 return (0); 14514 } 14515 14516 static void 14517 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) 14518 { 14519 /* 14520 * Types of logs (mod value) 14521 * 20 - Initial round setup 14522 * 21 - Rack declares a new round. 14523 */ 14524 struct tcpcb *tp; 14525 14526 tp = rack->rc_tp; 14527 if (tcp_bblogging_on(tp)) { 14528 union tcp_log_stackspecific log; 14529 struct timeval tv; 14530 14531 memset(&log, 0, sizeof(log)); 14532 log.u_bbr.flex1 = rack->r_ctl.current_round; 14533 log.u_bbr.flex2 = rack->r_ctl.roundends; 14534 log.u_bbr.flex3 = high_seq; 14535 log.u_bbr.flex4 = tp->snd_max; 14536 log.u_bbr.flex8 = mod; 14537 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14538 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 14539 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 14540 TCP_LOG_EVENTP(tp, NULL, 14541 &tptosocket(tp)->so_rcv, 14542 &tptosocket(tp)->so_snd, 14543 TCP_HYSTART, 0, 14544 0, &log, false, &tv); 14545 } 14546 } 14547 14548 static void 14549 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) 14550 { 14551 rack->rack_deferred_inited = 1; 14552 rack->r_ctl.roundends = tp->snd_max; 14553 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 14554 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 14555 } 14556 14557 static void 14558 rack_init_retransmit_value(struct tcp_rack *rack, int ctl) 14559 { 14560 /* Retransmit bit controls. 14561 * 14562 * The setting of these values control one of 14563 * three settings you can have and dictate 14564 * how rack does retransmissions. Note this 14565 * is in *any* mode i.e. pacing on or off DGP 14566 * fixed rate pacing, or just bursting rack. 14567 * 14568 * 1 - Use full sized retransmits i.e. limit 14569 * the size to whatever the pace_max_segments 14570 * size is. 14571 * 14572 * 2 - Use pacer min granularity as a guide to 14573 * the size combined with the current calculated 14574 * goodput b/w measurement. So for example if 14575 * the goodput is measured at 20Mbps we would 14576 * calculate 8125 (pacer minimum 250usec in 14577 * that b/w) and then round it up to the next 14578 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. 14579 * 14580 * 0 - The rack default 1 MSS (anything not 0/1/2 14581 * fall here too if we are setting via rack_init()). 14582 * 14583 */ 14584 if (ctl == 1) { 14585 rack->full_size_rxt = 1; 14586 rack->shape_rxt_to_pacing_min = 0; 14587 } else if (ctl == 2) { 14588 rack->full_size_rxt = 0; 14589 rack->shape_rxt_to_pacing_min = 1; 14590 } else { 14591 rack->full_size_rxt = 0; 14592 rack->shape_rxt_to_pacing_min = 0; 14593 } 14594 } 14595 14596 static void 14597 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, 14598 uint32_t flex1, 14599 uint32_t flex2, 14600 uint32_t flex3) 14601 { 14602 if (tcp_bblogging_on(rack->rc_tp)) { 14603 union tcp_log_stackspecific log; 14604 struct timeval tv; 14605 14606 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14607 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14608 log.u_bbr.flex8 = mod; 14609 log.u_bbr.flex1 = flex1; 14610 log.u_bbr.flex2 = flex2; 14611 log.u_bbr.flex3 = flex3; 14612 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, 14613 0, &log, false, NULL, __func__, __LINE__, &tv); 14614 } 14615 } 14616 14617 static int 14618 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) 14619 { 14620 struct tcp_rack *rack; 14621 struct rack_sendmap *rsm; 14622 int i; 14623 14624 14625 rack = (struct tcp_rack *)tp->t_fb_ptr; 14626 switch (reqr->req) { 14627 case TCP_QUERY_SENDMAP: 14628 if ((reqr->req_param == tp->snd_max) || 14629 (tp->snd_max == tp->snd_una)){ 14630 /* Unlikely */ 14631 return (0); 14632 } 14633 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); 14634 if (rsm == NULL) { 14635 /* Can't find that seq -- unlikely */ 14636 return (0); 14637 } 14638 reqr->sendmap_start = rsm->r_start; 14639 reqr->sendmap_end = rsm->r_end; 14640 reqr->sendmap_send_cnt = rsm->r_rtr_cnt; 14641 reqr->sendmap_fas = rsm->r_fas; 14642 if (reqr->sendmap_send_cnt > SNDMAP_NRTX) 14643 reqr->sendmap_send_cnt = SNDMAP_NRTX; 14644 for(i=0; i<reqr->sendmap_send_cnt; i++) 14645 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; 14646 reqr->sendmap_ack_arrival = rsm->r_ack_arrival; 14647 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; 14648 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; 14649 reqr->sendmap_dupacks = rsm->r_dupack; 14650 rack_log_chg_info(tp, rack, 1, 14651 rsm->r_start, 14652 rsm->r_end, 14653 rsm->r_flags); 14654 return(1); 14655 break; 14656 case TCP_QUERY_TIMERS_UP: 14657 if (rack->r_ctl.rc_hpts_flags == 0) { 14658 /* no timers up */ 14659 return (0); 14660 } 14661 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; 14662 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14663 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; 14664 } 14665 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14666 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; 14667 } 14668 rack_log_chg_info(tp, rack, 2, 14669 rack->r_ctl.rc_hpts_flags, 14670 rack->r_ctl.rc_last_output_to, 14671 rack->r_ctl.rc_timer_exp); 14672 return (1); 14673 break; 14674 case TCP_QUERY_RACK_TIMES: 14675 /* Reordering items */ 14676 reqr->rack_num_dsacks = rack->r_ctl.num_dsack; 14677 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; 14678 /* Timerstamps and timers */ 14679 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; 14680 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; 14681 reqr->rack_rtt = rack->rc_rack_rtt; 14682 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; 14683 reqr->rack_srtt_measured = rack->rc_srtt_measure_made; 14684 /* PRR data */ 14685 reqr->rack_sacked = rack->r_ctl.rc_sacked; 14686 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; 14687 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; 14688 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; 14689 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; 14690 reqr->rack_prr_out = rack->r_ctl.rc_prr_out; 14691 /* TLP and persists info */ 14692 reqr->rack_tlp_out = rack->rc_tlp_in_progress; 14693 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; 14694 if (rack->rc_in_persist) { 14695 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; 14696 reqr->rack_in_persist = 1; 14697 } else { 14698 reqr->rack_time_went_idle = 0; 14699 reqr->rack_in_persist = 0; 14700 } 14701 if (rack->r_wanted_output) 14702 reqr->rack_wanted_output = 1; 14703 else 14704 reqr->rack_wanted_output = 0; 14705 return (1); 14706 break; 14707 default: 14708 return (-EINVAL); 14709 } 14710 } 14711 14712 static void 14713 rack_switch_failed(struct tcpcb *tp) 14714 { 14715 /* 14716 * This method gets called if a stack switch was 14717 * attempted and it failed. We are left 14718 * but our hpts timers were stopped and we 14719 * need to validate time units and t_flags2. 14720 */ 14721 struct tcp_rack *rack; 14722 struct timeval tv; 14723 uint32_t cts; 14724 uint32_t toval; 14725 struct hpts_diag diag; 14726 14727 rack = (struct tcp_rack *)tp->t_fb_ptr; 14728 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 14729 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14730 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14731 else 14732 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14733 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14734 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14735 if (tp->t_in_hpts > IHPTS_NONE) { 14736 /* Strange */ 14737 return; 14738 } 14739 cts = tcp_get_usecs(&tv); 14740 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14741 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 14742 toval = rack->r_ctl.rc_last_output_to - cts; 14743 } else { 14744 /* one slot please */ 14745 toval = HPTS_TICKS_PER_SLOT; 14746 } 14747 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14748 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 14749 toval = rack->r_ctl.rc_timer_exp - cts; 14750 } else { 14751 /* one slot please */ 14752 toval = HPTS_TICKS_PER_SLOT; 14753 } 14754 } else 14755 toval = HPTS_TICKS_PER_SLOT; 14756 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), 14757 __LINE__, &diag); 14758 rack_log_hpts_diag(rack, cts, &diag, &tv); 14759 } 14760 14761 static int 14762 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) 14763 { 14764 struct rack_sendmap *rsm, *ersm; 14765 int insret __diagused; 14766 /* 14767 * When initing outstanding, we must be quite careful 14768 * to not refer to tp->t_fb_ptr. This has the old rack 14769 * pointer in it, not the "new" one (when we are doing 14770 * a stack switch). 14771 */ 14772 14773 14774 if (tp->t_fb->tfb_chg_query == NULL) { 14775 /* Create a send map for the current outstanding data */ 14776 14777 rsm = rack_alloc(rack); 14778 if (rsm == NULL) { 14779 uma_zfree(rack_pcb_zone, ptr); 14780 return (ENOMEM); 14781 } 14782 rsm->r_no_rtt_allowed = 1; 14783 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 14784 rsm->r_rtr_cnt = 1; 14785 rsm->r_rtr_bytes = 0; 14786 if (tp->t_flags & TF_SENTFIN) 14787 rsm->r_flags |= RACK_HAS_FIN; 14788 rsm->r_end = tp->snd_max; 14789 if (tp->snd_una == tp->iss) { 14790 /* The data space is one beyond snd_una */ 14791 rsm->r_flags |= RACK_HAS_SYN; 14792 rsm->r_start = tp->iss; 14793 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 14794 } else 14795 rsm->r_start = tp->snd_una; 14796 rsm->r_dupack = 0; 14797 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 14798 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 14799 if (rsm->m) { 14800 rsm->orig_m_len = rsm->m->m_len; 14801 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14802 } else { 14803 rsm->orig_m_len = 0; 14804 rsm->orig_t_space = 0; 14805 } 14806 } else { 14807 /* 14808 * This can happen if we have a stand-alone FIN or 14809 * SYN. 14810 */ 14811 rsm->m = NULL; 14812 rsm->orig_m_len = 0; 14813 rsm->orig_t_space = 0; 14814 rsm->soff = 0; 14815 } 14816 #ifdef INVARIANTS 14817 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14818 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14819 insret, rack, rsm); 14820 } 14821 #else 14822 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14823 #endif 14824 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14825 rsm->r_in_tmap = 1; 14826 } else { 14827 /* We have a query mechanism, lets use it */ 14828 struct tcp_query_resp qr; 14829 int i; 14830 tcp_seq at; 14831 14832 at = tp->snd_una; 14833 while (at != tp->snd_max) { 14834 memset(&qr, 0, sizeof(qr)); 14835 qr.req = TCP_QUERY_SENDMAP; 14836 qr.req_param = at; 14837 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) 14838 break; 14839 /* Move forward */ 14840 at = qr.sendmap_end; 14841 /* Now lets build the entry for this one */ 14842 rsm = rack_alloc(rack); 14843 if (rsm == NULL) { 14844 uma_zfree(rack_pcb_zone, ptr); 14845 return (ENOMEM); 14846 } 14847 memset(rsm, 0, sizeof(struct rack_sendmap)); 14848 /* Now configure the rsm and insert it */ 14849 rsm->r_dupack = qr.sendmap_dupacks; 14850 rsm->r_start = qr.sendmap_start; 14851 rsm->r_end = qr.sendmap_end; 14852 if (qr.sendmap_fas) 14853 rsm->r_fas = qr.sendmap_end; 14854 else 14855 rsm->r_fas = rsm->r_start - tp->snd_una; 14856 /* 14857 * We have carefully aligned the bits 14858 * so that all we have to do is copy over 14859 * the bits with the mask. 14860 */ 14861 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; 14862 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; 14863 rsm->r_rtr_cnt = qr.sendmap_send_cnt; 14864 rsm->r_ack_arrival = qr.sendmap_ack_arrival; 14865 for (i=0 ; i<rsm->r_rtr_cnt; i++) 14866 rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; 14867 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 14868 (rsm->r_start - tp->snd_una), &rsm->soff); 14869 if (rsm->m) { 14870 rsm->orig_m_len = rsm->m->m_len; 14871 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14872 } else { 14873 rsm->orig_m_len = 0; 14874 rsm->orig_t_space = 0; 14875 } 14876 #ifdef INVARIANTS 14877 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14878 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14879 insret, rack, rsm); 14880 } 14881 #else 14882 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14883 #endif 14884 if ((rsm->r_flags & RACK_ACKED) == 0) { 14885 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { 14886 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > 14887 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { 14888 /* 14889 * If the existing ersm was sent at 14890 * a later time than the new one, then 14891 * the new one should appear ahead of this 14892 * ersm. 14893 */ 14894 rsm->r_in_tmap = 1; 14895 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); 14896 break; 14897 } 14898 } 14899 if (rsm->r_in_tmap == 0) { 14900 /* 14901 * Not found so shove it on the tail. 14902 */ 14903 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14904 rsm->r_in_tmap = 1; 14905 } 14906 } else { 14907 if ((rack->r_ctl.rc_sacklast == NULL) || 14908 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { 14909 rack->r_ctl.rc_sacklast = rsm; 14910 } 14911 } 14912 rack_log_chg_info(tp, rack, 3, 14913 rsm->r_start, 14914 rsm->r_end, 14915 rsm->r_flags); 14916 } 14917 } 14918 return (0); 14919 } 14920 14921 static void 14922 rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval) 14923 { 14924 /* 14925 * P = percent bits 14926 * F = fill cw bit -- Toggle fillcw if this bit is set. 14927 * S = Segment bits 14928 * M = set max segment bit 14929 * U = Unclamined 14930 * C = If set to non-zero override the max number of clamps. 14931 * L = Bit to indicate if clamped gets lower. 14932 * 14933 * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP 14934 * 14935 * The lowest 3 nibbles is the perentage .1 - 6553.5% 14936 * where 10.1 = 101, max 6553.5 14937 * The upper 16 bits holds some options. 14938 * The F bit will turn on fill-cw on if you are 14939 * not pacing, it will turn it off if dgp is on. 14940 * The L bit will change it so when clamped we get 14941 * the min(gp, lt-bw) for dgp. 14942 */ 14943 uint16_t per; 14944 14945 rack->r_ctl.saved_rxt_clamp_val = optval; 14946 per = optval & 0x0000ffff; 14947 rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff); 14948 if (optval > 0) { 14949 uint16_t clamp_opt; 14950 14951 rack->excess_rxt_on = 1; 14952 clamp_opt = ((optval & 0xffff0000) >> 16); 14953 rack->r_ctl.clamp_options = clamp_opt & 0x00ff; 14954 if (clamp_opt & 0xff00) { 14955 /* A max clamps is also present */ 14956 rack->r_ctl.max_clamps = (clamp_opt >> 8); 14957 } else { 14958 /* No specified clamps means no limit */ 14959 rack->r_ctl.max_clamps = 0; 14960 } 14961 if (rack->r_ctl.clamp_options & 0x0002) { 14962 rack->r_clamped_gets_lower = 1; 14963 } else { 14964 rack->r_clamped_gets_lower = 0; 14965 } 14966 } else { 14967 /* Turn it off back to default */ 14968 rack->excess_rxt_on = 0; 14969 rack->r_clamped_gets_lower = 0; 14970 } 14971 14972 } 14973 14974 14975 static int32_t 14976 rack_init(struct tcpcb *tp, void **ptr) 14977 { 14978 struct inpcb *inp = tptoinpcb(tp); 14979 struct tcp_rack *rack = NULL; 14980 uint32_t iwin, snt, us_cts; 14981 int err, no_query; 14982 14983 tcp_hpts_init(tp); 14984 14985 /* 14986 * First are we the initial or are we a switched stack? 14987 * If we are initing via tcp_newtcppcb the ptr passed 14988 * will be tp->t_fb_ptr. If its a stack switch that 14989 * has a previous stack we can query it will be a local 14990 * var that will in the end be set into t_fb_ptr. 14991 */ 14992 if (ptr == &tp->t_fb_ptr) 14993 no_query = 1; 14994 else 14995 no_query = 0; 14996 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 14997 if (*ptr == NULL) { 14998 /* 14999 * We need to allocate memory but cant. The INP and INP_INFO 15000 * locks and they are recursive (happens during setup. So a 15001 * scheme to drop the locks fails :( 15002 * 15003 */ 15004 return(ENOMEM); 15005 } 15006 memset(*ptr, 0, sizeof(struct tcp_rack)); 15007 rack = (struct tcp_rack *)*ptr; 15008 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); 15009 if (rack->r_ctl.tqh == NULL) { 15010 uma_zfree(rack_pcb_zone, rack); 15011 return(ENOMEM); 15012 } 15013 tqhash_init(rack->r_ctl.tqh); 15014 TAILQ_INIT(&rack->r_ctl.rc_free); 15015 TAILQ_INIT(&rack->r_ctl.rc_tmap); 15016 rack->rc_tp = tp; 15017 rack->rc_inp = inp; 15018 /* Set the flag */ 15019 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 15020 /* Probably not needed but lets be sure */ 15021 rack_clear_rate_sample(rack); 15022 /* 15023 * Save off the default values, socket options will poke 15024 * at these if pacing is not on or we have not yet 15025 * reached where pacing is on (gp_ready/fixed enabled). 15026 * When they get set into the CC module (when gp_ready 15027 * is enabled or we enable fixed) then we will set these 15028 * values into the CC and place in here the old values 15029 * so we have a restoral. Then we will set the flag 15030 * rc_pacing_cc_set. That way whenever we turn off pacing 15031 * or switch off this stack, we will know to go restore 15032 * the saved values. 15033 * 15034 * We specifically put into the beta the ecn value for pacing. 15035 */ 15036 rack->rc_new_rnd_needed = 1; 15037 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; 15038 /* We want abe like behavior as well */ 15039 rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 15040 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 15041 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 15042 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 15043 if (rack_rxt_clamp_thresh) { 15044 rack_translate_clamp_value(rack, rack_rxt_clamp_thresh); 15045 rack->excess_rxt_on = 1; 15046 } 15047 if (rack_uses_full_dgp_in_rec) 15048 rack->r_ctl.full_dgp_in_rec = 1; 15049 if (rack_fill_cw_state) 15050 rack->rc_pace_to_cwnd = 1; 15051 if (rack_pacing_min_seg) 15052 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; 15053 if (use_rack_rr) 15054 rack->use_rack_rr = 1; 15055 if (rack_dnd_default) { 15056 rack->rc_pace_dnd = 1; 15057 } 15058 if (V_tcp_delack_enabled) 15059 tp->t_delayed_ack = 1; 15060 else 15061 tp->t_delayed_ack = 0; 15062 #ifdef TCP_ACCOUNTING 15063 if (rack_tcp_accounting) { 15064 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 15065 } 15066 #endif 15067 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 15068 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 15069 if (rack_enable_shared_cwnd) 15070 rack->rack_enable_scwnd = 1; 15071 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 15072 rack->rc_user_set_max_segs = rack_hptsi_segments; 15073 rack->rc_force_max_seg = 0; 15074 TAILQ_INIT(&rack->r_ctl.opt_list); 15075 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 15076 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 15077 if (rack_hibeta_setting) { 15078 rack->rack_hibeta = 1; 15079 if ((rack_hibeta_setting >= 50) && 15080 (rack_hibeta_setting <= 100)) { 15081 rack->r_ctl.rc_saved_beta.beta = rack_hibeta_setting; 15082 rack->r_ctl.saved_hibeta = rack_hibeta_setting; 15083 } 15084 } else { 15085 rack->r_ctl.saved_hibeta = 50; 15086 } 15087 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 15088 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 15089 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 15090 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 15091 rack->r_ctl.rc_highest_us_rtt = 0; 15092 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 15093 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 15094 if (rack_use_cmp_acks) 15095 rack->r_use_cmp_ack = 1; 15096 if (rack_disable_prr) 15097 rack->rack_no_prr = 1; 15098 if (rack_gp_no_rec_chg) 15099 rack->rc_gp_no_rec_chg = 1; 15100 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 15101 rack->rc_always_pace = 1; 15102 if (rack->rack_hibeta) 15103 rack_set_cc_pacing(rack); 15104 } else 15105 rack->rc_always_pace = 0; 15106 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 15107 rack->r_mbuf_queue = 1; 15108 else 15109 rack->r_mbuf_queue = 0; 15110 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15111 if (rack_limits_scwnd) 15112 rack->r_limit_scw = 1; 15113 else 15114 rack->r_limit_scw = 0; 15115 rack_init_retransmit_value(rack, rack_rxt_controls); 15116 rack->rc_labc = V_tcp_abc_l_var; 15117 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 15118 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 15119 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 15120 rack->r_ctl.rc_min_to = rack_min_to; 15121 microuptime(&rack->r_ctl.act_rcv_time); 15122 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 15123 rack->rc_init_win = rack_default_init_window; 15124 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 15125 if (rack_hw_up_only) 15126 rack->r_up_only = 1; 15127 if (rack_do_dyn_mul) { 15128 /* When dynamic adjustment is on CA needs to start at 100% */ 15129 rack->rc_gp_dyn_mul = 1; 15130 if (rack_do_dyn_mul >= 100) 15131 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 15132 } else 15133 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 15134 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 15135 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 15136 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 15137 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 15138 rack_probertt_filter_life); 15139 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 15140 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 15141 rack->r_ctl.rc_time_of_last_probertt = us_cts; 15142 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 15143 rack->r_ctl.rc_time_probertt_starts = 0; 15144 if (rack_dsack_std_based & 0x1) { 15145 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 15146 rack->rc_rack_tmr_std_based = 1; 15147 } 15148 if (rack_dsack_std_based & 0x2) { 15149 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 15150 rack->rc_rack_use_dsack = 1; 15151 } 15152 /* We require at least one measurement, even if the sysctl is 0 */ 15153 if (rack_req_measurements) 15154 rack->r_ctl.req_measurements = rack_req_measurements; 15155 else 15156 rack->r_ctl.req_measurements = 1; 15157 if (rack_enable_hw_pacing) 15158 rack->rack_hdw_pace_ena = 1; 15159 if (rack_hw_rate_caps) 15160 rack->r_rack_hw_rate_caps = 1; 15161 #ifdef TCP_SAD_DETECTION 15162 rack->do_detection = 1; 15163 #else 15164 rack->do_detection = 0; 15165 #endif 15166 if (rack_non_rxt_use_cr) 15167 rack->rack_rec_nonrxt_use_cr = 1; 15168 /* Lets setup the fsb block */ 15169 err = rack_init_fsb(tp, rack); 15170 if (err) { 15171 uma_zfree(rack_pcb_zone, *ptr); 15172 *ptr = NULL; 15173 return (err); 15174 } 15175 if (rack_do_hystart) { 15176 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 15177 if (rack_do_hystart > 1) 15178 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 15179 if (rack_do_hystart > 2) 15180 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 15181 } 15182 /* Log what we will do with queries */ 15183 rack_log_chg_info(tp, rack, 7, 15184 no_query, 0, 0); 15185 if (rack_def_profile) 15186 rack_set_profile(rack, rack_def_profile); 15187 /* Cancel the GP measurement in progress */ 15188 tp->t_flags &= ~TF_GPUTINPROG; 15189 if ((tp->t_state != TCPS_CLOSED) && 15190 (tp->t_state != TCPS_TIME_WAIT)) { 15191 /* 15192 * We are already open, we may 15193 * need to adjust a few things. 15194 */ 15195 if (SEQ_GT(tp->snd_max, tp->iss)) 15196 snt = tp->snd_max - tp->iss; 15197 else 15198 snt = 0; 15199 iwin = rc_init_window(rack); 15200 if ((snt < iwin) && 15201 (no_query == 1)) { 15202 /* We are not past the initial window 15203 * on the first init (i.e. a stack switch 15204 * has not yet occured) so we need to make 15205 * sure cwnd and ssthresh is correct. 15206 */ 15207 if (tp->snd_cwnd < iwin) 15208 tp->snd_cwnd = iwin; 15209 /* 15210 * If we are within the initial window 15211 * we want ssthresh to be unlimited. Setting 15212 * it to the rwnd (which the default stack does 15213 * and older racks) is not really a good idea 15214 * since we want to be in SS and grow both the 15215 * cwnd and the rwnd (via dynamic rwnd growth). If 15216 * we set it to the rwnd then as the peer grows its 15217 * rwnd we will be stuck in CA and never hit SS. 15218 * 15219 * Its far better to raise it up high (this takes the 15220 * risk that there as been a loss already, probably 15221 * we should have an indicator in all stacks of loss 15222 * but we don't), but considering the normal use this 15223 * is a risk worth taking. The consequences of not 15224 * hitting SS are far worse than going one more time 15225 * into it early on (before we have sent even a IW). 15226 * It is highly unlikely that we will have had a loss 15227 * before getting the IW out. 15228 */ 15229 tp->snd_ssthresh = 0xffffffff; 15230 } 15231 /* 15232 * Any init based on sequence numbers 15233 * should be done in the deferred init path 15234 * since we can be CLOSED and not have them 15235 * inited when rack_init() is called. We 15236 * are not closed so lets call it. 15237 */ 15238 rack_deferred_init(tp, rack); 15239 } 15240 if ((tp->t_state != TCPS_CLOSED) && 15241 (tp->t_state != TCPS_TIME_WAIT) && 15242 (no_query == 0) && 15243 (tp->snd_una != tp->snd_max)) { 15244 err = rack_init_outstanding(tp, rack, us_cts, *ptr); 15245 if (err) { 15246 *ptr = NULL; 15247 return(err); 15248 } 15249 } 15250 rack_stop_all_timers(tp, rack); 15251 /* Setup all the t_flags2 */ 15252 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 15253 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 15254 else 15255 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 15256 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15257 tp->t_flags2 |= TF2_MBUF_ACKCMP; 15258 /* 15259 * Timers in Rack are kept in microseconds so lets 15260 * convert any initial incoming variables 15261 * from ticks into usecs. Note that we 15262 * also change the values of t_srtt and t_rttvar, if 15263 * they are non-zero. They are kept with a 5 15264 * bit decimal so we have to carefully convert 15265 * these to get the full precision. 15266 */ 15267 rack_convert_rtts(tp); 15268 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); 15269 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { 15270 /* We do not start any timers on DROPPED connections */ 15271 if (tp->t_fb->tfb_chg_query == NULL) { 15272 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15273 } else { 15274 struct tcp_query_resp qr; 15275 int ret; 15276 15277 memset(&qr, 0, sizeof(qr)); 15278 15279 /* Get the misc time stamps and such for rack */ 15280 qr.req = TCP_QUERY_RACK_TIMES; 15281 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15282 if (ret == 1) { 15283 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; 15284 rack->r_ctl.num_dsack = qr.rack_num_dsacks; 15285 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; 15286 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; 15287 rack->rc_rack_rtt = qr.rack_rtt; 15288 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; 15289 rack->r_ctl.rc_sacked = qr.rack_sacked; 15290 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; 15291 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; 15292 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; 15293 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; 15294 rack->r_ctl.rc_prr_out = qr.rack_prr_out; 15295 if (qr.rack_tlp_out) { 15296 rack->rc_tlp_in_progress = 1; 15297 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; 15298 } else { 15299 rack->rc_tlp_in_progress = 0; 15300 rack->r_ctl.rc_tlp_cnt_out = 0; 15301 } 15302 if (qr.rack_srtt_measured) 15303 rack->rc_srtt_measure_made = 1; 15304 if (qr.rack_in_persist == 1) { 15305 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; 15306 #ifdef NETFLIX_SHARED_CWND 15307 if (rack->r_ctl.rc_scw) { 15308 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 15309 rack->rack_scwnd_is_idle = 1; 15310 } 15311 #endif 15312 rack->r_ctl.persist_lost_ends = 0; 15313 rack->probe_not_answered = 0; 15314 rack->forced_ack = 0; 15315 tp->t_rxtshift = 0; 15316 rack->rc_in_persist = 1; 15317 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 15318 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 15319 } 15320 if (qr.rack_wanted_output) 15321 rack->r_wanted_output = 1; 15322 rack_log_chg_info(tp, rack, 6, 15323 qr.rack_min_rtt, 15324 qr.rack_rtt, 15325 qr.rack_reorder_ts); 15326 } 15327 /* Get the old stack timers */ 15328 qr.req_param = 0; 15329 qr.req = TCP_QUERY_TIMERS_UP; 15330 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15331 if (ret) { 15332 /* 15333 * non-zero return means we have a timer('s) 15334 * to start. Zero means no timer (no keepalive 15335 * I suppose). 15336 */ 15337 uint32_t tov = 0; 15338 15339 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; 15340 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { 15341 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; 15342 if (TSTMP_GT(qr.timer_pacing_to, us_cts)) 15343 tov = qr.timer_pacing_to - us_cts; 15344 else 15345 tov = HPTS_TICKS_PER_SLOT; 15346 } 15347 if (qr.timer_hpts_flags & PACE_TMR_MASK) { 15348 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; 15349 if (tov == 0) { 15350 if (TSTMP_GT(qr.timer_timer_exp, us_cts)) 15351 tov = qr.timer_timer_exp - us_cts; 15352 else 15353 tov = HPTS_TICKS_PER_SLOT; 15354 } 15355 } 15356 rack_log_chg_info(tp, rack, 4, 15357 rack->r_ctl.rc_hpts_flags, 15358 rack->r_ctl.rc_last_output_to, 15359 rack->r_ctl.rc_timer_exp); 15360 if (tov) { 15361 struct hpts_diag diag; 15362 15363 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), 15364 __LINE__, &diag); 15365 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); 15366 } 15367 } 15368 } 15369 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 15370 __LINE__, RACK_RTTS_INIT); 15371 } 15372 return (0); 15373 } 15374 15375 static int 15376 rack_handoff_ok(struct tcpcb *tp) 15377 { 15378 if ((tp->t_state == TCPS_CLOSED) || 15379 (tp->t_state == TCPS_LISTEN)) { 15380 /* Sure no problem though it may not stick */ 15381 return (0); 15382 } 15383 if ((tp->t_state == TCPS_SYN_SENT) || 15384 (tp->t_state == TCPS_SYN_RECEIVED)) { 15385 /* 15386 * We really don't know if you support sack, 15387 * you have to get to ESTAB or beyond to tell. 15388 */ 15389 return (EAGAIN); 15390 } 15391 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 15392 /* 15393 * Rack will only send a FIN after all data is acknowledged. 15394 * So in this case we have more data outstanding. We can't 15395 * switch stacks until either all data and only the FIN 15396 * is left (in which case rack_init() now knows how 15397 * to deal with that) <or> all is acknowledged and we 15398 * are only left with incoming data, though why you 15399 * would want to switch to rack after all data is acknowledged 15400 * I have no idea (rrs)! 15401 */ 15402 return (EAGAIN); 15403 } 15404 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 15405 return (0); 15406 } 15407 /* 15408 * If we reach here we don't do SACK on this connection so we can 15409 * never do rack. 15410 */ 15411 return (EINVAL); 15412 } 15413 15414 static void 15415 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 15416 { 15417 15418 if (tp->t_fb_ptr) { 15419 uint32_t cnt_free = 0; 15420 struct tcp_rack *rack; 15421 struct rack_sendmap *rsm; 15422 15423 tcp_handle_orphaned_packets(tp); 15424 tp->t_flags &= ~TF_FORCEDATA; 15425 rack = (struct tcp_rack *)tp->t_fb_ptr; 15426 rack_log_pacing_delay_calc(rack, 15427 0, 15428 0, 15429 0, 15430 rack_get_gp_est(rack), /* delRate */ 15431 rack_get_lt_bw(rack), /* rttProp */ 15432 20, __LINE__, NULL, 0); 15433 #ifdef NETFLIX_SHARED_CWND 15434 if (rack->r_ctl.rc_scw) { 15435 uint32_t limit; 15436 15437 if (rack->r_limit_scw) 15438 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 15439 else 15440 limit = 0; 15441 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 15442 rack->r_ctl.rc_scw_index, 15443 limit); 15444 rack->r_ctl.rc_scw = NULL; 15445 } 15446 #endif 15447 if (rack->r_ctl.fsb.tcp_ip_hdr) { 15448 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 15449 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 15450 rack->r_ctl.fsb.th = NULL; 15451 } 15452 if (rack->rc_always_pace) { 15453 tcp_decrement_paced_conn(); 15454 rack_undo_cc_pacing(rack); 15455 rack->rc_always_pace = 0; 15456 } 15457 /* Clean up any options if they were not applied */ 15458 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 15459 struct deferred_opt_list *dol; 15460 15461 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 15462 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 15463 free(dol, M_TCPDO); 15464 } 15465 /* rack does not use force data but other stacks may clear it */ 15466 if (rack->r_ctl.crte != NULL) { 15467 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 15468 rack->rack_hdrw_pacing = 0; 15469 rack->r_ctl.crte = NULL; 15470 } 15471 #ifdef TCP_BLACKBOX 15472 tcp_log_flowend(tp); 15473 #endif 15474 /* 15475 * Lets take a different approach to purging just 15476 * get each one and free it like a cum-ack would and 15477 * not use a foreach loop. 15478 */ 15479 rsm = tqhash_min(rack->r_ctl.tqh); 15480 while (rsm) { 15481 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 15482 rack->r_ctl.rc_num_maps_alloced--; 15483 uma_zfree(rack_zone, rsm); 15484 rsm = tqhash_min(rack->r_ctl.tqh); 15485 } 15486 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15487 while (rsm) { 15488 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 15489 rack->r_ctl.rc_num_maps_alloced--; 15490 rack->rc_free_cnt--; 15491 cnt_free++; 15492 uma_zfree(rack_zone, rsm); 15493 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15494 } 15495 if ((rack->r_ctl.rc_num_maps_alloced > 0) && 15496 (tcp_bblogging_on(tp))) { 15497 union tcp_log_stackspecific log; 15498 struct timeval tv; 15499 15500 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15501 log.u_bbr.flex8 = 10; 15502 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; 15503 log.u_bbr.flex2 = rack->rc_free_cnt; 15504 log.u_bbr.flex3 = cnt_free; 15505 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15506 rsm = tqhash_min(rack->r_ctl.tqh); 15507 log.u_bbr.delRate = (uint64_t)rsm; 15508 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15509 log.u_bbr.cur_del_rate = (uint64_t)rsm; 15510 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15511 log.u_bbr.pkt_epoch = __LINE__; 15512 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15513 0, &log, false, NULL, NULL, 0, &tv); 15514 } 15515 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), 15516 ("rack:%p num_aloc:%u after freeing all?", 15517 rack, 15518 rack->r_ctl.rc_num_maps_alloced)); 15519 rack->rc_free_cnt = 0; 15520 free(rack->r_ctl.tqh, M_TCPFSB); 15521 rack->r_ctl.tqh = NULL; 15522 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 15523 tp->t_fb_ptr = NULL; 15524 } 15525 /* Make sure snd_nxt is correctly set */ 15526 tp->snd_nxt = tp->snd_max; 15527 } 15528 15529 static void 15530 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 15531 { 15532 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 15533 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 15534 } 15535 switch (tp->t_state) { 15536 case TCPS_SYN_SENT: 15537 rack->r_state = TCPS_SYN_SENT; 15538 rack->r_substate = rack_do_syn_sent; 15539 break; 15540 case TCPS_SYN_RECEIVED: 15541 rack->r_state = TCPS_SYN_RECEIVED; 15542 rack->r_substate = rack_do_syn_recv; 15543 break; 15544 case TCPS_ESTABLISHED: 15545 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15546 rack->r_state = TCPS_ESTABLISHED; 15547 rack->r_substate = rack_do_established; 15548 break; 15549 case TCPS_CLOSE_WAIT: 15550 rack->r_state = TCPS_CLOSE_WAIT; 15551 rack->r_substate = rack_do_close_wait; 15552 break; 15553 case TCPS_FIN_WAIT_1: 15554 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15555 rack->r_state = TCPS_FIN_WAIT_1; 15556 rack->r_substate = rack_do_fin_wait_1; 15557 break; 15558 case TCPS_CLOSING: 15559 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15560 rack->r_state = TCPS_CLOSING; 15561 rack->r_substate = rack_do_closing; 15562 break; 15563 case TCPS_LAST_ACK: 15564 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15565 rack->r_state = TCPS_LAST_ACK; 15566 rack->r_substate = rack_do_lastack; 15567 break; 15568 case TCPS_FIN_WAIT_2: 15569 rack->r_state = TCPS_FIN_WAIT_2; 15570 rack->r_substate = rack_do_fin_wait_2; 15571 break; 15572 case TCPS_LISTEN: 15573 case TCPS_CLOSED: 15574 case TCPS_TIME_WAIT: 15575 default: 15576 break; 15577 }; 15578 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15579 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 15580 15581 } 15582 15583 static void 15584 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 15585 { 15586 /* 15587 * We received an ack, and then did not 15588 * call send or were bounced out due to the 15589 * hpts was running. Now a timer is up as well, is 15590 * it the right timer? 15591 */ 15592 struct rack_sendmap *rsm; 15593 int tmr_up; 15594 15595 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 15596 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 15597 return; 15598 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 15599 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 15600 (tmr_up == PACE_TMR_RXT)) { 15601 /* Should be an RXT */ 15602 return; 15603 } 15604 if (rsm == NULL) { 15605 /* Nothing outstanding? */ 15606 if (tp->t_flags & TF_DELACK) { 15607 if (tmr_up == PACE_TMR_DELACK) 15608 /* We are supposed to have delayed ack up and we do */ 15609 return; 15610 } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) { 15611 /* 15612 * if we hit enobufs then we would expect the possibility 15613 * of nothing outstanding and the RXT up (and the hptsi timer). 15614 */ 15615 return; 15616 } else if (((V_tcp_always_keepalive || 15617 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 15618 (tp->t_state <= TCPS_CLOSING)) && 15619 (tmr_up == PACE_TMR_KEEP) && 15620 (tp->snd_max == tp->snd_una)) { 15621 /* We should have keep alive up and we do */ 15622 return; 15623 } 15624 } 15625 if (SEQ_GT(tp->snd_max, tp->snd_una) && 15626 ((tmr_up == PACE_TMR_TLP) || 15627 (tmr_up == PACE_TMR_RACK) || 15628 (tmr_up == PACE_TMR_RXT))) { 15629 /* 15630 * Either a Rack, TLP or RXT is fine if we 15631 * have outstanding data. 15632 */ 15633 return; 15634 } else if (tmr_up == PACE_TMR_DELACK) { 15635 /* 15636 * If the delayed ack was going to go off 15637 * before the rtx/tlp/rack timer were going to 15638 * expire, then that would be the timer in control. 15639 * Note we don't check the time here trusting the 15640 * code is correct. 15641 */ 15642 return; 15643 } 15644 /* 15645 * Ok the timer originally started is not what we want now. 15646 * We will force the hpts to be stopped if any, and restart 15647 * with the slot set to what was in the saved slot. 15648 */ 15649 if (tcp_in_hpts(rack->rc_tp)) { 15650 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 15651 uint32_t us_cts; 15652 15653 us_cts = tcp_get_usecs(NULL); 15654 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 15655 rack->r_early = 1; 15656 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 15657 } 15658 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 15659 } 15660 tcp_hpts_remove(rack->rc_tp); 15661 } 15662 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15663 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15664 } 15665 15666 15667 static void 15668 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) 15669 { 15670 if ((SEQ_LT(tp->snd_wl1, seq) || 15671 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 15672 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 15673 /* keep track of pure window updates */ 15674 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 15675 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 15676 tp->snd_wnd = tiwin; 15677 rack_validate_fo_sendwin_up(tp, rack); 15678 tp->snd_wl1 = seq; 15679 tp->snd_wl2 = ack; 15680 if (tp->snd_wnd > tp->max_sndwnd) 15681 tp->max_sndwnd = tp->snd_wnd; 15682 rack->r_wanted_output = 1; 15683 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 15684 tp->snd_wnd = tiwin; 15685 rack_validate_fo_sendwin_up(tp, rack); 15686 tp->snd_wl1 = seq; 15687 tp->snd_wl2 = ack; 15688 } else { 15689 /* Not a valid win update */ 15690 return; 15691 } 15692 if (tp->snd_wnd > tp->max_sndwnd) 15693 tp->max_sndwnd = tp->snd_wnd; 15694 /* Do we exit persists? */ 15695 if ((rack->rc_in_persist != 0) && 15696 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 15697 rack->r_ctl.rc_pace_min_segs))) { 15698 rack_exit_persist(tp, rack, cts); 15699 } 15700 /* Do we enter persists? */ 15701 if ((rack->rc_in_persist == 0) && 15702 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 15703 TCPS_HAVEESTABLISHED(tp->t_state) && 15704 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 15705 sbavail(&tptosocket(tp)->so_snd) && 15706 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 15707 /* 15708 * Here the rwnd is less than 15709 * the pacing size, we are established, 15710 * nothing is outstanding, and there is 15711 * data to send. Enter persists. 15712 */ 15713 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); 15714 } 15715 } 15716 15717 static void 15718 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 15719 { 15720 15721 if (tcp_bblogging_on(rack->rc_tp)) { 15722 struct inpcb *inp = tptoinpcb(tp); 15723 union tcp_log_stackspecific log; 15724 struct timeval ltv; 15725 char tcp_hdr_buf[60]; 15726 struct tcphdr *th; 15727 struct timespec ts; 15728 uint32_t orig_snd_una; 15729 uint8_t xx = 0; 15730 15731 #ifdef TCP_REQUEST_TRK 15732 struct tcp_sendfile_track *tcp_req; 15733 15734 if (SEQ_GT(ae->ack, tp->snd_una)) { 15735 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1)); 15736 } else { 15737 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); 15738 } 15739 #endif 15740 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15741 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 15742 if (rack->rack_no_prr == 0) 15743 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15744 else 15745 log.u_bbr.flex1 = 0; 15746 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 15747 log.u_bbr.use_lt_bw <<= 1; 15748 log.u_bbr.use_lt_bw |= rack->r_might_revert; 15749 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 15750 log.u_bbr.bbr_state = rack->rc_free_cnt; 15751 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15752 log.u_bbr.pkts_out = tp->t_maxseg; 15753 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 15754 log.u_bbr.flex7 = 1; 15755 log.u_bbr.lost = ae->flags; 15756 log.u_bbr.cwnd_gain = ackval; 15757 log.u_bbr.pacing_gain = 0x2; 15758 if (ae->flags & TSTMP_HDWR) { 15759 /* Record the hardware timestamp if present */ 15760 log.u_bbr.flex3 = M_TSTMP; 15761 ts.tv_sec = ae->timestamp / 1000000000; 15762 ts.tv_nsec = ae->timestamp % 1000000000; 15763 ltv.tv_sec = ts.tv_sec; 15764 ltv.tv_usec = ts.tv_nsec / 1000; 15765 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 15766 } else if (ae->flags & TSTMP_LRO) { 15767 /* Record the LRO the arrival timestamp */ 15768 log.u_bbr.flex3 = M_TSTMP_LRO; 15769 ts.tv_sec = ae->timestamp / 1000000000; 15770 ts.tv_nsec = ae->timestamp % 1000000000; 15771 ltv.tv_sec = ts.tv_sec; 15772 ltv.tv_usec = ts.tv_nsec / 1000; 15773 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 15774 } 15775 log.u_bbr.timeStamp = tcp_get_usecs(<v); 15776 /* Log the rcv time */ 15777 log.u_bbr.delRate = ae->timestamp; 15778 #ifdef TCP_REQUEST_TRK 15779 log.u_bbr.applimited = tp->t_tcpreq_closed; 15780 log.u_bbr.applimited <<= 8; 15781 log.u_bbr.applimited |= tp->t_tcpreq_open; 15782 log.u_bbr.applimited <<= 8; 15783 log.u_bbr.applimited |= tp->t_tcpreq_req; 15784 if (tcp_req) { 15785 /* Copy out any client req info */ 15786 /* seconds */ 15787 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 15788 /* useconds */ 15789 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 15790 log.u_bbr.rttProp = tcp_req->timestamp; 15791 log.u_bbr.cur_del_rate = tcp_req->start; 15792 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 15793 log.u_bbr.flex8 |= 1; 15794 } else { 15795 log.u_bbr.flex8 |= 2; 15796 log.u_bbr.bw_inuse = tcp_req->end; 15797 } 15798 log.u_bbr.flex6 = tcp_req->start_seq; 15799 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 15800 log.u_bbr.flex8 |= 4; 15801 log.u_bbr.epoch = tcp_req->end_seq; 15802 } 15803 } 15804 #endif 15805 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 15806 th = (struct tcphdr *)tcp_hdr_buf; 15807 th->th_seq = ae->seq; 15808 th->th_ack = ae->ack; 15809 th->th_win = ae->win; 15810 /* Now fill in the ports */ 15811 th->th_sport = inp->inp_fport; 15812 th->th_dport = inp->inp_lport; 15813 tcp_set_flags(th, ae->flags); 15814 /* Now do we have a timestamp option? */ 15815 if (ae->flags & HAS_TSTMP) { 15816 u_char *cp; 15817 uint32_t val; 15818 15819 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 15820 cp = (u_char *)(th + 1); 15821 *cp = TCPOPT_NOP; 15822 cp++; 15823 *cp = TCPOPT_NOP; 15824 cp++; 15825 *cp = TCPOPT_TIMESTAMP; 15826 cp++; 15827 *cp = TCPOLEN_TIMESTAMP; 15828 cp++; 15829 val = htonl(ae->ts_value); 15830 bcopy((char *)&val, 15831 (char *)cp, sizeof(uint32_t)); 15832 val = htonl(ae->ts_echo); 15833 bcopy((char *)&val, 15834 (char *)(cp + 4), sizeof(uint32_t)); 15835 } else 15836 th->th_off = (sizeof(struct tcphdr) >> 2); 15837 15838 /* 15839 * For sane logging we need to play a little trick. 15840 * If the ack were fully processed we would have moved 15841 * snd_una to high_seq, but since compressed acks are 15842 * processed in two phases, at this point (logging) snd_una 15843 * won't be advanced. So we would see multiple acks showing 15844 * the advancement. We can prevent that by "pretending" that 15845 * snd_una was advanced and then un-advancing it so that the 15846 * logging code has the right value for tlb_snd_una. 15847 */ 15848 if (tp->snd_una != high_seq) { 15849 orig_snd_una = tp->snd_una; 15850 tp->snd_una = high_seq; 15851 xx = 1; 15852 } else 15853 xx = 0; 15854 TCP_LOG_EVENTP(tp, th, 15855 &tptosocket(tp)->so_rcv, 15856 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 15857 0, &log, true, <v); 15858 if (xx) { 15859 tp->snd_una = orig_snd_una; 15860 } 15861 } 15862 15863 } 15864 15865 static void 15866 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 15867 { 15868 uint32_t us_rtt; 15869 /* 15870 * A persist or keep-alive was forced out, update our 15871 * min rtt time. Note now worry about lost responses. 15872 * When a subsequent keep-alive or persist times out 15873 * and forced_ack is still on, then the last probe 15874 * was not responded to. In such cases we have a 15875 * sysctl that controls the behavior. Either we apply 15876 * the rtt but with reduced confidence (0). Or we just 15877 * plain don't apply the rtt estimate. Having data flow 15878 * will clear the probe_not_answered flag i.e. cum-ack 15879 * move forward <or> exiting and reentering persists. 15880 */ 15881 15882 rack->forced_ack = 0; 15883 rack->rc_tp->t_rxtshift = 0; 15884 if ((rack->rc_in_persist && 15885 (tiwin == rack->rc_tp->snd_wnd)) || 15886 (rack->rc_in_persist == 0)) { 15887 /* 15888 * In persists only apply the RTT update if this is 15889 * a response to our window probe. And that 15890 * means the rwnd sent must match the current 15891 * snd_wnd. If it does not, then we got a 15892 * window update ack instead. For keepalive 15893 * we allow the answer no matter what the window. 15894 * 15895 * Note that if the probe_not_answered is set then 15896 * the forced_ack_ts is the oldest one i.e. the first 15897 * probe sent that might have been lost. This assures 15898 * us that if we do calculate an RTT it is longer not 15899 * some short thing. 15900 */ 15901 if (rack->rc_in_persist) 15902 counter_u64_add(rack_persists_acks, 1); 15903 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 15904 if (us_rtt == 0) 15905 us_rtt = 1; 15906 if (rack->probe_not_answered == 0) { 15907 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15908 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 15909 } else { 15910 /* We have a retransmitted probe here too */ 15911 if (rack_apply_rtt_with_reduced_conf) { 15912 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15913 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 15914 } 15915 } 15916 } 15917 } 15918 15919 static int 15920 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 15921 { 15922 /* 15923 * Handle a "special" compressed ack mbuf. Each incoming 15924 * ack has only four possible dispositions: 15925 * 15926 * A) It moves the cum-ack forward 15927 * B) It is behind the cum-ack. 15928 * C) It is a window-update ack. 15929 * D) It is a dup-ack. 15930 * 15931 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 15932 * in the incoming mbuf. We also need to still pay attention 15933 * to nxt_pkt since there may be another packet after this 15934 * one. 15935 */ 15936 #ifdef TCP_ACCOUNTING 15937 uint64_t ts_val; 15938 uint64_t rdstc; 15939 #endif 15940 int segsiz; 15941 struct timespec ts; 15942 struct tcp_rack *rack; 15943 struct tcp_ackent *ae; 15944 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 15945 int cnt, i, did_out, ourfinisacked = 0; 15946 struct tcpopt to_holder, *to = NULL; 15947 #ifdef TCP_ACCOUNTING 15948 int win_up_req = 0; 15949 #endif 15950 int nsegs = 0; 15951 int under_pacing = 0; 15952 int recovery = 0; 15953 #ifdef TCP_ACCOUNTING 15954 sched_pin(); 15955 #endif 15956 rack = (struct tcp_rack *)tp->t_fb_ptr; 15957 if (rack->gp_ready && 15958 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 15959 under_pacing = 1; 15960 15961 if (rack->r_state != tp->t_state) 15962 rack_set_state(tp, rack); 15963 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 15964 (tp->t_flags & TF_GPUTINPROG)) { 15965 /* 15966 * We have a goodput in progress 15967 * and we have entered a late state. 15968 * Do we have enough data in the sb 15969 * to handle the GPUT request? 15970 */ 15971 uint32_t bytes; 15972 15973 bytes = tp->gput_ack - tp->gput_seq; 15974 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 15975 bytes += tp->gput_seq - tp->snd_una; 15976 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 15977 /* 15978 * There are not enough bytes in the socket 15979 * buffer that have been sent to cover this 15980 * measurement. Cancel it. 15981 */ 15982 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 15983 rack->r_ctl.rc_gp_srtt /*flex1*/, 15984 tp->gput_seq, 15985 0, 0, 18, __LINE__, NULL, 0); 15986 tp->t_flags &= ~TF_GPUTINPROG; 15987 } 15988 } 15989 to = &to_holder; 15990 to->to_flags = 0; 15991 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 15992 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 15993 cnt = m->m_len / sizeof(struct tcp_ackent); 15994 counter_u64_add(rack_multi_single_eq, cnt); 15995 high_seq = tp->snd_una; 15996 the_win = tp->snd_wnd; 15997 win_seq = tp->snd_wl1; 15998 win_upd_ack = tp->snd_wl2; 15999 cts = tcp_tv_to_usectick(tv); 16000 ms_cts = tcp_tv_to_mssectick(tv); 16001 rack->r_ctl.rc_rcvtime = cts; 16002 segsiz = ctf_fixed_maxseg(tp); 16003 if ((rack->rc_gp_dyn_mul) && 16004 (rack->use_fixed_rate == 0) && 16005 (rack->rc_always_pace)) { 16006 /* Check in on probertt */ 16007 rack_check_probe_rtt(rack, cts); 16008 } 16009 for (i = 0; i < cnt; i++) { 16010 #ifdef TCP_ACCOUNTING 16011 ts_val = get_cyclecount(); 16012 #endif 16013 rack_clear_rate_sample(rack); 16014 ae = ((mtod(m, struct tcp_ackent *)) + i); 16015 if (ae->flags & TH_FIN) 16016 rack_log_pacing_delay_calc(rack, 16017 0, 16018 0, 16019 0, 16020 rack_get_gp_est(rack), /* delRate */ 16021 rack_get_lt_bw(rack), /* rttProp */ 16022 20, __LINE__, NULL, 0); 16023 /* Setup the window */ 16024 tiwin = ae->win << tp->snd_scale; 16025 if (tiwin > rack->r_ctl.rc_high_rwnd) 16026 rack->r_ctl.rc_high_rwnd = tiwin; 16027 /* figure out the type of ack */ 16028 if (SEQ_LT(ae->ack, high_seq)) { 16029 /* Case B*/ 16030 ae->ack_val_set = ACK_BEHIND; 16031 } else if (SEQ_GT(ae->ack, high_seq)) { 16032 /* Case A */ 16033 ae->ack_val_set = ACK_CUMACK; 16034 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 16035 /* Case D */ 16036 ae->ack_val_set = ACK_DUPACK; 16037 } else { 16038 /* Case C */ 16039 ae->ack_val_set = ACK_RWND; 16040 } 16041 if (rack->sack_attack_disable > 0) { 16042 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16043 rack->r_ctl.ack_during_sd++; 16044 } 16045 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 16046 /* Validate timestamp */ 16047 if (ae->flags & HAS_TSTMP) { 16048 /* Setup for a timestamp */ 16049 to->to_flags = TOF_TS; 16050 ae->ts_echo -= tp->ts_offset; 16051 to->to_tsecr = ae->ts_echo; 16052 to->to_tsval = ae->ts_value; 16053 /* 16054 * If echoed timestamp is later than the current time, fall back to 16055 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16056 * were used when this connection was established. 16057 */ 16058 if (TSTMP_GT(ae->ts_echo, ms_cts)) 16059 to->to_tsecr = 0; 16060 if (tp->ts_recent && 16061 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 16062 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 16063 #ifdef TCP_ACCOUNTING 16064 rdstc = get_cyclecount(); 16065 if (rdstc > ts_val) { 16066 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16067 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 16068 } 16069 } 16070 #endif 16071 continue; 16072 } 16073 } 16074 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 16075 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 16076 tp->ts_recent_age = tcp_ts_getticks(); 16077 tp->ts_recent = ae->ts_value; 16078 } 16079 } else { 16080 /* Setup for a no options */ 16081 to->to_flags = 0; 16082 } 16083 /* Update the rcv time and perform idle reduction possibly */ 16084 if (tp->t_idle_reduce && 16085 (tp->snd_max == tp->snd_una) && 16086 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16087 counter_u64_add(rack_input_idle_reduces, 1); 16088 rack_cc_after_idle(rack, tp); 16089 } 16090 tp->t_rcvtime = ticks; 16091 /* Now what about ECN of a chain of pure ACKs? */ 16092 if (tcp_ecn_input_segment(tp, ae->flags, 0, 16093 tcp_packets_this_ack(tp, ae->ack), 16094 ae->codepoint)) 16095 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 16096 #ifdef TCP_ACCOUNTING 16097 /* Count for the specific type of ack in */ 16098 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16099 tp->tcp_cnt_counters[ae->ack_val_set]++; 16100 } 16101 #endif 16102 /* 16103 * Note how we could move up these in the determination 16104 * above, but we don't so that way the timestamp checks (and ECN) 16105 * is done first before we do any processing on the ACK. 16106 * The non-compressed path through the code has this 16107 * weakness (noted by @jtl) that it actually does some 16108 * processing before verifying the timestamp information. 16109 * We don't take that path here which is why we set 16110 * the ack_val_set first, do the timestamp and ecn 16111 * processing, and then look at what we have setup. 16112 */ 16113 if (ae->ack_val_set == ACK_BEHIND) { 16114 /* 16115 * Case B flag reordering, if window is not closed 16116 * or it could be a keep-alive or persists 16117 */ 16118 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 16119 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 16120 if (rack->r_ctl.rc_reorder_ts == 0) 16121 rack->r_ctl.rc_reorder_ts = 1; 16122 } 16123 } else if (ae->ack_val_set == ACK_DUPACK) { 16124 /* Case D */ 16125 rack_strike_dupack(rack); 16126 } else if (ae->ack_val_set == ACK_RWND) { 16127 /* Case C */ 16128 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 16129 ts.tv_sec = ae->timestamp / 1000000000; 16130 ts.tv_nsec = ae->timestamp % 1000000000; 16131 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16132 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16133 } else { 16134 rack->r_ctl.act_rcv_time = *tv; 16135 } 16136 if (rack->forced_ack) { 16137 rack_handle_probe_response(rack, tiwin, 16138 tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 16139 } 16140 #ifdef TCP_ACCOUNTING 16141 win_up_req = 1; 16142 #endif 16143 win_upd_ack = ae->ack; 16144 win_seq = ae->seq; 16145 the_win = tiwin; 16146 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 16147 } else { 16148 /* Case A */ 16149 if (SEQ_GT(ae->ack, tp->snd_max)) { 16150 /* 16151 * We just send an ack since the incoming 16152 * ack is beyond the largest seq we sent. 16153 */ 16154 if ((tp->t_flags & TF_ACKNOW) == 0) { 16155 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 16156 if (tp->t_flags && TF_ACKNOW) 16157 rack->r_wanted_output = 1; 16158 } 16159 } else { 16160 nsegs++; 16161 /* If the window changed setup to update */ 16162 if (tiwin != tp->snd_wnd) { 16163 win_upd_ack = ae->ack; 16164 win_seq = ae->seq; 16165 the_win = tiwin; 16166 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 16167 } 16168 #ifdef TCP_ACCOUNTING 16169 /* Account for the acks */ 16170 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16171 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 16172 } 16173 #endif 16174 high_seq = ae->ack; 16175 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) 16176 rack_log_hystart_event(rack, high_seq, 8); 16177 /* Setup our act_rcv_time */ 16178 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 16179 ts.tv_sec = ae->timestamp / 1000000000; 16180 ts.tv_nsec = ae->timestamp % 1000000000; 16181 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16182 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16183 } else { 16184 rack->r_ctl.act_rcv_time = *tv; 16185 } 16186 rack_process_to_cumack(tp, rack, ae->ack, cts, to, 16187 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 16188 #ifdef TCP_REQUEST_TRK 16189 rack_req_check_for_comp(rack, high_seq); 16190 #endif 16191 if (rack->rc_dsack_round_seen) { 16192 /* Is the dsack round over? */ 16193 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 16194 /* Yes it is */ 16195 rack->rc_dsack_round_seen = 0; 16196 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 16197 } 16198 } 16199 } 16200 } 16201 /* And lets be sure to commit the rtt measurements for this ack */ 16202 tcp_rack_xmit_timer_commit(rack, tp); 16203 #ifdef TCP_ACCOUNTING 16204 rdstc = get_cyclecount(); 16205 if (rdstc > ts_val) { 16206 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16207 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 16208 if (ae->ack_val_set == ACK_CUMACK) 16209 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 16210 } 16211 } 16212 #endif 16213 } 16214 #ifdef TCP_ACCOUNTING 16215 ts_val = get_cyclecount(); 16216 #endif 16217 /* Tend to any collapsed window */ 16218 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 16219 /* The peer collapsed the window */ 16220 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); 16221 } else if (rack->rc_has_collapsed) 16222 rack_un_collapse_window(rack, __LINE__); 16223 if ((rack->r_collapse_point_valid) && 16224 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 16225 rack->r_collapse_point_valid = 0; 16226 acked_amount = acked = (high_seq - tp->snd_una); 16227 if (acked) { 16228 /* 16229 * The draft (v3) calls for us to use SEQ_GEQ, but that 16230 * causes issues when we are just going app limited. Lets 16231 * instead use SEQ_GT <or> where its equal but more data 16232 * is outstanding. 16233 * 16234 * Also make sure we are on the last ack of a series. We 16235 * have to have all the ack's processed in queue to know 16236 * if there is something left outstanding. 16237 * 16238 */ 16239 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && 16240 (rack->rc_new_rnd_needed == 0) && 16241 (nxt_pkt == 0)) { 16242 rack_log_hystart_event(rack, high_seq, 21); 16243 rack->r_ctl.current_round++; 16244 /* Force the next send to setup the next round */ 16245 rack->rc_new_rnd_needed = 1; 16246 if (CC_ALGO(tp)->newround != NULL) { 16247 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 16248 } 16249 } 16250 /* 16251 * Clear the probe not answered flag 16252 * since cum-ack moved forward. 16253 */ 16254 rack->probe_not_answered = 0; 16255 if (rack->sack_attack_disable == 0) 16256 rack_do_decay(rack); 16257 if (acked >= segsiz) { 16258 /* 16259 * You only get credit for 16260 * MSS and greater (and you get extra 16261 * credit for larger cum-ack moves). 16262 */ 16263 int ac; 16264 16265 ac = acked / segsiz; 16266 rack->r_ctl.ack_count += ac; 16267 counter_u64_add(rack_ack_total, ac); 16268 } 16269 if (rack->r_ctl.ack_count > 0xfff00000) { 16270 /* 16271 * reduce the number to keep us under 16272 * a uint32_t. 16273 */ 16274 rack->r_ctl.ack_count /= 2; 16275 rack->r_ctl.sack_count /= 2; 16276 } 16277 if (tp->t_flags & TF_NEEDSYN) { 16278 /* 16279 * T/TCP: Connection was half-synchronized, and our SYN has 16280 * been ACK'd (so connection is now fully synchronized). Go 16281 * to non-starred state, increment snd_una for ACK of SYN, 16282 * and check if we can do window scaling. 16283 */ 16284 tp->t_flags &= ~TF_NEEDSYN; 16285 tp->snd_una++; 16286 acked_amount = acked = (high_seq - tp->snd_una); 16287 } 16288 if (acked > sbavail(&so->so_snd)) 16289 acked_amount = sbavail(&so->so_snd); 16290 #ifdef TCP_SAD_DETECTION 16291 /* 16292 * We only care on a cum-ack move if we are in a sack-disabled 16293 * state. We have already added in to the ack_count, and we never 16294 * would disable on a cum-ack move, so we only care to do the 16295 * detection if it may "undo" it, i.e. we were in disabled already. 16296 */ 16297 if (rack->sack_attack_disable) 16298 rack_do_detection(tp, rack, acked_amount, segsiz); 16299 #endif 16300 if (IN_FASTRECOVERY(tp->t_flags) && 16301 (rack->rack_no_prr == 0)) 16302 rack_update_prr(tp, rack, acked_amount, high_seq); 16303 if (IN_RECOVERY(tp->t_flags)) { 16304 if (SEQ_LT(high_seq, tp->snd_recover) && 16305 (SEQ_LT(high_seq, tp->snd_max))) { 16306 tcp_rack_partialack(tp); 16307 } else { 16308 rack_post_recovery(tp, high_seq); 16309 recovery = 1; 16310 } 16311 } 16312 /* Handle the rack-log-ack part (sendmap) */ 16313 if ((sbused(&so->so_snd) == 0) && 16314 (acked > acked_amount) && 16315 (tp->t_state >= TCPS_FIN_WAIT_1) && 16316 (tp->t_flags & TF_SENTFIN)) { 16317 /* 16318 * We must be sure our fin 16319 * was sent and acked (we can be 16320 * in FIN_WAIT_1 without having 16321 * sent the fin). 16322 */ 16323 ourfinisacked = 1; 16324 /* 16325 * Lets make sure snd_una is updated 16326 * since most likely acked_amount = 0 (it 16327 * should be). 16328 */ 16329 tp->snd_una = high_seq; 16330 } 16331 /* Did we make a RTO error? */ 16332 if ((tp->t_flags & TF_PREVVALID) && 16333 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 16334 tp->t_flags &= ~TF_PREVVALID; 16335 if (tp->t_rxtshift == 1 && 16336 (int)(ticks - tp->t_badrxtwin) < 0) 16337 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 16338 } 16339 /* Handle the data in the socket buffer */ 16340 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 16341 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 16342 if (acked_amount > 0) { 16343 struct mbuf *mfree; 16344 16345 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 16346 SOCKBUF_LOCK(&so->so_snd); 16347 mfree = sbcut_locked(&so->so_snd, acked_amount); 16348 tp->snd_una = high_seq; 16349 /* Note we want to hold the sb lock through the sendmap adjust */ 16350 rack_adjust_sendmap_head(rack, &so->so_snd); 16351 /* Wake up the socket if we have room to write more */ 16352 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 16353 sowwakeup_locked(so); 16354 if ((recovery == 1) && 16355 (rack->excess_rxt_on) && 16356 (rack->r_cwnd_was_clamped == 0)) { 16357 do_rack_excess_rxt(tp, rack); 16358 } else if (rack->r_cwnd_was_clamped) 16359 do_rack_check_for_unclamp(tp, rack); 16360 m_freem(mfree); 16361 } 16362 /* update progress */ 16363 tp->t_acktime = ticks; 16364 rack_log_progress_event(rack, tp, tp->t_acktime, 16365 PROGRESS_UPDATE, __LINE__); 16366 /* Clear out shifts and such */ 16367 tp->t_rxtshift = 0; 16368 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 16369 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 16370 rack->rc_tlp_in_progress = 0; 16371 rack->r_ctl.rc_tlp_cnt_out = 0; 16372 /* Send recover and snd_nxt must be dragged along */ 16373 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 16374 tp->snd_recover = tp->snd_una; 16375 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 16376 tp->snd_nxt = tp->snd_max; 16377 /* 16378 * If the RXT timer is running we want to 16379 * stop it, so we can restart a TLP (or new RXT). 16380 */ 16381 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 16382 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16383 tp->snd_wl2 = high_seq; 16384 tp->t_dupacks = 0; 16385 if (under_pacing && 16386 (rack->use_fixed_rate == 0) && 16387 (rack->in_probe_rtt == 0) && 16388 rack->rc_gp_dyn_mul && 16389 rack->rc_always_pace) { 16390 /* Check if we are dragging bottom */ 16391 rack_check_bottom_drag(tp, rack, so); 16392 } 16393 if (tp->snd_una == tp->snd_max) { 16394 tp->t_flags &= ~TF_PREVVALID; 16395 rack->r_ctl.retran_during_recovery = 0; 16396 rack->rc_suspicious = 0; 16397 rack->r_ctl.dsack_byte_cnt = 0; 16398 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 16399 if (rack->r_ctl.rc_went_idle_time == 0) 16400 rack->r_ctl.rc_went_idle_time = 1; 16401 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 16402 if (sbavail(&tptosocket(tp)->so_snd) == 0) 16403 tp->t_acktime = 0; 16404 /* Set so we might enter persists... */ 16405 rack->r_wanted_output = 1; 16406 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16407 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 16408 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16409 (sbavail(&so->so_snd) == 0) && 16410 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 16411 /* 16412 * The socket was gone and the 16413 * peer sent data (not now in the past), time to 16414 * reset him. 16415 */ 16416 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16417 /* tcp_close will kill the inp pre-log the Reset */ 16418 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 16419 #ifdef TCP_ACCOUNTING 16420 rdstc = get_cyclecount(); 16421 if (rdstc > ts_val) { 16422 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16423 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16424 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16425 } 16426 } 16427 #endif 16428 m_freem(m); 16429 tp = tcp_close(tp); 16430 if (tp == NULL) { 16431 #ifdef TCP_ACCOUNTING 16432 sched_unpin(); 16433 #endif 16434 return (1); 16435 } 16436 /* 16437 * We would normally do drop-with-reset which would 16438 * send back a reset. We can't since we don't have 16439 * all the needed bits. Instead lets arrange for 16440 * a call to tcp_output(). That way since we 16441 * are in the closed state we will generate a reset. 16442 * 16443 * Note if tcp_accounting is on we don't unpin since 16444 * we do that after the goto label. 16445 */ 16446 goto send_out_a_rst; 16447 } 16448 if ((sbused(&so->so_snd) == 0) && 16449 (tp->t_state >= TCPS_FIN_WAIT_1) && 16450 (tp->t_flags & TF_SENTFIN)) { 16451 /* 16452 * If we can't receive any more data, then closing user can 16453 * proceed. Starting the timer is contrary to the 16454 * specification, but if we don't get a FIN we'll hang 16455 * forever. 16456 * 16457 */ 16458 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16459 soisdisconnected(so); 16460 tcp_timer_activate(tp, TT_2MSL, 16461 (tcp_fast_finwait2_recycle ? 16462 tcp_finwait2_timeout : 16463 TP_MAXIDLE(tp))); 16464 } 16465 if (ourfinisacked == 0) { 16466 /* 16467 * We don't change to fin-wait-2 if we have our fin acked 16468 * which means we are probably in TCPS_CLOSING. 16469 */ 16470 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16471 } 16472 } 16473 } 16474 /* Wake up the socket if we have room to write more */ 16475 if (sbavail(&so->so_snd)) { 16476 rack->r_wanted_output = 1; 16477 if (ctf_progress_timeout_check(tp, true)) { 16478 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 16479 tp, tick, PROGRESS_DROP, __LINE__); 16480 /* 16481 * We cheat here and don't send a RST, we should send one 16482 * when the pacer drops the connection. 16483 */ 16484 #ifdef TCP_ACCOUNTING 16485 rdstc = get_cyclecount(); 16486 if (rdstc > ts_val) { 16487 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16488 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16489 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16490 } 16491 } 16492 sched_unpin(); 16493 #endif 16494 (void)tcp_drop(tp, ETIMEDOUT); 16495 m_freem(m); 16496 return (1); 16497 } 16498 } 16499 if (ourfinisacked) { 16500 switch(tp->t_state) { 16501 case TCPS_CLOSING: 16502 #ifdef TCP_ACCOUNTING 16503 rdstc = get_cyclecount(); 16504 if (rdstc > ts_val) { 16505 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16506 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16507 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16508 } 16509 } 16510 sched_unpin(); 16511 #endif 16512 tcp_twstart(tp); 16513 m_freem(m); 16514 return (1); 16515 break; 16516 case TCPS_LAST_ACK: 16517 #ifdef TCP_ACCOUNTING 16518 rdstc = get_cyclecount(); 16519 if (rdstc > ts_val) { 16520 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16521 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16522 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16523 } 16524 } 16525 sched_unpin(); 16526 #endif 16527 tp = tcp_close(tp); 16528 ctf_do_drop(m, tp); 16529 return (1); 16530 break; 16531 case TCPS_FIN_WAIT_1: 16532 #ifdef TCP_ACCOUNTING 16533 rdstc = get_cyclecount(); 16534 if (rdstc > ts_val) { 16535 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16536 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16537 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16538 } 16539 } 16540 #endif 16541 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16542 soisdisconnected(so); 16543 tcp_timer_activate(tp, TT_2MSL, 16544 (tcp_fast_finwait2_recycle ? 16545 tcp_finwait2_timeout : 16546 TP_MAXIDLE(tp))); 16547 } 16548 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16549 break; 16550 default: 16551 break; 16552 } 16553 } 16554 if (rack->r_fast_output) { 16555 /* 16556 * We re doing fast output.. can we expand that? 16557 */ 16558 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 16559 } 16560 #ifdef TCP_ACCOUNTING 16561 rdstc = get_cyclecount(); 16562 if (rdstc > ts_val) { 16563 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16564 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16565 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16566 } 16567 } 16568 16569 } else if (win_up_req) { 16570 rdstc = get_cyclecount(); 16571 if (rdstc > ts_val) { 16572 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16573 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 16574 } 16575 } 16576 #endif 16577 } 16578 /* Now is there a next packet, if so we are done */ 16579 m_freem(m); 16580 did_out = 0; 16581 if (nxt_pkt) { 16582 #ifdef TCP_ACCOUNTING 16583 sched_unpin(); 16584 #endif 16585 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 16586 return (0); 16587 } 16588 rack_handle_might_revert(tp, rack); 16589 ctf_calc_rwin(so, tp); 16590 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 16591 send_out_a_rst: 16592 if (tcp_output(tp) < 0) { 16593 #ifdef TCP_ACCOUNTING 16594 sched_unpin(); 16595 #endif 16596 return (1); 16597 } 16598 did_out = 1; 16599 } 16600 if (tp->t_flags2 & TF2_HPTS_CALLS) 16601 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16602 rack_free_trim(rack); 16603 #ifdef TCP_ACCOUNTING 16604 sched_unpin(); 16605 #endif 16606 rack_timer_audit(tp, rack, &so->so_snd); 16607 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 16608 return (0); 16609 } 16610 16611 #define TCP_LRO_TS_OPTION \ 16612 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 16613 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) 16614 16615 static int 16616 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 16617 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, 16618 struct timeval *tv) 16619 { 16620 struct inpcb *inp = tptoinpcb(tp); 16621 struct socket *so = tptosocket(tp); 16622 #ifdef TCP_ACCOUNTING 16623 uint64_t ts_val; 16624 #endif 16625 int32_t thflags, retval, did_out = 0; 16626 int32_t way_out = 0; 16627 /* 16628 * cts - is the current time from tv (caller gets ts) in microseconds. 16629 * ms_cts - is the current time from tv in milliseconds. 16630 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 16631 */ 16632 uint32_t cts, us_cts, ms_cts; 16633 uint32_t tiwin, high_seq; 16634 struct timespec ts; 16635 struct tcpopt to; 16636 struct tcp_rack *rack; 16637 struct rack_sendmap *rsm; 16638 int32_t prev_state = 0; 16639 int no_output = 0; 16640 int slot_remaining = 0; 16641 #ifdef TCP_ACCOUNTING 16642 int ack_val_set = 0xf; 16643 #endif 16644 int nsegs; 16645 16646 NET_EPOCH_ASSERT(); 16647 INP_WLOCK_ASSERT(inp); 16648 16649 /* 16650 * tv passed from common code is from either M_TSTMP_LRO or 16651 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 16652 */ 16653 rack = (struct tcp_rack *)tp->t_fb_ptr; 16654 if (rack->rack_deferred_inited == 0) { 16655 /* 16656 * If we are the connecting socket we will 16657 * hit rack_init() when no sequence numbers 16658 * are setup. This makes it so we must defer 16659 * some initialization. Call that now. 16660 */ 16661 rack_deferred_init(tp, rack); 16662 } 16663 /* 16664 * Check to see if we need to skip any output plans. This 16665 * can happen in the non-LRO path where we are pacing and 16666 * must process the ack coming in but need to defer sending 16667 * anything becase a pacing timer is running. 16668 */ 16669 us_cts = tcp_tv_to_usectick(tv); 16670 if (m->m_flags & M_ACKCMP) { 16671 /* 16672 * All compressed ack's are ack's by definition so 16673 * remove any ack required flag and then do the processing. 16674 */ 16675 rack->rc_ack_required = 0; 16676 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 16677 } 16678 thflags = tcp_get_flags(th); 16679 if ((rack->rc_always_pace == 1) && 16680 (rack->rc_ack_can_sendout_data == 0) && 16681 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16682 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) { 16683 /* 16684 * Ok conditions are right for queuing the packets 16685 * but we do have to check the flags in the inp, it 16686 * could be, if a sack is present, we want to be awoken and 16687 * so should process the packets. 16688 */ 16689 slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; 16690 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { 16691 no_output = 1; 16692 } else { 16693 /* 16694 * If there is no options, or just a 16695 * timestamp option, we will want to queue 16696 * the packets. This is the same that LRO does 16697 * and will need to change with accurate ECN. 16698 */ 16699 uint32_t *ts_ptr; 16700 int optlen; 16701 16702 optlen = (th->th_off << 2) - sizeof(struct tcphdr); 16703 ts_ptr = (uint32_t *)(th + 1); 16704 if ((optlen == 0) || 16705 ((optlen == TCPOLEN_TSTAMP_APPA) && 16706 (*ts_ptr == TCP_LRO_TS_OPTION))) 16707 no_output = 1; 16708 } 16709 if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { 16710 /* 16711 * It is unrealistic to think we can pace in less than 16712 * the minimum granularity of the pacer (def:250usec). So 16713 * if we have less than that time remaining we should go 16714 * ahead and allow output to be "early". We will attempt to 16715 * make up for it in any pacing time we try to apply on 16716 * the outbound packet. 16717 */ 16718 no_output = 0; 16719 } 16720 } 16721 /* 16722 * If there is a RST or FIN lets dump out the bw 16723 * with a FIN the connection may go on but we 16724 * may not. 16725 */ 16726 if ((thflags & TH_FIN) || (thflags & TH_RST)) 16727 rack_log_pacing_delay_calc(rack, 16728 rack->r_ctl.gp_bw, 16729 0, 16730 0, 16731 rack_get_gp_est(rack), /* delRate */ 16732 rack_get_lt_bw(rack), /* rttProp */ 16733 20, __LINE__, NULL, 0); 16734 if (m->m_flags & M_ACKCMP) { 16735 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 16736 } 16737 cts = tcp_tv_to_usectick(tv); 16738 ms_cts = tcp_tv_to_mssectick(tv); 16739 nsegs = m->m_pkthdr.lro_nsegs; 16740 counter_u64_add(rack_proc_non_comp_ack, 1); 16741 #ifdef TCP_ACCOUNTING 16742 sched_pin(); 16743 if (thflags & TH_ACK) 16744 ts_val = get_cyclecount(); 16745 #endif 16746 if ((m->m_flags & M_TSTMP) || 16747 (m->m_flags & M_TSTMP_LRO)) { 16748 mbuf_tstmp2timespec(m, &ts); 16749 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16750 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16751 } else 16752 rack->r_ctl.act_rcv_time = *tv; 16753 kern_prefetch(rack, &prev_state); 16754 prev_state = 0; 16755 /* 16756 * Unscale the window into a 32-bit value. For the SYN_SENT state 16757 * the scale is zero. 16758 */ 16759 tiwin = th->th_win << tp->snd_scale; 16760 #ifdef TCP_ACCOUNTING 16761 if (thflags & TH_ACK) { 16762 /* 16763 * We have a tradeoff here. We can either do what we are 16764 * doing i.e. pinning to this CPU and then doing the accounting 16765 * <or> we could do a critical enter, setup the rdtsc and cpu 16766 * as in below, and then validate we are on the same CPU on 16767 * exit. I have choosen to not do the critical enter since 16768 * that often will gain you a context switch, and instead lock 16769 * us (line above this if) to the same CPU with sched_pin(). This 16770 * means we may be context switched out for a higher priority 16771 * interupt but we won't be moved to another CPU. 16772 * 16773 * If this occurs (which it won't very often since we most likely 16774 * are running this code in interupt context and only a higher 16775 * priority will bump us ... clock?) we will falsely add in 16776 * to the time the interupt processing time plus the ack processing 16777 * time. This is ok since its a rare event. 16778 */ 16779 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 16780 ctf_fixed_maxseg(tp)); 16781 } 16782 #endif 16783 /* 16784 * Parse options on any incoming segment. 16785 */ 16786 memset(&to, 0, sizeof(to)); 16787 tcp_dooptions(&to, (u_char *)(th + 1), 16788 (th->th_off << 2) - sizeof(struct tcphdr), 16789 (thflags & TH_SYN) ? TO_SYN : 0); 16790 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 16791 __func__)); 16792 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 16793 __func__)); 16794 16795 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16796 (tp->t_flags & TF_GPUTINPROG)) { 16797 /* 16798 * We have a goodput in progress 16799 * and we have entered a late state. 16800 * Do we have enough data in the sb 16801 * to handle the GPUT request? 16802 */ 16803 uint32_t bytes; 16804 16805 bytes = tp->gput_ack - tp->gput_seq; 16806 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 16807 bytes += tp->gput_seq - tp->snd_una; 16808 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 16809 /* 16810 * There are not enough bytes in the socket 16811 * buffer that have been sent to cover this 16812 * measurement. Cancel it. 16813 */ 16814 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 16815 rack->r_ctl.rc_gp_srtt /*flex1*/, 16816 tp->gput_seq, 16817 0, 0, 18, __LINE__, NULL, 0); 16818 tp->t_flags &= ~TF_GPUTINPROG; 16819 } 16820 } 16821 high_seq = th->th_ack; 16822 if (tcp_bblogging_on(rack->rc_tp)) { 16823 union tcp_log_stackspecific log; 16824 struct timeval ltv; 16825 #ifdef TCP_REQUEST_TRK 16826 struct tcp_sendfile_track *tcp_req; 16827 16828 if (SEQ_GT(th->th_ack, tp->snd_una)) { 16829 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1)); 16830 } else { 16831 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); 16832 } 16833 #endif 16834 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16835 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 16836 if (rack->rack_no_prr == 0) 16837 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16838 else 16839 log.u_bbr.flex1 = 0; 16840 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 16841 log.u_bbr.use_lt_bw <<= 1; 16842 log.u_bbr.use_lt_bw |= rack->r_might_revert; 16843 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 16844 log.u_bbr.bbr_state = rack->rc_free_cnt; 16845 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16846 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 16847 log.u_bbr.flex3 = m->m_flags; 16848 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 16849 log.u_bbr.lost = thflags; 16850 log.u_bbr.pacing_gain = 0x1; 16851 #ifdef TCP_ACCOUNTING 16852 log.u_bbr.cwnd_gain = ack_val_set; 16853 #endif 16854 log.u_bbr.flex7 = 2; 16855 if (m->m_flags & M_TSTMP) { 16856 /* Record the hardware timestamp if present */ 16857 mbuf_tstmp2timespec(m, &ts); 16858 ltv.tv_sec = ts.tv_sec; 16859 ltv.tv_usec = ts.tv_nsec / 1000; 16860 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 16861 } else if (m->m_flags & M_TSTMP_LRO) { 16862 /* Record the LRO the arrival timestamp */ 16863 mbuf_tstmp2timespec(m, &ts); 16864 ltv.tv_sec = ts.tv_sec; 16865 ltv.tv_usec = ts.tv_nsec / 1000; 16866 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 16867 } 16868 log.u_bbr.timeStamp = tcp_get_usecs(<v); 16869 /* Log the rcv time */ 16870 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 16871 #ifdef TCP_REQUEST_TRK 16872 log.u_bbr.applimited = tp->t_tcpreq_closed; 16873 log.u_bbr.applimited <<= 8; 16874 log.u_bbr.applimited |= tp->t_tcpreq_open; 16875 log.u_bbr.applimited <<= 8; 16876 log.u_bbr.applimited |= tp->t_tcpreq_req; 16877 if (tcp_req) { 16878 /* Copy out any client req info */ 16879 /* seconds */ 16880 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 16881 /* useconds */ 16882 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 16883 log.u_bbr.rttProp = tcp_req->timestamp; 16884 log.u_bbr.cur_del_rate = tcp_req->start; 16885 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 16886 log.u_bbr.flex8 |= 1; 16887 } else { 16888 log.u_bbr.flex8 |= 2; 16889 log.u_bbr.bw_inuse = tcp_req->end; 16890 } 16891 log.u_bbr.flex6 = tcp_req->start_seq; 16892 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 16893 log.u_bbr.flex8 |= 4; 16894 log.u_bbr.epoch = tcp_req->end_seq; 16895 } 16896 } 16897 #endif 16898 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 16899 tlen, &log, true, <v); 16900 } 16901 /* Remove ack required flag if set, we have one */ 16902 if (thflags & TH_ACK) 16903 rack->rc_ack_required = 0; 16904 if (rack->sack_attack_disable > 0) { 16905 rack->r_ctl.ack_during_sd++; 16906 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16907 } 16908 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 16909 way_out = 4; 16910 retval = 0; 16911 m_freem(m); 16912 goto done_with_input; 16913 } 16914 /* 16915 * If a segment with the ACK-bit set arrives in the SYN-SENT state 16916 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 16917 */ 16918 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 16919 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 16920 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 16921 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 16922 #ifdef TCP_ACCOUNTING 16923 sched_unpin(); 16924 #endif 16925 return (1); 16926 } 16927 /* 16928 * If timestamps were negotiated during SYN/ACK and a 16929 * segment without a timestamp is received, silently drop 16930 * the segment, unless it is a RST segment or missing timestamps are 16931 * tolerated. 16932 * See section 3.2 of RFC 7323. 16933 */ 16934 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 16935 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 16936 way_out = 5; 16937 retval = 0; 16938 m_freem(m); 16939 goto done_with_input; 16940 } 16941 16942 /* 16943 * Segment received on connection. Reset idle time and keep-alive 16944 * timer. XXX: This should be done after segment validation to 16945 * ignore broken/spoofed segs. 16946 */ 16947 if (tp->t_idle_reduce && 16948 (tp->snd_max == tp->snd_una) && 16949 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16950 counter_u64_add(rack_input_idle_reduces, 1); 16951 rack_cc_after_idle(rack, tp); 16952 } 16953 tp->t_rcvtime = ticks; 16954 #ifdef STATS 16955 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 16956 #endif 16957 if (tiwin > rack->r_ctl.rc_high_rwnd) 16958 rack->r_ctl.rc_high_rwnd = tiwin; 16959 /* 16960 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 16961 * this to occur after we've validated the segment. 16962 */ 16963 if (tcp_ecn_input_segment(tp, thflags, tlen, 16964 tcp_packets_this_ack(tp, th->th_ack), 16965 iptos)) 16966 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 16967 16968 /* 16969 * If echoed timestamp is later than the current time, fall back to 16970 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16971 * were used when this connection was established. 16972 */ 16973 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 16974 to.to_tsecr -= tp->ts_offset; 16975 if (TSTMP_GT(to.to_tsecr, ms_cts)) 16976 to.to_tsecr = 0; 16977 } 16978 16979 /* 16980 * If its the first time in we need to take care of options and 16981 * verify we can do SACK for rack! 16982 */ 16983 if (rack->r_state == 0) { 16984 /* Should be init'd by rack_init() */ 16985 KASSERT(rack->rc_inp != NULL, 16986 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 16987 if (rack->rc_inp == NULL) { 16988 rack->rc_inp = inp; 16989 } 16990 16991 /* 16992 * Process options only when we get SYN/ACK back. The SYN 16993 * case for incoming connections is handled in tcp_syncache. 16994 * According to RFC1323 the window field in a SYN (i.e., a 16995 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 16996 * this is traditional behavior, may need to be cleaned up. 16997 */ 16998 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 16999 /* Handle parallel SYN for ECN */ 17000 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 17001 if ((to.to_flags & TOF_SCALE) && 17002 (tp->t_flags & TF_REQ_SCALE)) { 17003 tp->t_flags |= TF_RCVD_SCALE; 17004 tp->snd_scale = to.to_wscale; 17005 } else 17006 tp->t_flags &= ~TF_REQ_SCALE; 17007 /* 17008 * Initial send window. It will be updated with the 17009 * next incoming segment to the scaled value. 17010 */ 17011 tp->snd_wnd = th->th_win; 17012 rack_validate_fo_sendwin_up(tp, rack); 17013 if ((to.to_flags & TOF_TS) && 17014 (tp->t_flags & TF_REQ_TSTMP)) { 17015 tp->t_flags |= TF_RCVD_TSTMP; 17016 tp->ts_recent = to.to_tsval; 17017 tp->ts_recent_age = cts; 17018 } else 17019 tp->t_flags &= ~TF_REQ_TSTMP; 17020 if (to.to_flags & TOF_MSS) { 17021 tcp_mss(tp, to.to_mss); 17022 } 17023 if ((tp->t_flags & TF_SACK_PERMIT) && 17024 (to.to_flags & TOF_SACKPERM) == 0) 17025 tp->t_flags &= ~TF_SACK_PERMIT; 17026 if (IS_FASTOPEN(tp->t_flags)) { 17027 if (to.to_flags & TOF_FASTOPEN) { 17028 uint16_t mss; 17029 17030 if (to.to_flags & TOF_MSS) 17031 mss = to.to_mss; 17032 else 17033 if ((inp->inp_vflag & INP_IPV6) != 0) 17034 mss = TCP6_MSS; 17035 else 17036 mss = TCP_MSS; 17037 tcp_fastopen_update_cache(tp, mss, 17038 to.to_tfo_len, to.to_tfo_cookie); 17039 } else 17040 tcp_fastopen_disable_path(tp); 17041 } 17042 } 17043 /* 17044 * At this point we are at the initial call. Here we decide 17045 * if we are doing RACK or not. We do this by seeing if 17046 * TF_SACK_PERMIT is set and the sack-not-required is clear. 17047 * The code now does do dup-ack counting so if you don't 17048 * switch back you won't get rack & TLP, but you will still 17049 * get this stack. 17050 */ 17051 17052 if ((rack_sack_not_required == 0) && 17053 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 17054 tcp_switch_back_to_default(tp); 17055 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen, 17056 tlen, iptos); 17057 #ifdef TCP_ACCOUNTING 17058 sched_unpin(); 17059 #endif 17060 return (1); 17061 } 17062 tcp_set_hpts(tp); 17063 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 17064 } 17065 if (thflags & TH_FIN) 17066 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 17067 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 17068 if ((rack->rc_gp_dyn_mul) && 17069 (rack->use_fixed_rate == 0) && 17070 (rack->rc_always_pace)) { 17071 /* Check in on probertt */ 17072 rack_check_probe_rtt(rack, us_cts); 17073 } 17074 rack_clear_rate_sample(rack); 17075 if ((rack->forced_ack) && 17076 ((tcp_get_flags(th) & TH_RST) == 0)) { 17077 rack_handle_probe_response(rack, tiwin, us_cts); 17078 } 17079 /* 17080 * This is the one exception case where we set the rack state 17081 * always. All other times (timers etc) we must have a rack-state 17082 * set (so we assure we have done the checks above for SACK). 17083 */ 17084 rack->r_ctl.rc_rcvtime = cts; 17085 if (rack->r_state != tp->t_state) 17086 rack_set_state(tp, rack); 17087 if (SEQ_GT(th->th_ack, tp->snd_una) && 17088 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) 17089 kern_prefetch(rsm, &prev_state); 17090 prev_state = rack->r_state; 17091 if ((thflags & TH_RST) && 17092 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 17093 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 17094 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { 17095 /* The connection will be killed by a reset check the tracepoint */ 17096 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); 17097 } 17098 retval = (*rack->r_substate) (m, th, so, 17099 tp, &to, drop_hdrlen, 17100 tlen, tiwin, thflags, nxt_pkt, iptos); 17101 if (retval == 0) { 17102 /* 17103 * If retval is 1 the tcb is unlocked and most likely the tp 17104 * is gone. 17105 */ 17106 INP_WLOCK_ASSERT(inp); 17107 if ((rack->rc_gp_dyn_mul) && 17108 (rack->rc_always_pace) && 17109 (rack->use_fixed_rate == 0) && 17110 rack->in_probe_rtt && 17111 (rack->r_ctl.rc_time_probertt_starts == 0)) { 17112 /* 17113 * If we are going for target, lets recheck before 17114 * we output. 17115 */ 17116 rack_check_probe_rtt(rack, us_cts); 17117 } 17118 if (rack->set_pacing_done_a_iw == 0) { 17119 /* How much has been acked? */ 17120 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 17121 /* We have enough to set in the pacing segment size */ 17122 rack->set_pacing_done_a_iw = 1; 17123 rack_set_pace_segments(tp, rack, __LINE__, NULL); 17124 } 17125 } 17126 tcp_rack_xmit_timer_commit(rack, tp); 17127 #ifdef TCP_ACCOUNTING 17128 /* 17129 * If we set the ack_val_se to what ack processing we are doing 17130 * we also want to track how many cycles we burned. Note 17131 * the bits after tcp_output we let be "free". This is because 17132 * we are also tracking the tcp_output times as well. Note the 17133 * use of 0xf here since we only have 11 counter (0 - 0xa) and 17134 * 0xf cannot be returned and is what we initialize it too to 17135 * indicate we are not doing the tabulations. 17136 */ 17137 if (ack_val_set != 0xf) { 17138 uint64_t crtsc; 17139 17140 crtsc = get_cyclecount(); 17141 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17142 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 17143 } 17144 } 17145 #endif 17146 if ((nxt_pkt == 0) && (no_output == 0)) { 17147 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 17148 do_output_now: 17149 if (tcp_output(tp) < 0) { 17150 #ifdef TCP_ACCOUNTING 17151 sched_unpin(); 17152 #endif 17153 return (1); 17154 } 17155 did_out = 1; 17156 } 17157 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 17158 rack_free_trim(rack); 17159 } else if ((no_output == 1) && 17160 (nxt_pkt == 0) && 17161 (tcp_in_hpts(rack->rc_tp) == 0)) { 17162 /* 17163 * We are not in hpts and we had a pacing timer up. Use 17164 * the remaining time (slot_remaining) to restart the timer. 17165 */ 17166 KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); 17167 rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); 17168 rack_free_trim(rack); 17169 } 17170 /* Clear the flag, it may have been cleared by output but we may not have */ 17171 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) 17172 tp->t_flags2 &= ~TF2_HPTS_CALLS; 17173 /* Update any rounds needed */ 17174 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) 17175 rack_log_hystart_event(rack, high_seq, 8); 17176 /* 17177 * The draft (v3) calls for us to use SEQ_GEQ, but that 17178 * causes issues when we are just going app limited. Lets 17179 * instead use SEQ_GT <or> where its equal but more data 17180 * is outstanding. 17181 * 17182 * Also make sure we are on the last ack of a series. We 17183 * have to have all the ack's processed in queue to know 17184 * if there is something left outstanding. 17185 */ 17186 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && 17187 (rack->rc_new_rnd_needed == 0) && 17188 (nxt_pkt == 0)) { 17189 rack_log_hystart_event(rack, tp->snd_una, 21); 17190 rack->r_ctl.current_round++; 17191 /* Force the next send to setup the next round */ 17192 rack->rc_new_rnd_needed = 1; 17193 if (CC_ALGO(tp)->newround != NULL) { 17194 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 17195 } 17196 } 17197 if ((nxt_pkt == 0) && 17198 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 17199 (SEQ_GT(tp->snd_max, tp->snd_una) || 17200 (tp->t_flags & TF_DELACK) || 17201 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 17202 (tp->t_state <= TCPS_CLOSING)))) { 17203 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 17204 if ((tp->snd_max == tp->snd_una) && 17205 ((tp->t_flags & TF_DELACK) == 0) && 17206 (tcp_in_hpts(rack->rc_tp)) && 17207 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 17208 /* keep alive not needed if we are hptsi output yet */ 17209 ; 17210 } else { 17211 int late = 0; 17212 if (tcp_in_hpts(tp)) { 17213 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 17214 us_cts = tcp_get_usecs(NULL); 17215 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 17216 rack->r_early = 1; 17217 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 17218 } else 17219 late = 1; 17220 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 17221 } 17222 tcp_hpts_remove(tp); 17223 } 17224 if (late && (did_out == 0)) { 17225 /* 17226 * We are late in the sending 17227 * and we did not call the output 17228 * (this probably should not happen). 17229 */ 17230 goto do_output_now; 17231 } 17232 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 17233 } 17234 way_out = 1; 17235 } else if (nxt_pkt == 0) { 17236 /* Do we have the correct timer running? */ 17237 rack_timer_audit(tp, rack, &so->so_snd); 17238 way_out = 2; 17239 } 17240 done_with_input: 17241 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 17242 if (did_out) 17243 rack->r_wanted_output = 0; 17244 } 17245 #ifdef TCP_ACCOUNTING 17246 sched_unpin(); 17247 #endif 17248 return (retval); 17249 } 17250 17251 static void 17252 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 17253 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 17254 { 17255 struct timeval tv; 17256 17257 /* First lets see if we have old packets */ 17258 if (!STAILQ_EMPTY(&tp->t_inqueue)) { 17259 if (ctf_do_queued_segments(tp, 1)) { 17260 m_freem(m); 17261 return; 17262 } 17263 } 17264 if (m->m_flags & M_TSTMP_LRO) { 17265 mbuf_tstmp2timeval(m, &tv); 17266 } else { 17267 /* Should not be should we kassert instead? */ 17268 tcp_get_usecs(&tv); 17269 } 17270 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0, 17271 &tv) == 0) { 17272 INP_WUNLOCK(tptoinpcb(tp)); 17273 } 17274 } 17275 17276 struct rack_sendmap * 17277 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 17278 { 17279 struct rack_sendmap *rsm = NULL; 17280 int32_t idx; 17281 uint32_t srtt = 0, thresh = 0, ts_low = 0; 17282 int no_sack = 0; 17283 17284 /* Return the next guy to be re-transmitted */ 17285 if (tqhash_empty(rack->r_ctl.tqh)) { 17286 return (NULL); 17287 } 17288 if (tp->t_flags & TF_SENTFIN) { 17289 /* retran the end FIN? */ 17290 return (NULL); 17291 } 17292 /* ok lets look at this one */ 17293 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17294 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 17295 return (rsm); 17296 } 17297 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 17298 goto check_it; 17299 } 17300 rsm = rack_find_lowest_rsm(rack); 17301 if (rsm == NULL) { 17302 return (NULL); 17303 } 17304 check_it: 17305 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) || 17306 (rack->sack_attack_disable > 0)) { 17307 no_sack = 1; 17308 } 17309 if ((no_sack > 0) && 17310 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 17311 /* 17312 * No sack so we automatically do the 3 strikes and 17313 * retransmit (no rack timer would be started). 17314 */ 17315 return (rsm); 17316 } 17317 if (rsm->r_flags & RACK_ACKED) { 17318 return (NULL); 17319 } 17320 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 17321 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 17322 /* Its not yet ready */ 17323 return (NULL); 17324 } 17325 srtt = rack_grab_rtt(tp, rack); 17326 idx = rsm->r_rtr_cnt - 1; 17327 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 17328 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 17329 if ((tsused == ts_low) || 17330 (TSTMP_LT(tsused, ts_low))) { 17331 /* No time since sending */ 17332 return (NULL); 17333 } 17334 if ((tsused - ts_low) < thresh) { 17335 /* It has not been long enough yet */ 17336 return (NULL); 17337 } 17338 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 17339 ((rsm->r_flags & RACK_SACK_PASSED) && 17340 (rack->sack_attack_disable == 0))) { 17341 /* 17342 * We have passed the dup-ack threshold <or> 17343 * a SACK has indicated this is missing. 17344 * Note that if you are a declared attacker 17345 * it is only the dup-ack threshold that 17346 * will cause retransmits. 17347 */ 17348 /* log retransmit reason */ 17349 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 17350 rack->r_fast_output = 0; 17351 return (rsm); 17352 } 17353 return (NULL); 17354 } 17355 17356 static void 17357 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 17358 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 17359 int line, struct rack_sendmap *rsm, uint8_t quality) 17360 { 17361 if (tcp_bblogging_on(rack->rc_tp)) { 17362 union tcp_log_stackspecific log; 17363 struct timeval tv; 17364 17365 if (rack_verbose_logging == 0) { 17366 /* 17367 * We are not verbose screen out all but 17368 * ones we always want. 17369 */ 17370 if ((method != 2) && 17371 (method != 3) && 17372 (method != 7) && 17373 (method != 14) && 17374 (method != 20)) { 17375 return; 17376 } 17377 } 17378 memset(&log, 0, sizeof(log)); 17379 log.u_bbr.flex1 = slot; 17380 log.u_bbr.flex2 = len; 17381 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 17382 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 17383 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 17384 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 17385 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 17386 log.u_bbr.use_lt_bw <<= 1; 17387 log.u_bbr.use_lt_bw |= rack->r_late; 17388 log.u_bbr.use_lt_bw <<= 1; 17389 log.u_bbr.use_lt_bw |= rack->r_early; 17390 log.u_bbr.use_lt_bw <<= 1; 17391 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 17392 log.u_bbr.use_lt_bw <<= 1; 17393 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 17394 log.u_bbr.use_lt_bw <<= 1; 17395 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 17396 log.u_bbr.use_lt_bw <<= 1; 17397 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 17398 log.u_bbr.use_lt_bw <<= 1; 17399 log.u_bbr.use_lt_bw |= rack->gp_ready; 17400 log.u_bbr.pkt_epoch = line; 17401 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 17402 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 17403 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 17404 log.u_bbr.bw_inuse = bw_est; 17405 log.u_bbr.delRate = bw; 17406 if (rack->r_ctl.gp_bw == 0) 17407 log.u_bbr.cur_del_rate = 0; 17408 else 17409 log.u_bbr.cur_del_rate = rack_get_bw(rack); 17410 log.u_bbr.rttProp = len_time; 17411 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 17412 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 17413 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 17414 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 17415 /* We are in slow start */ 17416 log.u_bbr.flex7 = 1; 17417 } else { 17418 /* we are on congestion avoidance */ 17419 log.u_bbr.flex7 = 0; 17420 } 17421 log.u_bbr.flex8 = method; 17422 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17423 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17424 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 17425 log.u_bbr.cwnd_gain <<= 1; 17426 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 17427 log.u_bbr.cwnd_gain <<= 1; 17428 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 17429 log.u_bbr.bbr_substate = quality; 17430 log.u_bbr.bbr_state = rack->dgp_on; 17431 log.u_bbr.bbr_state <<= 1; 17432 log.u_bbr.bbr_state |= rack->r_fill_less_agg; 17433 log.u_bbr.bbr_state <<= 1; 17434 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; 17435 log.u_bbr.bbr_state <<= 2; 17436 log.u_bbr.bbr_state |= rack->r_pacing_discount; 17437 log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7); 17438 TCP_LOG_EVENTP(rack->rc_tp, NULL, 17439 &rack->rc_inp->inp_socket->so_rcv, 17440 &rack->rc_inp->inp_socket->so_snd, 17441 BBR_LOG_HPTSI_CALC, 0, 17442 0, &log, false, &tv); 17443 } 17444 } 17445 17446 static uint32_t 17447 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 17448 { 17449 uint32_t new_tso, user_max, pace_one; 17450 17451 user_max = rack->rc_user_set_max_segs * mss; 17452 if (rack->rc_force_max_seg) { 17453 return (user_max); 17454 } 17455 if (rack->use_fixed_rate && 17456 ((rack->r_ctl.crte == NULL) || 17457 (bw != rack->r_ctl.crte->rate))) { 17458 /* Use the user mss since we are not exactly matched */ 17459 return (user_max); 17460 } 17461 if (rack_pace_one_seg || 17462 (rack->r_ctl.rc_user_set_min_segs == 1)) 17463 pace_one = 1; 17464 else 17465 pace_one = 0; 17466 17467 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, 17468 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 17469 if (new_tso > user_max) 17470 new_tso = user_max; 17471 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { 17472 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) 17473 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; 17474 } 17475 if (rack->r_ctl.rc_user_set_min_segs && 17476 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) 17477 new_tso = rack->r_ctl.rc_user_set_min_segs * mss; 17478 return (new_tso); 17479 } 17480 17481 static uint64_t 17482 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b) 17483 { 17484 uint64_t reduced_win; 17485 uint32_t gain; 17486 17487 if (window_input < rc_init_window(rack)) { 17488 /* 17489 * The cwnd is collapsed to 17490 * nearly zero, maybe because of a time-out? 17491 * Lets drop back to the lt-bw. 17492 */ 17493 reduced_win = rack_get_lt_bw(rack); 17494 /* Set the flag so the caller knows its a rate and not a reduced window */ 17495 *rate_set = 1; 17496 gain = 100; 17497 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) { 17498 /* 17499 * If we are in recover our cwnd needs to be less for 17500 * our pacing consideration. 17501 */ 17502 if (rack->rack_hibeta == 0) { 17503 reduced_win = window_input / 2; 17504 gain = 50; 17505 } else { 17506 reduced_win = window_input * rack->r_ctl.saved_hibeta; 17507 reduced_win /= 100; 17508 gain = rack->r_ctl.saved_hibeta; 17509 } 17510 } else { 17511 /* 17512 * Apply Timely factor to increase/decrease the 17513 * amount we are pacing at. 17514 */ 17515 gain = rack_get_output_gain(rack, NULL); 17516 if (gain > rack_gain_p5_ub) { 17517 gain = rack_gain_p5_ub; 17518 } 17519 reduced_win = window_input * gain; 17520 reduced_win /= 100; 17521 } 17522 if (gain_b != NULL) 17523 *gain_b = gain; 17524 /* 17525 * What is being returned here is a trimmed down 17526 * window values in all cases where rate_set is left 17527 * at 0. In one case we actually return the rate (lt_bw). 17528 * the "reduced_win" is returned as a slimmed down cwnd that 17529 * is then calculated by the caller into a rate when rate_set 17530 * is 0. 17531 */ 17532 return (reduced_win); 17533 } 17534 17535 static int32_t 17536 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 17537 { 17538 uint64_t lentim, fill_bw; 17539 17540 /* Lets first see if we are full, if so continue with normal rate */ 17541 rack->r_via_fill_cw = 0; 17542 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 17543 return (slot); 17544 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 17545 return (slot); 17546 if (rack->r_ctl.rc_last_us_rtt == 0) 17547 return (slot); 17548 if (rack->rc_pace_fill_if_rttin_range && 17549 (rack->r_ctl.rc_last_us_rtt >= 17550 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 17551 /* The rtt is huge, N * smallest, lets not fill */ 17552 return (slot); 17553 } 17554 /* 17555 * first lets calculate the b/w based on the last us-rtt 17556 * and the the smallest send window. 17557 */ 17558 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17559 if (rack->rc_fillcw_apply_discount) { 17560 uint32_t rate_set = 0; 17561 17562 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL); 17563 if (rate_set) { 17564 goto at_lt_bw; 17565 } 17566 } 17567 /* Take the rwnd if its smaller */ 17568 if (fill_bw > rack->rc_tp->snd_wnd) 17569 fill_bw = rack->rc_tp->snd_wnd; 17570 /* Now lets make it into a b/w */ 17571 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 17572 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17573 at_lt_bw: 17574 if (rack->r_fill_less_agg) { 17575 /* 17576 * We want the average of the rate_wanted 17577 * and our fill-cw calculated bw. We also want 17578 * to cap any increase to be no more than 17579 * X times the lt_bw (where X is the rack_bw_multipler). 17580 */ 17581 uint64_t lt_bw, rate; 17582 17583 lt_bw = rack_get_lt_bw(rack); 17584 if (lt_bw > *rate_wanted) 17585 rate = lt_bw; 17586 else 17587 rate = *rate_wanted; 17588 fill_bw += rate; 17589 fill_bw /= 2; 17590 if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) { 17591 fill_bw = rate * rack_bw_multipler; 17592 } 17593 } 17594 /* We are below the min b/w */ 17595 if (non_paced) 17596 *rate_wanted = fill_bw; 17597 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 17598 return (slot); 17599 rack->r_via_fill_cw = 1; 17600 if (rack->r_rack_hw_rate_caps && 17601 (rack->r_ctl.crte != NULL)) { 17602 uint64_t high_rate; 17603 17604 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 17605 if (fill_bw > high_rate) { 17606 /* We are capping bw at the highest rate table entry */ 17607 if (*rate_wanted > high_rate) { 17608 /* The original rate was also capped */ 17609 rack->r_via_fill_cw = 0; 17610 } 17611 rack_log_hdwr_pacing(rack, 17612 fill_bw, high_rate, __LINE__, 17613 0, 3); 17614 fill_bw = high_rate; 17615 if (capped) 17616 *capped = 1; 17617 } 17618 } else if ((rack->r_ctl.crte == NULL) && 17619 (rack->rack_hdrw_pacing == 0) && 17620 (rack->rack_hdw_pace_ena) && 17621 rack->r_rack_hw_rate_caps && 17622 (rack->rack_attempt_hdwr_pace == 0) && 17623 (rack->rc_inp->inp_route.ro_nh != NULL) && 17624 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17625 /* 17626 * Ok we may have a first attempt that is greater than our top rate 17627 * lets check. 17628 */ 17629 uint64_t high_rate; 17630 17631 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 17632 if (high_rate) { 17633 if (fill_bw > high_rate) { 17634 fill_bw = high_rate; 17635 if (capped) 17636 *capped = 1; 17637 } 17638 } 17639 } 17640 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { 17641 if (rack->rc_hybrid_mode) 17642 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 17643 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); 17644 fill_bw = rack->r_ctl.bw_rate_cap; 17645 } 17646 /* 17647 * Ok fill_bw holds our mythical b/w to fill the cwnd 17648 * in an rtt (unless it was capped), what does that 17649 * time wise equate too? 17650 */ 17651 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 17652 lentim /= fill_bw; 17653 *rate_wanted = fill_bw; 17654 if (non_paced || (lentim < slot)) { 17655 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 17656 0, lentim, 12, __LINE__, NULL, 0); 17657 return ((int32_t)lentim); 17658 } else 17659 return (slot); 17660 } 17661 17662 static int32_t 17663 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 17664 { 17665 uint64_t srtt; 17666 int32_t slot = 0; 17667 int32_t minslot = 0; 17668 int can_start_hw_pacing = 1; 17669 int err; 17670 int pace_one; 17671 17672 if (rack_pace_one_seg || 17673 (rack->r_ctl.rc_user_set_min_segs == 1)) 17674 pace_one = 1; 17675 else 17676 pace_one = 0; 17677 if (rack->rc_always_pace == 0) { 17678 /* 17679 * We use the most optimistic possible cwnd/srtt for 17680 * sending calculations. This will make our 17681 * calculation anticipate getting more through 17682 * quicker then possible. But thats ok we don't want 17683 * the peer to have a gap in data sending. 17684 */ 17685 uint64_t cwnd, tr_perms = 0; 17686 int32_t reduce = 0; 17687 17688 old_method: 17689 /* 17690 * We keep no precise pacing with the old method 17691 * instead we use the pacer to mitigate bursts. 17692 */ 17693 if (rack->r_ctl.rc_rack_min_rtt) 17694 srtt = rack->r_ctl.rc_rack_min_rtt; 17695 else 17696 srtt = max(tp->t_srtt, 1); 17697 if (rack->r_ctl.rc_rack_largest_cwnd) 17698 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 17699 else 17700 cwnd = rack->r_ctl.cwnd_to_use; 17701 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 17702 tr_perms = (cwnd * 1000) / srtt; 17703 if (tr_perms == 0) { 17704 tr_perms = ctf_fixed_maxseg(tp); 17705 } 17706 /* 17707 * Calculate how long this will take to drain, if 17708 * the calculation comes out to zero, thats ok we 17709 * will use send_a_lot to possibly spin around for 17710 * more increasing tot_len_this_send to the point 17711 * that its going to require a pace, or we hit the 17712 * cwnd. Which in that case we are just waiting for 17713 * a ACK. 17714 */ 17715 slot = len / tr_perms; 17716 /* Now do we reduce the time so we don't run dry? */ 17717 if (slot && rack_slot_reduction) { 17718 reduce = (slot / rack_slot_reduction); 17719 if (reduce < slot) { 17720 slot -= reduce; 17721 } else 17722 slot = 0; 17723 } 17724 slot *= HPTS_USEC_IN_MSEC; 17725 if (rack->rc_pace_to_cwnd) { 17726 uint64_t rate_wanted = 0; 17727 17728 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 17729 rack->rc_ack_can_sendout_data = 1; 17730 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 17731 } else 17732 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 17733 /*******************************************************/ 17734 /* RRS: We insert non-paced call to stats here for len */ 17735 /*******************************************************/ 17736 } else { 17737 uint64_t bw_est, res, lentim, rate_wanted; 17738 uint32_t segs, oh; 17739 int capped = 0; 17740 int prev_fill; 17741 17742 if ((rack->r_rr_config == 1) && rsm) { 17743 return (rack->r_ctl.rc_min_to); 17744 } 17745 if (rack->use_fixed_rate) { 17746 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 17747 } else if ((rack->r_ctl.init_rate == 0) && 17748 (rack->r_ctl.gp_bw == 0)) { 17749 /* no way to yet do an estimate */ 17750 bw_est = rate_wanted = 0; 17751 } else if (rack->dgp_on) { 17752 bw_est = rack_get_bw(rack); 17753 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 17754 } else { 17755 uint32_t gain, rate_set = 0; 17756 17757 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17758 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain); 17759 if (rate_set == 0) { 17760 if (rate_wanted > rack->rc_tp->snd_wnd) 17761 rate_wanted = rack->rc_tp->snd_wnd; 17762 /* Now lets make it into a b/w */ 17763 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC; 17764 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17765 } 17766 bw_est = rate_wanted; 17767 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd, 17768 rack->r_ctl.cwnd_to_use, 17769 rate_wanted, bw_est, 17770 rack->r_ctl.rc_last_us_rtt, 17771 88, __LINE__, NULL, gain); 17772 } 17773 if ((bw_est == 0) || (rate_wanted == 0) || 17774 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 17775 /* 17776 * No way yet to make a b/w estimate or 17777 * our raise is set incorrectly. 17778 */ 17779 goto old_method; 17780 } 17781 rack_rate_cap_bw(rack, &rate_wanted, &capped); 17782 /* We need to account for all the overheads */ 17783 segs = (len + segsiz - 1) / segsiz; 17784 /* 17785 * We need the diff between 1514 bytes (e-mtu with e-hdr) 17786 * and how much data we put in each packet. Yes this 17787 * means we may be off if we are larger than 1500 bytes 17788 * or smaller. But this just makes us more conservative. 17789 */ 17790 17791 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); 17792 if (rack->r_is_v6) { 17793 #ifdef INET6 17794 oh += sizeof(struct ip6_hdr); 17795 #endif 17796 } else { 17797 #ifdef INET 17798 oh += sizeof(struct ip); 17799 #endif 17800 } 17801 /* We add a fixed 14 for the ethernet header */ 17802 oh += 14; 17803 segs *= oh; 17804 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 17805 res = lentim / rate_wanted; 17806 slot = (uint32_t)res; 17807 if (rack_hw_rate_min && 17808 (rate_wanted < rack_hw_rate_min)) { 17809 can_start_hw_pacing = 0; 17810 if (rack->r_ctl.crte) { 17811 /* 17812 * Ok we need to release it, we 17813 * have fallen too low. 17814 */ 17815 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17816 rack->r_ctl.crte = NULL; 17817 rack->rack_attempt_hdwr_pace = 0; 17818 rack->rack_hdrw_pacing = 0; 17819 } 17820 } 17821 if (rack->r_ctl.crte && 17822 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17823 /* 17824 * We want more than the hardware can give us, 17825 * don't start any hw pacing. 17826 */ 17827 can_start_hw_pacing = 0; 17828 if (rack->r_rack_hw_rate_caps == 0) { 17829 /* 17830 * Ok we need to release it, we 17831 * want more than the card can give us and 17832 * no rate cap is in place. Set it up so 17833 * when we want less we can retry. 17834 */ 17835 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17836 rack->r_ctl.crte = NULL; 17837 rack->rack_attempt_hdwr_pace = 0; 17838 rack->rack_hdrw_pacing = 0; 17839 } 17840 } 17841 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { 17842 /* 17843 * We lost our rate somehow, this can happen 17844 * if the interface changed underneath us. 17845 */ 17846 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17847 rack->r_ctl.crte = NULL; 17848 /* Lets re-allow attempting to setup pacing */ 17849 rack->rack_hdrw_pacing = 0; 17850 rack->rack_attempt_hdwr_pace = 0; 17851 rack_log_hdwr_pacing(rack, 17852 rate_wanted, bw_est, __LINE__, 17853 0, 6); 17854 } 17855 prev_fill = rack->r_via_fill_cw; 17856 if ((rack->rc_pace_to_cwnd) && 17857 (capped == 0) && 17858 (rack->dgp_on == 1) && 17859 (rack->use_fixed_rate == 0) && 17860 (rack->in_probe_rtt == 0) && 17861 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 17862 /* 17863 * We want to pace at our rate *or* faster to 17864 * fill the cwnd to the max if its not full. 17865 */ 17866 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 17867 /* Re-check to make sure we are not exceeding our max b/w */ 17868 if ((rack->r_ctl.crte != NULL) && 17869 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17870 /* 17871 * We want more than the hardware can give us, 17872 * don't start any hw pacing. 17873 */ 17874 can_start_hw_pacing = 0; 17875 if (rack->r_rack_hw_rate_caps == 0) { 17876 /* 17877 * Ok we need to release it, we 17878 * want more than the card can give us and 17879 * no rate cap is in place. Set it up so 17880 * when we want less we can retry. 17881 */ 17882 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17883 rack->r_ctl.crte = NULL; 17884 rack->rack_attempt_hdwr_pace = 0; 17885 rack->rack_hdrw_pacing = 0; 17886 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 17887 } 17888 } 17889 } 17890 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 17891 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17892 if ((rack->rack_hdw_pace_ena) && 17893 (can_start_hw_pacing > 0) && 17894 (rack->rack_hdrw_pacing == 0) && 17895 (rack->rack_attempt_hdwr_pace == 0)) { 17896 /* 17897 * Lets attempt to turn on hardware pacing 17898 * if we can. 17899 */ 17900 rack->rack_attempt_hdwr_pace = 1; 17901 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 17902 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17903 rate_wanted, 17904 RS_PACING_GEQ, 17905 &err, &rack->r_ctl.crte_prev_rate); 17906 if (rack->r_ctl.crte) { 17907 rack->rack_hdrw_pacing = 1; 17908 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, 17909 pace_one, rack->r_ctl.crte, 17910 NULL, rack->r_ctl.pace_len_divisor); 17911 rack_log_hdwr_pacing(rack, 17912 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17913 err, 0); 17914 rack->r_ctl.last_hw_bw_req = rate_wanted; 17915 } else { 17916 counter_u64_add(rack_hw_pace_init_fail, 1); 17917 } 17918 } else if (rack->rack_hdrw_pacing && 17919 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 17920 /* Do we need to adjust our rate? */ 17921 const struct tcp_hwrate_limit_table *nrte; 17922 17923 if (rack->r_up_only && 17924 (rate_wanted < rack->r_ctl.crte->rate)) { 17925 /** 17926 * We have four possible states here 17927 * having to do with the previous time 17928 * and this time. 17929 * previous | this-time 17930 * A) 0 | 0 -- fill_cw not in the picture 17931 * B) 1 | 0 -- we were doing a fill-cw but now are not 17932 * C) 1 | 1 -- all rates from fill_cw 17933 * D) 0 | 1 -- we were doing non-fill and now we are filling 17934 * 17935 * For case A, C and D we don't allow a drop. But for 17936 * case B where we now our on our steady rate we do 17937 * allow a drop. 17938 * 17939 */ 17940 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 17941 goto done_w_hdwr; 17942 } 17943 if ((rate_wanted > rack->r_ctl.crte->rate) || 17944 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 17945 if (rack_hw_rate_to_low && 17946 (bw_est < rack_hw_rate_to_low)) { 17947 /* 17948 * The pacing rate is too low for hardware, but 17949 * do allow hardware pacing to be restarted. 17950 */ 17951 rack_log_hdwr_pacing(rack, 17952 bw_est, rack->r_ctl.crte->rate, __LINE__, 17953 0, 5); 17954 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17955 rack->r_ctl.crte = NULL; 17956 rack->rack_attempt_hdwr_pace = 0; 17957 rack->rack_hdrw_pacing = 0; 17958 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17959 goto done_w_hdwr; 17960 } 17961 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 17962 rack->rc_tp, 17963 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17964 rate_wanted, 17965 RS_PACING_GEQ, 17966 &err, &rack->r_ctl.crte_prev_rate); 17967 if (nrte == NULL) { 17968 /* 17969 * Lost the rate, lets drop hardware pacing 17970 * period. 17971 */ 17972 rack->rack_hdrw_pacing = 0; 17973 rack->r_ctl.crte = NULL; 17974 rack_log_hdwr_pacing(rack, 17975 rate_wanted, 0, __LINE__, 17976 err, 1); 17977 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17978 counter_u64_add(rack_hw_pace_lost, 1); 17979 } else if (nrte != rack->r_ctl.crte) { 17980 rack->r_ctl.crte = nrte; 17981 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, 17982 segsiz, pace_one, rack->r_ctl.crte, 17983 NULL, rack->r_ctl.pace_len_divisor); 17984 rack_log_hdwr_pacing(rack, 17985 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17986 err, 2); 17987 rack->r_ctl.last_hw_bw_req = rate_wanted; 17988 } 17989 } else { 17990 /* We just need to adjust the segment size */ 17991 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17992 rack_log_hdwr_pacing(rack, 17993 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17994 0, 4); 17995 rack->r_ctl.last_hw_bw_req = rate_wanted; 17996 } 17997 } 17998 } 17999 if (minslot && (minslot > slot)) { 18000 rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, 18001 98, __LINE__, NULL, 0); 18002 slot = minslot; 18003 } 18004 done_w_hdwr: 18005 if (rack_limit_time_with_srtt && 18006 (rack->use_fixed_rate == 0) && 18007 (rack->rack_hdrw_pacing == 0)) { 18008 /* 18009 * Sanity check, we do not allow the pacing delay 18010 * to be longer than the SRTT of the path. If it is 18011 * a slow path, then adding a packet should increase 18012 * the RTT and compensate for this i.e. the srtt will 18013 * be greater so the allowed pacing time will be greater. 18014 * 18015 * Note this restriction is not for where a peak rate 18016 * is set, we are doing fixed pacing or hardware pacing. 18017 */ 18018 if (rack->rc_tp->t_srtt) 18019 srtt = rack->rc_tp->t_srtt; 18020 else 18021 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 18022 if (srtt < (uint64_t)slot) { 18023 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 18024 slot = srtt; 18025 } 18026 } 18027 /*******************************************************************/ 18028 /* RRS: We insert paced call to stats here for len and rate_wanted */ 18029 /*******************************************************************/ 18030 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 18031 } 18032 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 18033 /* 18034 * If this rate is seeing enobufs when it 18035 * goes to send then either the nic is out 18036 * of gas or we are mis-estimating the time 18037 * somehow and not letting the queue empty 18038 * completely. Lets add to the pacing time. 18039 */ 18040 int hw_boost_delay; 18041 18042 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 18043 if (hw_boost_delay > rack_enobuf_hw_max) 18044 hw_boost_delay = rack_enobuf_hw_max; 18045 else if (hw_boost_delay < rack_enobuf_hw_min) 18046 hw_boost_delay = rack_enobuf_hw_min; 18047 slot += hw_boost_delay; 18048 } 18049 return (slot); 18050 } 18051 18052 static void 18053 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 18054 tcp_seq startseq, uint32_t sb_offset) 18055 { 18056 struct rack_sendmap *my_rsm = NULL; 18057 18058 if (tp->t_state < TCPS_ESTABLISHED) { 18059 /* 18060 * We don't start any measurements if we are 18061 * not at least established. 18062 */ 18063 return; 18064 } 18065 if (tp->t_state >= TCPS_FIN_WAIT_1) { 18066 /* 18067 * We will get no more data into the SB 18068 * this means we need to have the data available 18069 * before we start a measurement. 18070 */ 18071 18072 if (sbavail(&tptosocket(tp)->so_snd) < 18073 max(rc_init_window(rack), 18074 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 18075 /* Nope not enough data */ 18076 return; 18077 } 18078 } 18079 tp->t_flags |= TF_GPUTINPROG; 18080 rack->r_ctl.rc_gp_cumack_ts = 0; 18081 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 18082 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 18083 tp->gput_seq = startseq; 18084 rack->app_limited_needs_set = 0; 18085 if (rack->in_probe_rtt) 18086 rack->measure_saw_probe_rtt = 1; 18087 else if ((rack->measure_saw_probe_rtt) && 18088 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 18089 rack->measure_saw_probe_rtt = 0; 18090 if (rack->rc_gp_filled) 18091 tp->gput_ts = rack->r_ctl.last_cumack_advance; 18092 else { 18093 /* Special case initial measurement */ 18094 struct timeval tv; 18095 18096 tp->gput_ts = tcp_get_usecs(&tv); 18097 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18098 } 18099 /* 18100 * We take a guess out into the future, 18101 * if we have no measurement and no 18102 * initial rate, we measure the first 18103 * initial-windows worth of data to 18104 * speed up getting some GP measurement and 18105 * thus start pacing. 18106 */ 18107 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 18108 rack->app_limited_needs_set = 1; 18109 tp->gput_ack = startseq + max(rc_init_window(rack), 18110 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 18111 rack_log_pacing_delay_calc(rack, 18112 tp->gput_seq, 18113 tp->gput_ack, 18114 0, 18115 tp->gput_ts, 18116 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18117 9, 18118 __LINE__, NULL, 0); 18119 rack_tend_gp_marks(tp, rack); 18120 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18121 return; 18122 } 18123 if (sb_offset) { 18124 /* 18125 * We are out somewhere in the sb 18126 * can we use the already outstanding data? 18127 */ 18128 18129 if (rack->r_ctl.rc_app_limited_cnt == 0) { 18130 /* 18131 * Yes first one is good and in this case 18132 * the tp->gput_ts is correctly set based on 18133 * the last ack that arrived (no need to 18134 * set things up when an ack comes in). 18135 */ 18136 my_rsm = tqhash_min(rack->r_ctl.tqh); 18137 if ((my_rsm == NULL) || 18138 (my_rsm->r_rtr_cnt != 1)) { 18139 /* retransmission? */ 18140 goto use_latest; 18141 } 18142 } else { 18143 if (rack->r_ctl.rc_first_appl == NULL) { 18144 /* 18145 * If rc_first_appl is NULL 18146 * then the cnt should be 0. 18147 * This is probably an error, maybe 18148 * a KASSERT would be approprate. 18149 */ 18150 goto use_latest; 18151 } 18152 /* 18153 * If we have a marker pointer to the last one that is 18154 * app limited we can use that, but we need to set 18155 * things up so that when it gets ack'ed we record 18156 * the ack time (if its not already acked). 18157 */ 18158 rack->app_limited_needs_set = 1; 18159 /* 18160 * We want to get to the rsm that is either 18161 * next with space i.e. over 1 MSS or the one 18162 * after that (after the app-limited). 18163 */ 18164 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); 18165 if (my_rsm) { 18166 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 18167 /* Have to use the next one */ 18168 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 18169 else { 18170 /* Use after the first MSS of it is acked */ 18171 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 18172 goto start_set; 18173 } 18174 } 18175 if ((my_rsm == NULL) || 18176 (my_rsm->r_rtr_cnt != 1)) { 18177 /* 18178 * Either its a retransmit or 18179 * the last is the app-limited one. 18180 */ 18181 goto use_latest; 18182 } 18183 } 18184 tp->gput_seq = my_rsm->r_start; 18185 start_set: 18186 if (my_rsm->r_flags & RACK_ACKED) { 18187 /* 18188 * This one has been acked use the arrival ack time 18189 */ 18190 struct rack_sendmap *nrsm; 18191 18192 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 18193 rack->app_limited_needs_set = 0; 18194 /* 18195 * Ok in this path we need to use the r_end now 18196 * since this guy is the starting ack. 18197 */ 18198 tp->gput_seq = my_rsm->r_end; 18199 /* 18200 * We also need to adjust up the sendtime 18201 * to the send of the next data after my_rsm. 18202 */ 18203 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 18204 if (nrsm != NULL) 18205 my_rsm = nrsm; 18206 else { 18207 /* 18208 * The next as not been sent, thats the 18209 * case for using the latest. 18210 */ 18211 goto use_latest; 18212 } 18213 } 18214 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 18215 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 18216 rack->r_ctl.rc_gp_cumack_ts = 0; 18217 rack_log_pacing_delay_calc(rack, 18218 tp->gput_seq, 18219 tp->gput_ack, 18220 (uint64_t)my_rsm, 18221 tp->gput_ts, 18222 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18223 9, 18224 __LINE__, my_rsm, 0); 18225 /* Now lets make sure all are marked as they should be */ 18226 rack_tend_gp_marks(tp, rack); 18227 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18228 return; 18229 } 18230 18231 use_latest: 18232 /* 18233 * We don't know how long we may have been 18234 * idle or if this is the first-send. Lets 18235 * setup the flag so we will trim off 18236 * the first ack'd data so we get a true 18237 * measurement. 18238 */ 18239 rack->app_limited_needs_set = 1; 18240 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 18241 rack->r_ctl.rc_gp_cumack_ts = 0; 18242 /* Find this guy so we can pull the send time */ 18243 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); 18244 if (my_rsm) { 18245 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 18246 if (my_rsm->r_flags & RACK_ACKED) { 18247 /* 18248 * Unlikely since its probably what was 18249 * just transmitted (but I am paranoid). 18250 */ 18251 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 18252 rack->app_limited_needs_set = 0; 18253 } 18254 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 18255 /* This also is unlikely */ 18256 tp->gput_seq = my_rsm->r_start; 18257 } 18258 } else { 18259 /* 18260 * TSNH unless we have some send-map limit, 18261 * and even at that it should not be hitting 18262 * that limit (we should have stopped sending). 18263 */ 18264 struct timeval tv; 18265 18266 microuptime(&tv); 18267 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18268 } 18269 rack_tend_gp_marks(tp, rack); 18270 rack_log_pacing_delay_calc(rack, 18271 tp->gput_seq, 18272 tp->gput_ack, 18273 (uint64_t)my_rsm, 18274 tp->gput_ts, 18275 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18276 9, __LINE__, NULL, 0); 18277 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18278 } 18279 18280 static inline uint32_t 18281 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 18282 uint32_t avail, int32_t sb_offset) 18283 { 18284 uint32_t len; 18285 uint32_t sendwin; 18286 18287 if (tp->snd_wnd > cwnd_to_use) 18288 sendwin = cwnd_to_use; 18289 else 18290 sendwin = tp->snd_wnd; 18291 if (ctf_outstanding(tp) >= tp->snd_wnd) { 18292 /* We never want to go over our peers rcv-window */ 18293 len = 0; 18294 } else { 18295 uint32_t flight; 18296 18297 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 18298 if (flight >= sendwin) { 18299 /* 18300 * We have in flight what we are allowed by cwnd (if 18301 * it was rwnd blocking it would have hit above out 18302 * >= tp->snd_wnd). 18303 */ 18304 return (0); 18305 } 18306 len = sendwin - flight; 18307 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 18308 /* We would send too much (beyond the rwnd) */ 18309 len = tp->snd_wnd - ctf_outstanding(tp); 18310 } 18311 if ((len + sb_offset) > avail) { 18312 /* 18313 * We don't have that much in the SB, how much is 18314 * there? 18315 */ 18316 len = avail - sb_offset; 18317 } 18318 } 18319 return (len); 18320 } 18321 18322 static void 18323 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 18324 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 18325 int rsm_is_null, int optlen, int line, uint16_t mode) 18326 { 18327 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 18328 union tcp_log_stackspecific log; 18329 struct timeval tv; 18330 18331 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18332 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18333 log.u_bbr.flex1 = error; 18334 log.u_bbr.flex2 = flags; 18335 log.u_bbr.flex3 = rsm_is_null; 18336 log.u_bbr.flex4 = ipoptlen; 18337 log.u_bbr.flex5 = tp->rcv_numsacks; 18338 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18339 log.u_bbr.flex7 = optlen; 18340 log.u_bbr.flex8 = rack->r_fsb_inited; 18341 log.u_bbr.applimited = rack->r_fast_output; 18342 log.u_bbr.bw_inuse = rack_get_bw(rack); 18343 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18344 log.u_bbr.cwnd_gain = mode; 18345 log.u_bbr.pkts_out = orig_len; 18346 log.u_bbr.lt_epoch = len; 18347 log.u_bbr.delivered = line; 18348 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 18349 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18350 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 18351 len, &log, false, NULL, __func__, __LINE__, &tv); 18352 } 18353 } 18354 18355 18356 static struct mbuf * 18357 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 18358 struct rack_fast_send_blk *fsb, 18359 int32_t seglimit, int32_t segsize, int hw_tls) 18360 { 18361 #ifdef KERN_TLS 18362 struct ktls_session *tls, *ntls; 18363 #ifdef INVARIANTS 18364 struct mbuf *start; 18365 #endif 18366 #endif 18367 struct mbuf *m, *n, **np, *smb; 18368 struct mbuf *top; 18369 int32_t off, soff; 18370 int32_t len = *plen; 18371 int32_t fragsize; 18372 int32_t len_cp = 0; 18373 uint32_t mlen, frags; 18374 18375 soff = off = the_off; 18376 smb = m = the_m; 18377 np = ⊤ 18378 top = NULL; 18379 #ifdef KERN_TLS 18380 if (hw_tls && (m->m_flags & M_EXTPG)) 18381 tls = m->m_epg_tls; 18382 else 18383 tls = NULL; 18384 #ifdef INVARIANTS 18385 start = m; 18386 #endif 18387 #endif 18388 while (len > 0) { 18389 if (m == NULL) { 18390 *plen = len_cp; 18391 break; 18392 } 18393 #ifdef KERN_TLS 18394 if (hw_tls) { 18395 if (m->m_flags & M_EXTPG) 18396 ntls = m->m_epg_tls; 18397 else 18398 ntls = NULL; 18399 18400 /* 18401 * Avoid mixing TLS records with handshake 18402 * data or TLS records from different 18403 * sessions. 18404 */ 18405 if (tls != ntls) { 18406 MPASS(m != start); 18407 *plen = len_cp; 18408 break; 18409 } 18410 } 18411 #endif 18412 mlen = min(len, m->m_len - off); 18413 if (seglimit) { 18414 /* 18415 * For M_EXTPG mbufs, add 3 segments 18416 * + 1 in case we are crossing page boundaries 18417 * + 2 in case the TLS hdr/trailer are used 18418 * It is cheaper to just add the segments 18419 * than it is to take the cache miss to look 18420 * at the mbuf ext_pgs state in detail. 18421 */ 18422 if (m->m_flags & M_EXTPG) { 18423 fragsize = min(segsize, PAGE_SIZE); 18424 frags = 3; 18425 } else { 18426 fragsize = segsize; 18427 frags = 0; 18428 } 18429 18430 /* Break if we really can't fit anymore. */ 18431 if ((frags + 1) >= seglimit) { 18432 *plen = len_cp; 18433 break; 18434 } 18435 18436 /* 18437 * Reduce size if you can't copy the whole 18438 * mbuf. If we can't copy the whole mbuf, also 18439 * adjust len so the loop will end after this 18440 * mbuf. 18441 */ 18442 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 18443 mlen = (seglimit - frags - 1) * fragsize; 18444 len = mlen; 18445 *plen = len_cp + len; 18446 } 18447 frags += howmany(mlen, fragsize); 18448 if (frags == 0) 18449 frags++; 18450 seglimit -= frags; 18451 KASSERT(seglimit > 0, 18452 ("%s: seglimit went too low", __func__)); 18453 } 18454 n = m_get(M_NOWAIT, m->m_type); 18455 *np = n; 18456 if (n == NULL) 18457 goto nospace; 18458 n->m_len = mlen; 18459 soff += mlen; 18460 len_cp += n->m_len; 18461 if (m->m_flags & (M_EXT | M_EXTPG)) { 18462 n->m_data = m->m_data + off; 18463 mb_dupcl(n, m); 18464 } else { 18465 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 18466 (u_int)n->m_len); 18467 } 18468 len -= n->m_len; 18469 off = 0; 18470 m = m->m_next; 18471 np = &n->m_next; 18472 if (len || (soff == smb->m_len)) { 18473 /* 18474 * We have more so we move forward or 18475 * we have consumed the entire mbuf and 18476 * len has fell to 0. 18477 */ 18478 soff = 0; 18479 smb = m; 18480 } 18481 18482 } 18483 if (fsb != NULL) { 18484 fsb->m = smb; 18485 fsb->off = soff; 18486 if (smb) { 18487 /* 18488 * Save off the size of the mbuf. We do 18489 * this so that we can recognize when it 18490 * has been trimmed by sbcut() as acks 18491 * come in. 18492 */ 18493 fsb->o_m_len = smb->m_len; 18494 fsb->o_t_len = M_TRAILINGROOM(smb); 18495 } else { 18496 /* 18497 * This is the case where the next mbuf went to NULL. This 18498 * means with this copy we have sent everything in the sb. 18499 * In theory we could clear the fast_output flag, but lets 18500 * not since its possible that we could get more added 18501 * and acks that call the extend function which would let 18502 * us send more. 18503 */ 18504 fsb->o_m_len = 0; 18505 fsb->o_t_len = 0; 18506 } 18507 } 18508 return (top); 18509 nospace: 18510 if (top) 18511 m_freem(top); 18512 return (NULL); 18513 18514 } 18515 18516 /* 18517 * This is a copy of m_copym(), taking the TSO segment size/limit 18518 * constraints into account, and advancing the sndptr as it goes. 18519 */ 18520 static struct mbuf * 18521 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 18522 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 18523 { 18524 struct mbuf *m, *n; 18525 int32_t soff; 18526 18527 m = rack->r_ctl.fsb.m; 18528 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { 18529 /* 18530 * The trailing space changed, mbufs can grow 18531 * at the tail but they can't shrink from 18532 * it, KASSERT that. Adjust the orig_m_len to 18533 * compensate for this change. 18534 */ 18535 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), 18536 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 18537 m, 18538 rack, 18539 (intmax_t)M_TRAILINGROOM(m), 18540 rack->r_ctl.fsb.o_t_len, 18541 rack->r_ctl.fsb.o_m_len, 18542 m->m_len)); 18543 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); 18544 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); 18545 } 18546 if (m->m_len < rack->r_ctl.fsb.o_m_len) { 18547 /* 18548 * Mbuf shrank, trimmed off the top by an ack, our 18549 * offset changes. 18550 */ 18551 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), 18552 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", 18553 m, m->m_len, 18554 rack, rack->r_ctl.fsb.o_m_len, 18555 rack->r_ctl.fsb.off)); 18556 18557 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) 18558 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); 18559 else 18560 rack->r_ctl.fsb.off = 0; 18561 rack->r_ctl.fsb.o_m_len = m->m_len; 18562 #ifdef INVARIANTS 18563 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { 18564 panic("rack:%p m:%p m_len grew outside of t_space compensation", 18565 rack, m); 18566 #endif 18567 } 18568 soff = rack->r_ctl.fsb.off; 18569 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 18570 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 18571 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 18572 __FUNCTION__, 18573 rack, *plen, m, m->m_len)); 18574 /* Save off the right location before we copy and advance */ 18575 *s_soff = soff; 18576 *s_mb = rack->r_ctl.fsb.m; 18577 n = rack_fo_base_copym(m, soff, plen, 18578 &rack->r_ctl.fsb, 18579 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 18580 return (n); 18581 } 18582 18583 /* Log the buffer level */ 18584 static void 18585 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, 18586 int len, struct timeval *tv, 18587 uint32_t cts) 18588 { 18589 uint32_t p_rate = 0, p_queue = 0, err = 0; 18590 union tcp_log_stackspecific log; 18591 18592 #ifdef RATELIMIT 18593 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18594 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18595 #endif 18596 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18597 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18598 log.u_bbr.flex1 = p_rate; 18599 log.u_bbr.flex2 = p_queue; 18600 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18601 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18602 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18603 log.u_bbr.flex7 = 99; 18604 log.u_bbr.flex8 = 0; 18605 log.u_bbr.pkts_out = err; 18606 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18607 log.u_bbr.timeStamp = cts; 18608 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18609 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18610 len, &log, false, NULL, __func__, __LINE__, tv); 18611 18612 } 18613 18614 static uint32_t 18615 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, 18616 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) 18617 { 18618 uint64_t lentime = 0; 18619 #ifdef RATELIMIT 18620 uint32_t p_rate = 0, p_queue = 0, err; 18621 union tcp_log_stackspecific log; 18622 uint64_t bw; 18623 18624 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18625 /* Failed or queue is zero */ 18626 if (err || (p_queue == 0)) { 18627 lentime = 0; 18628 goto out; 18629 } 18630 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18631 if (err) { 18632 lentime = 0; 18633 goto out; 18634 } 18635 /* 18636 * If we reach here we have some bytes in 18637 * the queue. The number returned is a value 18638 * between 0 and 0xffff where ffff is full 18639 * and 0 is empty. So how best to make this into 18640 * something usable? 18641 * 18642 * The "safer" way is lets take the b/w gotten 18643 * from the query (which should be our b/w rate) 18644 * and pretend that a full send (our rc_pace_max_segs) 18645 * is outstanding. We factor it so its as if a full 18646 * number of our MSS segment is terms of full 18647 * ethernet segments are outstanding. 18648 */ 18649 bw = p_rate / 8; 18650 if (bw) { 18651 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); 18652 lentime *= ETHERNET_SEGMENT_SIZE; 18653 lentime *= (uint64_t)HPTS_USEC_IN_SEC; 18654 lentime /= bw; 18655 } else { 18656 /* TSNH -- KASSERT? */ 18657 lentime = 0; 18658 } 18659 out: 18660 if (tcp_bblogging_on(tp)) { 18661 memset(&log, 0, sizeof(log)); 18662 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18663 log.u_bbr.flex1 = p_rate; 18664 log.u_bbr.flex2 = p_queue; 18665 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18666 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18667 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18668 log.u_bbr.flex7 = 99; 18669 log.u_bbr.flex8 = 0; 18670 log.u_bbr.pkts_out = err; 18671 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18672 log.u_bbr.cur_del_rate = lentime; 18673 log.u_bbr.timeStamp = cts; 18674 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18675 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18676 len, &log, false, NULL, __func__, __LINE__,tv); 18677 } 18678 #endif 18679 return ((uint32_t)lentime); 18680 } 18681 18682 static int 18683 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 18684 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 18685 { 18686 /* 18687 * Enter the fast retransmit path. We are given that a sched_pin is 18688 * in place (if accounting is compliled in) and the cycle count taken 18689 * at the entry is in the ts_val. The concept her is that the rsm 18690 * now holds the mbuf offsets and such so we can directly transmit 18691 * without a lot of overhead, the len field is already set for 18692 * us to prohibit us from sending too much (usually its 1MSS). 18693 */ 18694 struct ip *ip = NULL; 18695 struct udphdr *udp = NULL; 18696 struct tcphdr *th = NULL; 18697 struct mbuf *m = NULL; 18698 struct inpcb *inp; 18699 uint8_t *cpto; 18700 struct tcp_log_buffer *lgb; 18701 #ifdef TCP_ACCOUNTING 18702 uint64_t crtsc; 18703 int cnt_thru = 1; 18704 #endif 18705 struct tcpopt to; 18706 u_char opt[TCP_MAXOLEN]; 18707 uint32_t hdrlen, optlen; 18708 int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; 18709 uint16_t flags; 18710 uint32_t if_hw_tsomaxsegcount = 0, startseq; 18711 uint32_t if_hw_tsomaxsegsize; 18712 int32_t ip_sendflag = IP_NO_SND_TAG_RL; 18713 18714 #ifdef INET6 18715 struct ip6_hdr *ip6 = NULL; 18716 18717 if (rack->r_is_v6) { 18718 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18719 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18720 } else 18721 #endif /* INET6 */ 18722 { 18723 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18724 hdrlen = sizeof(struct tcpiphdr); 18725 } 18726 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 18727 goto failed; 18728 } 18729 if (doing_tlp) { 18730 /* Its a TLP add the flag, it may already be there but be sure */ 18731 rsm->r_flags |= RACK_TLP; 18732 } else { 18733 /* If it was a TLP it is not not on this retransmit */ 18734 rsm->r_flags &= ~RACK_TLP; 18735 } 18736 startseq = rsm->r_start; 18737 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 18738 inp = rack->rc_inp; 18739 to.to_flags = 0; 18740 flags = tcp_outflags[tp->t_state]; 18741 if (flags & (TH_SYN|TH_RST)) { 18742 goto failed; 18743 } 18744 if (rsm->r_flags & RACK_HAS_FIN) { 18745 /* We can't send a FIN here */ 18746 goto failed; 18747 } 18748 if (flags & TH_FIN) { 18749 /* We never send a FIN */ 18750 flags &= ~TH_FIN; 18751 } 18752 if (tp->t_flags & TF_RCVD_TSTMP) { 18753 to.to_tsval = ms_cts + tp->ts_offset; 18754 to.to_tsecr = tp->ts_recent; 18755 to.to_flags = TOF_TS; 18756 } 18757 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18758 /* TCP-MD5 (RFC2385). */ 18759 if (tp->t_flags & TF_SIGNATURE) 18760 to.to_flags |= TOF_SIGNATURE; 18761 #endif 18762 optlen = tcp_addoptions(&to, opt); 18763 hdrlen += optlen; 18764 udp = rack->r_ctl.fsb.udp; 18765 if (udp) 18766 hdrlen += sizeof(struct udphdr); 18767 if (rack->r_ctl.rc_pace_max_segs) 18768 max_val = rack->r_ctl.rc_pace_max_segs; 18769 else if (rack->rc_user_set_max_segs) 18770 max_val = rack->rc_user_set_max_segs * segsiz; 18771 else 18772 max_val = len; 18773 if ((tp->t_flags & TF_TSO) && 18774 V_tcp_do_tso && 18775 (len > segsiz) && 18776 (tp->t_port == 0)) 18777 tso = 1; 18778 #ifdef INET6 18779 if (MHLEN < hdrlen + max_linkhdr) 18780 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18781 else 18782 #endif 18783 m = m_gethdr(M_NOWAIT, MT_DATA); 18784 if (m == NULL) 18785 goto failed; 18786 m->m_data += max_linkhdr; 18787 m->m_len = hdrlen; 18788 th = rack->r_ctl.fsb.th; 18789 /* Establish the len to send */ 18790 if (len > max_val) 18791 len = max_val; 18792 if ((tso) && (len + optlen > segsiz)) { 18793 uint32_t if_hw_tsomax; 18794 int32_t max_len; 18795 18796 /* extract TSO information */ 18797 if_hw_tsomax = tp->t_tsomax; 18798 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18799 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18800 /* 18801 * Check if we should limit by maximum payload 18802 * length: 18803 */ 18804 if (if_hw_tsomax != 0) { 18805 /* compute maximum TSO length */ 18806 max_len = (if_hw_tsomax - hdrlen - 18807 max_linkhdr); 18808 if (max_len <= 0) { 18809 goto failed; 18810 } else if (len > max_len) { 18811 len = max_len; 18812 } 18813 } 18814 if (len <= segsiz) { 18815 /* 18816 * In case there are too many small fragments don't 18817 * use TSO: 18818 */ 18819 tso = 0; 18820 } 18821 } else { 18822 tso = 0; 18823 } 18824 if ((tso == 0) && (len > segsiz)) 18825 len = segsiz; 18826 (void)tcp_get_usecs(tv); 18827 if ((len == 0) || 18828 (len <= MHLEN - hdrlen - max_linkhdr)) { 18829 goto failed; 18830 } 18831 th->th_seq = htonl(rsm->r_start); 18832 th->th_ack = htonl(tp->rcv_nxt); 18833 /* 18834 * The PUSH bit should only be applied 18835 * if the full retransmission is made. If 18836 * we are sending less than this is the 18837 * left hand edge and should not have 18838 * the PUSH bit. 18839 */ 18840 if ((rsm->r_flags & RACK_HAD_PUSH) && 18841 (len == (rsm->r_end - rsm->r_start))) 18842 flags |= TH_PUSH; 18843 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 18844 if (th->th_win == 0) { 18845 tp->t_sndzerowin++; 18846 tp->t_flags |= TF_RXWIN0SENT; 18847 } else 18848 tp->t_flags &= ~TF_RXWIN0SENT; 18849 if (rsm->r_flags & RACK_TLP) { 18850 /* 18851 * TLP should not count in retran count, but 18852 * in its own bin 18853 */ 18854 counter_u64_add(rack_tlp_retran, 1); 18855 counter_u64_add(rack_tlp_retran_bytes, len); 18856 } else { 18857 tp->t_sndrexmitpack++; 18858 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18859 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18860 } 18861 #ifdef STATS 18862 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18863 len); 18864 #endif 18865 if (rsm->m == NULL) 18866 goto failed; 18867 if (rsm->m && 18868 ((rsm->orig_m_len != rsm->m->m_len) || 18869 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 18870 /* Fix up the orig_m_len and possibly the mbuf offset */ 18871 rack_adjust_orig_mlen(rsm); 18872 } 18873 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 18874 if (len <= segsiz) { 18875 /* 18876 * Must have ran out of mbufs for the copy 18877 * shorten it to no longer need tso. Lets 18878 * not put on sendalot since we are low on 18879 * mbufs. 18880 */ 18881 tso = 0; 18882 } 18883 if ((m->m_next == NULL) || (len <= 0)){ 18884 goto failed; 18885 } 18886 if (udp) { 18887 if (rack->r_is_v6) 18888 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18889 else 18890 ulen = hdrlen + len - sizeof(struct ip); 18891 udp->uh_ulen = htons(ulen); 18892 } 18893 m->m_pkthdr.rcvif = (struct ifnet *)0; 18894 if (TCPS_HAVERCVDSYN(tp->t_state) && 18895 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 18896 int ect = tcp_ecn_output_established(tp, &flags, len, true); 18897 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18898 (tp->t_flags2 & TF2_ECN_SND_ECE)) 18899 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18900 #ifdef INET6 18901 if (rack->r_is_v6) { 18902 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 18903 ip6->ip6_flow |= htonl(ect << 20); 18904 } 18905 else 18906 #endif 18907 { 18908 ip->ip_tos &= ~IPTOS_ECN_MASK; 18909 ip->ip_tos |= ect; 18910 } 18911 } 18912 if (rack->r_ctl.crte != NULL) { 18913 /* See if we can send via the hw queue */ 18914 slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); 18915 /* If there is nothing in queue (no pacing time) we can send via the hw queue */ 18916 if (slot == 0) 18917 ip_sendflag = 0; 18918 } 18919 tcp_set_flags(th, flags); 18920 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18921 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18922 if (to.to_flags & TOF_SIGNATURE) { 18923 /* 18924 * Calculate MD5 signature and put it into the place 18925 * determined before. 18926 * NOTE: since TCP options buffer doesn't point into 18927 * mbuf's data, calculate offset and use it. 18928 */ 18929 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18930 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18931 /* 18932 * Do not send segment if the calculation of MD5 18933 * digest has failed. 18934 */ 18935 goto failed; 18936 } 18937 } 18938 #endif 18939 #ifdef INET6 18940 if (rack->r_is_v6) { 18941 if (tp->t_port) { 18942 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18943 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18944 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18945 th->th_sum = htons(0); 18946 UDPSTAT_INC(udps_opackets); 18947 } else { 18948 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18949 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18950 th->th_sum = in6_cksum_pseudo(ip6, 18951 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18952 0); 18953 } 18954 } 18955 #endif 18956 #if defined(INET6) && defined(INET) 18957 else 18958 #endif 18959 #ifdef INET 18960 { 18961 if (tp->t_port) { 18962 m->m_pkthdr.csum_flags = CSUM_UDP; 18963 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18964 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18965 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18966 th->th_sum = htons(0); 18967 UDPSTAT_INC(udps_opackets); 18968 } else { 18969 m->m_pkthdr.csum_flags = CSUM_TCP; 18970 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18971 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18972 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18973 IPPROTO_TCP + len + optlen)); 18974 } 18975 /* IP version must be set here for ipv4/ipv6 checking later */ 18976 KASSERT(ip->ip_v == IPVERSION, 18977 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18978 } 18979 #endif 18980 if (tso) { 18981 /* 18982 * Here we use segsiz since we have no added options besides 18983 * any standard timestamp options (no DSACKs or SACKS are sent 18984 * via either fast-path). 18985 */ 18986 KASSERT(len > segsiz, 18987 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 18988 m->m_pkthdr.csum_flags |= CSUM_TSO; 18989 m->m_pkthdr.tso_segsz = segsiz; 18990 } 18991 #ifdef INET6 18992 if (rack->r_is_v6) { 18993 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 18994 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18995 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18996 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18997 else 18998 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18999 } 19000 #endif 19001 #if defined(INET) && defined(INET6) 19002 else 19003 #endif 19004 #ifdef INET 19005 { 19006 ip->ip_len = htons(m->m_pkthdr.len); 19007 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19008 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19009 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19010 if (tp->t_port == 0 || len < V_tcp_minmss) { 19011 ip->ip_off |= htons(IP_DF); 19012 } 19013 } else { 19014 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19015 } 19016 } 19017 #endif 19018 if (doing_tlp == 0) { 19019 /* Set we retransmitted */ 19020 rack->rc_gp_saw_rec = 1; 19021 } else { 19022 /* Its a TLP set ca or ss */ 19023 if (tp->snd_cwnd > tp->snd_ssthresh) { 19024 /* Set we sent in CA */ 19025 rack->rc_gp_saw_ca = 1; 19026 } else { 19027 /* Set we sent in SS */ 19028 rack->rc_gp_saw_ss = 1; 19029 } 19030 } 19031 /* Time to copy in our header */ 19032 cpto = mtod(m, uint8_t *); 19033 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19034 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19035 if (optlen) { 19036 bcopy(opt, th + 1, optlen); 19037 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19038 } else { 19039 th->th_off = sizeof(struct tcphdr) >> 2; 19040 } 19041 if (tcp_bblogging_on(rack->rc_tp)) { 19042 union tcp_log_stackspecific log; 19043 19044 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 19045 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 19046 counter_u64_add(rack_collapsed_win_rxt, 1); 19047 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 19048 } 19049 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19050 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19051 if (rack->rack_no_prr) 19052 log.u_bbr.flex1 = 0; 19053 else 19054 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19055 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19056 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19057 log.u_bbr.flex4 = max_val; 19058 /* Save off the early/late values */ 19059 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19060 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19061 log.u_bbr.bw_inuse = rack_get_bw(rack); 19062 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19063 if (doing_tlp == 0) 19064 log.u_bbr.flex8 = 1; 19065 else 19066 log.u_bbr.flex8 = 2; 19067 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19068 log.u_bbr.flex7 = 55; 19069 log.u_bbr.pkts_out = tp->t_maxseg; 19070 log.u_bbr.timeStamp = cts; 19071 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19072 if (rsm && (rsm->r_rtr_cnt > 0)) { 19073 /* 19074 * When we have a retransmit we want to log the 19075 * burst at send and flight at send from before. 19076 */ 19077 log.u_bbr.flex5 = rsm->r_fas; 19078 log.u_bbr.bbr_substate = rsm->r_bas; 19079 } else { 19080 /* 19081 * This is currently unlikely until we do the 19082 * packet pair probes but I will add it for completeness. 19083 */ 19084 log.u_bbr.flex5 = log.u_bbr.inflight; 19085 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19086 } 19087 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19088 log.u_bbr.delivered = 0; 19089 log.u_bbr.rttProp = (uint64_t)rsm; 19090 log.u_bbr.delRate = rsm->r_flags; 19091 log.u_bbr.delRate <<= 31; 19092 log.u_bbr.delRate |= rack->r_must_retran; 19093 log.u_bbr.delRate <<= 1; 19094 log.u_bbr.delRate |= 1; 19095 log.u_bbr.pkt_epoch = __LINE__; 19096 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19097 len, &log, false, NULL, __func__, __LINE__, tv); 19098 } else 19099 lgb = NULL; 19100 if ((rack->r_ctl.crte != NULL) && 19101 tcp_bblogging_on(tp)) { 19102 rack_log_queue_level(tp, rack, len, tv, cts); 19103 } 19104 #ifdef INET6 19105 if (rack->r_is_v6) { 19106 error = ip6_output(m, inp->in6p_outputopts, 19107 &inp->inp_route6, 19108 ip_sendflag, NULL, NULL, inp); 19109 } 19110 else 19111 #endif 19112 #ifdef INET 19113 { 19114 error = ip_output(m, NULL, 19115 &inp->inp_route, 19116 ip_sendflag, 0, inp); 19117 } 19118 #endif 19119 m = NULL; 19120 if (lgb) { 19121 lgb->tlb_errno = error; 19122 lgb = NULL; 19123 } 19124 /* Move snd_nxt to snd_max so we don't have false retransmissions */ 19125 tp->snd_nxt = tp->snd_max; 19126 if (error) { 19127 goto failed; 19128 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { 19129 rack->rc_hw_nobuf = 0; 19130 rack->r_ctl.rc_agg_delayed = 0; 19131 rack->r_early = 0; 19132 rack->r_late = 0; 19133 rack->r_ctl.rc_agg_early = 0; 19134 } 19135 19136 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 19137 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); 19138 if (doing_tlp) { 19139 rack->rc_tlp_in_progress = 1; 19140 rack->r_ctl.rc_tlp_cnt_out++; 19141 } 19142 if (error == 0) { 19143 counter_u64_add(rack_total_bytes, len); 19144 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 19145 if (doing_tlp) { 19146 rack->rc_last_sent_tlp_past_cumack = 0; 19147 rack->rc_last_sent_tlp_seq_valid = 1; 19148 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 19149 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 19150 } 19151 if (rack->r_ctl.rc_prr_sndcnt >= len) 19152 rack->r_ctl.rc_prr_sndcnt -= len; 19153 else 19154 rack->r_ctl.rc_prr_sndcnt = 0; 19155 } 19156 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19157 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19158 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 19159 rack->r_ctl.retran_during_recovery += len; 19160 { 19161 int idx; 19162 19163 idx = (len / segsiz) + 3; 19164 if (idx >= TCP_MSS_ACCT_ATIMER) 19165 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19166 else 19167 counter_u64_add(rack_out_size[idx], 1); 19168 } 19169 if (tp->t_rtttime == 0) { 19170 tp->t_rtttime = ticks; 19171 tp->t_rtseq = startseq; 19172 KMOD_TCPSTAT_INC(tcps_segstimed); 19173 } 19174 counter_u64_add(rack_fto_rsm_send, 1); 19175 if (error && (error == ENOBUFS)) { 19176 if (rack->r_ctl.crte != NULL) { 19177 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 19178 if (tcp_bblogging_on(rack->rc_tp)) 19179 rack_log_queue_level(tp, rack, len, tv, cts); 19180 } else 19181 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 19182 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 19183 if (rack->rc_enobuf < 0x7f) 19184 rack->rc_enobuf++; 19185 if (slot < (10 * HPTS_USEC_IN_MSEC)) 19186 slot = 10 * HPTS_USEC_IN_MSEC; 19187 if (rack->r_ctl.crte != NULL) { 19188 counter_u64_add(rack_saw_enobuf_hw, 1); 19189 tcp_rl_log_enobuf(rack->r_ctl.crte); 19190 } 19191 counter_u64_add(rack_saw_enobuf, 1); 19192 } else 19193 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 19194 if ((slot == 0) || 19195 (rack->rc_always_pace == 0) || 19196 (rack->r_rr_config == 1)) { 19197 /* 19198 * We have no pacing set or we 19199 * are using old-style rack or 19200 * we are overridden to use the old 1ms pacing. 19201 */ 19202 slot = rack->r_ctl.rc_min_to; 19203 } 19204 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 19205 #ifdef TCP_ACCOUNTING 19206 crtsc = get_cyclecount(); 19207 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19208 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19209 } 19210 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19211 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19212 } 19213 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19214 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 19215 } 19216 sched_unpin(); 19217 #endif 19218 return (0); 19219 failed: 19220 if (m) 19221 m_free(m); 19222 return (-1); 19223 } 19224 19225 static void 19226 rack_sndbuf_autoscale(struct tcp_rack *rack) 19227 { 19228 /* 19229 * Automatic sizing of send socket buffer. Often the send buffer 19230 * size is not optimally adjusted to the actual network conditions 19231 * at hand (delay bandwidth product). Setting the buffer size too 19232 * small limits throughput on links with high bandwidth and high 19233 * delay (eg. trans-continental/oceanic links). Setting the 19234 * buffer size too big consumes too much real kernel memory, 19235 * especially with many connections on busy servers. 19236 * 19237 * The criteria to step up the send buffer one notch are: 19238 * 1. receive window of remote host is larger than send buffer 19239 * (with a fudge factor of 5/4th); 19240 * 2. send buffer is filled to 7/8th with data (so we actually 19241 * have data to make use of it); 19242 * 3. send buffer fill has not hit maximal automatic size; 19243 * 4. our send window (slow start and cogestion controlled) is 19244 * larger than sent but unacknowledged data in send buffer. 19245 * 19246 * Note that the rack version moves things much faster since 19247 * we want to avoid hitting cache lines in the rack_fast_output() 19248 * path so this is called much less often and thus moves 19249 * the SB forward by a percentage. 19250 */ 19251 struct socket *so; 19252 struct tcpcb *tp; 19253 uint32_t sendwin, scaleup; 19254 19255 tp = rack->rc_tp; 19256 so = rack->rc_inp->inp_socket; 19257 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 19258 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 19259 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 19260 sbused(&so->so_snd) >= 19261 (so->so_snd.sb_hiwat / 8 * 7) && 19262 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 19263 sendwin >= (sbused(&so->so_snd) - 19264 (tp->snd_nxt - tp->snd_una))) { 19265 if (rack_autosndbuf_inc) 19266 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 19267 else 19268 scaleup = V_tcp_autosndbuf_inc; 19269 if (scaleup < V_tcp_autosndbuf_inc) 19270 scaleup = V_tcp_autosndbuf_inc; 19271 scaleup += so->so_snd.sb_hiwat; 19272 if (scaleup > V_tcp_autosndbuf_max) 19273 scaleup = V_tcp_autosndbuf_max; 19274 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 19275 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 19276 } 19277 } 19278 } 19279 19280 static int 19281 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 19282 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 19283 { 19284 /* 19285 * Enter to do fast output. We are given that the sched_pin is 19286 * in place (if accounting is compiled in) and the cycle count taken 19287 * at entry is in place in ts_val. The idea here is that 19288 * we know how many more bytes needs to be sent (presumably either 19289 * during pacing or to fill the cwnd and that was greater than 19290 * the max-burst). We have how much to send and all the info we 19291 * need to just send. 19292 */ 19293 #ifdef INET 19294 struct ip *ip = NULL; 19295 #endif 19296 struct udphdr *udp = NULL; 19297 struct tcphdr *th = NULL; 19298 struct mbuf *m, *s_mb; 19299 struct inpcb *inp; 19300 uint8_t *cpto; 19301 struct tcp_log_buffer *lgb; 19302 #ifdef TCP_ACCOUNTING 19303 uint64_t crtsc; 19304 #endif 19305 struct tcpopt to; 19306 u_char opt[TCP_MAXOLEN]; 19307 uint32_t hdrlen, optlen; 19308 #ifdef TCP_ACCOUNTING 19309 int cnt_thru = 1; 19310 #endif 19311 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 19312 uint16_t flags; 19313 uint32_t s_soff; 19314 uint32_t if_hw_tsomaxsegcount = 0, startseq; 19315 uint32_t if_hw_tsomaxsegsize; 19316 uint16_t add_flag = RACK_SENT_FP; 19317 #ifdef INET6 19318 struct ip6_hdr *ip6 = NULL; 19319 19320 if (rack->r_is_v6) { 19321 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 19322 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 19323 } else 19324 #endif /* INET6 */ 19325 { 19326 #ifdef INET 19327 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 19328 hdrlen = sizeof(struct tcpiphdr); 19329 #endif 19330 } 19331 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 19332 m = NULL; 19333 goto failed; 19334 } 19335 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19336 startseq = tp->snd_max; 19337 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19338 inp = rack->rc_inp; 19339 len = rack->r_ctl.fsb.left_to_send; 19340 to.to_flags = 0; 19341 flags = rack->r_ctl.fsb.tcp_flags; 19342 if (tp->t_flags & TF_RCVD_TSTMP) { 19343 to.to_tsval = ms_cts + tp->ts_offset; 19344 to.to_tsecr = tp->ts_recent; 19345 to.to_flags = TOF_TS; 19346 } 19347 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19348 /* TCP-MD5 (RFC2385). */ 19349 if (tp->t_flags & TF_SIGNATURE) 19350 to.to_flags |= TOF_SIGNATURE; 19351 #endif 19352 optlen = tcp_addoptions(&to, opt); 19353 hdrlen += optlen; 19354 udp = rack->r_ctl.fsb.udp; 19355 if (udp) 19356 hdrlen += sizeof(struct udphdr); 19357 if (rack->r_ctl.rc_pace_max_segs) 19358 max_val = rack->r_ctl.rc_pace_max_segs; 19359 else if (rack->rc_user_set_max_segs) 19360 max_val = rack->rc_user_set_max_segs * segsiz; 19361 else 19362 max_val = len; 19363 if ((tp->t_flags & TF_TSO) && 19364 V_tcp_do_tso && 19365 (len > segsiz) && 19366 (tp->t_port == 0)) 19367 tso = 1; 19368 again: 19369 #ifdef INET6 19370 if (MHLEN < hdrlen + max_linkhdr) 19371 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 19372 else 19373 #endif 19374 m = m_gethdr(M_NOWAIT, MT_DATA); 19375 if (m == NULL) 19376 goto failed; 19377 m->m_data += max_linkhdr; 19378 m->m_len = hdrlen; 19379 th = rack->r_ctl.fsb.th; 19380 /* Establish the len to send */ 19381 if (len > max_val) 19382 len = max_val; 19383 if ((tso) && (len + optlen > segsiz)) { 19384 uint32_t if_hw_tsomax; 19385 int32_t max_len; 19386 19387 /* extract TSO information */ 19388 if_hw_tsomax = tp->t_tsomax; 19389 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 19390 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 19391 /* 19392 * Check if we should limit by maximum payload 19393 * length: 19394 */ 19395 if (if_hw_tsomax != 0) { 19396 /* compute maximum TSO length */ 19397 max_len = (if_hw_tsomax - hdrlen - 19398 max_linkhdr); 19399 if (max_len <= 0) { 19400 goto failed; 19401 } else if (len > max_len) { 19402 len = max_len; 19403 } 19404 } 19405 if (len <= segsiz) { 19406 /* 19407 * In case there are too many small fragments don't 19408 * use TSO: 19409 */ 19410 tso = 0; 19411 } 19412 } else { 19413 tso = 0; 19414 } 19415 if ((tso == 0) && (len > segsiz)) 19416 len = segsiz; 19417 (void)tcp_get_usecs(tv); 19418 if ((len == 0) || 19419 (len <= MHLEN - hdrlen - max_linkhdr)) { 19420 goto failed; 19421 } 19422 sb_offset = tp->snd_max - tp->snd_una; 19423 th->th_seq = htonl(tp->snd_max); 19424 th->th_ack = htonl(tp->rcv_nxt); 19425 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 19426 if (th->th_win == 0) { 19427 tp->t_sndzerowin++; 19428 tp->t_flags |= TF_RXWIN0SENT; 19429 } else 19430 tp->t_flags &= ~TF_RXWIN0SENT; 19431 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 19432 KMOD_TCPSTAT_INC(tcps_sndpack); 19433 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 19434 #ifdef STATS 19435 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 19436 len); 19437 #endif 19438 if (rack->r_ctl.fsb.m == NULL) 19439 goto failed; 19440 19441 /* s_mb and s_soff are saved for rack_log_output */ 19442 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 19443 &s_mb, &s_soff); 19444 if (len <= segsiz) { 19445 /* 19446 * Must have ran out of mbufs for the copy 19447 * shorten it to no longer need tso. Lets 19448 * not put on sendalot since we are low on 19449 * mbufs. 19450 */ 19451 tso = 0; 19452 } 19453 if (rack->r_ctl.fsb.rfo_apply_push && 19454 (len == rack->r_ctl.fsb.left_to_send)) { 19455 tcp_set_flags(th, flags | TH_PUSH); 19456 add_flag |= RACK_HAD_PUSH; 19457 } 19458 if ((m->m_next == NULL) || (len <= 0)){ 19459 goto failed; 19460 } 19461 if (udp) { 19462 if (rack->r_is_v6) 19463 ulen = hdrlen + len - sizeof(struct ip6_hdr); 19464 else 19465 ulen = hdrlen + len - sizeof(struct ip); 19466 udp->uh_ulen = htons(ulen); 19467 } 19468 m->m_pkthdr.rcvif = (struct ifnet *)0; 19469 if (TCPS_HAVERCVDSYN(tp->t_state) && 19470 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 19471 int ect = tcp_ecn_output_established(tp, &flags, len, false); 19472 if ((tp->t_state == TCPS_SYN_RECEIVED) && 19473 (tp->t_flags2 & TF2_ECN_SND_ECE)) 19474 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 19475 #ifdef INET6 19476 if (rack->r_is_v6) { 19477 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 19478 ip6->ip6_flow |= htonl(ect << 20); 19479 } 19480 else 19481 #endif 19482 { 19483 #ifdef INET 19484 ip->ip_tos &= ~IPTOS_ECN_MASK; 19485 ip->ip_tos |= ect; 19486 #endif 19487 } 19488 } 19489 tcp_set_flags(th, flags); 19490 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 19491 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19492 if (to.to_flags & TOF_SIGNATURE) { 19493 /* 19494 * Calculate MD5 signature and put it into the place 19495 * determined before. 19496 * NOTE: since TCP options buffer doesn't point into 19497 * mbuf's data, calculate offset and use it. 19498 */ 19499 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 19500 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 19501 /* 19502 * Do not send segment if the calculation of MD5 19503 * digest has failed. 19504 */ 19505 goto failed; 19506 } 19507 } 19508 #endif 19509 #ifdef INET6 19510 if (rack->r_is_v6) { 19511 if (tp->t_port) { 19512 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 19513 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19514 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 19515 th->th_sum = htons(0); 19516 UDPSTAT_INC(udps_opackets); 19517 } else { 19518 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 19519 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19520 th->th_sum = in6_cksum_pseudo(ip6, 19521 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 19522 0); 19523 } 19524 } 19525 #endif 19526 #if defined(INET6) && defined(INET) 19527 else 19528 #endif 19529 #ifdef INET 19530 { 19531 if (tp->t_port) { 19532 m->m_pkthdr.csum_flags = CSUM_UDP; 19533 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19534 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 19535 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 19536 th->th_sum = htons(0); 19537 UDPSTAT_INC(udps_opackets); 19538 } else { 19539 m->m_pkthdr.csum_flags = CSUM_TCP; 19540 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19541 th->th_sum = in_pseudo(ip->ip_src.s_addr, 19542 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 19543 IPPROTO_TCP + len + optlen)); 19544 } 19545 /* IP version must be set here for ipv4/ipv6 checking later */ 19546 KASSERT(ip->ip_v == IPVERSION, 19547 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 19548 } 19549 #endif 19550 if (tso) { 19551 /* 19552 * Here we use segsiz since we have no added options besides 19553 * any standard timestamp options (no DSACKs or SACKS are sent 19554 * via either fast-path). 19555 */ 19556 KASSERT(len > segsiz, 19557 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 19558 m->m_pkthdr.csum_flags |= CSUM_TSO; 19559 m->m_pkthdr.tso_segsz = segsiz; 19560 } 19561 #ifdef INET6 19562 if (rack->r_is_v6) { 19563 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 19564 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 19565 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 19566 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19567 else 19568 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19569 } 19570 #endif 19571 #if defined(INET) && defined(INET6) 19572 else 19573 #endif 19574 #ifdef INET 19575 { 19576 ip->ip_len = htons(m->m_pkthdr.len); 19577 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19578 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19579 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19580 if (tp->t_port == 0 || len < V_tcp_minmss) { 19581 ip->ip_off |= htons(IP_DF); 19582 } 19583 } else { 19584 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19585 } 19586 } 19587 #endif 19588 if (tp->snd_cwnd > tp->snd_ssthresh) { 19589 /* Set we sent in CA */ 19590 rack->rc_gp_saw_ca = 1; 19591 } else { 19592 /* Set we sent in SS */ 19593 rack->rc_gp_saw_ss = 1; 19594 } 19595 /* Time to copy in our header */ 19596 cpto = mtod(m, uint8_t *); 19597 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19598 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19599 if (optlen) { 19600 bcopy(opt, th + 1, optlen); 19601 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19602 } else { 19603 th->th_off = sizeof(struct tcphdr) >> 2; 19604 } 19605 if ((rack->r_ctl.crte != NULL) && 19606 tcp_bblogging_on(tp)) { 19607 rack_log_queue_level(tp, rack, len, tv, cts); 19608 } 19609 if (tcp_bblogging_on(rack->rc_tp)) { 19610 union tcp_log_stackspecific log; 19611 19612 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19613 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19614 if (rack->rack_no_prr) 19615 log.u_bbr.flex1 = 0; 19616 else 19617 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19618 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19619 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19620 log.u_bbr.flex4 = max_val; 19621 /* Save off the early/late values */ 19622 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19623 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19624 log.u_bbr.bw_inuse = rack_get_bw(rack); 19625 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19626 log.u_bbr.flex8 = 0; 19627 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19628 log.u_bbr.flex7 = 44; 19629 log.u_bbr.pkts_out = tp->t_maxseg; 19630 log.u_bbr.timeStamp = cts; 19631 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19632 log.u_bbr.flex5 = log.u_bbr.inflight; 19633 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19634 log.u_bbr.delivered = 0; 19635 log.u_bbr.rttProp = 0; 19636 log.u_bbr.delRate = rack->r_must_retran; 19637 log.u_bbr.delRate <<= 1; 19638 log.u_bbr.pkt_epoch = __LINE__; 19639 /* For fast output no retrans so just inflight and how many mss we send */ 19640 log.u_bbr.flex5 = log.u_bbr.inflight; 19641 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19642 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19643 len, &log, false, NULL, __func__, __LINE__, tv); 19644 } else 19645 lgb = NULL; 19646 #ifdef INET6 19647 if (rack->r_is_v6) { 19648 error = ip6_output(m, inp->in6p_outputopts, 19649 &inp->inp_route6, 19650 0, NULL, NULL, inp); 19651 } 19652 #endif 19653 #if defined(INET) && defined(INET6) 19654 else 19655 #endif 19656 #ifdef INET 19657 { 19658 error = ip_output(m, NULL, 19659 &inp->inp_route, 19660 0, 0, inp); 19661 } 19662 #endif 19663 if (lgb) { 19664 lgb->tlb_errno = error; 19665 lgb = NULL; 19666 } 19667 if (error) { 19668 *send_err = error; 19669 m = NULL; 19670 goto failed; 19671 } else if (rack->rc_hw_nobuf) { 19672 rack->rc_hw_nobuf = 0; 19673 rack->r_ctl.rc_agg_delayed = 0; 19674 rack->r_early = 0; 19675 rack->r_late = 0; 19676 rack->r_ctl.rc_agg_early = 0; 19677 } 19678 if ((error == 0) && (rack->lt_bw_up == 0)) { 19679 /* Unlikely */ 19680 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); 19681 rack->r_ctl.lt_seq = tp->snd_una; 19682 rack->lt_bw_up = 1; 19683 } 19684 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 19685 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); 19686 m = NULL; 19687 if (tp->snd_una == tp->snd_max) { 19688 rack->r_ctl.rc_tlp_rxt_last_time = cts; 19689 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 19690 tp->t_acktime = ticks; 19691 } 19692 counter_u64_add(rack_total_bytes, len); 19693 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 19694 19695 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19696 tot_len += len; 19697 if ((tp->t_flags & TF_GPUTINPROG) == 0) 19698 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 19699 tp->snd_max += len; 19700 tp->snd_nxt = tp->snd_max; 19701 if (rack->rc_new_rnd_needed) { 19702 /* 19703 * Update the rnd to start ticking not 19704 * that from a time perspective all of 19705 * the preceding idle time is "in the round" 19706 */ 19707 rack->rc_new_rnd_needed = 0; 19708 rack->r_ctl.roundends = tp->snd_max; 19709 } 19710 { 19711 int idx; 19712 19713 idx = (len / segsiz) + 3; 19714 if (idx >= TCP_MSS_ACCT_ATIMER) 19715 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19716 else 19717 counter_u64_add(rack_out_size[idx], 1); 19718 } 19719 if (len <= rack->r_ctl.fsb.left_to_send) 19720 rack->r_ctl.fsb.left_to_send -= len; 19721 else 19722 rack->r_ctl.fsb.left_to_send = 0; 19723 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19724 rack->r_fast_output = 0; 19725 rack->r_ctl.fsb.left_to_send = 0; 19726 /* At the end of fast_output scale up the sb */ 19727 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 19728 rack_sndbuf_autoscale(rack); 19729 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 19730 } 19731 if (tp->t_rtttime == 0) { 19732 tp->t_rtttime = ticks; 19733 tp->t_rtseq = startseq; 19734 KMOD_TCPSTAT_INC(tcps_segstimed); 19735 } 19736 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 19737 (max_val > len) && 19738 (tso == 0)) { 19739 max_val -= len; 19740 len = segsiz; 19741 th = rack->r_ctl.fsb.th; 19742 #ifdef TCP_ACCOUNTING 19743 cnt_thru++; 19744 #endif 19745 goto again; 19746 } 19747 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19748 counter_u64_add(rack_fto_send, 1); 19749 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 19750 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 19751 #ifdef TCP_ACCOUNTING 19752 crtsc = get_cyclecount(); 19753 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19754 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19755 } 19756 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19757 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19758 } 19759 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19760 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 19761 } 19762 sched_unpin(); 19763 #endif 19764 return (0); 19765 failed: 19766 if (m) 19767 m_free(m); 19768 rack->r_fast_output = 0; 19769 return (-1); 19770 } 19771 19772 static inline void 19773 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, 19774 struct sockbuf *sb, 19775 int len, int orig_len, int segsiz, uint32_t pace_max_seg, 19776 bool hw_tls, 19777 uint16_t flags) 19778 { 19779 rack->r_fast_output = 1; 19780 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19781 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19782 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 19783 rack->r_ctl.fsb.tcp_flags = flags; 19784 rack->r_ctl.fsb.left_to_send = orig_len - len; 19785 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { 19786 /* Less than a full sized pace, lets not */ 19787 rack->r_fast_output = 0; 19788 return; 19789 } else { 19790 /* Round down to the nearest pace_max_seg */ 19791 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); 19792 } 19793 if (hw_tls) 19794 rack->r_ctl.fsb.hw_tls = 1; 19795 else 19796 rack->r_ctl.fsb.hw_tls = 0; 19797 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19798 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19799 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19800 (tp->snd_max - tp->snd_una))); 19801 if (rack->r_ctl.fsb.left_to_send < segsiz) 19802 rack->r_fast_output = 0; 19803 else { 19804 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19805 rack->r_ctl.fsb.rfo_apply_push = 1; 19806 else 19807 rack->r_ctl.fsb.rfo_apply_push = 0; 19808 } 19809 } 19810 19811 static uint32_t 19812 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) 19813 { 19814 uint64_t min_time; 19815 uint32_t maxlen; 19816 19817 min_time = (uint64_t)get_hpts_min_sleep_time(); 19818 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); 19819 maxlen = roundup(maxlen, segsiz); 19820 return (maxlen); 19821 } 19822 19823 static struct rack_sendmap * 19824 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 19825 { 19826 struct rack_sendmap *rsm = NULL; 19827 int thresh; 19828 19829 restart: 19830 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 19831 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 19832 /* Nothing, strange turn off validity */ 19833 rack->r_collapse_point_valid = 0; 19834 return (NULL); 19835 } 19836 /* Can we send it yet? */ 19837 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 19838 /* 19839 * Receiver window has not grown enough for 19840 * the segment to be put on the wire. 19841 */ 19842 return (NULL); 19843 } 19844 if (rsm->r_flags & RACK_ACKED) { 19845 /* 19846 * It has been sacked, lets move to the 19847 * next one if possible. 19848 */ 19849 rack->r_ctl.last_collapse_point = rsm->r_end; 19850 /* Are we done? */ 19851 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19852 rack->r_ctl.high_collapse_point)) { 19853 rack->r_collapse_point_valid = 0; 19854 return (NULL); 19855 } 19856 goto restart; 19857 } 19858 /* Now has it been long enough ? */ 19859 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts); 19860 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 19861 rack_log_collapse(rack, rsm->r_start, 19862 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19863 thresh, __LINE__, 6, rsm->r_flags, rsm); 19864 return (rsm); 19865 } 19866 /* Not enough time */ 19867 rack_log_collapse(rack, rsm->r_start, 19868 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19869 thresh, __LINE__, 7, rsm->r_flags, rsm); 19870 return (NULL); 19871 } 19872 19873 static inline void 19874 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) 19875 { 19876 if ((rack->full_size_rxt == 0) && 19877 (rack->shape_rxt_to_pacing_min == 0) && 19878 (*len >= segsiz)) { 19879 *len = segsiz; 19880 } else if (rack->shape_rxt_to_pacing_min && 19881 rack->gp_ready) { 19882 /* We use pacing min as shaping len req */ 19883 uint32_t maxlen; 19884 19885 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 19886 if (*len > maxlen) 19887 *len = maxlen; 19888 } else { 19889 /* 19890 * The else is full_size_rxt is on so send it all 19891 * note we do need to check this for exceeding 19892 * our max segment size due to the fact that 19893 * we do sometimes merge chunks together i.e. 19894 * we cannot just assume that we will never have 19895 * a chunk greater than pace_max_seg 19896 */ 19897 if (*len > pace_max_seg) 19898 *len = pace_max_seg; 19899 } 19900 } 19901 19902 static int 19903 rack_output(struct tcpcb *tp) 19904 { 19905 struct socket *so; 19906 uint32_t recwin; 19907 uint32_t sb_offset, s_moff = 0; 19908 int32_t len, error = 0; 19909 uint16_t flags; 19910 struct mbuf *m, *s_mb = NULL; 19911 struct mbuf *mb; 19912 uint32_t if_hw_tsomaxsegcount = 0; 19913 uint32_t if_hw_tsomaxsegsize; 19914 int32_t segsiz, minseg; 19915 long tot_len_this_send = 0; 19916 #ifdef INET 19917 struct ip *ip = NULL; 19918 #endif 19919 struct udphdr *udp = NULL; 19920 struct tcp_rack *rack; 19921 struct tcphdr *th; 19922 uint8_t pass = 0; 19923 uint8_t mark = 0; 19924 uint8_t check_done = 0; 19925 uint8_t wanted_cookie = 0; 19926 u_char opt[TCP_MAXOLEN]; 19927 unsigned ipoptlen, optlen, hdrlen, ulen=0; 19928 uint32_t rack_seq; 19929 19930 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 19931 unsigned ipsec_optlen = 0; 19932 19933 #endif 19934 int32_t idle, sendalot; 19935 int32_t sub_from_prr = 0; 19936 volatile int32_t sack_rxmit; 19937 struct rack_sendmap *rsm = NULL; 19938 int32_t tso, mtu; 19939 struct tcpopt to; 19940 int32_t slot = 0; 19941 int32_t sup_rack = 0; 19942 uint32_t cts, ms_cts, delayed, early; 19943 uint16_t add_flag = RACK_SENT_SP; 19944 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 19945 uint8_t doing_tlp = 0; 19946 uint32_t cwnd_to_use, pace_max_seg; 19947 int32_t do_a_prefetch = 0; 19948 int32_t prefetch_rsm = 0; 19949 int32_t orig_len = 0; 19950 struct timeval tv; 19951 int32_t prefetch_so_done = 0; 19952 struct tcp_log_buffer *lgb; 19953 struct inpcb *inp = tptoinpcb(tp); 19954 struct sockbuf *sb; 19955 uint64_t ts_val = 0; 19956 #ifdef TCP_ACCOUNTING 19957 uint64_t crtsc; 19958 #endif 19959 #ifdef INET6 19960 struct ip6_hdr *ip6 = NULL; 19961 int32_t isipv6; 19962 #endif 19963 bool hpts_calling, hw_tls = false; 19964 19965 NET_EPOCH_ASSERT(); 19966 INP_WLOCK_ASSERT(inp); 19967 19968 /* setup and take the cache hits here */ 19969 rack = (struct tcp_rack *)tp->t_fb_ptr; 19970 #ifdef TCP_ACCOUNTING 19971 sched_pin(); 19972 ts_val = get_cyclecount(); 19973 #endif 19974 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); 19975 tp->t_flags2 &= ~TF2_HPTS_CALLS; 19976 #ifdef TCP_OFFLOAD 19977 if (tp->t_flags & TF_TOE) { 19978 #ifdef TCP_ACCOUNTING 19979 sched_unpin(); 19980 #endif 19981 return (tcp_offload_output(tp)); 19982 } 19983 #endif 19984 if (rack->rack_deferred_inited == 0) { 19985 /* 19986 * If we are the connecting socket we will 19987 * hit rack_init() when no sequence numbers 19988 * are setup. This makes it so we must defer 19989 * some initialization. Call that now. 19990 */ 19991 rack_deferred_init(tp, rack); 19992 } 19993 /* 19994 * For TFO connections in SYN_RECEIVED, only allow the initial 19995 * SYN|ACK and those sent by the retransmit timer. 19996 */ 19997 if (IS_FASTOPEN(tp->t_flags) && 19998 (tp->t_state == TCPS_SYN_RECEIVED) && 19999 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 20000 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 20001 #ifdef TCP_ACCOUNTING 20002 sched_unpin(); 20003 #endif 20004 return (0); 20005 } 20006 #ifdef INET6 20007 if (rack->r_state) { 20008 /* Use the cache line loaded if possible */ 20009 isipv6 = rack->r_is_v6; 20010 } else { 20011 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 20012 } 20013 #endif 20014 early = 0; 20015 cts = tcp_get_usecs(&tv); 20016 ms_cts = tcp_tv_to_mssectick(&tv); 20017 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 20018 tcp_in_hpts(rack->rc_tp)) { 20019 /* 20020 * We are on the hpts for some timer but not hptsi output. 20021 * Remove from the hpts unconditionally. 20022 */ 20023 rack_timer_cancel(tp, rack, cts, __LINE__); 20024 } 20025 /* Are we pacing and late? */ 20026 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 20027 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 20028 /* We are delayed */ 20029 delayed = cts - rack->r_ctl.rc_last_output_to; 20030 } else { 20031 delayed = 0; 20032 } 20033 /* Do the timers, which may override the pacer */ 20034 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 20035 int retval; 20036 20037 retval = rack_process_timers(tp, rack, cts, hpts_calling, 20038 &doing_tlp); 20039 if (retval != 0) { 20040 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 20041 #ifdef TCP_ACCOUNTING 20042 sched_unpin(); 20043 #endif 20044 /* 20045 * If timers want tcp_drop(), then pass error out, 20046 * otherwise suppress it. 20047 */ 20048 return (retval < 0 ? retval : 0); 20049 } 20050 } 20051 if (rack->rc_in_persist) { 20052 if (tcp_in_hpts(rack->rc_tp) == 0) { 20053 /* Timer is not running */ 20054 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 20055 } 20056 #ifdef TCP_ACCOUNTING 20057 sched_unpin(); 20058 #endif 20059 return (0); 20060 } 20061 if ((rack->rc_ack_required == 1) && 20062 (rack->r_timer_override == 0)){ 20063 /* A timeout occurred and no ack has arrived */ 20064 if (tcp_in_hpts(rack->rc_tp) == 0) { 20065 /* Timer is not running */ 20066 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 20067 } 20068 #ifdef TCP_ACCOUNTING 20069 sched_unpin(); 20070 #endif 20071 return (0); 20072 } 20073 if ((rack->r_timer_override) || 20074 (rack->rc_ack_can_sendout_data) || 20075 (delayed) || 20076 (tp->t_state < TCPS_ESTABLISHED)) { 20077 rack->rc_ack_can_sendout_data = 0; 20078 if (tcp_in_hpts(rack->rc_tp)) 20079 tcp_hpts_remove(rack->rc_tp); 20080 } else if (tcp_in_hpts(rack->rc_tp)) { 20081 /* 20082 * On the hpts you can't pass even if ACKNOW is on, we will 20083 * when the hpts fires. 20084 */ 20085 #ifdef TCP_ACCOUNTING 20086 crtsc = get_cyclecount(); 20087 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20088 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 20089 } 20090 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20091 tp->tcp_cnt_counters[SND_BLOCKED]++; 20092 } 20093 sched_unpin(); 20094 #endif 20095 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 20096 return (0); 20097 } 20098 /* Finish out both pacing early and late accounting */ 20099 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 20100 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 20101 early = rack->r_ctl.rc_last_output_to - cts; 20102 } else 20103 early = 0; 20104 if (delayed) { 20105 rack->r_ctl.rc_agg_delayed += delayed; 20106 rack->r_late = 1; 20107 } else if (early) { 20108 rack->r_ctl.rc_agg_early += early; 20109 rack->r_early = 1; 20110 } 20111 /* Now that early/late accounting is done turn off the flag */ 20112 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 20113 rack->r_wanted_output = 0; 20114 rack->r_timer_override = 0; 20115 if ((tp->t_state != rack->r_state) && 20116 TCPS_HAVEESTABLISHED(tp->t_state)) { 20117 rack_set_state(tp, rack); 20118 } 20119 if ((rack->r_fast_output) && 20120 (doing_tlp == 0) && 20121 (tp->rcv_numsacks == 0)) { 20122 int ret; 20123 20124 error = 0; 20125 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 20126 if (ret >= 0) 20127 return(ret); 20128 else if (error) { 20129 inp = rack->rc_inp; 20130 so = inp->inp_socket; 20131 sb = &so->so_snd; 20132 goto nomore; 20133 } 20134 } 20135 inp = rack->rc_inp; 20136 /* 20137 * For TFO connections in SYN_SENT or SYN_RECEIVED, 20138 * only allow the initial SYN or SYN|ACK and those sent 20139 * by the retransmit timer. 20140 */ 20141 if (IS_FASTOPEN(tp->t_flags) && 20142 ((tp->t_state == TCPS_SYN_RECEIVED) || 20143 (tp->t_state == TCPS_SYN_SENT)) && 20144 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 20145 (tp->t_rxtshift == 0)) { /* not a retransmit */ 20146 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 20147 so = inp->inp_socket; 20148 sb = &so->so_snd; 20149 goto just_return_nolock; 20150 } 20151 /* 20152 * Determine length of data that should be transmitted, and flags 20153 * that will be used. If there is some data or critical controls 20154 * (SYN, RST) to send, then transmit; otherwise, investigate 20155 * further. 20156 */ 20157 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 20158 if (tp->t_idle_reduce) { 20159 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 20160 rack_cc_after_idle(rack, tp); 20161 } 20162 tp->t_flags &= ~TF_LASTIDLE; 20163 if (idle) { 20164 if (tp->t_flags & TF_MORETOCOME) { 20165 tp->t_flags |= TF_LASTIDLE; 20166 idle = 0; 20167 } 20168 } 20169 if ((tp->snd_una == tp->snd_max) && 20170 rack->r_ctl.rc_went_idle_time && 20171 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 20172 idle = cts - rack->r_ctl.rc_went_idle_time; 20173 if (idle > rack_min_probertt_hold) { 20174 /* Count as a probe rtt */ 20175 if (rack->in_probe_rtt == 0) { 20176 rack->r_ctl.rc_lower_rtt_us_cts = cts; 20177 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 20178 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 20179 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 20180 } else { 20181 rack_exit_probertt(rack, cts); 20182 } 20183 } 20184 idle = 0; 20185 } 20186 if (rack_use_fsb && 20187 (rack->r_ctl.fsb.tcp_ip_hdr) && 20188 (rack->r_fsb_inited == 0) && 20189 (rack->r_state != TCPS_CLOSED)) 20190 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); 20191 again: 20192 /* 20193 * If we've recently taken a timeout, snd_max will be greater than 20194 * snd_nxt. There may be SACK information that allows us to avoid 20195 * resending already delivered data. Adjust snd_nxt accordingly. 20196 */ 20197 sendalot = 0; 20198 cts = tcp_get_usecs(&tv); 20199 ms_cts = tcp_tv_to_mssectick(&tv); 20200 tso = 0; 20201 mtu = 0; 20202 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 20203 minseg = segsiz; 20204 if (rack->r_ctl.rc_pace_max_segs == 0) 20205 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 20206 else 20207 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 20208 sb_offset = tp->snd_max - tp->snd_una; 20209 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 20210 flags = tcp_outflags[tp->t_state]; 20211 while (rack->rc_free_cnt < rack_free_cache) { 20212 rsm = rack_alloc(rack); 20213 if (rsm == NULL) { 20214 if (hpts_calling) 20215 /* Retry in a ms */ 20216 slot = (1 * HPTS_USEC_IN_MSEC); 20217 so = inp->inp_socket; 20218 sb = &so->so_snd; 20219 goto just_return_nolock; 20220 } 20221 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 20222 rack->rc_free_cnt++; 20223 rsm = NULL; 20224 } 20225 sack_rxmit = 0; 20226 len = 0; 20227 rsm = NULL; 20228 if (flags & TH_RST) { 20229 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 20230 so = inp->inp_socket; 20231 sb = &so->so_snd; 20232 goto send; 20233 } 20234 if (rack->r_ctl.rc_resend) { 20235 /* Retransmit timer */ 20236 rsm = rack->r_ctl.rc_resend; 20237 rack->r_ctl.rc_resend = NULL; 20238 len = rsm->r_end - rsm->r_start; 20239 sack_rxmit = 1; 20240 sendalot = 0; 20241 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20242 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20243 __func__, __LINE__, 20244 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20245 sb_offset = rsm->r_start - tp->snd_una; 20246 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20247 } else if (rack->r_collapse_point_valid && 20248 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 20249 /* 20250 * If an RSM is returned then enough time has passed 20251 * for us to retransmit it. Move up the collapse point, 20252 * since this rsm has its chance to retransmit now. 20253 */ 20254 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT); 20255 rack->r_ctl.last_collapse_point = rsm->r_end; 20256 /* Are we done? */ 20257 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 20258 rack->r_ctl.high_collapse_point)) 20259 rack->r_collapse_point_valid = 0; 20260 sack_rxmit = 1; 20261 /* We are not doing a TLP */ 20262 doing_tlp = 0; 20263 len = rsm->r_end - rsm->r_start; 20264 sb_offset = rsm->r_start - tp->snd_una; 20265 sendalot = 0; 20266 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20267 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 20268 /* We have a retransmit that takes precedence */ 20269 if ((!IN_FASTRECOVERY(tp->t_flags)) && 20270 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 20271 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 20272 /* Enter recovery if not induced by a time-out */ 20273 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 20274 } 20275 #ifdef INVARIANTS 20276 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 20277 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 20278 tp, rack, rsm, rsm->r_start, tp->snd_una); 20279 } 20280 #endif 20281 len = rsm->r_end - rsm->r_start; 20282 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20283 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20284 __func__, __LINE__, 20285 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20286 sb_offset = rsm->r_start - tp->snd_una; 20287 sendalot = 0; 20288 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20289 if (len > 0) { 20290 sack_rxmit = 1; 20291 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 20292 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 20293 min(len, segsiz)); 20294 } 20295 } else if (rack->r_ctl.rc_tlpsend) { 20296 /* Tail loss probe */ 20297 long cwin; 20298 long tlen; 20299 20300 /* 20301 * Check if we can do a TLP with a RACK'd packet 20302 * this can happen if we are not doing the rack 20303 * cheat and we skipped to a TLP and it 20304 * went off. 20305 */ 20306 rsm = rack->r_ctl.rc_tlpsend; 20307 /* We are doing a TLP make sure the flag is preent */ 20308 rsm->r_flags |= RACK_TLP; 20309 rack->r_ctl.rc_tlpsend = NULL; 20310 sack_rxmit = 1; 20311 tlen = rsm->r_end - rsm->r_start; 20312 if (tlen > segsiz) 20313 tlen = segsiz; 20314 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20315 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20316 __func__, __LINE__, 20317 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20318 sb_offset = rsm->r_start - tp->snd_una; 20319 cwin = min(tp->snd_wnd, tlen); 20320 len = cwin; 20321 } 20322 if (rack->r_must_retran && 20323 (doing_tlp == 0) && 20324 (SEQ_GT(tp->snd_max, tp->snd_una)) && 20325 (rsm == NULL)) { 20326 /* 20327 * There are two different ways that we 20328 * can get into this block: 20329 * a) This is a non-sack connection, we had a time-out 20330 * and thus r_must_retran was set and everything 20331 * left outstanding as been marked for retransmit. 20332 * b) The MTU of the path shrank, so that everything 20333 * was marked to be retransmitted with the smaller 20334 * mtu and r_must_retran was set. 20335 * 20336 * This means that we expect the sendmap (outstanding) 20337 * to all be marked must. We can use the tmap to 20338 * look at them. 20339 * 20340 */ 20341 int sendwin, flight; 20342 20343 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 20344 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 20345 if (flight >= sendwin) { 20346 /* 20347 * We can't send yet. 20348 */ 20349 so = inp->inp_socket; 20350 sb = &so->so_snd; 20351 goto just_return_nolock; 20352 } 20353 /* 20354 * This is the case a/b mentioned above. All 20355 * outstanding/not-acked should be marked. 20356 * We can use the tmap to find them. 20357 */ 20358 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 20359 if (rsm == NULL) { 20360 /* TSNH */ 20361 rack->r_must_retran = 0; 20362 rack->r_ctl.rc_out_at_rto = 0; 20363 so = inp->inp_socket; 20364 sb = &so->so_snd; 20365 goto just_return_nolock; 20366 } 20367 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 20368 /* 20369 * The first one does not have the flag, did we collapse 20370 * further up in our list? 20371 */ 20372 rack->r_must_retran = 0; 20373 rack->r_ctl.rc_out_at_rto = 0; 20374 rsm = NULL; 20375 sack_rxmit = 0; 20376 } else { 20377 sack_rxmit = 1; 20378 len = rsm->r_end - rsm->r_start; 20379 sb_offset = rsm->r_start - tp->snd_una; 20380 sendalot = 0; 20381 if ((rack->full_size_rxt == 0) && 20382 (rack->shape_rxt_to_pacing_min == 0) && 20383 (len >= segsiz)) 20384 len = segsiz; 20385 else if (rack->shape_rxt_to_pacing_min && 20386 rack->gp_ready) { 20387 /* We use pacing min as shaping len req */ 20388 uint32_t maxlen; 20389 20390 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20391 if (len > maxlen) 20392 len = maxlen; 20393 } 20394 /* 20395 * Delay removing the flag RACK_MUST_RXT so 20396 * that the fastpath for retransmit will 20397 * work with this rsm. 20398 */ 20399 } 20400 } 20401 /* 20402 * Enforce a connection sendmap count limit if set 20403 * as long as we are not retransmiting. 20404 */ 20405 if ((rsm == NULL) && 20406 (rack->do_detection == 0) && 20407 (V_tcp_map_entries_limit > 0) && 20408 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 20409 counter_u64_add(rack_to_alloc_limited, 1); 20410 if (!rack->alloc_limit_reported) { 20411 rack->alloc_limit_reported = 1; 20412 counter_u64_add(rack_alloc_limited_conns, 1); 20413 } 20414 so = inp->inp_socket; 20415 sb = &so->so_snd; 20416 goto just_return_nolock; 20417 } 20418 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 20419 /* we are retransmitting the fin */ 20420 len--; 20421 if (len) { 20422 /* 20423 * When retransmitting data do *not* include the 20424 * FIN. This could happen from a TLP probe. 20425 */ 20426 flags &= ~TH_FIN; 20427 } 20428 } 20429 if (rsm && rack->r_fsb_inited && 20430 rack_use_rsm_rfo && 20431 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 20432 int ret; 20433 20434 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 20435 if (ret == 0) 20436 return (0); 20437 } 20438 so = inp->inp_socket; 20439 sb = &so->so_snd; 20440 if (do_a_prefetch == 0) { 20441 kern_prefetch(sb, &do_a_prefetch); 20442 do_a_prefetch = 1; 20443 } 20444 #ifdef NETFLIX_SHARED_CWND 20445 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 20446 rack->rack_enable_scwnd) { 20447 /* We are doing cwnd sharing */ 20448 if (rack->gp_ready && 20449 (rack->rack_attempted_scwnd == 0) && 20450 (rack->r_ctl.rc_scw == NULL) && 20451 tp->t_lib) { 20452 /* The pcbid is in, lets make an attempt */ 20453 counter_u64_add(rack_try_scwnd, 1); 20454 rack->rack_attempted_scwnd = 1; 20455 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 20456 &rack->r_ctl.rc_scw_index, 20457 segsiz); 20458 } 20459 if (rack->r_ctl.rc_scw && 20460 (rack->rack_scwnd_is_idle == 1) && 20461 sbavail(&so->so_snd)) { 20462 /* we are no longer out of data */ 20463 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20464 rack->rack_scwnd_is_idle = 0; 20465 } 20466 if (rack->r_ctl.rc_scw) { 20467 /* First lets update and get the cwnd */ 20468 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 20469 rack->r_ctl.rc_scw_index, 20470 tp->snd_cwnd, tp->snd_wnd, segsiz); 20471 } 20472 } 20473 #endif 20474 /* 20475 * Get standard flags, and add SYN or FIN if requested by 'hidden' 20476 * state flags. 20477 */ 20478 if (tp->t_flags & TF_NEEDFIN) 20479 flags |= TH_FIN; 20480 if (tp->t_flags & TF_NEEDSYN) 20481 flags |= TH_SYN; 20482 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 20483 void *end_rsm; 20484 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 20485 if (end_rsm) 20486 kern_prefetch(end_rsm, &prefetch_rsm); 20487 prefetch_rsm = 1; 20488 } 20489 SOCKBUF_LOCK(sb); 20490 /* 20491 * If snd_nxt == snd_max and we have transmitted a FIN, the 20492 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 20493 * negative length. This can also occur when TCP opens up its 20494 * congestion window while receiving additional duplicate acks after 20495 * fast-retransmit because TCP will reset snd_nxt to snd_max after 20496 * the fast-retransmit. 20497 * 20498 * In the normal retransmit-FIN-only case, however, snd_nxt will be 20499 * set to snd_una, the sb_offset will be 0, and the length may wind 20500 * up 0. 20501 * 20502 * If sack_rxmit is true we are retransmitting from the scoreboard 20503 * in which case len is already set. 20504 */ 20505 if ((sack_rxmit == 0) && 20506 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 20507 uint32_t avail; 20508 20509 avail = sbavail(sb); 20510 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 20511 sb_offset = tp->snd_nxt - tp->snd_una; 20512 else 20513 sb_offset = 0; 20514 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 20515 if (rack->r_ctl.rc_tlp_new_data) { 20516 /* TLP is forcing out new data */ 20517 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 20518 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 20519 } 20520 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 20521 if (tp->snd_wnd > sb_offset) 20522 len = tp->snd_wnd - sb_offset; 20523 else 20524 len = 0; 20525 } else { 20526 len = rack->r_ctl.rc_tlp_new_data; 20527 } 20528 rack->r_ctl.rc_tlp_new_data = 0; 20529 } else { 20530 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 20531 } 20532 if ((rack->r_ctl.crte == NULL) && 20533 IN_FASTRECOVERY(tp->t_flags) && 20534 (rack->full_size_rxt == 0) && 20535 (rack->shape_rxt_to_pacing_min == 0) && 20536 (len > segsiz)) { 20537 /* 20538 * For prr=off, we need to send only 1 MSS 20539 * at a time. We do this because another sack could 20540 * be arriving that causes us to send retransmits and 20541 * we don't want to be on a long pace due to a larger send 20542 * that keeps us from sending out the retransmit. 20543 */ 20544 len = segsiz; 20545 } else if (rack->shape_rxt_to_pacing_min && 20546 rack->gp_ready) { 20547 /* We use pacing min as shaping len req */ 20548 uint32_t maxlen; 20549 20550 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20551 if (len > maxlen) 20552 len = maxlen; 20553 }/* The else is full_size_rxt is on so send it all */ 20554 } else { 20555 uint32_t outstanding; 20556 /* 20557 * We are inside of a Fast recovery episode, this 20558 * is caused by a SACK or 3 dup acks. At this point 20559 * we have sent all the retransmissions and we rely 20560 * on PRR to dictate what we will send in the form of 20561 * new data. 20562 */ 20563 20564 outstanding = tp->snd_max - tp->snd_una; 20565 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 20566 if (tp->snd_wnd > outstanding) { 20567 len = tp->snd_wnd - outstanding; 20568 /* Check to see if we have the data */ 20569 if ((sb_offset + len) > avail) { 20570 /* It does not all fit */ 20571 if (avail > sb_offset) 20572 len = avail - sb_offset; 20573 else 20574 len = 0; 20575 } 20576 } else { 20577 len = 0; 20578 } 20579 } else if (avail > sb_offset) { 20580 len = avail - sb_offset; 20581 } else { 20582 len = 0; 20583 } 20584 if (len > 0) { 20585 if (len > rack->r_ctl.rc_prr_sndcnt) { 20586 len = rack->r_ctl.rc_prr_sndcnt; 20587 } 20588 if (len > 0) { 20589 sub_from_prr = 1; 20590 } 20591 } 20592 if (len > segsiz) { 20593 /* 20594 * We should never send more than a MSS when 20595 * retransmitting or sending new data in prr 20596 * mode unless the override flag is on. Most 20597 * likely the PRR algorithm is not going to 20598 * let us send a lot as well :-) 20599 */ 20600 if (rack->r_ctl.rc_prr_sendalot == 0) { 20601 len = segsiz; 20602 } 20603 } else if (len < segsiz) { 20604 /* 20605 * Do we send any? The idea here is if the 20606 * send empty's the socket buffer we want to 20607 * do it. However if not then lets just wait 20608 * for our prr_sndcnt to get bigger. 20609 */ 20610 long leftinsb; 20611 20612 leftinsb = sbavail(sb) - sb_offset; 20613 if (leftinsb > len) { 20614 /* This send does not empty the sb */ 20615 len = 0; 20616 } 20617 } 20618 } 20619 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 20620 /* 20621 * If you have not established 20622 * and are not doing FAST OPEN 20623 * no data please. 20624 */ 20625 if ((sack_rxmit == 0) && 20626 (!IS_FASTOPEN(tp->t_flags))){ 20627 len = 0; 20628 sb_offset = 0; 20629 } 20630 } 20631 if (prefetch_so_done == 0) { 20632 kern_prefetch(so, &prefetch_so_done); 20633 prefetch_so_done = 1; 20634 } 20635 /* 20636 * Lop off SYN bit if it has already been sent. However, if this is 20637 * SYN-SENT state and if segment contains data and if we don't know 20638 * that foreign host supports TAO, suppress sending segment. 20639 */ 20640 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 20641 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 20642 /* 20643 * When sending additional segments following a TFO SYN|ACK, 20644 * do not include the SYN bit. 20645 */ 20646 if (IS_FASTOPEN(tp->t_flags) && 20647 (tp->t_state == TCPS_SYN_RECEIVED)) 20648 flags &= ~TH_SYN; 20649 } 20650 /* 20651 * Be careful not to send data and/or FIN on SYN segments. This 20652 * measure is needed to prevent interoperability problems with not 20653 * fully conformant TCP implementations. 20654 */ 20655 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 20656 len = 0; 20657 flags &= ~TH_FIN; 20658 } 20659 /* 20660 * On TFO sockets, ensure no data is sent in the following cases: 20661 * 20662 * - When retransmitting SYN|ACK on a passively-created socket 20663 * 20664 * - When retransmitting SYN on an actively created socket 20665 * 20666 * - When sending a zero-length cookie (cookie request) on an 20667 * actively created socket 20668 * 20669 * - When the socket is in the CLOSED state (RST is being sent) 20670 */ 20671 if (IS_FASTOPEN(tp->t_flags) && 20672 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 20673 ((tp->t_state == TCPS_SYN_SENT) && 20674 (tp->t_tfo_client_cookie_len == 0)) || 20675 (flags & TH_RST))) { 20676 sack_rxmit = 0; 20677 len = 0; 20678 } 20679 /* Without fast-open there should never be data sent on a SYN */ 20680 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 20681 tp->snd_nxt = tp->iss; 20682 len = 0; 20683 } 20684 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 20685 /* We only send 1 MSS if we have a DSACK block */ 20686 add_flag |= RACK_SENT_W_DSACK; 20687 len = segsiz; 20688 } 20689 orig_len = len; 20690 if (len <= 0) { 20691 /* 20692 * If FIN has been sent but not acked, but we haven't been 20693 * called to retransmit, len will be < 0. Otherwise, window 20694 * shrank after we sent into it. If window shrank to 0, 20695 * cancel pending retransmit, pull snd_nxt back to (closed) 20696 * window, and set the persist timer if it isn't already 20697 * going. If the window didn't close completely, just wait 20698 * for an ACK. 20699 * 20700 * We also do a general check here to ensure that we will 20701 * set the persist timer when we have data to send, but a 20702 * 0-byte window. This makes sure the persist timer is set 20703 * even if the packet hits one of the "goto send" lines 20704 * below. 20705 */ 20706 len = 0; 20707 if ((tp->snd_wnd == 0) && 20708 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20709 (tp->snd_una == tp->snd_max) && 20710 (sb_offset < (int)sbavail(sb))) { 20711 rack_enter_persist(tp, rack, cts, tp->snd_una); 20712 } 20713 } else if ((rsm == NULL) && 20714 (doing_tlp == 0) && 20715 (len < pace_max_seg)) { 20716 /* 20717 * We are not sending a maximum sized segment for 20718 * some reason. Should we not send anything (think 20719 * sws or persists)? 20720 */ 20721 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20722 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20723 (len < minseg) && 20724 (len < (int)(sbavail(sb) - sb_offset))) { 20725 /* 20726 * Here the rwnd is less than 20727 * the minimum pacing size, this is not a retransmit, 20728 * we are established and 20729 * the send is not the last in the socket buffer 20730 * we send nothing, and we may enter persists 20731 * if nothing is outstanding. 20732 */ 20733 len = 0; 20734 if (tp->snd_max == tp->snd_una) { 20735 /* 20736 * Nothing out we can 20737 * go into persists. 20738 */ 20739 rack_enter_persist(tp, rack, cts, tp->snd_una); 20740 } 20741 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 20742 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20743 (len < (int)(sbavail(sb) - sb_offset)) && 20744 (len < minseg)) { 20745 /* 20746 * Here we are not retransmitting, and 20747 * the cwnd is not so small that we could 20748 * not send at least a min size (rxt timer 20749 * not having gone off), We have 2 segments or 20750 * more already in flight, its not the tail end 20751 * of the socket buffer and the cwnd is blocking 20752 * us from sending out a minimum pacing segment size. 20753 * Lets not send anything. 20754 */ 20755 len = 0; 20756 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 20757 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20758 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20759 (len < (int)(sbavail(sb) - sb_offset)) && 20760 (TCPS_HAVEESTABLISHED(tp->t_state))) { 20761 /* 20762 * Here we have a send window but we have 20763 * filled it up and we can't send another pacing segment. 20764 * We also have in flight more than 2 segments 20765 * and we are not completing the sb i.e. we allow 20766 * the last bytes of the sb to go out even if 20767 * its not a full pacing segment. 20768 */ 20769 len = 0; 20770 } else if ((rack->r_ctl.crte != NULL) && 20771 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 20772 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 20773 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 20774 (len < (int)(sbavail(sb) - sb_offset))) { 20775 /* 20776 * Here we are doing hardware pacing, this is not a TLP, 20777 * we are not sending a pace max segment size, there is rwnd 20778 * room to send at least N pace_max_seg, the cwnd is greater 20779 * than or equal to a full pacing segments plus 4 mss and we have 2 or 20780 * more segments in flight and its not the tail of the socket buffer. 20781 * 20782 * We don't want to send instead we need to get more ack's in to 20783 * allow us to send a full pacing segment. Normally, if we are pacing 20784 * about the right speed, we should have finished our pacing 20785 * send as most of the acks have come back if we are at the 20786 * right rate. This is a bit fuzzy since return path delay 20787 * can delay the acks, which is why we want to make sure we 20788 * have cwnd space to have a bit more than a max pace segments in flight. 20789 * 20790 * If we have not gotten our acks back we are pacing at too high a 20791 * rate delaying will not hurt and will bring our GP estimate down by 20792 * injecting the delay. If we don't do this we will send 20793 * 2 MSS out in response to the acks being clocked in which 20794 * defeats the point of hw-pacing (i.e. to help us get 20795 * larger TSO's out). 20796 */ 20797 len = 0; 20798 } 20799 20800 } 20801 /* len will be >= 0 after this point. */ 20802 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 20803 rack_sndbuf_autoscale(rack); 20804 /* 20805 * Decide if we can use TCP Segmentation Offloading (if supported by 20806 * hardware). 20807 * 20808 * TSO may only be used if we are in a pure bulk sending state. The 20809 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 20810 * options prevent using TSO. With TSO the TCP header is the same 20811 * (except for the sequence number) for all generated packets. This 20812 * makes it impossible to transmit any options which vary per 20813 * generated segment or packet. 20814 * 20815 * IPv4 handling has a clear separation of ip options and ip header 20816 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 20817 * the right thing below to provide length of just ip options and thus 20818 * checking for ipoptlen is enough to decide if ip options are present. 20819 */ 20820 ipoptlen = 0; 20821 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20822 /* 20823 * Pre-calculate here as we save another lookup into the darknesses 20824 * of IPsec that way and can actually decide if TSO is ok. 20825 */ 20826 #ifdef INET6 20827 if (isipv6 && IPSEC_ENABLED(ipv6)) 20828 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 20829 #ifdef INET 20830 else 20831 #endif 20832 #endif /* INET6 */ 20833 #ifdef INET 20834 if (IPSEC_ENABLED(ipv4)) 20835 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 20836 #endif /* INET */ 20837 #endif 20838 20839 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20840 ipoptlen += ipsec_optlen; 20841 #endif 20842 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 20843 (tp->t_port == 0) && 20844 ((tp->t_flags & TF_SIGNATURE) == 0) && 20845 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 20846 ipoptlen == 0) 20847 tso = 1; 20848 { 20849 uint32_t outstanding __unused; 20850 20851 outstanding = tp->snd_max - tp->snd_una; 20852 if (tp->t_flags & TF_SENTFIN) { 20853 /* 20854 * If we sent a fin, snd_max is 1 higher than 20855 * snd_una 20856 */ 20857 outstanding--; 20858 } 20859 if (sack_rxmit) { 20860 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 20861 flags &= ~TH_FIN; 20862 } else { 20863 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 20864 sbused(sb))) 20865 flags &= ~TH_FIN; 20866 } 20867 } 20868 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 20869 (long)TCP_MAXWIN << tp->rcv_scale); 20870 20871 /* 20872 * Sender silly window avoidance. We transmit under the following 20873 * conditions when len is non-zero: 20874 * 20875 * - We have a full segment (or more with TSO) - This is the last 20876 * buffer in a write()/send() and we are either idle or running 20877 * NODELAY - we've timed out (e.g. persist timer) - we have more 20878 * then 1/2 the maximum send window's worth of data (receiver may be 20879 * limited the window size) - we need to retransmit 20880 */ 20881 if (len) { 20882 if (len >= segsiz) { 20883 goto send; 20884 } 20885 /* 20886 * NOTE! on localhost connections an 'ack' from the remote 20887 * end may occur synchronously with the output and cause us 20888 * to flush a buffer queued with moretocome. XXX 20889 * 20890 */ 20891 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 20892 (idle || (tp->t_flags & TF_NODELAY)) && 20893 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20894 (tp->t_flags & TF_NOPUSH) == 0) { 20895 pass = 2; 20896 goto send; 20897 } 20898 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 20899 pass = 22; 20900 goto send; 20901 } 20902 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 20903 pass = 4; 20904 goto send; 20905 } 20906 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 20907 pass = 5; 20908 goto send; 20909 } 20910 if (sack_rxmit) { 20911 pass = 6; 20912 goto send; 20913 } 20914 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 20915 (ctf_outstanding(tp) < (segsiz * 2))) { 20916 /* 20917 * We have less than two MSS outstanding (delayed ack) 20918 * and our rwnd will not let us send a full sized 20919 * MSS. Lets go ahead and let this small segment 20920 * out because we want to try to have at least two 20921 * packets inflight to not be caught by delayed ack. 20922 */ 20923 pass = 12; 20924 goto send; 20925 } 20926 } 20927 /* 20928 * Sending of standalone window updates. 20929 * 20930 * Window updates are important when we close our window due to a 20931 * full socket buffer and are opening it again after the application 20932 * reads data from it. Once the window has opened again and the 20933 * remote end starts to send again the ACK clock takes over and 20934 * provides the most current window information. 20935 * 20936 * We must avoid the silly window syndrome whereas every read from 20937 * the receive buffer, no matter how small, causes a window update 20938 * to be sent. We also should avoid sending a flurry of window 20939 * updates when the socket buffer had queued a lot of data and the 20940 * application is doing small reads. 20941 * 20942 * Prevent a flurry of pointless window updates by only sending an 20943 * update when we can increase the advertized window by more than 20944 * 1/4th of the socket buffer capacity. When the buffer is getting 20945 * full or is very small be more aggressive and send an update 20946 * whenever we can increase by two mss sized segments. In all other 20947 * situations the ACK's to new incoming data will carry further 20948 * window increases. 20949 * 20950 * Don't send an independent window update if a delayed ACK is 20951 * pending (it will get piggy-backed on it) or the remote side 20952 * already has done a half-close and won't send more data. Skip 20953 * this if the connection is in T/TCP half-open state. 20954 */ 20955 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 20956 !(tp->t_flags & TF_DELACK) && 20957 !TCPS_HAVERCVDFIN(tp->t_state)) { 20958 /* 20959 * "adv" is the amount we could increase the window, taking 20960 * into account that we are limited by TCP_MAXWIN << 20961 * tp->rcv_scale. 20962 */ 20963 int32_t adv; 20964 int oldwin; 20965 20966 adv = recwin; 20967 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 20968 oldwin = (tp->rcv_adv - tp->rcv_nxt); 20969 if (adv > oldwin) 20970 adv -= oldwin; 20971 else { 20972 /* We can't increase the window */ 20973 adv = 0; 20974 } 20975 } else 20976 oldwin = 0; 20977 20978 /* 20979 * If the new window size ends up being the same as or less 20980 * than the old size when it is scaled, then don't force 20981 * a window update. 20982 */ 20983 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 20984 goto dontupdate; 20985 20986 if (adv >= (int32_t)(2 * segsiz) && 20987 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 20988 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 20989 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 20990 pass = 7; 20991 goto send; 20992 } 20993 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 20994 pass = 23; 20995 goto send; 20996 } 20997 } 20998 dontupdate: 20999 21000 /* 21001 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 21002 * is also a catch-all for the retransmit timer timeout case. 21003 */ 21004 if (tp->t_flags & TF_ACKNOW) { 21005 pass = 8; 21006 goto send; 21007 } 21008 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 21009 pass = 9; 21010 goto send; 21011 } 21012 /* 21013 * If our state indicates that FIN should be sent and we have not 21014 * yet done so, then we need to send. 21015 */ 21016 if ((flags & TH_FIN) && 21017 (tp->snd_nxt == tp->snd_una)) { 21018 pass = 11; 21019 goto send; 21020 } 21021 /* 21022 * No reason to send a segment, just return. 21023 */ 21024 just_return: 21025 SOCKBUF_UNLOCK(sb); 21026 just_return_nolock: 21027 { 21028 int app_limited = CTF_JR_SENT_DATA; 21029 21030 if (tot_len_this_send > 0) { 21031 /* Make sure snd_nxt is up to max */ 21032 rack->r_ctl.fsb.recwin = recwin; 21033 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 21034 if ((error == 0) && 21035 rack_use_rfo && 21036 ((flags & (TH_SYN|TH_FIN)) == 0) && 21037 (ipoptlen == 0) && 21038 (tp->snd_nxt == tp->snd_max) && 21039 (tp->rcv_numsacks == 0) && 21040 rack->r_fsb_inited && 21041 TCPS_HAVEESTABLISHED(tp->t_state) && 21042 ((IN_RECOVERY(tp->t_flags)) == 0) && 21043 (rack->r_must_retran == 0) && 21044 ((tp->t_flags & TF_NEEDFIN) == 0) && 21045 (len > 0) && (orig_len > 0) && 21046 (orig_len > len) && 21047 ((orig_len - len) >= segsiz) && 21048 ((optlen == 0) || 21049 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 21050 /* We can send at least one more MSS using our fsb */ 21051 rack_setup_fast_output(tp, rack, sb, len, orig_len, 21052 segsiz, pace_max_seg, hw_tls, flags); 21053 } else 21054 rack->r_fast_output = 0; 21055 21056 21057 rack_log_fsb(rack, tp, so, flags, 21058 ipoptlen, orig_len, len, 0, 21059 1, optlen, __LINE__, 1); 21060 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 21061 tp->snd_nxt = tp->snd_max; 21062 } else { 21063 int end_window = 0; 21064 uint32_t seq = tp->gput_ack; 21065 21066 rsm = tqhash_max(rack->r_ctl.tqh); 21067 if (rsm) { 21068 /* 21069 * Mark the last sent that we just-returned (hinting 21070 * that delayed ack may play a role in any rtt measurement). 21071 */ 21072 rsm->r_just_ret = 1; 21073 } 21074 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 21075 rack->r_ctl.rc_agg_delayed = 0; 21076 rack->r_early = 0; 21077 rack->r_late = 0; 21078 rack->r_ctl.rc_agg_early = 0; 21079 if ((ctf_outstanding(tp) + 21080 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 21081 minseg)) >= tp->snd_wnd) { 21082 /* We are limited by the rwnd */ 21083 app_limited = CTF_JR_RWND_LIMITED; 21084 if (IN_FASTRECOVERY(tp->t_flags)) 21085 rack->r_ctl.rc_prr_sndcnt = 0; 21086 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 21087 /* We are limited by whats available -- app limited */ 21088 app_limited = CTF_JR_APP_LIMITED; 21089 if (IN_FASTRECOVERY(tp->t_flags)) 21090 rack->r_ctl.rc_prr_sndcnt = 0; 21091 } else if ((idle == 0) && 21092 ((tp->t_flags & TF_NODELAY) == 0) && 21093 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 21094 (len < segsiz)) { 21095 /* 21096 * No delay is not on and the 21097 * user is sending less than 1MSS. This 21098 * brings out SWS avoidance so we 21099 * don't send. Another app-limited case. 21100 */ 21101 app_limited = CTF_JR_APP_LIMITED; 21102 } else if (tp->t_flags & TF_NOPUSH) { 21103 /* 21104 * The user has requested no push of 21105 * the last segment and we are 21106 * at the last segment. Another app 21107 * limited case. 21108 */ 21109 app_limited = CTF_JR_APP_LIMITED; 21110 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 21111 /* Its the cwnd */ 21112 app_limited = CTF_JR_CWND_LIMITED; 21113 } else if (IN_FASTRECOVERY(tp->t_flags) && 21114 (rack->rack_no_prr == 0) && 21115 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 21116 app_limited = CTF_JR_PRR; 21117 } else { 21118 /* Now why here are we not sending? */ 21119 #ifdef NOW 21120 #ifdef INVARIANTS 21121 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 21122 #endif 21123 #endif 21124 app_limited = CTF_JR_ASSESSING; 21125 } 21126 /* 21127 * App limited in some fashion, for our pacing GP 21128 * measurements we don't want any gap (even cwnd). 21129 * Close down the measurement window. 21130 */ 21131 if (rack_cwnd_block_ends_measure && 21132 ((app_limited == CTF_JR_CWND_LIMITED) || 21133 (app_limited == CTF_JR_PRR))) { 21134 /* 21135 * The reason we are not sending is 21136 * the cwnd (or prr). We have been configured 21137 * to end the measurement window in 21138 * this case. 21139 */ 21140 end_window = 1; 21141 } else if (rack_rwnd_block_ends_measure && 21142 (app_limited == CTF_JR_RWND_LIMITED)) { 21143 /* 21144 * We are rwnd limited and have been 21145 * configured to end the measurement 21146 * window in this case. 21147 */ 21148 end_window = 1; 21149 } else if (app_limited == CTF_JR_APP_LIMITED) { 21150 /* 21151 * A true application limited period, we have 21152 * ran out of data. 21153 */ 21154 end_window = 1; 21155 } else if (app_limited == CTF_JR_ASSESSING) { 21156 /* 21157 * In the assessing case we hit the end of 21158 * the if/else and had no known reason 21159 * This will panic us under invariants.. 21160 * 21161 * If we get this out in logs we need to 21162 * investagate which reason we missed. 21163 */ 21164 end_window = 1; 21165 } 21166 if (end_window) { 21167 uint8_t log = 0; 21168 21169 /* Adjust the Gput measurement */ 21170 if ((tp->t_flags & TF_GPUTINPROG) && 21171 SEQ_GT(tp->gput_ack, tp->snd_max)) { 21172 tp->gput_ack = tp->snd_max; 21173 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 21174 /* 21175 * There is not enough to measure. 21176 */ 21177 tp->t_flags &= ~TF_GPUTINPROG; 21178 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 21179 rack->r_ctl.rc_gp_srtt /*flex1*/, 21180 tp->gput_seq, 21181 0, 0, 18, __LINE__, NULL, 0); 21182 } else 21183 log = 1; 21184 } 21185 /* Mark the last packet has app limited */ 21186 rsm = tqhash_max(rack->r_ctl.tqh); 21187 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 21188 if (rack->r_ctl.rc_app_limited_cnt == 0) 21189 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 21190 else { 21191 /* 21192 * Go out to the end app limited and mark 21193 * this new one as next and move the end_appl up 21194 * to this guy. 21195 */ 21196 if (rack->r_ctl.rc_end_appl) 21197 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 21198 rack->r_ctl.rc_end_appl = rsm; 21199 } 21200 rsm->r_flags |= RACK_APP_LIMITED; 21201 rack->r_ctl.rc_app_limited_cnt++; 21202 } 21203 if (log) 21204 rack_log_pacing_delay_calc(rack, 21205 rack->r_ctl.rc_app_limited_cnt, seq, 21206 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 21207 } 21208 } 21209 /* Check if we need to go into persists or not */ 21210 if ((tp->snd_max == tp->snd_una) && 21211 TCPS_HAVEESTABLISHED(tp->t_state) && 21212 sbavail(sb) && 21213 (sbavail(sb) > tp->snd_wnd) && 21214 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 21215 /* Yes lets make sure to move to persist before timer-start */ 21216 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 21217 } 21218 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 21219 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 21220 } 21221 #ifdef NETFLIX_SHARED_CWND 21222 if ((sbavail(sb) == 0) && 21223 rack->r_ctl.rc_scw) { 21224 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 21225 rack->rack_scwnd_is_idle = 1; 21226 } 21227 #endif 21228 #ifdef TCP_ACCOUNTING 21229 if (tot_len_this_send > 0) { 21230 crtsc = get_cyclecount(); 21231 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21232 tp->tcp_cnt_counters[SND_OUT_DATA]++; 21233 } 21234 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21235 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 21236 } 21237 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21238 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 21239 } 21240 } else { 21241 crtsc = get_cyclecount(); 21242 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21243 tp->tcp_cnt_counters[SND_LIMITED]++; 21244 } 21245 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21246 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 21247 } 21248 } 21249 sched_unpin(); 21250 #endif 21251 return (0); 21252 21253 send: 21254 if ((rack->r_ctl.crte != NULL) && 21255 (rsm == NULL) && 21256 ((rack->rc_hw_nobuf == 1) || 21257 (rack_hw_check_queue && (check_done == 0)))) { 21258 /* 21259 * We only want to do this once with the hw_check_queue, 21260 * for the enobuf case we would only do it once if 21261 * we come around to again, the flag will be clear. 21262 */ 21263 check_done = 1; 21264 slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); 21265 if (slot) { 21266 rack->r_ctl.rc_agg_delayed = 0; 21267 rack->r_ctl.rc_agg_early = 0; 21268 rack->r_early = 0; 21269 rack->r_late = 0; 21270 SOCKBUF_UNLOCK(&so->so_snd); 21271 goto skip_all_send; 21272 } 21273 } 21274 if (rsm || sack_rxmit) 21275 counter_u64_add(rack_nfto_resend, 1); 21276 else 21277 counter_u64_add(rack_non_fto_send, 1); 21278 if ((flags & TH_FIN) && 21279 sbavail(sb)) { 21280 /* 21281 * We do not transmit a FIN 21282 * with data outstanding. We 21283 * need to make it so all data 21284 * is acked first. 21285 */ 21286 flags &= ~TH_FIN; 21287 } 21288 /* Enforce stack imposed max seg size if we have one */ 21289 if (rack->r_ctl.rc_pace_max_segs && 21290 (len > rack->r_ctl.rc_pace_max_segs)) { 21291 mark = 1; 21292 len = rack->r_ctl.rc_pace_max_segs; 21293 } 21294 SOCKBUF_LOCK_ASSERT(sb); 21295 if (len > 0) { 21296 if (len >= segsiz) 21297 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 21298 else 21299 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 21300 } 21301 /* 21302 * Before ESTABLISHED, force sending of initial options unless TCP 21303 * set not to do any options. NOTE: we assume that the IP/TCP header 21304 * plus TCP options always fit in a single mbuf, leaving room for a 21305 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 21306 * + optlen <= MCLBYTES 21307 */ 21308 optlen = 0; 21309 #ifdef INET6 21310 if (isipv6) 21311 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 21312 else 21313 #endif 21314 hdrlen = sizeof(struct tcpiphdr); 21315 21316 /* 21317 * Compute options for segment. We only have to care about SYN and 21318 * established connection segments. Options for SYN-ACK segments 21319 * are handled in TCP syncache. 21320 */ 21321 to.to_flags = 0; 21322 if ((tp->t_flags & TF_NOOPT) == 0) { 21323 /* Maximum segment size. */ 21324 if (flags & TH_SYN) { 21325 tp->snd_nxt = tp->iss; 21326 to.to_mss = tcp_mssopt(&inp->inp_inc); 21327 if (tp->t_port) 21328 to.to_mss -= V_tcp_udp_tunneling_overhead; 21329 to.to_flags |= TOF_MSS; 21330 21331 /* 21332 * On SYN or SYN|ACK transmits on TFO connections, 21333 * only include the TFO option if it is not a 21334 * retransmit, as the presence of the TFO option may 21335 * have caused the original SYN or SYN|ACK to have 21336 * been dropped by a middlebox. 21337 */ 21338 if (IS_FASTOPEN(tp->t_flags) && 21339 (tp->t_rxtshift == 0)) { 21340 if (tp->t_state == TCPS_SYN_RECEIVED) { 21341 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 21342 to.to_tfo_cookie = 21343 (u_int8_t *)&tp->t_tfo_cookie.server; 21344 to.to_flags |= TOF_FASTOPEN; 21345 wanted_cookie = 1; 21346 } else if (tp->t_state == TCPS_SYN_SENT) { 21347 to.to_tfo_len = 21348 tp->t_tfo_client_cookie_len; 21349 to.to_tfo_cookie = 21350 tp->t_tfo_cookie.client; 21351 to.to_flags |= TOF_FASTOPEN; 21352 wanted_cookie = 1; 21353 /* 21354 * If we wind up having more data to 21355 * send with the SYN than can fit in 21356 * one segment, don't send any more 21357 * until the SYN|ACK comes back from 21358 * the other end. 21359 */ 21360 sendalot = 0; 21361 } 21362 } 21363 } 21364 /* Window scaling. */ 21365 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 21366 to.to_wscale = tp->request_r_scale; 21367 to.to_flags |= TOF_SCALE; 21368 } 21369 /* Timestamps. */ 21370 if ((tp->t_flags & TF_RCVD_TSTMP) || 21371 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 21372 to.to_tsval = ms_cts + tp->ts_offset; 21373 to.to_tsecr = tp->ts_recent; 21374 to.to_flags |= TOF_TS; 21375 } 21376 /* Set receive buffer autosizing timestamp. */ 21377 if (tp->rfbuf_ts == 0 && 21378 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 21379 tp->rfbuf_ts = tcp_ts_getticks(); 21380 /* Selective ACK's. */ 21381 if (tp->t_flags & TF_SACK_PERMIT) { 21382 if (flags & TH_SYN) 21383 to.to_flags |= TOF_SACKPERM; 21384 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 21385 tp->rcv_numsacks > 0) { 21386 to.to_flags |= TOF_SACK; 21387 to.to_nsacks = tp->rcv_numsacks; 21388 to.to_sacks = (u_char *)tp->sackblks; 21389 } 21390 } 21391 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21392 /* TCP-MD5 (RFC2385). */ 21393 if (tp->t_flags & TF_SIGNATURE) 21394 to.to_flags |= TOF_SIGNATURE; 21395 #endif 21396 21397 /* Processing the options. */ 21398 hdrlen += optlen = tcp_addoptions(&to, opt); 21399 /* 21400 * If we wanted a TFO option to be added, but it was unable 21401 * to fit, ensure no data is sent. 21402 */ 21403 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 21404 !(to.to_flags & TOF_FASTOPEN)) 21405 len = 0; 21406 } 21407 if (tp->t_port) { 21408 if (V_tcp_udp_tunneling_port == 0) { 21409 /* The port was removed?? */ 21410 SOCKBUF_UNLOCK(&so->so_snd); 21411 #ifdef TCP_ACCOUNTING 21412 crtsc = get_cyclecount(); 21413 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21414 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 21415 } 21416 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21417 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 21418 } 21419 sched_unpin(); 21420 #endif 21421 return (EHOSTUNREACH); 21422 } 21423 hdrlen += sizeof(struct udphdr); 21424 } 21425 #ifdef INET6 21426 if (isipv6) 21427 ipoptlen = ip6_optlen(inp); 21428 else 21429 #endif 21430 if (inp->inp_options) 21431 ipoptlen = inp->inp_options->m_len - 21432 offsetof(struct ipoption, ipopt_list); 21433 else 21434 ipoptlen = 0; 21435 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21436 ipoptlen += ipsec_optlen; 21437 #endif 21438 21439 /* 21440 * Adjust data length if insertion of options will bump the packet 21441 * length beyond the t_maxseg length. Clear the FIN bit because we 21442 * cut off the tail of the segment. 21443 */ 21444 if (len + optlen + ipoptlen > tp->t_maxseg) { 21445 if (tso) { 21446 uint32_t if_hw_tsomax; 21447 uint32_t moff; 21448 int32_t max_len; 21449 21450 /* extract TSO information */ 21451 if_hw_tsomax = tp->t_tsomax; 21452 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 21453 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 21454 KASSERT(ipoptlen == 0, 21455 ("%s: TSO can't do IP options", __func__)); 21456 21457 /* 21458 * Check if we should limit by maximum payload 21459 * length: 21460 */ 21461 if (if_hw_tsomax != 0) { 21462 /* compute maximum TSO length */ 21463 max_len = (if_hw_tsomax - hdrlen - 21464 max_linkhdr); 21465 if (max_len <= 0) { 21466 len = 0; 21467 } else if (len > max_len) { 21468 sendalot = 1; 21469 len = max_len; 21470 mark = 2; 21471 } 21472 } 21473 /* 21474 * Prevent the last segment from being fractional 21475 * unless the send sockbuf can be emptied: 21476 */ 21477 max_len = (tp->t_maxseg - optlen); 21478 if ((sb_offset + len) < sbavail(sb)) { 21479 moff = len % (u_int)max_len; 21480 if (moff != 0) { 21481 mark = 3; 21482 len -= moff; 21483 } 21484 } 21485 /* 21486 * In case there are too many small fragments don't 21487 * use TSO: 21488 */ 21489 if (len <= max_len) { 21490 mark = 4; 21491 tso = 0; 21492 } 21493 /* 21494 * Send the FIN in a separate segment after the bulk 21495 * sending is done. We don't trust the TSO 21496 * implementations to clear the FIN flag on all but 21497 * the last segment. 21498 */ 21499 if (tp->t_flags & TF_NEEDFIN) { 21500 sendalot = 4; 21501 } 21502 } else { 21503 mark = 5; 21504 if (optlen + ipoptlen >= tp->t_maxseg) { 21505 /* 21506 * Since we don't have enough space to put 21507 * the IP header chain and the TCP header in 21508 * one packet as required by RFC 7112, don't 21509 * send it. Also ensure that at least one 21510 * byte of the payload can be put into the 21511 * TCP segment. 21512 */ 21513 SOCKBUF_UNLOCK(&so->so_snd); 21514 error = EMSGSIZE; 21515 sack_rxmit = 0; 21516 goto out; 21517 } 21518 len = tp->t_maxseg - optlen - ipoptlen; 21519 sendalot = 5; 21520 } 21521 } else { 21522 tso = 0; 21523 mark = 6; 21524 } 21525 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 21526 ("%s: len > IP_MAXPACKET", __func__)); 21527 #ifdef DIAGNOSTIC 21528 #ifdef INET6 21529 if (max_linkhdr + hdrlen > MCLBYTES) 21530 #else 21531 if (max_linkhdr + hdrlen > MHLEN) 21532 #endif 21533 panic("tcphdr too big"); 21534 #endif 21535 21536 /* 21537 * This KASSERT is here to catch edge cases at a well defined place. 21538 * Before, those had triggered (random) panic conditions further 21539 * down. 21540 */ 21541 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 21542 if ((len == 0) && 21543 (flags & TH_FIN) && 21544 (sbused(sb))) { 21545 /* 21546 * We have outstanding data, don't send a fin by itself!. 21547 */ 21548 goto just_return; 21549 } 21550 /* 21551 * Grab a header mbuf, attaching a copy of data to be transmitted, 21552 * and initialize the header from the template for sends on this 21553 * connection. 21554 */ 21555 hw_tls = tp->t_nic_ktls_xmit != 0; 21556 if (len) { 21557 uint32_t max_val; 21558 uint32_t moff; 21559 21560 if (rack->r_ctl.rc_pace_max_segs) 21561 max_val = rack->r_ctl.rc_pace_max_segs; 21562 else if (rack->rc_user_set_max_segs) 21563 max_val = rack->rc_user_set_max_segs * segsiz; 21564 else 21565 max_val = len; 21566 /* 21567 * We allow a limit on sending with hptsi. 21568 */ 21569 if (len > max_val) { 21570 mark = 7; 21571 len = max_val; 21572 } 21573 #ifdef INET6 21574 if (MHLEN < hdrlen + max_linkhdr) 21575 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 21576 else 21577 #endif 21578 m = m_gethdr(M_NOWAIT, MT_DATA); 21579 21580 if (m == NULL) { 21581 SOCKBUF_UNLOCK(sb); 21582 error = ENOBUFS; 21583 sack_rxmit = 0; 21584 goto out; 21585 } 21586 m->m_data += max_linkhdr; 21587 m->m_len = hdrlen; 21588 21589 /* 21590 * Start the m_copy functions from the closest mbuf to the 21591 * sb_offset in the socket buffer chain. 21592 */ 21593 mb = sbsndptr_noadv(sb, sb_offset, &moff); 21594 s_mb = mb; 21595 s_moff = moff; 21596 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 21597 m_copydata(mb, moff, (int)len, 21598 mtod(m, caddr_t)+hdrlen); 21599 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 21600 sbsndptr_adv(sb, mb, len); 21601 m->m_len += len; 21602 } else { 21603 struct sockbuf *msb; 21604 21605 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 21606 msb = NULL; 21607 else 21608 msb = sb; 21609 m->m_next = tcp_m_copym( 21610 mb, moff, &len, 21611 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 21612 ((rsm == NULL) ? hw_tls : 0) 21613 #ifdef NETFLIX_COPY_ARGS 21614 , &s_mb, &s_moff 21615 #endif 21616 ); 21617 if (len <= (tp->t_maxseg - optlen)) { 21618 /* 21619 * Must have ran out of mbufs for the copy 21620 * shorten it to no longer need tso. Lets 21621 * not put on sendalot since we are low on 21622 * mbufs. 21623 */ 21624 tso = 0; 21625 } 21626 if (m->m_next == NULL) { 21627 SOCKBUF_UNLOCK(sb); 21628 (void)m_free(m); 21629 error = ENOBUFS; 21630 sack_rxmit = 0; 21631 goto out; 21632 } 21633 } 21634 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 21635 if (rsm && (rsm->r_flags & RACK_TLP)) { 21636 /* 21637 * TLP should not count in retran count, but 21638 * in its own bin 21639 */ 21640 counter_u64_add(rack_tlp_retran, 1); 21641 counter_u64_add(rack_tlp_retran_bytes, len); 21642 } else { 21643 tp->t_sndrexmitpack++; 21644 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 21645 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 21646 } 21647 #ifdef STATS 21648 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 21649 len); 21650 #endif 21651 } else { 21652 KMOD_TCPSTAT_INC(tcps_sndpack); 21653 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 21654 #ifdef STATS 21655 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 21656 len); 21657 #endif 21658 } 21659 /* 21660 * If we're sending everything we've got, set PUSH. (This 21661 * will keep happy those implementations which only give 21662 * data to the user when a buffer fills or a PUSH comes in.) 21663 */ 21664 if (sb_offset + len == sbused(sb) && 21665 sbused(sb) && 21666 !(flags & TH_SYN)) { 21667 flags |= TH_PUSH; 21668 add_flag |= RACK_HAD_PUSH; 21669 } 21670 21671 SOCKBUF_UNLOCK(sb); 21672 } else { 21673 SOCKBUF_UNLOCK(sb); 21674 if (tp->t_flags & TF_ACKNOW) 21675 KMOD_TCPSTAT_INC(tcps_sndacks); 21676 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 21677 KMOD_TCPSTAT_INC(tcps_sndctrl); 21678 else 21679 KMOD_TCPSTAT_INC(tcps_sndwinup); 21680 21681 m = m_gethdr(M_NOWAIT, MT_DATA); 21682 if (m == NULL) { 21683 error = ENOBUFS; 21684 sack_rxmit = 0; 21685 goto out; 21686 } 21687 #ifdef INET6 21688 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 21689 MHLEN >= hdrlen) { 21690 M_ALIGN(m, hdrlen); 21691 } else 21692 #endif 21693 m->m_data += max_linkhdr; 21694 m->m_len = hdrlen; 21695 } 21696 SOCKBUF_UNLOCK_ASSERT(sb); 21697 m->m_pkthdr.rcvif = (struct ifnet *)0; 21698 #ifdef MAC 21699 mac_inpcb_create_mbuf(inp, m); 21700 #endif 21701 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21702 #ifdef INET6 21703 if (isipv6) 21704 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 21705 else 21706 #endif /* INET6 */ 21707 #ifdef INET 21708 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 21709 #endif 21710 th = rack->r_ctl.fsb.th; 21711 udp = rack->r_ctl.fsb.udp; 21712 if (udp) { 21713 #ifdef INET6 21714 if (isipv6) 21715 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21716 else 21717 #endif /* INET6 */ 21718 ulen = hdrlen + len - sizeof(struct ip); 21719 udp->uh_ulen = htons(ulen); 21720 } 21721 } else { 21722 #ifdef INET6 21723 if (isipv6) { 21724 ip6 = mtod(m, struct ip6_hdr *); 21725 if (tp->t_port) { 21726 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 21727 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21728 udp->uh_dport = tp->t_port; 21729 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21730 udp->uh_ulen = htons(ulen); 21731 th = (struct tcphdr *)(udp + 1); 21732 } else 21733 th = (struct tcphdr *)(ip6 + 1); 21734 tcpip_fillheaders(inp, tp->t_port, ip6, th); 21735 } else 21736 #endif /* INET6 */ 21737 { 21738 #ifdef INET 21739 ip = mtod(m, struct ip *); 21740 if (tp->t_port) { 21741 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 21742 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21743 udp->uh_dport = tp->t_port; 21744 ulen = hdrlen + len - sizeof(struct ip); 21745 udp->uh_ulen = htons(ulen); 21746 th = (struct tcphdr *)(udp + 1); 21747 } else 21748 th = (struct tcphdr *)(ip + 1); 21749 tcpip_fillheaders(inp, tp->t_port, ip, th); 21750 #endif 21751 } 21752 } 21753 /* 21754 * Fill in fields, remembering maximum advertised window for use in 21755 * delaying messages about window sizes. If resending a FIN, be sure 21756 * not to use a new sequence number. 21757 */ 21758 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 21759 tp->snd_nxt == tp->snd_max) 21760 tp->snd_nxt--; 21761 /* 21762 * If we are starting a connection, send ECN setup SYN packet. If we 21763 * are on a retransmit, we may resend those bits a number of times 21764 * as per RFC 3168. 21765 */ 21766 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 21767 flags |= tcp_ecn_output_syn_sent(tp); 21768 } 21769 /* Also handle parallel SYN for ECN */ 21770 if (TCPS_HAVERCVDSYN(tp->t_state) && 21771 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 21772 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 21773 if ((tp->t_state == TCPS_SYN_RECEIVED) && 21774 (tp->t_flags2 & TF2_ECN_SND_ECE)) 21775 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 21776 #ifdef INET6 21777 if (isipv6) { 21778 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 21779 ip6->ip6_flow |= htonl(ect << 20); 21780 } 21781 else 21782 #endif 21783 { 21784 #ifdef INET 21785 ip->ip_tos &= ~IPTOS_ECN_MASK; 21786 ip->ip_tos |= ect; 21787 #endif 21788 } 21789 } 21790 /* 21791 * If we are doing retransmissions, then snd_nxt will not reflect 21792 * the first unsent octet. For ACK only packets, we do not want the 21793 * sequence number of the retransmitted packet, we want the sequence 21794 * number of the next unsent octet. So, if there is no data (and no 21795 * SYN or FIN), use snd_max instead of snd_nxt when filling in 21796 * ti_seq. But if we are in persist state, snd_max might reflect 21797 * one byte beyond the right edge of the window, so use snd_nxt in 21798 * that case, since we know we aren't doing a retransmission. 21799 * (retransmit and persist are mutually exclusive...) 21800 */ 21801 if (sack_rxmit == 0) { 21802 if (len || (flags & (TH_SYN | TH_FIN))) { 21803 th->th_seq = htonl(tp->snd_nxt); 21804 rack_seq = tp->snd_nxt; 21805 } else { 21806 th->th_seq = htonl(tp->snd_max); 21807 rack_seq = tp->snd_max; 21808 } 21809 } else { 21810 th->th_seq = htonl(rsm->r_start); 21811 rack_seq = rsm->r_start; 21812 } 21813 th->th_ack = htonl(tp->rcv_nxt); 21814 tcp_set_flags(th, flags); 21815 /* 21816 * Calculate receive window. Don't shrink window, but avoid silly 21817 * window syndrome. 21818 * If a RST segment is sent, advertise a window of zero. 21819 */ 21820 if (flags & TH_RST) { 21821 recwin = 0; 21822 } else { 21823 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 21824 recwin < (long)segsiz) { 21825 recwin = 0; 21826 } 21827 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 21828 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 21829 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 21830 } 21831 21832 /* 21833 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 21834 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 21835 * handled in syncache. 21836 */ 21837 if (flags & TH_SYN) 21838 th->th_win = htons((u_short) 21839 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 21840 else { 21841 /* Avoid shrinking window with window scaling. */ 21842 recwin = roundup2(recwin, 1 << tp->rcv_scale); 21843 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 21844 } 21845 /* 21846 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 21847 * window. This may cause the remote transmitter to stall. This 21848 * flag tells soreceive() to disable delayed acknowledgements when 21849 * draining the buffer. This can occur if the receiver is 21850 * attempting to read more data than can be buffered prior to 21851 * transmitting on the connection. 21852 */ 21853 if (th->th_win == 0) { 21854 tp->t_sndzerowin++; 21855 tp->t_flags |= TF_RXWIN0SENT; 21856 } else 21857 tp->t_flags &= ~TF_RXWIN0SENT; 21858 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 21859 /* Now are we using fsb?, if so copy the template data to the mbuf */ 21860 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21861 uint8_t *cpto; 21862 21863 cpto = mtod(m, uint8_t *); 21864 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 21865 /* 21866 * We have just copied in: 21867 * IP/IP6 21868 * <optional udphdr> 21869 * tcphdr (no options) 21870 * 21871 * We need to grab the correct pointers into the mbuf 21872 * for both the tcp header, and possibly the udp header (if tunneling). 21873 * We do this by using the offset in the copy buffer and adding it 21874 * to the mbuf base pointer (cpto). 21875 */ 21876 #ifdef INET6 21877 if (isipv6) 21878 ip6 = mtod(m, struct ip6_hdr *); 21879 else 21880 #endif /* INET6 */ 21881 #ifdef INET 21882 ip = mtod(m, struct ip *); 21883 #endif 21884 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 21885 /* If we have a udp header lets set it into the mbuf as well */ 21886 if (udp) 21887 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 21888 } 21889 if (optlen) { 21890 bcopy(opt, th + 1, optlen); 21891 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 21892 } 21893 /* 21894 * Put TCP length in extended header, and then checksum extended 21895 * header and data. 21896 */ 21897 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 21898 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21899 if (to.to_flags & TOF_SIGNATURE) { 21900 /* 21901 * Calculate MD5 signature and put it into the place 21902 * determined before. 21903 * NOTE: since TCP options buffer doesn't point into 21904 * mbuf's data, calculate offset and use it. 21905 */ 21906 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 21907 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 21908 /* 21909 * Do not send segment if the calculation of MD5 21910 * digest has failed. 21911 */ 21912 goto out; 21913 } 21914 } 21915 #endif 21916 #ifdef INET6 21917 if (isipv6) { 21918 /* 21919 * ip6_plen is not need to be filled now, and will be filled 21920 * in ip6_output. 21921 */ 21922 if (tp->t_port) { 21923 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 21924 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21925 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 21926 th->th_sum = htons(0); 21927 UDPSTAT_INC(udps_opackets); 21928 } else { 21929 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 21930 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21931 th->th_sum = in6_cksum_pseudo(ip6, 21932 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 21933 0); 21934 } 21935 } 21936 #endif 21937 #if defined(INET6) && defined(INET) 21938 else 21939 #endif 21940 #ifdef INET 21941 { 21942 if (tp->t_port) { 21943 m->m_pkthdr.csum_flags = CSUM_UDP; 21944 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21945 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 21946 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 21947 th->th_sum = htons(0); 21948 UDPSTAT_INC(udps_opackets); 21949 } else { 21950 m->m_pkthdr.csum_flags = CSUM_TCP; 21951 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21952 th->th_sum = in_pseudo(ip->ip_src.s_addr, 21953 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 21954 IPPROTO_TCP + len + optlen)); 21955 } 21956 /* IP version must be set here for ipv4/ipv6 checking later */ 21957 KASSERT(ip->ip_v == IPVERSION, 21958 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 21959 } 21960 #endif 21961 /* 21962 * Enable TSO and specify the size of the segments. The TCP pseudo 21963 * header checksum is always provided. XXX: Fixme: This is currently 21964 * not the case for IPv6. 21965 */ 21966 if (tso) { 21967 /* 21968 * Here we must use t_maxseg and the optlen since 21969 * the optlen may include SACK's (or DSACK). 21970 */ 21971 KASSERT(len > tp->t_maxseg - optlen, 21972 ("%s: len <= tso_segsz", __func__)); 21973 m->m_pkthdr.csum_flags |= CSUM_TSO; 21974 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 21975 } 21976 KASSERT(len + hdrlen == m_length(m, NULL), 21977 ("%s: mbuf chain different than expected: %d + %u != %u", 21978 __func__, len, hdrlen, m_length(m, NULL))); 21979 21980 #ifdef TCP_HHOOK 21981 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 21982 hhook_run_tcp_est_out(tp, th, &to, len, tso); 21983 #endif 21984 if ((rack->r_ctl.crte != NULL) && 21985 (rack->rc_hw_nobuf == 0) && 21986 tcp_bblogging_on(tp)) { 21987 rack_log_queue_level(tp, rack, len, &tv, cts); 21988 } 21989 /* We're getting ready to send; log now. */ 21990 if (tcp_bblogging_on(rack->rc_tp)) { 21991 union tcp_log_stackspecific log; 21992 21993 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 21994 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 21995 if (rack->rack_no_prr) 21996 log.u_bbr.flex1 = 0; 21997 else 21998 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 21999 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 22000 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 22001 log.u_bbr.flex4 = orig_len; 22002 /* Save off the early/late values */ 22003 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 22004 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 22005 log.u_bbr.bw_inuse = rack_get_bw(rack); 22006 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 22007 log.u_bbr.flex8 = 0; 22008 if (rsm) { 22009 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 22010 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 22011 counter_u64_add(rack_collapsed_win_rxt, 1); 22012 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 22013 } 22014 if (doing_tlp) 22015 log.u_bbr.flex8 = 2; 22016 else 22017 log.u_bbr.flex8 = 1; 22018 } else { 22019 if (doing_tlp) 22020 log.u_bbr.flex8 = 3; 22021 } 22022 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 22023 log.u_bbr.flex7 = mark; 22024 log.u_bbr.flex7 <<= 8; 22025 log.u_bbr.flex7 |= pass; 22026 log.u_bbr.pkts_out = tp->t_maxseg; 22027 log.u_bbr.timeStamp = cts; 22028 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 22029 if (rsm && (rsm->r_rtr_cnt > 0)) { 22030 /* 22031 * When we have a retransmit we want to log the 22032 * burst at send and flight at send from before. 22033 */ 22034 log.u_bbr.flex5 = rsm->r_fas; 22035 log.u_bbr.bbr_substate = rsm->r_bas; 22036 } else { 22037 /* 22038 * New transmits we log in flex5 the inflight again as 22039 * well as the number of segments in our send in the 22040 * substate field. 22041 */ 22042 log.u_bbr.flex5 = log.u_bbr.inflight; 22043 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 22044 } 22045 log.u_bbr.lt_epoch = cwnd_to_use; 22046 log.u_bbr.delivered = sendalot; 22047 log.u_bbr.rttProp = (uint64_t)rsm; 22048 log.u_bbr.pkt_epoch = __LINE__; 22049 if (rsm) { 22050 log.u_bbr.delRate = rsm->r_flags; 22051 log.u_bbr.delRate <<= 31; 22052 log.u_bbr.delRate |= rack->r_must_retran; 22053 log.u_bbr.delRate <<= 1; 22054 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 22055 } else { 22056 log.u_bbr.delRate = rack->r_must_retran; 22057 log.u_bbr.delRate <<= 1; 22058 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 22059 } 22060 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 22061 len, &log, false, NULL, __func__, __LINE__, &tv); 22062 } else 22063 lgb = NULL; 22064 22065 /* 22066 * Fill in IP length and desired time to live and send to IP level. 22067 * There should be a better way to handle ttl and tos; we could keep 22068 * them in the template, but need a way to checksum without them. 22069 */ 22070 /* 22071 * m->m_pkthdr.len should have been set before cksum calcuration, 22072 * because in6_cksum() need it. 22073 */ 22074 #ifdef INET6 22075 if (isipv6) { 22076 /* 22077 * we separately set hoplimit for every segment, since the 22078 * user might want to change the value via setsockopt. Also, 22079 * desired default hop limit might be changed via Neighbor 22080 * Discovery. 22081 */ 22082 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 22083 22084 /* 22085 * Set the packet size here for the benefit of DTrace 22086 * probes. ip6_output() will set it properly; it's supposed 22087 * to include the option header lengths as well. 22088 */ 22089 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 22090 22091 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 22092 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 22093 else 22094 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 22095 22096 if (tp->t_state == TCPS_SYN_SENT) 22097 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 22098 22099 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 22100 /* TODO: IPv6 IP6TOS_ECT bit on */ 22101 error = ip6_output(m, 22102 inp->in6p_outputopts, 22103 &inp->inp_route6, 22104 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 22105 NULL, NULL, inp); 22106 22107 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 22108 mtu = inp->inp_route6.ro_nh->nh_mtu; 22109 } 22110 #endif /* INET6 */ 22111 #if defined(INET) && defined(INET6) 22112 else 22113 #endif 22114 #ifdef INET 22115 { 22116 ip->ip_len = htons(m->m_pkthdr.len); 22117 #ifdef INET6 22118 if (inp->inp_vflag & INP_IPV6PROTO) 22119 ip->ip_ttl = in6_selecthlim(inp, NULL); 22120 #endif /* INET6 */ 22121 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 22122 /* 22123 * If we do path MTU discovery, then we set DF on every 22124 * packet. This might not be the best thing to do according 22125 * to RFC3390 Section 2. However the tcp hostcache migitates 22126 * the problem so it affects only the first tcp connection 22127 * with a host. 22128 * 22129 * NB: Don't set DF on small MTU/MSS to have a safe 22130 * fallback. 22131 */ 22132 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 22133 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 22134 if (tp->t_port == 0 || len < V_tcp_minmss) { 22135 ip->ip_off |= htons(IP_DF); 22136 } 22137 } else { 22138 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 22139 } 22140 22141 if (tp->t_state == TCPS_SYN_SENT) 22142 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 22143 22144 TCP_PROBE5(send, NULL, tp, ip, tp, th); 22145 22146 error = ip_output(m, 22147 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 22148 inp->inp_options, 22149 #else 22150 NULL, 22151 #endif 22152 &inp->inp_route, 22153 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 22154 inp); 22155 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 22156 mtu = inp->inp_route.ro_nh->nh_mtu; 22157 } 22158 #endif /* INET */ 22159 22160 out: 22161 if (lgb) { 22162 lgb->tlb_errno = error; 22163 lgb = NULL; 22164 } 22165 /* 22166 * In transmit state, time the transmission and arrange for the 22167 * retransmit. In persist state, just set snd_max. 22168 */ 22169 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 22170 rack_to_usec_ts(&tv), 22171 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); 22172 if (error == 0) { 22173 if (rsm == NULL) { 22174 if (rack->lt_bw_up == 0) { 22175 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); 22176 rack->r_ctl.lt_seq = tp->snd_una; 22177 rack->lt_bw_up = 1; 22178 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { 22179 /* 22180 * Need to record what we have since we are 22181 * approaching seq wrap. 22182 */ 22183 uint64_t tmark; 22184 22185 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 22186 rack->r_ctl.lt_seq = tp->snd_una; 22187 tmark = tcp_tv_to_lusectick(&tv); 22188 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 22189 rack->r_ctl.lt_timemark = tmark; 22190 } 22191 } 22192 rack->forced_ack = 0; /* If we send something zap the FA flag */ 22193 counter_u64_add(rack_total_bytes, len); 22194 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 22195 if (rsm && doing_tlp) { 22196 rack->rc_last_sent_tlp_past_cumack = 0; 22197 rack->rc_last_sent_tlp_seq_valid = 1; 22198 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 22199 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 22200 } 22201 if (rack->rc_hw_nobuf) { 22202 rack->rc_hw_nobuf = 0; 22203 rack->r_ctl.rc_agg_delayed = 0; 22204 rack->r_early = 0; 22205 rack->r_late = 0; 22206 rack->r_ctl.rc_agg_early = 0; 22207 } 22208 if (rsm && (doing_tlp == 0)) { 22209 /* Set we retransmitted */ 22210 rack->rc_gp_saw_rec = 1; 22211 } else { 22212 if (cwnd_to_use > tp->snd_ssthresh) { 22213 /* Set we sent in CA */ 22214 rack->rc_gp_saw_ca = 1; 22215 } else { 22216 /* Set we sent in SS */ 22217 rack->rc_gp_saw_ss = 1; 22218 } 22219 } 22220 if (TCPS_HAVEESTABLISHED(tp->t_state) && 22221 (tp->t_flags & TF_SACK_PERMIT) && 22222 tp->rcv_numsacks > 0) 22223 tcp_clean_dsack_blocks(tp); 22224 tot_len_this_send += len; 22225 if (len == 0) { 22226 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 22227 } else { 22228 int idx; 22229 22230 idx = (len / segsiz) + 3; 22231 if (idx >= TCP_MSS_ACCT_ATIMER) 22232 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 22233 else 22234 counter_u64_add(rack_out_size[idx], 1); 22235 } 22236 } 22237 if ((rack->rack_no_prr == 0) && 22238 sub_from_prr && 22239 (error == 0)) { 22240 if (rack->r_ctl.rc_prr_sndcnt >= len) 22241 rack->r_ctl.rc_prr_sndcnt -= len; 22242 else 22243 rack->r_ctl.rc_prr_sndcnt = 0; 22244 } 22245 sub_from_prr = 0; 22246 if (doing_tlp) { 22247 /* Make sure the TLP is added */ 22248 add_flag |= RACK_TLP; 22249 } else if (rsm) { 22250 /* If its a resend without TLP then it must not have the flag */ 22251 rsm->r_flags &= ~RACK_TLP; 22252 } 22253 22254 22255 if ((error == 0) && 22256 (len > 0) && 22257 (tp->snd_una == tp->snd_max)) 22258 rack->r_ctl.rc_tlp_rxt_last_time = cts; 22259 { 22260 tcp_seq startseq = tp->snd_nxt; 22261 22262 /* Track our lost count */ 22263 if (rsm && (doing_tlp == 0)) 22264 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 22265 /* 22266 * Advance snd_nxt over sequence space of this segment. 22267 */ 22268 if (error) 22269 /* We don't log or do anything with errors */ 22270 goto nomore; 22271 if (doing_tlp == 0) { 22272 if (rsm == NULL) { 22273 /* 22274 * Not a retransmission of some 22275 * sort, new data is going out so 22276 * clear our TLP count and flag. 22277 */ 22278 rack->rc_tlp_in_progress = 0; 22279 rack->r_ctl.rc_tlp_cnt_out = 0; 22280 } 22281 } else { 22282 /* 22283 * We have just sent a TLP, mark that it is true 22284 * and make sure our in progress is set so we 22285 * continue to check the count. 22286 */ 22287 rack->rc_tlp_in_progress = 1; 22288 rack->r_ctl.rc_tlp_cnt_out++; 22289 } 22290 if (flags & (TH_SYN | TH_FIN)) { 22291 if (flags & TH_SYN) 22292 tp->snd_nxt++; 22293 if (flags & TH_FIN) { 22294 tp->snd_nxt++; 22295 tp->t_flags |= TF_SENTFIN; 22296 } 22297 } 22298 /* In the ENOBUFS case we do *not* update snd_max */ 22299 if (sack_rxmit) 22300 goto nomore; 22301 22302 tp->snd_nxt += len; 22303 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 22304 if (tp->snd_una == tp->snd_max) { 22305 /* 22306 * Update the time we just added data since 22307 * none was outstanding. 22308 */ 22309 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 22310 tp->t_acktime = ticks; 22311 } 22312 tp->snd_max = tp->snd_nxt; 22313 if (rack->rc_new_rnd_needed) { 22314 /* 22315 * Update the rnd to start ticking not 22316 * that from a time perspective all of 22317 * the preceding idle time is "in the round" 22318 */ 22319 rack->rc_new_rnd_needed = 0; 22320 rack->r_ctl.roundends = tp->snd_max; 22321 } 22322 /* 22323 * Time this transmission if not a retransmission and 22324 * not currently timing anything. 22325 * This is only relevant in case of switching back to 22326 * the base stack. 22327 */ 22328 if (tp->t_rtttime == 0) { 22329 tp->t_rtttime = ticks; 22330 tp->t_rtseq = startseq; 22331 KMOD_TCPSTAT_INC(tcps_segstimed); 22332 } 22333 if (len && 22334 ((tp->t_flags & TF_GPUTINPROG) == 0)) 22335 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 22336 } 22337 /* 22338 * If we are doing FO we need to update the mbuf position and subtract 22339 * this happens when the peer sends us duplicate information and 22340 * we thus want to send a DSACK. 22341 * 22342 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 22343 * turned off? If not then we are going to echo multiple DSACK blocks 22344 * out (with the TSO), which we should not be doing. 22345 */ 22346 if (rack->r_fast_output && len) { 22347 if (rack->r_ctl.fsb.left_to_send > len) 22348 rack->r_ctl.fsb.left_to_send -= len; 22349 else 22350 rack->r_ctl.fsb.left_to_send = 0; 22351 if (rack->r_ctl.fsb.left_to_send < segsiz) 22352 rack->r_fast_output = 0; 22353 if (rack->r_fast_output) { 22354 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 22355 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 22356 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 22357 } 22358 } 22359 } 22360 nomore: 22361 if (error) { 22362 rack->r_ctl.rc_agg_delayed = 0; 22363 rack->r_early = 0; 22364 rack->r_late = 0; 22365 rack->r_ctl.rc_agg_early = 0; 22366 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 22367 /* 22368 * Failures do not advance the seq counter above. For the 22369 * case of ENOBUFS we will fall out and retry in 1ms with 22370 * the hpts. Everything else will just have to retransmit 22371 * with the timer. 22372 * 22373 * In any case, we do not want to loop around for another 22374 * send without a good reason. 22375 */ 22376 sendalot = 0; 22377 switch (error) { 22378 case EPERM: 22379 case EACCES: 22380 tp->t_softerror = error; 22381 #ifdef TCP_ACCOUNTING 22382 crtsc = get_cyclecount(); 22383 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22384 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22385 } 22386 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22387 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22388 } 22389 sched_unpin(); 22390 #endif 22391 return (error); 22392 case ENOBUFS: 22393 /* 22394 * Pace us right away to retry in a some 22395 * time 22396 */ 22397 if (rack->r_ctl.crte != NULL) { 22398 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 22399 if (tcp_bblogging_on(rack->rc_tp)) 22400 rack_log_queue_level(tp, rack, len, &tv, cts); 22401 } else 22402 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 22403 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 22404 if (rack->rc_enobuf < 0x7f) 22405 rack->rc_enobuf++; 22406 if (slot < (10 * HPTS_USEC_IN_MSEC)) 22407 slot = 10 * HPTS_USEC_IN_MSEC; 22408 if (rack->r_ctl.crte != NULL) { 22409 counter_u64_add(rack_saw_enobuf_hw, 1); 22410 tcp_rl_log_enobuf(rack->r_ctl.crte); 22411 } 22412 counter_u64_add(rack_saw_enobuf, 1); 22413 goto enobufs; 22414 case EMSGSIZE: 22415 /* 22416 * For some reason the interface we used initially 22417 * to send segments changed to another or lowered 22418 * its MTU. If TSO was active we either got an 22419 * interface without TSO capabilits or TSO was 22420 * turned off. If we obtained mtu from ip_output() 22421 * then update it and try again. 22422 */ 22423 if (tso) 22424 tp->t_flags &= ~TF_TSO; 22425 if (mtu != 0) { 22426 int saved_mtu; 22427 22428 saved_mtu = tp->t_maxseg; 22429 tcp_mss_update(tp, -1, mtu, NULL, NULL); 22430 if (saved_mtu > tp->t_maxseg) { 22431 goto again; 22432 } 22433 } 22434 slot = 10 * HPTS_USEC_IN_MSEC; 22435 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22436 #ifdef TCP_ACCOUNTING 22437 crtsc = get_cyclecount(); 22438 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22439 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22440 } 22441 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22442 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22443 } 22444 sched_unpin(); 22445 #endif 22446 return (error); 22447 case ENETUNREACH: 22448 counter_u64_add(rack_saw_enetunreach, 1); 22449 case EHOSTDOWN: 22450 case EHOSTUNREACH: 22451 case ENETDOWN: 22452 if (TCPS_HAVERCVDSYN(tp->t_state)) { 22453 tp->t_softerror = error; 22454 } 22455 /* FALLTHROUGH */ 22456 default: 22457 slot = 10 * HPTS_USEC_IN_MSEC; 22458 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22459 #ifdef TCP_ACCOUNTING 22460 crtsc = get_cyclecount(); 22461 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22462 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22463 } 22464 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22465 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22466 } 22467 sched_unpin(); 22468 #endif 22469 return (error); 22470 } 22471 } else { 22472 rack->rc_enobuf = 0; 22473 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 22474 rack->r_ctl.retran_during_recovery += len; 22475 } 22476 KMOD_TCPSTAT_INC(tcps_sndtotal); 22477 22478 /* 22479 * Data sent (as far as we can tell). If this advertises a larger 22480 * window than any other segment, then remember the size of the 22481 * advertised window. Any pending ACK has now been sent. 22482 */ 22483 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 22484 tp->rcv_adv = tp->rcv_nxt + recwin; 22485 22486 tp->last_ack_sent = tp->rcv_nxt; 22487 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 22488 enobufs: 22489 if (sendalot) { 22490 /* Do we need to turn off sendalot? */ 22491 if (rack->r_ctl.rc_pace_max_segs && 22492 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 22493 /* We hit our max. */ 22494 sendalot = 0; 22495 } else if ((rack->rc_user_set_max_segs) && 22496 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 22497 /* We hit the user defined max */ 22498 sendalot = 0; 22499 } 22500 } 22501 if ((error == 0) && (flags & TH_FIN)) 22502 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 22503 if (flags & TH_RST) { 22504 /* 22505 * We don't send again after sending a RST. 22506 */ 22507 slot = 0; 22508 sendalot = 0; 22509 if (error == 0) 22510 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 22511 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 22512 /* 22513 * Get our pacing rate, if an error 22514 * occurred in sending (ENOBUF) we would 22515 * hit the else if with slot preset. Other 22516 * errors return. 22517 */ 22518 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 22519 } 22520 if (rsm && 22521 (rsm->r_flags & RACK_HAS_SYN) == 0 && 22522 rack->use_rack_rr) { 22523 /* Its a retransmit and we use the rack cheat? */ 22524 if ((slot == 0) || 22525 (rack->rc_always_pace == 0) || 22526 (rack->r_rr_config == 1)) { 22527 /* 22528 * We have no pacing set or we 22529 * are using old-style rack or 22530 * we are overridden to use the old 1ms pacing. 22531 */ 22532 slot = rack->r_ctl.rc_min_to; 22533 } 22534 } 22535 /* We have sent clear the flag */ 22536 rack->r_ent_rec_ns = 0; 22537 if (rack->r_must_retran) { 22538 if (rsm) { 22539 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 22540 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 22541 /* 22542 * We have retransmitted all. 22543 */ 22544 rack->r_must_retran = 0; 22545 rack->r_ctl.rc_out_at_rto = 0; 22546 } 22547 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22548 /* 22549 * Sending new data will also kill 22550 * the loop. 22551 */ 22552 rack->r_must_retran = 0; 22553 rack->r_ctl.rc_out_at_rto = 0; 22554 } 22555 } 22556 rack->r_ctl.fsb.recwin = recwin; 22557 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 22558 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22559 /* 22560 * We hit an RTO and now have past snd_max at the RTO 22561 * clear all the WAS flags. 22562 */ 22563 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 22564 } 22565 if (slot) { 22566 /* set the rack tcb into the slot N */ 22567 if ((error == 0) && 22568 rack_use_rfo && 22569 ((flags & (TH_SYN|TH_FIN)) == 0) && 22570 (rsm == NULL) && 22571 (tp->snd_nxt == tp->snd_max) && 22572 (ipoptlen == 0) && 22573 (tp->rcv_numsacks == 0) && 22574 rack->r_fsb_inited && 22575 TCPS_HAVEESTABLISHED(tp->t_state) && 22576 ((IN_RECOVERY(tp->t_flags)) == 0) && 22577 (rack->r_must_retran == 0) && 22578 ((tp->t_flags & TF_NEEDFIN) == 0) && 22579 (len > 0) && (orig_len > 0) && 22580 (orig_len > len) && 22581 ((orig_len - len) >= segsiz) && 22582 ((optlen == 0) || 22583 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22584 /* We can send at least one more MSS using our fsb */ 22585 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22586 segsiz, pace_max_seg, hw_tls, flags); 22587 } else 22588 rack->r_fast_output = 0; 22589 rack_log_fsb(rack, tp, so, flags, 22590 ipoptlen, orig_len, len, error, 22591 (rsm == NULL), optlen, __LINE__, 2); 22592 } else if (sendalot) { 22593 int ret; 22594 22595 sack_rxmit = 0; 22596 if ((error == 0) && 22597 rack_use_rfo && 22598 ((flags & (TH_SYN|TH_FIN)) == 0) && 22599 (rsm == NULL) && 22600 (ipoptlen == 0) && 22601 (tp->rcv_numsacks == 0) && 22602 (tp->snd_nxt == tp->snd_max) && 22603 (rack->r_must_retran == 0) && 22604 rack->r_fsb_inited && 22605 TCPS_HAVEESTABLISHED(tp->t_state) && 22606 ((IN_RECOVERY(tp->t_flags)) == 0) && 22607 ((tp->t_flags & TF_NEEDFIN) == 0) && 22608 (len > 0) && (orig_len > 0) && 22609 (orig_len > len) && 22610 ((orig_len - len) >= segsiz) && 22611 ((optlen == 0) || 22612 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22613 /* we can use fast_output for more */ 22614 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22615 segsiz, pace_max_seg, hw_tls, flags); 22616 if (rack->r_fast_output) { 22617 error = 0; 22618 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 22619 if (ret >= 0) 22620 return (ret); 22621 else if (error) 22622 goto nomore; 22623 22624 } 22625 } 22626 goto again; 22627 } 22628 /* Assure when we leave that snd_nxt will point to top */ 22629 skip_all_send: 22630 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 22631 tp->snd_nxt = tp->snd_max; 22632 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 22633 #ifdef TCP_ACCOUNTING 22634 crtsc = get_cyclecount() - ts_val; 22635 if (tot_len_this_send) { 22636 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22637 tp->tcp_cnt_counters[SND_OUT_DATA]++; 22638 } 22639 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22640 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 22641 } 22642 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22643 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 22644 } 22645 } else { 22646 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22647 tp->tcp_cnt_counters[SND_OUT_ACK]++; 22648 } 22649 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22650 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 22651 } 22652 } 22653 sched_unpin(); 22654 #endif 22655 if (error == ENOBUFS) 22656 error = 0; 22657 return (error); 22658 } 22659 22660 static void 22661 rack_update_seg(struct tcp_rack *rack) 22662 { 22663 uint32_t orig_val; 22664 22665 orig_val = rack->r_ctl.rc_pace_max_segs; 22666 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 22667 if (orig_val != rack->r_ctl.rc_pace_max_segs) 22668 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 22669 } 22670 22671 static void 22672 rack_mtu_change(struct tcpcb *tp) 22673 { 22674 /* 22675 * The MSS may have changed 22676 */ 22677 struct tcp_rack *rack; 22678 struct rack_sendmap *rsm; 22679 22680 rack = (struct tcp_rack *)tp->t_fb_ptr; 22681 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 22682 /* 22683 * The MTU has changed we need to resend everything 22684 * since all we have sent is lost. We first fix 22685 * up the mtu though. 22686 */ 22687 rack_set_pace_segments(tp, rack, __LINE__, NULL); 22688 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 22689 rack_remxt_tmr(tp); 22690 rack->r_fast_output = 0; 22691 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 22692 rack->r_ctl.rc_sacked); 22693 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 22694 rack->r_must_retran = 1; 22695 /* Mark all inflight to needing to be rxt'd */ 22696 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 22697 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); 22698 } 22699 } 22700 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 22701 /* We don't use snd_nxt to retransmit */ 22702 tp->snd_nxt = tp->snd_max; 22703 } 22704 22705 static int 22706 rack_set_dgp(struct tcp_rack *rack) 22707 { 22708 /* pace_always=1 */ 22709 if (rack->rc_always_pace == 0) { 22710 if (tcp_can_enable_pacing() == 0) 22711 return (EBUSY); 22712 } 22713 rack->rc_fillcw_apply_discount = 0; 22714 rack->dgp_on = 1; 22715 rack->rc_always_pace = 1; 22716 rack->use_fixed_rate = 0; 22717 if (rack->gp_ready) 22718 rack_set_cc_pacing(rack); 22719 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22720 rack->rack_attempt_hdwr_pace = 0; 22721 /* rxt settings */ 22722 rack->full_size_rxt = 1; 22723 rack->shape_rxt_to_pacing_min = 0; 22724 /* cmpack=1 */ 22725 rack->r_use_cmp_ack = 1; 22726 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 22727 rack->r_use_cmp_ack) 22728 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22729 /* scwnd=1 */ 22730 rack->rack_enable_scwnd = 1; 22731 /* dynamic=100 */ 22732 rack->rc_gp_dyn_mul = 1; 22733 /* gp_inc_ca */ 22734 rack->r_ctl.rack_per_of_gp_ca = 100; 22735 /* rrr_conf=3 */ 22736 rack->r_rr_config = 3; 22737 /* npush=2 */ 22738 rack->r_ctl.rc_no_push_at_mrtt = 2; 22739 /* fillcw=1 */ 22740 if (rack->r_cwnd_was_clamped == 0) { 22741 rack->rc_pace_to_cwnd = 1; 22742 } else { 22743 rack->rc_pace_to_cwnd = 0; 22744 /* Reset all multipliers to 100.0 so just the measured bw */ 22745 rack->r_ctl.rack_per_of_gp_ss = 100; 22746 rack->r_ctl.rack_per_of_gp_ca = 100; 22747 } 22748 rack->rc_pace_fill_if_rttin_range = 0; 22749 rack->rtt_limit_mul = 0; 22750 /* noprr=1 */ 22751 rack->rack_no_prr = 1; 22752 /* lscwnd=1 */ 22753 rack->r_limit_scw = 1; 22754 /* gp_inc_rec */ 22755 rack->r_ctl.rack_per_of_gp_rec = 90; 22756 rack_client_buffer_level_set(rack); 22757 return (0); 22758 } 22759 22760 22761 22762 static int 22763 rack_set_profile(struct tcp_rack *rack, int prof) 22764 { 22765 int err = EINVAL; 22766 if (prof == 1) { 22767 /* 22768 * Profile 1 is "standard" DGP. It ignores 22769 * client buffer level. 22770 */ 22771 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0; 22772 err = rack_set_dgp(rack); 22773 if (err) 22774 return (err); 22775 } else if (prof == 2) { 22776 /* 22777 * Profile 2 is DGP. Less aggressive with 22778 * respect to client buffer level. 22779 */ 22780 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1; 22781 err = rack_set_dgp(rack); 22782 if (err) 22783 return (err); 22784 } else if (prof == 3) { 22785 /* 22786 * Profile 3 is DGP. Even Less aggressive with 22787 * respect to client buffer level. 22788 */ 22789 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2; 22790 err = rack_set_dgp(rack); 22791 if (err) 22792 return (err); 22793 } else if (prof == 4) { 22794 /* 22795 * Profile 4 is DGP with the most responsiveness 22796 * to client buffer level. 22797 */ 22798 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3; 22799 err = rack_set_dgp(rack); 22800 if (err) 22801 return (err); 22802 } else if (prof == 5) { 22803 err = rack_set_dgp(rack); 22804 if (err) 22805 return (err); 22806 /* 22807 * By turning DGP off we change the rate 22808 * picked to be only the one the cwnd and rtt 22809 * get us. 22810 */ 22811 rack->dgp_on = 0; 22812 } else if (prof == 6) { 22813 err = rack_set_dgp(rack); 22814 if (err) 22815 return (err); 22816 /* 22817 * Profile 6 tweaks DGP so that it will apply to 22818 * fill-cw the same settings that profile5 does 22819 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). 22820 */ 22821 rack->rc_fillcw_apply_discount = 1; 22822 } else if (prof == 0) { 22823 /* This changes things back to the default settings */ 22824 rack->dgp_on = 0; 22825 rack->rc_hybrid_mode = 0; 22826 err = 0; 22827 if (rack_fill_cw_state) 22828 rack->rc_pace_to_cwnd = 1; 22829 else 22830 rack->rc_pace_to_cwnd = 0; 22831 if (rack->rc_always_pace) { 22832 tcp_decrement_paced_conn(); 22833 rack_undo_cc_pacing(rack); 22834 rack->rc_always_pace = 0; 22835 } 22836 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 22837 rack->rc_always_pace = 1; 22838 if (rack->rack_hibeta) 22839 rack_set_cc_pacing(rack); 22840 } else 22841 rack->rc_always_pace = 0; 22842 if (rack_dsack_std_based & 0x1) { 22843 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 22844 rack->rc_rack_tmr_std_based = 1; 22845 } 22846 if (rack_dsack_std_based & 0x2) { 22847 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 22848 rack->rc_rack_use_dsack = 1; 22849 } 22850 if (rack_use_cmp_acks) 22851 rack->r_use_cmp_ack = 1; 22852 else 22853 rack->r_use_cmp_ack = 0; 22854 if (rack_disable_prr) 22855 rack->rack_no_prr = 1; 22856 else 22857 rack->rack_no_prr = 0; 22858 if (rack_gp_no_rec_chg) 22859 rack->rc_gp_no_rec_chg = 1; 22860 else 22861 rack->rc_gp_no_rec_chg = 0; 22862 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 22863 rack->r_mbuf_queue = 1; 22864 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 22865 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22866 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22867 } else { 22868 rack->r_mbuf_queue = 0; 22869 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22870 } 22871 if (rack_enable_shared_cwnd) 22872 rack->rack_enable_scwnd = 1; 22873 else 22874 rack->rack_enable_scwnd = 0; 22875 if (rack_do_dyn_mul) { 22876 /* When dynamic adjustment is on CA needs to start at 100% */ 22877 rack->rc_gp_dyn_mul = 1; 22878 if (rack_do_dyn_mul >= 100) 22879 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 22880 } else { 22881 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 22882 rack->rc_gp_dyn_mul = 0; 22883 } 22884 rack->r_rr_config = 0; 22885 rack->r_ctl.rc_no_push_at_mrtt = 0; 22886 rack->rc_pace_to_cwnd = 0; 22887 rack->rc_pace_fill_if_rttin_range = 0; 22888 rack->rtt_limit_mul = 0; 22889 22890 if (rack_enable_hw_pacing) 22891 rack->rack_hdw_pace_ena = 1; 22892 else 22893 rack->rack_hdw_pace_ena = 0; 22894 if (rack_disable_prr) 22895 rack->rack_no_prr = 1; 22896 else 22897 rack->rack_no_prr = 0; 22898 if (rack_limits_scwnd) 22899 rack->r_limit_scw = 1; 22900 else 22901 rack->r_limit_scw = 0; 22902 rack_init_retransmit_value(rack, rack_rxt_controls); 22903 err = 0; 22904 } 22905 return (err); 22906 } 22907 22908 static int 22909 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 22910 { 22911 struct deferred_opt_list *dol; 22912 22913 dol = malloc(sizeof(struct deferred_opt_list), 22914 M_TCPFSB, M_NOWAIT|M_ZERO); 22915 if (dol == NULL) { 22916 /* 22917 * No space yikes -- fail out.. 22918 */ 22919 return (0); 22920 } 22921 dol->optname = sopt_name; 22922 dol->optval = loptval; 22923 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 22924 return (1); 22925 } 22926 22927 static int 22928 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) 22929 { 22930 #ifdef TCP_REQUEST_TRK 22931 struct tcp_sendfile_track *sft; 22932 struct timeval tv; 22933 tcp_seq seq; 22934 int err; 22935 22936 microuptime(&tv); 22937 22938 /* 22939 * If BB logging is not on we need to look at the DTL flag. 22940 * If its on already then those reasons override the DTL input. 22941 * We do this with any request, you can turn DTL on, but it does 22942 * not turn off at least from hybrid pacing requests. 22943 */ 22944 if (tcp_bblogging_on(rack->rc_tp) == 0) { 22945 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) { 22946 /* Turn on BB point logging */ 22947 tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS, 22948 TCP_BBPOINT_REQ_LEVEL_LOGGING); 22949 } 22950 } 22951 /* Make sure no fixed rate is on */ 22952 rack->use_fixed_rate = 0; 22953 rack->r_ctl.rc_fixed_pacing_rate_rec = 0; 22954 rack->r_ctl.rc_fixed_pacing_rate_ca = 0; 22955 rack->r_ctl.rc_fixed_pacing_rate_ss = 0; 22956 /* Now allocate or find our entry that will have these settings */ 22957 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0); 22958 if (sft == NULL) { 22959 rack->rc_tp->tcp_hybrid_error++; 22960 /* no space, where would it have gone? */ 22961 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; 22962 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); 22963 return (ENOSPC); 22964 } 22965 /* The seq will be snd_una + everything in the buffer */ 22966 seq = sft->start_seq; 22967 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { 22968 /* Disabling hybrid pacing */ 22969 if (rack->rc_hybrid_mode) { 22970 rack_set_profile(rack, 0); 22971 rack->rc_tp->tcp_hybrid_stop++; 22972 } 22973 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); 22974 return (0); 22975 } 22976 if (rack->dgp_on == 0) { 22977 /* 22978 * If we have not yet turned DGP on, do so 22979 * now setting pure DGP mode, no buffer level 22980 * response. 22981 */ 22982 if ((err = rack_set_profile(rack, 1)) != 0){ 22983 /* Failed to turn pacing on */ 22984 rack->rc_tp->tcp_hybrid_error++; 22985 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); 22986 return (err); 22987 } 22988 } 22989 /* Now set in our flags */ 22990 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET; 22991 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) 22992 sft->cspr = hybrid->cspr; 22993 else 22994 sft->cspr = 0; 22995 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) 22996 sft->hint_maxseg = hybrid->hint_maxseg; 22997 else 22998 sft->hint_maxseg = 0; 22999 rack->rc_hybrid_mode = 1; 23000 rack->rc_tp->tcp_hybrid_start++; 23001 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); 23002 return (0); 23003 #else 23004 return (ENOTSUP); 23005 #endif 23006 } 23007 23008 static int 23009 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 23010 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) 23011 23012 { 23013 struct epoch_tracker et; 23014 struct sockopt sopt; 23015 struct cc_newreno_opts opt; 23016 uint64_t val; 23017 int error = 0; 23018 uint16_t ca, ss; 23019 23020 switch (sopt_name) { 23021 case TCP_RACK_SET_RXT_OPTIONS: 23022 if ((optval >= 0) && (optval <= 2)) { 23023 rack_init_retransmit_value(rack, optval); 23024 } else { 23025 /* 23026 * You must send in 0, 1 or 2 all else is 23027 * invalid. 23028 */ 23029 error = EINVAL; 23030 } 23031 break; 23032 case TCP_RACK_DSACK_OPT: 23033 RACK_OPTS_INC(tcp_rack_dsack_opt); 23034 if (optval & 0x1) { 23035 rack->rc_rack_tmr_std_based = 1; 23036 } else { 23037 rack->rc_rack_tmr_std_based = 0; 23038 } 23039 if (optval & 0x2) { 23040 rack->rc_rack_use_dsack = 1; 23041 } else { 23042 rack->rc_rack_use_dsack = 0; 23043 } 23044 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 23045 break; 23046 case TCP_RACK_PACING_DIVISOR: 23047 RACK_OPTS_INC(tcp_rack_pacing_divisor); 23048 if (optval == 0) { 23049 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 23050 } else { 23051 if (optval < RL_MIN_DIVISOR) 23052 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; 23053 else 23054 rack->r_ctl.pace_len_divisor = optval; 23055 } 23056 break; 23057 case TCP_RACK_HI_BETA: 23058 RACK_OPTS_INC(tcp_rack_hi_beta); 23059 if (optval > 0) { 23060 rack->rack_hibeta = 1; 23061 if ((optval >= 50) && 23062 (optval <= 100)) { 23063 /* 23064 * User wants to set a custom beta. 23065 */ 23066 rack->r_ctl.saved_hibeta = optval; 23067 if (rack->rc_pacing_cc_set) 23068 rack_undo_cc_pacing(rack); 23069 rack->r_ctl.rc_saved_beta.beta = optval; 23070 } 23071 if (rack->rc_pacing_cc_set == 0) 23072 rack_set_cc_pacing(rack); 23073 } else { 23074 rack->rack_hibeta = 0; 23075 if (rack->rc_pacing_cc_set) 23076 rack_undo_cc_pacing(rack); 23077 } 23078 break; 23079 case TCP_RACK_PACING_BETA: 23080 RACK_OPTS_INC(tcp_rack_beta); 23081 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 23082 /* This only works for newreno. */ 23083 error = EINVAL; 23084 break; 23085 } 23086 if (rack->rc_pacing_cc_set) { 23087 /* 23088 * Set them into the real CC module 23089 * whats in the rack pcb is the old values 23090 * to be used on restoral/ 23091 */ 23092 sopt.sopt_dir = SOPT_SET; 23093 opt.name = CC_NEWRENO_BETA; 23094 opt.val = optval; 23095 if (CC_ALGO(tp)->ctl_output != NULL) 23096 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 23097 else { 23098 error = ENOENT; 23099 break; 23100 } 23101 } else { 23102 /* 23103 * Not pacing yet so set it into our local 23104 * rack pcb storage. 23105 */ 23106 rack->r_ctl.rc_saved_beta.beta = optval; 23107 } 23108 break; 23109 case TCP_RACK_TIMER_SLOP: 23110 RACK_OPTS_INC(tcp_rack_timer_slop); 23111 rack->r_ctl.timer_slop = optval; 23112 if (rack->rc_tp->t_srtt) { 23113 /* 23114 * If we have an SRTT lets update t_rxtcur 23115 * to have the new slop. 23116 */ 23117 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 23118 rack_rto_min, rack_rto_max, 23119 rack->r_ctl.timer_slop); 23120 } 23121 break; 23122 case TCP_RACK_PACING_BETA_ECN: 23123 RACK_OPTS_INC(tcp_rack_beta_ecn); 23124 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 23125 /* This only works for newreno. */ 23126 error = EINVAL; 23127 break; 23128 } 23129 if (rack->rc_pacing_cc_set) { 23130 /* 23131 * Set them into the real CC module 23132 * whats in the rack pcb is the old values 23133 * to be used on restoral/ 23134 */ 23135 sopt.sopt_dir = SOPT_SET; 23136 opt.name = CC_NEWRENO_BETA_ECN; 23137 opt.val = optval; 23138 if (CC_ALGO(tp)->ctl_output != NULL) 23139 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 23140 else 23141 error = ENOENT; 23142 } else { 23143 /* 23144 * Not pacing yet so set it into our local 23145 * rack pcb storage. 23146 */ 23147 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 23148 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; 23149 } 23150 break; 23151 case TCP_DEFER_OPTIONS: 23152 RACK_OPTS_INC(tcp_defer_opt); 23153 if (optval) { 23154 if (rack->gp_ready) { 23155 /* Too late */ 23156 error = EINVAL; 23157 break; 23158 } 23159 rack->defer_options = 1; 23160 } else 23161 rack->defer_options = 0; 23162 break; 23163 case TCP_RACK_MEASURE_CNT: 23164 RACK_OPTS_INC(tcp_rack_measure_cnt); 23165 if (optval && (optval <= 0xff)) { 23166 rack->r_ctl.req_measurements = optval; 23167 } else 23168 error = EINVAL; 23169 break; 23170 case TCP_REC_ABC_VAL: 23171 RACK_OPTS_INC(tcp_rec_abc_val); 23172 if (optval > 0) 23173 rack->r_use_labc_for_rec = 1; 23174 else 23175 rack->r_use_labc_for_rec = 0; 23176 break; 23177 case TCP_RACK_ABC_VAL: 23178 RACK_OPTS_INC(tcp_rack_abc_val); 23179 if ((optval > 0) && (optval < 255)) 23180 rack->rc_labc = optval; 23181 else 23182 error = EINVAL; 23183 break; 23184 case TCP_HDWR_UP_ONLY: 23185 RACK_OPTS_INC(tcp_pacing_up_only); 23186 if (optval) 23187 rack->r_up_only = 1; 23188 else 23189 rack->r_up_only = 0; 23190 break; 23191 case TCP_PACING_RATE_CAP: 23192 RACK_OPTS_INC(tcp_pacing_rate_cap); 23193 rack->r_ctl.bw_rate_cap = loptval; 23194 break; 23195 case TCP_HYBRID_PACING: 23196 if (hybrid == NULL) { 23197 error = EINVAL; 23198 break; 23199 } 23200 error = process_hybrid_pacing(rack, hybrid); 23201 break; 23202 case TCP_RACK_PROFILE: 23203 RACK_OPTS_INC(tcp_profile); 23204 error = rack_set_profile(rack, optval); 23205 break; 23206 case TCP_USE_CMP_ACKS: 23207 RACK_OPTS_INC(tcp_use_cmp_acks); 23208 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) { 23209 /* You can't turn it off once its on! */ 23210 error = EINVAL; 23211 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 23212 rack->r_use_cmp_ack = 1; 23213 rack->r_mbuf_queue = 1; 23214 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23215 } 23216 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 23217 tp->t_flags2 |= TF2_MBUF_ACKCMP; 23218 break; 23219 case TCP_SHARED_CWND_TIME_LIMIT: 23220 RACK_OPTS_INC(tcp_lscwnd); 23221 if (optval) 23222 rack->r_limit_scw = 1; 23223 else 23224 rack->r_limit_scw = 0; 23225 break; 23226 case TCP_RACK_DGP_IN_REC: 23227 RACK_OPTS_INC(tcp_dgp_in_rec); 23228 if (optval) 23229 rack->r_ctl.full_dgp_in_rec = 1; 23230 else 23231 rack->r_ctl.full_dgp_in_rec = 0; 23232 break; 23233 case TCP_RXT_CLAMP: 23234 RACK_OPTS_INC(tcp_rxt_clamp); 23235 rack_translate_clamp_value(rack, optval); 23236 break; 23237 case TCP_RACK_PACE_TO_FILL: 23238 RACK_OPTS_INC(tcp_fillcw); 23239 if (optval == 0) 23240 rack->rc_pace_to_cwnd = 0; 23241 else { 23242 rack->rc_pace_to_cwnd = 1; 23243 if (optval > 1) 23244 rack->r_fill_less_agg = 1; 23245 } 23246 if ((optval >= rack_gp_rtt_maxmul) && 23247 rack_gp_rtt_maxmul && 23248 (optval < 0xf)) { 23249 rack->rc_pace_fill_if_rttin_range = 1; 23250 rack->rtt_limit_mul = optval; 23251 } else { 23252 rack->rc_pace_fill_if_rttin_range = 0; 23253 rack->rtt_limit_mul = 0; 23254 } 23255 break; 23256 case TCP_RACK_NO_PUSH_AT_MAX: 23257 RACK_OPTS_INC(tcp_npush); 23258 if (optval == 0) 23259 rack->r_ctl.rc_no_push_at_mrtt = 0; 23260 else if (optval < 0xff) 23261 rack->r_ctl.rc_no_push_at_mrtt = optval; 23262 else 23263 error = EINVAL; 23264 break; 23265 case TCP_SHARED_CWND_ENABLE: 23266 RACK_OPTS_INC(tcp_rack_scwnd); 23267 if (optval == 0) 23268 rack->rack_enable_scwnd = 0; 23269 else 23270 rack->rack_enable_scwnd = 1; 23271 break; 23272 case TCP_RACK_MBUF_QUEUE: 23273 /* Now do we use the LRO mbuf-queue feature */ 23274 RACK_OPTS_INC(tcp_rack_mbufq); 23275 if (optval || rack->r_use_cmp_ack) 23276 rack->r_mbuf_queue = 1; 23277 else 23278 rack->r_mbuf_queue = 0; 23279 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23280 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23281 else 23282 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23283 break; 23284 case TCP_RACK_NONRXT_CFG_RATE: 23285 RACK_OPTS_INC(tcp_rack_cfg_rate); 23286 if (optval == 0) 23287 rack->rack_rec_nonrxt_use_cr = 0; 23288 else 23289 rack->rack_rec_nonrxt_use_cr = 1; 23290 break; 23291 case TCP_NO_PRR: 23292 RACK_OPTS_INC(tcp_rack_noprr); 23293 if (optval == 0) 23294 rack->rack_no_prr = 0; 23295 else if (optval == 1) 23296 rack->rack_no_prr = 1; 23297 else if (optval == 2) 23298 rack->no_prr_addback = 1; 23299 else 23300 error = EINVAL; 23301 break; 23302 case TCP_TIMELY_DYN_ADJ: 23303 RACK_OPTS_INC(tcp_timely_dyn); 23304 if (optval == 0) 23305 rack->rc_gp_dyn_mul = 0; 23306 else { 23307 rack->rc_gp_dyn_mul = 1; 23308 if (optval >= 100) { 23309 /* 23310 * If the user sets something 100 or more 23311 * its the gp_ca value. 23312 */ 23313 rack->r_ctl.rack_per_of_gp_ca = optval; 23314 } 23315 } 23316 break; 23317 case TCP_RACK_DO_DETECTION: 23318 RACK_OPTS_INC(tcp_rack_do_detection); 23319 if (optval == 0) 23320 rack->do_detection = 0; 23321 else 23322 rack->do_detection = 1; 23323 break; 23324 case TCP_RACK_TLP_USE: 23325 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 23326 error = EINVAL; 23327 break; 23328 } 23329 RACK_OPTS_INC(tcp_tlp_use); 23330 rack->rack_tlp_threshold_use = optval; 23331 break; 23332 case TCP_RACK_TLP_REDUCE: 23333 /* RACK TLP cwnd reduction (bool) */ 23334 RACK_OPTS_INC(tcp_rack_tlp_reduce); 23335 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 23336 break; 23337 /* Pacing related ones */ 23338 case TCP_RACK_PACE_ALWAYS: 23339 /* 23340 * zero is old rack method, 1 is new 23341 * method using a pacing rate. 23342 */ 23343 RACK_OPTS_INC(tcp_rack_pace_always); 23344 if (optval > 0) { 23345 if (rack->rc_always_pace) { 23346 error = EALREADY; 23347 break; 23348 } else if (tcp_can_enable_pacing()) { 23349 rack->rc_always_pace = 1; 23350 if (rack->rack_hibeta) 23351 rack_set_cc_pacing(rack); 23352 } 23353 else { 23354 error = ENOSPC; 23355 break; 23356 } 23357 } else { 23358 if (rack->rc_always_pace) { 23359 tcp_decrement_paced_conn(); 23360 rack->rc_always_pace = 0; 23361 rack_undo_cc_pacing(rack); 23362 } 23363 } 23364 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23365 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23366 else 23367 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23368 /* A rate may be set irate or other, if so set seg size */ 23369 rack_update_seg(rack); 23370 break; 23371 case TCP_BBR_RACK_INIT_RATE: 23372 RACK_OPTS_INC(tcp_initial_rate); 23373 val = optval; 23374 /* Change from kbits per second to bytes per second */ 23375 val *= 1000; 23376 val /= 8; 23377 rack->r_ctl.init_rate = val; 23378 if (rack->rc_init_win != rack_default_init_window) { 23379 uint32_t win, snt; 23380 23381 /* 23382 * Options don't always get applied 23383 * in the order you think. So in order 23384 * to assure we update a cwnd we need 23385 * to check and see if we are still 23386 * where we should raise the cwnd. 23387 */ 23388 win = rc_init_window(rack); 23389 if (SEQ_GT(tp->snd_max, tp->iss)) 23390 snt = tp->snd_max - tp->iss; 23391 else 23392 snt = 0; 23393 if ((snt < win) && 23394 (tp->snd_cwnd < win)) 23395 tp->snd_cwnd = win; 23396 } 23397 if (rack->rc_always_pace) 23398 rack_update_seg(rack); 23399 break; 23400 case TCP_BBR_IWINTSO: 23401 RACK_OPTS_INC(tcp_initial_win); 23402 if (optval && (optval <= 0xff)) { 23403 uint32_t win, snt; 23404 23405 rack->rc_init_win = optval; 23406 win = rc_init_window(rack); 23407 if (SEQ_GT(tp->snd_max, tp->iss)) 23408 snt = tp->snd_max - tp->iss; 23409 else 23410 snt = 0; 23411 if ((snt < win) && 23412 (tp->t_srtt | 23413 rack->r_ctl.init_rate)) { 23414 /* 23415 * We are not past the initial window 23416 * and we have some bases for pacing, 23417 * so we need to possibly adjust up 23418 * the cwnd. Note even if we don't set 23419 * the cwnd, its still ok to raise the rc_init_win 23420 * which can be used coming out of idle when we 23421 * would have a rate. 23422 */ 23423 if (tp->snd_cwnd < win) 23424 tp->snd_cwnd = win; 23425 } 23426 if (rack->rc_always_pace) 23427 rack_update_seg(rack); 23428 } else 23429 error = EINVAL; 23430 break; 23431 case TCP_RACK_FORCE_MSEG: 23432 RACK_OPTS_INC(tcp_rack_force_max_seg); 23433 if (optval) 23434 rack->rc_force_max_seg = 1; 23435 else 23436 rack->rc_force_max_seg = 0; 23437 break; 23438 case TCP_RACK_PACE_MIN_SEG: 23439 RACK_OPTS_INC(tcp_rack_min_seg); 23440 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); 23441 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23442 break; 23443 case TCP_RACK_PACE_MAX_SEG: 23444 /* Max segments size in a pace in bytes */ 23445 RACK_OPTS_INC(tcp_rack_max_seg); 23446 if (optval <= MAX_USER_SET_SEG) 23447 rack->rc_user_set_max_segs = optval; 23448 else 23449 rack->rc_user_set_max_segs = MAX_USER_SET_SEG; 23450 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23451 break; 23452 case TCP_RACK_PACE_RATE_REC: 23453 /* Set the fixed pacing rate in Bytes per second ca */ 23454 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 23455 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23456 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23457 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23458 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23459 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23460 rack->use_fixed_rate = 1; 23461 if (rack->rack_hibeta) 23462 rack_set_cc_pacing(rack); 23463 rack_log_pacing_delay_calc(rack, 23464 rack->r_ctl.rc_fixed_pacing_rate_ss, 23465 rack->r_ctl.rc_fixed_pacing_rate_ca, 23466 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23467 __LINE__, NULL,0); 23468 break; 23469 23470 case TCP_RACK_PACE_RATE_SS: 23471 /* Set the fixed pacing rate in Bytes per second ca */ 23472 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 23473 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23474 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23475 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23476 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23477 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23478 rack->use_fixed_rate = 1; 23479 if (rack->rack_hibeta) 23480 rack_set_cc_pacing(rack); 23481 rack_log_pacing_delay_calc(rack, 23482 rack->r_ctl.rc_fixed_pacing_rate_ss, 23483 rack->r_ctl.rc_fixed_pacing_rate_ca, 23484 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23485 __LINE__, NULL, 0); 23486 break; 23487 23488 case TCP_RACK_PACE_RATE_CA: 23489 /* Set the fixed pacing rate in Bytes per second ca */ 23490 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 23491 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23492 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23493 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23494 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23495 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23496 rack->use_fixed_rate = 1; 23497 if (rack->rack_hibeta) 23498 rack_set_cc_pacing(rack); 23499 rack_log_pacing_delay_calc(rack, 23500 rack->r_ctl.rc_fixed_pacing_rate_ss, 23501 rack->r_ctl.rc_fixed_pacing_rate_ca, 23502 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23503 __LINE__, NULL, 0); 23504 break; 23505 case TCP_RACK_GP_INCREASE_REC: 23506 RACK_OPTS_INC(tcp_gp_inc_rec); 23507 rack->r_ctl.rack_per_of_gp_rec = optval; 23508 rack_log_pacing_delay_calc(rack, 23509 rack->r_ctl.rack_per_of_gp_ss, 23510 rack->r_ctl.rack_per_of_gp_ca, 23511 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23512 __LINE__, NULL, 0); 23513 break; 23514 case TCP_RACK_GP_INCREASE_CA: 23515 RACK_OPTS_INC(tcp_gp_inc_ca); 23516 ca = optval; 23517 if (ca < 100) { 23518 /* 23519 * We don't allow any reduction 23520 * over the GP b/w. 23521 */ 23522 error = EINVAL; 23523 break; 23524 } 23525 rack->r_ctl.rack_per_of_gp_ca = ca; 23526 rack_log_pacing_delay_calc(rack, 23527 rack->r_ctl.rack_per_of_gp_ss, 23528 rack->r_ctl.rack_per_of_gp_ca, 23529 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23530 __LINE__, NULL, 0); 23531 break; 23532 case TCP_RACK_GP_INCREASE_SS: 23533 RACK_OPTS_INC(tcp_gp_inc_ss); 23534 ss = optval; 23535 if (ss < 100) { 23536 /* 23537 * We don't allow any reduction 23538 * over the GP b/w. 23539 */ 23540 error = EINVAL; 23541 break; 23542 } 23543 rack->r_ctl.rack_per_of_gp_ss = ss; 23544 rack_log_pacing_delay_calc(rack, 23545 rack->r_ctl.rack_per_of_gp_ss, 23546 rack->r_ctl.rack_per_of_gp_ca, 23547 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23548 __LINE__, NULL, 0); 23549 break; 23550 case TCP_RACK_RR_CONF: 23551 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 23552 if (optval && optval <= 3) 23553 rack->r_rr_config = optval; 23554 else 23555 rack->r_rr_config = 0; 23556 break; 23557 case TCP_PACING_DND: /* URL:dnd */ 23558 if (optval > 0) 23559 rack->rc_pace_dnd = 1; 23560 else 23561 rack->rc_pace_dnd = 0; 23562 break; 23563 case TCP_HDWR_RATE_CAP: 23564 RACK_OPTS_INC(tcp_hdwr_rate_cap); 23565 if (optval) { 23566 if (rack->r_rack_hw_rate_caps == 0) 23567 rack->r_rack_hw_rate_caps = 1; 23568 else 23569 error = EALREADY; 23570 } else { 23571 rack->r_rack_hw_rate_caps = 0; 23572 } 23573 break; 23574 case TCP_RACK_SPLIT_LIMIT: 23575 RACK_OPTS_INC(tcp_split_limit); 23576 rack->r_ctl.rc_split_limit = optval; 23577 break; 23578 case TCP_BBR_HDWR_PACE: 23579 RACK_OPTS_INC(tcp_hdwr_pacing); 23580 if (optval){ 23581 if (rack->rack_hdrw_pacing == 0) { 23582 rack->rack_hdw_pace_ena = 1; 23583 rack->rack_attempt_hdwr_pace = 0; 23584 } else 23585 error = EALREADY; 23586 } else { 23587 rack->rack_hdw_pace_ena = 0; 23588 #ifdef RATELIMIT 23589 if (rack->r_ctl.crte != NULL) { 23590 rack->rack_hdrw_pacing = 0; 23591 rack->rack_attempt_hdwr_pace = 0; 23592 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 23593 rack->r_ctl.crte = NULL; 23594 } 23595 #endif 23596 } 23597 break; 23598 /* End Pacing related ones */ 23599 case TCP_RACK_PRR_SENDALOT: 23600 /* Allow PRR to send more than one seg */ 23601 RACK_OPTS_INC(tcp_rack_prr_sendalot); 23602 rack->r_ctl.rc_prr_sendalot = optval; 23603 break; 23604 case TCP_RACK_MIN_TO: 23605 /* Minimum time between rack t-o's in ms */ 23606 RACK_OPTS_INC(tcp_rack_min_to); 23607 rack->r_ctl.rc_min_to = optval; 23608 break; 23609 case TCP_RACK_EARLY_SEG: 23610 /* If early recovery max segments */ 23611 RACK_OPTS_INC(tcp_rack_early_seg); 23612 rack->r_ctl.rc_early_recovery_segs = optval; 23613 break; 23614 case TCP_RACK_ENABLE_HYSTART: 23615 { 23616 if (optval) { 23617 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23618 if (rack_do_hystart > RACK_HYSTART_ON) 23619 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 23620 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 23621 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 23622 } else { 23623 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 23624 } 23625 } 23626 break; 23627 case TCP_RACK_REORD_THRESH: 23628 /* RACK reorder threshold (shift amount) */ 23629 RACK_OPTS_INC(tcp_rack_reord_thresh); 23630 if ((optval > 0) && (optval < 31)) 23631 rack->r_ctl.rc_reorder_shift = optval; 23632 else 23633 error = EINVAL; 23634 break; 23635 case TCP_RACK_REORD_FADE: 23636 /* Does reordering fade after ms time */ 23637 RACK_OPTS_INC(tcp_rack_reord_fade); 23638 rack->r_ctl.rc_reorder_fade = optval; 23639 break; 23640 case TCP_RACK_TLP_THRESH: 23641 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 23642 RACK_OPTS_INC(tcp_rack_tlp_thresh); 23643 if (optval) 23644 rack->r_ctl.rc_tlp_threshold = optval; 23645 else 23646 error = EINVAL; 23647 break; 23648 case TCP_BBR_USE_RACK_RR: 23649 RACK_OPTS_INC(tcp_rack_rr); 23650 if (optval) 23651 rack->use_rack_rr = 1; 23652 else 23653 rack->use_rack_rr = 0; 23654 break; 23655 case TCP_RACK_PKT_DELAY: 23656 /* RACK added ms i.e. rack-rtt + reord + N */ 23657 RACK_OPTS_INC(tcp_rack_pkt_delay); 23658 rack->r_ctl.rc_pkt_delay = optval; 23659 break; 23660 case TCP_DELACK: 23661 RACK_OPTS_INC(tcp_rack_delayed_ack); 23662 if (optval == 0) 23663 tp->t_delayed_ack = 0; 23664 else 23665 tp->t_delayed_ack = 1; 23666 if (tp->t_flags & TF_DELACK) { 23667 tp->t_flags &= ~TF_DELACK; 23668 tp->t_flags |= TF_ACKNOW; 23669 NET_EPOCH_ENTER(et); 23670 rack_output(tp); 23671 NET_EPOCH_EXIT(et); 23672 } 23673 break; 23674 23675 case TCP_BBR_RACK_RTT_USE: 23676 RACK_OPTS_INC(tcp_rack_rtt_use); 23677 if ((optval != USE_RTT_HIGH) && 23678 (optval != USE_RTT_LOW) && 23679 (optval != USE_RTT_AVG)) 23680 error = EINVAL; 23681 else 23682 rack->r_ctl.rc_rate_sample_method = optval; 23683 break; 23684 case TCP_DATA_AFTER_CLOSE: 23685 RACK_OPTS_INC(tcp_data_after_close); 23686 if (optval) 23687 rack->rc_allow_data_af_clo = 1; 23688 else 23689 rack->rc_allow_data_af_clo = 0; 23690 break; 23691 default: 23692 break; 23693 } 23694 tcp_log_socket_option(tp, sopt_name, optval, error); 23695 return (error); 23696 } 23697 23698 23699 static void 23700 rack_apply_deferred_options(struct tcp_rack *rack) 23701 { 23702 struct deferred_opt_list *dol, *sdol; 23703 uint32_t s_optval; 23704 23705 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 23706 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 23707 /* Disadvantage of deferal is you loose the error return */ 23708 s_optval = (uint32_t)dol->optval; 23709 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); 23710 free(dol, M_TCPDO); 23711 } 23712 } 23713 23714 static void 23715 rack_hw_tls_change(struct tcpcb *tp, int chg) 23716 { 23717 /* Update HW tls state */ 23718 struct tcp_rack *rack; 23719 23720 rack = (struct tcp_rack *)tp->t_fb_ptr; 23721 if (chg) 23722 rack->r_ctl.fsb.hw_tls = 1; 23723 else 23724 rack->r_ctl.fsb.hw_tls = 0; 23725 } 23726 23727 static int 23728 rack_pru_options(struct tcpcb *tp, int flags) 23729 { 23730 if (flags & PRUS_OOB) 23731 return (EOPNOTSUPP); 23732 return (0); 23733 } 23734 23735 static bool 23736 rack_wake_check(struct tcpcb *tp) 23737 { 23738 struct tcp_rack *rack; 23739 struct timeval tv; 23740 uint32_t cts; 23741 23742 rack = (struct tcp_rack *)tp->t_fb_ptr; 23743 if (rack->r_ctl.rc_hpts_flags) { 23744 cts = tcp_get_usecs(&tv); 23745 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ 23746 /* 23747 * Pacing timer is up, check if we are ready. 23748 */ 23749 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) 23750 return (true); 23751 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { 23752 /* 23753 * A timer is up, check if we are ready. 23754 */ 23755 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) 23756 return (true); 23757 } 23758 } 23759 return (false); 23760 } 23761 23762 static struct tcp_function_block __tcp_rack = { 23763 .tfb_tcp_block_name = __XSTRING(STACKNAME), 23764 .tfb_tcp_output = rack_output, 23765 .tfb_do_queued_segments = ctf_do_queued_segments, 23766 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 23767 .tfb_tcp_do_segment = rack_do_segment, 23768 .tfb_tcp_ctloutput = rack_ctloutput, 23769 .tfb_tcp_fb_init = rack_init, 23770 .tfb_tcp_fb_fini = rack_fini, 23771 .tfb_tcp_timer_stop_all = rack_stopall, 23772 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 23773 .tfb_tcp_handoff_ok = rack_handoff_ok, 23774 .tfb_tcp_mtu_chg = rack_mtu_change, 23775 .tfb_pru_options = rack_pru_options, 23776 .tfb_hwtls_change = rack_hw_tls_change, 23777 .tfb_chg_query = rack_chg_query, 23778 .tfb_switch_failed = rack_switch_failed, 23779 .tfb_early_wake_check = rack_wake_check, 23780 .tfb_compute_pipe = rack_compute_pipe, 23781 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 23782 }; 23783 23784 /* 23785 * rack_ctloutput() must drop the inpcb lock before performing copyin on 23786 * socket option arguments. When it re-acquires the lock after the copy, it 23787 * has to revalidate that the connection is still valid for the socket 23788 * option. 23789 */ 23790 static int 23791 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) 23792 { 23793 struct inpcb *inp = tptoinpcb(tp); 23794 #ifdef INET 23795 struct ip *ip; 23796 #endif 23797 struct tcp_rack *rack; 23798 struct tcp_hybrid_req hybrid; 23799 uint64_t loptval; 23800 int32_t error = 0, optval; 23801 23802 rack = (struct tcp_rack *)tp->t_fb_ptr; 23803 if (rack == NULL) { 23804 INP_WUNLOCK(inp); 23805 return (EINVAL); 23806 } 23807 #ifdef INET 23808 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 23809 #endif 23810 23811 switch (sopt->sopt_level) { 23812 #ifdef INET6 23813 case IPPROTO_IPV6: 23814 MPASS(inp->inp_vflag & INP_IPV6PROTO); 23815 switch (sopt->sopt_name) { 23816 case IPV6_USE_MIN_MTU: 23817 tcp6_use_min_mtu(tp); 23818 break; 23819 } 23820 INP_WUNLOCK(inp); 23821 return (0); 23822 #endif 23823 #ifdef INET 23824 case IPPROTO_IP: 23825 switch (sopt->sopt_name) { 23826 case IP_TOS: 23827 /* 23828 * The DSCP codepoint has changed, update the fsb. 23829 */ 23830 ip->ip_tos = rack->rc_inp->inp_ip_tos; 23831 break; 23832 case IP_TTL: 23833 /* 23834 * The TTL has changed, update the fsb. 23835 */ 23836 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 23837 break; 23838 } 23839 INP_WUNLOCK(inp); 23840 return (0); 23841 #endif 23842 #ifdef SO_PEERPRIO 23843 case SOL_SOCKET: 23844 switch (sopt->sopt_name) { 23845 case SO_PEERPRIO: /* SC-URL:bs */ 23846 /* Already read in and sanity checked in sosetopt(). */ 23847 if (inp->inp_socket) { 23848 rack->client_bufferlvl = inp->inp_socket->so_peerprio; 23849 rack_client_buffer_level_set(rack); 23850 } 23851 break; 23852 } 23853 INP_WUNLOCK(inp); 23854 return (0); 23855 #endif 23856 case IPPROTO_TCP: 23857 switch (sopt->sopt_name) { 23858 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 23859 /* Pacing related ones */ 23860 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 23861 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 23862 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 23863 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ 23864 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 23865 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 23866 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 23867 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 23868 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 23869 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 23870 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 23871 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 23872 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 23873 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 23874 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 23875 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 23876 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 23877 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 23878 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 23879 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 23880 case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */ 23881 /* End pacing related */ 23882 case TCP_RXT_CLAMP: /* URL:rxtclamp */ 23883 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 23884 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 23885 case TCP_RACK_MIN_TO: /* URL:min_to */ 23886 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 23887 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 23888 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 23889 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 23890 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 23891 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 23892 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 23893 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 23894 case TCP_RACK_DO_DETECTION: /* URL:detect */ 23895 case TCP_NO_PRR: /* URL:noprr */ 23896 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 23897 case TCP_DATA_AFTER_CLOSE: /* no URL */ 23898 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 23899 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 23900 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 23901 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 23902 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 23903 case TCP_RACK_PROFILE: /* URL:profile */ 23904 case TCP_HYBRID_PACING: /* URL:hybrid */ 23905 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 23906 case TCP_RACK_ABC_VAL: /* URL:labc */ 23907 case TCP_REC_ABC_VAL: /* URL:reclabc */ 23908 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 23909 case TCP_DEFER_OPTIONS: /* URL:defer */ 23910 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 23911 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 23912 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 23913 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ 23914 case TCP_RACK_HI_BETA: /* URL:hibeta */ 23915 case TCP_RACK_SPLIT_LIMIT: /* URL:split */ 23916 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ 23917 case TCP_PACING_DND: /* URL:dnd */ 23918 goto process_opt; 23919 break; 23920 default: 23921 /* Filter off all unknown options to the base stack */ 23922 return (tcp_default_ctloutput(tp, sopt)); 23923 break; 23924 } 23925 23926 default: 23927 INP_WUNLOCK(inp); 23928 return (0); 23929 } 23930 process_opt: 23931 INP_WUNLOCK(inp); 23932 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 23933 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 23934 /* 23935 * We truncate it down to 32 bits for the socket-option trace this 23936 * means rates > 34Gbps won't show right, but thats probably ok. 23937 */ 23938 optval = (uint32_t)loptval; 23939 } else if (sopt->sopt_name == TCP_HYBRID_PACING) { 23940 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); 23941 } else { 23942 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 23943 /* Save it in 64 bit form too */ 23944 loptval = optval; 23945 } 23946 if (error) 23947 return (error); 23948 INP_WLOCK(inp); 23949 if (tp->t_fb != &__tcp_rack) { 23950 INP_WUNLOCK(inp); 23951 return (ENOPROTOOPT); 23952 } 23953 if (rack->defer_options && (rack->gp_ready == 0) && 23954 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 23955 (sopt->sopt_name != TCP_HYBRID_PACING) && 23956 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 23957 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && 23958 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 23959 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 23960 /* Options are beind deferred */ 23961 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 23962 INP_WUNLOCK(inp); 23963 return (0); 23964 } else { 23965 /* No memory to defer, fail */ 23966 INP_WUNLOCK(inp); 23967 return (ENOMEM); 23968 } 23969 } 23970 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); 23971 INP_WUNLOCK(inp); 23972 return (error); 23973 } 23974 23975 static void 23976 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 23977 { 23978 23979 INP_WLOCK_ASSERT(tptoinpcb(tp)); 23980 bzero(ti, sizeof(*ti)); 23981 23982 ti->tcpi_state = tp->t_state; 23983 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 23984 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 23985 if (tp->t_flags & TF_SACK_PERMIT) 23986 ti->tcpi_options |= TCPI_OPT_SACK; 23987 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 23988 ti->tcpi_options |= TCPI_OPT_WSCALE; 23989 ti->tcpi_snd_wscale = tp->snd_scale; 23990 ti->tcpi_rcv_wscale = tp->rcv_scale; 23991 } 23992 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 23993 ti->tcpi_options |= TCPI_OPT_ECN; 23994 if (tp->t_flags & TF_FASTOPEN) 23995 ti->tcpi_options |= TCPI_OPT_TFO; 23996 /* still kept in ticks is t_rcvtime */ 23997 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 23998 /* Since we hold everything in precise useconds this is easy */ 23999 ti->tcpi_rtt = tp->t_srtt; 24000 ti->tcpi_rttvar = tp->t_rttvar; 24001 ti->tcpi_rto = tp->t_rxtcur; 24002 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 24003 ti->tcpi_snd_cwnd = tp->snd_cwnd; 24004 /* 24005 * FreeBSD-specific extension fields for tcp_info. 24006 */ 24007 ti->tcpi_rcv_space = tp->rcv_wnd; 24008 ti->tcpi_rcv_nxt = tp->rcv_nxt; 24009 ti->tcpi_snd_wnd = tp->snd_wnd; 24010 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 24011 ti->tcpi_snd_nxt = tp->snd_nxt; 24012 ti->tcpi_snd_mss = tp->t_maxseg; 24013 ti->tcpi_rcv_mss = tp->t_maxseg; 24014 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 24015 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 24016 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 24017 ti->tcpi_total_tlp = tp->t_sndtlppack; 24018 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 24019 #ifdef NETFLIX_STATS 24020 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 24021 #endif 24022 #ifdef TCP_OFFLOAD 24023 if (tp->t_flags & TF_TOE) { 24024 ti->tcpi_options |= TCPI_OPT_TOE; 24025 tcp_offload_tcp_info(tp, ti); 24026 } 24027 #endif 24028 } 24029 24030 static int 24031 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) 24032 { 24033 struct inpcb *inp = tptoinpcb(tp); 24034 struct tcp_rack *rack; 24035 int32_t error, optval; 24036 uint64_t val, loptval; 24037 struct tcp_info ti; 24038 /* 24039 * Because all our options are either boolean or an int, we can just 24040 * pull everything into optval and then unlock and copy. If we ever 24041 * add a option that is not a int, then this will have quite an 24042 * impact to this routine. 24043 */ 24044 error = 0; 24045 rack = (struct tcp_rack *)tp->t_fb_ptr; 24046 if (rack == NULL) { 24047 INP_WUNLOCK(inp); 24048 return (EINVAL); 24049 } 24050 switch (sopt->sopt_name) { 24051 case TCP_INFO: 24052 /* First get the info filled */ 24053 rack_fill_info(tp, &ti); 24054 /* Fix up the rtt related fields if needed */ 24055 INP_WUNLOCK(inp); 24056 error = sooptcopyout(sopt, &ti, sizeof ti); 24057 return (error); 24058 /* 24059 * Beta is the congestion control value for NewReno that influences how 24060 * much of a backoff happens when loss is detected. It is normally set 24061 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 24062 * when you exit recovery. 24063 */ 24064 case TCP_RACK_PACING_BETA: 24065 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24066 error = EINVAL; 24067 else if (rack->rc_pacing_cc_set == 0) 24068 optval = rack->r_ctl.rc_saved_beta.beta; 24069 else { 24070 /* 24071 * Reach out into the CC data and report back what 24072 * I have previously set. Yeah it looks hackish but 24073 * we don't want to report the saved values. 24074 */ 24075 if (tp->t_ccv.cc_data) 24076 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; 24077 else 24078 error = EINVAL; 24079 } 24080 break; 24081 /* 24082 * Beta_ecn is the congestion control value for NewReno that influences how 24083 * much of a backoff happens when a ECN mark is detected. It is normally set 24084 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 24085 * you exit recovery. Note that classic ECN has a beta of 50, it is only 24086 * ABE Ecn that uses this "less" value, but we do too with pacing :) 24087 */ 24088 24089 case TCP_RACK_PACING_BETA_ECN: 24090 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24091 error = EINVAL; 24092 else if (rack->rc_pacing_cc_set == 0) 24093 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 24094 else { 24095 /* 24096 * Reach out into the CC data and report back what 24097 * I have previously set. Yeah it looks hackish but 24098 * we don't want to report the saved values. 24099 */ 24100 if (tp->t_ccv.cc_data) 24101 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 24102 else 24103 error = EINVAL; 24104 } 24105 break; 24106 case TCP_RACK_DSACK_OPT: 24107 optval = 0; 24108 if (rack->rc_rack_tmr_std_based) { 24109 optval |= 1; 24110 } 24111 if (rack->rc_rack_use_dsack) { 24112 optval |= 2; 24113 } 24114 break; 24115 case TCP_RACK_ENABLE_HYSTART: 24116 { 24117 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 24118 optval = RACK_HYSTART_ON; 24119 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 24120 optval = RACK_HYSTART_ON_W_SC; 24121 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 24122 optval = RACK_HYSTART_ON_W_SC_C; 24123 } else { 24124 optval = RACK_HYSTART_OFF; 24125 } 24126 } 24127 break; 24128 case TCP_RACK_DGP_IN_REC: 24129 optval = rack->r_ctl.full_dgp_in_rec; 24130 break; 24131 case TCP_RACK_HI_BETA: 24132 optval = rack->rack_hibeta; 24133 break; 24134 case TCP_RXT_CLAMP: 24135 optval = rack->r_ctl.saved_rxt_clamp_val; 24136 break; 24137 case TCP_DEFER_OPTIONS: 24138 optval = rack->defer_options; 24139 break; 24140 case TCP_RACK_MEASURE_CNT: 24141 optval = rack->r_ctl.req_measurements; 24142 break; 24143 case TCP_REC_ABC_VAL: 24144 optval = rack->r_use_labc_for_rec; 24145 break; 24146 case TCP_RACK_ABC_VAL: 24147 optval = rack->rc_labc; 24148 break; 24149 case TCP_HDWR_UP_ONLY: 24150 optval= rack->r_up_only; 24151 break; 24152 case TCP_PACING_RATE_CAP: 24153 loptval = rack->r_ctl.bw_rate_cap; 24154 break; 24155 case TCP_RACK_PROFILE: 24156 /* You cannot retrieve a profile, its write only */ 24157 error = EINVAL; 24158 break; 24159 case TCP_HYBRID_PACING: 24160 /* You cannot retrieve hybrid pacing information, its write only */ 24161 error = EINVAL; 24162 break; 24163 case TCP_USE_CMP_ACKS: 24164 optval = rack->r_use_cmp_ack; 24165 break; 24166 case TCP_RACK_PACE_TO_FILL: 24167 optval = rack->rc_pace_to_cwnd; 24168 if (optval && rack->r_fill_less_agg) 24169 optval++; 24170 break; 24171 case TCP_RACK_NO_PUSH_AT_MAX: 24172 optval = rack->r_ctl.rc_no_push_at_mrtt; 24173 break; 24174 case TCP_SHARED_CWND_ENABLE: 24175 optval = rack->rack_enable_scwnd; 24176 break; 24177 case TCP_RACK_NONRXT_CFG_RATE: 24178 optval = rack->rack_rec_nonrxt_use_cr; 24179 break; 24180 case TCP_NO_PRR: 24181 if (rack->rack_no_prr == 1) 24182 optval = 1; 24183 else if (rack->no_prr_addback == 1) 24184 optval = 2; 24185 else 24186 optval = 0; 24187 break; 24188 case TCP_RACK_DO_DETECTION: 24189 optval = rack->do_detection; 24190 break; 24191 case TCP_RACK_MBUF_QUEUE: 24192 /* Now do we use the LRO mbuf-queue feature */ 24193 optval = rack->r_mbuf_queue; 24194 break; 24195 case TCP_TIMELY_DYN_ADJ: 24196 optval = rack->rc_gp_dyn_mul; 24197 break; 24198 case TCP_BBR_IWINTSO: 24199 optval = rack->rc_init_win; 24200 break; 24201 case TCP_RACK_TLP_REDUCE: 24202 /* RACK TLP cwnd reduction (bool) */ 24203 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 24204 break; 24205 case TCP_BBR_RACK_INIT_RATE: 24206 val = rack->r_ctl.init_rate; 24207 /* convert to kbits per sec */ 24208 val *= 8; 24209 val /= 1000; 24210 optval = (uint32_t)val; 24211 break; 24212 case TCP_RACK_FORCE_MSEG: 24213 optval = rack->rc_force_max_seg; 24214 break; 24215 case TCP_RACK_PACE_MIN_SEG: 24216 optval = rack->r_ctl.rc_user_set_min_segs; 24217 break; 24218 case TCP_RACK_PACE_MAX_SEG: 24219 /* Max segments in a pace */ 24220 optval = rack->rc_user_set_max_segs; 24221 break; 24222 case TCP_RACK_PACE_ALWAYS: 24223 /* Use the always pace method */ 24224 optval = rack->rc_always_pace; 24225 break; 24226 case TCP_RACK_PRR_SENDALOT: 24227 /* Allow PRR to send more than one seg */ 24228 optval = rack->r_ctl.rc_prr_sendalot; 24229 break; 24230 case TCP_RACK_MIN_TO: 24231 /* Minimum time between rack t-o's in ms */ 24232 optval = rack->r_ctl.rc_min_to; 24233 break; 24234 case TCP_RACK_SPLIT_LIMIT: 24235 optval = rack->r_ctl.rc_split_limit; 24236 break; 24237 case TCP_RACK_EARLY_SEG: 24238 /* If early recovery max segments */ 24239 optval = rack->r_ctl.rc_early_recovery_segs; 24240 break; 24241 case TCP_RACK_REORD_THRESH: 24242 /* RACK reorder threshold (shift amount) */ 24243 optval = rack->r_ctl.rc_reorder_shift; 24244 break; 24245 case TCP_RACK_REORD_FADE: 24246 /* Does reordering fade after ms time */ 24247 optval = rack->r_ctl.rc_reorder_fade; 24248 break; 24249 case TCP_BBR_USE_RACK_RR: 24250 /* Do we use the rack cheat for rxt */ 24251 optval = rack->use_rack_rr; 24252 break; 24253 case TCP_RACK_RR_CONF: 24254 optval = rack->r_rr_config; 24255 break; 24256 case TCP_HDWR_RATE_CAP: 24257 optval = rack->r_rack_hw_rate_caps; 24258 break; 24259 case TCP_BBR_HDWR_PACE: 24260 optval = rack->rack_hdw_pace_ena; 24261 break; 24262 case TCP_RACK_TLP_THRESH: 24263 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 24264 optval = rack->r_ctl.rc_tlp_threshold; 24265 break; 24266 case TCP_RACK_PKT_DELAY: 24267 /* RACK added ms i.e. rack-rtt + reord + N */ 24268 optval = rack->r_ctl.rc_pkt_delay; 24269 break; 24270 case TCP_RACK_TLP_USE: 24271 optval = rack->rack_tlp_threshold_use; 24272 break; 24273 case TCP_PACING_DND: 24274 optval = rack->rc_pace_dnd; 24275 break; 24276 case TCP_RACK_PACE_RATE_CA: 24277 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 24278 break; 24279 case TCP_RACK_PACE_RATE_SS: 24280 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 24281 break; 24282 case TCP_RACK_PACE_RATE_REC: 24283 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 24284 break; 24285 case TCP_RACK_GP_INCREASE_SS: 24286 optval = rack->r_ctl.rack_per_of_gp_ca; 24287 break; 24288 case TCP_RACK_GP_INCREASE_CA: 24289 optval = rack->r_ctl.rack_per_of_gp_ss; 24290 break; 24291 case TCP_RACK_PACING_DIVISOR: 24292 optval = rack->r_ctl.pace_len_divisor; 24293 break; 24294 case TCP_BBR_RACK_RTT_USE: 24295 optval = rack->r_ctl.rc_rate_sample_method; 24296 break; 24297 case TCP_DELACK: 24298 optval = tp->t_delayed_ack; 24299 break; 24300 case TCP_DATA_AFTER_CLOSE: 24301 optval = rack->rc_allow_data_af_clo; 24302 break; 24303 case TCP_SHARED_CWND_TIME_LIMIT: 24304 optval = rack->r_limit_scw; 24305 break; 24306 case TCP_RACK_TIMER_SLOP: 24307 optval = rack->r_ctl.timer_slop; 24308 break; 24309 default: 24310 return (tcp_default_ctloutput(tp, sopt)); 24311 break; 24312 } 24313 INP_WUNLOCK(inp); 24314 if (error == 0) { 24315 if (TCP_PACING_RATE_CAP) 24316 error = sooptcopyout(sopt, &loptval, sizeof loptval); 24317 else 24318 error = sooptcopyout(sopt, &optval, sizeof optval); 24319 } 24320 return (error); 24321 } 24322 24323 static int 24324 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 24325 { 24326 if (sopt->sopt_dir == SOPT_SET) { 24327 return (rack_set_sockopt(tp, sopt)); 24328 } else if (sopt->sopt_dir == SOPT_GET) { 24329 return (rack_get_sockopt(tp, sopt)); 24330 } else { 24331 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 24332 } 24333 } 24334 24335 static const char *rack_stack_names[] = { 24336 __XSTRING(STACKNAME), 24337 #ifdef STACKALIAS 24338 __XSTRING(STACKALIAS), 24339 #endif 24340 }; 24341 24342 static int 24343 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 24344 { 24345 memset(mem, 0, size); 24346 return (0); 24347 } 24348 24349 static void 24350 rack_dtor(void *mem, int32_t size, void *arg) 24351 { 24352 24353 } 24354 24355 static bool rack_mod_inited = false; 24356 24357 static int 24358 tcp_addrack(module_t mod, int32_t type, void *data) 24359 { 24360 int32_t err = 0; 24361 int num_stacks; 24362 24363 switch (type) { 24364 case MOD_LOAD: 24365 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 24366 sizeof(struct rack_sendmap), 24367 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 24368 24369 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 24370 sizeof(struct tcp_rack), 24371 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 24372 24373 sysctl_ctx_init(&rack_sysctl_ctx); 24374 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 24375 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 24376 OID_AUTO, 24377 #ifdef STACKALIAS 24378 __XSTRING(STACKALIAS), 24379 #else 24380 __XSTRING(STACKNAME), 24381 #endif 24382 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 24383 ""); 24384 if (rack_sysctl_root == NULL) { 24385 printf("Failed to add sysctl node\n"); 24386 err = EFAULT; 24387 goto free_uma; 24388 } 24389 rack_init_sysctls(); 24390 num_stacks = nitems(rack_stack_names); 24391 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 24392 rack_stack_names, &num_stacks); 24393 if (err) { 24394 printf("Failed to register %s stack name for " 24395 "%s module\n", rack_stack_names[num_stacks], 24396 __XSTRING(MODNAME)); 24397 sysctl_ctx_free(&rack_sysctl_ctx); 24398 free_uma: 24399 uma_zdestroy(rack_zone); 24400 uma_zdestroy(rack_pcb_zone); 24401 rack_counter_destroy(); 24402 printf("Failed to register rack module -- err:%d\n", err); 24403 return (err); 24404 } 24405 tcp_lro_reg_mbufq(); 24406 rack_mod_inited = true; 24407 break; 24408 case MOD_QUIESCE: 24409 err = deregister_tcp_functions(&__tcp_rack, true, false); 24410 break; 24411 case MOD_UNLOAD: 24412 err = deregister_tcp_functions(&__tcp_rack, false, true); 24413 if (err == EBUSY) 24414 break; 24415 if (rack_mod_inited) { 24416 uma_zdestroy(rack_zone); 24417 uma_zdestroy(rack_pcb_zone); 24418 sysctl_ctx_free(&rack_sysctl_ctx); 24419 rack_counter_destroy(); 24420 rack_mod_inited = false; 24421 } 24422 tcp_lro_dereg_mbufq(); 24423 err = 0; 24424 break; 24425 default: 24426 return (EOPNOTSUPP); 24427 } 24428 return (err); 24429 } 24430 24431 static moduledata_t tcp_rack = { 24432 .name = __XSTRING(MODNAME), 24433 .evhand = tcp_addrack, 24434 .priv = 0 24435 }; 24436 24437 MODULE_VERSION(MODNAME, 1); 24438 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 24439 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 24440 24441 #endif /* #if !defined(INET) && !defined(INET6) */ 24442