1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 #include "opt_inet.h" 29 #include "opt_inet6.h" 30 #include "opt_ipsec.h" 31 #include "opt_ratelimit.h" 32 #include "opt_kern_tls.h" 33 #if defined(INET) || defined(INET6) 34 #include <sys/param.h> 35 #include <sys/arb.h> 36 #include <sys/module.h> 37 #include <sys/kernel.h> 38 #ifdef TCP_HHOOK 39 #include <sys/hhook.h> 40 #endif 41 #include <sys/lock.h> 42 #include <sys/malloc.h> 43 #include <sys/mutex.h> 44 #include <sys/mbuf.h> 45 #include <sys/proc.h> /* for proc0 declaration */ 46 #include <sys/socket.h> 47 #include <sys/socketvar.h> 48 #include <sys/sysctl.h> 49 #include <sys/systm.h> 50 #ifdef STATS 51 #include <sys/qmath.h> 52 #include <sys/tree.h> 53 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 54 #else 55 #include <sys/tree.h> 56 #endif 57 #include <sys/refcount.h> 58 #include <sys/queue.h> 59 #include <sys/tim_filter.h> 60 #include <sys/smp.h> 61 #include <sys/kthread.h> 62 #include <sys/kern_prefetch.h> 63 #include <sys/protosw.h> 64 #ifdef TCP_ACCOUNTING 65 #include <sys/sched.h> 66 #include <machine/cpu.h> 67 #endif 68 #include <vm/uma.h> 69 70 #include <net/route.h> 71 #include <net/route/nhop.h> 72 #include <net/vnet.h> 73 74 #define TCPSTATES /* for logging */ 75 76 #include <netinet/in.h> 77 #include <netinet/in_kdtrace.h> 78 #include <netinet/in_pcb.h> 79 #include <netinet/ip.h> 80 #include <netinet/ip_var.h> 81 #include <netinet/ip6.h> 82 #include <netinet6/in6_pcb.h> 83 #include <netinet6/ip6_var.h> 84 #include <netinet/tcp.h> 85 #define TCPOUTFLAGS 86 #include <netinet/tcp_fsm.h> 87 #include <netinet/tcp_seq.h> 88 #include <netinet/tcp_timer.h> 89 #include <netinet/tcp_var.h> 90 #include <netinet/tcp_log_buf.h> 91 #include <netinet/tcp_syncache.h> 92 #include <netinet/tcp_hpts.h> 93 #include <netinet/tcp_ratelimit.h> 94 #include <netinet/tcp_accounting.h> 95 #include <netinet/tcpip.h> 96 #include <netinet/cc/cc.h> 97 #include <netinet/cc/cc_newreno.h> 98 #include <netinet/tcp_fastopen.h> 99 #include <netinet/tcp_lro.h> 100 #ifdef NETFLIX_SHARED_CWND 101 #include <netinet/tcp_shared_cwnd.h> 102 #endif 103 #ifdef TCP_OFFLOAD 104 #include <netinet/tcp_offload.h> 105 #endif 106 #ifdef INET6 107 #include <netinet6/tcp6_var.h> 108 #endif 109 #include <netinet/tcp_ecn.h> 110 111 #include <netipsec/ipsec_support.h> 112 113 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 114 #include <netipsec/ipsec.h> 115 #include <netipsec/ipsec6.h> 116 #endif /* IPSEC */ 117 118 #include <netinet/udp.h> 119 #include <netinet/udp_var.h> 120 #include <machine/in_cksum.h> 121 122 #ifdef MAC 123 #include <security/mac/mac_framework.h> 124 #endif 125 #include "sack_filter.h" 126 #include "tcp_rack.h" 127 #include "tailq_hash.h" 128 #include "rack_bbr_common.h" 129 130 uma_zone_t rack_zone; 131 uma_zone_t rack_pcb_zone; 132 133 #ifndef TICKS2SBT 134 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 135 #endif 136 137 VNET_DECLARE(uint32_t, newreno_beta); 138 VNET_DECLARE(uint32_t, newreno_beta_ecn); 139 #define V_newreno_beta VNET(newreno_beta) 140 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 141 142 #define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME) 143 #define M_TCPDO __CONCAT(M_TCPDO, STACKNAME) 144 145 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block"); 146 MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options"); 147 MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information"); 148 149 struct sysctl_ctx_list rack_sysctl_ctx; 150 struct sysctl_oid *rack_sysctl_root; 151 152 #define CUM_ACKED 1 153 #define SACKED 2 154 155 /* 156 * The RACK module incorporates a number of 157 * TCP ideas that have been put out into the IETF 158 * over the last few years: 159 * - Matt Mathis's Rate Halving which slowly drops 160 * the congestion window so that the ack clock can 161 * be maintained during a recovery. 162 * - Yuchung Cheng's RACK TCP (for which its named) that 163 * will stop us using the number of dup acks and instead 164 * use time as the gage of when we retransmit. 165 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 166 * of Dukkipati et.al. 167 * RACK depends on SACK, so if an endpoint arrives that 168 * cannot do SACK the state machine below will shuttle the 169 * connection back to using the "default" TCP stack that is 170 * in FreeBSD. 171 * 172 * To implement RACK the original TCP stack was first decomposed 173 * into a functional state machine with individual states 174 * for each of the possible TCP connection states. The do_segment 175 * functions role in life is to mandate the connection supports SACK 176 * initially and then assure that the RACK state matches the conenction 177 * state before calling the states do_segment function. Each 178 * state is simplified due to the fact that the original do_segment 179 * has been decomposed and we *know* what state we are in (no 180 * switches on the state) and all tests for SACK are gone. This 181 * greatly simplifies what each state does. 182 * 183 * TCP output is also over-written with a new version since it 184 * must maintain the new rack scoreboard. 185 * 186 */ 187 static int32_t rack_tlp_thresh = 1; 188 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 189 static int32_t rack_tlp_use_greater = 1; 190 static int32_t rack_reorder_thresh = 2; 191 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 192 * - 60 seconds */ 193 static uint32_t rack_pcm_every_n_rounds = 100; 194 static uint32_t rack_pcm_blast = 0; 195 static uint32_t rack_pcm_is_enabled = 1; 196 static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ 197 198 static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round as "gaining" */ 199 static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ 200 201 202 static int32_t rack_rxt_scoreboard_clear_thresh = 2; 203 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ 204 static int32_t rack_rxt_controls = 0; 205 static int32_t rack_fill_cw_state = 0; 206 static uint8_t rack_req_measurements = 1; 207 /* Attack threshold detections */ 208 static uint32_t rack_highest_sack_thresh_seen = 0; 209 static uint32_t rack_highest_move_thresh_seen = 0; 210 static uint32_t rack_merge_out_sacks_on_attack = 0; 211 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 212 static int32_t rack_hw_rate_caps = 0; /* 1; */ 213 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ 214 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 215 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 216 static int32_t rack_hw_up_only = 0; 217 static int32_t rack_stats_gets_ms_rtt = 1; 218 static int32_t rack_prr_addbackmax = 2; 219 static int32_t rack_do_hystart = 0; 220 static int32_t rack_apply_rtt_with_reduced_conf = 0; 221 static int32_t rack_hibeta_setting = 0; 222 static int32_t rack_default_pacing_divisor = 250; 223 static uint16_t rack_pacing_min_seg = 0; 224 static int32_t rack_timely_off = 0; 225 226 static uint32_t sad_seg_size_per = 800; /* 80.0 % */ 227 static int32_t rack_pkt_delay = 1000; 228 static int32_t rack_send_a_lot_in_prr = 1; 229 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 230 static int32_t rack_verbose_logging = 0; 231 static int32_t rack_ignore_data_after_close = 1; 232 static int32_t rack_enable_shared_cwnd = 1; 233 static int32_t rack_use_cmp_acks = 1; 234 static int32_t rack_use_fsb = 1; 235 static int32_t rack_use_rfo = 1; 236 static int32_t rack_use_rsm_rfo = 1; 237 static int32_t rack_max_abc_post_recovery = 2; 238 static int32_t rack_client_low_buf = 0; 239 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 240 static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */ 241 #ifdef TCP_ACCOUNTING 242 static int32_t rack_tcp_accounting = 0; 243 #endif 244 static int32_t rack_limits_scwnd = 1; 245 static int32_t rack_enable_mqueue_for_nonpaced = 0; 246 static int32_t rack_hybrid_allow_set_maxseg = 0; 247 static int32_t rack_disable_prr = 0; 248 static int32_t use_rack_rr = 1; 249 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 250 static int32_t rack_persist_min = 250000; /* 250usec */ 251 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 252 static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ 253 static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ 254 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 255 static int32_t rack_limit_time_with_srtt = 0; 256 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 257 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ 258 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 259 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 260 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 261 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ 262 263 /* 264 * Currently regular tcp has a rto_min of 30ms 265 * the backoff goes 12 times so that ends up 266 * being a total of 122.850 seconds before a 267 * connection is killed. 268 */ 269 static uint32_t rack_def_data_window = 20; 270 static uint32_t rack_goal_bdp = 2; 271 static uint32_t rack_min_srtts = 1; 272 static uint32_t rack_min_measure_usec = 0; 273 static int32_t rack_tlp_min = 10000; /* 10ms */ 274 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 275 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 276 static const int32_t rack_free_cache = 2; 277 static int32_t rack_hptsi_segments = 40; 278 static int32_t rack_rate_sample_method = USE_RTT_LOW; 279 static int32_t rack_pace_every_seg = 0; 280 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 281 static int32_t rack_slot_reduction = 4; 282 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 283 static int32_t rack_cwnd_block_ends_measure = 0; 284 static int32_t rack_rwnd_block_ends_measure = 0; 285 static int32_t rack_def_profile = 0; 286 287 static int32_t rack_lower_cwnd_at_tlp = 0; 288 static int32_t rack_always_send_oldest = 0; 289 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 290 291 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 292 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 293 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 294 295 /* Probertt */ 296 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 297 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 298 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 299 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 300 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 301 302 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 303 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 304 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 305 static uint32_t rack_probertt_use_min_rtt_exit = 0; 306 static uint32_t rack_probe_rtt_sets_cwnd = 0; 307 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 308 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 309 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 310 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 311 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 312 static uint32_t rack_probertt_filter_life = 10000000; 313 static uint32_t rack_probertt_lower_within = 10; 314 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 315 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 316 static int32_t rack_probertt_clear_is = 1; 317 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 318 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 319 320 /* Part of pacing */ 321 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 322 323 /* Timely information: 324 * 325 * Here we have various control parameters on how 326 * timely may change the multiplier. rack_gain_p5_ub 327 * is associated with timely but not directly influencing 328 * the rate decision like the other variables. It controls 329 * the way fill-cw interacts with timely and caps how much 330 * timely can boost the fill-cw b/w. 331 * 332 * The other values are various boost/shrink numbers as well 333 * as potential caps when adjustments are made to the timely 334 * gain (returned by rack_get_output_gain(). Remember too that 335 * the gain returned can be overriden by other factors such as 336 * probeRTT as well as fixed-rate-pacing. 337 */ 338 static int32_t rack_gain_p5_ub = 250; 339 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 340 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 341 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 342 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 343 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 344 static int32_t rack_gp_decrease_per = 80; /* Beta value of timely decrease (.8) = 80 */ 345 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 346 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 347 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 348 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 349 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 350 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 351 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 352 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 353 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 354 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 355 static int32_t rack_timely_no_stopping = 0; 356 static int32_t rack_down_raise_thresh = 100; 357 static int32_t rack_req_segs = 1; 358 static uint64_t rack_bw_rate_cap = 0; 359 static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */ 360 361 362 /* Rack specific counters */ 363 counter_u64_t rack_saw_enobuf; 364 counter_u64_t rack_saw_enobuf_hw; 365 counter_u64_t rack_saw_enetunreach; 366 counter_u64_t rack_persists_sends; 367 counter_u64_t rack_persists_acks; 368 counter_u64_t rack_persists_loss; 369 counter_u64_t rack_persists_lost_ends; 370 counter_u64_t rack_total_bytes; 371 #ifdef INVARIANTS 372 counter_u64_t rack_adjust_map_bw; 373 #endif 374 /* Tail loss probe counters */ 375 counter_u64_t rack_tlp_tot; 376 counter_u64_t rack_tlp_newdata; 377 counter_u64_t rack_tlp_retran; 378 counter_u64_t rack_tlp_retran_bytes; 379 counter_u64_t rack_to_tot; 380 counter_u64_t rack_hot_alloc; 381 counter_u64_t rack_to_alloc; 382 counter_u64_t rack_to_alloc_hard; 383 counter_u64_t rack_to_alloc_emerg; 384 counter_u64_t rack_to_alloc_limited; 385 counter_u64_t rack_alloc_limited_conns; 386 counter_u64_t rack_split_limited; 387 counter_u64_t rack_rxt_clamps_cwnd; 388 counter_u64_t rack_rxt_clamps_cwnd_uniq; 389 390 counter_u64_t rack_multi_single_eq; 391 counter_u64_t rack_proc_non_comp_ack; 392 393 counter_u64_t rack_fto_send; 394 counter_u64_t rack_fto_rsm_send; 395 counter_u64_t rack_nfto_resend; 396 counter_u64_t rack_non_fto_send; 397 counter_u64_t rack_extended_rfo; 398 399 counter_u64_t rack_sack_proc_all; 400 counter_u64_t rack_sack_proc_short; 401 counter_u64_t rack_sack_proc_restart; 402 counter_u64_t rack_sack_attacks_detected; 403 counter_u64_t rack_sack_attacks_reversed; 404 counter_u64_t rack_sack_attacks_suspect; 405 counter_u64_t rack_sack_used_next_merge; 406 counter_u64_t rack_sack_splits; 407 counter_u64_t rack_sack_used_prev_merge; 408 counter_u64_t rack_sack_skipped_acked; 409 counter_u64_t rack_ack_total; 410 counter_u64_t rack_express_sack; 411 counter_u64_t rack_sack_total; 412 counter_u64_t rack_move_none; 413 counter_u64_t rack_move_some; 414 415 counter_u64_t rack_input_idle_reduces; 416 counter_u64_t rack_collapsed_win; 417 counter_u64_t rack_collapsed_win_seen; 418 counter_u64_t rack_collapsed_win_rxt; 419 counter_u64_t rack_collapsed_win_rxt_bytes; 420 counter_u64_t rack_try_scwnd; 421 counter_u64_t rack_hw_pace_init_fail; 422 counter_u64_t rack_hw_pace_lost; 423 424 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 425 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 426 427 428 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 429 430 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 431 (tv) = (value) + slop; \ 432 if ((u_long)(tv) < (u_long)(tvmin)) \ 433 (tv) = (tvmin); \ 434 if ((u_long)(tv) > (u_long)(tvmax)) \ 435 (tv) = (tvmax); \ 436 } while (0) 437 438 static void 439 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 440 441 static int 442 rack_process_ack(struct mbuf *m, struct tcphdr *th, 443 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 444 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen); 445 static int 446 rack_process_data(struct mbuf *m, struct tcphdr *th, 447 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 448 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 449 static void 450 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 451 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 452 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 453 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 454 uint8_t limit_type); 455 static struct rack_sendmap * 456 rack_check_recovery_mode(struct tcpcb *tp, 457 uint32_t tsused); 458 static uint32_t 459 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack); 460 static void 461 rack_cong_signal(struct tcpcb *tp, 462 uint32_t type, uint32_t ack, int ); 463 static void rack_counter_destroy(void); 464 static int 465 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt); 466 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 467 static void 468 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 469 static void 470 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 471 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); 472 static void rack_dtor(void *mem, int32_t size, void *arg); 473 static void 474 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 475 uint32_t flex1, uint32_t flex2, 476 uint32_t flex3, uint32_t flex4, 477 uint32_t flex5, uint32_t flex6, 478 uint16_t flex7, uint8_t mod); 479 480 static void 481 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 482 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 483 struct rack_sendmap *rsm, uint8_t quality); 484 static struct rack_sendmap * 485 rack_find_high_nonack(struct tcp_rack *rack, 486 struct rack_sendmap *rsm); 487 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 488 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 489 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 490 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt); 491 static void 492 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 493 tcp_seq th_ack, int line, uint8_t quality); 494 static void 495 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); 496 497 static uint32_t 498 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 499 static int32_t rack_handoff_ok(struct tcpcb *tp); 500 static int32_t rack_init(struct tcpcb *tp, void **ptr); 501 static void rack_init_sysctls(void); 502 503 static void 504 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 505 struct tcphdr *th, int entered_rec, int dup_ack_struck, 506 int *dsack_seen, int *sacks_seen); 507 static void 508 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 509 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 510 struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); 511 512 static uint64_t rack_get_gp_est(struct tcp_rack *rack); 513 514 515 static void 516 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 517 struct rack_sendmap *rsm, uint32_t cts); 518 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 519 static int32_t rack_output(struct tcpcb *tp); 520 521 static uint32_t 522 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 523 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 524 uint32_t cts, uint32_t segsiz); 525 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 526 static void rack_remxt_tmr(struct tcpcb *tp); 527 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); 528 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 529 static int32_t rack_stopall(struct tcpcb *tp); 530 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 531 static uint32_t 532 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 533 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz); 534 static void 535 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 536 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz); 537 static int 538 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 539 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 540 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 541 static int 542 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 543 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 544 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 545 546 static int 547 rack_do_closing(struct mbuf *m, struct tcphdr *th, 548 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 549 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 550 static int 551 rack_do_established(struct mbuf *m, struct tcphdr *th, 552 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 553 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 554 static int 555 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 556 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 557 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 558 static int 559 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 560 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 561 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 562 static int 563 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 564 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 565 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 566 static int 567 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 568 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 569 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 570 static int 571 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 572 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 573 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 574 static int 575 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 576 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 577 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 578 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); 579 struct rack_sendmap * 580 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 581 uint32_t tsused); 582 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 583 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 584 static void 585 tcp_rack_partialack(struct tcpcb *tp); 586 static int 587 rack_set_profile(struct tcp_rack *rack, int prof); 588 static void 589 rack_apply_deferred_options(struct tcp_rack *rack); 590 591 int32_t rack_clear_counter=0; 592 593 static uint64_t 594 rack_get_lt_bw(struct tcp_rack *rack) 595 { 596 struct timeval tv; 597 uint64_t tim, bytes; 598 599 tim = rack->r_ctl.lt_bw_time; 600 bytes = rack->r_ctl.lt_bw_bytes; 601 if (rack->lt_bw_up) { 602 /* Include all the current bytes too */ 603 microuptime(&tv); 604 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); 605 tim += (tcp_tv_to_lusec(&tv) - rack->r_ctl.lt_timemark); 606 } 607 if ((bytes != 0) && (tim != 0)) 608 return ((bytes * (uint64_t)1000000) / tim); 609 else 610 return (0); 611 } 612 613 static void 614 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) 615 { 616 struct sockopt sopt; 617 struct cc_newreno_opts opt; 618 struct tcpcb *tp; 619 uint32_t old_beta; 620 uint32_t old_beta_ecn; 621 int error = 0, failed = 0; 622 623 tp = rack->rc_tp; 624 if (tp->t_cc == NULL) { 625 /* Tcb is leaving */ 626 return; 627 } 628 rack->rc_pacing_cc_set = 1; 629 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 630 /* Not new-reno we can't play games with beta! */ 631 failed = 1; 632 goto out; 633 634 } 635 if (CC_ALGO(tp)->ctl_output == NULL) { 636 /* Huh, not using new-reno so no swaps.? */ 637 failed = 2; 638 goto out; 639 } 640 /* Get the current values out */ 641 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 642 sopt.sopt_dir = SOPT_GET; 643 opt.name = CC_NEWRENO_BETA; 644 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 645 if (error) { 646 failed = 3; 647 goto out; 648 } 649 old_beta = opt.val; 650 opt.name = CC_NEWRENO_BETA_ECN; 651 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 652 if (error) { 653 failed = 4; 654 goto out; 655 } 656 old_beta_ecn = opt.val; 657 658 /* Now lets set in the values we have stored */ 659 sopt.sopt_dir = SOPT_SET; 660 opt.name = CC_NEWRENO_BETA; 661 opt.val = rack->r_ctl.rc_saved_beta; 662 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 663 if (error) { 664 failed = 5; 665 goto out; 666 } 667 opt.name = CC_NEWRENO_BETA_ECN; 668 opt.val = rack->r_ctl.rc_saved_beta_ecn; 669 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 670 if (error) { 671 failed = 6; 672 goto out; 673 } 674 /* Save off the values for restoral */ 675 rack->r_ctl.rc_saved_beta = old_beta; 676 rack->r_ctl.rc_saved_beta_ecn = old_beta_ecn; 677 out: 678 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 679 union tcp_log_stackspecific log; 680 struct timeval tv; 681 struct newreno *ptr; 682 683 ptr = ((struct newreno *)tp->t_ccv.cc_data); 684 memset(&log, 0, sizeof(log)); 685 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 686 log.u_bbr.flex1 = ptr->beta; 687 log.u_bbr.flex2 = ptr->beta_ecn; 688 log.u_bbr.flex3 = ptr->newreno_flags; 689 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta; 690 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta_ecn; 691 log.u_bbr.flex6 = failed; 692 log.u_bbr.flex7 = rack->gp_ready; 693 log.u_bbr.flex7 <<= 1; 694 log.u_bbr.flex7 |= rack->use_fixed_rate; 695 log.u_bbr.flex7 <<= 1; 696 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 697 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 698 log.u_bbr.flex8 = flex8; 699 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 700 0, &log, false, NULL, NULL, 0, &tv); 701 } 702 } 703 704 static void 705 rack_set_cc_pacing(struct tcp_rack *rack) 706 { 707 if (rack->rc_pacing_cc_set) 708 return; 709 /* 710 * Use the swap utility placing in 3 for flex8 to id a 711 * set of a new set of values. 712 */ 713 rack->rc_pacing_cc_set = 1; 714 rack_swap_beta_values(rack, 3); 715 } 716 717 static void 718 rack_undo_cc_pacing(struct tcp_rack *rack) 719 { 720 if (rack->rc_pacing_cc_set == 0) 721 return; 722 /* 723 * Use the swap utility placing in 4 for flex8 to id a 724 * restoral of the old values. 725 */ 726 rack->rc_pacing_cc_set = 0; 727 rack_swap_beta_values(rack, 4); 728 } 729 730 static void 731 rack_remove_pacing(struct tcp_rack *rack) 732 { 733 if (rack->rc_pacing_cc_set) 734 rack_undo_cc_pacing(rack); 735 if (rack->r_ctl.pacing_method & RACK_REG_PACING) 736 tcp_decrement_paced_conn(); 737 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) 738 tcp_dec_dgp_pacing_cnt(); 739 rack->rc_always_pace = 0; 740 rack->r_ctl.pacing_method = RACK_PACING_NONE; 741 rack->dgp_on = 0; 742 rack->rc_hybrid_mode = 0; 743 rack->use_fixed_rate = 0; 744 } 745 746 static void 747 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, 748 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) 749 { 750 if (tcp_bblogging_on(rack->rc_tp) && (rack_verbose_logging != 0)) { 751 union tcp_log_stackspecific log; 752 struct timeval tv; 753 754 memset(&log, 0, sizeof(log)); 755 log.u_bbr.flex1 = seq_end; 756 log.u_bbr.flex2 = rack->rc_tp->gput_seq; 757 log.u_bbr.flex3 = ack_end_t; 758 log.u_bbr.flex4 = rack->rc_tp->gput_ts; 759 log.u_bbr.flex5 = send_end_t; 760 log.u_bbr.flex6 = rack->rc_tp->gput_ack; 761 log.u_bbr.flex7 = mode; 762 log.u_bbr.flex8 = 69; 763 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; 764 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; 765 log.u_bbr.pkts_out = line; 766 log.u_bbr.cwnd_gain = rack->app_limited_needs_set; 767 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; 768 log.u_bbr.epoch = rack->r_ctl.current_round; 769 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 770 if (rsm != NULL) { 771 log.u_bbr.applimited = rsm->r_start; 772 log.u_bbr.delivered = rsm->r_end; 773 log.u_bbr.epoch = rsm->r_flags; 774 } 775 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 776 TCP_LOG_EVENTP(rack->rc_tp, NULL, 777 &rack->rc_inp->inp_socket->so_rcv, 778 &rack->rc_inp->inp_socket->so_snd, 779 BBR_LOG_HPTSI_CALC, 0, 780 0, &log, false, &tv); 781 } 782 } 783 784 static int 785 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 786 { 787 uint32_t stat; 788 int32_t error; 789 790 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 791 if (error || req->newptr == NULL) 792 return error; 793 794 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 795 if (error) 796 return (error); 797 if (stat == 1) { 798 #ifdef INVARIANTS 799 printf("Clearing RACK counters\n"); 800 #endif 801 counter_u64_zero(rack_tlp_tot); 802 counter_u64_zero(rack_tlp_newdata); 803 counter_u64_zero(rack_tlp_retran); 804 counter_u64_zero(rack_tlp_retran_bytes); 805 counter_u64_zero(rack_to_tot); 806 counter_u64_zero(rack_saw_enobuf); 807 counter_u64_zero(rack_saw_enobuf_hw); 808 counter_u64_zero(rack_saw_enetunreach); 809 counter_u64_zero(rack_persists_sends); 810 counter_u64_zero(rack_total_bytes); 811 counter_u64_zero(rack_persists_acks); 812 counter_u64_zero(rack_persists_loss); 813 counter_u64_zero(rack_persists_lost_ends); 814 #ifdef INVARIANTS 815 counter_u64_zero(rack_adjust_map_bw); 816 #endif 817 counter_u64_zero(rack_to_alloc_hard); 818 counter_u64_zero(rack_to_alloc_emerg); 819 counter_u64_zero(rack_sack_proc_all); 820 counter_u64_zero(rack_fto_send); 821 counter_u64_zero(rack_fto_rsm_send); 822 counter_u64_zero(rack_extended_rfo); 823 counter_u64_zero(rack_hw_pace_init_fail); 824 counter_u64_zero(rack_hw_pace_lost); 825 counter_u64_zero(rack_non_fto_send); 826 counter_u64_zero(rack_nfto_resend); 827 counter_u64_zero(rack_sack_proc_short); 828 counter_u64_zero(rack_sack_proc_restart); 829 counter_u64_zero(rack_to_alloc); 830 counter_u64_zero(rack_to_alloc_limited); 831 counter_u64_zero(rack_alloc_limited_conns); 832 counter_u64_zero(rack_split_limited); 833 counter_u64_zero(rack_rxt_clamps_cwnd); 834 counter_u64_zero(rack_rxt_clamps_cwnd_uniq); 835 counter_u64_zero(rack_multi_single_eq); 836 counter_u64_zero(rack_proc_non_comp_ack); 837 counter_u64_zero(rack_sack_attacks_detected); 838 counter_u64_zero(rack_sack_attacks_reversed); 839 counter_u64_zero(rack_sack_attacks_suspect); 840 counter_u64_zero(rack_sack_used_next_merge); 841 counter_u64_zero(rack_sack_used_prev_merge); 842 counter_u64_zero(rack_sack_splits); 843 counter_u64_zero(rack_sack_skipped_acked); 844 counter_u64_zero(rack_ack_total); 845 counter_u64_zero(rack_express_sack); 846 counter_u64_zero(rack_sack_total); 847 counter_u64_zero(rack_move_none); 848 counter_u64_zero(rack_move_some); 849 counter_u64_zero(rack_try_scwnd); 850 counter_u64_zero(rack_collapsed_win); 851 counter_u64_zero(rack_collapsed_win_rxt); 852 counter_u64_zero(rack_collapsed_win_seen); 853 counter_u64_zero(rack_collapsed_win_rxt_bytes); 854 } else if (stat == 2) { 855 #ifdef INVARIANTS 856 printf("Clearing RACK option array\n"); 857 #endif 858 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); 859 } else if (stat == 3) { 860 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); 861 } else if (stat == 4) { 862 #ifdef INVARIANTS 863 printf("Clearing RACK out size array\n"); 864 #endif 865 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); 866 } 867 rack_clear_counter = 0; 868 return (0); 869 } 870 871 static void 872 rack_init_sysctls(void) 873 { 874 struct sysctl_oid *rack_counters; 875 struct sysctl_oid *rack_attack; 876 struct sysctl_oid *rack_pacing; 877 struct sysctl_oid *rack_timely; 878 struct sysctl_oid *rack_timers; 879 struct sysctl_oid *rack_tlp; 880 struct sysctl_oid *rack_misc; 881 struct sysctl_oid *rack_features; 882 struct sysctl_oid *rack_measure; 883 struct sysctl_oid *rack_probertt; 884 struct sysctl_oid *rack_hw_pacing; 885 886 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 887 SYSCTL_CHILDREN(rack_sysctl_root), 888 OID_AUTO, 889 "sack_attack", 890 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 891 "Rack Sack Attack Counters and Controls"); 892 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 893 SYSCTL_CHILDREN(rack_sysctl_root), 894 OID_AUTO, 895 "stats", 896 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 897 "Rack Counters"); 898 SYSCTL_ADD_S32(&rack_sysctl_ctx, 899 SYSCTL_CHILDREN(rack_sysctl_root), 900 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 901 &rack_rate_sample_method , USE_RTT_LOW, 902 "What method should we use for rate sampling 0=high, 1=low "); 903 /* Probe rtt related controls */ 904 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_sysctl_root), 906 OID_AUTO, 907 "probertt", 908 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 909 "ProbeRTT related Controls"); 910 SYSCTL_ADD_U16(&rack_sysctl_ctx, 911 SYSCTL_CHILDREN(rack_probertt), 912 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 913 &rack_atexit_prtt_hbp, 130, 914 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 915 SYSCTL_ADD_U16(&rack_sysctl_ctx, 916 SYSCTL_CHILDREN(rack_probertt), 917 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 918 &rack_atexit_prtt, 130, 919 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 920 SYSCTL_ADD_U16(&rack_sysctl_ctx, 921 SYSCTL_CHILDREN(rack_probertt), 922 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 923 &rack_per_of_gp_probertt, 60, 924 "What percentage of goodput do we pace at in probertt"); 925 SYSCTL_ADD_U16(&rack_sysctl_ctx, 926 SYSCTL_CHILDREN(rack_probertt), 927 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 928 &rack_per_of_gp_probertt_reduce, 10, 929 "What percentage of goodput do we reduce every gp_srtt"); 930 SYSCTL_ADD_U16(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_probertt), 932 OID_AUTO, "gp_per_low", CTLFLAG_RW, 933 &rack_per_of_gp_lowthresh, 40, 934 "What percentage of goodput do we allow the multiplier to fall to"); 935 SYSCTL_ADD_U32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_probertt), 937 OID_AUTO, "time_between", CTLFLAG_RW, 938 &rack_time_between_probertt, 96000000, 939 "How many useconds between the lowest rtt falling must past before we enter probertt"); 940 SYSCTL_ADD_U32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_probertt), 942 OID_AUTO, "safety", CTLFLAG_RW, 943 &rack_probe_rtt_safety_val, 2000000, 944 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 945 SYSCTL_ADD_U32(&rack_sysctl_ctx, 946 SYSCTL_CHILDREN(rack_probertt), 947 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 948 &rack_probe_rtt_sets_cwnd, 0, 949 "Do we set the cwnd too (if always_lower is on)"); 950 SYSCTL_ADD_U32(&rack_sysctl_ctx, 951 SYSCTL_CHILDREN(rack_probertt), 952 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 953 &rack_max_drain_wait, 2, 954 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 955 SYSCTL_ADD_U32(&rack_sysctl_ctx, 956 SYSCTL_CHILDREN(rack_probertt), 957 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 958 &rack_must_drain, 1, 959 "We must drain this many gp_srtt's waiting for flight to reach goal"); 960 SYSCTL_ADD_U32(&rack_sysctl_ctx, 961 SYSCTL_CHILDREN(rack_probertt), 962 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 963 &rack_probertt_use_min_rtt_entry, 1, 964 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 965 SYSCTL_ADD_U32(&rack_sysctl_ctx, 966 SYSCTL_CHILDREN(rack_probertt), 967 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 968 &rack_probertt_use_min_rtt_exit, 0, 969 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 970 SYSCTL_ADD_U32(&rack_sysctl_ctx, 971 SYSCTL_CHILDREN(rack_probertt), 972 OID_AUTO, "length_div", CTLFLAG_RW, 973 &rack_probertt_gpsrtt_cnt_div, 0, 974 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 975 SYSCTL_ADD_U32(&rack_sysctl_ctx, 976 SYSCTL_CHILDREN(rack_probertt), 977 OID_AUTO, "length_mul", CTLFLAG_RW, 978 &rack_probertt_gpsrtt_cnt_mul, 0, 979 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 980 SYSCTL_ADD_U32(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_probertt), 982 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 983 &rack_min_probertt_hold, 200000, 984 "What is the minimum time we hold probertt at target"); 985 SYSCTL_ADD_U32(&rack_sysctl_ctx, 986 SYSCTL_CHILDREN(rack_probertt), 987 OID_AUTO, "filter_life", CTLFLAG_RW, 988 &rack_probertt_filter_life, 10000000, 989 "What is the time for the filters life in useconds"); 990 SYSCTL_ADD_U32(&rack_sysctl_ctx, 991 SYSCTL_CHILDREN(rack_probertt), 992 OID_AUTO, "lower_within", CTLFLAG_RW, 993 &rack_probertt_lower_within, 10, 994 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 995 SYSCTL_ADD_U32(&rack_sysctl_ctx, 996 SYSCTL_CHILDREN(rack_probertt), 997 OID_AUTO, "must_move", CTLFLAG_RW, 998 &rack_min_rtt_movement, 250, 999 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 1000 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1001 SYSCTL_CHILDREN(rack_probertt), 1002 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 1003 &rack_probertt_clear_is, 1, 1004 "Do we clear I/S counts on exiting probe-rtt"); 1005 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1006 SYSCTL_CHILDREN(rack_probertt), 1007 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 1008 &rack_max_drain_hbp, 1, 1009 "How many extra drain gpsrtt's do we get in highly buffered paths"); 1010 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1011 SYSCTL_CHILDREN(rack_probertt), 1012 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 1013 &rack_hbp_thresh, 3, 1014 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 1015 /* Pacing related sysctls */ 1016 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1017 SYSCTL_CHILDREN(rack_sysctl_root), 1018 OID_AUTO, 1019 "pacing", 1020 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1021 "Pacing related Controls"); 1022 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1023 SYSCTL_CHILDREN(rack_pacing), 1024 OID_AUTO, "pcm_enabled", CTLFLAG_RW, 1025 &rack_pcm_is_enabled, 1, 1026 "Do we by default do PCM measurements?"); 1027 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1028 SYSCTL_CHILDREN(rack_pacing), 1029 OID_AUTO, "pcm_rnds", CTLFLAG_RW, 1030 &rack_pcm_every_n_rounds, 100, 1031 "How many rounds before we need to do a PCM measurement"); 1032 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1033 SYSCTL_CHILDREN(rack_pacing), 1034 OID_AUTO, "pcm_blast", CTLFLAG_RW, 1035 &rack_pcm_blast, 0, 1036 "Blast out the full cwnd/rwnd when doing a PCM measurement"); 1037 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1038 SYSCTL_CHILDREN(rack_pacing), 1039 OID_AUTO, "rnd_gp_gain", CTLFLAG_RW, 1040 &rack_gp_gain_req, 1200, 1041 "How much do we have to increase the GP to record the round 1200 = 120.0"); 1042 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1043 SYSCTL_CHILDREN(rack_pacing), 1044 OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW, 1045 &rack_rnd_cnt_req, 0x10005, 1046 "How many rounds less than rnd_gp_gain will drop us out of SS"); 1047 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1048 SYSCTL_CHILDREN(rack_pacing), 1049 OID_AUTO, "no_timely", CTLFLAG_RW, 1050 &rack_timely_off, 0, 1051 "Do we not use timely in DGP?"); 1052 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1053 SYSCTL_CHILDREN(rack_pacing), 1054 OID_AUTO, "fillcw", CTLFLAG_RW, 1055 &rack_fill_cw_state, 0, 1056 "Enable fillcw on new connections (default=0 off)?"); 1057 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1058 SYSCTL_CHILDREN(rack_pacing), 1059 OID_AUTO, "min_burst", CTLFLAG_RW, 1060 &rack_pacing_min_seg, 0, 1061 "What is the min burst size for pacing (0 disables)?"); 1062 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1063 SYSCTL_CHILDREN(rack_pacing), 1064 OID_AUTO, "divisor", CTLFLAG_RW, 1065 &rack_default_pacing_divisor, 250, 1066 "What is the default divisor given to the rl code?"); 1067 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1068 SYSCTL_CHILDREN(rack_pacing), 1069 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, 1070 &rack_bw_multipler, 0, 1071 "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?"); 1072 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1073 SYSCTL_CHILDREN(rack_pacing), 1074 OID_AUTO, "max_pace_over", CTLFLAG_RW, 1075 &rack_max_per_above, 30, 1076 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 1077 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1078 SYSCTL_CHILDREN(rack_pacing), 1079 OID_AUTO, "allow1mss", CTLFLAG_RW, 1080 &rack_pace_one_seg, 0, 1081 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); 1082 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1083 SYSCTL_CHILDREN(rack_pacing), 1084 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 1085 &rack_limit_time_with_srtt, 0, 1086 "Do we limit pacing time based on srtt"); 1087 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1088 SYSCTL_CHILDREN(rack_pacing), 1089 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1090 &rack_per_of_gp_ss, 250, 1091 "If non zero, what percentage of goodput to pace at in slow start"); 1092 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1093 SYSCTL_CHILDREN(rack_pacing), 1094 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1095 &rack_per_of_gp_ca, 150, 1096 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1097 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1098 SYSCTL_CHILDREN(rack_pacing), 1099 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1100 &rack_per_of_gp_rec, 200, 1101 "If non zero, what percentage of goodput to pace at in recovery"); 1102 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1103 SYSCTL_CHILDREN(rack_pacing), 1104 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1105 &rack_hptsi_segments, 40, 1106 "What size is the max for TSO segments in pacing and burst mitigation"); 1107 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1108 SYSCTL_CHILDREN(rack_pacing), 1109 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1110 &rack_slot_reduction, 4, 1111 "When doing only burst mitigation what is the reduce divisor"); 1112 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1113 SYSCTL_CHILDREN(rack_sysctl_root), 1114 OID_AUTO, "use_pacing", CTLFLAG_RW, 1115 &rack_pace_every_seg, 0, 1116 "If set we use pacing, if clear we use only the original burst mitigation"); 1117 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1118 SYSCTL_CHILDREN(rack_pacing), 1119 OID_AUTO, "rate_cap", CTLFLAG_RW, 1120 &rack_bw_rate_cap, 0, 1121 "If set we apply this value to the absolute rate cap used by pacing"); 1122 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_pacing), 1124 OID_AUTO, "fillcw_cap", CTLFLAG_RW, 1125 &rack_fillcw_bw_cap, 3750000, 1126 "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?"); 1127 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1128 SYSCTL_CHILDREN(rack_sysctl_root), 1129 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1130 &rack_req_measurements, 1, 1131 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1132 /* Hardware pacing */ 1133 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_sysctl_root), 1135 OID_AUTO, 1136 "hdwr_pacing", 1137 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1138 "Pacing related Controls"); 1139 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1140 SYSCTL_CHILDREN(rack_hw_pacing), 1141 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1142 &rack_hw_rwnd_factor, 2, 1143 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1144 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1145 SYSCTL_CHILDREN(rack_hw_pacing), 1146 OID_AUTO, "precheck", CTLFLAG_RW, 1147 &rack_hw_check_queue, 0, 1148 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); 1149 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1150 SYSCTL_CHILDREN(rack_hw_pacing), 1151 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1152 &rack_enobuf_hw_boost_mult, 0, 1153 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1154 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1155 SYSCTL_CHILDREN(rack_hw_pacing), 1156 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1157 &rack_enobuf_hw_max, 2, 1158 "What is the max boost the pacing time if we see a ENOBUFS?"); 1159 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1160 SYSCTL_CHILDREN(rack_hw_pacing), 1161 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1162 &rack_enobuf_hw_min, 2, 1163 "What is the min boost the pacing time if we see a ENOBUFS?"); 1164 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1165 SYSCTL_CHILDREN(rack_hw_pacing), 1166 OID_AUTO, "enable", CTLFLAG_RW, 1167 &rack_enable_hw_pacing, 0, 1168 "Should RACK attempt to use hw pacing?"); 1169 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1170 SYSCTL_CHILDREN(rack_hw_pacing), 1171 OID_AUTO, "rate_cap", CTLFLAG_RW, 1172 &rack_hw_rate_caps, 0, 1173 "Does the highest hardware pacing rate cap the rate we will send at??"); 1174 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1175 SYSCTL_CHILDREN(rack_hw_pacing), 1176 OID_AUTO, "uncap_per", CTLFLAG_RW, 1177 &rack_hw_rate_cap_per, 0, 1178 "If you go over b/w by this amount you will be uncapped (0 = never)"); 1179 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1180 SYSCTL_CHILDREN(rack_hw_pacing), 1181 OID_AUTO, "rate_min", CTLFLAG_RW, 1182 &rack_hw_rate_min, 0, 1183 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1184 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1185 SYSCTL_CHILDREN(rack_hw_pacing), 1186 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1187 &rack_hw_rate_to_low, 0, 1188 "If we fall below this rate, dis-engage hw pacing?"); 1189 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1190 SYSCTL_CHILDREN(rack_hw_pacing), 1191 OID_AUTO, "up_only", CTLFLAG_RW, 1192 &rack_hw_up_only, 0, 1193 "Do we allow hw pacing to lower the rate selected?"); 1194 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1195 SYSCTL_CHILDREN(rack_sysctl_root), 1196 OID_AUTO, 1197 "timely", 1198 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1199 "Rack Timely RTT Controls"); 1200 /* Timely based GP dynmics */ 1201 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_timely), 1203 OID_AUTO, "upper", CTLFLAG_RW, 1204 &rack_gp_per_bw_mul_up, 2, 1205 "Rack timely upper range for equal b/w (in percentage)"); 1206 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1207 SYSCTL_CHILDREN(rack_timely), 1208 OID_AUTO, "lower", CTLFLAG_RW, 1209 &rack_gp_per_bw_mul_down, 4, 1210 "Rack timely lower range for equal b/w (in percentage)"); 1211 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_timely), 1213 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1214 &rack_gp_rtt_maxmul, 3, 1215 "Rack timely multiplier of lowest rtt for rtt_max"); 1216 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_timely), 1218 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1219 &rack_gp_rtt_mindiv, 4, 1220 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1221 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1222 SYSCTL_CHILDREN(rack_timely), 1223 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1224 &rack_gp_rtt_minmul, 1, 1225 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1226 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1227 SYSCTL_CHILDREN(rack_timely), 1228 OID_AUTO, "decrease", CTLFLAG_RW, 1229 &rack_gp_decrease_per, 80, 1230 "Rack timely Beta value 80 = .8 (scaled by 100)"); 1231 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1232 SYSCTL_CHILDREN(rack_timely), 1233 OID_AUTO, "increase", CTLFLAG_RW, 1234 &rack_gp_increase_per, 2, 1235 "Rack timely increase perentage of our GP multiplication factor"); 1236 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_timely), 1238 OID_AUTO, "lowerbound", CTLFLAG_RW, 1239 &rack_per_lower_bound, 50, 1240 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1241 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_timely), 1243 OID_AUTO, "p5_upper", CTLFLAG_RW, 1244 &rack_gain_p5_ub, 250, 1245 "Profile 5 upper bound to timely gain"); 1246 1247 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1248 SYSCTL_CHILDREN(rack_timely), 1249 OID_AUTO, "upperboundss", CTLFLAG_RW, 1250 &rack_per_upper_bound_ss, 0, 1251 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1252 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1253 SYSCTL_CHILDREN(rack_timely), 1254 OID_AUTO, "upperboundca", CTLFLAG_RW, 1255 &rack_per_upper_bound_ca, 0, 1256 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1257 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1258 SYSCTL_CHILDREN(rack_timely), 1259 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1260 &rack_do_dyn_mul, 0, 1261 "Rack timely do we enable dynmaic timely goodput by default"); 1262 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1263 SYSCTL_CHILDREN(rack_timely), 1264 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1265 &rack_gp_no_rec_chg, 1, 1266 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1267 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1268 SYSCTL_CHILDREN(rack_timely), 1269 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1270 &rack_timely_dec_clear, 6, 1271 "Rack timely what threshold do we count to before another boost during b/w decent"); 1272 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1273 SYSCTL_CHILDREN(rack_timely), 1274 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1275 &rack_timely_max_push_rise, 3, 1276 "Rack timely how many times do we push up with b/w increase"); 1277 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1278 SYSCTL_CHILDREN(rack_timely), 1279 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1280 &rack_timely_max_push_drop, 3, 1281 "Rack timely how many times do we push back on b/w decent"); 1282 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1283 SYSCTL_CHILDREN(rack_timely), 1284 OID_AUTO, "min_segs", CTLFLAG_RW, 1285 &rack_timely_min_segs, 4, 1286 "Rack timely when setting the cwnd what is the min num segments"); 1287 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1288 SYSCTL_CHILDREN(rack_timely), 1289 OID_AUTO, "nonstop", CTLFLAG_RW, 1290 &rack_timely_no_stopping, 0, 1291 "Rack timely don't stop increase"); 1292 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1293 SYSCTL_CHILDREN(rack_timely), 1294 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1295 &rack_down_raise_thresh, 100, 1296 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1297 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1298 SYSCTL_CHILDREN(rack_timely), 1299 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1300 &rack_req_segs, 1, 1301 "Bottom dragging if not these many segments outstanding and room"); 1302 1303 /* TLP and Rack related parameters */ 1304 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1305 SYSCTL_CHILDREN(rack_sysctl_root), 1306 OID_AUTO, 1307 "tlp", 1308 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1309 "TLP and Rack related Controls"); 1310 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1311 SYSCTL_CHILDREN(rack_tlp), 1312 OID_AUTO, "use_rrr", CTLFLAG_RW, 1313 &use_rack_rr, 1, 1314 "Do we use Rack Rapid Recovery"); 1315 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1316 SYSCTL_CHILDREN(rack_tlp), 1317 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1318 &rack_max_abc_post_recovery, 2, 1319 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1320 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_tlp), 1322 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1323 &rack_non_rxt_use_cr, 0, 1324 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1325 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1326 SYSCTL_CHILDREN(rack_tlp), 1327 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1328 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1329 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1330 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1331 SYSCTL_CHILDREN(rack_tlp), 1332 OID_AUTO, "limit", CTLFLAG_RW, 1333 &rack_tlp_limit, 2, 1334 "How many TLP's can be sent without sending new data"); 1335 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1336 SYSCTL_CHILDREN(rack_tlp), 1337 OID_AUTO, "use_greater", CTLFLAG_RW, 1338 &rack_tlp_use_greater, 1, 1339 "Should we use the rack_rtt time if its greater than srtt"); 1340 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1341 SYSCTL_CHILDREN(rack_tlp), 1342 OID_AUTO, "tlpminto", CTLFLAG_RW, 1343 &rack_tlp_min, 10000, 1344 "TLP minimum timeout per the specification (in microseconds)"); 1345 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1346 SYSCTL_CHILDREN(rack_tlp), 1347 OID_AUTO, "send_oldest", CTLFLAG_RW, 1348 &rack_always_send_oldest, 0, 1349 "Should we always send the oldest TLP and RACK-TLP"); 1350 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1351 SYSCTL_CHILDREN(rack_tlp), 1352 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1353 &rack_lower_cwnd_at_tlp, 0, 1354 "When a TLP completes a retran should we enter recovery"); 1355 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1356 SYSCTL_CHILDREN(rack_tlp), 1357 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1358 &rack_reorder_thresh, 2, 1359 "What factor for rack will be added when seeing reordering (shift right)"); 1360 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1361 SYSCTL_CHILDREN(rack_tlp), 1362 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1363 &rack_tlp_thresh, 1, 1364 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1365 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1366 SYSCTL_CHILDREN(rack_tlp), 1367 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1368 &rack_reorder_fade, 60000000, 1369 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1370 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1371 SYSCTL_CHILDREN(rack_tlp), 1372 OID_AUTO, "pktdelay", CTLFLAG_RW, 1373 &rack_pkt_delay, 1000, 1374 "Extra RACK time (in microseconds) besides reordering thresh"); 1375 1376 /* Timer related controls */ 1377 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1378 SYSCTL_CHILDREN(rack_sysctl_root), 1379 OID_AUTO, 1380 "timers", 1381 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1382 "Timer related controls"); 1383 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1384 SYSCTL_CHILDREN(rack_timers), 1385 OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW, 1386 &rack_ssthresh_rest_rto_rec, 0, 1387 "When doing recovery -> rto -> recovery do we reset SSthresh?"); 1388 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1389 SYSCTL_CHILDREN(rack_timers), 1390 OID_AUTO, "scoreboard_thresh", CTLFLAG_RW, 1391 &rack_rxt_scoreboard_clear_thresh, 2, 1392 "How many RTO's are allowed before we clear the scoreboard"); 1393 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1394 SYSCTL_CHILDREN(rack_timers), 1395 OID_AUTO, "honor_hpts_min", CTLFLAG_RW, 1396 &rack_honors_hpts_min_to, 1, 1397 "Do rack pacing timers honor hpts min timeout"); 1398 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_timers), 1400 OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, 1401 &rack_max_reduce, 10, 1402 "Max percentage we will reduce slot by for pacing when we are behind"); 1403 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1404 SYSCTL_CHILDREN(rack_timers), 1405 OID_AUTO, "persmin", CTLFLAG_RW, 1406 &rack_persist_min, 250000, 1407 "What is the minimum time in microseconds between persists"); 1408 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1409 SYSCTL_CHILDREN(rack_timers), 1410 OID_AUTO, "persmax", CTLFLAG_RW, 1411 &rack_persist_max, 2000000, 1412 "What is the largest delay in microseconds between persists"); 1413 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1414 SYSCTL_CHILDREN(rack_timers), 1415 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1416 &rack_delayed_ack_time, 40000, 1417 "Delayed ack time (40ms in microseconds)"); 1418 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1419 SYSCTL_CHILDREN(rack_timers), 1420 OID_AUTO, "minrto", CTLFLAG_RW, 1421 &rack_rto_min, 30000, 1422 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1423 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1424 SYSCTL_CHILDREN(rack_timers), 1425 OID_AUTO, "maxrto", CTLFLAG_RW, 1426 &rack_rto_max, 4000000, 1427 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1428 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1429 SYSCTL_CHILDREN(rack_timers), 1430 OID_AUTO, "minto", CTLFLAG_RW, 1431 &rack_min_to, 1000, 1432 "Minimum rack timeout in microseconds"); 1433 /* Measure controls */ 1434 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1435 SYSCTL_CHILDREN(rack_sysctl_root), 1436 OID_AUTO, 1437 "measure", 1438 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1439 "Measure related controls"); 1440 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_measure), 1442 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1443 &rack_wma_divisor, 8, 1444 "When doing b/w calculation what is the divisor for the WMA"); 1445 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1446 SYSCTL_CHILDREN(rack_measure), 1447 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1448 &rack_cwnd_block_ends_measure, 0, 1449 "Does a cwnd just-return end the measurement window (app limited)"); 1450 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1451 SYSCTL_CHILDREN(rack_measure), 1452 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1453 &rack_rwnd_block_ends_measure, 0, 1454 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1455 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1456 SYSCTL_CHILDREN(rack_measure), 1457 OID_AUTO, "min_target", CTLFLAG_RW, 1458 &rack_def_data_window, 20, 1459 "What is the minimum target window (in mss) for a GP measurements"); 1460 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1461 SYSCTL_CHILDREN(rack_measure), 1462 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1463 &rack_goal_bdp, 2, 1464 "What is the goal BDP to measure"); 1465 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1466 SYSCTL_CHILDREN(rack_measure), 1467 OID_AUTO, "min_srtts", CTLFLAG_RW, 1468 &rack_min_srtts, 1, 1469 "What is the goal BDP to measure"); 1470 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1471 SYSCTL_CHILDREN(rack_measure), 1472 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1473 &rack_min_measure_usec, 0, 1474 "What is the Minimum time time for a measurement if 0, this is off"); 1475 /* Features */ 1476 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1477 SYSCTL_CHILDREN(rack_sysctl_root), 1478 OID_AUTO, 1479 "features", 1480 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1481 "Feature controls"); 1482 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1483 SYSCTL_CHILDREN(rack_features), 1484 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, 1485 &rack_hybrid_allow_set_maxseg, 0, 1486 "Should hybrid pacing allow the setmss command"); 1487 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1488 SYSCTL_CHILDREN(rack_features), 1489 OID_AUTO, "cmpack", CTLFLAG_RW, 1490 &rack_use_cmp_acks, 1, 1491 "Should RACK have LRO send compressed acks"); 1492 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1493 SYSCTL_CHILDREN(rack_features), 1494 OID_AUTO, "fsb", CTLFLAG_RW, 1495 &rack_use_fsb, 1, 1496 "Should RACK use the fast send block?"); 1497 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1498 SYSCTL_CHILDREN(rack_features), 1499 OID_AUTO, "rfo", CTLFLAG_RW, 1500 &rack_use_rfo, 1, 1501 "Should RACK use rack_fast_output()?"); 1502 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1503 SYSCTL_CHILDREN(rack_features), 1504 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1505 &rack_use_rsm_rfo, 1, 1506 "Should RACK use rack_fast_rsm_output()?"); 1507 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1508 SYSCTL_CHILDREN(rack_features), 1509 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1510 &rack_enable_mqueue_for_nonpaced, 0, 1511 "Should RACK use mbuf queuing for non-paced connections"); 1512 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1513 SYSCTL_CHILDREN(rack_features), 1514 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1515 &rack_do_hystart, 0, 1516 "Should RACK enable HyStart++ on connections?"); 1517 /* Misc rack controls */ 1518 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1519 SYSCTL_CHILDREN(rack_sysctl_root), 1520 OID_AUTO, 1521 "misc", 1522 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1523 "Misc related controls"); 1524 #ifdef TCP_ACCOUNTING 1525 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1526 SYSCTL_CHILDREN(rack_misc), 1527 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1528 &rack_tcp_accounting, 0, 1529 "Should we turn on TCP accounting for all rack sessions?"); 1530 #endif 1531 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1532 SYSCTL_CHILDREN(rack_misc), 1533 OID_AUTO, "dnd", CTLFLAG_RW, 1534 &rack_dnd_default, 0, 1535 "Do not disturb default for rack_rrr = 3"); 1536 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1537 SYSCTL_CHILDREN(rack_misc), 1538 OID_AUTO, "sad_seg_per", CTLFLAG_RW, 1539 &sad_seg_size_per, 800, 1540 "Percentage of segment size needed in a sack 800 = 80.0?"); 1541 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1542 SYSCTL_CHILDREN(rack_misc), 1543 OID_AUTO, "rxt_controls", CTLFLAG_RW, 1544 &rack_rxt_controls, 0, 1545 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); 1546 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1547 SYSCTL_CHILDREN(rack_misc), 1548 OID_AUTO, "rack_hibeta", CTLFLAG_RW, 1549 &rack_hibeta_setting, 0, 1550 "Do we ue a high beta (80 instead of 50)?"); 1551 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1552 SYSCTL_CHILDREN(rack_misc), 1553 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1554 &rack_apply_rtt_with_reduced_conf, 0, 1555 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1556 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1557 SYSCTL_CHILDREN(rack_misc), 1558 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1559 &rack_dsack_std_based, 3, 1560 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1561 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1562 SYSCTL_CHILDREN(rack_misc), 1563 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1564 &rack_prr_addbackmax, 2, 1565 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1566 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1567 SYSCTL_CHILDREN(rack_misc), 1568 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1569 &rack_stats_gets_ms_rtt, 1, 1570 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1571 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1572 SYSCTL_CHILDREN(rack_misc), 1573 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1574 &rack_client_low_buf, 0, 1575 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1576 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1577 SYSCTL_CHILDREN(rack_misc), 1578 OID_AUTO, "defprofile", CTLFLAG_RW, 1579 &rack_def_profile, 0, 1580 "Should RACK use a default profile (0=no, num == profile num)?"); 1581 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1582 SYSCTL_CHILDREN(rack_misc), 1583 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1584 &rack_enable_shared_cwnd, 1, 1585 "Should RACK try to use the shared cwnd on connections where allowed"); 1586 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1587 SYSCTL_CHILDREN(rack_misc), 1588 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1589 &rack_limits_scwnd, 1, 1590 "Should RACK place low end time limits on the shared cwnd feature"); 1591 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1592 SYSCTL_CHILDREN(rack_misc), 1593 OID_AUTO, "no_prr", CTLFLAG_RW, 1594 &rack_disable_prr, 0, 1595 "Should RACK not use prr and only pace (must have pacing on)"); 1596 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1597 SYSCTL_CHILDREN(rack_misc), 1598 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1599 &rack_verbose_logging, 0, 1600 "Should RACK black box logging be verbose"); 1601 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1602 SYSCTL_CHILDREN(rack_misc), 1603 OID_AUTO, "data_after_close", CTLFLAG_RW, 1604 &rack_ignore_data_after_close, 1, 1605 "Do we hold off sending a RST until all pending data is ack'd"); 1606 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1607 SYSCTL_CHILDREN(rack_misc), 1608 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1609 &rack_sack_not_required, 1, 1610 "Do we allow rack to run on connections not supporting SACK"); 1611 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1612 SYSCTL_CHILDREN(rack_misc), 1613 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1614 &rack_send_a_lot_in_prr, 1, 1615 "Send a lot in prr"); 1616 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1617 SYSCTL_CHILDREN(rack_misc), 1618 OID_AUTO, "autoscale", CTLFLAG_RW, 1619 &rack_autosndbuf_inc, 20, 1620 "What percentage should rack scale up its snd buffer by?"); 1621 1622 1623 /* Sack Attacker detection stuff */ 1624 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1625 SYSCTL_CHILDREN(rack_attack), 1626 OID_AUTO, "merge_out", CTLFLAG_RW, 1627 &rack_merge_out_sacks_on_attack, 0, 1628 "Do we merge the sendmap when we decide we are being attacked?"); 1629 1630 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1631 SYSCTL_CHILDREN(rack_attack), 1632 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1633 &rack_highest_sack_thresh_seen, 0, 1634 "Highest sack to ack ratio seen"); 1635 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1636 SYSCTL_CHILDREN(rack_attack), 1637 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1638 &rack_highest_move_thresh_seen, 0, 1639 "Highest move to non-move ratio seen"); 1640 rack_ack_total = counter_u64_alloc(M_WAITOK); 1641 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1642 SYSCTL_CHILDREN(rack_attack), 1643 OID_AUTO, "acktotal", CTLFLAG_RD, 1644 &rack_ack_total, 1645 "Total number of Ack's"); 1646 rack_express_sack = counter_u64_alloc(M_WAITOK); 1647 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1648 SYSCTL_CHILDREN(rack_attack), 1649 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1650 &rack_express_sack, 1651 "Total expresss number of Sack's"); 1652 rack_sack_total = counter_u64_alloc(M_WAITOK); 1653 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1654 SYSCTL_CHILDREN(rack_attack), 1655 OID_AUTO, "sacktotal", CTLFLAG_RD, 1656 &rack_sack_total, 1657 "Total number of SACKs"); 1658 rack_move_none = counter_u64_alloc(M_WAITOK); 1659 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1660 SYSCTL_CHILDREN(rack_attack), 1661 OID_AUTO, "move_none", CTLFLAG_RD, 1662 &rack_move_none, 1663 "Total number of SACK index reuse of positions under threshold"); 1664 rack_move_some = counter_u64_alloc(M_WAITOK); 1665 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1666 SYSCTL_CHILDREN(rack_attack), 1667 OID_AUTO, "move_some", CTLFLAG_RD, 1668 &rack_move_some, 1669 "Total number of SACK index reuse of positions over threshold"); 1670 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1671 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1672 SYSCTL_CHILDREN(rack_attack), 1673 OID_AUTO, "attacks", CTLFLAG_RD, 1674 &rack_sack_attacks_detected, 1675 "Total number of SACK attackers that had sack disabled"); 1676 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1677 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1678 SYSCTL_CHILDREN(rack_attack), 1679 OID_AUTO, "reversed", CTLFLAG_RD, 1680 &rack_sack_attacks_reversed, 1681 "Total number of SACK attackers that were later determined false positive"); 1682 rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); 1683 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1684 SYSCTL_CHILDREN(rack_attack), 1685 OID_AUTO, "suspect", CTLFLAG_RD, 1686 &rack_sack_attacks_suspect, 1687 "Total number of SACKs that triggered early detection"); 1688 1689 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1690 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1691 SYSCTL_CHILDREN(rack_attack), 1692 OID_AUTO, "nextmerge", CTLFLAG_RD, 1693 &rack_sack_used_next_merge, 1694 "Total number of times we used the next merge"); 1695 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1696 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1697 SYSCTL_CHILDREN(rack_attack), 1698 OID_AUTO, "prevmerge", CTLFLAG_RD, 1699 &rack_sack_used_prev_merge, 1700 "Total number of times we used the prev merge"); 1701 /* Counters */ 1702 rack_total_bytes = counter_u64_alloc(M_WAITOK); 1703 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1704 SYSCTL_CHILDREN(rack_counters), 1705 OID_AUTO, "totalbytes", CTLFLAG_RD, 1706 &rack_total_bytes, 1707 "Total number of bytes sent"); 1708 rack_fto_send = counter_u64_alloc(M_WAITOK); 1709 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1710 SYSCTL_CHILDREN(rack_counters), 1711 OID_AUTO, "fto_send", CTLFLAG_RD, 1712 &rack_fto_send, "Total number of rack_fast_output sends"); 1713 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1714 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1715 SYSCTL_CHILDREN(rack_counters), 1716 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1717 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1718 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1719 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1720 SYSCTL_CHILDREN(rack_counters), 1721 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1722 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1723 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1724 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1725 SYSCTL_CHILDREN(rack_counters), 1726 OID_AUTO, "nfto_send", CTLFLAG_RD, 1727 &rack_non_fto_send, "Total number of rack_output first sends"); 1728 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1729 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1730 SYSCTL_CHILDREN(rack_counters), 1731 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1732 &rack_extended_rfo, "Total number of times we extended rfo"); 1733 1734 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1735 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1736 SYSCTL_CHILDREN(rack_counters), 1737 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1738 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1739 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1740 1741 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1742 SYSCTL_CHILDREN(rack_counters), 1743 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1744 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1745 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1746 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1747 SYSCTL_CHILDREN(rack_counters), 1748 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1749 &rack_tlp_tot, 1750 "Total number of tail loss probe expirations"); 1751 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1752 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1753 SYSCTL_CHILDREN(rack_counters), 1754 OID_AUTO, "tlp_new", CTLFLAG_RD, 1755 &rack_tlp_newdata, 1756 "Total number of tail loss probe sending new data"); 1757 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1758 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1759 SYSCTL_CHILDREN(rack_counters), 1760 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1761 &rack_tlp_retran, 1762 "Total number of tail loss probe sending retransmitted data"); 1763 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1764 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1765 SYSCTL_CHILDREN(rack_counters), 1766 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1767 &rack_tlp_retran_bytes, 1768 "Total bytes of tail loss probe sending retransmitted data"); 1769 rack_to_tot = counter_u64_alloc(M_WAITOK); 1770 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1771 SYSCTL_CHILDREN(rack_counters), 1772 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1773 &rack_to_tot, 1774 "Total number of times the rack to expired"); 1775 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1776 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1777 SYSCTL_CHILDREN(rack_counters), 1778 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1779 &rack_saw_enobuf, 1780 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1781 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1782 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1783 SYSCTL_CHILDREN(rack_counters), 1784 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1785 &rack_saw_enobuf_hw, 1786 "Total number of times a send returned enobuf for hdwr paced connections"); 1787 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1788 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1789 SYSCTL_CHILDREN(rack_counters), 1790 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1791 &rack_saw_enetunreach, 1792 "Total number of times a send received a enetunreachable"); 1793 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1794 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1795 SYSCTL_CHILDREN(rack_counters), 1796 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1797 &rack_hot_alloc, 1798 "Total allocations from the top of our list"); 1799 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1800 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1801 SYSCTL_CHILDREN(rack_counters), 1802 OID_AUTO, "allocs", CTLFLAG_RD, 1803 &rack_to_alloc, 1804 "Total allocations of tracking structures"); 1805 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1806 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1807 SYSCTL_CHILDREN(rack_counters), 1808 OID_AUTO, "allochard", CTLFLAG_RD, 1809 &rack_to_alloc_hard, 1810 "Total allocations done with sleeping the hard way"); 1811 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1812 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1813 SYSCTL_CHILDREN(rack_counters), 1814 OID_AUTO, "allocemerg", CTLFLAG_RD, 1815 &rack_to_alloc_emerg, 1816 "Total allocations done from emergency cache"); 1817 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1818 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1819 SYSCTL_CHILDREN(rack_counters), 1820 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1821 &rack_to_alloc_limited, 1822 "Total allocations dropped due to limit"); 1823 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1824 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1825 SYSCTL_CHILDREN(rack_counters), 1826 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1827 &rack_alloc_limited_conns, 1828 "Connections with allocations dropped due to limit"); 1829 rack_split_limited = counter_u64_alloc(M_WAITOK); 1830 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1831 SYSCTL_CHILDREN(rack_counters), 1832 OID_AUTO, "split_limited", CTLFLAG_RD, 1833 &rack_split_limited, 1834 "Split allocations dropped due to limit"); 1835 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); 1836 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1837 SYSCTL_CHILDREN(rack_counters), 1838 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, 1839 &rack_rxt_clamps_cwnd, 1840 "Number of times that excessive rxt clamped the cwnd down"); 1841 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); 1842 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1843 SYSCTL_CHILDREN(rack_counters), 1844 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, 1845 &rack_rxt_clamps_cwnd_uniq, 1846 "Number of connections that have had excessive rxt clamped the cwnd down"); 1847 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1848 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1849 SYSCTL_CHILDREN(rack_counters), 1850 OID_AUTO, "persist_sends", CTLFLAG_RD, 1851 &rack_persists_sends, 1852 "Number of times we sent a persist probe"); 1853 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1854 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1855 SYSCTL_CHILDREN(rack_counters), 1856 OID_AUTO, "persist_acks", CTLFLAG_RD, 1857 &rack_persists_acks, 1858 "Number of times a persist probe was acked"); 1859 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1860 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1861 SYSCTL_CHILDREN(rack_counters), 1862 OID_AUTO, "persist_loss", CTLFLAG_RD, 1863 &rack_persists_loss, 1864 "Number of times we detected a lost persist probe (no ack)"); 1865 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1866 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1867 SYSCTL_CHILDREN(rack_counters), 1868 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1869 &rack_persists_lost_ends, 1870 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1871 #ifdef INVARIANTS 1872 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1873 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1874 SYSCTL_CHILDREN(rack_counters), 1875 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1876 &rack_adjust_map_bw, 1877 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1878 #endif 1879 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1880 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1881 SYSCTL_CHILDREN(rack_counters), 1882 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1883 &rack_multi_single_eq, 1884 "Number of compressed acks total represented"); 1885 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1886 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1887 SYSCTL_CHILDREN(rack_counters), 1888 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1889 &rack_proc_non_comp_ack, 1890 "Number of non compresseds acks that we processed"); 1891 1892 1893 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1894 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1895 SYSCTL_CHILDREN(rack_counters), 1896 OID_AUTO, "sack_long", CTLFLAG_RD, 1897 &rack_sack_proc_all, 1898 "Total times we had to walk whole list for sack processing"); 1899 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1900 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1901 SYSCTL_CHILDREN(rack_counters), 1902 OID_AUTO, "sack_restart", CTLFLAG_RD, 1903 &rack_sack_proc_restart, 1904 "Total times we had to walk whole list due to a restart"); 1905 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1906 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1907 SYSCTL_CHILDREN(rack_counters), 1908 OID_AUTO, "sack_short", CTLFLAG_RD, 1909 &rack_sack_proc_short, 1910 "Total times we took shortcut for sack processing"); 1911 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1912 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1913 SYSCTL_CHILDREN(rack_attack), 1914 OID_AUTO, "skipacked", CTLFLAG_RD, 1915 &rack_sack_skipped_acked, 1916 "Total number of times we skipped previously sacked"); 1917 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1918 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1919 SYSCTL_CHILDREN(rack_attack), 1920 OID_AUTO, "ofsplit", CTLFLAG_RD, 1921 &rack_sack_splits, 1922 "Total number of times we did the old fashion tree split"); 1923 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1924 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1925 SYSCTL_CHILDREN(rack_counters), 1926 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1927 &rack_input_idle_reduces, 1928 "Total number of idle reductions on input"); 1929 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 1930 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1931 SYSCTL_CHILDREN(rack_counters), 1932 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 1933 &rack_collapsed_win_seen, 1934 "Total number of collapsed window events seen (where our window shrinks)"); 1935 1936 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1937 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1938 SYSCTL_CHILDREN(rack_counters), 1939 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1940 &rack_collapsed_win, 1941 "Total number of collapsed window events where we mark packets"); 1942 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 1943 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1944 SYSCTL_CHILDREN(rack_counters), 1945 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 1946 &rack_collapsed_win_rxt, 1947 "Total number of packets that were retransmitted"); 1948 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 1949 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1950 SYSCTL_CHILDREN(rack_counters), 1951 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 1952 &rack_collapsed_win_rxt_bytes, 1953 "Total number of bytes that were retransmitted"); 1954 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1955 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1956 SYSCTL_CHILDREN(rack_counters), 1957 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1958 &rack_try_scwnd, 1959 "Total number of scwnd attempts"); 1960 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1961 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1962 OID_AUTO, "outsize", CTLFLAG_RD, 1963 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1964 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1965 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1966 OID_AUTO, "opts", CTLFLAG_RD, 1967 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1968 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1969 SYSCTL_CHILDREN(rack_sysctl_root), 1970 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1971 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1972 } 1973 1974 static uint32_t 1975 rc_init_window(struct tcp_rack *rack) 1976 { 1977 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1978 1979 } 1980 1981 static uint64_t 1982 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1983 { 1984 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1985 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1986 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1987 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1988 else 1989 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1990 } 1991 1992 static void 1993 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, 1994 uint64_t data, uint8_t mod, uint16_t aux, 1995 struct tcp_sendfile_track *cur, int line) 1996 { 1997 #ifdef TCP_REQUEST_TRK 1998 int do_log = 0; 1999 2000 /* 2001 * The rate cap one is noisy and only should come out when normal BB logging 2002 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out 2003 * once per chunk and make up the BBpoint that can be turned on by the client. 2004 */ 2005 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2006 /* 2007 * The very noisy two need to only come out when 2008 * we have verbose logging on. 2009 */ 2010 if (rack_verbose_logging != 0) 2011 do_log = tcp_bblogging_on(rack->rc_tp); 2012 else 2013 do_log = 0; 2014 } else if (mod != HYBRID_LOG_BW_MEASURE) { 2015 /* 2016 * All other less noisy logs here except the measure which 2017 * also needs to come out on the point and the log. 2018 */ 2019 do_log = tcp_bblogging_on(rack->rc_tp); 2020 } else { 2021 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); 2022 } 2023 2024 if (do_log) { 2025 union tcp_log_stackspecific log; 2026 struct timeval tv; 2027 uint64_t lt_bw; 2028 2029 /* Convert our ms to a microsecond */ 2030 memset(&log, 0, sizeof(log)); 2031 2032 log.u_bbr.cwnd_gain = line; 2033 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2034 log.u_bbr.rttProp = tim; 2035 log.u_bbr.bw_inuse = cbw; 2036 log.u_bbr.delRate = rack_get_gp_est(rack); 2037 lt_bw = rack_get_lt_bw(rack); 2038 log.u_bbr.flex1 = seq; 2039 log.u_bbr.pacing_gain = aux; 2040 /* lt_bw = < flex3 | flex2 > */ 2041 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); 2042 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); 2043 /* Record the last obtained us rtt in inflight */ 2044 if (cur == NULL) { 2045 /* Make sure we are looking at the right log if an overide comes in */ 2046 cur = rack->r_ctl.rc_last_sft; 2047 } 2048 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) 2049 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; 2050 else { 2051 /* Use the last known rtt i.e. the rack-rtt */ 2052 log.u_bbr.inflight = rack->rc_rack_rtt; 2053 } 2054 if (cur != NULL) { 2055 uint64_t off; 2056 2057 log.u_bbr.cur_del_rate = cur->deadline; 2058 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2059 /* start = < lost | pkt_epoch > */ 2060 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2061 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2062 log.u_bbr.flex6 = cur->start_seq; 2063 log.u_bbr.pkts_out = cur->end_seq; 2064 } else { 2065 /* start = < lost | pkt_epoch > */ 2066 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2067 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2068 /* end = < pkts_out | flex6 > */ 2069 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); 2070 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2071 } 2072 /* first_send = <lt_epoch | epoch> */ 2073 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); 2074 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); 2075 /* localtime = <delivered | applimited>*/ 2076 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2077 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2078 #ifdef TCP_REQUEST_TRK 2079 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2080 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2081 #endif 2082 log.u_bbr.inhpts = 1; 2083 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); 2084 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); 2085 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; 2086 } else { 2087 log.u_bbr.flex7 = 0xffff; 2088 log.u_bbr.cur_del_rate = 0xffffffffffffffff; 2089 } 2090 /* 2091 * Compose bbr_state to be a bit wise 0000ADHF 2092 * where A is the always_pace flag 2093 * where D is the dgp_on flag 2094 * where H is the hybrid_mode on flag 2095 * where F is the use_fixed_rate flag. 2096 */ 2097 log.u_bbr.bbr_state = rack->rc_always_pace; 2098 log.u_bbr.bbr_state <<= 1; 2099 log.u_bbr.bbr_state |= rack->dgp_on; 2100 log.u_bbr.bbr_state <<= 1; 2101 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2102 log.u_bbr.bbr_state <<= 1; 2103 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2104 log.u_bbr.flex8 = mod; 2105 tcp_log_event(rack->rc_tp, NULL, 2106 &rack->rc_inp->inp_socket->so_rcv, 2107 &rack->rc_inp->inp_socket->so_snd, 2108 TCP_HYBRID_PACING_LOG, 0, 2109 0, &log, false, NULL, __func__, __LINE__, &tv); 2110 2111 } 2112 #endif 2113 } 2114 2115 #ifdef TCP_REQUEST_TRK 2116 static void 2117 rack_log_hybrid_sends(struct tcp_rack *rack, struct tcp_sendfile_track *cur, int line) 2118 { 2119 if (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING)) { 2120 union tcp_log_stackspecific log; 2121 struct timeval tv; 2122 uint64_t off; 2123 2124 /* Convert our ms to a microsecond */ 2125 memset(&log, 0, sizeof(log)); 2126 2127 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2128 log.u_bbr.delRate = cur->sent_at_fs; 2129 2130 if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) { 2131 /* 2132 * We did not get a new Rules Applied to set so 2133 * no overlapping send occured, this means the 2134 * current byte counts are correct. 2135 */ 2136 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 2137 log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; 2138 } else { 2139 /* 2140 * Overlapping send case, we switched to a new 2141 * send and did a rules applied. 2142 */ 2143 log.u_bbr.cur_del_rate = cur->sent_at_ls; 2144 log.u_bbr.rttProp = cur->rxt_at_ls; 2145 } 2146 log.u_bbr.bw_inuse = cur->rxt_at_fs; 2147 log.u_bbr.cwnd_gain = line; 2148 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2149 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2150 /* start = < flex1 | flex2 > */ 2151 log.u_bbr.flex2 = (uint32_t)(cur->start & 0x00000000ffffffff); 2152 log.u_bbr.flex1 = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2153 /* end = < flex3 | flex4 > */ 2154 log.u_bbr.flex4 = (uint32_t)(cur->end & 0x00000000ffffffff); 2155 log.u_bbr.flex3 = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2156 2157 /* localtime = <delivered | applimited>*/ 2158 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2159 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2160 /* client timestamp = <lt_epoch | epoch>*/ 2161 log.u_bbr.epoch = (uint32_t)(cur->timestamp & 0x00000000ffffffff); 2162 log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff); 2163 /* now set all the flags in */ 2164 log.u_bbr.pkts_out = cur->hybrid_flags; 2165 log.u_bbr.lost = cur->playout_ms; 2166 log.u_bbr.flex6 = cur->flags; 2167 /* 2168 * Last send time = <flex5 | pkt_epoch> note we do not distinguish cases 2169 * where a false retransmit occurred so first_send <-> lastsend may 2170 * include longer time then it actually took if we have a false rxt. 2171 */ 2172 log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff); 2173 log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff); 2174 /* 2175 * Compose bbr_state to be a bit wise 0000ADHF 2176 * where A is the always_pace flag 2177 * where D is the dgp_on flag 2178 * where H is the hybrid_mode on flag 2179 * where F is the use_fixed_rate flag. 2180 */ 2181 log.u_bbr.bbr_state = rack->rc_always_pace; 2182 log.u_bbr.bbr_state <<= 1; 2183 log.u_bbr.bbr_state |= rack->dgp_on; 2184 log.u_bbr.bbr_state <<= 1; 2185 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2186 log.u_bbr.bbr_state <<= 1; 2187 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2188 2189 log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST; 2190 tcp_log_event(rack->rc_tp, NULL, 2191 &rack->rc_inp->inp_socket->so_rcv, 2192 &rack->rc_inp->inp_socket->so_snd, 2193 TCP_HYBRID_PACING_LOG, 0, 2194 0, &log, false, NULL, __func__, __LINE__, &tv); 2195 } 2196 } 2197 #endif 2198 2199 static inline uint64_t 2200 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) 2201 { 2202 uint64_t ret_bw, ether; 2203 uint64_t u_segsiz; 2204 2205 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); 2206 if (rack->r_is_v6){ 2207 #ifdef INET6 2208 ether += sizeof(struct ip6_hdr); 2209 #endif 2210 ether += 14; /* eheader size 6+6+2 */ 2211 } else { 2212 #ifdef INET 2213 ether += sizeof(struct ip); 2214 #endif 2215 ether += 14; /* eheader size 6+6+2 */ 2216 } 2217 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); 2218 ret_bw = bw; 2219 ret_bw *= ether; 2220 ret_bw /= u_segsiz; 2221 return (ret_bw); 2222 } 2223 2224 static void 2225 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) 2226 { 2227 #ifdef TCP_REQUEST_TRK 2228 struct timeval tv; 2229 uint64_t timenow, timeleft, lenleft, lengone, calcbw; 2230 #endif 2231 2232 if (rack->r_ctl.bw_rate_cap == 0) 2233 return; 2234 #ifdef TCP_REQUEST_TRK 2235 if (rack->rc_catch_up && rack->rc_hybrid_mode && 2236 (rack->r_ctl.rc_last_sft != NULL)) { 2237 /* 2238 * We have a dynamic cap. The original target 2239 * is in bw_rate_cap, but we need to look at 2240 * how long it is until we hit the deadline. 2241 */ 2242 struct tcp_sendfile_track *ent; 2243 2244 ent = rack->r_ctl.rc_last_sft; 2245 microuptime(&tv); 2246 timenow = tcp_tv_to_lusec(&tv); 2247 if (timenow >= ent->deadline) { 2248 /* No time left we do DGP only */ 2249 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2250 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2251 rack->r_ctl.bw_rate_cap = 0; 2252 return; 2253 } 2254 /* We have the time */ 2255 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; 2256 if (timeleft < HPTS_MSEC_IN_SEC) { 2257 /* If there is less than a ms left just use DGPs rate */ 2258 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2259 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent, __LINE__); 2260 rack->r_ctl.bw_rate_cap = 0; 2261 return; 2262 } 2263 /* 2264 * Now lets find the amount of data left to send. 2265 * 2266 * Now ideally we want to use the end_seq to figure out how much more 2267 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. 2268 */ 2269 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) { 2270 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) 2271 lenleft = ent->end_seq - rack->rc_tp->snd_una; 2272 else { 2273 /* TSNH, we should catch it at the send */ 2274 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2275 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2276 rack->r_ctl.bw_rate_cap = 0; 2277 return; 2278 } 2279 } else { 2280 /* 2281 * The hard way, figure out how much is gone and then 2282 * take that away from the total the client asked for 2283 * (thats off by tls overhead if this is tls). 2284 */ 2285 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) 2286 lengone = rack->rc_tp->snd_una - ent->start_seq; 2287 else 2288 lengone = 0; 2289 if (lengone < (ent->end - ent->start)) 2290 lenleft = (ent->end - ent->start) - lengone; 2291 else { 2292 /* TSNH, we should catch it at the send */ 2293 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2294 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent, __LINE__); 2295 rack->r_ctl.bw_rate_cap = 0; 2296 return; 2297 } 2298 } 2299 if (lenleft == 0) { 2300 /* We have it all sent */ 2301 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2302 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent, __LINE__); 2303 if (rack->r_ctl.bw_rate_cap) 2304 goto normal_ratecap; 2305 else 2306 return; 2307 } 2308 calcbw = lenleft * HPTS_USEC_IN_SEC; 2309 calcbw /= timeleft; 2310 /* Now we must compensate for IP/TCP overhead */ 2311 calcbw = rack_compensate_for_linerate(rack, calcbw); 2312 /* Update the bit rate cap */ 2313 rack->r_ctl.bw_rate_cap = calcbw; 2314 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2315 (rack_hybrid_allow_set_maxseg == 1) && 2316 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2317 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2318 uint32_t orig_max; 2319 2320 orig_max = rack->r_ctl.rc_pace_max_segs; 2321 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2322 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); 2323 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2324 } 2325 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2326 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent, __LINE__); 2327 if ((calcbw > 0) && (*bw > calcbw)) { 2328 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2329 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent, __LINE__); 2330 *capped = 1; 2331 *bw = calcbw; 2332 } 2333 return; 2334 } 2335 normal_ratecap: 2336 #endif 2337 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { 2338 #ifdef TCP_REQUEST_TRK 2339 if (rack->rc_hybrid_mode && 2340 rack->rc_catch_up && 2341 (rack->r_ctl.rc_last_sft != NULL) && 2342 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2343 (rack_hybrid_allow_set_maxseg == 1) && 2344 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2345 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2346 uint32_t orig_max; 2347 2348 orig_max = rack->r_ctl.rc_pace_max_segs; 2349 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2350 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); 2351 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2352 } 2353 #endif 2354 *capped = 1; 2355 *bw = rack->r_ctl.bw_rate_cap; 2356 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2357 *bw, 0, 0, 2358 HYBRID_LOG_RATE_CAP, 1, NULL, __LINE__); 2359 } 2360 } 2361 2362 static uint64_t 2363 rack_get_gp_est(struct tcp_rack *rack) 2364 { 2365 uint64_t bw, lt_bw, ret_bw; 2366 2367 if (rack->rc_gp_filled == 0) { 2368 /* 2369 * We have yet no b/w measurement, 2370 * if we have a user set initial bw 2371 * return it. If we don't have that and 2372 * we have an srtt, use the tcp IW (10) to 2373 * calculate a fictional b/w over the SRTT 2374 * which is more or less a guess. Note 2375 * we don't use our IW from rack on purpose 2376 * so if we have like IW=30, we are not 2377 * calculating a "huge" b/w. 2378 */ 2379 uint64_t srtt; 2380 2381 if (rack->dis_lt_bw == 1) 2382 lt_bw = 0; 2383 else 2384 lt_bw = rack_get_lt_bw(rack); 2385 if (lt_bw) { 2386 /* 2387 * No goodput bw but a long-term b/w does exist 2388 * lets use that. 2389 */ 2390 ret_bw = lt_bw; 2391 goto compensate; 2392 } 2393 if (rack->r_ctl.init_rate) 2394 return (rack->r_ctl.init_rate); 2395 2396 /* Ok lets come up with the IW guess, if we have a srtt */ 2397 if (rack->rc_tp->t_srtt == 0) { 2398 /* 2399 * Go with old pacing method 2400 * i.e. burst mitigation only. 2401 */ 2402 return (0); 2403 } 2404 /* Ok lets get the initial TCP win (not racks) */ 2405 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2406 srtt = (uint64_t)rack->rc_tp->t_srtt; 2407 bw *= (uint64_t)USECS_IN_SECOND; 2408 bw /= srtt; 2409 ret_bw = bw; 2410 goto compensate; 2411 2412 } 2413 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2414 /* Averaging is done, we can return the value */ 2415 bw = rack->r_ctl.gp_bw; 2416 } else { 2417 /* Still doing initial average must calculate */ 2418 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); 2419 } 2420 if (rack->dis_lt_bw) { 2421 /* We are not using lt-bw */ 2422 ret_bw = bw; 2423 goto compensate; 2424 } 2425 lt_bw = rack_get_lt_bw(rack); 2426 if (lt_bw == 0) { 2427 /* If we don't have one then equate it to the gp_bw */ 2428 lt_bw = rack->r_ctl.gp_bw; 2429 } 2430 if (rack->use_lesser_lt_bw) { 2431 if (lt_bw < bw) 2432 ret_bw = lt_bw; 2433 else 2434 ret_bw = bw; 2435 } else { 2436 if (lt_bw > bw) 2437 ret_bw = lt_bw; 2438 else 2439 ret_bw = bw; 2440 } 2441 /* 2442 * Now lets compensate based on the TCP/IP overhead. Our 2443 * Goodput estimate does not include this so we must pace out 2444 * a bit faster since our pacing calculations do. The pacing 2445 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz 2446 * we are using to do this, so we do that here in the opposite 2447 * direction as well. This means that if we are tunneled and the 2448 * segsiz is say 1200 bytes we will get quite a boost, but its 2449 * compensated for in the pacing time the opposite way. 2450 */ 2451 compensate: 2452 ret_bw = rack_compensate_for_linerate(rack, ret_bw); 2453 return(ret_bw); 2454 } 2455 2456 2457 static uint64_t 2458 rack_get_bw(struct tcp_rack *rack) 2459 { 2460 uint64_t bw; 2461 2462 if (rack->use_fixed_rate) { 2463 /* Return the fixed pacing rate */ 2464 return (rack_get_fixed_pacing_bw(rack)); 2465 } 2466 bw = rack_get_gp_est(rack); 2467 return (bw); 2468 } 2469 2470 static uint16_t 2471 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2472 { 2473 if (rack->use_fixed_rate) { 2474 return (100); 2475 } else if (rack->in_probe_rtt && (rsm == NULL)) 2476 return (rack->r_ctl.rack_per_of_gp_probertt); 2477 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2478 rack->r_ctl.rack_per_of_gp_rec)) { 2479 if (rsm) { 2480 /* a retransmission always use the recovery rate */ 2481 return (rack->r_ctl.rack_per_of_gp_rec); 2482 } else if (rack->rack_rec_nonrxt_use_cr) { 2483 /* Directed to use the configured rate */ 2484 goto configured_rate; 2485 } else if (rack->rack_no_prr && 2486 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2487 /* No PRR, lets just use the b/w estimate only */ 2488 return (100); 2489 } else { 2490 /* 2491 * Here we may have a non-retransmit but we 2492 * have no overrides, so just use the recovery 2493 * rate (prr is in effect). 2494 */ 2495 return (rack->r_ctl.rack_per_of_gp_rec); 2496 } 2497 } 2498 configured_rate: 2499 /* For the configured rate we look at our cwnd vs the ssthresh */ 2500 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2501 return (rack->r_ctl.rack_per_of_gp_ss); 2502 else 2503 return (rack->r_ctl.rack_per_of_gp_ca); 2504 } 2505 2506 static void 2507 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2508 { 2509 /* 2510 * Types of logs (mod value) 2511 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2512 * 2 = a dsack round begins, persist is reset to 16. 2513 * 3 = a dsack round ends 2514 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2515 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2516 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2517 */ 2518 if (tcp_bblogging_on(rack->rc_tp)) { 2519 union tcp_log_stackspecific log; 2520 struct timeval tv; 2521 2522 memset(&log, 0, sizeof(log)); 2523 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2524 log.u_bbr.flex1 <<= 1; 2525 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2526 log.u_bbr.flex1 <<= 1; 2527 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2528 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2529 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2530 log.u_bbr.flex4 = flex4; 2531 log.u_bbr.flex5 = flex5; 2532 log.u_bbr.flex6 = flex6; 2533 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2534 log.u_bbr.flex8 = mod; 2535 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2536 log.u_bbr.epoch = rack->r_ctl.current_round; 2537 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2538 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2539 &rack->rc_inp->inp_socket->so_rcv, 2540 &rack->rc_inp->inp_socket->so_snd, 2541 RACK_DSACK_HANDLING, 0, 2542 0, &log, false, &tv); 2543 } 2544 } 2545 2546 static void 2547 rack_log_hdwr_pacing(struct tcp_rack *rack, 2548 uint64_t rate, uint64_t hw_rate, int line, 2549 int error, uint16_t mod) 2550 { 2551 if (tcp_bblogging_on(rack->rc_tp)) { 2552 union tcp_log_stackspecific log; 2553 struct timeval tv; 2554 const struct ifnet *ifp; 2555 uint64_t ifp64; 2556 2557 memset(&log, 0, sizeof(log)); 2558 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2559 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2560 if (rack->r_ctl.crte) { 2561 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2562 } else if (rack->rc_inp->inp_route.ro_nh && 2563 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2564 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2565 } else 2566 ifp = NULL; 2567 if (ifp) { 2568 ifp64 = (uintptr_t)ifp; 2569 log.u_bbr.flex3 = ((ifp64 >> 32) & 0x00000000ffffffff); 2570 log.u_bbr.flex4 = (ifp64 & 0x00000000ffffffff); 2571 } 2572 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2573 log.u_bbr.bw_inuse = rate; 2574 log.u_bbr.flex5 = line; 2575 log.u_bbr.flex6 = error; 2576 log.u_bbr.flex7 = mod; 2577 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2578 log.u_bbr.flex8 = rack->use_fixed_rate; 2579 log.u_bbr.flex8 <<= 1; 2580 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2581 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2582 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2583 if (rack->r_ctl.crte) 2584 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2585 else 2586 log.u_bbr.cur_del_rate = 0; 2587 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2588 log.u_bbr.epoch = rack->r_ctl.current_round; 2589 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2590 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2591 &rack->rc_inp->inp_socket->so_rcv, 2592 &rack->rc_inp->inp_socket->so_snd, 2593 BBR_LOG_HDWR_PACE, 0, 2594 0, &log, false, &tv); 2595 } 2596 } 2597 2598 static uint64_t 2599 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2600 { 2601 /* 2602 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2603 */ 2604 uint64_t bw_est, high_rate; 2605 uint64_t gain; 2606 2607 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2608 bw_est = bw * gain; 2609 bw_est /= (uint64_t)100; 2610 /* Never fall below the minimum (def 64kbps) */ 2611 if (bw_est < RACK_MIN_BW) 2612 bw_est = RACK_MIN_BW; 2613 if (rack->r_rack_hw_rate_caps) { 2614 /* Rate caps are in place */ 2615 if (rack->r_ctl.crte != NULL) { 2616 /* We have a hdwr rate already */ 2617 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2618 if (bw_est >= high_rate) { 2619 /* We are capping bw at the highest rate table entry */ 2620 if (rack_hw_rate_cap_per && 2621 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { 2622 rack->r_rack_hw_rate_caps = 0; 2623 goto done; 2624 } 2625 rack_log_hdwr_pacing(rack, 2626 bw_est, high_rate, __LINE__, 2627 0, 3); 2628 bw_est = high_rate; 2629 if (capped) 2630 *capped = 1; 2631 } 2632 } else if ((rack->rack_hdrw_pacing == 0) && 2633 (rack->rack_hdw_pace_ena) && 2634 (rack->rack_attempt_hdwr_pace == 0) && 2635 (rack->rc_inp->inp_route.ro_nh != NULL) && 2636 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2637 /* 2638 * Special case, we have not yet attempted hardware 2639 * pacing, and yet we may, when we do, find out if we are 2640 * above the highest rate. We need to know the maxbw for the interface 2641 * in question (if it supports ratelimiting). We get back 2642 * a 0, if the interface is not found in the RL lists. 2643 */ 2644 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2645 if (high_rate) { 2646 /* Yep, we have a rate is it above this rate? */ 2647 if (bw_est > high_rate) { 2648 bw_est = high_rate; 2649 if (capped) 2650 *capped = 1; 2651 } 2652 } 2653 } 2654 } 2655 done: 2656 return (bw_est); 2657 } 2658 2659 static void 2660 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2661 { 2662 if (tcp_bblogging_on(rack->rc_tp)) { 2663 union tcp_log_stackspecific log; 2664 struct timeval tv; 2665 2666 if ((mod != 1) && (rack_verbose_logging == 0)) { 2667 /* 2668 * We get 3 values currently for mod 2669 * 1 - We are retransmitting and this tells the reason. 2670 * 2 - We are clearing a dup-ack count. 2671 * 3 - We are incrementing a dup-ack count. 2672 * 2673 * The clear/increment are only logged 2674 * if you have BBverbose on. 2675 */ 2676 return; 2677 } 2678 memset(&log, 0, sizeof(log)); 2679 log.u_bbr.flex1 = tsused; 2680 log.u_bbr.flex2 = thresh; 2681 log.u_bbr.flex3 = rsm->r_flags; 2682 log.u_bbr.flex4 = rsm->r_dupack; 2683 log.u_bbr.flex5 = rsm->r_start; 2684 log.u_bbr.flex6 = rsm->r_end; 2685 log.u_bbr.flex8 = mod; 2686 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2687 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2688 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2689 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2690 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2691 log.u_bbr.pacing_gain = rack->r_must_retran; 2692 log.u_bbr.epoch = rack->r_ctl.current_round; 2693 log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; 2694 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2695 &rack->rc_inp->inp_socket->so_rcv, 2696 &rack->rc_inp->inp_socket->so_snd, 2697 BBR_LOG_SETTINGS_CHG, 0, 2698 0, &log, false, &tv); 2699 } 2700 } 2701 2702 static void 2703 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2704 { 2705 if (tcp_bblogging_on(rack->rc_tp)) { 2706 union tcp_log_stackspecific log; 2707 struct timeval tv; 2708 2709 memset(&log, 0, sizeof(log)); 2710 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2711 log.u_bbr.flex2 = to; 2712 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2713 log.u_bbr.flex4 = slot; 2714 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; 2715 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2716 log.u_bbr.flex7 = rack->rc_in_persist; 2717 log.u_bbr.flex8 = which; 2718 if (rack->rack_no_prr) 2719 log.u_bbr.pkts_out = 0; 2720 else 2721 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2722 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2723 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2724 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2725 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2726 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2727 log.u_bbr.pacing_gain = rack->r_must_retran; 2728 log.u_bbr.cwnd_gain = rack->rack_deferred_inited; 2729 log.u_bbr.pkt_epoch = rack->rc_has_collapsed; 2730 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2731 log.u_bbr.lost = rack_rto_min; 2732 log.u_bbr.epoch = rack->r_ctl.roundends; 2733 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2734 log.u_bbr.bw_inuse <<= 32; 2735 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2736 log.u_bbr.applimited = rack->rc_tp->t_flags2; 2737 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2738 &rack->rc_inp->inp_socket->so_rcv, 2739 &rack->rc_inp->inp_socket->so_snd, 2740 BBR_LOG_TIMERSTAR, 0, 2741 0, &log, false, &tv); 2742 } 2743 } 2744 2745 static void 2746 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2747 { 2748 if (tcp_bblogging_on(rack->rc_tp)) { 2749 union tcp_log_stackspecific log; 2750 struct timeval tv; 2751 2752 memset(&log, 0, sizeof(log)); 2753 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2754 log.u_bbr.flex8 = to_num; 2755 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2756 log.u_bbr.flex2 = rack->rc_rack_rtt; 2757 if (rsm == NULL) 2758 log.u_bbr.flex3 = 0; 2759 else 2760 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2761 if (rack->rack_no_prr) 2762 log.u_bbr.flex5 = 0; 2763 else 2764 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2765 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2766 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2767 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2768 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2769 log.u_bbr.pacing_gain = rack->r_must_retran; 2770 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2771 log.u_bbr.bw_inuse <<= 32; 2772 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2773 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2774 &rack->rc_inp->inp_socket->so_rcv, 2775 &rack->rc_inp->inp_socket->so_snd, 2776 BBR_LOG_RTO, 0, 2777 0, &log, false, &tv); 2778 } 2779 } 2780 2781 static void 2782 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2783 struct rack_sendmap *prev, 2784 struct rack_sendmap *rsm, 2785 struct rack_sendmap *next, 2786 int flag, uint32_t th_ack, int line) 2787 { 2788 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2789 union tcp_log_stackspecific log; 2790 struct timeval tv; 2791 2792 memset(&log, 0, sizeof(log)); 2793 log.u_bbr.flex8 = flag; 2794 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2795 log.u_bbr.cur_del_rate = (uintptr_t)prev; 2796 log.u_bbr.delRate = (uintptr_t)rsm; 2797 log.u_bbr.rttProp = (uintptr_t)next; 2798 log.u_bbr.flex7 = 0; 2799 if (prev) { 2800 log.u_bbr.flex1 = prev->r_start; 2801 log.u_bbr.flex2 = prev->r_end; 2802 log.u_bbr.flex7 |= 0x4; 2803 } 2804 if (rsm) { 2805 log.u_bbr.flex3 = rsm->r_start; 2806 log.u_bbr.flex4 = rsm->r_end; 2807 log.u_bbr.flex7 |= 0x2; 2808 } 2809 if (next) { 2810 log.u_bbr.flex5 = next->r_start; 2811 log.u_bbr.flex6 = next->r_end; 2812 log.u_bbr.flex7 |= 0x1; 2813 } 2814 log.u_bbr.applimited = line; 2815 log.u_bbr.pkts_out = th_ack; 2816 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2817 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2818 if (rack->rack_no_prr) 2819 log.u_bbr.lost = 0; 2820 else 2821 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2822 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2823 log.u_bbr.bw_inuse <<= 32; 2824 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2825 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2826 &rack->rc_inp->inp_socket->so_rcv, 2827 &rack->rc_inp->inp_socket->so_snd, 2828 TCP_LOG_MAPCHG, 0, 2829 0, &log, false, &tv); 2830 } 2831 } 2832 2833 static void 2834 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2835 struct rack_sendmap *rsm, int conf) 2836 { 2837 if (tcp_bblogging_on(tp)) { 2838 union tcp_log_stackspecific log; 2839 struct timeval tv; 2840 memset(&log, 0, sizeof(log)); 2841 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2842 log.u_bbr.flex1 = t; 2843 log.u_bbr.flex2 = len; 2844 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2845 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2846 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2847 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2848 log.u_bbr.flex7 = conf; 2849 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2850 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2851 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2852 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2853 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2854 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2855 if (rsm) { 2856 log.u_bbr.pkt_epoch = rsm->r_start; 2857 log.u_bbr.lost = rsm->r_end; 2858 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2859 /* We loose any upper of the 24 bits */ 2860 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2861 } else { 2862 /* Its a SYN */ 2863 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2864 log.u_bbr.lost = 0; 2865 log.u_bbr.cwnd_gain = 0; 2866 log.u_bbr.pacing_gain = 0; 2867 } 2868 /* Write out general bits of interest rrs here */ 2869 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2870 log.u_bbr.use_lt_bw <<= 1; 2871 log.u_bbr.use_lt_bw |= rack->forced_ack; 2872 log.u_bbr.use_lt_bw <<= 1; 2873 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2874 log.u_bbr.use_lt_bw <<= 1; 2875 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2876 log.u_bbr.use_lt_bw <<= 1; 2877 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2878 log.u_bbr.use_lt_bw <<= 1; 2879 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2880 log.u_bbr.use_lt_bw <<= 1; 2881 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2882 log.u_bbr.use_lt_bw <<= 1; 2883 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2884 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2885 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2886 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2887 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2888 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2889 log.u_bbr.bw_inuse = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 2890 log.u_bbr.bw_inuse <<= 32; 2891 if (rsm) 2892 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2893 TCP_LOG_EVENTP(tp, NULL, 2894 &rack->rc_inp->inp_socket->so_rcv, 2895 &rack->rc_inp->inp_socket->so_snd, 2896 BBR_LOG_BBRRTT, 0, 2897 0, &log, false, &tv); 2898 2899 2900 } 2901 } 2902 2903 static void 2904 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2905 { 2906 /* 2907 * Log the rtt sample we are 2908 * applying to the srtt algorithm in 2909 * useconds. 2910 */ 2911 if (tcp_bblogging_on(rack->rc_tp)) { 2912 union tcp_log_stackspecific log; 2913 struct timeval tv; 2914 2915 /* Convert our ms to a microsecond */ 2916 memset(&log, 0, sizeof(log)); 2917 log.u_bbr.flex1 = rtt; 2918 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2919 log.u_bbr.flex7 = 1; 2920 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2921 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2922 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2923 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2924 log.u_bbr.pacing_gain = rack->r_must_retran; 2925 /* 2926 * We capture in delRate the upper 32 bits as 2927 * the confidence level we had declared, and the 2928 * lower 32 bits as the actual RTT using the arrival 2929 * timestamp. 2930 */ 2931 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2932 log.u_bbr.delRate <<= 32; 2933 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2934 /* Lets capture all the things that make up t_rtxcur */ 2935 log.u_bbr.applimited = rack_rto_min; 2936 log.u_bbr.epoch = rack_rto_max; 2937 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2938 log.u_bbr.lost = rack_rto_min; 2939 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2940 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2941 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2942 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2943 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2944 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2945 &rack->rc_inp->inp_socket->so_rcv, 2946 &rack->rc_inp->inp_socket->so_snd, 2947 TCP_LOG_RTT, 0, 2948 0, &log, false, &tv); 2949 } 2950 } 2951 2952 static void 2953 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2954 { 2955 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2956 union tcp_log_stackspecific log; 2957 struct timeval tv; 2958 2959 /* Convert our ms to a microsecond */ 2960 memset(&log, 0, sizeof(log)); 2961 log.u_bbr.flex1 = rtt; 2962 log.u_bbr.flex2 = send_time; 2963 log.u_bbr.flex3 = ack_time; 2964 log.u_bbr.flex4 = where; 2965 log.u_bbr.flex7 = 2; 2966 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2967 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2968 log.u_bbr.bw_inuse <<= 32; 2969 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2970 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2971 &rack->rc_inp->inp_socket->so_rcv, 2972 &rack->rc_inp->inp_socket->so_snd, 2973 TCP_LOG_RTT, 0, 2974 0, &log, false, &tv); 2975 } 2976 } 2977 2978 2979 static void 2980 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) 2981 { 2982 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2983 union tcp_log_stackspecific log; 2984 struct timeval tv; 2985 2986 /* Convert our ms to a microsecond */ 2987 memset(&log, 0, sizeof(log)); 2988 log.u_bbr.flex1 = idx; 2989 log.u_bbr.flex2 = rack_ts_to_msec(tsv); 2990 log.u_bbr.flex3 = tsecho; 2991 log.u_bbr.flex7 = 3; 2992 log.u_bbr.rttProp = tsv; 2993 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2994 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 2995 log.u_bbr.bw_inuse <<= 32; 2996 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 2997 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2998 &rack->rc_inp->inp_socket->so_rcv, 2999 &rack->rc_inp->inp_socket->so_snd, 3000 TCP_LOG_RTT, 0, 3001 0, &log, false, &tv); 3002 } 3003 } 3004 3005 3006 static inline void 3007 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 3008 { 3009 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3010 union tcp_log_stackspecific log; 3011 struct timeval tv; 3012 3013 memset(&log, 0, sizeof(log)); 3014 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3015 log.u_bbr.flex1 = line; 3016 log.u_bbr.flex2 = tick; 3017 log.u_bbr.flex3 = tp->t_maxunacktime; 3018 log.u_bbr.flex4 = tp->t_acktime; 3019 log.u_bbr.flex8 = event; 3020 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3021 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3022 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3023 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3024 log.u_bbr.pacing_gain = rack->r_must_retran; 3025 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3026 log.u_bbr.bw_inuse <<= 32; 3027 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3028 TCP_LOG_EVENTP(tp, NULL, 3029 &rack->rc_inp->inp_socket->so_rcv, 3030 &rack->rc_inp->inp_socket->so_snd, 3031 BBR_LOG_PROGRESS, 0, 3032 0, &log, false, &tv); 3033 } 3034 } 3035 3036 static void 3037 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) 3038 { 3039 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3040 union tcp_log_stackspecific log; 3041 3042 memset(&log, 0, sizeof(log)); 3043 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3044 log.u_bbr.flex1 = slot; 3045 if (rack->rack_no_prr) 3046 log.u_bbr.flex2 = 0; 3047 else 3048 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 3049 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3050 log.u_bbr.flex6 = line; 3051 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 3052 log.u_bbr.flex8 = rack->rc_in_persist; 3053 log.u_bbr.timeStamp = cts; 3054 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3055 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3056 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3057 log.u_bbr.pacing_gain = rack->r_must_retran; 3058 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3059 &rack->rc_inp->inp_socket->so_rcv, 3060 &rack->rc_inp->inp_socket->so_snd, 3061 BBR_LOG_BBRSND, 0, 3062 0, &log, false, tv); 3063 } 3064 } 3065 3066 static void 3067 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 3068 { 3069 if (tcp_bblogging_on(rack->rc_tp)) { 3070 union tcp_log_stackspecific log; 3071 struct timeval tv; 3072 3073 memset(&log, 0, sizeof(log)); 3074 log.u_bbr.flex1 = did_out; 3075 log.u_bbr.flex2 = nxt_pkt; 3076 log.u_bbr.flex3 = way_out; 3077 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3078 if (rack->rack_no_prr) 3079 log.u_bbr.flex5 = 0; 3080 else 3081 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3082 log.u_bbr.flex6 = nsegs; 3083 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 3084 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 3085 log.u_bbr.flex7 <<= 1; 3086 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 3087 log.u_bbr.flex7 <<= 1; 3088 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 3089 log.u_bbr.flex8 = rack->rc_in_persist; 3090 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3091 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3092 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3093 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3094 log.u_bbr.use_lt_bw <<= 1; 3095 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3096 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3097 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3098 log.u_bbr.pacing_gain = rack->r_must_retran; 3099 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3100 log.u_bbr.bw_inuse <<= 32; 3101 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3102 log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat; 3103 log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat; 3104 log.u_bbr.lost = rack->rc_tp->t_srtt; 3105 log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt; 3106 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3107 &rack->rc_inp->inp_socket->so_rcv, 3108 &rack->rc_inp->inp_socket->so_snd, 3109 BBR_LOG_DOSEG_DONE, 0, 3110 0, &log, false, &tv); 3111 } 3112 } 3113 3114 static void 3115 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 3116 { 3117 if (tcp_bblogging_on(rack->rc_tp)) { 3118 union tcp_log_stackspecific log; 3119 struct timeval tv; 3120 3121 memset(&log, 0, sizeof(log)); 3122 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 3123 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 3124 log.u_bbr.flex4 = arg1; 3125 log.u_bbr.flex5 = arg2; 3126 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; 3127 log.u_bbr.flex6 = arg3; 3128 log.u_bbr.flex8 = frm; 3129 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3130 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3131 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3132 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 3133 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3134 log.u_bbr.pacing_gain = rack->r_must_retran; 3135 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 3136 &tptosocket(tp)->so_snd, 3137 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 3138 } 3139 } 3140 3141 static void 3142 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 3143 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 3144 { 3145 if (tcp_bblogging_on(rack->rc_tp)) { 3146 union tcp_log_stackspecific log; 3147 struct timeval tv; 3148 3149 memset(&log, 0, sizeof(log)); 3150 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3151 log.u_bbr.flex1 = slot; 3152 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 3153 log.u_bbr.flex4 = reason; 3154 if (rack->rack_no_prr) 3155 log.u_bbr.flex5 = 0; 3156 else 3157 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3158 log.u_bbr.flex7 = hpts_calling; 3159 log.u_bbr.flex8 = rack->rc_in_persist; 3160 log.u_bbr.lt_epoch = cwnd_to_use; 3161 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3162 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3163 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3164 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3165 log.u_bbr.pacing_gain = rack->r_must_retran; 3166 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 3167 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3168 log.u_bbr.bw_inuse <<= 32; 3169 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3170 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3171 &rack->rc_inp->inp_socket->so_rcv, 3172 &rack->rc_inp->inp_socket->so_snd, 3173 BBR_LOG_JUSTRET, 0, 3174 tlen, &log, false, &tv); 3175 } 3176 } 3177 3178 static void 3179 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 3180 struct timeval *tv, uint32_t flags_on_entry) 3181 { 3182 if (tcp_bblogging_on(rack->rc_tp)) { 3183 union tcp_log_stackspecific log; 3184 3185 memset(&log, 0, sizeof(log)); 3186 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3187 log.u_bbr.flex1 = line; 3188 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 3189 log.u_bbr.flex3 = flags_on_entry; 3190 log.u_bbr.flex4 = us_cts; 3191 if (rack->rack_no_prr) 3192 log.u_bbr.flex5 = 0; 3193 else 3194 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3195 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3196 log.u_bbr.flex7 = hpts_removed; 3197 log.u_bbr.flex8 = 1; 3198 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 3199 log.u_bbr.timeStamp = us_cts; 3200 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3201 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3202 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3203 log.u_bbr.pacing_gain = rack->r_must_retran; 3204 log.u_bbr.bw_inuse = rack->r_ctl.current_round; 3205 log.u_bbr.bw_inuse <<= 32; 3206 log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; 3207 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3208 &rack->rc_inp->inp_socket->so_rcv, 3209 &rack->rc_inp->inp_socket->so_snd, 3210 BBR_LOG_TIMERCANC, 0, 3211 0, &log, false, tv); 3212 } 3213 } 3214 3215 static void 3216 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 3217 uint32_t flex1, uint32_t flex2, 3218 uint32_t flex3, uint32_t flex4, 3219 uint32_t flex5, uint32_t flex6, 3220 uint16_t flex7, uint8_t mod) 3221 { 3222 if (tcp_bblogging_on(rack->rc_tp)) { 3223 union tcp_log_stackspecific log; 3224 struct timeval tv; 3225 3226 if (mod == 1) { 3227 /* No you can't use 1, its for the real to cancel */ 3228 return; 3229 } 3230 memset(&log, 0, sizeof(log)); 3231 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3232 log.u_bbr.flex1 = flex1; 3233 log.u_bbr.flex2 = flex2; 3234 log.u_bbr.flex3 = flex3; 3235 log.u_bbr.flex4 = flex4; 3236 log.u_bbr.flex5 = flex5; 3237 log.u_bbr.flex6 = flex6; 3238 log.u_bbr.flex7 = flex7; 3239 log.u_bbr.flex8 = mod; 3240 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3241 &rack->rc_inp->inp_socket->so_rcv, 3242 &rack->rc_inp->inp_socket->so_snd, 3243 BBR_LOG_TIMERCANC, 0, 3244 0, &log, false, &tv); 3245 } 3246 } 3247 3248 static void 3249 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 3250 { 3251 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 3252 union tcp_log_stackspecific log; 3253 struct timeval tv; 3254 3255 memset(&log, 0, sizeof(log)); 3256 log.u_bbr.flex1 = timers; 3257 log.u_bbr.flex2 = ret; 3258 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 3259 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3260 log.u_bbr.flex5 = cts; 3261 if (rack->rack_no_prr) 3262 log.u_bbr.flex6 = 0; 3263 else 3264 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 3265 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3266 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3267 log.u_bbr.pacing_gain = rack->r_must_retran; 3268 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3269 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3270 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3271 &rack->rc_inp->inp_socket->so_rcv, 3272 &rack->rc_inp->inp_socket->so_snd, 3273 BBR_LOG_TO_PROCESS, 0, 3274 0, &log, false, &tv); 3275 } 3276 } 3277 3278 static void 3279 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 3280 { 3281 if (tcp_bblogging_on(rack->rc_tp)) { 3282 union tcp_log_stackspecific log; 3283 struct timeval tv; 3284 3285 memset(&log, 0, sizeof(log)); 3286 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 3287 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 3288 if (rack->rack_no_prr) 3289 log.u_bbr.flex3 = 0; 3290 else 3291 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 3292 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 3293 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 3294 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 3295 log.u_bbr.flex7 = line; 3296 log.u_bbr.flex8 = frm; 3297 log.u_bbr.pkts_out = orig_cwnd; 3298 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3299 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3300 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3301 log.u_bbr.use_lt_bw <<= 1; 3302 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3303 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3304 &rack->rc_inp->inp_socket->so_rcv, 3305 &rack->rc_inp->inp_socket->so_snd, 3306 BBR_LOG_BBRUPD, 0, 3307 0, &log, false, &tv); 3308 } 3309 } 3310 3311 static void 3312 rack_counter_destroy(void) 3313 { 3314 counter_u64_free(rack_total_bytes); 3315 counter_u64_free(rack_fto_send); 3316 counter_u64_free(rack_fto_rsm_send); 3317 counter_u64_free(rack_nfto_resend); 3318 counter_u64_free(rack_hw_pace_init_fail); 3319 counter_u64_free(rack_hw_pace_lost); 3320 counter_u64_free(rack_non_fto_send); 3321 counter_u64_free(rack_extended_rfo); 3322 counter_u64_free(rack_ack_total); 3323 counter_u64_free(rack_express_sack); 3324 counter_u64_free(rack_sack_total); 3325 counter_u64_free(rack_move_none); 3326 counter_u64_free(rack_move_some); 3327 counter_u64_free(rack_sack_attacks_detected); 3328 counter_u64_free(rack_sack_attacks_reversed); 3329 counter_u64_free(rack_sack_attacks_suspect); 3330 counter_u64_free(rack_sack_used_next_merge); 3331 counter_u64_free(rack_sack_used_prev_merge); 3332 counter_u64_free(rack_tlp_tot); 3333 counter_u64_free(rack_tlp_newdata); 3334 counter_u64_free(rack_tlp_retran); 3335 counter_u64_free(rack_tlp_retran_bytes); 3336 counter_u64_free(rack_to_tot); 3337 counter_u64_free(rack_saw_enobuf); 3338 counter_u64_free(rack_saw_enobuf_hw); 3339 counter_u64_free(rack_saw_enetunreach); 3340 counter_u64_free(rack_hot_alloc); 3341 counter_u64_free(rack_to_alloc); 3342 counter_u64_free(rack_to_alloc_hard); 3343 counter_u64_free(rack_to_alloc_emerg); 3344 counter_u64_free(rack_to_alloc_limited); 3345 counter_u64_free(rack_alloc_limited_conns); 3346 counter_u64_free(rack_split_limited); 3347 counter_u64_free(rack_multi_single_eq); 3348 counter_u64_free(rack_rxt_clamps_cwnd); 3349 counter_u64_free(rack_rxt_clamps_cwnd_uniq); 3350 counter_u64_free(rack_proc_non_comp_ack); 3351 counter_u64_free(rack_sack_proc_all); 3352 counter_u64_free(rack_sack_proc_restart); 3353 counter_u64_free(rack_sack_proc_short); 3354 counter_u64_free(rack_sack_skipped_acked); 3355 counter_u64_free(rack_sack_splits); 3356 counter_u64_free(rack_input_idle_reduces); 3357 counter_u64_free(rack_collapsed_win); 3358 counter_u64_free(rack_collapsed_win_rxt); 3359 counter_u64_free(rack_collapsed_win_rxt_bytes); 3360 counter_u64_free(rack_collapsed_win_seen); 3361 counter_u64_free(rack_try_scwnd); 3362 counter_u64_free(rack_persists_sends); 3363 counter_u64_free(rack_persists_acks); 3364 counter_u64_free(rack_persists_loss); 3365 counter_u64_free(rack_persists_lost_ends); 3366 #ifdef INVARIANTS 3367 counter_u64_free(rack_adjust_map_bw); 3368 #endif 3369 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 3370 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 3371 } 3372 3373 static struct rack_sendmap * 3374 rack_alloc(struct tcp_rack *rack) 3375 { 3376 struct rack_sendmap *rsm; 3377 3378 /* 3379 * First get the top of the list it in 3380 * theory is the "hottest" rsm we have, 3381 * possibly just freed by ack processing. 3382 */ 3383 if (rack->rc_free_cnt > rack_free_cache) { 3384 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3385 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3386 counter_u64_add(rack_hot_alloc, 1); 3387 rack->rc_free_cnt--; 3388 return (rsm); 3389 } 3390 /* 3391 * Once we get under our free cache we probably 3392 * no longer have a "hot" one available. Lets 3393 * get one from UMA. 3394 */ 3395 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3396 if (rsm) { 3397 rack->r_ctl.rc_num_maps_alloced++; 3398 counter_u64_add(rack_to_alloc, 1); 3399 return (rsm); 3400 } 3401 /* 3402 * Dig in to our aux rsm's (the last two) since 3403 * UMA failed to get us one. 3404 */ 3405 if (rack->rc_free_cnt) { 3406 counter_u64_add(rack_to_alloc_emerg, 1); 3407 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3408 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3409 rack->rc_free_cnt--; 3410 return (rsm); 3411 } 3412 return (NULL); 3413 } 3414 3415 static struct rack_sendmap * 3416 rack_alloc_full_limit(struct tcp_rack *rack) 3417 { 3418 if ((V_tcp_map_entries_limit > 0) && 3419 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3420 counter_u64_add(rack_to_alloc_limited, 1); 3421 if (!rack->alloc_limit_reported) { 3422 rack->alloc_limit_reported = 1; 3423 counter_u64_add(rack_alloc_limited_conns, 1); 3424 } 3425 return (NULL); 3426 } 3427 return (rack_alloc(rack)); 3428 } 3429 3430 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3431 static struct rack_sendmap * 3432 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3433 { 3434 struct rack_sendmap *rsm; 3435 3436 if (limit_type) { 3437 /* currently there is only one limit type */ 3438 if (rack->r_ctl.rc_split_limit > 0 && 3439 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { 3440 counter_u64_add(rack_split_limited, 1); 3441 if (!rack->alloc_limit_reported) { 3442 rack->alloc_limit_reported = 1; 3443 counter_u64_add(rack_alloc_limited_conns, 1); 3444 } 3445 return (NULL); 3446 } 3447 } 3448 3449 /* allocate and mark in the limit type, if set */ 3450 rsm = rack_alloc(rack); 3451 if (rsm != NULL && limit_type) { 3452 rsm->r_limit_type = limit_type; 3453 rack->r_ctl.rc_num_split_allocs++; 3454 } 3455 return (rsm); 3456 } 3457 3458 static void 3459 rack_free_trim(struct tcp_rack *rack) 3460 { 3461 struct rack_sendmap *rsm; 3462 3463 /* 3464 * Free up all the tail entries until 3465 * we get our list down to the limit. 3466 */ 3467 while (rack->rc_free_cnt > rack_free_cache) { 3468 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3469 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3470 rack->rc_free_cnt--; 3471 rack->r_ctl.rc_num_maps_alloced--; 3472 uma_zfree(rack_zone, rsm); 3473 } 3474 } 3475 3476 static void 3477 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3478 { 3479 if (rsm->r_flags & RACK_APP_LIMITED) { 3480 KASSERT((rack->r_ctl.rc_app_limited_cnt > 0), 3481 ("app_cnt %u, rsm %p", rack->r_ctl.rc_app_limited_cnt, rsm)); 3482 rack->r_ctl.rc_app_limited_cnt--; 3483 } 3484 if (rsm->r_limit_type) { 3485 /* currently there is only one limit type */ 3486 rack->r_ctl.rc_num_split_allocs--; 3487 } 3488 if (rsm == rack->r_ctl.rc_first_appl) { 3489 rack->r_ctl.cleared_app_ack_seq = rsm->r_end; 3490 rack->r_ctl.cleared_app_ack = 1; 3491 if (rack->r_ctl.rc_app_limited_cnt == 0) 3492 rack->r_ctl.rc_first_appl = NULL; 3493 else 3494 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); 3495 } 3496 if (rsm == rack->r_ctl.rc_resend) 3497 rack->r_ctl.rc_resend = NULL; 3498 if (rsm == rack->r_ctl.rc_end_appl) 3499 rack->r_ctl.rc_end_appl = NULL; 3500 if (rack->r_ctl.rc_tlpsend == rsm) 3501 rack->r_ctl.rc_tlpsend = NULL; 3502 if (rack->r_ctl.rc_sacklast == rsm) 3503 rack->r_ctl.rc_sacklast = NULL; 3504 memset(rsm, 0, sizeof(struct rack_sendmap)); 3505 /* Make sure we are not going to overrun our count limit of 0xff */ 3506 if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) { 3507 rack_free_trim(rack); 3508 } 3509 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3510 rack->rc_free_cnt++; 3511 } 3512 3513 static uint32_t 3514 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3515 { 3516 uint64_t srtt, bw, len, tim; 3517 uint32_t segsiz, def_len, minl; 3518 3519 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3520 def_len = rack_def_data_window * segsiz; 3521 if (rack->rc_gp_filled == 0) { 3522 /* 3523 * We have no measurement (IW is in flight?) so 3524 * we can only guess using our data_window sysctl 3525 * value (usually 20MSS). 3526 */ 3527 return (def_len); 3528 } 3529 /* 3530 * Now we have a number of factors to consider. 3531 * 3532 * 1) We have a desired BDP which is usually 3533 * at least 2. 3534 * 2) We have a minimum number of rtt's usually 1 SRTT 3535 * but we allow it too to be more. 3536 * 3) We want to make sure a measurement last N useconds (if 3537 * we have set rack_min_measure_usec. 3538 * 3539 * We handle the first concern here by trying to create a data 3540 * window of max(rack_def_data_window, DesiredBDP). The 3541 * second concern we handle in not letting the measurement 3542 * window end normally until at least the required SRTT's 3543 * have gone by which is done further below in 3544 * rack_enough_for_measurement(). Finally the third concern 3545 * we also handle here by calculating how long that time 3546 * would take at the current BW and then return the 3547 * max of our first calculation and that length. Note 3548 * that if rack_min_measure_usec is 0, we don't deal 3549 * with concern 3. Also for both Concern 1 and 3 an 3550 * application limited period could end the measurement 3551 * earlier. 3552 * 3553 * So lets calculate the BDP with the "known" b/w using 3554 * the SRTT as our rtt and then multiply it by the goal. 3555 */ 3556 bw = rack_get_bw(rack); 3557 srtt = (uint64_t)tp->t_srtt; 3558 len = bw * srtt; 3559 len /= (uint64_t)HPTS_USEC_IN_SEC; 3560 len *= max(1, rack_goal_bdp); 3561 /* Now we need to round up to the nearest MSS */ 3562 len = roundup(len, segsiz); 3563 if (rack_min_measure_usec) { 3564 /* Now calculate our min length for this b/w */ 3565 tim = rack_min_measure_usec; 3566 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3567 if (minl == 0) 3568 minl = 1; 3569 minl = roundup(minl, segsiz); 3570 if (len < minl) 3571 len = minl; 3572 } 3573 /* 3574 * Now if we have a very small window we want 3575 * to attempt to get the window that is 3576 * as small as possible. This happens on 3577 * low b/w connections and we don't want to 3578 * span huge numbers of rtt's between measurements. 3579 * 3580 * We basically include 2 over our "MIN window" so 3581 * that the measurement can be shortened (possibly) by 3582 * an ack'ed packet. 3583 */ 3584 if (len < def_len) 3585 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3586 else 3587 return (max((uint32_t)len, def_len)); 3588 3589 } 3590 3591 static int 3592 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3593 { 3594 uint32_t tim, srtts, segsiz; 3595 3596 /* 3597 * Has enough time passed for the GP measurement to be valid? 3598 */ 3599 if (SEQ_LT(th_ack, tp->gput_seq)) { 3600 /* Not enough bytes yet */ 3601 return (0); 3602 } 3603 if ((tp->snd_max == tp->snd_una) || 3604 (th_ack == tp->snd_max)){ 3605 /* 3606 * All is acked quality of all acked is 3607 * usually low or medium, but we in theory could split 3608 * all acked into two cases, where you got 3609 * a signifigant amount of your window and 3610 * where you did not. For now we leave it 3611 * but it is something to contemplate in the 3612 * future. The danger here is that delayed ack 3613 * is effecting the last byte (which is a 50:50 chance). 3614 */ 3615 *quality = RACK_QUALITY_ALLACKED; 3616 return (1); 3617 } 3618 if (SEQ_GEQ(th_ack, tp->gput_ack)) { 3619 /* 3620 * We obtained our entire window of data we wanted 3621 * no matter if we are in recovery or not then 3622 * its ok since expanding the window does not 3623 * make things fuzzy (or at least not as much). 3624 */ 3625 *quality = RACK_QUALITY_HIGH; 3626 return (1); 3627 } 3628 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3629 if (SEQ_LT(th_ack, tp->gput_ack) && 3630 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3631 /* Not enough bytes yet */ 3632 return (0); 3633 } 3634 if (rack->r_ctl.rc_first_appl && 3635 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3636 /* 3637 * We are up to the app limited send point 3638 * we have to measure irrespective of the time.. 3639 */ 3640 *quality = RACK_QUALITY_APPLIMITED; 3641 return (1); 3642 } 3643 /* Now what about time? */ 3644 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3645 tim = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3646 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 3647 /* 3648 * We do not allow a measurement if we are in recovery 3649 * that would shrink the goodput window we wanted. 3650 * This is to prevent cloudyness of when the last send 3651 * was actually made. 3652 */ 3653 *quality = RACK_QUALITY_HIGH; 3654 return (1); 3655 } 3656 /* Nope not even a full SRTT has passed */ 3657 return (0); 3658 } 3659 3660 static void 3661 rack_log_timely(struct tcp_rack *rack, 3662 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3663 uint64_t up_bnd, int line, uint8_t method) 3664 { 3665 if (tcp_bblogging_on(rack->rc_tp)) { 3666 union tcp_log_stackspecific log; 3667 struct timeval tv; 3668 3669 memset(&log, 0, sizeof(log)); 3670 log.u_bbr.flex1 = logged; 3671 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3672 log.u_bbr.flex2 <<= 4; 3673 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3674 log.u_bbr.flex2 <<= 4; 3675 log.u_bbr.flex2 |= rack->rc_gp_incr; 3676 log.u_bbr.flex2 <<= 4; 3677 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3678 log.u_bbr.flex3 = rack->rc_gp_incr; 3679 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3680 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3681 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3682 log.u_bbr.flex7 = rack->rc_gp_bwred; 3683 log.u_bbr.flex8 = method; 3684 log.u_bbr.cur_del_rate = cur_bw; 3685 log.u_bbr.delRate = low_bnd; 3686 log.u_bbr.bw_inuse = up_bnd; 3687 log.u_bbr.rttProp = rack_get_bw(rack); 3688 log.u_bbr.pkt_epoch = line; 3689 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3690 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3691 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3692 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3693 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3694 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3695 log.u_bbr.cwnd_gain <<= 1; 3696 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3697 log.u_bbr.cwnd_gain <<= 1; 3698 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3699 log.u_bbr.cwnd_gain <<= 1; 3700 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3701 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3702 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3703 &rack->rc_inp->inp_socket->so_rcv, 3704 &rack->rc_inp->inp_socket->so_snd, 3705 TCP_TIMELY_WORK, 0, 3706 0, &log, false, &tv); 3707 } 3708 } 3709 3710 static int 3711 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3712 { 3713 /* 3714 * Before we increase we need to know if 3715 * the estimate just made was less than 3716 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3717 * 3718 * If we already are pacing at a fast enough 3719 * rate to push us faster there is no sense of 3720 * increasing. 3721 * 3722 * We first caculate our actual pacing rate (ss or ca multiplier 3723 * times our cur_bw). 3724 * 3725 * Then we take the last measured rate and multipy by our 3726 * maximum pacing overage to give us a max allowable rate. 3727 * 3728 * If our act_rate is smaller than our max_allowable rate 3729 * then we should increase. Else we should hold steady. 3730 * 3731 */ 3732 uint64_t act_rate, max_allow_rate; 3733 3734 if (rack_timely_no_stopping) 3735 return (1); 3736 3737 if ((cur_bw == 0) || (last_bw_est == 0)) { 3738 /* 3739 * Initial startup case or 3740 * everything is acked case. 3741 */ 3742 rack_log_timely(rack, mult, cur_bw, 0, 0, 3743 __LINE__, 9); 3744 return (1); 3745 } 3746 if (mult <= 100) { 3747 /* 3748 * We can always pace at or slightly above our rate. 3749 */ 3750 rack_log_timely(rack, mult, cur_bw, 0, 0, 3751 __LINE__, 9); 3752 return (1); 3753 } 3754 act_rate = cur_bw * (uint64_t)mult; 3755 act_rate /= 100; 3756 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3757 max_allow_rate /= 100; 3758 if (act_rate < max_allow_rate) { 3759 /* 3760 * Here the rate we are actually pacing at 3761 * is smaller than 10% above our last measurement. 3762 * This means we are pacing below what we would 3763 * like to try to achieve (plus some wiggle room). 3764 */ 3765 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3766 __LINE__, 9); 3767 return (1); 3768 } else { 3769 /* 3770 * Here we are already pacing at least rack_max_per_above(10%) 3771 * what we are getting back. This indicates most likely 3772 * that we are being limited (cwnd/rwnd/app) and can't 3773 * get any more b/w. There is no sense of trying to 3774 * raise up the pacing rate its not speeding us up 3775 * and we already are pacing faster than we are getting. 3776 */ 3777 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3778 __LINE__, 8); 3779 return (0); 3780 } 3781 } 3782 3783 static void 3784 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3785 { 3786 /* 3787 * When we drag bottom, we want to assure 3788 * that no multiplier is below 1.0, if so 3789 * we want to restore it to at least that. 3790 */ 3791 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3792 /* This is unlikely we usually do not touch recovery */ 3793 rack->r_ctl.rack_per_of_gp_rec = 100; 3794 } 3795 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3796 rack->r_ctl.rack_per_of_gp_ca = 100; 3797 } 3798 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3799 rack->r_ctl.rack_per_of_gp_ss = 100; 3800 } 3801 } 3802 3803 static void 3804 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3805 { 3806 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3807 rack->r_ctl.rack_per_of_gp_ca = 100; 3808 } 3809 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3810 rack->r_ctl.rack_per_of_gp_ss = 100; 3811 } 3812 } 3813 3814 static void 3815 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3816 { 3817 int32_t calc, logged, plus; 3818 3819 logged = 0; 3820 3821 if (rack->rc_skip_timely) 3822 return; 3823 if (override) { 3824 /* 3825 * override is passed when we are 3826 * loosing b/w and making one last 3827 * gasp at trying to not loose out 3828 * to a new-reno flow. 3829 */ 3830 goto extra_boost; 3831 } 3832 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3833 if (rack->rc_gp_incr && 3834 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3835 /* 3836 * Reset and get 5 strokes more before the boost. Note 3837 * that the count is 0 based so we have to add one. 3838 */ 3839 extra_boost: 3840 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3841 rack->rc_gp_timely_inc_cnt = 0; 3842 } else 3843 plus = (uint32_t)rack_gp_increase_per; 3844 /* Must be at least 1% increase for true timely increases */ 3845 if ((plus < 1) && 3846 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3847 plus = 1; 3848 if (rack->rc_gp_saw_rec && 3849 (rack->rc_gp_no_rec_chg == 0) && 3850 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3851 rack->r_ctl.rack_per_of_gp_rec)) { 3852 /* We have been in recovery ding it too */ 3853 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3854 if (calc > 0xffff) 3855 calc = 0xffff; 3856 logged |= 1; 3857 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3858 if (rack->r_ctl.rack_per_upper_bound_ca && 3859 (rack->rc_dragged_bottom == 0) && 3860 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) 3861 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; 3862 } 3863 if (rack->rc_gp_saw_ca && 3864 (rack->rc_gp_saw_ss == 0) && 3865 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3866 rack->r_ctl.rack_per_of_gp_ca)) { 3867 /* In CA */ 3868 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3869 if (calc > 0xffff) 3870 calc = 0xffff; 3871 logged |= 2; 3872 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3873 if (rack->r_ctl.rack_per_upper_bound_ca && 3874 (rack->rc_dragged_bottom == 0) && 3875 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) 3876 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; 3877 } 3878 if (rack->rc_gp_saw_ss && 3879 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3880 rack->r_ctl.rack_per_of_gp_ss)) { 3881 /* In SS */ 3882 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3883 if (calc > 0xffff) 3884 calc = 0xffff; 3885 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3886 if (rack->r_ctl.rack_per_upper_bound_ss && 3887 (rack->rc_dragged_bottom == 0) && 3888 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) 3889 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; 3890 logged |= 4; 3891 } 3892 if (logged && 3893 (rack->rc_gp_incr == 0)){ 3894 /* Go into increment mode */ 3895 rack->rc_gp_incr = 1; 3896 rack->rc_gp_timely_inc_cnt = 0; 3897 } 3898 if (rack->rc_gp_incr && 3899 logged && 3900 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3901 rack->rc_gp_timely_inc_cnt++; 3902 } 3903 rack_log_timely(rack, logged, plus, 0, 0, 3904 __LINE__, 1); 3905 } 3906 3907 static uint32_t 3908 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3909 { 3910 /*- 3911 * norm_grad = rtt_diff / minrtt; 3912 * new_per = curper * (1 - B * norm_grad) 3913 * 3914 * B = rack_gp_decrease_per (default 80%) 3915 * rtt_dif = input var current rtt-diff 3916 * curper = input var current percentage 3917 * minrtt = from rack filter 3918 * 3919 * In order to do the floating point calculations above we 3920 * do an integer conversion. The code looks confusing so let me 3921 * translate it into something that use more variables and 3922 * is clearer for us humans :) 3923 * 3924 * uint64_t norm_grad, inverse, reduce_by, final_result; 3925 * uint32_t perf; 3926 * 3927 * norm_grad = (((uint64_t)rtt_diff * 1000000) / 3928 * (uint64_t)get_filter_small(&rack->r_ctl.rc_gp_min_rtt)); 3929 * inverse = ((uint64_t)rack_gp_decrease * (uint64_t)1000000) * norm_grad; 3930 * inverse /= 1000000; 3931 * reduce_by = (1000000 - inverse); 3932 * final_result = (cur_per * reduce_by) / 1000000; 3933 * perf = (uint32_t)final_result; 3934 */ 3935 uint64_t perf; 3936 3937 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3938 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3939 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3940 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3941 (uint64_t)1000000)) / 3942 (uint64_t)1000000); 3943 if (perf > curper) { 3944 /* TSNH */ 3945 perf = curper - 1; 3946 } 3947 return ((uint32_t)perf); 3948 } 3949 3950 static uint32_t 3951 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3952 { 3953 /* 3954 * highrttthresh 3955 * result = curper * (1 - (B * ( 1 - ------ )) 3956 * gp_srtt 3957 * 3958 * B = rack_gp_decrease_per (default .8 i.e. 80) 3959 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3960 */ 3961 uint64_t perf; 3962 uint32_t highrttthresh; 3963 3964 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3965 3966 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3967 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3968 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3969 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3970 if (tcp_bblogging_on(rack->rc_tp)) { 3971 uint64_t log1; 3972 3973 log1 = rtt; 3974 log1 <<= 32; 3975 log1 |= highrttthresh; 3976 rack_log_timely(rack, 3977 rack_gp_decrease_per, 3978 (uint64_t)curper, 3979 log1, 3980 perf, 3981 __LINE__, 3982 15); 3983 } 3984 return (perf); 3985 } 3986 3987 static void 3988 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3989 { 3990 uint64_t logvar, logvar2, logvar3; 3991 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3992 3993 if (rack->rc_skip_timely) 3994 return; 3995 if (rack->rc_gp_incr) { 3996 /* Turn off increment counting */ 3997 rack->rc_gp_incr = 0; 3998 rack->rc_gp_timely_inc_cnt = 0; 3999 } 4000 ss_red = ca_red = rec_red = 0; 4001 logged = 0; 4002 /* Calculate the reduction value */ 4003 if (rtt_diff < 0) { 4004 rtt_diff *= -1; 4005 } 4006 /* Must be at least 1% reduction */ 4007 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 4008 /* We have been in recovery ding it too */ 4009 if (timely_says == 2) { 4010 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 4011 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 4012 if (alt < new_per) 4013 val = alt; 4014 else 4015 val = new_per; 4016 } else 4017 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 4018 if (rack->r_ctl.rack_per_of_gp_rec > val) { 4019 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 4020 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 4021 } else { 4022 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4023 rec_red = 0; 4024 } 4025 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 4026 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 4027 logged |= 1; 4028 } 4029 if (rack->rc_gp_saw_ss) { 4030 /* Sent in SS */ 4031 if (timely_says == 2) { 4032 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 4033 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4034 if (alt < new_per) 4035 val = alt; 4036 else 4037 val = new_per; 4038 } else 4039 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 4040 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 4041 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 4042 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 4043 } else { 4044 ss_red = new_per; 4045 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4046 logvar = new_per; 4047 logvar <<= 32; 4048 logvar |= alt; 4049 logvar2 = (uint32_t)rtt; 4050 logvar2 <<= 32; 4051 logvar2 |= (uint32_t)rtt_diff; 4052 logvar3 = rack_gp_rtt_maxmul; 4053 logvar3 <<= 32; 4054 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4055 rack_log_timely(rack, timely_says, 4056 logvar2, logvar3, 4057 logvar, __LINE__, 10); 4058 } 4059 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 4060 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 4061 logged |= 4; 4062 } else if (rack->rc_gp_saw_ca) { 4063 /* Sent in CA */ 4064 if (timely_says == 2) { 4065 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 4066 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4067 if (alt < new_per) 4068 val = alt; 4069 else 4070 val = new_per; 4071 } else 4072 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 4073 if (rack->r_ctl.rack_per_of_gp_ca > val) { 4074 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 4075 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 4076 } else { 4077 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4078 ca_red = 0; 4079 logvar = new_per; 4080 logvar <<= 32; 4081 logvar |= alt; 4082 logvar2 = (uint32_t)rtt; 4083 logvar2 <<= 32; 4084 logvar2 |= (uint32_t)rtt_diff; 4085 logvar3 = rack_gp_rtt_maxmul; 4086 logvar3 <<= 32; 4087 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4088 rack_log_timely(rack, timely_says, 4089 logvar2, logvar3, 4090 logvar, __LINE__, 10); 4091 } 4092 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 4093 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 4094 logged |= 2; 4095 } 4096 if (rack->rc_gp_timely_dec_cnt < 0x7) { 4097 rack->rc_gp_timely_dec_cnt++; 4098 if (rack_timely_dec_clear && 4099 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 4100 rack->rc_gp_timely_dec_cnt = 0; 4101 } 4102 logvar = ss_red; 4103 logvar <<= 32; 4104 logvar |= ca_red; 4105 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 4106 __LINE__, 2); 4107 } 4108 4109 static void 4110 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 4111 uint32_t rtt, uint32_t line, uint8_t reas) 4112 { 4113 if (tcp_bblogging_on(rack->rc_tp)) { 4114 union tcp_log_stackspecific log; 4115 struct timeval tv; 4116 4117 memset(&log, 0, sizeof(log)); 4118 log.u_bbr.flex1 = line; 4119 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 4120 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 4121 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 4122 log.u_bbr.flex5 = rtt; 4123 log.u_bbr.flex6 = rack->rc_highly_buffered; 4124 log.u_bbr.flex6 <<= 1; 4125 log.u_bbr.flex6 |= rack->forced_ack; 4126 log.u_bbr.flex6 <<= 1; 4127 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 4128 log.u_bbr.flex6 <<= 1; 4129 log.u_bbr.flex6 |= rack->in_probe_rtt; 4130 log.u_bbr.flex6 <<= 1; 4131 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 4132 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 4133 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 4134 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 4135 log.u_bbr.flex8 = reas; 4136 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4137 log.u_bbr.delRate = rack_get_bw(rack); 4138 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 4139 log.u_bbr.cur_del_rate <<= 32; 4140 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 4141 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 4142 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 4143 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 4144 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 4145 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 4146 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 4147 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 4148 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4149 log.u_bbr.rttProp = us_cts; 4150 log.u_bbr.rttProp <<= 32; 4151 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 4152 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4153 &rack->rc_inp->inp_socket->so_rcv, 4154 &rack->rc_inp->inp_socket->so_snd, 4155 BBR_LOG_RTT_SHRINKS, 0, 4156 0, &log, false, &rack->r_ctl.act_rcv_time); 4157 } 4158 } 4159 4160 static void 4161 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 4162 { 4163 uint64_t bwdp; 4164 4165 bwdp = rack_get_bw(rack); 4166 bwdp *= (uint64_t)rtt; 4167 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 4168 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 4169 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 4170 /* 4171 * A window protocol must be able to have 4 packets 4172 * outstanding as the floor in order to function 4173 * (especially considering delayed ack :D). 4174 */ 4175 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 4176 } 4177 } 4178 4179 static void 4180 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 4181 { 4182 /** 4183 * ProbeRTT is a bit different in rack_pacing than in 4184 * BBR. It is like BBR in that it uses the lowering of 4185 * the RTT as a signal that we saw something new and 4186 * counts from there for how long between. But it is 4187 * different in that its quite simple. It does not 4188 * play with the cwnd and wait until we get down 4189 * to N segments outstanding and hold that for 4190 * 200ms. Instead it just sets the pacing reduction 4191 * rate to a set percentage (70 by default) and hold 4192 * that for a number of recent GP Srtt's. 4193 */ 4194 uint32_t segsiz; 4195 4196 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4197 if (rack->rc_gp_dyn_mul == 0) 4198 return; 4199 4200 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 4201 /* We are idle */ 4202 return; 4203 } 4204 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4205 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4206 /* 4207 * Stop the goodput now, the idea here is 4208 * that future measurements with in_probe_rtt 4209 * won't register if they are not greater so 4210 * we want to get what info (if any) is available 4211 * now. 4212 */ 4213 rack_do_goodput_measurement(rack->rc_tp, rack, 4214 rack->rc_tp->snd_una, __LINE__, 4215 RACK_QUALITY_PROBERTT); 4216 } 4217 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4218 rack->r_ctl.rc_time_probertt_entered = us_cts; 4219 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4220 rack->r_ctl.rc_pace_min_segs); 4221 rack->in_probe_rtt = 1; 4222 rack->measure_saw_probe_rtt = 1; 4223 rack->r_ctl.rc_time_probertt_starts = 0; 4224 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 4225 if (rack_probertt_use_min_rtt_entry) 4226 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4227 else 4228 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 4229 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4230 __LINE__, RACK_RTTS_ENTERPROBE); 4231 } 4232 4233 static void 4234 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 4235 { 4236 struct rack_sendmap *rsm; 4237 uint32_t segsiz; 4238 4239 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4240 rack->r_ctl.rc_pace_min_segs); 4241 rack->in_probe_rtt = 0; 4242 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4243 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4244 /* 4245 * Stop the goodput now, the idea here is 4246 * that future measurements with in_probe_rtt 4247 * won't register if they are not greater so 4248 * we want to get what info (if any) is available 4249 * now. 4250 */ 4251 rack_do_goodput_measurement(rack->rc_tp, rack, 4252 rack->rc_tp->snd_una, __LINE__, 4253 RACK_QUALITY_PROBERTT); 4254 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 4255 /* 4256 * We don't have enough data to make a measurement. 4257 * So lets just stop and start here after exiting 4258 * probe-rtt. We probably are not interested in 4259 * the results anyway. 4260 */ 4261 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 4262 } 4263 /* 4264 * Measurements through the current snd_max are going 4265 * to be limited by the slower pacing rate. 4266 * 4267 * We need to mark these as app-limited so we 4268 * don't collapse the b/w. 4269 */ 4270 rsm = tqhash_max(rack->r_ctl.tqh); 4271 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 4272 if (rack->r_ctl.rc_app_limited_cnt == 0) 4273 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 4274 else { 4275 /* 4276 * Go out to the end app limited and mark 4277 * this new one as next and move the end_appl up 4278 * to this guy. 4279 */ 4280 if (rack->r_ctl.rc_end_appl) 4281 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 4282 rack->r_ctl.rc_end_appl = rsm; 4283 } 4284 rsm->r_flags |= RACK_APP_LIMITED; 4285 rack->r_ctl.rc_app_limited_cnt++; 4286 } 4287 /* 4288 * Now, we need to examine our pacing rate multipliers. 4289 * If its under 100%, we need to kick it back up to 4290 * 100%. We also don't let it be over our "max" above 4291 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 4292 * Note setting clamp_atexit_prtt to 0 has the effect 4293 * of setting CA/SS to 100% always at exit (which is 4294 * the default behavior). 4295 */ 4296 if (rack_probertt_clear_is) { 4297 rack->rc_gp_incr = 0; 4298 rack->rc_gp_bwred = 0; 4299 rack->rc_gp_timely_inc_cnt = 0; 4300 rack->rc_gp_timely_dec_cnt = 0; 4301 } 4302 /* Do we do any clamping at exit? */ 4303 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 4304 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 4305 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 4306 } 4307 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 4308 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 4309 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 4310 } 4311 /* 4312 * Lets set rtt_diff to 0, so that we will get a "boost" 4313 * after exiting. 4314 */ 4315 rack->r_ctl.rc_rtt_diff = 0; 4316 4317 /* Clear all flags so we start fresh */ 4318 rack->rc_tp->t_bytes_acked = 0; 4319 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4320 /* 4321 * If configured to, set the cwnd and ssthresh to 4322 * our targets. 4323 */ 4324 if (rack_probe_rtt_sets_cwnd) { 4325 uint64_t ebdp; 4326 uint32_t setto; 4327 4328 /* Set ssthresh so we get into CA once we hit our target */ 4329 if (rack_probertt_use_min_rtt_exit == 1) { 4330 /* Set to min rtt */ 4331 rack_set_prtt_target(rack, segsiz, 4332 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4333 } else if (rack_probertt_use_min_rtt_exit == 2) { 4334 /* Set to current gp rtt */ 4335 rack_set_prtt_target(rack, segsiz, 4336 rack->r_ctl.rc_gp_srtt); 4337 } else if (rack_probertt_use_min_rtt_exit == 3) { 4338 /* Set to entry gp rtt */ 4339 rack_set_prtt_target(rack, segsiz, 4340 rack->r_ctl.rc_entry_gp_rtt); 4341 } else { 4342 uint64_t sum; 4343 uint32_t setval; 4344 4345 sum = rack->r_ctl.rc_entry_gp_rtt; 4346 sum *= 10; 4347 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 4348 if (sum >= 20) { 4349 /* 4350 * A highly buffered path needs 4351 * cwnd space for timely to work. 4352 * Lets set things up as if 4353 * we are heading back here again. 4354 */ 4355 setval = rack->r_ctl.rc_entry_gp_rtt; 4356 } else if (sum >= 15) { 4357 /* 4358 * Lets take the smaller of the 4359 * two since we are just somewhat 4360 * buffered. 4361 */ 4362 setval = rack->r_ctl.rc_gp_srtt; 4363 if (setval > rack->r_ctl.rc_entry_gp_rtt) 4364 setval = rack->r_ctl.rc_entry_gp_rtt; 4365 } else { 4366 /* 4367 * Here we are not highly buffered 4368 * and should pick the min we can to 4369 * keep from causing loss. 4370 */ 4371 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4372 } 4373 rack_set_prtt_target(rack, segsiz, 4374 setval); 4375 } 4376 if (rack_probe_rtt_sets_cwnd > 1) { 4377 /* There is a percentage here to boost */ 4378 ebdp = rack->r_ctl.rc_target_probertt_flight; 4379 ebdp *= rack_probe_rtt_sets_cwnd; 4380 ebdp /= 100; 4381 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 4382 } else 4383 setto = rack->r_ctl.rc_target_probertt_flight; 4384 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 4385 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 4386 /* Enforce a min */ 4387 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 4388 } 4389 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 4390 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 4391 } 4392 rack_log_rtt_shrinks(rack, us_cts, 4393 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4394 __LINE__, RACK_RTTS_EXITPROBE); 4395 /* Clear times last so log has all the info */ 4396 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 4397 rack->r_ctl.rc_time_probertt_entered = us_cts; 4398 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4399 rack->r_ctl.rc_time_of_last_probertt = us_cts; 4400 } 4401 4402 static void 4403 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 4404 { 4405 /* Check in on probe-rtt */ 4406 4407 if (rack->rc_gp_filled == 0) { 4408 /* We do not do p-rtt unless we have gp measurements */ 4409 return; 4410 } 4411 if (rack->in_probe_rtt) { 4412 uint64_t no_overflow; 4413 uint32_t endtime, must_stay; 4414 4415 if (rack->r_ctl.rc_went_idle_time && 4416 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 4417 /* 4418 * We went idle during prtt, just exit now. 4419 */ 4420 rack_exit_probertt(rack, us_cts); 4421 } else if (rack_probe_rtt_safety_val && 4422 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 4423 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 4424 /* 4425 * Probe RTT safety value triggered! 4426 */ 4427 rack_log_rtt_shrinks(rack, us_cts, 4428 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4429 __LINE__, RACK_RTTS_SAFETY); 4430 rack_exit_probertt(rack, us_cts); 4431 } 4432 /* Calculate the max we will wait */ 4433 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 4434 if (rack->rc_highly_buffered) 4435 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 4436 /* Calculate the min we must wait */ 4437 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 4438 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 4439 TSTMP_LT(us_cts, endtime)) { 4440 uint32_t calc; 4441 /* Do we lower more? */ 4442 no_exit: 4443 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 4444 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 4445 else 4446 calc = 0; 4447 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4448 if (calc) { 4449 /* Maybe */ 4450 calc *= rack_per_of_gp_probertt_reduce; 4451 if (calc > rack_per_of_gp_probertt) 4452 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4453 else 4454 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4455 /* Limit it too */ 4456 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4457 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4458 } 4459 /* We must reach target or the time set */ 4460 return; 4461 } 4462 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4463 if ((TSTMP_LT(us_cts, must_stay) && 4464 rack->rc_highly_buffered) || 4465 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4466 rack->r_ctl.rc_target_probertt_flight)) { 4467 /* We are not past the must_stay time */ 4468 goto no_exit; 4469 } 4470 rack_log_rtt_shrinks(rack, us_cts, 4471 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4472 __LINE__, RACK_RTTS_REACHTARGET); 4473 rack->r_ctl.rc_time_probertt_starts = us_cts; 4474 if (rack->r_ctl.rc_time_probertt_starts == 0) 4475 rack->r_ctl.rc_time_probertt_starts = 1; 4476 /* Restore back to our rate we want to pace at in prtt */ 4477 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4478 } 4479 /* 4480 * Setup our end time, some number of gp_srtts plus 200ms. 4481 */ 4482 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4483 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4484 if (rack_probertt_gpsrtt_cnt_div) 4485 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4486 else 4487 endtime = 0; 4488 endtime += rack_min_probertt_hold; 4489 endtime += rack->r_ctl.rc_time_probertt_starts; 4490 if (TSTMP_GEQ(us_cts, endtime)) { 4491 /* yes, exit probertt */ 4492 rack_exit_probertt(rack, us_cts); 4493 } 4494 4495 } else if ((rack->rc_skip_timely == 0) && 4496 (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) && 4497 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) { 4498 /* Go into probertt, its been too long since we went lower */ 4499 rack_enter_probertt(rack, us_cts); 4500 } 4501 } 4502 4503 static void 4504 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4505 uint32_t rtt, int32_t rtt_diff) 4506 { 4507 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4508 uint32_t losses; 4509 4510 if ((rack->rc_gp_dyn_mul == 0) || 4511 (rack->use_fixed_rate) || 4512 (rack->in_probe_rtt) || 4513 (rack->rc_always_pace == 0)) { 4514 /* No dynamic GP multiplier in play */ 4515 return; 4516 } 4517 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4518 cur_bw = rack_get_bw(rack); 4519 /* Calculate our up and down range */ 4520 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4521 up_bnd /= 100; 4522 up_bnd += rack->r_ctl.last_gp_comp_bw; 4523 4524 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4525 subfr /= 100; 4526 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4527 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4528 /* 4529 * This is the case where our RTT is above 4530 * the max target and we have been configured 4531 * to just do timely no bonus up stuff in that case. 4532 * 4533 * There are two configurations, set to 1, and we 4534 * just do timely if we are over our max. If its 4535 * set above 1 then we slam the multipliers down 4536 * to 100 and then decrement per timely. 4537 */ 4538 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4539 __LINE__, 3); 4540 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4541 rack_validate_multipliers_at_or_below_100(rack); 4542 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4543 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { 4544 /* 4545 * We are decreasing this is a bit complicated this 4546 * means we are loosing ground. This could be 4547 * because another flow entered and we are competing 4548 * for b/w with it. This will push the RTT up which 4549 * makes timely unusable unless we want to get shoved 4550 * into a corner and just be backed off (the age 4551 * old problem with delay based CC). 4552 * 4553 * On the other hand if it was a route change we 4554 * would like to stay somewhat contained and not 4555 * blow out the buffers. 4556 */ 4557 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4558 __LINE__, 3); 4559 rack->r_ctl.last_gp_comp_bw = cur_bw; 4560 if (rack->rc_gp_bwred == 0) { 4561 /* Go into reduction counting */ 4562 rack->rc_gp_bwred = 1; 4563 rack->rc_gp_timely_dec_cnt = 0; 4564 } 4565 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { 4566 /* 4567 * Push another time with a faster pacing 4568 * to try to gain back (we include override to 4569 * get a full raise factor). 4570 */ 4571 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4572 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4573 (timely_says == 0) || 4574 (rack_down_raise_thresh == 0)) { 4575 /* 4576 * Do an override up in b/w if we were 4577 * below the threshold or if the threshold 4578 * is zero we always do the raise. 4579 */ 4580 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4581 } else { 4582 /* Log it stays the same */ 4583 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4584 __LINE__, 11); 4585 } 4586 rack->rc_gp_timely_dec_cnt++; 4587 /* We are not incrementing really no-count */ 4588 rack->rc_gp_incr = 0; 4589 rack->rc_gp_timely_inc_cnt = 0; 4590 } else { 4591 /* 4592 * Lets just use the RTT 4593 * information and give up 4594 * pushing. 4595 */ 4596 goto use_timely; 4597 } 4598 } else if ((timely_says != 2) && 4599 !losses && 4600 (last_bw_est > up_bnd)) { 4601 /* 4602 * We are increasing b/w lets keep going, updating 4603 * our b/w and ignoring any timely input, unless 4604 * of course we are at our max raise (if there is one). 4605 */ 4606 4607 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4608 __LINE__, 3); 4609 rack->r_ctl.last_gp_comp_bw = cur_bw; 4610 if (rack->rc_gp_saw_ss && 4611 rack->r_ctl.rack_per_upper_bound_ss && 4612 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { 4613 /* 4614 * In cases where we can't go higher 4615 * we should just use timely. 4616 */ 4617 goto use_timely; 4618 } 4619 if (rack->rc_gp_saw_ca && 4620 rack->r_ctl.rack_per_upper_bound_ca && 4621 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { 4622 /* 4623 * In cases where we can't go higher 4624 * we should just use timely. 4625 */ 4626 goto use_timely; 4627 } 4628 rack->rc_gp_bwred = 0; 4629 rack->rc_gp_timely_dec_cnt = 0; 4630 /* You get a set number of pushes if timely is trying to reduce */ 4631 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4632 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4633 } else { 4634 /* Log it stays the same */ 4635 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4636 __LINE__, 12); 4637 } 4638 return; 4639 } else { 4640 /* 4641 * We are staying between the lower and upper range bounds 4642 * so use timely to decide. 4643 */ 4644 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4645 __LINE__, 3); 4646 use_timely: 4647 if (timely_says) { 4648 rack->rc_gp_incr = 0; 4649 rack->rc_gp_timely_inc_cnt = 0; 4650 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4651 !losses && 4652 (last_bw_est < low_bnd)) { 4653 /* We are loosing ground */ 4654 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4655 rack->rc_gp_timely_dec_cnt++; 4656 /* We are not incrementing really no-count */ 4657 rack->rc_gp_incr = 0; 4658 rack->rc_gp_timely_inc_cnt = 0; 4659 } else 4660 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4661 } else { 4662 rack->rc_gp_bwred = 0; 4663 rack->rc_gp_timely_dec_cnt = 0; 4664 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4665 } 4666 } 4667 } 4668 4669 static int32_t 4670 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4671 { 4672 int32_t timely_says; 4673 uint64_t log_mult, log_rtt_a_diff; 4674 4675 log_rtt_a_diff = rtt; 4676 log_rtt_a_diff <<= 32; 4677 log_rtt_a_diff |= (uint32_t)rtt_diff; 4678 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4679 rack_gp_rtt_maxmul)) { 4680 /* Reduce the b/w multiplier */ 4681 timely_says = 2; 4682 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4683 log_mult <<= 32; 4684 log_mult |= prev_rtt; 4685 rack_log_timely(rack, timely_says, log_mult, 4686 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4687 log_rtt_a_diff, __LINE__, 4); 4688 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4689 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4690 max(rack_gp_rtt_mindiv , 1)))) { 4691 /* Increase the b/w multiplier */ 4692 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4693 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4694 max(rack_gp_rtt_mindiv , 1)); 4695 log_mult <<= 32; 4696 log_mult |= prev_rtt; 4697 timely_says = 0; 4698 rack_log_timely(rack, timely_says, log_mult , 4699 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4700 log_rtt_a_diff, __LINE__, 5); 4701 } else { 4702 /* 4703 * Use a gradient to find it the timely gradient 4704 * is: 4705 * grad = rc_rtt_diff / min_rtt; 4706 * 4707 * anything below or equal to 0 will be 4708 * a increase indication. Anything above 4709 * zero is a decrease. Note we take care 4710 * of the actual gradient calculation 4711 * in the reduction (its not needed for 4712 * increase). 4713 */ 4714 log_mult = prev_rtt; 4715 if (rtt_diff <= 0) { 4716 /* 4717 * Rttdiff is less than zero, increase the 4718 * b/w multiplier (its 0 or negative) 4719 */ 4720 timely_says = 0; 4721 rack_log_timely(rack, timely_says, log_mult, 4722 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4723 } else { 4724 /* Reduce the b/w multiplier */ 4725 timely_says = 1; 4726 rack_log_timely(rack, timely_says, log_mult, 4727 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4728 } 4729 } 4730 return (timely_says); 4731 } 4732 4733 static __inline int 4734 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) 4735 { 4736 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4737 SEQ_LEQ(rsm->r_end, tp->gput_ack)) { 4738 /** 4739 * This covers the case that the 4740 * resent is completely inside 4741 * the gp range or up to it. 4742 * |----------------| 4743 * |-----| <or> 4744 * |----| 4745 * <or> |---| 4746 */ 4747 return (1); 4748 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && 4749 SEQ_GT(rsm->r_end, tp->gput_seq)){ 4750 /** 4751 * This covers the case of 4752 * |--------------| 4753 * |-------->| 4754 */ 4755 return (1); 4756 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4757 SEQ_LT(rsm->r_start, tp->gput_ack) && 4758 SEQ_GEQ(rsm->r_end, tp->gput_ack)) { 4759 4760 /** 4761 * This covers the case of 4762 * |--------------| 4763 * |-------->| 4764 */ 4765 return (1); 4766 } 4767 return (0); 4768 } 4769 4770 static __inline void 4771 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) 4772 { 4773 4774 if ((tp->t_flags & TF_GPUTINPROG) == 0) 4775 return; 4776 /* 4777 * We have a Goodput measurement in progress. Mark 4778 * the send if its within the window. If its not 4779 * in the window make sure it does not have the mark. 4780 */ 4781 if (rack_in_gp_window(tp, rsm)) 4782 rsm->r_flags |= RACK_IN_GP_WIN; 4783 else 4784 rsm->r_flags &= ~RACK_IN_GP_WIN; 4785 } 4786 4787 static __inline void 4788 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4789 { 4790 /* A GP measurement is ending, clear all marks on the send map*/ 4791 struct rack_sendmap *rsm = NULL; 4792 4793 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4794 if (rsm == NULL) { 4795 rsm = tqhash_min(rack->r_ctl.tqh); 4796 } 4797 /* Nothing left? */ 4798 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ 4799 rsm->r_flags &= ~RACK_IN_GP_WIN; 4800 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4801 } 4802 } 4803 4804 4805 static __inline void 4806 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4807 { 4808 struct rack_sendmap *rsm = NULL; 4809 4810 if (tp->snd_una == tp->snd_max) { 4811 /* Nothing outstanding yet, nothing to do here */ 4812 return; 4813 } 4814 if (SEQ_GT(tp->gput_seq, tp->snd_una)) { 4815 /* 4816 * We are measuring ahead of some outstanding 4817 * data. We need to walk through up until we get 4818 * to gp_seq marking so that no rsm is set incorrectly 4819 * with RACK_IN_GP_WIN. 4820 */ 4821 rsm = tqhash_min(rack->r_ctl.tqh); 4822 while (rsm != NULL) { 4823 rack_mark_in_gp_win(tp, rsm); 4824 if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) 4825 break; 4826 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4827 } 4828 } 4829 if (rsm == NULL) { 4830 /* 4831 * Need to find the GP seq, if rsm is 4832 * set we stopped as we hit it. 4833 */ 4834 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4835 if (rsm == NULL) 4836 return; 4837 rack_mark_in_gp_win(tp, rsm); 4838 } 4839 /* 4840 * Now we may need to mark already sent rsm, ahead of 4841 * gput_seq in the window since they may have been sent 4842 * *before* we started our measurment. The rsm, if non-null 4843 * has been marked (note if rsm would have been NULL we would have 4844 * returned in the previous block). So we go to the next, and continue 4845 * until we run out of entries or we exceed the gp_ack value. 4846 */ 4847 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4848 while (rsm) { 4849 rack_mark_in_gp_win(tp, rsm); 4850 if (SEQ_GT(rsm->r_end, tp->gput_ack)) 4851 break; 4852 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4853 } 4854 } 4855 4856 static void 4857 rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line) 4858 { 4859 if (tcp_bblogging_on(rack->rc_tp)) { 4860 union tcp_log_stackspecific log; 4861 struct timeval tv; 4862 4863 memset(&log, 0, sizeof(log)); 4864 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4865 log.u_bbr.flex1 = add_part; 4866 log.u_bbr.flex2 = sub_part; 4867 log.u_bbr.flex3 = rack_wma_divisor; 4868 log.u_bbr.flex4 = srtt; 4869 log.u_bbr.flex7 = (uint16_t)line; 4870 log.u_bbr.flex8 = meth; 4871 log.u_bbr.delRate = rack->r_ctl.gp_bw; 4872 log.u_bbr.cur_del_rate = meas_bw; 4873 log.u_bbr.rttProp = utim; 4874 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4875 &rack->rc_inp->inp_socket->so_rcv, 4876 &rack->rc_inp->inp_socket->so_snd, 4877 BBR_LOG_THRESH_CALC, 0, 4878 0, &log, false, &rack->r_ctl.act_rcv_time); 4879 } 4880 } 4881 4882 static void 4883 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4884 tcp_seq th_ack, int line, uint8_t quality) 4885 { 4886 uint64_t tim, bytes_ps, stim, utim; 4887 uint32_t segsiz, bytes, reqbytes, us_cts; 4888 int32_t gput, new_rtt_diff, timely_says; 4889 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4890 int did_add = 0; 4891 4892 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 4893 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4894 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4895 tim = us_cts - tp->gput_ts; 4896 else 4897 tim = 0; 4898 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4899 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4900 else 4901 stim = 0; 4902 /* 4903 * Use the larger of the send time or ack time. This prevents us 4904 * from being influenced by ack artifacts to come up with too 4905 * high of measurement. Note that since we are spanning over many more 4906 * bytes in most of our measurements hopefully that is less likely to 4907 * occur. 4908 */ 4909 if (tim > stim) 4910 utim = max(tim, 1); 4911 else 4912 utim = max(stim, 1); 4913 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4914 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); 4915 if ((tim == 0) && (stim == 0)) { 4916 /* 4917 * Invalid measurement time, maybe 4918 * all on one ack/one send? 4919 */ 4920 bytes = 0; 4921 bytes_ps = 0; 4922 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4923 0, 0, 0, 10, __LINE__, NULL, quality); 4924 goto skip_measurement; 4925 } 4926 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4927 /* We never made a us_rtt measurement? */ 4928 bytes = 0; 4929 bytes_ps = 0; 4930 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4931 0, 0, 0, 10, __LINE__, NULL, quality); 4932 goto skip_measurement; 4933 } 4934 /* 4935 * Calculate the maximum possible b/w this connection 4936 * could have. We base our calculation on the lowest 4937 * rtt we have seen during the measurement and the 4938 * largest rwnd the client has given us in that time. This 4939 * forms a BDP that is the maximum that we could ever 4940 * get to the client. Anything larger is not valid. 4941 * 4942 * I originally had code here that rejected measurements 4943 * where the time was less than 1/2 the latest us_rtt. 4944 * But after thinking on that I realized its wrong since 4945 * say you had a 150Mbps or even 1Gbps link, and you 4946 * were a long way away.. example I am in Europe (100ms rtt) 4947 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4948 * bytes my time would be 1.2ms, and yet my rtt would say 4949 * the measurement was invalid the time was < 50ms. The 4950 * same thing is true for 150Mb (8ms of time). 4951 * 4952 * A better way I realized is to look at what the maximum 4953 * the connection could possibly do. This is gated on 4954 * the lowest RTT we have seen and the highest rwnd. 4955 * We should in theory never exceed that, if we are 4956 * then something on the path is storing up packets 4957 * and then feeding them all at once to our endpoint 4958 * messing up our measurement. 4959 */ 4960 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4961 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4962 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4963 if (SEQ_LT(th_ack, tp->gput_seq)) { 4964 /* No measurement can be made */ 4965 bytes = 0; 4966 bytes_ps = 0; 4967 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4968 0, 0, 0, 10, __LINE__, NULL, quality); 4969 goto skip_measurement; 4970 } else 4971 bytes = (th_ack - tp->gput_seq); 4972 bytes_ps = (uint64_t)bytes; 4973 /* 4974 * Don't measure a b/w for pacing unless we have gotten at least 4975 * an initial windows worth of data in this measurement interval. 4976 * 4977 * Small numbers of bytes get badly influenced by delayed ack and 4978 * other artifacts. Note we take the initial window or our 4979 * defined minimum GP (defaulting to 10 which hopefully is the 4980 * IW). 4981 */ 4982 if (rack->rc_gp_filled == 0) { 4983 /* 4984 * The initial estimate is special. We 4985 * have blasted out an IW worth of packets 4986 * without a real valid ack ts results. We 4987 * then setup the app_limited_needs_set flag, 4988 * this should get the first ack in (probably 2 4989 * MSS worth) to be recorded as the timestamp. 4990 * We thus allow a smaller number of bytes i.e. 4991 * IW - 2MSS. 4992 */ 4993 reqbytes -= (2 * segsiz); 4994 /* Also lets fill previous for our first measurement to be neutral */ 4995 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4996 } 4997 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4998 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4999 rack->r_ctl.rc_app_limited_cnt, 5000 0, 0, 10, __LINE__, NULL, quality); 5001 goto skip_measurement; 5002 } 5003 /* 5004 * We now need to calculate the Timely like status so 5005 * we can update (possibly) the b/w multipliers. 5006 */ 5007 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 5008 if (rack->rc_gp_filled == 0) { 5009 /* No previous reading */ 5010 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 5011 } else { 5012 if (rack->measure_saw_probe_rtt == 0) { 5013 /* 5014 * We don't want a probertt to be counted 5015 * since it will be negative incorrectly. We 5016 * expect to be reducing the RTT when we 5017 * pace at a slower rate. 5018 */ 5019 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 5020 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 5021 } 5022 } 5023 timely_says = rack_make_timely_judgement(rack, 5024 rack->r_ctl.rc_gp_srtt, 5025 rack->r_ctl.rc_rtt_diff, 5026 rack->r_ctl.rc_prev_gp_srtt 5027 ); 5028 bytes_ps *= HPTS_USEC_IN_SEC; 5029 bytes_ps /= utim; 5030 if (bytes_ps > rack->r_ctl.last_max_bw) { 5031 /* 5032 * Something is on path playing 5033 * since this b/w is not possible based 5034 * on our BDP (highest rwnd and lowest rtt 5035 * we saw in the measurement window). 5036 * 5037 * Another option here would be to 5038 * instead skip the measurement. 5039 */ 5040 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 5041 bytes_ps, rack->r_ctl.last_max_bw, 0, 5042 11, __LINE__, NULL, quality); 5043 bytes_ps = rack->r_ctl.last_max_bw; 5044 } 5045 /* We store gp for b/w in bytes per second */ 5046 if (rack->rc_gp_filled == 0) { 5047 /* Initial measurement */ 5048 if (bytes_ps) { 5049 rack->r_ctl.gp_bw = bytes_ps; 5050 rack->rc_gp_filled = 1; 5051 rack->r_ctl.num_measurements = 1; 5052 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 5053 } else { 5054 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 5055 rack->r_ctl.rc_app_limited_cnt, 5056 0, 0, 10, __LINE__, NULL, quality); 5057 } 5058 if (tcp_in_hpts(rack->rc_tp) && 5059 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 5060 /* 5061 * Ok we can't trust the pacer in this case 5062 * where we transition from un-paced to paced. 5063 * Or for that matter when the burst mitigation 5064 * was making a wild guess and got it wrong. 5065 * Stop the pacer and clear up all the aggregate 5066 * delays etc. 5067 */ 5068 tcp_hpts_remove(rack->rc_tp); 5069 rack->r_ctl.rc_hpts_flags = 0; 5070 rack->r_ctl.rc_last_output_to = 0; 5071 } 5072 did_add = 2; 5073 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 5074 /* Still a small number run an average */ 5075 rack->r_ctl.gp_bw += bytes_ps; 5076 addpart = rack->r_ctl.num_measurements; 5077 rack->r_ctl.num_measurements++; 5078 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 5079 /* We have collected enough to move forward */ 5080 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 5081 } 5082 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5083 did_add = 3; 5084 } else { 5085 /* 5086 * We want to take 1/wma of the goodput and add in to 7/8th 5087 * of the old value weighted by the srtt. So if your measurement 5088 * period is say 2 SRTT's long you would get 1/4 as the 5089 * value, if it was like 1/2 SRTT then you would get 1/16th. 5090 * 5091 * But we must be careful not to take too much i.e. if the 5092 * srtt is say 20ms and the measurement is taken over 5093 * 400ms our weight would be 400/20 i.e. 20. On the 5094 * other hand if we get a measurement over 1ms with a 5095 * 10ms rtt we only want to take a much smaller portion. 5096 */ 5097 uint8_t meth; 5098 5099 if (rack->r_ctl.num_measurements < 0xff) { 5100 rack->r_ctl.num_measurements++; 5101 } 5102 srtt = (uint64_t)tp->t_srtt; 5103 if (srtt == 0) { 5104 /* 5105 * Strange why did t_srtt go back to zero? 5106 */ 5107 if (rack->r_ctl.rc_rack_min_rtt) 5108 srtt = rack->r_ctl.rc_rack_min_rtt; 5109 else 5110 srtt = HPTS_USEC_IN_MSEC; 5111 } 5112 /* 5113 * XXXrrs: Note for reviewers, in playing with 5114 * dynamic pacing I discovered this GP calculation 5115 * as done originally leads to some undesired results. 5116 * Basically you can get longer measurements contributing 5117 * too much to the WMA. Thus I changed it if you are doing 5118 * dynamic adjustments to only do the aportioned adjustment 5119 * if we have a very small (time wise) measurement. Longer 5120 * measurements just get there weight (defaulting to 1/8) 5121 * add to the WMA. We may want to think about changing 5122 * this to always do that for both sides i.e. dynamic 5123 * and non-dynamic... but considering lots of folks 5124 * were playing with this I did not want to change the 5125 * calculation per.se. without your thoughts.. Lawerence? 5126 * Peter?? 5127 */ 5128 if (rack->rc_gp_dyn_mul == 0) { 5129 subpart = rack->r_ctl.gp_bw * utim; 5130 subpart /= (srtt * 8); 5131 if (subpart < (rack->r_ctl.gp_bw / 2)) { 5132 /* 5133 * The b/w update takes no more 5134 * away then 1/2 our running total 5135 * so factor it in. 5136 */ 5137 addpart = bytes_ps * utim; 5138 addpart /= (srtt * 8); 5139 meth = 1; 5140 } else { 5141 /* 5142 * Don't allow a single measurement 5143 * to account for more than 1/2 of the 5144 * WMA. This could happen on a retransmission 5145 * where utim becomes huge compared to 5146 * srtt (multiple retransmissions when using 5147 * the sending rate which factors in all the 5148 * transmissions from the first one). 5149 */ 5150 subpart = rack->r_ctl.gp_bw / 2; 5151 addpart = bytes_ps / 2; 5152 meth = 2; 5153 } 5154 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5155 resid_bw = rack->r_ctl.gp_bw - subpart; 5156 rack->r_ctl.gp_bw = resid_bw + addpart; 5157 did_add = 1; 5158 } else { 5159 if ((utim / srtt) <= 1) { 5160 /* 5161 * The b/w update was over a small period 5162 * of time. The idea here is to prevent a small 5163 * measurement time period from counting 5164 * too much. So we scale it based on the 5165 * time so it attributes less than 1/rack_wma_divisor 5166 * of its measurement. 5167 */ 5168 subpart = rack->r_ctl.gp_bw * utim; 5169 subpart /= (srtt * rack_wma_divisor); 5170 addpart = bytes_ps * utim; 5171 addpart /= (srtt * rack_wma_divisor); 5172 meth = 3; 5173 } else { 5174 /* 5175 * The scaled measurement was long 5176 * enough so lets just add in the 5177 * portion of the measurement i.e. 1/rack_wma_divisor 5178 */ 5179 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 5180 addpart = bytes_ps / rack_wma_divisor; 5181 meth = 4; 5182 } 5183 if ((rack->measure_saw_probe_rtt == 0) || 5184 (bytes_ps > rack->r_ctl.gp_bw)) { 5185 /* 5186 * For probe-rtt we only add it in 5187 * if its larger, all others we just 5188 * add in. 5189 */ 5190 did_add = 1; 5191 rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); 5192 resid_bw = rack->r_ctl.gp_bw - subpart; 5193 rack->r_ctl.gp_bw = resid_bw + addpart; 5194 } 5195 } 5196 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5197 } 5198 /* 5199 * We only watch the growth of the GP during the initial startup 5200 * or first-slowstart that ensues. If we ever needed to watch 5201 * growth of gp outside of that period all we need to do is 5202 * remove the first clause of this if (rc_initial_ss_comp). 5203 */ 5204 if ((rack->rc_initial_ss_comp == 0) && 5205 (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) { 5206 uint64_t gp_est; 5207 5208 gp_est = bytes_ps; 5209 if (tcp_bblogging_on(rack->rc_tp)) { 5210 union tcp_log_stackspecific log; 5211 struct timeval tv; 5212 5213 memset(&log, 0, sizeof(log)); 5214 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5215 log.u_bbr.flex1 = rack->r_ctl.current_round; 5216 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 5217 log.u_bbr.delRate = gp_est; 5218 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5219 log.u_bbr.flex8 = 41; 5220 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5221 0, &log, false, NULL, __func__, __LINE__,&tv); 5222 } 5223 if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) || 5224 (rack->r_ctl.last_gpest == 0)) { 5225 /* 5226 * The round we get our measurement averaging going 5227 * is the base round so it always is the source point 5228 * for when we had our first increment. From there on 5229 * we only record the round that had a rise. 5230 */ 5231 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5232 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5233 } else if (gp_est >= rack->r_ctl.last_gpest) { 5234 /* 5235 * Test to see if its gone up enough 5236 * to set the round count up to now. Note 5237 * that on the seeding of the 4th measurement we 5238 */ 5239 gp_est *= 1000; 5240 gp_est /= rack->r_ctl.last_gpest; 5241 if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) { 5242 /* 5243 * We went up enough to record the round. 5244 */ 5245 if (tcp_bblogging_on(rack->rc_tp)) { 5246 union tcp_log_stackspecific log; 5247 struct timeval tv; 5248 5249 memset(&log, 0, sizeof(log)); 5250 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5251 log.u_bbr.flex1 = rack->r_ctl.current_round; 5252 log.u_bbr.flex2 = (uint32_t)gp_est; 5253 log.u_bbr.flex3 = rack->r_ctl.gp_gain_req; 5254 log.u_bbr.delRate = gp_est; 5255 log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; 5256 log.u_bbr.flex8 = 42; 5257 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5258 0, &log, false, NULL, __func__, __LINE__,&tv); 5259 } 5260 rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; 5261 if (rack->r_ctl.use_gp_not_last == 1) 5262 rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; 5263 else 5264 rack->r_ctl.last_gpest = bytes_ps; 5265 } 5266 } 5267 } 5268 if ((rack->gp_ready == 0) && 5269 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 5270 /* We have enough measurements now */ 5271 rack->gp_ready = 1; 5272 if (rack->dgp_on || 5273 rack->rack_hibeta) 5274 rack_set_cc_pacing(rack); 5275 if (rack->defer_options) 5276 rack_apply_deferred_options(rack); 5277 } 5278 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 5279 rack_get_bw(rack), 22, did_add, NULL, quality); 5280 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 5281 5282 if ((rack->measure_saw_probe_rtt == 0) && 5283 rack->rc_gp_rtt_set) { 5284 if (rack->rc_skip_timely == 0) { 5285 rack_update_multiplier(rack, timely_says, bytes_ps, 5286 rack->r_ctl.rc_gp_srtt, 5287 rack->r_ctl.rc_rtt_diff); 5288 } 5289 } 5290 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 5291 rack_get_bw(rack), 3, line, NULL, quality); 5292 rack_log_pacing_delay_calc(rack, 5293 bytes, /* flex2 */ 5294 tim, /* flex1 */ 5295 bytes_ps, /* bw_inuse */ 5296 rack->r_ctl.gp_bw, /* delRate */ 5297 rack_get_lt_bw(rack), /* rttProp */ 5298 20, line, NULL, 0); 5299 /* reset the gp srtt and setup the new prev */ 5300 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5301 /* Record the lost count for the next measurement */ 5302 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 5303 skip_measurement: 5304 /* 5305 * We restart our diffs based on the gpsrtt in the 5306 * measurement window. 5307 */ 5308 rack->rc_gp_rtt_set = 0; 5309 rack->rc_gp_saw_rec = 0; 5310 rack->rc_gp_saw_ca = 0; 5311 rack->rc_gp_saw_ss = 0; 5312 rack->rc_dragged_bottom = 0; 5313 if (quality == RACK_QUALITY_HIGH) { 5314 /* 5315 * Gput in the stats world is in kbps where bytes_ps is 5316 * bytes per second so we do ((x * 8)/ 1000). 5317 */ 5318 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); 5319 #ifdef STATS 5320 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 5321 gput); 5322 /* 5323 * XXXLAS: This is a temporary hack, and should be 5324 * chained off VOI_TCP_GPUT when stats(9) grows an 5325 * API to deal with chained VOIs. 5326 */ 5327 if (tp->t_stats_gput_prev > 0) 5328 stats_voi_update_abs_s32(tp->t_stats, 5329 VOI_TCP_GPUT_ND, 5330 ((gput - tp->t_stats_gput_prev) * 100) / 5331 tp->t_stats_gput_prev); 5332 #endif 5333 tp->t_stats_gput_prev = gput; 5334 } 5335 tp->t_flags &= ~TF_GPUTINPROG; 5336 /* 5337 * Now are we app limited now and there is space from where we 5338 * were to where we want to go? 5339 * 5340 * We don't do the other case i.e. non-applimited here since 5341 * the next send will trigger us picking up the missing data. 5342 */ 5343 if (rack->r_ctl.rc_first_appl && 5344 TCPS_HAVEESTABLISHED(tp->t_state) && 5345 rack->r_ctl.rc_app_limited_cnt && 5346 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 5347 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 5348 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 5349 /* 5350 * Yep there is enough outstanding to make a measurement here. 5351 */ 5352 struct rack_sendmap *rsm; 5353 5354 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 5355 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 5356 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 5357 rack->app_limited_needs_set = 0; 5358 tp->gput_seq = th_ack; 5359 if (rack->in_probe_rtt) 5360 rack->measure_saw_probe_rtt = 1; 5361 else if ((rack->measure_saw_probe_rtt) && 5362 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 5363 rack->measure_saw_probe_rtt = 0; 5364 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 5365 /* There is a full window to gain info from */ 5366 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 5367 } else { 5368 /* We can only measure up to the applimited point */ 5369 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 5370 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 5371 /* 5372 * We don't have enough to make a measurement. 5373 */ 5374 tp->t_flags &= ~TF_GPUTINPROG; 5375 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 5376 0, 0, 0, 6, __LINE__, NULL, quality); 5377 return; 5378 } 5379 } 5380 if (tp->t_state >= TCPS_FIN_WAIT_1) { 5381 /* 5382 * We will get no more data into the SB 5383 * this means we need to have the data available 5384 * before we start a measurement. 5385 */ 5386 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 5387 /* Nope not enough data. */ 5388 return; 5389 } 5390 } 5391 tp->t_flags |= TF_GPUTINPROG; 5392 /* 5393 * Now we need to find the timestamp of the send at tp->gput_seq 5394 * for the send based measurement. 5395 */ 5396 rack->r_ctl.rc_gp_cumack_ts = 0; 5397 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 5398 if (rsm) { 5399 /* Ok send-based limit is set */ 5400 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 5401 /* 5402 * Move back to include the earlier part 5403 * so our ack time lines up right (this may 5404 * make an overlapping measurement but thats 5405 * ok). 5406 */ 5407 tp->gput_seq = rsm->r_start; 5408 } 5409 if (rsm->r_flags & RACK_ACKED) { 5410 struct rack_sendmap *nrsm; 5411 5412 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 5413 tp->gput_seq = rsm->r_end; 5414 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 5415 if (nrsm) 5416 rsm = nrsm; 5417 else { 5418 rack->app_limited_needs_set = 1; 5419 } 5420 } else 5421 rack->app_limited_needs_set = 1; 5422 /* We always go from the first send */ 5423 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 5424 } else { 5425 /* 5426 * If we don't find the rsm due to some 5427 * send-limit set the current time, which 5428 * basically disables the send-limit. 5429 */ 5430 struct timeval tv; 5431 5432 microuptime(&tv); 5433 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 5434 } 5435 rack_tend_gp_marks(tp, rack); 5436 rack_log_pacing_delay_calc(rack, 5437 tp->gput_seq, 5438 tp->gput_ack, 5439 (uintptr_t)rsm, 5440 tp->gput_ts, 5441 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 5442 9, 5443 __LINE__, rsm, quality); 5444 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 5445 } else { 5446 /* 5447 * To make sure proper timestamp merging occurs, we need to clear 5448 * all GP marks if we don't start a measurement. 5449 */ 5450 rack_clear_gp_marks(tp, rack); 5451 } 5452 } 5453 5454 /* 5455 * CC wrapper hook functions 5456 */ 5457 static void 5458 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 5459 uint16_t type, int32_t post_recovery) 5460 { 5461 uint32_t prior_cwnd, acked; 5462 struct tcp_log_buffer *lgb = NULL; 5463 uint8_t labc_to_use, quality; 5464 5465 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5466 tp->t_ccv.nsegs = nsegs; 5467 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 5468 if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 5469 uint32_t max; 5470 5471 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 5472 if (tp->t_ccv.bytes_this_ack > max) { 5473 tp->t_ccv.bytes_this_ack = max; 5474 } 5475 } 5476 #ifdef STATS 5477 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 5478 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 5479 #endif 5480 if ((th_ack == tp->snd_max) && rack->lt_bw_up) { 5481 /* 5482 * We will ack all the data, time to end any 5483 * lt_bw_up we have running until something 5484 * new is sent. Note we need to use the actual 5485 * ack_rcv_time which with pacing may be different. 5486 */ 5487 uint64_t tmark; 5488 5489 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); 5490 rack->r_ctl.lt_seq = tp->snd_max; 5491 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 5492 if (tmark >= rack->r_ctl.lt_timemark) { 5493 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 5494 } 5495 rack->r_ctl.lt_timemark = tmark; 5496 rack->lt_bw_up = 0; 5497 } 5498 quality = RACK_QUALITY_NONE; 5499 if ((tp->t_flags & TF_GPUTINPROG) && 5500 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 5501 /* Measure the Goodput */ 5502 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 5503 } 5504 /* Which way our we limited, if not cwnd limited no advance in CA */ 5505 if (tp->snd_cwnd <= tp->snd_wnd) 5506 tp->t_ccv.flags |= CCF_CWND_LIMITED; 5507 else 5508 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 5509 if (tp->snd_cwnd > tp->snd_ssthresh) { 5510 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 5511 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 5512 /* For the setting of a window past use the actual scwnd we are using */ 5513 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 5514 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 5515 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 5516 } 5517 } else { 5518 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 5519 tp->t_bytes_acked = 0; 5520 } 5521 prior_cwnd = tp->snd_cwnd; 5522 if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 5523 (rack_client_low_buf && rack->client_bufferlvl && 5524 (rack->client_bufferlvl < rack_client_low_buf))) 5525 labc_to_use = rack->rc_labc; 5526 else 5527 labc_to_use = rack_max_abc_post_recovery; 5528 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5529 union tcp_log_stackspecific log; 5530 struct timeval tv; 5531 5532 memset(&log, 0, sizeof(log)); 5533 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5534 log.u_bbr.flex1 = th_ack; 5535 log.u_bbr.flex2 = tp->t_ccv.flags; 5536 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5537 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5538 log.u_bbr.flex5 = labc_to_use; 5539 log.u_bbr.flex6 = prior_cwnd; 5540 log.u_bbr.flex7 = V_tcp_do_newsack; 5541 log.u_bbr.flex8 = 1; 5542 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5543 0, &log, false, NULL, __func__, __LINE__,&tv); 5544 } 5545 if (CC_ALGO(tp)->ack_received != NULL) { 5546 /* XXXLAS: Find a way to live without this */ 5547 tp->t_ccv.curack = th_ack; 5548 tp->t_ccv.labc = labc_to_use; 5549 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 5550 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 5551 } 5552 if (lgb) { 5553 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 5554 } 5555 if (rack->r_must_retran) { 5556 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 5557 /* 5558 * We now are beyond the rxt point so lets disable 5559 * the flag. 5560 */ 5561 rack->r_ctl.rc_out_at_rto = 0; 5562 rack->r_must_retran = 0; 5563 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 5564 /* 5565 * Only decrement the rc_out_at_rto if the cwnd advances 5566 * at least a whole segment. Otherwise next time the peer 5567 * acks, we won't be able to send this generaly happens 5568 * when we are in Congestion Avoidance. 5569 */ 5570 if (acked <= rack->r_ctl.rc_out_at_rto){ 5571 rack->r_ctl.rc_out_at_rto -= acked; 5572 } else { 5573 rack->r_ctl.rc_out_at_rto = 0; 5574 } 5575 } 5576 } 5577 #ifdef STATS 5578 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 5579 #endif 5580 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 5581 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 5582 } 5583 if ((rack->rc_initial_ss_comp == 0) && 5584 (tp->snd_cwnd >= tp->snd_ssthresh)) { 5585 /* 5586 * The cwnd has grown beyond ssthresh we have 5587 * entered ca and completed our first Slowstart. 5588 */ 5589 rack->rc_initial_ss_comp = 1; 5590 } 5591 } 5592 5593 static void 5594 tcp_rack_partialack(struct tcpcb *tp) 5595 { 5596 struct tcp_rack *rack; 5597 5598 rack = (struct tcp_rack *)tp->t_fb_ptr; 5599 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5600 /* 5601 * If we are doing PRR and have enough 5602 * room to send <or> we are pacing and prr 5603 * is disabled we will want to see if we 5604 * can send data (by setting r_wanted_output to 5605 * true). 5606 */ 5607 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 5608 rack->rack_no_prr) 5609 rack->r_wanted_output = 1; 5610 } 5611 5612 static void 5613 rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how) 5614 { 5615 /* 5616 * Now exit recovery. 5617 */ 5618 EXIT_RECOVERY(tp->t_flags); 5619 } 5620 5621 static void 5622 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 5623 { 5624 struct tcp_rack *rack; 5625 uint32_t orig_cwnd; 5626 5627 orig_cwnd = tp->snd_cwnd; 5628 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5629 rack = (struct tcp_rack *)tp->t_fb_ptr; 5630 /* only alert CC if we alerted when we entered */ 5631 if (CC_ALGO(tp)->post_recovery != NULL) { 5632 tp->t_ccv.curack = th_ack; 5633 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 5634 if (tp->snd_cwnd < tp->snd_ssthresh) { 5635 /* 5636 * Rack has burst control and pacing 5637 * so lets not set this any lower than 5638 * snd_ssthresh per RFC-6582 (option 2). 5639 */ 5640 tp->snd_cwnd = tp->snd_ssthresh; 5641 } 5642 } 5643 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5644 union tcp_log_stackspecific log; 5645 struct timeval tv; 5646 5647 memset(&log, 0, sizeof(log)); 5648 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5649 log.u_bbr.flex1 = th_ack; 5650 log.u_bbr.flex2 = tp->t_ccv.flags; 5651 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5652 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5653 log.u_bbr.flex5 = V_tcp_abc_l_var; 5654 log.u_bbr.flex6 = orig_cwnd; 5655 log.u_bbr.flex7 = V_tcp_do_newsack; 5656 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 5657 log.u_bbr.flex8 = 2; 5658 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5659 0, &log, false, NULL, __func__, __LINE__, &tv); 5660 } 5661 if ((rack->rack_no_prr == 0) && 5662 (rack->no_prr_addback == 0) && 5663 (rack->r_ctl.rc_prr_sndcnt > 0)) { 5664 /* 5665 * Suck the next prr cnt back into cwnd, but 5666 * only do that if we are not application limited. 5667 */ 5668 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 5669 /* 5670 * We are allowed to add back to the cwnd the amount we did 5671 * not get out if: 5672 * a) no_prr_addback is off. 5673 * b) we are not app limited 5674 * c) we are doing prr 5675 * <and> 5676 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 5677 */ 5678 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 5679 rack->r_ctl.rc_prr_sndcnt); 5680 } 5681 rack->r_ctl.rc_prr_sndcnt = 0; 5682 rack_log_to_prr(rack, 1, 0, __LINE__); 5683 } 5684 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 5685 tp->snd_recover = tp->snd_una; 5686 if (rack->r_ctl.dsack_persist) { 5687 rack->r_ctl.dsack_persist--; 5688 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 5689 rack->r_ctl.num_dsack = 0; 5690 } 5691 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 5692 } 5693 if (rack->rto_from_rec == 1) { 5694 rack->rto_from_rec = 0; 5695 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 5696 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 5697 } 5698 rack_exit_recovery(tp, rack, 1); 5699 } 5700 5701 static void 5702 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 5703 { 5704 struct tcp_rack *rack; 5705 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 5706 5707 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5708 #ifdef STATS 5709 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 5710 #endif 5711 if (IN_RECOVERY(tp->t_flags) == 0) { 5712 in_rec_at_entry = 0; 5713 ssthresh_enter = tp->snd_ssthresh; 5714 cwnd_enter = tp->snd_cwnd; 5715 } else 5716 in_rec_at_entry = 1; 5717 rack = (struct tcp_rack *)tp->t_fb_ptr; 5718 switch (type) { 5719 case CC_NDUPACK: 5720 tp->t_flags &= ~TF_WASFRECOVERY; 5721 tp->t_flags &= ~TF_WASCRECOVERY; 5722 if (!IN_FASTRECOVERY(tp->t_flags)) { 5723 /* Check if this is the end of the initial Start-up i.e. initial slow-start */ 5724 if (rack->rc_initial_ss_comp == 0) { 5725 /* Yep it is the end of the initial slowstart */ 5726 rack->rc_initial_ss_comp = 1; 5727 } 5728 rack->r_ctl.rc_prr_delivered = 0; 5729 rack->r_ctl.rc_prr_out = 0; 5730 rack->r_fast_output = 0; 5731 if (rack->rack_no_prr == 0) { 5732 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5733 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 5734 } 5735 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 5736 tp->snd_recover = tp->snd_max; 5737 if (tp->t_flags2 & TF2_ECN_PERMIT) 5738 tp->t_flags2 |= TF2_ECN_SND_CWR; 5739 } 5740 break; 5741 case CC_ECN: 5742 if (!IN_CONGRECOVERY(tp->t_flags) || 5743 /* 5744 * Allow ECN reaction on ACK to CWR, if 5745 * that data segment was also CE marked. 5746 */ 5747 SEQ_GEQ(ack, tp->snd_recover)) { 5748 EXIT_CONGRECOVERY(tp->t_flags); 5749 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 5750 rack->r_fast_output = 0; 5751 tp->snd_recover = tp->snd_max + 1; 5752 if (tp->t_flags2 & TF2_ECN_PERMIT) 5753 tp->t_flags2 |= TF2_ECN_SND_CWR; 5754 } 5755 break; 5756 case CC_RTO: 5757 tp->t_dupacks = 0; 5758 tp->t_bytes_acked = 0; 5759 rack->r_fast_output = 0; 5760 if (IN_RECOVERY(tp->t_flags)) 5761 rack_exit_recovery(tp, rack, 2); 5762 orig_cwnd = tp->snd_cwnd; 5763 rack_log_to_prr(rack, 16, orig_cwnd, line); 5764 if (CC_ALGO(tp)->cong_signal == NULL) { 5765 /* TSNH */ 5766 tp->snd_ssthresh = max(2, 5767 min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 5768 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 5769 tp->snd_cwnd = ctf_fixed_maxseg(tp); 5770 } 5771 if (tp->t_flags2 & TF2_ECN_PERMIT) 5772 tp->t_flags2 |= TF2_ECN_SND_CWR; 5773 break; 5774 case CC_RTO_ERR: 5775 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 5776 /* RTO was unnecessary, so reset everything. */ 5777 tp->snd_cwnd = tp->snd_cwnd_prev; 5778 tp->snd_ssthresh = tp->snd_ssthresh_prev; 5779 tp->snd_recover = tp->snd_recover_prev; 5780 if (tp->t_flags & TF_WASFRECOVERY) { 5781 ENTER_FASTRECOVERY(tp->t_flags); 5782 tp->t_flags &= ~TF_WASFRECOVERY; 5783 } 5784 if (tp->t_flags & TF_WASCRECOVERY) { 5785 ENTER_CONGRECOVERY(tp->t_flags); 5786 tp->t_flags &= ~TF_WASCRECOVERY; 5787 } 5788 tp->snd_nxt = tp->snd_max; 5789 tp->t_badrxtwin = 0; 5790 break; 5791 } 5792 if ((CC_ALGO(tp)->cong_signal != NULL) && 5793 (type != CC_RTO)){ 5794 tp->t_ccv.curack = ack; 5795 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 5796 } 5797 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 5798 rack_log_to_prr(rack, 15, cwnd_enter, line); 5799 rack->r_ctl.dsack_byte_cnt = 0; 5800 rack->r_ctl.retran_during_recovery = 0; 5801 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 5802 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 5803 rack->r_ent_rec_ns = 1; 5804 } 5805 } 5806 5807 static inline void 5808 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 5809 { 5810 uint32_t i_cwnd; 5811 5812 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5813 5814 if (CC_ALGO(tp)->after_idle != NULL) 5815 CC_ALGO(tp)->after_idle(&tp->t_ccv); 5816 5817 if (tp->snd_cwnd == 1) 5818 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 5819 else 5820 i_cwnd = rc_init_window(rack); 5821 5822 /* 5823 * Being idle is no different than the initial window. If the cc 5824 * clamps it down below the initial window raise it to the initial 5825 * window. 5826 */ 5827 if (tp->snd_cwnd < i_cwnd) { 5828 tp->snd_cwnd = i_cwnd; 5829 } 5830 } 5831 5832 /* 5833 * Indicate whether this ack should be delayed. We can delay the ack if 5834 * following conditions are met: 5835 * - There is no delayed ack timer in progress. 5836 * - Our last ack wasn't a 0-sized window. We never want to delay 5837 * the ack that opens up a 0-sized window. 5838 * - LRO wasn't used for this segment. We make sure by checking that the 5839 * segment size is not larger than the MSS. 5840 * - Delayed acks are enabled or this is a half-synchronized T/TCP 5841 * connection. 5842 */ 5843 #define DELAY_ACK(tp, tlen) \ 5844 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 5845 ((tp->t_flags & TF_DELACK) == 0) && \ 5846 (tlen <= tp->t_maxseg) && \ 5847 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 5848 5849 static struct rack_sendmap * 5850 rack_find_lowest_rsm(struct tcp_rack *rack) 5851 { 5852 struct rack_sendmap *rsm; 5853 5854 /* 5855 * Walk the time-order transmitted list looking for an rsm that is 5856 * not acked. This will be the one that was sent the longest time 5857 * ago that is still outstanding. 5858 */ 5859 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 5860 if (rsm->r_flags & RACK_ACKED) { 5861 continue; 5862 } 5863 goto finish; 5864 } 5865 finish: 5866 return (rsm); 5867 } 5868 5869 static struct rack_sendmap * 5870 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5871 { 5872 struct rack_sendmap *prsm; 5873 5874 /* 5875 * Walk the sequence order list backward until we hit and arrive at 5876 * the highest seq not acked. In theory when this is called it 5877 * should be the last segment (which it was not). 5878 */ 5879 prsm = rsm; 5880 5881 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { 5882 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5883 continue; 5884 } 5885 return (prsm); 5886 } 5887 return (NULL); 5888 } 5889 5890 static uint32_t 5891 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed) 5892 { 5893 int32_t lro; 5894 uint32_t thresh; 5895 5896 /* 5897 * lro is the flag we use to determine if we have seen reordering. 5898 * If it gets set we have seen reordering. The reorder logic either 5899 * works in one of two ways: 5900 * 5901 * If reorder-fade is configured, then we track the last time we saw 5902 * re-ordering occur. If we reach the point where enough time as 5903 * passed we no longer consider reordering as occurring. 5904 * 5905 * Or if reorder-face is 0, then once we see reordering we consider 5906 * the connection to alway be subject to reordering and just set lro 5907 * to 1. 5908 * 5909 * In the end if lro is non-zero we add the extra time for 5910 * reordering in. 5911 */ 5912 if (srtt == 0) 5913 srtt = 1; 5914 if (rack->r_ctl.rc_reorder_ts) { 5915 if (rack->r_ctl.rc_reorder_fade) { 5916 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 5917 lro = cts - rack->r_ctl.rc_reorder_ts; 5918 if (lro == 0) { 5919 /* 5920 * No time as passed since the last 5921 * reorder, mark it as reordering. 5922 */ 5923 lro = 1; 5924 } 5925 } else { 5926 /* Negative time? */ 5927 lro = 0; 5928 } 5929 if (lro > rack->r_ctl.rc_reorder_fade) { 5930 /* Turn off reordering seen too */ 5931 rack->r_ctl.rc_reorder_ts = 0; 5932 lro = 0; 5933 } 5934 } else { 5935 /* Reodering does not fade */ 5936 lro = 1; 5937 } 5938 } else { 5939 lro = 0; 5940 } 5941 if (rack->rc_rack_tmr_std_based == 0) { 5942 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5943 } else { 5944 /* Standards based pkt-delay is 1/4 srtt */ 5945 thresh = srtt + (srtt >> 2); 5946 } 5947 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 5948 /* It must be set, if not you get 1/4 rtt */ 5949 if (rack->r_ctl.rc_reorder_shift) 5950 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5951 else 5952 thresh += (srtt >> 2); 5953 } 5954 if (rack->rc_rack_use_dsack && 5955 lro && 5956 (rack->r_ctl.num_dsack > 0)) { 5957 /* 5958 * We only increase the reordering window if we 5959 * have seen reordering <and> we have a DSACK count. 5960 */ 5961 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 5962 if (log_allowed) 5963 rack_log_dsack_event(rack, 4, line, srtt, thresh); 5964 } 5965 /* SRTT * 2 is the ceiling */ 5966 if (thresh > (srtt * 2)) { 5967 thresh = srtt * 2; 5968 } 5969 /* And we don't want it above the RTO max either */ 5970 if (thresh > rack_rto_max) { 5971 thresh = rack_rto_max; 5972 } 5973 if (log_allowed) 5974 rack_log_dsack_event(rack, 6, line, srtt, thresh); 5975 return (thresh); 5976 } 5977 5978 static uint32_t 5979 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5980 struct rack_sendmap *rsm, uint32_t srtt) 5981 { 5982 struct rack_sendmap *prsm; 5983 uint32_t thresh, len; 5984 int segsiz; 5985 5986 if (srtt == 0) 5987 srtt = 1; 5988 if (rack->r_ctl.rc_tlp_threshold) 5989 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5990 else 5991 thresh = (srtt * 2); 5992 5993 /* Get the previous sent packet, if any */ 5994 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5995 len = rsm->r_end - rsm->r_start; 5996 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5997 /* Exactly like the ID */ 5998 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5999 uint32_t alt_thresh; 6000 /* 6001 * Compensate for delayed-ack with the d-ack time. 6002 */ 6003 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6004 if (alt_thresh > thresh) 6005 thresh = alt_thresh; 6006 } 6007 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 6008 /* 2.1 behavior */ 6009 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 6010 if (prsm && (len <= segsiz)) { 6011 /* 6012 * Two packets outstanding, thresh should be (2*srtt) + 6013 * possible inter-packet delay (if any). 6014 */ 6015 uint32_t inter_gap = 0; 6016 int idx, nidx; 6017 6018 idx = rsm->r_rtr_cnt - 1; 6019 nidx = prsm->r_rtr_cnt - 1; 6020 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 6021 /* Yes it was sent later (or at the same time) */ 6022 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 6023 } 6024 thresh += inter_gap; 6025 } else if (len <= segsiz) { 6026 /* 6027 * Possibly compensate for delayed-ack. 6028 */ 6029 uint32_t alt_thresh; 6030 6031 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6032 if (alt_thresh > thresh) 6033 thresh = alt_thresh; 6034 } 6035 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 6036 /* 2.2 behavior */ 6037 if (len <= segsiz) { 6038 uint32_t alt_thresh; 6039 /* 6040 * Compensate for delayed-ack with the d-ack time. 6041 */ 6042 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6043 if (alt_thresh > thresh) 6044 thresh = alt_thresh; 6045 } 6046 } 6047 /* Not above an RTO */ 6048 if (thresh > tp->t_rxtcur) { 6049 thresh = tp->t_rxtcur; 6050 } 6051 /* Not above a RTO max */ 6052 if (thresh > rack_rto_max) { 6053 thresh = rack_rto_max; 6054 } 6055 /* Apply user supplied min TLP */ 6056 if (thresh < rack_tlp_min) { 6057 thresh = rack_tlp_min; 6058 } 6059 return (thresh); 6060 } 6061 6062 static uint32_t 6063 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 6064 { 6065 /* 6066 * We want the rack_rtt which is the 6067 * last rtt we measured. However if that 6068 * does not exist we fallback to the srtt (which 6069 * we probably will never do) and then as a last 6070 * resort we use RACK_INITIAL_RTO if no srtt is 6071 * yet set. 6072 */ 6073 if (rack->rc_rack_rtt) 6074 return (rack->rc_rack_rtt); 6075 else if (tp->t_srtt == 0) 6076 return (RACK_INITIAL_RTO); 6077 return (tp->t_srtt); 6078 } 6079 6080 static struct rack_sendmap * 6081 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 6082 { 6083 /* 6084 * Check to see that we don't need to fall into recovery. We will 6085 * need to do so if our oldest transmit is past the time we should 6086 * have had an ack. 6087 */ 6088 struct tcp_rack *rack; 6089 struct rack_sendmap *rsm; 6090 int32_t idx; 6091 uint32_t srtt, thresh; 6092 6093 rack = (struct tcp_rack *)tp->t_fb_ptr; 6094 if (tqhash_empty(rack->r_ctl.tqh)) { 6095 return (NULL); 6096 } 6097 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6098 if (rsm == NULL) 6099 return (NULL); 6100 6101 6102 if (rsm->r_flags & RACK_ACKED) { 6103 rsm = rack_find_lowest_rsm(rack); 6104 if (rsm == NULL) 6105 return (NULL); 6106 } 6107 idx = rsm->r_rtr_cnt - 1; 6108 srtt = rack_grab_rtt(tp, rack); 6109 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 6110 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 6111 return (NULL); 6112 } 6113 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 6114 return (NULL); 6115 } 6116 /* Ok if we reach here we are over-due and this guy can be sent */ 6117 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 6118 return (rsm); 6119 } 6120 6121 static uint32_t 6122 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 6123 { 6124 int32_t t; 6125 int32_t tt; 6126 uint32_t ret_val; 6127 6128 t = (tp->t_srtt + (tp->t_rttvar << 2)); 6129 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 6130 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 6131 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 6132 ret_val = (uint32_t)tt; 6133 return (ret_val); 6134 } 6135 6136 static uint32_t 6137 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 6138 { 6139 /* 6140 * Start the FR timer, we do this based on getting the first one in 6141 * the rc_tmap. Note that if its NULL we must stop the timer. in all 6142 * events we need to stop the running timer (if its running) before 6143 * starting the new one. 6144 */ 6145 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 6146 uint32_t srtt_cur; 6147 int32_t idx; 6148 int32_t is_tlp_timer = 0; 6149 struct rack_sendmap *rsm; 6150 6151 if (rack->t_timers_stopped) { 6152 /* All timers have been stopped none are to run */ 6153 return (0); 6154 } 6155 if (rack->rc_in_persist) { 6156 /* We can't start any timer in persists */ 6157 return (rack_get_persists_timer_val(tp, rack)); 6158 } 6159 rack->rc_on_min_to = 0; 6160 if ((tp->t_state < TCPS_ESTABLISHED) || 6161 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 6162 goto activate_rxt; 6163 } 6164 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6165 if ((rsm == NULL) || sup_rack) { 6166 /* Nothing on the send map or no rack */ 6167 activate_rxt: 6168 time_since_sent = 0; 6169 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6170 if (rsm) { 6171 /* 6172 * Should we discount the RTX timer any? 6173 * 6174 * We want to discount it the smallest amount. 6175 * If a timer (Rack/TLP or RXT) has gone off more 6176 * recently thats the discount we want to use (now - timer time). 6177 * If the retransmit of the oldest packet was more recent then 6178 * we want to use that (now - oldest-packet-last_transmit_time). 6179 * 6180 */ 6181 idx = rsm->r_rtr_cnt - 1; 6182 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 6183 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6184 else 6185 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6186 if (TSTMP_GT(cts, tstmp_touse)) 6187 time_since_sent = cts - tstmp_touse; 6188 } 6189 if (SEQ_LT(tp->snd_una, tp->snd_max) || 6190 sbavail(&tptosocket(tp)->so_snd)) { 6191 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 6192 to = tp->t_rxtcur; 6193 if (to > time_since_sent) 6194 to -= time_since_sent; 6195 else 6196 to = rack->r_ctl.rc_min_to; 6197 if (to == 0) 6198 to = 1; 6199 /* Special case for KEEPINIT */ 6200 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6201 (TP_KEEPINIT(tp) != 0) && 6202 rsm) { 6203 /* 6204 * We have to put a ceiling on the rxt timer 6205 * of the keep-init timeout. 6206 */ 6207 uint32_t max_time, red; 6208 6209 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 6210 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 6211 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 6212 if (red < max_time) 6213 max_time -= red; 6214 else 6215 max_time = 1; 6216 } 6217 /* Reduce timeout to the keep value if needed */ 6218 if (max_time < to) 6219 to = max_time; 6220 } 6221 return (to); 6222 } 6223 return (0); 6224 } 6225 if (rsm->r_flags & RACK_ACKED) { 6226 rsm = rack_find_lowest_rsm(rack); 6227 if (rsm == NULL) { 6228 /* No lowest? */ 6229 goto activate_rxt; 6230 } 6231 } 6232 /* Convert from ms to usecs */ 6233 if ((rsm->r_flags & RACK_SACK_PASSED) || 6234 (rsm->r_flags & RACK_RWND_COLLAPSED) || 6235 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 6236 if ((tp->t_flags & TF_SENTFIN) && 6237 ((tp->snd_max - tp->snd_una) == 1) && 6238 (rsm->r_flags & RACK_HAS_FIN)) { 6239 /* 6240 * We don't start a rack timer if all we have is a 6241 * FIN outstanding. 6242 */ 6243 goto activate_rxt; 6244 } 6245 if ((rack->use_rack_rr == 0) && 6246 (IN_FASTRECOVERY(tp->t_flags)) && 6247 (rack->rack_no_prr == 0) && 6248 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 6249 /* 6250 * We are not cheating, in recovery and 6251 * not enough ack's to yet get our next 6252 * retransmission out. 6253 * 6254 * Note that classified attackers do not 6255 * get to use the rack-cheat. 6256 */ 6257 goto activate_tlp; 6258 } 6259 srtt = rack_grab_rtt(tp, rack); 6260 thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1); 6261 idx = rsm->r_rtr_cnt - 1; 6262 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 6263 if (SEQ_GEQ(exp, cts)) { 6264 to = exp - cts; 6265 if (to < rack->r_ctl.rc_min_to) { 6266 to = rack->r_ctl.rc_min_to; 6267 if (rack->r_rr_config == 3) 6268 rack->rc_on_min_to = 1; 6269 } 6270 } else { 6271 to = rack->r_ctl.rc_min_to; 6272 if (rack->r_rr_config == 3) 6273 rack->rc_on_min_to = 1; 6274 } 6275 } else { 6276 /* Ok we need to do a TLP not RACK */ 6277 activate_tlp: 6278 if ((rack->rc_tlp_in_progress != 0) && 6279 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 6280 /* 6281 * The previous send was a TLP and we have sent 6282 * N TLP's without sending new data. 6283 */ 6284 goto activate_rxt; 6285 } 6286 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 6287 if (rsm == NULL) { 6288 /* We found no rsm to TLP with. */ 6289 goto activate_rxt; 6290 } 6291 if (rsm->r_flags & RACK_HAS_FIN) { 6292 /* If its a FIN we dont do TLP */ 6293 rsm = NULL; 6294 goto activate_rxt; 6295 } 6296 idx = rsm->r_rtr_cnt - 1; 6297 time_since_sent = 0; 6298 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 6299 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6300 else 6301 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6302 if (TSTMP_GT(cts, tstmp_touse)) 6303 time_since_sent = cts - tstmp_touse; 6304 is_tlp_timer = 1; 6305 if (tp->t_srtt) { 6306 if ((rack->rc_srtt_measure_made == 0) && 6307 (tp->t_srtt == 1)) { 6308 /* 6309 * If another stack as run and set srtt to 1, 6310 * then the srtt was 0, so lets use the initial. 6311 */ 6312 srtt = RACK_INITIAL_RTO; 6313 } else { 6314 srtt_cur = tp->t_srtt; 6315 srtt = srtt_cur; 6316 } 6317 } else 6318 srtt = RACK_INITIAL_RTO; 6319 /* 6320 * If the SRTT is not keeping up and the 6321 * rack RTT has spiked we want to use 6322 * the last RTT not the smoothed one. 6323 */ 6324 if (rack_tlp_use_greater && 6325 tp->t_srtt && 6326 (srtt < rack_grab_rtt(tp, rack))) { 6327 srtt = rack_grab_rtt(tp, rack); 6328 } 6329 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 6330 if (thresh > time_since_sent) { 6331 to = thresh - time_since_sent; 6332 } else { 6333 to = rack->r_ctl.rc_min_to; 6334 rack_log_alt_to_to_cancel(rack, 6335 thresh, /* flex1 */ 6336 time_since_sent, /* flex2 */ 6337 tstmp_touse, /* flex3 */ 6338 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 6339 (uint32_t)rsm->r_tim_lastsent[idx], 6340 srtt, 6341 idx, 99); 6342 } 6343 if (to < rack_tlp_min) { 6344 to = rack_tlp_min; 6345 } 6346 if (to > TICKS_2_USEC(tcp_rexmit_max)) { 6347 /* 6348 * If the TLP time works out to larger than the max 6349 * RTO lets not do TLP.. just RTO. 6350 */ 6351 goto activate_rxt; 6352 } 6353 } 6354 if (is_tlp_timer == 0) { 6355 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 6356 } else { 6357 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 6358 } 6359 if (to == 0) 6360 to = 1; 6361 return (to); 6362 } 6363 6364 static void 6365 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) 6366 { 6367 if (rack->rc_in_persist == 0) { 6368 if (tp->t_flags & TF_GPUTINPROG) { 6369 /* 6370 * Stop the goodput now, the calling of the 6371 * measurement function clears the flag. 6372 */ 6373 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 6374 RACK_QUALITY_PERSIST); 6375 } 6376 #ifdef NETFLIX_SHARED_CWND 6377 if (rack->r_ctl.rc_scw) { 6378 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6379 rack->rack_scwnd_is_idle = 1; 6380 } 6381 #endif 6382 rack->r_ctl.rc_went_idle_time = cts; 6383 if (rack->r_ctl.rc_went_idle_time == 0) 6384 rack->r_ctl.rc_went_idle_time = 1; 6385 if (rack->lt_bw_up) { 6386 /* Suspend our LT BW measurement */ 6387 uint64_t tmark; 6388 6389 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); 6390 rack->r_ctl.lt_seq = snd_una; 6391 tmark = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 6392 if (tmark >= rack->r_ctl.lt_timemark) { 6393 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 6394 } 6395 rack->r_ctl.lt_timemark = tmark; 6396 rack->lt_bw_up = 0; 6397 rack->r_persist_lt_bw_off = 1; 6398 } 6399 rack_timer_cancel(tp, rack, cts, __LINE__); 6400 rack->r_ctl.persist_lost_ends = 0; 6401 rack->probe_not_answered = 0; 6402 rack->forced_ack = 0; 6403 tp->t_rxtshift = 0; 6404 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6405 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6406 rack->rc_in_persist = 1; 6407 } 6408 } 6409 6410 static void 6411 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6412 { 6413 if (tcp_in_hpts(rack->rc_tp)) { 6414 tcp_hpts_remove(rack->rc_tp); 6415 rack->r_ctl.rc_hpts_flags = 0; 6416 } 6417 #ifdef NETFLIX_SHARED_CWND 6418 if (rack->r_ctl.rc_scw) { 6419 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6420 rack->rack_scwnd_is_idle = 0; 6421 } 6422 #endif 6423 if (rack->rc_gp_dyn_mul && 6424 (rack->use_fixed_rate == 0) && 6425 (rack->rc_always_pace)) { 6426 /* 6427 * Do we count this as if a probe-rtt just 6428 * finished? 6429 */ 6430 uint32_t time_idle, idle_min; 6431 6432 time_idle = cts - rack->r_ctl.rc_went_idle_time; 6433 idle_min = rack_min_probertt_hold; 6434 if (rack_probertt_gpsrtt_cnt_div) { 6435 uint64_t extra; 6436 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 6437 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 6438 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 6439 idle_min += (uint32_t)extra; 6440 } 6441 if (time_idle >= idle_min) { 6442 /* Yes, we count it as a probe-rtt. */ 6443 uint32_t us_cts; 6444 6445 us_cts = tcp_get_usecs(NULL); 6446 if (rack->in_probe_rtt == 0) { 6447 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6448 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 6449 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 6450 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 6451 } else { 6452 rack_exit_probertt(rack, us_cts); 6453 } 6454 } 6455 } 6456 if (rack->r_persist_lt_bw_off) { 6457 /* Continue where we left off */ 6458 rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL); 6459 rack->lt_bw_up = 1; 6460 rack->r_persist_lt_bw_off = 0; 6461 } 6462 rack->rc_in_persist = 0; 6463 rack->r_ctl.rc_went_idle_time = 0; 6464 tp->t_rxtshift = 0; 6465 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6466 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6467 rack->r_ctl.rc_agg_delayed = 0; 6468 rack->r_early = 0; 6469 rack->r_late = 0; 6470 rack->r_ctl.rc_agg_early = 0; 6471 } 6472 6473 static void 6474 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 6475 struct hpts_diag *diag, struct timeval *tv) 6476 { 6477 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6478 union tcp_log_stackspecific log; 6479 6480 memset(&log, 0, sizeof(log)); 6481 log.u_bbr.flex1 = diag->p_nxt_slot; 6482 log.u_bbr.flex2 = diag->p_cur_slot; 6483 log.u_bbr.flex3 = diag->slot_req; 6484 log.u_bbr.flex4 = diag->inp_hptsslot; 6485 log.u_bbr.flex5 = diag->slot_remaining; 6486 log.u_bbr.flex6 = diag->need_new_to; 6487 log.u_bbr.flex7 = diag->p_hpts_active; 6488 log.u_bbr.flex8 = diag->p_on_min_sleep; 6489 /* Hijack other fields as needed */ 6490 log.u_bbr.epoch = diag->have_slept; 6491 log.u_bbr.lt_epoch = diag->yet_to_sleep; 6492 log.u_bbr.pkts_out = diag->co_ret; 6493 log.u_bbr.applimited = diag->hpts_sleep_time; 6494 log.u_bbr.delivered = diag->p_prev_slot; 6495 log.u_bbr.inflight = diag->p_runningslot; 6496 log.u_bbr.bw_inuse = diag->wheel_slot; 6497 log.u_bbr.rttProp = diag->wheel_cts; 6498 log.u_bbr.timeStamp = cts; 6499 log.u_bbr.delRate = diag->maxslots; 6500 log.u_bbr.cur_del_rate = diag->p_curtick; 6501 log.u_bbr.cur_del_rate <<= 32; 6502 log.u_bbr.cur_del_rate |= diag->p_lasttick; 6503 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6504 &rack->rc_inp->inp_socket->so_rcv, 6505 &rack->rc_inp->inp_socket->so_snd, 6506 BBR_LOG_HPTSDIAG, 0, 6507 0, &log, false, tv); 6508 } 6509 6510 } 6511 6512 static void 6513 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 6514 { 6515 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6516 union tcp_log_stackspecific log; 6517 struct timeval tv; 6518 6519 memset(&log, 0, sizeof(log)); 6520 log.u_bbr.flex1 = sb->sb_flags; 6521 log.u_bbr.flex2 = len; 6522 log.u_bbr.flex3 = sb->sb_state; 6523 log.u_bbr.flex8 = type; 6524 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6525 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6526 &rack->rc_inp->inp_socket->so_rcv, 6527 &rack->rc_inp->inp_socket->so_snd, 6528 TCP_LOG_SB_WAKE, 0, 6529 len, &log, false, &tv); 6530 } 6531 } 6532 6533 static void 6534 rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 6535 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 6536 { 6537 struct hpts_diag diag; 6538 struct inpcb *inp = tptoinpcb(tp); 6539 struct timeval tv; 6540 uint32_t delayed_ack = 0; 6541 uint32_t hpts_timeout; 6542 uint32_t entry_slot = slot; 6543 uint8_t stopped; 6544 uint32_t left = 0; 6545 uint32_t us_cts; 6546 6547 if ((tp->t_state == TCPS_CLOSED) || 6548 (tp->t_state == TCPS_LISTEN)) { 6549 return; 6550 } 6551 if (tcp_in_hpts(tp)) { 6552 /* Already on the pacer */ 6553 return; 6554 } 6555 stopped = rack->rc_tmr_stopped; 6556 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 6557 left = rack->r_ctl.rc_timer_exp - cts; 6558 } 6559 rack->r_ctl.rc_timer_exp = 0; 6560 rack->r_ctl.rc_hpts_flags = 0; 6561 us_cts = tcp_get_usecs(&tv); 6562 /* Now early/late accounting */ 6563 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 6564 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 6565 /* 6566 * We have a early carry over set, 6567 * we can always add more time so we 6568 * can always make this compensation. 6569 * 6570 * Note if ack's are allowed to wake us do not 6571 * penalize the next timer for being awoke 6572 * by an ack aka the rc_agg_early (non-paced mode). 6573 */ 6574 slot += rack->r_ctl.rc_agg_early; 6575 rack->r_early = 0; 6576 rack->r_ctl.rc_agg_early = 0; 6577 } 6578 if ((rack->r_late) && 6579 ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) { 6580 /* 6581 * This is harder, we can 6582 * compensate some but it 6583 * really depends on what 6584 * the current pacing time is. 6585 */ 6586 if (rack->r_ctl.rc_agg_delayed >= slot) { 6587 /* 6588 * We can't compensate for it all. 6589 * And we have to have some time 6590 * on the clock. We always have a min 6591 * 10 slots (10 x 10 i.e. 100 usecs). 6592 */ 6593 if (slot <= HPTS_USECS_PER_SLOT) { 6594 /* We gain delay */ 6595 rack->r_ctl.rc_agg_delayed += (HPTS_USECS_PER_SLOT - slot); 6596 slot = HPTS_USECS_PER_SLOT; 6597 } else { 6598 /* We take off some */ 6599 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_USECS_PER_SLOT); 6600 slot = HPTS_USECS_PER_SLOT; 6601 } 6602 } else { 6603 slot -= rack->r_ctl.rc_agg_delayed; 6604 rack->r_ctl.rc_agg_delayed = 0; 6605 /* Make sure we have 100 useconds at minimum */ 6606 if (slot < HPTS_USECS_PER_SLOT) { 6607 rack->r_ctl.rc_agg_delayed = HPTS_USECS_PER_SLOT - slot; 6608 slot = HPTS_USECS_PER_SLOT; 6609 } 6610 if (rack->r_ctl.rc_agg_delayed == 0) 6611 rack->r_late = 0; 6612 } 6613 } else if (rack->r_late) { 6614 /* r_use_hpts_min is on and so is DGP */ 6615 uint32_t max_red; 6616 6617 max_red = (slot * rack->r_ctl.max_reduction) / 100; 6618 if (max_red >= rack->r_ctl.rc_agg_delayed) { 6619 slot -= rack->r_ctl.rc_agg_delayed; 6620 rack->r_ctl.rc_agg_delayed = 0; 6621 } else { 6622 slot -= max_red; 6623 rack->r_ctl.rc_agg_delayed -= max_red; 6624 } 6625 } 6626 if ((rack->r_use_hpts_min == 1) && 6627 (slot > 0) && 6628 (rack->dgp_on == 1)) { 6629 /* 6630 * We are enforcing a min pacing timer 6631 * based on our hpts min timeout. 6632 */ 6633 uint32_t min; 6634 6635 min = get_hpts_min_sleep_time(); 6636 if (min > slot) { 6637 slot = min; 6638 } 6639 } 6640 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 6641 if (tp->t_flags & TF_DELACK) { 6642 delayed_ack = TICKS_2_USEC(tcp_delacktime); 6643 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 6644 } 6645 if (delayed_ack && ((hpts_timeout == 0) || 6646 (delayed_ack < hpts_timeout))) 6647 hpts_timeout = delayed_ack; 6648 else 6649 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6650 /* 6651 * If no timers are going to run and we will fall off the hptsi 6652 * wheel, we resort to a keep-alive timer if its configured. 6653 */ 6654 if ((hpts_timeout == 0) && 6655 (slot == 0)) { 6656 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6657 (tp->t_state <= TCPS_CLOSING)) { 6658 /* 6659 * Ok we have no timer (persists, rack, tlp, rxt or 6660 * del-ack), we don't have segments being paced. So 6661 * all that is left is the keepalive timer. 6662 */ 6663 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6664 /* Get the established keep-alive time */ 6665 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 6666 } else { 6667 /* 6668 * Get the initial setup keep-alive time, 6669 * note that this is probably not going to 6670 * happen, since rack will be running a rxt timer 6671 * if a SYN of some sort is outstanding. It is 6672 * actually handled in rack_timeout_rxt(). 6673 */ 6674 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 6675 } 6676 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 6677 if (rack->in_probe_rtt) { 6678 /* 6679 * We want to instead not wake up a long time from 6680 * now but to wake up about the time we would 6681 * exit probe-rtt and initiate a keep-alive ack. 6682 * This will get us out of probe-rtt and update 6683 * our min-rtt. 6684 */ 6685 hpts_timeout = rack_min_probertt_hold; 6686 } 6687 } 6688 } 6689 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 6690 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 6691 /* 6692 * RACK, TLP, persists and RXT timers all are restartable 6693 * based on actions input .. i.e we received a packet (ack 6694 * or sack) and that changes things (rw, or snd_una etc). 6695 * Thus we can restart them with a new value. For 6696 * keep-alive, delayed_ack we keep track of what was left 6697 * and restart the timer with a smaller value. 6698 */ 6699 if (left < hpts_timeout) 6700 hpts_timeout = left; 6701 } 6702 if (hpts_timeout) { 6703 /* 6704 * Hack alert for now we can't time-out over 2,147,483 6705 * seconds (a bit more than 596 hours), which is probably ok 6706 * :). 6707 */ 6708 if (hpts_timeout > 0x7ffffffe) 6709 hpts_timeout = 0x7ffffffe; 6710 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 6711 } 6712 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 6713 if ((rack->gp_ready == 0) && 6714 (rack->use_fixed_rate == 0) && 6715 (hpts_timeout < slot) && 6716 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 6717 /* 6718 * We have no good estimate yet for the 6719 * old clunky burst mitigation or the 6720 * real pacing. And the tlp or rxt is smaller 6721 * than the pacing calculation. Lets not 6722 * pace that long since we know the calculation 6723 * so far is not accurate. 6724 */ 6725 slot = hpts_timeout; 6726 } 6727 /** 6728 * Turn off all the flags for queuing by default. The 6729 * flags have important meanings to what happens when 6730 * LRO interacts with the transport. Most likely (by default now) 6731 * mbuf_queueing and ack compression are on. So the transport 6732 * has a couple of flags that control what happens (if those 6733 * are not on then these flags won't have any effect since it 6734 * won't go through the queuing LRO path). 6735 * 6736 * TF2_MBUF_QUEUE_READY - This flags says that I am busy 6737 * pacing output, so don't disturb. But 6738 * it also means LRO can wake me if there 6739 * is a SACK arrival. 6740 * 6741 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction 6742 * with the above flag (QUEUE_READY) and 6743 * when present it says don't even wake me 6744 * if a SACK arrives. 6745 * 6746 * The idea behind these flags is that if we are pacing we 6747 * set the MBUF_QUEUE_READY and only get woken up if 6748 * a SACK arrives (which could change things) or if 6749 * our pacing timer expires. If, however, we have a rack 6750 * timer running, then we don't even want a sack to wake 6751 * us since the rack timer has to expire before we can send. 6752 * 6753 * Other cases should usually have none of the flags set 6754 * so LRO can call into us. 6755 */ 6756 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); 6757 if (slot) { 6758 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 6759 rack->r_ctl.rc_last_output_to = us_cts + slot; 6760 /* 6761 * A pacing timer (slot) is being set, in 6762 * such a case we cannot send (we are blocked by 6763 * the timer). So lets tell LRO that it should not 6764 * wake us unless there is a SACK. Note this only 6765 * will be effective if mbuf queueing is on or 6766 * compressed acks are being processed. 6767 */ 6768 tp->t_flags2 |= TF2_MBUF_QUEUE_READY; 6769 /* 6770 * But wait if we have a Rack timer running 6771 * even a SACK should not disturb us (with 6772 * the exception of r_rr_config 3). 6773 */ 6774 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) || 6775 (IN_RECOVERY(tp->t_flags))) { 6776 if (rack->r_rr_config != 3) 6777 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6778 else if (rack->rc_pace_dnd) { 6779 /* 6780 * When DND is on, we only let a sack 6781 * interrupt us if we are not in recovery. 6782 * 6783 * If DND is off, then we never hit here 6784 * and let all sacks wake us up. 6785 * 6786 */ 6787 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6788 } 6789 } 6790 if (rack->rc_ack_can_sendout_data) { 6791 /* 6792 * Ahh but wait, this is that special case 6793 * where the pacing timer can be disturbed 6794 * backout the changes (used for non-paced 6795 * burst limiting). 6796 */ 6797 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE | 6798 TF2_MBUF_QUEUE_READY); 6799 } 6800 if ((rack->use_rack_rr) && 6801 (rack->r_rr_config < 2) && 6802 ((hpts_timeout) && (hpts_timeout < slot))) { 6803 /* 6804 * Arrange for the hpts to kick back in after the 6805 * t-o if the t-o does not cause a send. 6806 */ 6807 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 6808 __LINE__, &diag); 6809 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6810 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6811 } else { 6812 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), 6813 __LINE__, &diag); 6814 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6815 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 6816 } 6817 } else if (hpts_timeout) { 6818 /* 6819 * With respect to t_flags2(?) here, lets let any new acks wake 6820 * us up here. Since we are not pacing (no pacing timer), output 6821 * can happen so we should let it. If its a Rack timer, then any inbound 6822 * packet probably won't change the sending (we will be blocked) 6823 * but it may change the prr stats so letting it in (the set defaults 6824 * at the start of this block) are good enough. 6825 */ 6826 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6827 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 6828 __LINE__, &diag); 6829 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6830 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6831 } else { 6832 /* No timer starting */ 6833 #ifdef INVARIANTS 6834 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 6835 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 6836 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 6837 } 6838 #endif 6839 } 6840 rack->rc_tmr_stopped = 0; 6841 if (slot) 6842 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 6843 } 6844 6845 static void 6846 rack_mark_lost(struct tcpcb *tp, 6847 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 6848 { 6849 struct rack_sendmap *nrsm; 6850 uint32_t thresh, exp; 6851 6852 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 6853 nrsm = rsm; 6854 TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) { 6855 if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) { 6856 /* Got up to all that were marked sack-passed */ 6857 break; 6858 } 6859 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 6860 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 6861 if (TSTMP_LT(exp, cts) || (exp == cts)) { 6862 /* We now consider it lost */ 6863 nrsm->r_flags |= RACK_WAS_LOST; 6864 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 6865 } else { 6866 /* Past here it won't be lost so stop */ 6867 break; 6868 } 6869 } 6870 } 6871 } 6872 6873 /* 6874 * RACK Timer, here we simply do logging and house keeping. 6875 * the normal rack_output() function will call the 6876 * appropriate thing to check if we need to do a RACK retransmit. 6877 * We return 1, saying don't proceed with rack_output only 6878 * when all timers have been stopped (destroyed PCB?). 6879 */ 6880 static int 6881 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6882 { 6883 /* 6884 * This timer simply provides an internal trigger to send out data. 6885 * The check_recovery_mode call will see if there are needed 6886 * retransmissions, if so we will enter fast-recovery. The output 6887 * call may or may not do the same thing depending on sysctl 6888 * settings. 6889 */ 6890 struct rack_sendmap *rsm; 6891 6892 counter_u64_add(rack_to_tot, 1); 6893 if (rack->r_state && (rack->r_state != tp->t_state)) 6894 rack_set_state(tp, rack); 6895 rack->rc_on_min_to = 0; 6896 rsm = rack_check_recovery_mode(tp, cts); 6897 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 6898 if (rsm) { 6899 /* We need to stroke any lost that are now declared as lost */ 6900 rack_mark_lost(tp, rack, rsm, cts); 6901 rack->r_ctl.rc_resend = rsm; 6902 rack->r_timer_override = 1; 6903 if (rack->use_rack_rr) { 6904 /* 6905 * Don't accumulate extra pacing delay 6906 * we are allowing the rack timer to 6907 * over-ride pacing i.e. rrr takes precedence 6908 * if the pacing interval is longer than the rrr 6909 * time (in other words we get the min pacing 6910 * time versus rrr pacing time). 6911 */ 6912 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6913 } 6914 } 6915 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 6916 if (rsm == NULL) { 6917 /* restart a timer and return 1 */ 6918 rack_start_hpts_timer(rack, tp, cts, 6919 0, 0, 0); 6920 return (1); 6921 } 6922 return (0); 6923 } 6924 6925 6926 6927 static void 6928 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 6929 { 6930 6931 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { 6932 /* 6933 * The trailing space changed, mbufs can grow 6934 * at the tail but they can't shrink from 6935 * it, KASSERT that. Adjust the orig_m_len to 6936 * compensate for this change. 6937 */ 6938 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), 6939 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 6940 rsm->m, 6941 rsm, 6942 (intmax_t)M_TRAILINGROOM(rsm->m), 6943 rsm->orig_t_space, 6944 rsm->orig_m_len, 6945 rsm->m->m_len)); 6946 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); 6947 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 6948 } 6949 if (rsm->m->m_len < rsm->orig_m_len) { 6950 /* 6951 * Mbuf shrank, trimmed off the top by an ack, our 6952 * offset changes. 6953 */ 6954 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), 6955 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", 6956 rsm->m, rsm->m->m_len, 6957 rsm, rsm->orig_m_len, 6958 rsm->soff)); 6959 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) 6960 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 6961 else 6962 rsm->soff = 0; 6963 rsm->orig_m_len = rsm->m->m_len; 6964 #ifdef INVARIANTS 6965 } else if (rsm->m->m_len > rsm->orig_m_len) { 6966 panic("rsm:%p m:%p m_len grew outside of t_space compensation", 6967 rsm, rsm->m); 6968 #endif 6969 } 6970 } 6971 6972 static void 6973 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 6974 { 6975 struct mbuf *m; 6976 uint32_t soff; 6977 6978 if (src_rsm->m && 6979 ((src_rsm->orig_m_len != src_rsm->m->m_len) || 6980 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { 6981 /* Fix up the orig_m_len and possibly the mbuf offset */ 6982 rack_adjust_orig_mlen(src_rsm); 6983 } 6984 m = src_rsm->m; 6985 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 6986 while (soff >= m->m_len) { 6987 /* Move out past this mbuf */ 6988 soff -= m->m_len; 6989 m = m->m_next; 6990 KASSERT((m != NULL), 6991 ("rsm:%p nrsm:%p hit at soff:%u null m", 6992 src_rsm, rsm, soff)); 6993 if (m == NULL) { 6994 /* This should *not* happen which is why there is a kassert */ 6995 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 6996 (src_rsm->r_start - rack->rc_tp->snd_una), 6997 &src_rsm->soff); 6998 src_rsm->orig_m_len = src_rsm->m->m_len; 6999 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); 7000 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7001 (rsm->r_start - rack->rc_tp->snd_una), 7002 &rsm->soff); 7003 rsm->orig_m_len = rsm->m->m_len; 7004 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7005 return; 7006 } 7007 } 7008 rsm->m = m; 7009 rsm->soff = soff; 7010 rsm->orig_m_len = m->m_len; 7011 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7012 } 7013 7014 static __inline void 7015 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 7016 struct rack_sendmap *rsm, uint32_t start) 7017 { 7018 int idx; 7019 7020 nrsm->r_start = start; 7021 nrsm->r_end = rsm->r_end; 7022 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 7023 nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt; 7024 nrsm->r_flags = rsm->r_flags; 7025 nrsm->r_dupack = rsm->r_dupack; 7026 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 7027 nrsm->r_rtr_bytes = 0; 7028 nrsm->r_fas = rsm->r_fas; 7029 nrsm->r_bas = rsm->r_bas; 7030 tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start); 7031 nrsm->r_just_ret = rsm->r_just_ret; 7032 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 7033 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 7034 } 7035 /* Now if we have SYN flag we keep it on the left edge */ 7036 if (nrsm->r_flags & RACK_HAS_SYN) 7037 nrsm->r_flags &= ~RACK_HAS_SYN; 7038 /* Now if we have a FIN flag we keep it on the right edge */ 7039 if (rsm->r_flags & RACK_HAS_FIN) 7040 rsm->r_flags &= ~RACK_HAS_FIN; 7041 /* Push bit must go to the right edge as well */ 7042 if (rsm->r_flags & RACK_HAD_PUSH) 7043 rsm->r_flags &= ~RACK_HAD_PUSH; 7044 /* Update the count if app limited */ 7045 if (nrsm->r_flags & RACK_APP_LIMITED) 7046 rack->r_ctl.rc_app_limited_cnt++; 7047 /* Clone over the state of the hw_tls flag */ 7048 nrsm->r_hw_tls = rsm->r_hw_tls; 7049 /* 7050 * Now we need to find nrsm's new location in the mbuf chain 7051 * we basically calculate a new offset, which is soff + 7052 * how much is left in original rsm. Then we walk out the mbuf 7053 * chain to find the righ position, it may be the same mbuf 7054 * or maybe not. 7055 */ 7056 KASSERT(((rsm->m != NULL) || 7057 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 7058 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 7059 if (rsm->m) 7060 rack_setup_offset_for_rsm(rack, rsm, nrsm); 7061 } 7062 7063 static struct rack_sendmap * 7064 rack_merge_rsm(struct tcp_rack *rack, 7065 struct rack_sendmap *l_rsm, 7066 struct rack_sendmap *r_rsm) 7067 { 7068 /* 7069 * We are merging two ack'd RSM's, 7070 * the l_rsm is on the left (lower seq 7071 * values) and the r_rsm is on the right 7072 * (higher seq value). The simplest way 7073 * to merge these is to move the right 7074 * one into the left. I don't think there 7075 * is any reason we need to try to find 7076 * the oldest (or last oldest retransmitted). 7077 */ 7078 rack_log_map_chg(rack->rc_tp, rack, NULL, 7079 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 7080 tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end); 7081 if (l_rsm->r_dupack < r_rsm->r_dupack) 7082 l_rsm->r_dupack = r_rsm->r_dupack; 7083 if (r_rsm->r_rtr_bytes) 7084 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 7085 if (r_rsm->r_in_tmap) { 7086 /* This really should not happen */ 7087 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 7088 r_rsm->r_in_tmap = 0; 7089 } 7090 7091 /* Now the flags */ 7092 if (r_rsm->r_flags & RACK_HAS_FIN) 7093 l_rsm->r_flags |= RACK_HAS_FIN; 7094 if (r_rsm->r_flags & RACK_TLP) 7095 l_rsm->r_flags |= RACK_TLP; 7096 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 7097 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 7098 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 7099 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 7100 /* 7101 * If both are app-limited then let the 7102 * free lower the count. If right is app 7103 * limited and left is not, transfer. 7104 */ 7105 l_rsm->r_flags |= RACK_APP_LIMITED; 7106 r_rsm->r_flags &= ~RACK_APP_LIMITED; 7107 if (r_rsm == rack->r_ctl.rc_first_appl) 7108 rack->r_ctl.rc_first_appl = l_rsm; 7109 } 7110 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); 7111 /* 7112 * We keep the largest value, which is the newest 7113 * send. We do this in case a segment that is 7114 * joined together and not part of a GP estimate 7115 * later gets expanded into the GP estimate. 7116 * 7117 * We prohibit the merging of unlike kinds i.e. 7118 * all pieces that are in the GP estimate can be 7119 * merged and all pieces that are not in a GP estimate 7120 * can be merged, but not disimilar pieces. Combine 7121 * this with taking the highest here and we should 7122 * be ok unless of course the client reneges. Then 7123 * all bets are off. 7124 */ 7125 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < 7126 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { 7127 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; 7128 } 7129 /* 7130 * When merging two RSM's we also need to consider the ack time and keep 7131 * newest. If the ack gets merged into a measurement then that is the 7132 * one we will want to be using. 7133 */ 7134 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) 7135 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; 7136 7137 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 7138 /* Transfer the split limit to the map we free */ 7139 r_rsm->r_limit_type = l_rsm->r_limit_type; 7140 l_rsm->r_limit_type = 0; 7141 } 7142 rack_free(rack, r_rsm); 7143 l_rsm->r_flags |= RACK_MERGED; 7144 return (l_rsm); 7145 } 7146 7147 /* 7148 * TLP Timer, here we simply setup what segment we want to 7149 * have the TLP expire on, the normal rack_output() will then 7150 * send it out. 7151 * 7152 * We return 1, saying don't proceed with rack_output only 7153 * when all timers have been stopped (destroyed PCB?). 7154 */ 7155 static int 7156 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 7157 { 7158 /* 7159 * Tail Loss Probe. 7160 */ 7161 struct rack_sendmap *rsm = NULL; 7162 int insret __diagused; 7163 struct socket *so = tptosocket(tp); 7164 uint32_t amm; 7165 uint32_t out, avail; 7166 int collapsed_win = 0; 7167 7168 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7169 /* Its not time yet */ 7170 return (0); 7171 } 7172 if (ctf_progress_timeout_check(tp, true)) { 7173 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7174 return (-ETIMEDOUT); /* tcp_drop() */ 7175 } 7176 /* 7177 * A TLP timer has expired. We have been idle for 2 rtts. So we now 7178 * need to figure out how to force a full MSS segment out. 7179 */ 7180 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 7181 rack->r_ctl.retran_during_recovery = 0; 7182 rack->r_might_revert = 0; 7183 rack->r_ctl.dsack_byte_cnt = 0; 7184 counter_u64_add(rack_tlp_tot, 1); 7185 if (rack->r_state && (rack->r_state != tp->t_state)) 7186 rack_set_state(tp, rack); 7187 avail = sbavail(&so->so_snd); 7188 out = tp->snd_max - tp->snd_una; 7189 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 7190 /* special case, we need a retransmission */ 7191 collapsed_win = 1; 7192 goto need_retran; 7193 } 7194 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 7195 rack->r_ctl.dsack_persist--; 7196 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7197 rack->r_ctl.num_dsack = 0; 7198 } 7199 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7200 } 7201 if ((tp->t_flags & TF_GPUTINPROG) && 7202 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 7203 /* 7204 * If this is the second in a row 7205 * TLP and we are doing a measurement 7206 * its time to abandon the measurement. 7207 * Something is likely broken on 7208 * the clients network and measuring a 7209 * broken network does us no good. 7210 */ 7211 tp->t_flags &= ~TF_GPUTINPROG; 7212 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7213 rack->r_ctl.rc_gp_srtt /*flex1*/, 7214 tp->gput_seq, 7215 0, 0, 18, __LINE__, NULL, 0); 7216 } 7217 /* 7218 * Check our send oldest always settings, and if 7219 * there is an oldest to send jump to the need_retran. 7220 */ 7221 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 7222 goto need_retran; 7223 7224 if (avail > out) { 7225 /* New data is available */ 7226 amm = avail - out; 7227 if (amm > ctf_fixed_maxseg(tp)) { 7228 amm = ctf_fixed_maxseg(tp); 7229 if ((amm + out) > tp->snd_wnd) { 7230 /* We are rwnd limited */ 7231 goto need_retran; 7232 } 7233 } else if (amm < ctf_fixed_maxseg(tp)) { 7234 /* not enough to fill a MTU */ 7235 goto need_retran; 7236 } 7237 if (IN_FASTRECOVERY(tp->t_flags)) { 7238 /* Unlikely */ 7239 if (rack->rack_no_prr == 0) { 7240 if (out + amm <= tp->snd_wnd) { 7241 rack->r_ctl.rc_prr_sndcnt = amm; 7242 rack->r_ctl.rc_tlp_new_data = amm; 7243 rack_log_to_prr(rack, 4, 0, __LINE__); 7244 } 7245 } else 7246 goto need_retran; 7247 } else { 7248 /* Set the send-new override */ 7249 if (out + amm <= tp->snd_wnd) 7250 rack->r_ctl.rc_tlp_new_data = amm; 7251 else 7252 goto need_retran; 7253 } 7254 rack->r_ctl.rc_tlpsend = NULL; 7255 counter_u64_add(rack_tlp_newdata, 1); 7256 goto send; 7257 } 7258 need_retran: 7259 /* 7260 * Ok we need to arrange the last un-acked segment to be re-sent, or 7261 * optionally the first un-acked segment. 7262 */ 7263 if (collapsed_win == 0) { 7264 if (rack_always_send_oldest) 7265 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7266 else { 7267 rsm = tqhash_max(rack->r_ctl.tqh); 7268 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 7269 rsm = rack_find_high_nonack(rack, rsm); 7270 } 7271 } 7272 if (rsm == NULL) { 7273 #ifdef TCP_BLACKBOX 7274 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 7275 #endif 7276 goto out; 7277 } 7278 } else { 7279 /* 7280 * We had a collapsed window, lets find 7281 * the point before the collapse. 7282 */ 7283 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) 7284 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); 7285 else { 7286 rsm = tqhash_min(rack->r_ctl.tqh); 7287 } 7288 if (rsm == NULL) { 7289 /* Huh */ 7290 goto out; 7291 } 7292 } 7293 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 7294 /* 7295 * We need to split this the last segment in two. 7296 */ 7297 struct rack_sendmap *nrsm; 7298 7299 nrsm = rack_alloc_full_limit(rack); 7300 if (nrsm == NULL) { 7301 /* 7302 * No memory to split, we will just exit and punt 7303 * off to the RXT timer. 7304 */ 7305 goto out; 7306 } 7307 rack_clone_rsm(rack, nrsm, rsm, 7308 (rsm->r_end - ctf_fixed_maxseg(tp))); 7309 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7310 #ifndef INVARIANTS 7311 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 7312 #else 7313 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 7314 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 7315 nrsm, insret, rack, rsm); 7316 } 7317 #endif 7318 if (rsm->r_in_tmap) { 7319 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7320 nrsm->r_in_tmap = 1; 7321 } 7322 rsm = nrsm; 7323 } 7324 rack->r_ctl.rc_tlpsend = rsm; 7325 send: 7326 /* Make sure output path knows we are doing a TLP */ 7327 *doing_tlp = 1; 7328 rack->r_timer_override = 1; 7329 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7330 return (0); 7331 out: 7332 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7333 return (0); 7334 } 7335 7336 /* 7337 * Delayed ack Timer, here we simply need to setup the 7338 * ACK_NOW flag and remove the DELACK flag. From there 7339 * the output routine will send the ack out. 7340 * 7341 * We only return 1, saying don't proceed, if all timers 7342 * are stopped (destroyed PCB?). 7343 */ 7344 static int 7345 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7346 { 7347 7348 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 7349 tp->t_flags &= ~TF_DELACK; 7350 tp->t_flags |= TF_ACKNOW; 7351 KMOD_TCPSTAT_INC(tcps_delack); 7352 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7353 return (0); 7354 } 7355 7356 static inline int 7357 rack_send_ack_challange(struct tcp_rack *rack) 7358 { 7359 struct tcptemp *t_template; 7360 7361 t_template = tcpip_maketemplate(rack->rc_inp); 7362 if (t_template) { 7363 if (rack->forced_ack == 0) { 7364 rack->forced_ack = 1; 7365 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7366 } else { 7367 rack->probe_not_answered = 1; 7368 } 7369 tcp_respond(rack->rc_tp, t_template->tt_ipgen, 7370 &t_template->tt_t, (struct mbuf *)NULL, 7371 rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0); 7372 free(t_template, M_TEMP); 7373 /* This does send an ack so kill any D-ack timer */ 7374 if (rack->rc_tp->t_flags & TF_DELACK) 7375 rack->rc_tp->t_flags &= ~TF_DELACK; 7376 return(1); 7377 } else 7378 return (0); 7379 7380 } 7381 7382 /* 7383 * Persists timer, here we simply send the 7384 * same thing as a keepalive will. 7385 * the one byte send. 7386 * 7387 * We only return 1, saying don't proceed, if all timers 7388 * are stopped (destroyed PCB?). 7389 */ 7390 static int 7391 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7392 { 7393 int32_t retval = 1; 7394 7395 if (rack->rc_in_persist == 0) 7396 return (0); 7397 if (ctf_progress_timeout_check(tp, false)) { 7398 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7399 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7400 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7401 return (-ETIMEDOUT); /* tcp_drop() */ 7402 } 7403 /* 7404 * Persistence timer into zero window. Force a byte to be output, if 7405 * possible. 7406 */ 7407 KMOD_TCPSTAT_INC(tcps_persisttimeo); 7408 /* 7409 * Hack: if the peer is dead/unreachable, we do not time out if the 7410 * window is closed. After a full backoff, drop the connection if 7411 * the idle time (no responses to probes) reaches the maximum 7412 * backoff that we would use if retransmitting. 7413 */ 7414 if (tp->t_rxtshift >= V_tcp_retries && 7415 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 7416 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 7417 KMOD_TCPSTAT_INC(tcps_persistdrop); 7418 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7419 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7420 retval = -ETIMEDOUT; /* tcp_drop() */ 7421 goto out; 7422 } 7423 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 7424 tp->snd_una == tp->snd_max) 7425 rack_exit_persist(tp, rack, cts); 7426 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 7427 /* 7428 * If the user has closed the socket then drop a persisting 7429 * connection after a much reduced timeout. 7430 */ 7431 if (tp->t_state > TCPS_CLOSE_WAIT && 7432 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 7433 KMOD_TCPSTAT_INC(tcps_persistdrop); 7434 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7435 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7436 retval = -ETIMEDOUT; /* tcp_drop() */ 7437 goto out; 7438 } 7439 if (rack_send_ack_challange(rack)) { 7440 /* only set it if we were answered */ 7441 if (rack->probe_not_answered) { 7442 counter_u64_add(rack_persists_loss, 1); 7443 rack->r_ctl.persist_lost_ends++; 7444 } 7445 counter_u64_add(rack_persists_sends, 1); 7446 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 7447 } 7448 if (tp->t_rxtshift < V_tcp_retries) 7449 tp->t_rxtshift++; 7450 out: 7451 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 7452 rack_start_hpts_timer(rack, tp, cts, 7453 0, 0, 0); 7454 return (retval); 7455 } 7456 7457 /* 7458 * If a keepalive goes off, we had no other timers 7459 * happening. We always return 1 here since this 7460 * routine either drops the connection or sends 7461 * out a segment with respond. 7462 */ 7463 static int 7464 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7465 { 7466 struct inpcb *inp = tptoinpcb(tp); 7467 7468 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 7469 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 7470 /* 7471 * Keep-alive timer went off; send something or drop connection if 7472 * idle for too long. 7473 */ 7474 KMOD_TCPSTAT_INC(tcps_keeptimeo); 7475 if (tp->t_state < TCPS_ESTABLISHED) 7476 goto dropit; 7477 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 7478 tp->t_state <= TCPS_CLOSING) { 7479 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 7480 goto dropit; 7481 /* 7482 * Send a packet designed to force a response if the peer is 7483 * up and reachable: either an ACK if the connection is 7484 * still alive, or an RST if the peer has closed the 7485 * connection due to timeout or reboot. Using sequence 7486 * number tp->snd_una-1 causes the transmitted zero-length 7487 * segment to lie outside the receive window; by the 7488 * protocol spec, this requires the correspondent TCP to 7489 * respond. 7490 */ 7491 KMOD_TCPSTAT_INC(tcps_keepprobe); 7492 rack_send_ack_challange(rack); 7493 } 7494 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 7495 return (1); 7496 dropit: 7497 KMOD_TCPSTAT_INC(tcps_keepdrops); 7498 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7499 return (-ETIMEDOUT); /* tcp_drop() */ 7500 } 7501 7502 /* 7503 * Retransmit helper function, clear up all the ack 7504 * flags and take care of important book keeping. 7505 */ 7506 static void 7507 rack_remxt_tmr(struct tcpcb *tp) 7508 { 7509 /* 7510 * The retransmit timer went off, all sack'd blocks must be 7511 * un-acked. 7512 */ 7513 struct rack_sendmap *rsm, *trsm = NULL; 7514 struct tcp_rack *rack; 7515 7516 rack = (struct tcp_rack *)tp->t_fb_ptr; 7517 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 7518 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 7519 rack->r_timer_override = 1; 7520 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 7521 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 7522 rack->r_late = 0; 7523 rack->r_early = 0; 7524 rack->r_ctl.rc_agg_delayed = 0; 7525 rack->r_ctl.rc_agg_early = 0; 7526 if (rack->r_state && (rack->r_state != tp->t_state)) 7527 rack_set_state(tp, rack); 7528 if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) { 7529 /* 7530 * We do not clear the scoreboard until we have had 7531 * more than rack_rxt_scoreboard_clear_thresh time-outs. 7532 */ 7533 rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7534 if (rack->r_ctl.rc_resend != NULL) 7535 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7536 7537 return; 7538 } 7539 /* 7540 * Ideally we would like to be able to 7541 * mark SACK-PASS on anything not acked here. 7542 * 7543 * However, if we do that we would burst out 7544 * all that data 1ms apart. This would be unwise, 7545 * so for now we will just let the normal rxt timer 7546 * and tlp timer take care of it. 7547 * 7548 * Also we really need to stick them back in sequence 7549 * order. This way we send in the proper order and any 7550 * sacks that come floating in will "re-ack" the data. 7551 * To do this we zap the tmap with an INIT and then 7552 * walk through and place every rsm in the tail queue 7553 * hash table back in its seq ordered place. 7554 */ 7555 TAILQ_INIT(&rack->r_ctl.rc_tmap); 7556 7557 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 7558 rsm->r_dupack = 0; 7559 if (rack_verbose_logging) 7560 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7561 /* We must re-add it back to the tlist */ 7562 if (trsm == NULL) { 7563 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7564 } else { 7565 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 7566 } 7567 rsm->r_in_tmap = 1; 7568 trsm = rsm; 7569 if (rsm->r_flags & RACK_ACKED) 7570 rsm->r_flags |= RACK_WAS_ACKED; 7571 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST); 7572 rsm->r_flags |= RACK_MUST_RXT; 7573 } 7574 /* zero the lost since it's all gone */ 7575 rack->r_ctl.rc_considered_lost = 0; 7576 /* Clear the count (we just un-acked them) */ 7577 rack->r_ctl.rc_sacked = 0; 7578 rack->r_ctl.rc_sacklast = NULL; 7579 /* Clear the tlp rtx mark */ 7580 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7581 if (rack->r_ctl.rc_resend != NULL) 7582 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7583 rack->r_ctl.rc_prr_sndcnt = 0; 7584 rack_log_to_prr(rack, 6, 0, __LINE__); 7585 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7586 if (rack->r_ctl.rc_resend != NULL) 7587 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7588 if (((tp->t_flags & TF_SACK_PERMIT) == 0) && 7589 ((tp->t_flags & TF_SENTFIN) == 0)) { 7590 /* 7591 * For non-sack customers new data 7592 * needs to go out as retransmits until 7593 * we retransmit up to snd_max. 7594 */ 7595 rack->r_must_retran = 1; 7596 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 7597 rack->r_ctl.rc_sacked); 7598 } 7599 } 7600 7601 static void 7602 rack_convert_rtts(struct tcpcb *tp) 7603 { 7604 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 7605 tp->t_rxtcur = RACK_REXMTVAL(tp); 7606 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 7607 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 7608 } 7609 if (tp->t_rxtcur > rack_rto_max) { 7610 tp->t_rxtcur = rack_rto_max; 7611 } 7612 } 7613 7614 static void 7615 rack_cc_conn_init(struct tcpcb *tp) 7616 { 7617 struct tcp_rack *rack; 7618 uint32_t srtt; 7619 7620 rack = (struct tcp_rack *)tp->t_fb_ptr; 7621 srtt = tp->t_srtt; 7622 cc_conn_init(tp); 7623 /* 7624 * Now convert to rack's internal format, 7625 * if required. 7626 */ 7627 if ((srtt == 0) && (tp->t_srtt != 0)) 7628 rack_convert_rtts(tp); 7629 /* 7630 * We want a chance to stay in slowstart as 7631 * we create a connection. TCP spec says that 7632 * initially ssthresh is infinite. For our 7633 * purposes that is the snd_wnd. 7634 */ 7635 if (tp->snd_ssthresh < tp->snd_wnd) { 7636 tp->snd_ssthresh = tp->snd_wnd; 7637 } 7638 /* 7639 * We also want to assure a IW worth of 7640 * data can get inflight. 7641 */ 7642 if (rc_init_window(rack) < tp->snd_cwnd) 7643 tp->snd_cwnd = rc_init_window(rack); 7644 } 7645 7646 /* 7647 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 7648 * we will setup to retransmit the lowest seq number outstanding. 7649 */ 7650 static int 7651 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7652 { 7653 struct inpcb *inp = tptoinpcb(tp); 7654 int32_t rexmt; 7655 int32_t retval = 0; 7656 bool isipv6; 7657 7658 if ((tp->t_flags & TF_GPUTINPROG) && 7659 (tp->t_rxtshift)) { 7660 /* 7661 * We have had a second timeout 7662 * measurements on successive rxt's are not profitable. 7663 * It is unlikely to be of any use (the network is 7664 * broken or the client went away). 7665 */ 7666 tp->t_flags &= ~TF_GPUTINPROG; 7667 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7668 rack->r_ctl.rc_gp_srtt /*flex1*/, 7669 tp->gput_seq, 7670 0, 0, 18, __LINE__, NULL, 0); 7671 } 7672 if (ctf_progress_timeout_check(tp, false)) { 7673 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7674 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7675 return (-ETIMEDOUT); /* tcp_drop() */ 7676 } 7677 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 7678 rack->r_ctl.retran_during_recovery = 0; 7679 rack->rc_ack_required = 1; 7680 rack->r_ctl.dsack_byte_cnt = 0; 7681 if (IN_RECOVERY(tp->t_flags) && 7682 (rack->rto_from_rec == 0)) { 7683 /* 7684 * Mark that we had a rto while in recovery 7685 * and save the ssthresh so if we go back 7686 * into recovery we will have a chance 7687 * to slowstart back to the level. 7688 */ 7689 rack->rto_from_rec = 1; 7690 rack->r_ctl.rto_ssthresh = tp->snd_ssthresh; 7691 } 7692 if (IN_FASTRECOVERY(tp->t_flags)) 7693 tp->t_flags |= TF_WASFRECOVERY; 7694 else 7695 tp->t_flags &= ~TF_WASFRECOVERY; 7696 if (IN_CONGRECOVERY(tp->t_flags)) 7697 tp->t_flags |= TF_WASCRECOVERY; 7698 else 7699 tp->t_flags &= ~TF_WASCRECOVERY; 7700 if (TCPS_HAVEESTABLISHED(tp->t_state) && 7701 (tp->snd_una == tp->snd_max)) { 7702 /* Nothing outstanding .. nothing to do */ 7703 return (0); 7704 } 7705 if (rack->r_ctl.dsack_persist) { 7706 rack->r_ctl.dsack_persist--; 7707 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7708 rack->r_ctl.num_dsack = 0; 7709 } 7710 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7711 } 7712 /* 7713 * Rack can only run one timer at a time, so we cannot 7714 * run a KEEPINIT (gating SYN sending) and a retransmit 7715 * timer for the SYN. So if we are in a front state and 7716 * have a KEEPINIT timer we need to check the first transmit 7717 * against now to see if we have exceeded the KEEPINIT time 7718 * (if one is set). 7719 */ 7720 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 7721 (TP_KEEPINIT(tp) != 0)) { 7722 struct rack_sendmap *rsm; 7723 7724 rsm = tqhash_min(rack->r_ctl.tqh); 7725 if (rsm) { 7726 /* Ok we have something outstanding to test keepinit with */ 7727 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 7728 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 7729 /* We have exceeded the KEEPINIT time */ 7730 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7731 goto drop_it; 7732 } 7733 } 7734 } 7735 /* 7736 * Retransmission timer went off. Message has not been acked within 7737 * retransmit interval. Back off to a longer retransmit interval 7738 * and retransmit one segment. 7739 */ 7740 if ((rack->r_ctl.rc_resend == NULL) || 7741 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 7742 /* 7743 * If the rwnd collapsed on 7744 * the one we are retransmitting 7745 * it does not count against the 7746 * rxt count. 7747 */ 7748 tp->t_rxtshift++; 7749 } 7750 rack_remxt_tmr(tp); 7751 if (tp->t_rxtshift > V_tcp_retries) { 7752 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7753 drop_it: 7754 tp->t_rxtshift = V_tcp_retries; 7755 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 7756 /* XXXGL: previously t_softerror was casted to uint16_t */ 7757 MPASS(tp->t_softerror >= 0); 7758 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 7759 goto out; /* tcp_drop() */ 7760 } 7761 if (tp->t_state == TCPS_SYN_SENT) { 7762 /* 7763 * If the SYN was retransmitted, indicate CWND to be limited 7764 * to 1 segment in cc_conn_init(). 7765 */ 7766 tp->snd_cwnd = 1; 7767 } else if (tp->t_rxtshift == 1) { 7768 /* 7769 * first retransmit; record ssthresh and cwnd so they can be 7770 * recovered if this turns out to be a "bad" retransmit. A 7771 * retransmit is considered "bad" if an ACK for this segment 7772 * is received within RTT/2 interval; the assumption here is 7773 * that the ACK was already in flight. See "On Estimating 7774 * End-to-End Network Path Properties" by Allman and Paxson 7775 * for more details. 7776 */ 7777 tp->snd_cwnd_prev = tp->snd_cwnd; 7778 tp->snd_ssthresh_prev = tp->snd_ssthresh; 7779 tp->snd_recover_prev = tp->snd_recover; 7780 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 7781 tp->t_flags |= TF_PREVVALID; 7782 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 7783 tp->t_flags &= ~TF_PREVVALID; 7784 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 7785 if ((tp->t_state == TCPS_SYN_SENT) || 7786 (tp->t_state == TCPS_SYN_RECEIVED)) 7787 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 7788 else 7789 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 7790 7791 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 7792 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 7793 /* 7794 * We enter the path for PLMTUD if connection is established or, if 7795 * connection is FIN_WAIT_1 status, reason for the last is that if 7796 * amount of data we send is very small, we could send it in couple 7797 * of packets and process straight to FIN. In that case we won't 7798 * catch ESTABLISHED state. 7799 */ 7800 #ifdef INET6 7801 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 7802 #else 7803 isipv6 = false; 7804 #endif 7805 if (((V_tcp_pmtud_blackhole_detect == 1) || 7806 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 7807 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 7808 ((tp->t_state == TCPS_ESTABLISHED) || 7809 (tp->t_state == TCPS_FIN_WAIT_1))) { 7810 /* 7811 * Idea here is that at each stage of mtu probe (usually, 7812 * 1448 -> 1188 -> 524) should be given 2 chances to recover 7813 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 7814 * should take care of that. 7815 */ 7816 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 7817 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 7818 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 7819 tp->t_rxtshift % 2 == 0)) { 7820 /* 7821 * Enter Path MTU Black-hole Detection mechanism: - 7822 * Disable Path MTU Discovery (IP "DF" bit). - 7823 * Reduce MTU to lower value than what we negotiated 7824 * with peer. 7825 */ 7826 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 7827 /* Record that we may have found a black hole. */ 7828 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 7829 /* Keep track of previous MSS. */ 7830 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 7831 } 7832 7833 /* 7834 * Reduce the MSS to blackhole value or to the 7835 * default in an attempt to retransmit. 7836 */ 7837 #ifdef INET6 7838 if (isipv6 && 7839 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 7840 /* Use the sysctl tuneable blackhole MSS. */ 7841 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 7842 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7843 } else if (isipv6) { 7844 /* Use the default MSS. */ 7845 tp->t_maxseg = V_tcp_v6mssdflt; 7846 /* 7847 * Disable Path MTU Discovery when we switch 7848 * to minmss. 7849 */ 7850 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7851 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7852 } 7853 #endif 7854 #if defined(INET6) && defined(INET) 7855 else 7856 #endif 7857 #ifdef INET 7858 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 7859 /* Use the sysctl tuneable blackhole MSS. */ 7860 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 7861 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7862 } else { 7863 /* Use the default MSS. */ 7864 tp->t_maxseg = V_tcp_mssdflt; 7865 /* 7866 * Disable Path MTU Discovery when we switch 7867 * to minmss. 7868 */ 7869 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7870 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7871 } 7872 #endif 7873 } else { 7874 /* 7875 * If further retransmissions are still unsuccessful 7876 * with a lowered MTU, maybe this isn't a blackhole 7877 * and we restore the previous MSS and blackhole 7878 * detection flags. The limit '6' is determined by 7879 * giving each probe stage (1448, 1188, 524) 2 7880 * chances to recover. 7881 */ 7882 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 7883 (tp->t_rxtshift >= 6)) { 7884 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 7885 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7886 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 7887 if (tp->t_maxseg < V_tcp_mssdflt) { 7888 /* 7889 * The MSS is so small we should not 7890 * process incoming SACK's since we are 7891 * subject to attack in such a case. 7892 */ 7893 tp->t_flags2 |= TF2_PROC_SACK_PROHIBIT; 7894 } else { 7895 tp->t_flags2 &= ~TF2_PROC_SACK_PROHIBIT; 7896 } 7897 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 7898 } 7899 } 7900 } 7901 /* 7902 * Disable RFC1323 and SACK if we haven't got any response to 7903 * our third SYN to work-around some broken terminal servers 7904 * (most of which have hopefully been retired) that have bad VJ 7905 * header compression code which trashes TCP segments containing 7906 * unknown-to-them TCP options. 7907 */ 7908 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7909 (tp->t_rxtshift == 3)) 7910 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7911 /* 7912 * If we backed off this far, our srtt estimate is probably bogus. 7913 * Clobber it so we'll take the next rtt measurement as our srtt; 7914 * move the current srtt into rttvar to keep the current retransmit 7915 * times until then. 7916 */ 7917 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 7918 #ifdef INET6 7919 if ((inp->inp_vflag & INP_IPV6) != 0) 7920 in6_losing(inp); 7921 else 7922 #endif 7923 in_losing(inp); 7924 tp->t_rttvar += tp->t_srtt; 7925 tp->t_srtt = 0; 7926 } 7927 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 7928 tp->snd_recover = tp->snd_max; 7929 tp->t_flags |= TF_ACKNOW; 7930 tp->t_rtttime = 0; 7931 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 7932 out: 7933 return (retval); 7934 } 7935 7936 static int 7937 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 7938 { 7939 int32_t ret = 0; 7940 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 7941 7942 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 7943 (tp->t_flags & TF_GPUTINPROG)) { 7944 /* 7945 * We have a goodput in progress 7946 * and we have entered a late state. 7947 * Do we have enough data in the sb 7948 * to handle the GPUT request? 7949 */ 7950 uint32_t bytes; 7951 7952 bytes = tp->gput_ack - tp->gput_seq; 7953 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 7954 bytes += tp->gput_seq - tp->snd_una; 7955 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 7956 /* 7957 * There are not enough bytes in the socket 7958 * buffer that have been sent to cover this 7959 * measurement. Cancel it. 7960 */ 7961 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7962 rack->r_ctl.rc_gp_srtt /*flex1*/, 7963 tp->gput_seq, 7964 0, 0, 18, __LINE__, NULL, 0); 7965 tp->t_flags &= ~TF_GPUTINPROG; 7966 } 7967 } 7968 if (timers == 0) { 7969 return (0); 7970 } 7971 if (tp->t_state == TCPS_LISTEN) { 7972 /* no timers on listen sockets */ 7973 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 7974 return (0); 7975 return (1); 7976 } 7977 if ((timers & PACE_TMR_RACK) && 7978 rack->rc_on_min_to) { 7979 /* 7980 * For the rack timer when we 7981 * are on a min-timeout (which means rrr_conf = 3) 7982 * we don't want to check the timer. It may 7983 * be going off for a pace and thats ok we 7984 * want to send the retransmit (if its ready). 7985 * 7986 * If its on a normal rack timer (non-min) then 7987 * we will check if its expired. 7988 */ 7989 goto skip_time_check; 7990 } 7991 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7992 uint32_t left; 7993 7994 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 7995 ret = -1; 7996 rack_log_to_processing(rack, cts, ret, 0); 7997 return (0); 7998 } 7999 if (hpts_calling == 0) { 8000 /* 8001 * A user send or queued mbuf (sack) has called us? We 8002 * return 0 and let the pacing guards 8003 * deal with it if they should or 8004 * should not cause a send. 8005 */ 8006 ret = -2; 8007 rack_log_to_processing(rack, cts, ret, 0); 8008 return (0); 8009 } 8010 /* 8011 * Ok our timer went off early and we are not paced false 8012 * alarm, go back to sleep. We make sure we don't have 8013 * no-sack wakeup on since we no longer have a PKT_OUTPUT 8014 * flag in place. 8015 */ 8016 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; 8017 ret = -3; 8018 left = rack->r_ctl.rc_timer_exp - cts; 8019 tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); 8020 rack_log_to_processing(rack, cts, ret, left); 8021 return (1); 8022 } 8023 skip_time_check: 8024 rack->rc_tmr_stopped = 0; 8025 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 8026 if (timers & PACE_TMR_DELACK) { 8027 ret = rack_timeout_delack(tp, rack, cts); 8028 } else if (timers & PACE_TMR_RACK) { 8029 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8030 rack->r_fast_output = 0; 8031 ret = rack_timeout_rack(tp, rack, cts); 8032 } else if (timers & PACE_TMR_TLP) { 8033 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8034 rack->r_fast_output = 0; 8035 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 8036 } else if (timers & PACE_TMR_RXT) { 8037 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8038 rack->r_fast_output = 0; 8039 ret = rack_timeout_rxt(tp, rack, cts); 8040 } else if (timers & PACE_TMR_PERSIT) { 8041 ret = rack_timeout_persist(tp, rack, cts); 8042 } else if (timers & PACE_TMR_KEEP) { 8043 ret = rack_timeout_keepalive(tp, rack, cts); 8044 } 8045 rack_log_to_processing(rack, cts, ret, timers); 8046 return (ret); 8047 } 8048 8049 static void 8050 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 8051 { 8052 struct timeval tv; 8053 uint32_t us_cts, flags_on_entry; 8054 uint8_t hpts_removed = 0; 8055 8056 flags_on_entry = rack->r_ctl.rc_hpts_flags; 8057 us_cts = tcp_get_usecs(&tv); 8058 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 8059 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 8060 ((tp->snd_max - tp->snd_una) == 0))) { 8061 tcp_hpts_remove(rack->rc_tp); 8062 hpts_removed = 1; 8063 /* If we were not delayed cancel out the flag. */ 8064 if ((tp->snd_max - tp->snd_una) == 0) 8065 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8066 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8067 } 8068 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 8069 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 8070 if (tcp_in_hpts(rack->rc_tp) && 8071 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 8072 /* 8073 * Canceling timer's when we have no output being 8074 * paced. We also must remove ourselves from the 8075 * hpts. 8076 */ 8077 tcp_hpts_remove(rack->rc_tp); 8078 hpts_removed = 1; 8079 } 8080 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 8081 } 8082 if (hpts_removed == 0) 8083 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8084 } 8085 8086 static int 8087 rack_stopall(struct tcpcb *tp) 8088 { 8089 struct tcp_rack *rack; 8090 8091 rack = (struct tcp_rack *)tp->t_fb_ptr; 8092 rack->t_timers_stopped = 1; 8093 8094 tcp_hpts_remove(tp); 8095 8096 return (0); 8097 } 8098 8099 static void 8100 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) 8101 { 8102 /* 8103 * Assure no timers are running. 8104 */ 8105 if (tcp_timer_active(tp, TT_PERSIST)) { 8106 /* We enter in persists, set the flag appropriately */ 8107 rack->rc_in_persist = 1; 8108 } 8109 if (tcp_in_hpts(rack->rc_tp)) { 8110 tcp_hpts_remove(rack->rc_tp); 8111 } 8112 } 8113 8114 static void 8115 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 8116 struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz) 8117 { 8118 int32_t idx; 8119 8120 rsm->r_rtr_cnt++; 8121 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 8122 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 8123 rsm->r_flags |= RACK_OVERMAX; 8124 } 8125 rsm->r_act_rxt_cnt++; 8126 /* Peg the count/index */ 8127 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8128 rsm->r_dupack = 0; 8129 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 8130 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 8131 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 8132 } 8133 if (rsm->r_flags & RACK_WAS_LOST) { 8134 /* 8135 * We retransmitted it putting it back in flight 8136 * remove the lost desgination and reduce the 8137 * bytes considered lost. 8138 */ 8139 rsm->r_flags &= ~RACK_WAS_LOST; 8140 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), 8141 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 8142 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) 8143 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; 8144 else 8145 rack->r_ctl.rc_considered_lost = 0; 8146 } 8147 idx = rsm->r_rtr_cnt - 1; 8148 rsm->r_tim_lastsent[idx] = ts; 8149 /* 8150 * Here we don't add in the len of send, since its already 8151 * in snduna <->snd_max. 8152 */ 8153 rsm->r_fas = ctf_flight_size(rack->rc_tp, 8154 rack->r_ctl.rc_sacked); 8155 if (rsm->r_flags & RACK_ACKED) { 8156 /* Problably MTU discovery messing with us */ 8157 rsm->r_flags &= ~RACK_ACKED; 8158 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8159 } 8160 if (rsm->r_in_tmap) { 8161 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8162 rsm->r_in_tmap = 0; 8163 } 8164 /* Lets make sure it really is in or not the GP window */ 8165 rack_mark_in_gp_win(tp, rsm); 8166 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8167 rsm->r_in_tmap = 1; 8168 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); 8169 /* Take off the must retransmit flag, if its on */ 8170 if (rsm->r_flags & RACK_MUST_RXT) { 8171 if (rack->r_must_retran) 8172 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 8173 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 8174 /* 8175 * We have retransmitted all we need. Clear 8176 * any must retransmit flags. 8177 */ 8178 rack->r_must_retran = 0; 8179 rack->r_ctl.rc_out_at_rto = 0; 8180 } 8181 rsm->r_flags &= ~RACK_MUST_RXT; 8182 } 8183 /* Remove any collapsed flag */ 8184 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8185 if (rsm->r_flags & RACK_SACK_PASSED) { 8186 /* We have retransmitted due to the SACK pass */ 8187 rsm->r_flags &= ~RACK_SACK_PASSED; 8188 rsm->r_flags |= RACK_WAS_SACKPASS; 8189 } 8190 } 8191 8192 static uint32_t 8193 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 8194 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz) 8195 { 8196 /* 8197 * We (re-)transmitted starting at rsm->r_start for some length 8198 * (possibly less than r_end. 8199 */ 8200 struct rack_sendmap *nrsm; 8201 int insret __diagused; 8202 uint32_t c_end; 8203 int32_t len; 8204 8205 len = *lenp; 8206 c_end = rsm->r_start + len; 8207 if (SEQ_GEQ(c_end, rsm->r_end)) { 8208 /* 8209 * We retransmitted the whole piece or more than the whole 8210 * slopping into the next rsm. 8211 */ 8212 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8213 if (c_end == rsm->r_end) { 8214 *lenp = 0; 8215 return (0); 8216 } else { 8217 int32_t act_len; 8218 8219 /* Hangs over the end return whats left */ 8220 act_len = rsm->r_end - rsm->r_start; 8221 *lenp = (len - act_len); 8222 return (rsm->r_end); 8223 } 8224 /* We don't get out of this block. */ 8225 } 8226 /* 8227 * Here we retransmitted less than the whole thing which means we 8228 * have to split this into what was transmitted and what was not. 8229 */ 8230 nrsm = rack_alloc_full_limit(rack); 8231 if (nrsm == NULL) { 8232 /* 8233 * We can't get memory, so lets not proceed. 8234 */ 8235 *lenp = 0; 8236 return (0); 8237 } 8238 /* 8239 * So here we are going to take the original rsm and make it what we 8240 * retransmitted. nrsm will be the tail portion we did not 8241 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 8242 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 8243 * 1, 6 and the new piece will be 6, 11. 8244 */ 8245 rack_clone_rsm(rack, nrsm, rsm, c_end); 8246 nrsm->r_dupack = 0; 8247 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8248 #ifndef INVARIANTS 8249 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8250 #else 8251 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8252 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8253 nrsm, insret, rack, rsm); 8254 } 8255 #endif 8256 if (rsm->r_in_tmap) { 8257 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8258 nrsm->r_in_tmap = 1; 8259 } 8260 rsm->r_flags &= (~RACK_HAS_FIN); 8261 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8262 /* Log a split of rsm into rsm and nrsm */ 8263 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8264 *lenp = 0; 8265 return (0); 8266 } 8267 8268 static void 8269 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 8270 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 8271 struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb, 8272 uint32_t s_moff, int hw_tls, int segsiz) 8273 { 8274 struct tcp_rack *rack; 8275 struct rack_sendmap *rsm, *nrsm; 8276 int insret __diagused; 8277 8278 register uint32_t snd_max, snd_una; 8279 8280 /* 8281 * Add to the RACK log of packets in flight or retransmitted. If 8282 * there is a TS option we will use the TS echoed, if not we will 8283 * grab a TS. 8284 * 8285 * Retransmissions will increment the count and move the ts to its 8286 * proper place. Note that if options do not include TS's then we 8287 * won't be able to effectively use the ACK for an RTT on a retran. 8288 * 8289 * Notes about r_start and r_end. Lets consider a send starting at 8290 * sequence 1 for 10 bytes. In such an example the r_start would be 8291 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 8292 * This means that r_end is actually the first sequence for the next 8293 * slot (11). 8294 * 8295 */ 8296 /* 8297 * If err is set what do we do XXXrrs? should we not add the thing? 8298 * -- i.e. return if err != 0 or should we pretend we sent it? -- 8299 * i.e. proceed with add ** do this for now. 8300 */ 8301 INP_WLOCK_ASSERT(tptoinpcb(tp)); 8302 if (err) 8303 /* 8304 * We don't log errors -- we could but snd_max does not 8305 * advance in this case either. 8306 */ 8307 return; 8308 8309 if (th_flags & TH_RST) { 8310 /* 8311 * We don't log resets and we return immediately from 8312 * sending 8313 */ 8314 return; 8315 } 8316 rack = (struct tcp_rack *)tp->t_fb_ptr; 8317 snd_una = tp->snd_una; 8318 snd_max = tp->snd_max; 8319 if (th_flags & (TH_SYN | TH_FIN)) { 8320 /* 8321 * The call to rack_log_output is made before bumping 8322 * snd_max. This means we can record one extra byte on a SYN 8323 * or FIN if seq_out is adding more on and a FIN is present 8324 * (and we are not resending). 8325 */ 8326 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 8327 len++; 8328 if (th_flags & TH_FIN) 8329 len++; 8330 } 8331 if (SEQ_LEQ((seq_out + len), snd_una)) { 8332 /* Are sending an old segment to induce an ack (keep-alive)? */ 8333 return; 8334 } 8335 if (SEQ_LT(seq_out, snd_una)) { 8336 /* huh? should we panic? */ 8337 uint32_t end; 8338 8339 end = seq_out + len; 8340 seq_out = snd_una; 8341 if (SEQ_GEQ(end, seq_out)) 8342 len = end - seq_out; 8343 else 8344 len = 0; 8345 } 8346 if (len == 0) { 8347 /* We don't log zero window probes */ 8348 return; 8349 } 8350 if (IN_FASTRECOVERY(tp->t_flags)) { 8351 rack->r_ctl.rc_prr_out += len; 8352 } 8353 /* First question is it a retransmission or new? */ 8354 if (seq_out == snd_max) { 8355 /* Its new */ 8356 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts); 8357 again: 8358 rsm = rack_alloc(rack); 8359 if (rsm == NULL) { 8360 /* 8361 * Hmm out of memory and the tcb got destroyed while 8362 * we tried to wait. 8363 */ 8364 return; 8365 } 8366 if (th_flags & TH_FIN) { 8367 rsm->r_flags = RACK_HAS_FIN|add_flag; 8368 } else { 8369 rsm->r_flags = add_flag; 8370 } 8371 if (hw_tls) 8372 rsm->r_hw_tls = 1; 8373 rsm->r_tim_lastsent[0] = cts; 8374 rsm->r_rtr_cnt = 1; 8375 rsm->r_act_rxt_cnt = 0; 8376 rsm->r_rtr_bytes = 0; 8377 if (th_flags & TH_SYN) { 8378 /* The data space is one beyond snd_una */ 8379 rsm->r_flags |= RACK_HAS_SYN; 8380 } 8381 rsm->r_start = seq_out; 8382 rsm->r_end = rsm->r_start + len; 8383 rack_mark_in_gp_win(tp, rsm); 8384 rsm->r_dupack = 0; 8385 /* 8386 * save off the mbuf location that 8387 * sndmbuf_noadv returned (which is 8388 * where we started copying from).. 8389 */ 8390 rsm->m = s_mb; 8391 rsm->soff = s_moff; 8392 /* 8393 * Here we do add in the len of send, since its not yet 8394 * reflected in in snduna <->snd_max 8395 */ 8396 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 8397 rack->r_ctl.rc_sacked) + 8398 (rsm->r_end - rsm->r_start)); 8399 if ((rack->rc_initial_ss_comp == 0) && 8400 (rack->r_ctl.ss_hi_fs < rsm->r_fas)) { 8401 rack->r_ctl.ss_hi_fs = rsm->r_fas; 8402 } 8403 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 8404 if (rsm->m) { 8405 if (rsm->m->m_len <= rsm->soff) { 8406 /* 8407 * XXXrrs Question, will this happen? 8408 * 8409 * If sbsndptr is set at the correct place 8410 * then s_moff should always be somewhere 8411 * within rsm->m. But if the sbsndptr was 8412 * off then that won't be true. If it occurs 8413 * we need to walkout to the correct location. 8414 */ 8415 struct mbuf *lm; 8416 8417 lm = rsm->m; 8418 while (lm->m_len <= rsm->soff) { 8419 rsm->soff -= lm->m_len; 8420 lm = lm->m_next; 8421 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 8422 __func__, rack, s_moff, s_mb, rsm->soff)); 8423 } 8424 rsm->m = lm; 8425 } 8426 rsm->orig_m_len = rsm->m->m_len; 8427 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 8428 } else { 8429 rsm->orig_m_len = 0; 8430 rsm->orig_t_space = 0; 8431 } 8432 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); 8433 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8434 /* Log a new rsm */ 8435 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 8436 #ifndef INVARIANTS 8437 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 8438 #else 8439 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 8440 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8441 nrsm, insret, rack, rsm); 8442 } 8443 #endif 8444 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8445 rsm->r_in_tmap = 1; 8446 if (rsm->r_flags & RACK_IS_PCM) { 8447 rack->r_ctl.pcm_i.send_time = cts; 8448 rack->r_ctl.pcm_i.eseq = rsm->r_end; 8449 /* First time through we set the start too */ 8450 if (rack->pcm_in_progress == 0) 8451 rack->r_ctl.pcm_i.sseq = rsm->r_start; 8452 } 8453 /* 8454 * Special case detection, is there just a single 8455 * packet outstanding when we are not in recovery? 8456 * 8457 * If this is true mark it so. 8458 */ 8459 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 8460 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 8461 struct rack_sendmap *prsm; 8462 8463 prsm = tqhash_prev(rack->r_ctl.tqh, rsm); 8464 if (prsm) 8465 prsm->r_one_out_nr = 1; 8466 } 8467 return; 8468 } 8469 /* 8470 * If we reach here its a retransmission and we need to find it. 8471 */ 8472 more: 8473 if (hintrsm && (hintrsm->r_start == seq_out)) { 8474 rsm = hintrsm; 8475 hintrsm = NULL; 8476 } else { 8477 /* No hints sorry */ 8478 rsm = NULL; 8479 } 8480 if ((rsm) && (rsm->r_start == seq_out)) { 8481 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8482 if (len == 0) { 8483 return; 8484 } else { 8485 goto more; 8486 } 8487 } 8488 /* Ok it was not the last pointer go through it the hard way. */ 8489 refind: 8490 rsm = tqhash_find(rack->r_ctl.tqh, seq_out); 8491 if (rsm) { 8492 if (rsm->r_start == seq_out) { 8493 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8494 if (len == 0) { 8495 return; 8496 } else { 8497 goto refind; 8498 } 8499 } 8500 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 8501 /* Transmitted within this piece */ 8502 /* 8503 * Ok we must split off the front and then let the 8504 * update do the rest 8505 */ 8506 nrsm = rack_alloc_full_limit(rack); 8507 if (nrsm == NULL) { 8508 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); 8509 return; 8510 } 8511 /* 8512 * copy rsm to nrsm and then trim the front of rsm 8513 * to not include this part. 8514 */ 8515 rack_clone_rsm(rack, nrsm, rsm, seq_out); 8516 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8517 #ifndef INVARIANTS 8518 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8519 #else 8520 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8521 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 8522 nrsm, insret, rack, rsm); 8523 } 8524 #endif 8525 if (rsm->r_in_tmap) { 8526 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8527 nrsm->r_in_tmap = 1; 8528 } 8529 rsm->r_flags &= (~RACK_HAS_FIN); 8530 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); 8531 if (len == 0) { 8532 return; 8533 } else if (len > 0) 8534 goto refind; 8535 } 8536 } 8537 /* 8538 * Hmm not found in map did they retransmit both old and on into the 8539 * new? 8540 */ 8541 if (seq_out == tp->snd_max) { 8542 goto again; 8543 } else if (SEQ_LT(seq_out, tp->snd_max)) { 8544 #ifdef INVARIANTS 8545 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 8546 seq_out, len, tp->snd_una, tp->snd_max); 8547 printf("Starting Dump of all rack entries\n"); 8548 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 8549 printf("rsm:%p start:%u end:%u\n", 8550 rsm, rsm->r_start, rsm->r_end); 8551 } 8552 printf("Dump complete\n"); 8553 panic("seq_out not found rack:%p tp:%p", 8554 rack, tp); 8555 #endif 8556 } else { 8557 #ifdef INVARIANTS 8558 /* 8559 * Hmm beyond sndmax? (only if we are using the new rtt-pack 8560 * flag) 8561 */ 8562 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 8563 seq_out, len, tp->snd_max, tp); 8564 #endif 8565 } 8566 } 8567 8568 /* 8569 * Record one of the RTT updates from an ack into 8570 * our sample structure. 8571 */ 8572 8573 static void 8574 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 8575 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 8576 { 8577 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8578 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 8579 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 8580 } 8581 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8582 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 8583 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 8584 } 8585 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 8586 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 8587 rack->r_ctl.rc_gp_lowrtt = us_rtt; 8588 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 8589 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 8590 } 8591 if ((confidence == 1) && 8592 ((rsm == NULL) || 8593 (rsm->r_just_ret) || 8594 (rsm->r_one_out_nr && 8595 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 8596 /* 8597 * If the rsm had a just return 8598 * hit it then we can't trust the 8599 * rtt measurement for buffer deterimination 8600 * Note that a confidence of 2, indicates 8601 * SACK'd which overrides the r_just_ret or 8602 * the r_one_out_nr. If it was a CUM-ACK and 8603 * we had only two outstanding, but get an 8604 * ack for only 1. Then that also lowers our 8605 * confidence. 8606 */ 8607 confidence = 0; 8608 } 8609 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8610 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 8611 if (rack->r_ctl.rack_rs.confidence == 0) { 8612 /* 8613 * We take anything with no current confidence 8614 * saved. 8615 */ 8616 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8617 rack->r_ctl.rack_rs.confidence = confidence; 8618 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8619 } else if (confidence != 0) { 8620 /* 8621 * Once we have a confident number, 8622 * we can update it with a smaller 8623 * value since this confident number 8624 * may include the DSACK time until 8625 * the next segment (the second one) arrived. 8626 */ 8627 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8628 rack->r_ctl.rack_rs.confidence = confidence; 8629 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8630 } 8631 } 8632 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 8633 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 8634 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 8635 rack->r_ctl.rack_rs.rs_rtt_cnt++; 8636 } 8637 8638 /* 8639 * Collect new round-trip time estimate 8640 * and update averages and current timeout. 8641 */ 8642 static void 8643 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 8644 { 8645 int32_t delta; 8646 int32_t rtt; 8647 8648 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 8649 /* No valid sample */ 8650 return; 8651 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 8652 /* We are to use the lowest RTT seen in a single ack */ 8653 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 8654 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 8655 /* We are to use the highest RTT seen in a single ack */ 8656 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 8657 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 8658 /* We are to use the average RTT seen in a single ack */ 8659 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 8660 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 8661 } else { 8662 #ifdef INVARIANTS 8663 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 8664 #endif 8665 return; 8666 } 8667 if (rtt == 0) 8668 rtt = 1; 8669 if (rack->rc_gp_rtt_set == 0) { 8670 /* 8671 * With no RTT we have to accept 8672 * even one we are not confident of. 8673 */ 8674 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 8675 rack->rc_gp_rtt_set = 1; 8676 } else if (rack->r_ctl.rack_rs.confidence) { 8677 /* update the running gp srtt */ 8678 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 8679 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 8680 } 8681 if (rack->r_ctl.rack_rs.confidence) { 8682 /* 8683 * record the low and high for highly buffered path computation, 8684 * we only do this if we are confident (not a retransmission). 8685 */ 8686 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 8687 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8688 } 8689 if (rack->rc_highly_buffered == 0) { 8690 /* 8691 * Currently once we declare a path has 8692 * highly buffered there is no going 8693 * back, which may be a problem... 8694 */ 8695 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 8696 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 8697 rack->r_ctl.rc_highest_us_rtt, 8698 rack->r_ctl.rc_lowest_us_rtt, 8699 RACK_RTTS_SEEHBP); 8700 rack->rc_highly_buffered = 1; 8701 } 8702 } 8703 } 8704 if ((rack->r_ctl.rack_rs.confidence) || 8705 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 8706 /* 8707 * If we are highly confident of it <or> it was 8708 * never retransmitted we accept it as the last us_rtt. 8709 */ 8710 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8711 /* The lowest rtt can be set if its was not retransmited */ 8712 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 8713 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8714 if (rack->r_ctl.rc_lowest_us_rtt == 0) 8715 rack->r_ctl.rc_lowest_us_rtt = 1; 8716 } 8717 } 8718 rack = (struct tcp_rack *)tp->t_fb_ptr; 8719 if (tp->t_srtt != 0) { 8720 /* 8721 * We keep a simple srtt in microseconds, like our rtt 8722 * measurement. We don't need to do any tricks with shifting 8723 * etc. Instead we just add in 1/8th of the new measurement 8724 * and subtract out 1/8 of the old srtt. We do the same with 8725 * the variance after finding the absolute value of the 8726 * difference between this sample and the current srtt. 8727 */ 8728 delta = tp->t_srtt - rtt; 8729 /* Take off 1/8th of the current sRTT */ 8730 tp->t_srtt -= (tp->t_srtt >> 3); 8731 /* Add in 1/8th of the new RTT just measured */ 8732 tp->t_srtt += (rtt >> 3); 8733 if (tp->t_srtt <= 0) 8734 tp->t_srtt = 1; 8735 /* Now lets make the absolute value of the variance */ 8736 if (delta < 0) 8737 delta = -delta; 8738 /* Subtract out 1/8th */ 8739 tp->t_rttvar -= (tp->t_rttvar >> 3); 8740 /* Add in 1/8th of the new variance we just saw */ 8741 tp->t_rttvar += (delta >> 3); 8742 if (tp->t_rttvar <= 0) 8743 tp->t_rttvar = 1; 8744 } else { 8745 /* 8746 * No rtt measurement yet - use the unsmoothed rtt. Set the 8747 * variance to half the rtt (so our first retransmit happens 8748 * at 3*rtt). 8749 */ 8750 tp->t_srtt = rtt; 8751 tp->t_rttvar = rtt >> 1; 8752 } 8753 rack->rc_srtt_measure_made = 1; 8754 KMOD_TCPSTAT_INC(tcps_rttupdated); 8755 if (tp->t_rttupdated < UCHAR_MAX) 8756 tp->t_rttupdated++; 8757 #ifdef STATS 8758 if (rack_stats_gets_ms_rtt == 0) { 8759 /* Send in the microsecond rtt used for rxt timeout purposes */ 8760 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 8761 } else if (rack_stats_gets_ms_rtt == 1) { 8762 /* Send in the millisecond rtt used for rxt timeout purposes */ 8763 int32_t ms_rtt; 8764 8765 /* Round up */ 8766 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8767 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8768 } else if (rack_stats_gets_ms_rtt == 2) { 8769 /* Send in the millisecond rtt has close to the path RTT as we can get */ 8770 int32_t ms_rtt; 8771 8772 /* Round up */ 8773 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8774 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8775 } else { 8776 /* Send in the microsecond rtt has close to the path RTT as we can get */ 8777 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8778 } 8779 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8780 #endif 8781 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 8782 /* 8783 * the retransmit should happen at rtt + 4 * rttvar. Because of the 8784 * way we do the smoothing, srtt and rttvar will each average +1/2 8785 * tick of bias. When we compute the retransmit timer, we want 1/2 8786 * tick of rounding and 1 extra tick because of +-1/2 tick 8787 * uncertainty in the firing of the timer. The bias will give us 8788 * exactly the 1.5 tick we need. But, because the bias is 8789 * statistical, we have to test that we don't drop below the minimum 8790 * feasible timer (which is 2 ticks). 8791 */ 8792 tp->t_rxtshift = 0; 8793 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8794 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 8795 rack_log_rtt_sample(rack, rtt); 8796 tp->t_softerror = 0; 8797 } 8798 8799 8800 static void 8801 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 8802 { 8803 /* 8804 * Apply to filter the inbound us-rtt at us_cts. 8805 */ 8806 uint32_t old_rtt; 8807 8808 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 8809 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 8810 us_rtt, us_cts); 8811 if (old_rtt > us_rtt) { 8812 /* We just hit a new lower rtt time */ 8813 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 8814 __LINE__, RACK_RTTS_NEWRTT); 8815 /* 8816 * Only count it if its lower than what we saw within our 8817 * calculated range. 8818 */ 8819 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 8820 if (rack_probertt_lower_within && 8821 rack->rc_gp_dyn_mul && 8822 (rack->use_fixed_rate == 0) && 8823 (rack->rc_always_pace)) { 8824 /* 8825 * We are seeing a new lower rtt very close 8826 * to the time that we would have entered probe-rtt. 8827 * This is probably due to the fact that a peer flow 8828 * has entered probe-rtt. Lets go in now too. 8829 */ 8830 uint32_t val; 8831 8832 val = rack_probertt_lower_within * rack_time_between_probertt; 8833 val /= 100; 8834 if ((rack->in_probe_rtt == 0) && 8835 (rack->rc_skip_timely == 0) && 8836 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 8837 rack_enter_probertt(rack, us_cts); 8838 } 8839 } 8840 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 8841 } 8842 } 8843 } 8844 8845 static int 8846 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 8847 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 8848 { 8849 uint32_t us_rtt; 8850 int32_t i, all; 8851 uint32_t t, len_acked; 8852 8853 if ((rsm->r_flags & RACK_ACKED) || 8854 (rsm->r_flags & RACK_WAS_ACKED)) 8855 /* Already done */ 8856 return (0); 8857 if (rsm->r_no_rtt_allowed) { 8858 /* Not allowed */ 8859 return (0); 8860 } 8861 if (ack_type == CUM_ACKED) { 8862 if (SEQ_GT(th_ack, rsm->r_end)) { 8863 len_acked = rsm->r_end - rsm->r_start; 8864 all = 1; 8865 } else { 8866 len_acked = th_ack - rsm->r_start; 8867 all = 0; 8868 } 8869 } else { 8870 len_acked = rsm->r_end - rsm->r_start; 8871 all = 0; 8872 } 8873 if (rsm->r_rtr_cnt == 1) { 8874 8875 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8876 if ((int)t <= 0) 8877 t = 1; 8878 if (!tp->t_rttlow || tp->t_rttlow > t) 8879 tp->t_rttlow = t; 8880 if (!rack->r_ctl.rc_rack_min_rtt || 8881 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8882 rack->r_ctl.rc_rack_min_rtt = t; 8883 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8884 rack->r_ctl.rc_rack_min_rtt = 1; 8885 } 8886 } 8887 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 8888 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8889 else 8890 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8891 if (us_rtt == 0) 8892 us_rtt = 1; 8893 if (CC_ALGO(tp)->rttsample != NULL) { 8894 /* Kick the RTT to the CC */ 8895 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8896 } 8897 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); 8898 if (ack_type == SACKED) { 8899 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 8900 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 8901 } else { 8902 /* 8903 * We need to setup what our confidence 8904 * is in this ack. 8905 * 8906 * If the rsm was app limited and it is 8907 * less than a mss in length (the end 8908 * of the send) then we have a gap. If we 8909 * were app limited but say we were sending 8910 * multiple MSS's then we are more confident 8911 * int it. 8912 * 8913 * When we are not app-limited then we see if 8914 * the rsm is being included in the current 8915 * measurement, we tell this by the app_limited_needs_set 8916 * flag. 8917 * 8918 * Note that being cwnd blocked is not applimited 8919 * as well as the pacing delay between packets which 8920 * are sending only 1 or 2 MSS's also will show up 8921 * in the RTT. We probably need to examine this algorithm 8922 * a bit more and enhance it to account for the delay 8923 * between rsm's. We could do that by saving off the 8924 * pacing delay of each rsm (in an rsm) and then 8925 * factoring that in somehow though for now I am 8926 * not sure how :) 8927 */ 8928 int calc_conf = 0; 8929 8930 if (rsm->r_flags & RACK_APP_LIMITED) { 8931 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 8932 calc_conf = 0; 8933 else 8934 calc_conf = 1; 8935 } else if (rack->app_limited_needs_set == 0) { 8936 calc_conf = 1; 8937 } else { 8938 calc_conf = 0; 8939 } 8940 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 8941 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 8942 calc_conf, rsm, rsm->r_rtr_cnt); 8943 } 8944 if ((rsm->r_flags & RACK_TLP) && 8945 (!IN_FASTRECOVERY(tp->t_flags))) { 8946 /* Segment was a TLP and our retrans matched */ 8947 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 8948 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 8949 } 8950 } 8951 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 8952 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8953 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 8954 /* New more recent rack_tmit_time */ 8955 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8956 if (rack->r_ctl.rc_rack_tmit_time == 0) 8957 rack->r_ctl.rc_rack_tmit_time = 1; 8958 rack->rc_rack_rtt = t; 8959 } 8960 return (1); 8961 } 8962 /* 8963 * We clear the soft/rxtshift since we got an ack. 8964 * There is no assurance we will call the commit() function 8965 * so we need to clear these to avoid incorrect handling. 8966 */ 8967 tp->t_rxtshift = 0; 8968 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8969 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 8970 tp->t_softerror = 0; 8971 if (to && (to->to_flags & TOF_TS) && 8972 (ack_type == CUM_ACKED) && 8973 (to->to_tsecr) && 8974 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 8975 /* 8976 * Now which timestamp does it match? In this block the ACK 8977 * must be coming from a previous transmission. 8978 */ 8979 for (i = 0; i < rsm->r_rtr_cnt; i++) { 8980 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 8981 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8982 if ((int)t <= 0) 8983 t = 1; 8984 if (CC_ALGO(tp)->rttsample != NULL) { 8985 /* 8986 * Kick the RTT to the CC, here 8987 * we lie a bit in that we know the 8988 * retransmission is correct even though 8989 * we retransmitted. This is because 8990 * we match the timestamps. 8991 */ 8992 if (TSTMP_GT(tcp_tv_to_usec(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 8993 us_rtt = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 8994 else 8995 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 8996 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8997 } 8998 if ((i + 1) < rsm->r_rtr_cnt) { 8999 /* 9000 * The peer ack'd from our previous 9001 * transmission. We have a spurious 9002 * retransmission and thus we dont 9003 * want to update our rack_rtt. 9004 * 9005 * Hmm should there be a CC revert here? 9006 * 9007 */ 9008 return (0); 9009 } 9010 if (!tp->t_rttlow || tp->t_rttlow > t) 9011 tp->t_rttlow = t; 9012 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9013 rack->r_ctl.rc_rack_min_rtt = t; 9014 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9015 rack->r_ctl.rc_rack_min_rtt = 1; 9016 } 9017 } 9018 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9019 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9020 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9021 /* New more recent rack_tmit_time */ 9022 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9023 if (rack->r_ctl.rc_rack_tmit_time == 0) 9024 rack->r_ctl.rc_rack_tmit_time = 1; 9025 rack->rc_rack_rtt = t; 9026 } 9027 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 9028 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 9029 rsm->r_rtr_cnt); 9030 return (1); 9031 } 9032 } 9033 /* If we are logging log out the sendmap */ 9034 if (tcp_bblogging_on(rack->rc_tp)) { 9035 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9036 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); 9037 } 9038 } 9039 goto ts_not_found; 9040 } else { 9041 /* 9042 * Ok its a SACK block that we retransmitted. or a windows 9043 * machine without timestamps. We can tell nothing from the 9044 * time-stamp since its not there or the time the peer last 9045 * received a segment that moved forward its cum-ack point. 9046 */ 9047 ts_not_found: 9048 i = rsm->r_rtr_cnt - 1; 9049 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9050 if ((int)t <= 0) 9051 t = 1; 9052 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9053 /* 9054 * We retransmitted and the ack came back in less 9055 * than the smallest rtt we have observed. We most 9056 * likely did an improper retransmit as outlined in 9057 * 6.2 Step 2 point 2 in the rack-draft so we 9058 * don't want to update our rack_rtt. We in 9059 * theory (in future) might want to think about reverting our 9060 * cwnd state but we won't for now. 9061 */ 9062 return (0); 9063 } else if (rack->r_ctl.rc_rack_min_rtt) { 9064 /* 9065 * We retransmitted it and the retransmit did the 9066 * job. 9067 */ 9068 if (!rack->r_ctl.rc_rack_min_rtt || 9069 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9070 rack->r_ctl.rc_rack_min_rtt = t; 9071 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9072 rack->r_ctl.rc_rack_min_rtt = 1; 9073 } 9074 } 9075 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9076 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9077 (uint32_t)rsm->r_tim_lastsent[i]))) { 9078 /* New more recent rack_tmit_time */ 9079 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 9080 if (rack->r_ctl.rc_rack_tmit_time == 0) 9081 rack->r_ctl.rc_rack_tmit_time = 1; 9082 rack->rc_rack_rtt = t; 9083 } 9084 return (1); 9085 } 9086 } 9087 return (0); 9088 } 9089 9090 /* 9091 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 9092 */ 9093 static void 9094 rack_log_sack_passed(struct tcpcb *tp, 9095 struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) 9096 { 9097 struct rack_sendmap *nrsm; 9098 uint32_t thresh; 9099 9100 /* Get our rxt threshold for lost consideration */ 9101 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); 9102 /* Now start looking at rsm's */ 9103 nrsm = rsm; 9104 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 9105 rack_head, r_tnext) { 9106 if (nrsm == rsm) { 9107 /* Skip original segment he is acked */ 9108 continue; 9109 } 9110 if (nrsm->r_flags & RACK_ACKED) { 9111 /* 9112 * Skip ack'd segments, though we 9113 * should not see these, since tmap 9114 * should not have ack'd segments. 9115 */ 9116 continue; 9117 } 9118 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 9119 /* 9120 * If the peer dropped the rwnd on 9121 * these then we don't worry about them. 9122 */ 9123 continue; 9124 } 9125 /* Check lost state */ 9126 if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { 9127 uint32_t exp; 9128 9129 exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; 9130 if (TSTMP_LT(exp, cts) || (exp == cts)) { 9131 /* We consider it lost */ 9132 nrsm->r_flags |= RACK_WAS_LOST; 9133 rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; 9134 } 9135 } 9136 if (nrsm->r_flags & RACK_SACK_PASSED) { 9137 /* 9138 * We found one that is already marked 9139 * passed, we have been here before and 9140 * so all others below this are marked. 9141 */ 9142 break; 9143 } 9144 nrsm->r_flags |= RACK_SACK_PASSED; 9145 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 9146 } 9147 } 9148 9149 static void 9150 rack_need_set_test(struct tcpcb *tp, 9151 struct tcp_rack *rack, 9152 struct rack_sendmap *rsm, 9153 tcp_seq th_ack, 9154 int line, 9155 int use_which) 9156 { 9157 struct rack_sendmap *s_rsm; 9158 9159 if ((tp->t_flags & TF_GPUTINPROG) && 9160 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9161 /* 9162 * We were app limited, and this ack 9163 * butts up or goes beyond the point where we want 9164 * to start our next measurement. We need 9165 * to record the new gput_ts as here and 9166 * possibly update the start sequence. 9167 */ 9168 uint32_t seq, ts; 9169 9170 if (rsm->r_rtr_cnt > 1) { 9171 /* 9172 * This is a retransmit, can we 9173 * really make any assessment at this 9174 * point? We are not really sure of 9175 * the timestamp, is it this or the 9176 * previous transmission? 9177 * 9178 * Lets wait for something better that 9179 * is not retransmitted. 9180 */ 9181 return; 9182 } 9183 seq = tp->gput_seq; 9184 ts = tp->gput_ts; 9185 rack->app_limited_needs_set = 0; 9186 tp->gput_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 9187 /* Do we start at a new end? */ 9188 if ((use_which == RACK_USE_BEG) && 9189 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 9190 /* 9191 * When we get an ACK that just eats 9192 * up some of the rsm, we set RACK_USE_BEG 9193 * since whats at r_start (i.e. th_ack) 9194 * is left unacked and thats where the 9195 * measurement now starts. 9196 */ 9197 tp->gput_seq = rsm->r_start; 9198 } 9199 if ((use_which == RACK_USE_END) && 9200 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9201 /* 9202 * We use the end when the cumack 9203 * is moving forward and completely 9204 * deleting the rsm passed so basically 9205 * r_end holds th_ack. 9206 * 9207 * For SACK's we also want to use the end 9208 * since this piece just got sacked and 9209 * we want to target anything after that 9210 * in our measurement. 9211 */ 9212 tp->gput_seq = rsm->r_end; 9213 } 9214 if (use_which == RACK_USE_END_OR_THACK) { 9215 /* 9216 * special case for ack moving forward, 9217 * not a sack, we need to move all the 9218 * way up to where this ack cum-ack moves 9219 * to. 9220 */ 9221 if (SEQ_GT(th_ack, rsm->r_end)) 9222 tp->gput_seq = th_ack; 9223 else 9224 tp->gput_seq = rsm->r_end; 9225 } 9226 if (SEQ_LT(tp->gput_seq, tp->snd_max)) 9227 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 9228 else 9229 s_rsm = NULL; 9230 /* 9231 * Pick up the correct send time if we can the rsm passed in 9232 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other 9233 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will 9234 * find a different seq i.e. the next send up. 9235 * 9236 * If that has not been sent, s_rsm will be NULL and we must 9237 * arrange it so this function will get called again by setting 9238 * app_limited_needs_set. 9239 */ 9240 if (s_rsm) 9241 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; 9242 else { 9243 /* If we hit here we have to have *not* sent tp->gput_seq */ 9244 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 9245 /* Set it up so we will go through here again */ 9246 rack->app_limited_needs_set = 1; 9247 } 9248 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 9249 /* 9250 * We moved beyond this guy's range, re-calculate 9251 * the new end point. 9252 */ 9253 if (rack->rc_gp_filled == 0) { 9254 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 9255 } else { 9256 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 9257 } 9258 } 9259 /* 9260 * We are moving the goal post, we may be able to clear the 9261 * measure_saw_probe_rtt flag. 9262 */ 9263 if ((rack->in_probe_rtt == 0) && 9264 (rack->measure_saw_probe_rtt) && 9265 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 9266 rack->measure_saw_probe_rtt = 0; 9267 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 9268 seq, tp->gput_seq, 9269 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9270 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9271 5, line, NULL, 0); 9272 if (rack->rc_gp_filled && 9273 ((tp->gput_ack - tp->gput_seq) < 9274 max(rc_init_window(rack), (MIN_GP_WIN * 9275 ctf_fixed_maxseg(tp))))) { 9276 uint32_t ideal_amount; 9277 9278 ideal_amount = rack_get_measure_window(tp, rack); 9279 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 9280 /* 9281 * There is no sense of continuing this measurement 9282 * because its too small to gain us anything we 9283 * trust. Skip it and that way we can start a new 9284 * measurement quicker. 9285 */ 9286 tp->t_flags &= ~TF_GPUTINPROG; 9287 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 9288 0, 0, 9289 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9290 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9291 6, __LINE__, NULL, 0); 9292 } else { 9293 /* 9294 * Reset the window further out. 9295 */ 9296 tp->gput_ack = tp->gput_seq + ideal_amount; 9297 } 9298 } 9299 rack_tend_gp_marks(tp, rack); 9300 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); 9301 } 9302 } 9303 9304 static inline int 9305 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 9306 { 9307 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 9308 /* Behind our TLP definition or right at */ 9309 return (0); 9310 } 9311 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 9312 /* The start is beyond or right at our end of TLP definition */ 9313 return (0); 9314 } 9315 /* It has to be a sub-part of the original TLP recorded */ 9316 return (1); 9317 } 9318 9319 static uint32_t 9320 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 9321 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, 9322 uint32_t segsiz) 9323 { 9324 uint32_t start, end, changed = 0; 9325 struct rack_sendmap stack_map; 9326 struct rack_sendmap *rsm, *nrsm, *prev, *next; 9327 int insret __diagused; 9328 int32_t used_ref = 1; 9329 int can_use_hookery = 0; 9330 9331 start = sack->start; 9332 end = sack->end; 9333 rsm = *prsm; 9334 9335 do_rest_ofb: 9336 if ((rsm == NULL) || 9337 (SEQ_LT(end, rsm->r_start)) || 9338 (SEQ_GEQ(start, rsm->r_end)) || 9339 (SEQ_LT(start, rsm->r_start))) { 9340 /* 9341 * We are not in the right spot, 9342 * find the correct spot in the tree. 9343 */ 9344 used_ref = 0; 9345 rsm = tqhash_find(rack->r_ctl.tqh, start); 9346 } 9347 if (rsm == NULL) { 9348 /* TSNH */ 9349 goto out; 9350 } 9351 /* Ok we have an ACK for some piece of this rsm */ 9352 if (rsm->r_start != start) { 9353 if ((rsm->r_flags & RACK_ACKED) == 0) { 9354 /* 9355 * Before any splitting or hookery is 9356 * done is it a TLP of interest i.e. rxt? 9357 */ 9358 if ((rsm->r_flags & RACK_TLP) && 9359 (rsm->r_rtr_cnt > 1)) { 9360 /* 9361 * We are splitting a rxt TLP, check 9362 * if we need to save off the start/end 9363 */ 9364 if (rack->rc_last_tlp_acked_set && 9365 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9366 /* 9367 * We already turned this on since we are inside 9368 * the previous one was a partially sack now we 9369 * are getting another one (maybe all of it). 9370 * 9371 */ 9372 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9373 /* 9374 * Lets make sure we have all of it though. 9375 */ 9376 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9377 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9378 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9379 rack->r_ctl.last_tlp_acked_end); 9380 } 9381 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9382 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9383 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9384 rack->r_ctl.last_tlp_acked_end); 9385 } 9386 } else { 9387 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9388 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9389 rack->rc_last_tlp_past_cumack = 0; 9390 rack->rc_last_tlp_acked_set = 1; 9391 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9392 } 9393 } 9394 /** 9395 * Need to split this in two pieces the before and after, 9396 * the before remains in the map, the after must be 9397 * added. In other words we have: 9398 * rsm |--------------| 9399 * sackblk |-------> 9400 * rsm will become 9401 * rsm |---| 9402 * and nrsm will be the sacked piece 9403 * nrsm |----------| 9404 * 9405 * But before we start down that path lets 9406 * see if the sack spans over on top of 9407 * the next guy and it is already sacked. 9408 * 9409 */ 9410 /* 9411 * Hookery can only be used if the two entries 9412 * are in the same bucket and neither one of 9413 * them staddle the bucket line. 9414 */ 9415 next = tqhash_next(rack->r_ctl.tqh, rsm); 9416 if (next && 9417 (rsm->bindex == next->bindex) && 9418 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9419 ((next->r_flags & RACK_STRADDLE) == 0) && 9420 ((rsm->r_flags & RACK_IS_PCM) == 0) && 9421 ((next->r_flags & RACK_IS_PCM) == 0) && 9422 (rsm->r_flags & RACK_IN_GP_WIN) && 9423 (next->r_flags & RACK_IN_GP_WIN)) 9424 can_use_hookery = 1; 9425 else 9426 can_use_hookery = 0; 9427 if (next && can_use_hookery && 9428 (next->r_flags & RACK_ACKED) && 9429 SEQ_GEQ(end, next->r_start)) { 9430 /** 9431 * So the next one is already acked, and 9432 * we can thus by hookery use our stack_map 9433 * to reflect the piece being sacked and 9434 * then adjust the two tree entries moving 9435 * the start and ends around. So we start like: 9436 * rsm |------------| (not-acked) 9437 * next |-----------| (acked) 9438 * sackblk |--------> 9439 * We want to end like so: 9440 * rsm |------| (not-acked) 9441 * next |-----------------| (acked) 9442 * nrsm |-----| 9443 * Where nrsm is a temporary stack piece we 9444 * use to update all the gizmos. 9445 */ 9446 /* Copy up our fudge block */ 9447 nrsm = &stack_map; 9448 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9449 /* Now adjust our tree blocks */ 9450 tqhash_update_end(rack->r_ctl.tqh, rsm, start); 9451 next->r_start = start; 9452 rsm->r_flags |= RACK_SHUFFLED; 9453 next->r_flags |= RACK_SHUFFLED; 9454 /* Now we must adjust back where next->m is */ 9455 rack_setup_offset_for_rsm(rack, rsm, next); 9456 /* 9457 * Which timestamp do we keep? It is rather 9458 * important in GP measurements to have the 9459 * accurate end of the send window. 9460 * 9461 * We keep the largest value, which is the newest 9462 * send. We do this in case a segment that is 9463 * joined together and not part of a GP estimate 9464 * later gets expanded into the GP estimate. 9465 * 9466 * We prohibit the merging of unlike kinds i.e. 9467 * all pieces that are in the GP estimate can be 9468 * merged and all pieces that are not in a GP estimate 9469 * can be merged, but not disimilar pieces. Combine 9470 * this with taking the highest here and we should 9471 * be ok unless of course the client reneges. Then 9472 * all bets are off. 9473 */ 9474 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < 9475 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) 9476 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; 9477 /* 9478 * And we must keep the newest ack arrival time. 9479 */ 9480 if (next->r_ack_arrival < 9481 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9482 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9483 9484 9485 /* We don't need to adjust rsm, it did not change */ 9486 /* Clear out the dup ack count of the remainder */ 9487 rsm->r_dupack = 0; 9488 rsm->r_just_ret = 0; 9489 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9490 /* Now lets make sure our fudge block is right */ 9491 nrsm->r_start = start; 9492 /* Now lets update all the stats and such */ 9493 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9494 if (rack->app_limited_needs_set) 9495 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9496 changed += (nrsm->r_end - nrsm->r_start); 9497 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9498 if (rsm->r_flags & RACK_WAS_LOST) { 9499 int my_chg; 9500 9501 my_chg = (nrsm->r_end - nrsm->r_start); 9502 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9503 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9504 if (my_chg <= rack->r_ctl.rc_considered_lost) 9505 rack->r_ctl.rc_considered_lost -= my_chg; 9506 else 9507 rack->r_ctl.rc_considered_lost = 0; 9508 } 9509 if (nrsm->r_flags & RACK_SACK_PASSED) { 9510 rack->r_ctl.rc_reorder_ts = cts; 9511 if (rack->r_ctl.rc_reorder_ts == 0) 9512 rack->r_ctl.rc_reorder_ts = 1; 9513 } 9514 /* 9515 * Now we want to go up from rsm (the 9516 * one left un-acked) to the next one 9517 * in the tmap. We do this so when 9518 * we walk backwards we include marking 9519 * sack-passed on rsm (The one passed in 9520 * is skipped since it is generally called 9521 * on something sacked before removing it 9522 * from the tmap). 9523 */ 9524 if (rsm->r_in_tmap) { 9525 nrsm = TAILQ_NEXT(rsm, r_tnext); 9526 /* 9527 * Now that we have the next 9528 * one walk backwards from there. 9529 */ 9530 if (nrsm && nrsm->r_in_tmap) 9531 rack_log_sack_passed(tp, rack, nrsm, cts); 9532 } 9533 /* Now are we done? */ 9534 if (SEQ_LT(end, next->r_end) || 9535 (end == next->r_end)) { 9536 /* Done with block */ 9537 goto out; 9538 } 9539 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 9540 counter_u64_add(rack_sack_used_next_merge, 1); 9541 /* Postion for the next block */ 9542 start = next->r_end; 9543 rsm = tqhash_next(rack->r_ctl.tqh, next); 9544 if (rsm == NULL) 9545 goto out; 9546 } else { 9547 /** 9548 * We can't use any hookery here, so we 9549 * need to split the map. We enter like 9550 * so: 9551 * rsm |--------| 9552 * sackblk |-----> 9553 * We will add the new block nrsm and 9554 * that will be the new portion, and then 9555 * fall through after reseting rsm. So we 9556 * split and look like this: 9557 * rsm |----| 9558 * sackblk |-----> 9559 * nrsm |---| 9560 * We then fall through reseting 9561 * rsm to nrsm, so the next block 9562 * picks it up. 9563 */ 9564 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9565 if (nrsm == NULL) { 9566 /* 9567 * failed XXXrrs what can we do but loose the sack 9568 * info? 9569 */ 9570 goto out; 9571 } 9572 counter_u64_add(rack_sack_splits, 1); 9573 rack_clone_rsm(rack, nrsm, rsm, start); 9574 rsm->r_just_ret = 0; 9575 #ifndef INVARIANTS 9576 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9577 #else 9578 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9579 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 9580 nrsm, insret, rack, rsm); 9581 } 9582 #endif 9583 if (rsm->r_in_tmap) { 9584 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9585 nrsm->r_in_tmap = 1; 9586 } 9587 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 9588 rsm->r_flags &= (~RACK_HAS_FIN); 9589 /* Position us to point to the new nrsm that starts the sack blk */ 9590 rsm = nrsm; 9591 } 9592 } else { 9593 /* Already sacked this piece */ 9594 counter_u64_add(rack_sack_skipped_acked, 1); 9595 if (end == rsm->r_end) { 9596 /* Done with block */ 9597 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9598 goto out; 9599 } else if (SEQ_LT(end, rsm->r_end)) { 9600 /* A partial sack to a already sacked block */ 9601 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9602 goto out; 9603 } else { 9604 /* 9605 * The end goes beyond this guy 9606 * reposition the start to the 9607 * next block. 9608 */ 9609 start = rsm->r_end; 9610 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9611 if (rsm == NULL) 9612 goto out; 9613 } 9614 } 9615 } 9616 if (SEQ_GEQ(end, rsm->r_end)) { 9617 /** 9618 * The end of this block is either beyond this guy or right 9619 * at this guy. I.e.: 9620 * rsm --- |-----| 9621 * end |-----| 9622 * <or> 9623 * end |---------| 9624 */ 9625 if ((rsm->r_flags & RACK_ACKED) == 0) { 9626 /* 9627 * Is it a TLP of interest? 9628 */ 9629 if ((rsm->r_flags & RACK_TLP) && 9630 (rsm->r_rtr_cnt > 1)) { 9631 /* 9632 * We are splitting a rxt TLP, check 9633 * if we need to save off the start/end 9634 */ 9635 if (rack->rc_last_tlp_acked_set && 9636 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9637 /* 9638 * We already turned this on since we are inside 9639 * the previous one was a partially sack now we 9640 * are getting another one (maybe all of it). 9641 */ 9642 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9643 /* 9644 * Lets make sure we have all of it though. 9645 */ 9646 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9647 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9648 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9649 rack->r_ctl.last_tlp_acked_end); 9650 } 9651 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9652 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9653 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9654 rack->r_ctl.last_tlp_acked_end); 9655 } 9656 } else { 9657 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9658 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9659 rack->rc_last_tlp_past_cumack = 0; 9660 rack->rc_last_tlp_acked_set = 1; 9661 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9662 } 9663 } 9664 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9665 changed += (rsm->r_end - rsm->r_start); 9666 /* You get a count for acking a whole segment or more */ 9667 if (rsm->r_flags & RACK_WAS_LOST) { 9668 int my_chg; 9669 9670 my_chg = (rsm->r_end - rsm->r_start); 9671 rsm->r_flags &= ~RACK_WAS_LOST; 9672 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9673 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9674 if (my_chg <= rack->r_ctl.rc_considered_lost) 9675 rack->r_ctl.rc_considered_lost -= my_chg; 9676 else 9677 rack->r_ctl.rc_considered_lost = 0; 9678 } 9679 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9680 if (rsm->r_in_tmap) /* should be true */ 9681 rack_log_sack_passed(tp, rack, rsm, cts); 9682 /* Is Reordering occuring? */ 9683 if (rsm->r_flags & RACK_SACK_PASSED) { 9684 rsm->r_flags &= ~RACK_SACK_PASSED; 9685 rack->r_ctl.rc_reorder_ts = cts; 9686 if (rack->r_ctl.rc_reorder_ts == 0) 9687 rack->r_ctl.rc_reorder_ts = 1; 9688 } 9689 if (rack->app_limited_needs_set) 9690 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9691 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9692 rsm->r_flags |= RACK_ACKED; 9693 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 9694 if (rsm->r_in_tmap) { 9695 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9696 rsm->r_in_tmap = 0; 9697 } 9698 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 9699 } else { 9700 counter_u64_add(rack_sack_skipped_acked, 1); 9701 } 9702 if (end == rsm->r_end) { 9703 /* This block only - done, setup for next */ 9704 goto out; 9705 } 9706 /* 9707 * There is more not coverend by this rsm move on 9708 * to the next block in the tail queue hash table. 9709 */ 9710 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 9711 start = rsm->r_end; 9712 rsm = nrsm; 9713 if (rsm == NULL) 9714 goto out; 9715 goto do_rest_ofb; 9716 } 9717 /** 9718 * The end of this sack block is smaller than 9719 * our rsm i.e.: 9720 * rsm --- |-----| 9721 * end |--| 9722 */ 9723 if ((rsm->r_flags & RACK_ACKED) == 0) { 9724 /* 9725 * Is it a TLP of interest? 9726 */ 9727 if ((rsm->r_flags & RACK_TLP) && 9728 (rsm->r_rtr_cnt > 1)) { 9729 /* 9730 * We are splitting a rxt TLP, check 9731 * if we need to save off the start/end 9732 */ 9733 if (rack->rc_last_tlp_acked_set && 9734 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9735 /* 9736 * We already turned this on since we are inside 9737 * the previous one was a partially sack now we 9738 * are getting another one (maybe all of it). 9739 */ 9740 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9741 /* 9742 * Lets make sure we have all of it though. 9743 */ 9744 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9745 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9746 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9747 rack->r_ctl.last_tlp_acked_end); 9748 } 9749 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9750 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9751 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9752 rack->r_ctl.last_tlp_acked_end); 9753 } 9754 } else { 9755 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9756 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9757 rack->rc_last_tlp_past_cumack = 0; 9758 rack->rc_last_tlp_acked_set = 1; 9759 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9760 } 9761 } 9762 /* 9763 * Hookery can only be used if the two entries 9764 * are in the same bucket and neither one of 9765 * them staddle the bucket line. 9766 */ 9767 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9768 if (prev && 9769 (rsm->bindex == prev->bindex) && 9770 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9771 ((prev->r_flags & RACK_STRADDLE) == 0) && 9772 ((rsm->r_flags & RACK_IS_PCM) == 0) && 9773 ((prev->r_flags & RACK_IS_PCM) == 0) && 9774 (rsm->r_flags & RACK_IN_GP_WIN) && 9775 (prev->r_flags & RACK_IN_GP_WIN)) 9776 can_use_hookery = 1; 9777 else 9778 can_use_hookery = 0; 9779 if (prev && can_use_hookery && 9780 (prev->r_flags & RACK_ACKED)) { 9781 /** 9782 * Goal, we want the right remainder of rsm to shrink 9783 * in place and span from (rsm->r_start = end) to rsm->r_end. 9784 * We want to expand prev to go all the way 9785 * to prev->r_end <- end. 9786 * so in the tree we have before: 9787 * prev |--------| (acked) 9788 * rsm |-------| (non-acked) 9789 * sackblk |-| 9790 * We churn it so we end up with 9791 * prev |----------| (acked) 9792 * rsm |-----| (non-acked) 9793 * nrsm |-| (temporary) 9794 * 9795 * Note if either prev/rsm is a TLP we don't 9796 * do this. 9797 */ 9798 nrsm = &stack_map; 9799 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9800 tqhash_update_end(rack->r_ctl.tqh, prev, end); 9801 rsm->r_start = end; 9802 rsm->r_flags |= RACK_SHUFFLED; 9803 prev->r_flags |= RACK_SHUFFLED; 9804 /* Now adjust nrsm (stack copy) to be 9805 * the one that is the small 9806 * piece that was "sacked". 9807 */ 9808 nrsm->r_end = end; 9809 rsm->r_dupack = 0; 9810 /* 9811 * Which timestamp do we keep? It is rather 9812 * important in GP measurements to have the 9813 * accurate end of the send window. 9814 * 9815 * We keep the largest value, which is the newest 9816 * send. We do this in case a segment that is 9817 * joined together and not part of a GP estimate 9818 * later gets expanded into the GP estimate. 9819 * 9820 * We prohibit the merging of unlike kinds i.e. 9821 * all pieces that are in the GP estimate can be 9822 * merged and all pieces that are not in a GP estimate 9823 * can be merged, but not disimilar pieces. Combine 9824 * this with taking the highest here and we should 9825 * be ok unless of course the client reneges. Then 9826 * all bets are off. 9827 */ 9828 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < 9829 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { 9830 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9831 } 9832 /* 9833 * And we must keep the newest ack arrival time. 9834 */ 9835 9836 if(prev->r_ack_arrival < 9837 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9838 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9839 9840 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9841 /* 9842 * Now that the rsm has had its start moved forward 9843 * lets go ahead and get its new place in the world. 9844 */ 9845 rack_setup_offset_for_rsm(rack, prev, rsm); 9846 /* 9847 * Now nrsm is our new little piece 9848 * that is acked (which was merged 9849 * to prev). Update the rtt and changed 9850 * based on that. Also check for reordering. 9851 */ 9852 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9853 if (rack->app_limited_needs_set) 9854 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9855 changed += (nrsm->r_end - nrsm->r_start); 9856 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9857 if (rsm->r_flags & RACK_WAS_LOST) { 9858 int my_chg; 9859 9860 my_chg = (nrsm->r_end - nrsm->r_start); 9861 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9862 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9863 if (my_chg <= rack->r_ctl.rc_considered_lost) 9864 rack->r_ctl.rc_considered_lost -= my_chg; 9865 else 9866 rack->r_ctl.rc_considered_lost = 0; 9867 } 9868 if (nrsm->r_flags & RACK_SACK_PASSED) { 9869 rack->r_ctl.rc_reorder_ts = cts; 9870 if (rack->r_ctl.rc_reorder_ts == 0) 9871 rack->r_ctl.rc_reorder_ts = 1; 9872 } 9873 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 9874 rsm = prev; 9875 counter_u64_add(rack_sack_used_prev_merge, 1); 9876 } else { 9877 /** 9878 * This is the case where our previous 9879 * block is not acked either, so we must 9880 * split the block in two. 9881 */ 9882 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9883 if (nrsm == NULL) { 9884 /* failed rrs what can we do but loose the sack info? */ 9885 goto out; 9886 } 9887 if ((rsm->r_flags & RACK_TLP) && 9888 (rsm->r_rtr_cnt > 1)) { 9889 /* 9890 * We are splitting a rxt TLP, check 9891 * if we need to save off the start/end 9892 */ 9893 if (rack->rc_last_tlp_acked_set && 9894 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9895 /* 9896 * We already turned this on since this block is inside 9897 * the previous one was a partially sack now we 9898 * are getting another one (maybe all of it). 9899 */ 9900 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9901 /* 9902 * Lets make sure we have all of it though. 9903 */ 9904 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9905 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9906 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9907 rack->r_ctl.last_tlp_acked_end); 9908 } 9909 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9910 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9911 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9912 rack->r_ctl.last_tlp_acked_end); 9913 } 9914 } else { 9915 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9916 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9917 rack->rc_last_tlp_acked_set = 1; 9918 rack->rc_last_tlp_past_cumack = 0; 9919 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9920 } 9921 } 9922 /** 9923 * In this case nrsm becomes 9924 * nrsm->r_start = end; 9925 * nrsm->r_end = rsm->r_end; 9926 * which is un-acked. 9927 * <and> 9928 * rsm->r_end = nrsm->r_start; 9929 * i.e. the remaining un-acked 9930 * piece is left on the left 9931 * hand side. 9932 * 9933 * So we start like this 9934 * rsm |----------| (not acked) 9935 * sackblk |---| 9936 * build it so we have 9937 * rsm |---| (acked) 9938 * nrsm |------| (not acked) 9939 */ 9940 counter_u64_add(rack_sack_splits, 1); 9941 rack_clone_rsm(rack, nrsm, rsm, end); 9942 rsm->r_flags &= (~RACK_HAS_FIN); 9943 rsm->r_just_ret = 0; 9944 #ifndef INVARIANTS 9945 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9946 #else 9947 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9948 panic("Insert in tailq_hash of %p fails ret:% rack:%p rsm:%p", 9949 nrsm, insret, rack, rsm); 9950 } 9951 #endif 9952 if (rsm->r_in_tmap) { 9953 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9954 nrsm->r_in_tmap = 1; 9955 } 9956 nrsm->r_dupack = 0; 9957 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 9958 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9959 changed += (rsm->r_end - rsm->r_start); 9960 if (rsm->r_flags & RACK_WAS_LOST) { 9961 int my_chg; 9962 9963 my_chg = (rsm->r_end - rsm->r_start); 9964 rsm->r_flags &= ~RACK_WAS_LOST; 9965 KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), 9966 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 9967 if (my_chg <= rack->r_ctl.rc_considered_lost) 9968 rack->r_ctl.rc_considered_lost -= my_chg; 9969 else 9970 rack->r_ctl.rc_considered_lost = 0; 9971 } 9972 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9973 9974 if (rsm->r_in_tmap) /* should be true */ 9975 rack_log_sack_passed(tp, rack, rsm, cts); 9976 /* Is Reordering occuring? */ 9977 if (rsm->r_flags & RACK_SACK_PASSED) { 9978 rsm->r_flags &= ~RACK_SACK_PASSED; 9979 rack->r_ctl.rc_reorder_ts = cts; 9980 if (rack->r_ctl.rc_reorder_ts == 0) 9981 rack->r_ctl.rc_reorder_ts = 1; 9982 } 9983 if (rack->app_limited_needs_set) 9984 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9985 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9986 rsm->r_flags |= RACK_ACKED; 9987 rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); 9988 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 9989 if (rsm->r_in_tmap) { 9990 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9991 rsm->r_in_tmap = 0; 9992 } 9993 } 9994 } else if (start != end){ 9995 /* 9996 * The block was already acked. 9997 */ 9998 counter_u64_add(rack_sack_skipped_acked, 1); 9999 } 10000 out: 10001 if (rsm && 10002 ((rsm->r_flags & RACK_TLP) == 0) && 10003 (rsm->r_flags & RACK_ACKED)) { 10004 /* 10005 * Now can we merge where we worked 10006 * with either the previous or 10007 * next block? 10008 */ 10009 next = tqhash_next(rack->r_ctl.tqh, rsm); 10010 while (next) { 10011 if (next->r_flags & RACK_TLP) 10012 break; 10013 /* Only allow merges between ones in or out of GP window */ 10014 if ((next->r_flags & RACK_IN_GP_WIN) && 10015 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10016 break; 10017 } 10018 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10019 ((next->r_flags & RACK_IN_GP_WIN) == 0)) { 10020 break; 10021 } 10022 if (rsm->bindex != next->bindex) 10023 break; 10024 if (rsm->r_flags & RACK_STRADDLE) 10025 break; 10026 if (rsm->r_flags & RACK_IS_PCM) 10027 break; 10028 if (next->r_flags & RACK_STRADDLE) 10029 break; 10030 if (next->r_flags & RACK_IS_PCM) 10031 break; 10032 if (next->r_flags & RACK_ACKED) { 10033 /* yep this and next can be merged */ 10034 rsm = rack_merge_rsm(rack, rsm, next); 10035 next = tqhash_next(rack->r_ctl.tqh, rsm); 10036 } else 10037 break; 10038 } 10039 /* Now what about the previous? */ 10040 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10041 while (prev) { 10042 if (prev->r_flags & RACK_TLP) 10043 break; 10044 /* Only allow merges between ones in or out of GP window */ 10045 if ((prev->r_flags & RACK_IN_GP_WIN) && 10046 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10047 break; 10048 } 10049 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10050 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { 10051 break; 10052 } 10053 if (rsm->bindex != prev->bindex) 10054 break; 10055 if (rsm->r_flags & RACK_STRADDLE) 10056 break; 10057 if (rsm->r_flags & RACK_IS_PCM) 10058 break; 10059 if (prev->r_flags & RACK_STRADDLE) 10060 break; 10061 if (prev->r_flags & RACK_IS_PCM) 10062 break; 10063 if (prev->r_flags & RACK_ACKED) { 10064 /* yep the previous and this can be merged */ 10065 rsm = rack_merge_rsm(rack, prev, rsm); 10066 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10067 } else 10068 break; 10069 } 10070 } 10071 if (used_ref == 0) { 10072 counter_u64_add(rack_sack_proc_all, 1); 10073 } else { 10074 counter_u64_add(rack_sack_proc_short, 1); 10075 } 10076 /* Save off the next one for quick reference. */ 10077 nrsm = tqhash_find(rack->r_ctl.tqh, end); 10078 *prsm = rack->r_ctl.rc_sacklast = nrsm; 10079 return (changed); 10080 } 10081 10082 static void inline 10083 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 10084 { 10085 struct rack_sendmap *tmap; 10086 10087 tmap = NULL; 10088 while (rsm && (rsm->r_flags & RACK_ACKED)) { 10089 /* Its no longer sacked, mark it so */ 10090 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10091 #ifdef INVARIANTS 10092 if (rsm->r_in_tmap) { 10093 panic("rack:%p rsm:%p flags:0x%x in tmap?", 10094 rack, rsm, rsm->r_flags); 10095 } 10096 #endif 10097 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 10098 /* Rebuild it into our tmap */ 10099 if (tmap == NULL) { 10100 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10101 tmap = rsm; 10102 } else { 10103 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 10104 tmap = rsm; 10105 } 10106 tmap->r_in_tmap = 1; 10107 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10108 } 10109 /* 10110 * Now lets possibly clear the sack filter so we start 10111 * recognizing sacks that cover this area. 10112 */ 10113 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 10114 10115 } 10116 10117 10118 static void inline 10119 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) 10120 { 10121 /* 10122 * We look at advancing the end send time for our GP 10123 * measurement tracking only as the cumulative acknowledgment 10124 * moves forward. You might wonder about this, why not 10125 * at every transmission or retransmission within the 10126 * GP window update the rc_gp_cumack_ts? Well its rather 10127 * nuanced but basically the GP window *may* expand (as 10128 * it does below) or worse and harder to track it may shrink. 10129 * 10130 * This last makes it impossible to track at the time of 10131 * the send, since you may set forward your rc_gp_cumack_ts 10132 * when you send, because that send *is* in your currently 10133 * "guessed" window, but then it shrinks. Now which was 10134 * the send time of the last bytes in the window, by the 10135 * time you ask that question that part of the sendmap 10136 * is freed. So you don't know and you will have too 10137 * long of send window. Instead by updating the time 10138 * marker only when the cumack advances this assures us 10139 * that we will have only the sends in the window of our 10140 * GP measurement. 10141 * 10142 * Another complication from this is the 10143 * merging of sendmap entries. During SACK processing this 10144 * can happen to conserve the sendmap size. That breaks 10145 * everything down in tracking the send window of the GP 10146 * estimate. So to prevent that and keep it working with 10147 * a tiny bit more limited merging, we only allow like 10148 * types to be merged. I.e. if two sends are in the GP window 10149 * then its ok to merge them together. If two sends are not 10150 * in the GP window its ok to merge them together too. Though 10151 * one send in and one send out cannot be merged. We combine 10152 * this with never allowing the shrinking of the GP window when 10153 * we are in recovery so that we can properly calculate the 10154 * sending times. 10155 * 10156 * This all of course seems complicated, because it is.. :) 10157 * 10158 * The cum-ack is being advanced upon the sendmap. 10159 * If we are not doing a GP estimate don't 10160 * proceed. 10161 */ 10162 uint64_t ts; 10163 10164 if ((tp->t_flags & TF_GPUTINPROG) == 0) 10165 return; 10166 /* 10167 * If this sendmap entry is going 10168 * beyond the measurement window we had picked, 10169 * expand the measurement window by that much. 10170 */ 10171 if (SEQ_GT(rsm->r_end, tp->gput_ack)) { 10172 tp->gput_ack = rsm->r_end; 10173 } 10174 /* 10175 * If we have not setup a ack, then we 10176 * have no idea if the newly acked pieces 10177 * will be "in our seq measurement range". If 10178 * it is when we clear the app_limited_needs_set 10179 * flag the timestamp will be updated. 10180 */ 10181 if (rack->app_limited_needs_set) 10182 return; 10183 /* 10184 * Finally, we grab out the latest timestamp 10185 * that this packet was sent and then see 10186 * if: 10187 * a) The packet touches are newly defined GP range. 10188 * b) The time is greater than (newer) than the 10189 * one we currently have. If so we update 10190 * our sending end time window. 10191 * 10192 * Note we *do not* do this at send time. The reason 10193 * is that if you do you *may* pick up a newer timestamp 10194 * for a range you are not going to measure. We project 10195 * out how far and then sometimes modify that to be 10196 * smaller. If that occurs then you will have a send 10197 * that does not belong to the range included. 10198 */ 10199 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= 10200 rack->r_ctl.rc_gp_cumack_ts) 10201 return; 10202 if (rack_in_gp_window(tp, rsm)) { 10203 rack->r_ctl.rc_gp_cumack_ts = ts; 10204 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, 10205 __LINE__, from, rsm); 10206 } 10207 } 10208 10209 static void 10210 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) 10211 { 10212 struct rack_sendmap *rsm; 10213 /* 10214 * The ACK point is advancing to th_ack, we must drop off 10215 * the packets in the rack log and calculate any eligble 10216 * RTT's. 10217 */ 10218 10219 if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) { 10220 /* 10221 * If we have some sack blocks in the filter 10222 * lets prune them out by calling sfb with no blocks. 10223 */ 10224 sack_filter_blks(tp, &rack->r_ctl.rack_sf, NULL, 0, th_ack); 10225 } 10226 if (SEQ_GT(th_ack, tp->snd_una)) { 10227 /* Clear any app ack remembered settings */ 10228 rack->r_ctl.cleared_app_ack = 0; 10229 } 10230 rack->r_wanted_output = 1; 10231 if (SEQ_GT(th_ack, tp->snd_una)) 10232 rack->r_ctl.last_cumack_advance = acktime; 10233 10234 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 10235 if ((rack->rc_last_tlp_acked_set == 1)&& 10236 (rack->rc_last_tlp_past_cumack == 1) && 10237 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 10238 /* 10239 * We have reached the point where our last rack 10240 * tlp retransmit sequence is ahead of the cum-ack. 10241 * This can only happen when the cum-ack moves all 10242 * the way around (its been a full 2^^31+1 bytes 10243 * or more since we sent a retransmitted TLP). Lets 10244 * turn off the valid flag since its not really valid. 10245 * 10246 * Note since sack's also turn on this event we have 10247 * a complication, we have to wait to age it out until 10248 * the cum-ack is by the TLP before checking which is 10249 * what the next else clause does. 10250 */ 10251 rack_log_dsack_event(rack, 9, __LINE__, 10252 rack->r_ctl.last_tlp_acked_start, 10253 rack->r_ctl.last_tlp_acked_end); 10254 rack->rc_last_tlp_acked_set = 0; 10255 rack->rc_last_tlp_past_cumack = 0; 10256 } else if ((rack->rc_last_tlp_acked_set == 1) && 10257 (rack->rc_last_tlp_past_cumack == 0) && 10258 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 10259 /* 10260 * It is safe to start aging TLP's out. 10261 */ 10262 rack->rc_last_tlp_past_cumack = 1; 10263 } 10264 /* We do the same for the tlp send seq as well */ 10265 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10266 (rack->rc_last_sent_tlp_past_cumack == 1) && 10267 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 10268 rack_log_dsack_event(rack, 9, __LINE__, 10269 rack->r_ctl.last_sent_tlp_seq, 10270 (rack->r_ctl.last_sent_tlp_seq + 10271 rack->r_ctl.last_sent_tlp_len)); 10272 rack->rc_last_sent_tlp_seq_valid = 0; 10273 rack->rc_last_sent_tlp_past_cumack = 0; 10274 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10275 (rack->rc_last_sent_tlp_past_cumack == 0) && 10276 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 10277 /* 10278 * It is safe to start aging TLP's send. 10279 */ 10280 rack->rc_last_sent_tlp_past_cumack = 1; 10281 } 10282 more: 10283 rsm = tqhash_min(rack->r_ctl.tqh); 10284 if (rsm == NULL) { 10285 if ((th_ack - 1) == tp->iss) { 10286 /* 10287 * For the SYN incoming case we will not 10288 * have called tcp_output for the sending of 10289 * the SYN, so there will be no map. All 10290 * other cases should probably be a panic. 10291 */ 10292 return; 10293 } 10294 if (tp->t_flags & TF_SENTFIN) { 10295 /* if we sent a FIN we often will not have map */ 10296 return; 10297 } 10298 #ifdef INVARIANTS 10299 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n", 10300 tp, 10301 tp->t_state, th_ack, rack, 10302 tp->snd_una, tp->snd_max); 10303 #endif 10304 return; 10305 } 10306 if (SEQ_LT(th_ack, rsm->r_start)) { 10307 /* Huh map is missing this */ 10308 #ifdef INVARIANTS 10309 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 10310 rsm->r_start, 10311 th_ack, tp->t_state, rack->r_state); 10312 #endif 10313 return; 10314 } 10315 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 10316 10317 /* Now was it a retransmitted TLP? */ 10318 if ((rsm->r_flags & RACK_TLP) && 10319 (rsm->r_rtr_cnt > 1)) { 10320 /* 10321 * Yes, this rsm was a TLP and retransmitted, remember that 10322 * since if a DSACK comes back on this we don't want 10323 * to think of it as a reordered segment. This may 10324 * get updated again with possibly even other TLPs 10325 * in flight, but thats ok. Only when we don't send 10326 * a retransmitted TLP for 1/2 the sequences space 10327 * will it get turned off (above). 10328 */ 10329 if (rack->rc_last_tlp_acked_set && 10330 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10331 /* 10332 * We already turned this on since the end matches, 10333 * the previous one was a partially ack now we 10334 * are getting another one (maybe all of it). 10335 */ 10336 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10337 /* 10338 * Lets make sure we have all of it though. 10339 */ 10340 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10341 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10342 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10343 rack->r_ctl.last_tlp_acked_end); 10344 } 10345 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10346 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10347 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10348 rack->r_ctl.last_tlp_acked_end); 10349 } 10350 } else { 10351 rack->rc_last_tlp_past_cumack = 1; 10352 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10353 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10354 rack->rc_last_tlp_acked_set = 1; 10355 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10356 } 10357 } 10358 /* Now do we consume the whole thing? */ 10359 rack->r_ctl.last_tmit_time_acked = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 10360 if (SEQ_GEQ(th_ack, rsm->r_end)) { 10361 /* Its all consumed. */ 10362 uint32_t left; 10363 uint8_t newly_acked; 10364 10365 if (rsm->r_flags & RACK_WAS_LOST) { 10366 /* 10367 * This can happen when we marked it as lost 10368 * and yet before retransmitting we get an ack 10369 * which can happen due to reordering. 10370 */ 10371 rsm->r_flags &= ~RACK_WAS_LOST; 10372 KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), 10373 ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); 10374 if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) 10375 rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; 10376 else 10377 rack->r_ctl.rc_considered_lost = 0; 10378 } 10379 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 10380 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 10381 rsm->r_rtr_bytes = 0; 10382 /* 10383 * Record the time of highest cumack sent if its in our measurement 10384 * window and possibly bump out the end. 10385 */ 10386 rack_rsm_sender_update(rack, tp, rsm, 4); 10387 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 10388 if (rsm->r_in_tmap) { 10389 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10390 rsm->r_in_tmap = 0; 10391 } 10392 newly_acked = 1; 10393 if (rsm->r_flags & RACK_ACKED) { 10394 /* 10395 * It was acked on the scoreboard -- remove 10396 * it from total 10397 */ 10398 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10399 newly_acked = 0; 10400 } else if (rsm->r_flags & RACK_SACK_PASSED) { 10401 /* 10402 * There are segments ACKED on the 10403 * scoreboard further up. We are seeing 10404 * reordering. 10405 */ 10406 rsm->r_flags &= ~RACK_SACK_PASSED; 10407 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10408 rsm->r_flags |= RACK_ACKED; 10409 rack->r_ctl.rc_reorder_ts = cts; 10410 if (rack->r_ctl.rc_reorder_ts == 0) 10411 rack->r_ctl.rc_reorder_ts = 1; 10412 if (rack->r_ent_rec_ns) { 10413 /* 10414 * We have sent no more, and we saw an sack 10415 * then ack arrive. 10416 */ 10417 rack->r_might_revert = 1; 10418 } 10419 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 10420 } else { 10421 rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); 10422 } 10423 if ((rsm->r_flags & RACK_TO_REXT) && 10424 (tp->t_flags & TF_RCVD_TSTMP) && 10425 (to->to_flags & TOF_TS) && 10426 (to->to_tsecr != 0) && 10427 (tp->t_flags & TF_PREVVALID)) { 10428 /* 10429 * We can use the timestamp to see 10430 * if this retransmission was from the 10431 * first transmit. If so we made a mistake. 10432 */ 10433 tp->t_flags &= ~TF_PREVVALID; 10434 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 10435 /* The first transmit is what this ack is for */ 10436 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 10437 } 10438 } 10439 left = th_ack - rsm->r_end; 10440 if (rack->app_limited_needs_set && newly_acked) 10441 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 10442 /* Free back to zone */ 10443 rack_free(rack, rsm); 10444 if (left) { 10445 goto more; 10446 } 10447 /* Check for reneging */ 10448 rsm = tqhash_min(rack->r_ctl.tqh); 10449 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 10450 /* 10451 * The peer has moved snd_una up to 10452 * the edge of this send, i.e. one 10453 * that it had previously acked. The only 10454 * way that can be true if the peer threw 10455 * away data (space issues) that it had 10456 * previously sacked (else it would have 10457 * given us snd_una up to (rsm->r_end). 10458 * We need to undo the acked markings here. 10459 * 10460 * Note we have to look to make sure th_ack is 10461 * our rsm->r_start in case we get an old ack 10462 * where th_ack is behind snd_una. 10463 */ 10464 rack_peer_reneges(rack, rsm, th_ack); 10465 } 10466 return; 10467 } 10468 if (rsm->r_flags & RACK_ACKED) { 10469 /* 10470 * It was acked on the scoreboard -- remove it from 10471 * total for the part being cum-acked. 10472 */ 10473 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 10474 } else { 10475 rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack); 10476 } 10477 /* And what about the lost flag? */ 10478 if (rsm->r_flags & RACK_WAS_LOST) { 10479 /* 10480 * This can happen when we marked it as lost 10481 * and yet before retransmitting we get an ack 10482 * which can happen due to reordering. In this 10483 * case its only a partial ack of the send. 10484 */ 10485 KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)), 10486 ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack)); 10487 if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)) 10488 rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start; 10489 else 10490 rack->r_ctl.rc_considered_lost = 0; 10491 } 10492 /* 10493 * Clear the dup ack count for 10494 * the piece that remains. 10495 */ 10496 rsm->r_dupack = 0; 10497 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10498 if (rsm->r_rtr_bytes) { 10499 /* 10500 * It was retransmitted adjust the 10501 * sack holes for what was acked. 10502 */ 10503 int ack_am; 10504 10505 ack_am = (th_ack - rsm->r_start); 10506 if (ack_am >= rsm->r_rtr_bytes) { 10507 rack->r_ctl.rc_holes_rxt -= ack_am; 10508 rsm->r_rtr_bytes -= ack_am; 10509 } 10510 } 10511 /* 10512 * Update where the piece starts and record 10513 * the time of send of highest cumack sent if 10514 * its in our GP range. 10515 */ 10516 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 10517 /* Now we need to move our offset forward too */ 10518 if (rsm->m && 10519 ((rsm->orig_m_len != rsm->m->m_len) || 10520 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 10521 /* Fix up the orig_m_len and possibly the mbuf offset */ 10522 rack_adjust_orig_mlen(rsm); 10523 } 10524 rsm->soff += (th_ack - rsm->r_start); 10525 rack_rsm_sender_update(rack, tp, rsm, 5); 10526 /* The trim will move th_ack into r_start for us */ 10527 tqhash_trim(rack->r_ctl.tqh, th_ack); 10528 /* Now do we need to move the mbuf fwd too? */ 10529 { 10530 struct mbuf *m; 10531 uint32_t soff; 10532 10533 m = rsm->m; 10534 soff = rsm->soff; 10535 if (m) { 10536 while (soff >= m->m_len) { 10537 soff -= m->m_len; 10538 KASSERT((m->m_next != NULL), 10539 (" rsm:%p off:%u soff:%u m:%p", 10540 rsm, rsm->soff, soff, m)); 10541 m = m->m_next; 10542 if (m == NULL) { 10543 /* 10544 * This is a fall-back that prevents a panic. In reality 10545 * we should be able to walk the mbuf's and find our place. 10546 * At this point snd_una has not been updated with the sbcut() yet 10547 * but tqhash_trim did update rsm->r_start so the offset calcuation 10548 * should work fine. This is undesirable since we will take cache 10549 * hits to access the socket buffer. And even more puzzling is that 10550 * it happens occasionally. It should not :( 10551 */ 10552 m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 10553 (rsm->r_start - tp->snd_una), 10554 &soff); 10555 break; 10556 } 10557 } 10558 /* 10559 * Now save in our updated values. 10560 */ 10561 rsm->m = m; 10562 rsm->soff = soff; 10563 rsm->orig_m_len = rsm->m->m_len; 10564 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 10565 } 10566 } 10567 if (rack->app_limited_needs_set && 10568 SEQ_GEQ(th_ack, tp->gput_seq)) 10569 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 10570 } 10571 10572 static void 10573 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 10574 { 10575 struct rack_sendmap *rsm; 10576 int sack_pass_fnd = 0; 10577 10578 if (rack->r_might_revert) { 10579 /* 10580 * Ok we have reordering, have not sent anything, we 10581 * might want to revert the congestion state if nothing 10582 * further has SACK_PASSED on it. Lets check. 10583 * 10584 * We also get here when we have DSACKs come in for 10585 * all the data that we FR'd. Note that a rxt or tlp 10586 * timer clears this from happening. 10587 */ 10588 10589 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 10590 if (rsm->r_flags & RACK_SACK_PASSED) { 10591 sack_pass_fnd = 1; 10592 break; 10593 } 10594 } 10595 if (sack_pass_fnd == 0) { 10596 /* 10597 * We went into recovery 10598 * incorrectly due to reordering! 10599 */ 10600 int orig_cwnd; 10601 10602 rack->r_ent_rec_ns = 0; 10603 orig_cwnd = tp->snd_cwnd; 10604 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 10605 tp->snd_recover = tp->snd_una; 10606 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 10607 if (IN_RECOVERY(tp->t_flags)) { 10608 rack_exit_recovery(tp, rack, 3); 10609 if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){ 10610 /* 10611 * We were in recovery, had an RTO 10612 * and then re-entered recovery (more sack's arrived) 10613 * and we have properly recorded the old ssthresh from 10614 * the first recovery. We want to be able to slow-start 10615 * back to this level. The ssthresh from the timeout 10616 * and then back into recovery will end up most likely 10617 * to be min(cwnd=1mss, 2mss). Which makes it basically 10618 * so we get no slow-start after our RTO. 10619 */ 10620 rack->rto_from_rec = 0; 10621 if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) 10622 tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; 10623 } 10624 } 10625 } 10626 rack->r_might_revert = 0; 10627 } 10628 } 10629 10630 10631 static int 10632 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 10633 { 10634 10635 uint32_t am, l_end; 10636 int was_tlp = 0; 10637 10638 if (SEQ_GT(end, start)) 10639 am = end - start; 10640 else 10641 am = 0; 10642 if ((rack->rc_last_tlp_acked_set ) && 10643 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 10644 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 10645 /* 10646 * The DSACK is because of a TLP which we don't 10647 * do anything with the reordering window over since 10648 * it was not reordering that caused the DSACK but 10649 * our previous retransmit TLP. 10650 */ 10651 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10652 was_tlp = 1; 10653 goto skip_dsack_round; 10654 } 10655 if (rack->rc_last_sent_tlp_seq_valid) { 10656 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 10657 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 10658 (SEQ_LEQ(end, l_end))) { 10659 /* 10660 * This dsack is from the last sent TLP, ignore it 10661 * for reordering purposes. 10662 */ 10663 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10664 was_tlp = 1; 10665 goto skip_dsack_round; 10666 } 10667 } 10668 if (rack->rc_dsack_round_seen == 0) { 10669 rack->rc_dsack_round_seen = 1; 10670 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 10671 rack->r_ctl.num_dsack++; 10672 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 10673 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 10674 } 10675 skip_dsack_round: 10676 /* 10677 * We keep track of how many DSACK blocks we get 10678 * after a recovery incident. 10679 */ 10680 rack->r_ctl.dsack_byte_cnt += am; 10681 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 10682 rack->r_ctl.retran_during_recovery && 10683 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 10684 /* 10685 * False recovery most likely culprit is reordering. If 10686 * nothing else is missing we need to revert. 10687 */ 10688 rack->r_might_revert = 1; 10689 rack_handle_might_revert(rack->rc_tp, rack); 10690 rack->r_might_revert = 0; 10691 rack->r_ctl.retran_during_recovery = 0; 10692 rack->r_ctl.dsack_byte_cnt = 0; 10693 } 10694 return (was_tlp); 10695 } 10696 10697 static uint32_t 10698 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 10699 { 10700 return (((tp->snd_max - snd_una) - 10701 (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt); 10702 } 10703 10704 static int32_t 10705 rack_compute_pipe(struct tcpcb *tp) 10706 { 10707 return ((int32_t)do_rack_compute_pipe(tp, 10708 (struct tcp_rack *)tp->t_fb_ptr, 10709 tp->snd_una)); 10710 } 10711 10712 static void 10713 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 10714 { 10715 /* Deal with changed and PRR here (in recovery only) */ 10716 uint32_t pipe, snd_una; 10717 10718 rack->r_ctl.rc_prr_delivered += changed; 10719 10720 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 10721 /* 10722 * It is all outstanding, we are application limited 10723 * and thus we don't need more room to send anything. 10724 * Note we use tp->snd_una here and not th_ack because 10725 * the data as yet not been cut from the sb. 10726 */ 10727 rack->r_ctl.rc_prr_sndcnt = 0; 10728 return; 10729 } 10730 /* Compute prr_sndcnt */ 10731 if (SEQ_GT(tp->snd_una, th_ack)) { 10732 snd_una = tp->snd_una; 10733 } else { 10734 snd_una = th_ack; 10735 } 10736 pipe = do_rack_compute_pipe(tp, rack, snd_una); 10737 if (pipe > tp->snd_ssthresh) { 10738 long sndcnt; 10739 10740 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 10741 if (rack->r_ctl.rc_prr_recovery_fs > 0) 10742 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 10743 else { 10744 rack->r_ctl.rc_prr_sndcnt = 0; 10745 rack_log_to_prr(rack, 9, 0, __LINE__); 10746 sndcnt = 0; 10747 } 10748 sndcnt++; 10749 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 10750 sndcnt -= rack->r_ctl.rc_prr_out; 10751 else 10752 sndcnt = 0; 10753 rack->r_ctl.rc_prr_sndcnt = sndcnt; 10754 rack_log_to_prr(rack, 10, 0, __LINE__); 10755 } else { 10756 uint32_t limit; 10757 10758 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 10759 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 10760 else 10761 limit = 0; 10762 if (changed > limit) 10763 limit = changed; 10764 limit += ctf_fixed_maxseg(tp); 10765 if (tp->snd_ssthresh > pipe) { 10766 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 10767 rack_log_to_prr(rack, 11, 0, __LINE__); 10768 } else { 10769 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 10770 rack_log_to_prr(rack, 12, 0, __LINE__); 10771 } 10772 } 10773 } 10774 10775 static void 10776 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, 10777 int *dsack_seen, int *sacks_seen) 10778 { 10779 uint32_t changed; 10780 struct tcp_rack *rack; 10781 struct rack_sendmap *rsm; 10782 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 10783 register uint32_t th_ack; 10784 int32_t i, j, k, num_sack_blks = 0; 10785 uint32_t cts, acked, ack_point; 10786 int loop_start = 0; 10787 uint32_t tsused; 10788 uint32_t segsiz; 10789 10790 10791 INP_WLOCK_ASSERT(tptoinpcb(tp)); 10792 if (tcp_get_flags(th) & TH_RST) { 10793 /* We don't log resets */ 10794 return; 10795 } 10796 rack = (struct tcp_rack *)tp->t_fb_ptr; 10797 cts = tcp_get_usecs(NULL); 10798 rsm = tqhash_min(rack->r_ctl.tqh); 10799 changed = 0; 10800 th_ack = th->th_ack; 10801 segsiz = ctf_fixed_maxseg(rack->rc_tp); 10802 if (BYTES_THIS_ACK(tp, th) >= segsiz) { 10803 /* 10804 * You only get credit for 10805 * MSS and greater (and you get extra 10806 * credit for larger cum-ack moves). 10807 */ 10808 int ac; 10809 10810 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 10811 counter_u64_add(rack_ack_total, ac); 10812 } 10813 if (SEQ_GT(th_ack, tp->snd_una)) { 10814 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 10815 tp->t_acktime = ticks; 10816 } 10817 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 10818 changed = th_ack - rsm->r_start; 10819 if (changed) { 10820 rack_process_to_cumack(tp, rack, th_ack, cts, to, 10821 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); 10822 } 10823 if ((to->to_flags & TOF_SACK) == 0) { 10824 /* We are done nothing left and no sack. */ 10825 rack_handle_might_revert(tp, rack); 10826 /* 10827 * For cases where we struck a dup-ack 10828 * with no SACK, add to the changes so 10829 * PRR will work right. 10830 */ 10831 if (dup_ack_struck && (changed == 0)) { 10832 changed += ctf_fixed_maxseg(rack->rc_tp); 10833 } 10834 goto out; 10835 } 10836 /* Sack block processing */ 10837 if (SEQ_GT(th_ack, tp->snd_una)) 10838 ack_point = th_ack; 10839 else 10840 ack_point = tp->snd_una; 10841 for (i = 0; i < to->to_nsacks; i++) { 10842 bcopy((to->to_sacks + i * TCPOLEN_SACK), 10843 &sack, sizeof(sack)); 10844 sack.start = ntohl(sack.start); 10845 sack.end = ntohl(sack.end); 10846 if (SEQ_GT(sack.end, sack.start) && 10847 SEQ_GT(sack.start, ack_point) && 10848 SEQ_LT(sack.start, tp->snd_max) && 10849 SEQ_GT(sack.end, ack_point) && 10850 SEQ_LEQ(sack.end, tp->snd_max)) { 10851 sack_blocks[num_sack_blks] = sack; 10852 num_sack_blks++; 10853 } else if (SEQ_LEQ(sack.start, th_ack) && 10854 SEQ_LEQ(sack.end, th_ack)) { 10855 int was_tlp; 10856 10857 if (dsack_seen != NULL) 10858 *dsack_seen = 1; 10859 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 10860 /* 10861 * Its a D-SACK block. 10862 */ 10863 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 10864 } 10865 } 10866 if (rack->rc_dsack_round_seen) { 10867 /* Is the dsack roound over? */ 10868 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 10869 /* Yes it is */ 10870 rack->rc_dsack_round_seen = 0; 10871 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 10872 } 10873 } 10874 /* 10875 * Sort the SACK blocks so we can update the rack scoreboard with 10876 * just one pass. 10877 */ 10878 num_sack_blks = sack_filter_blks(tp, &rack->r_ctl.rack_sf, sack_blocks, 10879 num_sack_blks, th->th_ack); 10880 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 10881 if (sacks_seen != NULL) 10882 *sacks_seen = num_sack_blks; 10883 if (num_sack_blks == 0) { 10884 /* Nothing to sack, but we need to update counts */ 10885 goto out_with_totals; 10886 } 10887 /* Its a sack of some sort */ 10888 if (num_sack_blks < 2) { 10889 /* Only one, we don't need to sort */ 10890 goto do_sack_work; 10891 } 10892 /* Sort the sacks */ 10893 for (i = 0; i < num_sack_blks; i++) { 10894 for (j = i + 1; j < num_sack_blks; j++) { 10895 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 10896 sack = sack_blocks[i]; 10897 sack_blocks[i] = sack_blocks[j]; 10898 sack_blocks[j] = sack; 10899 } 10900 } 10901 } 10902 /* 10903 * Now are any of the sack block ends the same (yes some 10904 * implementations send these)? 10905 */ 10906 again: 10907 if (num_sack_blks == 0) 10908 goto out_with_totals; 10909 if (num_sack_blks > 1) { 10910 for (i = 0; i < num_sack_blks; i++) { 10911 for (j = i + 1; j < num_sack_blks; j++) { 10912 if (sack_blocks[i].end == sack_blocks[j].end) { 10913 /* 10914 * Ok these two have the same end we 10915 * want the smallest end and then 10916 * throw away the larger and start 10917 * again. 10918 */ 10919 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 10920 /* 10921 * The second block covers 10922 * more area use that 10923 */ 10924 sack_blocks[i].start = sack_blocks[j].start; 10925 } 10926 /* 10927 * Now collapse out the dup-sack and 10928 * lower the count 10929 */ 10930 for (k = (j + 1); k < num_sack_blks; k++) { 10931 sack_blocks[j].start = sack_blocks[k].start; 10932 sack_blocks[j].end = sack_blocks[k].end; 10933 j++; 10934 } 10935 num_sack_blks--; 10936 goto again; 10937 } 10938 } 10939 } 10940 } 10941 do_sack_work: 10942 /* 10943 * First lets look to see if 10944 * we have retransmitted and 10945 * can use the transmit next? 10946 */ 10947 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10948 if (rsm && 10949 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 10950 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 10951 /* 10952 * We probably did the FR and the next 10953 * SACK in continues as we would expect. 10954 */ 10955 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, segsiz); 10956 if (acked) { 10957 rack->r_wanted_output = 1; 10958 changed += acked; 10959 } 10960 if (num_sack_blks == 1) { 10961 /* 10962 * This is what we would expect from 10963 * a normal implementation to happen 10964 * after we have retransmitted the FR, 10965 * i.e the sack-filter pushes down 10966 * to 1 block and the next to be retransmitted 10967 * is the sequence in the sack block (has more 10968 * are acked). Count this as ACK'd data to boost 10969 * up the chances of recovering any false positives. 10970 */ 10971 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 10972 counter_u64_add(rack_express_sack, 1); 10973 goto out_with_totals; 10974 } else { 10975 /* 10976 * Start the loop through the 10977 * rest of blocks, past the first block. 10978 */ 10979 loop_start = 1; 10980 } 10981 } 10982 counter_u64_add(rack_sack_total, 1); 10983 rsm = rack->r_ctl.rc_sacklast; 10984 for (i = loop_start; i < num_sack_blks; i++) { 10985 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, segsiz); 10986 if (acked) { 10987 rack->r_wanted_output = 1; 10988 changed += acked; 10989 } 10990 } 10991 out_with_totals: 10992 if (num_sack_blks > 1) { 10993 /* 10994 * You get an extra stroke if 10995 * you have more than one sack-blk, this 10996 * could be where we are skipping forward 10997 * and the sack-filter is still working, or 10998 * it could be an attacker constantly 10999 * moving us. 11000 */ 11001 counter_u64_add(rack_move_some, 1); 11002 } 11003 out: 11004 if (changed) { 11005 /* Something changed cancel the rack timer */ 11006 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11007 } 11008 tsused = tcp_get_usecs(NULL); 11009 rsm = tcp_rack_output(tp, rack, tsused); 11010 if ((!IN_FASTRECOVERY(tp->t_flags)) && 11011 rsm && 11012 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 11013 /* Enter recovery */ 11014 entered_recovery = 1; 11015 rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); 11016 /* 11017 * When we enter recovery we need to assure we send 11018 * one packet. 11019 */ 11020 if (rack->rack_no_prr == 0) { 11021 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 11022 rack_log_to_prr(rack, 8, 0, __LINE__); 11023 } 11024 rack->r_timer_override = 1; 11025 rack->r_early = 0; 11026 rack->r_ctl.rc_agg_early = 0; 11027 } else if (IN_FASTRECOVERY(tp->t_flags) && 11028 rsm && 11029 (rack->r_rr_config == 3)) { 11030 /* 11031 * Assure we can output and we get no 11032 * remembered pace time except the retransmit. 11033 */ 11034 rack->r_timer_override = 1; 11035 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11036 rack->r_ctl.rc_resend = rsm; 11037 } 11038 if (IN_FASTRECOVERY(tp->t_flags) && 11039 (rack->rack_no_prr == 0) && 11040 (entered_recovery == 0)) { 11041 rack_update_prr(tp, rack, changed, th_ack); 11042 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 11043 ((tcp_in_hpts(rack->rc_tp) == 0) && 11044 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 11045 /* 11046 * If you are pacing output you don't want 11047 * to override. 11048 */ 11049 rack->r_early = 0; 11050 rack->r_ctl.rc_agg_early = 0; 11051 rack->r_timer_override = 1; 11052 } 11053 } 11054 } 11055 11056 static void 11057 rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) 11058 { 11059 struct rack_sendmap *rsm; 11060 11061 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11062 while (rsm) { 11063 /* 11064 * We need to skip anything already set 11065 * to be retransmitted. 11066 */ 11067 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11068 (rsm->r_flags & RACK_MUST_RXT)) { 11069 rsm = TAILQ_NEXT(rsm, r_tnext); 11070 continue; 11071 } 11072 break; 11073 } 11074 if (rsm && (rsm->r_dupack < 0xff)) { 11075 rsm->r_dupack++; 11076 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 11077 struct timeval tv; 11078 uint32_t cts; 11079 /* 11080 * Here we see if we need to retransmit. For 11081 * a SACK type connection if enough time has passed 11082 * we will get a return of the rsm. For a non-sack 11083 * connection we will get the rsm returned if the 11084 * dupack value is 3 or more. 11085 */ 11086 cts = tcp_get_usecs(&tv); 11087 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 11088 if (rack->r_ctl.rc_resend != NULL) { 11089 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 11090 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 11091 th_ack, __LINE__); 11092 } 11093 rack->r_wanted_output = 1; 11094 rack->r_timer_override = 1; 11095 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 11096 } 11097 } else { 11098 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 11099 } 11100 } 11101 } 11102 11103 static void 11104 rack_check_bottom_drag(struct tcpcb *tp, 11105 struct tcp_rack *rack, 11106 struct socket *so) 11107 { 11108 /* 11109 * So what is dragging bottom? 11110 * 11111 * Dragging bottom means you were under pacing and had a 11112 * delay in processing inbound acks waiting on our pacing 11113 * timer to expire. While you were waiting all of the acknowledgments 11114 * for the packets you sent have arrived. This means we are pacing 11115 * way underneath the bottleneck to the point where our Goodput 11116 * measurements stop working, since they require more than one 11117 * ack (usually at least 8 packets worth with multiple acks so we can 11118 * gauge the inter-ack times). If that occurs we have a real problem 11119 * since we are stuck in a hole that we can't get out of without 11120 * something speeding us up. 11121 * 11122 * We also check to see if we are widdling down to just one segment 11123 * outstanding. If this occurs and we have room to send in our cwnd/rwnd 11124 * then we are adding the delayed ack interval into our measurments and 11125 * we need to speed up slightly. 11126 */ 11127 uint32_t segsiz, minseg; 11128 11129 segsiz = ctf_fixed_maxseg(tp); 11130 minseg = segsiz; 11131 if (tp->snd_max == tp->snd_una) { 11132 /* 11133 * We are doing dynamic pacing and we are way 11134 * under. Basically everything got acked while 11135 * we were still waiting on the pacer to expire. 11136 * 11137 * This means we need to boost the b/w in 11138 * addition to any earlier boosting of 11139 * the multiplier. 11140 */ 11141 uint64_t lt_bw; 11142 11143 tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM); 11144 lt_bw = rack_get_lt_bw(rack); 11145 rack->rc_dragged_bottom = 1; 11146 rack_validate_multipliers_at_or_above100(rack); 11147 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 11148 (rack->dis_lt_bw == 0) && 11149 (rack->use_lesser_lt_bw == 0) && 11150 (lt_bw > 0)) { 11151 /* 11152 * Lets use the long-term b/w we have 11153 * been getting as a base. 11154 */ 11155 if (rack->rc_gp_filled == 0) { 11156 if (lt_bw > ONE_POINT_TWO_MEG) { 11157 /* 11158 * If we have no measurement 11159 * don't let us set in more than 11160 * 1.2Mbps. If we are still too 11161 * low after pacing with this we 11162 * will hopefully have a max b/w 11163 * available to sanity check things. 11164 */ 11165 lt_bw = ONE_POINT_TWO_MEG; 11166 } 11167 rack->r_ctl.rc_rtt_diff = 0; 11168 rack->r_ctl.gp_bw = lt_bw; 11169 rack->rc_gp_filled = 1; 11170 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11171 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11172 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11173 } else if (lt_bw > rack->r_ctl.gp_bw) { 11174 rack->r_ctl.rc_rtt_diff = 0; 11175 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11176 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11177 rack->r_ctl.gp_bw = lt_bw; 11178 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11179 } else 11180 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11181 if ((rack->gp_ready == 0) && 11182 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 11183 /* We have enough measurements now */ 11184 rack->gp_ready = 1; 11185 if (rack->dgp_on || 11186 rack->rack_hibeta) 11187 rack_set_cc_pacing(rack); 11188 if (rack->defer_options) 11189 rack_apply_deferred_options(rack); 11190 } 11191 } else { 11192 /* 11193 * zero rtt possibly?, settle for just an old increase. 11194 */ 11195 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11196 } 11197 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 11198 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 11199 minseg)) && 11200 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 11201 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 11202 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 11203 (segsiz * rack_req_segs))) { 11204 /* 11205 * We are doing dynamic GP pacing and 11206 * we have everything except 1MSS or less 11207 * bytes left out. We are still pacing away. 11208 * And there is data that could be sent, This 11209 * means we are inserting delayed ack time in 11210 * our measurements because we are pacing too slow. 11211 */ 11212 rack_validate_multipliers_at_or_above100(rack); 11213 rack->rc_dragged_bottom = 1; 11214 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11215 } 11216 } 11217 11218 #ifdef TCP_REQUEST_TRK 11219 static void 11220 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, 11221 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err) 11222 { 11223 int do_log; 11224 11225 do_log = tcp_bblogging_on(rack->rc_tp); 11226 if (do_log == 0) { 11227 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) 11228 return; 11229 /* We only allow the three below with point logging on */ 11230 if ((mod != HYBRID_LOG_RULES_APP) && 11231 (mod != HYBRID_LOG_RULES_SET) && 11232 (mod != HYBRID_LOG_REQ_COMP)) 11233 return; 11234 11235 } 11236 if (do_log) { 11237 union tcp_log_stackspecific log; 11238 struct timeval tv; 11239 11240 /* Convert our ms to a microsecond */ 11241 memset(&log, 0, sizeof(log)); 11242 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11243 log.u_bbr.flex1 = seq; 11244 log.u_bbr.cwnd_gain = line; 11245 if (cur != NULL) { 11246 uint64_t off; 11247 11248 log.u_bbr.flex2 = cur->start_seq; 11249 log.u_bbr.flex3 = cur->end_seq; 11250 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 11251 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); 11252 log.u_bbr.flex6 = cur->flags; 11253 log.u_bbr.pkts_out = cur->hybrid_flags; 11254 log.u_bbr.rttProp = cur->timestamp; 11255 log.u_bbr.cur_del_rate = cur->cspr; 11256 log.u_bbr.bw_inuse = cur->start; 11257 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); 11258 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; 11259 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); 11260 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; 11261 log.u_bbr.inhpts = 1; 11262 #ifdef TCP_REQUEST_TRK 11263 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 11264 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 11265 #endif 11266 } else { 11267 log.u_bbr.flex2 = err; 11268 } 11269 /* 11270 * Fill in flex7 to be CHD (catchup|hybrid|DGP) 11271 */ 11272 log.u_bbr.flex7 = rack->rc_catch_up; 11273 log.u_bbr.flex7 <<= 1; 11274 log.u_bbr.flex7 |= rack->rc_hybrid_mode; 11275 log.u_bbr.flex7 <<= 1; 11276 log.u_bbr.flex7 |= rack->dgp_on; 11277 /* 11278 * Compose bbr_state to be a bit wise 0000ADHF 11279 * where A is the always_pace flag 11280 * where D is the dgp_on flag 11281 * where H is the hybrid_mode on flag 11282 * where F is the use_fixed_rate flag. 11283 */ 11284 log.u_bbr.bbr_state = rack->rc_always_pace; 11285 log.u_bbr.bbr_state <<= 1; 11286 log.u_bbr.bbr_state |= rack->dgp_on; 11287 log.u_bbr.bbr_state <<= 1; 11288 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 11289 log.u_bbr.bbr_state <<= 1; 11290 log.u_bbr.bbr_state |= rack->use_fixed_rate; 11291 log.u_bbr.flex8 = mod; 11292 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; 11293 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; 11294 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11295 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; 11296 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; 11297 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; 11298 tcp_log_event(rack->rc_tp, NULL, 11299 &rack->rc_inp->inp_socket->so_rcv, 11300 &rack->rc_inp->inp_socket->so_snd, 11301 TCP_HYBRID_PACING_LOG, 0, 11302 0, &log, false, NULL, __func__, __LINE__, &tv); 11303 } 11304 } 11305 #endif 11306 11307 #ifdef TCP_REQUEST_TRK 11308 static void 11309 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11310 { 11311 struct tcp_sendfile_track *rc_cur, *orig_ent; 11312 struct tcpcb *tp; 11313 int err = 0; 11314 11315 orig_ent = rack->r_ctl.rc_last_sft; 11316 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); 11317 if (rc_cur == NULL) { 11318 /* If not in the beginning what about the end piece */ 11319 if (rack->rc_hybrid_mode) 11320 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11321 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1)); 11322 } else { 11323 err = 12345; 11324 } 11325 /* If we find no parameters we are in straight DGP mode */ 11326 if(rc_cur == NULL) { 11327 /* None found for this seq, just DGP for now */ 11328 if (rack->rc_hybrid_mode) { 11329 rack->r_ctl.client_suggested_maxseg = 0; 11330 rack->rc_catch_up = 0; 11331 if (rack->cspr_is_fcc == 0) 11332 rack->r_ctl.bw_rate_cap = 0; 11333 else 11334 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11335 } 11336 if (rack->rc_hybrid_mode) { 11337 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11338 } 11339 if (rack->r_ctl.rc_last_sft) { 11340 rack->r_ctl.rc_last_sft = NULL; 11341 } 11342 return; 11343 } 11344 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) { 11345 /* This entry was never setup for hybrid pacing on/off etc */ 11346 if (rack->rc_hybrid_mode) { 11347 rack->r_ctl.client_suggested_maxseg = 0; 11348 rack->rc_catch_up = 0; 11349 rack->r_ctl.bw_rate_cap = 0; 11350 } 11351 if (rack->r_ctl.rc_last_sft) { 11352 rack->r_ctl.rc_last_sft = NULL; 11353 } 11354 if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11355 rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND; 11356 rc_cur->first_send = cts; 11357 rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes; 11358 rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11359 } 11360 return; 11361 } 11362 /* 11363 * Ok if we have a new entry *or* have never 11364 * set up an entry we need to proceed. If 11365 * we have already set it up this entry we 11366 * just continue along with what we already 11367 * setup. 11368 */ 11369 tp = rack->rc_tp; 11370 if ((rack->r_ctl.rc_last_sft != NULL) && 11371 (rack->r_ctl.rc_last_sft == rc_cur)) { 11372 /* Its already in place */ 11373 if (rack->rc_hybrid_mode) 11374 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); 11375 return; 11376 } 11377 if (rack->rc_hybrid_mode == 0) { 11378 rack->r_ctl.rc_last_sft = rc_cur; 11379 if (orig_ent) { 11380 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 11381 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 11382 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 11383 } 11384 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11385 return; 11386 } 11387 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ 11388 /* Compensate for all the header overhead's */ 11389 if (rack->cspr_is_fcc == 0) 11390 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11391 else 11392 rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11393 } else { 11394 if (rack->rc_hybrid_mode) { 11395 if (rack->cspr_is_fcc == 0) 11396 rack->r_ctl.bw_rate_cap = 0; 11397 else 11398 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11399 } 11400 } 11401 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) 11402 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; 11403 else 11404 rack->r_ctl.client_suggested_maxseg = 0; 11405 if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) { 11406 /* 11407 * It is the same timestamp as the previous one 11408 * add the hybrid flag that will indicate we use 11409 * sendtime not arrival time for catch-up mode. 11410 */ 11411 rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME; 11412 } 11413 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && 11414 (rc_cur->cspr > 0)) { 11415 uint64_t len; 11416 11417 rack->rc_catch_up = 1; 11418 /* 11419 * Calculate the deadline time, first set the 11420 * time to when the request arrived. 11421 */ 11422 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) { 11423 /* 11424 * For cases where its a duplicate tm (we received more 11425 * than one request for a tm) we want to use now, the point 11426 * where we are just sending the first bit of the request. 11427 */ 11428 rc_cur->deadline = cts; 11429 } else { 11430 /* 11431 * Here we have a different tm from the last request 11432 * so we want to use arrival time as our base. 11433 */ 11434 rc_cur->deadline = rc_cur->localtime; 11435 } 11436 /* 11437 * Next calculate the length and compensate for 11438 * TLS if need be. 11439 */ 11440 len = rc_cur->end - rc_cur->start; 11441 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { 11442 /* 11443 * This session is doing TLS. Take a swag guess 11444 * at the overhead. 11445 */ 11446 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); 11447 } 11448 /* 11449 * Now considering the size, and the cspr, what is the time that 11450 * would be required at the cspr rate. Here we use the raw 11451 * cspr value since the client only looks at the raw data. We 11452 * do use len which includes TLS overhead, but not the TCP/IP etc. 11453 * That will get made up for in the CU pacing rate set. 11454 */ 11455 len *= HPTS_USEC_IN_SEC; 11456 len /= rc_cur->cspr; 11457 rc_cur->deadline += len; 11458 } else { 11459 rack->rc_catch_up = 0; 11460 rc_cur->deadline = 0; 11461 } 11462 if (rack->r_ctl.client_suggested_maxseg != 0) { 11463 /* 11464 * We need to reset the max pace segs if we have a 11465 * client_suggested_maxseg. 11466 */ 11467 rack_set_pace_segments(tp, rack, __LINE__, NULL); 11468 } 11469 if (orig_ent) { 11470 orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; 11471 orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; 11472 orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; 11473 } 11474 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11475 /* Remember it for next time and for CU mode */ 11476 rack->r_ctl.rc_last_sft = rc_cur; 11477 rack->r_ctl.last_tm_mark = rc_cur->timestamp; 11478 } 11479 #endif 11480 11481 static void 11482 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11483 { 11484 #ifdef TCP_REQUEST_TRK 11485 struct tcp_sendfile_track *ent; 11486 11487 ent = rack->r_ctl.rc_last_sft; 11488 if ((ent == NULL) || 11489 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || 11490 (SEQ_GEQ(seq, ent->end_seq))) { 11491 /* Time to update the track. */ 11492 rack_set_dgp_hybrid_mode(rack, seq, len, cts); 11493 ent = rack->r_ctl.rc_last_sft; 11494 } 11495 /* Out of all */ 11496 if (ent == NULL) { 11497 return; 11498 } 11499 if (SEQ_LT(ent->end_seq, (seq + len))) { 11500 /* 11501 * This is the case where our end_seq guess 11502 * was wrong. This is usually due to TLS having 11503 * more bytes then our guess. It could also be the 11504 * case that the client sent in two requests closely 11505 * and the SB is full of both so we are sending part 11506 * of each (end|beg). In such a case lets move this 11507 * guys end to match the end of this send. That 11508 * way it will complete when all of it is acked. 11509 */ 11510 ent->end_seq = (seq + len); 11511 if (rack->rc_hybrid_mode) 11512 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent, __LINE__); 11513 } 11514 /* Now validate we have set the send time of this one */ 11515 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11516 ent->flags |= TCP_TRK_TRACK_FLG_FSND; 11517 ent->first_send = cts; 11518 ent->sent_at_fs = rack->rc_tp->t_sndbytes; 11519 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11520 } 11521 #endif 11522 } 11523 11524 static void 11525 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 11526 { 11527 /* 11528 * The fast output path is enabled and we 11529 * have moved the cumack forward. Lets see if 11530 * we can expand forward the fast path length by 11531 * that amount. What we would ideally like to 11532 * do is increase the number of bytes in the 11533 * fast path block (left_to_send) by the 11534 * acked amount. However we have to gate that 11535 * by two factors: 11536 * 1) The amount outstanding and the rwnd of the peer 11537 * (i.e. we don't want to exceed the rwnd of the peer). 11538 * <and> 11539 * 2) The amount of data left in the socket buffer (i.e. 11540 * we can't send beyond what is in the buffer). 11541 * 11542 * Note that this does not take into account any increase 11543 * in the cwnd. We will only extend the fast path by 11544 * what was acked. 11545 */ 11546 uint32_t new_total, gating_val; 11547 11548 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 11549 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 11550 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 11551 if (new_total <= gating_val) { 11552 /* We can increase left_to_send by the acked amount */ 11553 counter_u64_add(rack_extended_rfo, 1); 11554 rack->r_ctl.fsb.left_to_send = new_total; 11555 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 11556 ("rack:%p left_to_send:%u sbavail:%u out:%u", 11557 rack, rack->r_ctl.fsb.left_to_send, 11558 sbavail(&rack->rc_inp->inp_socket->so_snd), 11559 (tp->snd_max - tp->snd_una))); 11560 11561 } 11562 } 11563 11564 static void 11565 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) 11566 { 11567 /* 11568 * Here any sendmap entry that points to the 11569 * beginning mbuf must be adjusted to the correct 11570 * offset. This must be called with: 11571 * 1) The socket buffer locked 11572 * 2) snd_una adjusted to its new position. 11573 * 11574 * Note that (2) implies rack_ack_received has also 11575 * been called and all the sbcut's have been done. 11576 * 11577 * We grab the first mbuf in the socket buffer and 11578 * then go through the front of the sendmap, recalculating 11579 * the stored offset for any sendmap entry that has 11580 * that mbuf. We must use the sb functions to do this 11581 * since its possible an add was done has well as 11582 * the subtraction we may have just completed. This should 11583 * not be a penalty though, since we just referenced the sb 11584 * to go in and trim off the mbufs that we freed (of course 11585 * there will be a penalty for the sendmap references though). 11586 * 11587 * Note also with INVARIANT on, we validate with a KASSERT 11588 * that the first sendmap entry has a soff of 0. 11589 * 11590 */ 11591 struct mbuf *m; 11592 struct rack_sendmap *rsm; 11593 tcp_seq snd_una; 11594 #ifdef INVARIANTS 11595 int first_processed = 0; 11596 #endif 11597 11598 snd_una = rack->rc_tp->snd_una; 11599 SOCKBUF_LOCK_ASSERT(sb); 11600 m = sb->sb_mb; 11601 rsm = tqhash_min(rack->r_ctl.tqh); 11602 if ((rsm == NULL) || (m == NULL)) { 11603 /* Nothing outstanding */ 11604 return; 11605 } 11606 /* The very first RSM's mbuf must point to the head mbuf in the sb */ 11607 KASSERT((rsm->m == m), 11608 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", 11609 rack, sb, rsm)); 11610 while (rsm->m && (rsm->m == m)) { 11611 /* one to adjust */ 11612 #ifdef INVARIANTS 11613 struct mbuf *tm; 11614 uint32_t soff; 11615 11616 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 11617 if ((rsm->orig_m_len != m->m_len) || 11618 (rsm->orig_t_space != M_TRAILINGROOM(m))){ 11619 rack_adjust_orig_mlen(rsm); 11620 } 11621 if (first_processed == 0) { 11622 KASSERT((rsm->soff == 0), 11623 ("Rack:%p rsm:%p -- rsm at head but soff not zero", 11624 rack, rsm)); 11625 first_processed = 1; 11626 } 11627 if ((rsm->soff != soff) || (rsm->m != tm)) { 11628 /* 11629 * This is not a fatal error, we anticipate it 11630 * might happen (the else code), so we count it here 11631 * so that under invariant we can see that it really 11632 * does happen. 11633 */ 11634 counter_u64_add(rack_adjust_map_bw, 1); 11635 } 11636 rsm->m = tm; 11637 rsm->soff = soff; 11638 if (tm) { 11639 rsm->orig_m_len = rsm->m->m_len; 11640 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11641 } else { 11642 rsm->orig_m_len = 0; 11643 rsm->orig_t_space = 0; 11644 } 11645 #else 11646 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 11647 if (rsm->m) { 11648 rsm->orig_m_len = rsm->m->m_len; 11649 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11650 } else { 11651 rsm->orig_m_len = 0; 11652 rsm->orig_t_space = 0; 11653 } 11654 #endif 11655 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 11656 if (rsm == NULL) 11657 break; 11658 } 11659 } 11660 11661 #ifdef TCP_REQUEST_TRK 11662 static inline void 11663 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) 11664 { 11665 struct tcp_sendfile_track *ent; 11666 int i; 11667 11668 if ((rack->rc_hybrid_mode == 0) && 11669 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { 11670 /* 11671 * Just do normal completions hybrid pacing is not on 11672 * and CLDL is off as well. 11673 */ 11674 tcp_req_check_for_comp(rack->rc_tp, th_ack); 11675 return; 11676 } 11677 /* 11678 * Originally I was just going to find the th_ack associated 11679 * with an entry. But then I realized a large strech ack could 11680 * in theory ack two or more requests at once. So instead we 11681 * need to find all entries that are completed by th_ack not 11682 * just a single entry and do our logging. 11683 */ 11684 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11685 while (ent != NULL) { 11686 /* 11687 * We may be doing hybrid pacing or CLDL and need more details possibly 11688 * so we do it manually instead of calling 11689 * tcp_req_check_for_comp() 11690 */ 11691 uint64_t laa, tim, data, cbw, ftim; 11692 11693 /* Ok this ack frees it */ 11694 rack_log_hybrid(rack, th_ack, 11695 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); 11696 rack_log_hybrid_sends(rack, ent, __LINE__); 11697 /* calculate the time based on the ack arrival */ 11698 data = ent->end - ent->start; 11699 laa = tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time); 11700 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { 11701 if (ent->first_send > ent->localtime) 11702 ftim = ent->first_send; 11703 else 11704 ftim = ent->localtime; 11705 } else { 11706 /* TSNH */ 11707 ftim = ent->localtime; 11708 } 11709 if (laa > ent->localtime) 11710 tim = laa - ftim; 11711 else 11712 tim = 0; 11713 cbw = data * HPTS_USEC_IN_SEC; 11714 if (tim > 0) 11715 cbw /= tim; 11716 else 11717 cbw = 0; 11718 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent, __LINE__); 11719 /* 11720 * Check to see if we are freeing what we are pointing to send wise 11721 * if so be sure to NULL the pointer so we know we are no longer 11722 * set to anything. 11723 */ 11724 if (ent == rack->r_ctl.rc_last_sft) { 11725 rack->r_ctl.rc_last_sft = NULL; 11726 if (rack->rc_hybrid_mode) { 11727 rack->rc_catch_up = 0; 11728 if (rack->cspr_is_fcc == 0) 11729 rack->r_ctl.bw_rate_cap = 0; 11730 else 11731 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 11732 rack->r_ctl.client_suggested_maxseg = 0; 11733 } 11734 } 11735 /* Generate the log that the tcp_netflix call would have */ 11736 tcp_req_log_req_info(rack->rc_tp, ent, 11737 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 11738 /* Free it and see if there is another one */ 11739 tcp_req_free_a_slot(rack->rc_tp, ent); 11740 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11741 } 11742 } 11743 #endif 11744 11745 11746 /* 11747 * Return value of 1, we do not need to call rack_process_data(). 11748 * return value of 0, rack_process_data can be called. 11749 * For ret_val if its 0 the TCP is locked, if its non-zero 11750 * its unlocked and probably unsafe to touch the TCB. 11751 */ 11752 static int 11753 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11754 struct tcpcb *tp, struct tcpopt *to, 11755 uint32_t tiwin, int32_t tlen, 11756 int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen) 11757 { 11758 int32_t ourfinisacked = 0; 11759 int32_t nsegs, acked_amount; 11760 int32_t acked; 11761 struct mbuf *mfree; 11762 struct tcp_rack *rack; 11763 int32_t under_pacing = 0; 11764 int32_t post_recovery = 0; 11765 uint32_t p_cwnd; 11766 11767 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11768 11769 rack = (struct tcp_rack *)tp->t_fb_ptr; 11770 if (SEQ_GEQ(tp->snd_una, tp->iss + (65535 << tp->snd_scale))) { 11771 /* Checking SEG.ACK against ISS is definitely redundant. */ 11772 tp->t_flags2 |= TF2_NO_ISS_CHECK; 11773 } 11774 if (!V_tcp_insecure_ack) { 11775 tcp_seq seq_min; 11776 bool ghost_ack_check; 11777 11778 if (tp->t_flags2 & TF2_NO_ISS_CHECK) { 11779 /* Check for too old ACKs (RFC 5961, Section 5.2). */ 11780 seq_min = tp->snd_una - tp->max_sndwnd; 11781 ghost_ack_check = false; 11782 } else { 11783 if (SEQ_GT(tp->iss + 1, tp->snd_una - tp->max_sndwnd)) { 11784 /* Checking for ghost ACKs is stricter. */ 11785 seq_min = tp->iss + 1; 11786 ghost_ack_check = true; 11787 } else { 11788 /* 11789 * Checking for too old ACKs (RFC 5961, 11790 * Section 5.2) is stricter. 11791 */ 11792 seq_min = tp->snd_una - tp->max_sndwnd; 11793 ghost_ack_check = false; 11794 } 11795 } 11796 if (SEQ_LT(th->th_ack, seq_min)) { 11797 if (ghost_ack_check) 11798 TCPSTAT_INC(tcps_rcvghostack); 11799 else 11800 TCPSTAT_INC(tcps_rcvacktooold); 11801 /* Send challenge ACK. */ 11802 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 11803 rack->r_wanted_output = 1; 11804 return (1); 11805 } 11806 } 11807 if (SEQ_GT(th->th_ack, tp->snd_max)) { 11808 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 11809 rack->r_wanted_output = 1; 11810 return (1); 11811 } 11812 if (rack->gp_ready && 11813 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11814 under_pacing = 1; 11815 } 11816 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 11817 int in_rec, dup_ack_struck = 0; 11818 int dsack_seen = 0, sacks_seen = 0; 11819 11820 in_rec = IN_FASTRECOVERY(tp->t_flags); 11821 if (rack->rc_in_persist) { 11822 tp->t_rxtshift = 0; 11823 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11824 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11825 } 11826 11827 if ((th->th_ack == tp->snd_una) && 11828 (tiwin == tp->snd_wnd) && 11829 (orig_tlen == 0) && 11830 ((to->to_flags & TOF_SACK) == 0)) { 11831 rack_strike_dupack(rack, th->th_ack); 11832 dup_ack_struck = 1; 11833 } 11834 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), 11835 dup_ack_struck, &dsack_seen, &sacks_seen); 11836 11837 } 11838 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 11839 /* 11840 * Old ack, behind (or duplicate to) the last one rcv'd 11841 * Note: We mark reordering is occuring if its 11842 * less than and we have not closed our window. 11843 */ 11844 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 11845 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 11846 if (rack->r_ctl.rc_reorder_ts == 0) 11847 rack->r_ctl.rc_reorder_ts = 1; 11848 } 11849 return (0); 11850 } 11851 /* 11852 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 11853 * something we sent. 11854 */ 11855 if (tp->t_flags & TF_NEEDSYN) { 11856 /* 11857 * T/TCP: Connection was half-synchronized, and our SYN has 11858 * been ACK'd (so connection is now fully synchronized). Go 11859 * to non-starred state, increment snd_una for ACK of SYN, 11860 * and check if we can do window scaling. 11861 */ 11862 tp->t_flags &= ~TF_NEEDSYN; 11863 tp->snd_una++; 11864 /* Do window scaling? */ 11865 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11866 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11867 tp->rcv_scale = tp->request_r_scale; 11868 /* Send window already scaled. */ 11869 } 11870 } 11871 nsegs = max(1, m->m_pkthdr.lro_nsegs); 11872 11873 acked = BYTES_THIS_ACK(tp, th); 11874 if (acked) { 11875 /* 11876 * Any time we move the cum-ack forward clear 11877 * keep-alive tied probe-not-answered. The 11878 * persists clears its own on entry. 11879 */ 11880 rack->probe_not_answered = 0; 11881 } 11882 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 11883 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 11884 /* 11885 * If we just performed our first retransmit, and the ACK arrives 11886 * within our recovery window, then it was a mistake to do the 11887 * retransmit in the first place. Recover our original cwnd and 11888 * ssthresh, and proceed to transmit where we left off. 11889 */ 11890 if ((tp->t_flags & TF_PREVVALID) && 11891 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 11892 tp->t_flags &= ~TF_PREVVALID; 11893 if (tp->t_rxtshift == 1 && 11894 (int)(ticks - tp->t_badrxtwin) < 0) 11895 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 11896 } 11897 if (acked) { 11898 /* assure we are not backed off */ 11899 tp->t_rxtshift = 0; 11900 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11901 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11902 rack->rc_tlp_in_progress = 0; 11903 rack->r_ctl.rc_tlp_cnt_out = 0; 11904 /* 11905 * If it is the RXT timer we want to 11906 * stop it, so we can restart a TLP. 11907 */ 11908 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 11909 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11910 #ifdef TCP_REQUEST_TRK 11911 rack_req_check_for_comp(rack, th->th_ack); 11912 #endif 11913 } 11914 /* 11915 * If we have a timestamp reply, update smoothed round trip time. If 11916 * no timestamp is present but transmit timer is running and timed 11917 * sequence number was acked, update smoothed round trip time. Since 11918 * we now have an rtt measurement, cancel the timer backoff (cf., 11919 * Phil Karn's retransmit alg.). Recompute the initial retransmit 11920 * timer. 11921 * 11922 * Some boxes send broken timestamp replies during the SYN+ACK 11923 * phase, ignore timestamps of 0 or we could calculate a huge RTT 11924 * and blow up the retransmit timer. 11925 */ 11926 /* 11927 * If all outstanding data is acked, stop retransmit timer and 11928 * remember to restart (more output or persist). If there is more 11929 * data to be acked, restart retransmit timer, using current 11930 * (possibly backed-off) value. 11931 */ 11932 if (acked == 0) { 11933 if (ofia) 11934 *ofia = ourfinisacked; 11935 return (0); 11936 } 11937 if (IN_RECOVERY(tp->t_flags)) { 11938 if (SEQ_LT(th->th_ack, tp->snd_recover) && 11939 (SEQ_LT(th->th_ack, tp->snd_max))) { 11940 tcp_rack_partialack(tp); 11941 } else { 11942 rack_post_recovery(tp, th->th_ack); 11943 post_recovery = 1; 11944 /* 11945 * Grab the segsiz, multiply by 2 and add the snd_cwnd 11946 * that is the max the CC should add if we are exiting 11947 * recovery and doing a late add. 11948 */ 11949 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 11950 p_cwnd <<= 1; 11951 p_cwnd += tp->snd_cwnd; 11952 } 11953 } else if ((rack->rto_from_rec == 1) && 11954 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 11955 /* 11956 * We were in recovery, hit a rxt timeout 11957 * and never re-entered recovery. The timeout(s) 11958 * made up all the lost data. In such a case 11959 * we need to clear the rto_from_rec flag. 11960 */ 11961 rack->rto_from_rec = 0; 11962 } 11963 /* 11964 * Let the congestion control algorithm update congestion control 11965 * related information. This typically means increasing the 11966 * congestion window. 11967 */ 11968 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery); 11969 if (post_recovery && 11970 (tp->snd_cwnd > p_cwnd)) { 11971 /* Must be non-newreno (cubic) getting too ahead of itself */ 11972 tp->snd_cwnd = p_cwnd; 11973 } 11974 SOCK_SENDBUF_LOCK(so); 11975 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 11976 tp->snd_wnd -= acked_amount; 11977 mfree = sbcut_locked(&so->so_snd, acked_amount); 11978 if ((sbused(&so->so_snd) == 0) && 11979 (acked > acked_amount) && 11980 (tp->t_state >= TCPS_FIN_WAIT_1) && 11981 (tp->t_flags & TF_SENTFIN)) { 11982 /* 11983 * We must be sure our fin 11984 * was sent and acked (we can be 11985 * in FIN_WAIT_1 without having 11986 * sent the fin). 11987 */ 11988 ourfinisacked = 1; 11989 } 11990 tp->snd_una = th->th_ack; 11991 /* wakeups? */ 11992 if (acked_amount && sbavail(&so->so_snd)) 11993 rack_adjust_sendmap_head(rack, &so->so_snd); 11994 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 11995 /* NB: sowwakeup_locked() does an implicit unlock. */ 11996 sowwakeup_locked(so); 11997 m_freem(mfree); 11998 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 11999 tp->snd_recover = tp->snd_una; 12000 12001 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { 12002 tp->snd_nxt = tp->snd_max; 12003 } 12004 if (under_pacing && 12005 (rack->use_fixed_rate == 0) && 12006 (rack->in_probe_rtt == 0) && 12007 rack->rc_gp_dyn_mul && 12008 rack->rc_always_pace) { 12009 /* Check if we are dragging bottom */ 12010 rack_check_bottom_drag(tp, rack, so); 12011 } 12012 if (tp->snd_una == tp->snd_max) { 12013 /* Nothing left outstanding */ 12014 tp->t_flags &= ~TF_PREVVALID; 12015 if (rack->r_ctl.rc_went_idle_time == 0) 12016 rack->r_ctl.rc_went_idle_time = 1; 12017 rack->r_ctl.retran_during_recovery = 0; 12018 rack->r_ctl.dsack_byte_cnt = 0; 12019 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12020 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12021 tp->t_acktime = 0; 12022 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12023 rack->rc_suspicious = 0; 12024 /* Set need output so persist might get set */ 12025 rack->r_wanted_output = 1; 12026 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12027 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 12028 (sbavail(&so->so_snd) == 0) && 12029 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 12030 /* 12031 * The socket was gone and the 12032 * peer sent data (now or in the past), time to 12033 * reset him. 12034 */ 12035 *ret_val = 1; 12036 /* tcp_close will kill the inp pre-log the Reset */ 12037 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 12038 tp = tcp_close(tp); 12039 ctf_do_dropwithreset(m, tp, th, tlen); 12040 return (1); 12041 } 12042 } 12043 if (ofia) 12044 *ofia = ourfinisacked; 12045 return (0); 12046 } 12047 12048 12049 static void 12050 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 12051 int dir, uint32_t flags, struct rack_sendmap *rsm) 12052 { 12053 if (tcp_bblogging_on(rack->rc_tp)) { 12054 union tcp_log_stackspecific log; 12055 struct timeval tv; 12056 12057 memset(&log, 0, sizeof(log)); 12058 log.u_bbr.flex1 = cnt; 12059 log.u_bbr.flex2 = split; 12060 log.u_bbr.flex3 = out; 12061 log.u_bbr.flex4 = line; 12062 log.u_bbr.flex5 = rack->r_must_retran; 12063 log.u_bbr.flex6 = flags; 12064 log.u_bbr.flex7 = rack->rc_has_collapsed; 12065 log.u_bbr.flex8 = dir; /* 12066 * 1 is collapsed, 0 is uncollapsed, 12067 * 2 is log of a rsm being marked, 3 is a split. 12068 */ 12069 if (rsm == NULL) 12070 log.u_bbr.rttProp = 0; 12071 else 12072 log.u_bbr.rttProp = (uintptr_t)rsm; 12073 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 12074 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 12075 TCP_LOG_EVENTP(rack->rc_tp, NULL, 12076 &rack->rc_inp->inp_socket->so_rcv, 12077 &rack->rc_inp->inp_socket->so_snd, 12078 TCP_RACK_LOG_COLLAPSE, 0, 12079 0, &log, false, &tv); 12080 } 12081 } 12082 12083 static void 12084 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) 12085 { 12086 /* 12087 * Here all we do is mark the collapsed point and set the flag. 12088 * This may happen again and again, but there is no 12089 * sense splitting our map until we know where the 12090 * peer finally lands in the collapse. 12091 */ 12092 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12093 if ((rack->rc_has_collapsed == 0) || 12094 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) 12095 counter_u64_add(rack_collapsed_win_seen, 1); 12096 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; 12097 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 12098 rack->rc_has_collapsed = 1; 12099 rack->r_collapse_point_valid = 1; 12100 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 12101 } 12102 12103 static void 12104 rack_un_collapse_window(struct tcp_rack *rack, int line) 12105 { 12106 struct rack_sendmap *nrsm, *rsm; 12107 int cnt = 0, split = 0; 12108 int insret __diagused; 12109 12110 12111 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12112 rack->rc_has_collapsed = 0; 12113 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 12114 if (rsm == NULL) { 12115 /* Nothing to do maybe the peer ack'ed it all */ 12116 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12117 return; 12118 } 12119 /* Now do we need to split this one? */ 12120 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 12121 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 12122 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 12123 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 12124 if (nrsm == NULL) { 12125 /* We can't get a rsm, mark all? */ 12126 nrsm = rsm; 12127 goto no_split; 12128 } 12129 /* Clone it */ 12130 split = 1; 12131 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 12132 #ifndef INVARIANTS 12133 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 12134 #else 12135 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 12136 panic("Insert in tailq_hash of %p fails ret:%d rack:%p rsm:%p", 12137 nrsm, insret, rack, rsm); 12138 } 12139 #endif 12140 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 12141 rack->r_ctl.last_collapse_point, __LINE__); 12142 if (rsm->r_in_tmap) { 12143 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 12144 nrsm->r_in_tmap = 1; 12145 } 12146 /* 12147 * Set in the new RSM as the 12148 * collapsed starting point 12149 */ 12150 rsm = nrsm; 12151 } 12152 12153 no_split: 12154 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { 12155 cnt++; 12156 nrsm->r_flags |= RACK_RWND_COLLAPSED; 12157 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 12158 cnt++; 12159 } 12160 if (cnt) { 12161 counter_u64_add(rack_collapsed_win, 1); 12162 } 12163 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12164 } 12165 12166 static void 12167 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 12168 int32_t tlen, int32_t tfo_syn) 12169 { 12170 if (DELAY_ACK(tp, tlen) || tfo_syn) { 12171 rack_timer_cancel(tp, rack, 12172 rack->r_ctl.rc_rcvtime, __LINE__); 12173 tp->t_flags |= TF_DELACK; 12174 } else { 12175 rack->r_wanted_output = 1; 12176 tp->t_flags |= TF_ACKNOW; 12177 } 12178 } 12179 12180 static void 12181 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 12182 { 12183 /* 12184 * If fast output is in progress, lets validate that 12185 * the new window did not shrink on us and make it 12186 * so fast output should end. 12187 */ 12188 if (rack->r_fast_output) { 12189 uint32_t out; 12190 12191 /* 12192 * Calculate what we will send if left as is 12193 * and compare that to our send window. 12194 */ 12195 out = ctf_outstanding(tp); 12196 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 12197 /* ok we have an issue */ 12198 if (out >= tp->snd_wnd) { 12199 /* Turn off fast output the window is met or collapsed */ 12200 rack->r_fast_output = 0; 12201 } else { 12202 /* we have some room left */ 12203 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 12204 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 12205 /* If not at least 1 full segment never mind */ 12206 rack->r_fast_output = 0; 12207 } 12208 } 12209 } 12210 } 12211 } 12212 12213 /* 12214 * Return value of 1, the TCB is unlocked and most 12215 * likely gone, return value of 0, the TCP is still 12216 * locked. 12217 */ 12218 static int 12219 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 12220 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 12221 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 12222 { 12223 /* 12224 * Update window information. Don't look at window if no ACK: TAC's 12225 * send garbage on first SYN. 12226 */ 12227 int32_t nsegs; 12228 int32_t tfo_syn; 12229 struct tcp_rack *rack; 12230 12231 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12232 12233 rack = (struct tcp_rack *)tp->t_fb_ptr; 12234 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12235 if ((thflags & TH_ACK) && 12236 (SEQ_LT(tp->snd_wl1, th->th_seq) || 12237 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 12238 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 12239 /* keep track of pure window updates */ 12240 if (tlen == 0 && 12241 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 12242 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 12243 tp->snd_wnd = tiwin; 12244 rack_validate_fo_sendwin_up(tp, rack); 12245 tp->snd_wl1 = th->th_seq; 12246 tp->snd_wl2 = th->th_ack; 12247 if (tp->snd_wnd > tp->max_sndwnd) 12248 tp->max_sndwnd = tp->snd_wnd; 12249 rack->r_wanted_output = 1; 12250 } else if (thflags & TH_ACK) { 12251 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 12252 tp->snd_wnd = tiwin; 12253 rack_validate_fo_sendwin_up(tp, rack); 12254 tp->snd_wl1 = th->th_seq; 12255 tp->snd_wl2 = th->th_ack; 12256 } 12257 } 12258 if (tp->snd_wnd < ctf_outstanding(tp)) 12259 /* The peer collapsed the window */ 12260 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12261 else if (rack->rc_has_collapsed) 12262 rack_un_collapse_window(rack, __LINE__); 12263 if ((rack->r_collapse_point_valid) && 12264 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 12265 rack->r_collapse_point_valid = 0; 12266 /* Was persist timer active and now we have window space? */ 12267 if ((rack->rc_in_persist != 0) && 12268 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12269 rack->r_ctl.rc_pace_min_segs))) { 12270 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12271 tp->snd_nxt = tp->snd_max; 12272 /* Make sure we output to start the timer */ 12273 rack->r_wanted_output = 1; 12274 } 12275 /* Do we enter persists? */ 12276 if ((rack->rc_in_persist == 0) && 12277 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12278 TCPS_HAVEESTABLISHED(tp->t_state) && 12279 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12280 sbavail(&tptosocket(tp)->so_snd) && 12281 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12282 /* 12283 * Here the rwnd is less than 12284 * the pacing size, we are established, 12285 * nothing is outstanding, and there is 12286 * data to send. Enter persists. 12287 */ 12288 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 12289 } 12290 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 12291 m_freem(m); 12292 return (0); 12293 } 12294 /* 12295 * don't process the URG bit, ignore them drag 12296 * along the up. 12297 */ 12298 tp->rcv_up = tp->rcv_nxt; 12299 12300 /* 12301 * Process the segment text, merging it into the TCP sequencing 12302 * queue, and arranging for acknowledgment of receipt if necessary. 12303 * This process logically involves adjusting tp->rcv_wnd as data is 12304 * presented to the user (this happens in tcp_usrreq.c, case 12305 * PRU_RCVD). If a FIN has already been received on this connection 12306 * then we just ignore the text. 12307 */ 12308 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 12309 (tp->t_flags & TF_FASTOPEN)); 12310 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 12311 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12312 tcp_seq save_start = th->th_seq; 12313 tcp_seq save_rnxt = tp->rcv_nxt; 12314 int save_tlen = tlen; 12315 12316 m_adj(m, drop_hdrlen); /* delayed header drop */ 12317 /* 12318 * Insert segment which includes th into TCP reassembly 12319 * queue with control block tp. Set thflags to whether 12320 * reassembly now includes a segment with FIN. This handles 12321 * the common case inline (segment is the next to be 12322 * received on an established connection, and the queue is 12323 * empty), avoiding linkage into and removal from the queue 12324 * and repetition of various conversions. Set DELACK for 12325 * segments received in order, but ack immediately when 12326 * segments are out of order (so fast retransmit can work). 12327 */ 12328 if (th->th_seq == tp->rcv_nxt && 12329 SEGQ_EMPTY(tp) && 12330 (TCPS_HAVEESTABLISHED(tp->t_state) || 12331 tfo_syn)) { 12332 #ifdef NETFLIX_SB_LIMITS 12333 u_int mcnt, appended; 12334 12335 if (so->so_rcv.sb_shlim) { 12336 mcnt = m_memcnt(m); 12337 appended = 0; 12338 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12339 CFO_NOSLEEP, NULL) == false) { 12340 counter_u64_add(tcp_sb_shlim_fails, 1); 12341 m_freem(m); 12342 return (0); 12343 } 12344 } 12345 #endif 12346 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 12347 tp->rcv_nxt += tlen; 12348 if (tlen && 12349 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12350 (tp->t_fbyte_in == 0)) { 12351 tp->t_fbyte_in = ticks; 12352 if (tp->t_fbyte_in == 0) 12353 tp->t_fbyte_in = 1; 12354 if (tp->t_fbyte_out && tp->t_fbyte_in) 12355 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12356 } 12357 thflags = tcp_get_flags(th) & TH_FIN; 12358 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12359 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12360 SOCK_RECVBUF_LOCK(so); 12361 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12362 m_freem(m); 12363 } else { 12364 int32_t newsize; 12365 12366 if (tlen > 0) { 12367 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12368 if (newsize) 12369 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12370 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12371 } 12372 #ifdef NETFLIX_SB_LIMITS 12373 appended = 12374 #endif 12375 sbappendstream_locked(&so->so_rcv, m, 0); 12376 } 12377 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12378 /* NB: sorwakeup_locked() does an implicit unlock. */ 12379 sorwakeup_locked(so); 12380 #ifdef NETFLIX_SB_LIMITS 12381 if (so->so_rcv.sb_shlim && appended != mcnt) 12382 counter_fo_release(so->so_rcv.sb_shlim, 12383 mcnt - appended); 12384 #endif 12385 } else { 12386 /* 12387 * XXX: Due to the header drop above "th" is 12388 * theoretically invalid by now. Fortunately 12389 * m_adj() doesn't actually frees any mbufs when 12390 * trimming from the head. 12391 */ 12392 tcp_seq temp = save_start; 12393 12394 thflags = tcp_reass(tp, th, &temp, &tlen, m); 12395 tp->t_flags |= TF_ACKNOW; 12396 if (tp->t_flags & TF_WAKESOR) { 12397 tp->t_flags &= ~TF_WAKESOR; 12398 /* NB: sorwakeup_locked() does an implicit unlock. */ 12399 sorwakeup_locked(so); 12400 } 12401 } 12402 if ((tp->t_flags & TF_SACK_PERMIT) && 12403 (save_tlen > 0) && 12404 TCPS_HAVEESTABLISHED(tp->t_state)) { 12405 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 12406 /* 12407 * DSACK actually handled in the fastpath 12408 * above. 12409 */ 12410 tcp_update_sack_list(tp, save_start, 12411 save_start + save_tlen); 12412 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 12413 if ((tp->rcv_numsacks >= 1) && 12414 (tp->sackblks[0].end == save_start)) { 12415 /* 12416 * Partial overlap, recorded at todrop 12417 * above. 12418 */ 12419 tcp_update_sack_list(tp, 12420 tp->sackblks[0].start, 12421 tp->sackblks[0].end); 12422 } else { 12423 tcp_update_dsack_list(tp, save_start, 12424 save_start + save_tlen); 12425 } 12426 } else if (tlen >= save_tlen) { 12427 /* Update of sackblks. */ 12428 tcp_update_dsack_list(tp, save_start, 12429 save_start + save_tlen); 12430 } else if (tlen > 0) { 12431 tcp_update_dsack_list(tp, save_start, 12432 save_start + tlen); 12433 } 12434 } 12435 } else { 12436 m_freem(m); 12437 thflags &= ~TH_FIN; 12438 } 12439 12440 /* 12441 * If FIN is received ACK the FIN and let the user know that the 12442 * connection is closing. 12443 */ 12444 if (thflags & TH_FIN) { 12445 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12446 /* The socket upcall is handled by socantrcvmore. */ 12447 socantrcvmore(so); 12448 /* 12449 * If connection is half-synchronized (ie NEEDSYN 12450 * flag on) then delay ACK, so it may be piggybacked 12451 * when SYN is sent. Otherwise, since we received a 12452 * FIN then no more input can be expected, send ACK 12453 * now. 12454 */ 12455 if (tp->t_flags & TF_NEEDSYN) { 12456 rack_timer_cancel(tp, rack, 12457 rack->r_ctl.rc_rcvtime, __LINE__); 12458 tp->t_flags |= TF_DELACK; 12459 } else { 12460 tp->t_flags |= TF_ACKNOW; 12461 } 12462 tp->rcv_nxt++; 12463 } 12464 switch (tp->t_state) { 12465 /* 12466 * In SYN_RECEIVED and ESTABLISHED STATES enter the 12467 * CLOSE_WAIT state. 12468 */ 12469 case TCPS_SYN_RECEIVED: 12470 tp->t_starttime = ticks; 12471 /* FALLTHROUGH */ 12472 case TCPS_ESTABLISHED: 12473 rack_timer_cancel(tp, rack, 12474 rack->r_ctl.rc_rcvtime, __LINE__); 12475 tcp_state_change(tp, TCPS_CLOSE_WAIT); 12476 break; 12477 12478 /* 12479 * If still in FIN_WAIT_1 STATE FIN has not been 12480 * acked so enter the CLOSING state. 12481 */ 12482 case TCPS_FIN_WAIT_1: 12483 rack_timer_cancel(tp, rack, 12484 rack->r_ctl.rc_rcvtime, __LINE__); 12485 tcp_state_change(tp, TCPS_CLOSING); 12486 break; 12487 12488 /* 12489 * In FIN_WAIT_2 state enter the TIME_WAIT state, 12490 * starting the time-wait timer, turning off the 12491 * other standard timers. 12492 */ 12493 case TCPS_FIN_WAIT_2: 12494 rack_timer_cancel(tp, rack, 12495 rack->r_ctl.rc_rcvtime, __LINE__); 12496 tcp_twstart(tp); 12497 return (1); 12498 } 12499 } 12500 /* 12501 * Return any desired output. 12502 */ 12503 if ((tp->t_flags & TF_ACKNOW) || 12504 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 12505 rack->r_wanted_output = 1; 12506 } 12507 return (0); 12508 } 12509 12510 /* 12511 * Here nothing is really faster, its just that we 12512 * have broken out the fast-data path also just like 12513 * the fast-ack. 12514 */ 12515 static int 12516 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 12517 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12518 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 12519 { 12520 int32_t nsegs; 12521 int32_t newsize = 0; /* automatic sockbuf scaling */ 12522 struct tcp_rack *rack; 12523 #ifdef NETFLIX_SB_LIMITS 12524 u_int mcnt, appended; 12525 #endif 12526 12527 /* 12528 * If last ACK falls within this segment's sequence numbers, record 12529 * the timestamp. NOTE that the test is modified according to the 12530 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12531 */ 12532 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 12533 return (0); 12534 } 12535 if (tiwin && tiwin != tp->snd_wnd) { 12536 return (0); 12537 } 12538 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 12539 return (0); 12540 } 12541 if (__predict_false((to->to_flags & TOF_TS) && 12542 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 12543 return (0); 12544 } 12545 if (__predict_false((th->th_ack != tp->snd_una))) { 12546 return (0); 12547 } 12548 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 12549 return (0); 12550 } 12551 if ((to->to_flags & TOF_TS) != 0 && 12552 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12553 tp->ts_recent_age = tcp_ts_getticks(); 12554 tp->ts_recent = to->to_tsval; 12555 } 12556 rack = (struct tcp_rack *)tp->t_fb_ptr; 12557 /* 12558 * This is a pure, in-sequence data packet with nothing on the 12559 * reassembly queue and we have enough buffer space to take it. 12560 */ 12561 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12562 12563 #ifdef NETFLIX_SB_LIMITS 12564 if (so->so_rcv.sb_shlim) { 12565 mcnt = m_memcnt(m); 12566 appended = 0; 12567 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12568 CFO_NOSLEEP, NULL) == false) { 12569 counter_u64_add(tcp_sb_shlim_fails, 1); 12570 m_freem(m); 12571 return (1); 12572 } 12573 } 12574 #endif 12575 /* Clean receiver SACK report if present */ 12576 if (tp->rcv_numsacks) 12577 tcp_clean_sackreport(tp); 12578 KMOD_TCPSTAT_INC(tcps_preddat); 12579 tp->rcv_nxt += tlen; 12580 if (tlen && 12581 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12582 (tp->t_fbyte_in == 0)) { 12583 tp->t_fbyte_in = ticks; 12584 if (tp->t_fbyte_in == 0) 12585 tp->t_fbyte_in = 1; 12586 if (tp->t_fbyte_out && tp->t_fbyte_in) 12587 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12588 } 12589 /* 12590 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 12591 */ 12592 tp->snd_wl1 = th->th_seq; 12593 /* 12594 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 12595 */ 12596 tp->rcv_up = tp->rcv_nxt; 12597 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12598 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12599 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12600 12601 /* Add data to socket buffer. */ 12602 SOCK_RECVBUF_LOCK(so); 12603 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12604 m_freem(m); 12605 } else { 12606 /* 12607 * Set new socket buffer size. Give up when limit is 12608 * reached. 12609 */ 12610 if (newsize) 12611 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12612 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12613 m_adj(m, drop_hdrlen); /* delayed header drop */ 12614 #ifdef NETFLIX_SB_LIMITS 12615 appended = 12616 #endif 12617 sbappendstream_locked(&so->so_rcv, m, 0); 12618 ctf_calc_rwin(so, tp); 12619 } 12620 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12621 /* NB: sorwakeup_locked() does an implicit unlock. */ 12622 sorwakeup_locked(so); 12623 #ifdef NETFLIX_SB_LIMITS 12624 if (so->so_rcv.sb_shlim && mcnt != appended) 12625 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 12626 #endif 12627 rack_handle_delayed_ack(tp, rack, tlen, 0); 12628 if (tp->snd_una == tp->snd_max) 12629 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12630 return (1); 12631 } 12632 12633 /* 12634 * This subfunction is used to try to highly optimize the 12635 * fast path. We again allow window updates that are 12636 * in sequence to remain in the fast-path. We also add 12637 * in the __predict's to attempt to help the compiler. 12638 * Note that if we return a 0, then we can *not* process 12639 * it and the caller should push the packet into the 12640 * slow-path. 12641 */ 12642 static int 12643 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12644 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12645 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 12646 { 12647 int32_t acked; 12648 int32_t nsegs; 12649 int32_t under_pacing = 0; 12650 struct tcp_rack *rack; 12651 12652 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12653 /* Old ack, behind (or duplicate to) the last one rcv'd */ 12654 return (0); 12655 } 12656 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 12657 /* Above what we have sent? */ 12658 return (0); 12659 } 12660 if (__predict_false(tiwin == 0)) { 12661 /* zero window */ 12662 return (0); 12663 } 12664 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 12665 /* We need a SYN or a FIN, unlikely.. */ 12666 return (0); 12667 } 12668 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 12669 /* Timestamp is behind .. old ack with seq wrap? */ 12670 return (0); 12671 } 12672 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 12673 /* Still recovering */ 12674 return (0); 12675 } 12676 rack = (struct tcp_rack *)tp->t_fb_ptr; 12677 if (rack->r_ctl.rc_sacked) { 12678 /* We have sack holes on our scoreboard */ 12679 return (0); 12680 } 12681 /* Ok if we reach here, we can process a fast-ack */ 12682 if (rack->gp_ready && 12683 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12684 under_pacing = 1; 12685 } 12686 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12687 rack_log_ack(tp, to, th, 0, 0, NULL, NULL); 12688 /* Did the window get updated? */ 12689 if (tiwin != tp->snd_wnd) { 12690 tp->snd_wnd = tiwin; 12691 rack_validate_fo_sendwin_up(tp, rack); 12692 tp->snd_wl1 = th->th_seq; 12693 if (tp->snd_wnd > tp->max_sndwnd) 12694 tp->max_sndwnd = tp->snd_wnd; 12695 } 12696 /* Do we exit persists? */ 12697 if ((rack->rc_in_persist != 0) && 12698 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12699 rack->r_ctl.rc_pace_min_segs))) { 12700 rack_exit_persist(tp, rack, cts); 12701 } 12702 /* Do we enter persists? */ 12703 if ((rack->rc_in_persist == 0) && 12704 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12705 TCPS_HAVEESTABLISHED(tp->t_state) && 12706 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12707 sbavail(&tptosocket(tp)->so_snd) && 12708 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12709 /* 12710 * Here the rwnd is less than 12711 * the pacing size, we are established, 12712 * nothing is outstanding, and there is 12713 * data to send. Enter persists. 12714 */ 12715 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); 12716 } 12717 /* 12718 * If last ACK falls within this segment's sequence numbers, record 12719 * the timestamp. NOTE that the test is modified according to the 12720 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12721 */ 12722 if ((to->to_flags & TOF_TS) != 0 && 12723 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12724 tp->ts_recent_age = tcp_ts_getticks(); 12725 tp->ts_recent = to->to_tsval; 12726 } 12727 /* 12728 * This is a pure ack for outstanding data. 12729 */ 12730 KMOD_TCPSTAT_INC(tcps_predack); 12731 12732 /* 12733 * "bad retransmit" recovery. 12734 */ 12735 if ((tp->t_flags & TF_PREVVALID) && 12736 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12737 tp->t_flags &= ~TF_PREVVALID; 12738 if (tp->t_rxtshift == 1 && 12739 (int)(ticks - tp->t_badrxtwin) < 0) 12740 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12741 } 12742 /* 12743 * Recalculate the transmit timer / rtt. 12744 * 12745 * Some boxes send broken timestamp replies during the SYN+ACK 12746 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12747 * and blow up the retransmit timer. 12748 */ 12749 acked = BYTES_THIS_ACK(tp, th); 12750 12751 #ifdef TCP_HHOOK 12752 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 12753 hhook_run_tcp_est_in(tp, th, to); 12754 #endif 12755 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12756 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12757 if (acked) { 12758 struct mbuf *mfree; 12759 12760 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 12761 SOCK_SENDBUF_LOCK(so); 12762 mfree = sbcut_locked(&so->so_snd, acked); 12763 tp->snd_una = th->th_ack; 12764 /* Note we want to hold the sb lock through the sendmap adjust */ 12765 rack_adjust_sendmap_head(rack, &so->so_snd); 12766 /* Wake up the socket if we have room to write more */ 12767 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12768 sowwakeup_locked(so); 12769 m_freem(mfree); 12770 tp->t_rxtshift = 0; 12771 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12772 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12773 rack->rc_tlp_in_progress = 0; 12774 rack->r_ctl.rc_tlp_cnt_out = 0; 12775 /* 12776 * If it is the RXT timer we want to 12777 * stop it, so we can restart a TLP. 12778 */ 12779 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12780 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12781 12782 #ifdef TCP_REQUEST_TRK 12783 rack_req_check_for_comp(rack, th->th_ack); 12784 #endif 12785 } 12786 /* 12787 * Let the congestion control algorithm update congestion control 12788 * related information. This typically means increasing the 12789 * congestion window. 12790 */ 12791 if (tp->snd_wnd < ctf_outstanding(tp)) { 12792 /* The peer collapsed the window */ 12793 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12794 } else if (rack->rc_has_collapsed) 12795 rack_un_collapse_window(rack, __LINE__); 12796 if ((rack->r_collapse_point_valid) && 12797 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 12798 rack->r_collapse_point_valid = 0; 12799 /* 12800 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 12801 */ 12802 tp->snd_wl2 = th->th_ack; 12803 tp->t_dupacks = 0; 12804 m_freem(m); 12805 /* ND6_HINT(tp); *//* Some progress has been made. */ 12806 12807 /* 12808 * If all outstanding data are acked, stop retransmit timer, 12809 * otherwise restart timer using current (possibly backed-off) 12810 * value. If process is waiting for space, wakeup/selwakeup/signal. 12811 * If data are ready to send, let tcp_output decide between more 12812 * output or persist. 12813 */ 12814 if (under_pacing && 12815 (rack->use_fixed_rate == 0) && 12816 (rack->in_probe_rtt == 0) && 12817 rack->rc_gp_dyn_mul && 12818 rack->rc_always_pace) { 12819 /* Check if we are dragging bottom */ 12820 rack_check_bottom_drag(tp, rack, so); 12821 } 12822 if (tp->snd_una == tp->snd_max) { 12823 tp->t_flags &= ~TF_PREVVALID; 12824 rack->r_ctl.retran_during_recovery = 0; 12825 rack->rc_suspicious = 0; 12826 rack->r_ctl.dsack_byte_cnt = 0; 12827 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 12828 if (rack->r_ctl.rc_went_idle_time == 0) 12829 rack->r_ctl.rc_went_idle_time = 1; 12830 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12831 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12832 tp->t_acktime = 0; 12833 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12834 } 12835 if (acked && rack->r_fast_output) 12836 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 12837 if (sbavail(&so->so_snd)) { 12838 rack->r_wanted_output = 1; 12839 } 12840 return (1); 12841 } 12842 12843 /* 12844 * Return value of 1, the TCB is unlocked and most 12845 * likely gone, return value of 0, the TCP is still 12846 * locked. 12847 */ 12848 static int 12849 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 12850 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12851 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12852 { 12853 int32_t ret_val = 0; 12854 int32_t orig_tlen = tlen; 12855 int32_t todrop; 12856 int32_t ourfinisacked = 0; 12857 struct tcp_rack *rack; 12858 12859 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12860 12861 ctf_calc_rwin(so, tp); 12862 /* 12863 * If the state is SYN_SENT: if seg contains an ACK, but not for our 12864 * SYN, drop the input. if seg contains a RST, then drop the 12865 * connection. if seg does not contain SYN, then drop it. Otherwise 12866 * this is an acceptable SYN segment initialize tp->rcv_nxt and 12867 * tp->irs if seg contains ack then advance tp->snd_una if seg 12868 * contains an ECE and ECN support is enabled, the stream is ECN 12869 * capable. if SYN has been acked change to ESTABLISHED else 12870 * SYN_RCVD state arrange for segment to be acked (eventually) 12871 * continue processing rest of data/controls. 12872 */ 12873 if ((thflags & TH_ACK) && 12874 (SEQ_LEQ(th->th_ack, tp->iss) || 12875 SEQ_GT(th->th_ack, tp->snd_max))) { 12876 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 12877 ctf_do_dropwithreset(m, tp, th, tlen); 12878 return (1); 12879 } 12880 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 12881 TCP_PROBE5(connect__refused, NULL, tp, 12882 mtod(m, const char *), tp, th); 12883 tp = tcp_drop(tp, ECONNREFUSED); 12884 ctf_do_drop(m, tp); 12885 return (1); 12886 } 12887 if (thflags & TH_RST) { 12888 ctf_do_drop(m, tp); 12889 return (1); 12890 } 12891 if (!(thflags & TH_SYN)) { 12892 ctf_do_drop(m, tp); 12893 return (1); 12894 } 12895 tp->irs = th->th_seq; 12896 tcp_rcvseqinit(tp); 12897 rack = (struct tcp_rack *)tp->t_fb_ptr; 12898 if (thflags & TH_ACK) { 12899 int tfo_partial = 0; 12900 12901 KMOD_TCPSTAT_INC(tcps_connects); 12902 soisconnected(so); 12903 #ifdef MAC 12904 mac_socketpeer_set_from_mbuf(m, so); 12905 #endif 12906 /* Do window scaling on this connection? */ 12907 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12908 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12909 tp->rcv_scale = tp->request_r_scale; 12910 } 12911 tp->rcv_adv += min(tp->rcv_wnd, 12912 TCP_MAXWIN << tp->rcv_scale); 12913 /* 12914 * If not all the data that was sent in the TFO SYN 12915 * has been acked, resend the remainder right away. 12916 */ 12917 if ((tp->t_flags & TF_FASTOPEN) && 12918 (tp->snd_una != tp->snd_max)) { 12919 /* Was it a partial ack? */ 12920 if (SEQ_LT(th->th_ack, tp->snd_max)) 12921 tfo_partial = 1; 12922 } 12923 /* 12924 * If there's data, delay ACK; if there's also a FIN ACKNOW 12925 * will be turned on later. 12926 */ 12927 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 12928 rack_timer_cancel(tp, rack, 12929 rack->r_ctl.rc_rcvtime, __LINE__); 12930 tp->t_flags |= TF_DELACK; 12931 } else { 12932 rack->r_wanted_output = 1; 12933 tp->t_flags |= TF_ACKNOW; 12934 } 12935 12936 tcp_ecn_input_syn_sent(tp, thflags, iptos); 12937 12938 if (SEQ_GT(th->th_ack, tp->snd_una)) { 12939 /* 12940 * We advance snd_una for the 12941 * fast open case. If th_ack is 12942 * acknowledging data beyond 12943 * snd_una we can't just call 12944 * ack-processing since the 12945 * data stream in our send-map 12946 * will start at snd_una + 1 (one 12947 * beyond the SYN). If its just 12948 * equal we don't need to do that 12949 * and there is no send_map. 12950 */ 12951 tp->snd_una++; 12952 if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) { 12953 /* 12954 * We sent a SYN with data, and thus have a 12955 * sendmap entry with a SYN set. Lets find it 12956 * and take off the send bit and the byte and 12957 * set it up to be what we send (send it next). 12958 */ 12959 struct rack_sendmap *rsm; 12960 12961 rsm = tqhash_min(rack->r_ctl.tqh); 12962 if (rsm) { 12963 if (rsm->r_flags & RACK_HAS_SYN) { 12964 rsm->r_flags &= ~RACK_HAS_SYN; 12965 rsm->r_start++; 12966 } 12967 rack->r_ctl.rc_resend = rsm; 12968 } 12969 } 12970 } 12971 /* 12972 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 12973 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 12974 */ 12975 tp->t_starttime = ticks; 12976 if (tp->t_flags & TF_NEEDFIN) { 12977 tcp_state_change(tp, TCPS_FIN_WAIT_1); 12978 tp->t_flags &= ~TF_NEEDFIN; 12979 thflags &= ~TH_SYN; 12980 } else { 12981 tcp_state_change(tp, TCPS_ESTABLISHED); 12982 TCP_PROBE5(connect__established, NULL, tp, 12983 mtod(m, const char *), tp, th); 12984 rack_cc_conn_init(tp); 12985 } 12986 } else { 12987 /* 12988 * Received initial SYN in SYN-SENT[*] state => simultaneous 12989 * open. If segment contains CC option and there is a 12990 * cached CC, apply TAO test. If it succeeds, connection is * 12991 * half-synchronized. Otherwise, do 3-way handshake: 12992 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 12993 * there was no CC option, clear cached CC value. 12994 */ 12995 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 12996 tcp_state_change(tp, TCPS_SYN_RECEIVED); 12997 } 12998 /* 12999 * Advance th->th_seq to correspond to first data byte. If data, 13000 * trim to stay within window, dropping FIN if necessary. 13001 */ 13002 th->th_seq++; 13003 if (tlen > tp->rcv_wnd) { 13004 todrop = tlen - tp->rcv_wnd; 13005 m_adj(m, -todrop); 13006 tlen = tp->rcv_wnd; 13007 thflags &= ~TH_FIN; 13008 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 13009 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 13010 } 13011 tp->snd_wl1 = th->th_seq - 1; 13012 tp->rcv_up = th->th_seq; 13013 /* 13014 * Client side of transaction: already sent SYN and data. If the 13015 * remote host used T/TCP to validate the SYN, our data will be 13016 * ACK'd; if so, enter normal data segment processing in the middle 13017 * of step 5, ack processing. Otherwise, goto step 6. 13018 */ 13019 if (thflags & TH_ACK) { 13020 /* For syn-sent we need to possibly update the rtt */ 13021 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13022 uint32_t t, mcts; 13023 13024 mcts = tcp_ts_getticks(); 13025 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13026 if (!tp->t_rttlow || tp->t_rttlow > t) 13027 tp->t_rttlow = t; 13028 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 13029 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13030 tcp_rack_xmit_timer_commit(rack, tp); 13031 } 13032 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) 13033 return (ret_val); 13034 /* We may have changed to FIN_WAIT_1 above */ 13035 if (tp->t_state == TCPS_FIN_WAIT_1) { 13036 /* 13037 * In FIN_WAIT_1 STATE in addition to the processing 13038 * for the ESTABLISHED state if our FIN is now 13039 * acknowledged then enter FIN_WAIT_2. 13040 */ 13041 if (ourfinisacked) { 13042 /* 13043 * If we can't receive any more data, then 13044 * closing user can proceed. Starting the 13045 * timer is contrary to the specification, 13046 * but if we don't get a FIN we'll hang 13047 * forever. 13048 * 13049 * XXXjl: we should release the tp also, and 13050 * use a compressed state. 13051 */ 13052 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13053 soisdisconnected(so); 13054 tcp_timer_activate(tp, TT_2MSL, 13055 (tcp_fast_finwait2_recycle ? 13056 tcp_finwait2_timeout : 13057 TP_MAXIDLE(tp))); 13058 } 13059 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13060 } 13061 } 13062 } 13063 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13064 tiwin, thflags, nxt_pkt)); 13065 } 13066 13067 /* 13068 * Return value of 1, the TCB is unlocked and most 13069 * likely gone, return value of 0, the TCP is still 13070 * locked. 13071 */ 13072 static int 13073 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 13074 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13075 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13076 { 13077 struct tcp_rack *rack; 13078 int32_t orig_tlen = tlen; 13079 int32_t ret_val = 0; 13080 int32_t ourfinisacked = 0; 13081 13082 rack = (struct tcp_rack *)tp->t_fb_ptr; 13083 ctf_calc_rwin(so, tp); 13084 if ((thflags & TH_RST) || 13085 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13086 return (ctf_process_rst(m, th, so, tp)); 13087 if ((thflags & TH_ACK) && 13088 (SEQ_LEQ(th->th_ack, tp->snd_una) || 13089 SEQ_GT(th->th_ack, tp->snd_max))) { 13090 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13091 ctf_do_dropwithreset(m, tp, th, tlen); 13092 return (1); 13093 } 13094 if (tp->t_flags & TF_FASTOPEN) { 13095 /* 13096 * When a TFO connection is in SYN_RECEIVED, the 13097 * only valid packets are the initial SYN, a 13098 * retransmit/copy of the initial SYN (possibly with 13099 * a subset of the original data), a valid ACK, a 13100 * FIN, or a RST. 13101 */ 13102 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 13103 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13104 ctf_do_dropwithreset(m, tp, th, tlen); 13105 return (1); 13106 } else if (thflags & TH_SYN) { 13107 /* non-initial SYN is ignored */ 13108 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 13109 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 13110 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 13111 ctf_do_drop(m, NULL); 13112 return (0); 13113 } 13114 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 13115 ctf_do_drop(m, NULL); 13116 return (0); 13117 } 13118 } 13119 13120 /* 13121 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13122 * it's less than ts_recent, drop it. 13123 */ 13124 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13125 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13126 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13127 return (ret_val); 13128 } 13129 /* 13130 * In the SYN-RECEIVED state, validate that the packet belongs to 13131 * this connection before trimming the data to fit the receive 13132 * window. Check the sequence number versus IRS since we know the 13133 * sequence numbers haven't wrapped. This is a partial fix for the 13134 * "LAND" DoS attack. 13135 */ 13136 if (SEQ_LT(th->th_seq, tp->irs)) { 13137 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13138 ctf_do_dropwithreset(m, tp, th, tlen); 13139 return (1); 13140 } 13141 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13142 return (ret_val); 13143 } 13144 /* 13145 * If last ACK falls within this segment's sequence numbers, record 13146 * its timestamp. NOTE: 1) That the test incorporates suggestions 13147 * from the latest proposal of the tcplw@cray.com list (Braden 13148 * 1993/04/26). 2) That updating only on newer timestamps interferes 13149 * with our earlier PAWS tests, so this check should be solely 13150 * predicated on the sequence space of this segment. 3) That we 13151 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13152 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13153 * SEG.Len, This modified check allows us to overcome RFC1323's 13154 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13155 * p.869. In such cases, we can still calculate the RTT correctly 13156 * when RCV.NXT == Last.ACK.Sent. 13157 */ 13158 if ((to->to_flags & TOF_TS) != 0 && 13159 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13160 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13161 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13162 tp->ts_recent_age = tcp_ts_getticks(); 13163 tp->ts_recent = to->to_tsval; 13164 } 13165 tp->snd_wnd = tiwin; 13166 rack_validate_fo_sendwin_up(tp, rack); 13167 /* 13168 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13169 * is on (half-synchronized state), then queue data for later 13170 * processing; else drop segment and return. 13171 */ 13172 if ((thflags & TH_ACK) == 0) { 13173 if (tp->t_flags & TF_FASTOPEN) { 13174 rack_cc_conn_init(tp); 13175 } 13176 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13177 tiwin, thflags, nxt_pkt)); 13178 } 13179 KMOD_TCPSTAT_INC(tcps_connects); 13180 if (tp->t_flags & TF_SONOTCONN) { 13181 tp->t_flags &= ~TF_SONOTCONN; 13182 soisconnected(so); 13183 } 13184 /* Do window scaling? */ 13185 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13186 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13187 tp->rcv_scale = tp->request_r_scale; 13188 } 13189 /* 13190 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 13191 * FIN-WAIT-1 13192 */ 13193 tp->t_starttime = ticks; 13194 if ((tp->t_flags & TF_FASTOPEN) && tp->t_tfo_pending) { 13195 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 13196 tp->t_tfo_pending = NULL; 13197 } 13198 if (tp->t_flags & TF_NEEDFIN) { 13199 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13200 tp->t_flags &= ~TF_NEEDFIN; 13201 } else { 13202 tcp_state_change(tp, TCPS_ESTABLISHED); 13203 TCP_PROBE5(accept__established, NULL, tp, 13204 mtod(m, const char *), tp, th); 13205 /* 13206 * TFO connections call cc_conn_init() during SYN 13207 * processing. Calling it again here for such connections 13208 * is not harmless as it would undo the snd_cwnd reduction 13209 * that occurs when a TFO SYN|ACK is retransmitted. 13210 */ 13211 if (!(tp->t_flags & TF_FASTOPEN)) 13212 rack_cc_conn_init(tp); 13213 } 13214 /* 13215 * Account for the ACK of our SYN prior to 13216 * regular ACK processing below, except for 13217 * simultaneous SYN, which is handled later. 13218 */ 13219 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 13220 tp->snd_una++; 13221 /* 13222 * If segment contains data or ACK, will call tcp_reass() later; if 13223 * not, do so now to pass queued data to user. 13224 */ 13225 if (tlen == 0 && (thflags & TH_FIN) == 0) { 13226 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 13227 (struct mbuf *)0); 13228 if (tp->t_flags & TF_WAKESOR) { 13229 tp->t_flags &= ~TF_WAKESOR; 13230 /* NB: sorwakeup_locked() does an implicit unlock. */ 13231 sorwakeup_locked(so); 13232 } 13233 } 13234 tp->snd_wl1 = th->th_seq - 1; 13235 /* For syn-recv we need to possibly update the rtt */ 13236 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13237 uint32_t t, mcts; 13238 13239 mcts = tcp_ts_getticks(); 13240 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13241 if (!tp->t_rttlow || tp->t_rttlow > t) 13242 tp->t_rttlow = t; 13243 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 13244 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13245 tcp_rack_xmit_timer_commit(rack, tp); 13246 } 13247 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13248 return (ret_val); 13249 } 13250 if (tp->t_state == TCPS_FIN_WAIT_1) { 13251 /* We could have went to FIN_WAIT_1 (or EST) above */ 13252 /* 13253 * In FIN_WAIT_1 STATE in addition to the processing for the 13254 * ESTABLISHED state if our FIN is now acknowledged then 13255 * enter FIN_WAIT_2. 13256 */ 13257 if (ourfinisacked) { 13258 /* 13259 * If we can't receive any more data, then closing 13260 * user can proceed. Starting the timer is contrary 13261 * to the specification, but if we don't get a FIN 13262 * we'll hang forever. 13263 * 13264 * XXXjl: we should release the tp also, and use a 13265 * compressed state. 13266 */ 13267 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13268 soisdisconnected(so); 13269 tcp_timer_activate(tp, TT_2MSL, 13270 (tcp_fast_finwait2_recycle ? 13271 tcp_finwait2_timeout : 13272 TP_MAXIDLE(tp))); 13273 } 13274 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13275 } 13276 } 13277 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13278 tiwin, thflags, nxt_pkt)); 13279 } 13280 13281 /* 13282 * Return value of 1, the TCB is unlocked and most 13283 * likely gone, return value of 0, the TCP is still 13284 * locked. 13285 */ 13286 static int 13287 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 13288 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13289 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13290 { 13291 int32_t ret_val = 0; 13292 int32_t orig_tlen = tlen; 13293 struct tcp_rack *rack; 13294 13295 /* 13296 * Header prediction: check for the two common cases of a 13297 * uni-directional data xfer. If the packet has no control flags, 13298 * is in-sequence, the window didn't change and we're not 13299 * retransmitting, it's a candidate. If the length is zero and the 13300 * ack moved forward, we're the sender side of the xfer. Just free 13301 * the data acked & wake any higher level process that was blocked 13302 * waiting for space. If the length is non-zero and the ack didn't 13303 * move, we're the receiver side. If we're getting packets in-order 13304 * (the reassembly queue is empty), add the data toc The socket 13305 * buffer and note that we need a delayed ack. Make sure that the 13306 * hidden state-flags are also off. Since we check for 13307 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 13308 */ 13309 rack = (struct tcp_rack *)tp->t_fb_ptr; 13310 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 13311 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 13312 __predict_true(SEGQ_EMPTY(tp)) && 13313 __predict_true(th->th_seq == tp->rcv_nxt)) { 13314 if (tlen == 0) { 13315 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 13316 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 13317 return (0); 13318 } 13319 } else { 13320 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 13321 tiwin, nxt_pkt, iptos)) { 13322 return (0); 13323 } 13324 } 13325 } 13326 ctf_calc_rwin(so, tp); 13327 13328 if ((thflags & TH_RST) || 13329 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13330 return (ctf_process_rst(m, th, so, tp)); 13331 13332 /* 13333 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13334 * synchronized state. 13335 */ 13336 if (thflags & TH_SYN) { 13337 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13338 return (ret_val); 13339 } 13340 /* 13341 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13342 * it's less than ts_recent, drop it. 13343 */ 13344 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13345 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13346 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13347 return (ret_val); 13348 } 13349 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13350 return (ret_val); 13351 } 13352 /* 13353 * If last ACK falls within this segment's sequence numbers, record 13354 * its timestamp. NOTE: 1) That the test incorporates suggestions 13355 * from the latest proposal of the tcplw@cray.com list (Braden 13356 * 1993/04/26). 2) That updating only on newer timestamps interferes 13357 * with our earlier PAWS tests, so this check should be solely 13358 * predicated on the sequence space of this segment. 3) That we 13359 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13360 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13361 * SEG.Len, This modified check allows us to overcome RFC1323's 13362 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13363 * p.869. In such cases, we can still calculate the RTT correctly 13364 * when RCV.NXT == Last.ACK.Sent. 13365 */ 13366 if ((to->to_flags & TOF_TS) != 0 && 13367 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13368 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13369 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13370 tp->ts_recent_age = tcp_ts_getticks(); 13371 tp->ts_recent = to->to_tsval; 13372 } 13373 /* 13374 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13375 * is on (half-synchronized state), then queue data for later 13376 * processing; else drop segment and return. 13377 */ 13378 if ((thflags & TH_ACK) == 0) { 13379 if (tp->t_flags & TF_NEEDSYN) { 13380 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13381 tiwin, thflags, nxt_pkt)); 13382 13383 } else if (tp->t_flags & TF_ACKNOW) { 13384 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13385 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13386 return (ret_val); 13387 } else { 13388 ctf_do_drop(m, NULL); 13389 return (0); 13390 } 13391 } 13392 /* 13393 * Ack processing. 13394 */ 13395 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 13396 return (ret_val); 13397 } 13398 if (sbavail(&so->so_snd)) { 13399 if (ctf_progress_timeout_check(tp, true)) { 13400 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 13401 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13402 return (1); 13403 } 13404 } 13405 /* State changes only happen in rack_process_data() */ 13406 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13407 tiwin, thflags, nxt_pkt)); 13408 } 13409 13410 /* 13411 * Return value of 1, the TCB is unlocked and most 13412 * likely gone, return value of 0, the TCP is still 13413 * locked. 13414 */ 13415 static int 13416 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 13417 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13418 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13419 { 13420 int32_t ret_val = 0; 13421 int32_t orig_tlen = tlen; 13422 13423 ctf_calc_rwin(so, tp); 13424 if ((thflags & TH_RST) || 13425 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13426 return (ctf_process_rst(m, th, so, tp)); 13427 /* 13428 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13429 * synchronized state. 13430 */ 13431 if (thflags & TH_SYN) { 13432 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13433 return (ret_val); 13434 } 13435 /* 13436 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13437 * it's less than ts_recent, drop it. 13438 */ 13439 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13440 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13441 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13442 return (ret_val); 13443 } 13444 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13445 return (ret_val); 13446 } 13447 /* 13448 * If last ACK falls within this segment's sequence numbers, record 13449 * its timestamp. NOTE: 1) That the test incorporates suggestions 13450 * from the latest proposal of the tcplw@cray.com list (Braden 13451 * 1993/04/26). 2) That updating only on newer timestamps interferes 13452 * with our earlier PAWS tests, so this check should be solely 13453 * predicated on the sequence space of this segment. 3) That we 13454 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13455 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13456 * SEG.Len, This modified check allows us to overcome RFC1323's 13457 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13458 * p.869. In such cases, we can still calculate the RTT correctly 13459 * when RCV.NXT == Last.ACK.Sent. 13460 */ 13461 if ((to->to_flags & TOF_TS) != 0 && 13462 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13463 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13464 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13465 tp->ts_recent_age = tcp_ts_getticks(); 13466 tp->ts_recent = to->to_tsval; 13467 } 13468 /* 13469 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13470 * is on (half-synchronized state), then queue data for later 13471 * processing; else drop segment and return. 13472 */ 13473 if ((thflags & TH_ACK) == 0) { 13474 if (tp->t_flags & TF_NEEDSYN) { 13475 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13476 tiwin, thflags, nxt_pkt)); 13477 13478 } else if (tp->t_flags & TF_ACKNOW) { 13479 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13480 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13481 return (ret_val); 13482 } else { 13483 ctf_do_drop(m, NULL); 13484 return (0); 13485 } 13486 } 13487 /* 13488 * Ack processing. 13489 */ 13490 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { 13491 return (ret_val); 13492 } 13493 if (sbavail(&so->so_snd)) { 13494 if (ctf_progress_timeout_check(tp, true)) { 13495 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13496 tp, tick, PROGRESS_DROP, __LINE__); 13497 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13498 return (1); 13499 } 13500 } 13501 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13502 tiwin, thflags, nxt_pkt)); 13503 } 13504 13505 static int 13506 rack_check_data_after_close(struct mbuf *m, 13507 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 13508 { 13509 struct tcp_rack *rack; 13510 13511 rack = (struct tcp_rack *)tp->t_fb_ptr; 13512 if (rack->rc_allow_data_af_clo == 0) { 13513 close_now: 13514 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13515 /* tcp_close will kill the inp pre-log the Reset */ 13516 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13517 tp = tcp_close(tp); 13518 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 13519 ctf_do_dropwithreset(m, tp, th, *tlen); 13520 return (1); 13521 } 13522 if (sbavail(&so->so_snd) == 0) 13523 goto close_now; 13524 /* Ok we allow data that is ignored and a followup reset */ 13525 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13526 tp->rcv_nxt = th->th_seq + *tlen; 13527 tp->t_flags2 |= TF2_DROP_AF_DATA; 13528 rack->r_wanted_output = 1; 13529 *tlen = 0; 13530 return (0); 13531 } 13532 13533 /* 13534 * Return value of 1, the TCB is unlocked and most 13535 * likely gone, return value of 0, the TCP is still 13536 * locked. 13537 */ 13538 static int 13539 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 13540 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13541 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13542 { 13543 int32_t ret_val = 0; 13544 int32_t orig_tlen = tlen; 13545 int32_t ourfinisacked = 0; 13546 13547 ctf_calc_rwin(so, tp); 13548 13549 if ((thflags & TH_RST) || 13550 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13551 return (ctf_process_rst(m, th, so, tp)); 13552 /* 13553 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13554 * synchronized state. 13555 */ 13556 if (thflags & TH_SYN) { 13557 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13558 return (ret_val); 13559 } 13560 /* 13561 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13562 * it's less than ts_recent, drop it. 13563 */ 13564 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13565 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13566 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13567 return (ret_val); 13568 } 13569 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13570 return (ret_val); 13571 } 13572 /* 13573 * If new data are received on a connection after the user processes 13574 * are gone, then RST the other end. 13575 */ 13576 if ((tp->t_flags & TF_CLOSED) && tlen && 13577 rack_check_data_after_close(m, tp, &tlen, th, so)) 13578 return (1); 13579 /* 13580 * If last ACK falls within this segment's sequence numbers, record 13581 * its timestamp. NOTE: 1) That the test incorporates suggestions 13582 * from the latest proposal of the tcplw@cray.com list (Braden 13583 * 1993/04/26). 2) That updating only on newer timestamps interferes 13584 * with our earlier PAWS tests, so this check should be solely 13585 * predicated on the sequence space of this segment. 3) That we 13586 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13587 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13588 * SEG.Len, This modified check allows us to overcome RFC1323's 13589 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13590 * p.869. In such cases, we can still calculate the RTT correctly 13591 * when RCV.NXT == Last.ACK.Sent. 13592 */ 13593 if ((to->to_flags & TOF_TS) != 0 && 13594 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13595 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13596 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13597 tp->ts_recent_age = tcp_ts_getticks(); 13598 tp->ts_recent = to->to_tsval; 13599 } 13600 /* 13601 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13602 * is on (half-synchronized state), then queue data for later 13603 * processing; else drop segment and return. 13604 */ 13605 if ((thflags & TH_ACK) == 0) { 13606 if (tp->t_flags & TF_NEEDSYN) { 13607 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13608 tiwin, thflags, nxt_pkt)); 13609 } else if (tp->t_flags & TF_ACKNOW) { 13610 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13611 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13612 return (ret_val); 13613 } else { 13614 ctf_do_drop(m, NULL); 13615 return (0); 13616 } 13617 } 13618 /* 13619 * Ack processing. 13620 */ 13621 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13622 return (ret_val); 13623 } 13624 if (ourfinisacked) { 13625 /* 13626 * If we can't receive any more data, then closing user can 13627 * proceed. Starting the timer is contrary to the 13628 * specification, but if we don't get a FIN we'll hang 13629 * forever. 13630 * 13631 * XXXjl: we should release the tp also, and use a 13632 * compressed state. 13633 */ 13634 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13635 soisdisconnected(so); 13636 tcp_timer_activate(tp, TT_2MSL, 13637 (tcp_fast_finwait2_recycle ? 13638 tcp_finwait2_timeout : 13639 TP_MAXIDLE(tp))); 13640 } 13641 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13642 } 13643 if (sbavail(&so->so_snd)) { 13644 if (ctf_progress_timeout_check(tp, true)) { 13645 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13646 tp, tick, PROGRESS_DROP, __LINE__); 13647 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13648 return (1); 13649 } 13650 } 13651 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13652 tiwin, thflags, nxt_pkt)); 13653 } 13654 13655 /* 13656 * Return value of 1, the TCB is unlocked and most 13657 * likely gone, return value of 0, the TCP is still 13658 * locked. 13659 */ 13660 static int 13661 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 13662 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13663 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13664 { 13665 int32_t ret_val = 0; 13666 int32_t orig_tlen = tlen; 13667 int32_t ourfinisacked = 0; 13668 13669 ctf_calc_rwin(so, tp); 13670 13671 if ((thflags & TH_RST) || 13672 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13673 return (ctf_process_rst(m, th, so, tp)); 13674 /* 13675 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13676 * synchronized state. 13677 */ 13678 if (thflags & TH_SYN) { 13679 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13680 return (ret_val); 13681 } 13682 /* 13683 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13684 * it's less than ts_recent, drop it. 13685 */ 13686 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13687 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13688 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13689 return (ret_val); 13690 } 13691 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13692 return (ret_val); 13693 } 13694 /* 13695 * If last ACK falls within this segment's sequence numbers, record 13696 * its timestamp. NOTE: 1) That the test incorporates suggestions 13697 * from the latest proposal of the tcplw@cray.com list (Braden 13698 * 1993/04/26). 2) That updating only on newer timestamps interferes 13699 * with our earlier PAWS tests, so this check should be solely 13700 * predicated on the sequence space of this segment. 3) That we 13701 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13702 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13703 * SEG.Len, This modified check allows us to overcome RFC1323's 13704 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13705 * p.869. In such cases, we can still calculate the RTT correctly 13706 * when RCV.NXT == Last.ACK.Sent. 13707 */ 13708 if ((to->to_flags & TOF_TS) != 0 && 13709 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13710 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13711 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13712 tp->ts_recent_age = tcp_ts_getticks(); 13713 tp->ts_recent = to->to_tsval; 13714 } 13715 /* 13716 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13717 * is on (half-synchronized state), then queue data for later 13718 * processing; else drop segment and return. 13719 */ 13720 if ((thflags & TH_ACK) == 0) { 13721 if (tp->t_flags & TF_NEEDSYN) { 13722 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13723 tiwin, thflags, nxt_pkt)); 13724 } else if (tp->t_flags & TF_ACKNOW) { 13725 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13726 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13727 return (ret_val); 13728 } else { 13729 ctf_do_drop(m, NULL); 13730 return (0); 13731 } 13732 } 13733 /* 13734 * Ack processing. 13735 */ 13736 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13737 return (ret_val); 13738 } 13739 if (ourfinisacked) { 13740 tcp_twstart(tp); 13741 m_freem(m); 13742 return (1); 13743 } 13744 if (sbavail(&so->so_snd)) { 13745 if (ctf_progress_timeout_check(tp, true)) { 13746 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13747 tp, tick, PROGRESS_DROP, __LINE__); 13748 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13749 return (1); 13750 } 13751 } 13752 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13753 tiwin, thflags, nxt_pkt)); 13754 } 13755 13756 /* 13757 * Return value of 1, the TCB is unlocked and most 13758 * likely gone, return value of 0, the TCP is still 13759 * locked. 13760 */ 13761 static int 13762 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 13763 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13764 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13765 { 13766 int32_t ret_val = 0; 13767 int32_t orig_tlen; 13768 int32_t ourfinisacked = 0; 13769 13770 ctf_calc_rwin(so, tp); 13771 13772 if ((thflags & TH_RST) || 13773 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13774 return (ctf_process_rst(m, th, so, tp)); 13775 /* 13776 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13777 * synchronized state. 13778 */ 13779 if (thflags & TH_SYN) { 13780 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13781 return (ret_val); 13782 } 13783 /* 13784 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13785 * it's less than ts_recent, drop it. 13786 */ 13787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13788 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13789 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13790 return (ret_val); 13791 } 13792 orig_tlen = tlen; 13793 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13794 return (ret_val); 13795 } 13796 /* 13797 * If last ACK falls within this segment's sequence numbers, record 13798 * its timestamp. NOTE: 1) That the test incorporates suggestions 13799 * from the latest proposal of the tcplw@cray.com list (Braden 13800 * 1993/04/26). 2) That updating only on newer timestamps interferes 13801 * with our earlier PAWS tests, so this check should be solely 13802 * predicated on the sequence space of this segment. 3) That we 13803 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13804 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13805 * SEG.Len, This modified check allows us to overcome RFC1323's 13806 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13807 * p.869. In such cases, we can still calculate the RTT correctly 13808 * when RCV.NXT == Last.ACK.Sent. 13809 */ 13810 if ((to->to_flags & TOF_TS) != 0 && 13811 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13812 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13813 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13814 tp->ts_recent_age = tcp_ts_getticks(); 13815 tp->ts_recent = to->to_tsval; 13816 } 13817 /* 13818 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13819 * is on (half-synchronized state), then queue data for later 13820 * processing; else drop segment and return. 13821 */ 13822 if ((thflags & TH_ACK) == 0) { 13823 if (tp->t_flags & TF_NEEDSYN) { 13824 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13825 tiwin, thflags, nxt_pkt)); 13826 } else if (tp->t_flags & TF_ACKNOW) { 13827 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13828 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13829 return (ret_val); 13830 } else { 13831 ctf_do_drop(m, NULL); 13832 return (0); 13833 } 13834 } 13835 /* 13836 * case TCPS_LAST_ACK: Ack processing. 13837 */ 13838 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13839 return (ret_val); 13840 } 13841 if (ourfinisacked) { 13842 tp = tcp_close(tp); 13843 ctf_do_drop(m, tp); 13844 return (1); 13845 } 13846 if (sbavail(&so->so_snd)) { 13847 if (ctf_progress_timeout_check(tp, true)) { 13848 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13849 tp, tick, PROGRESS_DROP, __LINE__); 13850 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13851 return (1); 13852 } 13853 } 13854 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13855 tiwin, thflags, nxt_pkt)); 13856 } 13857 13858 /* 13859 * Return value of 1, the TCB is unlocked and most 13860 * likely gone, return value of 0, the TCP is still 13861 * locked. 13862 */ 13863 static int 13864 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 13865 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13866 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13867 { 13868 int32_t ret_val = 0; 13869 int32_t orig_tlen = tlen; 13870 int32_t ourfinisacked = 0; 13871 13872 ctf_calc_rwin(so, tp); 13873 13874 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 13875 if ((thflags & TH_RST) || 13876 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13877 return (ctf_process_rst(m, th, so, tp)); 13878 /* 13879 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13880 * synchronized state. 13881 */ 13882 if (thflags & TH_SYN) { 13883 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13884 return (ret_val); 13885 } 13886 /* 13887 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13888 * it's less than ts_recent, drop it. 13889 */ 13890 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13891 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13892 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13893 return (ret_val); 13894 } 13895 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 13896 return (ret_val); 13897 } 13898 /* 13899 * If new data are received on a connection after the user processes 13900 * are gone, then RST the other end. 13901 */ 13902 if ((tp->t_flags & TF_CLOSED) && tlen && 13903 rack_check_data_after_close(m, tp, &tlen, th, so)) 13904 return (1); 13905 /* 13906 * If last ACK falls within this segment's sequence numbers, record 13907 * its timestamp. NOTE: 1) That the test incorporates suggestions 13908 * from the latest proposal of the tcplw@cray.com list (Braden 13909 * 1993/04/26). 2) That updating only on newer timestamps interferes 13910 * with our earlier PAWS tests, so this check should be solely 13911 * predicated on the sequence space of this segment. 3) That we 13912 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13913 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13914 * SEG.Len, This modified check allows us to overcome RFC1323's 13915 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13916 * p.869. In such cases, we can still calculate the RTT correctly 13917 * when RCV.NXT == Last.ACK.Sent. 13918 */ 13919 if ((to->to_flags & TOF_TS) != 0 && 13920 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13921 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13922 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13923 tp->ts_recent_age = tcp_ts_getticks(); 13924 tp->ts_recent = to->to_tsval; 13925 } 13926 /* 13927 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13928 * is on (half-synchronized state), then queue data for later 13929 * processing; else drop segment and return. 13930 */ 13931 if ((thflags & TH_ACK) == 0) { 13932 if (tp->t_flags & TF_NEEDSYN) { 13933 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13934 tiwin, thflags, nxt_pkt)); 13935 } else if (tp->t_flags & TF_ACKNOW) { 13936 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13937 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13938 return (ret_val); 13939 } else { 13940 ctf_do_drop(m, NULL); 13941 return (0); 13942 } 13943 } 13944 /* 13945 * Ack processing. 13946 */ 13947 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { 13948 return (ret_val); 13949 } 13950 if (sbavail(&so->so_snd)) { 13951 if (ctf_progress_timeout_check(tp, true)) { 13952 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13953 tp, tick, PROGRESS_DROP, __LINE__); 13954 ctf_do_dropwithreset_conn(m, tp, th, tlen); 13955 return (1); 13956 } 13957 } 13958 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13959 tiwin, thflags, nxt_pkt)); 13960 } 13961 13962 static void inline 13963 rack_clear_rate_sample(struct tcp_rack *rack) 13964 { 13965 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 13966 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 13967 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 13968 } 13969 13970 static void 13971 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 13972 { 13973 uint64_t bw_est, rate_wanted; 13974 int chged = 0; 13975 uint32_t user_max, orig_min, orig_max; 13976 13977 #ifdef TCP_REQUEST_TRK 13978 if (rack->rc_hybrid_mode && 13979 (rack->r_ctl.rc_pace_max_segs != 0) && 13980 (rack_hybrid_allow_set_maxseg == 1) && 13981 (rack->r_ctl.rc_last_sft != NULL)) { 13982 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; 13983 return; 13984 } 13985 #endif 13986 orig_min = rack->r_ctl.rc_pace_min_segs; 13987 orig_max = rack->r_ctl.rc_pace_max_segs; 13988 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 13989 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 13990 chged = 1; 13991 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 13992 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 13993 if (user_max != rack->r_ctl.rc_pace_max_segs) 13994 chged = 1; 13995 } 13996 if (rack->rc_force_max_seg) { 13997 rack->r_ctl.rc_pace_max_segs = user_max; 13998 } else if (rack->use_fixed_rate) { 13999 bw_est = rack_get_bw(rack); 14000 if ((rack->r_ctl.crte == NULL) || 14001 (bw_est != rack->r_ctl.crte->rate)) { 14002 rack->r_ctl.rc_pace_max_segs = user_max; 14003 } else { 14004 /* We are pacing right at the hardware rate */ 14005 uint32_t segsiz, pace_one; 14006 14007 if (rack_pace_one_seg || 14008 (rack->r_ctl.rc_user_set_min_segs == 1)) 14009 pace_one = 1; 14010 else 14011 pace_one = 0; 14012 segsiz = min(ctf_fixed_maxseg(tp), 14013 rack->r_ctl.rc_pace_min_segs); 14014 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( 14015 tp, bw_est, segsiz, pace_one, 14016 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 14017 } 14018 } else if (rack->rc_always_pace) { 14019 if (rack->r_ctl.gp_bw || 14020 rack->r_ctl.init_rate) { 14021 /* We have a rate of some sort set */ 14022 uint32_t orig; 14023 14024 bw_est = rack_get_bw(rack); 14025 orig = rack->r_ctl.rc_pace_max_segs; 14026 if (fill_override) 14027 rate_wanted = *fill_override; 14028 else 14029 rate_wanted = rack_get_gp_est(rack); 14030 if (rate_wanted) { 14031 /* We have something */ 14032 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 14033 rate_wanted, 14034 ctf_fixed_maxseg(rack->rc_tp)); 14035 } else 14036 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 14037 if (orig != rack->r_ctl.rc_pace_max_segs) 14038 chged = 1; 14039 } else if ((rack->r_ctl.gp_bw == 0) && 14040 (rack->r_ctl.rc_pace_max_segs == 0)) { 14041 /* 14042 * If we have nothing limit us to bursting 14043 * out IW sized pieces. 14044 */ 14045 chged = 1; 14046 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 14047 } 14048 } 14049 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 14050 chged = 1; 14051 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 14052 } 14053 if (chged) 14054 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 14055 } 14056 14057 14058 static void 14059 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) 14060 { 14061 #ifdef INET6 14062 struct ip6_hdr *ip6 = NULL; 14063 #endif 14064 #ifdef INET 14065 struct ip *ip = NULL; 14066 #endif 14067 struct udphdr *udp = NULL; 14068 14069 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 14070 #ifdef INET6 14071 if (rack->r_is_v6) { 14072 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 14073 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 14074 if (tp->t_port) { 14075 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14076 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 14077 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14078 udp->uh_dport = tp->t_port; 14079 rack->r_ctl.fsb.udp = udp; 14080 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14081 } else 14082 { 14083 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 14084 rack->r_ctl.fsb.udp = NULL; 14085 } 14086 tcpip_fillheaders(rack->rc_inp, 14087 tp->t_port, 14088 ip6, rack->r_ctl.fsb.th); 14089 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); 14090 } else 14091 #endif /* INET6 */ 14092 #ifdef INET 14093 { 14094 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 14095 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 14096 if (tp->t_port) { 14097 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14098 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 14099 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14100 udp->uh_dport = tp->t_port; 14101 rack->r_ctl.fsb.udp = udp; 14102 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14103 } else 14104 { 14105 rack->r_ctl.fsb.udp = NULL; 14106 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 14107 } 14108 tcpip_fillheaders(rack->rc_inp, 14109 tp->t_port, 14110 ip, rack->r_ctl.fsb.th); 14111 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; 14112 } 14113 #endif 14114 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), 14115 (long)TCP_MAXWIN << tp->rcv_scale); 14116 rack->r_fsb_inited = 1; 14117 } 14118 14119 static int 14120 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 14121 { 14122 /* 14123 * Allocate the larger of spaces V6 if available else just 14124 * V4 and include udphdr (overbook) 14125 */ 14126 #ifdef INET6 14127 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 14128 #else 14129 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 14130 #endif 14131 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 14132 M_TCPFSB, M_NOWAIT|M_ZERO); 14133 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 14134 return (ENOMEM); 14135 } 14136 rack->r_fsb_inited = 0; 14137 return (0); 14138 } 14139 14140 static void 14141 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) 14142 { 14143 /* 14144 * Types of logs (mod value) 14145 * 20 - Initial round setup 14146 * 21 - Rack declares a new round. 14147 */ 14148 struct tcpcb *tp; 14149 14150 tp = rack->rc_tp; 14151 if (tcp_bblogging_on(tp)) { 14152 union tcp_log_stackspecific log; 14153 struct timeval tv; 14154 14155 memset(&log, 0, sizeof(log)); 14156 log.u_bbr.flex1 = rack->r_ctl.current_round; 14157 log.u_bbr.flex2 = rack->r_ctl.roundends; 14158 log.u_bbr.flex3 = high_seq; 14159 log.u_bbr.flex4 = tp->snd_max; 14160 log.u_bbr.flex8 = mod; 14161 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14162 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 14163 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 14164 TCP_LOG_EVENTP(tp, NULL, 14165 &tptosocket(tp)->so_rcv, 14166 &tptosocket(tp)->so_snd, 14167 TCP_HYSTART, 0, 14168 0, &log, false, &tv); 14169 } 14170 } 14171 14172 static void 14173 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) 14174 { 14175 rack->rack_deferred_inited = 1; 14176 rack->r_ctl.roundends = tp->snd_max; 14177 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 14178 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 14179 } 14180 14181 static void 14182 rack_init_retransmit_value(struct tcp_rack *rack, int ctl) 14183 { 14184 /* Retransmit bit controls. 14185 * 14186 * The setting of these values control one of 14187 * three settings you can have and dictate 14188 * how rack does retransmissions. Note this 14189 * is in *any* mode i.e. pacing on or off DGP 14190 * fixed rate pacing, or just bursting rack. 14191 * 14192 * 1 - Use full sized retransmits i.e. limit 14193 * the size to whatever the pace_max_segments 14194 * size is. 14195 * 14196 * 2 - Use pacer min granularity as a guide to 14197 * the size combined with the current calculated 14198 * goodput b/w measurement. So for example if 14199 * the goodput is measured at 20Mbps we would 14200 * calculate 8125 (pacer minimum 250usec in 14201 * that b/w) and then round it up to the next 14202 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. 14203 * 14204 * 0 - The rack default 1 MSS (anything not 0/1/2 14205 * fall here too if we are setting via rack_init()). 14206 * 14207 */ 14208 if (ctl == 1) { 14209 rack->full_size_rxt = 1; 14210 rack->shape_rxt_to_pacing_min = 0; 14211 } else if (ctl == 2) { 14212 rack->full_size_rxt = 0; 14213 rack->shape_rxt_to_pacing_min = 1; 14214 } else { 14215 rack->full_size_rxt = 0; 14216 rack->shape_rxt_to_pacing_min = 0; 14217 } 14218 } 14219 14220 static void 14221 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, 14222 uint32_t flex1, 14223 uint32_t flex2, 14224 uint32_t flex3) 14225 { 14226 if (tcp_bblogging_on(rack->rc_tp)) { 14227 union tcp_log_stackspecific log; 14228 struct timeval tv; 14229 14230 memset(&log, 0, sizeof(log)); 14231 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14232 log.u_bbr.flex8 = mod; 14233 log.u_bbr.flex1 = flex1; 14234 log.u_bbr.flex2 = flex2; 14235 log.u_bbr.flex3 = flex3; 14236 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, 14237 0, &log, false, NULL, __func__, __LINE__, &tv); 14238 } 14239 } 14240 14241 static int 14242 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) 14243 { 14244 struct tcp_rack *rack; 14245 struct rack_sendmap *rsm; 14246 int i; 14247 14248 14249 rack = (struct tcp_rack *)tp->t_fb_ptr; 14250 switch (reqr->req) { 14251 case TCP_QUERY_SENDMAP: 14252 if ((reqr->req_param == tp->snd_max) || 14253 (tp->snd_max == tp->snd_una)){ 14254 /* Unlikely */ 14255 return (0); 14256 } 14257 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); 14258 if (rsm == NULL) { 14259 /* Can't find that seq -- unlikely */ 14260 return (0); 14261 } 14262 reqr->sendmap_start = rsm->r_start; 14263 reqr->sendmap_end = rsm->r_end; 14264 reqr->sendmap_send_cnt = rsm->r_rtr_cnt; 14265 reqr->sendmap_fas = rsm->r_fas; 14266 if (reqr->sendmap_send_cnt > SNDMAP_NRTX) 14267 reqr->sendmap_send_cnt = SNDMAP_NRTX; 14268 for(i=0; i<reqr->sendmap_send_cnt; i++) 14269 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; 14270 reqr->sendmap_ack_arrival = rsm->r_ack_arrival; 14271 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; 14272 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; 14273 reqr->sendmap_dupacks = rsm->r_dupack; 14274 rack_log_chg_info(tp, rack, 1, 14275 rsm->r_start, 14276 rsm->r_end, 14277 rsm->r_flags); 14278 return(1); 14279 break; 14280 case TCP_QUERY_TIMERS_UP: 14281 if (rack->r_ctl.rc_hpts_flags == 0) { 14282 /* no timers up */ 14283 return (0); 14284 } 14285 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; 14286 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14287 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; 14288 } 14289 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14290 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; 14291 } 14292 rack_log_chg_info(tp, rack, 2, 14293 rack->r_ctl.rc_hpts_flags, 14294 rack->r_ctl.rc_last_output_to, 14295 rack->r_ctl.rc_timer_exp); 14296 return (1); 14297 break; 14298 case TCP_QUERY_RACK_TIMES: 14299 /* Reordering items */ 14300 reqr->rack_num_dsacks = rack->r_ctl.num_dsack; 14301 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; 14302 /* Timerstamps and timers */ 14303 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; 14304 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; 14305 reqr->rack_rtt = rack->rc_rack_rtt; 14306 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; 14307 reqr->rack_srtt_measured = rack->rc_srtt_measure_made; 14308 /* PRR data */ 14309 reqr->rack_sacked = rack->r_ctl.rc_sacked; 14310 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; 14311 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; 14312 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; 14313 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; 14314 reqr->rack_prr_out = rack->r_ctl.rc_prr_out; 14315 /* TLP and persists info */ 14316 reqr->rack_tlp_out = rack->rc_tlp_in_progress; 14317 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; 14318 if (rack->rc_in_persist) { 14319 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; 14320 reqr->rack_in_persist = 1; 14321 } else { 14322 reqr->rack_time_went_idle = 0; 14323 reqr->rack_in_persist = 0; 14324 } 14325 if (rack->r_wanted_output) 14326 reqr->rack_wanted_output = 1; 14327 else 14328 reqr->rack_wanted_output = 0; 14329 return (1); 14330 break; 14331 default: 14332 return (-EINVAL); 14333 } 14334 } 14335 14336 static void 14337 rack_switch_failed(struct tcpcb *tp) 14338 { 14339 /* 14340 * This method gets called if a stack switch was 14341 * attempted and it failed. We are left 14342 * but our hpts timers were stopped and we 14343 * need to validate time units and t_flags2. 14344 */ 14345 struct tcp_rack *rack; 14346 struct timeval tv; 14347 uint32_t cts; 14348 uint32_t toval; 14349 struct hpts_diag diag; 14350 14351 rack = (struct tcp_rack *)tp->t_fb_ptr; 14352 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 14353 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14354 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14355 else 14356 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14357 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14358 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14359 if (tp->t_in_hpts > IHPTS_NONE) { 14360 /* Strange */ 14361 return; 14362 } 14363 cts = tcp_get_usecs(&tv); 14364 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14365 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 14366 toval = rack->r_ctl.rc_last_output_to - cts; 14367 } else { 14368 /* one slot please */ 14369 toval = HPTS_USECS_PER_SLOT; 14370 } 14371 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14372 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 14373 toval = rack->r_ctl.rc_timer_exp - cts; 14374 } else { 14375 /* one slot please */ 14376 toval = HPTS_USECS_PER_SLOT; 14377 } 14378 } else 14379 toval = HPTS_USECS_PER_SLOT; 14380 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), 14381 __LINE__, &diag); 14382 rack_log_hpts_diag(rack, cts, &diag, &tv); 14383 } 14384 14385 static int 14386 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) 14387 { 14388 struct rack_sendmap *rsm, *ersm; 14389 int insret __diagused; 14390 /* 14391 * When initing outstanding, we must be quite careful 14392 * to not refer to tp->t_fb_ptr. This has the old rack 14393 * pointer in it, not the "new" one (when we are doing 14394 * a stack switch). 14395 */ 14396 14397 14398 if (tp->t_fb->tfb_chg_query == NULL) { 14399 /* Create a send map for the current outstanding data */ 14400 14401 rsm = rack_alloc(rack); 14402 if (rsm == NULL) { 14403 uma_zfree(rack_pcb_zone, ptr); 14404 return (ENOMEM); 14405 } 14406 rsm->r_no_rtt_allowed = 1; 14407 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 14408 rsm->r_rtr_cnt = 1; 14409 rsm->r_rtr_bytes = 0; 14410 if (tp->t_flags & TF_SENTFIN) 14411 rsm->r_flags |= RACK_HAS_FIN; 14412 rsm->r_end = tp->snd_max; 14413 if (tp->snd_una == tp->iss) { 14414 /* The data space is one beyond snd_una */ 14415 rsm->r_flags |= RACK_HAS_SYN; 14416 rsm->r_start = tp->iss; 14417 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 14418 } else 14419 rsm->r_start = tp->snd_una; 14420 rsm->r_dupack = 0; 14421 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 14422 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 14423 if (rsm->m) { 14424 rsm->orig_m_len = rsm->m->m_len; 14425 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14426 } else { 14427 rsm->orig_m_len = 0; 14428 rsm->orig_t_space = 0; 14429 } 14430 } else { 14431 /* 14432 * This can happen if we have a stand-alone FIN or 14433 * SYN. 14434 */ 14435 rsm->m = NULL; 14436 rsm->orig_m_len = 0; 14437 rsm->orig_t_space = 0; 14438 rsm->soff = 0; 14439 } 14440 #ifdef INVARIANTS 14441 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14442 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14443 insret, rack, rsm); 14444 } 14445 #else 14446 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14447 #endif 14448 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14449 rsm->r_in_tmap = 1; 14450 } else { 14451 /* We have a query mechanism, lets use it */ 14452 struct tcp_query_resp qr; 14453 int i; 14454 tcp_seq at; 14455 14456 at = tp->snd_una; 14457 while (at != tp->snd_max) { 14458 memset(&qr, 0, sizeof(qr)); 14459 qr.req = TCP_QUERY_SENDMAP; 14460 qr.req_param = at; 14461 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) 14462 break; 14463 /* Move forward */ 14464 at = qr.sendmap_end; 14465 /* Now lets build the entry for this one */ 14466 rsm = rack_alloc(rack); 14467 if (rsm == NULL) { 14468 uma_zfree(rack_pcb_zone, ptr); 14469 return (ENOMEM); 14470 } 14471 memset(rsm, 0, sizeof(struct rack_sendmap)); 14472 /* Now configure the rsm and insert it */ 14473 rsm->r_dupack = qr.sendmap_dupacks; 14474 rsm->r_start = qr.sendmap_start; 14475 rsm->r_end = qr.sendmap_end; 14476 if (qr.sendmap_fas) 14477 rsm->r_fas = qr.sendmap_end; 14478 else 14479 rsm->r_fas = rsm->r_start - tp->snd_una; 14480 /* 14481 * We have carefully aligned the bits 14482 * so that all we have to do is copy over 14483 * the bits with the mask. 14484 */ 14485 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; 14486 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; 14487 rsm->r_rtr_cnt = qr.sendmap_send_cnt; 14488 rsm->r_ack_arrival = qr.sendmap_ack_arrival; 14489 for (i=0 ; i<rsm->r_rtr_cnt; i++) 14490 rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; 14491 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 14492 (rsm->r_start - tp->snd_una), &rsm->soff); 14493 if (rsm->m) { 14494 rsm->orig_m_len = rsm->m->m_len; 14495 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14496 } else { 14497 rsm->orig_m_len = 0; 14498 rsm->orig_t_space = 0; 14499 } 14500 #ifdef INVARIANTS 14501 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14502 panic("Insert in tailq_hash fails ret:%d rack:%p rsm:%p", 14503 insret, rack, rsm); 14504 } 14505 #else 14506 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14507 #endif 14508 if ((rsm->r_flags & RACK_ACKED) == 0) { 14509 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { 14510 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > 14511 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { 14512 /* 14513 * If the existing ersm was sent at 14514 * a later time than the new one, then 14515 * the new one should appear ahead of this 14516 * ersm. 14517 */ 14518 rsm->r_in_tmap = 1; 14519 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); 14520 break; 14521 } 14522 } 14523 if (rsm->r_in_tmap == 0) { 14524 /* 14525 * Not found so shove it on the tail. 14526 */ 14527 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14528 rsm->r_in_tmap = 1; 14529 } 14530 } else { 14531 if ((rack->r_ctl.rc_sacklast == NULL) || 14532 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { 14533 rack->r_ctl.rc_sacklast = rsm; 14534 } 14535 } 14536 rack_log_chg_info(tp, rack, 3, 14537 rsm->r_start, 14538 rsm->r_end, 14539 rsm->r_flags); 14540 } 14541 } 14542 return (0); 14543 } 14544 14545 14546 static int32_t 14547 rack_init(struct tcpcb *tp, void **ptr) 14548 { 14549 struct inpcb *inp = tptoinpcb(tp); 14550 struct tcp_rack *rack = NULL; 14551 uint32_t iwin, snt, us_cts; 14552 size_t sz; 14553 int err, no_query; 14554 14555 tcp_hpts_init(tp); 14556 14557 /* 14558 * First are we the initial or are we a switched stack? 14559 * If we are initing via tcp_newtcppcb the ptr passed 14560 * will be tp->t_fb_ptr. If its a stack switch that 14561 * has a previous stack we can query it will be a local 14562 * var that will in the end be set into t_fb_ptr. 14563 */ 14564 if (ptr == &tp->t_fb_ptr) 14565 no_query = 1; 14566 else 14567 no_query = 0; 14568 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 14569 if (*ptr == NULL) { 14570 /* 14571 * We need to allocate memory but cant. The INP and INP_INFO 14572 * locks and they are recursive (happens during setup. So a 14573 * scheme to drop the locks fails :( 14574 * 14575 */ 14576 return(ENOMEM); 14577 } 14578 memset(*ptr, 0, sizeof(struct tcp_rack)); 14579 rack = (struct tcp_rack *)*ptr; 14580 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); 14581 if (rack->r_ctl.tqh == NULL) { 14582 uma_zfree(rack_pcb_zone, rack); 14583 return(ENOMEM); 14584 } 14585 tqhash_init(rack->r_ctl.tqh); 14586 TAILQ_INIT(&rack->r_ctl.rc_free); 14587 TAILQ_INIT(&rack->r_ctl.rc_tmap); 14588 rack->rc_tp = tp; 14589 rack->rc_inp = inp; 14590 /* Set the flag */ 14591 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 14592 /* Probably not needed but lets be sure */ 14593 rack_clear_rate_sample(rack); 14594 /* 14595 * Save off the default values, socket options will poke 14596 * at these if pacing is not on or we have not yet 14597 * reached where pacing is on (gp_ready/fixed enabled). 14598 * When they get set into the CC module (when gp_ready 14599 * is enabled or we enable fixed) then we will set these 14600 * values into the CC and place in here the old values 14601 * so we have a restoral. Then we will set the flag 14602 * rc_pacing_cc_set. That way whenever we turn off pacing 14603 * or switch off this stack, we will know to go restore 14604 * the saved values. 14605 * 14606 * We specifically put into the beta the ecn value for pacing. 14607 */ 14608 rack->rc_new_rnd_needed = 1; 14609 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; 14610 /* We want abe like behavior as well */ 14611 14612 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 14613 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 14614 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 14615 if (rack_fill_cw_state) 14616 rack->rc_pace_to_cwnd = 1; 14617 if (rack_pacing_min_seg) 14618 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; 14619 if (use_rack_rr) 14620 rack->use_rack_rr = 1; 14621 if (rack_dnd_default) { 14622 rack->rc_pace_dnd = 1; 14623 } 14624 if (V_tcp_delack_enabled) 14625 tp->t_delayed_ack = 1; 14626 else 14627 tp->t_delayed_ack = 0; 14628 #ifdef TCP_ACCOUNTING 14629 if (rack_tcp_accounting) { 14630 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 14631 } 14632 #endif 14633 rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY; 14634 sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); 14635 rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT); 14636 if (rack->r_ctl.pcm_s == NULL) { 14637 rack->r_ctl.pcm_i.cnt_alloc = 0; 14638 } 14639 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 14640 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 14641 if (rack_enable_shared_cwnd) 14642 rack->rack_enable_scwnd = 1; 14643 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 14644 rack->rc_user_set_max_segs = rack_hptsi_segments; 14645 rack->r_ctl.max_reduction = rack_max_reduce; 14646 rack->rc_force_max_seg = 0; 14647 TAILQ_INIT(&rack->r_ctl.opt_list); 14648 rack->r_ctl.rc_saved_beta = V_newreno_beta_ecn; 14649 rack->r_ctl.rc_saved_beta_ecn = V_newreno_beta_ecn; 14650 if (rack_hibeta_setting) { 14651 rack->rack_hibeta = 1; 14652 if ((rack_hibeta_setting >= 50) && 14653 (rack_hibeta_setting <= 100)) { 14654 rack->r_ctl.rc_saved_beta = rack_hibeta_setting; 14655 rack->r_ctl.saved_hibeta = rack_hibeta_setting; 14656 } 14657 } else { 14658 rack->r_ctl.saved_hibeta = 50; 14659 } 14660 /* 14661 * We initialize to all ones so we never match 0 14662 * just in case the client sends in 0, it hopefully 14663 * will never have all 1's in ms :-) 14664 */ 14665 rack->r_ctl.last_tm_mark = 0xffffffffffffffff; 14666 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 14667 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 14668 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 14669 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 14670 rack->r_ctl.rc_highest_us_rtt = 0; 14671 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 14672 rack->pcm_enabled = rack_pcm_is_enabled; 14673 if (rack_fillcw_bw_cap) 14674 rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; 14675 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 14676 if (rack_use_cmp_acks) 14677 rack->r_use_cmp_ack = 1; 14678 if (rack_disable_prr) 14679 rack->rack_no_prr = 1; 14680 if (rack_gp_no_rec_chg) 14681 rack->rc_gp_no_rec_chg = 1; 14682 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 14683 rack->r_ctl.pacing_method |= RACK_REG_PACING; 14684 rack->rc_always_pace = 1; 14685 if (rack->rack_hibeta) 14686 rack_set_cc_pacing(rack); 14687 } else 14688 rack->rc_always_pace = 0; 14689 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 14690 rack->r_mbuf_queue = 1; 14691 else 14692 rack->r_mbuf_queue = 0; 14693 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14694 if (rack_limits_scwnd) 14695 rack->r_limit_scw = 1; 14696 else 14697 rack->r_limit_scw = 0; 14698 rack_init_retransmit_value(rack, rack_rxt_controls); 14699 rack->rc_labc = V_tcp_abc_l_var; 14700 if (rack_honors_hpts_min_to) 14701 rack->r_use_hpts_min = 1; 14702 if (tp->snd_una != 0) { 14703 rack->rc_sendvars_notset = 0; 14704 /* 14705 * Make sure any TCP timers are not running. 14706 */ 14707 tcp_timer_stop(tp); 14708 } else { 14709 /* 14710 * Server side, we are called from the 14711 * syn-cache. This means none of the 14712 * snd_una/max are set yet so we have 14713 * to defer this until the first send. 14714 */ 14715 rack->rc_sendvars_notset = 1; 14716 } 14717 14718 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 14719 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 14720 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 14721 rack->r_ctl.rc_min_to = rack_min_to; 14722 microuptime(&rack->r_ctl.act_rcv_time); 14723 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 14724 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 14725 if (rack_hw_up_only) 14726 rack->r_up_only = 1; 14727 if (rack_do_dyn_mul) { 14728 /* When dynamic adjustment is on CA needs to start at 100% */ 14729 rack->rc_gp_dyn_mul = 1; 14730 if (rack_do_dyn_mul >= 100) 14731 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 14732 } else 14733 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 14734 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 14735 if (rack_timely_off) { 14736 rack->rc_skip_timely = 1; 14737 } 14738 if (rack->rc_skip_timely) { 14739 rack->r_ctl.rack_per_of_gp_rec = 90; 14740 rack->r_ctl.rack_per_of_gp_ca = 100; 14741 rack->r_ctl.rack_per_of_gp_ss = 250; 14742 } 14743 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 14744 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 14745 rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_msec(&rack->r_ctl.act_rcv_time); 14746 14747 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 14748 rack_probertt_filter_life); 14749 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 14750 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 14751 rack->r_ctl.rc_time_of_last_probertt = us_cts; 14752 rack->r_ctl.rc_went_idle_time = us_cts; 14753 rack->r_ctl.rc_time_probertt_starts = 0; 14754 14755 rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff; 14756 if (rack_rnd_cnt_req & 0x10000) 14757 rack->r_ctl.gate_to_fs = 1; 14758 rack->r_ctl.gp_gain_req = rack_gp_gain_req; 14759 if ((rack_rnd_cnt_req & 0x100) > 0) { 14760 14761 } 14762 if (rack_dsack_std_based & 0x1) { 14763 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 14764 rack->rc_rack_tmr_std_based = 1; 14765 } 14766 if (rack_dsack_std_based & 0x2) { 14767 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 14768 rack->rc_rack_use_dsack = 1; 14769 } 14770 /* We require at least one measurement, even if the sysctl is 0 */ 14771 if (rack_req_measurements) 14772 rack->r_ctl.req_measurements = rack_req_measurements; 14773 else 14774 rack->r_ctl.req_measurements = 1; 14775 if (rack_enable_hw_pacing) 14776 rack->rack_hdw_pace_ena = 1; 14777 if (rack_hw_rate_caps) 14778 rack->r_rack_hw_rate_caps = 1; 14779 if (rack_non_rxt_use_cr) 14780 rack->rack_rec_nonrxt_use_cr = 1; 14781 /* Lets setup the fsb block */ 14782 err = rack_init_fsb(tp, rack); 14783 if (err) { 14784 uma_zfree(rack_pcb_zone, *ptr); 14785 *ptr = NULL; 14786 return (err); 14787 } 14788 if (rack_do_hystart) { 14789 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 14790 if (rack_do_hystart > 1) 14791 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 14792 if (rack_do_hystart > 2) 14793 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 14794 } 14795 /* Log what we will do with queries */ 14796 rack_log_chg_info(tp, rack, 7, 14797 no_query, 0, 0); 14798 if (rack_def_profile) 14799 rack_set_profile(rack, rack_def_profile); 14800 /* Cancel the GP measurement in progress */ 14801 tp->t_flags &= ~TF_GPUTINPROG; 14802 if ((tp->t_state != TCPS_CLOSED) && 14803 (tp->t_state != TCPS_TIME_WAIT)) { 14804 /* 14805 * We are already open, we may 14806 * need to adjust a few things. 14807 */ 14808 if (SEQ_GT(tp->snd_max, tp->iss)) 14809 snt = tp->snd_max - tp->iss; 14810 else 14811 snt = 0; 14812 iwin = rc_init_window(rack); 14813 if ((snt < iwin) && 14814 (no_query == 1)) { 14815 /* We are not past the initial window 14816 * on the first init (i.e. a stack switch 14817 * has not yet occured) so we need to make 14818 * sure cwnd and ssthresh is correct. 14819 */ 14820 if (tp->snd_cwnd < iwin) 14821 tp->snd_cwnd = iwin; 14822 /* 14823 * If we are within the initial window 14824 * we want ssthresh to be unlimited. Setting 14825 * it to the rwnd (which the default stack does 14826 * and older racks) is not really a good idea 14827 * since we want to be in SS and grow both the 14828 * cwnd and the rwnd (via dynamic rwnd growth). If 14829 * we set it to the rwnd then as the peer grows its 14830 * rwnd we will be stuck in CA and never hit SS. 14831 * 14832 * Its far better to raise it up high (this takes the 14833 * risk that there as been a loss already, probably 14834 * we should have an indicator in all stacks of loss 14835 * but we don't), but considering the normal use this 14836 * is a risk worth taking. The consequences of not 14837 * hitting SS are far worse than going one more time 14838 * into it early on (before we have sent even a IW). 14839 * It is highly unlikely that we will have had a loss 14840 * before getting the IW out. 14841 */ 14842 tp->snd_ssthresh = 0xffffffff; 14843 } 14844 /* 14845 * Any init based on sequence numbers 14846 * should be done in the deferred init path 14847 * since we can be CLOSED and not have them 14848 * inited when rack_init() is called. We 14849 * are not closed so lets call it. 14850 */ 14851 rack_deferred_init(tp, rack); 14852 } 14853 if ((tp->t_state != TCPS_CLOSED) && 14854 (tp->t_state != TCPS_TIME_WAIT) && 14855 (no_query == 0) && 14856 (tp->snd_una != tp->snd_max)) { 14857 err = rack_init_outstanding(tp, rack, us_cts, *ptr); 14858 if (err) { 14859 *ptr = NULL; 14860 return(err); 14861 } 14862 } 14863 rack_stop_all_timers(tp, rack); 14864 /* Setup all the t_flags2 */ 14865 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14866 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14867 else 14868 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14869 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14870 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14871 /* 14872 * Timers in Rack are kept in microseconds so lets 14873 * convert any initial incoming variables 14874 * from ticks into usecs. Note that we 14875 * also change the values of t_srtt and t_rttvar, if 14876 * they are non-zero. They are kept with a 5 14877 * bit decimal so we have to carefully convert 14878 * these to get the full precision. 14879 */ 14880 rack_convert_rtts(tp); 14881 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); 14882 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { 14883 /* We do not start any timers on DROPPED connections */ 14884 if (tp->t_fb->tfb_chg_query == NULL) { 14885 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 14886 } else { 14887 struct tcp_query_resp qr; 14888 int ret; 14889 14890 memset(&qr, 0, sizeof(qr)); 14891 14892 /* Get the misc time stamps and such for rack */ 14893 qr.req = TCP_QUERY_RACK_TIMES; 14894 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 14895 if (ret == 1) { 14896 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; 14897 rack->r_ctl.num_dsack = qr.rack_num_dsacks; 14898 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; 14899 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; 14900 rack->rc_rack_rtt = qr.rack_rtt; 14901 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; 14902 rack->r_ctl.rc_sacked = qr.rack_sacked; 14903 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; 14904 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; 14905 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; 14906 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; 14907 rack->r_ctl.rc_prr_out = qr.rack_prr_out; 14908 if (qr.rack_tlp_out) { 14909 rack->rc_tlp_in_progress = 1; 14910 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; 14911 } else { 14912 rack->rc_tlp_in_progress = 0; 14913 rack->r_ctl.rc_tlp_cnt_out = 0; 14914 } 14915 if (qr.rack_srtt_measured) 14916 rack->rc_srtt_measure_made = 1; 14917 if (qr.rack_in_persist == 1) { 14918 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; 14919 #ifdef NETFLIX_SHARED_CWND 14920 if (rack->r_ctl.rc_scw) { 14921 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 14922 rack->rack_scwnd_is_idle = 1; 14923 } 14924 #endif 14925 rack->r_ctl.persist_lost_ends = 0; 14926 rack->probe_not_answered = 0; 14927 rack->forced_ack = 0; 14928 tp->t_rxtshift = 0; 14929 rack->rc_in_persist = 1; 14930 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 14931 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 14932 } 14933 if (qr.rack_wanted_output) 14934 rack->r_wanted_output = 1; 14935 rack_log_chg_info(tp, rack, 6, 14936 qr.rack_min_rtt, 14937 qr.rack_rtt, 14938 qr.rack_reorder_ts); 14939 } 14940 /* Get the old stack timers */ 14941 qr.req_param = 0; 14942 qr.req = TCP_QUERY_TIMERS_UP; 14943 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 14944 if (ret) { 14945 /* 14946 * non-zero return means we have a timer('s) 14947 * to start. Zero means no timer (no keepalive 14948 * I suppose). 14949 */ 14950 uint32_t tov = 0; 14951 14952 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; 14953 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { 14954 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; 14955 if (TSTMP_GT(qr.timer_pacing_to, us_cts)) 14956 tov = qr.timer_pacing_to - us_cts; 14957 else 14958 tov = HPTS_USECS_PER_SLOT; 14959 } 14960 if (qr.timer_hpts_flags & PACE_TMR_MASK) { 14961 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; 14962 if (tov == 0) { 14963 if (TSTMP_GT(qr.timer_timer_exp, us_cts)) 14964 tov = qr.timer_timer_exp - us_cts; 14965 else 14966 tov = HPTS_USECS_PER_SLOT; 14967 } 14968 } 14969 rack_log_chg_info(tp, rack, 4, 14970 rack->r_ctl.rc_hpts_flags, 14971 rack->r_ctl.rc_last_output_to, 14972 rack->r_ctl.rc_timer_exp); 14973 if (tov) { 14974 struct hpts_diag diag; 14975 14976 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), 14977 __LINE__, &diag); 14978 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); 14979 } 14980 } 14981 } 14982 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 14983 __LINE__, RACK_RTTS_INIT); 14984 } 14985 return (0); 14986 } 14987 14988 static int 14989 rack_handoff_ok(struct tcpcb *tp) 14990 { 14991 if ((tp->t_state == TCPS_CLOSED) || 14992 (tp->t_state == TCPS_LISTEN)) { 14993 /* Sure no problem though it may not stick */ 14994 return (0); 14995 } 14996 if ((tp->t_state == TCPS_SYN_SENT) || 14997 (tp->t_state == TCPS_SYN_RECEIVED)) { 14998 /* 14999 * We really don't know if you support sack, 15000 * you have to get to ESTAB or beyond to tell. 15001 */ 15002 return (EAGAIN); 15003 } 15004 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 15005 /* 15006 * Rack will only send a FIN after all data is acknowledged. 15007 * So in this case we have more data outstanding. We can't 15008 * switch stacks until either all data and only the FIN 15009 * is left (in which case rack_init() now knows how 15010 * to deal with that) <or> all is acknowledged and we 15011 * are only left with incoming data, though why you 15012 * would want to switch to rack after all data is acknowledged 15013 * I have no idea (rrs)! 15014 */ 15015 return (EAGAIN); 15016 } 15017 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 15018 return (0); 15019 } 15020 /* 15021 * If we reach here we don't do SACK on this connection so we can 15022 * never do rack. 15023 */ 15024 return (EINVAL); 15025 } 15026 15027 static void 15028 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 15029 { 15030 15031 if (tp->t_fb_ptr) { 15032 uint32_t cnt_free = 0; 15033 struct tcp_rack *rack; 15034 struct rack_sendmap *rsm; 15035 15036 tcp_handle_orphaned_packets(tp); 15037 tp->t_flags &= ~TF_FORCEDATA; 15038 rack = (struct tcp_rack *)tp->t_fb_ptr; 15039 rack_log_pacing_delay_calc(rack, 15040 0, 15041 0, 15042 0, 15043 rack_get_gp_est(rack), /* delRate */ 15044 rack_get_lt_bw(rack), /* rttProp */ 15045 20, __LINE__, NULL, 0); 15046 #ifdef NETFLIX_SHARED_CWND 15047 if (rack->r_ctl.rc_scw) { 15048 uint32_t limit; 15049 15050 if (rack->r_limit_scw) 15051 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 15052 else 15053 limit = 0; 15054 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 15055 rack->r_ctl.rc_scw_index, 15056 limit); 15057 rack->r_ctl.rc_scw = NULL; 15058 } 15059 #endif 15060 if (rack->r_ctl.fsb.tcp_ip_hdr) { 15061 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 15062 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 15063 rack->r_ctl.fsb.th = NULL; 15064 } 15065 if (rack->rc_always_pace == 1) { 15066 rack_remove_pacing(rack); 15067 } 15068 /* Clean up any options if they were not applied */ 15069 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 15070 struct deferred_opt_list *dol; 15071 15072 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 15073 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 15074 free(dol, M_TCPDO); 15075 } 15076 /* rack does not use force data but other stacks may clear it */ 15077 if (rack->r_ctl.crte != NULL) { 15078 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 15079 rack->rack_hdrw_pacing = 0; 15080 rack->r_ctl.crte = NULL; 15081 } 15082 #ifdef TCP_BLACKBOX 15083 tcp_log_flowend(tp); 15084 #endif 15085 /* 15086 * Lets take a different approach to purging just 15087 * get each one and free it like a cum-ack would and 15088 * not use a foreach loop. 15089 */ 15090 rsm = tqhash_min(rack->r_ctl.tqh); 15091 while (rsm) { 15092 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 15093 rack->r_ctl.rc_num_maps_alloced--; 15094 uma_zfree(rack_zone, rsm); 15095 rsm = tqhash_min(rack->r_ctl.tqh); 15096 } 15097 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15098 while (rsm) { 15099 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 15100 rack->r_ctl.rc_num_maps_alloced--; 15101 rack->rc_free_cnt--; 15102 cnt_free++; 15103 uma_zfree(rack_zone, rsm); 15104 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15105 } 15106 if (rack->r_ctl.pcm_s != NULL) { 15107 free(rack->r_ctl.pcm_s, M_TCPPCM); 15108 rack->r_ctl.pcm_s = NULL; 15109 rack->r_ctl.pcm_i.cnt_alloc = 0; 15110 rack->r_ctl.pcm_i.cnt = 0; 15111 } 15112 if ((rack->r_ctl.rc_num_maps_alloced > 0) && 15113 (tcp_bblogging_on(tp))) { 15114 union tcp_log_stackspecific log; 15115 struct timeval tv; 15116 15117 memset(&log, 0, sizeof(log)); 15118 log.u_bbr.flex8 = 10; 15119 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; 15120 log.u_bbr.flex2 = rack->rc_free_cnt; 15121 log.u_bbr.flex3 = cnt_free; 15122 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15123 rsm = tqhash_min(rack->r_ctl.tqh); 15124 log.u_bbr.delRate = (uintptr_t)rsm; 15125 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15126 log.u_bbr.cur_del_rate = (uintptr_t)rsm; 15127 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15128 log.u_bbr.pkt_epoch = __LINE__; 15129 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15130 0, &log, false, NULL, NULL, 0, &tv); 15131 } 15132 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), 15133 ("rack:%p num_aloc:%u after freeing all?", 15134 rack, 15135 rack->r_ctl.rc_num_maps_alloced)); 15136 rack->rc_free_cnt = 0; 15137 free(rack->r_ctl.tqh, M_TCPFSB); 15138 rack->r_ctl.tqh = NULL; 15139 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 15140 tp->t_fb_ptr = NULL; 15141 } 15142 /* Make sure snd_nxt is correctly set */ 15143 tp->snd_nxt = tp->snd_max; 15144 } 15145 15146 static void 15147 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 15148 { 15149 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 15150 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 15151 } 15152 switch (tp->t_state) { 15153 case TCPS_SYN_SENT: 15154 rack->r_state = TCPS_SYN_SENT; 15155 rack->r_substate = rack_do_syn_sent; 15156 break; 15157 case TCPS_SYN_RECEIVED: 15158 rack->r_state = TCPS_SYN_RECEIVED; 15159 rack->r_substate = rack_do_syn_recv; 15160 break; 15161 case TCPS_ESTABLISHED: 15162 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15163 rack->r_state = TCPS_ESTABLISHED; 15164 rack->r_substate = rack_do_established; 15165 break; 15166 case TCPS_CLOSE_WAIT: 15167 rack->r_state = TCPS_CLOSE_WAIT; 15168 rack->r_substate = rack_do_close_wait; 15169 break; 15170 case TCPS_FIN_WAIT_1: 15171 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15172 rack->r_state = TCPS_FIN_WAIT_1; 15173 rack->r_substate = rack_do_fin_wait_1; 15174 break; 15175 case TCPS_CLOSING: 15176 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15177 rack->r_state = TCPS_CLOSING; 15178 rack->r_substate = rack_do_closing; 15179 break; 15180 case TCPS_LAST_ACK: 15181 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15182 rack->r_state = TCPS_LAST_ACK; 15183 rack->r_substate = rack_do_lastack; 15184 break; 15185 case TCPS_FIN_WAIT_2: 15186 rack->r_state = TCPS_FIN_WAIT_2; 15187 rack->r_substate = rack_do_fin_wait_2; 15188 break; 15189 case TCPS_LISTEN: 15190 case TCPS_CLOSED: 15191 case TCPS_TIME_WAIT: 15192 default: 15193 break; 15194 }; 15195 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15196 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 15197 15198 } 15199 15200 static void 15201 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 15202 { 15203 /* 15204 * We received an ack, and then did not 15205 * call send or were bounced out due to the 15206 * hpts was running. Now a timer is up as well, is 15207 * it the right timer? 15208 */ 15209 struct rack_sendmap *rsm; 15210 int tmr_up; 15211 15212 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 15213 if (tcp_in_hpts(rack->rc_tp) == 0) { 15214 /* 15215 * Ok we probably need some timer up, but no 15216 * matter what the mask we are not in hpts. We 15217 * may have received an old ack and thus did nothing. 15218 */ 15219 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15220 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15221 return; 15222 } 15223 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 15224 return; 15225 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 15226 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 15227 (tmr_up == PACE_TMR_RXT)) { 15228 /* Should be an RXT */ 15229 return; 15230 } 15231 if (rsm == NULL) { 15232 /* Nothing outstanding? */ 15233 if (tp->t_flags & TF_DELACK) { 15234 if (tmr_up == PACE_TMR_DELACK) 15235 /* We are supposed to have delayed ack up and we do */ 15236 return; 15237 } else if (((V_tcp_always_keepalive || 15238 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 15239 (tp->t_state <= TCPS_CLOSING)) && 15240 (tmr_up == PACE_TMR_KEEP) && 15241 (tp->snd_max == tp->snd_una)) { 15242 /* We should have keep alive up and we do */ 15243 return; 15244 } 15245 } 15246 if (SEQ_GT(tp->snd_max, tp->snd_una) && 15247 ((tmr_up == PACE_TMR_TLP) || 15248 (tmr_up == PACE_TMR_RACK) || 15249 (tmr_up == PACE_TMR_RXT))) { 15250 /* 15251 * Either a Rack, TLP or RXT is fine if we 15252 * have outstanding data. 15253 */ 15254 return; 15255 } else if (tmr_up == PACE_TMR_DELACK) { 15256 /* 15257 * If the delayed ack was going to go off 15258 * before the rtx/tlp/rack timer were going to 15259 * expire, then that would be the timer in control. 15260 * Note we don't check the time here trusting the 15261 * code is correct. 15262 */ 15263 return; 15264 } 15265 /* 15266 * Ok the timer originally started is not what we want now. 15267 * We will force the hpts to be stopped if any, and restart 15268 * with the slot set to what was in the saved slot. 15269 */ 15270 if (tcp_in_hpts(rack->rc_tp)) { 15271 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 15272 uint32_t us_cts; 15273 15274 us_cts = tcp_get_usecs(NULL); 15275 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 15276 rack->r_early = 1; 15277 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 15278 } 15279 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 15280 } 15281 tcp_hpts_remove(rack->rc_tp); 15282 } 15283 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15284 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15285 } 15286 15287 15288 static void 15289 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) 15290 { 15291 if ((SEQ_LT(tp->snd_wl1, seq) || 15292 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 15293 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 15294 /* keep track of pure window updates */ 15295 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 15296 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 15297 tp->snd_wnd = tiwin; 15298 rack_validate_fo_sendwin_up(tp, rack); 15299 tp->snd_wl1 = seq; 15300 tp->snd_wl2 = ack; 15301 if (tp->snd_wnd > tp->max_sndwnd) 15302 tp->max_sndwnd = tp->snd_wnd; 15303 rack->r_wanted_output = 1; 15304 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 15305 tp->snd_wnd = tiwin; 15306 rack_validate_fo_sendwin_up(tp, rack); 15307 tp->snd_wl1 = seq; 15308 tp->snd_wl2 = ack; 15309 } else { 15310 /* Not a valid win update */ 15311 return; 15312 } 15313 if (tp->snd_wnd > tp->max_sndwnd) 15314 tp->max_sndwnd = tp->snd_wnd; 15315 /* Do we exit persists? */ 15316 if ((rack->rc_in_persist != 0) && 15317 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 15318 rack->r_ctl.rc_pace_min_segs))) { 15319 rack_exit_persist(tp, rack, cts); 15320 } 15321 /* Do we enter persists? */ 15322 if ((rack->rc_in_persist == 0) && 15323 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 15324 TCPS_HAVEESTABLISHED(tp->t_state) && 15325 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 15326 sbavail(&tptosocket(tp)->so_snd) && 15327 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 15328 /* 15329 * Here the rwnd is less than 15330 * the pacing size, we are established, 15331 * nothing is outstanding, and there is 15332 * data to send. Enter persists. 15333 */ 15334 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); 15335 } 15336 } 15337 15338 static void 15339 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 15340 { 15341 15342 if (tcp_bblogging_on(rack->rc_tp)) { 15343 struct inpcb *inp = tptoinpcb(tp); 15344 union tcp_log_stackspecific log; 15345 struct timeval ltv; 15346 char tcp_hdr_buf[60]; 15347 struct tcphdr *th; 15348 struct timespec ts; 15349 uint32_t orig_snd_una; 15350 uint8_t xx = 0; 15351 15352 #ifdef TCP_REQUEST_TRK 15353 struct tcp_sendfile_track *tcp_req; 15354 15355 if (SEQ_GT(ae->ack, tp->snd_una)) { 15356 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1)); 15357 } else { 15358 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); 15359 } 15360 #endif 15361 memset(&log, 0, sizeof(log)); 15362 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 15363 if (rack->rack_no_prr == 0) 15364 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15365 else 15366 log.u_bbr.flex1 = 0; 15367 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 15368 log.u_bbr.use_lt_bw <<= 1; 15369 log.u_bbr.use_lt_bw |= rack->r_might_revert; 15370 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 15371 log.u_bbr.bbr_state = rack->rc_free_cnt; 15372 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15373 log.u_bbr.pkts_out = tp->t_maxseg; 15374 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 15375 log.u_bbr.flex7 = 1; 15376 log.u_bbr.lost = ae->flags; 15377 log.u_bbr.cwnd_gain = ackval; 15378 log.u_bbr.pacing_gain = 0x2; 15379 if (ae->flags & TSTMP_HDWR) { 15380 /* Record the hardware timestamp if present */ 15381 log.u_bbr.flex3 = M_TSTMP; 15382 ts.tv_sec = ae->timestamp / 1000000000; 15383 ts.tv_nsec = ae->timestamp % 1000000000; 15384 ltv.tv_sec = ts.tv_sec; 15385 ltv.tv_usec = ts.tv_nsec / 1000; 15386 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); 15387 } else if (ae->flags & TSTMP_LRO) { 15388 /* Record the LRO the arrival timestamp */ 15389 log.u_bbr.flex3 = M_TSTMP_LRO; 15390 ts.tv_sec = ae->timestamp / 1000000000; 15391 ts.tv_nsec = ae->timestamp % 1000000000; 15392 ltv.tv_sec = ts.tv_sec; 15393 ltv.tv_usec = ts.tv_nsec / 1000; 15394 log.u_bbr.flex5 = tcp_tv_to_usec(<v); 15395 } 15396 log.u_bbr.timeStamp = tcp_get_usecs(<v); 15397 /* Log the rcv time */ 15398 log.u_bbr.delRate = ae->timestamp; 15399 #ifdef TCP_REQUEST_TRK 15400 log.u_bbr.applimited = tp->t_tcpreq_closed; 15401 log.u_bbr.applimited <<= 8; 15402 log.u_bbr.applimited |= tp->t_tcpreq_open; 15403 log.u_bbr.applimited <<= 8; 15404 log.u_bbr.applimited |= tp->t_tcpreq_req; 15405 if (tcp_req) { 15406 /* Copy out any client req info */ 15407 /* seconds */ 15408 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 15409 /* useconds */ 15410 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 15411 log.u_bbr.rttProp = tcp_req->timestamp; 15412 log.u_bbr.cur_del_rate = tcp_req->start; 15413 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 15414 log.u_bbr.flex8 |= 1; 15415 } else { 15416 log.u_bbr.flex8 |= 2; 15417 log.u_bbr.bw_inuse = tcp_req->end; 15418 } 15419 log.u_bbr.flex6 = tcp_req->start_seq; 15420 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 15421 log.u_bbr.flex8 |= 4; 15422 log.u_bbr.epoch = tcp_req->end_seq; 15423 } 15424 } 15425 #endif 15426 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 15427 th = (struct tcphdr *)tcp_hdr_buf; 15428 th->th_seq = ae->seq; 15429 th->th_ack = ae->ack; 15430 th->th_win = ae->win; 15431 /* Now fill in the ports */ 15432 th->th_sport = inp->inp_fport; 15433 th->th_dport = inp->inp_lport; 15434 tcp_set_flags(th, ae->flags); 15435 /* Now do we have a timestamp option? */ 15436 if (ae->flags & HAS_TSTMP) { 15437 u_char *cp; 15438 uint32_t val; 15439 15440 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 15441 cp = (u_char *)(th + 1); 15442 *cp = TCPOPT_NOP; 15443 cp++; 15444 *cp = TCPOPT_NOP; 15445 cp++; 15446 *cp = TCPOPT_TIMESTAMP; 15447 cp++; 15448 *cp = TCPOLEN_TIMESTAMP; 15449 cp++; 15450 val = htonl(ae->ts_value); 15451 bcopy((char *)&val, 15452 (char *)cp, sizeof(uint32_t)); 15453 val = htonl(ae->ts_echo); 15454 bcopy((char *)&val, 15455 (char *)(cp + 4), sizeof(uint32_t)); 15456 } else 15457 th->th_off = (sizeof(struct tcphdr) >> 2); 15458 15459 /* 15460 * For sane logging we need to play a little trick. 15461 * If the ack were fully processed we would have moved 15462 * snd_una to high_seq, but since compressed acks are 15463 * processed in two phases, at this point (logging) snd_una 15464 * won't be advanced. So we would see multiple acks showing 15465 * the advancement. We can prevent that by "pretending" that 15466 * snd_una was advanced and then un-advancing it so that the 15467 * logging code has the right value for tlb_snd_una. 15468 */ 15469 if (tp->snd_una != high_seq) { 15470 orig_snd_una = tp->snd_una; 15471 tp->snd_una = high_seq; 15472 xx = 1; 15473 } else 15474 xx = 0; 15475 TCP_LOG_EVENTP(tp, th, 15476 &tptosocket(tp)->so_rcv, 15477 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 15478 0, &log, true, <v); 15479 if (xx) { 15480 tp->snd_una = orig_snd_una; 15481 } 15482 } 15483 15484 } 15485 15486 static void 15487 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 15488 { 15489 uint32_t us_rtt; 15490 /* 15491 * A persist or keep-alive was forced out, update our 15492 * min rtt time. Note now worry about lost responses. 15493 * When a subsequent keep-alive or persist times out 15494 * and forced_ack is still on, then the last probe 15495 * was not responded to. In such cases we have a 15496 * sysctl that controls the behavior. Either we apply 15497 * the rtt but with reduced confidence (0). Or we just 15498 * plain don't apply the rtt estimate. Having data flow 15499 * will clear the probe_not_answered flag i.e. cum-ack 15500 * move forward <or> exiting and reentering persists. 15501 */ 15502 15503 rack->forced_ack = 0; 15504 rack->rc_tp->t_rxtshift = 0; 15505 if ((rack->rc_in_persist && 15506 (tiwin == rack->rc_tp->snd_wnd)) || 15507 (rack->rc_in_persist == 0)) { 15508 /* 15509 * In persists only apply the RTT update if this is 15510 * a response to our window probe. And that 15511 * means the rwnd sent must match the current 15512 * snd_wnd. If it does not, then we got a 15513 * window update ack instead. For keepalive 15514 * we allow the answer no matter what the window. 15515 * 15516 * Note that if the probe_not_answered is set then 15517 * the forced_ack_ts is the oldest one i.e. the first 15518 * probe sent that might have been lost. This assures 15519 * us that if we do calculate an RTT it is longer not 15520 * some short thing. 15521 */ 15522 if (rack->rc_in_persist) 15523 counter_u64_add(rack_persists_acks, 1); 15524 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 15525 if (us_rtt == 0) 15526 us_rtt = 1; 15527 if (rack->probe_not_answered == 0) { 15528 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15529 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 15530 } else { 15531 /* We have a retransmitted probe here too */ 15532 if (rack_apply_rtt_with_reduced_conf) { 15533 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15534 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 15535 } 15536 } 15537 } 15538 } 15539 15540 static void 15541 rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 15542 { 15543 /* 15544 * The next send has occurred mark the end of the round 15545 * as when that data gets acknowledged. We can 15546 * also do common things we might need to do when 15547 * a round begins. 15548 */ 15549 rack->r_ctl.roundends = tp->snd_max; 15550 rack->rc_new_rnd_needed = 0; 15551 rack_log_hystart_event(rack, tp->snd_max, 4); 15552 } 15553 15554 15555 static void 15556 rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, 15557 uint32_t flex3) 15558 { 15559 if (tcp_bblogging_on(rack->rc_tp)) { 15560 union tcp_log_stackspecific log; 15561 struct timeval tv; 15562 15563 (void)tcp_get_usecs(&tv); 15564 memset(&log, 0, sizeof(log)); 15565 log.u_bbr.timeStamp = tcp_tv_to_usec(&tv); 15566 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15567 log.u_bbr.flex8 = mod; 15568 log.u_bbr.flex1 = flex1; 15569 log.u_bbr.flex2 = flex2; 15570 log.u_bbr.flex3 = flex3; 15571 log.u_bbr.flex4 = rack_pcm_every_n_rounds; 15572 log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds; 15573 log.u_bbr.bbr_substate = rack->pcm_needed; 15574 log.u_bbr.bbr_substate <<= 1; 15575 log.u_bbr.bbr_substate |= rack->pcm_in_progress; 15576 log.u_bbr.bbr_substate <<= 1; 15577 log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */ 15578 (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, 15579 0, &log, false, NULL, NULL, 0, &tv); 15580 } 15581 } 15582 15583 static void 15584 rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) 15585 { 15586 /* 15587 * The round (current_round) has ended. We now 15588 * setup for the next round by incrementing the 15589 * round numnber and doing any round specific 15590 * things. 15591 */ 15592 rack_log_hystart_event(rack, high_seq, 21); 15593 rack->r_ctl.current_round++; 15594 /* New round (current_round) begins at next send */ 15595 rack->rc_new_rnd_needed = 1; 15596 if ((rack->pcm_enabled == 1) && 15597 (rack->pcm_needed == 0) && 15598 (rack->pcm_in_progress == 0)) { 15599 /* 15600 * If we have enabled PCM, then we need to 15601 * check if the round has adanced to the state 15602 * where one is required. 15603 */ 15604 int rnds; 15605 15606 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 15607 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 15608 rack->pcm_needed = 1; 15609 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 15610 } else if (rack_verbose_logging) { 15611 rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); 15612 } 15613 } 15614 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 15615 /* We have hystart enabled send the round info in */ 15616 if (CC_ALGO(tp)->newround != NULL) { 15617 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 15618 } 15619 } 15620 /* 15621 * For DGP an initial startup check. We want to validate 15622 * that we are not just pushing on slow-start and just 15623 * not gaining.. i.e. filling buffers without getting any 15624 * boost in b/w during the inital slow-start. 15625 */ 15626 if (rack->dgp_on && 15627 (rack->rc_initial_ss_comp == 0) && 15628 (tp->snd_cwnd < tp->snd_ssthresh) && 15629 (rack->r_ctl.num_measurements >= RACK_REQ_AVG) && 15630 (rack->r_ctl.gp_rnd_thresh > 0) && 15631 ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) { 15632 15633 /* 15634 * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where 15635 * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets 15636 * exit SS. 15637 * 15638 * Pick up the flight size now as we enter slowstart (not the 15639 * cwnd which may be inflated). 15640 */ 15641 rack->rc_initial_ss_comp = 1; 15642 15643 if (tcp_bblogging_on(rack->rc_tp)) { 15644 union tcp_log_stackspecific log; 15645 struct timeval tv; 15646 15647 memset(&log, 0, sizeof(log)); 15648 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15649 log.u_bbr.flex1 = rack->r_ctl.current_round; 15650 log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; 15651 log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh; 15652 log.u_bbr.flex4 = rack->r_ctl.gate_to_fs; 15653 log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs; 15654 log.u_bbr.flex8 = 40; 15655 (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 15656 0, &log, false, NULL, __func__, __LINE__,&tv); 15657 } 15658 if ((rack->r_ctl.gate_to_fs == 1) && 15659 (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) { 15660 tp->snd_cwnd = rack->r_ctl.ss_hi_fs; 15661 } 15662 tp->snd_ssthresh = tp->snd_cwnd - 1; 15663 /* Turn off any fast output running */ 15664 rack->r_fast_output = 0; 15665 } 15666 } 15667 15668 static int 15669 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 15670 { 15671 /* 15672 * Handle a "special" compressed ack mbuf. Each incoming 15673 * ack has only four possible dispositions: 15674 * 15675 * A) It moves the cum-ack forward 15676 * B) It is behind the cum-ack. 15677 * C) It is a window-update ack. 15678 * D) It is a dup-ack. 15679 * 15680 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 15681 * in the incoming mbuf. We also need to still pay attention 15682 * to nxt_pkt since there may be another packet after this 15683 * one. 15684 */ 15685 #ifdef TCP_ACCOUNTING 15686 uint64_t ts_val; 15687 uint64_t rdstc; 15688 #endif 15689 int segsiz; 15690 struct timespec ts; 15691 struct tcp_rack *rack; 15692 struct tcp_ackent *ae; 15693 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 15694 int cnt, i, did_out, ourfinisacked = 0; 15695 struct tcpopt to_holder, *to = NULL; 15696 #ifdef TCP_ACCOUNTING 15697 int win_up_req = 0; 15698 #endif 15699 int nsegs = 0; 15700 int under_pacing = 0; 15701 int post_recovery = 0; 15702 #ifdef TCP_ACCOUNTING 15703 sched_pin(); 15704 #endif 15705 rack = (struct tcp_rack *)tp->t_fb_ptr; 15706 if (rack->gp_ready && 15707 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 15708 under_pacing = 1; 15709 15710 if (rack->r_state != tp->t_state) 15711 rack_set_state(tp, rack); 15712 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 15713 (tp->t_flags & TF_GPUTINPROG)) { 15714 /* 15715 * We have a goodput in progress 15716 * and we have entered a late state. 15717 * Do we have enough data in the sb 15718 * to handle the GPUT request? 15719 */ 15720 uint32_t bytes; 15721 15722 bytes = tp->gput_ack - tp->gput_seq; 15723 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 15724 bytes += tp->gput_seq - tp->snd_una; 15725 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 15726 /* 15727 * There are not enough bytes in the socket 15728 * buffer that have been sent to cover this 15729 * measurement. Cancel it. 15730 */ 15731 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 15732 rack->r_ctl.rc_gp_srtt /*flex1*/, 15733 tp->gput_seq, 15734 0, 0, 18, __LINE__, NULL, 0); 15735 tp->t_flags &= ~TF_GPUTINPROG; 15736 } 15737 } 15738 to = &to_holder; 15739 to->to_flags = 0; 15740 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 15741 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 15742 cnt = m->m_len / sizeof(struct tcp_ackent); 15743 counter_u64_add(rack_multi_single_eq, cnt); 15744 high_seq = tp->snd_una; 15745 the_win = tp->snd_wnd; 15746 win_seq = tp->snd_wl1; 15747 win_upd_ack = tp->snd_wl2; 15748 cts = tcp_tv_to_usec(tv); 15749 ms_cts = tcp_tv_to_msec(tv); 15750 rack->r_ctl.rc_rcvtime = cts; 15751 segsiz = ctf_fixed_maxseg(tp); 15752 if ((rack->rc_gp_dyn_mul) && 15753 (rack->use_fixed_rate == 0) && 15754 (rack->rc_always_pace)) { 15755 /* Check in on probertt */ 15756 rack_check_probe_rtt(rack, cts); 15757 } 15758 for (i = 0; i < cnt; i++) { 15759 #ifdef TCP_ACCOUNTING 15760 ts_val = get_cyclecount(); 15761 #endif 15762 rack_clear_rate_sample(rack); 15763 ae = ((mtod(m, struct tcp_ackent *)) + i); 15764 if (ae->flags & TH_FIN) 15765 rack_log_pacing_delay_calc(rack, 15766 0, 15767 0, 15768 0, 15769 rack_get_gp_est(rack), /* delRate */ 15770 rack_get_lt_bw(rack), /* rttProp */ 15771 20, __LINE__, NULL, 0); 15772 /* Setup the window */ 15773 tiwin = ae->win << tp->snd_scale; 15774 if (tiwin > rack->r_ctl.rc_high_rwnd) 15775 rack->r_ctl.rc_high_rwnd = tiwin; 15776 /* figure out the type of ack */ 15777 if (SEQ_LT(ae->ack, high_seq)) { 15778 /* Case B*/ 15779 ae->ack_val_set = ACK_BEHIND; 15780 } else if (SEQ_GT(ae->ack, high_seq)) { 15781 /* Case A */ 15782 ae->ack_val_set = ACK_CUMACK; 15783 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 15784 /* Case D */ 15785 ae->ack_val_set = ACK_DUPACK; 15786 } else { 15787 /* Case C */ 15788 ae->ack_val_set = ACK_RWND; 15789 } 15790 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 15791 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 15792 /* Validate timestamp */ 15793 if (ae->flags & HAS_TSTMP) { 15794 /* Setup for a timestamp */ 15795 to->to_flags = TOF_TS; 15796 ae->ts_echo -= tp->ts_offset; 15797 to->to_tsecr = ae->ts_echo; 15798 to->to_tsval = ae->ts_value; 15799 /* 15800 * If echoed timestamp is later than the current time, fall back to 15801 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 15802 * were used when this connection was established. 15803 */ 15804 if (TSTMP_GT(ae->ts_echo, ms_cts)) 15805 to->to_tsecr = 0; 15806 if (tp->ts_recent && 15807 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 15808 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 15809 #ifdef TCP_ACCOUNTING 15810 rdstc = get_cyclecount(); 15811 if (rdstc > ts_val) { 15812 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15813 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 15814 } 15815 } 15816 #endif 15817 continue; 15818 } 15819 } 15820 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 15821 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 15822 tp->ts_recent_age = tcp_ts_getticks(); 15823 tp->ts_recent = ae->ts_value; 15824 } 15825 } else { 15826 /* Setup for a no options */ 15827 to->to_flags = 0; 15828 } 15829 /* Update the rcv time and perform idle reduction possibly */ 15830 if (tp->t_idle_reduce && 15831 (tp->snd_max == tp->snd_una) && 15832 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 15833 counter_u64_add(rack_input_idle_reduces, 1); 15834 rack_cc_after_idle(rack, tp); 15835 } 15836 tp->t_rcvtime = ticks; 15837 /* Now what about ECN of a chain of pure ACKs? */ 15838 if (tcp_ecn_input_segment(tp, ae->flags, 0, 15839 tcp_packets_this_ack(tp, ae->ack), 15840 ae->codepoint)) 15841 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 15842 #ifdef TCP_ACCOUNTING 15843 /* Count for the specific type of ack in */ 15844 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15845 tp->tcp_cnt_counters[ae->ack_val_set]++; 15846 } 15847 #endif 15848 /* 15849 * Note how we could move up these in the determination 15850 * above, but we don't so that way the timestamp checks (and ECN) 15851 * is done first before we do any processing on the ACK. 15852 * The non-compressed path through the code has this 15853 * weakness (noted by @jtl) that it actually does some 15854 * processing before verifying the timestamp information. 15855 * We don't take that path here which is why we set 15856 * the ack_val_set first, do the timestamp and ecn 15857 * processing, and then look at what we have setup. 15858 */ 15859 if (ae->ack_val_set == ACK_BEHIND) { 15860 /* 15861 * Case B flag reordering, if window is not closed 15862 * or it could be a keep-alive or persists 15863 */ 15864 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 15865 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 15866 if (rack->r_ctl.rc_reorder_ts == 0) 15867 rack->r_ctl.rc_reorder_ts = 1; 15868 } 15869 } else if (ae->ack_val_set == ACK_DUPACK) { 15870 /* Case D */ 15871 rack_strike_dupack(rack, ae->ack); 15872 } else if (ae->ack_val_set == ACK_RWND) { 15873 /* Case C */ 15874 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 15875 ts.tv_sec = ae->timestamp / 1000000000; 15876 ts.tv_nsec = ae->timestamp % 1000000000; 15877 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 15878 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 15879 } else { 15880 rack->r_ctl.act_rcv_time = *tv; 15881 } 15882 if (rack->forced_ack) { 15883 rack_handle_probe_response(rack, tiwin, 15884 tcp_tv_to_usec(&rack->r_ctl.act_rcv_time)); 15885 } 15886 #ifdef TCP_ACCOUNTING 15887 win_up_req = 1; 15888 #endif 15889 win_upd_ack = ae->ack; 15890 win_seq = ae->seq; 15891 the_win = tiwin; 15892 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 15893 } else { 15894 /* Case A */ 15895 if (SEQ_GT(ae->ack, tp->snd_max)) { 15896 /* 15897 * We just send an ack since the incoming 15898 * ack is beyond the largest seq we sent. 15899 */ 15900 if ((tp->t_flags & TF_ACKNOW) == 0) { 15901 ctf_ack_war_checks(tp); 15902 if (tp->t_flags && TF_ACKNOW) 15903 rack->r_wanted_output = 1; 15904 } 15905 } else { 15906 nsegs++; 15907 /* If the window changed setup to update */ 15908 if (tiwin != tp->snd_wnd) { 15909 win_upd_ack = ae->ack; 15910 win_seq = ae->seq; 15911 the_win = tiwin; 15912 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 15913 } 15914 #ifdef TCP_ACCOUNTING 15915 /* Account for the acks */ 15916 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15917 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 15918 } 15919 #endif 15920 high_seq = ae->ack; 15921 /* Setup our act_rcv_time */ 15922 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 15923 ts.tv_sec = ae->timestamp / 1000000000; 15924 ts.tv_nsec = ae->timestamp % 1000000000; 15925 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 15926 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 15927 } else { 15928 rack->r_ctl.act_rcv_time = *tv; 15929 } 15930 rack_process_to_cumack(tp, rack, ae->ack, cts, to, 15931 tcp_tv_to_lusec(&rack->r_ctl.act_rcv_time)); 15932 #ifdef TCP_REQUEST_TRK 15933 rack_req_check_for_comp(rack, high_seq); 15934 #endif 15935 if (rack->rc_dsack_round_seen) { 15936 /* Is the dsack round over? */ 15937 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 15938 /* Yes it is */ 15939 rack->rc_dsack_round_seen = 0; 15940 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 15941 } 15942 } 15943 } 15944 } 15945 /* And lets be sure to commit the rtt measurements for this ack */ 15946 tcp_rack_xmit_timer_commit(rack, tp); 15947 #ifdef TCP_ACCOUNTING 15948 rdstc = get_cyclecount(); 15949 if (rdstc > ts_val) { 15950 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15951 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 15952 if (ae->ack_val_set == ACK_CUMACK) 15953 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 15954 } 15955 } 15956 #endif 15957 } 15958 #ifdef TCP_ACCOUNTING 15959 ts_val = get_cyclecount(); 15960 #endif 15961 /* Tend to any collapsed window */ 15962 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 15963 /* The peer collapsed the window */ 15964 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); 15965 } else if (rack->rc_has_collapsed) 15966 rack_un_collapse_window(rack, __LINE__); 15967 if ((rack->r_collapse_point_valid) && 15968 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 15969 rack->r_collapse_point_valid = 0; 15970 acked_amount = acked = (high_seq - tp->snd_una); 15971 if (acked) { 15972 /* 15973 * The draft (v3) calls for us to use SEQ_GEQ, but that 15974 * causes issues when we are just going app limited. Lets 15975 * instead use SEQ_GT <or> where its equal but more data 15976 * is outstanding. 15977 * 15978 * Also make sure we are on the last ack of a series. We 15979 * have to have all the ack's processed in queue to know 15980 * if there is something left outstanding. 15981 * 15982 */ 15983 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && 15984 (rack->rc_new_rnd_needed == 0) && 15985 (nxt_pkt == 0)) { 15986 /* 15987 * We have crossed into a new round with 15988 * this th_ack value. 15989 */ 15990 rack_new_round_setup(tp, rack, high_seq); 15991 } 15992 /* 15993 * Clear the probe not answered flag 15994 * since cum-ack moved forward. 15995 */ 15996 rack->probe_not_answered = 0; 15997 if (tp->t_flags & TF_NEEDSYN) { 15998 /* 15999 * T/TCP: Connection was half-synchronized, and our SYN has 16000 * been ACK'd (so connection is now fully synchronized). Go 16001 * to non-starred state, increment snd_una for ACK of SYN, 16002 * and check if we can do window scaling. 16003 */ 16004 tp->t_flags &= ~TF_NEEDSYN; 16005 tp->snd_una++; 16006 acked_amount = acked = (high_seq - tp->snd_una); 16007 } 16008 if (acked > sbavail(&so->so_snd)) 16009 acked_amount = sbavail(&so->so_snd); 16010 if (IN_FASTRECOVERY(tp->t_flags) && 16011 (rack->rack_no_prr == 0)) 16012 rack_update_prr(tp, rack, acked_amount, high_seq); 16013 if (IN_RECOVERY(tp->t_flags)) { 16014 if (SEQ_LT(high_seq, tp->snd_recover) && 16015 (SEQ_LT(high_seq, tp->snd_max))) { 16016 tcp_rack_partialack(tp); 16017 } else { 16018 rack_post_recovery(tp, high_seq); 16019 post_recovery = 1; 16020 } 16021 } else if ((rack->rto_from_rec == 1) && 16022 SEQ_GEQ(high_seq, tp->snd_recover)) { 16023 /* 16024 * We were in recovery, hit a rxt timeout 16025 * and never re-entered recovery. The timeout(s) 16026 * made up all the lost data. In such a case 16027 * we need to clear the rto_from_rec flag. 16028 */ 16029 rack->rto_from_rec = 0; 16030 } 16031 /* Handle the rack-log-ack part (sendmap) */ 16032 if ((sbused(&so->so_snd) == 0) && 16033 (acked > acked_amount) && 16034 (tp->t_state >= TCPS_FIN_WAIT_1) && 16035 (tp->t_flags & TF_SENTFIN)) { 16036 /* 16037 * We must be sure our fin 16038 * was sent and acked (we can be 16039 * in FIN_WAIT_1 without having 16040 * sent the fin). 16041 */ 16042 ourfinisacked = 1; 16043 /* 16044 * Lets make sure snd_una is updated 16045 * since most likely acked_amount = 0 (it 16046 * should be). 16047 */ 16048 tp->snd_una = high_seq; 16049 } 16050 /* Did we make a RTO error? */ 16051 if ((tp->t_flags & TF_PREVVALID) && 16052 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 16053 tp->t_flags &= ~TF_PREVVALID; 16054 if (tp->t_rxtshift == 1 && 16055 (int)(ticks - tp->t_badrxtwin) < 0) 16056 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 16057 } 16058 /* Handle the data in the socket buffer */ 16059 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 16060 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 16061 if (acked_amount > 0) { 16062 uint32_t p_cwnd; 16063 struct mbuf *mfree; 16064 16065 if (post_recovery) { 16066 /* 16067 * Grab the segsiz, multiply by 2 and add the snd_cwnd 16068 * that is the max the CC should add if we are exiting 16069 * recovery and doing a late add. 16070 */ 16071 p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16072 p_cwnd <<= 1; 16073 p_cwnd += tp->snd_cwnd; 16074 } 16075 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery); 16076 if (post_recovery && (tp->snd_cwnd > p_cwnd)) { 16077 /* Must be non-newreno (cubic) getting too ahead of itself */ 16078 tp->snd_cwnd = p_cwnd; 16079 } 16080 SOCK_SENDBUF_LOCK(so); 16081 mfree = sbcut_locked(&so->so_snd, acked_amount); 16082 tp->snd_una = high_seq; 16083 /* Note we want to hold the sb lock through the sendmap adjust */ 16084 rack_adjust_sendmap_head(rack, &so->so_snd); 16085 /* Wake up the socket if we have room to write more */ 16086 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 16087 sowwakeup_locked(so); 16088 m_freem(mfree); 16089 } 16090 /* update progress */ 16091 tp->t_acktime = ticks; 16092 rack_log_progress_event(rack, tp, tp->t_acktime, 16093 PROGRESS_UPDATE, __LINE__); 16094 /* Clear out shifts and such */ 16095 tp->t_rxtshift = 0; 16096 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 16097 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 16098 rack->rc_tlp_in_progress = 0; 16099 rack->r_ctl.rc_tlp_cnt_out = 0; 16100 /* Send recover and snd_nxt must be dragged along */ 16101 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 16102 tp->snd_recover = tp->snd_una; 16103 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 16104 tp->snd_nxt = tp->snd_max; 16105 /* 16106 * If the RXT timer is running we want to 16107 * stop it, so we can restart a TLP (or new RXT). 16108 */ 16109 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 16110 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16111 tp->snd_wl2 = high_seq; 16112 tp->t_dupacks = 0; 16113 if (under_pacing && 16114 (rack->use_fixed_rate == 0) && 16115 (rack->in_probe_rtt == 0) && 16116 rack->rc_gp_dyn_mul && 16117 rack->rc_always_pace) { 16118 /* Check if we are dragging bottom */ 16119 rack_check_bottom_drag(tp, rack, so); 16120 } 16121 if (tp->snd_una == tp->snd_max) { 16122 tp->t_flags &= ~TF_PREVVALID; 16123 rack->r_ctl.retran_during_recovery = 0; 16124 rack->rc_suspicious = 0; 16125 rack->r_ctl.dsack_byte_cnt = 0; 16126 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 16127 if (rack->r_ctl.rc_went_idle_time == 0) 16128 rack->r_ctl.rc_went_idle_time = 1; 16129 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 16130 if (sbavail(&tptosocket(tp)->so_snd) == 0) 16131 tp->t_acktime = 0; 16132 /* Set so we might enter persists... */ 16133 rack->r_wanted_output = 1; 16134 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16135 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 16136 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16137 (sbavail(&so->so_snd) == 0) && 16138 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 16139 /* 16140 * The socket was gone and the 16141 * peer sent data (not now in the past), time to 16142 * reset him. 16143 */ 16144 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16145 /* tcp_close will kill the inp pre-log the Reset */ 16146 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 16147 #ifdef TCP_ACCOUNTING 16148 rdstc = get_cyclecount(); 16149 if (rdstc > ts_val) { 16150 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16151 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16152 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16153 } 16154 } 16155 #endif 16156 m_freem(m); 16157 tp = tcp_close(tp); 16158 if (tp == NULL) { 16159 #ifdef TCP_ACCOUNTING 16160 sched_unpin(); 16161 #endif 16162 return (1); 16163 } 16164 /* 16165 * We would normally do drop-with-reset which would 16166 * send back a reset. We can't since we don't have 16167 * all the needed bits. Instead lets arrange for 16168 * a call to tcp_output(). That way since we 16169 * are in the closed state we will generate a reset. 16170 * 16171 * Note if tcp_accounting is on we don't unpin since 16172 * we do that after the goto label. 16173 */ 16174 goto send_out_a_rst; 16175 } 16176 if ((sbused(&so->so_snd) == 0) && 16177 (tp->t_state >= TCPS_FIN_WAIT_1) && 16178 (tp->t_flags & TF_SENTFIN)) { 16179 /* 16180 * If we can't receive any more data, then closing user can 16181 * proceed. Starting the timer is contrary to the 16182 * specification, but if we don't get a FIN we'll hang 16183 * forever. 16184 * 16185 */ 16186 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16187 soisdisconnected(so); 16188 tcp_timer_activate(tp, TT_2MSL, 16189 (tcp_fast_finwait2_recycle ? 16190 tcp_finwait2_timeout : 16191 TP_MAXIDLE(tp))); 16192 } 16193 if (ourfinisacked == 0) { 16194 /* 16195 * We don't change to fin-wait-2 if we have our fin acked 16196 * which means we are probably in TCPS_CLOSING. 16197 */ 16198 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16199 } 16200 } 16201 } 16202 /* Wake up the socket if we have room to write more */ 16203 if (sbavail(&so->so_snd)) { 16204 rack->r_wanted_output = 1; 16205 if (ctf_progress_timeout_check(tp, true)) { 16206 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 16207 tp, tick, PROGRESS_DROP, __LINE__); 16208 /* 16209 * We cheat here and don't send a RST, we should send one 16210 * when the pacer drops the connection. 16211 */ 16212 #ifdef TCP_ACCOUNTING 16213 rdstc = get_cyclecount(); 16214 if (rdstc > ts_val) { 16215 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16216 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16217 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16218 } 16219 } 16220 sched_unpin(); 16221 #endif 16222 (void)tcp_drop(tp, ETIMEDOUT); 16223 m_freem(m); 16224 return (1); 16225 } 16226 } 16227 if (ourfinisacked) { 16228 switch(tp->t_state) { 16229 case TCPS_CLOSING: 16230 #ifdef TCP_ACCOUNTING 16231 rdstc = get_cyclecount(); 16232 if (rdstc > ts_val) { 16233 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16234 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16235 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16236 } 16237 } 16238 sched_unpin(); 16239 #endif 16240 tcp_twstart(tp); 16241 m_freem(m); 16242 return (1); 16243 break; 16244 case TCPS_LAST_ACK: 16245 #ifdef TCP_ACCOUNTING 16246 rdstc = get_cyclecount(); 16247 if (rdstc > ts_val) { 16248 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16249 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16250 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16251 } 16252 } 16253 sched_unpin(); 16254 #endif 16255 tp = tcp_close(tp); 16256 ctf_do_drop(m, tp); 16257 return (1); 16258 break; 16259 case TCPS_FIN_WAIT_1: 16260 #ifdef TCP_ACCOUNTING 16261 rdstc = get_cyclecount(); 16262 if (rdstc > ts_val) { 16263 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16264 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16265 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16266 } 16267 } 16268 #endif 16269 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16270 soisdisconnected(so); 16271 tcp_timer_activate(tp, TT_2MSL, 16272 (tcp_fast_finwait2_recycle ? 16273 tcp_finwait2_timeout : 16274 TP_MAXIDLE(tp))); 16275 } 16276 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16277 break; 16278 default: 16279 break; 16280 } 16281 } 16282 if (rack->r_fast_output) { 16283 /* 16284 * We re doing fast output.. can we expand that? 16285 */ 16286 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 16287 } 16288 #ifdef TCP_ACCOUNTING 16289 rdstc = get_cyclecount(); 16290 if (rdstc > ts_val) { 16291 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16292 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16293 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16294 } 16295 } 16296 16297 } else if (win_up_req) { 16298 rdstc = get_cyclecount(); 16299 if (rdstc > ts_val) { 16300 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16301 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 16302 } 16303 } 16304 #endif 16305 } 16306 /* Now is there a next packet, if so we are done */ 16307 m_freem(m); 16308 did_out = 0; 16309 if (nxt_pkt) { 16310 #ifdef TCP_ACCOUNTING 16311 sched_unpin(); 16312 #endif 16313 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 16314 return (0); 16315 } 16316 rack_handle_might_revert(tp, rack); 16317 ctf_calc_rwin(so, tp); 16318 if ((rack->r_wanted_output != 0) || 16319 (rack->r_fast_output != 0) || 16320 (tp->t_flags & TF_ACKNOW )) { 16321 send_out_a_rst: 16322 if (tcp_output(tp) < 0) { 16323 #ifdef TCP_ACCOUNTING 16324 sched_unpin(); 16325 #endif 16326 return (1); 16327 } 16328 did_out = 1; 16329 } 16330 if (tp->t_flags2 & TF2_HPTS_CALLS) 16331 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16332 rack_free_trim(rack); 16333 #ifdef TCP_ACCOUNTING 16334 sched_unpin(); 16335 #endif 16336 rack_timer_audit(tp, rack, &so->so_snd); 16337 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 16338 return (0); 16339 } 16340 16341 #define TCP_LRO_TS_OPTION \ 16342 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 16343 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) 16344 16345 static int 16346 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 16347 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, 16348 struct timeval *tv) 16349 { 16350 struct inpcb *inp = tptoinpcb(tp); 16351 struct socket *so = tptosocket(tp); 16352 #ifdef TCP_ACCOUNTING 16353 uint64_t ts_val; 16354 #endif 16355 int32_t thflags, retval, did_out = 0; 16356 int32_t way_out = 0; 16357 /* 16358 * cts - is the current time from tv (caller gets ts) in microseconds. 16359 * ms_cts - is the current time from tv in milliseconds. 16360 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 16361 */ 16362 uint32_t cts, us_cts, ms_cts; 16363 uint32_t tiwin; 16364 struct timespec ts; 16365 struct tcpopt to; 16366 struct tcp_rack *rack; 16367 struct rack_sendmap *rsm; 16368 int32_t prev_state = 0; 16369 int no_output = 0; 16370 int slot_remaining = 0; 16371 #ifdef TCP_ACCOUNTING 16372 int ack_val_set = 0xf; 16373 #endif 16374 int nsegs; 16375 16376 NET_EPOCH_ASSERT(); 16377 INP_WLOCK_ASSERT(inp); 16378 16379 /* 16380 * tv passed from common code is from either M_TSTMP_LRO or 16381 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 16382 */ 16383 rack = (struct tcp_rack *)tp->t_fb_ptr; 16384 if (rack->rack_deferred_inited == 0) { 16385 /* 16386 * If we are the connecting socket we will 16387 * hit rack_init() when no sequence numbers 16388 * are setup. This makes it so we must defer 16389 * some initialization. Call that now. 16390 */ 16391 rack_deferred_init(tp, rack); 16392 } 16393 /* 16394 * Check to see if we need to skip any output plans. This 16395 * can happen in the non-LRO path where we are pacing and 16396 * must process the ack coming in but need to defer sending 16397 * anything becase a pacing timer is running. 16398 */ 16399 us_cts = tcp_tv_to_usec(tv); 16400 if (m->m_flags & M_ACKCMP) { 16401 /* 16402 * All compressed ack's are ack's by definition so 16403 * remove any ack required flag and then do the processing. 16404 */ 16405 rack->rc_ack_required = 0; 16406 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 16407 } 16408 thflags = tcp_get_flags(th); 16409 if ((rack->rc_always_pace == 1) && 16410 (rack->rc_ack_can_sendout_data == 0) && 16411 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16412 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) { 16413 /* 16414 * Ok conditions are right for queuing the packets 16415 * but we do have to check the flags in the inp, it 16416 * could be, if a sack is present, we want to be awoken and 16417 * so should process the packets. 16418 */ 16419 slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; 16420 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { 16421 no_output = 1; 16422 } else { 16423 /* 16424 * If there is no options, or just a 16425 * timestamp option, we will want to queue 16426 * the packets. This is the same that LRO does 16427 * and will need to change with accurate ECN. 16428 */ 16429 uint32_t *ts_ptr; 16430 int optlen; 16431 16432 optlen = (th->th_off << 2) - sizeof(struct tcphdr); 16433 ts_ptr = (uint32_t *)(th + 1); 16434 if ((optlen == 0) || 16435 ((optlen == TCPOLEN_TSTAMP_APPA) && 16436 (*ts_ptr == TCP_LRO_TS_OPTION))) 16437 no_output = 1; 16438 } 16439 if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { 16440 /* 16441 * It is unrealistic to think we can pace in less than 16442 * the minimum granularity of the pacer (def:250usec). So 16443 * if we have less than that time remaining we should go 16444 * ahead and allow output to be "early". We will attempt to 16445 * make up for it in any pacing time we try to apply on 16446 * the outbound packet. 16447 */ 16448 no_output = 0; 16449 } 16450 } 16451 /* 16452 * If there is a RST or FIN lets dump out the bw 16453 * with a FIN the connection may go on but we 16454 * may not. 16455 */ 16456 if ((thflags & TH_FIN) || (thflags & TH_RST)) 16457 rack_log_pacing_delay_calc(rack, 16458 rack->r_ctl.gp_bw, 16459 0, 16460 0, 16461 rack_get_gp_est(rack), /* delRate */ 16462 rack_get_lt_bw(rack), /* rttProp */ 16463 20, __LINE__, NULL, 0); 16464 if (m->m_flags & M_ACKCMP) { 16465 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 16466 } 16467 cts = tcp_tv_to_usec(tv); 16468 ms_cts = tcp_tv_to_msec(tv); 16469 nsegs = m->m_pkthdr.lro_nsegs; 16470 counter_u64_add(rack_proc_non_comp_ack, 1); 16471 #ifdef TCP_ACCOUNTING 16472 sched_pin(); 16473 if (thflags & TH_ACK) 16474 ts_val = get_cyclecount(); 16475 #endif 16476 if ((m->m_flags & M_TSTMP) || 16477 (m->m_flags & M_TSTMP_LRO)) { 16478 mbuf_tstmp2timespec(m, &ts); 16479 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16480 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16481 } else 16482 rack->r_ctl.act_rcv_time = *tv; 16483 kern_prefetch(rack, &prev_state); 16484 prev_state = 0; 16485 /* 16486 * Unscale the window into a 32-bit value. For the SYN_SENT state 16487 * the scale is zero. 16488 */ 16489 tiwin = th->th_win << tp->snd_scale; 16490 #ifdef TCP_ACCOUNTING 16491 if (thflags & TH_ACK) { 16492 /* 16493 * We have a tradeoff here. We can either do what we are 16494 * doing i.e. pinning to this CPU and then doing the accounting 16495 * <or> we could do a critical enter, setup the rdtsc and cpu 16496 * as in below, and then validate we are on the same CPU on 16497 * exit. I have choosen to not do the critical enter since 16498 * that often will gain you a context switch, and instead lock 16499 * us (line above this if) to the same CPU with sched_pin(). This 16500 * means we may be context switched out for a higher priority 16501 * interupt but we won't be moved to another CPU. 16502 * 16503 * If this occurs (which it won't very often since we most likely 16504 * are running this code in interupt context and only a higher 16505 * priority will bump us ... clock?) we will falsely add in 16506 * to the time the interupt processing time plus the ack processing 16507 * time. This is ok since its a rare event. 16508 */ 16509 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 16510 ctf_fixed_maxseg(tp)); 16511 } 16512 #endif 16513 /* 16514 * Parse options on any incoming segment. 16515 */ 16516 memset(&to, 0, sizeof(to)); 16517 tcp_dooptions(&to, (u_char *)(th + 1), 16518 (th->th_off << 2) - sizeof(struct tcphdr), 16519 (thflags & TH_SYN) ? TO_SYN : 0); 16520 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 16521 __func__)); 16522 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 16523 __func__)); 16524 if (tp->t_flags2 & TF2_PROC_SACK_PROHIBIT) { 16525 /* 16526 * We don't look at sack's from the 16527 * peer because the MSS is too small which 16528 * can subject us to an attack. 16529 */ 16530 to.to_flags &= ~TOF_SACK; 16531 } 16532 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16533 (tp->t_flags & TF_GPUTINPROG)) { 16534 /* 16535 * We have a goodput in progress 16536 * and we have entered a late state. 16537 * Do we have enough data in the sb 16538 * to handle the GPUT request? 16539 */ 16540 uint32_t bytes; 16541 16542 bytes = tp->gput_ack - tp->gput_seq; 16543 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 16544 bytes += tp->gput_seq - tp->snd_una; 16545 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 16546 /* 16547 * There are not enough bytes in the socket 16548 * buffer that have been sent to cover this 16549 * measurement. Cancel it. 16550 */ 16551 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 16552 rack->r_ctl.rc_gp_srtt /*flex1*/, 16553 tp->gput_seq, 16554 0, 0, 18, __LINE__, NULL, 0); 16555 tp->t_flags &= ~TF_GPUTINPROG; 16556 } 16557 } 16558 if (tcp_bblogging_on(rack->rc_tp)) { 16559 union tcp_log_stackspecific log; 16560 struct timeval ltv; 16561 #ifdef TCP_REQUEST_TRK 16562 struct tcp_sendfile_track *tcp_req; 16563 16564 if (SEQ_GT(th->th_ack, tp->snd_una)) { 16565 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1)); 16566 } else { 16567 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); 16568 } 16569 #endif 16570 memset(&log, 0, sizeof(log)); 16571 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 16572 if (rack->rack_no_prr == 0) 16573 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16574 else 16575 log.u_bbr.flex1 = 0; 16576 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 16577 log.u_bbr.use_lt_bw <<= 1; 16578 log.u_bbr.use_lt_bw |= rack->r_might_revert; 16579 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 16580 log.u_bbr.bbr_state = rack->rc_free_cnt; 16581 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16582 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 16583 log.u_bbr.flex3 = m->m_flags; 16584 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 16585 log.u_bbr.lost = thflags; 16586 log.u_bbr.pacing_gain = 0x1; 16587 #ifdef TCP_ACCOUNTING 16588 log.u_bbr.cwnd_gain = ack_val_set; 16589 #endif 16590 log.u_bbr.flex7 = 2; 16591 if (m->m_flags & M_TSTMP) { 16592 /* Record the hardware timestamp if present */ 16593 mbuf_tstmp2timespec(m, &ts); 16594 ltv.tv_sec = ts.tv_sec; 16595 ltv.tv_usec = ts.tv_nsec / 1000; 16596 log.u_bbr.lt_epoch = tcp_tv_to_usec(<v); 16597 } else if (m->m_flags & M_TSTMP_LRO) { 16598 /* Record the LRO the arrival timestamp */ 16599 mbuf_tstmp2timespec(m, &ts); 16600 ltv.tv_sec = ts.tv_sec; 16601 ltv.tv_usec = ts.tv_nsec / 1000; 16602 log.u_bbr.flex5 = tcp_tv_to_usec(<v); 16603 } 16604 log.u_bbr.timeStamp = tcp_get_usecs(<v); 16605 /* Log the rcv time */ 16606 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 16607 #ifdef TCP_REQUEST_TRK 16608 log.u_bbr.applimited = tp->t_tcpreq_closed; 16609 log.u_bbr.applimited <<= 8; 16610 log.u_bbr.applimited |= tp->t_tcpreq_open; 16611 log.u_bbr.applimited <<= 8; 16612 log.u_bbr.applimited |= tp->t_tcpreq_req; 16613 if (tcp_req) { 16614 /* Copy out any client req info */ 16615 /* seconds */ 16616 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 16617 /* useconds */ 16618 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 16619 log.u_bbr.rttProp = tcp_req->timestamp; 16620 log.u_bbr.cur_del_rate = tcp_req->start; 16621 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 16622 log.u_bbr.flex8 |= 1; 16623 } else { 16624 log.u_bbr.flex8 |= 2; 16625 log.u_bbr.bw_inuse = tcp_req->end; 16626 } 16627 log.u_bbr.flex6 = tcp_req->start_seq; 16628 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 16629 log.u_bbr.flex8 |= 4; 16630 log.u_bbr.epoch = tcp_req->end_seq; 16631 } 16632 } 16633 #endif 16634 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 16635 tlen, &log, true, <v); 16636 } 16637 /* Remove ack required flag if set, we have one */ 16638 if (thflags & TH_ACK) 16639 rack->rc_ack_required = 0; 16640 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16641 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 16642 way_out = 4; 16643 retval = 0; 16644 m_freem(m); 16645 goto done_with_input; 16646 } 16647 /* 16648 * If a segment with the ACK-bit set arrives in the SYN-SENT state 16649 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 16650 */ 16651 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 16652 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 16653 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 16654 ctf_do_dropwithreset(m, tp, th, tlen); 16655 #ifdef TCP_ACCOUNTING 16656 sched_unpin(); 16657 #endif 16658 return (1); 16659 } 16660 /* 16661 * If timestamps were negotiated during SYN/ACK and a 16662 * segment without a timestamp is received, silently drop 16663 * the segment, unless it is a RST segment or missing timestamps are 16664 * tolerated. 16665 * See section 3.2 of RFC 7323. 16666 */ 16667 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 16668 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 16669 way_out = 5; 16670 retval = 0; 16671 m_freem(m); 16672 goto done_with_input; 16673 } 16674 /* 16675 * Segment received on connection. Reset idle time and keep-alive 16676 * timer. XXX: This should be done after segment validation to 16677 * ignore broken/spoofed segs. 16678 */ 16679 if (tp->t_idle_reduce && 16680 (tp->snd_max == tp->snd_una) && 16681 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16682 counter_u64_add(rack_input_idle_reduces, 1); 16683 rack_cc_after_idle(rack, tp); 16684 } 16685 tp->t_rcvtime = ticks; 16686 #ifdef STATS 16687 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 16688 #endif 16689 if (tiwin > rack->r_ctl.rc_high_rwnd) 16690 rack->r_ctl.rc_high_rwnd = tiwin; 16691 /* 16692 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 16693 * this to occur after we've validated the segment. 16694 */ 16695 if (tcp_ecn_input_segment(tp, thflags, tlen, 16696 tcp_packets_this_ack(tp, th->th_ack), 16697 iptos)) 16698 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 16699 16700 /* 16701 * If echoed timestamp is later than the current time, fall back to 16702 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16703 * were used when this connection was established. 16704 */ 16705 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 16706 to.to_tsecr -= tp->ts_offset; 16707 if (TSTMP_GT(to.to_tsecr, ms_cts)) 16708 to.to_tsecr = 0; 16709 } 16710 if ((rack->r_rcvpath_rtt_up == 1) && 16711 (to.to_flags & TOF_TS) && 16712 (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) { 16713 uint32_t rtt = 0; 16714 16715 /* 16716 * We are receiving only and thus not sending 16717 * data to do an RTT. We set a flag when we first 16718 * sent this TS to the peer. We now have it back 16719 * and have an RTT to share. We log it as a conf 16720 * 4, we are not so sure about it.. since we 16721 * may have lost an ack. 16722 */ 16723 if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv)) 16724 rtt = (cts - rack->r_ctl.last_time_of_arm_rcv); 16725 rack->r_rcvpath_rtt_up = 0; 16726 /* Submit and commit the timer */ 16727 if (rtt > 0) { 16728 tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1); 16729 tcp_rack_xmit_timer_commit(rack, tp); 16730 } 16731 } 16732 /* 16733 * If its the first time in we need to take care of options and 16734 * verify we can do SACK for rack! 16735 */ 16736 if (rack->r_state == 0) { 16737 /* Should be init'd by rack_init() */ 16738 KASSERT(rack->rc_inp != NULL, 16739 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 16740 if (rack->rc_inp == NULL) { 16741 rack->rc_inp = inp; 16742 } 16743 16744 /* 16745 * Process options only when we get SYN/ACK back. The SYN 16746 * case for incoming connections is handled in tcp_syncache. 16747 * According to RFC1323 the window field in a SYN (i.e., a 16748 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 16749 * this is traditional behavior, may need to be cleaned up. 16750 */ 16751 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 16752 /* Handle parallel SYN for ECN */ 16753 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 16754 if ((to.to_flags & TOF_SCALE) && 16755 (tp->t_flags & TF_REQ_SCALE)) { 16756 tp->t_flags |= TF_RCVD_SCALE; 16757 tp->snd_scale = to.to_wscale; 16758 } else 16759 tp->t_flags &= ~TF_REQ_SCALE; 16760 /* 16761 * Initial send window. It will be updated with the 16762 * next incoming segment to the scaled value. 16763 */ 16764 tp->snd_wnd = th->th_win; 16765 rack_validate_fo_sendwin_up(tp, rack); 16766 if ((to.to_flags & TOF_TS) && 16767 (tp->t_flags & TF_REQ_TSTMP)) { 16768 tp->t_flags |= TF_RCVD_TSTMP; 16769 tp->ts_recent = to.to_tsval; 16770 tp->ts_recent_age = cts; 16771 } else 16772 tp->t_flags &= ~TF_REQ_TSTMP; 16773 if (to.to_flags & TOF_MSS) { 16774 tcp_mss(tp, to.to_mss); 16775 } 16776 if ((tp->t_flags & TF_SACK_PERMIT) && 16777 (to.to_flags & TOF_SACKPERM) == 0) 16778 tp->t_flags &= ~TF_SACK_PERMIT; 16779 if (tp->t_flags & TF_FASTOPEN) { 16780 if (to.to_flags & TOF_FASTOPEN) { 16781 uint16_t mss; 16782 16783 if (to.to_flags & TOF_MSS) 16784 mss = to.to_mss; 16785 else 16786 if ((inp->inp_vflag & INP_IPV6) != 0) 16787 mss = TCP6_MSS; 16788 else 16789 mss = TCP_MSS; 16790 tcp_fastopen_update_cache(tp, mss, 16791 to.to_tfo_len, to.to_tfo_cookie); 16792 } else 16793 tcp_fastopen_disable_path(tp); 16794 } 16795 } 16796 /* 16797 * At this point we are at the initial call. Here we decide 16798 * if we are doing RACK or not. We do this by seeing if 16799 * TF_SACK_PERMIT is set and the sack-not-required is clear. 16800 * The code now does do dup-ack counting so if you don't 16801 * switch back you won't get rack & TLP, but you will still 16802 * get this stack. 16803 */ 16804 16805 if ((rack_sack_not_required == 0) && 16806 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 16807 tcp_switch_back_to_default(tp); 16808 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen, 16809 tlen, iptos); 16810 #ifdef TCP_ACCOUNTING 16811 sched_unpin(); 16812 #endif 16813 return (1); 16814 } 16815 tcp_set_hpts(tp); 16816 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 16817 } 16818 if (thflags & TH_FIN) 16819 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 16820 us_cts = tcp_tv_to_usec(&rack->r_ctl.act_rcv_time); 16821 if ((rack->rc_gp_dyn_mul) && 16822 (rack->use_fixed_rate == 0) && 16823 (rack->rc_always_pace)) { 16824 /* Check in on probertt */ 16825 rack_check_probe_rtt(rack, cts); 16826 } 16827 rack_clear_rate_sample(rack); 16828 if ((rack->forced_ack) && 16829 ((tcp_get_flags(th) & TH_RST) == 0)) { 16830 rack_handle_probe_response(rack, tiwin, us_cts); 16831 } 16832 /* 16833 * This is the one exception case where we set the rack state 16834 * always. All other times (timers etc) we must have a rack-state 16835 * set (so we assure we have done the checks above for SACK). 16836 */ 16837 rack->r_ctl.rc_rcvtime = cts; 16838 if (rack->r_state != tp->t_state) 16839 rack_set_state(tp, rack); 16840 if (SEQ_GT(th->th_ack, tp->snd_una) && 16841 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) 16842 kern_prefetch(rsm, &prev_state); 16843 prev_state = rack->r_state; 16844 if ((thflags & TH_RST) && 16845 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 16846 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 16847 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { 16848 /* The connection will be killed by a reset check the tracepoint */ 16849 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); 16850 } 16851 retval = (*rack->r_substate) (m, th, so, 16852 tp, &to, drop_hdrlen, 16853 tlen, tiwin, thflags, nxt_pkt, iptos); 16854 if (retval == 0) { 16855 /* 16856 * If retval is 1 the tcb is unlocked and most likely the tp 16857 * is gone. 16858 */ 16859 INP_WLOCK_ASSERT(inp); 16860 if ((rack->rc_gp_dyn_mul) && 16861 (rack->rc_always_pace) && 16862 (rack->use_fixed_rate == 0) && 16863 rack->in_probe_rtt && 16864 (rack->r_ctl.rc_time_probertt_starts == 0)) { 16865 /* 16866 * If we are going for target, lets recheck before 16867 * we output. 16868 */ 16869 rack_check_probe_rtt(rack, cts); 16870 } 16871 if (rack->set_pacing_done_a_iw == 0) { 16872 /* How much has been acked? */ 16873 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 16874 /* We have enough to set in the pacing segment size */ 16875 rack->set_pacing_done_a_iw = 1; 16876 rack_set_pace_segments(tp, rack, __LINE__, NULL); 16877 } 16878 } 16879 tcp_rack_xmit_timer_commit(rack, tp); 16880 #ifdef TCP_ACCOUNTING 16881 /* 16882 * If we set the ack_val_se to what ack processing we are doing 16883 * we also want to track how many cycles we burned. Note 16884 * the bits after tcp_output we let be "free". This is because 16885 * we are also tracking the tcp_output times as well. Note the 16886 * use of 0xf here since we only have 11 counter (0 - 0xa) and 16887 * 0xf cannot be returned and is what we initialize it too to 16888 * indicate we are not doing the tabulations. 16889 */ 16890 if (ack_val_set != 0xf) { 16891 uint64_t crtsc; 16892 16893 crtsc = get_cyclecount(); 16894 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16895 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 16896 } 16897 } 16898 #endif 16899 if ((nxt_pkt == 0) && (no_output == 0)) { 16900 if ((rack->r_wanted_output != 0) || 16901 (tp->t_flags & TF_ACKNOW) || 16902 (rack->r_fast_output != 0)) { 16903 16904 do_output_now: 16905 if (tcp_output(tp) < 0) { 16906 #ifdef TCP_ACCOUNTING 16907 sched_unpin(); 16908 #endif 16909 return (1); 16910 } 16911 did_out = 1; 16912 } 16913 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16914 rack_free_trim(rack); 16915 } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { 16916 goto do_output_now; 16917 } else if ((no_output == 1) && 16918 (nxt_pkt == 0) && 16919 (tcp_in_hpts(rack->rc_tp) == 0)) { 16920 /* 16921 * We are not in hpts and we had a pacing timer up. Use 16922 * the remaining time (slot_remaining) to restart the timer. 16923 */ 16924 KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); 16925 rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); 16926 rack_free_trim(rack); 16927 } 16928 /* Clear the flag, it may have been cleared by output but we may not have */ 16929 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) 16930 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16931 /* 16932 * The draft (v3) calls for us to use SEQ_GEQ, but that 16933 * causes issues when we are just going app limited. Lets 16934 * instead use SEQ_GT <or> where its equal but more data 16935 * is outstanding. 16936 * 16937 * Also make sure we are on the last ack of a series. We 16938 * have to have all the ack's processed in queue to know 16939 * if there is something left outstanding. 16940 */ 16941 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && 16942 (rack->rc_new_rnd_needed == 0) && 16943 (nxt_pkt == 0)) { 16944 /* 16945 * We have crossed into a new round with 16946 * the new snd_unae. 16947 */ 16948 rack_new_round_setup(tp, rack, tp->snd_una); 16949 } 16950 if ((nxt_pkt == 0) && 16951 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 16952 (SEQ_GT(tp->snd_max, tp->snd_una) || 16953 (tp->t_flags & TF_DELACK) || 16954 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 16955 (tp->t_state <= TCPS_CLOSING)))) { 16956 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 16957 if ((tp->snd_max == tp->snd_una) && 16958 ((tp->t_flags & TF_DELACK) == 0) && 16959 (tcp_in_hpts(rack->rc_tp)) && 16960 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 16961 /* keep alive not needed if we are hptsi output yet */ 16962 ; 16963 } else { 16964 int late = 0; 16965 if (tcp_in_hpts(tp)) { 16966 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 16967 us_cts = tcp_get_usecs(NULL); 16968 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 16969 rack->r_early = 1; 16970 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 16971 } else 16972 late = 1; 16973 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16974 } 16975 tcp_hpts_remove(tp); 16976 } 16977 if (late && (did_out == 0)) { 16978 /* 16979 * We are late in the sending 16980 * and we did not call the output 16981 * (this probably should not happen). 16982 */ 16983 goto do_output_now; 16984 } 16985 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 16986 } 16987 way_out = 1; 16988 } else if (nxt_pkt == 0) { 16989 /* Do we have the correct timer running? */ 16990 rack_timer_audit(tp, rack, &so->so_snd); 16991 way_out = 2; 16992 } 16993 done_with_input: 16994 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 16995 if (did_out) 16996 rack->r_wanted_output = 0; 16997 } 16998 16999 #ifdef TCP_ACCOUNTING 17000 sched_unpin(); 17001 #endif 17002 return (retval); 17003 } 17004 17005 static void 17006 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 17007 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 17008 { 17009 struct timeval tv; 17010 17011 /* First lets see if we have old packets */ 17012 if (!STAILQ_EMPTY(&tp->t_inqueue)) { 17013 if (ctf_do_queued_segments(tp, 1)) { 17014 m_freem(m); 17015 return; 17016 } 17017 } 17018 if (m->m_flags & M_TSTMP_LRO) { 17019 mbuf_tstmp2timeval(m, &tv); 17020 } else { 17021 /* Should not be should we kassert instead? */ 17022 tcp_get_usecs(&tv); 17023 } 17024 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0, 17025 &tv) == 0) { 17026 INP_WUNLOCK(tptoinpcb(tp)); 17027 } 17028 } 17029 17030 struct rack_sendmap * 17031 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 17032 { 17033 struct rack_sendmap *rsm = NULL; 17034 int32_t idx; 17035 uint32_t srtt = 0, thresh = 0, ts_low = 0; 17036 17037 /* Return the next guy to be re-transmitted */ 17038 if (tqhash_empty(rack->r_ctl.tqh)) { 17039 return (NULL); 17040 } 17041 if (tp->t_flags & TF_SENTFIN) { 17042 /* retran the end FIN? */ 17043 return (NULL); 17044 } 17045 /* ok lets look at this one */ 17046 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17047 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 17048 return (rsm); 17049 } 17050 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 17051 goto check_it; 17052 } 17053 rsm = rack_find_lowest_rsm(rack); 17054 if (rsm == NULL) { 17055 return (NULL); 17056 } 17057 check_it: 17058 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 17059 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 17060 /* 17061 * No sack so we automatically do the 3 strikes and 17062 * retransmit (no rack timer would be started). 17063 */ 17064 return (rsm); 17065 } 17066 if (rsm->r_flags & RACK_ACKED) { 17067 return (NULL); 17068 } 17069 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 17070 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 17071 /* Its not yet ready */ 17072 return (NULL); 17073 } 17074 srtt = rack_grab_rtt(tp, rack); 17075 idx = rsm->r_rtr_cnt - 1; 17076 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 17077 thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); 17078 if ((tsused == ts_low) || 17079 (TSTMP_LT(tsused, ts_low))) { 17080 /* No time since sending */ 17081 return (NULL); 17082 } 17083 if ((tsused - ts_low) < thresh) { 17084 /* It has not been long enough yet */ 17085 return (NULL); 17086 } 17087 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 17088 ((rsm->r_flags & RACK_SACK_PASSED))) { 17089 /* 17090 * We have passed the dup-ack threshold <or> 17091 * a SACK has indicated this is missing. 17092 * Note that if you are a declared attacker 17093 * it is only the dup-ack threshold that 17094 * will cause retransmits. 17095 */ 17096 /* log retransmit reason */ 17097 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 17098 rack->r_fast_output = 0; 17099 return (rsm); 17100 } 17101 return (NULL); 17102 } 17103 17104 static void 17105 rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, 17106 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 17107 int line, struct rack_sendmap *rsm, uint8_t quality) 17108 { 17109 if (tcp_bblogging_on(rack->rc_tp)) { 17110 union tcp_log_stackspecific log; 17111 struct timeval tv; 17112 17113 if (rack_verbose_logging == 0) { 17114 /* 17115 * We are not verbose screen out all but 17116 * ones we always want. 17117 */ 17118 if ((method != 2) && 17119 (method != 3) && 17120 (method != 7) && 17121 (method != 89) && 17122 (method != 14) && 17123 (method != 20)) { 17124 return; 17125 } 17126 } 17127 memset(&log, 0, sizeof(log)); 17128 log.u_bbr.flex1 = slot; 17129 log.u_bbr.flex2 = len; 17130 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 17131 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 17132 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 17133 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 17134 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 17135 log.u_bbr.use_lt_bw <<= 1; 17136 log.u_bbr.use_lt_bw |= rack->r_late; 17137 log.u_bbr.use_lt_bw <<= 1; 17138 log.u_bbr.use_lt_bw |= rack->r_early; 17139 log.u_bbr.use_lt_bw <<= 1; 17140 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 17141 log.u_bbr.use_lt_bw <<= 1; 17142 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 17143 log.u_bbr.use_lt_bw <<= 1; 17144 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 17145 log.u_bbr.use_lt_bw <<= 1; 17146 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 17147 log.u_bbr.use_lt_bw <<= 1; 17148 log.u_bbr.use_lt_bw |= rack->gp_ready; 17149 log.u_bbr.pkt_epoch = line; 17150 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 17151 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 17152 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 17153 log.u_bbr.bw_inuse = bw_est; 17154 log.u_bbr.delRate = bw; 17155 if (rack->r_ctl.gp_bw == 0) 17156 log.u_bbr.cur_del_rate = 0; 17157 else 17158 log.u_bbr.cur_del_rate = rack_get_bw(rack); 17159 log.u_bbr.rttProp = len_time; 17160 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 17161 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 17162 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 17163 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 17164 /* We are in slow start */ 17165 log.u_bbr.flex7 = 1; 17166 } else { 17167 /* we are on congestion avoidance */ 17168 log.u_bbr.flex7 = 0; 17169 } 17170 log.u_bbr.flex8 = method; 17171 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17172 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17173 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 17174 log.u_bbr.cwnd_gain <<= 1; 17175 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 17176 log.u_bbr.cwnd_gain <<= 1; 17177 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 17178 log.u_bbr.cwnd_gain <<= 1; 17179 log.u_bbr.cwnd_gain |= rack->use_fixed_rate; 17180 log.u_bbr.cwnd_gain <<= 1; 17181 log.u_bbr.cwnd_gain |= rack->rc_always_pace; 17182 log.u_bbr.cwnd_gain <<= 1; 17183 log.u_bbr.cwnd_gain |= rack->gp_ready; 17184 log.u_bbr.bbr_substate = quality; 17185 log.u_bbr.bbr_state = rack->dgp_on; 17186 log.u_bbr.bbr_state <<= 1; 17187 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; 17188 log.u_bbr.bbr_state <<= 2; 17189 TCP_LOG_EVENTP(rack->rc_tp, NULL, 17190 &rack->rc_inp->inp_socket->so_rcv, 17191 &rack->rc_inp->inp_socket->so_snd, 17192 BBR_LOG_HPTSI_CALC, 0, 17193 0, &log, false, &tv); 17194 } 17195 } 17196 17197 static uint32_t 17198 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 17199 { 17200 uint32_t new_tso, user_max, pace_one; 17201 17202 user_max = rack->rc_user_set_max_segs * mss; 17203 if (rack->rc_force_max_seg) { 17204 return (user_max); 17205 } 17206 if (rack->use_fixed_rate && 17207 ((rack->r_ctl.crte == NULL) || 17208 (bw != rack->r_ctl.crte->rate))) { 17209 /* Use the user mss since we are not exactly matched */ 17210 return (user_max); 17211 } 17212 if (rack_pace_one_seg || 17213 (rack->r_ctl.rc_user_set_min_segs == 1)) 17214 pace_one = 1; 17215 else 17216 pace_one = 0; 17217 17218 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, 17219 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 17220 if (new_tso > user_max) 17221 new_tso = user_max; 17222 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { 17223 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) 17224 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; 17225 } 17226 if (rack->r_ctl.rc_user_set_min_segs && 17227 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) 17228 new_tso = rack->r_ctl.rc_user_set_min_segs * mss; 17229 return (new_tso); 17230 } 17231 17232 static uint64_t 17233 rack_arrive_at_discounted_rate(struct tcp_rack *rack, uint64_t window_input, uint32_t *rate_set, uint32_t *gain_b) 17234 { 17235 uint64_t reduced_win; 17236 uint32_t gain; 17237 17238 if (window_input < rc_init_window(rack)) { 17239 /* 17240 * The cwnd is collapsed to 17241 * nearly zero, maybe because of a time-out? 17242 * Lets drop back to the lt-bw. 17243 */ 17244 reduced_win = rack_get_lt_bw(rack); 17245 /* Set the flag so the caller knows its a rate and not a reduced window */ 17246 *rate_set = 1; 17247 gain = 100; 17248 } else if (IN_RECOVERY(rack->rc_tp->t_flags)) { 17249 /* 17250 * If we are in recover our cwnd needs to be less for 17251 * our pacing consideration. 17252 */ 17253 if (rack->rack_hibeta == 0) { 17254 reduced_win = window_input / 2; 17255 gain = 50; 17256 } else { 17257 reduced_win = window_input * rack->r_ctl.saved_hibeta; 17258 reduced_win /= 100; 17259 gain = rack->r_ctl.saved_hibeta; 17260 } 17261 } else { 17262 /* 17263 * Apply Timely factor to increase/decrease the 17264 * amount we are pacing at. 17265 */ 17266 gain = rack_get_output_gain(rack, NULL); 17267 if (gain > rack_gain_p5_ub) { 17268 gain = rack_gain_p5_ub; 17269 } 17270 reduced_win = window_input * gain; 17271 reduced_win /= 100; 17272 } 17273 if (gain_b != NULL) 17274 *gain_b = gain; 17275 /* 17276 * What is being returned here is a trimmed down 17277 * window values in all cases where rate_set is left 17278 * at 0. In one case we actually return the rate (lt_bw). 17279 * the "reduced_win" is returned as a slimmed down cwnd that 17280 * is then calculated by the caller into a rate when rate_set 17281 * is 0. 17282 */ 17283 return (reduced_win); 17284 } 17285 17286 static int32_t 17287 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 17288 { 17289 uint64_t lentim, fill_bw; 17290 17291 rack->r_via_fill_cw = 0; 17292 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 17293 return (slot); 17294 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 17295 return (slot); 17296 if (rack->r_ctl.rc_last_us_rtt == 0) 17297 return (slot); 17298 if (rack->rc_pace_fill_if_rttin_range && 17299 (rack->r_ctl.rc_last_us_rtt >= 17300 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 17301 /* The rtt is huge, N * smallest, lets not fill */ 17302 return (slot); 17303 } 17304 if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) 17305 return (slot); 17306 /* 17307 * first lets calculate the b/w based on the last us-rtt 17308 * and the the smallest send window. 17309 */ 17310 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17311 if (rack->rc_fillcw_apply_discount) { 17312 uint32_t rate_set = 0; 17313 17314 fill_bw = rack_arrive_at_discounted_rate(rack, fill_bw, &rate_set, NULL); 17315 if (rate_set) { 17316 goto at_lt_bw; 17317 } 17318 } 17319 /* Take the rwnd if its smaller */ 17320 if (fill_bw > rack->rc_tp->snd_wnd) 17321 fill_bw = rack->rc_tp->snd_wnd; 17322 /* Now lets make it into a b/w */ 17323 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 17324 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17325 /* Adjust to any cap */ 17326 if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap) 17327 fill_bw = rack->r_ctl.fillcw_cap; 17328 17329 at_lt_bw: 17330 if (rack_bw_multipler > 0) { 17331 /* 17332 * We want to limit fill-cw to the some multiplier 17333 * of the max(lt_bw, gp_est). The normal default 17334 * is 0 for off, so a sysctl has enabled it. 17335 */ 17336 uint64_t lt_bw, gp, rate; 17337 17338 gp = rack_get_gp_est(rack); 17339 lt_bw = rack_get_lt_bw(rack); 17340 if (lt_bw > gp) 17341 rate = lt_bw; 17342 else 17343 rate = gp; 17344 rate *= rack_bw_multipler; 17345 rate /= 100; 17346 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 17347 union tcp_log_stackspecific log; 17348 struct timeval tv; 17349 17350 memset(&log, 0, sizeof(log)); 17351 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17352 log.u_bbr.flex1 = rack_bw_multipler; 17353 log.u_bbr.flex2 = len; 17354 log.u_bbr.cur_del_rate = gp; 17355 log.u_bbr.delRate = lt_bw; 17356 log.u_bbr.bw_inuse = rate; 17357 log.u_bbr.rttProp = fill_bw; 17358 log.u_bbr.flex8 = 44; 17359 tcp_log_event(rack->rc_tp, NULL, NULL, NULL, 17360 BBR_LOG_CWND, 0, 17361 0, &log, false, NULL, 17362 __func__, __LINE__, &tv); 17363 } 17364 if (fill_bw > rate) 17365 fill_bw = rate; 17366 } 17367 /* We are below the min b/w */ 17368 if (non_paced) 17369 *rate_wanted = fill_bw; 17370 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 17371 return (slot); 17372 rack->r_via_fill_cw = 1; 17373 if (rack->r_rack_hw_rate_caps && 17374 (rack->r_ctl.crte != NULL)) { 17375 uint64_t high_rate; 17376 17377 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 17378 if (fill_bw > high_rate) { 17379 /* We are capping bw at the highest rate table entry */ 17380 if (*rate_wanted > high_rate) { 17381 /* The original rate was also capped */ 17382 rack->r_via_fill_cw = 0; 17383 } 17384 rack_log_hdwr_pacing(rack, 17385 fill_bw, high_rate, __LINE__, 17386 0, 3); 17387 fill_bw = high_rate; 17388 if (capped) 17389 *capped = 1; 17390 } 17391 } else if ((rack->r_ctl.crte == NULL) && 17392 (rack->rack_hdrw_pacing == 0) && 17393 (rack->rack_hdw_pace_ena) && 17394 rack->r_rack_hw_rate_caps && 17395 (rack->rack_attempt_hdwr_pace == 0) && 17396 (rack->rc_inp->inp_route.ro_nh != NULL) && 17397 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17398 /* 17399 * Ok we may have a first attempt that is greater than our top rate 17400 * lets check. 17401 */ 17402 uint64_t high_rate; 17403 17404 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 17405 if (high_rate) { 17406 if (fill_bw > high_rate) { 17407 fill_bw = high_rate; 17408 if (capped) 17409 *capped = 1; 17410 } 17411 } 17412 } 17413 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { 17414 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 17415 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); 17416 fill_bw = rack->r_ctl.bw_rate_cap; 17417 } 17418 /* 17419 * Ok fill_bw holds our mythical b/w to fill the cwnd 17420 * in an rtt (unless it was capped), what does that 17421 * time wise equate too? 17422 */ 17423 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 17424 lentim /= fill_bw; 17425 *rate_wanted = fill_bw; 17426 if (non_paced || (lentim < slot)) { 17427 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 17428 0, lentim, 12, __LINE__, NULL, 0); 17429 return ((int32_t)lentim); 17430 } else 17431 return (slot); 17432 } 17433 17434 static int32_t 17435 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) 17436 { 17437 uint64_t srtt; 17438 int32_t slot = 0; 17439 int can_start_hw_pacing = 1; 17440 int err; 17441 int pace_one; 17442 17443 if (rack_pace_one_seg || 17444 (rack->r_ctl.rc_user_set_min_segs == 1)) 17445 pace_one = 1; 17446 else 17447 pace_one = 0; 17448 if (rack->rc_always_pace == 0) { 17449 /* 17450 * We use the most optimistic possible cwnd/srtt for 17451 * sending calculations. This will make our 17452 * calculation anticipate getting more through 17453 * quicker then possible. But thats ok we don't want 17454 * the peer to have a gap in data sending. 17455 */ 17456 uint64_t cwnd, tr_perms = 0; 17457 int32_t reduce; 17458 17459 old_method: 17460 /* 17461 * We keep no precise pacing with the old method 17462 * instead we use the pacer to mitigate bursts. 17463 */ 17464 if (rack->r_ctl.rc_rack_min_rtt) 17465 srtt = rack->r_ctl.rc_rack_min_rtt; 17466 else 17467 srtt = max(tp->t_srtt, 1); 17468 if (rack->r_ctl.rc_rack_largest_cwnd) 17469 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 17470 else 17471 cwnd = rack->r_ctl.cwnd_to_use; 17472 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 17473 tr_perms = (cwnd * 1000) / srtt; 17474 if (tr_perms == 0) { 17475 tr_perms = ctf_fixed_maxseg(tp); 17476 } 17477 /* 17478 * Calculate how long this will take to drain, if 17479 * the calculation comes out to zero, thats ok we 17480 * will use send_a_lot to possibly spin around for 17481 * more increasing tot_len_this_send to the point 17482 * that its going to require a pace, or we hit the 17483 * cwnd. Which in that case we are just waiting for 17484 * a ACK. 17485 */ 17486 slot = len / tr_perms; 17487 /* Now do we reduce the time so we don't run dry? */ 17488 if (slot && rack_slot_reduction) { 17489 reduce = (slot / rack_slot_reduction); 17490 if (reduce < slot) { 17491 slot -= reduce; 17492 } else 17493 slot = 0; 17494 } else 17495 reduce = 0; 17496 slot *= HPTS_USEC_IN_MSEC; 17497 if (rack->rc_pace_to_cwnd) { 17498 uint64_t rate_wanted = 0; 17499 17500 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 17501 rack->rc_ack_can_sendout_data = 1; 17502 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 17503 } else 17504 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 17505 /*******************************************************/ 17506 /* RRS: We insert non-paced call to stats here for len */ 17507 /*******************************************************/ 17508 } else { 17509 uint64_t bw_est, res, lentim, rate_wanted; 17510 uint32_t segs, oh; 17511 int capped = 0; 17512 int prev_fill; 17513 17514 if ((rack->r_rr_config == 1) && rsm) { 17515 return (rack->r_ctl.rc_min_to); 17516 } 17517 if (rack->use_fixed_rate) { 17518 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 17519 } else if ((rack->r_ctl.init_rate == 0) && 17520 (rack->r_ctl.gp_bw == 0)) { 17521 /* no way to yet do an estimate */ 17522 bw_est = rate_wanted = 0; 17523 } else if (rack->dgp_on) { 17524 bw_est = rack_get_bw(rack); 17525 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 17526 } else { 17527 uint32_t gain, rate_set = 0; 17528 17529 rate_wanted = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17530 rate_wanted = rack_arrive_at_discounted_rate(rack, rate_wanted, &rate_set, &gain); 17531 if (rate_set == 0) { 17532 if (rate_wanted > rack->rc_tp->snd_wnd) 17533 rate_wanted = rack->rc_tp->snd_wnd; 17534 /* Now lets make it into a b/w */ 17535 rate_wanted *= (uint64_t)HPTS_USEC_IN_SEC; 17536 rate_wanted /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17537 } 17538 bw_est = rate_wanted; 17539 rack_log_pacing_delay_calc(rack, rack->rc_tp->snd_cwnd, 17540 rack->r_ctl.cwnd_to_use, 17541 rate_wanted, bw_est, 17542 rack->r_ctl.rc_last_us_rtt, 17543 88, __LINE__, NULL, gain); 17544 } 17545 if (((bw_est == 0) || (rate_wanted == 0) || (rack->gp_ready == 0)) && 17546 (rack->use_fixed_rate == 0)) { 17547 /* 17548 * No way yet to make a b/w estimate or 17549 * our raise is set incorrectly. 17550 */ 17551 goto old_method; 17552 } 17553 rack_rate_cap_bw(rack, &rate_wanted, &capped); 17554 /* We need to account for all the overheads */ 17555 segs = (len + segsiz - 1) / segsiz; 17556 /* 17557 * We need the diff between 1514 bytes (e-mtu with e-hdr) 17558 * and how much data we put in each packet. Yes this 17559 * means we may be off if we are larger than 1500 bytes 17560 * or smaller. But this just makes us more conservative. 17561 */ 17562 17563 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); 17564 if (rack->r_is_v6) { 17565 #ifdef INET6 17566 oh += sizeof(struct ip6_hdr); 17567 #endif 17568 } else { 17569 #ifdef INET 17570 oh += sizeof(struct ip); 17571 #endif 17572 } 17573 /* We add a fixed 14 for the ethernet header */ 17574 oh += 14; 17575 segs *= oh; 17576 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 17577 res = lentim / rate_wanted; 17578 slot = (uint32_t)res; 17579 if (rack_hw_rate_min && 17580 (rate_wanted < rack_hw_rate_min)) { 17581 can_start_hw_pacing = 0; 17582 if (rack->r_ctl.crte) { 17583 /* 17584 * Ok we need to release it, we 17585 * have fallen too low. 17586 */ 17587 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17588 rack->r_ctl.crte = NULL; 17589 rack->rack_attempt_hdwr_pace = 0; 17590 rack->rack_hdrw_pacing = 0; 17591 } 17592 } 17593 if (rack->r_ctl.crte && 17594 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17595 /* 17596 * We want more than the hardware can give us, 17597 * don't start any hw pacing. 17598 */ 17599 can_start_hw_pacing = 0; 17600 if (rack->r_rack_hw_rate_caps == 0) { 17601 /* 17602 * Ok we need to release it, we 17603 * want more than the card can give us and 17604 * no rate cap is in place. Set it up so 17605 * when we want less we can retry. 17606 */ 17607 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17608 rack->r_ctl.crte = NULL; 17609 rack->rack_attempt_hdwr_pace = 0; 17610 rack->rack_hdrw_pacing = 0; 17611 } 17612 } 17613 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { 17614 /* 17615 * We lost our rate somehow, this can happen 17616 * if the interface changed underneath us. 17617 */ 17618 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17619 rack->r_ctl.crte = NULL; 17620 /* Lets re-allow attempting to setup pacing */ 17621 rack->rack_hdrw_pacing = 0; 17622 rack->rack_attempt_hdwr_pace = 0; 17623 rack_log_hdwr_pacing(rack, 17624 rate_wanted, bw_est, __LINE__, 17625 0, 6); 17626 } 17627 prev_fill = rack->r_via_fill_cw; 17628 if ((rack->rc_pace_to_cwnd) && 17629 (capped == 0) && 17630 (rack->dgp_on == 1) && 17631 (rack->use_fixed_rate == 0) && 17632 (rack->in_probe_rtt == 0) && 17633 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 17634 /* 17635 * We want to pace at our rate *or* faster to 17636 * fill the cwnd to the max if its not full. 17637 */ 17638 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 17639 /* Re-check to make sure we are not exceeding our max b/w */ 17640 if ((rack->r_ctl.crte != NULL) && 17641 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17642 /* 17643 * We want more than the hardware can give us, 17644 * don't start any hw pacing. 17645 */ 17646 can_start_hw_pacing = 0; 17647 if (rack->r_rack_hw_rate_caps == 0) { 17648 /* 17649 * Ok we need to release it, we 17650 * want more than the card can give us and 17651 * no rate cap is in place. Set it up so 17652 * when we want less we can retry. 17653 */ 17654 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17655 rack->r_ctl.crte = NULL; 17656 rack->rack_attempt_hdwr_pace = 0; 17657 rack->rack_hdrw_pacing = 0; 17658 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 17659 } 17660 } 17661 } 17662 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 17663 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17664 if ((rack->rack_hdw_pace_ena) && 17665 (can_start_hw_pacing > 0) && 17666 (rack->rack_hdrw_pacing == 0) && 17667 (rack->rack_attempt_hdwr_pace == 0)) { 17668 /* 17669 * Lets attempt to turn on hardware pacing 17670 * if we can. 17671 */ 17672 rack->rack_attempt_hdwr_pace = 1; 17673 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 17674 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17675 rate_wanted, 17676 RS_PACING_GEQ, 17677 &err, &rack->r_ctl.crte_prev_rate); 17678 if (rack->r_ctl.crte) { 17679 rack->rack_hdrw_pacing = 1; 17680 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, 17681 pace_one, rack->r_ctl.crte, 17682 NULL, rack->r_ctl.pace_len_divisor); 17683 rack_log_hdwr_pacing(rack, 17684 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17685 err, 0); 17686 rack->r_ctl.last_hw_bw_req = rate_wanted; 17687 } else { 17688 counter_u64_add(rack_hw_pace_init_fail, 1); 17689 } 17690 } else if (rack->rack_hdrw_pacing && 17691 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 17692 /* Do we need to adjust our rate? */ 17693 const struct tcp_hwrate_limit_table *nrte; 17694 17695 if (rack->r_up_only && 17696 (rate_wanted < rack->r_ctl.crte->rate)) { 17697 /** 17698 * We have four possible states here 17699 * having to do with the previous time 17700 * and this time. 17701 * previous | this-time 17702 * A) 0 | 0 -- fill_cw not in the picture 17703 * B) 1 | 0 -- we were doing a fill-cw but now are not 17704 * C) 1 | 1 -- all rates from fill_cw 17705 * D) 0 | 1 -- we were doing non-fill and now we are filling 17706 * 17707 * For case A, C and D we don't allow a drop. But for 17708 * case B where we now our on our steady rate we do 17709 * allow a drop. 17710 * 17711 */ 17712 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 17713 goto done_w_hdwr; 17714 } 17715 if ((rate_wanted > rack->r_ctl.crte->rate) || 17716 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 17717 if (rack_hw_rate_to_low && 17718 (bw_est < rack_hw_rate_to_low)) { 17719 /* 17720 * The pacing rate is too low for hardware, but 17721 * do allow hardware pacing to be restarted. 17722 */ 17723 rack_log_hdwr_pacing(rack, 17724 bw_est, rack->r_ctl.crte->rate, __LINE__, 17725 0, 5); 17726 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17727 rack->r_ctl.crte = NULL; 17728 rack->rack_attempt_hdwr_pace = 0; 17729 rack->rack_hdrw_pacing = 0; 17730 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17731 goto done_w_hdwr; 17732 } 17733 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 17734 rack->rc_tp, 17735 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17736 rate_wanted, 17737 RS_PACING_GEQ, 17738 &err, &rack->r_ctl.crte_prev_rate); 17739 if (nrte == NULL) { 17740 /* 17741 * Lost the rate, lets drop hardware pacing 17742 * period. 17743 */ 17744 rack->rack_hdrw_pacing = 0; 17745 rack->r_ctl.crte = NULL; 17746 rack_log_hdwr_pacing(rack, 17747 rate_wanted, 0, __LINE__, 17748 err, 1); 17749 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17750 counter_u64_add(rack_hw_pace_lost, 1); 17751 } else if (nrte != rack->r_ctl.crte) { 17752 rack->r_ctl.crte = nrte; 17753 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, 17754 segsiz, pace_one, rack->r_ctl.crte, 17755 NULL, rack->r_ctl.pace_len_divisor); 17756 rack_log_hdwr_pacing(rack, 17757 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17758 err, 2); 17759 rack->r_ctl.last_hw_bw_req = rate_wanted; 17760 } 17761 } else { 17762 /* We just need to adjust the segment size */ 17763 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17764 rack_log_hdwr_pacing(rack, 17765 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17766 0, 4); 17767 rack->r_ctl.last_hw_bw_req = rate_wanted; 17768 } 17769 } 17770 } 17771 done_w_hdwr: 17772 if (rack_limit_time_with_srtt && 17773 (rack->use_fixed_rate == 0) && 17774 (rack->rack_hdrw_pacing == 0)) { 17775 /* 17776 * Sanity check, we do not allow the pacing delay 17777 * to be longer than the SRTT of the path. If it is 17778 * a slow path, then adding a packet should increase 17779 * the RTT and compensate for this i.e. the srtt will 17780 * be greater so the allowed pacing time will be greater. 17781 * 17782 * Note this restriction is not for where a peak rate 17783 * is set, we are doing fixed pacing or hardware pacing. 17784 */ 17785 if (rack->rc_tp->t_srtt) 17786 srtt = rack->rc_tp->t_srtt; 17787 else 17788 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 17789 if (srtt < (uint64_t)slot) { 17790 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 17791 slot = srtt; 17792 } 17793 } 17794 /*******************************************************************/ 17795 /* RRS: We insert paced call to stats here for len and rate_wanted */ 17796 /*******************************************************************/ 17797 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 17798 } 17799 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 17800 /* 17801 * If this rate is seeing enobufs when it 17802 * goes to send then either the nic is out 17803 * of gas or we are mis-estimating the time 17804 * somehow and not letting the queue empty 17805 * completely. Lets add to the pacing time. 17806 */ 17807 int hw_boost_delay; 17808 17809 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 17810 if (hw_boost_delay > rack_enobuf_hw_max) 17811 hw_boost_delay = rack_enobuf_hw_max; 17812 else if (hw_boost_delay < rack_enobuf_hw_min) 17813 hw_boost_delay = rack_enobuf_hw_min; 17814 slot += hw_boost_delay; 17815 } 17816 return (slot); 17817 } 17818 17819 static void 17820 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 17821 tcp_seq startseq, uint32_t sb_offset) 17822 { 17823 struct rack_sendmap *my_rsm = NULL; 17824 17825 if (tp->t_state < TCPS_ESTABLISHED) { 17826 /* 17827 * We don't start any measurements if we are 17828 * not at least established. 17829 */ 17830 return; 17831 } 17832 if (tp->t_state >= TCPS_FIN_WAIT_1) { 17833 /* 17834 * We will get no more data into the SB 17835 * this means we need to have the data available 17836 * before we start a measurement. 17837 */ 17838 17839 if (sbavail(&tptosocket(tp)->so_snd) < 17840 max(rc_init_window(rack), 17841 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 17842 /* Nope not enough data */ 17843 return; 17844 } 17845 } 17846 tp->t_flags |= TF_GPUTINPROG; 17847 rack->r_ctl.rc_gp_cumack_ts = 0; 17848 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 17849 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 17850 tp->gput_seq = startseq; 17851 rack->app_limited_needs_set = 0; 17852 if (rack->in_probe_rtt) 17853 rack->measure_saw_probe_rtt = 1; 17854 else if ((rack->measure_saw_probe_rtt) && 17855 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 17856 rack->measure_saw_probe_rtt = 0; 17857 if (rack->rc_gp_filled) 17858 tp->gput_ts = rack->r_ctl.last_cumack_advance; 17859 else { 17860 /* Special case initial measurement */ 17861 struct timeval tv; 17862 17863 tp->gput_ts = tcp_get_usecs(&tv); 17864 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 17865 } 17866 /* 17867 * We take a guess out into the future, 17868 * if we have no measurement and no 17869 * initial rate, we measure the first 17870 * initial-windows worth of data to 17871 * speed up getting some GP measurement and 17872 * thus start pacing. 17873 */ 17874 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 17875 rack->app_limited_needs_set = 1; 17876 tp->gput_ack = startseq + max(rc_init_window(rack), 17877 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 17878 rack_log_pacing_delay_calc(rack, 17879 tp->gput_seq, 17880 tp->gput_ack, 17881 0, 17882 tp->gput_ts, 17883 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17884 9, 17885 __LINE__, NULL, 0); 17886 rack_tend_gp_marks(tp, rack); 17887 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17888 return; 17889 } 17890 if (sb_offset) { 17891 /* 17892 * We are out somewhere in the sb 17893 * can we use the already outstanding data? 17894 */ 17895 17896 if (rack->r_ctl.rc_app_limited_cnt == 0) { 17897 /* 17898 * Yes first one is good and in this case 17899 * the tp->gput_ts is correctly set based on 17900 * the last ack that arrived (no need to 17901 * set things up when an ack comes in). 17902 */ 17903 my_rsm = tqhash_min(rack->r_ctl.tqh); 17904 if ((my_rsm == NULL) || 17905 (my_rsm->r_rtr_cnt != 1)) { 17906 /* retransmission? */ 17907 goto use_latest; 17908 } 17909 } else { 17910 if (rack->r_ctl.rc_first_appl == NULL) { 17911 /* 17912 * If rc_first_appl is NULL 17913 * then the cnt should be 0. 17914 * This is probably an error, maybe 17915 * a KASSERT would be approprate. 17916 */ 17917 goto use_latest; 17918 } 17919 /* 17920 * If we have a marker pointer to the last one that is 17921 * app limited we can use that, but we need to set 17922 * things up so that when it gets ack'ed we record 17923 * the ack time (if its not already acked). 17924 */ 17925 rack->app_limited_needs_set = 1; 17926 /* 17927 * We want to get to the rsm that is either 17928 * next with space i.e. over 1 MSS or the one 17929 * after that (after the app-limited). 17930 */ 17931 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); 17932 if (my_rsm) { 17933 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 17934 /* Have to use the next one */ 17935 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17936 else { 17937 /* Use after the first MSS of it is acked */ 17938 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 17939 goto start_set; 17940 } 17941 } 17942 if ((my_rsm == NULL) || 17943 (my_rsm->r_rtr_cnt != 1)) { 17944 /* 17945 * Either its a retransmit or 17946 * the last is the app-limited one. 17947 */ 17948 goto use_latest; 17949 } 17950 } 17951 tp->gput_seq = my_rsm->r_start; 17952 start_set: 17953 if (my_rsm->r_flags & RACK_ACKED) { 17954 /* 17955 * This one has been acked use the arrival ack time 17956 */ 17957 struct rack_sendmap *nrsm; 17958 17959 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 17960 rack->app_limited_needs_set = 0; 17961 /* 17962 * Ok in this path we need to use the r_end now 17963 * since this guy is the starting ack. 17964 */ 17965 tp->gput_seq = my_rsm->r_end; 17966 /* 17967 * We also need to adjust up the sendtime 17968 * to the send of the next data after my_rsm. 17969 */ 17970 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17971 if (nrsm != NULL) 17972 my_rsm = nrsm; 17973 else { 17974 /* 17975 * The next as not been sent, thats the 17976 * case for using the latest. 17977 */ 17978 goto use_latest; 17979 } 17980 } 17981 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 17982 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 17983 rack->r_ctl.rc_gp_cumack_ts = 0; 17984 if ((rack->r_ctl.cleared_app_ack == 1) && 17985 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.cleared_app_ack_seq))) { 17986 /* 17987 * We just cleared an application limited period 17988 * so the next seq out needs to skip the first 17989 * ack. 17990 */ 17991 rack->app_limited_needs_set = 1; 17992 rack->r_ctl.cleared_app_ack = 0; 17993 } 17994 rack_log_pacing_delay_calc(rack, 17995 tp->gput_seq, 17996 tp->gput_ack, 17997 (uintptr_t)my_rsm, 17998 tp->gput_ts, 17999 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18000 9, 18001 __LINE__, my_rsm, 0); 18002 /* Now lets make sure all are marked as they should be */ 18003 rack_tend_gp_marks(tp, rack); 18004 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18005 return; 18006 } 18007 18008 use_latest: 18009 /* 18010 * We don't know how long we may have been 18011 * idle or if this is the first-send. Lets 18012 * setup the flag so we will trim off 18013 * the first ack'd data so we get a true 18014 * measurement. 18015 */ 18016 rack->app_limited_needs_set = 1; 18017 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 18018 rack->r_ctl.rc_gp_cumack_ts = 0; 18019 /* Find this guy so we can pull the send time */ 18020 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); 18021 if (my_rsm) { 18022 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 18023 if (my_rsm->r_flags & RACK_ACKED) { 18024 /* 18025 * Unlikely since its probably what was 18026 * just transmitted (but I am paranoid). 18027 */ 18028 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 18029 rack->app_limited_needs_set = 0; 18030 } 18031 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 18032 /* This also is unlikely */ 18033 tp->gput_seq = my_rsm->r_start; 18034 } 18035 } else { 18036 /* 18037 * TSNH unless we have some send-map limit, 18038 * and even at that it should not be hitting 18039 * that limit (we should have stopped sending). 18040 */ 18041 struct timeval tv; 18042 18043 microuptime(&tv); 18044 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18045 } 18046 rack_tend_gp_marks(tp, rack); 18047 rack_log_pacing_delay_calc(rack, 18048 tp->gput_seq, 18049 tp->gput_ack, 18050 (uintptr_t)my_rsm, 18051 tp->gput_ts, 18052 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18053 9, __LINE__, NULL, 0); 18054 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18055 } 18056 18057 static inline uint32_t 18058 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 18059 uint32_t avail, int32_t sb_offset) 18060 { 18061 uint32_t len; 18062 uint32_t sendwin; 18063 18064 if (tp->snd_wnd > cwnd_to_use) 18065 sendwin = cwnd_to_use; 18066 else 18067 sendwin = tp->snd_wnd; 18068 if (ctf_outstanding(tp) >= tp->snd_wnd) { 18069 /* We never want to go over our peers rcv-window */ 18070 len = 0; 18071 } else { 18072 uint32_t flight; 18073 18074 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 18075 if (flight >= sendwin) { 18076 /* 18077 * We have in flight what we are allowed by cwnd (if 18078 * it was rwnd blocking it would have hit above out 18079 * >= tp->snd_wnd). 18080 */ 18081 return (0); 18082 } 18083 len = sendwin - flight; 18084 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 18085 /* We would send too much (beyond the rwnd) */ 18086 len = tp->snd_wnd - ctf_outstanding(tp); 18087 } 18088 if ((len + sb_offset) > avail) { 18089 /* 18090 * We don't have that much in the SB, how much is 18091 * there? 18092 */ 18093 len = avail - sb_offset; 18094 } 18095 } 18096 return (len); 18097 } 18098 18099 static void 18100 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 18101 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 18102 int rsm_is_null, int optlen, int line, uint16_t mode) 18103 { 18104 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 18105 union tcp_log_stackspecific log; 18106 struct timeval tv; 18107 18108 memset(&log, 0, sizeof(log)); 18109 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18110 log.u_bbr.flex1 = error; 18111 log.u_bbr.flex2 = flags; 18112 log.u_bbr.flex3 = rsm_is_null; 18113 log.u_bbr.flex4 = ipoptlen; 18114 log.u_bbr.flex5 = tp->rcv_numsacks; 18115 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18116 log.u_bbr.flex7 = optlen; 18117 log.u_bbr.flex8 = rack->r_fsb_inited; 18118 log.u_bbr.applimited = rack->r_fast_output; 18119 log.u_bbr.bw_inuse = rack_get_bw(rack); 18120 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18121 log.u_bbr.cwnd_gain = mode; 18122 log.u_bbr.pkts_out = orig_len; 18123 log.u_bbr.lt_epoch = len; 18124 log.u_bbr.delivered = line; 18125 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 18126 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18127 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 18128 len, &log, false, NULL, __func__, __LINE__, &tv); 18129 } 18130 } 18131 18132 18133 static struct mbuf * 18134 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 18135 struct rack_fast_send_blk *fsb, 18136 int32_t seglimit, int32_t segsize, int hw_tls) 18137 { 18138 #ifdef KERN_TLS 18139 struct ktls_session *tls, *ntls; 18140 #ifdef INVARIANTS 18141 struct mbuf *start; 18142 #endif 18143 #endif 18144 struct mbuf *m, *n, **np, *smb; 18145 struct mbuf *top; 18146 int32_t off, soff; 18147 int32_t len = *plen; 18148 int32_t fragsize; 18149 int32_t len_cp = 0; 18150 uint32_t mlen, frags; 18151 18152 soff = off = the_off; 18153 smb = m = the_m; 18154 np = ⊤ 18155 top = NULL; 18156 #ifdef KERN_TLS 18157 if (hw_tls && (m->m_flags & M_EXTPG)) 18158 tls = m->m_epg_tls; 18159 else 18160 tls = NULL; 18161 #ifdef INVARIANTS 18162 start = m; 18163 #endif 18164 #endif 18165 while (len > 0) { 18166 if (m == NULL) { 18167 *plen = len_cp; 18168 break; 18169 } 18170 #ifdef KERN_TLS 18171 if (hw_tls) { 18172 if (m->m_flags & M_EXTPG) 18173 ntls = m->m_epg_tls; 18174 else 18175 ntls = NULL; 18176 18177 /* 18178 * Avoid mixing TLS records with handshake 18179 * data or TLS records from different 18180 * sessions. 18181 */ 18182 if (tls != ntls) { 18183 MPASS(m != start); 18184 *plen = len_cp; 18185 break; 18186 } 18187 } 18188 #endif 18189 mlen = min(len, m->m_len - off); 18190 if (seglimit) { 18191 /* 18192 * For M_EXTPG mbufs, add 3 segments 18193 * + 1 in case we are crossing page boundaries 18194 * + 2 in case the TLS hdr/trailer are used 18195 * It is cheaper to just add the segments 18196 * than it is to take the cache miss to look 18197 * at the mbuf ext_pgs state in detail. 18198 */ 18199 if (m->m_flags & M_EXTPG) { 18200 fragsize = min(segsize, PAGE_SIZE); 18201 frags = 3; 18202 } else { 18203 fragsize = segsize; 18204 frags = 0; 18205 } 18206 18207 /* Break if we really can't fit anymore. */ 18208 if ((frags + 1) >= seglimit) { 18209 *plen = len_cp; 18210 break; 18211 } 18212 18213 /* 18214 * Reduce size if you can't copy the whole 18215 * mbuf. If we can't copy the whole mbuf, also 18216 * adjust len so the loop will end after this 18217 * mbuf. 18218 */ 18219 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 18220 mlen = (seglimit - frags - 1) * fragsize; 18221 len = mlen; 18222 *plen = len_cp + len; 18223 } 18224 frags += howmany(mlen, fragsize); 18225 if (frags == 0) 18226 frags++; 18227 seglimit -= frags; 18228 KASSERT(seglimit > 0, 18229 ("%s: seglimit went too low", __func__)); 18230 } 18231 n = m_get(M_NOWAIT, m->m_type); 18232 *np = n; 18233 if (n == NULL) 18234 goto nospace; 18235 n->m_len = mlen; 18236 soff += mlen; 18237 len_cp += n->m_len; 18238 if (m->m_flags & (M_EXT | M_EXTPG)) { 18239 n->m_data = m->m_data + off; 18240 mb_dupcl(n, m); 18241 } else { 18242 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 18243 (u_int)n->m_len); 18244 } 18245 len -= n->m_len; 18246 off = 0; 18247 m = m->m_next; 18248 np = &n->m_next; 18249 if (len || (soff == smb->m_len)) { 18250 /* 18251 * We have more so we move forward or 18252 * we have consumed the entire mbuf and 18253 * len has fell to 0. 18254 */ 18255 soff = 0; 18256 smb = m; 18257 } 18258 18259 } 18260 if (fsb != NULL) { 18261 fsb->m = smb; 18262 fsb->off = soff; 18263 if (smb) { 18264 /* 18265 * Save off the size of the mbuf. We do 18266 * this so that we can recognize when it 18267 * has been trimmed by sbcut() as acks 18268 * come in. 18269 */ 18270 fsb->o_m_len = smb->m_len; 18271 fsb->o_t_len = M_TRAILINGROOM(smb); 18272 } else { 18273 /* 18274 * This is the case where the next mbuf went to NULL. This 18275 * means with this copy we have sent everything in the sb. 18276 * In theory we could clear the fast_output flag, but lets 18277 * not since its possible that we could get more added 18278 * and acks that call the extend function which would let 18279 * us send more. 18280 */ 18281 fsb->o_m_len = 0; 18282 fsb->o_t_len = 0; 18283 } 18284 } 18285 return (top); 18286 nospace: 18287 if (top) 18288 m_freem(top); 18289 return (NULL); 18290 18291 } 18292 18293 /* 18294 * This is a copy of m_copym(), taking the TSO segment size/limit 18295 * constraints into account, and advancing the sndptr as it goes. 18296 */ 18297 static struct mbuf * 18298 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 18299 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 18300 { 18301 struct mbuf *m, *n; 18302 int32_t soff; 18303 18304 m = rack->r_ctl.fsb.m; 18305 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { 18306 /* 18307 * The trailing space changed, mbufs can grow 18308 * at the tail but they can't shrink from 18309 * it, KASSERT that. Adjust the orig_m_len to 18310 * compensate for this change. 18311 */ 18312 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), 18313 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 18314 m, 18315 rack, 18316 (intmax_t)M_TRAILINGROOM(m), 18317 rack->r_ctl.fsb.o_t_len, 18318 rack->r_ctl.fsb.o_m_len, 18319 m->m_len)); 18320 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); 18321 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); 18322 } 18323 if (m->m_len < rack->r_ctl.fsb.o_m_len) { 18324 /* 18325 * Mbuf shrank, trimmed off the top by an ack, our 18326 * offset changes. 18327 */ 18328 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), 18329 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", 18330 m, m->m_len, 18331 rack, rack->r_ctl.fsb.o_m_len, 18332 rack->r_ctl.fsb.off)); 18333 18334 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) 18335 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); 18336 else 18337 rack->r_ctl.fsb.off = 0; 18338 rack->r_ctl.fsb.o_m_len = m->m_len; 18339 #ifdef INVARIANTS 18340 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { 18341 panic("rack:%p m:%p m_len grew outside of t_space compensation", 18342 rack, m); 18343 #endif 18344 } 18345 soff = rack->r_ctl.fsb.off; 18346 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 18347 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 18348 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 18349 __FUNCTION__, 18350 rack, *plen, m, m->m_len)); 18351 /* Save off the right location before we copy and advance */ 18352 *s_soff = soff; 18353 *s_mb = rack->r_ctl.fsb.m; 18354 n = rack_fo_base_copym(m, soff, plen, 18355 &rack->r_ctl.fsb, 18356 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 18357 return (n); 18358 } 18359 18360 /* Log the buffer level */ 18361 static void 18362 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, 18363 int len, struct timeval *tv, 18364 uint32_t cts) 18365 { 18366 uint32_t p_rate = 0, p_queue = 0, err = 0; 18367 union tcp_log_stackspecific log; 18368 18369 #ifdef RATELIMIT 18370 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18371 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18372 #endif 18373 memset(&log, 0, sizeof(log)); 18374 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18375 log.u_bbr.flex1 = p_rate; 18376 log.u_bbr.flex2 = p_queue; 18377 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18378 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18379 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18380 log.u_bbr.flex7 = 99; 18381 log.u_bbr.flex8 = 0; 18382 log.u_bbr.pkts_out = err; 18383 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18384 log.u_bbr.timeStamp = cts; 18385 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18386 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18387 len, &log, false, NULL, __func__, __LINE__, tv); 18388 18389 } 18390 18391 static uint32_t 18392 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, 18393 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) 18394 { 18395 uint64_t lentime = 0; 18396 #ifdef RATELIMIT 18397 uint32_t p_rate = 0, p_queue = 0, err; 18398 union tcp_log_stackspecific log; 18399 uint64_t bw; 18400 18401 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18402 /* Failed or queue is zero */ 18403 if (err || (p_queue == 0)) { 18404 lentime = 0; 18405 goto out; 18406 } 18407 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18408 if (err) { 18409 lentime = 0; 18410 goto out; 18411 } 18412 /* 18413 * If we reach here we have some bytes in 18414 * the queue. The number returned is a value 18415 * between 0 and 0xffff where ffff is full 18416 * and 0 is empty. So how best to make this into 18417 * something usable? 18418 * 18419 * The "safer" way is lets take the b/w gotten 18420 * from the query (which should be our b/w rate) 18421 * and pretend that a full send (our rc_pace_max_segs) 18422 * is outstanding. We factor it so its as if a full 18423 * number of our MSS segment is terms of full 18424 * ethernet segments are outstanding. 18425 */ 18426 bw = p_rate / 8; 18427 if (bw) { 18428 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); 18429 lentime *= ETHERNET_SEGMENT_SIZE; 18430 lentime *= (uint64_t)HPTS_USEC_IN_SEC; 18431 lentime /= bw; 18432 } else { 18433 /* TSNH -- KASSERT? */ 18434 lentime = 0; 18435 } 18436 out: 18437 if (tcp_bblogging_on(tp)) { 18438 memset(&log, 0, sizeof(log)); 18439 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18440 log.u_bbr.flex1 = p_rate; 18441 log.u_bbr.flex2 = p_queue; 18442 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18443 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18444 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18445 log.u_bbr.flex7 = 99; 18446 log.u_bbr.flex8 = 0; 18447 log.u_bbr.pkts_out = err; 18448 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18449 log.u_bbr.cur_del_rate = lentime; 18450 log.u_bbr.timeStamp = cts; 18451 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18452 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18453 len, &log, false, NULL, __func__, __LINE__,tv); 18454 } 18455 #endif 18456 return ((uint32_t)lentime); 18457 } 18458 18459 static int 18460 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 18461 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 18462 { 18463 /* 18464 * Enter the fast retransmit path. We are given that a sched_pin is 18465 * in place (if accounting is compliled in) and the cycle count taken 18466 * at the entry is in the ts_val. The concept her is that the rsm 18467 * now holds the mbuf offsets and such so we can directly transmit 18468 * without a lot of overhead, the len field is already set for 18469 * us to prohibit us from sending too much (usually its 1MSS). 18470 */ 18471 struct ip *ip = NULL; 18472 struct udphdr *udp = NULL; 18473 struct tcphdr *th = NULL; 18474 struct mbuf *m = NULL; 18475 struct inpcb *inp; 18476 uint8_t *cpto; 18477 struct tcp_log_buffer *lgb; 18478 #ifdef TCP_ACCOUNTING 18479 uint64_t crtsc; 18480 int cnt_thru = 1; 18481 #endif 18482 struct tcpopt to; 18483 u_char opt[TCP_MAXOLEN]; 18484 uint32_t hdrlen, optlen; 18485 int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; 18486 uint16_t flags; 18487 uint32_t if_hw_tsomaxsegcount = 0, startseq; 18488 uint32_t if_hw_tsomaxsegsize; 18489 int32_t ip_sendflag = IP_NO_SND_TAG_RL; 18490 18491 #ifdef INET6 18492 struct ip6_hdr *ip6 = NULL; 18493 18494 if (rack->r_is_v6) { 18495 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18496 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18497 } else 18498 #endif /* INET6 */ 18499 { 18500 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18501 hdrlen = sizeof(struct tcpiphdr); 18502 } 18503 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 18504 goto failed; 18505 } 18506 if (doing_tlp) { 18507 /* Its a TLP add the flag, it may already be there but be sure */ 18508 rsm->r_flags |= RACK_TLP; 18509 } else { 18510 /* If it was a TLP it is not not on this retransmit */ 18511 rsm->r_flags &= ~RACK_TLP; 18512 } 18513 startseq = rsm->r_start; 18514 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 18515 inp = rack->rc_inp; 18516 to.to_flags = 0; 18517 flags = tcp_outflags[tp->t_state]; 18518 if (flags & (TH_SYN|TH_RST)) { 18519 goto failed; 18520 } 18521 if (rsm->r_flags & RACK_HAS_FIN) { 18522 /* We can't send a FIN here */ 18523 goto failed; 18524 } 18525 if (flags & TH_FIN) { 18526 /* We never send a FIN */ 18527 flags &= ~TH_FIN; 18528 } 18529 if (tp->t_flags & TF_RCVD_TSTMP) { 18530 to.to_tsval = ms_cts + tp->ts_offset; 18531 to.to_tsecr = tp->ts_recent; 18532 to.to_flags = TOF_TS; 18533 } 18534 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18535 /* TCP-MD5 (RFC2385). */ 18536 if (tp->t_flags & TF_SIGNATURE) 18537 to.to_flags |= TOF_SIGNATURE; 18538 #endif 18539 optlen = tcp_addoptions(&to, opt); 18540 hdrlen += optlen; 18541 udp = rack->r_ctl.fsb.udp; 18542 if (udp) 18543 hdrlen += sizeof(struct udphdr); 18544 if (rack->r_ctl.rc_pace_max_segs) 18545 max_val = rack->r_ctl.rc_pace_max_segs; 18546 else if (rack->rc_user_set_max_segs) 18547 max_val = rack->rc_user_set_max_segs * segsiz; 18548 else 18549 max_val = len; 18550 if ((tp->t_flags & TF_TSO) && 18551 V_tcp_do_tso && 18552 (len > segsiz) && 18553 (tp->t_port == 0)) 18554 tso = 1; 18555 #ifdef INET6 18556 if (MHLEN < hdrlen + max_linkhdr) 18557 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18558 else 18559 #endif 18560 m = m_gethdr(M_NOWAIT, MT_DATA); 18561 if (m == NULL) 18562 goto failed; 18563 m->m_data += max_linkhdr; 18564 m->m_len = hdrlen; 18565 th = rack->r_ctl.fsb.th; 18566 /* Establish the len to send */ 18567 if (len > max_val) 18568 len = max_val; 18569 if ((tso) && (len + optlen > segsiz)) { 18570 uint32_t if_hw_tsomax; 18571 int32_t max_len; 18572 18573 /* extract TSO information */ 18574 if_hw_tsomax = tp->t_tsomax; 18575 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18576 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18577 /* 18578 * Check if we should limit by maximum payload 18579 * length: 18580 */ 18581 if (if_hw_tsomax != 0) { 18582 /* compute maximum TSO length */ 18583 max_len = (if_hw_tsomax - hdrlen - 18584 max_linkhdr); 18585 if (max_len <= 0) { 18586 goto failed; 18587 } else if (len > max_len) { 18588 len = max_len; 18589 } 18590 } 18591 if (len <= segsiz) { 18592 /* 18593 * In case there are too many small fragments don't 18594 * use TSO: 18595 */ 18596 tso = 0; 18597 } 18598 } else { 18599 tso = 0; 18600 } 18601 if ((tso == 0) && (len > segsiz)) 18602 len = segsiz; 18603 (void)tcp_get_usecs(tv); 18604 if ((len == 0) || 18605 (len <= MHLEN - hdrlen - max_linkhdr)) { 18606 goto failed; 18607 } 18608 th->th_seq = htonl(rsm->r_start); 18609 th->th_ack = htonl(tp->rcv_nxt); 18610 /* 18611 * The PUSH bit should only be applied 18612 * if the full retransmission is made. If 18613 * we are sending less than this is the 18614 * left hand edge and should not have 18615 * the PUSH bit. 18616 */ 18617 if ((rsm->r_flags & RACK_HAD_PUSH) && 18618 (len == (rsm->r_end - rsm->r_start))) 18619 flags |= TH_PUSH; 18620 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 18621 if (th->th_win == 0) { 18622 tp->t_sndzerowin++; 18623 tp->t_flags |= TF_RXWIN0SENT; 18624 } else 18625 tp->t_flags &= ~TF_RXWIN0SENT; 18626 if (rsm->r_flags & RACK_TLP) { 18627 /* 18628 * TLP should not count in retran count, but 18629 * in its own bin 18630 */ 18631 counter_u64_add(rack_tlp_retran, 1); 18632 counter_u64_add(rack_tlp_retran_bytes, len); 18633 } else { 18634 tp->t_sndrexmitpack++; 18635 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18636 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18637 } 18638 #ifdef STATS 18639 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18640 len); 18641 #endif 18642 if (rsm->m == NULL) 18643 goto failed; 18644 if (rsm->m && 18645 ((rsm->orig_m_len != rsm->m->m_len) || 18646 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 18647 /* Fix up the orig_m_len and possibly the mbuf offset */ 18648 rack_adjust_orig_mlen(rsm); 18649 } 18650 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 18651 if (len <= segsiz) { 18652 /* 18653 * Must have ran out of mbufs for the copy 18654 * shorten it to no longer need tso. Lets 18655 * not put on sendalot since we are low on 18656 * mbufs. 18657 */ 18658 tso = 0; 18659 } 18660 if ((m->m_next == NULL) || (len <= 0)){ 18661 goto failed; 18662 } 18663 if (udp) { 18664 if (rack->r_is_v6) 18665 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18666 else 18667 ulen = hdrlen + len - sizeof(struct ip); 18668 udp->uh_ulen = htons(ulen); 18669 } 18670 m->m_pkthdr.rcvif = (struct ifnet *)0; 18671 if (TCPS_HAVERCVDSYN(tp->t_state) && 18672 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 18673 int ect = tcp_ecn_output_established(tp, &flags, len, true); 18674 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18675 (tp->t_flags2 & TF2_ECN_SND_ECE)) 18676 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18677 #ifdef INET6 18678 if (rack->r_is_v6) { 18679 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 18680 ip6->ip6_flow |= htonl(ect << 20); 18681 } 18682 else 18683 #endif 18684 { 18685 ip->ip_tos &= ~IPTOS_ECN_MASK; 18686 ip->ip_tos |= ect; 18687 } 18688 } 18689 if (rack->r_ctl.crte != NULL) { 18690 /* See if we can send via the hw queue */ 18691 slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); 18692 /* If there is nothing in queue (no pacing time) we can send via the hw queue */ 18693 if (slot == 0) 18694 ip_sendflag = 0; 18695 } 18696 tcp_set_flags(th, flags); 18697 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18698 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18699 if (to.to_flags & TOF_SIGNATURE) { 18700 /* 18701 * Calculate MD5 signature and put it into the place 18702 * determined before. 18703 * NOTE: since TCP options buffer doesn't point into 18704 * mbuf's data, calculate offset and use it. 18705 */ 18706 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18707 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18708 /* 18709 * Do not send segment if the calculation of MD5 18710 * digest has failed. 18711 */ 18712 goto failed; 18713 } 18714 } 18715 #endif 18716 #ifdef INET6 18717 if (rack->r_is_v6) { 18718 if (tp->t_port) { 18719 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18720 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18721 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18722 th->th_sum = htons(0); 18723 UDPSTAT_INC(udps_opackets); 18724 } else { 18725 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18726 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18727 th->th_sum = in6_cksum_pseudo(ip6, 18728 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18729 0); 18730 } 18731 } 18732 #endif 18733 #if defined(INET6) && defined(INET) 18734 else 18735 #endif 18736 #ifdef INET 18737 { 18738 if (tp->t_port) { 18739 m->m_pkthdr.csum_flags = CSUM_UDP; 18740 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18741 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18742 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18743 th->th_sum = htons(0); 18744 UDPSTAT_INC(udps_opackets); 18745 } else { 18746 m->m_pkthdr.csum_flags = CSUM_TCP; 18747 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18748 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18749 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18750 IPPROTO_TCP + len + optlen)); 18751 } 18752 /* IP version must be set here for ipv4/ipv6 checking later */ 18753 KASSERT(ip->ip_v == IPVERSION, 18754 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18755 } 18756 #endif 18757 if (tso) { 18758 /* 18759 * Here we use segsiz since we have no added options besides 18760 * any standard timestamp options (no DSACKs or SACKS are sent 18761 * via either fast-path). 18762 */ 18763 KASSERT(len > segsiz, 18764 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 18765 m->m_pkthdr.csum_flags |= CSUM_TSO; 18766 m->m_pkthdr.tso_segsz = segsiz; 18767 } 18768 #ifdef INET6 18769 if (rack->r_is_v6) { 18770 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 18771 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18772 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18773 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18774 else 18775 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18776 } 18777 #endif 18778 #if defined(INET) && defined(INET6) 18779 else 18780 #endif 18781 #ifdef INET 18782 { 18783 ip->ip_len = htons(m->m_pkthdr.len); 18784 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 18785 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18786 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18787 if (tp->t_port == 0 || len < V_tcp_minmss) { 18788 ip->ip_off |= htons(IP_DF); 18789 } 18790 } else { 18791 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18792 } 18793 } 18794 #endif 18795 if (doing_tlp == 0) { 18796 /* Set we retransmitted */ 18797 rack->rc_gp_saw_rec = 1; 18798 } else { 18799 /* Its a TLP set ca or ss */ 18800 if (tp->snd_cwnd > tp->snd_ssthresh) { 18801 /* Set we sent in CA */ 18802 rack->rc_gp_saw_ca = 1; 18803 } else { 18804 /* Set we sent in SS */ 18805 rack->rc_gp_saw_ss = 1; 18806 } 18807 } 18808 /* Time to copy in our header */ 18809 cpto = mtod(m, uint8_t *); 18810 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18811 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18812 if (optlen) { 18813 bcopy(opt, th + 1, optlen); 18814 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18815 } else { 18816 th->th_off = sizeof(struct tcphdr) >> 2; 18817 } 18818 if (tcp_bblogging_on(rack->rc_tp)) { 18819 union tcp_log_stackspecific log; 18820 18821 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 18822 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 18823 counter_u64_add(rack_collapsed_win_rxt, 1); 18824 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 18825 } 18826 memset(&log, 0, sizeof(log)); 18827 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18828 if (rack->rack_no_prr) 18829 log.u_bbr.flex1 = 0; 18830 else 18831 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18832 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18833 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18834 log.u_bbr.flex4 = max_val; 18835 /* Save off the early/late values */ 18836 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18837 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18838 log.u_bbr.bw_inuse = rack_get_bw(rack); 18839 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 18840 if (doing_tlp == 0) 18841 log.u_bbr.flex8 = 1; 18842 else 18843 log.u_bbr.flex8 = 2; 18844 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18845 log.u_bbr.flex7 = 55; 18846 log.u_bbr.pkts_out = tp->t_maxseg; 18847 log.u_bbr.timeStamp = cts; 18848 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18849 if (rsm->r_rtr_cnt > 0) { 18850 /* 18851 * When we have a retransmit we want to log the 18852 * burst at send and flight at send from before. 18853 */ 18854 log.u_bbr.flex5 = rsm->r_fas; 18855 log.u_bbr.bbr_substate = rsm->r_bas; 18856 } else { 18857 /* 18858 * This is currently unlikely until we do the 18859 * packet pair probes but I will add it for completeness. 18860 */ 18861 log.u_bbr.flex5 = log.u_bbr.inflight; 18862 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 18863 } 18864 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 18865 log.u_bbr.delivered = 0; 18866 log.u_bbr.rttProp = (uintptr_t)rsm; 18867 log.u_bbr.delRate = rsm->r_flags; 18868 log.u_bbr.delRate <<= 31; 18869 log.u_bbr.delRate |= rack->r_must_retran; 18870 log.u_bbr.delRate <<= 1; 18871 log.u_bbr.delRate |= 1; 18872 log.u_bbr.pkt_epoch = __LINE__; 18873 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 18874 len, &log, false, NULL, __func__, __LINE__, tv); 18875 } else 18876 lgb = NULL; 18877 if ((rack->r_ctl.crte != NULL) && 18878 tcp_bblogging_on(tp)) { 18879 rack_log_queue_level(tp, rack, len, tv, cts); 18880 } 18881 #ifdef INET6 18882 if (rack->r_is_v6) { 18883 error = ip6_output(m, inp->in6p_outputopts, 18884 &inp->inp_route6, 18885 ip_sendflag, NULL, NULL, inp); 18886 } 18887 else 18888 #endif 18889 #ifdef INET 18890 { 18891 error = ip_output(m, NULL, 18892 &inp->inp_route, 18893 ip_sendflag, 0, inp); 18894 } 18895 #endif 18896 m = NULL; 18897 if (lgb) { 18898 lgb->tlb_errno = error; 18899 lgb = NULL; 18900 } 18901 /* Move snd_nxt to snd_max so we don't have false retransmissions */ 18902 tp->snd_nxt = tp->snd_max; 18903 if (error) { 18904 goto failed; 18905 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { 18906 rack->rc_hw_nobuf = 0; 18907 rack->r_ctl.rc_agg_delayed = 0; 18908 rack->r_early = 0; 18909 rack->r_late = 0; 18910 rack->r_ctl.rc_agg_early = 0; 18911 } 18912 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 18913 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); 18914 if (doing_tlp) { 18915 rack->rc_tlp_in_progress = 1; 18916 rack->r_ctl.rc_tlp_cnt_out++; 18917 } 18918 if (error == 0) { 18919 counter_u64_add(rack_total_bytes, len); 18920 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 18921 if (doing_tlp) { 18922 rack->rc_last_sent_tlp_past_cumack = 0; 18923 rack->rc_last_sent_tlp_seq_valid = 1; 18924 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 18925 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 18926 } 18927 if (rack->r_ctl.rc_prr_sndcnt >= len) 18928 rack->r_ctl.rc_prr_sndcnt -= len; 18929 else 18930 rack->r_ctl.rc_prr_sndcnt = 0; 18931 } 18932 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 18933 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18934 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 18935 rack->r_ctl.retran_during_recovery += len; 18936 { 18937 int idx; 18938 18939 idx = (len / segsiz) + 3; 18940 if (idx >= TCP_MSS_ACCT_ATIMER) 18941 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18942 else 18943 counter_u64_add(rack_out_size[idx], 1); 18944 } 18945 if (tp->t_rtttime == 0) { 18946 tp->t_rtttime = ticks; 18947 tp->t_rtseq = startseq; 18948 KMOD_TCPSTAT_INC(tcps_segstimed); 18949 } 18950 counter_u64_add(rack_fto_rsm_send, 1); 18951 if (error && (error == ENOBUFS)) { 18952 if (rack->r_ctl.crte != NULL) { 18953 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 18954 if (tcp_bblogging_on(rack->rc_tp)) 18955 rack_log_queue_level(tp, rack, len, tv, cts); 18956 } else 18957 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 18958 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 18959 if (rack->rc_enobuf < 0x7f) 18960 rack->rc_enobuf++; 18961 if (slot < (10 * HPTS_USEC_IN_MSEC)) 18962 slot = 10 * HPTS_USEC_IN_MSEC; 18963 if (rack->r_ctl.crte != NULL) { 18964 counter_u64_add(rack_saw_enobuf_hw, 1); 18965 tcp_rl_log_enobuf(rack->r_ctl.crte); 18966 } 18967 counter_u64_add(rack_saw_enobuf, 1); 18968 } else { 18969 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); 18970 } 18971 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 18972 #ifdef TCP_ACCOUNTING 18973 crtsc = get_cyclecount(); 18974 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18975 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 18976 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 18977 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 18978 } 18979 sched_unpin(); 18980 #endif 18981 return (0); 18982 failed: 18983 if (m) 18984 m_free(m); 18985 return (-1); 18986 } 18987 18988 static void 18989 rack_sndbuf_autoscale(struct tcp_rack *rack) 18990 { 18991 /* 18992 * Automatic sizing of send socket buffer. Often the send buffer 18993 * size is not optimally adjusted to the actual network conditions 18994 * at hand (delay bandwidth product). Setting the buffer size too 18995 * small limits throughput on links with high bandwidth and high 18996 * delay (eg. trans-continental/oceanic links). Setting the 18997 * buffer size too big consumes too much real kernel memory, 18998 * especially with many connections on busy servers. 18999 * 19000 * The criteria to step up the send buffer one notch are: 19001 * 1. receive window of remote host is larger than send buffer 19002 * (with a fudge factor of 5/4th); 19003 * 2. send buffer is filled to 7/8th with data (so we actually 19004 * have data to make use of it); 19005 * 3. send buffer fill has not hit maximal automatic size; 19006 * 4. our send window (slow start and cogestion controlled) is 19007 * larger than sent but unacknowledged data in send buffer. 19008 * 19009 * Note that the rack version moves things much faster since 19010 * we want to avoid hitting cache lines in the rack_fast_output() 19011 * path so this is called much less often and thus moves 19012 * the SB forward by a percentage. 19013 */ 19014 struct socket *so; 19015 struct tcpcb *tp; 19016 uint32_t sendwin, scaleup; 19017 19018 tp = rack->rc_tp; 19019 so = rack->rc_inp->inp_socket; 19020 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 19021 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 19022 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 19023 sbused(&so->so_snd) >= 19024 (so->so_snd.sb_hiwat / 8 * 7) && 19025 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 19026 sendwin >= (sbused(&so->so_snd) - 19027 (tp->snd_max - tp->snd_una))) { 19028 if (rack_autosndbuf_inc) 19029 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 19030 else 19031 scaleup = V_tcp_autosndbuf_inc; 19032 if (scaleup < V_tcp_autosndbuf_inc) 19033 scaleup = V_tcp_autosndbuf_inc; 19034 scaleup += so->so_snd.sb_hiwat; 19035 if (scaleup > V_tcp_autosndbuf_max) 19036 scaleup = V_tcp_autosndbuf_max; 19037 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 19038 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 19039 } 19040 } 19041 } 19042 19043 static int 19044 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 19045 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long *tot_len, int *send_err, int line) 19046 { 19047 /* 19048 * Enter to do fast output. We are given that the sched_pin is 19049 * in place (if accounting is compiled in) and the cycle count taken 19050 * at entry is in place in ts_val. The idea here is that 19051 * we know how many more bytes needs to be sent (presumably either 19052 * during pacing or to fill the cwnd and that was greater than 19053 * the max-burst). We have how much to send and all the info we 19054 * need to just send. 19055 */ 19056 #ifdef INET 19057 struct ip *ip = NULL; 19058 #endif 19059 struct udphdr *udp = NULL; 19060 struct tcphdr *th = NULL; 19061 struct mbuf *m, *s_mb; 19062 struct inpcb *inp; 19063 uint8_t *cpto; 19064 struct tcp_log_buffer *lgb; 19065 #ifdef TCP_ACCOUNTING 19066 uint64_t crtsc; 19067 #endif 19068 struct tcpopt to; 19069 u_char opt[TCP_MAXOLEN]; 19070 uint32_t hdrlen, optlen; 19071 #ifdef TCP_ACCOUNTING 19072 int cnt_thru = 1; 19073 #endif 19074 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 19075 uint16_t flags; 19076 uint32_t s_soff; 19077 uint32_t if_hw_tsomaxsegcount = 0, startseq; 19078 uint32_t if_hw_tsomaxsegsize; 19079 uint32_t add_flag = RACK_SENT_FP; 19080 #ifdef INET6 19081 struct ip6_hdr *ip6 = NULL; 19082 19083 if (rack->r_is_v6) { 19084 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 19085 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 19086 } else 19087 #endif /* INET6 */ 19088 { 19089 #ifdef INET 19090 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 19091 hdrlen = sizeof(struct tcpiphdr); 19092 #endif 19093 } 19094 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 19095 m = NULL; 19096 goto failed; 19097 } 19098 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19099 startseq = tp->snd_max; 19100 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19101 inp = rack->rc_inp; 19102 len = rack->r_ctl.fsb.left_to_send; 19103 to.to_flags = 0; 19104 flags = rack->r_ctl.fsb.tcp_flags; 19105 if (tp->t_flags & TF_RCVD_TSTMP) { 19106 to.to_tsval = ms_cts + tp->ts_offset; 19107 to.to_tsecr = tp->ts_recent; 19108 to.to_flags = TOF_TS; 19109 } 19110 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19111 /* TCP-MD5 (RFC2385). */ 19112 if (tp->t_flags & TF_SIGNATURE) 19113 to.to_flags |= TOF_SIGNATURE; 19114 #endif 19115 optlen = tcp_addoptions(&to, opt); 19116 hdrlen += optlen; 19117 udp = rack->r_ctl.fsb.udp; 19118 if (udp) 19119 hdrlen += sizeof(struct udphdr); 19120 if (rack->r_ctl.rc_pace_max_segs) 19121 max_val = rack->r_ctl.rc_pace_max_segs; 19122 else if (rack->rc_user_set_max_segs) 19123 max_val = rack->rc_user_set_max_segs * segsiz; 19124 else 19125 max_val = len; 19126 if ((tp->t_flags & TF_TSO) && 19127 V_tcp_do_tso && 19128 (len > segsiz) && 19129 (tp->t_port == 0)) 19130 tso = 1; 19131 again: 19132 #ifdef INET6 19133 if (MHLEN < hdrlen + max_linkhdr) 19134 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 19135 else 19136 #endif 19137 m = m_gethdr(M_NOWAIT, MT_DATA); 19138 if (m == NULL) 19139 goto failed; 19140 m->m_data += max_linkhdr; 19141 m->m_len = hdrlen; 19142 th = rack->r_ctl.fsb.th; 19143 /* Establish the len to send */ 19144 if (len > max_val) 19145 len = max_val; 19146 if ((tso) && (len + optlen > segsiz)) { 19147 uint32_t if_hw_tsomax; 19148 int32_t max_len; 19149 19150 /* extract TSO information */ 19151 if_hw_tsomax = tp->t_tsomax; 19152 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 19153 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 19154 /* 19155 * Check if we should limit by maximum payload 19156 * length: 19157 */ 19158 if (if_hw_tsomax != 0) { 19159 /* compute maximum TSO length */ 19160 max_len = (if_hw_tsomax - hdrlen - 19161 max_linkhdr); 19162 if (max_len <= 0) { 19163 goto failed; 19164 } else if (len > max_len) { 19165 len = max_len; 19166 } 19167 } 19168 if (len <= segsiz) { 19169 /* 19170 * In case there are too many small fragments don't 19171 * use TSO: 19172 */ 19173 tso = 0; 19174 } 19175 } else { 19176 tso = 0; 19177 } 19178 if ((tso == 0) && (len > segsiz)) 19179 len = segsiz; 19180 (void)tcp_get_usecs(tv); 19181 if ((len == 0) || 19182 (len <= MHLEN - hdrlen - max_linkhdr)) { 19183 goto failed; 19184 } 19185 sb_offset = tp->snd_max - tp->snd_una; 19186 th->th_seq = htonl(tp->snd_max); 19187 th->th_ack = htonl(tp->rcv_nxt); 19188 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 19189 if (th->th_win == 0) { 19190 tp->t_sndzerowin++; 19191 tp->t_flags |= TF_RXWIN0SENT; 19192 } else 19193 tp->t_flags &= ~TF_RXWIN0SENT; 19194 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 19195 KMOD_TCPSTAT_INC(tcps_sndpack); 19196 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 19197 #ifdef STATS 19198 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 19199 len); 19200 #endif 19201 if (rack->r_ctl.fsb.m == NULL) 19202 goto failed; 19203 19204 /* s_mb and s_soff are saved for rack_log_output */ 19205 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 19206 &s_mb, &s_soff); 19207 if (len <= segsiz) { 19208 /* 19209 * Must have ran out of mbufs for the copy 19210 * shorten it to no longer need tso. Lets 19211 * not put on sendalot since we are low on 19212 * mbufs. 19213 */ 19214 tso = 0; 19215 } 19216 if (rack->r_ctl.fsb.rfo_apply_push && 19217 (len == rack->r_ctl.fsb.left_to_send)) { 19218 flags |= TH_PUSH; 19219 add_flag |= RACK_HAD_PUSH; 19220 } 19221 if ((m->m_next == NULL) || (len <= 0)){ 19222 goto failed; 19223 } 19224 if (udp) { 19225 if (rack->r_is_v6) 19226 ulen = hdrlen + len - sizeof(struct ip6_hdr); 19227 else 19228 ulen = hdrlen + len - sizeof(struct ip); 19229 udp->uh_ulen = htons(ulen); 19230 } 19231 m->m_pkthdr.rcvif = (struct ifnet *)0; 19232 if (TCPS_HAVERCVDSYN(tp->t_state) && 19233 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 19234 int ect = tcp_ecn_output_established(tp, &flags, len, false); 19235 if ((tp->t_state == TCPS_SYN_RECEIVED) && 19236 (tp->t_flags2 & TF2_ECN_SND_ECE)) 19237 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 19238 #ifdef INET6 19239 if (rack->r_is_v6) { 19240 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 19241 ip6->ip6_flow |= htonl(ect << 20); 19242 } 19243 else 19244 #endif 19245 { 19246 #ifdef INET 19247 ip->ip_tos &= ~IPTOS_ECN_MASK; 19248 ip->ip_tos |= ect; 19249 #endif 19250 } 19251 } 19252 tcp_set_flags(th, flags); 19253 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 19254 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 19255 if (to.to_flags & TOF_SIGNATURE) { 19256 /* 19257 * Calculate MD5 signature and put it into the place 19258 * determined before. 19259 * NOTE: since TCP options buffer doesn't point into 19260 * mbuf's data, calculate offset and use it. 19261 */ 19262 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 19263 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 19264 /* 19265 * Do not send segment if the calculation of MD5 19266 * digest has failed. 19267 */ 19268 goto failed; 19269 } 19270 } 19271 #endif 19272 #ifdef INET6 19273 if (rack->r_is_v6) { 19274 if (tp->t_port) { 19275 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 19276 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19277 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 19278 th->th_sum = htons(0); 19279 UDPSTAT_INC(udps_opackets); 19280 } else { 19281 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 19282 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19283 th->th_sum = in6_cksum_pseudo(ip6, 19284 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 19285 0); 19286 } 19287 } 19288 #endif 19289 #if defined(INET6) && defined(INET) 19290 else 19291 #endif 19292 #ifdef INET 19293 { 19294 if (tp->t_port) { 19295 m->m_pkthdr.csum_flags = CSUM_UDP; 19296 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19297 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 19298 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 19299 th->th_sum = htons(0); 19300 UDPSTAT_INC(udps_opackets); 19301 } else { 19302 m->m_pkthdr.csum_flags = CSUM_TCP; 19303 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19304 th->th_sum = in_pseudo(ip->ip_src.s_addr, 19305 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 19306 IPPROTO_TCP + len + optlen)); 19307 } 19308 /* IP version must be set here for ipv4/ipv6 checking later */ 19309 KASSERT(ip->ip_v == IPVERSION, 19310 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 19311 } 19312 #endif 19313 if (tso) { 19314 /* 19315 * Here we use segsiz since we have no added options besides 19316 * any standard timestamp options (no DSACKs or SACKS are sent 19317 * via either fast-path). 19318 */ 19319 KASSERT(len > segsiz, 19320 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 19321 m->m_pkthdr.csum_flags |= CSUM_TSO; 19322 m->m_pkthdr.tso_segsz = segsiz; 19323 } 19324 #ifdef INET6 19325 if (rack->r_is_v6) { 19326 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 19327 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 19328 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 19329 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19330 else 19331 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19332 } 19333 #endif 19334 #if defined(INET) && defined(INET6) 19335 else 19336 #endif 19337 #ifdef INET 19338 { 19339 ip->ip_len = htons(m->m_pkthdr.len); 19340 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19341 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19342 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19343 if (tp->t_port == 0 || len < V_tcp_minmss) { 19344 ip->ip_off |= htons(IP_DF); 19345 } 19346 } else { 19347 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19348 } 19349 } 19350 #endif 19351 if (tp->snd_cwnd > tp->snd_ssthresh) { 19352 /* Set we sent in CA */ 19353 rack->rc_gp_saw_ca = 1; 19354 } else { 19355 /* Set we sent in SS */ 19356 rack->rc_gp_saw_ss = 1; 19357 } 19358 /* Time to copy in our header */ 19359 cpto = mtod(m, uint8_t *); 19360 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19361 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19362 if (optlen) { 19363 bcopy(opt, th + 1, optlen); 19364 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19365 } else { 19366 th->th_off = sizeof(struct tcphdr) >> 2; 19367 } 19368 if ((rack->r_ctl.crte != NULL) && 19369 tcp_bblogging_on(tp)) { 19370 rack_log_queue_level(tp, rack, len, tv, cts); 19371 } 19372 if (tcp_bblogging_on(rack->rc_tp)) { 19373 union tcp_log_stackspecific log; 19374 19375 memset(&log, 0, sizeof(log)); 19376 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19377 if (rack->rack_no_prr) 19378 log.u_bbr.flex1 = 0; 19379 else 19380 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19381 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19382 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19383 log.u_bbr.flex4 = max_val; 19384 /* Save off the early/late values */ 19385 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19386 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19387 log.u_bbr.bw_inuse = rack_get_bw(rack); 19388 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19389 log.u_bbr.flex8 = 0; 19390 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19391 log.u_bbr.flex7 = 44; 19392 log.u_bbr.pkts_out = tp->t_maxseg; 19393 log.u_bbr.timeStamp = cts; 19394 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19395 log.u_bbr.flex5 = log.u_bbr.inflight; 19396 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19397 log.u_bbr.delivered = rack->r_ctl.fsb.left_to_send; 19398 log.u_bbr.rttProp = 0; 19399 log.u_bbr.delRate = rack->r_must_retran; 19400 log.u_bbr.delRate <<= 1; 19401 log.u_bbr.pkt_epoch = line; 19402 /* For fast output no retrans so just inflight and how many mss we send */ 19403 log.u_bbr.flex5 = log.u_bbr.inflight; 19404 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19405 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19406 len, &log, false, NULL, __func__, __LINE__, tv); 19407 } else 19408 lgb = NULL; 19409 #ifdef INET6 19410 if (rack->r_is_v6) { 19411 error = ip6_output(m, inp->in6p_outputopts, 19412 &inp->inp_route6, 19413 0, NULL, NULL, inp); 19414 } 19415 #endif 19416 #if defined(INET) && defined(INET6) 19417 else 19418 #endif 19419 #ifdef INET 19420 { 19421 error = ip_output(m, NULL, 19422 &inp->inp_route, 19423 0, 0, inp); 19424 } 19425 #endif 19426 if (lgb) { 19427 lgb->tlb_errno = error; 19428 lgb = NULL; 19429 } 19430 if (error) { 19431 *send_err = error; 19432 m = NULL; 19433 goto failed; 19434 } else if (rack->rc_hw_nobuf) { 19435 rack->rc_hw_nobuf = 0; 19436 rack->r_ctl.rc_agg_delayed = 0; 19437 rack->r_early = 0; 19438 rack->r_late = 0; 19439 rack->r_ctl.rc_agg_early = 0; 19440 } 19441 if ((error == 0) && (rack->lt_bw_up == 0)) { 19442 /* Unlikely */ 19443 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(tv); 19444 rack->r_ctl.lt_seq = tp->snd_una; 19445 rack->lt_bw_up = 1; 19446 } else if ((error == 0) && 19447 (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) { 19448 /* 19449 * Need to record what we have since we are 19450 * approaching seq wrap. 19451 */ 19452 struct timeval tv; 19453 uint64_t tmark; 19454 19455 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 19456 rack->r_ctl.lt_seq = tp->snd_una; 19457 tmark = tcp_get_u64_usecs(&tv); 19458 if (tmark > rack->r_ctl.lt_timemark) { 19459 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 19460 rack->r_ctl.lt_timemark = tmark; 19461 } 19462 } 19463 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 19464 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); 19465 if (tp->snd_una == tp->snd_max) { 19466 rack->r_ctl.rc_tlp_rxt_last_time = cts; 19467 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 19468 tp->t_acktime = ticks; 19469 } 19470 counter_u64_add(rack_total_bytes, len); 19471 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 19472 19473 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19474 *tot_len += len; 19475 if ((tp->t_flags & TF_GPUTINPROG) == 0) 19476 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 19477 tp->snd_max += len; 19478 tp->snd_nxt = tp->snd_max; 19479 if (rack->rc_new_rnd_needed) { 19480 rack_new_round_starts(tp, rack, tp->snd_max); 19481 } 19482 { 19483 int idx; 19484 19485 idx = (len / segsiz) + 3; 19486 if (idx >= TCP_MSS_ACCT_ATIMER) 19487 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19488 else 19489 counter_u64_add(rack_out_size[idx], 1); 19490 } 19491 if (len <= rack->r_ctl.fsb.left_to_send) 19492 rack->r_ctl.fsb.left_to_send -= len; 19493 else 19494 rack->r_ctl.fsb.left_to_send = 0; 19495 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19496 rack->r_fast_output = 0; 19497 rack->r_ctl.fsb.left_to_send = 0; 19498 /* At the end of fast_output scale up the sb */ 19499 SOCK_SENDBUF_LOCK(rack->rc_inp->inp_socket); 19500 rack_sndbuf_autoscale(rack); 19501 SOCK_SENDBUF_UNLOCK(rack->rc_inp->inp_socket); 19502 } 19503 if (tp->t_rtttime == 0) { 19504 tp->t_rtttime = ticks; 19505 tp->t_rtseq = startseq; 19506 KMOD_TCPSTAT_INC(tcps_segstimed); 19507 } 19508 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 19509 (max_val > len) && 19510 (*tot_len < rack->r_ctl.rc_pace_max_segs) && 19511 (tso == 0)) { 19512 max_val -= len; 19513 len = segsiz; 19514 th = rack->r_ctl.fsb.th; 19515 #ifdef TCP_ACCOUNTING 19516 cnt_thru++; 19517 #endif 19518 goto again; 19519 } 19520 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19521 counter_u64_add(rack_fto_send, 1); 19522 slot = rack_get_pacing_delay(rack, tp, *tot_len, NULL, segsiz, __LINE__); 19523 rack_start_hpts_timer(rack, tp, cts, slot, *tot_len, 0); 19524 #ifdef TCP_ACCOUNTING 19525 crtsc = get_cyclecount(); 19526 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19527 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19528 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19529 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((*tot_len + segsiz - 1) / segsiz); 19530 } 19531 sched_unpin(); 19532 #endif 19533 return (0); 19534 failed: 19535 if (m) 19536 m_free(m); 19537 rack->r_fast_output = 0; 19538 return (-1); 19539 } 19540 19541 static inline void 19542 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, 19543 struct sockbuf *sb, 19544 int len, int orig_len, int segsiz, uint32_t pace_max_seg, 19545 bool hw_tls, 19546 uint16_t flags) 19547 { 19548 rack->r_fast_output = 1; 19549 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19550 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19551 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 19552 rack->r_ctl.fsb.tcp_flags = flags; 19553 rack->r_ctl.fsb.left_to_send = orig_len - len; 19554 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { 19555 /* Less than a full sized pace, lets not */ 19556 rack->r_fast_output = 0; 19557 return; 19558 } else { 19559 /* Round down to the nearest pace_max_seg */ 19560 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); 19561 } 19562 if (hw_tls) 19563 rack->r_ctl.fsb.hw_tls = 1; 19564 else 19565 rack->r_ctl.fsb.hw_tls = 0; 19566 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19567 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19568 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19569 (tp->snd_max - tp->snd_una))); 19570 if (rack->r_ctl.fsb.left_to_send < segsiz) 19571 rack->r_fast_output = 0; 19572 else { 19573 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19574 rack->r_ctl.fsb.rfo_apply_push = 1; 19575 else 19576 rack->r_ctl.fsb.rfo_apply_push = 0; 19577 } 19578 } 19579 19580 static uint32_t 19581 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) 19582 { 19583 uint64_t min_time; 19584 uint32_t maxlen; 19585 19586 min_time = (uint64_t)get_hpts_min_sleep_time(); 19587 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); 19588 maxlen = roundup(maxlen, segsiz); 19589 return (maxlen); 19590 } 19591 19592 static struct rack_sendmap * 19593 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 19594 { 19595 struct rack_sendmap *rsm = NULL; 19596 int thresh; 19597 19598 restart: 19599 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 19600 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 19601 /* Nothing, strange turn off validity */ 19602 rack->r_collapse_point_valid = 0; 19603 return (NULL); 19604 } 19605 /* Can we send it yet? */ 19606 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 19607 /* 19608 * Receiver window has not grown enough for 19609 * the segment to be put on the wire. 19610 */ 19611 return (NULL); 19612 } 19613 if (rsm->r_flags & RACK_ACKED) { 19614 /* 19615 * It has been sacked, lets move to the 19616 * next one if possible. 19617 */ 19618 rack->r_ctl.last_collapse_point = rsm->r_end; 19619 /* Are we done? */ 19620 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19621 rack->r_ctl.high_collapse_point)) { 19622 rack->r_collapse_point_valid = 0; 19623 return (NULL); 19624 } 19625 goto restart; 19626 } 19627 /* Now has it been long enough ? */ 19628 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1); 19629 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 19630 rack_log_collapse(rack, rsm->r_start, 19631 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19632 thresh, __LINE__, 6, rsm->r_flags, rsm); 19633 return (rsm); 19634 } 19635 /* Not enough time */ 19636 rack_log_collapse(rack, rsm->r_start, 19637 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19638 thresh, __LINE__, 7, rsm->r_flags, rsm); 19639 return (NULL); 19640 } 19641 19642 static inline void 19643 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) 19644 { 19645 if ((rack->full_size_rxt == 0) && 19646 (rack->shape_rxt_to_pacing_min == 0) && 19647 (*len >= segsiz)) { 19648 *len = segsiz; 19649 } else if (rack->shape_rxt_to_pacing_min && 19650 rack->gp_ready) { 19651 /* We use pacing min as shaping len req */ 19652 uint32_t maxlen; 19653 19654 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 19655 if (*len > maxlen) 19656 *len = maxlen; 19657 } else { 19658 /* 19659 * The else is full_size_rxt is on so send it all 19660 * note we do need to check this for exceeding 19661 * our max segment size due to the fact that 19662 * we do sometimes merge chunks together i.e. 19663 * we cannot just assume that we will never have 19664 * a chunk greater than pace_max_seg 19665 */ 19666 if (*len > pace_max_seg) 19667 *len = pace_max_seg; 19668 } 19669 } 19670 19671 static int 19672 rack_output(struct tcpcb *tp) 19673 { 19674 struct socket *so; 19675 uint32_t recwin; 19676 uint32_t sb_offset, s_moff = 0; 19677 int32_t len, error = 0; 19678 uint16_t flags; 19679 struct mbuf *m, *s_mb = NULL; 19680 struct mbuf *mb; 19681 uint32_t if_hw_tsomaxsegcount = 0; 19682 uint32_t if_hw_tsomaxsegsize; 19683 int32_t segsiz, minseg; 19684 long tot_len_this_send = 0; 19685 #ifdef INET 19686 struct ip *ip = NULL; 19687 #endif 19688 struct udphdr *udp = NULL; 19689 struct tcp_rack *rack; 19690 struct tcphdr *th; 19691 uint8_t pass = 0; 19692 uint8_t mark = 0; 19693 uint8_t check_done = 0; 19694 uint8_t wanted_cookie = 0; 19695 u_char opt[TCP_MAXOLEN]; 19696 unsigned ipoptlen, optlen, hdrlen, ulen=0; 19697 uint32_t rack_seq; 19698 19699 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 19700 unsigned ipsec_optlen = 0; 19701 19702 #endif 19703 int32_t idle, sendalot; 19704 uint32_t tot_idle; 19705 int32_t sub_from_prr = 0; 19706 volatile int32_t sack_rxmit; 19707 struct rack_sendmap *rsm = NULL; 19708 int32_t tso, mtu; 19709 struct tcpopt to; 19710 int32_t slot = 0; 19711 int32_t sup_rack = 0; 19712 uint32_t cts, ms_cts, delayed, early; 19713 uint32_t add_flag = RACK_SENT_SP; 19714 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 19715 uint8_t doing_tlp = 0; 19716 uint32_t cwnd_to_use, pace_max_seg; 19717 int32_t do_a_prefetch = 0; 19718 int32_t prefetch_rsm = 0; 19719 int32_t orig_len = 0; 19720 struct timeval tv; 19721 int32_t prefetch_so_done = 0; 19722 struct tcp_log_buffer *lgb; 19723 struct inpcb *inp = tptoinpcb(tp); 19724 struct sockbuf *sb; 19725 uint64_t ts_val = 0; 19726 #ifdef TCP_ACCOUNTING 19727 uint64_t crtsc; 19728 #endif 19729 #ifdef INET6 19730 struct ip6_hdr *ip6 = NULL; 19731 int32_t isipv6; 19732 #endif 19733 bool hpts_calling, hw_tls = false; 19734 19735 NET_EPOCH_ASSERT(); 19736 INP_WLOCK_ASSERT(inp); 19737 19738 /* setup and take the cache hits here */ 19739 rack = (struct tcp_rack *)tp->t_fb_ptr; 19740 #ifdef TCP_ACCOUNTING 19741 sched_pin(); 19742 ts_val = get_cyclecount(); 19743 #endif 19744 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); 19745 tp->t_flags2 &= ~TF2_HPTS_CALLS; 19746 #ifdef TCP_OFFLOAD 19747 if (tp->t_flags & TF_TOE) { 19748 #ifdef TCP_ACCOUNTING 19749 sched_unpin(); 19750 #endif 19751 return (tcp_offload_output(tp)); 19752 } 19753 #endif 19754 if (rack->rack_deferred_inited == 0) { 19755 /* 19756 * If we are the connecting socket we will 19757 * hit rack_init() when no sequence numbers 19758 * are setup. This makes it so we must defer 19759 * some initialization. Call that now. 19760 */ 19761 rack_deferred_init(tp, rack); 19762 } 19763 /* 19764 * For TFO connections in SYN_RECEIVED, only allow the initial 19765 * SYN|ACK and those sent by the retransmit timer. 19766 */ 19767 if ((tp->t_flags & TF_FASTOPEN) && 19768 (tp->t_state == TCPS_SYN_RECEIVED) && 19769 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 19770 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 19771 #ifdef TCP_ACCOUNTING 19772 sched_unpin(); 19773 #endif 19774 return (0); 19775 } 19776 #ifdef INET6 19777 if (rack->r_state) { 19778 /* Use the cache line loaded if possible */ 19779 isipv6 = rack->r_is_v6; 19780 } else { 19781 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 19782 } 19783 #endif 19784 early = 0; 19785 cts = tcp_get_usecs(&tv); 19786 ms_cts = tcp_tv_to_msec(&tv); 19787 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 19788 tcp_in_hpts(rack->rc_tp)) { 19789 /* 19790 * We are on the hpts for some timer but not hptsi output. 19791 * Remove from the hpts unconditionally. 19792 */ 19793 rack_timer_cancel(tp, rack, cts, __LINE__); 19794 } 19795 /* Are we pacing and late? */ 19796 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19797 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 19798 /* We are delayed */ 19799 delayed = cts - rack->r_ctl.rc_last_output_to; 19800 } else { 19801 delayed = 0; 19802 } 19803 /* Do the timers, which may override the pacer */ 19804 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 19805 int retval; 19806 19807 retval = rack_process_timers(tp, rack, cts, hpts_calling, 19808 &doing_tlp); 19809 if (retval != 0) { 19810 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 19811 #ifdef TCP_ACCOUNTING 19812 sched_unpin(); 19813 #endif 19814 /* 19815 * If timers want tcp_drop(), then pass error out, 19816 * otherwise suppress it. 19817 */ 19818 return (retval < 0 ? retval : 0); 19819 } 19820 } 19821 if (rack->rc_in_persist) { 19822 if (tcp_in_hpts(rack->rc_tp) == 0) { 19823 /* Timer is not running */ 19824 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19825 } 19826 #ifdef TCP_ACCOUNTING 19827 sched_unpin(); 19828 #endif 19829 return (0); 19830 } 19831 if ((rack->rc_ack_required == 1) && 19832 (rack->r_timer_override == 0)){ 19833 /* A timeout occurred and no ack has arrived */ 19834 if (tcp_in_hpts(rack->rc_tp) == 0) { 19835 /* Timer is not running */ 19836 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19837 } 19838 #ifdef TCP_ACCOUNTING 19839 sched_unpin(); 19840 #endif 19841 return (0); 19842 } 19843 if ((rack->r_timer_override) || 19844 (rack->rc_ack_can_sendout_data) || 19845 (delayed) || 19846 (tp->t_state < TCPS_ESTABLISHED)) { 19847 rack->rc_ack_can_sendout_data = 0; 19848 if (tcp_in_hpts(rack->rc_tp)) 19849 tcp_hpts_remove(rack->rc_tp); 19850 } else if (tcp_in_hpts(rack->rc_tp)) { 19851 /* 19852 * On the hpts you can't pass even if ACKNOW is on, we will 19853 * when the hpts fires. 19854 */ 19855 #ifdef TCP_ACCOUNTING 19856 crtsc = get_cyclecount(); 19857 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19858 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 19859 tp->tcp_cnt_counters[SND_BLOCKED]++; 19860 } 19861 sched_unpin(); 19862 #endif 19863 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 19864 return (0); 19865 } 19866 /* Finish out both pacing early and late accounting */ 19867 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19868 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 19869 early = rack->r_ctl.rc_last_output_to - cts; 19870 } else 19871 early = 0; 19872 if (delayed && (rack->rc_always_pace == 1)) { 19873 rack->r_ctl.rc_agg_delayed += delayed; 19874 rack->r_late = 1; 19875 } else if (early && (rack->rc_always_pace == 1)) { 19876 rack->r_ctl.rc_agg_early += early; 19877 rack->r_early = 1; 19878 } else if (rack->rc_always_pace == 0) { 19879 /* Non-paced we are not late */ 19880 rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0; 19881 rack->r_early = rack->r_late = 0; 19882 } 19883 /* Now that early/late accounting is done turn off the flag */ 19884 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 19885 rack->r_wanted_output = 0; 19886 rack->r_timer_override = 0; 19887 if ((tp->t_state != rack->r_state) && 19888 TCPS_HAVEESTABLISHED(tp->t_state)) { 19889 rack_set_state(tp, rack); 19890 } 19891 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19892 minseg = segsiz; 19893 if (rack->r_ctl.rc_pace_max_segs == 0) 19894 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 19895 else 19896 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 19897 if ((rack->r_fast_output) && 19898 (doing_tlp == 0) && 19899 (tp->rcv_numsacks == 0)) { 19900 int ret; 19901 19902 error = 0; 19903 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); 19904 if (ret > 0) 19905 return(ret); 19906 else if (error) { 19907 inp = rack->rc_inp; 19908 so = inp->inp_socket; 19909 sb = &so->so_snd; 19910 goto nomore; 19911 } else { 19912 /* Return == 0, if there is more we can send tot_len wise fall through and send */ 19913 if (tot_len_this_send >= pace_max_seg) 19914 return (ret); 19915 #ifdef TCP_ACCOUNTING 19916 /* We need to re-pin since fast_output un-pined */ 19917 sched_pin(); 19918 ts_val = get_cyclecount(); 19919 #endif 19920 /* Fall back out so we can send any more that may bring us to pace_max_seg */ 19921 } 19922 } 19923 inp = rack->rc_inp; 19924 /* 19925 * For TFO connections in SYN_SENT or SYN_RECEIVED, 19926 * only allow the initial SYN or SYN|ACK and those sent 19927 * by the retransmit timer. 19928 */ 19929 if ((tp->t_flags & TF_FASTOPEN) && 19930 ((tp->t_state == TCPS_SYN_RECEIVED) || 19931 (tp->t_state == TCPS_SYN_SENT)) && 19932 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 19933 (tp->t_rxtshift == 0)) { /* not a retransmit */ 19934 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19935 #ifdef TCP_ACCOUNTING 19936 sched_unpin(); 19937 #endif 19938 return (0); 19939 } 19940 /* 19941 * Determine length of data that should be transmitted, and flags 19942 * that will be used. If there is some data or critical controls 19943 * (SYN, RST) to send, then transmit; otherwise, investigate 19944 * further. 19945 */ 19946 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 19947 if (tp->t_idle_reduce) { 19948 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 19949 rack_cc_after_idle(rack, tp); 19950 } 19951 tp->t_flags &= ~TF_LASTIDLE; 19952 if (idle) { 19953 if (tp->t_flags & TF_MORETOCOME) { 19954 tp->t_flags |= TF_LASTIDLE; 19955 idle = 0; 19956 } 19957 } 19958 if ((tp->snd_una == tp->snd_max) && 19959 rack->r_ctl.rc_went_idle_time && 19960 (cts > rack->r_ctl.rc_went_idle_time)) { 19961 tot_idle = (cts - rack->r_ctl.rc_went_idle_time); 19962 if (tot_idle > rack_min_probertt_hold) { 19963 /* Count as a probe rtt */ 19964 if (rack->in_probe_rtt == 0) { 19965 rack->r_ctl.rc_lower_rtt_us_cts = cts; 19966 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 19967 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 19968 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 19969 } else { 19970 rack_exit_probertt(rack, cts); 19971 } 19972 } 19973 } else 19974 tot_idle = 0; 19975 if (rack_use_fsb && 19976 (rack->r_ctl.fsb.tcp_ip_hdr) && 19977 (rack->r_fsb_inited == 0) && 19978 (rack->r_state != TCPS_CLOSED)) 19979 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); 19980 if (rack->rc_sendvars_notset == 1) { 19981 rack->rc_sendvars_notset = 0; 19982 /* 19983 * Make sure any TCP timers (keep-alive) is not running. 19984 */ 19985 tcp_timer_stop(tp); 19986 } 19987 if ((rack->rack_no_prr == 1) && 19988 (rack->rc_always_pace == 0)) { 19989 /* 19990 * Sanity check before sending, if we have 19991 * no-pacing enabled and prr is turned off that 19992 * is a logistics error. Correct this by turnning 19993 * prr back on. A user *must* set some form of 19994 * pacing in order to turn PRR off. We do this 19995 * in the output path so that we can avoid socket 19996 * option ordering issues that would occur if we 19997 * tried to do it while setting rack_no_prr on. 19998 */ 19999 rack->rack_no_prr = 0; 20000 } 20001 if ((rack->pcm_enabled == 1) && 20002 (rack->pcm_needed == 0) && 20003 (tot_idle > 0)) { 20004 /* 20005 * We have been idle some micro seconds. We need 20006 * to factor this in to see if a PCM is needed. 20007 */ 20008 uint32_t rtts_idle, rnds; 20009 20010 if (tp->t_srtt) 20011 rtts_idle = tot_idle / tp->t_srtt; 20012 else 20013 rtts_idle = 0; 20014 rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; 20015 rack->r_ctl.pcm_idle_rounds += rtts_idle; 20016 if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { 20017 rack->pcm_needed = 1; 20018 rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round ); 20019 } 20020 } 20021 again: 20022 sendalot = 0; 20023 cts = tcp_get_usecs(&tv); 20024 ms_cts = tcp_tv_to_msec(&tv); 20025 tso = 0; 20026 mtu = 0; 20027 if (TCPS_HAVEESTABLISHED(tp->t_state) && 20028 (rack->r_ctl.pcm_max_seg == 0)) { 20029 /* 20030 * We set in our first send so we know that the ctf_fixed_maxseg 20031 * has been fully set. If we do it in rack_init() we most likely 20032 * see 512 bytes so we end up at 5120, not desirable. 20033 */ 20034 rack->r_ctl.pcm_max_seg = rc_init_window(rack); 20035 if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) { 20036 /* 20037 * Assure our initial PCM probe is at least 10 MSS. 20038 */ 20039 rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; 20040 } 20041 } 20042 if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { 20043 uint32_t rw_avail, cwa; 20044 20045 if (tp->snd_wnd > ctf_outstanding(tp)) 20046 rw_avail = tp->snd_wnd - ctf_outstanding(tp); 20047 else 20048 rw_avail = 0; 20049 if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked)) 20050 cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 20051 else 20052 cwa = 0; 20053 if ((cwa >= rack->r_ctl.pcm_max_seg) && 20054 (rw_avail > rack->r_ctl.pcm_max_seg)) { 20055 /* Raise up the max seg for this trip through */ 20056 pace_max_seg = rack->r_ctl.pcm_max_seg; 20057 /* Disable any fast output */ 20058 rack->r_fast_output = 0; 20059 } 20060 if (rack_verbose_logging) { 20061 rack_log_pcm(rack, 4, 20062 cwa, rack->r_ctl.pcm_max_seg, rw_avail); 20063 } 20064 } 20065 sb_offset = tp->snd_max - tp->snd_una; 20066 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 20067 flags = tcp_outflags[tp->t_state]; 20068 while (rack->rc_free_cnt < rack_free_cache) { 20069 rsm = rack_alloc(rack); 20070 if (rsm == NULL) { 20071 if (hpts_calling) 20072 /* Retry in a ms */ 20073 slot = (1 * HPTS_USEC_IN_MSEC); 20074 so = inp->inp_socket; 20075 sb = &so->so_snd; 20076 goto just_return_nolock; 20077 } 20078 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 20079 rack->rc_free_cnt++; 20080 rsm = NULL; 20081 } 20082 sack_rxmit = 0; 20083 len = 0; 20084 rsm = NULL; 20085 if (flags & TH_RST) { 20086 SOCK_SENDBUF_LOCK(inp->inp_socket); 20087 so = inp->inp_socket; 20088 sb = &so->so_snd; 20089 goto send; 20090 } 20091 if (rack->r_ctl.rc_resend) { 20092 /* Retransmit timer */ 20093 rsm = rack->r_ctl.rc_resend; 20094 rack->r_ctl.rc_resend = NULL; 20095 len = rsm->r_end - rsm->r_start; 20096 sack_rxmit = 1; 20097 sendalot = 0; 20098 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20099 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20100 __func__, __LINE__, 20101 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20102 sb_offset = rsm->r_start - tp->snd_una; 20103 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20104 } else if (rack->r_collapse_point_valid && 20105 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 20106 /* 20107 * If an RSM is returned then enough time has passed 20108 * for us to retransmit it. Move up the collapse point, 20109 * since this rsm has its chance to retransmit now. 20110 */ 20111 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT); 20112 rack->r_ctl.last_collapse_point = rsm->r_end; 20113 /* Are we done? */ 20114 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 20115 rack->r_ctl.high_collapse_point)) 20116 rack->r_collapse_point_valid = 0; 20117 sack_rxmit = 1; 20118 /* We are not doing a TLP */ 20119 doing_tlp = 0; 20120 len = rsm->r_end - rsm->r_start; 20121 sb_offset = rsm->r_start - tp->snd_una; 20122 sendalot = 0; 20123 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20124 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 20125 /* We have a retransmit that takes precedence */ 20126 if ((!IN_FASTRECOVERY(tp->t_flags)) && 20127 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 20128 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 20129 /* Enter recovery if not induced by a time-out */ 20130 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 20131 } 20132 #ifdef INVARIANTS 20133 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 20134 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 20135 tp, rack, rsm, rsm->r_start, tp->snd_una); 20136 } 20137 #endif 20138 len = rsm->r_end - rsm->r_start; 20139 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20140 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20141 __func__, __LINE__, 20142 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20143 sb_offset = rsm->r_start - tp->snd_una; 20144 sendalot = 0; 20145 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 20146 if (len > 0) { 20147 sack_rxmit = 1; 20148 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 20149 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 20150 min(len, segsiz)); 20151 } 20152 } else if (rack->r_ctl.rc_tlpsend) { 20153 /* Tail loss probe */ 20154 long cwin; 20155 long tlen; 20156 20157 /* 20158 * Check if we can do a TLP with a RACK'd packet 20159 * this can happen if we are not doing the rack 20160 * cheat and we skipped to a TLP and it 20161 * went off. 20162 */ 20163 rsm = rack->r_ctl.rc_tlpsend; 20164 /* We are doing a TLP make sure the flag is preent */ 20165 rsm->r_flags |= RACK_TLP; 20166 rack->r_ctl.rc_tlpsend = NULL; 20167 sack_rxmit = 1; 20168 tlen = rsm->r_end - rsm->r_start; 20169 if (tlen > segsiz) 20170 tlen = segsiz; 20171 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20172 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20173 __func__, __LINE__, 20174 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20175 sb_offset = rsm->r_start - tp->snd_una; 20176 cwin = min(tp->snd_wnd, tlen); 20177 len = cwin; 20178 } 20179 if (rack->r_must_retran && 20180 (doing_tlp == 0) && 20181 (SEQ_GT(tp->snd_max, tp->snd_una)) && 20182 (rsm == NULL)) { 20183 /* 20184 * There are two different ways that we 20185 * can get into this block: 20186 * a) This is a non-sack connection, we had a time-out 20187 * and thus r_must_retran was set and everything 20188 * left outstanding as been marked for retransmit. 20189 * b) The MTU of the path shrank, so that everything 20190 * was marked to be retransmitted with the smaller 20191 * mtu and r_must_retran was set. 20192 * 20193 * This means that we expect the sendmap (outstanding) 20194 * to all be marked must. We can use the tmap to 20195 * look at them. 20196 * 20197 */ 20198 int sendwin, flight; 20199 20200 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 20201 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 20202 if (flight >= sendwin) { 20203 /* 20204 * We can't send yet. 20205 */ 20206 so = inp->inp_socket; 20207 sb = &so->so_snd; 20208 goto just_return_nolock; 20209 } 20210 /* 20211 * This is the case a/b mentioned above. All 20212 * outstanding/not-acked should be marked. 20213 * We can use the tmap to find them. 20214 */ 20215 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 20216 if (rsm == NULL) { 20217 /* TSNH */ 20218 rack->r_must_retran = 0; 20219 rack->r_ctl.rc_out_at_rto = 0; 20220 so = inp->inp_socket; 20221 sb = &so->so_snd; 20222 goto just_return_nolock; 20223 } 20224 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 20225 /* 20226 * The first one does not have the flag, did we collapse 20227 * further up in our list? 20228 */ 20229 rack->r_must_retran = 0; 20230 rack->r_ctl.rc_out_at_rto = 0; 20231 rsm = NULL; 20232 sack_rxmit = 0; 20233 } else { 20234 sack_rxmit = 1; 20235 len = rsm->r_end - rsm->r_start; 20236 sb_offset = rsm->r_start - tp->snd_una; 20237 sendalot = 0; 20238 if ((rack->full_size_rxt == 0) && 20239 (rack->shape_rxt_to_pacing_min == 0) && 20240 (len >= segsiz)) 20241 len = segsiz; 20242 else if (rack->shape_rxt_to_pacing_min && 20243 rack->gp_ready) { 20244 /* We use pacing min as shaping len req */ 20245 uint32_t maxlen; 20246 20247 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20248 if (len > maxlen) 20249 len = maxlen; 20250 } 20251 /* 20252 * Delay removing the flag RACK_MUST_RXT so 20253 * that the fastpath for retransmit will 20254 * work with this rsm. 20255 */ 20256 } 20257 } 20258 /* 20259 * Enforce a connection sendmap count limit if set 20260 * as long as we are not retransmiting. 20261 */ 20262 if ((rsm == NULL) && 20263 (V_tcp_map_entries_limit > 0) && 20264 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 20265 counter_u64_add(rack_to_alloc_limited, 1); 20266 if (!rack->alloc_limit_reported) { 20267 rack->alloc_limit_reported = 1; 20268 counter_u64_add(rack_alloc_limited_conns, 1); 20269 } 20270 so = inp->inp_socket; 20271 sb = &so->so_snd; 20272 goto just_return_nolock; 20273 } 20274 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 20275 /* we are retransmitting the fin */ 20276 len--; 20277 if (len) { 20278 /* 20279 * When retransmitting data do *not* include the 20280 * FIN. This could happen from a TLP probe. 20281 */ 20282 flags &= ~TH_FIN; 20283 } 20284 } 20285 if (rsm && rack->r_fsb_inited && 20286 rack_use_rsm_rfo && 20287 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 20288 int ret; 20289 20290 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 20291 if (ret == 0) 20292 return (0); 20293 } 20294 so = inp->inp_socket; 20295 sb = &so->so_snd; 20296 if (do_a_prefetch == 0) { 20297 kern_prefetch(sb, &do_a_prefetch); 20298 do_a_prefetch = 1; 20299 } 20300 #ifdef NETFLIX_SHARED_CWND 20301 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 20302 rack->rack_enable_scwnd) { 20303 /* We are doing cwnd sharing */ 20304 if (rack->gp_ready && 20305 (rack->rack_attempted_scwnd == 0) && 20306 (rack->r_ctl.rc_scw == NULL) && 20307 tp->t_lib) { 20308 /* The pcbid is in, lets make an attempt */ 20309 counter_u64_add(rack_try_scwnd, 1); 20310 rack->rack_attempted_scwnd = 1; 20311 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 20312 &rack->r_ctl.rc_scw_index, 20313 segsiz); 20314 } 20315 if (rack->r_ctl.rc_scw && 20316 (rack->rack_scwnd_is_idle == 1) && 20317 sbavail(&so->so_snd)) { 20318 /* we are no longer out of data */ 20319 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20320 rack->rack_scwnd_is_idle = 0; 20321 } 20322 if (rack->r_ctl.rc_scw) { 20323 /* First lets update and get the cwnd */ 20324 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 20325 rack->r_ctl.rc_scw_index, 20326 tp->snd_cwnd, tp->snd_wnd, segsiz); 20327 } 20328 } 20329 #endif 20330 /* 20331 * Get standard flags, and add SYN or FIN if requested by 'hidden' 20332 * state flags. 20333 */ 20334 if (tp->t_flags & TF_NEEDFIN) 20335 flags |= TH_FIN; 20336 if (tp->t_flags & TF_NEEDSYN) 20337 flags |= TH_SYN; 20338 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 20339 void *end_rsm; 20340 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 20341 if (end_rsm) 20342 kern_prefetch(end_rsm, &prefetch_rsm); 20343 prefetch_rsm = 1; 20344 } 20345 SOCK_SENDBUF_LOCK(so); 20346 if ((sack_rxmit == 0) && 20347 (TCPS_HAVEESTABLISHED(tp->t_state) || 20348 (tp->t_flags & TF_FASTOPEN))) { 20349 /* 20350 * We are not retransmitting (sack_rxmit is 0) so we 20351 * are sending new data. This is always based on snd_max. 20352 * Now in theory snd_max may be equal to snd_una, if so 20353 * then nothing is outstanding and the offset would be 0. 20354 */ 20355 uint32_t avail; 20356 20357 avail = sbavail(sb); 20358 if (SEQ_GT(tp->snd_max, tp->snd_una) && avail) 20359 sb_offset = tp->snd_max - tp->snd_una; 20360 else 20361 sb_offset = 0; 20362 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 20363 if (rack->r_ctl.rc_tlp_new_data) { 20364 /* TLP is forcing out new data */ 20365 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 20366 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 20367 } 20368 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 20369 if (tp->snd_wnd > sb_offset) 20370 len = tp->snd_wnd - sb_offset; 20371 else 20372 len = 0; 20373 } else { 20374 len = rack->r_ctl.rc_tlp_new_data; 20375 } 20376 rack->r_ctl.rc_tlp_new_data = 0; 20377 } else { 20378 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 20379 } 20380 if ((rack->r_ctl.crte == NULL) && 20381 IN_FASTRECOVERY(tp->t_flags) && 20382 (rack->full_size_rxt == 0) && 20383 (rack->shape_rxt_to_pacing_min == 0) && 20384 (len > segsiz)) { 20385 /* 20386 * For prr=off, we need to send only 1 MSS 20387 * at a time. We do this because another sack could 20388 * be arriving that causes us to send retransmits and 20389 * we don't want to be on a long pace due to a larger send 20390 * that keeps us from sending out the retransmit. 20391 */ 20392 len = segsiz; 20393 } else if (rack->shape_rxt_to_pacing_min && 20394 rack->gp_ready) { 20395 /* We use pacing min as shaping len req */ 20396 uint32_t maxlen; 20397 20398 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20399 if (len > maxlen) 20400 len = maxlen; 20401 }/* The else is full_size_rxt is on so send it all */ 20402 } else { 20403 uint32_t outstanding; 20404 /* 20405 * We are inside of a Fast recovery episode, this 20406 * is caused by a SACK or 3 dup acks. At this point 20407 * we have sent all the retransmissions and we rely 20408 * on PRR to dictate what we will send in the form of 20409 * new data. 20410 */ 20411 20412 outstanding = tp->snd_max - tp->snd_una; 20413 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 20414 if (tp->snd_wnd > outstanding) { 20415 len = tp->snd_wnd - outstanding; 20416 /* Check to see if we have the data */ 20417 if ((sb_offset + len) > avail) { 20418 /* It does not all fit */ 20419 if (avail > sb_offset) 20420 len = avail - sb_offset; 20421 else 20422 len = 0; 20423 } 20424 } else { 20425 len = 0; 20426 } 20427 } else if (avail > sb_offset) { 20428 len = avail - sb_offset; 20429 } else { 20430 len = 0; 20431 } 20432 if (len > 0) { 20433 if (len > rack->r_ctl.rc_prr_sndcnt) { 20434 len = rack->r_ctl.rc_prr_sndcnt; 20435 } 20436 if (len > 0) { 20437 sub_from_prr = 1; 20438 } 20439 } 20440 if (len > segsiz) { 20441 /* 20442 * We should never send more than a MSS when 20443 * retransmitting or sending new data in prr 20444 * mode unless the override flag is on. Most 20445 * likely the PRR algorithm is not going to 20446 * let us send a lot as well :-) 20447 */ 20448 if (rack->r_ctl.rc_prr_sendalot == 0) { 20449 len = segsiz; 20450 } 20451 } else if (len < segsiz) { 20452 /* 20453 * Do we send any? The idea here is if the 20454 * send empty's the socket buffer we want to 20455 * do it. However if not then lets just wait 20456 * for our prr_sndcnt to get bigger. 20457 */ 20458 long leftinsb; 20459 20460 leftinsb = sbavail(sb) - sb_offset; 20461 if (leftinsb > len) { 20462 /* This send does not empty the sb */ 20463 len = 0; 20464 } 20465 } 20466 } 20467 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 20468 /* 20469 * If you have not established 20470 * and are not doing FAST OPEN 20471 * no data please. 20472 */ 20473 if ((sack_rxmit == 0) && 20474 !(tp->t_flags & TF_FASTOPEN)) { 20475 len = 0; 20476 sb_offset = 0; 20477 } 20478 } 20479 if (prefetch_so_done == 0) { 20480 kern_prefetch(so, &prefetch_so_done); 20481 prefetch_so_done = 1; 20482 } 20483 orig_len = len; 20484 /* 20485 * Lop off SYN bit if it has already been sent. However, if this is 20486 * SYN-SENT state and if segment contains data and if we don't know 20487 * that foreign host supports TAO, suppress sending segment. 20488 */ 20489 if ((flags & TH_SYN) && 20490 SEQ_GT(tp->snd_max, tp->snd_una) && 20491 ((sack_rxmit == 0) && 20492 (tp->t_rxtshift == 0))) { 20493 /* 20494 * When sending additional segments following a TFO SYN|ACK, 20495 * do not include the SYN bit. 20496 */ 20497 if ((tp->t_flags & TF_FASTOPEN) && 20498 (tp->t_state == TCPS_SYN_RECEIVED)) 20499 flags &= ~TH_SYN; 20500 } 20501 /* 20502 * Be careful not to send data and/or FIN on SYN segments. This 20503 * measure is needed to prevent interoperability problems with not 20504 * fully conformant TCP implementations. 20505 */ 20506 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 20507 len = 0; 20508 flags &= ~TH_FIN; 20509 } 20510 /* 20511 * On TFO sockets, ensure no data is sent in the following cases: 20512 * 20513 * - When retransmitting SYN|ACK on a passively-created socket 20514 * 20515 * - When retransmitting SYN on an actively created socket 20516 * 20517 * - When sending a zero-length cookie (cookie request) on an 20518 * actively created socket 20519 * 20520 * - When the socket is in the CLOSED state (RST is being sent) 20521 */ 20522 if ((tp->t_flags & TF_FASTOPEN) && 20523 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 20524 ((tp->t_state == TCPS_SYN_SENT) && 20525 (tp->t_tfo_client_cookie_len == 0)) || 20526 (flags & TH_RST))) { 20527 sack_rxmit = 0; 20528 len = 0; 20529 } 20530 /* Without fast-open there should never be data sent on a SYN */ 20531 if ((flags & TH_SYN) && !(tp->t_flags & TF_FASTOPEN)) { 20532 len = 0; 20533 } 20534 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 20535 /* We only send 1 MSS if we have a DSACK block */ 20536 add_flag |= RACK_SENT_W_DSACK; 20537 len = segsiz; 20538 } 20539 if (len <= 0) { 20540 /* 20541 * We have nothing to send, or the window shrank, or 20542 * is closed, do we need to go into persists? 20543 */ 20544 len = 0; 20545 if ((tp->snd_wnd == 0) && 20546 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20547 (tp->snd_una == tp->snd_max) && 20548 (sb_offset < (int)sbavail(sb))) { 20549 rack_enter_persist(tp, rack, cts, tp->snd_una); 20550 } 20551 } else if ((rsm == NULL) && 20552 (doing_tlp == 0) && 20553 (len < pace_max_seg)) { 20554 /* 20555 * We are not sending a maximum sized segment for 20556 * some reason. Should we not send anything (think 20557 * sws or persists)? 20558 */ 20559 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20560 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20561 (len < minseg) && 20562 (len < (int)(sbavail(sb) - sb_offset))) { 20563 /* 20564 * Here the rwnd is less than 20565 * the minimum pacing size, this is not a retransmit, 20566 * we are established and 20567 * the send is not the last in the socket buffer 20568 * we send nothing, and we may enter persists 20569 * if nothing is outstanding. 20570 */ 20571 len = 0; 20572 if (tp->snd_max == tp->snd_una) { 20573 /* 20574 * Nothing out we can 20575 * go into persists. 20576 */ 20577 rack_enter_persist(tp, rack, cts, tp->snd_una); 20578 } 20579 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 20580 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20581 (len < (int)(sbavail(sb) - sb_offset)) && 20582 (len < minseg)) { 20583 /* 20584 * Here we are not retransmitting, and 20585 * the cwnd is not so small that we could 20586 * not send at least a min size (rxt timer 20587 * not having gone off), We have 2 segments or 20588 * more already in flight, its not the tail end 20589 * of the socket buffer and the cwnd is blocking 20590 * us from sending out a minimum pacing segment size. 20591 * Lets not send anything. 20592 */ 20593 len = 0; 20594 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 20595 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20596 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20597 (len < (int)(sbavail(sb) - sb_offset)) && 20598 (TCPS_HAVEESTABLISHED(tp->t_state))) { 20599 /* 20600 * Here we have a send window but we have 20601 * filled it up and we can't send another pacing segment. 20602 * We also have in flight more than 2 segments 20603 * and we are not completing the sb i.e. we allow 20604 * the last bytes of the sb to go out even if 20605 * its not a full pacing segment. 20606 */ 20607 len = 0; 20608 } else if ((rack->r_ctl.crte != NULL) && 20609 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 20610 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 20611 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 20612 (len < (int)(sbavail(sb) - sb_offset))) { 20613 /* 20614 * Here we are doing hardware pacing, this is not a TLP, 20615 * we are not sending a pace max segment size, there is rwnd 20616 * room to send at least N pace_max_seg, the cwnd is greater 20617 * than or equal to a full pacing segments plus 4 mss and we have 2 or 20618 * more segments in flight and its not the tail of the socket buffer. 20619 * 20620 * We don't want to send instead we need to get more ack's in to 20621 * allow us to send a full pacing segment. Normally, if we are pacing 20622 * about the right speed, we should have finished our pacing 20623 * send as most of the acks have come back if we are at the 20624 * right rate. This is a bit fuzzy since return path delay 20625 * can delay the acks, which is why we want to make sure we 20626 * have cwnd space to have a bit more than a max pace segments in flight. 20627 * 20628 * If we have not gotten our acks back we are pacing at too high a 20629 * rate delaying will not hurt and will bring our GP estimate down by 20630 * injecting the delay. If we don't do this we will send 20631 * 2 MSS out in response to the acks being clocked in which 20632 * defeats the point of hw-pacing (i.e. to help us get 20633 * larger TSO's out). 20634 */ 20635 len = 0; 20636 } 20637 20638 } 20639 /* len will be >= 0 after this point. */ 20640 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 20641 rack_sndbuf_autoscale(rack); 20642 /* 20643 * Decide if we can use TCP Segmentation Offloading (if supported by 20644 * hardware). 20645 * 20646 * TSO may only be used if we are in a pure bulk sending state. The 20647 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 20648 * options prevent using TSO. With TSO the TCP header is the same 20649 * (except for the sequence number) for all generated packets. This 20650 * makes it impossible to transmit any options which vary per 20651 * generated segment or packet. 20652 * 20653 * IPv4 handling has a clear separation of ip options and ip header 20654 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 20655 * the right thing below to provide length of just ip options and thus 20656 * checking for ipoptlen is enough to decide if ip options are present. 20657 */ 20658 ipoptlen = 0; 20659 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20660 /* 20661 * Pre-calculate here as we save another lookup into the darknesses 20662 * of IPsec that way and can actually decide if TSO is ok. 20663 */ 20664 #ifdef INET6 20665 if (isipv6 && IPSEC_ENABLED(ipv6)) 20666 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 20667 #ifdef INET 20668 else 20669 #endif 20670 #endif /* INET6 */ 20671 #ifdef INET 20672 if (IPSEC_ENABLED(ipv4)) 20673 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 20674 #endif /* INET */ 20675 #endif 20676 20677 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20678 ipoptlen += ipsec_optlen; 20679 #endif 20680 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 20681 (tp->t_port == 0) && 20682 ((tp->t_flags & TF_SIGNATURE) == 0) && 20683 sack_rxmit == 0 && 20684 ipoptlen == 0) 20685 tso = 1; 20686 { 20687 uint32_t outstanding __unused; 20688 20689 outstanding = tp->snd_max - tp->snd_una; 20690 if (tp->t_flags & TF_SENTFIN) { 20691 /* 20692 * If we sent a fin, snd_max is 1 higher than 20693 * snd_una 20694 */ 20695 outstanding--; 20696 } 20697 if (sack_rxmit) { 20698 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 20699 flags &= ~TH_FIN; 20700 } 20701 } 20702 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 20703 (long)TCP_MAXWIN << tp->rcv_scale); 20704 20705 /* 20706 * Sender silly window avoidance. We transmit under the following 20707 * conditions when len is non-zero: 20708 * 20709 * - We have a full segment (or more with TSO) - This is the last 20710 * buffer in a write()/send() and we are either idle or running 20711 * NODELAY - we've timed out (e.g. persist timer) - we have more 20712 * then 1/2 the maximum send window's worth of data (receiver may be 20713 * limited the window size) - we need to retransmit 20714 */ 20715 if (len) { 20716 if (len >= segsiz) { 20717 goto send; 20718 } 20719 /* 20720 * NOTE! on localhost connections an 'ack' from the remote 20721 * end may occur synchronously with the output and cause us 20722 * to flush a buffer queued with moretocome. XXX 20723 * 20724 */ 20725 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 20726 (idle || (tp->t_flags & TF_NODELAY)) && 20727 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20728 (tp->t_flags & TF_NOPUSH) == 0) { 20729 pass = 2; 20730 goto send; 20731 } 20732 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 20733 pass = 22; 20734 goto send; 20735 } 20736 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 20737 pass = 4; 20738 goto send; 20739 } 20740 if (sack_rxmit) { 20741 pass = 6; 20742 goto send; 20743 } 20744 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 20745 (ctf_outstanding(tp) < (segsiz * 2))) { 20746 /* 20747 * We have less than two MSS outstanding (delayed ack) 20748 * and our rwnd will not let us send a full sized 20749 * MSS. Lets go ahead and let this small segment 20750 * out because we want to try to have at least two 20751 * packets inflight to not be caught by delayed ack. 20752 */ 20753 pass = 12; 20754 goto send; 20755 } 20756 } 20757 /* 20758 * Sending of standalone window updates. 20759 * 20760 * Window updates are important when we close our window due to a 20761 * full socket buffer and are opening it again after the application 20762 * reads data from it. Once the window has opened again and the 20763 * remote end starts to send again the ACK clock takes over and 20764 * provides the most current window information. 20765 * 20766 * We must avoid the silly window syndrome whereas every read from 20767 * the receive buffer, no matter how small, causes a window update 20768 * to be sent. We also should avoid sending a flurry of window 20769 * updates when the socket buffer had queued a lot of data and the 20770 * application is doing small reads. 20771 * 20772 * Prevent a flurry of pointless window updates by only sending an 20773 * update when we can increase the advertized window by more than 20774 * 1/4th of the socket buffer capacity. When the buffer is getting 20775 * full or is very small be more aggressive and send an update 20776 * whenever we can increase by two mss sized segments. In all other 20777 * situations the ACK's to new incoming data will carry further 20778 * window increases. 20779 * 20780 * Don't send an independent window update if a delayed ACK is 20781 * pending (it will get piggy-backed on it) or the remote side 20782 * already has done a half-close and won't send more data. Skip 20783 * this if the connection is in T/TCP half-open state. 20784 */ 20785 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 20786 !(tp->t_flags & TF_DELACK) && 20787 !TCPS_HAVERCVDFIN(tp->t_state)) { 20788 /* 20789 * "adv" is the amount we could increase the window, taking 20790 * into account that we are limited by TCP_MAXWIN << 20791 * tp->rcv_scale. 20792 */ 20793 int32_t adv; 20794 int oldwin; 20795 20796 adv = recwin; 20797 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 20798 oldwin = (tp->rcv_adv - tp->rcv_nxt); 20799 if (adv > oldwin) 20800 adv -= oldwin; 20801 else { 20802 /* We can't increase the window */ 20803 adv = 0; 20804 } 20805 } else 20806 oldwin = 0; 20807 20808 /* 20809 * If the new window size ends up being the same as or less 20810 * than the old size when it is scaled, then don't force 20811 * a window update. 20812 */ 20813 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 20814 goto dontupdate; 20815 20816 if (adv >= (int32_t)(2 * segsiz) && 20817 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 20818 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 20819 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 20820 pass = 7; 20821 goto send; 20822 } 20823 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 20824 pass = 23; 20825 goto send; 20826 } 20827 } 20828 dontupdate: 20829 20830 /* 20831 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 20832 * is also a catch-all for the retransmit timer timeout case. 20833 */ 20834 if (tp->t_flags & TF_ACKNOW) { 20835 pass = 8; 20836 goto send; 20837 } 20838 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 20839 pass = 9; 20840 goto send; 20841 } 20842 /* 20843 * If our state indicates that FIN should be sent and we have not 20844 * yet done so, then we need to send. 20845 */ 20846 if ((flags & TH_FIN) && 20847 (tp->snd_max == tp->snd_una)) { 20848 pass = 11; 20849 goto send; 20850 } 20851 /* 20852 * No reason to send a segment, just return. 20853 */ 20854 just_return: 20855 SOCK_SENDBUF_UNLOCK(so); 20856 just_return_nolock: 20857 { 20858 int app_limited = CTF_JR_SENT_DATA; 20859 20860 if ((tp->t_flags & TF_FASTOPEN) == 0 && 20861 (flags & TH_FIN) && 20862 (len == 0) && 20863 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 20864 ((tp->snd_max - tp->snd_una) <= segsiz)) { 20865 /* 20866 * Ok less than or right at a MSS is 20867 * outstanding. The original FreeBSD stack would 20868 * have sent a FIN, which can speed things up for 20869 * a transactional application doing a MSG_WAITALL. 20870 * To speed things up since we do *not* send a FIN 20871 * if data is outstanding, we send a "challenge ack". 20872 * The idea behind that is instead of having to have 20873 * the peer wait for the delayed-ack timer to run off 20874 * we send an ack that makes the peer send us an ack. 20875 */ 20876 rack_send_ack_challange(rack); 20877 } 20878 if (tot_len_this_send > 0) { 20879 rack->r_ctl.fsb.recwin = recwin; 20880 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); 20881 if ((error == 0) && 20882 rack_use_rfo && 20883 ((flags & (TH_SYN|TH_FIN)) == 0) && 20884 (ipoptlen == 0) && 20885 rack->r_fsb_inited && 20886 TCPS_HAVEESTABLISHED(tp->t_state) && 20887 ((IN_RECOVERY(tp->t_flags)) == 0) && 20888 (doing_tlp == 0) && 20889 (rack->r_must_retran == 0) && 20890 ((tp->t_flags & TF_NEEDFIN) == 0) && 20891 (len > 0) && (orig_len > 0) && 20892 (orig_len > len) && 20893 ((orig_len - len) >= segsiz) && 20894 ((optlen == 0) || 20895 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 20896 /* We can send at least one more MSS using our fsb */ 20897 rack_setup_fast_output(tp, rack, sb, len, orig_len, 20898 segsiz, pace_max_seg, hw_tls, flags); 20899 } else 20900 rack->r_fast_output = 0; 20901 rack_log_fsb(rack, tp, so, flags, 20902 ipoptlen, orig_len, len, 0, 20903 1, optlen, __LINE__, 1); 20904 /* Assure when we leave that snd_nxt will point to top */ 20905 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 20906 tp->snd_nxt = tp->snd_max; 20907 } else { 20908 int end_window = 0; 20909 uint32_t seq = tp->gput_ack; 20910 20911 rsm = tqhash_max(rack->r_ctl.tqh); 20912 if (rsm) { 20913 /* 20914 * Mark the last sent that we just-returned (hinting 20915 * that delayed ack may play a role in any rtt measurement). 20916 */ 20917 rsm->r_just_ret = 1; 20918 } 20919 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 20920 rack->r_ctl.rc_agg_delayed = 0; 20921 rack->r_early = 0; 20922 rack->r_late = 0; 20923 rack->r_ctl.rc_agg_early = 0; 20924 if ((ctf_outstanding(tp) + 20925 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 20926 minseg)) >= tp->snd_wnd) { 20927 /* We are limited by the rwnd */ 20928 app_limited = CTF_JR_RWND_LIMITED; 20929 if (IN_FASTRECOVERY(tp->t_flags)) 20930 rack->r_ctl.rc_prr_sndcnt = 0; 20931 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 20932 /* We are limited by whats available -- app limited */ 20933 app_limited = CTF_JR_APP_LIMITED; 20934 if (IN_FASTRECOVERY(tp->t_flags)) 20935 rack->r_ctl.rc_prr_sndcnt = 0; 20936 } else if ((idle == 0) && 20937 ((tp->t_flags & TF_NODELAY) == 0) && 20938 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20939 (len < segsiz)) { 20940 /* 20941 * No delay is not on and the 20942 * user is sending less than 1MSS. This 20943 * brings out SWS avoidance so we 20944 * don't send. Another app-limited case. 20945 */ 20946 app_limited = CTF_JR_APP_LIMITED; 20947 } else if (tp->t_flags & TF_NOPUSH) { 20948 /* 20949 * The user has requested no push of 20950 * the last segment and we are 20951 * at the last segment. Another app 20952 * limited case. 20953 */ 20954 app_limited = CTF_JR_APP_LIMITED; 20955 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 20956 /* Its the cwnd */ 20957 app_limited = CTF_JR_CWND_LIMITED; 20958 } else if (IN_FASTRECOVERY(tp->t_flags) && 20959 (rack->rack_no_prr == 0) && 20960 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 20961 app_limited = CTF_JR_PRR; 20962 } else { 20963 /* Now why here are we not sending? */ 20964 #ifdef NOW 20965 #ifdef INVARIANTS 20966 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 20967 #endif 20968 #endif 20969 app_limited = CTF_JR_ASSESSING; 20970 } 20971 /* 20972 * App limited in some fashion, for our pacing GP 20973 * measurements we don't want any gap (even cwnd). 20974 * Close down the measurement window. 20975 */ 20976 if (rack_cwnd_block_ends_measure && 20977 ((app_limited == CTF_JR_CWND_LIMITED) || 20978 (app_limited == CTF_JR_PRR))) { 20979 /* 20980 * The reason we are not sending is 20981 * the cwnd (or prr). We have been configured 20982 * to end the measurement window in 20983 * this case. 20984 */ 20985 end_window = 1; 20986 } else if (rack_rwnd_block_ends_measure && 20987 (app_limited == CTF_JR_RWND_LIMITED)) { 20988 /* 20989 * We are rwnd limited and have been 20990 * configured to end the measurement 20991 * window in this case. 20992 */ 20993 end_window = 1; 20994 } else if (app_limited == CTF_JR_APP_LIMITED) { 20995 /* 20996 * A true application limited period, we have 20997 * ran out of data. 20998 */ 20999 end_window = 1; 21000 } else if (app_limited == CTF_JR_ASSESSING) { 21001 /* 21002 * In the assessing case we hit the end of 21003 * the if/else and had no known reason 21004 * This will panic us under invariants.. 21005 * 21006 * If we get this out in logs we need to 21007 * investagate which reason we missed. 21008 */ 21009 end_window = 1; 21010 } 21011 if (end_window) { 21012 uint8_t log = 0; 21013 21014 /* Adjust the Gput measurement */ 21015 if ((tp->t_flags & TF_GPUTINPROG) && 21016 SEQ_GT(tp->gput_ack, tp->snd_max)) { 21017 tp->gput_ack = tp->snd_max; 21018 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 21019 /* 21020 * There is not enough to measure. 21021 */ 21022 tp->t_flags &= ~TF_GPUTINPROG; 21023 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 21024 rack->r_ctl.rc_gp_srtt /*flex1*/, 21025 tp->gput_seq, 21026 0, 0, 18, __LINE__, NULL, 0); 21027 } else 21028 log = 1; 21029 } 21030 /* Mark the last packet as app limited */ 21031 rsm = tqhash_max(rack->r_ctl.tqh); 21032 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 21033 if (rack->r_ctl.rc_app_limited_cnt == 0) 21034 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 21035 else { 21036 /* 21037 * Go out to the end app limited and mark 21038 * this new one as next and move the end_appl up 21039 * to this guy. 21040 */ 21041 if (rack->r_ctl.rc_end_appl) 21042 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 21043 rack->r_ctl.rc_end_appl = rsm; 21044 } 21045 rsm->r_flags |= RACK_APP_LIMITED; 21046 rack->r_ctl.rc_app_limited_cnt++; 21047 } 21048 if (log) 21049 rack_log_pacing_delay_calc(rack, 21050 rack->r_ctl.rc_app_limited_cnt, seq, 21051 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 21052 } 21053 } 21054 /* Check if we need to go into persists or not */ 21055 if ((tp->snd_max == tp->snd_una) && 21056 TCPS_HAVEESTABLISHED(tp->t_state) && 21057 sbavail(sb) && 21058 (sbavail(sb) > tp->snd_wnd) && 21059 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 21060 /* Yes lets make sure to move to persist before timer-start */ 21061 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 21062 } 21063 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 21064 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 21065 } 21066 #ifdef NETFLIX_SHARED_CWND 21067 if ((sbavail(sb) == 0) && 21068 rack->r_ctl.rc_scw) { 21069 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 21070 rack->rack_scwnd_is_idle = 1; 21071 } 21072 #endif 21073 #ifdef TCP_ACCOUNTING 21074 if (tot_len_this_send > 0) { 21075 crtsc = get_cyclecount(); 21076 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21077 tp->tcp_cnt_counters[SND_OUT_DATA]++; 21078 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 21079 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 21080 } 21081 } else { 21082 crtsc = get_cyclecount(); 21083 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21084 tp->tcp_cnt_counters[SND_LIMITED]++; 21085 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 21086 } 21087 } 21088 sched_unpin(); 21089 #endif 21090 return (0); 21091 21092 send: 21093 if ((rack->r_ctl.crte != NULL) && 21094 (rsm == NULL) && 21095 ((rack->rc_hw_nobuf == 1) || 21096 (rack_hw_check_queue && (check_done == 0)))) { 21097 /* 21098 * We only want to do this once with the hw_check_queue, 21099 * for the enobuf case we would only do it once if 21100 * we come around to again, the flag will be clear. 21101 */ 21102 check_done = 1; 21103 slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); 21104 if (slot) { 21105 rack->r_ctl.rc_agg_delayed = 0; 21106 rack->r_ctl.rc_agg_early = 0; 21107 rack->r_early = 0; 21108 rack->r_late = 0; 21109 SOCK_SENDBUF_UNLOCK(so); 21110 goto skip_all_send; 21111 } 21112 } 21113 if (rsm || sack_rxmit) 21114 counter_u64_add(rack_nfto_resend, 1); 21115 else 21116 counter_u64_add(rack_non_fto_send, 1); 21117 if ((flags & TH_FIN) && 21118 sbavail(sb)) { 21119 /* 21120 * We do not transmit a FIN 21121 * with data outstanding. We 21122 * need to make it so all data 21123 * is acked first. 21124 */ 21125 flags &= ~TH_FIN; 21126 if (TCPS_HAVEESTABLISHED(tp->t_state) && 21127 (sbused(sb) == (tp->snd_max - tp->snd_una)) && 21128 ((tp->snd_max - tp->snd_una) <= segsiz)) { 21129 /* 21130 * Ok less than or right at a MSS is 21131 * outstanding. The original FreeBSD stack would 21132 * have sent a FIN, which can speed things up for 21133 * a transactional application doing a MSG_WAITALL. 21134 * To speed things up since we do *not* send a FIN 21135 * if data is outstanding, we send a "challenge ack". 21136 * The idea behind that is instead of having to have 21137 * the peer wait for the delayed-ack timer to run off 21138 * we send an ack that makes the peer send us an ack. 21139 */ 21140 rack_send_ack_challange(rack); 21141 } 21142 } 21143 /* Enforce stack imposed max seg size if we have one */ 21144 if (pace_max_seg && 21145 (len > pace_max_seg)) { 21146 mark = 1; 21147 len = pace_max_seg; 21148 } 21149 if ((rsm == NULL) && 21150 (rack->pcm_in_progress == 0) && 21151 (rack->r_ctl.pcm_max_seg > 0) && 21152 (len >= rack->r_ctl.pcm_max_seg)) { 21153 /* It is large enough for a measurement */ 21154 add_flag |= RACK_IS_PCM; 21155 rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag); 21156 } else if (rack_verbose_logging) { 21157 rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag); 21158 } 21159 21160 SOCKBUF_LOCK_ASSERT(sb); 21161 if (len > 0) { 21162 if (len >= segsiz) 21163 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 21164 else 21165 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 21166 } 21167 /* 21168 * Before ESTABLISHED, force sending of initial options unless TCP 21169 * set not to do any options. NOTE: we assume that the IP/TCP header 21170 * plus TCP options always fit in a single mbuf, leaving room for a 21171 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 21172 * + optlen <= MCLBYTES 21173 */ 21174 optlen = 0; 21175 #ifdef INET6 21176 if (isipv6) 21177 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 21178 else 21179 #endif 21180 hdrlen = sizeof(struct tcpiphdr); 21181 21182 /* 21183 * Ok what seq are we sending from. If we have 21184 * no rsm to use, then we look at various bits, 21185 * if we are putting out a SYN it will be ISS. 21186 * If we are retransmitting a FIN it will 21187 * be snd_max-1 else its snd_max. 21188 */ 21189 if (rsm == NULL) { 21190 if (flags & TH_SYN) 21191 rack_seq = tp->iss; 21192 else if ((flags & TH_FIN) && 21193 (tp->t_flags & TF_SENTFIN)) 21194 rack_seq = tp->snd_max - 1; 21195 else 21196 rack_seq = tp->snd_max; 21197 } else { 21198 rack_seq = rsm->r_start; 21199 } 21200 /* 21201 * Compute options for segment. We only have to care about SYN and 21202 * established connection segments. Options for SYN-ACK segments 21203 * are handled in TCP syncache. 21204 */ 21205 to.to_flags = 0; 21206 if ((tp->t_flags & TF_NOOPT) == 0) { 21207 /* Maximum segment size. */ 21208 if (flags & TH_SYN) { 21209 to.to_mss = tcp_mssopt(&inp->inp_inc); 21210 if (tp->t_port) 21211 to.to_mss -= V_tcp_udp_tunneling_overhead; 21212 to.to_flags |= TOF_MSS; 21213 21214 /* 21215 * On SYN or SYN|ACK transmits on TFO connections, 21216 * only include the TFO option if it is not a 21217 * retransmit, as the presence of the TFO option may 21218 * have caused the original SYN or SYN|ACK to have 21219 * been dropped by a middlebox. 21220 */ 21221 if ((tp->t_flags & TF_FASTOPEN) && 21222 (tp->t_rxtshift == 0)) { 21223 if (tp->t_state == TCPS_SYN_RECEIVED) { 21224 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 21225 to.to_tfo_cookie = 21226 (u_int8_t *)&tp->t_tfo_cookie.server; 21227 to.to_flags |= TOF_FASTOPEN; 21228 wanted_cookie = 1; 21229 } else if (tp->t_state == TCPS_SYN_SENT) { 21230 to.to_tfo_len = 21231 tp->t_tfo_client_cookie_len; 21232 to.to_tfo_cookie = 21233 tp->t_tfo_cookie.client; 21234 to.to_flags |= TOF_FASTOPEN; 21235 wanted_cookie = 1; 21236 /* 21237 * If we wind up having more data to 21238 * send with the SYN than can fit in 21239 * one segment, don't send any more 21240 * until the SYN|ACK comes back from 21241 * the other end. 21242 */ 21243 sendalot = 0; 21244 } 21245 } 21246 } 21247 /* Window scaling. */ 21248 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 21249 to.to_wscale = tp->request_r_scale; 21250 to.to_flags |= TOF_SCALE; 21251 } 21252 /* Timestamps. */ 21253 if ((tp->t_flags & TF_RCVD_TSTMP) || 21254 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 21255 uint32_t ts_to_use; 21256 21257 if ((rack->r_rcvpath_rtt_up == 1) && 21258 (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) { 21259 /* 21260 * When we are doing a rcv_rtt probe all 21261 * other timestamps use the next msec. This 21262 * is safe since our previous ack is in the 21263 * air and we will just have a few more 21264 * on the next ms. This assures that only 21265 * the one ack has the ms_cts that was on 21266 * our ack-probe. 21267 */ 21268 ts_to_use = ms_cts + 1; 21269 } else { 21270 ts_to_use = ms_cts; 21271 } 21272 to.to_tsval = ts_to_use + tp->ts_offset; 21273 to.to_tsecr = tp->ts_recent; 21274 to.to_flags |= TOF_TS; 21275 if ((len == 0) && 21276 (TCPS_HAVEESTABLISHED(tp->t_state)) && 21277 ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) && 21278 (tp->snd_una == tp->snd_max) && 21279 (flags & TH_ACK) && 21280 (sbavail(sb) == 0) && 21281 (rack->r_ctl.current_round != 0) && 21282 ((flags & (TH_SYN|TH_FIN)) == 0) && 21283 (rack->r_rcvpath_rtt_up == 0)) { 21284 rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts; 21285 rack->r_ctl.last_time_of_arm_rcv = cts; 21286 rack->r_rcvpath_rtt_up = 1; 21287 /* Subtract 1 from seq to force a response */ 21288 rack_seq--; 21289 } 21290 } 21291 /* Set receive buffer autosizing timestamp. */ 21292 if (tp->rfbuf_ts == 0 && 21293 (so->so_rcv.sb_flags & SB_AUTOSIZE)) { 21294 tp->rfbuf_ts = ms_cts; 21295 } 21296 /* Selective ACK's. */ 21297 if (tp->t_flags & TF_SACK_PERMIT) { 21298 if (flags & TH_SYN) 21299 to.to_flags |= TOF_SACKPERM; 21300 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 21301 tp->rcv_numsacks > 0) { 21302 to.to_flags |= TOF_SACK; 21303 to.to_nsacks = tp->rcv_numsacks; 21304 to.to_sacks = (u_char *)tp->sackblks; 21305 } 21306 } 21307 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21308 /* TCP-MD5 (RFC2385). */ 21309 if (tp->t_flags & TF_SIGNATURE) 21310 to.to_flags |= TOF_SIGNATURE; 21311 #endif 21312 21313 /* Processing the options. */ 21314 hdrlen += optlen = tcp_addoptions(&to, opt); 21315 /* 21316 * If we wanted a TFO option to be added, but it was unable 21317 * to fit, ensure no data is sent. 21318 */ 21319 if ((tp->t_flags & TF_FASTOPEN) && wanted_cookie && 21320 !(to.to_flags & TOF_FASTOPEN)) 21321 len = 0; 21322 } 21323 if (tp->t_port) { 21324 if (V_tcp_udp_tunneling_port == 0) { 21325 /* The port was removed?? */ 21326 SOCK_SENDBUF_UNLOCK(so); 21327 #ifdef TCP_ACCOUNTING 21328 crtsc = get_cyclecount(); 21329 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21330 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 21331 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 21332 } 21333 sched_unpin(); 21334 #endif 21335 return (EHOSTUNREACH); 21336 } 21337 hdrlen += sizeof(struct udphdr); 21338 } 21339 #ifdef INET6 21340 if (isipv6) 21341 ipoptlen = ip6_optlen(inp); 21342 else 21343 #endif 21344 if (inp->inp_options) 21345 ipoptlen = inp->inp_options->m_len - 21346 offsetof(struct ipoption, ipopt_list); 21347 else 21348 ipoptlen = 0; 21349 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21350 ipoptlen += ipsec_optlen; 21351 #endif 21352 21353 /* 21354 * Adjust data length if insertion of options will bump the packet 21355 * length beyond the t_maxseg length. Clear the FIN bit because we 21356 * cut off the tail of the segment. 21357 */ 21358 if (len + optlen + ipoptlen > tp->t_maxseg) { 21359 if (tso) { 21360 uint32_t if_hw_tsomax; 21361 uint32_t moff; 21362 int32_t max_len; 21363 21364 /* extract TSO information */ 21365 if_hw_tsomax = tp->t_tsomax; 21366 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 21367 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 21368 KASSERT(ipoptlen == 0, 21369 ("%s: TSO can't do IP options", __func__)); 21370 21371 /* 21372 * Check if we should limit by maximum payload 21373 * length: 21374 */ 21375 if (if_hw_tsomax != 0) { 21376 /* compute maximum TSO length */ 21377 max_len = (if_hw_tsomax - hdrlen - 21378 max_linkhdr); 21379 if (max_len <= 0) { 21380 len = 0; 21381 } else if (len > max_len) { 21382 if (doing_tlp == 0) 21383 sendalot = 1; 21384 len = max_len; 21385 mark = 2; 21386 } 21387 } 21388 /* 21389 * Prevent the last segment from being fractional 21390 * unless the send sockbuf can be emptied: 21391 */ 21392 max_len = (tp->t_maxseg - optlen); 21393 if ((sb_offset + len) < sbavail(sb)) { 21394 moff = len % (u_int)max_len; 21395 if (moff != 0) { 21396 mark = 3; 21397 len -= moff; 21398 } 21399 } 21400 /* 21401 * In case there are too many small fragments don't 21402 * use TSO: 21403 */ 21404 if (len <= max_len) { 21405 mark = 4; 21406 tso = 0; 21407 } 21408 /* 21409 * Send the FIN in a separate segment after the bulk 21410 * sending is done. We don't trust the TSO 21411 * implementations to clear the FIN flag on all but 21412 * the last segment. 21413 */ 21414 if (tp->t_flags & TF_NEEDFIN) { 21415 sendalot = 4; 21416 } 21417 } else { 21418 mark = 5; 21419 if (optlen + ipoptlen >= tp->t_maxseg) { 21420 /* 21421 * Since we don't have enough space to put 21422 * the IP header chain and the TCP header in 21423 * one packet as required by RFC 7112, don't 21424 * send it. Also ensure that at least one 21425 * byte of the payload can be put into the 21426 * TCP segment. 21427 */ 21428 SOCK_SENDBUF_UNLOCK(so); 21429 error = EMSGSIZE; 21430 sack_rxmit = 0; 21431 goto out; 21432 } 21433 len = tp->t_maxseg - optlen - ipoptlen; 21434 sendalot = 5; 21435 } 21436 } else { 21437 tso = 0; 21438 mark = 6; 21439 } 21440 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 21441 ("%s: len > IP_MAXPACKET", __func__)); 21442 #ifdef DIAGNOSTIC 21443 #ifdef INET6 21444 if (max_linkhdr + hdrlen > MCLBYTES) 21445 #else 21446 if (max_linkhdr + hdrlen > MHLEN) 21447 #endif 21448 panic("tcphdr too big"); 21449 #endif 21450 21451 /* 21452 * This KASSERT is here to catch edge cases at a well defined place. 21453 * Before, those had triggered (random) panic conditions further 21454 * down. 21455 */ 21456 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 21457 if ((len == 0) && 21458 (flags & TH_FIN) && 21459 (sbused(sb))) { 21460 /* 21461 * We have outstanding data, don't send a fin by itself!. 21462 * 21463 * Check to see if we need to send a challenge ack. 21464 */ 21465 if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && 21466 ((tp->snd_max - tp->snd_una) <= segsiz)) { 21467 /* 21468 * Ok less than or right at a MSS is 21469 * outstanding. The original FreeBSD stack would 21470 * have sent a FIN, which can speed things up for 21471 * a transactional application doing a MSG_WAITALL. 21472 * To speed things up since we do *not* send a FIN 21473 * if data is outstanding, we send a "challenge ack". 21474 * The idea behind that is instead of having to have 21475 * the peer wait for the delayed-ack timer to run off 21476 * we send an ack that makes the peer send us an ack. 21477 */ 21478 rack_send_ack_challange(rack); 21479 } 21480 goto just_return; 21481 } 21482 /* 21483 * Grab a header mbuf, attaching a copy of data to be transmitted, 21484 * and initialize the header from the template for sends on this 21485 * connection. 21486 */ 21487 hw_tls = tp->t_nic_ktls_xmit != 0; 21488 if (len) { 21489 uint32_t max_val; 21490 uint32_t moff; 21491 21492 if (pace_max_seg) 21493 max_val = pace_max_seg; 21494 else 21495 max_val = len; 21496 /* 21497 * We allow a limit on sending with hptsi. 21498 */ 21499 if (len > max_val) { 21500 mark = 7; 21501 len = max_val; 21502 } 21503 #ifdef INET6 21504 if (MHLEN < hdrlen + max_linkhdr) 21505 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 21506 else 21507 #endif 21508 m = m_gethdr(M_NOWAIT, MT_DATA); 21509 21510 if (m == NULL) { 21511 SOCK_SENDBUF_UNLOCK(so); 21512 error = ENOBUFS; 21513 sack_rxmit = 0; 21514 goto out; 21515 } 21516 m->m_data += max_linkhdr; 21517 m->m_len = hdrlen; 21518 21519 /* 21520 * Start the m_copy functions from the closest mbuf to the 21521 * sb_offset in the socket buffer chain. 21522 */ 21523 mb = sbsndptr_noadv(sb, sb_offset, &moff); 21524 s_mb = mb; 21525 s_moff = moff; 21526 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 21527 m_copydata(mb, moff, (int)len, 21528 mtod(m, caddr_t)+hdrlen); 21529 /* 21530 * If we are not retransmitting advance the 21531 * sndptr to help remember the next place in 21532 * the sb. 21533 */ 21534 if (rsm == NULL) 21535 sbsndptr_adv(sb, mb, len); 21536 m->m_len += len; 21537 } else { 21538 struct sockbuf *msb; 21539 21540 /* 21541 * If we are not retransmitting pass in msb so 21542 * the socket buffer can be advanced. Otherwise 21543 * set it to NULL if its a retransmission since 21544 * we don't want to change the sb remembered 21545 * location. 21546 */ 21547 if (rsm == NULL) 21548 msb = sb; 21549 else 21550 msb = NULL; 21551 m->m_next = tcp_m_copym( 21552 mb, moff, &len, 21553 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 21554 ((rsm == NULL) ? hw_tls : 0)); 21555 if (len <= (tp->t_maxseg - optlen)) { 21556 /* 21557 * Must have ran out of mbufs for the copy 21558 * shorten it to no longer need tso. Lets 21559 * not put on sendalot since we are low on 21560 * mbufs. 21561 */ 21562 tso = 0; 21563 } 21564 if (m->m_next == NULL) { 21565 SOCK_SENDBUF_UNLOCK(so); 21566 (void)m_free(m); 21567 error = ENOBUFS; 21568 sack_rxmit = 0; 21569 goto out; 21570 } 21571 } 21572 if (sack_rxmit) { 21573 if (rsm && (rsm->r_flags & RACK_TLP)) { 21574 /* 21575 * TLP should not count in retran count, but 21576 * in its own bin 21577 */ 21578 counter_u64_add(rack_tlp_retran, 1); 21579 counter_u64_add(rack_tlp_retran_bytes, len); 21580 } else { 21581 tp->t_sndrexmitpack++; 21582 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 21583 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 21584 } 21585 #ifdef STATS 21586 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 21587 len); 21588 #endif 21589 } else { 21590 KMOD_TCPSTAT_INC(tcps_sndpack); 21591 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 21592 #ifdef STATS 21593 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 21594 len); 21595 #endif 21596 } 21597 /* 21598 * If we're sending everything we've got, set PUSH. (This 21599 * will keep happy those implementations which only give 21600 * data to the user when a buffer fills or a PUSH comes in.) 21601 */ 21602 if (sb_offset + len == sbused(sb) && 21603 sbused(sb) && 21604 !(flags & TH_SYN)) { 21605 flags |= TH_PUSH; 21606 add_flag |= RACK_HAD_PUSH; 21607 } 21608 SOCK_SENDBUF_UNLOCK(so); 21609 } else { 21610 SOCK_SENDBUF_UNLOCK(so); 21611 if (tp->t_flags & TF_ACKNOW) 21612 KMOD_TCPSTAT_INC(tcps_sndacks); 21613 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 21614 KMOD_TCPSTAT_INC(tcps_sndctrl); 21615 else 21616 KMOD_TCPSTAT_INC(tcps_sndwinup); 21617 21618 m = m_gethdr(M_NOWAIT, MT_DATA); 21619 if (m == NULL) { 21620 error = ENOBUFS; 21621 sack_rxmit = 0; 21622 goto out; 21623 } 21624 #ifdef INET6 21625 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 21626 MHLEN >= hdrlen) { 21627 M_ALIGN(m, hdrlen); 21628 } else 21629 #endif 21630 m->m_data += max_linkhdr; 21631 m->m_len = hdrlen; 21632 } 21633 SOCK_SENDBUF_UNLOCK_ASSERT(so); 21634 m->m_pkthdr.rcvif = (struct ifnet *)0; 21635 #ifdef MAC 21636 mac_inpcb_create_mbuf(inp, m); 21637 #endif 21638 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21639 #ifdef INET6 21640 if (isipv6) 21641 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 21642 else 21643 #endif /* INET6 */ 21644 #ifdef INET 21645 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 21646 #endif 21647 th = rack->r_ctl.fsb.th; 21648 udp = rack->r_ctl.fsb.udp; 21649 if (udp) { 21650 #ifdef INET6 21651 if (isipv6) 21652 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21653 else 21654 #endif /* INET6 */ 21655 ulen = hdrlen + len - sizeof(struct ip); 21656 udp->uh_ulen = htons(ulen); 21657 } 21658 } else { 21659 #ifdef INET6 21660 if (isipv6) { 21661 ip6 = mtod(m, struct ip6_hdr *); 21662 if (tp->t_port) { 21663 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 21664 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21665 udp->uh_dport = tp->t_port; 21666 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21667 udp->uh_ulen = htons(ulen); 21668 th = (struct tcphdr *)(udp + 1); 21669 } else 21670 th = (struct tcphdr *)(ip6 + 1); 21671 tcpip_fillheaders(inp, tp->t_port, ip6, th); 21672 } else 21673 #endif /* INET6 */ 21674 { 21675 #ifdef INET 21676 ip = mtod(m, struct ip *); 21677 if (tp->t_port) { 21678 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 21679 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21680 udp->uh_dport = tp->t_port; 21681 ulen = hdrlen + len - sizeof(struct ip); 21682 udp->uh_ulen = htons(ulen); 21683 th = (struct tcphdr *)(udp + 1); 21684 } else 21685 th = (struct tcphdr *)(ip + 1); 21686 tcpip_fillheaders(inp, tp->t_port, ip, th); 21687 #endif 21688 } 21689 } 21690 /* 21691 * If we are starting a connection, send ECN setup SYN packet. If we 21692 * are on a retransmit, we may resend those bits a number of times 21693 * as per RFC 3168. 21694 */ 21695 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 21696 flags |= tcp_ecn_output_syn_sent(tp); 21697 } 21698 /* Also handle parallel SYN for ECN */ 21699 if (TCPS_HAVERCVDSYN(tp->t_state) && 21700 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 21701 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 21702 if ((tp->t_state == TCPS_SYN_RECEIVED) && 21703 (tp->t_flags2 & TF2_ECN_SND_ECE)) 21704 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 21705 #ifdef INET6 21706 if (isipv6) { 21707 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 21708 ip6->ip6_flow |= htonl(ect << 20); 21709 } 21710 else 21711 #endif 21712 { 21713 #ifdef INET 21714 ip->ip_tos &= ~IPTOS_ECN_MASK; 21715 ip->ip_tos |= ect; 21716 #endif 21717 } 21718 } 21719 th->th_seq = htonl(rack_seq); 21720 th->th_ack = htonl(tp->rcv_nxt); 21721 tcp_set_flags(th, flags); 21722 /* 21723 * Calculate receive window. Don't shrink window, but avoid silly 21724 * window syndrome. 21725 * If a RST segment is sent, advertise a window of zero. 21726 */ 21727 if (flags & TH_RST) { 21728 recwin = 0; 21729 } else { 21730 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 21731 recwin < (long)segsiz) { 21732 recwin = 0; 21733 } 21734 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 21735 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 21736 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 21737 } 21738 21739 /* 21740 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 21741 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 21742 * handled in syncache. 21743 */ 21744 if (flags & TH_SYN) 21745 th->th_win = htons((u_short) 21746 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 21747 else { 21748 /* Avoid shrinking window with window scaling. */ 21749 recwin = roundup2(recwin, 1 << tp->rcv_scale); 21750 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 21751 } 21752 /* 21753 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 21754 * window. This may cause the remote transmitter to stall. This 21755 * flag tells soreceive() to disable delayed acknowledgements when 21756 * draining the buffer. This can occur if the receiver is 21757 * attempting to read more data than can be buffered prior to 21758 * transmitting on the connection. 21759 */ 21760 if (th->th_win == 0) { 21761 tp->t_sndzerowin++; 21762 tp->t_flags |= TF_RXWIN0SENT; 21763 } else 21764 tp->t_flags &= ~TF_RXWIN0SENT; 21765 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 21766 /* Now are we using fsb?, if so copy the template data to the mbuf */ 21767 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21768 uint8_t *cpto; 21769 21770 cpto = mtod(m, uint8_t *); 21771 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 21772 /* 21773 * We have just copied in: 21774 * IP/IP6 21775 * <optional udphdr> 21776 * tcphdr (no options) 21777 * 21778 * We need to grab the correct pointers into the mbuf 21779 * for both the tcp header, and possibly the udp header (if tunneling). 21780 * We do this by using the offset in the copy buffer and adding it 21781 * to the mbuf base pointer (cpto). 21782 */ 21783 #ifdef INET6 21784 if (isipv6) 21785 ip6 = mtod(m, struct ip6_hdr *); 21786 else 21787 #endif /* INET6 */ 21788 #ifdef INET 21789 ip = mtod(m, struct ip *); 21790 #endif 21791 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 21792 /* If we have a udp header lets set it into the mbuf as well */ 21793 if (udp) 21794 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 21795 } 21796 if (optlen) { 21797 bcopy(opt, th + 1, optlen); 21798 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 21799 } 21800 /* 21801 * Put TCP length in extended header, and then checksum extended 21802 * header and data. 21803 */ 21804 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 21805 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21806 if (to.to_flags & TOF_SIGNATURE) { 21807 /* 21808 * Calculate MD5 signature and put it into the place 21809 * determined before. 21810 * NOTE: since TCP options buffer doesn't point into 21811 * mbuf's data, calculate offset and use it. 21812 */ 21813 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 21814 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 21815 /* 21816 * Do not send segment if the calculation of MD5 21817 * digest has failed. 21818 */ 21819 goto out; 21820 } 21821 } 21822 #endif 21823 #ifdef INET6 21824 if (isipv6) { 21825 /* 21826 * ip6_plen is not need to be filled now, and will be filled 21827 * in ip6_output. 21828 */ 21829 if (tp->t_port) { 21830 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 21831 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21832 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 21833 th->th_sum = htons(0); 21834 UDPSTAT_INC(udps_opackets); 21835 } else { 21836 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 21837 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21838 th->th_sum = in6_cksum_pseudo(ip6, 21839 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 21840 0); 21841 } 21842 } 21843 #endif 21844 #if defined(INET6) && defined(INET) 21845 else 21846 #endif 21847 #ifdef INET 21848 { 21849 if (tp->t_port) { 21850 m->m_pkthdr.csum_flags = CSUM_UDP; 21851 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21852 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 21853 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 21854 th->th_sum = htons(0); 21855 UDPSTAT_INC(udps_opackets); 21856 } else { 21857 m->m_pkthdr.csum_flags = CSUM_TCP; 21858 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21859 th->th_sum = in_pseudo(ip->ip_src.s_addr, 21860 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 21861 IPPROTO_TCP + len + optlen)); 21862 } 21863 /* IP version must be set here for ipv4/ipv6 checking later */ 21864 KASSERT(ip->ip_v == IPVERSION, 21865 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 21866 } 21867 #endif 21868 /* 21869 * Enable TSO and specify the size of the segments. The TCP pseudo 21870 * header checksum is always provided. XXX: Fixme: This is currently 21871 * not the case for IPv6. 21872 */ 21873 if (tso) { 21874 /* 21875 * Here we must use t_maxseg and the optlen since 21876 * the optlen may include SACK's (or DSACK). 21877 */ 21878 KASSERT(len > tp->t_maxseg - optlen, 21879 ("%s: len <= tso_segsz", __func__)); 21880 m->m_pkthdr.csum_flags |= CSUM_TSO; 21881 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 21882 } 21883 KASSERT(len + hdrlen == m_length(m, NULL), 21884 ("%s: mbuf chain different than expected: %d + %u != %u", 21885 __func__, len, hdrlen, m_length(m, NULL))); 21886 21887 #ifdef TCP_HHOOK 21888 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 21889 hhook_run_tcp_est_out(tp, th, &to, len, tso); 21890 #endif 21891 if ((rack->r_ctl.crte != NULL) && 21892 (rack->rc_hw_nobuf == 0) && 21893 tcp_bblogging_on(tp)) { 21894 rack_log_queue_level(tp, rack, len, &tv, cts); 21895 } 21896 /* We're getting ready to send; log now. */ 21897 if (tcp_bblogging_on(rack->rc_tp)) { 21898 union tcp_log_stackspecific log; 21899 21900 memset(&log, 0, sizeof(log)); 21901 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 21902 if (rack->rack_no_prr) 21903 log.u_bbr.flex1 = 0; 21904 else 21905 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 21906 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 21907 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 21908 log.u_bbr.flex4 = orig_len; 21909 /* Save off the early/late values */ 21910 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 21911 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 21912 log.u_bbr.bw_inuse = rack_get_bw(rack); 21913 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 21914 log.u_bbr.flex8 = 0; 21915 if (rsm) { 21916 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 21917 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 21918 counter_u64_add(rack_collapsed_win_rxt, 1); 21919 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 21920 } 21921 if (doing_tlp) 21922 log.u_bbr.flex8 = 2; 21923 else 21924 log.u_bbr.flex8 = 1; 21925 } else { 21926 if (doing_tlp) 21927 log.u_bbr.flex8 = 3; 21928 } 21929 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 21930 log.u_bbr.flex7 = mark; 21931 log.u_bbr.flex7 <<= 8; 21932 log.u_bbr.flex7 |= pass; 21933 log.u_bbr.pkts_out = tp->t_maxseg; 21934 log.u_bbr.timeStamp = cts; 21935 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 21936 if (rsm && (rsm->r_rtr_cnt > 0)) { 21937 /* 21938 * When we have a retransmit we want to log the 21939 * burst at send and flight at send from before. 21940 */ 21941 log.u_bbr.flex5 = rsm->r_fas; 21942 log.u_bbr.bbr_substate = rsm->r_bas; 21943 } else { 21944 /* 21945 * New transmits we log in flex5 the inflight again as 21946 * well as the number of segments in our send in the 21947 * substate field. 21948 */ 21949 log.u_bbr.flex5 = log.u_bbr.inflight; 21950 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 21951 } 21952 log.u_bbr.lt_epoch = cwnd_to_use; 21953 log.u_bbr.delivered = sendalot; 21954 log.u_bbr.rttProp = (uintptr_t)rsm; 21955 log.u_bbr.pkt_epoch = __LINE__; 21956 if (rsm) { 21957 log.u_bbr.delRate = rsm->r_flags; 21958 log.u_bbr.delRate <<= 31; 21959 log.u_bbr.delRate |= rack->r_must_retran; 21960 log.u_bbr.delRate <<= 1; 21961 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21962 } else { 21963 log.u_bbr.delRate = rack->r_must_retran; 21964 log.u_bbr.delRate <<= 1; 21965 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21966 } 21967 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 21968 len, &log, false, NULL, __func__, __LINE__, &tv); 21969 } else 21970 lgb = NULL; 21971 21972 /* 21973 * Fill in IP length and desired time to live and send to IP level. 21974 * There should be a better way to handle ttl and tos; we could keep 21975 * them in the template, but need a way to checksum without them. 21976 */ 21977 /* 21978 * m->m_pkthdr.len should have been set before cksum calcuration, 21979 * because in6_cksum() need it. 21980 */ 21981 #ifdef INET6 21982 if (isipv6) { 21983 /* 21984 * we separately set hoplimit for every segment, since the 21985 * user might want to change the value via setsockopt. Also, 21986 * desired default hop limit might be changed via Neighbor 21987 * Discovery. 21988 */ 21989 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 21990 21991 /* 21992 * Set the packet size here for the benefit of DTrace 21993 * probes. ip6_output() will set it properly; it's supposed 21994 * to include the option header lengths as well. 21995 */ 21996 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 21997 21998 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 21999 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 22000 else 22001 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 22002 22003 if (tp->t_state == TCPS_SYN_SENT) 22004 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 22005 22006 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 22007 /* TODO: IPv6 IP6TOS_ECT bit on */ 22008 error = ip6_output(m, 22009 inp->in6p_outputopts, 22010 &inp->inp_route6, 22011 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 22012 NULL, NULL, inp); 22013 22014 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 22015 mtu = inp->inp_route6.ro_nh->nh_mtu; 22016 } 22017 #endif /* INET6 */ 22018 #if defined(INET) && defined(INET6) 22019 else 22020 #endif 22021 #ifdef INET 22022 { 22023 ip->ip_len = htons(m->m_pkthdr.len); 22024 #ifdef INET6 22025 if (inp->inp_vflag & INP_IPV6PROTO) 22026 ip->ip_ttl = in6_selecthlim(inp, NULL); 22027 #endif /* INET6 */ 22028 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 22029 /* 22030 * If we do path MTU discovery, then we set DF on every 22031 * packet. This might not be the best thing to do according 22032 * to RFC3390 Section 2. However the tcp hostcache migitates 22033 * the problem so it affects only the first tcp connection 22034 * with a host. 22035 * 22036 * NB: Don't set DF on small MTU/MSS to have a safe 22037 * fallback. 22038 */ 22039 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 22040 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 22041 if (tp->t_port == 0 || len < V_tcp_minmss) { 22042 ip->ip_off |= htons(IP_DF); 22043 } 22044 } else { 22045 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 22046 } 22047 22048 if (tp->t_state == TCPS_SYN_SENT) 22049 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 22050 22051 TCP_PROBE5(send, NULL, tp, ip, tp, th); 22052 22053 error = ip_output(m, 22054 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 22055 inp->inp_options, 22056 #else 22057 NULL, 22058 #endif 22059 &inp->inp_route, 22060 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 22061 inp); 22062 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 22063 mtu = inp->inp_route.ro_nh->nh_mtu; 22064 } 22065 #endif /* INET */ 22066 if (lgb) { 22067 lgb->tlb_errno = error; 22068 lgb = NULL; 22069 } 22070 22071 out: 22072 /* 22073 * In transmit state, time the transmission and arrange for the 22074 * retransmit. In persist state, just set snd_max. 22075 */ 22076 if ((rsm == NULL) && doing_tlp) 22077 add_flag |= RACK_TLP; 22078 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 22079 rack_to_usec_ts(&tv), 22080 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); 22081 if (error == 0) { 22082 if (add_flag & RACK_IS_PCM) { 22083 /* We just launched a PCM */ 22084 /* rrs here log */ 22085 rack->pcm_in_progress = 1; 22086 rack->pcm_needed = 0; 22087 rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag); 22088 } 22089 if (rsm == NULL) { 22090 if (rack->lt_bw_up == 0) { 22091 rack->r_ctl.lt_timemark = tcp_tv_to_lusec(&tv); 22092 rack->r_ctl.lt_seq = tp->snd_una; 22093 rack->lt_bw_up = 1; 22094 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { 22095 /* 22096 * Need to record what we have since we are 22097 * approaching seq wrap. 22098 */ 22099 uint64_t tmark; 22100 22101 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 22102 rack->r_ctl.lt_seq = tp->snd_una; 22103 tmark = tcp_get_u64_usecs(&tv); 22104 if (tmark > rack->r_ctl.lt_timemark) { 22105 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 22106 rack->r_ctl.lt_timemark = tmark; 22107 } 22108 } 22109 } 22110 rack->forced_ack = 0; /* If we send something zap the FA flag */ 22111 counter_u64_add(rack_total_bytes, len); 22112 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 22113 if (rsm && doing_tlp) { 22114 rack->rc_last_sent_tlp_past_cumack = 0; 22115 rack->rc_last_sent_tlp_seq_valid = 1; 22116 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 22117 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 22118 } 22119 if (rack->rc_hw_nobuf) { 22120 rack->rc_hw_nobuf = 0; 22121 rack->r_ctl.rc_agg_delayed = 0; 22122 rack->r_early = 0; 22123 rack->r_late = 0; 22124 rack->r_ctl.rc_agg_early = 0; 22125 } 22126 if (rsm && (doing_tlp == 0)) { 22127 /* Set we retransmitted */ 22128 rack->rc_gp_saw_rec = 1; 22129 } else { 22130 if (cwnd_to_use > tp->snd_ssthresh) { 22131 /* Set we sent in CA */ 22132 rack->rc_gp_saw_ca = 1; 22133 } else { 22134 /* Set we sent in SS */ 22135 rack->rc_gp_saw_ss = 1; 22136 } 22137 } 22138 if (TCPS_HAVEESTABLISHED(tp->t_state) && 22139 (tp->t_flags & TF_SACK_PERMIT) && 22140 tp->rcv_numsacks > 0) 22141 tcp_clean_dsack_blocks(tp); 22142 tot_len_this_send += len; 22143 if (len == 0) { 22144 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 22145 } else { 22146 int idx; 22147 22148 idx = (len / segsiz) + 3; 22149 if (idx >= TCP_MSS_ACCT_ATIMER) 22150 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 22151 else 22152 counter_u64_add(rack_out_size[idx], 1); 22153 } 22154 } 22155 if ((rack->rack_no_prr == 0) && 22156 sub_from_prr && 22157 (error == 0)) { 22158 if (rack->r_ctl.rc_prr_sndcnt >= len) 22159 rack->r_ctl.rc_prr_sndcnt -= len; 22160 else 22161 rack->r_ctl.rc_prr_sndcnt = 0; 22162 } 22163 sub_from_prr = 0; 22164 if (rsm != NULL) { 22165 if (doing_tlp) 22166 /* Make sure the TLP is added */ 22167 rsm->r_flags |= RACK_TLP; 22168 else 22169 /* If its a resend without TLP then it must not have the flag */ 22170 rsm->r_flags &= ~RACK_TLP; 22171 } 22172 if ((error == 0) && 22173 (len > 0) && 22174 (tp->snd_una == tp->snd_max)) 22175 rack->r_ctl.rc_tlp_rxt_last_time = cts; 22176 22177 { 22178 /* 22179 * This block is not associated with the above error == 0 test. 22180 * It is used to advance snd_max if we have a new transmit. 22181 */ 22182 tcp_seq startseq = tp->snd_max; 22183 22184 22185 if (rsm && (doing_tlp == 0)) 22186 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 22187 if (error) 22188 /* We don't log or do anything with errors */ 22189 goto nomore; 22190 if (doing_tlp == 0) { 22191 if (rsm == NULL) { 22192 /* 22193 * Not a retransmission of some 22194 * sort, new data is going out so 22195 * clear our TLP count and flag. 22196 */ 22197 rack->rc_tlp_in_progress = 0; 22198 rack->r_ctl.rc_tlp_cnt_out = 0; 22199 } 22200 } else { 22201 /* 22202 * We have just sent a TLP, mark that it is true 22203 * and make sure our in progress is set so we 22204 * continue to check the count. 22205 */ 22206 rack->rc_tlp_in_progress = 1; 22207 rack->r_ctl.rc_tlp_cnt_out++; 22208 } 22209 /* 22210 * If we are retransmitting we are done, snd_max 22211 * does not get updated. 22212 */ 22213 if (sack_rxmit) 22214 goto nomore; 22215 if ((tp->snd_una == tp->snd_max) && (len > 0)) { 22216 /* 22217 * Update the time we just added data since 22218 * nothing was outstanding. 22219 */ 22220 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 22221 tp->t_acktime = ticks; 22222 } 22223 /* 22224 * Now for special SYN/FIN handling. 22225 */ 22226 if (flags & (TH_SYN | TH_FIN)) { 22227 if ((flags & TH_SYN) && 22228 ((tp->t_flags & TF_SENTSYN) == 0)) { 22229 tp->snd_max++; 22230 tp->t_flags |= TF_SENTSYN; 22231 } 22232 if ((flags & TH_FIN) && 22233 ((tp->t_flags & TF_SENTFIN) == 0)) { 22234 tp->snd_max++; 22235 tp->t_flags |= TF_SENTFIN; 22236 } 22237 } 22238 tp->snd_max += len; 22239 if (rack->rc_new_rnd_needed) { 22240 rack_new_round_starts(tp, rack, tp->snd_max); 22241 } 22242 /* 22243 * Time this transmission if not a retransmission and 22244 * not currently timing anything. 22245 * This is only relevant in case of switching back to 22246 * the base stack. 22247 */ 22248 if (tp->t_rtttime == 0) { 22249 tp->t_rtttime = ticks; 22250 tp->t_rtseq = startseq; 22251 KMOD_TCPSTAT_INC(tcps_segstimed); 22252 } 22253 if (len && 22254 ((tp->t_flags & TF_GPUTINPROG) == 0)) 22255 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 22256 /* 22257 * If we are doing FO we need to update the mbuf position and subtract 22258 * this happens when the peer sends us duplicate information and 22259 * we thus want to send a DSACK. 22260 * 22261 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 22262 * turned off? If not then we are going to echo multiple DSACK blocks 22263 * out (with the TSO), which we should not be doing. 22264 */ 22265 if (rack->r_fast_output && len) { 22266 if (rack->r_ctl.fsb.left_to_send > len) 22267 rack->r_ctl.fsb.left_to_send -= len; 22268 else 22269 rack->r_ctl.fsb.left_to_send = 0; 22270 if (rack->r_ctl.fsb.left_to_send < segsiz) 22271 rack->r_fast_output = 0; 22272 if (rack->r_fast_output) { 22273 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 22274 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 22275 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 22276 } 22277 } 22278 if (rack_pcm_blast == 0) { 22279 if ((orig_len > len) && 22280 (add_flag & RACK_IS_PCM) && 22281 (len < pace_max_seg) && 22282 ((pace_max_seg - len) > segsiz)) { 22283 /* 22284 * We are doing a PCM measurement and we did 22285 * not get enough data in the TSO to meet the 22286 * burst requirement. 22287 */ 22288 uint32_t n_len; 22289 22290 n_len = (orig_len - len); 22291 orig_len -= len; 22292 pace_max_seg -= len; 22293 len = n_len; 22294 sb_offset = tp->snd_max - tp->snd_una; 22295 /* Re-lock for the next spin */ 22296 SOCK_SENDBUF_LOCK(so); 22297 goto send; 22298 } 22299 } else { 22300 if ((orig_len > len) && 22301 (add_flag & RACK_IS_PCM) && 22302 ((orig_len - len) > segsiz)) { 22303 /* 22304 * We are doing a PCM measurement and we did 22305 * not get enough data in the TSO to meet the 22306 * burst requirement. 22307 */ 22308 uint32_t n_len; 22309 22310 n_len = (orig_len - len); 22311 orig_len -= len; 22312 len = n_len; 22313 sb_offset = tp->snd_max - tp->snd_una; 22314 /* Re-lock for the next spin */ 22315 SOCK_SENDBUF_LOCK(so); 22316 goto send; 22317 } 22318 } 22319 } 22320 nomore: 22321 if (error) { 22322 rack->r_ctl.rc_agg_delayed = 0; 22323 rack->r_early = 0; 22324 rack->r_late = 0; 22325 rack->r_ctl.rc_agg_early = 0; 22326 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 22327 /* 22328 * Failures do not advance the seq counter above. For the 22329 * case of ENOBUFS we will fall out and retry in 1ms with 22330 * the hpts. Everything else will just have to retransmit 22331 * with the timer. 22332 * 22333 * In any case, we do not want to loop around for another 22334 * send without a good reason. 22335 */ 22336 sendalot = 0; 22337 switch (error) { 22338 case EPERM: 22339 case EACCES: 22340 tp->t_softerror = error; 22341 #ifdef TCP_ACCOUNTING 22342 crtsc = get_cyclecount(); 22343 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22344 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22345 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22346 } 22347 sched_unpin(); 22348 #endif 22349 return (error); 22350 case ENOBUFS: 22351 /* 22352 * Pace us right away to retry in a some 22353 * time 22354 */ 22355 if (rack->r_ctl.crte != NULL) { 22356 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 22357 if (tcp_bblogging_on(rack->rc_tp)) 22358 rack_log_queue_level(tp, rack, len, &tv, cts); 22359 } else 22360 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 22361 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 22362 if (rack->rc_enobuf < 0x7f) 22363 rack->rc_enobuf++; 22364 if (slot < (10 * HPTS_USEC_IN_MSEC)) 22365 slot = 10 * HPTS_USEC_IN_MSEC; 22366 if (rack->r_ctl.crte != NULL) { 22367 counter_u64_add(rack_saw_enobuf_hw, 1); 22368 tcp_rl_log_enobuf(rack->r_ctl.crte); 22369 } 22370 counter_u64_add(rack_saw_enobuf, 1); 22371 goto enobufs; 22372 case EMSGSIZE: 22373 /* 22374 * For some reason the interface we used initially 22375 * to send segments changed to another or lowered 22376 * its MTU. If TSO was active we either got an 22377 * interface without TSO capabilits or TSO was 22378 * turned off. If we obtained mtu from ip_output() 22379 * then update it and try again. 22380 */ 22381 if (tso) 22382 tp->t_flags &= ~TF_TSO; 22383 if (mtu != 0) { 22384 int saved_mtu; 22385 22386 saved_mtu = tp->t_maxseg; 22387 tcp_mss_update(tp, -1, mtu, NULL, NULL); 22388 if (saved_mtu > tp->t_maxseg) { 22389 goto again; 22390 } 22391 } 22392 slot = 10 * HPTS_USEC_IN_MSEC; 22393 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22394 #ifdef TCP_ACCOUNTING 22395 crtsc = get_cyclecount(); 22396 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22397 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22398 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22399 } 22400 sched_unpin(); 22401 #endif 22402 return (error); 22403 case ENETUNREACH: 22404 counter_u64_add(rack_saw_enetunreach, 1); 22405 /* FALLTHROUGH */ 22406 case EHOSTDOWN: 22407 case EHOSTUNREACH: 22408 case ENETDOWN: 22409 if (TCPS_HAVERCVDSYN(tp->t_state)) { 22410 tp->t_softerror = error; 22411 error = 0; 22412 } 22413 /* FALLTHROUGH */ 22414 default: 22415 slot = 10 * HPTS_USEC_IN_MSEC; 22416 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22417 #ifdef TCP_ACCOUNTING 22418 crtsc = get_cyclecount(); 22419 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22420 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22421 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22422 } 22423 sched_unpin(); 22424 #endif 22425 return (error); 22426 } 22427 } else { 22428 rack->rc_enobuf = 0; 22429 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 22430 rack->r_ctl.retran_during_recovery += len; 22431 } 22432 KMOD_TCPSTAT_INC(tcps_sndtotal); 22433 22434 /* 22435 * Data sent (as far as we can tell). If this advertises a larger 22436 * window than any other segment, then remember the size of the 22437 * advertised window. Any pending ACK has now been sent. 22438 */ 22439 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 22440 tp->rcv_adv = tp->rcv_nxt + recwin; 22441 22442 tp->last_ack_sent = tp->rcv_nxt; 22443 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 22444 enobufs: 22445 if (sendalot) { 22446 /* Do we need to turn off sendalot? */ 22447 if (pace_max_seg && 22448 (tot_len_this_send >= pace_max_seg)) { 22449 /* We hit our max. */ 22450 sendalot = 0; 22451 } 22452 } 22453 if ((error == 0) && (flags & TH_FIN)) 22454 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 22455 if (flags & TH_RST) { 22456 /* 22457 * We don't send again after sending a RST. 22458 */ 22459 slot = 0; 22460 sendalot = 0; 22461 if (error == 0) 22462 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 22463 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 22464 /* 22465 * Get our pacing rate, if an error 22466 * occurred in sending (ENOBUF) we would 22467 * hit the else if with slot preset. Other 22468 * errors return. 22469 */ 22470 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); 22471 } 22472 /* We have sent clear the flag */ 22473 rack->r_ent_rec_ns = 0; 22474 if (rack->r_must_retran) { 22475 if (rsm) { 22476 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 22477 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 22478 /* 22479 * We have retransmitted all. 22480 */ 22481 rack->r_must_retran = 0; 22482 rack->r_ctl.rc_out_at_rto = 0; 22483 } 22484 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22485 /* 22486 * Sending new data will also kill 22487 * the loop. 22488 */ 22489 rack->r_must_retran = 0; 22490 rack->r_ctl.rc_out_at_rto = 0; 22491 } 22492 } 22493 rack->r_ctl.fsb.recwin = recwin; 22494 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 22495 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22496 /* 22497 * We hit an RTO and now have past snd_max at the RTO 22498 * clear all the WAS flags. 22499 */ 22500 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 22501 } 22502 if (slot) { 22503 /* set the rack tcb into the slot N */ 22504 if ((error == 0) && 22505 rack_use_rfo && 22506 ((flags & (TH_SYN|TH_FIN)) == 0) && 22507 (rsm == NULL) && 22508 (ipoptlen == 0) && 22509 (doing_tlp == 0) && 22510 rack->r_fsb_inited && 22511 TCPS_HAVEESTABLISHED(tp->t_state) && 22512 ((IN_RECOVERY(tp->t_flags)) == 0) && 22513 (rack->r_must_retran == 0) && 22514 ((tp->t_flags & TF_NEEDFIN) == 0) && 22515 (len > 0) && (orig_len > 0) && 22516 (orig_len > len) && 22517 ((orig_len - len) >= segsiz) && 22518 ((optlen == 0) || 22519 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22520 /* We can send at least one more MSS using our fsb */ 22521 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22522 segsiz, pace_max_seg, hw_tls, flags); 22523 } else 22524 rack->r_fast_output = 0; 22525 rack_log_fsb(rack, tp, so, flags, 22526 ipoptlen, orig_len, len, error, 22527 (rsm == NULL), optlen, __LINE__, 2); 22528 } else if (sendalot) { 22529 int ret; 22530 22531 sack_rxmit = 0; 22532 if ((error == 0) && 22533 rack_use_rfo && 22534 ((flags & (TH_SYN|TH_FIN)) == 0) && 22535 (rsm == NULL) && 22536 (doing_tlp == 0) && 22537 (ipoptlen == 0) && 22538 (rack->r_must_retran == 0) && 22539 rack->r_fsb_inited && 22540 TCPS_HAVEESTABLISHED(tp->t_state) && 22541 ((IN_RECOVERY(tp->t_flags)) == 0) && 22542 ((tp->t_flags & TF_NEEDFIN) == 0) && 22543 (len > 0) && (orig_len > 0) && 22544 (orig_len > len) && 22545 ((orig_len - len) >= segsiz) && 22546 ((optlen == 0) || 22547 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22548 /* we can use fast_output for more */ 22549 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22550 segsiz, pace_max_seg, hw_tls, flags); 22551 if (rack->r_fast_output) { 22552 error = 0; 22553 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, &tot_len_this_send, &error, __LINE__); 22554 if (ret >= 0) 22555 return (ret); 22556 else if (error) 22557 goto nomore; 22558 22559 } 22560 } 22561 goto again; 22562 } 22563 skip_all_send: 22564 /* Assure when we leave that snd_nxt will point to top */ 22565 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 22566 tp->snd_nxt = tp->snd_max; 22567 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 22568 #ifdef TCP_ACCOUNTING 22569 crtsc = get_cyclecount() - ts_val; 22570 if (tot_len_this_send) { 22571 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22572 tp->tcp_cnt_counters[SND_OUT_DATA]++; 22573 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 22574 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 22575 } 22576 } else { 22577 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22578 tp->tcp_cnt_counters[SND_OUT_ACK]++; 22579 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 22580 } 22581 } 22582 sched_unpin(); 22583 #endif 22584 if (error == ENOBUFS) 22585 error = 0; 22586 return (error); 22587 } 22588 22589 static void 22590 rack_update_seg(struct tcp_rack *rack) 22591 { 22592 uint32_t orig_val; 22593 22594 orig_val = rack->r_ctl.rc_pace_max_segs; 22595 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 22596 if (orig_val != rack->r_ctl.rc_pace_max_segs) 22597 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 22598 } 22599 22600 static void 22601 rack_mtu_change(struct tcpcb *tp) 22602 { 22603 /* 22604 * The MSS may have changed 22605 */ 22606 struct tcp_rack *rack; 22607 struct rack_sendmap *rsm; 22608 22609 rack = (struct tcp_rack *)tp->t_fb_ptr; 22610 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 22611 /* 22612 * The MTU has changed we need to resend everything 22613 * since all we have sent is lost. We first fix 22614 * up the mtu though. 22615 */ 22616 rack_set_pace_segments(tp, rack, __LINE__, NULL); 22617 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 22618 rack_remxt_tmr(tp); 22619 rack->r_fast_output = 0; 22620 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 22621 rack->r_ctl.rc_sacked); 22622 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 22623 rack->r_must_retran = 1; 22624 /* Mark all inflight to needing to be rxt'd */ 22625 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 22626 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); 22627 } 22628 } 22629 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 22630 /* We don't use snd_nxt to retransmit */ 22631 tp->snd_nxt = tp->snd_max; 22632 } 22633 22634 static int 22635 rack_set_dgp(struct tcp_rack *rack) 22636 { 22637 if (rack->dgp_on == 1) 22638 return(0); 22639 if ((rack->use_fixed_rate == 1) && 22640 (rack->rc_always_pace == 1)) { 22641 /* 22642 * We are already pacing another 22643 * way. 22644 */ 22645 return (EBUSY); 22646 } 22647 if (rack->rc_always_pace == 1) { 22648 rack_remove_pacing(rack); 22649 } 22650 if (tcp_incr_dgp_pacing_cnt() == 0) 22651 return (ENOSPC); 22652 rack->r_ctl.pacing_method |= RACK_DGP_PACING; 22653 rack->rc_fillcw_apply_discount = 0; 22654 rack->dgp_on = 1; 22655 rack->rc_always_pace = 1; 22656 rack->rc_pace_dnd = 1; 22657 rack->use_fixed_rate = 0; 22658 if (rack->gp_ready) 22659 rack_set_cc_pacing(rack); 22660 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22661 rack->rack_attempt_hdwr_pace = 0; 22662 /* rxt settings */ 22663 rack->full_size_rxt = 1; 22664 rack->shape_rxt_to_pacing_min = 0; 22665 /* cmpack=1 */ 22666 rack->r_use_cmp_ack = 1; 22667 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 22668 rack->r_use_cmp_ack) 22669 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22670 /* scwnd=1 */ 22671 rack->rack_enable_scwnd = 1; 22672 /* dynamic=100 */ 22673 rack->rc_gp_dyn_mul = 1; 22674 /* gp_inc_ca */ 22675 rack->r_ctl.rack_per_of_gp_ca = 100; 22676 /* rrr_conf=3 */ 22677 rack->r_rr_config = 3; 22678 /* npush=2 */ 22679 rack->r_ctl.rc_no_push_at_mrtt = 2; 22680 /* fillcw=1 */ 22681 rack->rc_pace_to_cwnd = 1; 22682 rack->rc_pace_fill_if_rttin_range = 0; 22683 rack->rtt_limit_mul = 0; 22684 /* noprr=1 */ 22685 rack->rack_no_prr = 1; 22686 /* lscwnd=1 */ 22687 rack->r_limit_scw = 1; 22688 /* gp_inc_rec */ 22689 rack->r_ctl.rack_per_of_gp_rec = 90; 22690 return (0); 22691 } 22692 22693 static int 22694 rack_set_profile(struct tcp_rack *rack, int prof) 22695 { 22696 int err = EINVAL; 22697 if (prof == 1) { 22698 /* 22699 * Profile 1 is "standard" DGP. It ignores 22700 * client buffer level. 22701 */ 22702 err = rack_set_dgp(rack); 22703 if (err) 22704 return (err); 22705 } else if (prof == 6) { 22706 err = rack_set_dgp(rack); 22707 if (err) 22708 return (err); 22709 /* 22710 * Profile 6 tweaks DGP so that it will apply to 22711 * fill-cw the same settings that profile5 does 22712 * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). 22713 */ 22714 rack->rc_fillcw_apply_discount = 1; 22715 } else if (prof == 0) { 22716 /* This changes things back to the default settings */ 22717 if (rack->rc_always_pace == 1) { 22718 rack_remove_pacing(rack); 22719 } else { 22720 /* Make sure any stray flags are off */ 22721 rack->dgp_on = 0; 22722 rack->rc_hybrid_mode = 0; 22723 rack->use_fixed_rate = 0; 22724 } 22725 err = 0; 22726 if (rack_fill_cw_state) 22727 rack->rc_pace_to_cwnd = 1; 22728 else 22729 rack->rc_pace_to_cwnd = 0; 22730 22731 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 22732 rack->r_ctl.pacing_method |= RACK_REG_PACING; 22733 rack->rc_always_pace = 1; 22734 if (rack->rack_hibeta) 22735 rack_set_cc_pacing(rack); 22736 } else 22737 rack->rc_always_pace = 0; 22738 if (rack_dsack_std_based & 0x1) { 22739 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 22740 rack->rc_rack_tmr_std_based = 1; 22741 } 22742 if (rack_dsack_std_based & 0x2) { 22743 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 22744 rack->rc_rack_use_dsack = 1; 22745 } 22746 if (rack_use_cmp_acks) 22747 rack->r_use_cmp_ack = 1; 22748 else 22749 rack->r_use_cmp_ack = 0; 22750 if (rack_disable_prr) 22751 rack->rack_no_prr = 1; 22752 else 22753 rack->rack_no_prr = 0; 22754 if (rack_gp_no_rec_chg) 22755 rack->rc_gp_no_rec_chg = 1; 22756 else 22757 rack->rc_gp_no_rec_chg = 0; 22758 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 22759 rack->r_mbuf_queue = 1; 22760 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 22761 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22762 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22763 } else { 22764 rack->r_mbuf_queue = 0; 22765 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22766 } 22767 if (rack_enable_shared_cwnd) 22768 rack->rack_enable_scwnd = 1; 22769 else 22770 rack->rack_enable_scwnd = 0; 22771 if (rack_do_dyn_mul) { 22772 /* When dynamic adjustment is on CA needs to start at 100% */ 22773 rack->rc_gp_dyn_mul = 1; 22774 if (rack_do_dyn_mul >= 100) 22775 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 22776 } else { 22777 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 22778 rack->rc_gp_dyn_mul = 0; 22779 } 22780 rack->r_rr_config = 0; 22781 rack->r_ctl.rc_no_push_at_mrtt = 0; 22782 rack->rc_pace_fill_if_rttin_range = 0; 22783 rack->rtt_limit_mul = 0; 22784 22785 if (rack_enable_hw_pacing) 22786 rack->rack_hdw_pace_ena = 1; 22787 else 22788 rack->rack_hdw_pace_ena = 0; 22789 if (rack_disable_prr) 22790 rack->rack_no_prr = 1; 22791 else 22792 rack->rack_no_prr = 0; 22793 if (rack_limits_scwnd) 22794 rack->r_limit_scw = 1; 22795 else 22796 rack->r_limit_scw = 0; 22797 rack_init_retransmit_value(rack, rack_rxt_controls); 22798 err = 0; 22799 } 22800 return (err); 22801 } 22802 22803 static int 22804 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 22805 { 22806 struct deferred_opt_list *dol; 22807 22808 dol = malloc(sizeof(struct deferred_opt_list), 22809 M_TCPDO, M_NOWAIT|M_ZERO); 22810 if (dol == NULL) { 22811 /* 22812 * No space yikes -- fail out.. 22813 */ 22814 return (0); 22815 } 22816 dol->optname = sopt_name; 22817 dol->optval = loptval; 22818 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 22819 return (1); 22820 } 22821 22822 static int 22823 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) 22824 { 22825 #ifdef TCP_REQUEST_TRK 22826 struct tcp_sendfile_track *sft; 22827 struct timeval tv; 22828 tcp_seq seq; 22829 int err; 22830 22831 microuptime(&tv); 22832 22833 /* Make sure no fixed rate is on */ 22834 rack->use_fixed_rate = 0; 22835 rack->r_ctl.rc_fixed_pacing_rate_rec = 0; 22836 rack->r_ctl.rc_fixed_pacing_rate_ca = 0; 22837 rack->r_ctl.rc_fixed_pacing_rate_ss = 0; 22838 /* Now allocate or find our entry that will have these settings */ 22839 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusec(&tv), 0); 22840 if (sft == NULL) { 22841 rack->rc_tp->tcp_hybrid_error++; 22842 /* no space, where would it have gone? */ 22843 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; 22844 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); 22845 return (ENOSPC); 22846 } 22847 /* mask our internal flags */ 22848 hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK; 22849 /* The seq will be snd_una + everything in the buffer */ 22850 seq = sft->start_seq; 22851 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { 22852 /* Disabling hybrid pacing */ 22853 if (rack->rc_hybrid_mode) { 22854 rack_set_profile(rack, 0); 22855 rack->rc_tp->tcp_hybrid_stop++; 22856 } 22857 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); 22858 return (0); 22859 } 22860 if (rack->dgp_on == 0) { 22861 /* 22862 * If we have not yet turned DGP on, do so 22863 * now setting pure DGP mode, no buffer level 22864 * response. 22865 */ 22866 if ((err = rack_set_profile(rack, 1)) != 0){ 22867 /* Failed to turn pacing on */ 22868 rack->rc_tp->tcp_hybrid_error++; 22869 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); 22870 return (err); 22871 } 22872 } 22873 /* 22874 * Now we must switch to hybrid mode as well which also 22875 * means moving to regular pacing. 22876 */ 22877 if (rack->rc_hybrid_mode == 0) { 22878 /* First time */ 22879 if (tcp_can_enable_pacing()) { 22880 rack->r_ctl.pacing_method |= RACK_REG_PACING; 22881 rack->rc_hybrid_mode = 1; 22882 } else { 22883 return (ENOSPC); 22884 } 22885 if (rack->r_ctl.pacing_method & RACK_DGP_PACING) { 22886 /* 22887 * This should be true. 22888 */ 22889 tcp_dec_dgp_pacing_cnt(); 22890 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 22891 } 22892 } 22893 /* Now set in our flags */ 22894 sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET; 22895 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) 22896 sft->cspr = hybrid->cspr; 22897 else 22898 sft->cspr = 0; 22899 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) 22900 sft->hint_maxseg = hybrid->hint_maxseg; 22901 else 22902 sft->hint_maxseg = 0; 22903 rack->rc_tp->tcp_hybrid_start++; 22904 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); 22905 return (0); 22906 #else 22907 return (ENOTSUP); 22908 #endif 22909 } 22910 22911 static int 22912 rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si) 22913 { 22914 /* We pulled a SSI info log out what was there */ 22915 si->bytes_transmitted = tp->t_sndbytes; 22916 si->bytes_retransmitted = tp->t_snd_rxt_bytes; 22917 return (0); 22918 } 22919 22920 static int 22921 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 22922 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) 22923 22924 { 22925 struct epoch_tracker et; 22926 struct sockopt sopt; 22927 struct cc_newreno_opts opt; 22928 uint64_t val; 22929 int error = 0; 22930 uint16_t ca, ss; 22931 22932 switch (sopt_name) { 22933 case TCP_RACK_SET_RXT_OPTIONS: 22934 if (optval <= 2) { 22935 rack_init_retransmit_value(rack, optval); 22936 } else { 22937 /* 22938 * You must send in 0, 1 or 2 all else is 22939 * invalid. 22940 */ 22941 error = EINVAL; 22942 } 22943 break; 22944 case TCP_RACK_DSACK_OPT: 22945 RACK_OPTS_INC(tcp_rack_dsack_opt); 22946 if (optval & 0x1) { 22947 rack->rc_rack_tmr_std_based = 1; 22948 } else { 22949 rack->rc_rack_tmr_std_based = 0; 22950 } 22951 if (optval & 0x2) { 22952 rack->rc_rack_use_dsack = 1; 22953 } else { 22954 rack->rc_rack_use_dsack = 0; 22955 } 22956 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 22957 break; 22958 case TCP_RACK_PACING_DIVISOR: 22959 RACK_OPTS_INC(tcp_rack_pacing_divisor); 22960 if (optval == 0) { 22961 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 22962 } else { 22963 if (optval < RL_MIN_DIVISOR) 22964 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; 22965 else 22966 rack->r_ctl.pace_len_divisor = optval; 22967 } 22968 break; 22969 case TCP_RACK_HI_BETA: 22970 RACK_OPTS_INC(tcp_rack_hi_beta); 22971 if (optval > 0) { 22972 rack->rack_hibeta = 1; 22973 if ((optval >= 50) && 22974 (optval <= 100)) { 22975 /* 22976 * User wants to set a custom beta. 22977 */ 22978 rack->r_ctl.saved_hibeta = optval; 22979 if (rack->rc_pacing_cc_set) 22980 rack_undo_cc_pacing(rack); 22981 rack->r_ctl.rc_saved_beta = optval; 22982 } 22983 if (rack->rc_pacing_cc_set == 0) 22984 rack_set_cc_pacing(rack); 22985 } else { 22986 rack->rack_hibeta = 0; 22987 if (rack->rc_pacing_cc_set) 22988 rack_undo_cc_pacing(rack); 22989 } 22990 break; 22991 case TCP_RACK_PACING_BETA: 22992 error = EINVAL; 22993 break; 22994 case TCP_RACK_TIMER_SLOP: 22995 RACK_OPTS_INC(tcp_rack_timer_slop); 22996 rack->r_ctl.timer_slop = optval; 22997 if (rack->rc_tp->t_srtt) { 22998 /* 22999 * If we have an SRTT lets update t_rxtcur 23000 * to have the new slop. 23001 */ 23002 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 23003 rack_rto_min, rack_rto_max, 23004 rack->r_ctl.timer_slop); 23005 } 23006 break; 23007 case TCP_RACK_PACING_BETA_ECN: 23008 RACK_OPTS_INC(tcp_rack_beta_ecn); 23009 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 23010 /* This only works for newreno. */ 23011 error = EINVAL; 23012 break; 23013 } 23014 if (rack->rc_pacing_cc_set) { 23015 /* 23016 * Set them into the real CC module 23017 * whats in the rack pcb is the old values 23018 * to be used on restoral/ 23019 */ 23020 sopt.sopt_dir = SOPT_SET; 23021 opt.name = CC_NEWRENO_BETA_ECN; 23022 opt.val = optval; 23023 if (CC_ALGO(tp)->ctl_output != NULL) 23024 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 23025 else 23026 error = ENOENT; 23027 } else { 23028 /* 23029 * Not pacing yet so set it into our local 23030 * rack pcb storage. 23031 */ 23032 rack->r_ctl.rc_saved_beta_ecn = optval; 23033 } 23034 break; 23035 case TCP_DEFER_OPTIONS: 23036 RACK_OPTS_INC(tcp_defer_opt); 23037 if (optval) { 23038 if (rack->gp_ready) { 23039 /* Too late */ 23040 error = EINVAL; 23041 break; 23042 } 23043 rack->defer_options = 1; 23044 } else 23045 rack->defer_options = 0; 23046 break; 23047 case TCP_RACK_MEASURE_CNT: 23048 RACK_OPTS_INC(tcp_rack_measure_cnt); 23049 if (optval && (optval <= 0xff)) { 23050 rack->r_ctl.req_measurements = optval; 23051 } else 23052 error = EINVAL; 23053 break; 23054 case TCP_REC_ABC_VAL: 23055 RACK_OPTS_INC(tcp_rec_abc_val); 23056 if (optval > 0) 23057 rack->r_use_labc_for_rec = 1; 23058 else 23059 rack->r_use_labc_for_rec = 0; 23060 break; 23061 case TCP_RACK_ABC_VAL: 23062 RACK_OPTS_INC(tcp_rack_abc_val); 23063 if ((optval > 0) && (optval < 255)) 23064 rack->rc_labc = optval; 23065 else 23066 error = EINVAL; 23067 break; 23068 case TCP_HDWR_UP_ONLY: 23069 RACK_OPTS_INC(tcp_pacing_up_only); 23070 if (optval) 23071 rack->r_up_only = 1; 23072 else 23073 rack->r_up_only = 0; 23074 break; 23075 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 23076 RACK_OPTS_INC(tcp_fillcw_rate_cap); 23077 rack->r_ctl.fillcw_cap = loptval; 23078 break; 23079 case TCP_PACING_RATE_CAP: 23080 RACK_OPTS_INC(tcp_pacing_rate_cap); 23081 if ((rack->dgp_on == 1) && 23082 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 23083 /* 23084 * If we are doing DGP we need to switch 23085 * to using the pacing limit. 23086 */ 23087 if (tcp_can_enable_pacing() == 0) { 23088 error = ENOSPC; 23089 break; 23090 } 23091 /* 23092 * Now change up the flags and counts to be correct. 23093 */ 23094 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23095 tcp_dec_dgp_pacing_cnt(); 23096 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 23097 } 23098 rack->r_ctl.bw_rate_cap = loptval; 23099 break; 23100 case TCP_HYBRID_PACING: 23101 if (hybrid == NULL) { 23102 error = EINVAL; 23103 break; 23104 } 23105 if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) { 23106 error = EPERM; 23107 break; 23108 } 23109 error = process_hybrid_pacing(rack, hybrid); 23110 break; 23111 case TCP_SIDECHAN_DIS: /* URL:scodm */ 23112 if (optval) 23113 rack->r_ctl.side_chan_dis_mask = optval; 23114 else 23115 rack->r_ctl.side_chan_dis_mask = 0; 23116 break; 23117 case TCP_RACK_PROFILE: 23118 RACK_OPTS_INC(tcp_profile); 23119 error = rack_set_profile(rack, optval); 23120 break; 23121 case TCP_USE_CMP_ACKS: 23122 RACK_OPTS_INC(tcp_use_cmp_acks); 23123 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) { 23124 /* You can't turn it off once its on! */ 23125 error = EINVAL; 23126 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 23127 rack->r_use_cmp_ack = 1; 23128 rack->r_mbuf_queue = 1; 23129 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23130 } 23131 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 23132 tp->t_flags2 |= TF2_MBUF_ACKCMP; 23133 break; 23134 case TCP_SHARED_CWND_TIME_LIMIT: 23135 RACK_OPTS_INC(tcp_lscwnd); 23136 if (optval) 23137 rack->r_limit_scw = 1; 23138 else 23139 rack->r_limit_scw = 0; 23140 break; 23141 case TCP_RACK_DGP_IN_REC: 23142 error = EINVAL; 23143 break; 23144 case TCP_RACK_PACE_TO_FILL: 23145 RACK_OPTS_INC(tcp_fillcw); 23146 if (optval == 0) 23147 rack->rc_pace_to_cwnd = 0; 23148 else { 23149 rack->rc_pace_to_cwnd = 1; 23150 } 23151 if ((optval >= rack_gp_rtt_maxmul) && 23152 rack_gp_rtt_maxmul && 23153 (optval < 0xf)) { 23154 rack->rc_pace_fill_if_rttin_range = 1; 23155 rack->rtt_limit_mul = optval; 23156 } else { 23157 rack->rc_pace_fill_if_rttin_range = 0; 23158 rack->rtt_limit_mul = 0; 23159 } 23160 break; 23161 case TCP_RACK_NO_PUSH_AT_MAX: 23162 RACK_OPTS_INC(tcp_npush); 23163 if (optval == 0) 23164 rack->r_ctl.rc_no_push_at_mrtt = 0; 23165 else if (optval < 0xff) 23166 rack->r_ctl.rc_no_push_at_mrtt = optval; 23167 else 23168 error = EINVAL; 23169 break; 23170 case TCP_SHARED_CWND_ENABLE: 23171 RACK_OPTS_INC(tcp_rack_scwnd); 23172 if (optval == 0) 23173 rack->rack_enable_scwnd = 0; 23174 else 23175 rack->rack_enable_scwnd = 1; 23176 break; 23177 case TCP_RACK_MBUF_QUEUE: 23178 /* Now do we use the LRO mbuf-queue feature */ 23179 RACK_OPTS_INC(tcp_rack_mbufq); 23180 if (optval || rack->r_use_cmp_ack) 23181 rack->r_mbuf_queue = 1; 23182 else 23183 rack->r_mbuf_queue = 0; 23184 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23185 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23186 else 23187 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23188 break; 23189 case TCP_RACK_NONRXT_CFG_RATE: 23190 RACK_OPTS_INC(tcp_rack_cfg_rate); 23191 if (optval == 0) 23192 rack->rack_rec_nonrxt_use_cr = 0; 23193 else 23194 rack->rack_rec_nonrxt_use_cr = 1; 23195 break; 23196 case TCP_NO_PRR: 23197 RACK_OPTS_INC(tcp_rack_noprr); 23198 if (optval == 0) 23199 rack->rack_no_prr = 0; 23200 else if (optval == 1) 23201 rack->rack_no_prr = 1; 23202 else if (optval == 2) 23203 rack->no_prr_addback = 1; 23204 else 23205 error = EINVAL; 23206 break; 23207 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 23208 if (optval > 0) 23209 rack->cspr_is_fcc = 1; 23210 else 23211 rack->cspr_is_fcc = 0; 23212 break; 23213 case TCP_TIMELY_DYN_ADJ: 23214 RACK_OPTS_INC(tcp_timely_dyn); 23215 if (optval == 0) 23216 rack->rc_gp_dyn_mul = 0; 23217 else { 23218 rack->rc_gp_dyn_mul = 1; 23219 if (optval >= 100) { 23220 /* 23221 * If the user sets something 100 or more 23222 * its the gp_ca value. 23223 */ 23224 rack->r_ctl.rack_per_of_gp_ca = optval; 23225 } 23226 } 23227 break; 23228 case TCP_RACK_DO_DETECTION: 23229 error = EINVAL; 23230 break; 23231 case TCP_RACK_TLP_USE: 23232 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 23233 error = EINVAL; 23234 break; 23235 } 23236 RACK_OPTS_INC(tcp_tlp_use); 23237 rack->rack_tlp_threshold_use = optval; 23238 break; 23239 case TCP_RACK_TLP_REDUCE: 23240 /* RACK TLP cwnd reduction (bool) */ 23241 RACK_OPTS_INC(tcp_rack_tlp_reduce); 23242 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 23243 break; 23244 /* Pacing related ones */ 23245 case TCP_RACK_PACE_ALWAYS: 23246 /* 23247 * zero is old rack method, 1 is new 23248 * method using a pacing rate. 23249 */ 23250 RACK_OPTS_INC(tcp_rack_pace_always); 23251 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23252 error = EPERM; 23253 break; 23254 } 23255 if (optval > 0) { 23256 if (rack->rc_always_pace) { 23257 error = EALREADY; 23258 break; 23259 } else if (tcp_can_enable_pacing()) { 23260 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23261 rack->rc_always_pace = 1; 23262 if (rack->rack_hibeta) 23263 rack_set_cc_pacing(rack); 23264 } 23265 else { 23266 error = ENOSPC; 23267 break; 23268 } 23269 } else { 23270 if (rack->rc_always_pace == 1) { 23271 rack_remove_pacing(rack); 23272 } 23273 } 23274 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23275 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23276 else 23277 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23278 /* A rate may be set irate or other, if so set seg size */ 23279 rack_update_seg(rack); 23280 break; 23281 case TCP_BBR_RACK_INIT_RATE: 23282 RACK_OPTS_INC(tcp_initial_rate); 23283 val = optval; 23284 /* Change from kbits per second to bytes per second */ 23285 val *= 1000; 23286 val /= 8; 23287 rack->r_ctl.init_rate = val; 23288 if (rack->rc_always_pace) 23289 rack_update_seg(rack); 23290 break; 23291 case TCP_BBR_IWINTSO: 23292 error = EINVAL; 23293 break; 23294 case TCP_RACK_FORCE_MSEG: 23295 RACK_OPTS_INC(tcp_rack_force_max_seg); 23296 if (optval) 23297 rack->rc_force_max_seg = 1; 23298 else 23299 rack->rc_force_max_seg = 0; 23300 break; 23301 case TCP_RACK_PACE_MIN_SEG: 23302 RACK_OPTS_INC(tcp_rack_min_seg); 23303 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); 23304 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23305 break; 23306 case TCP_RACK_PACE_MAX_SEG: 23307 /* Max segments size in a pace in bytes */ 23308 RACK_OPTS_INC(tcp_rack_max_seg); 23309 if ((rack->dgp_on == 1) && 23310 (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { 23311 /* 23312 * If we set a max-seg and are doing DGP then 23313 * we now fall under the pacing limits not the 23314 * DGP ones. 23315 */ 23316 if (tcp_can_enable_pacing() == 0) { 23317 error = ENOSPC; 23318 break; 23319 } 23320 /* 23321 * Now change up the flags and counts to be correct. 23322 */ 23323 rack->r_ctl.pacing_method |= RACK_REG_PACING; 23324 tcp_dec_dgp_pacing_cnt(); 23325 rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; 23326 } 23327 if (optval <= MAX_USER_SET_SEG) 23328 rack->rc_user_set_max_segs = optval; 23329 else 23330 rack->rc_user_set_max_segs = MAX_USER_SET_SEG; 23331 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23332 break; 23333 case TCP_RACK_PACE_RATE_REC: 23334 /* Set the fixed pacing rate in Bytes per second ca */ 23335 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 23336 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23337 error = EPERM; 23338 break; 23339 } 23340 if (rack->dgp_on) { 23341 /* 23342 * We are already pacing another 23343 * way. 23344 */ 23345 error = EBUSY; 23346 break; 23347 } 23348 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23349 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23350 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23351 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23352 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23353 rack->use_fixed_rate = 1; 23354 if (rack->rack_hibeta) 23355 rack_set_cc_pacing(rack); 23356 rack_log_pacing_delay_calc(rack, 23357 rack->r_ctl.rc_fixed_pacing_rate_ss, 23358 rack->r_ctl.rc_fixed_pacing_rate_ca, 23359 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23360 __LINE__, NULL,0); 23361 break; 23362 23363 case TCP_RACK_PACE_RATE_SS: 23364 /* Set the fixed pacing rate in Bytes per second ca */ 23365 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 23366 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23367 error = EPERM; 23368 break; 23369 } 23370 if (rack->dgp_on) { 23371 /* 23372 * We are already pacing another 23373 * way. 23374 */ 23375 error = EBUSY; 23376 break; 23377 } 23378 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23379 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23380 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23381 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23382 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23383 rack->use_fixed_rate = 1; 23384 if (rack->rack_hibeta) 23385 rack_set_cc_pacing(rack); 23386 rack_log_pacing_delay_calc(rack, 23387 rack->r_ctl.rc_fixed_pacing_rate_ss, 23388 rack->r_ctl.rc_fixed_pacing_rate_ca, 23389 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23390 __LINE__, NULL, 0); 23391 break; 23392 23393 case TCP_RACK_PACE_RATE_CA: 23394 /* Set the fixed pacing rate in Bytes per second ca */ 23395 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 23396 if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { 23397 error = EPERM; 23398 break; 23399 } 23400 if (rack->dgp_on) { 23401 /* 23402 * We are already pacing another 23403 * way. 23404 */ 23405 error = EBUSY; 23406 break; 23407 } 23408 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23409 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23410 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23411 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23412 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23413 rack->use_fixed_rate = 1; 23414 if (rack->rack_hibeta) 23415 rack_set_cc_pacing(rack); 23416 rack_log_pacing_delay_calc(rack, 23417 rack->r_ctl.rc_fixed_pacing_rate_ss, 23418 rack->r_ctl.rc_fixed_pacing_rate_ca, 23419 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23420 __LINE__, NULL, 0); 23421 break; 23422 case TCP_RACK_GP_INCREASE_REC: 23423 RACK_OPTS_INC(tcp_gp_inc_rec); 23424 rack->r_ctl.rack_per_of_gp_rec = optval; 23425 rack_log_pacing_delay_calc(rack, 23426 rack->r_ctl.rack_per_of_gp_ss, 23427 rack->r_ctl.rack_per_of_gp_ca, 23428 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23429 __LINE__, NULL, 0); 23430 break; 23431 case TCP_RACK_GP_INCREASE_CA: 23432 RACK_OPTS_INC(tcp_gp_inc_ca); 23433 ca = optval; 23434 if (ca < 100) { 23435 /* 23436 * We don't allow any reduction 23437 * over the GP b/w. 23438 */ 23439 error = EINVAL; 23440 break; 23441 } 23442 rack->r_ctl.rack_per_of_gp_ca = ca; 23443 rack_log_pacing_delay_calc(rack, 23444 rack->r_ctl.rack_per_of_gp_ss, 23445 rack->r_ctl.rack_per_of_gp_ca, 23446 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23447 __LINE__, NULL, 0); 23448 break; 23449 case TCP_RACK_GP_INCREASE_SS: 23450 RACK_OPTS_INC(tcp_gp_inc_ss); 23451 ss = optval; 23452 if (ss < 100) { 23453 /* 23454 * We don't allow any reduction 23455 * over the GP b/w. 23456 */ 23457 error = EINVAL; 23458 break; 23459 } 23460 rack->r_ctl.rack_per_of_gp_ss = ss; 23461 rack_log_pacing_delay_calc(rack, 23462 rack->r_ctl.rack_per_of_gp_ss, 23463 rack->r_ctl.rack_per_of_gp_ca, 23464 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23465 __LINE__, NULL, 0); 23466 break; 23467 case TCP_RACK_RR_CONF: 23468 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 23469 if (optval && optval <= 3) 23470 rack->r_rr_config = optval; 23471 else 23472 rack->r_rr_config = 0; 23473 break; 23474 case TCP_PACING_DND: /* URL:dnd */ 23475 if (optval > 0) 23476 rack->rc_pace_dnd = 1; 23477 else 23478 rack->rc_pace_dnd = 0; 23479 break; 23480 case TCP_HDWR_RATE_CAP: 23481 RACK_OPTS_INC(tcp_hdwr_rate_cap); 23482 if (optval) { 23483 if (rack->r_rack_hw_rate_caps == 0) 23484 rack->r_rack_hw_rate_caps = 1; 23485 else 23486 error = EALREADY; 23487 } else { 23488 rack->r_rack_hw_rate_caps = 0; 23489 } 23490 break; 23491 case TCP_DGP_UPPER_BOUNDS: 23492 { 23493 uint8_t val; 23494 val = optval & 0x0000ff; 23495 rack->r_ctl.rack_per_upper_bound_ca = val; 23496 val = (optval >> 16) & 0x0000ff; 23497 rack->r_ctl.rack_per_upper_bound_ss = val; 23498 break; 23499 } 23500 case TCP_SS_EEXIT: /* URL:eexit */ 23501 if (optval > 0) { 23502 rack->r_ctl.gp_rnd_thresh = optval & 0x0ff; 23503 if (optval & 0x10000) { 23504 rack->r_ctl.gate_to_fs = 1; 23505 } else { 23506 rack->r_ctl.gate_to_fs = 0; 23507 } 23508 if (optval & 0x20000) { 23509 rack->r_ctl.use_gp_not_last = 1; 23510 } else { 23511 rack->r_ctl.use_gp_not_last = 0; 23512 } 23513 if (optval & 0xfffc0000) { 23514 uint32_t v; 23515 23516 v = (optval >> 18) & 0x00003fff; 23517 if (v >= 1000) 23518 rack->r_ctl.gp_gain_req = v; 23519 } 23520 } else { 23521 /* We do not do ss early exit at all */ 23522 rack->rc_initial_ss_comp = 1; 23523 rack->r_ctl.gp_rnd_thresh = 0; 23524 } 23525 break; 23526 case TCP_RACK_SPLIT_LIMIT: 23527 RACK_OPTS_INC(tcp_split_limit); 23528 rack->r_ctl.rc_split_limit = optval; 23529 break; 23530 case TCP_BBR_HDWR_PACE: 23531 RACK_OPTS_INC(tcp_hdwr_pacing); 23532 if (optval){ 23533 if (rack->rack_hdrw_pacing == 0) { 23534 rack->rack_hdw_pace_ena = 1; 23535 rack->rack_attempt_hdwr_pace = 0; 23536 } else 23537 error = EALREADY; 23538 } else { 23539 rack->rack_hdw_pace_ena = 0; 23540 #ifdef RATELIMIT 23541 if (rack->r_ctl.crte != NULL) { 23542 rack->rack_hdrw_pacing = 0; 23543 rack->rack_attempt_hdwr_pace = 0; 23544 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 23545 rack->r_ctl.crte = NULL; 23546 } 23547 #endif 23548 } 23549 break; 23550 /* End Pacing related ones */ 23551 case TCP_RACK_PRR_SENDALOT: 23552 /* Allow PRR to send more than one seg */ 23553 RACK_OPTS_INC(tcp_rack_prr_sendalot); 23554 rack->r_ctl.rc_prr_sendalot = optval; 23555 break; 23556 case TCP_RACK_MIN_TO: 23557 /* Minimum time between rack t-o's in ms */ 23558 RACK_OPTS_INC(tcp_rack_min_to); 23559 rack->r_ctl.rc_min_to = optval; 23560 break; 23561 case TCP_RACK_EARLY_SEG: 23562 /* If early recovery max segments */ 23563 RACK_OPTS_INC(tcp_rack_early_seg); 23564 rack->r_ctl.rc_early_recovery_segs = optval; 23565 break; 23566 case TCP_RACK_ENABLE_HYSTART: 23567 { 23568 if (optval) { 23569 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23570 if (rack_do_hystart > RACK_HYSTART_ON) 23571 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 23572 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 23573 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 23574 } else { 23575 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 23576 } 23577 } 23578 break; 23579 case TCP_RACK_REORD_THRESH: 23580 /* RACK reorder threshold (shift amount) */ 23581 RACK_OPTS_INC(tcp_rack_reord_thresh); 23582 if ((optval > 0) && (optval < 31)) 23583 rack->r_ctl.rc_reorder_shift = optval; 23584 else 23585 error = EINVAL; 23586 break; 23587 case TCP_RACK_REORD_FADE: 23588 /* Does reordering fade after ms time */ 23589 RACK_OPTS_INC(tcp_rack_reord_fade); 23590 rack->r_ctl.rc_reorder_fade = optval; 23591 break; 23592 case TCP_RACK_TLP_THRESH: 23593 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 23594 RACK_OPTS_INC(tcp_rack_tlp_thresh); 23595 if (optval) 23596 rack->r_ctl.rc_tlp_threshold = optval; 23597 else 23598 error = EINVAL; 23599 break; 23600 case TCP_BBR_USE_RACK_RR: 23601 RACK_OPTS_INC(tcp_rack_rr); 23602 if (optval) 23603 rack->use_rack_rr = 1; 23604 else 23605 rack->use_rack_rr = 0; 23606 break; 23607 case TCP_RACK_PKT_DELAY: 23608 /* RACK added ms i.e. rack-rtt + reord + N */ 23609 RACK_OPTS_INC(tcp_rack_pkt_delay); 23610 rack->r_ctl.rc_pkt_delay = optval; 23611 break; 23612 case TCP_DELACK: 23613 RACK_OPTS_INC(tcp_rack_delayed_ack); 23614 if (optval == 0) 23615 tp->t_delayed_ack = 0; 23616 else 23617 tp->t_delayed_ack = 1; 23618 if (tp->t_flags & TF_DELACK) { 23619 tp->t_flags &= ~TF_DELACK; 23620 tp->t_flags |= TF_ACKNOW; 23621 NET_EPOCH_ENTER(et); 23622 rack_output(tp); 23623 NET_EPOCH_EXIT(et); 23624 } 23625 break; 23626 23627 case TCP_BBR_RACK_RTT_USE: 23628 RACK_OPTS_INC(tcp_rack_rtt_use); 23629 if ((optval != USE_RTT_HIGH) && 23630 (optval != USE_RTT_LOW) && 23631 (optval != USE_RTT_AVG)) 23632 error = EINVAL; 23633 else 23634 rack->r_ctl.rc_rate_sample_method = optval; 23635 break; 23636 case TCP_HONOR_HPTS_MIN: 23637 RACK_OPTS_INC(tcp_honor_hpts); 23638 if (optval) { 23639 rack->r_use_hpts_min = 1; 23640 /* 23641 * Must be between 2 - 80% to be a reduction else 23642 * we keep the default (10%). 23643 */ 23644 if ((optval > 1) && (optval <= 80)) { 23645 rack->r_ctl.max_reduction = optval; 23646 } 23647 } else 23648 rack->r_use_hpts_min = 0; 23649 break; 23650 case TCP_REC_IS_DYN: /* URL:dynrec */ 23651 RACK_OPTS_INC(tcp_dyn_rec); 23652 if (optval) 23653 rack->rc_gp_no_rec_chg = 1; 23654 else 23655 rack->rc_gp_no_rec_chg = 0; 23656 break; 23657 case TCP_NO_TIMELY: 23658 RACK_OPTS_INC(tcp_notimely); 23659 if (optval) { 23660 rack->rc_skip_timely = 1; 23661 rack->r_ctl.rack_per_of_gp_rec = 90; 23662 rack->r_ctl.rack_per_of_gp_ca = 100; 23663 rack->r_ctl.rack_per_of_gp_ss = 250; 23664 } else { 23665 rack->rc_skip_timely = 0; 23666 } 23667 break; 23668 case TCP_GP_USE_LTBW: 23669 if (optval == 0) { 23670 rack->use_lesser_lt_bw = 0; 23671 rack->dis_lt_bw = 1; 23672 } else if (optval == 1) { 23673 rack->use_lesser_lt_bw = 1; 23674 rack->dis_lt_bw = 0; 23675 } else if (optval == 2) { 23676 rack->use_lesser_lt_bw = 0; 23677 rack->dis_lt_bw = 0; 23678 } 23679 break; 23680 case TCP_DATA_AFTER_CLOSE: 23681 RACK_OPTS_INC(tcp_data_after_close); 23682 if (optval) 23683 rack->rc_allow_data_af_clo = 1; 23684 else 23685 rack->rc_allow_data_af_clo = 0; 23686 break; 23687 default: 23688 break; 23689 } 23690 tcp_log_socket_option(tp, sopt_name, optval, error); 23691 return (error); 23692 } 23693 23694 static void 23695 rack_inherit(struct tcpcb *tp, struct inpcb *parent) 23696 { 23697 /* 23698 * A new connection has been created (tp) and 23699 * the parent is the inpcb given. We want to 23700 * apply a read-lock to the parent (we are already 23701 * holding a write lock on the tp) and copy anything 23702 * out of the rack specific data as long as its tfb is 23703 * the same as ours i.e. we are the same stack. Otherwise 23704 * we just return. 23705 */ 23706 struct tcpcb *par; 23707 struct tcp_rack *dest, *src; 23708 int cnt = 0; 23709 23710 par = intotcpcb(parent); 23711 if (par->t_fb != tp->t_fb) { 23712 /* Not the same stack */ 23713 tcp_log_socket_option(tp, 0, 0, 1); 23714 return; 23715 } 23716 /* Ok if we reach here lets setup the two rack pointers */ 23717 dest = (struct tcp_rack *)tp->t_fb_ptr; 23718 src = (struct tcp_rack *)par->t_fb_ptr; 23719 if ((src == NULL) || (dest == NULL)) { 23720 /* Huh? */ 23721 tcp_log_socket_option(tp, 0, 0, 2); 23722 return; 23723 } 23724 /* Now copy out anything we wish to inherit i.e. things in socket-options */ 23725 /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */ 23726 if ((src->dgp_on) && (dest->dgp_on == 0)) { 23727 /* Profile 1 had to be set via sock opt */ 23728 rack_set_dgp(dest); 23729 cnt++; 23730 } 23731 /* TCP_RACK_SET_RXT_OPTIONS */ 23732 if (dest->full_size_rxt != src->full_size_rxt) { 23733 dest->full_size_rxt = src->full_size_rxt; 23734 cnt++; 23735 } 23736 if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) { 23737 dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min; 23738 cnt++; 23739 } 23740 /* TCP_RACK_DSACK_OPT */ 23741 if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) { 23742 dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based; 23743 cnt++; 23744 } 23745 if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) { 23746 dest->rc_rack_use_dsack = src->rc_rack_use_dsack; 23747 cnt++; 23748 } 23749 /* TCP_RACK_PACING_DIVISOR */ 23750 if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) { 23751 dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor; 23752 cnt++; 23753 } 23754 /* TCP_RACK_HI_BETA */ 23755 if (src->rack_hibeta != dest->rack_hibeta) { 23756 cnt++; 23757 if (src->rack_hibeta) { 23758 dest->r_ctl.rc_saved_beta = src->r_ctl.rc_saved_beta; 23759 dest->rack_hibeta = 1; 23760 } else { 23761 dest->rack_hibeta = 0; 23762 } 23763 } 23764 /* TCP_RACK_TIMER_SLOP */ 23765 if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) { 23766 dest->r_ctl.timer_slop = src->r_ctl.timer_slop; 23767 cnt++; 23768 } 23769 /* TCP_RACK_PACING_BETA_ECN */ 23770 if (dest->r_ctl.rc_saved_beta_ecn != src->r_ctl.rc_saved_beta_ecn) { 23771 dest->r_ctl.rc_saved_beta_ecn = src->r_ctl.rc_saved_beta_ecn; 23772 cnt++; 23773 } 23774 /* We do not do TCP_DEFER_OPTIONS */ 23775 /* TCP_RACK_MEASURE_CNT */ 23776 if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) { 23777 dest->r_ctl.req_measurements = src->r_ctl.req_measurements; 23778 cnt++; 23779 } 23780 /* TCP_HDWR_UP_ONLY */ 23781 if (dest->r_up_only != src->r_up_only) { 23782 dest->r_up_only = src->r_up_only; 23783 cnt++; 23784 } 23785 /* TCP_FILLCW_RATE_CAP */ 23786 if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) { 23787 dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap; 23788 cnt++; 23789 } 23790 /* TCP_PACING_RATE_CAP */ 23791 if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) { 23792 dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap; 23793 cnt++; 23794 } 23795 /* A listener can't set TCP_HYBRID_PACING */ 23796 /* TCP_SIDECHAN_DIS */ 23797 if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) { 23798 dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask; 23799 cnt++; 23800 } 23801 /* TCP_SHARED_CWND_TIME_LIMIT */ 23802 if (dest->r_limit_scw != src->r_limit_scw) { 23803 dest->r_limit_scw = src->r_limit_scw; 23804 cnt++; 23805 } 23806 /* TCP_RACK_PACE_TO_FILL */ 23807 if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) { 23808 dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd; 23809 cnt++; 23810 } 23811 if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) { 23812 dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range; 23813 cnt++; 23814 } 23815 if (dest->rtt_limit_mul != src->rtt_limit_mul) { 23816 dest->rtt_limit_mul = src->rtt_limit_mul; 23817 cnt++; 23818 } 23819 /* TCP_RACK_NO_PUSH_AT_MAX */ 23820 if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) { 23821 dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt; 23822 cnt++; 23823 } 23824 /* TCP_SHARED_CWND_ENABLE */ 23825 if (dest->rack_enable_scwnd != src->rack_enable_scwnd) { 23826 dest->rack_enable_scwnd = src->rack_enable_scwnd; 23827 cnt++; 23828 } 23829 /* TCP_USE_CMP_ACKS */ 23830 if (dest->r_use_cmp_ack != src->r_use_cmp_ack) { 23831 dest->r_use_cmp_ack = src->r_use_cmp_ack; 23832 cnt++; 23833 } 23834 23835 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 23836 dest->r_mbuf_queue = src->r_mbuf_queue; 23837 cnt++; 23838 } 23839 /* TCP_RACK_MBUF_QUEUE */ 23840 if (dest->r_mbuf_queue != src->r_mbuf_queue) { 23841 dest->r_mbuf_queue = src->r_mbuf_queue; 23842 cnt++; 23843 } 23844 if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) { 23845 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23846 } else { 23847 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23848 } 23849 if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) { 23850 tp->t_flags2 |= TF2_MBUF_ACKCMP; 23851 } 23852 /* TCP_RACK_NONRXT_CFG_RATE */ 23853 if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) { 23854 dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr; 23855 cnt++; 23856 } 23857 /* TCP_NO_PRR */ 23858 if (dest->rack_no_prr != src->rack_no_prr) { 23859 dest->rack_no_prr = src->rack_no_prr; 23860 cnt++; 23861 } 23862 if (dest->no_prr_addback != src->no_prr_addback) { 23863 dest->no_prr_addback = src->no_prr_addback; 23864 cnt++; 23865 } 23866 /* RACK_CSPR_IS_FCC */ 23867 if (dest->cspr_is_fcc != src->cspr_is_fcc) { 23868 dest->cspr_is_fcc = src->cspr_is_fcc; 23869 cnt++; 23870 } 23871 /* TCP_TIMELY_DYN_ADJ */ 23872 if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) { 23873 dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul; 23874 cnt++; 23875 } 23876 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 23877 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 23878 cnt++; 23879 } 23880 /* TCP_RACK_TLP_USE */ 23881 if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) { 23882 dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use; 23883 cnt++; 23884 } 23885 /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */ 23886 /* TCP_BBR_RACK_INIT_RATE */ 23887 if (dest->r_ctl.init_rate != src->r_ctl.init_rate) { 23888 dest->r_ctl.init_rate = src->r_ctl.init_rate; 23889 cnt++; 23890 } 23891 /* TCP_RACK_FORCE_MSEG */ 23892 if (dest->rc_force_max_seg != src->rc_force_max_seg) { 23893 dest->rc_force_max_seg = src->rc_force_max_seg; 23894 cnt++; 23895 } 23896 /* TCP_RACK_PACE_MIN_SEG */ 23897 if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) { 23898 dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs; 23899 cnt++; 23900 } 23901 /* we don't allow TCP_RACK_PACE_MAX_SEG */ 23902 /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */ 23903 if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) { 23904 dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca; 23905 cnt++; 23906 } 23907 if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) { 23908 dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss; 23909 cnt++; 23910 } 23911 if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) { 23912 dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec; 23913 cnt++; 23914 } 23915 /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */ 23916 if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) { 23917 dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec; 23918 cnt++; 23919 } 23920 if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { 23921 dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; 23922 cnt++; 23923 } 23924 23925 if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) { 23926 dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss; 23927 cnt++; 23928 } 23929 /* TCP_RACK_RR_CONF */ 23930 if (dest->r_rr_config != src->r_rr_config) { 23931 dest->r_rr_config = src->r_rr_config; 23932 cnt++; 23933 } 23934 /* TCP_PACING_DND */ 23935 if (dest->rc_pace_dnd != src->rc_pace_dnd) { 23936 dest->rc_pace_dnd = src->rc_pace_dnd; 23937 cnt++; 23938 } 23939 /* TCP_HDWR_RATE_CAP */ 23940 if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) { 23941 dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps; 23942 cnt++; 23943 } 23944 /* TCP_DGP_UPPER_BOUNDS */ 23945 if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) { 23946 dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca; 23947 cnt++; 23948 } 23949 if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) { 23950 dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss; 23951 cnt++; 23952 } 23953 /* TCP_SS_EEXIT */ 23954 if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) { 23955 dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh; 23956 cnt++; 23957 } 23958 if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) { 23959 dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs; 23960 cnt++; 23961 } 23962 if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) { 23963 dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last; 23964 cnt++; 23965 } 23966 if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) { 23967 dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req; 23968 cnt++; 23969 } 23970 /* TCP_BBR_HDWR_PACE */ 23971 if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) { 23972 dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena; 23973 cnt++; 23974 } 23975 if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) { 23976 dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace; 23977 cnt++; 23978 } 23979 /* TCP_RACK_PRR_SENDALOT */ 23980 if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) { 23981 dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot; 23982 cnt++; 23983 } 23984 /* TCP_RACK_MIN_TO */ 23985 if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) { 23986 dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to; 23987 cnt++; 23988 } 23989 /* TCP_RACK_EARLY_SEG */ 23990 if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) { 23991 dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs; 23992 cnt++; 23993 } 23994 /* TCP_RACK_ENABLE_HYSTART */ 23995 if (par->t_ccv.flags != tp->t_ccv.flags) { 23996 cnt++; 23997 if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) { 23998 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23999 if (rack_do_hystart > RACK_HYSTART_ON) 24000 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 24001 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 24002 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 24003 } else { 24004 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 24005 } 24006 } 24007 /* TCP_RACK_REORD_THRESH */ 24008 if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) { 24009 dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift; 24010 cnt++; 24011 } 24012 /* TCP_RACK_REORD_FADE */ 24013 if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) { 24014 dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade; 24015 cnt++; 24016 } 24017 /* TCP_RACK_TLP_THRESH */ 24018 if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) { 24019 dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold; 24020 cnt++; 24021 } 24022 /* TCP_BBR_USE_RACK_RR */ 24023 if (dest->use_rack_rr != src->use_rack_rr) { 24024 dest->use_rack_rr = src->use_rack_rr; 24025 cnt++; 24026 } 24027 /* TCP_RACK_PKT_DELAY */ 24028 if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) { 24029 dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay; 24030 cnt++; 24031 } 24032 /* TCP_DELACK will get copied via the main code if applicable */ 24033 /* TCP_BBR_RACK_RTT_USE */ 24034 if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) { 24035 dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method; 24036 cnt++; 24037 } 24038 /* TCP_HONOR_HPTS_MIN */ 24039 if (dest->r_use_hpts_min != src->r_use_hpts_min) { 24040 dest->r_use_hpts_min = src->r_use_hpts_min; 24041 cnt++; 24042 } 24043 if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) { 24044 dest->r_ctl.max_reduction = src->r_ctl.max_reduction; 24045 cnt++; 24046 } 24047 /* TCP_REC_IS_DYN */ 24048 if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) { 24049 dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg; 24050 cnt++; 24051 } 24052 if (dest->rc_skip_timely != src->rc_skip_timely) { 24053 dest->rc_skip_timely = src->rc_skip_timely; 24054 cnt++; 24055 } 24056 /* TCP_DATA_AFTER_CLOSE */ 24057 if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) { 24058 dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo; 24059 cnt++; 24060 } 24061 /* TCP_GP_USE_LTBW */ 24062 if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) { 24063 dest->use_lesser_lt_bw = src->use_lesser_lt_bw; 24064 cnt++; 24065 } 24066 if (dest->dis_lt_bw != src->dis_lt_bw) { 24067 dest->dis_lt_bw = src->dis_lt_bw; 24068 cnt++; 24069 } 24070 tcp_log_socket_option(tp, 0, cnt, 0); 24071 } 24072 24073 24074 static void 24075 rack_apply_deferred_options(struct tcp_rack *rack) 24076 { 24077 struct deferred_opt_list *dol, *sdol; 24078 uint32_t s_optval; 24079 24080 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 24081 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 24082 /* Disadvantage of deferal is you loose the error return */ 24083 s_optval = (uint32_t)dol->optval; 24084 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); 24085 free(dol, M_TCPDO); 24086 } 24087 } 24088 24089 static void 24090 rack_hw_tls_change(struct tcpcb *tp, int chg) 24091 { 24092 /* Update HW tls state */ 24093 struct tcp_rack *rack; 24094 24095 rack = (struct tcp_rack *)tp->t_fb_ptr; 24096 if (chg) 24097 rack->r_ctl.fsb.hw_tls = 1; 24098 else 24099 rack->r_ctl.fsb.hw_tls = 0; 24100 } 24101 24102 static int 24103 rack_pru_options(struct tcpcb *tp, int flags) 24104 { 24105 if (flags & PRUS_OOB) 24106 return (EOPNOTSUPP); 24107 return (0); 24108 } 24109 24110 static bool 24111 rack_wake_check(struct tcpcb *tp) 24112 { 24113 struct tcp_rack *rack; 24114 struct timeval tv; 24115 uint32_t cts; 24116 24117 rack = (struct tcp_rack *)tp->t_fb_ptr; 24118 if (rack->r_ctl.rc_hpts_flags) { 24119 cts = tcp_get_usecs(&tv); 24120 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ 24121 /* 24122 * Pacing timer is up, check if we are ready. 24123 */ 24124 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) 24125 return (true); 24126 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { 24127 /* 24128 * A timer is up, check if we are ready. 24129 */ 24130 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) 24131 return (true); 24132 } 24133 } 24134 return (false); 24135 } 24136 24137 static struct tcp_function_block __tcp_rack = { 24138 .tfb_tcp_block_name = __XSTRING(STACKNAME), 24139 .tfb_tcp_output = rack_output, 24140 .tfb_do_queued_segments = ctf_do_queued_segments, 24141 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 24142 .tfb_tcp_do_segment = rack_do_segment, 24143 .tfb_tcp_ctloutput = rack_ctloutput, 24144 .tfb_tcp_fb_init = rack_init, 24145 .tfb_tcp_fb_fini = rack_fini, 24146 .tfb_tcp_timer_stop_all = rack_stopall, 24147 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 24148 .tfb_tcp_handoff_ok = rack_handoff_ok, 24149 .tfb_tcp_mtu_chg = rack_mtu_change, 24150 .tfb_pru_options = rack_pru_options, 24151 .tfb_hwtls_change = rack_hw_tls_change, 24152 .tfb_chg_query = rack_chg_query, 24153 .tfb_switch_failed = rack_switch_failed, 24154 .tfb_early_wake_check = rack_wake_check, 24155 .tfb_compute_pipe = rack_compute_pipe, 24156 .tfb_stack_info = rack_stack_information, 24157 .tfb_inherit = rack_inherit, 24158 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP | TCP_FUNC_DEFAULT_OK, 24159 24160 }; 24161 24162 /* 24163 * rack_ctloutput() must drop the inpcb lock before performing copyin on 24164 * socket option arguments. When it re-acquires the lock after the copy, it 24165 * has to revalidate that the connection is still valid for the socket 24166 * option. 24167 */ 24168 static int 24169 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) 24170 { 24171 struct inpcb *inp = tptoinpcb(tp); 24172 #ifdef INET 24173 struct ip *ip; 24174 #endif 24175 struct tcp_rack *rack; 24176 struct tcp_hybrid_req hybrid; 24177 uint64_t loptval; 24178 int32_t error = 0, optval; 24179 24180 rack = (struct tcp_rack *)tp->t_fb_ptr; 24181 if (rack == NULL) { 24182 INP_WUNLOCK(inp); 24183 return (EINVAL); 24184 } 24185 #ifdef INET 24186 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 24187 #endif 24188 24189 switch (sopt->sopt_level) { 24190 #ifdef INET6 24191 case IPPROTO_IPV6: 24192 MPASS(inp->inp_vflag & INP_IPV6PROTO); 24193 switch (sopt->sopt_name) { 24194 case IPV6_USE_MIN_MTU: 24195 tcp6_use_min_mtu(tp); 24196 break; 24197 } 24198 INP_WUNLOCK(inp); 24199 return (0); 24200 #endif 24201 #ifdef INET 24202 case IPPROTO_IP: 24203 switch (sopt->sopt_name) { 24204 case IP_TOS: 24205 /* 24206 * The DSCP codepoint has changed, update the fsb. 24207 */ 24208 ip->ip_tos = rack->rc_inp->inp_ip_tos; 24209 break; 24210 case IP_TTL: 24211 /* 24212 * The TTL has changed, update the fsb. 24213 */ 24214 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 24215 break; 24216 } 24217 INP_WUNLOCK(inp); 24218 return (0); 24219 #endif 24220 #ifdef SO_PEERPRIO 24221 case SOL_SOCKET: 24222 switch (sopt->sopt_name) { 24223 case SO_PEERPRIO: /* SC-URL:bs */ 24224 /* Already read in and sanity checked in sosetopt(). */ 24225 if (inp->inp_socket) { 24226 rack->client_bufferlvl = inp->inp_socket->so_peerprio; 24227 } 24228 break; 24229 } 24230 INP_WUNLOCK(inp); 24231 return (0); 24232 #endif 24233 case IPPROTO_TCP: 24234 switch (sopt->sopt_name) { 24235 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 24236 /* Pacing related ones */ 24237 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 24238 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 24239 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ 24240 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 24241 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 24242 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 24243 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 24244 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 24245 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 24246 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 24247 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 24248 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 24249 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 24250 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 24251 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 24252 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 24253 case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ 24254 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 24255 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 24256 /* End pacing related */ 24257 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 24258 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 24259 case TCP_RACK_MIN_TO: /* URL:min_to */ 24260 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 24261 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 24262 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 24263 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 24264 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 24265 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 24266 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 24267 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 24268 case TCP_NO_PRR: /* URL:noprr */ 24269 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 24270 case TCP_DATA_AFTER_CLOSE: /* no URL */ 24271 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 24272 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 24273 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 24274 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 24275 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 24276 case TCP_RACK_PROFILE: /* URL:profile */ 24277 case TCP_SIDECHAN_DIS: /* URL:scodm */ 24278 case TCP_HYBRID_PACING: /* URL:pacing=hybrid */ 24279 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 24280 case TCP_RACK_ABC_VAL: /* URL:labc */ 24281 case TCP_REC_ABC_VAL: /* URL:reclabc */ 24282 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 24283 case TCP_DEFER_OPTIONS: /* URL:defer */ 24284 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 24285 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 24286 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 24287 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ 24288 case TCP_RACK_HI_BETA: /* URL:hibeta */ 24289 case TCP_RACK_SPLIT_LIMIT: /* URL:split */ 24290 case TCP_SS_EEXIT: /* URL:eexit */ 24291 case TCP_DGP_UPPER_BOUNDS: /* URL:upper */ 24292 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ 24293 case TCP_PACING_DND: /* URL:dnd */ 24294 case TCP_NO_TIMELY: /* URL:notimely */ 24295 case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ 24296 case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */ 24297 case TCP_REC_IS_DYN: /* URL:dynrec */ 24298 case TCP_GP_USE_LTBW: /* URL:useltbw */ 24299 goto process_opt; 24300 break; 24301 default: 24302 /* Filter off all unknown options to the base stack */ 24303 return (tcp_default_ctloutput(tp, sopt)); 24304 break; 24305 } 24306 default: 24307 INP_WUNLOCK(inp); 24308 return (0); 24309 } 24310 process_opt: 24311 INP_WUNLOCK(inp); 24312 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 24313 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) { 24314 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 24315 /* 24316 * We truncate it down to 32 bits for the socket-option trace this 24317 * means rates > 34Gbps won't show right, but thats probably ok. 24318 */ 24319 optval = (uint32_t)loptval; 24320 } else if (sopt->sopt_name == TCP_HYBRID_PACING) { 24321 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); 24322 } else { 24323 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 24324 /* Save it in 64 bit form too */ 24325 loptval = optval; 24326 } 24327 if (error) 24328 return (error); 24329 INP_WLOCK(inp); 24330 if (tp->t_fb != &__tcp_rack) { 24331 INP_WUNLOCK(inp); 24332 return (ENOPROTOOPT); 24333 } 24334 if (rack->defer_options && (rack->gp_ready == 0) && 24335 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 24336 (sopt->sopt_name != TCP_HYBRID_PACING) && 24337 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && 24338 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 24339 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 24340 /* Options are being deferred */ 24341 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 24342 INP_WUNLOCK(inp); 24343 return (0); 24344 } else { 24345 /* No memory to defer, fail */ 24346 INP_WUNLOCK(inp); 24347 return (ENOMEM); 24348 } 24349 } 24350 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); 24351 INP_WUNLOCK(inp); 24352 return (error); 24353 } 24354 24355 static void 24356 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 24357 { 24358 24359 INP_WLOCK_ASSERT(tptoinpcb(tp)); 24360 bzero(ti, sizeof(*ti)); 24361 24362 ti->tcpi_state = tp->t_state; 24363 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 24364 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 24365 if (tp->t_flags & TF_SACK_PERMIT) 24366 ti->tcpi_options |= TCPI_OPT_SACK; 24367 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 24368 ti->tcpi_options |= TCPI_OPT_WSCALE; 24369 ti->tcpi_snd_wscale = tp->snd_scale; 24370 ti->tcpi_rcv_wscale = tp->rcv_scale; 24371 } 24372 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 24373 ti->tcpi_options |= TCPI_OPT_ECN; 24374 if (tp->t_flags & TF_FASTOPEN) 24375 ti->tcpi_options |= TCPI_OPT_TFO; 24376 /* still kept in ticks is t_rcvtime */ 24377 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 24378 /* Since we hold everything in precise useconds this is easy */ 24379 ti->tcpi_rtt = tp->t_srtt; 24380 ti->tcpi_rttvar = tp->t_rttvar; 24381 ti->tcpi_rto = tp->t_rxtcur; 24382 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 24383 ti->tcpi_snd_cwnd = tp->snd_cwnd; 24384 /* 24385 * FreeBSD-specific extension fields for tcp_info. 24386 */ 24387 ti->tcpi_rcv_space = tp->rcv_wnd; 24388 ti->tcpi_rcv_nxt = tp->rcv_nxt; 24389 ti->tcpi_snd_wnd = tp->snd_wnd; 24390 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 24391 ti->tcpi_snd_nxt = tp->snd_nxt; 24392 ti->tcpi_snd_mss = tp->t_maxseg; 24393 ti->tcpi_rcv_mss = tp->t_maxseg; 24394 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 24395 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 24396 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 24397 ti->tcpi_total_tlp = tp->t_sndtlppack; 24398 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 24399 ti->tcpi_rttmin = tp->t_rttlow; 24400 #ifdef NETFLIX_STATS 24401 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 24402 #endif 24403 #ifdef TCP_OFFLOAD 24404 if (tp->t_flags & TF_TOE) { 24405 ti->tcpi_options |= TCPI_OPT_TOE; 24406 tcp_offload_tcp_info(tp, ti); 24407 } 24408 #endif 24409 } 24410 24411 static int 24412 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) 24413 { 24414 struct inpcb *inp = tptoinpcb(tp); 24415 struct tcp_rack *rack; 24416 int32_t error, optval; 24417 uint64_t val, loptval; 24418 struct tcp_info ti; 24419 /* 24420 * Because all our options are either boolean or an int, we can just 24421 * pull everything into optval and then unlock and copy. If we ever 24422 * add a option that is not a int, then this will have quite an 24423 * impact to this routine. 24424 */ 24425 error = 0; 24426 rack = (struct tcp_rack *)tp->t_fb_ptr; 24427 if (rack == NULL) { 24428 INP_WUNLOCK(inp); 24429 return (EINVAL); 24430 } 24431 switch (sopt->sopt_name) { 24432 case TCP_INFO: 24433 /* First get the info filled */ 24434 rack_fill_info(tp, &ti); 24435 /* Fix up the rtt related fields if needed */ 24436 INP_WUNLOCK(inp); 24437 error = sooptcopyout(sopt, &ti, sizeof ti); 24438 return (error); 24439 /* 24440 * Beta is the congestion control value for NewReno that influences how 24441 * much of a backoff happens when loss is detected. It is normally set 24442 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 24443 * when you exit recovery. 24444 */ 24445 case TCP_RACK_PACING_BETA: 24446 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24447 error = EINVAL; 24448 else if (rack->rc_pacing_cc_set == 0) 24449 optval = rack->r_ctl.rc_saved_beta; 24450 else { 24451 /* 24452 * Reach out into the CC data and report back what 24453 * I have previously set. Yeah it looks hackish but 24454 * we don't want to report the saved values. 24455 */ 24456 if (tp->t_ccv.cc_data) 24457 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; 24458 else 24459 error = EINVAL; 24460 } 24461 break; 24462 /* 24463 * Beta_ecn is the congestion control value for NewReno that influences how 24464 * much of a backoff happens when a ECN mark is detected. It is normally set 24465 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 24466 * you exit recovery. Note that classic ECN has a beta of 50, it is only 24467 * ABE Ecn that uses this "less" value, but we do too with pacing :) 24468 */ 24469 case TCP_RACK_PACING_BETA_ECN: 24470 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 24471 error = EINVAL; 24472 else if (rack->rc_pacing_cc_set == 0) 24473 optval = rack->r_ctl.rc_saved_beta_ecn; 24474 else { 24475 /* 24476 * Reach out into the CC data and report back what 24477 * I have previously set. Yeah it looks hackish but 24478 * we don't want to report the saved values. 24479 */ 24480 if (tp->t_ccv.cc_data) 24481 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 24482 else 24483 error = EINVAL; 24484 } 24485 break; 24486 case TCP_RACK_DSACK_OPT: 24487 optval = 0; 24488 if (rack->rc_rack_tmr_std_based) { 24489 optval |= 1; 24490 } 24491 if (rack->rc_rack_use_dsack) { 24492 optval |= 2; 24493 } 24494 break; 24495 case TCP_RACK_ENABLE_HYSTART: 24496 { 24497 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 24498 optval = RACK_HYSTART_ON; 24499 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 24500 optval = RACK_HYSTART_ON_W_SC; 24501 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 24502 optval = RACK_HYSTART_ON_W_SC_C; 24503 } else { 24504 optval = RACK_HYSTART_OFF; 24505 } 24506 } 24507 break; 24508 case TCP_RACK_DGP_IN_REC: 24509 error = EINVAL; 24510 break; 24511 case TCP_RACK_HI_BETA: 24512 optval = rack->rack_hibeta; 24513 break; 24514 case TCP_DEFER_OPTIONS: 24515 optval = rack->defer_options; 24516 break; 24517 case TCP_RACK_MEASURE_CNT: 24518 optval = rack->r_ctl.req_measurements; 24519 break; 24520 case TCP_REC_ABC_VAL: 24521 optval = rack->r_use_labc_for_rec; 24522 break; 24523 case TCP_RACK_ABC_VAL: 24524 optval = rack->rc_labc; 24525 break; 24526 case TCP_HDWR_UP_ONLY: 24527 optval= rack->r_up_only; 24528 break; 24529 case TCP_FILLCW_RATE_CAP: 24530 loptval = rack->r_ctl.fillcw_cap; 24531 break; 24532 case TCP_PACING_RATE_CAP: 24533 loptval = rack->r_ctl.bw_rate_cap; 24534 break; 24535 case TCP_RACK_PROFILE: 24536 /* You cannot retrieve a profile, its write only */ 24537 error = EINVAL; 24538 break; 24539 case TCP_SIDECHAN_DIS: 24540 optval = rack->r_ctl.side_chan_dis_mask; 24541 break; 24542 case TCP_HYBRID_PACING: 24543 /* You cannot retrieve hybrid pacing information, its write only */ 24544 error = EINVAL; 24545 break; 24546 case TCP_USE_CMP_ACKS: 24547 optval = rack->r_use_cmp_ack; 24548 break; 24549 case TCP_RACK_PACE_TO_FILL: 24550 optval = rack->rc_pace_to_cwnd; 24551 break; 24552 case TCP_RACK_NO_PUSH_AT_MAX: 24553 optval = rack->r_ctl.rc_no_push_at_mrtt; 24554 break; 24555 case TCP_SHARED_CWND_ENABLE: 24556 optval = rack->rack_enable_scwnd; 24557 break; 24558 case TCP_RACK_NONRXT_CFG_RATE: 24559 optval = rack->rack_rec_nonrxt_use_cr; 24560 break; 24561 case TCP_NO_PRR: 24562 if (rack->rack_no_prr == 1) 24563 optval = 1; 24564 else if (rack->no_prr_addback == 1) 24565 optval = 2; 24566 else 24567 optval = 0; 24568 break; 24569 case TCP_GP_USE_LTBW: 24570 if (rack->dis_lt_bw) { 24571 /* It is not used */ 24572 optval = 0; 24573 } else if (rack->use_lesser_lt_bw) { 24574 /* we use min() */ 24575 optval = 1; 24576 } else { 24577 /* we use max() */ 24578 optval = 2; 24579 } 24580 break; 24581 case TCP_RACK_DO_DETECTION: 24582 error = EINVAL; 24583 break; 24584 case TCP_RACK_MBUF_QUEUE: 24585 /* Now do we use the LRO mbuf-queue feature */ 24586 optval = rack->r_mbuf_queue; 24587 break; 24588 case RACK_CSPR_IS_FCC: 24589 optval = rack->cspr_is_fcc; 24590 break; 24591 case TCP_TIMELY_DYN_ADJ: 24592 optval = rack->rc_gp_dyn_mul; 24593 break; 24594 case TCP_BBR_IWINTSO: 24595 error = EINVAL; 24596 break; 24597 case TCP_RACK_TLP_REDUCE: 24598 /* RACK TLP cwnd reduction (bool) */ 24599 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 24600 break; 24601 case TCP_BBR_RACK_INIT_RATE: 24602 val = rack->r_ctl.init_rate; 24603 /* convert to kbits per sec */ 24604 val *= 8; 24605 val /= 1000; 24606 optval = (uint32_t)val; 24607 break; 24608 case TCP_RACK_FORCE_MSEG: 24609 optval = rack->rc_force_max_seg; 24610 break; 24611 case TCP_RACK_PACE_MIN_SEG: 24612 optval = rack->r_ctl.rc_user_set_min_segs; 24613 break; 24614 case TCP_RACK_PACE_MAX_SEG: 24615 /* Max segments in a pace */ 24616 optval = rack->rc_user_set_max_segs; 24617 break; 24618 case TCP_RACK_PACE_ALWAYS: 24619 /* Use the always pace method */ 24620 optval = rack->rc_always_pace; 24621 break; 24622 case TCP_RACK_PRR_SENDALOT: 24623 /* Allow PRR to send more than one seg */ 24624 optval = rack->r_ctl.rc_prr_sendalot; 24625 break; 24626 case TCP_RACK_MIN_TO: 24627 /* Minimum time between rack t-o's in ms */ 24628 optval = rack->r_ctl.rc_min_to; 24629 break; 24630 case TCP_RACK_SPLIT_LIMIT: 24631 optval = rack->r_ctl.rc_split_limit; 24632 break; 24633 case TCP_RACK_EARLY_SEG: 24634 /* If early recovery max segments */ 24635 optval = rack->r_ctl.rc_early_recovery_segs; 24636 break; 24637 case TCP_RACK_REORD_THRESH: 24638 /* RACK reorder threshold (shift amount) */ 24639 optval = rack->r_ctl.rc_reorder_shift; 24640 break; 24641 case TCP_SS_EEXIT: 24642 if (rack->r_ctl.gp_rnd_thresh) { 24643 uint32_t v; 24644 24645 v = rack->r_ctl.gp_gain_req; 24646 v <<= 17; 24647 optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff); 24648 if (rack->r_ctl.gate_to_fs == 1) 24649 optval |= 0x10000; 24650 } else 24651 optval = 0; 24652 break; 24653 case TCP_RACK_REORD_FADE: 24654 /* Does reordering fade after ms time */ 24655 optval = rack->r_ctl.rc_reorder_fade; 24656 break; 24657 case TCP_BBR_USE_RACK_RR: 24658 /* Do we use the rack cheat for rxt */ 24659 optval = rack->use_rack_rr; 24660 break; 24661 case TCP_RACK_RR_CONF: 24662 optval = rack->r_rr_config; 24663 break; 24664 case TCP_HDWR_RATE_CAP: 24665 optval = rack->r_rack_hw_rate_caps; 24666 break; 24667 case TCP_BBR_HDWR_PACE: 24668 optval = rack->rack_hdw_pace_ena; 24669 break; 24670 case TCP_RACK_TLP_THRESH: 24671 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 24672 optval = rack->r_ctl.rc_tlp_threshold; 24673 break; 24674 case TCP_RACK_PKT_DELAY: 24675 /* RACK added ms i.e. rack-rtt + reord + N */ 24676 optval = rack->r_ctl.rc_pkt_delay; 24677 break; 24678 case TCP_RACK_TLP_USE: 24679 optval = rack->rack_tlp_threshold_use; 24680 break; 24681 case TCP_PACING_DND: 24682 optval = rack->rc_pace_dnd; 24683 break; 24684 case TCP_RACK_PACE_RATE_CA: 24685 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 24686 break; 24687 case TCP_RACK_PACE_RATE_SS: 24688 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 24689 break; 24690 case TCP_RACK_PACE_RATE_REC: 24691 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 24692 break; 24693 case TCP_DGP_UPPER_BOUNDS: 24694 optval = rack->r_ctl.rack_per_upper_bound_ss; 24695 optval <<= 16; 24696 optval |= rack->r_ctl.rack_per_upper_bound_ca; 24697 break; 24698 case TCP_RACK_GP_INCREASE_SS: 24699 optval = rack->r_ctl.rack_per_of_gp_ca; 24700 break; 24701 case TCP_RACK_GP_INCREASE_CA: 24702 optval = rack->r_ctl.rack_per_of_gp_ss; 24703 break; 24704 case TCP_RACK_PACING_DIVISOR: 24705 optval = rack->r_ctl.pace_len_divisor; 24706 break; 24707 case TCP_BBR_RACK_RTT_USE: 24708 optval = rack->r_ctl.rc_rate_sample_method; 24709 break; 24710 case TCP_DELACK: 24711 optval = tp->t_delayed_ack; 24712 break; 24713 case TCP_DATA_AFTER_CLOSE: 24714 optval = rack->rc_allow_data_af_clo; 24715 break; 24716 case TCP_SHARED_CWND_TIME_LIMIT: 24717 optval = rack->r_limit_scw; 24718 break; 24719 case TCP_HONOR_HPTS_MIN: 24720 if (rack->r_use_hpts_min) 24721 optval = rack->r_ctl.max_reduction; 24722 else 24723 optval = 0; 24724 break; 24725 case TCP_REC_IS_DYN: 24726 optval = rack->rc_gp_no_rec_chg; 24727 break; 24728 case TCP_NO_TIMELY: 24729 optval = rack->rc_skip_timely; 24730 break; 24731 case TCP_RACK_TIMER_SLOP: 24732 optval = rack->r_ctl.timer_slop; 24733 break; 24734 default: 24735 return (tcp_default_ctloutput(tp, sopt)); 24736 break; 24737 } 24738 INP_WUNLOCK(inp); 24739 if (error == 0) { 24740 if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || 24741 (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) 24742 error = sooptcopyout(sopt, &loptval, sizeof loptval); 24743 else 24744 error = sooptcopyout(sopt, &optval, sizeof optval); 24745 } 24746 return (error); 24747 } 24748 24749 static int 24750 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 24751 { 24752 if (sopt->sopt_dir == SOPT_SET) { 24753 return (rack_set_sockopt(tp, sopt)); 24754 } else if (sopt->sopt_dir == SOPT_GET) { 24755 return (rack_get_sockopt(tp, sopt)); 24756 } else { 24757 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 24758 } 24759 } 24760 24761 static const char *rack_stack_names[] = { 24762 __XSTRING(STACKNAME), 24763 #ifdef STACKALIAS 24764 __XSTRING(STACKALIAS), 24765 #endif 24766 }; 24767 24768 static int 24769 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 24770 { 24771 memset(mem, 0, size); 24772 return (0); 24773 } 24774 24775 static void 24776 rack_dtor(void *mem, int32_t size, void *arg) 24777 { 24778 24779 } 24780 24781 static bool rack_mod_inited = false; 24782 24783 static int 24784 tcp_addrack(module_t mod, int32_t type, void *data) 24785 { 24786 int32_t err = 0; 24787 int num_stacks; 24788 24789 switch (type) { 24790 case MOD_LOAD: 24791 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 24792 sizeof(struct rack_sendmap), 24793 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 24794 24795 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 24796 sizeof(struct tcp_rack), 24797 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 24798 24799 sysctl_ctx_init(&rack_sysctl_ctx); 24800 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 24801 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 24802 OID_AUTO, 24803 #ifdef STACKALIAS 24804 __XSTRING(STACKALIAS), 24805 #else 24806 __XSTRING(STACKNAME), 24807 #endif 24808 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 24809 ""); 24810 if (rack_sysctl_root == NULL) { 24811 printf("Failed to add sysctl node\n"); 24812 err = EFAULT; 24813 goto free_uma; 24814 } 24815 rack_init_sysctls(); 24816 num_stacks = nitems(rack_stack_names); 24817 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 24818 rack_stack_names, &num_stacks); 24819 if (err) { 24820 printf("Failed to register %s stack name for " 24821 "%s module\n", rack_stack_names[num_stacks], 24822 __XSTRING(MODNAME)); 24823 sysctl_ctx_free(&rack_sysctl_ctx); 24824 free_uma: 24825 uma_zdestroy(rack_zone); 24826 uma_zdestroy(rack_pcb_zone); 24827 rack_counter_destroy(); 24828 printf("Failed to register rack module -- err:%d\n", err); 24829 return (err); 24830 } 24831 tcp_lro_reg_mbufq(); 24832 rack_mod_inited = true; 24833 break; 24834 case MOD_QUIESCE: 24835 err = deregister_tcp_functions(&__tcp_rack, true, false); 24836 break; 24837 case MOD_UNLOAD: 24838 err = deregister_tcp_functions(&__tcp_rack, false, true); 24839 if (err == EBUSY) 24840 break; 24841 if (rack_mod_inited) { 24842 uma_zdestroy(rack_zone); 24843 uma_zdestroy(rack_pcb_zone); 24844 sysctl_ctx_free(&rack_sysctl_ctx); 24845 rack_counter_destroy(); 24846 rack_mod_inited = false; 24847 } 24848 tcp_lro_dereg_mbufq(); 24849 err = 0; 24850 break; 24851 default: 24852 return (EOPNOTSUPP); 24853 } 24854 return (err); 24855 } 24856 24857 static moduledata_t tcp_rack = { 24858 .name = __XSTRING(MODNAME), 24859 .evhand = tcp_addrack, 24860 .priv = 0 24861 }; 24862 24863 MODULE_VERSION(MODNAME, 1); 24864 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 24865 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 24866 24867 #endif /* #if !defined(INET) && !defined(INET6) */ 24868