1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_ratelimit.h" 34 #include "opt_kern_tls.h" 35 #if defined(INET) || defined(INET6) 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 #ifdef STATS 54 #include <sys/qmath.h> 55 #include <sys/tree.h> 56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 57 #else 58 #include <sys/tree.h> 59 #endif 60 #include <sys/refcount.h> 61 #include <sys/queue.h> 62 #include <sys/tim_filter.h> 63 #include <sys/smp.h> 64 #include <sys/kthread.h> 65 #include <sys/kern_prefetch.h> 66 #include <sys/protosw.h> 67 #ifdef TCP_ACCOUNTING 68 #include <sys/sched.h> 69 #include <machine/cpu.h> 70 #endif 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcp_log_buf.h> 96 #include <netinet/tcp_syncache.h> 97 #include <netinet/tcp_hpts.h> 98 #include <netinet/tcp_ratelimit.h> 99 #include <netinet/tcp_accounting.h> 100 #include <netinet/tcpip.h> 101 #include <netinet/cc/cc.h> 102 #include <netinet/cc/cc_newreno.h> 103 #include <netinet/tcp_fastopen.h> 104 #include <netinet/tcp_lro.h> 105 #ifdef NETFLIX_SHARED_CWND 106 #include <netinet/tcp_shared_cwnd.h> 107 #endif 108 #ifdef TCP_OFFLOAD 109 #include <netinet/tcp_offload.h> 110 #endif 111 #ifdef INET6 112 #include <netinet6/tcp6_var.h> 113 #endif 114 #include <netinet/tcp_ecn.h> 115 116 #include <netipsec/ipsec_support.h> 117 118 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 119 #include <netipsec/ipsec.h> 120 #include <netipsec/ipsec6.h> 121 #endif /* IPSEC */ 122 123 #include <netinet/udp.h> 124 #include <netinet/udp_var.h> 125 #include <machine/in_cksum.h> 126 127 #ifdef MAC 128 #include <security/mac/mac_framework.h> 129 #endif 130 #include "sack_filter.h" 131 #include "tcp_rack.h" 132 #include "tailq_hash.h" 133 #include "rack_bbr_common.h" 134 135 uma_zone_t rack_zone; 136 uma_zone_t rack_pcb_zone; 137 138 #ifndef TICKS2SBT 139 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 140 #endif 141 142 VNET_DECLARE(uint32_t, newreno_beta); 143 VNET_DECLARE(uint32_t, newreno_beta_ecn); 144 #define V_newreno_beta VNET(newreno_beta) 145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 146 147 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 150 151 struct sysctl_ctx_list rack_sysctl_ctx; 152 struct sysctl_oid *rack_sysctl_root; 153 154 #define CUM_ACKED 1 155 #define SACKED 2 156 157 /* 158 * The RACK module incorporates a number of 159 * TCP ideas that have been put out into the IETF 160 * over the last few years: 161 * - Matt Mathis's Rate Halving which slowly drops 162 * the congestion window so that the ack clock can 163 * be maintained during a recovery. 164 * - Yuchung Cheng's RACK TCP (for which its named) that 165 * will stop us using the number of dup acks and instead 166 * use time as the gage of when we retransmit. 167 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 168 * of Dukkipati et.al. 169 * RACK depends on SACK, so if an endpoint arrives that 170 * cannot do SACK the state machine below will shuttle the 171 * connection back to using the "default" TCP stack that is 172 * in FreeBSD. 173 * 174 * To implement RACK the original TCP stack was first decomposed 175 * into a functional state machine with individual states 176 * for each of the possible TCP connection states. The do_segment 177 * functions role in life is to mandate the connection supports SACK 178 * initially and then assure that the RACK state matches the conenction 179 * state before calling the states do_segment function. Each 180 * state is simplified due to the fact that the original do_segment 181 * has been decomposed and we *know* what state we are in (no 182 * switches on the state) and all tests for SACK are gone. This 183 * greatly simplifies what each state does. 184 * 185 * TCP output is also over-written with a new version since it 186 * must maintain the new rack scoreboard. 187 * 188 */ 189 static int32_t rack_tlp_thresh = 1; 190 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 191 static int32_t rack_tlp_use_greater = 1; 192 static int32_t rack_reorder_thresh = 2; 193 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 194 * - 60 seconds */ 195 static uint32_t rack_clamp_ss_upper = 110; 196 static uint32_t rack_clamp_ca_upper = 105; 197 static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */ 198 static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */ 199 static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */ 200 static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */ 201 static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ 202 static int32_t rack_rxt_controls = 0; 203 static int32_t rack_fill_cw_state = 0; 204 static uint8_t rack_req_measurements = 1; 205 /* Attack threshold detections */ 206 static uint32_t rack_highest_sack_thresh_seen = 0; 207 static uint32_t rack_highest_move_thresh_seen = 0; 208 static uint32_t rack_merge_out_sacks_on_attack = 0; 209 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 210 static int32_t rack_hw_pace_extra_slots = 0; /* 2 extra MSS time betweens */ 211 static int32_t rack_hw_rate_caps = 0; /* 1; */ 212 static int32_t rack_hw_rate_cap_per = 0; /* 0 -- off */ 213 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 214 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 215 static int32_t rack_hw_up_only = 0; 216 static int32_t rack_stats_gets_ms_rtt = 1; 217 static int32_t rack_prr_addbackmax = 2; 218 static int32_t rack_do_hystart = 0; 219 static int32_t rack_apply_rtt_with_reduced_conf = 0; 220 static int32_t rack_hibeta_setting = 0; 221 static int32_t rack_default_pacing_divisor = 250; 222 static int32_t rack_uses_full_dgp_in_rec = 1; 223 static uint16_t rack_pacing_min_seg = 0; 224 225 226 static uint32_t sad_seg_size_per = 800; /* 80.0 % */ 227 static int32_t rack_pkt_delay = 1000; 228 static int32_t rack_send_a_lot_in_prr = 1; 229 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 230 static int32_t rack_verbose_logging = 0; 231 static int32_t rack_ignore_data_after_close = 1; 232 static int32_t rack_enable_shared_cwnd = 1; 233 static int32_t rack_use_cmp_acks = 1; 234 static int32_t rack_use_fsb = 1; 235 static int32_t rack_use_rfo = 1; 236 static int32_t rack_use_rsm_rfo = 1; 237 static int32_t rack_max_abc_post_recovery = 2; 238 static int32_t rack_client_low_buf = 0; 239 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 240 static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */ 241 #ifdef TCP_ACCOUNTING 242 static int32_t rack_tcp_accounting = 0; 243 #endif 244 static int32_t rack_limits_scwnd = 1; 245 static int32_t rack_enable_mqueue_for_nonpaced = 0; 246 static int32_t rack_hybrid_allow_set_maxseg = 0; 247 static int32_t rack_disable_prr = 0; 248 static int32_t use_rack_rr = 1; 249 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 250 static int32_t rack_persist_min = 250000; /* 250usec */ 251 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 252 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 253 static int32_t rack_default_init_window = 0; /* Use system default */ 254 static int32_t rack_limit_time_with_srtt = 0; 255 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 256 static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ 257 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 258 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 259 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 260 static int32_t rack_hw_check_queue = 0; /* Do we always pre-check queue depth of a hw queue */ 261 static int32_t rack_full_buffer_discount = 10; 262 /* 263 * Currently regular tcp has a rto_min of 30ms 264 * the backoff goes 12 times so that ends up 265 * being a total of 122.850 seconds before a 266 * connection is killed. 267 */ 268 static uint32_t rack_def_data_window = 20; 269 static uint32_t rack_goal_bdp = 2; 270 static uint32_t rack_min_srtts = 1; 271 static uint32_t rack_min_measure_usec = 0; 272 static int32_t rack_tlp_min = 10000; /* 10ms */ 273 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 274 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 275 static const int32_t rack_free_cache = 2; 276 static int32_t rack_hptsi_segments = 40; 277 static int32_t rack_rate_sample_method = USE_RTT_LOW; 278 static int32_t rack_pace_every_seg = 0; 279 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 280 static int32_t rack_slot_reduction = 4; 281 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 282 static int32_t rack_cwnd_block_ends_measure = 0; 283 static int32_t rack_rwnd_block_ends_measure = 0; 284 static int32_t rack_def_profile = 0; 285 286 static int32_t rack_lower_cwnd_at_tlp = 0; 287 static int32_t rack_limited_retran = 0; 288 static int32_t rack_always_send_oldest = 0; 289 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 290 291 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 292 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 293 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 294 295 /* Probertt */ 296 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 297 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 298 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 299 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 300 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 301 302 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 303 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 304 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 305 static uint32_t rack_probertt_use_min_rtt_exit = 0; 306 static uint32_t rack_probe_rtt_sets_cwnd = 0; 307 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 308 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 309 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 310 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 311 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 312 static uint32_t rack_probertt_filter_life = 10000000; 313 static uint32_t rack_probertt_lower_within = 10; 314 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 315 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 316 static int32_t rack_probertt_clear_is = 1; 317 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 318 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 319 320 /* Part of pacing */ 321 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 322 323 /* Timely information */ 324 /* Combine these two gives the range of 'no change' to bw */ 325 /* ie the up/down provide the upper and lower bound */ 326 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 327 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 328 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 329 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 330 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 331 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multiplier */ 332 static int32_t rack_gp_increase_per = 2; /* 2% increase in multiplier */ 333 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 334 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 335 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 336 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 337 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 338 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 339 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 340 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 341 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 342 static int32_t rack_use_max_for_nobackoff = 0; 343 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 344 static int32_t rack_timely_no_stopping = 0; 345 static int32_t rack_down_raise_thresh = 100; 346 static int32_t rack_req_segs = 1; 347 static uint64_t rack_bw_rate_cap = 0; 348 349 350 /* Rack specific counters */ 351 counter_u64_t rack_saw_enobuf; 352 counter_u64_t rack_saw_enobuf_hw; 353 counter_u64_t rack_saw_enetunreach; 354 counter_u64_t rack_persists_sends; 355 counter_u64_t rack_persists_acks; 356 counter_u64_t rack_persists_loss; 357 counter_u64_t rack_persists_lost_ends; 358 counter_u64_t rack_total_bytes; 359 #ifdef INVARIANTS 360 counter_u64_t rack_adjust_map_bw; 361 #endif 362 /* Tail loss probe counters */ 363 counter_u64_t rack_tlp_tot; 364 counter_u64_t rack_tlp_newdata; 365 counter_u64_t rack_tlp_retran; 366 counter_u64_t rack_tlp_retran_bytes; 367 counter_u64_t rack_to_tot; 368 counter_u64_t rack_hot_alloc; 369 counter_u64_t rack_to_alloc; 370 counter_u64_t rack_to_alloc_hard; 371 counter_u64_t rack_to_alloc_emerg; 372 counter_u64_t rack_to_alloc_limited; 373 counter_u64_t rack_alloc_limited_conns; 374 counter_u64_t rack_split_limited; 375 counter_u64_t rack_rxt_clamps_cwnd; 376 counter_u64_t rack_rxt_clamps_cwnd_uniq; 377 378 counter_u64_t rack_multi_single_eq; 379 counter_u64_t rack_proc_non_comp_ack; 380 381 counter_u64_t rack_fto_send; 382 counter_u64_t rack_fto_rsm_send; 383 counter_u64_t rack_nfto_resend; 384 counter_u64_t rack_non_fto_send; 385 counter_u64_t rack_extended_rfo; 386 387 counter_u64_t rack_sack_proc_all; 388 counter_u64_t rack_sack_proc_short; 389 counter_u64_t rack_sack_proc_restart; 390 counter_u64_t rack_sack_attacks_detected; 391 counter_u64_t rack_sack_attacks_reversed; 392 counter_u64_t rack_sack_attacks_suspect; 393 counter_u64_t rack_sack_used_next_merge; 394 counter_u64_t rack_sack_splits; 395 counter_u64_t rack_sack_used_prev_merge; 396 counter_u64_t rack_sack_skipped_acked; 397 counter_u64_t rack_ack_total; 398 counter_u64_t rack_express_sack; 399 counter_u64_t rack_sack_total; 400 counter_u64_t rack_move_none; 401 counter_u64_t rack_move_some; 402 403 counter_u64_t rack_input_idle_reduces; 404 counter_u64_t rack_collapsed_win; 405 counter_u64_t rack_collapsed_win_seen; 406 counter_u64_t rack_collapsed_win_rxt; 407 counter_u64_t rack_collapsed_win_rxt_bytes; 408 counter_u64_t rack_try_scwnd; 409 counter_u64_t rack_hw_pace_init_fail; 410 counter_u64_t rack_hw_pace_lost; 411 412 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 413 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 414 415 416 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 417 418 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 419 (tv) = (value) + slop; \ 420 if ((u_long)(tv) < (u_long)(tvmin)) \ 421 (tv) = (tvmin); \ 422 if ((u_long)(tv) > (u_long)(tvmax)) \ 423 (tv) = (tvmax); \ 424 } while (0) 425 426 static void 427 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 428 429 static int 430 rack_process_ack(struct mbuf *m, struct tcphdr *th, 431 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 432 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 433 static int 434 rack_process_data(struct mbuf *m, struct tcphdr *th, 435 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 436 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 437 static void 438 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 439 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 440 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 441 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 442 uint8_t limit_type); 443 static struct rack_sendmap * 444 rack_check_recovery_mode(struct tcpcb *tp, 445 uint32_t tsused); 446 static void 447 rack_cong_signal(struct tcpcb *tp, 448 uint32_t type, uint32_t ack, int ); 449 static void rack_counter_destroy(void); 450 static int 451 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt); 452 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 453 static void 454 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 455 static void 456 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 457 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); 458 static void rack_dtor(void *mem, int32_t size, void *arg); 459 static void 460 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 461 uint32_t flex1, uint32_t flex2, 462 uint32_t flex3, uint32_t flex4, 463 uint32_t flex5, uint32_t flex6, 464 uint16_t flex7, uint8_t mod); 465 466 static void 467 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 468 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 469 struct rack_sendmap *rsm, uint8_t quality); 470 static struct rack_sendmap * 471 rack_find_high_nonack(struct tcp_rack *rack, 472 struct rack_sendmap *rsm); 473 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 474 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 475 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 476 static int rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt); 477 static void 478 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 479 tcp_seq th_ack, int line, uint8_t quality); 480 static void 481 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm); 482 483 static uint32_t 484 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 485 static int32_t rack_handoff_ok(struct tcpcb *tp); 486 static int32_t rack_init(struct tcpcb *tp, void **ptr); 487 static void rack_init_sysctls(void); 488 489 static void 490 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 491 struct tcphdr *th, int entered_rec, int dup_ack_struck, 492 int *dsack_seen, int *sacks_seen); 493 static void 494 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 495 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, 496 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); 497 498 static uint64_t rack_get_gp_est(struct tcp_rack *rack); 499 500 static void 501 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 502 struct rack_sendmap *rsm); 503 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 504 static int32_t rack_output(struct tcpcb *tp); 505 506 static uint32_t 507 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 508 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 509 uint32_t cts, int *no_extra, int *moved_two, uint32_t segsiz); 510 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 511 static void rack_remxt_tmr(struct tcpcb *tp); 512 static int rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt); 513 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 514 static int32_t rack_stopall(struct tcpcb *tp); 515 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 516 static uint32_t 517 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 518 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz); 519 static void 520 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 521 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz); 522 static int 523 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 524 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 525 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 526 static int 527 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 528 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 529 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 530 static int 531 rack_do_closing(struct mbuf *m, struct tcphdr *th, 532 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 533 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 534 static int 535 rack_do_established(struct mbuf *m, struct tcphdr *th, 536 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 537 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 538 static int 539 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 540 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 541 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 542 static int 543 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 544 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 545 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 546 static int 547 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 548 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 549 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 550 static int 551 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 552 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 553 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 554 static int 555 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 556 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 557 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 558 static int 559 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 560 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 561 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 562 static void rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts); 563 struct rack_sendmap * 564 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 565 uint32_t tsused); 566 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 567 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 568 static void 569 tcp_rack_partialack(struct tcpcb *tp); 570 static int 571 rack_set_profile(struct tcp_rack *rack, int prof); 572 static void 573 rack_apply_deferred_options(struct tcp_rack *rack); 574 575 int32_t rack_clear_counter=0; 576 577 static uint64_t 578 rack_get_lt_bw(struct tcp_rack *rack) 579 { 580 struct timeval tv; 581 uint64_t tim, bytes; 582 583 tim = rack->r_ctl.lt_bw_time; 584 bytes = rack->r_ctl.lt_bw_bytes; 585 if (rack->lt_bw_up) { 586 /* Include all the current bytes too */ 587 microuptime(&tv); 588 bytes += (rack->rc_tp->snd_una - rack->r_ctl.lt_seq); 589 tim += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); 590 } 591 if ((bytes != 0) && (tim != 0)) 592 return ((bytes * (uint64_t)1000000) / tim); 593 else 594 return (0); 595 } 596 597 static void 598 rack_swap_beta_values(struct tcp_rack *rack, uint8_t flex8) 599 { 600 struct sockopt sopt; 601 struct cc_newreno_opts opt; 602 struct newreno old; 603 struct tcpcb *tp; 604 int error, failed = 0; 605 606 tp = rack->rc_tp; 607 if (tp->t_cc == NULL) { 608 /* Tcb is leaving */ 609 return; 610 } 611 rack->rc_pacing_cc_set = 1; 612 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 613 /* Not new-reno we can't play games with beta! */ 614 failed = 1; 615 goto out; 616 617 } 618 if (CC_ALGO(tp)->ctl_output == NULL) { 619 /* Huh, not using new-reno so no swaps.? */ 620 failed = 2; 621 goto out; 622 } 623 /* Get the current values out */ 624 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 625 sopt.sopt_dir = SOPT_GET; 626 opt.name = CC_NEWRENO_BETA; 627 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 628 if (error) { 629 failed = 3; 630 goto out; 631 } 632 old.beta = opt.val; 633 opt.name = CC_NEWRENO_BETA_ECN; 634 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 635 if (error) { 636 failed = 4; 637 goto out; 638 } 639 old.beta_ecn = opt.val; 640 641 /* Now lets set in the values we have stored */ 642 sopt.sopt_dir = SOPT_SET; 643 opt.name = CC_NEWRENO_BETA; 644 opt.val = rack->r_ctl.rc_saved_beta.beta; 645 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 646 if (error) { 647 failed = 5; 648 goto out; 649 } 650 opt.name = CC_NEWRENO_BETA_ECN; 651 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 652 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 653 if (error) { 654 failed = 6; 655 goto out; 656 } 657 /* Save off the values for restoral */ 658 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 659 out: 660 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 661 union tcp_log_stackspecific log; 662 struct timeval tv; 663 struct newreno *ptr; 664 665 ptr = ((struct newreno *)tp->t_ccv.cc_data); 666 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 667 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 668 log.u_bbr.flex1 = ptr->beta; 669 log.u_bbr.flex2 = ptr->beta_ecn; 670 log.u_bbr.flex3 = ptr->newreno_flags; 671 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 672 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 673 log.u_bbr.flex6 = failed; 674 log.u_bbr.flex7 = rack->gp_ready; 675 log.u_bbr.flex7 <<= 1; 676 log.u_bbr.flex7 |= rack->use_fixed_rate; 677 log.u_bbr.flex7 <<= 1; 678 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 679 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 680 log.u_bbr.flex8 = flex8; 681 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 682 0, &log, false, NULL, NULL, 0, &tv); 683 } 684 } 685 686 static void 687 rack_set_cc_pacing(struct tcp_rack *rack) 688 { 689 if (rack->rc_pacing_cc_set) 690 return; 691 /* 692 * Use the swap utility placing in 3 for flex8 to id a 693 * set of a new set of values. 694 */ 695 rack->rc_pacing_cc_set = 1; 696 rack_swap_beta_values(rack, 3); 697 } 698 699 static void 700 rack_undo_cc_pacing(struct tcp_rack *rack) 701 { 702 if (rack->rc_pacing_cc_set == 0) 703 return; 704 /* 705 * Use the swap utility placing in 4 for flex8 to id a 706 * restoral of the old values. 707 */ 708 rack->rc_pacing_cc_set = 0; 709 rack_swap_beta_values(rack, 4); 710 } 711 712 static void 713 rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, 714 uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) 715 { 716 if (tcp_bblogging_on(rack->rc_tp)) { 717 union tcp_log_stackspecific log; 718 struct timeval tv; 719 720 memset(&log, 0, sizeof(log)); 721 log.u_bbr.flex1 = seq_end; 722 log.u_bbr.flex2 = rack->rc_tp->gput_seq; 723 log.u_bbr.flex3 = ack_end_t; 724 log.u_bbr.flex4 = rack->rc_tp->gput_ts; 725 log.u_bbr.flex5 = send_end_t; 726 log.u_bbr.flex6 = rack->rc_tp->gput_ack; 727 log.u_bbr.flex7 = mode; 728 log.u_bbr.flex8 = 69; 729 log.u_bbr.rttProp = rack->r_ctl.rc_gp_cumack_ts; 730 log.u_bbr.delRate = rack->r_ctl.rc_gp_output_ts; 731 log.u_bbr.pkts_out = line; 732 log.u_bbr.cwnd_gain = rack->app_limited_needs_set; 733 log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; 734 if (rsm != NULL) { 735 log.u_bbr.applimited = rsm->r_start; 736 log.u_bbr.delivered = rsm->r_end; 737 log.u_bbr.epoch = rsm->r_flags; 738 } 739 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 740 TCP_LOG_EVENTP(rack->rc_tp, NULL, 741 &rack->rc_inp->inp_socket->so_rcv, 742 &rack->rc_inp->inp_socket->so_snd, 743 BBR_LOG_HPTSI_CALC, 0, 744 0, &log, false, &tv); 745 } 746 } 747 748 static int 749 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 750 { 751 uint32_t stat; 752 int32_t error; 753 754 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 755 if (error || req->newptr == NULL) 756 return error; 757 758 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 759 if (error) 760 return (error); 761 if (stat == 1) { 762 #ifdef INVARIANTS 763 printf("Clearing RACK counters\n"); 764 #endif 765 counter_u64_zero(rack_tlp_tot); 766 counter_u64_zero(rack_tlp_newdata); 767 counter_u64_zero(rack_tlp_retran); 768 counter_u64_zero(rack_tlp_retran_bytes); 769 counter_u64_zero(rack_to_tot); 770 counter_u64_zero(rack_saw_enobuf); 771 counter_u64_zero(rack_saw_enobuf_hw); 772 counter_u64_zero(rack_saw_enetunreach); 773 counter_u64_zero(rack_persists_sends); 774 counter_u64_zero(rack_total_bytes); 775 counter_u64_zero(rack_persists_acks); 776 counter_u64_zero(rack_persists_loss); 777 counter_u64_zero(rack_persists_lost_ends); 778 #ifdef INVARIANTS 779 counter_u64_zero(rack_adjust_map_bw); 780 #endif 781 counter_u64_zero(rack_to_alloc_hard); 782 counter_u64_zero(rack_to_alloc_emerg); 783 counter_u64_zero(rack_sack_proc_all); 784 counter_u64_zero(rack_fto_send); 785 counter_u64_zero(rack_fto_rsm_send); 786 counter_u64_zero(rack_extended_rfo); 787 counter_u64_zero(rack_hw_pace_init_fail); 788 counter_u64_zero(rack_hw_pace_lost); 789 counter_u64_zero(rack_non_fto_send); 790 counter_u64_zero(rack_nfto_resend); 791 counter_u64_zero(rack_sack_proc_short); 792 counter_u64_zero(rack_sack_proc_restart); 793 counter_u64_zero(rack_to_alloc); 794 counter_u64_zero(rack_to_alloc_limited); 795 counter_u64_zero(rack_alloc_limited_conns); 796 counter_u64_zero(rack_split_limited); 797 counter_u64_zero(rack_rxt_clamps_cwnd); 798 counter_u64_zero(rack_rxt_clamps_cwnd_uniq); 799 counter_u64_zero(rack_multi_single_eq); 800 counter_u64_zero(rack_proc_non_comp_ack); 801 counter_u64_zero(rack_sack_attacks_detected); 802 counter_u64_zero(rack_sack_attacks_reversed); 803 counter_u64_zero(rack_sack_attacks_suspect); 804 counter_u64_zero(rack_sack_used_next_merge); 805 counter_u64_zero(rack_sack_used_prev_merge); 806 counter_u64_zero(rack_sack_splits); 807 counter_u64_zero(rack_sack_skipped_acked); 808 counter_u64_zero(rack_ack_total); 809 counter_u64_zero(rack_express_sack); 810 counter_u64_zero(rack_sack_total); 811 counter_u64_zero(rack_move_none); 812 counter_u64_zero(rack_move_some); 813 counter_u64_zero(rack_try_scwnd); 814 counter_u64_zero(rack_collapsed_win); 815 counter_u64_zero(rack_collapsed_win_rxt); 816 counter_u64_zero(rack_collapsed_win_seen); 817 counter_u64_zero(rack_collapsed_win_rxt_bytes); 818 } else if (stat == 2) { 819 #ifdef INVARIANTS 820 printf("Clearing RACK option array\n"); 821 #endif 822 COUNTER_ARRAY_ZERO(rack_opts_arry, RACK_OPTS_SIZE); 823 } else if (stat == 3) { 824 printf("Rack has no stats counters to clear (use 1 to clear all stats in sysctl node)\n"); 825 } else if (stat == 4) { 826 #ifdef INVARIANTS 827 printf("Clearing RACK out size array\n"); 828 #endif 829 COUNTER_ARRAY_ZERO(rack_out_size, TCP_MSS_ACCT_SIZE); 830 } 831 rack_clear_counter = 0; 832 return (0); 833 } 834 835 static void 836 rack_init_sysctls(void) 837 { 838 struct sysctl_oid *rack_counters; 839 struct sysctl_oid *rack_attack; 840 struct sysctl_oid *rack_pacing; 841 struct sysctl_oid *rack_timely; 842 struct sysctl_oid *rack_timers; 843 struct sysctl_oid *rack_tlp; 844 struct sysctl_oid *rack_misc; 845 struct sysctl_oid *rack_features; 846 struct sysctl_oid *rack_measure; 847 struct sysctl_oid *rack_probertt; 848 struct sysctl_oid *rack_hw_pacing; 849 850 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 851 SYSCTL_CHILDREN(rack_sysctl_root), 852 OID_AUTO, 853 "sack_attack", 854 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 855 "Rack Sack Attack Counters and Controls"); 856 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_sysctl_root), 858 OID_AUTO, 859 "stats", 860 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 861 "Rack Counters"); 862 SYSCTL_ADD_S32(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_sysctl_root), 864 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 865 &rack_rate_sample_method , USE_RTT_LOW, 866 "What method should we use for rate sampling 0=high, 1=low "); 867 /* Probe rtt related controls */ 868 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 869 SYSCTL_CHILDREN(rack_sysctl_root), 870 OID_AUTO, 871 "probertt", 872 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 873 "ProbeRTT related Controls"); 874 SYSCTL_ADD_U16(&rack_sysctl_ctx, 875 SYSCTL_CHILDREN(rack_probertt), 876 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 877 &rack_atexit_prtt_hbp, 130, 878 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 879 SYSCTL_ADD_U16(&rack_sysctl_ctx, 880 SYSCTL_CHILDREN(rack_probertt), 881 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 882 &rack_atexit_prtt, 130, 883 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 884 SYSCTL_ADD_U16(&rack_sysctl_ctx, 885 SYSCTL_CHILDREN(rack_probertt), 886 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 887 &rack_per_of_gp_probertt, 60, 888 "What percentage of goodput do we pace at in probertt"); 889 SYSCTL_ADD_U16(&rack_sysctl_ctx, 890 SYSCTL_CHILDREN(rack_probertt), 891 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 892 &rack_per_of_gp_probertt_reduce, 10, 893 "What percentage of goodput do we reduce every gp_srtt"); 894 SYSCTL_ADD_U16(&rack_sysctl_ctx, 895 SYSCTL_CHILDREN(rack_probertt), 896 OID_AUTO, "gp_per_low", CTLFLAG_RW, 897 &rack_per_of_gp_lowthresh, 40, 898 "What percentage of goodput do we allow the multiplier to fall to"); 899 SYSCTL_ADD_U32(&rack_sysctl_ctx, 900 SYSCTL_CHILDREN(rack_probertt), 901 OID_AUTO, "time_between", CTLFLAG_RW, 902 & rack_time_between_probertt, 96000000, 903 "How many useconds between the lowest rtt falling must past before we enter probertt"); 904 SYSCTL_ADD_U32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_probertt), 906 OID_AUTO, "safety", CTLFLAG_RW, 907 &rack_probe_rtt_safety_val, 2000000, 908 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 909 SYSCTL_ADD_U32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_probertt), 911 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 912 &rack_probe_rtt_sets_cwnd, 0, 913 "Do we set the cwnd too (if always_lower is on)"); 914 SYSCTL_ADD_U32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_probertt), 916 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 917 &rack_max_drain_wait, 2, 918 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 919 SYSCTL_ADD_U32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_probertt), 921 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 922 &rack_must_drain, 1, 923 "We must drain this many gp_srtt's waiting for flight to reach goal"); 924 SYSCTL_ADD_U32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_probertt), 926 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 927 &rack_probertt_use_min_rtt_entry, 1, 928 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 929 SYSCTL_ADD_U32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_probertt), 931 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 932 &rack_probertt_use_min_rtt_exit, 0, 933 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 934 SYSCTL_ADD_U32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_probertt), 936 OID_AUTO, "length_div", CTLFLAG_RW, 937 &rack_probertt_gpsrtt_cnt_div, 0, 938 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 939 SYSCTL_ADD_U32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_probertt), 941 OID_AUTO, "length_mul", CTLFLAG_RW, 942 &rack_probertt_gpsrtt_cnt_mul, 0, 943 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 944 SYSCTL_ADD_U32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_probertt), 946 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 947 &rack_min_probertt_hold, 200000, 948 "What is the minimum time we hold probertt at target"); 949 SYSCTL_ADD_U32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_probertt), 951 OID_AUTO, "filter_life", CTLFLAG_RW, 952 &rack_probertt_filter_life, 10000000, 953 "What is the time for the filters life in useconds"); 954 SYSCTL_ADD_U32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_probertt), 956 OID_AUTO, "lower_within", CTLFLAG_RW, 957 &rack_probertt_lower_within, 10, 958 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 959 SYSCTL_ADD_U32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_probertt), 961 OID_AUTO, "must_move", CTLFLAG_RW, 962 &rack_min_rtt_movement, 250, 963 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 964 SYSCTL_ADD_U32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_probertt), 966 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 967 &rack_probertt_clear_is, 1, 968 "Do we clear I/S counts on exiting probe-rtt"); 969 SYSCTL_ADD_S32(&rack_sysctl_ctx, 970 SYSCTL_CHILDREN(rack_probertt), 971 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 972 &rack_max_drain_hbp, 1, 973 "How many extra drain gpsrtt's do we get in highly buffered paths"); 974 SYSCTL_ADD_S32(&rack_sysctl_ctx, 975 SYSCTL_CHILDREN(rack_probertt), 976 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 977 &rack_hbp_thresh, 3, 978 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 979 /* Pacing related sysctls */ 980 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_sysctl_root), 982 OID_AUTO, 983 "pacing", 984 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 985 "Pacing related Controls"); 986 SYSCTL_ADD_S32(&rack_sysctl_ctx, 987 SYSCTL_CHILDREN(rack_pacing), 988 OID_AUTO, "fulldgpinrec", CTLFLAG_RW, 989 &rack_uses_full_dgp_in_rec, 1, 990 "Do we use all DGP features in recovery (fillcw, timely et.al.)?"); 991 SYSCTL_ADD_S32(&rack_sysctl_ctx, 992 SYSCTL_CHILDREN(rack_pacing), 993 OID_AUTO, "fullbufdisc", CTLFLAG_RW, 994 &rack_full_buffer_discount, 10, 995 "What percentage b/w reduction over the GP estimate for a full buffer (default=0 off)?"); 996 SYSCTL_ADD_S32(&rack_sysctl_ctx, 997 SYSCTL_CHILDREN(rack_pacing), 998 OID_AUTO, "fillcw", CTLFLAG_RW, 999 &rack_fill_cw_state, 0, 1000 "Enable fillcw on new connections (default=0 off)?"); 1001 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1002 SYSCTL_CHILDREN(rack_pacing), 1003 OID_AUTO, "min_burst", CTLFLAG_RW, 1004 &rack_pacing_min_seg, 0, 1005 "What is the min burst size for pacing (0 disables)?"); 1006 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1007 SYSCTL_CHILDREN(rack_pacing), 1008 OID_AUTO, "divisor", CTLFLAG_RW, 1009 &rack_default_pacing_divisor, 4, 1010 "What is the default divisor given to the rl code?"); 1011 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1012 SYSCTL_CHILDREN(rack_pacing), 1013 OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, 1014 &rack_bw_multipler, 2, 1015 "What is the multiplier of the current gp_est that fillcw can increase the b/w too?"); 1016 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1017 SYSCTL_CHILDREN(rack_pacing), 1018 OID_AUTO, "max_pace_over", CTLFLAG_RW, 1019 &rack_max_per_above, 30, 1020 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 1021 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1022 SYSCTL_CHILDREN(rack_pacing), 1023 OID_AUTO, "allow1mss", CTLFLAG_RW, 1024 &rack_pace_one_seg, 0, 1025 "Do we allow low b/w pacing of 1MSS instead of two (1.2Meg and less)?"); 1026 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1027 SYSCTL_CHILDREN(rack_pacing), 1028 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 1029 &rack_limit_time_with_srtt, 0, 1030 "Do we limit pacing time based on srtt"); 1031 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1032 SYSCTL_CHILDREN(rack_pacing), 1033 OID_AUTO, "init_win", CTLFLAG_RW, 1034 &rack_default_init_window, 0, 1035 "Do we have a rack initial window 0 = system default"); 1036 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1037 SYSCTL_CHILDREN(rack_pacing), 1038 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1039 &rack_per_of_gp_ss, 250, 1040 "If non zero, what percentage of goodput to pace at in slow start"); 1041 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1042 SYSCTL_CHILDREN(rack_pacing), 1043 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1044 &rack_per_of_gp_ca, 150, 1045 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1046 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1047 SYSCTL_CHILDREN(rack_pacing), 1048 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1049 &rack_per_of_gp_rec, 200, 1050 "If non zero, what percentage of goodput to pace at in recovery"); 1051 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1052 SYSCTL_CHILDREN(rack_pacing), 1053 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1054 &rack_hptsi_segments, 40, 1055 "What size is the max for TSO segments in pacing and burst mitigation"); 1056 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_pacing), 1058 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1059 &rack_slot_reduction, 4, 1060 "When doing only burst mitigation what is the reduce divisor"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_sysctl_root), 1063 OID_AUTO, "use_pacing", CTLFLAG_RW, 1064 &rack_pace_every_seg, 0, 1065 "If set we use pacing, if clear we use only the original burst mitigation"); 1066 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_pacing), 1068 OID_AUTO, "rate_cap", CTLFLAG_RW, 1069 &rack_bw_rate_cap, 0, 1070 "If set we apply this value to the absolute rate cap used by pacing"); 1071 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_sysctl_root), 1073 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1074 &rack_req_measurements, 1, 1075 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1076 /* Hardware pacing */ 1077 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1078 SYSCTL_CHILDREN(rack_sysctl_root), 1079 OID_AUTO, 1080 "hdwr_pacing", 1081 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1082 "Pacing related Controls"); 1083 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1084 SYSCTL_CHILDREN(rack_hw_pacing), 1085 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1086 &rack_hw_rwnd_factor, 2, 1087 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1088 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1089 SYSCTL_CHILDREN(rack_hw_pacing), 1090 OID_AUTO, "precheck", CTLFLAG_RW, 1091 &rack_hw_check_queue, 0, 1092 "Do we always precheck the hdwr pacing queue to avoid ENOBUF's?"); 1093 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1094 SYSCTL_CHILDREN(rack_hw_pacing), 1095 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1096 &rack_enobuf_hw_boost_mult, 0, 1097 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1098 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1099 SYSCTL_CHILDREN(rack_hw_pacing), 1100 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1101 &rack_enobuf_hw_max, 2, 1102 "What is the max boost the pacing time if we see a ENOBUFS?"); 1103 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1104 SYSCTL_CHILDREN(rack_hw_pacing), 1105 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1106 &rack_enobuf_hw_min, 2, 1107 "What is the min boost the pacing time if we see a ENOBUFS?"); 1108 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1109 SYSCTL_CHILDREN(rack_hw_pacing), 1110 OID_AUTO, "enable", CTLFLAG_RW, 1111 &rack_enable_hw_pacing, 0, 1112 "Should RACK attempt to use hw pacing?"); 1113 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1114 SYSCTL_CHILDREN(rack_hw_pacing), 1115 OID_AUTO, "rate_cap", CTLFLAG_RW, 1116 &rack_hw_rate_caps, 0, 1117 "Does the highest hardware pacing rate cap the rate we will send at??"); 1118 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1119 SYSCTL_CHILDREN(rack_hw_pacing), 1120 OID_AUTO, "uncap_per", CTLFLAG_RW, 1121 &rack_hw_rate_cap_per, 0, 1122 "If you go over b/w by this amount you will be uncapped (0 = never)"); 1123 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1124 SYSCTL_CHILDREN(rack_hw_pacing), 1125 OID_AUTO, "rate_min", CTLFLAG_RW, 1126 &rack_hw_rate_min, 0, 1127 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1128 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_hw_pacing), 1130 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1131 &rack_hw_rate_to_low, 0, 1132 "If we fall below this rate, dis-engage hw pacing?"); 1133 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_hw_pacing), 1135 OID_AUTO, "up_only", CTLFLAG_RW, 1136 &rack_hw_up_only, 0, 1137 "Do we allow hw pacing to lower the rate selected?"); 1138 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1139 SYSCTL_CHILDREN(rack_hw_pacing), 1140 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1141 &rack_hw_pace_extra_slots, 0, 1142 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1143 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1144 SYSCTL_CHILDREN(rack_sysctl_root), 1145 OID_AUTO, 1146 "timely", 1147 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1148 "Rack Timely RTT Controls"); 1149 /* Timely based GP dynmics */ 1150 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1151 SYSCTL_CHILDREN(rack_timely), 1152 OID_AUTO, "upper", CTLFLAG_RW, 1153 &rack_gp_per_bw_mul_up, 2, 1154 "Rack timely upper range for equal b/w (in percentage)"); 1155 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1156 SYSCTL_CHILDREN(rack_timely), 1157 OID_AUTO, "lower", CTLFLAG_RW, 1158 &rack_gp_per_bw_mul_down, 4, 1159 "Rack timely lower range for equal b/w (in percentage)"); 1160 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1161 SYSCTL_CHILDREN(rack_timely), 1162 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1163 &rack_gp_rtt_maxmul, 3, 1164 "Rack timely multiplier of lowest rtt for rtt_max"); 1165 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1166 SYSCTL_CHILDREN(rack_timely), 1167 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1168 &rack_gp_rtt_mindiv, 4, 1169 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1170 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1171 SYSCTL_CHILDREN(rack_timely), 1172 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1173 &rack_gp_rtt_minmul, 1, 1174 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1175 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1176 SYSCTL_CHILDREN(rack_timely), 1177 OID_AUTO, "decrease", CTLFLAG_RW, 1178 &rack_gp_decrease_per, 20, 1179 "Rack timely decrease percentage of our GP multiplication factor"); 1180 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1181 SYSCTL_CHILDREN(rack_timely), 1182 OID_AUTO, "increase", CTLFLAG_RW, 1183 &rack_gp_increase_per, 2, 1184 "Rack timely increase perentage of our GP multiplication factor"); 1185 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1186 SYSCTL_CHILDREN(rack_timely), 1187 OID_AUTO, "lowerbound", CTLFLAG_RW, 1188 &rack_per_lower_bound, 50, 1189 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1190 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1191 SYSCTL_CHILDREN(rack_timely), 1192 OID_AUTO, "upperboundss", CTLFLAG_RW, 1193 &rack_per_upper_bound_ss, 0, 1194 "Rack timely highest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1195 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1196 SYSCTL_CHILDREN(rack_timely), 1197 OID_AUTO, "upperboundca", CTLFLAG_RW, 1198 &rack_per_upper_bound_ca, 0, 1199 "Rack timely highest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1200 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1201 SYSCTL_CHILDREN(rack_timely), 1202 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1203 &rack_do_dyn_mul, 0, 1204 "Rack timely do we enable dynmaic timely goodput by default"); 1205 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_timely), 1207 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1208 &rack_gp_no_rec_chg, 1, 1209 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1210 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1211 SYSCTL_CHILDREN(rack_timely), 1212 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1213 &rack_timely_dec_clear, 6, 1214 "Rack timely what threshold do we count to before another boost during b/w decent"); 1215 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1216 SYSCTL_CHILDREN(rack_timely), 1217 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1218 &rack_timely_max_push_rise, 3, 1219 "Rack timely how many times do we push up with b/w increase"); 1220 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1221 SYSCTL_CHILDREN(rack_timely), 1222 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1223 &rack_timely_max_push_drop, 3, 1224 "Rack timely how many times do we push back on b/w decent"); 1225 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1226 SYSCTL_CHILDREN(rack_timely), 1227 OID_AUTO, "min_segs", CTLFLAG_RW, 1228 &rack_timely_min_segs, 4, 1229 "Rack timely when setting the cwnd what is the min num segments"); 1230 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1231 SYSCTL_CHILDREN(rack_timely), 1232 OID_AUTO, "noback_max", CTLFLAG_RW, 1233 &rack_use_max_for_nobackoff, 0, 1234 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1235 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1236 SYSCTL_CHILDREN(rack_timely), 1237 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1238 &rack_timely_int_timely_only, 0, 1239 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1240 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1241 SYSCTL_CHILDREN(rack_timely), 1242 OID_AUTO, "nonstop", CTLFLAG_RW, 1243 &rack_timely_no_stopping, 0, 1244 "Rack timely don't stop increase"); 1245 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1246 SYSCTL_CHILDREN(rack_timely), 1247 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1248 &rack_down_raise_thresh, 100, 1249 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1250 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1251 SYSCTL_CHILDREN(rack_timely), 1252 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1253 &rack_req_segs, 1, 1254 "Bottom dragging if not these many segments outstanding and room"); 1255 1256 /* TLP and Rack related parameters */ 1257 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1258 SYSCTL_CHILDREN(rack_sysctl_root), 1259 OID_AUTO, 1260 "tlp", 1261 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1262 "TLP and Rack related Controls"); 1263 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1264 SYSCTL_CHILDREN(rack_tlp), 1265 OID_AUTO, "use_rrr", CTLFLAG_RW, 1266 &use_rack_rr, 1, 1267 "Do we use Rack Rapid Recovery"); 1268 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1269 SYSCTL_CHILDREN(rack_tlp), 1270 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1271 &rack_max_abc_post_recovery, 2, 1272 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1273 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1274 SYSCTL_CHILDREN(rack_tlp), 1275 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1276 &rack_non_rxt_use_cr, 0, 1277 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1278 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1279 SYSCTL_CHILDREN(rack_tlp), 1280 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1281 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1282 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1283 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_tlp), 1285 OID_AUTO, "limit", CTLFLAG_RW, 1286 &rack_tlp_limit, 2, 1287 "How many TLP's can be sent without sending new data"); 1288 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1289 SYSCTL_CHILDREN(rack_tlp), 1290 OID_AUTO, "use_greater", CTLFLAG_RW, 1291 &rack_tlp_use_greater, 1, 1292 "Should we use the rack_rtt time if its greater than srtt"); 1293 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1294 SYSCTL_CHILDREN(rack_tlp), 1295 OID_AUTO, "tlpminto", CTLFLAG_RW, 1296 &rack_tlp_min, 10000, 1297 "TLP minimum timeout per the specification (in microseconds)"); 1298 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1299 SYSCTL_CHILDREN(rack_tlp), 1300 OID_AUTO, "send_oldest", CTLFLAG_RW, 1301 &rack_always_send_oldest, 0, 1302 "Should we always send the oldest TLP and RACK-TLP"); 1303 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1304 SYSCTL_CHILDREN(rack_tlp), 1305 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1306 &rack_limited_retran, 0, 1307 "How many times can a rack timeout drive out sends"); 1308 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1309 SYSCTL_CHILDREN(rack_tlp), 1310 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1311 &rack_lower_cwnd_at_tlp, 0, 1312 "When a TLP completes a retran should we enter recovery"); 1313 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1314 SYSCTL_CHILDREN(rack_tlp), 1315 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1316 &rack_reorder_thresh, 2, 1317 "What factor for rack will be added when seeing reordering (shift right)"); 1318 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1319 SYSCTL_CHILDREN(rack_tlp), 1320 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1321 &rack_tlp_thresh, 1, 1322 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1323 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1324 SYSCTL_CHILDREN(rack_tlp), 1325 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1326 &rack_reorder_fade, 60000000, 1327 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1328 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1329 SYSCTL_CHILDREN(rack_tlp), 1330 OID_AUTO, "pktdelay", CTLFLAG_RW, 1331 &rack_pkt_delay, 1000, 1332 "Extra RACK time (in microseconds) besides reordering thresh"); 1333 1334 /* Timer related controls */ 1335 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1336 SYSCTL_CHILDREN(rack_sysctl_root), 1337 OID_AUTO, 1338 "timers", 1339 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1340 "Timer related controls"); 1341 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1342 SYSCTL_CHILDREN(rack_timers), 1343 OID_AUTO, "persmin", CTLFLAG_RW, 1344 &rack_persist_min, 250000, 1345 "What is the minimum time in microseconds between persists"); 1346 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1347 SYSCTL_CHILDREN(rack_timers), 1348 OID_AUTO, "persmax", CTLFLAG_RW, 1349 &rack_persist_max, 2000000, 1350 "What is the largest delay in microseconds between persists"); 1351 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1352 SYSCTL_CHILDREN(rack_timers), 1353 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1354 &rack_delayed_ack_time, 40000, 1355 "Delayed ack time (40ms in microseconds)"); 1356 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1357 SYSCTL_CHILDREN(rack_timers), 1358 OID_AUTO, "minrto", CTLFLAG_RW, 1359 &rack_rto_min, 30000, 1360 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1361 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1362 SYSCTL_CHILDREN(rack_timers), 1363 OID_AUTO, "maxrto", CTLFLAG_RW, 1364 &rack_rto_max, 4000000, 1365 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1366 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1367 SYSCTL_CHILDREN(rack_timers), 1368 OID_AUTO, "minto", CTLFLAG_RW, 1369 &rack_min_to, 1000, 1370 "Minimum rack timeout in microseconds"); 1371 /* Measure controls */ 1372 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_sysctl_root), 1374 OID_AUTO, 1375 "measure", 1376 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1377 "Measure related controls"); 1378 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_measure), 1380 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1381 &rack_wma_divisor, 8, 1382 "When doing b/w calculation what is the divisor for the WMA"); 1383 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1384 SYSCTL_CHILDREN(rack_measure), 1385 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1386 &rack_cwnd_block_ends_measure, 0, 1387 "Does a cwnd just-return end the measurement window (app limited)"); 1388 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1389 SYSCTL_CHILDREN(rack_measure), 1390 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1391 &rack_rwnd_block_ends_measure, 0, 1392 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1393 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1394 SYSCTL_CHILDREN(rack_measure), 1395 OID_AUTO, "min_target", CTLFLAG_RW, 1396 &rack_def_data_window, 20, 1397 "What is the minimum target window (in mss) for a GP measurements"); 1398 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_measure), 1400 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1401 &rack_goal_bdp, 2, 1402 "What is the goal BDP to measure"); 1403 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1404 SYSCTL_CHILDREN(rack_measure), 1405 OID_AUTO, "min_srtts", CTLFLAG_RW, 1406 &rack_min_srtts, 1, 1407 "What is the goal BDP to measure"); 1408 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1409 SYSCTL_CHILDREN(rack_measure), 1410 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1411 &rack_min_measure_usec, 0, 1412 "What is the Minimum time time for a measurement if 0, this is off"); 1413 /* Features */ 1414 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1415 SYSCTL_CHILDREN(rack_sysctl_root), 1416 OID_AUTO, 1417 "features", 1418 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1419 "Feature controls"); 1420 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_features), 1422 OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW, 1423 &rack_rxt_clamp_thresh, 0, 1424 "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP"); 1425 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1426 SYSCTL_CHILDREN(rack_features), 1427 OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, 1428 &rack_hybrid_allow_set_maxseg, 0, 1429 "Should hybrid pacing allow the setmss command"); 1430 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1431 SYSCTL_CHILDREN(rack_features), 1432 OID_AUTO, "cmpack", CTLFLAG_RW, 1433 &rack_use_cmp_acks, 1, 1434 "Should RACK have LRO send compressed acks"); 1435 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1436 SYSCTL_CHILDREN(rack_features), 1437 OID_AUTO, "fsb", CTLFLAG_RW, 1438 &rack_use_fsb, 1, 1439 "Should RACK use the fast send block?"); 1440 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_features), 1442 OID_AUTO, "rfo", CTLFLAG_RW, 1443 &rack_use_rfo, 1, 1444 "Should RACK use rack_fast_output()?"); 1445 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1446 SYSCTL_CHILDREN(rack_features), 1447 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1448 &rack_use_rsm_rfo, 1, 1449 "Should RACK use rack_fast_rsm_output()?"); 1450 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1451 SYSCTL_CHILDREN(rack_features), 1452 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1453 &rack_enable_mqueue_for_nonpaced, 0, 1454 "Should RACK use mbuf queuing for non-paced connections"); 1455 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1456 SYSCTL_CHILDREN(rack_features), 1457 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1458 &rack_do_hystart, 0, 1459 "Should RACK enable HyStart++ on connections?"); 1460 /* Misc rack controls */ 1461 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1462 SYSCTL_CHILDREN(rack_sysctl_root), 1463 OID_AUTO, 1464 "misc", 1465 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1466 "Misc related controls"); 1467 #ifdef TCP_ACCOUNTING 1468 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1469 SYSCTL_CHILDREN(rack_misc), 1470 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1471 &rack_tcp_accounting, 0, 1472 "Should we turn on TCP accounting for all rack sessions?"); 1473 #endif 1474 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1475 SYSCTL_CHILDREN(rack_misc), 1476 OID_AUTO, "dnd", CTLFLAG_RW, 1477 &rack_dnd_default, 0, 1478 "Do not disturb default for rack_rrr = 3"); 1479 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1480 SYSCTL_CHILDREN(rack_misc), 1481 OID_AUTO, "sad_seg_per", CTLFLAG_RW, 1482 &sad_seg_size_per, 800, 1483 "Percentage of segment size needed in a sack 800 = 80.0?"); 1484 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1485 SYSCTL_CHILDREN(rack_misc), 1486 OID_AUTO, "rxt_controls", CTLFLAG_RW, 1487 &rack_rxt_controls, 0, 1488 "Retransmit sending size controls (valid values 0, 1, 2 default=1)?"); 1489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_misc), 1491 OID_AUTO, "rack_hibeta", CTLFLAG_RW, 1492 &rack_hibeta_setting, 0, 1493 "Do we ue a high beta (80 instead of 50)?"); 1494 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1495 SYSCTL_CHILDREN(rack_misc), 1496 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1497 &rack_apply_rtt_with_reduced_conf, 0, 1498 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1499 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1500 SYSCTL_CHILDREN(rack_misc), 1501 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1502 &rack_dsack_std_based, 3, 1503 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1504 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1505 SYSCTL_CHILDREN(rack_misc), 1506 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1507 &rack_prr_addbackmax, 2, 1508 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1509 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1510 SYSCTL_CHILDREN(rack_misc), 1511 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1512 &rack_stats_gets_ms_rtt, 1, 1513 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1514 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1515 SYSCTL_CHILDREN(rack_misc), 1516 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1517 &rack_client_low_buf, 0, 1518 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1519 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1520 SYSCTL_CHILDREN(rack_misc), 1521 OID_AUTO, "defprofile", CTLFLAG_RW, 1522 &rack_def_profile, 0, 1523 "Should RACK use a default profile (0=no, num == profile num)?"); 1524 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1525 SYSCTL_CHILDREN(rack_misc), 1526 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1527 &rack_enable_shared_cwnd, 1, 1528 "Should RACK try to use the shared cwnd on connections where allowed"); 1529 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1530 SYSCTL_CHILDREN(rack_misc), 1531 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1532 &rack_limits_scwnd, 1, 1533 "Should RACK place low end time limits on the shared cwnd feature"); 1534 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1535 SYSCTL_CHILDREN(rack_misc), 1536 OID_AUTO, "no_prr", CTLFLAG_RW, 1537 &rack_disable_prr, 0, 1538 "Should RACK not use prr and only pace (must have pacing on)"); 1539 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1540 SYSCTL_CHILDREN(rack_misc), 1541 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1542 &rack_verbose_logging, 0, 1543 "Should RACK black box logging be verbose"); 1544 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1545 SYSCTL_CHILDREN(rack_misc), 1546 OID_AUTO, "data_after_close", CTLFLAG_RW, 1547 &rack_ignore_data_after_close, 1, 1548 "Do we hold off sending a RST until all pending data is ack'd"); 1549 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1550 SYSCTL_CHILDREN(rack_misc), 1551 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1552 &rack_sack_not_required, 1, 1553 "Do we allow rack to run on connections not supporting SACK"); 1554 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1555 SYSCTL_CHILDREN(rack_misc), 1556 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1557 &rack_send_a_lot_in_prr, 1, 1558 "Send a lot in prr"); 1559 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1560 SYSCTL_CHILDREN(rack_misc), 1561 OID_AUTO, "autoscale", CTLFLAG_RW, 1562 &rack_autosndbuf_inc, 20, 1563 "What percentage should rack scale up its snd buffer by?"); 1564 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1565 SYSCTL_CHILDREN(rack_misc), 1566 OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW, 1567 &rack_rxt_min_rnds, 10, 1568 "Number of rounds needed between RTT clamps due to high loss rates"); 1569 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1570 SYSCTL_CHILDREN(rack_misc), 1571 OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW, 1572 &rack_unclamp_round_thresh, 100, 1573 "Number of rounds needed with no loss to unclamp"); 1574 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1575 SYSCTL_CHILDREN(rack_misc), 1576 OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW, 1577 &rack_unclamp_rxt_thresh, 5, 1578 "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n"); 1579 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1580 SYSCTL_CHILDREN(rack_misc), 1581 OID_AUTO, "clamp_ss_upper", CTLFLAG_RW, 1582 &rack_clamp_ss_upper, 110, 1583 "Clamp percentage ceiling in SS?"); 1584 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1585 SYSCTL_CHILDREN(rack_misc), 1586 OID_AUTO, "clamp_ca_upper", CTLFLAG_RW, 1587 &rack_clamp_ca_upper, 110, 1588 "Clamp percentage ceiling in CA?"); 1589 /* Sack Attacker detection stuff */ 1590 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1591 SYSCTL_CHILDREN(rack_attack), 1592 OID_AUTO, "merge_out", CTLFLAG_RW, 1593 &rack_merge_out_sacks_on_attack, 0, 1594 "Do we merge the sendmap when we decide we are being attacked?"); 1595 1596 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1597 SYSCTL_CHILDREN(rack_attack), 1598 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1599 &rack_highest_sack_thresh_seen, 0, 1600 "Highest sack to ack ratio seen"); 1601 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1602 SYSCTL_CHILDREN(rack_attack), 1603 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1604 &rack_highest_move_thresh_seen, 0, 1605 "Highest move to non-move ratio seen"); 1606 rack_ack_total = counter_u64_alloc(M_WAITOK); 1607 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1608 SYSCTL_CHILDREN(rack_attack), 1609 OID_AUTO, "acktotal", CTLFLAG_RD, 1610 &rack_ack_total, 1611 "Total number of Ack's"); 1612 rack_express_sack = counter_u64_alloc(M_WAITOK); 1613 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1614 SYSCTL_CHILDREN(rack_attack), 1615 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1616 &rack_express_sack, 1617 "Total expresss number of Sack's"); 1618 rack_sack_total = counter_u64_alloc(M_WAITOK); 1619 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1620 SYSCTL_CHILDREN(rack_attack), 1621 OID_AUTO, "sacktotal", CTLFLAG_RD, 1622 &rack_sack_total, 1623 "Total number of SACKs"); 1624 rack_move_none = counter_u64_alloc(M_WAITOK); 1625 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1626 SYSCTL_CHILDREN(rack_attack), 1627 OID_AUTO, "move_none", CTLFLAG_RD, 1628 &rack_move_none, 1629 "Total number of SACK index reuse of positions under threshold"); 1630 rack_move_some = counter_u64_alloc(M_WAITOK); 1631 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1632 SYSCTL_CHILDREN(rack_attack), 1633 OID_AUTO, "move_some", CTLFLAG_RD, 1634 &rack_move_some, 1635 "Total number of SACK index reuse of positions over threshold"); 1636 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1637 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1638 SYSCTL_CHILDREN(rack_attack), 1639 OID_AUTO, "attacks", CTLFLAG_RD, 1640 &rack_sack_attacks_detected, 1641 "Total number of SACK attackers that had sack disabled"); 1642 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1643 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1644 SYSCTL_CHILDREN(rack_attack), 1645 OID_AUTO, "reversed", CTLFLAG_RD, 1646 &rack_sack_attacks_reversed, 1647 "Total number of SACK attackers that were later determined false positive"); 1648 rack_sack_attacks_suspect = counter_u64_alloc(M_WAITOK); 1649 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1650 SYSCTL_CHILDREN(rack_attack), 1651 OID_AUTO, "suspect", CTLFLAG_RD, 1652 &rack_sack_attacks_suspect, 1653 "Total number of SACKs that triggered early detection"); 1654 1655 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1656 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1657 SYSCTL_CHILDREN(rack_attack), 1658 OID_AUTO, "nextmerge", CTLFLAG_RD, 1659 &rack_sack_used_next_merge, 1660 "Total number of times we used the next merge"); 1661 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1662 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1663 SYSCTL_CHILDREN(rack_attack), 1664 OID_AUTO, "prevmerge", CTLFLAG_RD, 1665 &rack_sack_used_prev_merge, 1666 "Total number of times we used the prev merge"); 1667 /* Counters */ 1668 rack_total_bytes = counter_u64_alloc(M_WAITOK); 1669 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1670 SYSCTL_CHILDREN(rack_counters), 1671 OID_AUTO, "totalbytes", CTLFLAG_RD, 1672 &rack_total_bytes, 1673 "Total number of bytes sent"); 1674 rack_fto_send = counter_u64_alloc(M_WAITOK); 1675 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1676 SYSCTL_CHILDREN(rack_counters), 1677 OID_AUTO, "fto_send", CTLFLAG_RD, 1678 &rack_fto_send, "Total number of rack_fast_output sends"); 1679 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1680 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1681 SYSCTL_CHILDREN(rack_counters), 1682 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1683 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1684 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1685 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1686 SYSCTL_CHILDREN(rack_counters), 1687 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1688 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1689 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1690 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1691 SYSCTL_CHILDREN(rack_counters), 1692 OID_AUTO, "nfto_send", CTLFLAG_RD, 1693 &rack_non_fto_send, "Total number of rack_output first sends"); 1694 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1695 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1696 SYSCTL_CHILDREN(rack_counters), 1697 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1698 &rack_extended_rfo, "Total number of times we extended rfo"); 1699 1700 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1701 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1702 SYSCTL_CHILDREN(rack_counters), 1703 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1704 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1705 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1706 1707 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1708 SYSCTL_CHILDREN(rack_counters), 1709 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1710 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1711 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1712 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1713 SYSCTL_CHILDREN(rack_counters), 1714 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1715 &rack_tlp_tot, 1716 "Total number of tail loss probe expirations"); 1717 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1718 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1719 SYSCTL_CHILDREN(rack_counters), 1720 OID_AUTO, "tlp_new", CTLFLAG_RD, 1721 &rack_tlp_newdata, 1722 "Total number of tail loss probe sending new data"); 1723 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1724 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1725 SYSCTL_CHILDREN(rack_counters), 1726 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1727 &rack_tlp_retran, 1728 "Total number of tail loss probe sending retransmitted data"); 1729 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1730 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1731 SYSCTL_CHILDREN(rack_counters), 1732 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1733 &rack_tlp_retran_bytes, 1734 "Total bytes of tail loss probe sending retransmitted data"); 1735 rack_to_tot = counter_u64_alloc(M_WAITOK); 1736 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1737 SYSCTL_CHILDREN(rack_counters), 1738 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1739 &rack_to_tot, 1740 "Total number of times the rack to expired"); 1741 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1742 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1743 SYSCTL_CHILDREN(rack_counters), 1744 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1745 &rack_saw_enobuf, 1746 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1747 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1748 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1749 SYSCTL_CHILDREN(rack_counters), 1750 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1751 &rack_saw_enobuf_hw, 1752 "Total number of times a send returned enobuf for hdwr paced connections"); 1753 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1754 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1755 SYSCTL_CHILDREN(rack_counters), 1756 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1757 &rack_saw_enetunreach, 1758 "Total number of times a send received a enetunreachable"); 1759 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1760 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1761 SYSCTL_CHILDREN(rack_counters), 1762 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1763 &rack_hot_alloc, 1764 "Total allocations from the top of our list"); 1765 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1766 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1767 SYSCTL_CHILDREN(rack_counters), 1768 OID_AUTO, "allocs", CTLFLAG_RD, 1769 &rack_to_alloc, 1770 "Total allocations of tracking structures"); 1771 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1772 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1773 SYSCTL_CHILDREN(rack_counters), 1774 OID_AUTO, "allochard", CTLFLAG_RD, 1775 &rack_to_alloc_hard, 1776 "Total allocations done with sleeping the hard way"); 1777 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1778 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1779 SYSCTL_CHILDREN(rack_counters), 1780 OID_AUTO, "allocemerg", CTLFLAG_RD, 1781 &rack_to_alloc_emerg, 1782 "Total allocations done from emergency cache"); 1783 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1784 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1785 SYSCTL_CHILDREN(rack_counters), 1786 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1787 &rack_to_alloc_limited, 1788 "Total allocations dropped due to limit"); 1789 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1790 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1791 SYSCTL_CHILDREN(rack_counters), 1792 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1793 &rack_alloc_limited_conns, 1794 "Connections with allocations dropped due to limit"); 1795 rack_split_limited = counter_u64_alloc(M_WAITOK); 1796 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1797 SYSCTL_CHILDREN(rack_counters), 1798 OID_AUTO, "split_limited", CTLFLAG_RD, 1799 &rack_split_limited, 1800 "Split allocations dropped due to limit"); 1801 rack_rxt_clamps_cwnd = counter_u64_alloc(M_WAITOK); 1802 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1803 SYSCTL_CHILDREN(rack_counters), 1804 OID_AUTO, "rxt_clamps_cwnd", CTLFLAG_RD, 1805 &rack_rxt_clamps_cwnd, 1806 "Number of times that excessive rxt clamped the cwnd down"); 1807 rack_rxt_clamps_cwnd_uniq = counter_u64_alloc(M_WAITOK); 1808 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1809 SYSCTL_CHILDREN(rack_counters), 1810 OID_AUTO, "rxt_clamps_cwnd_uniq", CTLFLAG_RD, 1811 &rack_rxt_clamps_cwnd_uniq, 1812 "Number of connections that have had excessive rxt clamped the cwnd down"); 1813 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1814 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1815 SYSCTL_CHILDREN(rack_counters), 1816 OID_AUTO, "persist_sends", CTLFLAG_RD, 1817 &rack_persists_sends, 1818 "Number of times we sent a persist probe"); 1819 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1820 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1821 SYSCTL_CHILDREN(rack_counters), 1822 OID_AUTO, "persist_acks", CTLFLAG_RD, 1823 &rack_persists_acks, 1824 "Number of times a persist probe was acked"); 1825 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1826 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1827 SYSCTL_CHILDREN(rack_counters), 1828 OID_AUTO, "persist_loss", CTLFLAG_RD, 1829 &rack_persists_loss, 1830 "Number of times we detected a lost persist probe (no ack)"); 1831 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1832 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1833 SYSCTL_CHILDREN(rack_counters), 1834 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1835 &rack_persists_lost_ends, 1836 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1837 #ifdef INVARIANTS 1838 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1839 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1840 SYSCTL_CHILDREN(rack_counters), 1841 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1842 &rack_adjust_map_bw, 1843 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1844 #endif 1845 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1846 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1847 SYSCTL_CHILDREN(rack_counters), 1848 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1849 &rack_multi_single_eq, 1850 "Number of compressed acks total represented"); 1851 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1852 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1853 SYSCTL_CHILDREN(rack_counters), 1854 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1855 &rack_proc_non_comp_ack, 1856 "Number of non compresseds acks that we processed"); 1857 1858 1859 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1860 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1861 SYSCTL_CHILDREN(rack_counters), 1862 OID_AUTO, "sack_long", CTLFLAG_RD, 1863 &rack_sack_proc_all, 1864 "Total times we had to walk whole list for sack processing"); 1865 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1866 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1867 SYSCTL_CHILDREN(rack_counters), 1868 OID_AUTO, "sack_restart", CTLFLAG_RD, 1869 &rack_sack_proc_restart, 1870 "Total times we had to walk whole list due to a restart"); 1871 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1872 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1873 SYSCTL_CHILDREN(rack_counters), 1874 OID_AUTO, "sack_short", CTLFLAG_RD, 1875 &rack_sack_proc_short, 1876 "Total times we took shortcut for sack processing"); 1877 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1878 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1879 SYSCTL_CHILDREN(rack_attack), 1880 OID_AUTO, "skipacked", CTLFLAG_RD, 1881 &rack_sack_skipped_acked, 1882 "Total number of times we skipped previously sacked"); 1883 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1884 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1885 SYSCTL_CHILDREN(rack_attack), 1886 OID_AUTO, "ofsplit", CTLFLAG_RD, 1887 &rack_sack_splits, 1888 "Total number of times we did the old fashion tree split"); 1889 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1890 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1891 SYSCTL_CHILDREN(rack_counters), 1892 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1893 &rack_input_idle_reduces, 1894 "Total number of idle reductions on input"); 1895 rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK); 1896 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1897 SYSCTL_CHILDREN(rack_counters), 1898 OID_AUTO, "collapsed_win_seen", CTLFLAG_RD, 1899 &rack_collapsed_win_seen, 1900 "Total number of collapsed window events seen (where our window shrinks)"); 1901 1902 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1903 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1904 SYSCTL_CHILDREN(rack_counters), 1905 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1906 &rack_collapsed_win, 1907 "Total number of collapsed window events where we mark packets"); 1908 rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK); 1909 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1910 SYSCTL_CHILDREN(rack_counters), 1911 OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD, 1912 &rack_collapsed_win_rxt, 1913 "Total number of packets that were retransmitted"); 1914 rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK); 1915 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1916 SYSCTL_CHILDREN(rack_counters), 1917 OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD, 1918 &rack_collapsed_win_rxt_bytes, 1919 "Total number of bytes that were retransmitted"); 1920 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1921 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1922 SYSCTL_CHILDREN(rack_counters), 1923 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1924 &rack_try_scwnd, 1925 "Total number of scwnd attempts"); 1926 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1927 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1928 OID_AUTO, "outsize", CTLFLAG_RD, 1929 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1930 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1931 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1932 OID_AUTO, "opts", CTLFLAG_RD, 1933 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1934 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1935 SYSCTL_CHILDREN(rack_sysctl_root), 1936 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1937 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1938 } 1939 1940 static uint32_t 1941 rc_init_window(struct tcp_rack *rack) 1942 { 1943 uint32_t win; 1944 1945 if (rack->rc_init_win == 0) { 1946 /* 1947 * Nothing set by the user, use the system stack 1948 * default. 1949 */ 1950 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1951 } 1952 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1953 return (win); 1954 } 1955 1956 static uint64_t 1957 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1958 { 1959 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1960 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1961 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1962 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1963 else 1964 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1965 } 1966 1967 static void 1968 rack_log_hybrid_bw(struct tcp_rack *rack, uint32_t seq, uint64_t cbw, uint64_t tim, 1969 uint64_t data, uint8_t mod, uint16_t aux, 1970 struct tcp_sendfile_track *cur) 1971 { 1972 #ifdef TCP_REQUEST_TRK 1973 int do_log = 0; 1974 1975 /* 1976 * The rate cap one is noisy and only should come out when normal BB logging 1977 * is enabled, the other logs (not RATE_CAP and NOT CAP_CALC) only come out 1978 * once per chunk and make up the BBpoint that can be turned on by the client. 1979 */ 1980 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 1981 /* 1982 * The very noisy two need to only come out when 1983 * we have verbose logging on. 1984 */ 1985 if (rack_verbose_logging != 0) 1986 do_log = tcp_bblogging_on(rack->rc_tp); 1987 else 1988 do_log = 0; 1989 } else if (mod != HYBRID_LOG_BW_MEASURE) { 1990 /* 1991 * All other less noisy logs here except the measure which 1992 * also needs to come out on the point and the log. 1993 */ 1994 do_log = tcp_bblogging_on(rack->rc_tp); 1995 } else { 1996 do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING); 1997 } 1998 1999 if (do_log) { 2000 union tcp_log_stackspecific log; 2001 struct timeval tv; 2002 uint64_t lt_bw; 2003 2004 /* Convert our ms to a microsecond */ 2005 memset(&log, 0, sizeof(log)); 2006 2007 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2008 log.u_bbr.rttProp = tim; 2009 log.u_bbr.bw_inuse = cbw; 2010 log.u_bbr.delRate = rack_get_gp_est(rack); 2011 lt_bw = rack_get_lt_bw(rack); 2012 log.u_bbr.flex1 = seq; 2013 log.u_bbr.pacing_gain = aux; 2014 /* lt_bw = < flex3 | flex2 > */ 2015 log.u_bbr.flex2 = (uint32_t)(lt_bw & 0x00000000ffffffff); 2016 log.u_bbr.flex3 = (uint32_t)((lt_bw >> 32) & 0x00000000ffffffff); 2017 /* Record the last obtained us rtt in inflight */ 2018 if (cur == NULL) { 2019 /* Make sure we are looking at the right log if an overide comes in */ 2020 cur = rack->r_ctl.rc_last_sft; 2021 } 2022 if (rack->r_ctl.rack_rs.rs_flags != RACK_RTT_EMPTY) 2023 log.u_bbr.inflight = rack->r_ctl.rack_rs.rs_us_rtt; 2024 else { 2025 /* Use the last known rtt i.e. the rack-rtt */ 2026 log.u_bbr.inflight = rack->rc_rack_rtt; 2027 } 2028 if (cur != NULL) { 2029 uint64_t off; 2030 2031 log.u_bbr.cur_del_rate = cur->deadline; 2032 if ((mod == HYBRID_LOG_RATE_CAP) || (mod == HYBRID_LOG_CAP_CALC)) { 2033 /* start = < lost | pkt_epoch > */ 2034 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2035 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2036 log.u_bbr.flex6 = cur->start_seq; 2037 log.u_bbr.pkts_out = cur->end_seq; 2038 } else { 2039 /* start = < lost | pkt_epoch > */ 2040 log.u_bbr.pkt_epoch = (uint32_t)(cur->start & 0x00000000ffffffff); 2041 log.u_bbr.lost = (uint32_t)((cur->start >> 32) & 0x00000000ffffffff); 2042 /* end = < pkts_out | flex6 > */ 2043 log.u_bbr.flex6 = (uint32_t)(cur->end & 0x00000000ffffffff); 2044 log.u_bbr.pkts_out = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff); 2045 } 2046 /* first_send = <lt_epoch | epoch> */ 2047 log.u_bbr.epoch = (uint32_t)(cur->first_send & 0x00000000ffffffff); 2048 log.u_bbr.lt_epoch = (uint32_t)((cur->first_send >> 32) & 0x00000000ffffffff); 2049 /* localtime = <delivered | applimited>*/ 2050 log.u_bbr.applimited = (uint32_t)(cur->localtime & 0x00000000ffffffff); 2051 log.u_bbr.delivered = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 2052 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 2053 log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 2054 log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); 2055 log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); 2056 log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; 2057 } else { 2058 log.u_bbr.flex7 = 0xffff; 2059 log.u_bbr.cur_del_rate = 0xffffffffffffffff; 2060 } 2061 /* 2062 * Compose bbr_state to be a bit wise 0000ADHF 2063 * where A is the always_pace flag 2064 * where D is the dgp_on flag 2065 * where H is the hybrid_mode on flag 2066 * where F is the use_fixed_rate flag. 2067 */ 2068 log.u_bbr.bbr_state = rack->rc_always_pace; 2069 log.u_bbr.bbr_state <<= 1; 2070 log.u_bbr.bbr_state |= rack->dgp_on; 2071 log.u_bbr.bbr_state <<= 1; 2072 log.u_bbr.bbr_state |= rack->rc_hybrid_mode; 2073 log.u_bbr.bbr_state <<= 1; 2074 log.u_bbr.bbr_state |= rack->use_fixed_rate; 2075 log.u_bbr.flex8 = mod; 2076 tcp_log_event(rack->rc_tp, NULL, 2077 &rack->rc_inp->inp_socket->so_rcv, 2078 &rack->rc_inp->inp_socket->so_snd, 2079 TCP_HYBRID_PACING_LOG, 0, 2080 0, &log, false, NULL, __func__, __LINE__, &tv); 2081 2082 } 2083 #endif 2084 } 2085 2086 static inline uint64_t 2087 rack_compensate_for_linerate(struct tcp_rack *rack, uint64_t bw) 2088 { 2089 uint64_t ret_bw, ether; 2090 uint64_t u_segsiz; 2091 2092 ether = rack->rc_tp->t_maxseg + sizeof(struct tcphdr); 2093 if (rack->r_is_v6){ 2094 #ifdef INET6 2095 ether += sizeof(struct ip6_hdr); 2096 #endif 2097 ether += 14; /* eheader size 6+6+2 */ 2098 } else { 2099 #ifdef INET 2100 ether += sizeof(struct ip); 2101 #endif 2102 ether += 14; /* eheader size 6+6+2 */ 2103 } 2104 u_segsiz = (uint64_t)min(ctf_fixed_maxseg(rack->rc_tp), rack->r_ctl.rc_pace_min_segs); 2105 ret_bw = bw; 2106 ret_bw *= ether; 2107 ret_bw /= u_segsiz; 2108 return (ret_bw); 2109 } 2110 2111 static void 2112 rack_rate_cap_bw(struct tcp_rack *rack, uint64_t *bw, int *capped) 2113 { 2114 #ifdef TCP_REQUEST_TRK 2115 struct timeval tv; 2116 uint64_t timenow, timeleft, lenleft, lengone, calcbw; 2117 #endif 2118 2119 if (rack->r_ctl.bw_rate_cap == 0) 2120 return; 2121 #ifdef TCP_REQUEST_TRK 2122 if (rack->rc_catch_up && rack->rc_hybrid_mode && 2123 (rack->r_ctl.rc_last_sft != NULL)) { 2124 /* 2125 * We have a dynamic cap. The original target 2126 * is in bw_rate_cap, but we need to look at 2127 * how long it is until we hit the deadline. 2128 */ 2129 struct tcp_sendfile_track *ent; 2130 2131 ent = rack->r_ctl.rc_last_sft; 2132 microuptime(&tv); 2133 timenow = tcp_tv_to_lusectick(&tv); 2134 if (timenow >= ent->deadline) { 2135 /* No time left we do DGP only */ 2136 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2137 0, 0, 0, HYBRID_LOG_OUTOFTIME, 0, ent); 2138 rack->r_ctl.bw_rate_cap = 0; 2139 return; 2140 } 2141 /* We have the time */ 2142 timeleft = rack->r_ctl.rc_last_sft->deadline - timenow; 2143 if (timeleft < HPTS_MSEC_IN_SEC) { 2144 /* If there is less than a ms left just use DGPs rate */ 2145 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2146 0, timeleft, 0, HYBRID_LOG_OUTOFTIME, 0, ent); 2147 rack->r_ctl.bw_rate_cap = 0; 2148 return; 2149 } 2150 /* 2151 * Now lets find the amount of data left to send. 2152 * 2153 * Now ideally we want to use the end_seq to figure out how much more 2154 * but it might not be possible (only if we have the TRACK_FG_COMP on the entry.. 2155 */ 2156 if (ent->flags & TCP_TRK_TRACK_FLG_COMP) { 2157 if (SEQ_GT(ent->end_seq, rack->rc_tp->snd_una)) 2158 lenleft = ent->end_seq - rack->rc_tp->snd_una; 2159 else { 2160 /* TSNH, we should catch it at the send */ 2161 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2162 0, timeleft, 0, HYBRID_LOG_CAPERROR, 0, ent); 2163 rack->r_ctl.bw_rate_cap = 0; 2164 return; 2165 } 2166 } else { 2167 /* 2168 * The hard way, figure out how much is gone and then 2169 * take that away from the total the client asked for 2170 * (thats off by tls overhead if this is tls). 2171 */ 2172 if (SEQ_GT(rack->rc_tp->snd_una, ent->start_seq)) 2173 lengone = rack->rc_tp->snd_una - ent->start_seq; 2174 else 2175 lengone = 0; 2176 if (lengone < (ent->end - ent->start)) 2177 lenleft = (ent->end - ent->start) - lengone; 2178 else { 2179 /* TSNH, we should catch it at the send */ 2180 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2181 0, timeleft, lengone, HYBRID_LOG_CAPERROR, 0, ent); 2182 rack->r_ctl.bw_rate_cap = 0; 2183 return; 2184 } 2185 } 2186 if (lenleft == 0) { 2187 /* We have it all sent */ 2188 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2189 0, timeleft, lenleft, HYBRID_LOG_ALLSENT, 0, ent); 2190 if (rack->r_ctl.bw_rate_cap) 2191 goto normal_ratecap; 2192 else 2193 return; 2194 } 2195 calcbw = lenleft * HPTS_USEC_IN_SEC; 2196 calcbw /= timeleft; 2197 /* Now we must compensate for IP/TCP overhead */ 2198 calcbw = rack_compensate_for_linerate(rack, calcbw); 2199 /* Update the bit rate cap */ 2200 rack->r_ctl.bw_rate_cap = calcbw; 2201 if ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2202 (rack_hybrid_allow_set_maxseg == 1) && 2203 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2204 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2205 uint32_t orig_max; 2206 2207 orig_max = rack->r_ctl.rc_pace_max_segs; 2208 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2209 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, calcbw, ctf_fixed_maxseg(rack->rc_tp)); 2210 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2211 } 2212 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2213 calcbw, timeleft, lenleft, HYBRID_LOG_CAP_CALC, 0, ent); 2214 if ((calcbw > 0) && (*bw > calcbw)) { 2215 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2216 *bw, ent->deadline, lenleft, HYBRID_LOG_RATE_CAP, 0, ent); 2217 *capped = 1; 2218 *bw = calcbw; 2219 } 2220 return; 2221 } 2222 normal_ratecap: 2223 #endif 2224 if ((rack->r_ctl.bw_rate_cap > 0) && (*bw > rack->r_ctl.bw_rate_cap)) { 2225 #ifdef TCP_REQUEST_TRK 2226 if (rack->rc_hybrid_mode && 2227 rack->rc_catch_up && 2228 (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && 2229 (rack_hybrid_allow_set_maxseg == 1) && 2230 ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { 2231 /* Lets set in a smaller mss possibly here to match our rate-cap */ 2232 uint32_t orig_max; 2233 2234 orig_max = rack->r_ctl.rc_pace_max_segs; 2235 rack->r_ctl.rc_last_sft->hybrid_flags |= TCP_HYBRID_PACING_SETMSS; 2236 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, rack->r_ctl.bw_rate_cap, ctf_fixed_maxseg(rack->rc_tp)); 2237 rack_log_type_pacing_sizes(rack->rc_tp, rack, rack->r_ctl.client_suggested_maxseg, orig_max, __LINE__, 5); 2238 } 2239 #endif 2240 *capped = 1; 2241 *bw = rack->r_ctl.bw_rate_cap; 2242 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 2243 *bw, 0, 0, 2244 HYBRID_LOG_RATE_CAP, 1, NULL); 2245 } 2246 } 2247 2248 static uint64_t 2249 rack_get_gp_est(struct tcp_rack *rack) 2250 { 2251 uint64_t bw, lt_bw, ret_bw; 2252 2253 if (rack->rc_gp_filled == 0) { 2254 /* 2255 * We have yet no b/w measurement, 2256 * if we have a user set initial bw 2257 * return it. If we don't have that and 2258 * we have an srtt, use the tcp IW (10) to 2259 * calculate a fictional b/w over the SRTT 2260 * which is more or less a guess. Note 2261 * we don't use our IW from rack on purpose 2262 * so if we have like IW=30, we are not 2263 * calculating a "huge" b/w. 2264 */ 2265 uint64_t srtt; 2266 2267 lt_bw = rack_get_lt_bw(rack); 2268 if (lt_bw) { 2269 /* 2270 * No goodput bw but a long-term b/w does exist 2271 * lets use that. 2272 */ 2273 ret_bw = lt_bw; 2274 goto compensate; 2275 } 2276 if (rack->r_ctl.init_rate) 2277 return (rack->r_ctl.init_rate); 2278 2279 /* Ok lets come up with the IW guess, if we have a srtt */ 2280 if (rack->rc_tp->t_srtt == 0) { 2281 /* 2282 * Go with old pacing method 2283 * i.e. burst mitigation only. 2284 */ 2285 return (0); 2286 } 2287 /* Ok lets get the initial TCP win (not racks) */ 2288 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2289 srtt = (uint64_t)rack->rc_tp->t_srtt; 2290 bw *= (uint64_t)USECS_IN_SECOND; 2291 bw /= srtt; 2292 ret_bw = bw; 2293 goto compensate; 2294 2295 } 2296 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2297 /* Averaging is done, we can return the value */ 2298 bw = rack->r_ctl.gp_bw; 2299 } else { 2300 /* Still doing initial average must calculate */ 2301 bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); 2302 } 2303 lt_bw = rack_get_lt_bw(rack); 2304 if (lt_bw == 0) { 2305 /* If we don't have one then equate it to the gp_bw */ 2306 lt_bw = rack->r_ctl.gp_bw; 2307 } 2308 if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){ 2309 /* if clamped take the lowest */ 2310 if (lt_bw < bw) 2311 ret_bw = lt_bw; 2312 else 2313 ret_bw = bw; 2314 } else { 2315 /* If not set for clamped to get lowest, take the highest */ 2316 if (lt_bw > bw) 2317 ret_bw = lt_bw; 2318 else 2319 ret_bw = bw; 2320 } 2321 /* 2322 * Now lets compensate based on the TCP/IP overhead. Our 2323 * Goodput estimate does not include this so we must pace out 2324 * a bit faster since our pacing calculations do. The pacing 2325 * calculations use the base ETHERNET_SEGMENT_SIZE and the segsiz 2326 * we are using to do this, so we do that here in the opposite 2327 * direction as well. This means that if we are tunneled and the 2328 * segsiz is say 1200 bytes we will get quite a boost, but its 2329 * compensated for in the pacing time the opposite way. 2330 */ 2331 compensate: 2332 ret_bw = rack_compensate_for_linerate(rack, ret_bw); 2333 return(ret_bw); 2334 } 2335 2336 2337 static uint64_t 2338 rack_get_bw(struct tcp_rack *rack) 2339 { 2340 uint64_t bw; 2341 2342 if (rack->use_fixed_rate) { 2343 /* Return the fixed pacing rate */ 2344 return (rack_get_fixed_pacing_bw(rack)); 2345 } 2346 bw = rack_get_gp_est(rack); 2347 return (bw); 2348 } 2349 2350 static uint16_t 2351 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2352 { 2353 if (rack->use_fixed_rate) { 2354 return (100); 2355 } else if (rack->in_probe_rtt && (rsm == NULL)) 2356 return (rack->r_ctl.rack_per_of_gp_probertt); 2357 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2358 rack->r_ctl.rack_per_of_gp_rec)) { 2359 if (rsm) { 2360 /* a retransmission always use the recovery rate */ 2361 return (rack->r_ctl.rack_per_of_gp_rec); 2362 } else if (rack->rack_rec_nonrxt_use_cr) { 2363 /* Directed to use the configured rate */ 2364 goto configured_rate; 2365 } else if (rack->rack_no_prr && 2366 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2367 /* No PRR, lets just use the b/w estimate only */ 2368 return (100); 2369 } else { 2370 /* 2371 * Here we may have a non-retransmit but we 2372 * have no overrides, so just use the recovery 2373 * rate (prr is in effect). 2374 */ 2375 return (rack->r_ctl.rack_per_of_gp_rec); 2376 } 2377 } 2378 configured_rate: 2379 /* For the configured rate we look at our cwnd vs the ssthresh */ 2380 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2381 return (rack->r_ctl.rack_per_of_gp_ss); 2382 else 2383 return (rack->r_ctl.rack_per_of_gp_ca); 2384 } 2385 2386 static void 2387 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2388 { 2389 /* 2390 * Types of logs (mod value) 2391 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2392 * 2 = a dsack round begins, persist is reset to 16. 2393 * 3 = a dsack round ends 2394 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2395 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2396 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2397 */ 2398 if (tcp_bblogging_on(rack->rc_tp)) { 2399 union tcp_log_stackspecific log; 2400 struct timeval tv; 2401 2402 memset(&log, 0, sizeof(log)); 2403 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2404 log.u_bbr.flex1 <<= 1; 2405 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2406 log.u_bbr.flex1 <<= 1; 2407 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2408 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2409 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2410 log.u_bbr.flex4 = flex4; 2411 log.u_bbr.flex5 = flex5; 2412 log.u_bbr.flex6 = flex6; 2413 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2414 log.u_bbr.flex8 = mod; 2415 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2416 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2417 &rack->rc_inp->inp_socket->so_rcv, 2418 &rack->rc_inp->inp_socket->so_snd, 2419 RACK_DSACK_HANDLING, 0, 2420 0, &log, false, &tv); 2421 } 2422 } 2423 2424 static void 2425 rack_log_hdwr_pacing(struct tcp_rack *rack, 2426 uint64_t rate, uint64_t hw_rate, int line, 2427 int error, uint16_t mod) 2428 { 2429 if (tcp_bblogging_on(rack->rc_tp)) { 2430 union tcp_log_stackspecific log; 2431 struct timeval tv; 2432 const struct ifnet *ifp; 2433 2434 memset(&log, 0, sizeof(log)); 2435 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2436 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2437 if (rack->r_ctl.crte) { 2438 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2439 } else if (rack->rc_inp->inp_route.ro_nh && 2440 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2441 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2442 } else 2443 ifp = NULL; 2444 if (ifp) { 2445 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2446 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2447 } 2448 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2449 log.u_bbr.bw_inuse = rate; 2450 log.u_bbr.flex5 = line; 2451 log.u_bbr.flex6 = error; 2452 log.u_bbr.flex7 = mod; 2453 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2454 log.u_bbr.flex8 = rack->use_fixed_rate; 2455 log.u_bbr.flex8 <<= 1; 2456 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2457 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2458 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2459 if (rack->r_ctl.crte) 2460 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2461 else 2462 log.u_bbr.cur_del_rate = 0; 2463 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2464 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2465 &rack->rc_inp->inp_socket->so_rcv, 2466 &rack->rc_inp->inp_socket->so_snd, 2467 BBR_LOG_HDWR_PACE, 0, 2468 0, &log, false, &tv); 2469 } 2470 } 2471 2472 static uint64_t 2473 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2474 { 2475 /* 2476 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2477 */ 2478 uint64_t bw_est, high_rate; 2479 uint64_t gain; 2480 2481 if ((rack->r_pacing_discount == 0) || 2482 (rack_full_buffer_discount == 0)) { 2483 /* 2484 * No buffer level based discount from client buffer 2485 * level is enabled or the feature is disabled. 2486 */ 2487 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2488 bw_est = bw * gain; 2489 bw_est /= (uint64_t)100; 2490 } else { 2491 /* 2492 * We have a discount in place apply it with 2493 * just a 100% gain (we get no boost if the buffer 2494 * is full). 2495 */ 2496 uint64_t discount; 2497 2498 discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm); 2499 discount /= 100; 2500 /* What %% of the b/w do we discount */ 2501 bw_est = bw - discount; 2502 } 2503 /* Never fall below the minimum (def 64kbps) */ 2504 if (bw_est < RACK_MIN_BW) 2505 bw_est = RACK_MIN_BW; 2506 if (rack->r_rack_hw_rate_caps) { 2507 /* Rate caps are in place */ 2508 if (rack->r_ctl.crte != NULL) { 2509 /* We have a hdwr rate already */ 2510 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2511 if (bw_est >= high_rate) { 2512 /* We are capping bw at the highest rate table entry */ 2513 if (rack_hw_rate_cap_per && 2514 (((high_rate * (100 + rack_hw_rate_cap_per)) / 100) < bw_est)) { 2515 rack->r_rack_hw_rate_caps = 0; 2516 goto done; 2517 } 2518 rack_log_hdwr_pacing(rack, 2519 bw_est, high_rate, __LINE__, 2520 0, 3); 2521 bw_est = high_rate; 2522 if (capped) 2523 *capped = 1; 2524 } 2525 } else if ((rack->rack_hdrw_pacing == 0) && 2526 (rack->rack_hdw_pace_ena) && 2527 (rack->rack_attempt_hdwr_pace == 0) && 2528 (rack->rc_inp->inp_route.ro_nh != NULL) && 2529 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2530 /* 2531 * Special case, we have not yet attempted hardware 2532 * pacing, and yet we may, when we do, find out if we are 2533 * above the highest rate. We need to know the maxbw for the interface 2534 * in question (if it supports ratelimiting). We get back 2535 * a 0, if the interface is not found in the RL lists. 2536 */ 2537 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2538 if (high_rate) { 2539 /* Yep, we have a rate is it above this rate? */ 2540 if (bw_est > high_rate) { 2541 bw_est = high_rate; 2542 if (capped) 2543 *capped = 1; 2544 } 2545 } 2546 } 2547 } 2548 done: 2549 return (bw_est); 2550 } 2551 2552 static void 2553 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2554 { 2555 if (tcp_bblogging_on(rack->rc_tp)) { 2556 union tcp_log_stackspecific log; 2557 struct timeval tv; 2558 2559 if (rack->sack_attack_disable > 0) 2560 goto log_anyway; 2561 if ((mod != 1) && (rack_verbose_logging == 0)) { 2562 /* 2563 * We get 3 values currently for mod 2564 * 1 - We are retransmitting and this tells the reason. 2565 * 2 - We are clearing a dup-ack count. 2566 * 3 - We are incrementing a dup-ack count. 2567 * 2568 * The clear/increment are only logged 2569 * if you have BBverbose on. 2570 */ 2571 return; 2572 } 2573 log_anyway: 2574 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2575 log.u_bbr.flex1 = tsused; 2576 log.u_bbr.flex2 = thresh; 2577 log.u_bbr.flex3 = rsm->r_flags; 2578 log.u_bbr.flex4 = rsm->r_dupack; 2579 log.u_bbr.flex5 = rsm->r_start; 2580 log.u_bbr.flex6 = rsm->r_end; 2581 log.u_bbr.flex8 = mod; 2582 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2583 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2584 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2585 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2586 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2587 log.u_bbr.pacing_gain = rack->r_must_retran; 2588 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2589 &rack->rc_inp->inp_socket->so_rcv, 2590 &rack->rc_inp->inp_socket->so_snd, 2591 BBR_LOG_SETTINGS_CHG, 0, 2592 0, &log, false, &tv); 2593 } 2594 } 2595 2596 static void 2597 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2598 { 2599 if (tcp_bblogging_on(rack->rc_tp)) { 2600 union tcp_log_stackspecific log; 2601 struct timeval tv; 2602 2603 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2604 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2605 log.u_bbr.flex2 = to; 2606 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2607 log.u_bbr.flex4 = slot; 2608 log.u_bbr.flex5 = rack->rc_tp->t_hpts_slot; 2609 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2610 log.u_bbr.flex7 = rack->rc_in_persist; 2611 log.u_bbr.flex8 = which; 2612 if (rack->rack_no_prr) 2613 log.u_bbr.pkts_out = 0; 2614 else 2615 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2616 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2617 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2618 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2619 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2620 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2621 log.u_bbr.pacing_gain = rack->r_must_retran; 2622 log.u_bbr.cwnd_gain = rack->rack_deferred_inited; 2623 log.u_bbr.pkt_epoch = rack->rc_has_collapsed; 2624 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2625 log.u_bbr.lost = rack_rto_min; 2626 log.u_bbr.epoch = rack->r_ctl.roundends; 2627 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2628 &rack->rc_inp->inp_socket->so_rcv, 2629 &rack->rc_inp->inp_socket->so_snd, 2630 BBR_LOG_TIMERSTAR, 0, 2631 0, &log, false, &tv); 2632 } 2633 } 2634 2635 static void 2636 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2637 { 2638 if (tcp_bblogging_on(rack->rc_tp)) { 2639 union tcp_log_stackspecific log; 2640 struct timeval tv; 2641 2642 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2643 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2644 log.u_bbr.flex8 = to_num; 2645 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2646 log.u_bbr.flex2 = rack->rc_rack_rtt; 2647 if (rsm == NULL) 2648 log.u_bbr.flex3 = 0; 2649 else 2650 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2651 if (rack->rack_no_prr) 2652 log.u_bbr.flex5 = 0; 2653 else 2654 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2655 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2656 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2657 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2658 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2659 log.u_bbr.pacing_gain = rack->r_must_retran; 2660 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2661 &rack->rc_inp->inp_socket->so_rcv, 2662 &rack->rc_inp->inp_socket->so_snd, 2663 BBR_LOG_RTO, 0, 2664 0, &log, false, &tv); 2665 } 2666 } 2667 2668 static void 2669 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2670 struct rack_sendmap *prev, 2671 struct rack_sendmap *rsm, 2672 struct rack_sendmap *next, 2673 int flag, uint32_t th_ack, int line) 2674 { 2675 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2676 union tcp_log_stackspecific log; 2677 struct timeval tv; 2678 2679 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2680 log.u_bbr.flex8 = flag; 2681 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2682 log.u_bbr.cur_del_rate = (uint64_t)prev; 2683 log.u_bbr.delRate = (uint64_t)rsm; 2684 log.u_bbr.rttProp = (uint64_t)next; 2685 log.u_bbr.flex7 = 0; 2686 if (prev) { 2687 log.u_bbr.flex1 = prev->r_start; 2688 log.u_bbr.flex2 = prev->r_end; 2689 log.u_bbr.flex7 |= 0x4; 2690 } 2691 if (rsm) { 2692 log.u_bbr.flex3 = rsm->r_start; 2693 log.u_bbr.flex4 = rsm->r_end; 2694 log.u_bbr.flex7 |= 0x2; 2695 } 2696 if (next) { 2697 log.u_bbr.flex5 = next->r_start; 2698 log.u_bbr.flex6 = next->r_end; 2699 log.u_bbr.flex7 |= 0x1; 2700 } 2701 log.u_bbr.applimited = line; 2702 log.u_bbr.pkts_out = th_ack; 2703 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2704 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2705 if (rack->rack_no_prr) 2706 log.u_bbr.lost = 0; 2707 else 2708 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2709 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2710 &rack->rc_inp->inp_socket->so_rcv, 2711 &rack->rc_inp->inp_socket->so_snd, 2712 TCP_LOG_MAPCHG, 0, 2713 0, &log, false, &tv); 2714 } 2715 } 2716 2717 static void 2718 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2719 struct rack_sendmap *rsm, int conf) 2720 { 2721 if (tcp_bblogging_on(tp)) { 2722 union tcp_log_stackspecific log; 2723 struct timeval tv; 2724 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2725 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2726 log.u_bbr.flex1 = t; 2727 log.u_bbr.flex2 = len; 2728 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2729 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2730 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2731 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2732 log.u_bbr.flex7 = conf; 2733 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2734 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2735 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2736 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2737 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2738 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2739 if (rsm) { 2740 log.u_bbr.pkt_epoch = rsm->r_start; 2741 log.u_bbr.lost = rsm->r_end; 2742 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2743 /* We loose any upper of the 24 bits */ 2744 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2745 } else { 2746 /* Its a SYN */ 2747 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2748 log.u_bbr.lost = 0; 2749 log.u_bbr.cwnd_gain = 0; 2750 log.u_bbr.pacing_gain = 0; 2751 } 2752 /* Write out general bits of interest rrs here */ 2753 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2754 log.u_bbr.use_lt_bw <<= 1; 2755 log.u_bbr.use_lt_bw |= rack->forced_ack; 2756 log.u_bbr.use_lt_bw <<= 1; 2757 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2758 log.u_bbr.use_lt_bw <<= 1; 2759 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2760 log.u_bbr.use_lt_bw <<= 1; 2761 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2762 log.u_bbr.use_lt_bw <<= 1; 2763 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2764 log.u_bbr.use_lt_bw <<= 1; 2765 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2766 log.u_bbr.use_lt_bw <<= 1; 2767 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2768 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2769 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2770 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2771 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2772 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2773 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2774 log.u_bbr.bw_inuse <<= 32; 2775 if (rsm) 2776 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2777 TCP_LOG_EVENTP(tp, NULL, 2778 &rack->rc_inp->inp_socket->so_rcv, 2779 &rack->rc_inp->inp_socket->so_snd, 2780 BBR_LOG_BBRRTT, 0, 2781 0, &log, false, &tv); 2782 2783 2784 } 2785 } 2786 2787 static void 2788 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2789 { 2790 /* 2791 * Log the rtt sample we are 2792 * applying to the srtt algorithm in 2793 * useconds. 2794 */ 2795 if (tcp_bblogging_on(rack->rc_tp)) { 2796 union tcp_log_stackspecific log; 2797 struct timeval tv; 2798 2799 /* Convert our ms to a microsecond */ 2800 memset(&log, 0, sizeof(log)); 2801 log.u_bbr.flex1 = rtt; 2802 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2803 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2804 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2805 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2806 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2807 log.u_bbr.flex7 = 1; 2808 log.u_bbr.flex8 = rack->sack_attack_disable; 2809 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2810 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2811 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2812 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2813 log.u_bbr.pacing_gain = rack->r_must_retran; 2814 /* 2815 * We capture in delRate the upper 32 bits as 2816 * the confidence level we had declared, and the 2817 * lower 32 bits as the actual RTT using the arrival 2818 * timestamp. 2819 */ 2820 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2821 log.u_bbr.delRate <<= 32; 2822 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2823 /* Lets capture all the things that make up t_rtxcur */ 2824 log.u_bbr.applimited = rack_rto_min; 2825 log.u_bbr.epoch = rack_rto_max; 2826 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2827 log.u_bbr.lost = rack_rto_min; 2828 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2829 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2830 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2831 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2832 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2833 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2834 &rack->rc_inp->inp_socket->so_rcv, 2835 &rack->rc_inp->inp_socket->so_snd, 2836 TCP_LOG_RTT, 0, 2837 0, &log, false, &tv); 2838 } 2839 } 2840 2841 static void 2842 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2843 { 2844 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2845 union tcp_log_stackspecific log; 2846 struct timeval tv; 2847 2848 /* Convert our ms to a microsecond */ 2849 memset(&log, 0, sizeof(log)); 2850 log.u_bbr.flex1 = rtt; 2851 log.u_bbr.flex2 = send_time; 2852 log.u_bbr.flex3 = ack_time; 2853 log.u_bbr.flex4 = where; 2854 log.u_bbr.flex7 = 2; 2855 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2856 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2857 &rack->rc_inp->inp_socket->so_rcv, 2858 &rack->rc_inp->inp_socket->so_snd, 2859 TCP_LOG_RTT, 0, 2860 0, &log, false, &tv); 2861 } 2862 } 2863 2864 2865 static void 2866 rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) 2867 { 2868 if (tcp_bblogging_on(rack->rc_tp)) { 2869 union tcp_log_stackspecific log; 2870 struct timeval tv; 2871 2872 /* Convert our ms to a microsecond */ 2873 memset(&log, 0, sizeof(log)); 2874 log.u_bbr.flex1 = idx; 2875 log.u_bbr.flex2 = rack_ts_to_msec(tsv); 2876 log.u_bbr.flex3 = tsecho; 2877 log.u_bbr.flex7 = 3; 2878 log.u_bbr.rttProp = tsv; 2879 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2880 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2881 &rack->rc_inp->inp_socket->so_rcv, 2882 &rack->rc_inp->inp_socket->so_snd, 2883 TCP_LOG_RTT, 0, 2884 0, &log, false, &tv); 2885 } 2886 } 2887 2888 2889 static inline void 2890 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2891 { 2892 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 2893 union tcp_log_stackspecific log; 2894 struct timeval tv; 2895 2896 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2897 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2898 log.u_bbr.flex1 = line; 2899 log.u_bbr.flex2 = tick; 2900 log.u_bbr.flex3 = tp->t_maxunacktime; 2901 log.u_bbr.flex4 = tp->t_acktime; 2902 log.u_bbr.flex8 = event; 2903 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2904 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2905 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2906 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2907 log.u_bbr.pacing_gain = rack->r_must_retran; 2908 TCP_LOG_EVENTP(tp, NULL, 2909 &rack->rc_inp->inp_socket->so_rcv, 2910 &rack->rc_inp->inp_socket->so_snd, 2911 BBR_LOG_PROGRESS, 0, 2912 0, &log, false, &tv); 2913 } 2914 } 2915 2916 static void 2917 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv, int line) 2918 { 2919 if (tcp_bblogging_on(rack->rc_tp)) { 2920 union tcp_log_stackspecific log; 2921 2922 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2923 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2924 log.u_bbr.flex1 = slot; 2925 if (rack->rack_no_prr) 2926 log.u_bbr.flex2 = 0; 2927 else 2928 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2929 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2930 log.u_bbr.flex5 = rack->r_ctl.ack_during_sd; 2931 log.u_bbr.flex6 = line; 2932 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2933 log.u_bbr.flex8 = rack->rc_in_persist; 2934 log.u_bbr.timeStamp = cts; 2935 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2936 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2937 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2938 log.u_bbr.pacing_gain = rack->r_must_retran; 2939 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2940 &rack->rc_inp->inp_socket->so_rcv, 2941 &rack->rc_inp->inp_socket->so_snd, 2942 BBR_LOG_BBRSND, 0, 2943 0, &log, false, tv); 2944 } 2945 } 2946 2947 static void 2948 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2949 { 2950 if (tcp_bblogging_on(rack->rc_tp)) { 2951 union tcp_log_stackspecific log; 2952 struct timeval tv; 2953 2954 memset(&log, 0, sizeof(log)); 2955 log.u_bbr.flex1 = did_out; 2956 log.u_bbr.flex2 = nxt_pkt; 2957 log.u_bbr.flex3 = way_out; 2958 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2959 if (rack->rack_no_prr) 2960 log.u_bbr.flex5 = 0; 2961 else 2962 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2963 log.u_bbr.flex6 = nsegs; 2964 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2965 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2966 log.u_bbr.flex7 <<= 1; 2967 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2968 log.u_bbr.flex7 <<= 1; 2969 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2970 log.u_bbr.flex8 = rack->rc_in_persist; 2971 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 2972 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2973 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2974 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2975 log.u_bbr.use_lt_bw <<= 1; 2976 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2977 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2978 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2979 log.u_bbr.pacing_gain = rack->r_must_retran; 2980 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2981 &rack->rc_inp->inp_socket->so_rcv, 2982 &rack->rc_inp->inp_socket->so_snd, 2983 BBR_LOG_DOSEG_DONE, 0, 2984 0, &log, false, &tv); 2985 } 2986 } 2987 2988 static void 2989 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2990 { 2991 if (tcp_bblogging_on(rack->rc_tp)) { 2992 union tcp_log_stackspecific log; 2993 struct timeval tv; 2994 2995 memset(&log, 0, sizeof(log)); 2996 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2997 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2998 log.u_bbr.flex4 = arg1; 2999 log.u_bbr.flex5 = arg2; 3000 log.u_bbr.flex7 = rack->r_ctl.rc_user_set_min_segs; 3001 log.u_bbr.flex6 = arg3; 3002 log.u_bbr.flex8 = frm; 3003 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3004 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3005 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3006 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 3007 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3008 log.u_bbr.pacing_gain = rack->r_must_retran; 3009 TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, 3010 &tptosocket(tp)->so_snd, 3011 TCP_HDWR_PACE_SIZE, 0, 0, &log, false, &tv); 3012 } 3013 } 3014 3015 static void 3016 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 3017 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 3018 { 3019 if (tcp_bblogging_on(rack->rc_tp)) { 3020 union tcp_log_stackspecific log; 3021 struct timeval tv; 3022 3023 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3024 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3025 log.u_bbr.flex1 = slot; 3026 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 3027 log.u_bbr.flex4 = reason; 3028 if (rack->rack_no_prr) 3029 log.u_bbr.flex5 = 0; 3030 else 3031 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3032 log.u_bbr.flex7 = hpts_calling; 3033 log.u_bbr.flex8 = rack->rc_in_persist; 3034 log.u_bbr.lt_epoch = cwnd_to_use; 3035 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3036 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3037 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3038 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3039 log.u_bbr.pacing_gain = rack->r_must_retran; 3040 log.u_bbr.cwnd_gain = rack->rc_has_collapsed; 3041 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3042 &rack->rc_inp->inp_socket->so_rcv, 3043 &rack->rc_inp->inp_socket->so_snd, 3044 BBR_LOG_JUSTRET, 0, 3045 tlen, &log, false, &tv); 3046 } 3047 } 3048 3049 static void 3050 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 3051 struct timeval *tv, uint32_t flags_on_entry) 3052 { 3053 if (tcp_bblogging_on(rack->rc_tp)) { 3054 union tcp_log_stackspecific log; 3055 3056 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3057 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 3058 log.u_bbr.flex1 = line; 3059 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 3060 log.u_bbr.flex3 = flags_on_entry; 3061 log.u_bbr.flex4 = us_cts; 3062 if (rack->rack_no_prr) 3063 log.u_bbr.flex5 = 0; 3064 else 3065 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 3066 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 3067 log.u_bbr.flex7 = hpts_removed; 3068 log.u_bbr.flex8 = 1; 3069 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 3070 log.u_bbr.timeStamp = us_cts; 3071 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3072 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3073 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3074 log.u_bbr.pacing_gain = rack->r_must_retran; 3075 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3076 &rack->rc_inp->inp_socket->so_rcv, 3077 &rack->rc_inp->inp_socket->so_snd, 3078 BBR_LOG_TIMERCANC, 0, 3079 0, &log, false, tv); 3080 } 3081 } 3082 3083 static void 3084 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 3085 uint32_t flex1, uint32_t flex2, 3086 uint32_t flex3, uint32_t flex4, 3087 uint32_t flex5, uint32_t flex6, 3088 uint16_t flex7, uint8_t mod) 3089 { 3090 if (tcp_bblogging_on(rack->rc_tp)) { 3091 union tcp_log_stackspecific log; 3092 struct timeval tv; 3093 3094 if (mod == 1) { 3095 /* No you can't use 1, its for the real to cancel */ 3096 return; 3097 } 3098 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3099 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3100 log.u_bbr.flex1 = flex1; 3101 log.u_bbr.flex2 = flex2; 3102 log.u_bbr.flex3 = flex3; 3103 log.u_bbr.flex4 = flex4; 3104 log.u_bbr.flex5 = flex5; 3105 log.u_bbr.flex6 = flex6; 3106 log.u_bbr.flex7 = flex7; 3107 log.u_bbr.flex8 = mod; 3108 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3109 &rack->rc_inp->inp_socket->so_rcv, 3110 &rack->rc_inp->inp_socket->so_snd, 3111 BBR_LOG_TIMERCANC, 0, 3112 0, &log, false, &tv); 3113 } 3114 } 3115 3116 static void 3117 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 3118 { 3119 if (tcp_bblogging_on(rack->rc_tp)) { 3120 union tcp_log_stackspecific log; 3121 struct timeval tv; 3122 3123 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3124 log.u_bbr.flex1 = timers; 3125 log.u_bbr.flex2 = ret; 3126 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 3127 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 3128 log.u_bbr.flex5 = cts; 3129 if (rack->rack_no_prr) 3130 log.u_bbr.flex6 = 0; 3131 else 3132 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 3133 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 3134 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 3135 log.u_bbr.pacing_gain = rack->r_must_retran; 3136 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3137 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3138 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3139 &rack->rc_inp->inp_socket->so_rcv, 3140 &rack->rc_inp->inp_socket->so_snd, 3141 BBR_LOG_TO_PROCESS, 0, 3142 0, &log, false, &tv); 3143 } 3144 } 3145 3146 static void 3147 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd, int line) 3148 { 3149 if (tcp_bblogging_on(rack->rc_tp)) { 3150 union tcp_log_stackspecific log; 3151 struct timeval tv; 3152 3153 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3154 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 3155 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 3156 if (rack->rack_no_prr) 3157 log.u_bbr.flex3 = 0; 3158 else 3159 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 3160 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 3161 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 3162 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 3163 log.u_bbr.flex7 = line; 3164 log.u_bbr.flex8 = frm; 3165 log.u_bbr.pkts_out = orig_cwnd; 3166 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3167 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3168 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 3169 log.u_bbr.use_lt_bw <<= 1; 3170 log.u_bbr.use_lt_bw |= rack->r_might_revert; 3171 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3172 &rack->rc_inp->inp_socket->so_rcv, 3173 &rack->rc_inp->inp_socket->so_snd, 3174 BBR_LOG_BBRUPD, 0, 3175 0, &log, false, &tv); 3176 } 3177 } 3178 3179 #ifdef TCP_SAD_DETECTION 3180 static void 3181 rack_log_sad(struct tcp_rack *rack, int event) 3182 { 3183 if (tcp_bblogging_on(rack->rc_tp)) { 3184 union tcp_log_stackspecific log; 3185 struct timeval tv; 3186 3187 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3188 log.u_bbr.flex1 = rack->r_ctl.sack_count; 3189 log.u_bbr.flex2 = rack->r_ctl.ack_count; 3190 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 3191 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 3192 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 3193 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 3194 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 3195 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 3196 log.u_bbr.lt_epoch |= rack->do_detection; 3197 log.u_bbr.applimited = tcp_map_minimum; 3198 log.u_bbr.flex7 = rack->sack_attack_disable; 3199 log.u_bbr.flex8 = event; 3200 log.u_bbr.bbr_state = rack->rc_suspicious; 3201 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3202 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3203 log.u_bbr.delivered = tcp_sad_decay_val; 3204 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3205 &rack->rc_inp->inp_socket->so_rcv, 3206 &rack->rc_inp->inp_socket->so_snd, 3207 TCP_SAD_DETECT, 0, 3208 0, &log, false, &tv); 3209 } 3210 } 3211 #endif 3212 3213 static void 3214 rack_counter_destroy(void) 3215 { 3216 counter_u64_free(rack_total_bytes); 3217 counter_u64_free(rack_fto_send); 3218 counter_u64_free(rack_fto_rsm_send); 3219 counter_u64_free(rack_nfto_resend); 3220 counter_u64_free(rack_hw_pace_init_fail); 3221 counter_u64_free(rack_hw_pace_lost); 3222 counter_u64_free(rack_non_fto_send); 3223 counter_u64_free(rack_extended_rfo); 3224 counter_u64_free(rack_ack_total); 3225 counter_u64_free(rack_express_sack); 3226 counter_u64_free(rack_sack_total); 3227 counter_u64_free(rack_move_none); 3228 counter_u64_free(rack_move_some); 3229 counter_u64_free(rack_sack_attacks_detected); 3230 counter_u64_free(rack_sack_attacks_reversed); 3231 counter_u64_free(rack_sack_attacks_suspect); 3232 counter_u64_free(rack_sack_used_next_merge); 3233 counter_u64_free(rack_sack_used_prev_merge); 3234 counter_u64_free(rack_tlp_tot); 3235 counter_u64_free(rack_tlp_newdata); 3236 counter_u64_free(rack_tlp_retran); 3237 counter_u64_free(rack_tlp_retran_bytes); 3238 counter_u64_free(rack_to_tot); 3239 counter_u64_free(rack_saw_enobuf); 3240 counter_u64_free(rack_saw_enobuf_hw); 3241 counter_u64_free(rack_saw_enetunreach); 3242 counter_u64_free(rack_hot_alloc); 3243 counter_u64_free(rack_to_alloc); 3244 counter_u64_free(rack_to_alloc_hard); 3245 counter_u64_free(rack_to_alloc_emerg); 3246 counter_u64_free(rack_to_alloc_limited); 3247 counter_u64_free(rack_alloc_limited_conns); 3248 counter_u64_free(rack_split_limited); 3249 counter_u64_free(rack_multi_single_eq); 3250 counter_u64_free(rack_rxt_clamps_cwnd); 3251 counter_u64_free(rack_rxt_clamps_cwnd_uniq); 3252 counter_u64_free(rack_proc_non_comp_ack); 3253 counter_u64_free(rack_sack_proc_all); 3254 counter_u64_free(rack_sack_proc_restart); 3255 counter_u64_free(rack_sack_proc_short); 3256 counter_u64_free(rack_sack_skipped_acked); 3257 counter_u64_free(rack_sack_splits); 3258 counter_u64_free(rack_input_idle_reduces); 3259 counter_u64_free(rack_collapsed_win); 3260 counter_u64_free(rack_collapsed_win_rxt); 3261 counter_u64_free(rack_collapsed_win_rxt_bytes); 3262 counter_u64_free(rack_collapsed_win_seen); 3263 counter_u64_free(rack_try_scwnd); 3264 counter_u64_free(rack_persists_sends); 3265 counter_u64_free(rack_persists_acks); 3266 counter_u64_free(rack_persists_loss); 3267 counter_u64_free(rack_persists_lost_ends); 3268 #ifdef INVARIANTS 3269 counter_u64_free(rack_adjust_map_bw); 3270 #endif 3271 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 3272 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 3273 } 3274 3275 static struct rack_sendmap * 3276 rack_alloc(struct tcp_rack *rack) 3277 { 3278 struct rack_sendmap *rsm; 3279 3280 /* 3281 * First get the top of the list it in 3282 * theory is the "hottest" rsm we have, 3283 * possibly just freed by ack processing. 3284 */ 3285 if (rack->rc_free_cnt > rack_free_cache) { 3286 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3287 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3288 counter_u64_add(rack_hot_alloc, 1); 3289 rack->rc_free_cnt--; 3290 return (rsm); 3291 } 3292 /* 3293 * Once we get under our free cache we probably 3294 * no longer have a "hot" one available. Lets 3295 * get one from UMA. 3296 */ 3297 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3298 if (rsm) { 3299 rack->r_ctl.rc_num_maps_alloced++; 3300 counter_u64_add(rack_to_alloc, 1); 3301 return (rsm); 3302 } 3303 /* 3304 * Dig in to our aux rsm's (the last two) since 3305 * UMA failed to get us one. 3306 */ 3307 if (rack->rc_free_cnt) { 3308 counter_u64_add(rack_to_alloc_emerg, 1); 3309 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3310 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3311 rack->rc_free_cnt--; 3312 return (rsm); 3313 } 3314 return (NULL); 3315 } 3316 3317 static struct rack_sendmap * 3318 rack_alloc_full_limit(struct tcp_rack *rack) 3319 { 3320 if ((V_tcp_map_entries_limit > 0) && 3321 (rack->do_detection == 0) && 3322 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3323 counter_u64_add(rack_to_alloc_limited, 1); 3324 if (!rack->alloc_limit_reported) { 3325 rack->alloc_limit_reported = 1; 3326 counter_u64_add(rack_alloc_limited_conns, 1); 3327 } 3328 return (NULL); 3329 } 3330 return (rack_alloc(rack)); 3331 } 3332 3333 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3334 static struct rack_sendmap * 3335 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3336 { 3337 struct rack_sendmap *rsm; 3338 3339 if (limit_type) { 3340 /* currently there is only one limit type */ 3341 if (rack->r_ctl.rc_split_limit > 0 && 3342 (rack->do_detection == 0) && 3343 rack->r_ctl.rc_num_split_allocs >= rack->r_ctl.rc_split_limit) { 3344 counter_u64_add(rack_split_limited, 1); 3345 if (!rack->alloc_limit_reported) { 3346 rack->alloc_limit_reported = 1; 3347 counter_u64_add(rack_alloc_limited_conns, 1); 3348 } 3349 return (NULL); 3350 #ifdef TCP_SAD_DETECTION 3351 } else if ((tcp_sad_limit != 0) && 3352 (rack->do_detection == 1) && 3353 (rack->r_ctl.rc_num_split_allocs >= tcp_sad_limit)) { 3354 counter_u64_add(rack_split_limited, 1); 3355 if (!rack->alloc_limit_reported) { 3356 rack->alloc_limit_reported = 1; 3357 counter_u64_add(rack_alloc_limited_conns, 1); 3358 } 3359 return (NULL); 3360 #endif 3361 } 3362 } 3363 3364 /* allocate and mark in the limit type, if set */ 3365 rsm = rack_alloc(rack); 3366 if (rsm != NULL && limit_type) { 3367 rsm->r_limit_type = limit_type; 3368 rack->r_ctl.rc_num_split_allocs++; 3369 } 3370 return (rsm); 3371 } 3372 3373 static void 3374 rack_free_trim(struct tcp_rack *rack) 3375 { 3376 struct rack_sendmap *rsm; 3377 3378 /* 3379 * Free up all the tail entries until 3380 * we get our list down to the limit. 3381 */ 3382 while (rack->rc_free_cnt > rack_free_cache) { 3383 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3384 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3385 rack->rc_free_cnt--; 3386 rack->r_ctl.rc_num_maps_alloced--; 3387 uma_zfree(rack_zone, rsm); 3388 } 3389 } 3390 3391 static void 3392 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3393 { 3394 if (rsm->r_flags & RACK_APP_LIMITED) { 3395 if (rack->r_ctl.rc_app_limited_cnt > 0) { 3396 rack->r_ctl.rc_app_limited_cnt--; 3397 } 3398 } 3399 if (rsm->r_limit_type) { 3400 /* currently there is only one limit type */ 3401 rack->r_ctl.rc_num_split_allocs--; 3402 } 3403 if (rsm == rack->r_ctl.rc_first_appl) { 3404 if (rack->r_ctl.rc_app_limited_cnt == 0) 3405 rack->r_ctl.rc_first_appl = NULL; 3406 else 3407 rack->r_ctl.rc_first_appl = tqhash_find(rack->r_ctl.tqh, rsm->r_nseq_appl); 3408 } 3409 if (rsm == rack->r_ctl.rc_resend) 3410 rack->r_ctl.rc_resend = NULL; 3411 if (rsm == rack->r_ctl.rc_end_appl) 3412 rack->r_ctl.rc_end_appl = NULL; 3413 if (rack->r_ctl.rc_tlpsend == rsm) 3414 rack->r_ctl.rc_tlpsend = NULL; 3415 if (rack->r_ctl.rc_sacklast == rsm) 3416 rack->r_ctl.rc_sacklast = NULL; 3417 memset(rsm, 0, sizeof(struct rack_sendmap)); 3418 /* Make sure we are not going to overrun our count limit of 0xff */ 3419 if ((rack->rc_free_cnt + 1) > 0xff) { 3420 rack_free_trim(rack); 3421 } 3422 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3423 rack->rc_free_cnt++; 3424 } 3425 3426 static uint32_t 3427 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3428 { 3429 uint64_t srtt, bw, len, tim; 3430 uint32_t segsiz, def_len, minl; 3431 3432 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3433 def_len = rack_def_data_window * segsiz; 3434 if (rack->rc_gp_filled == 0) { 3435 /* 3436 * We have no measurement (IW is in flight?) so 3437 * we can only guess using our data_window sysctl 3438 * value (usually 20MSS). 3439 */ 3440 return (def_len); 3441 } 3442 /* 3443 * Now we have a number of factors to consider. 3444 * 3445 * 1) We have a desired BDP which is usually 3446 * at least 2. 3447 * 2) We have a minimum number of rtt's usually 1 SRTT 3448 * but we allow it too to be more. 3449 * 3) We want to make sure a measurement last N useconds (if 3450 * we have set rack_min_measure_usec. 3451 * 3452 * We handle the first concern here by trying to create a data 3453 * window of max(rack_def_data_window, DesiredBDP). The 3454 * second concern we handle in not letting the measurement 3455 * window end normally until at least the required SRTT's 3456 * have gone by which is done further below in 3457 * rack_enough_for_measurement(). Finally the third concern 3458 * we also handle here by calculating how long that time 3459 * would take at the current BW and then return the 3460 * max of our first calculation and that length. Note 3461 * that if rack_min_measure_usec is 0, we don't deal 3462 * with concern 3. Also for both Concern 1 and 3 an 3463 * application limited period could end the measurement 3464 * earlier. 3465 * 3466 * So lets calculate the BDP with the "known" b/w using 3467 * the SRTT has our rtt and then multiply it by the 3468 * goal. 3469 */ 3470 bw = rack_get_bw(rack); 3471 srtt = (uint64_t)tp->t_srtt; 3472 len = bw * srtt; 3473 len /= (uint64_t)HPTS_USEC_IN_SEC; 3474 len *= max(1, rack_goal_bdp); 3475 /* Now we need to round up to the nearest MSS */ 3476 len = roundup(len, segsiz); 3477 if (rack_min_measure_usec) { 3478 /* Now calculate our min length for this b/w */ 3479 tim = rack_min_measure_usec; 3480 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3481 if (minl == 0) 3482 minl = 1; 3483 minl = roundup(minl, segsiz); 3484 if (len < minl) 3485 len = minl; 3486 } 3487 /* 3488 * Now if we have a very small window we want 3489 * to attempt to get the window that is 3490 * as small as possible. This happens on 3491 * low b/w connections and we don't want to 3492 * span huge numbers of rtt's between measurements. 3493 * 3494 * We basically include 2 over our "MIN window" so 3495 * that the measurement can be shortened (possibly) by 3496 * an ack'ed packet. 3497 */ 3498 if (len < def_len) 3499 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3500 else 3501 return (max((uint32_t)len, def_len)); 3502 3503 } 3504 3505 static int 3506 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3507 { 3508 uint32_t tim, srtts, segsiz; 3509 3510 /* 3511 * Has enough time passed for the GP measurement to be valid? 3512 */ 3513 if (SEQ_LT(th_ack, tp->gput_seq)) { 3514 /* Not enough bytes yet */ 3515 return (0); 3516 } 3517 if ((tp->snd_max == tp->snd_una) || 3518 (th_ack == tp->snd_max)){ 3519 /* 3520 * All is acked quality of all acked is 3521 * usually low or medium, but we in theory could split 3522 * all acked into two cases, where you got 3523 * a signifigant amount of your window and 3524 * where you did not. For now we leave it 3525 * but it is something to contemplate in the 3526 * future. The danger here is that delayed ack 3527 * is effecting the last byte (which is a 50:50 chance). 3528 */ 3529 *quality = RACK_QUALITY_ALLACKED; 3530 return (1); 3531 } 3532 if (SEQ_GEQ(th_ack, tp->gput_ack)) { 3533 /* 3534 * We obtained our entire window of data we wanted 3535 * no matter if we are in recovery or not then 3536 * its ok since expanding the window does not 3537 * make things fuzzy (or at least not as much). 3538 */ 3539 *quality = RACK_QUALITY_HIGH; 3540 return (1); 3541 } 3542 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3543 if (SEQ_LT(th_ack, tp->gput_ack) && 3544 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3545 /* Not enough bytes yet */ 3546 return (0); 3547 } 3548 if (rack->r_ctl.rc_first_appl && 3549 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3550 /* 3551 * We are up to the app limited send point 3552 * we have to measure irrespective of the time.. 3553 */ 3554 *quality = RACK_QUALITY_APPLIMITED; 3555 return (1); 3556 } 3557 /* Now what about time? */ 3558 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3559 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3560 if ((tim >= srtts) && (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 3561 /* 3562 * We do not allow a measurement if we are in recovery 3563 * that would shrink the goodput window we wanted. 3564 * This is to prevent cloudyness of when the last send 3565 * was actually made. 3566 */ 3567 *quality = RACK_QUALITY_HIGH; 3568 return (1); 3569 } 3570 /* Nope not even a full SRTT has passed */ 3571 return (0); 3572 } 3573 3574 static void 3575 rack_log_timely(struct tcp_rack *rack, 3576 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3577 uint64_t up_bnd, int line, uint8_t method) 3578 { 3579 if (tcp_bblogging_on(rack->rc_tp)) { 3580 union tcp_log_stackspecific log; 3581 struct timeval tv; 3582 3583 memset(&log, 0, sizeof(log)); 3584 log.u_bbr.flex1 = logged; 3585 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3586 log.u_bbr.flex2 <<= 4; 3587 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3588 log.u_bbr.flex2 <<= 4; 3589 log.u_bbr.flex2 |= rack->rc_gp_incr; 3590 log.u_bbr.flex2 <<= 4; 3591 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3592 log.u_bbr.flex3 = rack->rc_gp_incr; 3593 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3594 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3595 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3596 log.u_bbr.flex7 = rack->rc_gp_bwred; 3597 log.u_bbr.flex8 = method; 3598 log.u_bbr.cur_del_rate = cur_bw; 3599 log.u_bbr.delRate = low_bnd; 3600 log.u_bbr.bw_inuse = up_bnd; 3601 log.u_bbr.rttProp = rack_get_bw(rack); 3602 log.u_bbr.pkt_epoch = line; 3603 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3604 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3605 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3606 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3607 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3608 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3609 log.u_bbr.cwnd_gain <<= 1; 3610 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3611 log.u_bbr.cwnd_gain <<= 1; 3612 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3613 log.u_bbr.cwnd_gain <<= 1; 3614 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3615 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3616 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3617 &rack->rc_inp->inp_socket->so_rcv, 3618 &rack->rc_inp->inp_socket->so_snd, 3619 TCP_TIMELY_WORK, 0, 3620 0, &log, false, &tv); 3621 } 3622 } 3623 3624 static int 3625 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3626 { 3627 /* 3628 * Before we increase we need to know if 3629 * the estimate just made was less than 3630 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3631 * 3632 * If we already are pacing at a fast enough 3633 * rate to push us faster there is no sense of 3634 * increasing. 3635 * 3636 * We first caculate our actual pacing rate (ss or ca multiplier 3637 * times our cur_bw). 3638 * 3639 * Then we take the last measured rate and multipy by our 3640 * maximum pacing overage to give us a max allowable rate. 3641 * 3642 * If our act_rate is smaller than our max_allowable rate 3643 * then we should increase. Else we should hold steady. 3644 * 3645 */ 3646 uint64_t act_rate, max_allow_rate; 3647 3648 if (rack_timely_no_stopping) 3649 return (1); 3650 3651 if ((cur_bw == 0) || (last_bw_est == 0)) { 3652 /* 3653 * Initial startup case or 3654 * everything is acked case. 3655 */ 3656 rack_log_timely(rack, mult, cur_bw, 0, 0, 3657 __LINE__, 9); 3658 return (1); 3659 } 3660 if (mult <= 100) { 3661 /* 3662 * We can always pace at or slightly above our rate. 3663 */ 3664 rack_log_timely(rack, mult, cur_bw, 0, 0, 3665 __LINE__, 9); 3666 return (1); 3667 } 3668 act_rate = cur_bw * (uint64_t)mult; 3669 act_rate /= 100; 3670 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3671 max_allow_rate /= 100; 3672 if (act_rate < max_allow_rate) { 3673 /* 3674 * Here the rate we are actually pacing at 3675 * is smaller than 10% above our last measurement. 3676 * This means we are pacing below what we would 3677 * like to try to achieve (plus some wiggle room). 3678 */ 3679 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3680 __LINE__, 9); 3681 return (1); 3682 } else { 3683 /* 3684 * Here we are already pacing at least rack_max_per_above(10%) 3685 * what we are getting back. This indicates most likely 3686 * that we are being limited (cwnd/rwnd/app) and can't 3687 * get any more b/w. There is no sense of trying to 3688 * raise up the pacing rate its not speeding us up 3689 * and we already are pacing faster than we are getting. 3690 */ 3691 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3692 __LINE__, 8); 3693 return (0); 3694 } 3695 } 3696 3697 static void 3698 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3699 { 3700 /* 3701 * When we drag bottom, we want to assure 3702 * that no multiplier is below 1.0, if so 3703 * we want to restore it to at least that. 3704 */ 3705 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3706 /* This is unlikely we usually do not touch recovery */ 3707 rack->r_ctl.rack_per_of_gp_rec = 100; 3708 } 3709 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3710 rack->r_ctl.rack_per_of_gp_ca = 100; 3711 } 3712 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3713 rack->r_ctl.rack_per_of_gp_ss = 100; 3714 } 3715 } 3716 3717 static void 3718 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3719 { 3720 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3721 rack->r_ctl.rack_per_of_gp_ca = 100; 3722 } 3723 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3724 rack->r_ctl.rack_per_of_gp_ss = 100; 3725 } 3726 } 3727 3728 static void 3729 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3730 { 3731 int32_t calc, logged, plus; 3732 3733 logged = 0; 3734 3735 if (override) { 3736 /* 3737 * override is passed when we are 3738 * loosing b/w and making one last 3739 * gasp at trying to not loose out 3740 * to a new-reno flow. 3741 */ 3742 goto extra_boost; 3743 } 3744 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3745 if (rack->rc_gp_incr && 3746 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3747 /* 3748 * Reset and get 5 strokes more before the boost. Note 3749 * that the count is 0 based so we have to add one. 3750 */ 3751 extra_boost: 3752 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3753 rack->rc_gp_timely_inc_cnt = 0; 3754 } else 3755 plus = (uint32_t)rack_gp_increase_per; 3756 /* Must be at least 1% increase for true timely increases */ 3757 if ((plus < 1) && 3758 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3759 plus = 1; 3760 if (rack->rc_gp_saw_rec && 3761 (rack->rc_gp_no_rec_chg == 0) && 3762 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3763 rack->r_ctl.rack_per_of_gp_rec)) { 3764 /* We have been in recovery ding it too */ 3765 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3766 if (calc > 0xffff) 3767 calc = 0xffff; 3768 logged |= 1; 3769 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3770 if (rack->r_ctl.rack_per_upper_bound_ca && 3771 (rack->rc_dragged_bottom == 0) && 3772 (rack->r_ctl.rack_per_of_gp_rec > rack->r_ctl.rack_per_upper_bound_ca)) 3773 rack->r_ctl.rack_per_of_gp_rec = rack->r_ctl.rack_per_upper_bound_ca; 3774 } 3775 if (rack->rc_gp_saw_ca && 3776 (rack->rc_gp_saw_ss == 0) && 3777 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3778 rack->r_ctl.rack_per_of_gp_ca)) { 3779 /* In CA */ 3780 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3781 if (calc > 0xffff) 3782 calc = 0xffff; 3783 logged |= 2; 3784 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3785 if (rack->r_ctl.rack_per_upper_bound_ca && 3786 (rack->rc_dragged_bottom == 0) && 3787 (rack->r_ctl.rack_per_of_gp_ca > rack->r_ctl.rack_per_upper_bound_ca)) 3788 rack->r_ctl.rack_per_of_gp_ca = rack->r_ctl.rack_per_upper_bound_ca; 3789 } 3790 if (rack->rc_gp_saw_ss && 3791 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3792 rack->r_ctl.rack_per_of_gp_ss)) { 3793 /* In SS */ 3794 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3795 if (calc > 0xffff) 3796 calc = 0xffff; 3797 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3798 if (rack->r_ctl.rack_per_upper_bound_ss && 3799 (rack->rc_dragged_bottom == 0) && 3800 (rack->r_ctl.rack_per_of_gp_ss > rack->r_ctl.rack_per_upper_bound_ss)) 3801 rack->r_ctl.rack_per_of_gp_ss = rack->r_ctl.rack_per_upper_bound_ss; 3802 logged |= 4; 3803 } 3804 if (logged && 3805 (rack->rc_gp_incr == 0)){ 3806 /* Go into increment mode */ 3807 rack->rc_gp_incr = 1; 3808 rack->rc_gp_timely_inc_cnt = 0; 3809 } 3810 if (rack->rc_gp_incr && 3811 logged && 3812 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3813 rack->rc_gp_timely_inc_cnt++; 3814 } 3815 rack_log_timely(rack, logged, plus, 0, 0, 3816 __LINE__, 1); 3817 } 3818 3819 static uint32_t 3820 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3821 { 3822 /* 3823 * norm_grad = rtt_diff / minrtt; 3824 * new_per = curper * (1 - B * norm_grad) 3825 * 3826 * B = rack_gp_decrease_per (default 10%) 3827 * rtt_dif = input var current rtt-diff 3828 * curper = input var current percentage 3829 * minrtt = from rack filter 3830 * 3831 */ 3832 uint64_t perf; 3833 3834 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3835 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3836 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3837 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3838 (uint64_t)1000000)) / 3839 (uint64_t)1000000); 3840 if (perf > curper) { 3841 /* TSNH */ 3842 perf = curper - 1; 3843 } 3844 return ((uint32_t)perf); 3845 } 3846 3847 static uint32_t 3848 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3849 { 3850 /* 3851 * highrttthresh 3852 * result = curper * (1 - (B * ( 1 - ------ )) 3853 * gp_srtt 3854 * 3855 * B = rack_gp_decrease_per (default 10%) 3856 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3857 */ 3858 uint64_t perf; 3859 uint32_t highrttthresh; 3860 3861 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3862 3863 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3864 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3865 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3866 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3867 return (perf); 3868 } 3869 3870 static void 3871 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3872 { 3873 uint64_t logvar, logvar2, logvar3; 3874 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3875 3876 if (rack->rc_gp_incr) { 3877 /* Turn off increment counting */ 3878 rack->rc_gp_incr = 0; 3879 rack->rc_gp_timely_inc_cnt = 0; 3880 } 3881 ss_red = ca_red = rec_red = 0; 3882 logged = 0; 3883 /* Calculate the reduction value */ 3884 if (rtt_diff < 0) { 3885 rtt_diff *= -1; 3886 } 3887 /* Must be at least 1% reduction */ 3888 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3889 /* We have been in recovery ding it too */ 3890 if (timely_says == 2) { 3891 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3892 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3893 if (alt < new_per) 3894 val = alt; 3895 else 3896 val = new_per; 3897 } else 3898 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3899 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3900 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3901 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3902 } else { 3903 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3904 rec_red = 0; 3905 } 3906 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3907 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3908 logged |= 1; 3909 } 3910 if (rack->rc_gp_saw_ss) { 3911 /* Sent in SS */ 3912 if (timely_says == 2) { 3913 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3914 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3915 if (alt < new_per) 3916 val = alt; 3917 else 3918 val = new_per; 3919 } else 3920 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3921 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3922 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3923 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3924 } else { 3925 ss_red = new_per; 3926 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3927 logvar = new_per; 3928 logvar <<= 32; 3929 logvar |= alt; 3930 logvar2 = (uint32_t)rtt; 3931 logvar2 <<= 32; 3932 logvar2 |= (uint32_t)rtt_diff; 3933 logvar3 = rack_gp_rtt_maxmul; 3934 logvar3 <<= 32; 3935 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3936 rack_log_timely(rack, timely_says, 3937 logvar2, logvar3, 3938 logvar, __LINE__, 10); 3939 } 3940 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3941 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3942 logged |= 4; 3943 } else if (rack->rc_gp_saw_ca) { 3944 /* Sent in CA */ 3945 if (timely_says == 2) { 3946 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3947 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3948 if (alt < new_per) 3949 val = alt; 3950 else 3951 val = new_per; 3952 } else 3953 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3954 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3955 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3956 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3957 } else { 3958 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3959 ca_red = 0; 3960 logvar = new_per; 3961 logvar <<= 32; 3962 logvar |= alt; 3963 logvar2 = (uint32_t)rtt; 3964 logvar2 <<= 32; 3965 logvar2 |= (uint32_t)rtt_diff; 3966 logvar3 = rack_gp_rtt_maxmul; 3967 logvar3 <<= 32; 3968 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3969 rack_log_timely(rack, timely_says, 3970 logvar2, logvar3, 3971 logvar, __LINE__, 10); 3972 } 3973 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3974 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3975 logged |= 2; 3976 } 3977 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3978 rack->rc_gp_timely_dec_cnt++; 3979 if (rack_timely_dec_clear && 3980 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3981 rack->rc_gp_timely_dec_cnt = 0; 3982 } 3983 logvar = ss_red; 3984 logvar <<= 32; 3985 logvar |= ca_red; 3986 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3987 __LINE__, 2); 3988 } 3989 3990 static void 3991 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3992 uint32_t rtt, uint32_t line, uint8_t reas) 3993 { 3994 if (tcp_bblogging_on(rack->rc_tp)) { 3995 union tcp_log_stackspecific log; 3996 struct timeval tv; 3997 3998 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3999 log.u_bbr.flex1 = line; 4000 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 4001 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 4002 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 4003 log.u_bbr.flex5 = rtt; 4004 log.u_bbr.flex6 = rack->rc_highly_buffered; 4005 log.u_bbr.flex6 <<= 1; 4006 log.u_bbr.flex6 |= rack->forced_ack; 4007 log.u_bbr.flex6 <<= 1; 4008 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 4009 log.u_bbr.flex6 <<= 1; 4010 log.u_bbr.flex6 |= rack->in_probe_rtt; 4011 log.u_bbr.flex6 <<= 1; 4012 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 4013 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 4014 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 4015 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 4016 log.u_bbr.flex8 = reas; 4017 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4018 log.u_bbr.delRate = rack_get_bw(rack); 4019 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 4020 log.u_bbr.cur_del_rate <<= 32; 4021 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 4022 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 4023 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 4024 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 4025 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 4026 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 4027 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 4028 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 4029 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4030 log.u_bbr.rttProp = us_cts; 4031 log.u_bbr.rttProp <<= 32; 4032 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 4033 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4034 &rack->rc_inp->inp_socket->so_rcv, 4035 &rack->rc_inp->inp_socket->so_snd, 4036 BBR_LOG_RTT_SHRINKS, 0, 4037 0, &log, false, &rack->r_ctl.act_rcv_time); 4038 } 4039 } 4040 4041 static void 4042 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 4043 { 4044 uint64_t bwdp; 4045 4046 bwdp = rack_get_bw(rack); 4047 bwdp *= (uint64_t)rtt; 4048 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 4049 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 4050 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 4051 /* 4052 * A window protocol must be able to have 4 packets 4053 * outstanding as the floor in order to function 4054 * (especially considering delayed ack :D). 4055 */ 4056 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 4057 } 4058 } 4059 4060 static void 4061 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 4062 { 4063 /** 4064 * ProbeRTT is a bit different in rack_pacing than in 4065 * BBR. It is like BBR in that it uses the lowering of 4066 * the RTT as a signal that we saw something new and 4067 * counts from there for how long between. But it is 4068 * different in that its quite simple. It does not 4069 * play with the cwnd and wait until we get down 4070 * to N segments outstanding and hold that for 4071 * 200ms. Instead it just sets the pacing reduction 4072 * rate to a set percentage (70 by default) and hold 4073 * that for a number of recent GP Srtt's. 4074 */ 4075 uint32_t segsiz; 4076 4077 if (rack->rc_gp_dyn_mul == 0) 4078 return; 4079 4080 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 4081 /* We are idle */ 4082 return; 4083 } 4084 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4085 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4086 /* 4087 * Stop the goodput now, the idea here is 4088 * that future measurements with in_probe_rtt 4089 * won't register if they are not greater so 4090 * we want to get what info (if any) is available 4091 * now. 4092 */ 4093 rack_do_goodput_measurement(rack->rc_tp, rack, 4094 rack->rc_tp->snd_una, __LINE__, 4095 RACK_QUALITY_PROBERTT); 4096 } 4097 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4098 rack->r_ctl.rc_time_probertt_entered = us_cts; 4099 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4100 rack->r_ctl.rc_pace_min_segs); 4101 rack->in_probe_rtt = 1; 4102 rack->measure_saw_probe_rtt = 1; 4103 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4104 rack->r_ctl.rc_time_probertt_starts = 0; 4105 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 4106 if (rack_probertt_use_min_rtt_entry) 4107 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4108 else 4109 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 4110 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4111 __LINE__, RACK_RTTS_ENTERPROBE); 4112 } 4113 4114 static void 4115 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 4116 { 4117 struct rack_sendmap *rsm; 4118 uint32_t segsiz; 4119 4120 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 4121 rack->r_ctl.rc_pace_min_segs); 4122 rack->in_probe_rtt = 0; 4123 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 4124 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 4125 /* 4126 * Stop the goodput now, the idea here is 4127 * that future measurements with in_probe_rtt 4128 * won't register if they are not greater so 4129 * we want to get what info (if any) is available 4130 * now. 4131 */ 4132 rack_do_goodput_measurement(rack->rc_tp, rack, 4133 rack->rc_tp->snd_una, __LINE__, 4134 RACK_QUALITY_PROBERTT); 4135 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 4136 /* 4137 * We don't have enough data to make a measurement. 4138 * So lets just stop and start here after exiting 4139 * probe-rtt. We probably are not interested in 4140 * the results anyway. 4141 */ 4142 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 4143 } 4144 /* 4145 * Measurements through the current snd_max are going 4146 * to be limited by the slower pacing rate. 4147 * 4148 * We need to mark these as app-limited so we 4149 * don't collapse the b/w. 4150 */ 4151 rsm = tqhash_max(rack->r_ctl.tqh); 4152 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 4153 if (rack->r_ctl.rc_app_limited_cnt == 0) 4154 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 4155 else { 4156 /* 4157 * Go out to the end app limited and mark 4158 * this new one as next and move the end_appl up 4159 * to this guy. 4160 */ 4161 if (rack->r_ctl.rc_end_appl) 4162 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 4163 rack->r_ctl.rc_end_appl = rsm; 4164 } 4165 rsm->r_flags |= RACK_APP_LIMITED; 4166 rack->r_ctl.rc_app_limited_cnt++; 4167 } 4168 /* 4169 * Now, we need to examine our pacing rate multipliers. 4170 * If its under 100%, we need to kick it back up to 4171 * 100%. We also don't let it be over our "max" above 4172 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 4173 * Note setting clamp_atexit_prtt to 0 has the effect 4174 * of setting CA/SS to 100% always at exit (which is 4175 * the default behavior). 4176 */ 4177 if (rack_probertt_clear_is) { 4178 rack->rc_gp_incr = 0; 4179 rack->rc_gp_bwred = 0; 4180 rack->rc_gp_timely_inc_cnt = 0; 4181 rack->rc_gp_timely_dec_cnt = 0; 4182 } 4183 /* Do we do any clamping at exit? */ 4184 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 4185 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 4186 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 4187 } 4188 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 4189 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 4190 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 4191 } 4192 /* 4193 * Lets set rtt_diff to 0, so that we will get a "boost" 4194 * after exiting. 4195 */ 4196 rack->r_ctl.rc_rtt_diff = 0; 4197 4198 /* Clear all flags so we start fresh */ 4199 rack->rc_tp->t_bytes_acked = 0; 4200 rack->rc_tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 4201 /* 4202 * If configured to, set the cwnd and ssthresh to 4203 * our targets. 4204 */ 4205 if (rack_probe_rtt_sets_cwnd) { 4206 uint64_t ebdp; 4207 uint32_t setto; 4208 4209 /* Set ssthresh so we get into CA once we hit our target */ 4210 if (rack_probertt_use_min_rtt_exit == 1) { 4211 /* Set to min rtt */ 4212 rack_set_prtt_target(rack, segsiz, 4213 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 4214 } else if (rack_probertt_use_min_rtt_exit == 2) { 4215 /* Set to current gp rtt */ 4216 rack_set_prtt_target(rack, segsiz, 4217 rack->r_ctl.rc_gp_srtt); 4218 } else if (rack_probertt_use_min_rtt_exit == 3) { 4219 /* Set to entry gp rtt */ 4220 rack_set_prtt_target(rack, segsiz, 4221 rack->r_ctl.rc_entry_gp_rtt); 4222 } else { 4223 uint64_t sum; 4224 uint32_t setval; 4225 4226 sum = rack->r_ctl.rc_entry_gp_rtt; 4227 sum *= 10; 4228 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 4229 if (sum >= 20) { 4230 /* 4231 * A highly buffered path needs 4232 * cwnd space for timely to work. 4233 * Lets set things up as if 4234 * we are heading back here again. 4235 */ 4236 setval = rack->r_ctl.rc_entry_gp_rtt; 4237 } else if (sum >= 15) { 4238 /* 4239 * Lets take the smaller of the 4240 * two since we are just somewhat 4241 * buffered. 4242 */ 4243 setval = rack->r_ctl.rc_gp_srtt; 4244 if (setval > rack->r_ctl.rc_entry_gp_rtt) 4245 setval = rack->r_ctl.rc_entry_gp_rtt; 4246 } else { 4247 /* 4248 * Here we are not highly buffered 4249 * and should pick the min we can to 4250 * keep from causing loss. 4251 */ 4252 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 4253 } 4254 rack_set_prtt_target(rack, segsiz, 4255 setval); 4256 } 4257 if (rack_probe_rtt_sets_cwnd > 1) { 4258 /* There is a percentage here to boost */ 4259 ebdp = rack->r_ctl.rc_target_probertt_flight; 4260 ebdp *= rack_probe_rtt_sets_cwnd; 4261 ebdp /= 100; 4262 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 4263 } else 4264 setto = rack->r_ctl.rc_target_probertt_flight; 4265 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 4266 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 4267 /* Enforce a min */ 4268 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 4269 } 4270 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 4271 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 4272 } 4273 rack_log_rtt_shrinks(rack, us_cts, 4274 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4275 __LINE__, RACK_RTTS_EXITPROBE); 4276 /* Clear times last so log has all the info */ 4277 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 4278 rack->r_ctl.rc_time_probertt_entered = us_cts; 4279 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4280 rack->r_ctl.rc_time_of_last_probertt = us_cts; 4281 } 4282 4283 static void 4284 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 4285 { 4286 /* Check in on probe-rtt */ 4287 if (rack->rc_gp_filled == 0) { 4288 /* We do not do p-rtt unless we have gp measurements */ 4289 return; 4290 } 4291 if (rack->in_probe_rtt) { 4292 uint64_t no_overflow; 4293 uint32_t endtime, must_stay; 4294 4295 if (rack->r_ctl.rc_went_idle_time && 4296 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 4297 /* 4298 * We went idle during prtt, just exit now. 4299 */ 4300 rack_exit_probertt(rack, us_cts); 4301 } else if (rack_probe_rtt_safety_val && 4302 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 4303 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 4304 /* 4305 * Probe RTT safety value triggered! 4306 */ 4307 rack_log_rtt_shrinks(rack, us_cts, 4308 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4309 __LINE__, RACK_RTTS_SAFETY); 4310 rack_exit_probertt(rack, us_cts); 4311 } 4312 /* Calculate the max we will wait */ 4313 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 4314 if (rack->rc_highly_buffered) 4315 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 4316 /* Calculate the min we must wait */ 4317 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 4318 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 4319 TSTMP_LT(us_cts, endtime)) { 4320 uint32_t calc; 4321 /* Do we lower more? */ 4322 no_exit: 4323 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 4324 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 4325 else 4326 calc = 0; 4327 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4328 if (calc) { 4329 /* Maybe */ 4330 calc *= rack_per_of_gp_probertt_reduce; 4331 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4332 /* Limit it too */ 4333 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4334 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4335 } 4336 /* We must reach target or the time set */ 4337 return; 4338 } 4339 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4340 if ((TSTMP_LT(us_cts, must_stay) && 4341 rack->rc_highly_buffered) || 4342 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4343 rack->r_ctl.rc_target_probertt_flight)) { 4344 /* We are not past the must_stay time */ 4345 goto no_exit; 4346 } 4347 rack_log_rtt_shrinks(rack, us_cts, 4348 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4349 __LINE__, RACK_RTTS_REACHTARGET); 4350 rack->r_ctl.rc_time_probertt_starts = us_cts; 4351 if (rack->r_ctl.rc_time_probertt_starts == 0) 4352 rack->r_ctl.rc_time_probertt_starts = 1; 4353 /* Restore back to our rate we want to pace at in prtt */ 4354 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4355 } 4356 /* 4357 * Setup our end time, some number of gp_srtts plus 200ms. 4358 */ 4359 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4360 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4361 if (rack_probertt_gpsrtt_cnt_div) 4362 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4363 else 4364 endtime = 0; 4365 endtime += rack_min_probertt_hold; 4366 endtime += rack->r_ctl.rc_time_probertt_starts; 4367 if (TSTMP_GEQ(us_cts, endtime)) { 4368 /* yes, exit probertt */ 4369 rack_exit_probertt(rack, us_cts); 4370 } 4371 4372 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 4373 /* Go into probertt, its been too long since we went lower */ 4374 rack_enter_probertt(rack, us_cts); 4375 } 4376 } 4377 4378 static void 4379 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4380 uint32_t rtt, int32_t rtt_diff) 4381 { 4382 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4383 uint32_t losses; 4384 4385 if ((rack->rc_gp_dyn_mul == 0) || 4386 (rack->use_fixed_rate) || 4387 (rack->in_probe_rtt) || 4388 (rack->rc_always_pace == 0)) { 4389 /* No dynamic GP multiplier in play */ 4390 return; 4391 } 4392 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4393 cur_bw = rack_get_bw(rack); 4394 /* Calculate our up and down range */ 4395 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4396 up_bnd /= 100; 4397 up_bnd += rack->r_ctl.last_gp_comp_bw; 4398 4399 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4400 subfr /= 100; 4401 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4402 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4403 /* 4404 * This is the case where our RTT is above 4405 * the max target and we have been configured 4406 * to just do timely no bonus up stuff in that case. 4407 * 4408 * There are two configurations, set to 1, and we 4409 * just do timely if we are over our max. If its 4410 * set above 1 then we slam the multipliers down 4411 * to 100 and then decrement per timely. 4412 */ 4413 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4414 __LINE__, 3); 4415 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4416 rack_validate_multipliers_at_or_below_100(rack); 4417 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4418 } else if ((timely_says != 0) && (last_bw_est < low_bnd) && !losses) { 4419 /* 4420 * We are decreasing this is a bit complicated this 4421 * means we are loosing ground. This could be 4422 * because another flow entered and we are competing 4423 * for b/w with it. This will push the RTT up which 4424 * makes timely unusable unless we want to get shoved 4425 * into a corner and just be backed off (the age 4426 * old problem with delay based CC). 4427 * 4428 * On the other hand if it was a route change we 4429 * would like to stay somewhat contained and not 4430 * blow out the buffers. 4431 */ 4432 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4433 __LINE__, 3); 4434 rack->r_ctl.last_gp_comp_bw = cur_bw; 4435 if (rack->rc_gp_bwred == 0) { 4436 /* Go into reduction counting */ 4437 rack->rc_gp_bwred = 1; 4438 rack->rc_gp_timely_dec_cnt = 0; 4439 } 4440 if (rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) { 4441 /* 4442 * Push another time with a faster pacing 4443 * to try to gain back (we include override to 4444 * get a full raise factor). 4445 */ 4446 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4447 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4448 (timely_says == 0) || 4449 (rack_down_raise_thresh == 0)) { 4450 /* 4451 * Do an override up in b/w if we were 4452 * below the threshold or if the threshold 4453 * is zero we always do the raise. 4454 */ 4455 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4456 } else { 4457 /* Log it stays the same */ 4458 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4459 __LINE__, 11); 4460 } 4461 rack->rc_gp_timely_dec_cnt++; 4462 /* We are not incrementing really no-count */ 4463 rack->rc_gp_incr = 0; 4464 rack->rc_gp_timely_inc_cnt = 0; 4465 } else { 4466 /* 4467 * Lets just use the RTT 4468 * information and give up 4469 * pushing. 4470 */ 4471 goto use_timely; 4472 } 4473 } else if ((timely_says != 2) && 4474 !losses && 4475 (last_bw_est > up_bnd)) { 4476 /* 4477 * We are increasing b/w lets keep going, updating 4478 * our b/w and ignoring any timely input, unless 4479 * of course we are at our max raise (if there is one). 4480 */ 4481 4482 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4483 __LINE__, 3); 4484 rack->r_ctl.last_gp_comp_bw = cur_bw; 4485 if (rack->rc_gp_saw_ss && 4486 rack->r_ctl.rack_per_upper_bound_ss && 4487 (rack->r_ctl.rack_per_of_gp_ss == rack->r_ctl.rack_per_upper_bound_ss)) { 4488 /* 4489 * In cases where we can't go higher 4490 * we should just use timely. 4491 */ 4492 goto use_timely; 4493 } 4494 if (rack->rc_gp_saw_ca && 4495 rack->r_ctl.rack_per_upper_bound_ca && 4496 (rack->r_ctl.rack_per_of_gp_ca == rack->r_ctl.rack_per_upper_bound_ca)) { 4497 /* 4498 * In cases where we can't go higher 4499 * we should just use timely. 4500 */ 4501 goto use_timely; 4502 } 4503 rack->rc_gp_bwred = 0; 4504 rack->rc_gp_timely_dec_cnt = 0; 4505 /* You get a set number of pushes if timely is trying to reduce */ 4506 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4507 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4508 } else { 4509 /* Log it stays the same */ 4510 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4511 __LINE__, 12); 4512 } 4513 return; 4514 } else { 4515 /* 4516 * We are staying between the lower and upper range bounds 4517 * so use timely to decide. 4518 */ 4519 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4520 __LINE__, 3); 4521 use_timely: 4522 if (timely_says) { 4523 rack->rc_gp_incr = 0; 4524 rack->rc_gp_timely_inc_cnt = 0; 4525 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4526 !losses && 4527 (last_bw_est < low_bnd)) { 4528 /* We are loosing ground */ 4529 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4530 rack->rc_gp_timely_dec_cnt++; 4531 /* We are not incrementing really no-count */ 4532 rack->rc_gp_incr = 0; 4533 rack->rc_gp_timely_inc_cnt = 0; 4534 } else 4535 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4536 } else { 4537 rack->rc_gp_bwred = 0; 4538 rack->rc_gp_timely_dec_cnt = 0; 4539 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4540 } 4541 } 4542 } 4543 4544 static int32_t 4545 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4546 { 4547 int32_t timely_says; 4548 uint64_t log_mult, log_rtt_a_diff; 4549 4550 log_rtt_a_diff = rtt; 4551 log_rtt_a_diff <<= 32; 4552 log_rtt_a_diff |= (uint32_t)rtt_diff; 4553 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4554 rack_gp_rtt_maxmul)) { 4555 /* Reduce the b/w multiplier */ 4556 timely_says = 2; 4557 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4558 log_mult <<= 32; 4559 log_mult |= prev_rtt; 4560 rack_log_timely(rack, timely_says, log_mult, 4561 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4562 log_rtt_a_diff, __LINE__, 4); 4563 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4564 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4565 max(rack_gp_rtt_mindiv , 1)))) { 4566 /* Increase the b/w multiplier */ 4567 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4568 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4569 max(rack_gp_rtt_mindiv , 1)); 4570 log_mult <<= 32; 4571 log_mult |= prev_rtt; 4572 timely_says = 0; 4573 rack_log_timely(rack, timely_says, log_mult , 4574 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4575 log_rtt_a_diff, __LINE__, 5); 4576 } else { 4577 /* 4578 * Use a gradient to find it the timely gradient 4579 * is: 4580 * grad = rc_rtt_diff / min_rtt; 4581 * 4582 * anything below or equal to 0 will be 4583 * a increase indication. Anything above 4584 * zero is a decrease. Note we take care 4585 * of the actual gradient calculation 4586 * in the reduction (its not needed for 4587 * increase). 4588 */ 4589 log_mult = prev_rtt; 4590 if (rtt_diff <= 0) { 4591 /* 4592 * Rttdiff is less than zero, increase the 4593 * b/w multiplier (its 0 or negative) 4594 */ 4595 timely_says = 0; 4596 rack_log_timely(rack, timely_says, log_mult, 4597 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4598 } else { 4599 /* Reduce the b/w multiplier */ 4600 timely_says = 1; 4601 rack_log_timely(rack, timely_says, log_mult, 4602 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4603 } 4604 } 4605 return (timely_says); 4606 } 4607 4608 static __inline int 4609 rack_in_gp_window(struct tcpcb *tp, struct rack_sendmap *rsm) 4610 { 4611 if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4612 SEQ_LEQ(rsm->r_end, tp->gput_ack)) { 4613 /** 4614 * This covers the case that the 4615 * resent is completely inside 4616 * the gp range or up to it. 4617 * |----------------| 4618 * |-----| <or> 4619 * |----| 4620 * <or> |---| 4621 */ 4622 return (1); 4623 } else if (SEQ_LT(rsm->r_start, tp->gput_seq) && 4624 SEQ_GT(rsm->r_end, tp->gput_seq)){ 4625 /** 4626 * This covers the case of 4627 * |--------------| 4628 * |-------->| 4629 */ 4630 return (1); 4631 } else if (SEQ_GEQ(rsm->r_start, tp->gput_seq) && 4632 SEQ_LT(rsm->r_start, tp->gput_ack) && 4633 SEQ_GEQ(rsm->r_end, tp->gput_ack)) { 4634 4635 /** 4636 * This covers the case of 4637 * |--------------| 4638 * |-------->| 4639 */ 4640 return (1); 4641 } 4642 return (0); 4643 } 4644 4645 static __inline void 4646 rack_mark_in_gp_win(struct tcpcb *tp, struct rack_sendmap *rsm) 4647 { 4648 4649 if ((tp->t_flags & TF_GPUTINPROG) == 0) 4650 return; 4651 /* 4652 * We have a Goodput measurement in progress. Mark 4653 * the send if its within the window. If its not 4654 * in the window make sure it does not have the mark. 4655 */ 4656 if (rack_in_gp_window(tp, rsm)) 4657 rsm->r_flags |= RACK_IN_GP_WIN; 4658 else 4659 rsm->r_flags &= ~RACK_IN_GP_WIN; 4660 } 4661 4662 static __inline void 4663 rack_clear_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4664 { 4665 /* A GP measurement is ending, clear all marks on the send map*/ 4666 struct rack_sendmap *rsm = NULL; 4667 4668 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4669 if (rsm == NULL) { 4670 rsm = tqhash_min(rack->r_ctl.tqh); 4671 } 4672 /* Nothing left? */ 4673 while ((rsm != NULL) && (SEQ_GEQ(tp->gput_ack, rsm->r_start))){ 4674 rsm->r_flags &= ~RACK_IN_GP_WIN; 4675 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4676 } 4677 } 4678 4679 4680 static __inline void 4681 rack_tend_gp_marks(struct tcpcb *tp, struct tcp_rack *rack) 4682 { 4683 struct rack_sendmap *rsm = NULL; 4684 4685 if (tp->snd_una == tp->snd_max) { 4686 /* Nothing outstanding yet, nothing to do here */ 4687 return; 4688 } 4689 if (SEQ_GT(tp->gput_seq, tp->snd_una)) { 4690 /* 4691 * We are measuring ahead of some outstanding 4692 * data. We need to walk through up until we get 4693 * to gp_seq marking so that no rsm is set incorrectly 4694 * with RACK_IN_GP_WIN. 4695 */ 4696 rsm = tqhash_min(rack->r_ctl.tqh); 4697 while (rsm != NULL) { 4698 rack_mark_in_gp_win(tp, rsm); 4699 if (SEQ_GEQ(rsm->r_end, tp->gput_seq)) 4700 break; 4701 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4702 } 4703 } 4704 if (rsm == NULL) { 4705 /* 4706 * Need to find the GP seq, if rsm is 4707 * set we stopped as we hit it. 4708 */ 4709 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 4710 if (rsm == NULL) 4711 return; 4712 rack_mark_in_gp_win(tp, rsm); 4713 } 4714 /* 4715 * Now we may need to mark already sent rsm, ahead of 4716 * gput_seq in the window since they may have been sent 4717 * *before* we started our measurment. The rsm, if non-null 4718 * has been marked (note if rsm would have been NULL we would have 4719 * returned in the previous block). So we go to the next, and continue 4720 * until we run out of entries or we exceed the gp_ack value. 4721 */ 4722 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4723 while (rsm) { 4724 rack_mark_in_gp_win(tp, rsm); 4725 if (SEQ_GT(rsm->r_end, tp->gput_ack)) 4726 break; 4727 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 4728 } 4729 } 4730 4731 static void 4732 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4733 tcp_seq th_ack, int line, uint8_t quality) 4734 { 4735 uint64_t tim, bytes_ps, stim, utim; 4736 uint32_t segsiz, bytes, reqbytes, us_cts; 4737 int32_t gput, new_rtt_diff, timely_says; 4738 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4739 int did_add = 0; 4740 4741 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4742 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4743 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4744 tim = us_cts - tp->gput_ts; 4745 else 4746 tim = 0; 4747 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4748 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4749 else 4750 stim = 0; 4751 /* 4752 * Use the larger of the send time or ack time. This prevents us 4753 * from being influenced by ack artifacts to come up with too 4754 * high of measurement. Note that since we are spanning over many more 4755 * bytes in most of our measurements hopefully that is less likely to 4756 * occur. 4757 */ 4758 if (tim > stim) 4759 utim = max(tim, 1); 4760 else 4761 utim = max(stim, 1); 4762 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4763 rack_log_gpset(rack, th_ack, us_cts, rack->r_ctl.rc_gp_cumack_ts, __LINE__, 3, NULL); 4764 if ((tim == 0) && (stim == 0)) { 4765 /* 4766 * Invalid measurement time, maybe 4767 * all on one ack/one send? 4768 */ 4769 bytes = 0; 4770 bytes_ps = 0; 4771 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4772 0, 0, 0, 10, __LINE__, NULL, quality); 4773 goto skip_measurement; 4774 } 4775 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4776 /* We never made a us_rtt measurement? */ 4777 bytes = 0; 4778 bytes_ps = 0; 4779 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4780 0, 0, 0, 10, __LINE__, NULL, quality); 4781 goto skip_measurement; 4782 } 4783 /* 4784 * Calculate the maximum possible b/w this connection 4785 * could have. We base our calculation on the lowest 4786 * rtt we have seen during the measurement and the 4787 * largest rwnd the client has given us in that time. This 4788 * forms a BDP that is the maximum that we could ever 4789 * get to the client. Anything larger is not valid. 4790 * 4791 * I originally had code here that rejected measurements 4792 * where the time was less than 1/2 the latest us_rtt. 4793 * But after thinking on that I realized its wrong since 4794 * say you had a 150Mbps or even 1Gbps link, and you 4795 * were a long way away.. example I am in Europe (100ms rtt) 4796 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4797 * bytes my time would be 1.2ms, and yet my rtt would say 4798 * the measurement was invalid the time was < 50ms. The 4799 * same thing is true for 150Mb (8ms of time). 4800 * 4801 * A better way I realized is to look at what the maximum 4802 * the connection could possibly do. This is gated on 4803 * the lowest RTT we have seen and the highest rwnd. 4804 * We should in theory never exceed that, if we are 4805 * then something on the path is storing up packets 4806 * and then feeding them all at once to our endpoint 4807 * messing up our measurement. 4808 */ 4809 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4810 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4811 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4812 if (SEQ_LT(th_ack, tp->gput_seq)) { 4813 /* No measurement can be made */ 4814 bytes = 0; 4815 bytes_ps = 0; 4816 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4817 0, 0, 0, 10, __LINE__, NULL, quality); 4818 goto skip_measurement; 4819 } else 4820 bytes = (th_ack - tp->gput_seq); 4821 bytes_ps = (uint64_t)bytes; 4822 /* 4823 * Don't measure a b/w for pacing unless we have gotten at least 4824 * an initial windows worth of data in this measurement interval. 4825 * 4826 * Small numbers of bytes get badly influenced by delayed ack and 4827 * other artifacts. Note we take the initial window or our 4828 * defined minimum GP (defaulting to 10 which hopefully is the 4829 * IW). 4830 */ 4831 if (rack->rc_gp_filled == 0) { 4832 /* 4833 * The initial estimate is special. We 4834 * have blasted out an IW worth of packets 4835 * without a real valid ack ts results. We 4836 * then setup the app_limited_needs_set flag, 4837 * this should get the first ack in (probably 2 4838 * MSS worth) to be recorded as the timestamp. 4839 * We thus allow a smaller number of bytes i.e. 4840 * IW - 2MSS. 4841 */ 4842 reqbytes -= (2 * segsiz); 4843 /* Also lets fill previous for our first measurement to be neutral */ 4844 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4845 } 4846 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4847 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4848 rack->r_ctl.rc_app_limited_cnt, 4849 0, 0, 10, __LINE__, NULL, quality); 4850 goto skip_measurement; 4851 } 4852 /* 4853 * We now need to calculate the Timely like status so 4854 * we can update (possibly) the b/w multipliers. 4855 */ 4856 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4857 if (rack->rc_gp_filled == 0) { 4858 /* No previous reading */ 4859 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4860 } else { 4861 if (rack->measure_saw_probe_rtt == 0) { 4862 /* 4863 * We don't want a probertt to be counted 4864 * since it will be negative incorrectly. We 4865 * expect to be reducing the RTT when we 4866 * pace at a slower rate. 4867 */ 4868 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4869 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4870 } 4871 } 4872 timely_says = rack_make_timely_judgement(rack, 4873 rack->r_ctl.rc_gp_srtt, 4874 rack->r_ctl.rc_rtt_diff, 4875 rack->r_ctl.rc_prev_gp_srtt 4876 ); 4877 bytes_ps *= HPTS_USEC_IN_SEC; 4878 bytes_ps /= utim; 4879 if (bytes_ps > rack->r_ctl.last_max_bw) { 4880 /* 4881 * Something is on path playing 4882 * since this b/w is not possible based 4883 * on our BDP (highest rwnd and lowest rtt 4884 * we saw in the measurement window). 4885 * 4886 * Another option here would be to 4887 * instead skip the measurement. 4888 */ 4889 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4890 bytes_ps, rack->r_ctl.last_max_bw, 0, 4891 11, __LINE__, NULL, quality); 4892 bytes_ps = rack->r_ctl.last_max_bw; 4893 } 4894 /* We store gp for b/w in bytes per second */ 4895 if (rack->rc_gp_filled == 0) { 4896 /* Initial measurement */ 4897 if (bytes_ps) { 4898 rack->r_ctl.gp_bw = bytes_ps; 4899 rack->rc_gp_filled = 1; 4900 rack->r_ctl.num_measurements = 1; 4901 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4902 } else { 4903 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4904 rack->r_ctl.rc_app_limited_cnt, 4905 0, 0, 10, __LINE__, NULL, quality); 4906 } 4907 if (tcp_in_hpts(rack->rc_tp) && 4908 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4909 /* 4910 * Ok we can't trust the pacer in this case 4911 * where we transition from un-paced to paced. 4912 * Or for that matter when the burst mitigation 4913 * was making a wild guess and got it wrong. 4914 * Stop the pacer and clear up all the aggregate 4915 * delays etc. 4916 */ 4917 tcp_hpts_remove(rack->rc_tp); 4918 rack->r_ctl.rc_hpts_flags = 0; 4919 rack->r_ctl.rc_last_output_to = 0; 4920 } 4921 did_add = 2; 4922 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4923 /* Still a small number run an average */ 4924 rack->r_ctl.gp_bw += bytes_ps; 4925 addpart = rack->r_ctl.num_measurements; 4926 rack->r_ctl.num_measurements++; 4927 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4928 /* We have collected enough to move forward */ 4929 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4930 } 4931 rack_set_pace_segments(tp, rack, __LINE__, NULL); 4932 did_add = 3; 4933 } else { 4934 /* 4935 * We want to take 1/wma of the goodput and add in to 7/8th 4936 * of the old value weighted by the srtt. So if your measurement 4937 * period is say 2 SRTT's long you would get 1/4 as the 4938 * value, if it was like 1/2 SRTT then you would get 1/16th. 4939 * 4940 * But we must be careful not to take too much i.e. if the 4941 * srtt is say 20ms and the measurement is taken over 4942 * 400ms our weight would be 400/20 i.e. 20. On the 4943 * other hand if we get a measurement over 1ms with a 4944 * 10ms rtt we only want to take a much smaller portion. 4945 */ 4946 if (rack->r_ctl.num_measurements < 0xff) { 4947 rack->r_ctl.num_measurements++; 4948 } 4949 srtt = (uint64_t)tp->t_srtt; 4950 if (srtt == 0) { 4951 /* 4952 * Strange why did t_srtt go back to zero? 4953 */ 4954 if (rack->r_ctl.rc_rack_min_rtt) 4955 srtt = rack->r_ctl.rc_rack_min_rtt; 4956 else 4957 srtt = HPTS_USEC_IN_MSEC; 4958 } 4959 /* 4960 * XXXrrs: Note for reviewers, in playing with 4961 * dynamic pacing I discovered this GP calculation 4962 * as done originally leads to some undesired results. 4963 * Basically you can get longer measurements contributing 4964 * too much to the WMA. Thus I changed it if you are doing 4965 * dynamic adjustments to only do the aportioned adjustment 4966 * if we have a very small (time wise) measurement. Longer 4967 * measurements just get there weight (defaulting to 1/8) 4968 * add to the WMA. We may want to think about changing 4969 * this to always do that for both sides i.e. dynamic 4970 * and non-dynamic... but considering lots of folks 4971 * were playing with this I did not want to change the 4972 * calculation per.se. without your thoughts.. Lawerence? 4973 * Peter?? 4974 */ 4975 if (rack->rc_gp_dyn_mul == 0) { 4976 subpart = rack->r_ctl.gp_bw * utim; 4977 subpart /= (srtt * 8); 4978 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4979 /* 4980 * The b/w update takes no more 4981 * away then 1/2 our running total 4982 * so factor it in. 4983 */ 4984 addpart = bytes_ps * utim; 4985 addpart /= (srtt * 8); 4986 } else { 4987 /* 4988 * Don't allow a single measurement 4989 * to account for more than 1/2 of the 4990 * WMA. This could happen on a retransmission 4991 * where utim becomes huge compared to 4992 * srtt (multiple retransmissions when using 4993 * the sending rate which factors in all the 4994 * transmissions from the first one). 4995 */ 4996 subpart = rack->r_ctl.gp_bw / 2; 4997 addpart = bytes_ps / 2; 4998 } 4999 resid_bw = rack->r_ctl.gp_bw - subpart; 5000 rack->r_ctl.gp_bw = resid_bw + addpart; 5001 did_add = 1; 5002 } else { 5003 if ((utim / srtt) <= 1) { 5004 /* 5005 * The b/w update was over a small period 5006 * of time. The idea here is to prevent a small 5007 * measurement time period from counting 5008 * too much. So we scale it based on the 5009 * time so it attributes less than 1/rack_wma_divisor 5010 * of its measurement. 5011 */ 5012 subpart = rack->r_ctl.gp_bw * utim; 5013 subpart /= (srtt * rack_wma_divisor); 5014 addpart = bytes_ps * utim; 5015 addpart /= (srtt * rack_wma_divisor); 5016 } else { 5017 /* 5018 * The scaled measurement was long 5019 * enough so lets just add in the 5020 * portion of the measurement i.e. 1/rack_wma_divisor 5021 */ 5022 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 5023 addpart = bytes_ps / rack_wma_divisor; 5024 } 5025 if ((rack->measure_saw_probe_rtt == 0) || 5026 (bytes_ps > rack->r_ctl.gp_bw)) { 5027 /* 5028 * For probe-rtt we only add it in 5029 * if its larger, all others we just 5030 * add in. 5031 */ 5032 did_add = 1; 5033 resid_bw = rack->r_ctl.gp_bw - subpart; 5034 rack->r_ctl.gp_bw = resid_bw + addpart; 5035 } 5036 } 5037 rack_set_pace_segments(tp, rack, __LINE__, NULL); 5038 } 5039 if ((rack->gp_ready == 0) && 5040 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 5041 /* We have enough measurements now */ 5042 rack->gp_ready = 1; 5043 if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) || 5044 rack->rack_hibeta) 5045 rack_set_cc_pacing(rack); 5046 if (rack->defer_options) 5047 rack_apply_deferred_options(rack); 5048 } 5049 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 5050 rack_get_bw(rack), 22, did_add, NULL, quality); 5051 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 5052 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 5053 rack_update_multiplier(rack, timely_says, bytes_ps, 5054 rack->r_ctl.rc_gp_srtt, 5055 rack->r_ctl.rc_rtt_diff); 5056 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 5057 rack_get_bw(rack), 3, line, NULL, quality); 5058 rack_log_pacing_delay_calc(rack, 5059 bytes, /* flex2 */ 5060 tim, /* flex1 */ 5061 bytes_ps, /* bw_inuse */ 5062 rack->r_ctl.gp_bw, /* delRate */ 5063 rack_get_lt_bw(rack), /* rttProp */ 5064 20, line, NULL, 0); 5065 /* reset the gp srtt and setup the new prev */ 5066 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 5067 /* Record the lost count for the next measurement */ 5068 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 5069 skip_measurement: 5070 /* 5071 * We restart our diffs based on the gpsrtt in the 5072 * measurement window. 5073 */ 5074 rack->rc_gp_rtt_set = 0; 5075 rack->rc_gp_saw_rec = 0; 5076 rack->rc_gp_saw_ca = 0; 5077 rack->rc_gp_saw_ss = 0; 5078 rack->rc_dragged_bottom = 0; 5079 5080 if (quality == RACK_QUALITY_HIGH) { 5081 /* 5082 * Gput in the stats world is in kbps where bytes_ps is 5083 * bytes per second so we do ((x * 8)/ 1000). 5084 */ 5085 gput = (int32_t)((bytes_ps << 3) / (uint64_t)1000); 5086 #ifdef STATS 5087 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 5088 gput); 5089 /* 5090 * XXXLAS: This is a temporary hack, and should be 5091 * chained off VOI_TCP_GPUT when stats(9) grows an 5092 * API to deal with chained VOIs. 5093 */ 5094 if (tp->t_stats_gput_prev > 0) 5095 stats_voi_update_abs_s32(tp->t_stats, 5096 VOI_TCP_GPUT_ND, 5097 ((gput - tp->t_stats_gput_prev) * 100) / 5098 tp->t_stats_gput_prev); 5099 #endif 5100 tp->t_stats_gput_prev = gput; 5101 } 5102 tp->t_flags &= ~TF_GPUTINPROG; 5103 /* 5104 * Now are we app limited now and there is space from where we 5105 * were to where we want to go? 5106 * 5107 * We don't do the other case i.e. non-applimited here since 5108 * the next send will trigger us picking up the missing data. 5109 */ 5110 if (rack->r_ctl.rc_first_appl && 5111 TCPS_HAVEESTABLISHED(tp->t_state) && 5112 rack->r_ctl.rc_app_limited_cnt && 5113 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 5114 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 5115 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 5116 /* 5117 * Yep there is enough outstanding to make a measurement here. 5118 */ 5119 struct rack_sendmap *rsm; 5120 5121 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 5122 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 5123 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 5124 rack->app_limited_needs_set = 0; 5125 tp->gput_seq = th_ack; 5126 if (rack->in_probe_rtt) 5127 rack->measure_saw_probe_rtt = 1; 5128 else if ((rack->measure_saw_probe_rtt) && 5129 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 5130 rack->measure_saw_probe_rtt = 0; 5131 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 5132 /* There is a full window to gain info from */ 5133 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 5134 } else { 5135 /* We can only measure up to the applimited point */ 5136 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 5137 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 5138 /* 5139 * We don't have enough to make a measurement. 5140 */ 5141 tp->t_flags &= ~TF_GPUTINPROG; 5142 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 5143 0, 0, 0, 6, __LINE__, NULL, quality); 5144 return; 5145 } 5146 } 5147 if (tp->t_state >= TCPS_FIN_WAIT_1) { 5148 /* 5149 * We will get no more data into the SB 5150 * this means we need to have the data available 5151 * before we start a measurement. 5152 */ 5153 if (sbavail(&tptosocket(tp)->so_snd) < (tp->gput_ack - tp->gput_seq)) { 5154 /* Nope not enough data. */ 5155 return; 5156 } 5157 } 5158 tp->t_flags |= TF_GPUTINPROG; 5159 /* 5160 * Now we need to find the timestamp of the send at tp->gput_seq 5161 * for the send based measurement. 5162 */ 5163 rack->r_ctl.rc_gp_cumack_ts = 0; 5164 rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 5165 if (rsm) { 5166 /* Ok send-based limit is set */ 5167 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 5168 /* 5169 * Move back to include the earlier part 5170 * so our ack time lines up right (this may 5171 * make an overlapping measurement but thats 5172 * ok). 5173 */ 5174 tp->gput_seq = rsm->r_start; 5175 } 5176 if (rsm->r_flags & RACK_ACKED) { 5177 struct rack_sendmap *nrsm; 5178 5179 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 5180 tp->gput_seq = rsm->r_end; 5181 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 5182 if (nrsm) 5183 rsm = nrsm; 5184 else { 5185 rack->app_limited_needs_set = 1; 5186 } 5187 } else 5188 rack->app_limited_needs_set = 1; 5189 /* We always go from the first send */ 5190 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 5191 } else { 5192 /* 5193 * If we don't find the rsm due to some 5194 * send-limit set the current time, which 5195 * basically disables the send-limit. 5196 */ 5197 struct timeval tv; 5198 5199 microuptime(&tv); 5200 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 5201 } 5202 rack_tend_gp_marks(tp, rack); 5203 rack_log_pacing_delay_calc(rack, 5204 tp->gput_seq, 5205 tp->gput_ack, 5206 (uint64_t)rsm, 5207 tp->gput_ts, 5208 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 5209 9, 5210 __LINE__, rsm, quality); 5211 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 5212 } else { 5213 /* 5214 * To make sure proper timestamp merging occurs, we need to clear 5215 * all GP marks if we don't start a measurement. 5216 */ 5217 rack_clear_gp_marks(tp, rack); 5218 } 5219 } 5220 5221 /* 5222 * CC wrapper hook functions 5223 */ 5224 static void 5225 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 5226 uint16_t type, int32_t recovery) 5227 { 5228 uint32_t prior_cwnd, acked; 5229 struct tcp_log_buffer *lgb = NULL; 5230 uint8_t labc_to_use, quality; 5231 5232 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5233 tp->t_ccv.nsegs = nsegs; 5234 acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); 5235 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 5236 uint32_t max; 5237 5238 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 5239 if (tp->t_ccv.bytes_this_ack > max) { 5240 tp->t_ccv.bytes_this_ack = max; 5241 } 5242 } 5243 #ifdef STATS 5244 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 5245 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 5246 #endif 5247 if ((th_ack == tp->snd_max) && rack->lt_bw_up) { 5248 /* We will ack all, time 5249 * to end any lt_bw_up we 5250 * have running until something 5251 * new is sent. 5252 */ 5253 struct timeval tv; 5254 5255 rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); 5256 rack->r_ctl.lt_seq = tp->snd_max; 5257 (void)tcp_get_usecs(&tv); 5258 rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); 5259 rack->lt_bw_up = 0; 5260 } 5261 quality = RACK_QUALITY_NONE; 5262 if ((tp->t_flags & TF_GPUTINPROG) && 5263 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 5264 /* Measure the Goodput */ 5265 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 5266 } 5267 /* Which way our we limited, if not cwnd limited no advance in CA */ 5268 if (tp->snd_cwnd <= tp->snd_wnd) 5269 tp->t_ccv.flags |= CCF_CWND_LIMITED; 5270 else 5271 tp->t_ccv.flags &= ~CCF_CWND_LIMITED; 5272 if (tp->snd_cwnd > tp->snd_ssthresh) { 5273 tp->t_bytes_acked += min(tp->t_ccv.bytes_this_ack, 5274 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 5275 /* For the setting of a window past use the actual scwnd we are using */ 5276 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 5277 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 5278 tp->t_ccv.flags |= CCF_ABC_SENTAWND; 5279 } 5280 } else { 5281 tp->t_ccv.flags &= ~CCF_ABC_SENTAWND; 5282 tp->t_bytes_acked = 0; 5283 } 5284 prior_cwnd = tp->snd_cwnd; 5285 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 5286 (rack_client_low_buf && rack->client_bufferlvl && 5287 (rack->client_bufferlvl < rack_client_low_buf))) 5288 labc_to_use = rack->rc_labc; 5289 else 5290 labc_to_use = rack_max_abc_post_recovery; 5291 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5292 union tcp_log_stackspecific log; 5293 struct timeval tv; 5294 5295 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5296 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5297 log.u_bbr.flex1 = th_ack; 5298 log.u_bbr.flex2 = tp->t_ccv.flags; 5299 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5300 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5301 log.u_bbr.flex5 = labc_to_use; 5302 log.u_bbr.flex6 = prior_cwnd; 5303 log.u_bbr.flex7 = V_tcp_do_newsack; 5304 log.u_bbr.flex8 = 1; 5305 lgb = tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5306 0, &log, false, NULL, __func__, __LINE__,&tv); 5307 } 5308 if (CC_ALGO(tp)->ack_received != NULL) { 5309 /* XXXLAS: Find a way to live without this */ 5310 tp->t_ccv.curack = th_ack; 5311 tp->t_ccv.labc = labc_to_use; 5312 tp->t_ccv.flags |= CCF_USE_LOCAL_ABC; 5313 CC_ALGO(tp)->ack_received(&tp->t_ccv, type); 5314 } 5315 if (lgb) { 5316 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 5317 } 5318 if (rack->r_must_retran) { 5319 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 5320 /* 5321 * We now are beyond the rxt point so lets disable 5322 * the flag. 5323 */ 5324 rack->r_ctl.rc_out_at_rto = 0; 5325 rack->r_must_retran = 0; 5326 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 5327 /* 5328 * Only decrement the rc_out_at_rto if the cwnd advances 5329 * at least a whole segment. Otherwise next time the peer 5330 * acks, we won't be able to send this generaly happens 5331 * when we are in Congestion Avoidance. 5332 */ 5333 if (acked <= rack->r_ctl.rc_out_at_rto){ 5334 rack->r_ctl.rc_out_at_rto -= acked; 5335 } else { 5336 rack->r_ctl.rc_out_at_rto = 0; 5337 } 5338 } 5339 } 5340 #ifdef STATS 5341 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 5342 #endif 5343 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 5344 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 5345 } 5346 } 5347 5348 static void 5349 tcp_rack_partialack(struct tcpcb *tp) 5350 { 5351 struct tcp_rack *rack; 5352 5353 rack = (struct tcp_rack *)tp->t_fb_ptr; 5354 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5355 /* 5356 * If we are doing PRR and have enough 5357 * room to send <or> we are pacing and prr 5358 * is disabled we will want to see if we 5359 * can send data (by setting r_wanted_output to 5360 * true). 5361 */ 5362 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 5363 rack->rack_no_prr) 5364 rack->r_wanted_output = 1; 5365 } 5366 5367 static inline void 5368 rack_set_most_aggr(struct tcp_rack *rack) 5369 { 5370 rack->r_fill_less_agg = 0; 5371 /* Once the cwnd as been clamped we don't do fill_cw */ 5372 if (rack->r_cwnd_was_clamped == 0) 5373 rack->rc_pace_to_cwnd = 1; 5374 rack->r_pacing_discount = 0; 5375 } 5376 5377 static inline void 5378 rack_limit_fillcw(struct tcp_rack *rack) 5379 { 5380 rack->r_fill_less_agg = 1; 5381 /* Once the cwnd as been clamped we don't do fill_cw */ 5382 if (rack->r_cwnd_was_clamped == 0) 5383 rack->rc_pace_to_cwnd = 1; 5384 rack->r_pacing_discount = 0; 5385 } 5386 5387 static inline void 5388 rack_disable_fillcw(struct tcp_rack *rack) 5389 { 5390 rack->r_fill_less_agg = 1; 5391 rack->rc_pace_to_cwnd = 0; 5392 rack->r_pacing_discount = 0; 5393 } 5394 5395 static void 5396 rack_client_buffer_level_set(struct tcp_rack *rack) 5397 { 5398 /* 5399 * Only if DGP is on do we do anything that 5400 * changes stack behavior. If DGP is off all 5401 * we will do is issue a BB log (if BB logging is 5402 * on) and return. 5403 */ 5404 if (rack->dgp_on == 0) { 5405 rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl, 5406 0, 0, 0, 30, __LINE__, NULL, 0); 5407 return; 5408 } 5409 if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) { 5410 goto set_most_agg; 5411 } 5412 /* 5413 * We are in DGP so what setting should we 5414 * apply based on where the client is? 5415 */ 5416 switch(rack->r_ctl.rc_dgp_bl_agg) { 5417 default: 5418 case DGP_LEVEL0: 5419 set_most_agg: 5420 rack_set_most_aggr(rack); 5421 break; 5422 case DGP_LEVEL1: 5423 if (rack->client_bufferlvl == 4) 5424 rack_limit_fillcw(rack); 5425 else if (rack->client_bufferlvl == 5) 5426 rack_disable_fillcw(rack); 5427 else 5428 rack_set_most_aggr(rack); 5429 break; 5430 case DGP_LEVEL2: 5431 if (rack->client_bufferlvl == 3) 5432 rack_limit_fillcw(rack); 5433 else if (rack->client_bufferlvl == 4) 5434 rack_disable_fillcw(rack); 5435 else if (rack->client_bufferlvl == 5) { 5436 rack_disable_fillcw(rack); 5437 rack->r_pacing_discount = 1; 5438 rack->r_ctl.pacing_discount_amm = 1; 5439 } else 5440 rack_set_most_aggr(rack); 5441 break; 5442 case DGP_LEVEL3: 5443 if (rack->client_bufferlvl == 2) 5444 rack_limit_fillcw(rack); 5445 else if (rack->client_bufferlvl == 3) 5446 rack_disable_fillcw(rack); 5447 else if (rack->client_bufferlvl == 4) { 5448 rack_disable_fillcw(rack); 5449 rack->r_pacing_discount = 1; 5450 rack->r_ctl.pacing_discount_amm = 1; 5451 } else if (rack->client_bufferlvl == 5) { 5452 rack_disable_fillcw(rack); 5453 rack->r_pacing_discount = 1; 5454 rack->r_ctl.pacing_discount_amm = 2; 5455 } else 5456 rack_set_most_aggr(rack); 5457 break; 5458 } 5459 rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0, 5460 0, 0, 30, __LINE__, NULL, 0); 5461 } 5462 5463 static void 5464 do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack) 5465 { 5466 /* 5467 * Can we unclamp. We unclamp if more than 5468 * N rounds have transpired with no loss. 5469 */ 5470 uint64_t snds, rxts, rxt_per; 5471 uint32_t rnds; 5472 5473 rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; 5474 if ((rack_unclamp_round_thresh > 0) && 5475 (rnds >= rack_unclamp_round_thresh)) { 5476 snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; 5477 KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp, 5478 (uintmax_t)snds)); 5479 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; 5480 rxt_per = rxts * 1000; 5481 rxt_per /= snds; 5482 if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) { 5483 /* Unclamp */ 5484 if (tcp_bblogging_on(rack->rc_tp)) { 5485 union tcp_log_stackspecific log; 5486 struct timeval tv; 5487 5488 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5489 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5490 log.u_bbr.flex3 = rnds; 5491 log.u_bbr.flex4 = rack_unclamp_round_thresh; 5492 log.u_bbr.flex5 = (uint32_t)rxt_per; 5493 log.u_bbr.flex8 = 6; 5494 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; 5495 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; 5496 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; 5497 log.u_bbr.applimited = rack->r_ctl.max_clamps; 5498 log.u_bbr.epoch = rack->r_ctl.clamp_options; 5499 log.u_bbr.cur_del_rate = rxts; 5500 log.u_bbr.bw_inuse = rack_get_lt_bw(rack); 5501 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5502 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); 5503 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); 5504 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5505 0, &log, false, NULL, NULL, 0, &tv); 5506 } 5507 rack->r_ctl.num_of_clamps_applied = 0; 5508 rack->r_cwnd_was_clamped = 0; 5509 rack->excess_rxt_on = 1; 5510 if (rack->r_ctl.clamp_options) { 5511 /* 5512 * We only allow fillcw to be toggled 5513 * if you are setting a max seg too. 5514 */ 5515 if (rack->r_ctl.clamp_options & 0x1) { 5516 if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { 5517 /* turn on fill cw for non-dgp*/ 5518 rack->rc_pace_to_cwnd = 0; 5519 } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { 5520 /* For DGP we want it off */ 5521 rack->rc_pace_to_cwnd = 1; 5522 } 5523 } 5524 } 5525 if (rack->dgp_on) { 5526 /* Reset all multipliers to 100.0 so just the measured bw */ 5527 /* Crash any per boosts down to 100% */ 5528 rack->r_ctl.rack_per_of_gp_rec = 100; 5529 rack->r_ctl.rack_per_of_gp_ss = 100; 5530 rack->r_ctl.rack_per_of_gp_ca = 100; 5531 /* Set in an upper bound for ss/ca % increase */ 5532 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 5533 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 5534 } 5535 } 5536 } 5537 } 5538 5539 static void 5540 do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack) 5541 { 5542 /* 5543 * Rack excess rxt accounting is turned on. If we 5544 * are above a threshold of rxt's in at least N 5545 * rounds, then back off the cwnd and ssthresh 5546 * to fit into the long-term b/w. 5547 */ 5548 uint64_t snds, rxts, rxt_per, lt_bw, bdp; 5549 uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0; 5550 5551 /* Is it shut off by 0 rounds? */ 5552 if (rack_rxt_min_rnds == 0) 5553 return; 5554 if ((rack->r_ctl.max_clamps > 0) && 5555 (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) { 5556 /* 5557 * The idea, if max_clamps is set, is that if clamping it 5558 * N times did not work again, then there is no sense 5559 * clamping it again. The link is just a lossy link and 5560 * our clamps are doing no good. Turn it off so we don't come 5561 * back here again. 5562 */ 5563 rack->excess_rxt_on = 0; 5564 rack->r_cwnd_was_clamped = 0; 5565 rack->r_ctl.num_of_clamps_applied = 0; 5566 return; 5567 } 5568 snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; 5569 rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; 5570 rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; 5571 /* Has enough rounds progressed for us to re-measure? */ 5572 if ((rnds >= rack_rxt_min_rnds) && 5573 (rack->r_ctl.rxt_threshold > 0)){ 5574 rxt_per = rxts * 1000; 5575 rxt_per /= snds; 5576 if (rxt_per >= rack->r_ctl.rxt_threshold) { 5577 /* 5578 * Action required: 5579 * We are above our excess retransmit level, lets 5580 * cut down the cwnd and ssthresh to match the long-term 5581 * b/w we are getting. 5582 */ 5583 /* First disable scwnd if enabled */ 5584 #ifdef NETFLIX_SHARED_CWND 5585 rack->rack_enable_scwnd = 0; 5586 if (rack->r_ctl.rc_scw) { 5587 uint32_t limit; 5588 5589 shared_cwnd_was_enabled = 1; 5590 if (rack->r_limit_scw) 5591 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 5592 else 5593 limit = 0; 5594 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 5595 rack->r_ctl.rc_scw_index, 5596 limit); 5597 rack->r_ctl.rc_scw = NULL; 5598 } 5599 5600 #endif 5601 /* Calculate what the cwnd and ssthresh should be */ 5602 tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT); 5603 lt_bw = rack_get_lt_bw(rack); 5604 if (lt_bw == 0) { 5605 /* 5606 * No lt_bw, lets chop things to one MSS 5607 * and the ssthresh to the iwnd. 5608 */ 5609 reset_to_iw: 5610 new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5611 new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp)); 5612 } else { 5613 rtt = rack->rc_rack_rtt; 5614 if (rtt == 0) { 5615 /* If we have no rack_rtt drop to the IW situation */ 5616 goto reset_to_iw; 5617 } 5618 bdp = lt_bw * (uint64_t)rtt; 5619 bdp /= HPTS_USEC_IN_SEC; 5620 new_cwnd = (uint32_t)bdp; 5621 new_ssthresh = new_cwnd - 1; 5622 if (new_cwnd < ctf_fixed_maxseg(tp)) { 5623 /* Rock bottom, goto IW settings */ 5624 goto reset_to_iw; 5625 } 5626 } 5627 rack->r_cwnd_was_clamped = 1; 5628 rack->r_ctl.num_of_clamps_applied++; 5629 /* Reset the counter fromn now */ 5630 tp->t_bytes_acked = 0; 5631 /* 5632 * Now what about options? 5633 * We look at the bottom 8 bits: 5634 * F = fill cw bit (toggle it if set) 5635 * S = Segment bits 5636 * M = set max segment bit 5637 * 5638 * SSSS SSMF 5639 */ 5640 if (rack->r_ctl.clamp_options) { 5641 if (rack->r_ctl.clamp_options & 0x1) { 5642 if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { 5643 /* turn on fill cw for non-dgp*/ 5644 rack->rc_pace_to_cwnd = 1; 5645 } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { 5646 /* For DGP we want it off */ 5647 rack->rc_pace_to_cwnd = 0; 5648 } 5649 } 5650 } 5651 if (rack->dgp_on) { 5652 /* Reset all multipliers to 100.0 so just the measured bw */ 5653 /* Crash any per boosts down to 100% */ 5654 rack->r_ctl.rack_per_of_gp_rec = 100; 5655 rack->r_ctl.rack_per_of_gp_ss = 100; 5656 rack->r_ctl.rack_per_of_gp_ca = 100; 5657 /* Set in an upper bound for ss/ca % increase */ 5658 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper; 5659 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper; 5660 /* Now move to the lt_bw */ 5661 rack->r_ctl.gp_bw = lt_bw; 5662 rack->rc_gp_filled = 1; 5663 rack->r_ctl.num_measurements = RACK_REQ_AVG; 5664 } 5665 if (tcp_bblogging_on(rack->rc_tp)) { 5666 union tcp_log_stackspecific log; 5667 struct timeval tv; 5668 5669 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5670 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5671 log.u_bbr.flex1 = new_cwnd; 5672 log.u_bbr.flex2 = new_ssthresh; 5673 log.u_bbr.flex3 = rnds; 5674 log.u_bbr.flex4 = rack_rxt_min_rnds; 5675 log.u_bbr.flex5 = rtt; 5676 log.u_bbr.flex6 = shared_cwnd_was_enabled; 5677 log.u_bbr.flex8 = 5; 5678 log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; 5679 log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; 5680 log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; 5681 log.u_bbr.applimited = rack->r_ctl.max_clamps; 5682 log.u_bbr.epoch = rack->r_ctl.clamp_options; 5683 log.u_bbr.cur_del_rate = rxts; 5684 log.u_bbr.delRate = snds; 5685 log.u_bbr.rttProp = rack->r_ctl.rxt_threshold; 5686 log.u_bbr.bw_inuse = lt_bw; 5687 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 5688 log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); 5689 log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); 5690 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5691 0, &log, false, NULL, NULL, 0, &tv); 5692 } 5693 /* Update our point where we did it */ 5694 if (rack->r_ctl.already_had_a_excess == 0) { 5695 rack->r_ctl.already_had_a_excess = 1; 5696 counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1); 5697 } 5698 counter_u64_add(rack_rxt_clamps_cwnd, 1); 5699 rack->r_ctl.last_sndbytes = tp->t_sndbytes; 5700 rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes; 5701 rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round; 5702 if (new_cwnd < tp->snd_cwnd) 5703 tp->snd_cwnd = new_cwnd; 5704 if (new_ssthresh < tp->snd_ssthresh) 5705 tp->snd_ssthresh = new_ssthresh; 5706 } 5707 } 5708 } 5709 5710 static void 5711 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 5712 { 5713 struct tcp_rack *rack; 5714 uint32_t orig_cwnd; 5715 5716 orig_cwnd = tp->snd_cwnd; 5717 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5718 rack = (struct tcp_rack *)tp->t_fb_ptr; 5719 /* only alert CC if we alerted when we entered */ 5720 if (CC_ALGO(tp)->post_recovery != NULL) { 5721 tp->t_ccv.curack = th_ack; 5722 CC_ALGO(tp)->post_recovery(&tp->t_ccv); 5723 if (tp->snd_cwnd < tp->snd_ssthresh) { 5724 /* 5725 * Rack has burst control and pacing 5726 * so lets not set this any lower than 5727 * snd_ssthresh per RFC-6582 (option 2). 5728 */ 5729 tp->snd_cwnd = tp->snd_ssthresh; 5730 } 5731 } 5732 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 5733 union tcp_log_stackspecific log; 5734 struct timeval tv; 5735 5736 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5737 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5738 log.u_bbr.flex1 = th_ack; 5739 log.u_bbr.flex2 = tp->t_ccv.flags; 5740 log.u_bbr.flex3 = tp->t_ccv.bytes_this_ack; 5741 log.u_bbr.flex4 = tp->t_ccv.nsegs; 5742 log.u_bbr.flex5 = V_tcp_abc_l_var; 5743 log.u_bbr.flex6 = orig_cwnd; 5744 log.u_bbr.flex7 = V_tcp_do_newsack; 5745 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 5746 log.u_bbr.flex8 = 2; 5747 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 5748 0, &log, false, NULL, __func__, __LINE__, &tv); 5749 } 5750 if ((rack->rack_no_prr == 0) && 5751 (rack->no_prr_addback == 0) && 5752 (rack->r_ctl.rc_prr_sndcnt > 0)) { 5753 /* 5754 * Suck the next prr cnt back into cwnd, but 5755 * only do that if we are not application limited. 5756 */ 5757 if (ctf_outstanding(tp) <= sbavail(&tptosocket(tp)->so_snd)) { 5758 /* 5759 * We are allowed to add back to the cwnd the amount we did 5760 * not get out if: 5761 * a) no_prr_addback is off. 5762 * b) we are not app limited 5763 * c) we are doing prr 5764 * <and> 5765 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 5766 */ 5767 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 5768 rack->r_ctl.rc_prr_sndcnt); 5769 } 5770 rack->r_ctl.rc_prr_sndcnt = 0; 5771 rack_log_to_prr(rack, 1, 0, __LINE__); 5772 } 5773 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 5774 tp->snd_recover = tp->snd_una; 5775 if (rack->r_ctl.dsack_persist) { 5776 rack->r_ctl.dsack_persist--; 5777 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 5778 rack->r_ctl.num_dsack = 0; 5779 } 5780 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 5781 } 5782 EXIT_RECOVERY(tp->t_flags); 5783 if (rack->r_ctl.full_dgp_in_rec) 5784 rack_client_buffer_level_set(rack); 5785 } 5786 5787 static void 5788 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int line) 5789 { 5790 struct tcp_rack *rack; 5791 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 5792 5793 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5794 #ifdef STATS 5795 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 5796 #endif 5797 if (IN_RECOVERY(tp->t_flags) == 0) { 5798 in_rec_at_entry = 0; 5799 ssthresh_enter = tp->snd_ssthresh; 5800 cwnd_enter = tp->snd_cwnd; 5801 } else 5802 in_rec_at_entry = 1; 5803 rack = (struct tcp_rack *)tp->t_fb_ptr; 5804 switch (type) { 5805 case CC_NDUPACK: 5806 tp->t_flags &= ~TF_WASFRECOVERY; 5807 tp->t_flags &= ~TF_WASCRECOVERY; 5808 if (!IN_FASTRECOVERY(tp->t_flags)) { 5809 if (rack->dgp_on && rack->r_cwnd_was_clamped) { 5810 /* Reset the gains so that on exit we will be softer longer */ 5811 rack->r_ctl.rack_per_of_gp_rec = 100; 5812 rack->r_ctl.rack_per_of_gp_ss = 98; 5813 rack->r_ctl.rack_per_of_gp_ca = 98; 5814 } 5815 rack->r_ctl.rc_prr_delivered = 0; 5816 rack->r_ctl.rc_prr_out = 0; 5817 rack->r_fast_output = 0; 5818 if (rack->rack_no_prr == 0) { 5819 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5820 rack_log_to_prr(rack, 2, in_rec_at_entry, line); 5821 } 5822 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 5823 tp->snd_recover = tp->snd_max; 5824 if (tp->t_flags2 & TF2_ECN_PERMIT) 5825 tp->t_flags2 |= TF2_ECN_SND_CWR; 5826 } 5827 break; 5828 case CC_ECN: 5829 if (!IN_CONGRECOVERY(tp->t_flags) || 5830 /* 5831 * Allow ECN reaction on ACK to CWR, if 5832 * that data segment was also CE marked. 5833 */ 5834 SEQ_GEQ(ack, tp->snd_recover)) { 5835 EXIT_CONGRECOVERY(tp->t_flags); 5836 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 5837 rack->r_fast_output = 0; 5838 tp->snd_recover = tp->snd_max + 1; 5839 if (tp->t_flags2 & TF2_ECN_PERMIT) 5840 tp->t_flags2 |= TF2_ECN_SND_CWR; 5841 } 5842 break; 5843 case CC_RTO: 5844 tp->t_dupacks = 0; 5845 tp->t_bytes_acked = 0; 5846 rack->r_fast_output = 0; 5847 EXIT_RECOVERY(tp->t_flags); 5848 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 5849 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 5850 orig_cwnd = tp->snd_cwnd; 5851 tp->snd_cwnd = ctf_fixed_maxseg(tp); 5852 rack_log_to_prr(rack, 16, orig_cwnd, line); 5853 if (tp->t_flags2 & TF2_ECN_PERMIT) 5854 tp->t_flags2 |= TF2_ECN_SND_CWR; 5855 break; 5856 case CC_RTO_ERR: 5857 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 5858 /* RTO was unnecessary, so reset everything. */ 5859 tp->snd_cwnd = tp->snd_cwnd_prev; 5860 tp->snd_ssthresh = tp->snd_ssthresh_prev; 5861 tp->snd_recover = tp->snd_recover_prev; 5862 if (tp->t_flags & TF_WASFRECOVERY) { 5863 ENTER_FASTRECOVERY(tp->t_flags); 5864 tp->t_flags &= ~TF_WASFRECOVERY; 5865 } 5866 if (tp->t_flags & TF_WASCRECOVERY) { 5867 ENTER_CONGRECOVERY(tp->t_flags); 5868 tp->t_flags &= ~TF_WASCRECOVERY; 5869 } 5870 tp->snd_nxt = tp->snd_max; 5871 tp->t_badrxtwin = 0; 5872 break; 5873 } 5874 if ((CC_ALGO(tp)->cong_signal != NULL) && 5875 (type != CC_RTO)){ 5876 tp->t_ccv.curack = ack; 5877 CC_ALGO(tp)->cong_signal(&tp->t_ccv, type); 5878 } 5879 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 5880 rack_log_to_prr(rack, 15, cwnd_enter, line); 5881 if (rack->r_ctl.full_dgp_in_rec) 5882 rack_client_buffer_level_set(rack); 5883 rack->r_ctl.dsack_byte_cnt = 0; 5884 rack->r_ctl.retran_during_recovery = 0; 5885 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 5886 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 5887 rack->r_ent_rec_ns = 1; 5888 } 5889 } 5890 5891 static inline void 5892 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 5893 { 5894 uint32_t i_cwnd; 5895 5896 INP_WLOCK_ASSERT(tptoinpcb(tp)); 5897 5898 if (CC_ALGO(tp)->after_idle != NULL) 5899 CC_ALGO(tp)->after_idle(&tp->t_ccv); 5900 5901 if (tp->snd_cwnd == 1) 5902 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 5903 else 5904 i_cwnd = rc_init_window(rack); 5905 5906 /* 5907 * Being idle is no different than the initial window. If the cc 5908 * clamps it down below the initial window raise it to the initial 5909 * window. 5910 */ 5911 if (tp->snd_cwnd < i_cwnd) { 5912 tp->snd_cwnd = i_cwnd; 5913 } 5914 } 5915 5916 /* 5917 * Indicate whether this ack should be delayed. We can delay the ack if 5918 * following conditions are met: 5919 * - There is no delayed ack timer in progress. 5920 * - Our last ack wasn't a 0-sized window. We never want to delay 5921 * the ack that opens up a 0-sized window. 5922 * - LRO wasn't used for this segment. We make sure by checking that the 5923 * segment size is not larger than the MSS. 5924 * - Delayed acks are enabled or this is a half-synchronized T/TCP 5925 * connection. 5926 */ 5927 #define DELAY_ACK(tp, tlen) \ 5928 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 5929 ((tp->t_flags & TF_DELACK) == 0) && \ 5930 (tlen <= tp->t_maxseg) && \ 5931 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 5932 5933 static struct rack_sendmap * 5934 rack_find_lowest_rsm(struct tcp_rack *rack) 5935 { 5936 struct rack_sendmap *rsm; 5937 5938 /* 5939 * Walk the time-order transmitted list looking for an rsm that is 5940 * not acked. This will be the one that was sent the longest time 5941 * ago that is still outstanding. 5942 */ 5943 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 5944 if (rsm->r_flags & RACK_ACKED) { 5945 continue; 5946 } 5947 goto finish; 5948 } 5949 finish: 5950 return (rsm); 5951 } 5952 5953 static struct rack_sendmap * 5954 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5955 { 5956 struct rack_sendmap *prsm; 5957 5958 /* 5959 * Walk the sequence order list backward until we hit and arrive at 5960 * the highest seq not acked. In theory when this is called it 5961 * should be the last segment (which it was not). 5962 */ 5963 prsm = rsm; 5964 5965 TQHASH_FOREACH_REVERSE_FROM(prsm, rack->r_ctl.tqh) { 5966 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5967 continue; 5968 } 5969 return (prsm); 5970 } 5971 return (NULL); 5972 } 5973 5974 static uint32_t 5975 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 5976 { 5977 int32_t lro; 5978 uint32_t thresh; 5979 5980 /* 5981 * lro is the flag we use to determine if we have seen reordering. 5982 * If it gets set we have seen reordering. The reorder logic either 5983 * works in one of two ways: 5984 * 5985 * If reorder-fade is configured, then we track the last time we saw 5986 * re-ordering occur. If we reach the point where enough time as 5987 * passed we no longer consider reordering has occuring. 5988 * 5989 * Or if reorder-face is 0, then once we see reordering we consider 5990 * the connection to alway be subject to reordering and just set lro 5991 * to 1. 5992 * 5993 * In the end if lro is non-zero we add the extra time for 5994 * reordering in. 5995 */ 5996 if (srtt == 0) 5997 srtt = 1; 5998 if (rack->r_ctl.rc_reorder_ts) { 5999 if (rack->r_ctl.rc_reorder_fade) { 6000 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 6001 lro = cts - rack->r_ctl.rc_reorder_ts; 6002 if (lro == 0) { 6003 /* 6004 * No time as passed since the last 6005 * reorder, mark it as reordering. 6006 */ 6007 lro = 1; 6008 } 6009 } else { 6010 /* Negative time? */ 6011 lro = 0; 6012 } 6013 if (lro > rack->r_ctl.rc_reorder_fade) { 6014 /* Turn off reordering seen too */ 6015 rack->r_ctl.rc_reorder_ts = 0; 6016 lro = 0; 6017 } 6018 } else { 6019 /* Reodering does not fade */ 6020 lro = 1; 6021 } 6022 } else { 6023 lro = 0; 6024 } 6025 if (rack->rc_rack_tmr_std_based == 0) { 6026 thresh = srtt + rack->r_ctl.rc_pkt_delay; 6027 } else { 6028 /* Standards based pkt-delay is 1/4 srtt */ 6029 thresh = srtt + (srtt >> 2); 6030 } 6031 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 6032 /* It must be set, if not you get 1/4 rtt */ 6033 if (rack->r_ctl.rc_reorder_shift) 6034 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 6035 else 6036 thresh += (srtt >> 2); 6037 } 6038 if (rack->rc_rack_use_dsack && 6039 lro && 6040 (rack->r_ctl.num_dsack > 0)) { 6041 /* 6042 * We only increase the reordering window if we 6043 * have seen reordering <and> we have a DSACK count. 6044 */ 6045 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 6046 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh); 6047 } 6048 /* SRTT * 2 is the ceiling */ 6049 if (thresh > (srtt * 2)) { 6050 thresh = srtt * 2; 6051 } 6052 /* And we don't want it above the RTO max either */ 6053 if (thresh > rack_rto_max) { 6054 thresh = rack_rto_max; 6055 } 6056 rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh); 6057 return (thresh); 6058 } 6059 6060 static uint32_t 6061 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 6062 struct rack_sendmap *rsm, uint32_t srtt) 6063 { 6064 struct rack_sendmap *prsm; 6065 uint32_t thresh, len; 6066 int segsiz; 6067 6068 if (srtt == 0) 6069 srtt = 1; 6070 if (rack->r_ctl.rc_tlp_threshold) 6071 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 6072 else 6073 thresh = (srtt * 2); 6074 6075 /* Get the previous sent packet, if any */ 6076 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 6077 len = rsm->r_end - rsm->r_start; 6078 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 6079 /* Exactly like the ID */ 6080 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 6081 uint32_t alt_thresh; 6082 /* 6083 * Compensate for delayed-ack with the d-ack time. 6084 */ 6085 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6086 if (alt_thresh > thresh) 6087 thresh = alt_thresh; 6088 } 6089 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 6090 /* 2.1 behavior */ 6091 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 6092 if (prsm && (len <= segsiz)) { 6093 /* 6094 * Two packets outstanding, thresh should be (2*srtt) + 6095 * possible inter-packet delay (if any). 6096 */ 6097 uint32_t inter_gap = 0; 6098 int idx, nidx; 6099 6100 idx = rsm->r_rtr_cnt - 1; 6101 nidx = prsm->r_rtr_cnt - 1; 6102 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 6103 /* Yes it was sent later (or at the same time) */ 6104 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 6105 } 6106 thresh += inter_gap; 6107 } else if (len <= segsiz) { 6108 /* 6109 * Possibly compensate for delayed-ack. 6110 */ 6111 uint32_t alt_thresh; 6112 6113 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6114 if (alt_thresh > thresh) 6115 thresh = alt_thresh; 6116 } 6117 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 6118 /* 2.2 behavior */ 6119 if (len <= segsiz) { 6120 uint32_t alt_thresh; 6121 /* 6122 * Compensate for delayed-ack with the d-ack time. 6123 */ 6124 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 6125 if (alt_thresh > thresh) 6126 thresh = alt_thresh; 6127 } 6128 } 6129 /* Not above an RTO */ 6130 if (thresh > tp->t_rxtcur) { 6131 thresh = tp->t_rxtcur; 6132 } 6133 /* Not above a RTO max */ 6134 if (thresh > rack_rto_max) { 6135 thresh = rack_rto_max; 6136 } 6137 /* Apply user supplied min TLP */ 6138 if (thresh < rack_tlp_min) { 6139 thresh = rack_tlp_min; 6140 } 6141 return (thresh); 6142 } 6143 6144 static uint32_t 6145 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 6146 { 6147 /* 6148 * We want the rack_rtt which is the 6149 * last rtt we measured. However if that 6150 * does not exist we fallback to the srtt (which 6151 * we probably will never do) and then as a last 6152 * resort we use RACK_INITIAL_RTO if no srtt is 6153 * yet set. 6154 */ 6155 if (rack->rc_rack_rtt) 6156 return (rack->rc_rack_rtt); 6157 else if (tp->t_srtt == 0) 6158 return (RACK_INITIAL_RTO); 6159 return (tp->t_srtt); 6160 } 6161 6162 static struct rack_sendmap * 6163 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 6164 { 6165 /* 6166 * Check to see that we don't need to fall into recovery. We will 6167 * need to do so if our oldest transmit is past the time we should 6168 * have had an ack. 6169 */ 6170 struct tcp_rack *rack; 6171 struct rack_sendmap *rsm; 6172 int32_t idx; 6173 uint32_t srtt, thresh; 6174 6175 rack = (struct tcp_rack *)tp->t_fb_ptr; 6176 if (tqhash_empty(rack->r_ctl.tqh)) { 6177 return (NULL); 6178 } 6179 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6180 if (rsm == NULL) 6181 return (NULL); 6182 6183 6184 if (rsm->r_flags & RACK_ACKED) { 6185 rsm = rack_find_lowest_rsm(rack); 6186 if (rsm == NULL) 6187 return (NULL); 6188 } 6189 idx = rsm->r_rtr_cnt - 1; 6190 srtt = rack_grab_rtt(tp, rack); 6191 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 6192 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 6193 return (NULL); 6194 } 6195 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 6196 return (NULL); 6197 } 6198 /* Ok if we reach here we are over-due and this guy can be sent */ 6199 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 6200 return (rsm); 6201 } 6202 6203 static uint32_t 6204 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 6205 { 6206 int32_t t; 6207 int32_t tt; 6208 uint32_t ret_val; 6209 6210 t = (tp->t_srtt + (tp->t_rttvar << 2)); 6211 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 6212 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 6213 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 6214 ret_val = (uint32_t)tt; 6215 return (ret_val); 6216 } 6217 6218 static uint32_t 6219 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 6220 { 6221 /* 6222 * Start the FR timer, we do this based on getting the first one in 6223 * the rc_tmap. Note that if its NULL we must stop the timer. in all 6224 * events we need to stop the running timer (if its running) before 6225 * starting the new one. 6226 */ 6227 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 6228 uint32_t srtt_cur; 6229 int32_t idx; 6230 int32_t is_tlp_timer = 0; 6231 struct rack_sendmap *rsm; 6232 6233 if (rack->t_timers_stopped) { 6234 /* All timers have been stopped none are to run */ 6235 return (0); 6236 } 6237 if (rack->rc_in_persist) { 6238 /* We can't start any timer in persists */ 6239 return (rack_get_persists_timer_val(tp, rack)); 6240 } 6241 rack->rc_on_min_to = 0; 6242 if ((tp->t_state < TCPS_ESTABLISHED) || 6243 (rack->sack_attack_disable > 0) || 6244 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 6245 goto activate_rxt; 6246 } 6247 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6248 if ((rsm == NULL) || sup_rack) { 6249 /* Nothing on the send map or no rack */ 6250 activate_rxt: 6251 time_since_sent = 0; 6252 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6253 if (rsm) { 6254 /* 6255 * Should we discount the RTX timer any? 6256 * 6257 * We want to discount it the smallest amount. 6258 * If a timer (Rack/TLP or RXT) has gone off more 6259 * recently thats the discount we want to use (now - timer time). 6260 * If the retransmit of the oldest packet was more recent then 6261 * we want to use that (now - oldest-packet-last_transmit_time). 6262 * 6263 */ 6264 idx = rsm->r_rtr_cnt - 1; 6265 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 6266 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6267 else 6268 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6269 if (TSTMP_GT(cts, tstmp_touse)) 6270 time_since_sent = cts - tstmp_touse; 6271 } 6272 if (SEQ_LT(tp->snd_una, tp->snd_max) || 6273 sbavail(&tptosocket(tp)->so_snd)) { 6274 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 6275 to = tp->t_rxtcur; 6276 if (to > time_since_sent) 6277 to -= time_since_sent; 6278 else 6279 to = rack->r_ctl.rc_min_to; 6280 if (to == 0) 6281 to = 1; 6282 /* Special case for KEEPINIT */ 6283 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6284 (TP_KEEPINIT(tp) != 0) && 6285 rsm) { 6286 /* 6287 * We have to put a ceiling on the rxt timer 6288 * of the keep-init timeout. 6289 */ 6290 uint32_t max_time, red; 6291 6292 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 6293 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 6294 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 6295 if (red < max_time) 6296 max_time -= red; 6297 else 6298 max_time = 1; 6299 } 6300 /* Reduce timeout to the keep value if needed */ 6301 if (max_time < to) 6302 to = max_time; 6303 } 6304 return (to); 6305 } 6306 return (0); 6307 } 6308 if (rsm->r_flags & RACK_ACKED) { 6309 rsm = rack_find_lowest_rsm(rack); 6310 if (rsm == NULL) { 6311 /* No lowest? */ 6312 goto activate_rxt; 6313 } 6314 } 6315 if (rack->sack_attack_disable) { 6316 /* 6317 * We don't want to do 6318 * any TLP's if you are an attacker. 6319 * Though if you are doing what 6320 * is expected you may still have 6321 * SACK-PASSED marks. 6322 */ 6323 goto activate_rxt; 6324 } 6325 /* Convert from ms to usecs */ 6326 if ((rsm->r_flags & RACK_SACK_PASSED) || 6327 (rsm->r_flags & RACK_RWND_COLLAPSED) || 6328 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 6329 if ((tp->t_flags & TF_SENTFIN) && 6330 ((tp->snd_max - tp->snd_una) == 1) && 6331 (rsm->r_flags & RACK_HAS_FIN)) { 6332 /* 6333 * We don't start a rack timer if all we have is a 6334 * FIN outstanding. 6335 */ 6336 goto activate_rxt; 6337 } 6338 if ((rack->use_rack_rr == 0) && 6339 (IN_FASTRECOVERY(tp->t_flags)) && 6340 (rack->rack_no_prr == 0) && 6341 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 6342 /* 6343 * We are not cheating, in recovery and 6344 * not enough ack's to yet get our next 6345 * retransmission out. 6346 * 6347 * Note that classified attackers do not 6348 * get to use the rack-cheat. 6349 */ 6350 goto activate_tlp; 6351 } 6352 srtt = rack_grab_rtt(tp, rack); 6353 thresh = rack_calc_thresh_rack(rack, srtt, cts); 6354 idx = rsm->r_rtr_cnt - 1; 6355 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 6356 if (SEQ_GEQ(exp, cts)) { 6357 to = exp - cts; 6358 if (to < rack->r_ctl.rc_min_to) { 6359 to = rack->r_ctl.rc_min_to; 6360 if (rack->r_rr_config == 3) 6361 rack->rc_on_min_to = 1; 6362 } 6363 } else { 6364 to = rack->r_ctl.rc_min_to; 6365 if (rack->r_rr_config == 3) 6366 rack->rc_on_min_to = 1; 6367 } 6368 } else { 6369 /* Ok we need to do a TLP not RACK */ 6370 activate_tlp: 6371 if ((rack->rc_tlp_in_progress != 0) && 6372 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 6373 /* 6374 * The previous send was a TLP and we have sent 6375 * N TLP's without sending new data. 6376 */ 6377 goto activate_rxt; 6378 } 6379 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 6380 if (rsm == NULL) { 6381 /* We found no rsm to TLP with. */ 6382 goto activate_rxt; 6383 } 6384 if (rsm->r_flags & RACK_HAS_FIN) { 6385 /* If its a FIN we dont do TLP */ 6386 rsm = NULL; 6387 goto activate_rxt; 6388 } 6389 idx = rsm->r_rtr_cnt - 1; 6390 time_since_sent = 0; 6391 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 6392 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 6393 else 6394 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 6395 if (TSTMP_GT(cts, tstmp_touse)) 6396 time_since_sent = cts - tstmp_touse; 6397 is_tlp_timer = 1; 6398 if (tp->t_srtt) { 6399 if ((rack->rc_srtt_measure_made == 0) && 6400 (tp->t_srtt == 1)) { 6401 /* 6402 * If another stack as run and set srtt to 1, 6403 * then the srtt was 0, so lets use the initial. 6404 */ 6405 srtt = RACK_INITIAL_RTO; 6406 } else { 6407 srtt_cur = tp->t_srtt; 6408 srtt = srtt_cur; 6409 } 6410 } else 6411 srtt = RACK_INITIAL_RTO; 6412 /* 6413 * If the SRTT is not keeping up and the 6414 * rack RTT has spiked we want to use 6415 * the last RTT not the smoothed one. 6416 */ 6417 if (rack_tlp_use_greater && 6418 tp->t_srtt && 6419 (srtt < rack_grab_rtt(tp, rack))) { 6420 srtt = rack_grab_rtt(tp, rack); 6421 } 6422 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 6423 if (thresh > time_since_sent) { 6424 to = thresh - time_since_sent; 6425 } else { 6426 to = rack->r_ctl.rc_min_to; 6427 rack_log_alt_to_to_cancel(rack, 6428 thresh, /* flex1 */ 6429 time_since_sent, /* flex2 */ 6430 tstmp_touse, /* flex3 */ 6431 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 6432 (uint32_t)rsm->r_tim_lastsent[idx], 6433 srtt, 6434 idx, 99); 6435 } 6436 if (to < rack_tlp_min) { 6437 to = rack_tlp_min; 6438 } 6439 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 6440 /* 6441 * If the TLP time works out to larger than the max 6442 * RTO lets not do TLP.. just RTO. 6443 */ 6444 goto activate_rxt; 6445 } 6446 } 6447 if (is_tlp_timer == 0) { 6448 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 6449 } else { 6450 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 6451 } 6452 if (to == 0) 6453 to = 1; 6454 return (to); 6455 } 6456 6457 static void 6458 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) 6459 { 6460 struct timeval tv; 6461 6462 if (rack->rc_in_persist == 0) { 6463 if (tp->t_flags & TF_GPUTINPROG) { 6464 /* 6465 * Stop the goodput now, the calling of the 6466 * measurement function clears the flag. 6467 */ 6468 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 6469 RACK_QUALITY_PERSIST); 6470 } 6471 #ifdef NETFLIX_SHARED_CWND 6472 if (rack->r_ctl.rc_scw) { 6473 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6474 rack->rack_scwnd_is_idle = 1; 6475 } 6476 #endif 6477 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv); 6478 if (rack->lt_bw_up) { 6479 /* Suspend our LT BW measurement */ 6480 uint64_t tmark; 6481 6482 rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); 6483 rack->r_ctl.lt_seq = snd_una; 6484 tmark = tcp_tv_to_lusectick(&tv); 6485 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 6486 rack->r_ctl.lt_timemark = tmark; 6487 rack->lt_bw_up = 0; 6488 rack->r_persist_lt_bw_off = 1; 6489 } 6490 if (rack->r_ctl.rc_went_idle_time == 0) 6491 rack->r_ctl.rc_went_idle_time = 1; 6492 rack_timer_cancel(tp, rack, cts, __LINE__); 6493 rack->r_ctl.persist_lost_ends = 0; 6494 rack->probe_not_answered = 0; 6495 rack->forced_ack = 0; 6496 tp->t_rxtshift = 0; 6497 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6498 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6499 rack->rc_in_persist = 1; 6500 } 6501 } 6502 6503 static void 6504 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6505 { 6506 struct timeval tv; 6507 uint32_t t_time; 6508 6509 if (tcp_in_hpts(rack->rc_tp)) { 6510 tcp_hpts_remove(rack->rc_tp); 6511 rack->r_ctl.rc_hpts_flags = 0; 6512 } 6513 #ifdef NETFLIX_SHARED_CWND 6514 if (rack->r_ctl.rc_scw) { 6515 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 6516 rack->rack_scwnd_is_idle = 0; 6517 } 6518 #endif 6519 t_time = tcp_get_usecs(&tv); 6520 if (rack->rc_gp_dyn_mul && 6521 (rack->use_fixed_rate == 0) && 6522 (rack->rc_always_pace)) { 6523 /* 6524 * Do we count this as if a probe-rtt just 6525 * finished? 6526 */ 6527 uint32_t time_idle, idle_min; 6528 6529 time_idle = t_time - rack->r_ctl.rc_went_idle_time; 6530 idle_min = rack_min_probertt_hold; 6531 if (rack_probertt_gpsrtt_cnt_div) { 6532 uint64_t extra; 6533 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 6534 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 6535 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 6536 idle_min += (uint32_t)extra; 6537 } 6538 if (time_idle >= idle_min) { 6539 /* Yes, we count it as a probe-rtt. */ 6540 uint32_t us_cts; 6541 6542 us_cts = tcp_get_usecs(NULL); 6543 if (rack->in_probe_rtt == 0) { 6544 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6545 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 6546 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 6547 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 6548 } else { 6549 rack_exit_probertt(rack, us_cts); 6550 } 6551 } 6552 } 6553 if (rack->r_persist_lt_bw_off) { 6554 /* Continue where we left off */ 6555 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); 6556 rack->lt_bw_up = 1; 6557 rack->r_persist_lt_bw_off = 0; 6558 } 6559 rack->rc_in_persist = 0; 6560 rack->r_ctl.rc_went_idle_time = 0; 6561 tp->t_rxtshift = 0; 6562 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 6563 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 6564 rack->r_ctl.rc_agg_delayed = 0; 6565 rack->r_early = 0; 6566 rack->r_late = 0; 6567 rack->r_ctl.rc_agg_early = 0; 6568 } 6569 6570 static void 6571 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 6572 struct hpts_diag *diag, struct timeval *tv) 6573 { 6574 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6575 union tcp_log_stackspecific log; 6576 6577 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6578 log.u_bbr.flex1 = diag->p_nxt_slot; 6579 log.u_bbr.flex2 = diag->p_cur_slot; 6580 log.u_bbr.flex3 = diag->slot_req; 6581 log.u_bbr.flex4 = diag->inp_hptsslot; 6582 log.u_bbr.flex5 = diag->slot_remaining; 6583 log.u_bbr.flex6 = diag->need_new_to; 6584 log.u_bbr.flex7 = diag->p_hpts_active; 6585 log.u_bbr.flex8 = diag->p_on_min_sleep; 6586 /* Hijack other fields as needed */ 6587 log.u_bbr.epoch = diag->have_slept; 6588 log.u_bbr.lt_epoch = diag->yet_to_sleep; 6589 log.u_bbr.pkts_out = diag->co_ret; 6590 log.u_bbr.applimited = diag->hpts_sleep_time; 6591 log.u_bbr.delivered = diag->p_prev_slot; 6592 log.u_bbr.inflight = diag->p_runningslot; 6593 log.u_bbr.bw_inuse = diag->wheel_slot; 6594 log.u_bbr.rttProp = diag->wheel_cts; 6595 log.u_bbr.timeStamp = cts; 6596 log.u_bbr.delRate = diag->maxslots; 6597 log.u_bbr.cur_del_rate = diag->p_curtick; 6598 log.u_bbr.cur_del_rate <<= 32; 6599 log.u_bbr.cur_del_rate |= diag->p_lasttick; 6600 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6601 &rack->rc_inp->inp_socket->so_rcv, 6602 &rack->rc_inp->inp_socket->so_snd, 6603 BBR_LOG_HPTSDIAG, 0, 6604 0, &log, false, tv); 6605 } 6606 6607 } 6608 6609 static void 6610 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 6611 { 6612 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { 6613 union tcp_log_stackspecific log; 6614 struct timeval tv; 6615 6616 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 6617 log.u_bbr.flex1 = sb->sb_flags; 6618 log.u_bbr.flex2 = len; 6619 log.u_bbr.flex3 = sb->sb_state; 6620 log.u_bbr.flex8 = type; 6621 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 6622 TCP_LOG_EVENTP(rack->rc_tp, NULL, 6623 &rack->rc_inp->inp_socket->so_rcv, 6624 &rack->rc_inp->inp_socket->so_snd, 6625 TCP_LOG_SB_WAKE, 0, 6626 len, &log, false, &tv); 6627 } 6628 } 6629 6630 static void 6631 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 6632 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 6633 { 6634 struct hpts_diag diag; 6635 struct inpcb *inp = tptoinpcb(tp); 6636 struct timeval tv; 6637 uint32_t delayed_ack = 0; 6638 uint32_t hpts_timeout; 6639 uint32_t entry_slot = slot; 6640 uint8_t stopped; 6641 uint32_t left = 0; 6642 uint32_t us_cts; 6643 6644 if ((tp->t_state == TCPS_CLOSED) || 6645 (tp->t_state == TCPS_LISTEN)) { 6646 return; 6647 } 6648 if (tcp_in_hpts(tp)) { 6649 /* Already on the pacer */ 6650 return; 6651 } 6652 stopped = rack->rc_tmr_stopped; 6653 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 6654 left = rack->r_ctl.rc_timer_exp - cts; 6655 } 6656 rack->r_ctl.rc_timer_exp = 0; 6657 rack->r_ctl.rc_hpts_flags = 0; 6658 us_cts = tcp_get_usecs(&tv); 6659 /* Now early/late accounting */ 6660 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 6661 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 6662 /* 6663 * We have a early carry over set, 6664 * we can always add more time so we 6665 * can always make this compensation. 6666 * 6667 * Note if ack's are allowed to wake us do not 6668 * penalize the next timer for being awoke 6669 * by an ack aka the rc_agg_early (non-paced mode). 6670 */ 6671 slot += rack->r_ctl.rc_agg_early; 6672 rack->r_early = 0; 6673 rack->r_ctl.rc_agg_early = 0; 6674 } 6675 if (rack->r_late) { 6676 /* 6677 * This is harder, we can 6678 * compensate some but it 6679 * really depends on what 6680 * the current pacing time is. 6681 */ 6682 if (rack->r_ctl.rc_agg_delayed >= slot) { 6683 /* 6684 * We can't compensate for it all. 6685 * And we have to have some time 6686 * on the clock. We always have a min 6687 * 10 slots (10 x 10 i.e. 100 usecs). 6688 */ 6689 if (slot <= HPTS_TICKS_PER_SLOT) { 6690 /* We gain delay */ 6691 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 6692 slot = HPTS_TICKS_PER_SLOT; 6693 } else { 6694 /* We take off some */ 6695 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 6696 slot = HPTS_TICKS_PER_SLOT; 6697 } 6698 } else { 6699 slot -= rack->r_ctl.rc_agg_delayed; 6700 rack->r_ctl.rc_agg_delayed = 0; 6701 /* Make sure we have 100 useconds at minimum */ 6702 if (slot < HPTS_TICKS_PER_SLOT) { 6703 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 6704 slot = HPTS_TICKS_PER_SLOT; 6705 } 6706 if (rack->r_ctl.rc_agg_delayed == 0) 6707 rack->r_late = 0; 6708 } 6709 } 6710 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 6711 #ifdef TCP_SAD_DETECTION 6712 if (rack->sack_attack_disable && 6713 (rack->r_ctl.ack_during_sd > 0) && 6714 (slot < tcp_sad_pacing_interval)) { 6715 /* 6716 * We have a potential attacker on 6717 * the line. We have possibly some 6718 * (or now) pacing time set. We want to 6719 * slow down the processing of sacks by some 6720 * amount (if it is an attacker). Set the default 6721 * slot for attackers in place (unless the original 6722 * interval is longer). Its stored in 6723 * micro-seconds, so lets convert to msecs. 6724 */ 6725 slot = tcp_sad_pacing_interval; 6726 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 6727 rack->r_ctl.ack_during_sd = 0; 6728 } 6729 #endif 6730 if (tp->t_flags & TF_DELACK) { 6731 delayed_ack = TICKS_2_USEC(tcp_delacktime); 6732 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 6733 } 6734 if (delayed_ack && ((hpts_timeout == 0) || 6735 (delayed_ack < hpts_timeout))) 6736 hpts_timeout = delayed_ack; 6737 else 6738 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6739 /* 6740 * If no timers are going to run and we will fall off the hptsi 6741 * wheel, we resort to a keep-alive timer if its configured. 6742 */ 6743 if ((hpts_timeout == 0) && 6744 (slot == 0)) { 6745 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6746 (tp->t_state <= TCPS_CLOSING)) { 6747 /* 6748 * Ok we have no timer (persists, rack, tlp, rxt or 6749 * del-ack), we don't have segments being paced. So 6750 * all that is left is the keepalive timer. 6751 */ 6752 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6753 /* Get the established keep-alive time */ 6754 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 6755 } else { 6756 /* 6757 * Get the initial setup keep-alive time, 6758 * note that this is probably not going to 6759 * happen, since rack will be running a rxt timer 6760 * if a SYN of some sort is outstanding. It is 6761 * actually handled in rack_timeout_rxt(). 6762 */ 6763 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 6764 } 6765 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 6766 if (rack->in_probe_rtt) { 6767 /* 6768 * We want to instead not wake up a long time from 6769 * now but to wake up about the time we would 6770 * exit probe-rtt and initiate a keep-alive ack. 6771 * This will get us out of probe-rtt and update 6772 * our min-rtt. 6773 */ 6774 hpts_timeout = rack_min_probertt_hold; 6775 } 6776 } 6777 } 6778 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 6779 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 6780 /* 6781 * RACK, TLP, persists and RXT timers all are restartable 6782 * based on actions input .. i.e we received a packet (ack 6783 * or sack) and that changes things (rw, or snd_una etc). 6784 * Thus we can restart them with a new value. For 6785 * keep-alive, delayed_ack we keep track of what was left 6786 * and restart the timer with a smaller value. 6787 */ 6788 if (left < hpts_timeout) 6789 hpts_timeout = left; 6790 } 6791 if (hpts_timeout) { 6792 /* 6793 * Hack alert for now we can't time-out over 2,147,483 6794 * seconds (a bit more than 596 hours), which is probably ok 6795 * :). 6796 */ 6797 if (hpts_timeout > 0x7ffffffe) 6798 hpts_timeout = 0x7ffffffe; 6799 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 6800 } 6801 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 6802 if ((rack->gp_ready == 0) && 6803 (rack->use_fixed_rate == 0) && 6804 (hpts_timeout < slot) && 6805 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 6806 /* 6807 * We have no good estimate yet for the 6808 * old clunky burst mitigation or the 6809 * real pacing. And the tlp or rxt is smaller 6810 * than the pacing calculation. Lets not 6811 * pace that long since we know the calculation 6812 * so far is not accurate. 6813 */ 6814 slot = hpts_timeout; 6815 } 6816 /** 6817 * Turn off all the flags for queuing by default. The 6818 * flags have important meanings to what happens when 6819 * LRO interacts with the transport. Most likely (by default now) 6820 * mbuf_queueing and ack compression are on. So the transport 6821 * has a couple of flags that control what happens (if those 6822 * are not on then these flags won't have any effect since it 6823 * won't go through the queuing LRO path). 6824 * 6825 * TF2_MBUF_QUEUE_READY - This flags says that I am busy 6826 * pacing output, so don't disturb. But 6827 * it also means LRO can wake me if there 6828 * is a SACK arrival. 6829 * 6830 * TF2_DONT_SACK_QUEUE - This flag is used in conjunction 6831 * with the above flag (QUEUE_READY) and 6832 * when present it says don't even wake me 6833 * if a SACK arrives. 6834 * 6835 * The idea behind these flags is that if we are pacing we 6836 * set the MBUF_QUEUE_READY and only get woken up if 6837 * a SACK arrives (which could change things) or if 6838 * our pacing timer expires. If, however, we have a rack 6839 * timer running, then we don't even want a sack to wake 6840 * us since the rack timer has to expire before we can send. 6841 * 6842 * Other cases should usually have none of the flags set 6843 * so LRO can call into us. 6844 */ 6845 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE|TF2_MBUF_QUEUE_READY); 6846 if (slot) { 6847 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 6848 rack->r_ctl.rc_last_output_to = us_cts + slot; 6849 /* 6850 * A pacing timer (slot) is being set, in 6851 * such a case we cannot send (we are blocked by 6852 * the timer). So lets tell LRO that it should not 6853 * wake us unless there is a SACK. Note this only 6854 * will be effective if mbuf queueing is on or 6855 * compressed acks are being processed. 6856 */ 6857 tp->t_flags2 |= TF2_MBUF_QUEUE_READY; 6858 /* 6859 * But wait if we have a Rack timer running 6860 * even a SACK should not disturb us (with 6861 * the exception of r_rr_config 3). 6862 */ 6863 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 6864 if (rack->r_rr_config != 3) 6865 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6866 else if (rack->rc_pace_dnd) { 6867 if (IN_RECOVERY(tp->t_flags)) { 6868 /* 6869 * When DND is on, we only let a sack 6870 * interrupt us if we are not in recovery. 6871 * 6872 * If DND is off, then we never hit here 6873 * and let all sacks wake us up. 6874 * 6875 */ 6876 tp->t_flags2 |= TF2_DONT_SACK_QUEUE; 6877 } 6878 } 6879 } 6880 /* For sack attackers we want to ignore sack */ 6881 if (rack->sack_attack_disable == 1) { 6882 tp->t_flags2 |= (TF2_DONT_SACK_QUEUE | 6883 TF2_MBUF_QUEUE_READY); 6884 } else if (rack->rc_ack_can_sendout_data) { 6885 /* 6886 * Ahh but wait, this is that special case 6887 * where the pacing timer can be disturbed 6888 * backout the changes (used for non-paced 6889 * burst limiting). 6890 */ 6891 tp->t_flags2 &= ~(TF2_DONT_SACK_QUEUE | 6892 TF2_MBUF_QUEUE_READY); 6893 } 6894 if ((rack->use_rack_rr) && 6895 (rack->r_rr_config < 2) && 6896 ((hpts_timeout) && (hpts_timeout < slot))) { 6897 /* 6898 * Arrange for the hpts to kick back in after the 6899 * t-o if the t-o does not cause a send. 6900 */ 6901 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 6902 __LINE__, &diag); 6903 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6904 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6905 } else { 6906 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot), 6907 __LINE__, &diag); 6908 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6909 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 6910 } 6911 } else if (hpts_timeout) { 6912 /* 6913 * With respect to t_flags2(?) here, lets let any new acks wake 6914 * us up here. Since we are not pacing (no pacing timer), output 6915 * can happen so we should let it. If its a Rack timer, then any inbound 6916 * packet probably won't change the sending (we will be blocked) 6917 * but it may change the prr stats so letting it in (the set defaults 6918 * at the start of this block) are good enough. 6919 */ 6920 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6921 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout), 6922 __LINE__, &diag); 6923 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6924 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6925 } else { 6926 /* No timer starting */ 6927 #ifdef INVARIANTS 6928 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 6929 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 6930 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 6931 } 6932 #endif 6933 } 6934 rack->rc_tmr_stopped = 0; 6935 if (slot) 6936 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); 6937 } 6938 6939 /* 6940 * RACK Timer, here we simply do logging and house keeping. 6941 * the normal rack_output() function will call the 6942 * appropriate thing to check if we need to do a RACK retransmit. 6943 * We return 1, saying don't proceed with rack_output only 6944 * when all timers have been stopped (destroyed PCB?). 6945 */ 6946 static int 6947 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6948 { 6949 /* 6950 * This timer simply provides an internal trigger to send out data. 6951 * The check_recovery_mode call will see if there are needed 6952 * retransmissions, if so we will enter fast-recovery. The output 6953 * call may or may not do the same thing depending on sysctl 6954 * settings. 6955 */ 6956 struct rack_sendmap *rsm; 6957 6958 counter_u64_add(rack_to_tot, 1); 6959 if (rack->r_state && (rack->r_state != tp->t_state)) 6960 rack_set_state(tp, rack); 6961 rack->rc_on_min_to = 0; 6962 rsm = rack_check_recovery_mode(tp, cts); 6963 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 6964 if (rsm) { 6965 rack->r_ctl.rc_resend = rsm; 6966 rack->r_timer_override = 1; 6967 if (rack->use_rack_rr) { 6968 /* 6969 * Don't accumulate extra pacing delay 6970 * we are allowing the rack timer to 6971 * over-ride pacing i.e. rrr takes precedence 6972 * if the pacing interval is longer than the rrr 6973 * time (in other words we get the min pacing 6974 * time versus rrr pacing time). 6975 */ 6976 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6977 } 6978 } 6979 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 6980 if (rsm == NULL) { 6981 /* restart a timer and return 1 */ 6982 rack_start_hpts_timer(rack, tp, cts, 6983 0, 0, 0); 6984 return (1); 6985 } 6986 return (0); 6987 } 6988 6989 6990 6991 static void 6992 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 6993 { 6994 6995 if ((M_TRAILINGROOM(rsm->m) != rsm->orig_t_space)) { 6996 /* 6997 * The trailing space changed, mbufs can grow 6998 * at the tail but they can't shrink from 6999 * it, KASSERT that. Adjust the orig_m_len to 7000 * compensate for this change. 7001 */ 7002 KASSERT((rsm->orig_t_space > M_TRAILINGROOM(rsm->m)), 7003 ("mbuf:%p rsm:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 7004 rsm->m, 7005 rsm, 7006 (intmax_t)M_TRAILINGROOM(rsm->m), 7007 rsm->orig_t_space, 7008 rsm->orig_m_len, 7009 rsm->m->m_len)); 7010 rsm->orig_m_len += (rsm->orig_t_space - M_TRAILINGROOM(rsm->m)); 7011 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7012 } 7013 if (rsm->m->m_len < rsm->orig_m_len) { 7014 /* 7015 * Mbuf shrank, trimmed off the top by an ack, our 7016 * offset changes. 7017 */ 7018 KASSERT((rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)), 7019 ("mbuf:%p len:%u rsm:%p oml:%u soff:%u\n", 7020 rsm->m, rsm->m->m_len, 7021 rsm, rsm->orig_m_len, 7022 rsm->soff)); 7023 if (rsm->soff >= (rsm->orig_m_len - rsm->m->m_len)) 7024 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 7025 else 7026 rsm->soff = 0; 7027 rsm->orig_m_len = rsm->m->m_len; 7028 #ifdef INVARIANTS 7029 } else if (rsm->m->m_len > rsm->orig_m_len) { 7030 panic("rsm:%p m:%p m_len grew outside of t_space compensation", 7031 rsm, rsm->m); 7032 #endif 7033 } 7034 } 7035 7036 static void 7037 rack_setup_offset_for_rsm(struct tcp_rack *rack, struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 7038 { 7039 struct mbuf *m; 7040 uint32_t soff; 7041 7042 if (src_rsm->m && 7043 ((src_rsm->orig_m_len != src_rsm->m->m_len) || 7044 (M_TRAILINGROOM(src_rsm->m) != src_rsm->orig_t_space))) { 7045 /* Fix up the orig_m_len and possibly the mbuf offset */ 7046 rack_adjust_orig_mlen(src_rsm); 7047 } 7048 m = src_rsm->m; 7049 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 7050 while (soff >= m->m_len) { 7051 /* Move out past this mbuf */ 7052 soff -= m->m_len; 7053 m = m->m_next; 7054 KASSERT((m != NULL), 7055 ("rsm:%p nrsm:%p hit at soff:%u null m", 7056 src_rsm, rsm, soff)); 7057 if (m == NULL) { 7058 /* This should *not* happen which is why there is a kassert */ 7059 src_rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7060 (src_rsm->r_start - rack->rc_tp->snd_una), 7061 &src_rsm->soff); 7062 src_rsm->orig_m_len = src_rsm->m->m_len; 7063 src_rsm->orig_t_space = M_TRAILINGROOM(src_rsm->m); 7064 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 7065 (rsm->r_start - rack->rc_tp->snd_una), 7066 &rsm->soff); 7067 rsm->orig_m_len = rsm->m->m_len; 7068 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7069 return; 7070 } 7071 } 7072 rsm->m = m; 7073 rsm->soff = soff; 7074 rsm->orig_m_len = m->m_len; 7075 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 7076 } 7077 7078 static __inline void 7079 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 7080 struct rack_sendmap *rsm, uint32_t start) 7081 { 7082 int idx; 7083 7084 nrsm->r_start = start; 7085 nrsm->r_end = rsm->r_end; 7086 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 7087 nrsm->r_flags = rsm->r_flags; 7088 nrsm->r_dupack = rsm->r_dupack; 7089 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 7090 nrsm->r_rtr_bytes = 0; 7091 nrsm->r_fas = rsm->r_fas; 7092 nrsm->r_bas = rsm->r_bas; 7093 rsm->r_end = nrsm->r_start; 7094 nrsm->r_just_ret = rsm->r_just_ret; 7095 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 7096 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 7097 } 7098 /* Now if we have SYN flag we keep it on the left edge */ 7099 if (nrsm->r_flags & RACK_HAS_SYN) 7100 nrsm->r_flags &= ~RACK_HAS_SYN; 7101 /* Now if we have a FIN flag we keep it on the right edge */ 7102 if (rsm->r_flags & RACK_HAS_FIN) 7103 rsm->r_flags &= ~RACK_HAS_FIN; 7104 /* Push bit must go to the right edge as well */ 7105 if (rsm->r_flags & RACK_HAD_PUSH) 7106 rsm->r_flags &= ~RACK_HAD_PUSH; 7107 /* Clone over the state of the hw_tls flag */ 7108 nrsm->r_hw_tls = rsm->r_hw_tls; 7109 /* 7110 * Now we need to find nrsm's new location in the mbuf chain 7111 * we basically calculate a new offset, which is soff + 7112 * how much is left in original rsm. Then we walk out the mbuf 7113 * chain to find the righ position, it may be the same mbuf 7114 * or maybe not. 7115 */ 7116 KASSERT(((rsm->m != NULL) || 7117 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 7118 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 7119 if (rsm->m) 7120 rack_setup_offset_for_rsm(rack, rsm, nrsm); 7121 } 7122 7123 static struct rack_sendmap * 7124 rack_merge_rsm(struct tcp_rack *rack, 7125 struct rack_sendmap *l_rsm, 7126 struct rack_sendmap *r_rsm) 7127 { 7128 /* 7129 * We are merging two ack'd RSM's, 7130 * the l_rsm is on the left (lower seq 7131 * values) and the r_rsm is on the right 7132 * (higher seq value). The simplest way 7133 * to merge these is to move the right 7134 * one into the left. I don't think there 7135 * is any reason we need to try to find 7136 * the oldest (or last oldest retransmitted). 7137 */ 7138 rack_log_map_chg(rack->rc_tp, rack, NULL, 7139 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 7140 l_rsm->r_end = r_rsm->r_end; 7141 if (l_rsm->r_dupack < r_rsm->r_dupack) 7142 l_rsm->r_dupack = r_rsm->r_dupack; 7143 if (r_rsm->r_rtr_bytes) 7144 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 7145 if (r_rsm->r_in_tmap) { 7146 /* This really should not happen */ 7147 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 7148 r_rsm->r_in_tmap = 0; 7149 } 7150 7151 /* Now the flags */ 7152 if (r_rsm->r_flags & RACK_HAS_FIN) 7153 l_rsm->r_flags |= RACK_HAS_FIN; 7154 if (r_rsm->r_flags & RACK_TLP) 7155 l_rsm->r_flags |= RACK_TLP; 7156 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 7157 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 7158 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 7159 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 7160 /* 7161 * If both are app-limited then let the 7162 * free lower the count. If right is app 7163 * limited and left is not, transfer. 7164 */ 7165 l_rsm->r_flags |= RACK_APP_LIMITED; 7166 r_rsm->r_flags &= ~RACK_APP_LIMITED; 7167 if (r_rsm == rack->r_ctl.rc_first_appl) 7168 rack->r_ctl.rc_first_appl = l_rsm; 7169 } 7170 tqhash_remove(rack->r_ctl.tqh, r_rsm, REMOVE_TYPE_MERGE); 7171 /* 7172 * We keep the largest value, which is the newest 7173 * send. We do this in case a segment that is 7174 * joined together and not part of a GP estimate 7175 * later gets expanded into the GP estimate. 7176 * 7177 * We prohibit the merging of unlike kinds i.e. 7178 * all pieces that are in the GP estimate can be 7179 * merged and all pieces that are not in a GP estimate 7180 * can be merged, but not disimilar pieces. Combine 7181 * this with taking the highest here and we should 7182 * be ok unless of course the client reneges. Then 7183 * all bets are off. 7184 */ 7185 if(l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] < 7186 r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]) { 7187 l_rsm->r_tim_lastsent[(l_rsm->r_rtr_cnt-1)] = r_rsm->r_tim_lastsent[(r_rsm->r_rtr_cnt-1)]; 7188 } 7189 /* 7190 * When merging two RSM's we also need to consider the ack time and keep 7191 * newest. If the ack gets merged into a measurement then that is the 7192 * one we will want to be using. 7193 */ 7194 if(l_rsm->r_ack_arrival < r_rsm->r_ack_arrival) 7195 l_rsm->r_ack_arrival = r_rsm->r_ack_arrival; 7196 7197 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 7198 /* Transfer the split limit to the map we free */ 7199 r_rsm->r_limit_type = l_rsm->r_limit_type; 7200 l_rsm->r_limit_type = 0; 7201 } 7202 rack_free(rack, r_rsm); 7203 l_rsm->r_flags |= RACK_MERGED; 7204 return (l_rsm); 7205 } 7206 7207 /* 7208 * TLP Timer, here we simply setup what segment we want to 7209 * have the TLP expire on, the normal rack_output() will then 7210 * send it out. 7211 * 7212 * We return 1, saying don't proceed with rack_output only 7213 * when all timers have been stopped (destroyed PCB?). 7214 */ 7215 static int 7216 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 7217 { 7218 /* 7219 * Tail Loss Probe. 7220 */ 7221 struct rack_sendmap *rsm = NULL; 7222 int insret __diagused; 7223 struct socket *so = tptosocket(tp); 7224 uint32_t amm; 7225 uint32_t out, avail; 7226 int collapsed_win = 0; 7227 7228 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7229 /* Its not time yet */ 7230 return (0); 7231 } 7232 if (ctf_progress_timeout_check(tp, true)) { 7233 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7234 return (-ETIMEDOUT); /* tcp_drop() */ 7235 } 7236 /* 7237 * A TLP timer has expired. We have been idle for 2 rtts. So we now 7238 * need to figure out how to force a full MSS segment out. 7239 */ 7240 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 7241 rack->r_ctl.retran_during_recovery = 0; 7242 rack->r_ctl.dsack_byte_cnt = 0; 7243 counter_u64_add(rack_tlp_tot, 1); 7244 if (rack->r_state && (rack->r_state != tp->t_state)) 7245 rack_set_state(tp, rack); 7246 avail = sbavail(&so->so_snd); 7247 out = tp->snd_max - tp->snd_una; 7248 if ((out > tp->snd_wnd) || rack->rc_has_collapsed) { 7249 /* special case, we need a retransmission */ 7250 collapsed_win = 1; 7251 goto need_retran; 7252 } 7253 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 7254 rack->r_ctl.dsack_persist--; 7255 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7256 rack->r_ctl.num_dsack = 0; 7257 } 7258 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7259 } 7260 if ((tp->t_flags & TF_GPUTINPROG) && 7261 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 7262 /* 7263 * If this is the second in a row 7264 * TLP and we are doing a measurement 7265 * its time to abandon the measurement. 7266 * Something is likely broken on 7267 * the clients network and measuring a 7268 * broken network does us no good. 7269 */ 7270 tp->t_flags &= ~TF_GPUTINPROG; 7271 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7272 rack->r_ctl.rc_gp_srtt /*flex1*/, 7273 tp->gput_seq, 7274 0, 0, 18, __LINE__, NULL, 0); 7275 } 7276 /* 7277 * Check our send oldest always settings, and if 7278 * there is an oldest to send jump to the need_retran. 7279 */ 7280 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 7281 goto need_retran; 7282 7283 if (avail > out) { 7284 /* New data is available */ 7285 amm = avail - out; 7286 if (amm > ctf_fixed_maxseg(tp)) { 7287 amm = ctf_fixed_maxseg(tp); 7288 if ((amm + out) > tp->snd_wnd) { 7289 /* We are rwnd limited */ 7290 goto need_retran; 7291 } 7292 } else if (amm < ctf_fixed_maxseg(tp)) { 7293 /* not enough to fill a MTU */ 7294 goto need_retran; 7295 } 7296 if (IN_FASTRECOVERY(tp->t_flags)) { 7297 /* Unlikely */ 7298 if (rack->rack_no_prr == 0) { 7299 if (out + amm <= tp->snd_wnd) { 7300 rack->r_ctl.rc_prr_sndcnt = amm; 7301 rack->r_ctl.rc_tlp_new_data = amm; 7302 rack_log_to_prr(rack, 4, 0, __LINE__); 7303 } 7304 } else 7305 goto need_retran; 7306 } else { 7307 /* Set the send-new override */ 7308 if (out + amm <= tp->snd_wnd) 7309 rack->r_ctl.rc_tlp_new_data = amm; 7310 else 7311 goto need_retran; 7312 } 7313 rack->r_ctl.rc_tlpsend = NULL; 7314 counter_u64_add(rack_tlp_newdata, 1); 7315 goto send; 7316 } 7317 need_retran: 7318 /* 7319 * Ok we need to arrange the last un-acked segment to be re-sent, or 7320 * optionally the first un-acked segment. 7321 */ 7322 if (collapsed_win == 0) { 7323 if (rack_always_send_oldest) 7324 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7325 else { 7326 rsm = tqhash_max(rack->r_ctl.tqh); 7327 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 7328 rsm = rack_find_high_nonack(rack, rsm); 7329 } 7330 } 7331 if (rsm == NULL) { 7332 #ifdef TCP_BLACKBOX 7333 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 7334 #endif 7335 goto out; 7336 } 7337 } else { 7338 /* 7339 * We had a collapsed window, lets find 7340 * the point before the collapse. 7341 */ 7342 if (SEQ_GT((rack->r_ctl.last_collapse_point - 1), rack->rc_tp->snd_una)) 7343 rsm = tqhash_find(rack->r_ctl.tqh, (rack->r_ctl.last_collapse_point - 1)); 7344 else { 7345 rsm = tqhash_min(rack->r_ctl.tqh); 7346 } 7347 if (rsm == NULL) { 7348 /* Huh */ 7349 goto out; 7350 } 7351 } 7352 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 7353 /* 7354 * We need to split this the last segment in two. 7355 */ 7356 struct rack_sendmap *nrsm; 7357 7358 nrsm = rack_alloc_full_limit(rack); 7359 if (nrsm == NULL) { 7360 /* 7361 * No memory to split, we will just exit and punt 7362 * off to the RXT timer. 7363 */ 7364 goto out; 7365 } 7366 rack_clone_rsm(rack, nrsm, rsm, 7367 (rsm->r_end - ctf_fixed_maxseg(tp))); 7368 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7369 #ifndef INVARIANTS 7370 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 7371 #else 7372 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 7373 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 7374 nrsm, insret, rack, rsm); 7375 } 7376 #endif 7377 if (rsm->r_in_tmap) { 7378 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7379 nrsm->r_in_tmap = 1; 7380 } 7381 rsm = nrsm; 7382 } 7383 rack->r_ctl.rc_tlpsend = rsm; 7384 send: 7385 /* Make sure output path knows we are doing a TLP */ 7386 *doing_tlp = 1; 7387 rack->r_timer_override = 1; 7388 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7389 return (0); 7390 out: 7391 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 7392 return (0); 7393 } 7394 7395 /* 7396 * Delayed ack Timer, here we simply need to setup the 7397 * ACK_NOW flag and remove the DELACK flag. From there 7398 * the output routine will send the ack out. 7399 * 7400 * We only return 1, saying don't proceed, if all timers 7401 * are stopped (destroyed PCB?). 7402 */ 7403 static int 7404 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7405 { 7406 7407 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 7408 tp->t_flags &= ~TF_DELACK; 7409 tp->t_flags |= TF_ACKNOW; 7410 KMOD_TCPSTAT_INC(tcps_delack); 7411 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 7412 return (0); 7413 } 7414 7415 /* 7416 * Persists timer, here we simply send the 7417 * same thing as a keepalive will. 7418 * the one byte send. 7419 * 7420 * We only return 1, saying don't proceed, if all timers 7421 * are stopped (destroyed PCB?). 7422 */ 7423 static int 7424 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7425 { 7426 struct tcptemp *t_template; 7427 int32_t retval = 1; 7428 7429 if (rack->rc_in_persist == 0) 7430 return (0); 7431 if (ctf_progress_timeout_check(tp, false)) { 7432 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7433 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7434 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7435 return (-ETIMEDOUT); /* tcp_drop() */ 7436 } 7437 /* 7438 * Persistence timer into zero window. Force a byte to be output, if 7439 * possible. 7440 */ 7441 KMOD_TCPSTAT_INC(tcps_persisttimeo); 7442 /* 7443 * Hack: if the peer is dead/unreachable, we do not time out if the 7444 * window is closed. After a full backoff, drop the connection if 7445 * the idle time (no responses to probes) reaches the maximum 7446 * backoff that we would use if retransmitting. 7447 */ 7448 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 7449 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 7450 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 7451 KMOD_TCPSTAT_INC(tcps_persistdrop); 7452 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7453 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7454 retval = -ETIMEDOUT; /* tcp_drop() */ 7455 goto out; 7456 } 7457 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 7458 tp->snd_una == tp->snd_max) 7459 rack_exit_persist(tp, rack, cts); 7460 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 7461 /* 7462 * If the user has closed the socket then drop a persisting 7463 * connection after a much reduced timeout. 7464 */ 7465 if (tp->t_state > TCPS_CLOSE_WAIT && 7466 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 7467 KMOD_TCPSTAT_INC(tcps_persistdrop); 7468 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 7469 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 7470 retval = -ETIMEDOUT; /* tcp_drop() */ 7471 goto out; 7472 } 7473 t_template = tcpip_maketemplate(rack->rc_inp); 7474 if (t_template) { 7475 /* only set it if we were answered */ 7476 if (rack->forced_ack == 0) { 7477 rack->forced_ack = 1; 7478 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7479 } else { 7480 rack->probe_not_answered = 1; 7481 counter_u64_add(rack_persists_loss, 1); 7482 rack->r_ctl.persist_lost_ends++; 7483 } 7484 counter_u64_add(rack_persists_sends, 1); 7485 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 7486 tcp_respond(tp, t_template->tt_ipgen, 7487 &t_template->tt_t, (struct mbuf *)NULL, 7488 tp->rcv_nxt, tp->snd_una - 1, 0); 7489 /* This sends an ack */ 7490 if (tp->t_flags & TF_DELACK) 7491 tp->t_flags &= ~TF_DELACK; 7492 free(t_template, M_TEMP); 7493 } 7494 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 7495 tp->t_rxtshift++; 7496 out: 7497 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 7498 rack_start_hpts_timer(rack, tp, cts, 7499 0, 0, 0); 7500 return (retval); 7501 } 7502 7503 /* 7504 * If a keepalive goes off, we had no other timers 7505 * happening. We always return 1 here since this 7506 * routine either drops the connection or sends 7507 * out a segment with respond. 7508 */ 7509 static int 7510 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7511 { 7512 struct tcptemp *t_template; 7513 struct inpcb *inp = tptoinpcb(tp); 7514 7515 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 7516 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 7517 /* 7518 * Keep-alive timer went off; send something or drop connection if 7519 * idle for too long. 7520 */ 7521 KMOD_TCPSTAT_INC(tcps_keeptimeo); 7522 if (tp->t_state < TCPS_ESTABLISHED) 7523 goto dropit; 7524 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 7525 tp->t_state <= TCPS_CLOSING) { 7526 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 7527 goto dropit; 7528 /* 7529 * Send a packet designed to force a response if the peer is 7530 * up and reachable: either an ACK if the connection is 7531 * still alive, or an RST if the peer has closed the 7532 * connection due to timeout or reboot. Using sequence 7533 * number tp->snd_una-1 causes the transmitted zero-length 7534 * segment to lie outside the receive window; by the 7535 * protocol spec, this requires the correspondent TCP to 7536 * respond. 7537 */ 7538 KMOD_TCPSTAT_INC(tcps_keepprobe); 7539 t_template = tcpip_maketemplate(inp); 7540 if (t_template) { 7541 if (rack->forced_ack == 0) { 7542 rack->forced_ack = 1; 7543 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 7544 } else { 7545 rack->probe_not_answered = 1; 7546 } 7547 tcp_respond(tp, t_template->tt_ipgen, 7548 &t_template->tt_t, (struct mbuf *)NULL, 7549 tp->rcv_nxt, tp->snd_una - 1, 0); 7550 free(t_template, M_TEMP); 7551 } 7552 } 7553 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 7554 return (1); 7555 dropit: 7556 KMOD_TCPSTAT_INC(tcps_keepdrops); 7557 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7558 return (-ETIMEDOUT); /* tcp_drop() */ 7559 } 7560 7561 /* 7562 * Retransmit helper function, clear up all the ack 7563 * flags and take care of important book keeping. 7564 */ 7565 static void 7566 rack_remxt_tmr(struct tcpcb *tp) 7567 { 7568 /* 7569 * The retransmit timer went off, all sack'd blocks must be 7570 * un-acked. 7571 */ 7572 struct rack_sendmap *rsm, *trsm = NULL; 7573 struct tcp_rack *rack; 7574 7575 rack = (struct tcp_rack *)tp->t_fb_ptr; 7576 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 7577 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 7578 if (rack->r_state && (rack->r_state != tp->t_state)) 7579 rack_set_state(tp, rack); 7580 /* 7581 * Ideally we would like to be able to 7582 * mark SACK-PASS on anything not acked here. 7583 * 7584 * However, if we do that we would burst out 7585 * all that data 1ms apart. This would be unwise, 7586 * so for now we will just let the normal rxt timer 7587 * and tlp timer take care of it. 7588 * 7589 * Also we really need to stick them back in sequence 7590 * order. This way we send in the proper order and any 7591 * sacks that come floating in will "re-ack" the data. 7592 * To do this we zap the tmap with an INIT and then 7593 * walk through and place every rsm in the RB tree 7594 * back in its seq ordered place. 7595 */ 7596 TAILQ_INIT(&rack->r_ctl.rc_tmap); 7597 7598 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 7599 rsm->r_dupack = 0; 7600 if (rack_verbose_logging) 7601 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7602 /* We must re-add it back to the tlist */ 7603 if (trsm == NULL) { 7604 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7605 } else { 7606 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 7607 } 7608 rsm->r_in_tmap = 1; 7609 trsm = rsm; 7610 if (rsm->r_flags & RACK_ACKED) 7611 rsm->r_flags |= RACK_WAS_ACKED; 7612 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); 7613 rsm->r_flags |= RACK_MUST_RXT; 7614 } 7615 /* Clear the count (we just un-acked them) */ 7616 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 7617 rack->r_ctl.rc_sacked = 0; 7618 rack->r_ctl.rc_sacklast = NULL; 7619 rack->r_ctl.rc_agg_delayed = 0; 7620 rack->r_early = 0; 7621 rack->r_ctl.rc_agg_early = 0; 7622 rack->r_late = 0; 7623 /* Clear the tlp rtx mark */ 7624 rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); 7625 if (rack->r_ctl.rc_resend != NULL) 7626 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 7627 rack->r_ctl.rc_prr_sndcnt = 0; 7628 rack_log_to_prr(rack, 6, 0, __LINE__); 7629 rack->r_timer_override = 1; 7630 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 7631 #ifdef TCP_SAD_DETECTION 7632 || (rack->sack_attack_disable != 0) 7633 #endif 7634 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 7635 /* 7636 * For non-sack customers new data 7637 * needs to go out as retransmits until 7638 * we retransmit up to snd_max. 7639 */ 7640 rack->r_must_retran = 1; 7641 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 7642 rack->r_ctl.rc_sacked); 7643 } 7644 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 7645 } 7646 7647 static void 7648 rack_convert_rtts(struct tcpcb *tp) 7649 { 7650 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 7651 tp->t_rxtcur = RACK_REXMTVAL(tp); 7652 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 7653 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 7654 } 7655 if (tp->t_rxtcur > rack_rto_max) { 7656 tp->t_rxtcur = rack_rto_max; 7657 } 7658 } 7659 7660 static void 7661 rack_cc_conn_init(struct tcpcb *tp) 7662 { 7663 struct tcp_rack *rack; 7664 uint32_t srtt; 7665 7666 rack = (struct tcp_rack *)tp->t_fb_ptr; 7667 srtt = tp->t_srtt; 7668 cc_conn_init(tp); 7669 /* 7670 * Now convert to rack's internal format, 7671 * if required. 7672 */ 7673 if ((srtt == 0) && (tp->t_srtt != 0)) 7674 rack_convert_rtts(tp); 7675 /* 7676 * We want a chance to stay in slowstart as 7677 * we create a connection. TCP spec says that 7678 * initially ssthresh is infinite. For our 7679 * purposes that is the snd_wnd. 7680 */ 7681 if (tp->snd_ssthresh < tp->snd_wnd) { 7682 tp->snd_ssthresh = tp->snd_wnd; 7683 } 7684 /* 7685 * We also want to assure a IW worth of 7686 * data can get inflight. 7687 */ 7688 if (rc_init_window(rack) < tp->snd_cwnd) 7689 tp->snd_cwnd = rc_init_window(rack); 7690 } 7691 7692 /* 7693 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 7694 * we will setup to retransmit the lowest seq number outstanding. 7695 */ 7696 static int 7697 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 7698 { 7699 struct inpcb *inp = tptoinpcb(tp); 7700 int32_t rexmt; 7701 int32_t retval = 0; 7702 bool isipv6; 7703 7704 if ((tp->t_flags & TF_GPUTINPROG) && 7705 (tp->t_rxtshift)) { 7706 /* 7707 * We have had a second timeout 7708 * measurements on successive rxt's are not profitable. 7709 * It is unlikely to be of any use (the network is 7710 * broken or the client went away). 7711 */ 7712 tp->t_flags &= ~TF_GPUTINPROG; 7713 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7714 rack->r_ctl.rc_gp_srtt /*flex1*/, 7715 tp->gput_seq, 7716 0, 0, 18, __LINE__, NULL, 0); 7717 } 7718 if (ctf_progress_timeout_check(tp, false)) { 7719 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7720 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 7721 return (-ETIMEDOUT); /* tcp_drop() */ 7722 } 7723 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 7724 rack->r_ctl.retran_during_recovery = 0; 7725 rack->rc_ack_required = 1; 7726 rack->r_ctl.dsack_byte_cnt = 0; 7727 if (IN_FASTRECOVERY(tp->t_flags)) 7728 tp->t_flags |= TF_WASFRECOVERY; 7729 else 7730 tp->t_flags &= ~TF_WASFRECOVERY; 7731 if (IN_CONGRECOVERY(tp->t_flags)) 7732 tp->t_flags |= TF_WASCRECOVERY; 7733 else 7734 tp->t_flags &= ~TF_WASCRECOVERY; 7735 if (TCPS_HAVEESTABLISHED(tp->t_state) && 7736 (tp->snd_una == tp->snd_max)) { 7737 /* Nothing outstanding .. nothing to do */ 7738 return (0); 7739 } 7740 if (rack->r_ctl.dsack_persist) { 7741 rack->r_ctl.dsack_persist--; 7742 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 7743 rack->r_ctl.num_dsack = 0; 7744 } 7745 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 7746 } 7747 /* 7748 * Rack can only run one timer at a time, so we cannot 7749 * run a KEEPINIT (gating SYN sending) and a retransmit 7750 * timer for the SYN. So if we are in a front state and 7751 * have a KEEPINIT timer we need to check the first transmit 7752 * against now to see if we have exceeded the KEEPINIT time 7753 * (if one is set). 7754 */ 7755 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 7756 (TP_KEEPINIT(tp) != 0)) { 7757 struct rack_sendmap *rsm; 7758 7759 rsm = tqhash_min(rack->r_ctl.tqh); 7760 if (rsm) { 7761 /* Ok we have something outstanding to test keepinit with */ 7762 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 7763 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 7764 /* We have exceeded the KEEPINIT time */ 7765 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 7766 goto drop_it; 7767 } 7768 } 7769 } 7770 /* 7771 * Retransmission timer went off. Message has not been acked within 7772 * retransmit interval. Back off to a longer retransmit interval 7773 * and retransmit one segment. 7774 */ 7775 rack_remxt_tmr(tp); 7776 if ((rack->r_ctl.rc_resend == NULL) || 7777 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 7778 /* 7779 * If the rwnd collapsed on 7780 * the one we are retransmitting 7781 * it does not count against the 7782 * rxt count. 7783 */ 7784 tp->t_rxtshift++; 7785 } 7786 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 7787 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 7788 drop_it: 7789 tp->t_rxtshift = TCP_MAXRXTSHIFT; 7790 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 7791 /* XXXGL: previously t_softerror was casted to uint16_t */ 7792 MPASS(tp->t_softerror >= 0); 7793 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 7794 goto out; /* tcp_drop() */ 7795 } 7796 if (tp->t_state == TCPS_SYN_SENT) { 7797 /* 7798 * If the SYN was retransmitted, indicate CWND to be limited 7799 * to 1 segment in cc_conn_init(). 7800 */ 7801 tp->snd_cwnd = 1; 7802 } else if (tp->t_rxtshift == 1) { 7803 /* 7804 * first retransmit; record ssthresh and cwnd so they can be 7805 * recovered if this turns out to be a "bad" retransmit. A 7806 * retransmit is considered "bad" if an ACK for this segment 7807 * is received within RTT/2 interval; the assumption here is 7808 * that the ACK was already in flight. See "On Estimating 7809 * End-to-End Network Path Properties" by Allman and Paxson 7810 * for more details. 7811 */ 7812 tp->snd_cwnd_prev = tp->snd_cwnd; 7813 tp->snd_ssthresh_prev = tp->snd_ssthresh; 7814 tp->snd_recover_prev = tp->snd_recover; 7815 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 7816 tp->t_flags |= TF_PREVVALID; 7817 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 7818 tp->t_flags &= ~TF_PREVVALID; 7819 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 7820 if ((tp->t_state == TCPS_SYN_SENT) || 7821 (tp->t_state == TCPS_SYN_RECEIVED)) 7822 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 7823 else 7824 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 7825 7826 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 7827 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 7828 /* 7829 * We enter the path for PLMTUD if connection is established or, if 7830 * connection is FIN_WAIT_1 status, reason for the last is that if 7831 * amount of data we send is very small, we could send it in couple 7832 * of packets and process straight to FIN. In that case we won't 7833 * catch ESTABLISHED state. 7834 */ 7835 #ifdef INET6 7836 isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false; 7837 #else 7838 isipv6 = false; 7839 #endif 7840 if (((V_tcp_pmtud_blackhole_detect == 1) || 7841 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 7842 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 7843 ((tp->t_state == TCPS_ESTABLISHED) || 7844 (tp->t_state == TCPS_FIN_WAIT_1))) { 7845 /* 7846 * Idea here is that at each stage of mtu probe (usually, 7847 * 1448 -> 1188 -> 524) should be given 2 chances to recover 7848 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 7849 * should take care of that. 7850 */ 7851 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 7852 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 7853 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 7854 tp->t_rxtshift % 2 == 0)) { 7855 /* 7856 * Enter Path MTU Black-hole Detection mechanism: - 7857 * Disable Path MTU Discovery (IP "DF" bit). - 7858 * Reduce MTU to lower value than what we negotiated 7859 * with peer. 7860 */ 7861 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 7862 /* Record that we may have found a black hole. */ 7863 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 7864 /* Keep track of previous MSS. */ 7865 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 7866 } 7867 7868 /* 7869 * Reduce the MSS to blackhole value or to the 7870 * default in an attempt to retransmit. 7871 */ 7872 #ifdef INET6 7873 if (isipv6 && 7874 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 7875 /* Use the sysctl tuneable blackhole MSS. */ 7876 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 7877 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7878 } else if (isipv6) { 7879 /* Use the default MSS. */ 7880 tp->t_maxseg = V_tcp_v6mssdflt; 7881 /* 7882 * Disable Path MTU Discovery when we switch 7883 * to minmss. 7884 */ 7885 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7886 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7887 } 7888 #endif 7889 #if defined(INET6) && defined(INET) 7890 else 7891 #endif 7892 #ifdef INET 7893 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 7894 /* Use the sysctl tuneable blackhole MSS. */ 7895 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 7896 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7897 } else { 7898 /* Use the default MSS. */ 7899 tp->t_maxseg = V_tcp_mssdflt; 7900 /* 7901 * Disable Path MTU Discovery when we switch 7902 * to minmss. 7903 */ 7904 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7905 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7906 } 7907 #endif 7908 } else { 7909 /* 7910 * If further retransmissions are still unsuccessful 7911 * with a lowered MTU, maybe this isn't a blackhole 7912 * and we restore the previous MSS and blackhole 7913 * detection flags. The limit '6' is determined by 7914 * giving each probe stage (1448, 1188, 524) 2 7915 * chances to recover. 7916 */ 7917 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 7918 (tp->t_rxtshift >= 6)) { 7919 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 7920 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7921 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 7922 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 7923 } 7924 } 7925 } 7926 /* 7927 * Disable RFC1323 and SACK if we haven't got any response to 7928 * our third SYN to work-around some broken terminal servers 7929 * (most of which have hopefully been retired) that have bad VJ 7930 * header compression code which trashes TCP segments containing 7931 * unknown-to-them TCP options. 7932 */ 7933 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7934 (tp->t_rxtshift == 3)) 7935 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7936 /* 7937 * If we backed off this far, our srtt estimate is probably bogus. 7938 * Clobber it so we'll take the next rtt measurement as our srtt; 7939 * move the current srtt into rttvar to keep the current retransmit 7940 * times until then. 7941 */ 7942 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 7943 #ifdef INET6 7944 if ((inp->inp_vflag & INP_IPV6) != 0) 7945 in6_losing(inp); 7946 else 7947 #endif 7948 in_losing(inp); 7949 tp->t_rttvar += tp->t_srtt; 7950 tp->t_srtt = 0; 7951 } 7952 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 7953 tp->snd_recover = tp->snd_max; 7954 tp->t_flags |= TF_ACKNOW; 7955 tp->t_rtttime = 0; 7956 rack_cong_signal(tp, CC_RTO, tp->snd_una, __LINE__); 7957 out: 7958 return (retval); 7959 } 7960 7961 static int 7962 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 7963 { 7964 int32_t ret = 0; 7965 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 7966 7967 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 7968 (tp->t_flags & TF_GPUTINPROG)) { 7969 /* 7970 * We have a goodput in progress 7971 * and we have entered a late state. 7972 * Do we have enough data in the sb 7973 * to handle the GPUT request? 7974 */ 7975 uint32_t bytes; 7976 7977 bytes = tp->gput_ack - tp->gput_seq; 7978 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 7979 bytes += tp->gput_seq - tp->snd_una; 7980 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 7981 /* 7982 * There are not enough bytes in the socket 7983 * buffer that have been sent to cover this 7984 * measurement. Cancel it. 7985 */ 7986 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7987 rack->r_ctl.rc_gp_srtt /*flex1*/, 7988 tp->gput_seq, 7989 0, 0, 18, __LINE__, NULL, 0); 7990 tp->t_flags &= ~TF_GPUTINPROG; 7991 } 7992 } 7993 if (timers == 0) { 7994 return (0); 7995 } 7996 if (tp->t_state == TCPS_LISTEN) { 7997 /* no timers on listen sockets */ 7998 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 7999 return (0); 8000 return (1); 8001 } 8002 if ((timers & PACE_TMR_RACK) && 8003 rack->rc_on_min_to) { 8004 /* 8005 * For the rack timer when we 8006 * are on a min-timeout (which means rrr_conf = 3) 8007 * we don't want to check the timer. It may 8008 * be going off for a pace and thats ok we 8009 * want to send the retransmit (if its ready). 8010 * 8011 * If its on a normal rack timer (non-min) then 8012 * we will check if its expired. 8013 */ 8014 goto skip_time_check; 8015 } 8016 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 8017 uint32_t left; 8018 8019 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 8020 ret = -1; 8021 rack_log_to_processing(rack, cts, ret, 0); 8022 return (0); 8023 } 8024 if (hpts_calling == 0) { 8025 /* 8026 * A user send or queued mbuf (sack) has called us? We 8027 * return 0 and let the pacing guards 8028 * deal with it if they should or 8029 * should not cause a send. 8030 */ 8031 ret = -2; 8032 rack_log_to_processing(rack, cts, ret, 0); 8033 return (0); 8034 } 8035 /* 8036 * Ok our timer went off early and we are not paced false 8037 * alarm, go back to sleep. We make sure we don't have 8038 * no-sack wakeup on since we no longer have a PKT_OUTPUT 8039 * flag in place. 8040 */ 8041 rack->rc_tp->t_flags2 &= ~TF2_DONT_SACK_QUEUE; 8042 ret = -3; 8043 left = rack->r_ctl.rc_timer_exp - cts; 8044 tcp_hpts_insert(tp, HPTS_MS_TO_SLOTS(left)); 8045 rack_log_to_processing(rack, cts, ret, left); 8046 return (1); 8047 } 8048 skip_time_check: 8049 rack->rc_tmr_stopped = 0; 8050 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 8051 if (timers & PACE_TMR_DELACK) { 8052 ret = rack_timeout_delack(tp, rack, cts); 8053 } else if (timers & PACE_TMR_RACK) { 8054 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8055 rack->r_fast_output = 0; 8056 ret = rack_timeout_rack(tp, rack, cts); 8057 } else if (timers & PACE_TMR_TLP) { 8058 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8059 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 8060 } else if (timers & PACE_TMR_RXT) { 8061 rack->r_ctl.rc_tlp_rxt_last_time = cts; 8062 rack->r_fast_output = 0; 8063 ret = rack_timeout_rxt(tp, rack, cts); 8064 } else if (timers & PACE_TMR_PERSIT) { 8065 ret = rack_timeout_persist(tp, rack, cts); 8066 } else if (timers & PACE_TMR_KEEP) { 8067 ret = rack_timeout_keepalive(tp, rack, cts); 8068 } 8069 rack_log_to_processing(rack, cts, ret, timers); 8070 return (ret); 8071 } 8072 8073 static void 8074 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 8075 { 8076 struct timeval tv; 8077 uint32_t us_cts, flags_on_entry; 8078 uint8_t hpts_removed = 0; 8079 8080 flags_on_entry = rack->r_ctl.rc_hpts_flags; 8081 us_cts = tcp_get_usecs(&tv); 8082 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 8083 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 8084 ((tp->snd_max - tp->snd_una) == 0))) { 8085 tcp_hpts_remove(rack->rc_tp); 8086 hpts_removed = 1; 8087 /* If we were not delayed cancel out the flag. */ 8088 if ((tp->snd_max - tp->snd_una) == 0) 8089 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8090 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8091 } 8092 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 8093 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 8094 if (tcp_in_hpts(rack->rc_tp) && 8095 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 8096 /* 8097 * Canceling timer's when we have no output being 8098 * paced. We also must remove ourselves from the 8099 * hpts. 8100 */ 8101 tcp_hpts_remove(rack->rc_tp); 8102 hpts_removed = 1; 8103 } 8104 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 8105 } 8106 if (hpts_removed == 0) 8107 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 8108 } 8109 8110 static int 8111 rack_stopall(struct tcpcb *tp) 8112 { 8113 struct tcp_rack *rack; 8114 rack = (struct tcp_rack *)tp->t_fb_ptr; 8115 rack->t_timers_stopped = 1; 8116 return (0); 8117 } 8118 8119 static void 8120 rack_stop_all_timers(struct tcpcb *tp, struct tcp_rack *rack) 8121 { 8122 /* 8123 * Assure no timers are running. 8124 */ 8125 if (tcp_timer_active(tp, TT_PERSIST)) { 8126 /* We enter in persists, set the flag appropriately */ 8127 rack->rc_in_persist = 1; 8128 } 8129 if (tcp_in_hpts(rack->rc_tp)) { 8130 tcp_hpts_remove(rack->rc_tp); 8131 } 8132 } 8133 8134 static void 8135 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 8136 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz) 8137 { 8138 int32_t idx; 8139 8140 rsm->r_rtr_cnt++; 8141 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8142 rsm->r_dupack = 0; 8143 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 8144 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 8145 rsm->r_flags |= RACK_OVERMAX; 8146 } 8147 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 8148 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 8149 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 8150 } 8151 idx = rsm->r_rtr_cnt - 1; 8152 rsm->r_tim_lastsent[idx] = ts; 8153 /* 8154 * Here we don't add in the len of send, since its already 8155 * in snduna <->snd_max. 8156 */ 8157 rsm->r_fas = ctf_flight_size(rack->rc_tp, 8158 rack->r_ctl.rc_sacked); 8159 if (rsm->r_flags & RACK_ACKED) { 8160 /* Problably MTU discovery messing with us */ 8161 rsm->r_flags &= ~RACK_ACKED; 8162 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8163 } 8164 if (rsm->r_in_tmap) { 8165 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8166 rsm->r_in_tmap = 0; 8167 } 8168 /* Lets make sure it really is in or not the GP window */ 8169 rack_mark_in_gp_win(tp, rsm); 8170 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8171 rsm->r_in_tmap = 1; 8172 rsm->r_bas = (uint8_t)(((rsm->r_end - rsm->r_start) + segsiz - 1) / segsiz); 8173 /* Take off the must retransmit flag, if its on */ 8174 if (rsm->r_flags & RACK_MUST_RXT) { 8175 if (rack->r_must_retran) 8176 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 8177 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 8178 /* 8179 * We have retransmitted all we need. Clear 8180 * any must retransmit flags. 8181 */ 8182 rack->r_must_retran = 0; 8183 rack->r_ctl.rc_out_at_rto = 0; 8184 } 8185 rsm->r_flags &= ~RACK_MUST_RXT; 8186 } 8187 /* Remove any collapsed flag */ 8188 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8189 if (rsm->r_flags & RACK_SACK_PASSED) { 8190 /* We have retransmitted due to the SACK pass */ 8191 rsm->r_flags &= ~RACK_SACK_PASSED; 8192 rsm->r_flags |= RACK_WAS_SACKPASS; 8193 } 8194 } 8195 8196 static uint32_t 8197 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 8198 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz) 8199 { 8200 /* 8201 * We (re-)transmitted starting at rsm->r_start for some length 8202 * (possibly less than r_end. 8203 */ 8204 struct rack_sendmap *nrsm; 8205 int insret __diagused; 8206 uint32_t c_end; 8207 int32_t len; 8208 8209 len = *lenp; 8210 c_end = rsm->r_start + len; 8211 if (SEQ_GEQ(c_end, rsm->r_end)) { 8212 /* 8213 * We retransmitted the whole piece or more than the whole 8214 * slopping into the next rsm. 8215 */ 8216 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8217 if (c_end == rsm->r_end) { 8218 *lenp = 0; 8219 return (0); 8220 } else { 8221 int32_t act_len; 8222 8223 /* Hangs over the end return whats left */ 8224 act_len = rsm->r_end - rsm->r_start; 8225 *lenp = (len - act_len); 8226 return (rsm->r_end); 8227 } 8228 /* We don't get out of this block. */ 8229 } 8230 /* 8231 * Here we retransmitted less than the whole thing which means we 8232 * have to split this into what was transmitted and what was not. 8233 */ 8234 nrsm = rack_alloc_full_limit(rack); 8235 if (nrsm == NULL) { 8236 /* 8237 * We can't get memory, so lets not proceed. 8238 */ 8239 *lenp = 0; 8240 return (0); 8241 } 8242 /* 8243 * So here we are going to take the original rsm and make it what we 8244 * retransmitted. nrsm will be the tail portion we did not 8245 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 8246 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 8247 * 1, 6 and the new piece will be 6, 11. 8248 */ 8249 rack_clone_rsm(rack, nrsm, rsm, c_end); 8250 nrsm->r_dupack = 0; 8251 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8252 #ifndef INVARIANTS 8253 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8254 #else 8255 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8256 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 8257 nrsm, insret, rack, rsm); 8258 } 8259 #endif 8260 if (rsm->r_in_tmap) { 8261 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8262 nrsm->r_in_tmap = 1; 8263 } 8264 rsm->r_flags &= (~RACK_HAS_FIN); 8265 rack_update_rsm(tp, rack, rsm, ts, add_flag, segsiz); 8266 /* Log a split of rsm into rsm and nrsm */ 8267 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8268 *lenp = 0; 8269 return (0); 8270 } 8271 8272 static void 8273 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 8274 uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, 8275 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, 8276 uint32_t s_moff, int hw_tls, int segsiz) 8277 { 8278 struct tcp_rack *rack; 8279 struct rack_sendmap *rsm, *nrsm; 8280 int insret __diagused; 8281 8282 register uint32_t snd_max, snd_una; 8283 8284 /* 8285 * Add to the RACK log of packets in flight or retransmitted. If 8286 * there is a TS option we will use the TS echoed, if not we will 8287 * grab a TS. 8288 * 8289 * Retransmissions will increment the count and move the ts to its 8290 * proper place. Note that if options do not include TS's then we 8291 * won't be able to effectively use the ACK for an RTT on a retran. 8292 * 8293 * Notes about r_start and r_end. Lets consider a send starting at 8294 * sequence 1 for 10 bytes. In such an example the r_start would be 8295 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 8296 * This means that r_end is actually the first sequence for the next 8297 * slot (11). 8298 * 8299 */ 8300 /* 8301 * If err is set what do we do XXXrrs? should we not add the thing? 8302 * -- i.e. return if err != 0 or should we pretend we sent it? -- 8303 * i.e. proceed with add ** do this for now. 8304 */ 8305 INP_WLOCK_ASSERT(tptoinpcb(tp)); 8306 if (err) 8307 /* 8308 * We don't log errors -- we could but snd_max does not 8309 * advance in this case either. 8310 */ 8311 return; 8312 8313 if (th_flags & TH_RST) { 8314 /* 8315 * We don't log resets and we return immediately from 8316 * sending 8317 */ 8318 return; 8319 } 8320 rack = (struct tcp_rack *)tp->t_fb_ptr; 8321 snd_una = tp->snd_una; 8322 snd_max = tp->snd_max; 8323 if (th_flags & (TH_SYN | TH_FIN)) { 8324 /* 8325 * The call to rack_log_output is made before bumping 8326 * snd_max. This means we can record one extra byte on a SYN 8327 * or FIN if seq_out is adding more on and a FIN is present 8328 * (and we are not resending). 8329 */ 8330 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 8331 len++; 8332 if (th_flags & TH_FIN) 8333 len++; 8334 if (SEQ_LT(snd_max, tp->snd_nxt)) { 8335 /* 8336 * The add/update as not been done for the FIN/SYN 8337 * yet. 8338 */ 8339 snd_max = tp->snd_nxt; 8340 } 8341 } 8342 if (SEQ_LEQ((seq_out + len), snd_una)) { 8343 /* Are sending an old segment to induce an ack (keep-alive)? */ 8344 return; 8345 } 8346 if (SEQ_LT(seq_out, snd_una)) { 8347 /* huh? should we panic? */ 8348 uint32_t end; 8349 8350 end = seq_out + len; 8351 seq_out = snd_una; 8352 if (SEQ_GEQ(end, seq_out)) 8353 len = end - seq_out; 8354 else 8355 len = 0; 8356 } 8357 if (len == 0) { 8358 /* We don't log zero window probes */ 8359 return; 8360 } 8361 if (IN_FASTRECOVERY(tp->t_flags)) { 8362 rack->r_ctl.rc_prr_out += len; 8363 } 8364 /* First question is it a retransmission or new? */ 8365 if (seq_out == snd_max) { 8366 /* Its new */ 8367 rack_chk_req_and_hybrid_on_out(rack, seq_out, len, cts); 8368 again: 8369 rsm = rack_alloc(rack); 8370 if (rsm == NULL) { 8371 /* 8372 * Hmm out of memory and the tcb got destroyed while 8373 * we tried to wait. 8374 */ 8375 return; 8376 } 8377 if (th_flags & TH_FIN) { 8378 rsm->r_flags = RACK_HAS_FIN|add_flag; 8379 } else { 8380 rsm->r_flags = add_flag; 8381 } 8382 if (hw_tls) 8383 rsm->r_hw_tls = 1; 8384 rsm->r_tim_lastsent[0] = cts; 8385 rsm->r_rtr_cnt = 1; 8386 rsm->r_rtr_bytes = 0; 8387 if (th_flags & TH_SYN) { 8388 /* The data space is one beyond snd_una */ 8389 rsm->r_flags |= RACK_HAS_SYN; 8390 } 8391 rsm->r_start = seq_out; 8392 rsm->r_end = rsm->r_start + len; 8393 rack_mark_in_gp_win(tp, rsm); 8394 rsm->r_dupack = 0; 8395 /* 8396 * save off the mbuf location that 8397 * sndmbuf_noadv returned (which is 8398 * where we started copying from).. 8399 */ 8400 rsm->m = s_mb; 8401 rsm->soff = s_moff; 8402 /* 8403 * Here we do add in the len of send, since its not yet 8404 * reflected in in snduna <->snd_max 8405 */ 8406 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 8407 rack->r_ctl.rc_sacked) + 8408 (rsm->r_end - rsm->r_start)); 8409 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 8410 if (rsm->m) { 8411 if (rsm->m->m_len <= rsm->soff) { 8412 /* 8413 * XXXrrs Question, will this happen? 8414 * 8415 * If sbsndptr is set at the correct place 8416 * then s_moff should always be somewhere 8417 * within rsm->m. But if the sbsndptr was 8418 * off then that won't be true. If it occurs 8419 * we need to walkout to the correct location. 8420 */ 8421 struct mbuf *lm; 8422 8423 lm = rsm->m; 8424 while (lm->m_len <= rsm->soff) { 8425 rsm->soff -= lm->m_len; 8426 lm = lm->m_next; 8427 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 8428 __func__, rack, s_moff, s_mb, rsm->soff)); 8429 } 8430 rsm->m = lm; 8431 } 8432 rsm->orig_m_len = rsm->m->m_len; 8433 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 8434 } else { 8435 rsm->orig_m_len = 0; 8436 rsm->orig_t_space = 0; 8437 } 8438 rsm->r_bas = (uint8_t)((len + segsiz - 1) / segsiz); 8439 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8440 /* Log a new rsm */ 8441 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 8442 #ifndef INVARIANTS 8443 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 8444 #else 8445 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 8446 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 8447 nrsm, insret, rack, rsm); 8448 } 8449 #endif 8450 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8451 rsm->r_in_tmap = 1; 8452 /* 8453 * Special case detection, is there just a single 8454 * packet outstanding when we are not in recovery? 8455 * 8456 * If this is true mark it so. 8457 */ 8458 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 8459 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 8460 struct rack_sendmap *prsm; 8461 8462 prsm = tqhash_prev(rack->r_ctl.tqh, rsm); 8463 if (prsm) 8464 prsm->r_one_out_nr = 1; 8465 } 8466 return; 8467 } 8468 /* 8469 * If we reach here its a retransmission and we need to find it. 8470 */ 8471 more: 8472 if (hintrsm && (hintrsm->r_start == seq_out)) { 8473 rsm = hintrsm; 8474 hintrsm = NULL; 8475 } else { 8476 /* No hints sorry */ 8477 rsm = NULL; 8478 } 8479 if ((rsm) && (rsm->r_start == seq_out)) { 8480 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8481 if (len == 0) { 8482 return; 8483 } else { 8484 goto more; 8485 } 8486 } 8487 /* Ok it was not the last pointer go through it the hard way. */ 8488 refind: 8489 rsm = tqhash_find(rack->r_ctl.tqh, seq_out); 8490 if (rsm) { 8491 if (rsm->r_start == seq_out) { 8492 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag, segsiz); 8493 if (len == 0) { 8494 return; 8495 } else { 8496 goto refind; 8497 } 8498 } 8499 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 8500 /* Transmitted within this piece */ 8501 /* 8502 * Ok we must split off the front and then let the 8503 * update do the rest 8504 */ 8505 nrsm = rack_alloc_full_limit(rack); 8506 if (nrsm == NULL) { 8507 rack_update_rsm(tp, rack, rsm, cts, add_flag, segsiz); 8508 return; 8509 } 8510 /* 8511 * copy rsm to nrsm and then trim the front of rsm 8512 * to not include this part. 8513 */ 8514 rack_clone_rsm(rack, nrsm, rsm, seq_out); 8515 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 8516 #ifndef INVARIANTS 8517 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 8518 #else 8519 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 8520 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 8521 nrsm, insret, rack, rsm); 8522 } 8523 #endif 8524 if (rsm->r_in_tmap) { 8525 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8526 nrsm->r_in_tmap = 1; 8527 } 8528 rsm->r_flags &= (~RACK_HAS_FIN); 8529 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag, segsiz); 8530 if (len == 0) { 8531 return; 8532 } else if (len > 0) 8533 goto refind; 8534 } 8535 } 8536 /* 8537 * Hmm not found in map did they retransmit both old and on into the 8538 * new? 8539 */ 8540 if (seq_out == tp->snd_max) { 8541 goto again; 8542 } else if (SEQ_LT(seq_out, tp->snd_max)) { 8543 #ifdef INVARIANTS 8544 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 8545 seq_out, len, tp->snd_una, tp->snd_max); 8546 printf("Starting Dump of all rack entries\n"); 8547 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 8548 printf("rsm:%p start:%u end:%u\n", 8549 rsm, rsm->r_start, rsm->r_end); 8550 } 8551 printf("Dump complete\n"); 8552 panic("seq_out not found rack:%p tp:%p", 8553 rack, tp); 8554 #endif 8555 } else { 8556 #ifdef INVARIANTS 8557 /* 8558 * Hmm beyond sndmax? (only if we are using the new rtt-pack 8559 * flag) 8560 */ 8561 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 8562 seq_out, len, tp->snd_max, tp); 8563 #endif 8564 } 8565 } 8566 8567 /* 8568 * Record one of the RTT updates from an ack into 8569 * our sample structure. 8570 */ 8571 8572 static void 8573 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 8574 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 8575 { 8576 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8577 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 8578 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 8579 } 8580 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8581 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 8582 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 8583 } 8584 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 8585 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 8586 rack->r_ctl.rc_gp_lowrtt = us_rtt; 8587 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 8588 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 8589 } 8590 if ((confidence == 1) && 8591 ((rsm == NULL) || 8592 (rsm->r_just_ret) || 8593 (rsm->r_one_out_nr && 8594 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 8595 /* 8596 * If the rsm had a just return 8597 * hit it then we can't trust the 8598 * rtt measurement for buffer deterimination 8599 * Note that a confidence of 2, indicates 8600 * SACK'd which overrides the r_just_ret or 8601 * the r_one_out_nr. If it was a CUM-ACK and 8602 * we had only two outstanding, but get an 8603 * ack for only 1. Then that also lowers our 8604 * confidence. 8605 */ 8606 confidence = 0; 8607 } 8608 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 8609 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 8610 if (rack->r_ctl.rack_rs.confidence == 0) { 8611 /* 8612 * We take anything with no current confidence 8613 * saved. 8614 */ 8615 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8616 rack->r_ctl.rack_rs.confidence = confidence; 8617 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8618 } else if (confidence != 0) { 8619 /* 8620 * Once we have a confident number, 8621 * we can update it with a smaller 8622 * value since this confident number 8623 * may include the DSACK time until 8624 * the next segment (the second one) arrived. 8625 */ 8626 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 8627 rack->r_ctl.rack_rs.confidence = confidence; 8628 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 8629 } 8630 } 8631 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 8632 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 8633 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 8634 rack->r_ctl.rack_rs.rs_rtt_cnt++; 8635 } 8636 8637 /* 8638 * Collect new round-trip time estimate 8639 * and update averages and current timeout. 8640 */ 8641 static void 8642 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 8643 { 8644 int32_t delta; 8645 int32_t rtt; 8646 8647 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 8648 /* No valid sample */ 8649 return; 8650 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 8651 /* We are to use the lowest RTT seen in a single ack */ 8652 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 8653 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 8654 /* We are to use the highest RTT seen in a single ack */ 8655 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 8656 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 8657 /* We are to use the average RTT seen in a single ack */ 8658 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 8659 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 8660 } else { 8661 #ifdef INVARIANTS 8662 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 8663 #endif 8664 return; 8665 } 8666 if (rtt == 0) 8667 rtt = 1; 8668 if (rack->rc_gp_rtt_set == 0) { 8669 /* 8670 * With no RTT we have to accept 8671 * even one we are not confident of. 8672 */ 8673 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 8674 rack->rc_gp_rtt_set = 1; 8675 } else if (rack->r_ctl.rack_rs.confidence) { 8676 /* update the running gp srtt */ 8677 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 8678 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 8679 } 8680 if (rack->r_ctl.rack_rs.confidence) { 8681 /* 8682 * record the low and high for highly buffered path computation, 8683 * we only do this if we are confident (not a retransmission). 8684 */ 8685 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 8686 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8687 } 8688 if (rack->rc_highly_buffered == 0) { 8689 /* 8690 * Currently once we declare a path has 8691 * highly buffered there is no going 8692 * back, which may be a problem... 8693 */ 8694 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 8695 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 8696 rack->r_ctl.rc_highest_us_rtt, 8697 rack->r_ctl.rc_lowest_us_rtt, 8698 RACK_RTTS_SEEHBP); 8699 rack->rc_highly_buffered = 1; 8700 } 8701 } 8702 } 8703 if ((rack->r_ctl.rack_rs.confidence) || 8704 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 8705 /* 8706 * If we are highly confident of it <or> it was 8707 * never retransmitted we accept it as the last us_rtt. 8708 */ 8709 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8710 /* The lowest rtt can be set if its was not retransmited */ 8711 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 8712 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8713 if (rack->r_ctl.rc_lowest_us_rtt == 0) 8714 rack->r_ctl.rc_lowest_us_rtt = 1; 8715 } 8716 } 8717 rack = (struct tcp_rack *)tp->t_fb_ptr; 8718 if (tp->t_srtt != 0) { 8719 /* 8720 * We keep a simple srtt in microseconds, like our rtt 8721 * measurement. We don't need to do any tricks with shifting 8722 * etc. Instead we just add in 1/8th of the new measurement 8723 * and subtract out 1/8 of the old srtt. We do the same with 8724 * the variance after finding the absolute value of the 8725 * difference between this sample and the current srtt. 8726 */ 8727 delta = tp->t_srtt - rtt; 8728 /* Take off 1/8th of the current sRTT */ 8729 tp->t_srtt -= (tp->t_srtt >> 3); 8730 /* Add in 1/8th of the new RTT just measured */ 8731 tp->t_srtt += (rtt >> 3); 8732 if (tp->t_srtt <= 0) 8733 tp->t_srtt = 1; 8734 /* Now lets make the absolute value of the variance */ 8735 if (delta < 0) 8736 delta = -delta; 8737 /* Subtract out 1/8th */ 8738 tp->t_rttvar -= (tp->t_rttvar >> 3); 8739 /* Add in 1/8th of the new variance we just saw */ 8740 tp->t_rttvar += (delta >> 3); 8741 if (tp->t_rttvar <= 0) 8742 tp->t_rttvar = 1; 8743 } else { 8744 /* 8745 * No rtt measurement yet - use the unsmoothed rtt. Set the 8746 * variance to half the rtt (so our first retransmit happens 8747 * at 3*rtt). 8748 */ 8749 tp->t_srtt = rtt; 8750 tp->t_rttvar = rtt >> 1; 8751 } 8752 rack->rc_srtt_measure_made = 1; 8753 KMOD_TCPSTAT_INC(tcps_rttupdated); 8754 if (tp->t_rttupdated < UCHAR_MAX) 8755 tp->t_rttupdated++; 8756 #ifdef STATS 8757 if (rack_stats_gets_ms_rtt == 0) { 8758 /* Send in the microsecond rtt used for rxt timeout purposes */ 8759 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 8760 } else if (rack_stats_gets_ms_rtt == 1) { 8761 /* Send in the millisecond rtt used for rxt timeout purposes */ 8762 int32_t ms_rtt; 8763 8764 /* Round up */ 8765 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8766 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8767 } else if (rack_stats_gets_ms_rtt == 2) { 8768 /* Send in the millisecond rtt has close to the path RTT as we can get */ 8769 int32_t ms_rtt; 8770 8771 /* Round up */ 8772 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 8773 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 8774 } else { 8775 /* Send in the microsecond rtt has close to the path RTT as we can get */ 8776 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8777 } 8778 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 8779 #endif 8780 /* 8781 * the retransmit should happen at rtt + 4 * rttvar. Because of the 8782 * way we do the smoothing, srtt and rttvar will each average +1/2 8783 * tick of bias. When we compute the retransmit timer, we want 1/2 8784 * tick of rounding and 1 extra tick because of +-1/2 tick 8785 * uncertainty in the firing of the timer. The bias will give us 8786 * exactly the 1.5 tick we need. But, because the bias is 8787 * statistical, we have to test that we don't drop below the minimum 8788 * feasible timer (which is 2 ticks). 8789 */ 8790 tp->t_rxtshift = 0; 8791 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8792 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 8793 rack_log_rtt_sample(rack, rtt); 8794 tp->t_softerror = 0; 8795 } 8796 8797 8798 static void 8799 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 8800 { 8801 /* 8802 * Apply to filter the inbound us-rtt at us_cts. 8803 */ 8804 uint32_t old_rtt; 8805 8806 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 8807 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 8808 us_rtt, us_cts); 8809 if (old_rtt > us_rtt) { 8810 /* We just hit a new lower rtt time */ 8811 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 8812 __LINE__, RACK_RTTS_NEWRTT); 8813 /* 8814 * Only count it if its lower than what we saw within our 8815 * calculated range. 8816 */ 8817 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 8818 if (rack_probertt_lower_within && 8819 rack->rc_gp_dyn_mul && 8820 (rack->use_fixed_rate == 0) && 8821 (rack->rc_always_pace)) { 8822 /* 8823 * We are seeing a new lower rtt very close 8824 * to the time that we would have entered probe-rtt. 8825 * This is probably due to the fact that a peer flow 8826 * has entered probe-rtt. Lets go in now too. 8827 */ 8828 uint32_t val; 8829 8830 val = rack_probertt_lower_within * rack_time_between_probertt; 8831 val /= 100; 8832 if ((rack->in_probe_rtt == 0) && 8833 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 8834 rack_enter_probertt(rack, us_cts); 8835 } 8836 } 8837 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 8838 } 8839 } 8840 } 8841 8842 static int 8843 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 8844 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 8845 { 8846 uint32_t us_rtt; 8847 int32_t i, all; 8848 uint32_t t, len_acked; 8849 8850 if ((rsm->r_flags & RACK_ACKED) || 8851 (rsm->r_flags & RACK_WAS_ACKED)) 8852 /* Already done */ 8853 return (0); 8854 if (rsm->r_no_rtt_allowed) { 8855 /* Not allowed */ 8856 return (0); 8857 } 8858 if (ack_type == CUM_ACKED) { 8859 if (SEQ_GT(th_ack, rsm->r_end)) { 8860 len_acked = rsm->r_end - rsm->r_start; 8861 all = 1; 8862 } else { 8863 len_acked = th_ack - rsm->r_start; 8864 all = 0; 8865 } 8866 } else { 8867 len_acked = rsm->r_end - rsm->r_start; 8868 all = 0; 8869 } 8870 if (rsm->r_rtr_cnt == 1) { 8871 8872 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8873 if ((int)t <= 0) 8874 t = 1; 8875 if (!tp->t_rttlow || tp->t_rttlow > t) 8876 tp->t_rttlow = t; 8877 if (!rack->r_ctl.rc_rack_min_rtt || 8878 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8879 rack->r_ctl.rc_rack_min_rtt = t; 8880 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8881 rack->r_ctl.rc_rack_min_rtt = 1; 8882 } 8883 } 8884 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 8885 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8886 else 8887 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8888 if (us_rtt == 0) 8889 us_rtt = 1; 8890 if (CC_ALGO(tp)->rttsample != NULL) { 8891 /* Kick the RTT to the CC */ 8892 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8893 } 8894 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 8895 if (ack_type == SACKED) { 8896 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 8897 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 8898 } else { 8899 /* 8900 * We need to setup what our confidence 8901 * is in this ack. 8902 * 8903 * If the rsm was app limited and it is 8904 * less than a mss in length (the end 8905 * of the send) then we have a gap. If we 8906 * were app limited but say we were sending 8907 * multiple MSS's then we are more confident 8908 * int it. 8909 * 8910 * When we are not app-limited then we see if 8911 * the rsm is being included in the current 8912 * measurement, we tell this by the app_limited_needs_set 8913 * flag. 8914 * 8915 * Note that being cwnd blocked is not applimited 8916 * as well as the pacing delay between packets which 8917 * are sending only 1 or 2 MSS's also will show up 8918 * in the RTT. We probably need to examine this algorithm 8919 * a bit more and enhance it to account for the delay 8920 * between rsm's. We could do that by saving off the 8921 * pacing delay of each rsm (in an rsm) and then 8922 * factoring that in somehow though for now I am 8923 * not sure how :) 8924 */ 8925 int calc_conf = 0; 8926 8927 if (rsm->r_flags & RACK_APP_LIMITED) { 8928 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 8929 calc_conf = 0; 8930 else 8931 calc_conf = 1; 8932 } else if (rack->app_limited_needs_set == 0) { 8933 calc_conf = 1; 8934 } else { 8935 calc_conf = 0; 8936 } 8937 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 8938 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 8939 calc_conf, rsm, rsm->r_rtr_cnt); 8940 } 8941 if ((rsm->r_flags & RACK_TLP) && 8942 (!IN_FASTRECOVERY(tp->t_flags))) { 8943 /* Segment was a TLP and our retrans matched */ 8944 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 8945 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 8946 } 8947 } 8948 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 8949 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8950 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 8951 /* New more recent rack_tmit_time */ 8952 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8953 if (rack->r_ctl.rc_rack_tmit_time == 0) 8954 rack->r_ctl.rc_rack_tmit_time = 1; 8955 rack->rc_rack_rtt = t; 8956 } 8957 return (1); 8958 } 8959 /* 8960 * We clear the soft/rxtshift since we got an ack. 8961 * There is no assurance we will call the commit() function 8962 * so we need to clear these to avoid incorrect handling. 8963 */ 8964 tp->t_rxtshift = 0; 8965 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8966 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 8967 tp->t_softerror = 0; 8968 if (to && (to->to_flags & TOF_TS) && 8969 (ack_type == CUM_ACKED) && 8970 (to->to_tsecr) && 8971 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 8972 /* 8973 * Now which timestamp does it match? In this block the ACK 8974 * must be coming from a previous transmission. 8975 */ 8976 for (i = 0; i < rsm->r_rtr_cnt; i++) { 8977 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 8978 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8979 if ((int)t <= 0) 8980 t = 1; 8981 if (CC_ALGO(tp)->rttsample != NULL) { 8982 /* 8983 * Kick the RTT to the CC, here 8984 * we lie a bit in that we know the 8985 * retransmission is correct even though 8986 * we retransmitted. This is because 8987 * we match the timestamps. 8988 */ 8989 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 8990 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 8991 else 8992 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 8993 CC_ALGO(tp)->rttsample(&tp->t_ccv, us_rtt, 1, rsm->r_fas); 8994 } 8995 if ((i + 1) < rsm->r_rtr_cnt) { 8996 /* 8997 * The peer ack'd from our previous 8998 * transmission. We have a spurious 8999 * retransmission and thus we dont 9000 * want to update our rack_rtt. 9001 * 9002 * Hmm should there be a CC revert here? 9003 * 9004 */ 9005 return (0); 9006 } 9007 if (!tp->t_rttlow || tp->t_rttlow > t) 9008 tp->t_rttlow = t; 9009 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9010 rack->r_ctl.rc_rack_min_rtt = t; 9011 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9012 rack->r_ctl.rc_rack_min_rtt = 1; 9013 } 9014 } 9015 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9016 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9017 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]))) { 9018 /* New more recent rack_tmit_time */ 9019 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 9020 if (rack->r_ctl.rc_rack_tmit_time == 0) 9021 rack->r_ctl.rc_rack_tmit_time = 1; 9022 rack->rc_rack_rtt = t; 9023 } 9024 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 9025 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 9026 rsm->r_rtr_cnt); 9027 return (1); 9028 } 9029 } 9030 /* If we are logging log out the sendmap */ 9031 if (tcp_bblogging_on(rack->rc_tp)) { 9032 for (i = 0; i < rsm->r_rtr_cnt; i++) { 9033 rack_log_rtt_sendmap(rack, i, rsm->r_tim_lastsent[i], to->to_tsecr); 9034 } 9035 } 9036 goto ts_not_found; 9037 } else { 9038 /* 9039 * Ok its a SACK block that we retransmitted. or a windows 9040 * machine without timestamps. We can tell nothing from the 9041 * time-stamp since its not there or the time the peer last 9042 * recieved a segment that moved forward its cum-ack point. 9043 */ 9044 ts_not_found: 9045 i = rsm->r_rtr_cnt - 1; 9046 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 9047 if ((int)t <= 0) 9048 t = 1; 9049 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9050 /* 9051 * We retransmitted and the ack came back in less 9052 * than the smallest rtt we have observed. We most 9053 * likely did an improper retransmit as outlined in 9054 * 6.2 Step 2 point 2 in the rack-draft so we 9055 * don't want to update our rack_rtt. We in 9056 * theory (in future) might want to think about reverting our 9057 * cwnd state but we won't for now. 9058 */ 9059 return (0); 9060 } else if (rack->r_ctl.rc_rack_min_rtt) { 9061 /* 9062 * We retransmitted it and the retransmit did the 9063 * job. 9064 */ 9065 if (!rack->r_ctl.rc_rack_min_rtt || 9066 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 9067 rack->r_ctl.rc_rack_min_rtt = t; 9068 if (rack->r_ctl.rc_rack_min_rtt == 0) { 9069 rack->r_ctl.rc_rack_min_rtt = 1; 9070 } 9071 } 9072 if ((rack->r_ctl.rc_rack_tmit_time == 0) || 9073 (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 9074 (uint32_t)rsm->r_tim_lastsent[i]))) { 9075 /* New more recent rack_tmit_time */ 9076 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 9077 if (rack->r_ctl.rc_rack_tmit_time == 0) 9078 rack->r_ctl.rc_rack_tmit_time = 1; 9079 rack->rc_rack_rtt = t; 9080 } 9081 return (1); 9082 } 9083 } 9084 return (0); 9085 } 9086 9087 /* 9088 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 9089 */ 9090 static void 9091 rack_log_sack_passed(struct tcpcb *tp, 9092 struct tcp_rack *rack, struct rack_sendmap *rsm) 9093 { 9094 struct rack_sendmap *nrsm; 9095 9096 nrsm = rsm; 9097 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 9098 rack_head, r_tnext) { 9099 if (nrsm == rsm) { 9100 /* Skip original segment he is acked */ 9101 continue; 9102 } 9103 if (nrsm->r_flags & RACK_ACKED) { 9104 /* 9105 * Skip ack'd segments, though we 9106 * should not see these, since tmap 9107 * should not have ack'd segments. 9108 */ 9109 continue; 9110 } 9111 if (nrsm->r_flags & RACK_RWND_COLLAPSED) { 9112 /* 9113 * If the peer dropped the rwnd on 9114 * these then we don't worry about them. 9115 */ 9116 continue; 9117 } 9118 if (nrsm->r_flags & RACK_SACK_PASSED) { 9119 /* 9120 * We found one that is already marked 9121 * passed, we have been here before and 9122 * so all others below this are marked. 9123 */ 9124 break; 9125 } 9126 nrsm->r_flags |= RACK_SACK_PASSED; 9127 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 9128 } 9129 } 9130 9131 static void 9132 rack_need_set_test(struct tcpcb *tp, 9133 struct tcp_rack *rack, 9134 struct rack_sendmap *rsm, 9135 tcp_seq th_ack, 9136 int line, 9137 int use_which) 9138 { 9139 struct rack_sendmap *s_rsm; 9140 9141 if ((tp->t_flags & TF_GPUTINPROG) && 9142 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9143 /* 9144 * We were app limited, and this ack 9145 * butts up or goes beyond the point where we want 9146 * to start our next measurement. We need 9147 * to record the new gput_ts as here and 9148 * possibly update the start sequence. 9149 */ 9150 uint32_t seq, ts; 9151 9152 if (rsm->r_rtr_cnt > 1) { 9153 /* 9154 * This is a retransmit, can we 9155 * really make any assessment at this 9156 * point? We are not really sure of 9157 * the timestamp, is it this or the 9158 * previous transmission? 9159 * 9160 * Lets wait for something better that 9161 * is not retransmitted. 9162 */ 9163 return; 9164 } 9165 seq = tp->gput_seq; 9166 ts = tp->gput_ts; 9167 rack->app_limited_needs_set = 0; 9168 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 9169 /* Do we start at a new end? */ 9170 if ((use_which == RACK_USE_BEG) && 9171 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 9172 /* 9173 * When we get an ACK that just eats 9174 * up some of the rsm, we set RACK_USE_BEG 9175 * since whats at r_start (i.e. th_ack) 9176 * is left unacked and thats where the 9177 * measurement now starts. 9178 */ 9179 tp->gput_seq = rsm->r_start; 9180 } 9181 if ((use_which == RACK_USE_END) && 9182 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 9183 /* 9184 * We use the end when the cumack 9185 * is moving forward and completely 9186 * deleting the rsm passed so basically 9187 * r_end holds th_ack. 9188 * 9189 * For SACK's we also want to use the end 9190 * since this piece just got sacked and 9191 * we want to target anything after that 9192 * in our measurement. 9193 */ 9194 tp->gput_seq = rsm->r_end; 9195 } 9196 if (use_which == RACK_USE_END_OR_THACK) { 9197 /* 9198 * special case for ack moving forward, 9199 * not a sack, we need to move all the 9200 * way up to where this ack cum-ack moves 9201 * to. 9202 */ 9203 if (SEQ_GT(th_ack, rsm->r_end)) 9204 tp->gput_seq = th_ack; 9205 else 9206 tp->gput_seq = rsm->r_end; 9207 } 9208 if (SEQ_LT(tp->gput_seq, tp->snd_max)) 9209 s_rsm = tqhash_find(rack->r_ctl.tqh, tp->gput_seq); 9210 else 9211 s_rsm = NULL; 9212 /* 9213 * Pick up the correct send time if we can the rsm passed in 9214 * may be equal to s_rsm if the RACK_USE_BEG was set. For the other 9215 * two cases (RACK_USE_THACK or RACK_USE_END) most likely we will 9216 * find a different seq i.e. the next send up. 9217 * 9218 * If that has not been sent, s_rsm will be NULL and we must 9219 * arrange it so this function will get called again by setting 9220 * app_limited_needs_set. 9221 */ 9222 if (s_rsm) 9223 rack->r_ctl.rc_gp_output_ts = s_rsm->r_tim_lastsent[0]; 9224 else { 9225 /* If we hit here we have to have *not* sent tp->gput_seq */ 9226 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[0]; 9227 /* Set it up so we will go through here again */ 9228 rack->app_limited_needs_set = 1; 9229 } 9230 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 9231 /* 9232 * We moved beyond this guy's range, re-calculate 9233 * the new end point. 9234 */ 9235 if (rack->rc_gp_filled == 0) { 9236 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 9237 } else { 9238 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 9239 } 9240 } 9241 /* 9242 * We are moving the goal post, we may be able to clear the 9243 * measure_saw_probe_rtt flag. 9244 */ 9245 if ((rack->in_probe_rtt == 0) && 9246 (rack->measure_saw_probe_rtt) && 9247 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 9248 rack->measure_saw_probe_rtt = 0; 9249 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 9250 seq, tp->gput_seq, 9251 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9252 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9253 5, line, NULL, 0); 9254 if (rack->rc_gp_filled && 9255 ((tp->gput_ack - tp->gput_seq) < 9256 max(rc_init_window(rack), (MIN_GP_WIN * 9257 ctf_fixed_maxseg(tp))))) { 9258 uint32_t ideal_amount; 9259 9260 ideal_amount = rack_get_measure_window(tp, rack); 9261 if (ideal_amount > sbavail(&tptosocket(tp)->so_snd)) { 9262 /* 9263 * There is no sense of continuing this measurement 9264 * because its too small to gain us anything we 9265 * trust. Skip it and that way we can start a new 9266 * measurement quicker. 9267 */ 9268 tp->t_flags &= ~TF_GPUTINPROG; 9269 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 9270 0, 0, 9271 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | 9272 (uint64_t)rack->r_ctl.rc_gp_output_ts), 9273 6, __LINE__, NULL, 0); 9274 } else { 9275 /* 9276 * Reset the window further out. 9277 */ 9278 tp->gput_ack = tp->gput_seq + ideal_amount; 9279 } 9280 } 9281 rack_tend_gp_marks(tp, rack); 9282 rack_log_gpset(rack, tp->gput_ack, 0, 0, line, 2, rsm); 9283 } 9284 } 9285 9286 static inline int 9287 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 9288 { 9289 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 9290 /* Behind our TLP definition or right at */ 9291 return (0); 9292 } 9293 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 9294 /* The start is beyond or right at our end of TLP definition */ 9295 return (0); 9296 } 9297 /* It has to be a sub-part of the original TLP recorded */ 9298 return (1); 9299 } 9300 9301 9302 9303 static uint32_t 9304 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 9305 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, 9306 int *no_extra, 9307 int *moved_two, uint32_t segsiz) 9308 { 9309 uint32_t start, end, changed = 0; 9310 struct rack_sendmap stack_map; 9311 struct rack_sendmap *rsm, *nrsm, *prev, *next; 9312 int insret __diagused; 9313 int32_t used_ref = 1; 9314 int moved = 0; 9315 #ifdef TCP_SAD_DETECTION 9316 int allow_segsiz; 9317 int first_time_through = 1; 9318 #endif 9319 int noextra = 0; 9320 int can_use_hookery = 0; 9321 9322 start = sack->start; 9323 end = sack->end; 9324 rsm = *prsm; 9325 9326 #ifdef TCP_SAD_DETECTION 9327 /* 9328 * There are a strange number of proxys and meddle boxes in the world 9329 * that seem to cut up segments on different boundaries. This gets us 9330 * smaller sacks that are still ok in terms of it being an attacker. 9331 * We use the base segsiz to calculate an allowable smallness but 9332 * also enforce a min on the segsiz in case it is an attacker playing 9333 * games with MSS. So basically if the sack arrives and it is 9334 * larger than a worse case 960 bytes, we don't classify the guy 9335 * as supicious. 9336 */ 9337 allow_segsiz = max(segsiz, 1200) * sad_seg_size_per; 9338 allow_segsiz /= 1000; 9339 #endif 9340 do_rest_ofb: 9341 if ((rsm == NULL) || 9342 (SEQ_LT(end, rsm->r_start)) || 9343 (SEQ_GEQ(start, rsm->r_end)) || 9344 (SEQ_LT(start, rsm->r_start))) { 9345 /* 9346 * We are not in the right spot, 9347 * find the correct spot in the tree. 9348 */ 9349 used_ref = 0; 9350 rsm = tqhash_find(rack->r_ctl.tqh, start); 9351 moved++; 9352 } 9353 if (rsm == NULL) { 9354 /* TSNH */ 9355 goto out; 9356 } 9357 #ifdef TCP_SAD_DETECTION 9358 /* Now we must check for suspicous activity */ 9359 if ((first_time_through == 1) && 9360 ((end - start) < min((rsm->r_end - rsm->r_start), allow_segsiz)) && 9361 ((rsm->r_flags & RACK_PMTU_CHG) == 0) && 9362 ((rsm->r_flags & RACK_TLP) == 0)) { 9363 /* 9364 * Its less than a full MSS or the segment being acked 9365 * this should only happen if the rsm in question had the 9366 * r_just_ret flag set <and> the end matches the end of 9367 * the rsm block. 9368 * 9369 * Note we do not look at segments that have had TLP's on 9370 * them since we can get un-reported rwnd collapses that 9371 * basically we TLP on and then we get back a sack block 9372 * that goes from the start to only a small way. 9373 * 9374 */ 9375 int loss, ok; 9376 9377 ok = 0; 9378 if (SEQ_GEQ(end, rsm->r_end)) { 9379 if (rsm->r_just_ret == 1) { 9380 /* This was at the end of a send which is ok */ 9381 ok = 1; 9382 } else { 9383 /* A bit harder was it the end of our segment */ 9384 int segs, len; 9385 9386 len = (rsm->r_end - rsm->r_start); 9387 segs = len / segsiz; 9388 segs *= segsiz; 9389 if ((segs + (rsm->r_end - start)) == len) { 9390 /* 9391 * So this last bit was the 9392 * end of our send if we cut it 9393 * up into segsiz pieces so its ok. 9394 */ 9395 ok = 1; 9396 } 9397 } 9398 } 9399 if (ok == 0) { 9400 /* 9401 * This guy is doing something suspicious 9402 * lets start detection. 9403 */ 9404 if (rack->rc_suspicious == 0) { 9405 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_SUSPECT); 9406 counter_u64_add(rack_sack_attacks_suspect, 1); 9407 rack->rc_suspicious = 1; 9408 rack_log_sad(rack, 4); 9409 if (tcp_bblogging_on(rack->rc_tp)) { 9410 union tcp_log_stackspecific log; 9411 struct timeval tv; 9412 9413 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 9414 log.u_bbr.flex1 = end; 9415 log.u_bbr.flex2 = start; 9416 log.u_bbr.flex3 = rsm->r_end; 9417 log.u_bbr.flex4 = rsm->r_start; 9418 log.u_bbr.flex5 = segsiz; 9419 log.u_bbr.flex6 = rsm->r_fas; 9420 log.u_bbr.flex7 = rsm->r_bas; 9421 log.u_bbr.flex8 = 5; 9422 log.u_bbr.pkts_out = rsm->r_flags; 9423 log.u_bbr.bbr_state = rack->rc_suspicious; 9424 log.u_bbr.bbr_substate = rsm->r_just_ret; 9425 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 9426 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9427 TCP_LOG_EVENTP(rack->rc_tp, NULL, 9428 &rack->rc_inp->inp_socket->so_rcv, 9429 &rack->rc_inp->inp_socket->so_snd, 9430 TCP_SAD_DETECTION, 0, 9431 0, &log, false, &tv); 9432 } 9433 } 9434 /* You loose some ack count every time you sack 9435 * a small bit that is not butting to the end of 9436 * what we have sent. This is because we never 9437 * send small bits unless its the end of the sb. 9438 * Anyone sending a sack that is not at the end 9439 * is thus very very suspicious. 9440 */ 9441 loss = (segsiz/2) / (end - start); 9442 if (loss < rack->r_ctl.ack_count) 9443 rack->r_ctl.ack_count -= loss; 9444 else 9445 rack->r_ctl.ack_count = 0; 9446 } 9447 } 9448 first_time_through = 0; 9449 #endif 9450 /* Ok we have an ACK for some piece of this rsm */ 9451 if (rsm->r_start != start) { 9452 if ((rsm->r_flags & RACK_ACKED) == 0) { 9453 /* 9454 * Before any splitting or hookery is 9455 * done is it a TLP of interest i.e. rxt? 9456 */ 9457 if ((rsm->r_flags & RACK_TLP) && 9458 (rsm->r_rtr_cnt > 1)) { 9459 /* 9460 * We are splitting a rxt TLP, check 9461 * if we need to save off the start/end 9462 */ 9463 if (rack->rc_last_tlp_acked_set && 9464 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9465 /* 9466 * We already turned this on since we are inside 9467 * the previous one was a partially sack now we 9468 * are getting another one (maybe all of it). 9469 * 9470 */ 9471 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9472 /* 9473 * Lets make sure we have all of it though. 9474 */ 9475 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9476 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9477 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9478 rack->r_ctl.last_tlp_acked_end); 9479 } 9480 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9481 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9482 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9483 rack->r_ctl.last_tlp_acked_end); 9484 } 9485 } else { 9486 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9487 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9488 rack->rc_last_tlp_past_cumack = 0; 9489 rack->rc_last_tlp_acked_set = 1; 9490 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9491 } 9492 } 9493 /** 9494 * Need to split this in two pieces the before and after, 9495 * the before remains in the map, the after must be 9496 * added. In other words we have: 9497 * rsm |--------------| 9498 * sackblk |-------> 9499 * rsm will become 9500 * rsm |---| 9501 * and nrsm will be the sacked piece 9502 * nrsm |----------| 9503 * 9504 * But before we start down that path lets 9505 * see if the sack spans over on top of 9506 * the next guy and it is already sacked. 9507 * 9508 */ 9509 /* 9510 * Hookery can only be used if the two entries 9511 * are in the same bucket and neither one of 9512 * them staddle the bucket line. 9513 */ 9514 next = tqhash_next(rack->r_ctl.tqh, rsm); 9515 if (next && 9516 (rsm->bindex == next->bindex) && 9517 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9518 ((next->r_flags & RACK_STRADDLE) == 0) && 9519 (rsm->r_flags & RACK_IN_GP_WIN) && 9520 (next->r_flags & RACK_IN_GP_WIN)) 9521 can_use_hookery = 1; 9522 else if (next && 9523 (rsm->bindex == next->bindex) && 9524 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9525 ((next->r_flags & RACK_STRADDLE) == 0) && 9526 ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && 9527 ((next->r_flags & RACK_IN_GP_WIN) == 0)) 9528 can_use_hookery = 1; 9529 else 9530 can_use_hookery = 0; 9531 if (next && can_use_hookery && 9532 (next->r_flags & RACK_ACKED) && 9533 SEQ_GEQ(end, next->r_start)) { 9534 /** 9535 * So the next one is already acked, and 9536 * we can thus by hookery use our stack_map 9537 * to reflect the piece being sacked and 9538 * then adjust the two tree entries moving 9539 * the start and ends around. So we start like: 9540 * rsm |------------| (not-acked) 9541 * next |-----------| (acked) 9542 * sackblk |--------> 9543 * We want to end like so: 9544 * rsm |------| (not-acked) 9545 * next |-----------------| (acked) 9546 * nrsm |-----| 9547 * Where nrsm is a temporary stack piece we 9548 * use to update all the gizmos. 9549 */ 9550 /* Copy up our fudge block */ 9551 noextra++; 9552 nrsm = &stack_map; 9553 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9554 /* Now adjust our tree blocks */ 9555 rsm->r_end = start; 9556 next->r_start = start; 9557 rsm->r_flags |= RACK_SHUFFLED; 9558 next->r_flags |= RACK_SHUFFLED; 9559 /* Now we must adjust back where next->m is */ 9560 rack_setup_offset_for_rsm(rack, rsm, next); 9561 /* 9562 * Which timestamp do we keep? It is rather 9563 * important in GP measurements to have the 9564 * accurate end of the send window. 9565 * 9566 * We keep the largest value, which is the newest 9567 * send. We do this in case a segment that is 9568 * joined together and not part of a GP estimate 9569 * later gets expanded into the GP estimate. 9570 * 9571 * We prohibit the merging of unlike kinds i.e. 9572 * all pieces that are in the GP estimate can be 9573 * merged and all pieces that are not in a GP estimate 9574 * can be merged, but not disimilar pieces. Combine 9575 * this with taking the highest here and we should 9576 * be ok unless of course the client reneges. Then 9577 * all bets are off. 9578 */ 9579 if (next->r_tim_lastsent[(next->r_rtr_cnt-1)] < 9580 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) 9581 next->r_tim_lastsent[(next->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]; 9582 /* 9583 * And we must keep the newest ack arrival time. 9584 */ 9585 if (next->r_ack_arrival < 9586 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9587 next->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9588 9589 9590 /* We don't need to adjust rsm, it did not change */ 9591 /* Clear out the dup ack count of the remainder */ 9592 rsm->r_dupack = 0; 9593 rsm->r_just_ret = 0; 9594 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9595 /* Now lets make sure our fudge block is right */ 9596 nrsm->r_start = start; 9597 /* Now lets update all the stats and such */ 9598 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9599 if (rack->app_limited_needs_set) 9600 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9601 changed += (nrsm->r_end - nrsm->r_start); 9602 /* You get a count for acking a whole segment or more */ 9603 if ((nrsm->r_end - nrsm->r_start) >= segsiz) 9604 rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); 9605 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9606 if (nrsm->r_flags & RACK_SACK_PASSED) { 9607 rack->r_ctl.rc_reorder_ts = cts; 9608 if (rack->r_ctl.rc_reorder_ts == 0) 9609 rack->r_ctl.rc_reorder_ts = 1; 9610 } 9611 /* 9612 * Now we want to go up from rsm (the 9613 * one left un-acked) to the next one 9614 * in the tmap. We do this so when 9615 * we walk backwards we include marking 9616 * sack-passed on rsm (The one passed in 9617 * is skipped since it is generally called 9618 * on something sacked before removing it 9619 * from the tmap). 9620 */ 9621 if (rsm->r_in_tmap) { 9622 nrsm = TAILQ_NEXT(rsm, r_tnext); 9623 /* 9624 * Now that we have the next 9625 * one walk backwards from there. 9626 */ 9627 if (nrsm && nrsm->r_in_tmap) 9628 rack_log_sack_passed(tp, rack, nrsm); 9629 } 9630 /* Now are we done? */ 9631 if (SEQ_LT(end, next->r_end) || 9632 (end == next->r_end)) { 9633 /* Done with block */ 9634 goto out; 9635 } 9636 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 9637 counter_u64_add(rack_sack_used_next_merge, 1); 9638 /* Postion for the next block */ 9639 start = next->r_end; 9640 rsm = tqhash_next(rack->r_ctl.tqh, next); 9641 if (rsm == NULL) 9642 goto out; 9643 } else { 9644 /** 9645 * We can't use any hookery here, so we 9646 * need to split the map. We enter like 9647 * so: 9648 * rsm |--------| 9649 * sackblk |-----> 9650 * We will add the new block nrsm and 9651 * that will be the new portion, and then 9652 * fall through after reseting rsm. So we 9653 * split and look like this: 9654 * rsm |----| 9655 * sackblk |-----> 9656 * nrsm |---| 9657 * We then fall through reseting 9658 * rsm to nrsm, so the next block 9659 * picks it up. 9660 */ 9661 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9662 if (nrsm == NULL) { 9663 /* 9664 * failed XXXrrs what can we do but loose the sack 9665 * info? 9666 */ 9667 goto out; 9668 } 9669 counter_u64_add(rack_sack_splits, 1); 9670 rack_clone_rsm(rack, nrsm, rsm, start); 9671 moved++; 9672 rsm->r_just_ret = 0; 9673 #ifndef INVARIANTS 9674 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 9675 #else 9676 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 9677 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 9678 nrsm, insret, rack, rsm); 9679 } 9680 #endif 9681 if (rsm->r_in_tmap) { 9682 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9683 nrsm->r_in_tmap = 1; 9684 } 9685 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 9686 rsm->r_flags &= (~RACK_HAS_FIN); 9687 /* Position us to point to the new nrsm that starts the sack blk */ 9688 rsm = nrsm; 9689 } 9690 } else { 9691 /* Already sacked this piece */ 9692 counter_u64_add(rack_sack_skipped_acked, 1); 9693 moved++; 9694 if (end == rsm->r_end) { 9695 /* Done with block */ 9696 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9697 goto out; 9698 } else if (SEQ_LT(end, rsm->r_end)) { 9699 /* A partial sack to a already sacked block */ 9700 moved++; 9701 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9702 goto out; 9703 } else { 9704 /* 9705 * The end goes beyond this guy 9706 * reposition the start to the 9707 * next block. 9708 */ 9709 start = rsm->r_end; 9710 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 9711 if (rsm == NULL) 9712 goto out; 9713 } 9714 } 9715 } 9716 if (SEQ_GEQ(end, rsm->r_end)) { 9717 /** 9718 * The end of this block is either beyond this guy or right 9719 * at this guy. I.e.: 9720 * rsm --- |-----| 9721 * end |-----| 9722 * <or> 9723 * end |---------| 9724 */ 9725 if ((rsm->r_flags & RACK_ACKED) == 0) { 9726 /* 9727 * Is it a TLP of interest? 9728 */ 9729 if ((rsm->r_flags & RACK_TLP) && 9730 (rsm->r_rtr_cnt > 1)) { 9731 /* 9732 * We are splitting a rxt TLP, check 9733 * if we need to save off the start/end 9734 */ 9735 if (rack->rc_last_tlp_acked_set && 9736 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9737 /* 9738 * We already turned this on since we are inside 9739 * the previous one was a partially sack now we 9740 * are getting another one (maybe all of it). 9741 */ 9742 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9743 /* 9744 * Lets make sure we have all of it though. 9745 */ 9746 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9747 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9748 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9749 rack->r_ctl.last_tlp_acked_end); 9750 } 9751 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9752 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9753 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9754 rack->r_ctl.last_tlp_acked_end); 9755 } 9756 } else { 9757 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9758 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9759 rack->rc_last_tlp_past_cumack = 0; 9760 rack->rc_last_tlp_acked_set = 1; 9761 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9762 } 9763 } 9764 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 9765 changed += (rsm->r_end - rsm->r_start); 9766 /* You get a count for acking a whole segment or more */ 9767 if ((rsm->r_end - rsm->r_start) >= segsiz) 9768 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); 9769 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 9770 if (rsm->r_in_tmap) /* should be true */ 9771 rack_log_sack_passed(tp, rack, rsm); 9772 /* Is Reordering occuring? */ 9773 if (rsm->r_flags & RACK_SACK_PASSED) { 9774 rsm->r_flags &= ~RACK_SACK_PASSED; 9775 rack->r_ctl.rc_reorder_ts = cts; 9776 if (rack->r_ctl.rc_reorder_ts == 0) 9777 rack->r_ctl.rc_reorder_ts = 1; 9778 } 9779 if (rack->app_limited_needs_set) 9780 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 9781 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9782 rsm->r_flags |= RACK_ACKED; 9783 if (rsm->r_in_tmap) { 9784 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9785 rsm->r_in_tmap = 0; 9786 } 9787 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 9788 } else { 9789 counter_u64_add(rack_sack_skipped_acked, 1); 9790 moved++; 9791 } 9792 if (end == rsm->r_end) { 9793 /* This block only - done, setup for next */ 9794 goto out; 9795 } 9796 /* 9797 * There is more not coverend by this rsm move on 9798 * to the next block in the RB tree. 9799 */ 9800 nrsm = tqhash_next(rack->r_ctl.tqh, rsm); 9801 start = rsm->r_end; 9802 rsm = nrsm; 9803 if (rsm == NULL) 9804 goto out; 9805 goto do_rest_ofb; 9806 } 9807 /** 9808 * The end of this sack block is smaller than 9809 * our rsm i.e.: 9810 * rsm --- |-----| 9811 * end |--| 9812 */ 9813 if ((rsm->r_flags & RACK_ACKED) == 0) { 9814 /* 9815 * Is it a TLP of interest? 9816 */ 9817 if ((rsm->r_flags & RACK_TLP) && 9818 (rsm->r_rtr_cnt > 1)) { 9819 /* 9820 * We are splitting a rxt TLP, check 9821 * if we need to save off the start/end 9822 */ 9823 if (rack->rc_last_tlp_acked_set && 9824 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9825 /* 9826 * We already turned this on since we are inside 9827 * the previous one was a partially sack now we 9828 * are getting another one (maybe all of it). 9829 */ 9830 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9831 /* 9832 * Lets make sure we have all of it though. 9833 */ 9834 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9835 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9836 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9837 rack->r_ctl.last_tlp_acked_end); 9838 } 9839 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9840 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9841 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9842 rack->r_ctl.last_tlp_acked_end); 9843 } 9844 } else { 9845 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9846 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9847 rack->rc_last_tlp_past_cumack = 0; 9848 rack->rc_last_tlp_acked_set = 1; 9849 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9850 } 9851 } 9852 /* 9853 * Hookery can only be used if the two entries 9854 * are in the same bucket and neither one of 9855 * them staddle the bucket line. 9856 */ 9857 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 9858 if (prev && 9859 (rsm->bindex == prev->bindex) && 9860 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9861 ((prev->r_flags & RACK_STRADDLE) == 0) && 9862 (rsm->r_flags & RACK_IN_GP_WIN) && 9863 (prev->r_flags & RACK_IN_GP_WIN)) 9864 can_use_hookery = 1; 9865 else if (prev && 9866 (rsm->bindex == prev->bindex) && 9867 ((rsm->r_flags & RACK_STRADDLE) == 0) && 9868 ((prev->r_flags & RACK_STRADDLE) == 0) && 9869 ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && 9870 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) 9871 can_use_hookery = 1; 9872 else 9873 can_use_hookery = 0; 9874 9875 if (prev && can_use_hookery && 9876 (prev->r_flags & RACK_ACKED)) { 9877 /** 9878 * Goal, we want the right remainder of rsm to shrink 9879 * in place and span from (rsm->r_start = end) to rsm->r_end. 9880 * We want to expand prev to go all the way 9881 * to prev->r_end <- end. 9882 * so in the tree we have before: 9883 * prev |--------| (acked) 9884 * rsm |-------| (non-acked) 9885 * sackblk |-| 9886 * We churn it so we end up with 9887 * prev |----------| (acked) 9888 * rsm |-----| (non-acked) 9889 * nrsm |-| (temporary) 9890 * 9891 * Note if either prev/rsm is a TLP we don't 9892 * do this. 9893 */ 9894 noextra++; 9895 nrsm = &stack_map; 9896 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 9897 prev->r_end = end; 9898 rsm->r_start = end; 9899 rsm->r_flags |= RACK_SHUFFLED; 9900 prev->r_flags |= RACK_SHUFFLED; 9901 /* Now adjust nrsm (stack copy) to be 9902 * the one that is the small 9903 * piece that was "sacked". 9904 */ 9905 nrsm->r_end = end; 9906 rsm->r_dupack = 0; 9907 /* 9908 * Which timestamp do we keep? It is rather 9909 * important in GP measurements to have the 9910 * accurate end of the send window. 9911 * 9912 * We keep the largest value, which is the newest 9913 * send. We do this in case a segment that is 9914 * joined together and not part of a GP estimate 9915 * later gets expanded into the GP estimate. 9916 * 9917 * We prohibit the merging of unlike kinds i.e. 9918 * all pieces that are in the GP estimate can be 9919 * merged and all pieces that are not in a GP estimate 9920 * can be merged, but not disimilar pieces. Combine 9921 * this with taking the highest here and we should 9922 * be ok unless of course the client reneges. Then 9923 * all bets are off. 9924 */ 9925 if(prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] < 9926 nrsm->r_tim_lastsent[(nrsm->r_rtr_cnt-1)]) { 9927 prev->r_tim_lastsent[(prev->r_rtr_cnt-1)] = nrsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9928 } 9929 /* 9930 * And we must keep the newest ack arrival time. 9931 */ 9932 9933 if(prev->r_ack_arrival < 9934 rack_to_usec_ts(&rack->r_ctl.act_rcv_time)) 9935 prev->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9936 9937 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9938 /* 9939 * Now that the rsm has had its start moved forward 9940 * lets go ahead and get its new place in the world. 9941 */ 9942 rack_setup_offset_for_rsm(rack, prev, rsm); 9943 /* 9944 * Now nrsm is our new little piece 9945 * that is acked (which was merged 9946 * to prev). Update the rtt and changed 9947 * based on that. Also check for reordering. 9948 */ 9949 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 9950 if (rack->app_limited_needs_set) 9951 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 9952 changed += (nrsm->r_end - nrsm->r_start); 9953 /* You get a count for acking a whole segment or more */ 9954 if ((nrsm->r_end - nrsm->r_start) >= segsiz) 9955 rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); 9956 9957 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 9958 if (nrsm->r_flags & RACK_SACK_PASSED) { 9959 rack->r_ctl.rc_reorder_ts = cts; 9960 if (rack->r_ctl.rc_reorder_ts == 0) 9961 rack->r_ctl.rc_reorder_ts = 1; 9962 } 9963 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 9964 rsm = prev; 9965 counter_u64_add(rack_sack_used_prev_merge, 1); 9966 } else { 9967 /** 9968 * This is the case where our previous 9969 * block is not acked either, so we must 9970 * split the block in two. 9971 */ 9972 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9973 if (nrsm == NULL) { 9974 /* failed rrs what can we do but loose the sack info? */ 9975 goto out; 9976 } 9977 if ((rsm->r_flags & RACK_TLP) && 9978 (rsm->r_rtr_cnt > 1)) { 9979 /* 9980 * We are splitting a rxt TLP, check 9981 * if we need to save off the start/end 9982 */ 9983 if (rack->rc_last_tlp_acked_set && 9984 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9985 /* 9986 * We already turned this on since this block is inside 9987 * the previous one was a partially sack now we 9988 * are getting another one (maybe all of it). 9989 */ 9990 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9991 /* 9992 * Lets make sure we have all of it though. 9993 */ 9994 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9995 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9996 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9997 rack->r_ctl.last_tlp_acked_end); 9998 } 9999 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10000 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10001 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10002 rack->r_ctl.last_tlp_acked_end); 10003 } 10004 } else { 10005 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10006 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10007 rack->rc_last_tlp_acked_set = 1; 10008 rack->rc_last_tlp_past_cumack = 0; 10009 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10010 } 10011 } 10012 /** 10013 * In this case nrsm becomes 10014 * nrsm->r_start = end; 10015 * nrsm->r_end = rsm->r_end; 10016 * which is un-acked. 10017 * <and> 10018 * rsm->r_end = nrsm->r_start; 10019 * i.e. the remaining un-acked 10020 * piece is left on the left 10021 * hand side. 10022 * 10023 * So we start like this 10024 * rsm |----------| (not acked) 10025 * sackblk |---| 10026 * build it so we have 10027 * rsm |---| (acked) 10028 * nrsm |------| (not acked) 10029 */ 10030 counter_u64_add(rack_sack_splits, 1); 10031 rack_clone_rsm(rack, nrsm, rsm, end); 10032 moved++; 10033 rsm->r_flags &= (~RACK_HAS_FIN); 10034 rsm->r_just_ret = 0; 10035 #ifndef INVARIANTS 10036 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 10037 #else 10038 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 10039 panic("Insert in rb tree of %p fails ret:% rack:%p rsm:%p", 10040 nrsm, insret, rack, rsm); 10041 } 10042 #endif 10043 if (rsm->r_in_tmap) { 10044 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10045 nrsm->r_in_tmap = 1; 10046 } 10047 nrsm->r_dupack = 0; 10048 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 10049 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 10050 changed += (rsm->r_end - rsm->r_start); 10051 /* You get a count for acking a whole segment or more */ 10052 if ((rsm->r_end - rsm->r_start) >= segsiz) 10053 rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); 10054 10055 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 10056 if (rsm->r_in_tmap) /* should be true */ 10057 rack_log_sack_passed(tp, rack, rsm); 10058 /* Is Reordering occuring? */ 10059 if (rsm->r_flags & RACK_SACK_PASSED) { 10060 rsm->r_flags &= ~RACK_SACK_PASSED; 10061 rack->r_ctl.rc_reorder_ts = cts; 10062 if (rack->r_ctl.rc_reorder_ts == 0) 10063 rack->r_ctl.rc_reorder_ts = 1; 10064 } 10065 if (rack->app_limited_needs_set) 10066 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 10067 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10068 rsm->r_flags |= RACK_ACKED; 10069 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 10070 if (rsm->r_in_tmap) { 10071 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10072 rsm->r_in_tmap = 0; 10073 } 10074 } 10075 } else if (start != end){ 10076 /* 10077 * The block was already acked. 10078 */ 10079 counter_u64_add(rack_sack_skipped_acked, 1); 10080 moved++; 10081 } 10082 out: 10083 if (rsm && 10084 ((rsm->r_flags & RACK_TLP) == 0) && 10085 (rsm->r_flags & RACK_ACKED)) { 10086 /* 10087 * Now can we merge where we worked 10088 * with either the previous or 10089 * next block? 10090 */ 10091 next = tqhash_next(rack->r_ctl.tqh, rsm); 10092 while (next) { 10093 if (next->r_flags & RACK_TLP) 10094 break; 10095 /* Only allow merges between ones in or out of GP window */ 10096 if ((next->r_flags & RACK_IN_GP_WIN) && 10097 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10098 break; 10099 } 10100 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10101 ((next->r_flags & RACK_IN_GP_WIN) == 0)) { 10102 break; 10103 } 10104 if (rsm->bindex != next->bindex) 10105 break; 10106 if (rsm->r_flags & RACK_STRADDLE) 10107 break; 10108 if (next->r_flags & RACK_STRADDLE) 10109 break; 10110 if (next->r_flags & RACK_ACKED) { 10111 /* yep this and next can be merged */ 10112 rsm = rack_merge_rsm(rack, rsm, next); 10113 noextra++; 10114 next = tqhash_next(rack->r_ctl.tqh, rsm); 10115 } else 10116 break; 10117 } 10118 /* Now what about the previous? */ 10119 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10120 while (prev) { 10121 if (prev->r_flags & RACK_TLP) 10122 break; 10123 /* Only allow merges between ones in or out of GP window */ 10124 if ((prev->r_flags & RACK_IN_GP_WIN) && 10125 ((rsm->r_flags & RACK_IN_GP_WIN) == 0)) { 10126 break; 10127 } 10128 if ((rsm->r_flags & RACK_IN_GP_WIN) && 10129 ((prev->r_flags & RACK_IN_GP_WIN) == 0)) { 10130 break; 10131 } 10132 if (rsm->bindex != prev->bindex) 10133 break; 10134 if (rsm->r_flags & RACK_STRADDLE) 10135 break; 10136 if (prev->r_flags & RACK_STRADDLE) 10137 break; 10138 if (prev->r_flags & RACK_ACKED) { 10139 /* yep the previous and this can be merged */ 10140 rsm = rack_merge_rsm(rack, prev, rsm); 10141 noextra++; 10142 prev = tqhash_prev(rack->r_ctl.tqh, rsm); 10143 } else 10144 break; 10145 } 10146 } 10147 if (used_ref == 0) { 10148 counter_u64_add(rack_sack_proc_all, 1); 10149 } else { 10150 counter_u64_add(rack_sack_proc_short, 1); 10151 } 10152 /* Save off the next one for quick reference. */ 10153 nrsm = tqhash_find(rack->r_ctl.tqh, end); 10154 *prsm = rack->r_ctl.rc_sacklast = nrsm; 10155 /* Pass back the moved. */ 10156 *moved_two = moved; 10157 *no_extra = noextra; 10158 return (changed); 10159 } 10160 10161 static void inline 10162 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 10163 { 10164 struct rack_sendmap *tmap; 10165 10166 tmap = NULL; 10167 while (rsm && (rsm->r_flags & RACK_ACKED)) { 10168 /* Its no longer sacked, mark it so */ 10169 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10170 #ifdef INVARIANTS 10171 if (rsm->r_in_tmap) { 10172 panic("rack:%p rsm:%p flags:0x%x in tmap?", 10173 rack, rsm, rsm->r_flags); 10174 } 10175 #endif 10176 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 10177 /* Rebuild it into our tmap */ 10178 if (tmap == NULL) { 10179 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10180 tmap = rsm; 10181 } else { 10182 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 10183 tmap = rsm; 10184 } 10185 tmap->r_in_tmap = 1; 10186 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 10187 } 10188 /* 10189 * Now lets possibly clear the sack filter so we start 10190 * recognizing sacks that cover this area. 10191 */ 10192 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 10193 10194 } 10195 10196 static void 10197 rack_do_decay(struct tcp_rack *rack) 10198 { 10199 struct timeval res; 10200 10201 #define timersub(tvp, uvp, vvp) \ 10202 do { \ 10203 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 10204 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 10205 if ((vvp)->tv_usec < 0) { \ 10206 (vvp)->tv_sec--; \ 10207 (vvp)->tv_usec += 1000000; \ 10208 } \ 10209 } while (0) 10210 10211 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 10212 #undef timersub 10213 10214 rack->r_ctl.input_pkt++; 10215 if ((rack->rc_in_persist) || 10216 (res.tv_sec >= 1) || 10217 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 10218 /* 10219 * Check for decay of non-SAD, 10220 * we want all SAD detection metrics to 10221 * decay 1/4 per second (or more) passed. 10222 * Current default is 800 so it decays 10223 * 80% every second. 10224 */ 10225 #ifdef TCP_SAD_DETECTION 10226 uint32_t pkt_delta; 10227 10228 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 10229 #endif 10230 /* Update our saved tracking values */ 10231 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 10232 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10233 /* Now do we escape without decay? */ 10234 #ifdef TCP_SAD_DETECTION 10235 if (rack->rc_in_persist || 10236 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 10237 (pkt_delta < tcp_sad_low_pps)){ 10238 /* 10239 * We don't decay idle connections 10240 * or ones that have a low input pps. 10241 */ 10242 return; 10243 } 10244 /* Decay the counters */ 10245 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 10246 tcp_sad_decay_val); 10247 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 10248 tcp_sad_decay_val); 10249 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 10250 tcp_sad_decay_val); 10251 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 10252 tcp_sad_decay_val); 10253 #endif 10254 } 10255 } 10256 10257 static void inline 10258 rack_rsm_sender_update(struct tcp_rack *rack, struct tcpcb *tp, struct rack_sendmap *rsm, uint8_t from) 10259 { 10260 /* 10261 * We look at advancing the end send time for our GP 10262 * measurement tracking only as the cumulative acknowledgment 10263 * moves forward. You might wonder about this, why not 10264 * at every transmission or retransmission within the 10265 * GP window update the rc_gp_cumack_ts? Well its rather 10266 * nuanced but basically the GP window *may* expand (as 10267 * it does below) or worse and harder to track it may shrink. 10268 * 10269 * This last makes it impossible to track at the time of 10270 * the send, since you may set forward your rc_gp_cumack_ts 10271 * when you send, because that send *is* in your currently 10272 * "guessed" window, but then it shrinks. Now which was 10273 * the send time of the last bytes in the window, by the 10274 * time you ask that question that part of the sendmap 10275 * is freed. So you don't know and you will have too 10276 * long of send window. Instead by updating the time 10277 * marker only when the cumack advances this assures us 10278 * that we will have only the sends in the window of our 10279 * GP measurement. 10280 * 10281 * Another complication from this is the 10282 * merging of sendmap entries. During SACK processing this 10283 * can happen to conserve the sendmap size. That breaks 10284 * everything down in tracking the send window of the GP 10285 * estimate. So to prevent that and keep it working with 10286 * a tiny bit more limited merging, we only allow like 10287 * types to be merged. I.e. if two sends are in the GP window 10288 * then its ok to merge them together. If two sends are not 10289 * in the GP window its ok to merge them together too. Though 10290 * one send in and one send out cannot be merged. We combine 10291 * this with never allowing the shrinking of the GP window when 10292 * we are in recovery so that we can properly calculate the 10293 * sending times. 10294 * 10295 * This all of course seems complicated, because it is.. :) 10296 * 10297 * The cum-ack is being advanced upon the sendmap. 10298 * If we are not doing a GP estimate don't 10299 * proceed. 10300 */ 10301 uint64_t ts; 10302 10303 if ((tp->t_flags & TF_GPUTINPROG) == 0) 10304 return; 10305 /* 10306 * If this sendmap entry is going 10307 * beyond the measurement window we had picked, 10308 * expand the measurement window by that much. 10309 */ 10310 if (SEQ_GT(rsm->r_end, tp->gput_ack)) { 10311 tp->gput_ack = rsm->r_end; 10312 } 10313 /* 10314 * If we have not setup a ack, then we 10315 * have no idea if the newly acked pieces 10316 * will be "in our seq measurement range". If 10317 * it is when we clear the app_limited_needs_set 10318 * flag the timestamp will be updated. 10319 */ 10320 if (rack->app_limited_needs_set) 10321 return; 10322 /* 10323 * Finally, we grab out the latest timestamp 10324 * that this packet was sent and then see 10325 * if: 10326 * a) The packet touches are newly defined GP range. 10327 * b) The time is greater than (newer) than the 10328 * one we currently have. If so we update 10329 * our sending end time window. 10330 * 10331 * Note we *do not* do this at send time. The reason 10332 * is that if you do you *may* pick up a newer timestamp 10333 * for a range you are not going to measure. We project 10334 * out how far and then sometimes modify that to be 10335 * smaller. If that occurs then you will have a send 10336 * that does not belong to the range included. 10337 */ 10338 if ((ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) <= 10339 rack->r_ctl.rc_gp_cumack_ts) 10340 return; 10341 if (rack_in_gp_window(tp, rsm)) { 10342 rack->r_ctl.rc_gp_cumack_ts = ts; 10343 rack_log_gpset(rack, tp->gput_ack, (uint32_t)ts, rsm->r_end, 10344 __LINE__, from, rsm); 10345 } 10346 } 10347 10348 static void 10349 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to, uint64_t acktime) 10350 { 10351 struct rack_sendmap *rsm; 10352 /* 10353 * The ACK point is advancing to th_ack, we must drop off 10354 * the packets in the rack log and calculate any eligble 10355 * RTT's. 10356 */ 10357 10358 rack->r_wanted_output = 1; 10359 if (SEQ_GT(th_ack, tp->snd_una)) 10360 rack->r_ctl.last_cumack_advance = acktime; 10361 10362 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 10363 if ((rack->rc_last_tlp_acked_set == 1)&& 10364 (rack->rc_last_tlp_past_cumack == 1) && 10365 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 10366 /* 10367 * We have reached the point where our last rack 10368 * tlp retransmit sequence is ahead of the cum-ack. 10369 * This can only happen when the cum-ack moves all 10370 * the way around (its been a full 2^^31+1 bytes 10371 * or more since we sent a retransmitted TLP). Lets 10372 * turn off the valid flag since its not really valid. 10373 * 10374 * Note since sack's also turn on this event we have 10375 * a complication, we have to wait to age it out until 10376 * the cum-ack is by the TLP before checking which is 10377 * what the next else clause does. 10378 */ 10379 rack_log_dsack_event(rack, 9, __LINE__, 10380 rack->r_ctl.last_tlp_acked_start, 10381 rack->r_ctl.last_tlp_acked_end); 10382 rack->rc_last_tlp_acked_set = 0; 10383 rack->rc_last_tlp_past_cumack = 0; 10384 } else if ((rack->rc_last_tlp_acked_set == 1) && 10385 (rack->rc_last_tlp_past_cumack == 0) && 10386 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 10387 /* 10388 * It is safe to start aging TLP's out. 10389 */ 10390 rack->rc_last_tlp_past_cumack = 1; 10391 } 10392 /* We do the same for the tlp send seq as well */ 10393 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10394 (rack->rc_last_sent_tlp_past_cumack == 1) && 10395 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 10396 rack_log_dsack_event(rack, 9, __LINE__, 10397 rack->r_ctl.last_sent_tlp_seq, 10398 (rack->r_ctl.last_sent_tlp_seq + 10399 rack->r_ctl.last_sent_tlp_len)); 10400 rack->rc_last_sent_tlp_seq_valid = 0; 10401 rack->rc_last_sent_tlp_past_cumack = 0; 10402 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 10403 (rack->rc_last_sent_tlp_past_cumack == 0) && 10404 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 10405 /* 10406 * It is safe to start aging TLP's send. 10407 */ 10408 rack->rc_last_sent_tlp_past_cumack = 1; 10409 } 10410 more: 10411 rsm = tqhash_min(rack->r_ctl.tqh); 10412 if (rsm == NULL) { 10413 if ((th_ack - 1) == tp->iss) { 10414 /* 10415 * For the SYN incoming case we will not 10416 * have called tcp_output for the sending of 10417 * the SYN, so there will be no map. All 10418 * other cases should probably be a panic. 10419 */ 10420 return; 10421 } 10422 if (tp->t_flags & TF_SENTFIN) { 10423 /* if we sent a FIN we often will not have map */ 10424 return; 10425 } 10426 #ifdef INVARIANTS 10427 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 10428 tp, 10429 tp->t_state, th_ack, rack, 10430 tp->snd_una, tp->snd_max, tp->snd_nxt); 10431 #endif 10432 return; 10433 } 10434 if (SEQ_LT(th_ack, rsm->r_start)) { 10435 /* Huh map is missing this */ 10436 #ifdef INVARIANTS 10437 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 10438 rsm->r_start, 10439 th_ack, tp->t_state, rack->r_state); 10440 #endif 10441 return; 10442 } 10443 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 10444 10445 /* Now was it a retransmitted TLP? */ 10446 if ((rsm->r_flags & RACK_TLP) && 10447 (rsm->r_rtr_cnt > 1)) { 10448 /* 10449 * Yes, this rsm was a TLP and retransmitted, remember that 10450 * since if a DSACK comes back on this we don't want 10451 * to think of it as a reordered segment. This may 10452 * get updated again with possibly even other TLPs 10453 * in flight, but thats ok. Only when we don't send 10454 * a retransmitted TLP for 1/2 the sequences space 10455 * will it get turned off (above). 10456 */ 10457 if (rack->rc_last_tlp_acked_set && 10458 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 10459 /* 10460 * We already turned this on since the end matches, 10461 * the previous one was a partially ack now we 10462 * are getting another one (maybe all of it). 10463 */ 10464 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 10465 /* 10466 * Lets make sure we have all of it though. 10467 */ 10468 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 10469 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10470 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10471 rack->r_ctl.last_tlp_acked_end); 10472 } 10473 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 10474 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10475 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 10476 rack->r_ctl.last_tlp_acked_end); 10477 } 10478 } else { 10479 rack->rc_last_tlp_past_cumack = 1; 10480 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 10481 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 10482 rack->rc_last_tlp_acked_set = 1; 10483 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 10484 } 10485 } 10486 /* Now do we consume the whole thing? */ 10487 if (SEQ_GEQ(th_ack, rsm->r_end)) { 10488 /* Its all consumed. */ 10489 uint32_t left; 10490 uint8_t newly_acked; 10491 10492 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 10493 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 10494 rsm->r_rtr_bytes = 0; 10495 /* 10496 * Record the time of highest cumack sent if its in our measurement 10497 * window and possibly bump out the end. 10498 */ 10499 rack_rsm_sender_update(rack, tp, rsm, 4); 10500 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 10501 if (rsm->r_in_tmap) { 10502 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10503 rsm->r_in_tmap = 0; 10504 } 10505 newly_acked = 1; 10506 if (rsm->r_flags & RACK_ACKED) { 10507 /* 10508 * It was acked on the scoreboard -- remove 10509 * it from total 10510 */ 10511 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 10512 newly_acked = 0; 10513 } else if (rsm->r_flags & RACK_SACK_PASSED) { 10514 /* 10515 * There are segments ACKED on the 10516 * scoreboard further up. We are seeing 10517 * reordering. 10518 */ 10519 rsm->r_flags &= ~RACK_SACK_PASSED; 10520 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 10521 rsm->r_flags |= RACK_ACKED; 10522 rack->r_ctl.rc_reorder_ts = cts; 10523 if (rack->r_ctl.rc_reorder_ts == 0) 10524 rack->r_ctl.rc_reorder_ts = 1; 10525 if (rack->r_ent_rec_ns) { 10526 /* 10527 * We have sent no more, and we saw an sack 10528 * then ack arrive. 10529 */ 10530 rack->r_might_revert = 1; 10531 } 10532 } 10533 if ((rsm->r_flags & RACK_TO_REXT) && 10534 (tp->t_flags & TF_RCVD_TSTMP) && 10535 (to->to_flags & TOF_TS) && 10536 (to->to_tsecr != 0) && 10537 (tp->t_flags & TF_PREVVALID)) { 10538 /* 10539 * We can use the timestamp to see 10540 * if this retransmission was from the 10541 * first transmit. If so we made a mistake. 10542 */ 10543 tp->t_flags &= ~TF_PREVVALID; 10544 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 10545 /* The first transmit is what this ack is for */ 10546 rack_cong_signal(tp, CC_RTO_ERR, th_ack, __LINE__); 10547 } 10548 } 10549 left = th_ack - rsm->r_end; 10550 if (rack->app_limited_needs_set && newly_acked) 10551 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 10552 /* Free back to zone */ 10553 rack_free(rack, rsm); 10554 if (left) { 10555 goto more; 10556 } 10557 /* Check for reneging */ 10558 rsm = tqhash_min(rack->r_ctl.tqh); 10559 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 10560 /* 10561 * The peer has moved snd_una up to 10562 * the edge of this send, i.e. one 10563 * that it had previously acked. The only 10564 * way that can be true if the peer threw 10565 * away data (space issues) that it had 10566 * previously sacked (else it would have 10567 * given us snd_una up to (rsm->r_end). 10568 * We need to undo the acked markings here. 10569 * 10570 * Note we have to look to make sure th_ack is 10571 * our rsm->r_start in case we get an old ack 10572 * where th_ack is behind snd_una. 10573 */ 10574 rack_peer_reneges(rack, rsm, th_ack); 10575 } 10576 return; 10577 } 10578 if (rsm->r_flags & RACK_ACKED) { 10579 /* 10580 * It was acked on the scoreboard -- remove it from 10581 * total for the part being cum-acked. 10582 */ 10583 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 10584 } 10585 /* 10586 * Clear the dup ack count for 10587 * the piece that remains. 10588 */ 10589 rsm->r_dupack = 0; 10590 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 10591 if (rsm->r_rtr_bytes) { 10592 /* 10593 * It was retransmitted adjust the 10594 * sack holes for what was acked. 10595 */ 10596 int ack_am; 10597 10598 ack_am = (th_ack - rsm->r_start); 10599 if (ack_am >= rsm->r_rtr_bytes) { 10600 rack->r_ctl.rc_holes_rxt -= ack_am; 10601 rsm->r_rtr_bytes -= ack_am; 10602 } 10603 } 10604 /* 10605 * Update where the piece starts and record 10606 * the time of send of highest cumack sent if 10607 * its in our GP range. 10608 */ 10609 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 10610 /* Now we need to move our offset forward too */ 10611 if (rsm->m && 10612 ((rsm->orig_m_len != rsm->m->m_len) || 10613 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 10614 /* Fix up the orig_m_len and possibly the mbuf offset */ 10615 rack_adjust_orig_mlen(rsm); 10616 } 10617 rsm->soff += (th_ack - rsm->r_start); 10618 rack_rsm_sender_update(rack, tp, rsm, 5); 10619 /* The trim will move th_ack into r_start for us */ 10620 tqhash_trim(rack->r_ctl.tqh, th_ack); 10621 /* Now do we need to move the mbuf fwd too? */ 10622 if (rsm->m) { 10623 while (rsm->soff >= rsm->m->m_len) { 10624 rsm->soff -= rsm->m->m_len; 10625 rsm->m = rsm->m->m_next; 10626 KASSERT((rsm->m != NULL), 10627 (" nrsm:%p hit at soff:%u null m", 10628 rsm, rsm->soff)); 10629 } 10630 rsm->orig_m_len = rsm->m->m_len; 10631 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 10632 } 10633 if (rack->app_limited_needs_set && 10634 SEQ_GEQ(th_ack, tp->gput_seq)) 10635 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 10636 } 10637 10638 static void 10639 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 10640 { 10641 struct rack_sendmap *rsm; 10642 int sack_pass_fnd = 0; 10643 10644 if (rack->r_might_revert) { 10645 /* 10646 * Ok we have reordering, have not sent anything, we 10647 * might want to revert the congestion state if nothing 10648 * further has SACK_PASSED on it. Lets check. 10649 * 10650 * We also get here when we have DSACKs come in for 10651 * all the data that we FR'd. Note that a rxt or tlp 10652 * timer clears this from happening. 10653 */ 10654 10655 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 10656 if (rsm->r_flags & RACK_SACK_PASSED) { 10657 sack_pass_fnd = 1; 10658 break; 10659 } 10660 } 10661 if (sack_pass_fnd == 0) { 10662 /* 10663 * We went into recovery 10664 * incorrectly due to reordering! 10665 */ 10666 int orig_cwnd; 10667 10668 rack->r_ent_rec_ns = 0; 10669 orig_cwnd = tp->snd_cwnd; 10670 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 10671 tp->snd_recover = tp->snd_una; 10672 rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); 10673 EXIT_RECOVERY(tp->t_flags); 10674 } 10675 rack->r_might_revert = 0; 10676 } 10677 } 10678 10679 #ifdef TCP_SAD_DETECTION 10680 10681 static void 10682 rack_merge_out_sacks(struct tcp_rack *rack) 10683 { 10684 struct rack_sendmap *cur, *next, *rsm, *trsm = NULL; 10685 10686 cur = tqhash_min(rack->r_ctl.tqh); 10687 while(cur) { 10688 next = tqhash_next(rack->r_ctl.tqh, cur); 10689 /* 10690 * The idea is to go through all and merge back 10691 * together the pieces sent together, 10692 */ 10693 if ((next != NULL) && 10694 (cur->r_tim_lastsent[0] == next->r_tim_lastsent[0])) { 10695 rack_merge_rsm(rack, cur, next); 10696 } else { 10697 cur = next; 10698 } 10699 } 10700 /* 10701 * now treat it like a rxt event, everything is outstanding 10702 * and sent nothing acvked and dupacks are all zero. If this 10703 * is not an attacker it will have to dupack its way through 10704 * it all. 10705 */ 10706 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10707 TQHASH_FOREACH(rsm, rack->r_ctl.tqh) { 10708 rsm->r_dupack = 0; 10709 /* We must re-add it back to the tlist */ 10710 if (trsm == NULL) { 10711 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10712 } else { 10713 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 10714 } 10715 rsm->r_in_tmap = 1; 10716 trsm = rsm; 10717 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); 10718 } 10719 sack_filter_clear(&rack->r_ctl.rack_sf, rack->rc_tp->snd_una); 10720 } 10721 10722 static void 10723 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 10724 { 10725 int do_detection = 0; 10726 10727 if (rack->sack_attack_disable || rack->rc_suspicious) { 10728 /* 10729 * If we have been disabled we must detect 10730 * to possibly reverse it. Or if the guy has 10731 * sent in suspicious sacks we want to do detection too. 10732 */ 10733 do_detection = 1; 10734 10735 } else if ((rack->do_detection || tcp_force_detection) && 10736 (tcp_sack_to_ack_thresh > 0) && 10737 (tcp_sack_to_move_thresh > 0) && 10738 (rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum)) { 10739 /* 10740 * We only detect here if: 10741 * 1) System wide forcing is on <or> do_detection is on 10742 * <and> 10743 * 2) We have thresholds for move and ack (set one to 0 and we are off) 10744 * <and> 10745 * 3) We have maps allocated larger than our min (500). 10746 */ 10747 do_detection = 1; 10748 } 10749 if (do_detection > 0) { 10750 /* 10751 * We have thresholds set to find 10752 * possible attackers and disable sack. 10753 * Check them. 10754 */ 10755 uint64_t ackratio, moveratio, movetotal; 10756 10757 /* Log detecting */ 10758 rack_log_sad(rack, 1); 10759 /* Do we establish a ack ratio */ 10760 if ((rack->r_ctl.sack_count > tcp_map_minimum) || 10761 (rack->rc_suspicious == 1) || 10762 (rack->sack_attack_disable > 0)) { 10763 ackratio = (uint64_t)(rack->r_ctl.sack_count); 10764 ackratio *= (uint64_t)(1000); 10765 if (rack->r_ctl.ack_count) 10766 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 10767 else { 10768 /* We can hit this due to ack totals degregation (via small sacks) */ 10769 ackratio = 1000; 10770 } 10771 } else { 10772 /* 10773 * No ack ratio needed if we have not 10774 * seen more sacks then the number of map entries. 10775 * The exception to that is if we have disabled sack then 10776 * we need to find a ratio. 10777 */ 10778 ackratio = 0; 10779 } 10780 10781 if ((rack->sack_attack_disable == 0) && 10782 (ackratio > rack_highest_sack_thresh_seen)) 10783 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 10784 /* Do we establish a move ratio? */ 10785 if ((rack->r_ctl.sack_moved_extra > tcp_map_minimum) || 10786 (rack->rc_suspicious == 1) || 10787 (rack->sack_attack_disable > 0)) { 10788 /* 10789 * We need to have more sack moves than maps 10790 * allocated to have a move ratio considered. 10791 */ 10792 movetotal = rack->r_ctl.sack_moved_extra; 10793 movetotal += rack->r_ctl.sack_noextra_move; 10794 moveratio = rack->r_ctl.sack_moved_extra; 10795 moveratio *= (uint64_t)1000; 10796 if (movetotal) 10797 moveratio /= movetotal; 10798 else { 10799 /* No moves, thats pretty good */ 10800 moveratio = 0; 10801 } 10802 } else { 10803 /* 10804 * Not enough moves have occured to consider 10805 * if we are out of whack in that ratio. 10806 * The exception to that is if we have disabled sack then 10807 * we need to find a ratio. 10808 */ 10809 moveratio = 0; 10810 } 10811 if ((rack->sack_attack_disable == 0) && 10812 (moveratio > rack_highest_move_thresh_seen)) 10813 rack_highest_move_thresh_seen = (uint32_t)moveratio; 10814 /* Now the tests */ 10815 if (rack->sack_attack_disable == 0) { 10816 /* Not disabled, do we need to disable? */ 10817 if ((ackratio > tcp_sack_to_ack_thresh) && 10818 (moveratio > tcp_sack_to_move_thresh)) { 10819 /* Disable sack processing */ 10820 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); 10821 rack->sack_attack_disable = 1; 10822 /* set it so we have the built in delay */ 10823 rack->r_ctl.ack_during_sd = 1; 10824 if (rack_merge_out_sacks_on_attack) 10825 rack_merge_out_sacks(rack); 10826 counter_u64_add(rack_sack_attacks_detected, 1); 10827 tcp_trace_point(rack->rc_tp, TCP_TP_SAD_TRIGGERED); 10828 /* Clamp the cwnd at flight size */ 10829 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 10830 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10831 rack_log_sad(rack, 2); 10832 } 10833 } else { 10834 /* We are sack-disabled check for false positives */ 10835 if ((ackratio <= tcp_restoral_thresh) || 10836 ((rack_merge_out_sacks_on_attack == 0) && 10837 (rack->rc_suspicious == 0) && 10838 (rack->r_ctl.rc_num_maps_alloced <= (tcp_map_minimum/2)))) { 10839 rack->sack_attack_disable = 0; 10840 rack_log_sad(rack, 3); 10841 /* Restart counting */ 10842 rack->r_ctl.sack_count = 0; 10843 rack->r_ctl.sack_moved_extra = 0; 10844 rack->r_ctl.sack_noextra_move = 1; 10845 rack->rc_suspicious = 0; 10846 rack->r_ctl.ack_count = max(1, 10847 (bytes_this_ack / segsiz)); 10848 10849 counter_u64_add(rack_sack_attacks_reversed, 1); 10850 /* Restore the cwnd */ 10851 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 10852 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 10853 } 10854 } 10855 } 10856 } 10857 #endif 10858 10859 static int 10860 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 10861 { 10862 10863 uint32_t am, l_end; 10864 int was_tlp = 0; 10865 10866 if (SEQ_GT(end, start)) 10867 am = end - start; 10868 else 10869 am = 0; 10870 if ((rack->rc_last_tlp_acked_set ) && 10871 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 10872 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 10873 /* 10874 * The DSACK is because of a TLP which we don't 10875 * do anything with the reordering window over since 10876 * it was not reordering that caused the DSACK but 10877 * our previous retransmit TLP. 10878 */ 10879 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10880 was_tlp = 1; 10881 goto skip_dsack_round; 10882 } 10883 if (rack->rc_last_sent_tlp_seq_valid) { 10884 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 10885 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 10886 (SEQ_LEQ(end, l_end))) { 10887 /* 10888 * This dsack is from the last sent TLP, ignore it 10889 * for reordering purposes. 10890 */ 10891 rack_log_dsack_event(rack, 7, __LINE__, start, end); 10892 was_tlp = 1; 10893 goto skip_dsack_round; 10894 } 10895 } 10896 if (rack->rc_dsack_round_seen == 0) { 10897 rack->rc_dsack_round_seen = 1; 10898 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 10899 rack->r_ctl.num_dsack++; 10900 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 10901 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 10902 } 10903 skip_dsack_round: 10904 /* 10905 * We keep track of how many DSACK blocks we get 10906 * after a recovery incident. 10907 */ 10908 rack->r_ctl.dsack_byte_cnt += am; 10909 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 10910 rack->r_ctl.retran_during_recovery && 10911 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 10912 /* 10913 * False recovery most likely culprit is reordering. If 10914 * nothing else is missing we need to revert. 10915 */ 10916 rack->r_might_revert = 1; 10917 rack_handle_might_revert(rack->rc_tp, rack); 10918 rack->r_might_revert = 0; 10919 rack->r_ctl.retran_during_recovery = 0; 10920 rack->r_ctl.dsack_byte_cnt = 0; 10921 } 10922 return (was_tlp); 10923 } 10924 10925 static uint32_t 10926 do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) 10927 { 10928 return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt); 10929 } 10930 10931 static int32_t 10932 rack_compute_pipe(struct tcpcb *tp) 10933 { 10934 return ((int32_t)do_rack_compute_pipe(tp, 10935 (struct tcp_rack *)tp->t_fb_ptr, 10936 tp->snd_una)); 10937 } 10938 10939 static void 10940 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 10941 { 10942 /* Deal with changed and PRR here (in recovery only) */ 10943 uint32_t pipe, snd_una; 10944 10945 rack->r_ctl.rc_prr_delivered += changed; 10946 10947 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 10948 /* 10949 * It is all outstanding, we are application limited 10950 * and thus we don't need more room to send anything. 10951 * Note we use tp->snd_una here and not th_ack because 10952 * the data as yet not been cut from the sb. 10953 */ 10954 rack->r_ctl.rc_prr_sndcnt = 0; 10955 return; 10956 } 10957 /* Compute prr_sndcnt */ 10958 if (SEQ_GT(tp->snd_una, th_ack)) { 10959 snd_una = tp->snd_una; 10960 } else { 10961 snd_una = th_ack; 10962 } 10963 pipe = do_rack_compute_pipe(tp, rack, snd_una); 10964 if (pipe > tp->snd_ssthresh) { 10965 long sndcnt; 10966 10967 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 10968 if (rack->r_ctl.rc_prr_recovery_fs > 0) 10969 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 10970 else { 10971 rack->r_ctl.rc_prr_sndcnt = 0; 10972 rack_log_to_prr(rack, 9, 0, __LINE__); 10973 sndcnt = 0; 10974 } 10975 sndcnt++; 10976 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 10977 sndcnt -= rack->r_ctl.rc_prr_out; 10978 else 10979 sndcnt = 0; 10980 rack->r_ctl.rc_prr_sndcnt = sndcnt; 10981 rack_log_to_prr(rack, 10, 0, __LINE__); 10982 } else { 10983 uint32_t limit; 10984 10985 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 10986 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 10987 else 10988 limit = 0; 10989 if (changed > limit) 10990 limit = changed; 10991 limit += ctf_fixed_maxseg(tp); 10992 if (tp->snd_ssthresh > pipe) { 10993 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 10994 rack_log_to_prr(rack, 11, 0, __LINE__); 10995 } else { 10996 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 10997 rack_log_to_prr(rack, 12, 0, __LINE__); 10998 } 10999 } 11000 } 11001 11002 static void 11003 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck, 11004 int *dsack_seen, int *sacks_seen) 11005 { 11006 uint32_t changed; 11007 struct tcp_rack *rack; 11008 struct rack_sendmap *rsm; 11009 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 11010 register uint32_t th_ack; 11011 int32_t i, j, k, num_sack_blks = 0; 11012 uint32_t cts, acked, ack_point; 11013 int loop_start = 0, moved_two = 0, no_extra = 0; 11014 uint32_t tsused; 11015 uint32_t segsiz, o_cnt; 11016 11017 11018 INP_WLOCK_ASSERT(tptoinpcb(tp)); 11019 if (tcp_get_flags(th) & TH_RST) { 11020 /* We don't log resets */ 11021 return; 11022 } 11023 rack = (struct tcp_rack *)tp->t_fb_ptr; 11024 cts = tcp_get_usecs(NULL); 11025 rsm = tqhash_min(rack->r_ctl.tqh); 11026 changed = 0; 11027 th_ack = th->th_ack; 11028 if (rack->sack_attack_disable == 0) 11029 rack_do_decay(rack); 11030 segsiz = ctf_fixed_maxseg(rack->rc_tp); 11031 if (BYTES_THIS_ACK(tp, th) >= segsiz) { 11032 /* 11033 * You only get credit for 11034 * MSS and greater (and you get extra 11035 * credit for larger cum-ack moves). 11036 */ 11037 int ac; 11038 11039 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 11040 rack->r_ctl.ack_count += ac; 11041 counter_u64_add(rack_ack_total, ac); 11042 } 11043 if (rack->r_ctl.ack_count > 0xfff00000) { 11044 /* 11045 * reduce the number to keep us under 11046 * a uint32_t. 11047 */ 11048 rack->r_ctl.ack_count /= 2; 11049 rack->r_ctl.sack_count /= 2; 11050 } 11051 if (SEQ_GT(th_ack, tp->snd_una)) { 11052 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 11053 tp->t_acktime = ticks; 11054 } 11055 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 11056 changed = th_ack - rsm->r_start; 11057 if (changed) { 11058 rack_process_to_cumack(tp, rack, th_ack, cts, to, 11059 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 11060 } 11061 if ((to->to_flags & TOF_SACK) == 0) { 11062 /* We are done nothing left and no sack. */ 11063 rack_handle_might_revert(tp, rack); 11064 /* 11065 * For cases where we struck a dup-ack 11066 * with no SACK, add to the changes so 11067 * PRR will work right. 11068 */ 11069 if (dup_ack_struck && (changed == 0)) { 11070 changed += ctf_fixed_maxseg(rack->rc_tp); 11071 } 11072 goto out; 11073 } 11074 /* Sack block processing */ 11075 if (SEQ_GT(th_ack, tp->snd_una)) 11076 ack_point = th_ack; 11077 else 11078 ack_point = tp->snd_una; 11079 for (i = 0; i < to->to_nsacks; i++) { 11080 bcopy((to->to_sacks + i * TCPOLEN_SACK), 11081 &sack, sizeof(sack)); 11082 sack.start = ntohl(sack.start); 11083 sack.end = ntohl(sack.end); 11084 if (SEQ_GT(sack.end, sack.start) && 11085 SEQ_GT(sack.start, ack_point) && 11086 SEQ_LT(sack.start, tp->snd_max) && 11087 SEQ_GT(sack.end, ack_point) && 11088 SEQ_LEQ(sack.end, tp->snd_max)) { 11089 sack_blocks[num_sack_blks] = sack; 11090 num_sack_blks++; 11091 } else if (SEQ_LEQ(sack.start, th_ack) && 11092 SEQ_LEQ(sack.end, th_ack)) { 11093 int was_tlp; 11094 11095 if (dsack_seen != NULL) 11096 *dsack_seen = 1; 11097 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 11098 /* 11099 * Its a D-SACK block. 11100 */ 11101 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 11102 } 11103 } 11104 if (rack->rc_dsack_round_seen) { 11105 /* Is the dsack roound over? */ 11106 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 11107 /* Yes it is */ 11108 rack->rc_dsack_round_seen = 0; 11109 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 11110 } 11111 } 11112 /* 11113 * Sort the SACK blocks so we can update the rack scoreboard with 11114 * just one pass. 11115 */ 11116 o_cnt = num_sack_blks; 11117 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 11118 num_sack_blks, th->th_ack); 11119 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 11120 if (sacks_seen != NULL) 11121 *sacks_seen = num_sack_blks; 11122 if (num_sack_blks == 0) { 11123 /* Nothing to sack, but we need to update counts */ 11124 if ((o_cnt == 1) && 11125 (*dsack_seen != 1)) 11126 rack->r_ctl.sack_count++; 11127 else if (o_cnt > 1) 11128 rack->r_ctl.sack_count++; 11129 goto out_with_totals; 11130 } 11131 if (rack->sack_attack_disable) { 11132 /* 11133 * An attacker disablement is in place, for 11134 * every sack block that is not at least a full MSS 11135 * count up sack_count. 11136 */ 11137 for (i = 0; i < num_sack_blks; i++) { 11138 if ((sack_blocks[i].end - sack_blocks[i].start) < segsiz) { 11139 rack->r_ctl.sack_count++; 11140 } 11141 if (rack->r_ctl.sack_count > 0xfff00000) { 11142 /* 11143 * reduce the number to keep us under 11144 * a uint32_t. 11145 */ 11146 rack->r_ctl.ack_count /= 2; 11147 rack->r_ctl.sack_count /= 2; 11148 } 11149 } 11150 goto out; 11151 } 11152 /* Its a sack of some sort */ 11153 rack->r_ctl.sack_count += num_sack_blks; 11154 if (rack->r_ctl.sack_count > 0xfff00000) { 11155 /* 11156 * reduce the number to keep us under 11157 * a uint32_t. 11158 */ 11159 rack->r_ctl.ack_count /= 2; 11160 rack->r_ctl.sack_count /= 2; 11161 } 11162 if (num_sack_blks < 2) { 11163 /* Only one, we don't need to sort */ 11164 goto do_sack_work; 11165 } 11166 /* Sort the sacks */ 11167 for (i = 0; i < num_sack_blks; i++) { 11168 for (j = i + 1; j < num_sack_blks; j++) { 11169 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 11170 sack = sack_blocks[i]; 11171 sack_blocks[i] = sack_blocks[j]; 11172 sack_blocks[j] = sack; 11173 } 11174 } 11175 } 11176 /* 11177 * Now are any of the sack block ends the same (yes some 11178 * implementations send these)? 11179 */ 11180 again: 11181 if (num_sack_blks == 0) 11182 goto out_with_totals; 11183 if (num_sack_blks > 1) { 11184 for (i = 0; i < num_sack_blks; i++) { 11185 for (j = i + 1; j < num_sack_blks; j++) { 11186 if (sack_blocks[i].end == sack_blocks[j].end) { 11187 /* 11188 * Ok these two have the same end we 11189 * want the smallest end and then 11190 * throw away the larger and start 11191 * again. 11192 */ 11193 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 11194 /* 11195 * The second block covers 11196 * more area use that 11197 */ 11198 sack_blocks[i].start = sack_blocks[j].start; 11199 } 11200 /* 11201 * Now collapse out the dup-sack and 11202 * lower the count 11203 */ 11204 for (k = (j + 1); k < num_sack_blks; k++) { 11205 sack_blocks[j].start = sack_blocks[k].start; 11206 sack_blocks[j].end = sack_blocks[k].end; 11207 j++; 11208 } 11209 num_sack_blks--; 11210 goto again; 11211 } 11212 } 11213 } 11214 } 11215 do_sack_work: 11216 /* 11217 * First lets look to see if 11218 * we have retransmitted and 11219 * can use the transmit next? 11220 */ 11221 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11222 if (rsm && 11223 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 11224 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 11225 /* 11226 * We probably did the FR and the next 11227 * SACK in continues as we would expect. 11228 */ 11229 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &no_extra, &moved_two, segsiz); 11230 if (acked) { 11231 rack->r_wanted_output = 1; 11232 changed += acked; 11233 } 11234 if (num_sack_blks == 1) { 11235 /* 11236 * This is what we would expect from 11237 * a normal implementation to happen 11238 * after we have retransmitted the FR, 11239 * i.e the sack-filter pushes down 11240 * to 1 block and the next to be retransmitted 11241 * is the sequence in the sack block (has more 11242 * are acked). Count this as ACK'd data to boost 11243 * up the chances of recovering any false positives. 11244 */ 11245 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 11246 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 11247 counter_u64_add(rack_express_sack, 1); 11248 if (rack->r_ctl.ack_count > 0xfff00000) { 11249 /* 11250 * reduce the number to keep us under 11251 * a uint32_t. 11252 */ 11253 rack->r_ctl.ack_count /= 2; 11254 rack->r_ctl.sack_count /= 2; 11255 } 11256 if (moved_two) { 11257 /* 11258 * If we did not get a SACK for at least a MSS and 11259 * had to move at all, or if we moved more than our 11260 * threshold, it counts against the "extra" move. 11261 */ 11262 rack->r_ctl.sack_moved_extra += moved_two; 11263 rack->r_ctl.sack_noextra_move += no_extra; 11264 counter_u64_add(rack_move_some, 1); 11265 } else { 11266 /* 11267 * else we did not have to move 11268 * any more than we would expect. 11269 */ 11270 rack->r_ctl.sack_noextra_move += no_extra; 11271 rack->r_ctl.sack_noextra_move++; 11272 counter_u64_add(rack_move_none, 1); 11273 } 11274 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 11275 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 11276 rack->r_ctl.sack_moved_extra /= 2; 11277 rack->r_ctl.sack_noextra_move /= 2; 11278 } 11279 goto out_with_totals; 11280 } else { 11281 /* 11282 * Start the loop through the 11283 * rest of blocks, past the first block. 11284 */ 11285 loop_start = 1; 11286 } 11287 } 11288 counter_u64_add(rack_sack_total, 1); 11289 rsm = rack->r_ctl.rc_sacklast; 11290 for (i = loop_start; i < num_sack_blks; i++) { 11291 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &no_extra, &moved_two, segsiz); 11292 if (acked) { 11293 rack->r_wanted_output = 1; 11294 changed += acked; 11295 } 11296 if (moved_two) { 11297 /* 11298 * If we did not get a SACK for at least a MSS and 11299 * had to move at all, or if we moved more than our 11300 * threshold, it counts against the "extra" move. 11301 */ 11302 rack->r_ctl.sack_moved_extra += moved_two; 11303 rack->r_ctl.sack_noextra_move += no_extra; 11304 counter_u64_add(rack_move_some, 1); 11305 } else { 11306 /* 11307 * else we did not have to move 11308 * any more than we would expect. 11309 */ 11310 rack->r_ctl.sack_noextra_move += no_extra; 11311 rack->r_ctl.sack_noextra_move++; 11312 counter_u64_add(rack_move_none, 1); 11313 } 11314 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 11315 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 11316 rack->r_ctl.sack_moved_extra /= 2; 11317 rack->r_ctl.sack_noextra_move /= 2; 11318 } 11319 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 11320 /* 11321 * If the SACK was not a full MSS then 11322 * we add to sack_count the number of 11323 * MSS's (or possibly more than 11324 * a MSS if its a TSO send) we had to skip by. 11325 */ 11326 rack->r_ctl.sack_count += moved_two; 11327 if (rack->r_ctl.sack_count > 0xfff00000) { 11328 rack->r_ctl.ack_count /= 2; 11329 rack->r_ctl.sack_count /= 2; 11330 } 11331 counter_u64_add(rack_sack_total, moved_two); 11332 } 11333 /* 11334 * Now we need to setup for the next 11335 * round. First we make sure we won't 11336 * exceed the size of our uint32_t on 11337 * the various counts, and then clear out 11338 * moved_two. 11339 */ 11340 moved_two = 0; 11341 no_extra = 0; 11342 } 11343 out_with_totals: 11344 if (num_sack_blks > 1) { 11345 /* 11346 * You get an extra stroke if 11347 * you have more than one sack-blk, this 11348 * could be where we are skipping forward 11349 * and the sack-filter is still working, or 11350 * it could be an attacker constantly 11351 * moving us. 11352 */ 11353 rack->r_ctl.sack_moved_extra++; 11354 counter_u64_add(rack_move_some, 1); 11355 } 11356 out: 11357 #ifdef TCP_SAD_DETECTION 11358 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 11359 #endif 11360 if (changed) { 11361 /* Something changed cancel the rack timer */ 11362 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11363 } 11364 tsused = tcp_get_usecs(NULL); 11365 rsm = tcp_rack_output(tp, rack, tsused); 11366 if ((!IN_FASTRECOVERY(tp->t_flags)) && 11367 rsm && 11368 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 11369 /* Enter recovery */ 11370 entered_recovery = 1; 11371 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 11372 /* 11373 * When we enter recovery we need to assure we send 11374 * one packet. 11375 */ 11376 if (rack->rack_no_prr == 0) { 11377 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 11378 rack_log_to_prr(rack, 8, 0, __LINE__); 11379 } 11380 rack->r_timer_override = 1; 11381 rack->r_early = 0; 11382 rack->r_ctl.rc_agg_early = 0; 11383 } else if (IN_FASTRECOVERY(tp->t_flags) && 11384 rsm && 11385 (rack->r_rr_config == 3)) { 11386 /* 11387 * Assure we can output and we get no 11388 * remembered pace time except the retransmit. 11389 */ 11390 rack->r_timer_override = 1; 11391 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11392 rack->r_ctl.rc_resend = rsm; 11393 } 11394 if (IN_FASTRECOVERY(tp->t_flags) && 11395 (rack->rack_no_prr == 0) && 11396 (entered_recovery == 0)) { 11397 rack_update_prr(tp, rack, changed, th_ack); 11398 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 11399 ((tcp_in_hpts(rack->rc_tp) == 0) && 11400 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 11401 /* 11402 * If you are pacing output you don't want 11403 * to override. 11404 */ 11405 rack->r_early = 0; 11406 rack->r_ctl.rc_agg_early = 0; 11407 rack->r_timer_override = 1; 11408 } 11409 } 11410 } 11411 11412 static void 11413 rack_strike_dupack(struct tcp_rack *rack) 11414 { 11415 struct rack_sendmap *rsm; 11416 11417 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11418 while (rsm) { 11419 /* 11420 * We need to skip anything already set 11421 * to be retransmitted. 11422 */ 11423 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11424 (rsm->r_flags & RACK_MUST_RXT)) { 11425 rsm = TAILQ_NEXT(rsm, r_tnext); 11426 continue; 11427 } 11428 break; 11429 } 11430 if (rsm && (rsm->r_dupack < 0xff)) { 11431 rsm->r_dupack++; 11432 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 11433 struct timeval tv; 11434 uint32_t cts; 11435 /* 11436 * Here we see if we need to retransmit. For 11437 * a SACK type connection if enough time has passed 11438 * we will get a return of the rsm. For a non-sack 11439 * connection we will get the rsm returned if the 11440 * dupack value is 3 or more. 11441 */ 11442 cts = tcp_get_usecs(&tv); 11443 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 11444 if (rack->r_ctl.rc_resend != NULL) { 11445 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 11446 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 11447 rack->rc_tp->snd_una, __LINE__); 11448 } 11449 rack->r_wanted_output = 1; 11450 rack->r_timer_override = 1; 11451 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 11452 } 11453 } else { 11454 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 11455 } 11456 } 11457 } 11458 11459 static void 11460 rack_check_bottom_drag(struct tcpcb *tp, 11461 struct tcp_rack *rack, 11462 struct socket *so) 11463 { 11464 uint32_t segsiz, minseg; 11465 11466 segsiz = ctf_fixed_maxseg(tp); 11467 minseg = segsiz; 11468 if (tp->snd_max == tp->snd_una) { 11469 /* 11470 * We are doing dynamic pacing and we are way 11471 * under. Basically everything got acked while 11472 * we were still waiting on the pacer to expire. 11473 * 11474 * This means we need to boost the b/w in 11475 * addition to any earlier boosting of 11476 * the multiplier. 11477 */ 11478 uint64_t lt_bw; 11479 11480 lt_bw = rack_get_lt_bw(rack); 11481 rack->rc_dragged_bottom = 1; 11482 rack_validate_multipliers_at_or_above100(rack); 11483 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 11484 (lt_bw > 0)) { 11485 /* 11486 * Lets use the long-term b/w we have 11487 * been getting as a base. 11488 */ 11489 if (rack->rc_gp_filled == 0) { 11490 if (lt_bw > ONE_POINT_TWO_MEG) { 11491 /* 11492 * If we have no measurement 11493 * don't let us set in more than 11494 * 1.2Mbps. If we are still too 11495 * low after pacing with this we 11496 * will hopefully have a max b/w 11497 * available to sanity check things. 11498 */ 11499 lt_bw = ONE_POINT_TWO_MEG; 11500 } 11501 rack->r_ctl.rc_rtt_diff = 0; 11502 rack->r_ctl.gp_bw = lt_bw; 11503 rack->rc_gp_filled = 1; 11504 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11505 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11506 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11507 } else if (lt_bw > rack->r_ctl.gp_bw) { 11508 rack->r_ctl.rc_rtt_diff = 0; 11509 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 11510 rack->r_ctl.num_measurements = RACK_REQ_AVG; 11511 rack->r_ctl.gp_bw = lt_bw; 11512 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 11513 } else 11514 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11515 if ((rack->gp_ready == 0) && 11516 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 11517 /* We have enough measurements now */ 11518 rack->gp_ready = 1; 11519 if ((rack->rc_always_pace && (rack->use_fixed_rate == 0)) || 11520 rack->rack_hibeta) 11521 rack_set_cc_pacing(rack); 11522 if (rack->defer_options) 11523 rack_apply_deferred_options(rack); 11524 } 11525 } else { 11526 /* 11527 * zero rtt possibly?, settle for just an old increase. 11528 */ 11529 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11530 } 11531 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 11532 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 11533 minseg)) && 11534 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 11535 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 11536 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 11537 (segsiz * rack_req_segs))) { 11538 /* 11539 * We are doing dynamic GP pacing and 11540 * we have everything except 1MSS or less 11541 * bytes left out. We are still pacing away. 11542 * And there is data that could be sent, This 11543 * means we are inserting delayed ack time in 11544 * our measurements because we are pacing too slow. 11545 */ 11546 rack_validate_multipliers_at_or_above100(rack); 11547 rack->rc_dragged_bottom = 1; 11548 rack_increase_bw_mul(rack, -1, 0, 0, 1); 11549 } 11550 } 11551 11552 #ifdef TCP_REQUEST_TRK 11553 static void 11554 rack_log_hybrid(struct tcp_rack *rack, uint32_t seq, 11555 struct tcp_sendfile_track *cur, uint8_t mod, int line, int err) 11556 { 11557 int do_log; 11558 11559 do_log = tcp_bblogging_on(rack->rc_tp); 11560 if (do_log == 0) { 11561 if ((do_log = tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) )== 0) 11562 return; 11563 /* We only allow the three below with point logging on */ 11564 if ((mod != HYBRID_LOG_RULES_APP) && 11565 (mod != HYBRID_LOG_RULES_SET) && 11566 (mod != HYBRID_LOG_REQ_COMP)) 11567 return; 11568 11569 } 11570 if (do_log) { 11571 union tcp_log_stackspecific log; 11572 struct timeval tv; 11573 11574 /* Convert our ms to a microsecond */ 11575 memset(&log, 0, sizeof(log)); 11576 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11577 log.u_bbr.flex1 = seq; 11578 log.u_bbr.cwnd_gain = line; 11579 if (cur != NULL) { 11580 uint64_t off; 11581 11582 log.u_bbr.flex2 = cur->start_seq; 11583 log.u_bbr.flex3 = cur->end_seq; 11584 log.u_bbr.flex4 = (uint32_t)((cur->localtime >> 32) & 0x00000000ffffffff); 11585 log.u_bbr.flex5 = (uint32_t)(cur->localtime & 0x00000000ffffffff); 11586 log.u_bbr.flex6 = cur->flags; 11587 log.u_bbr.pkts_out = cur->hybrid_flags; 11588 log.u_bbr.rttProp = cur->timestamp; 11589 log.u_bbr.cur_del_rate = cur->cspr; 11590 log.u_bbr.bw_inuse = cur->start; 11591 log.u_bbr.applimited = (uint32_t)(cur->end & 0x00000000ffffffff); 11592 log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; 11593 log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); 11594 log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; 11595 log.u_bbr.bbr_state = 1; 11596 off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); 11597 log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); 11598 } else { 11599 log.u_bbr.flex2 = err; 11600 } 11601 /* 11602 * Fill in flex7 to be CHD (catchup|hybrid|DGP) 11603 */ 11604 log.u_bbr.flex7 = rack->rc_catch_up; 11605 log.u_bbr.flex7 <<= 1; 11606 log.u_bbr.flex7 |= rack->rc_hybrid_mode; 11607 log.u_bbr.flex7 <<= 1; 11608 log.u_bbr.flex7 |= rack->dgp_on; 11609 log.u_bbr.flex8 = mod; 11610 log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; 11611 log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; 11612 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11613 log.u_bbr.pkt_epoch = rack->rc_tp->tcp_hybrid_start; 11614 log.u_bbr.lost = rack->rc_tp->tcp_hybrid_error; 11615 log.u_bbr.pacing_gain = (uint16_t)rack->rc_tp->tcp_hybrid_stop; 11616 tcp_log_event(rack->rc_tp, NULL, 11617 &rack->rc_inp->inp_socket->so_rcv, 11618 &rack->rc_inp->inp_socket->so_snd, 11619 TCP_HYBRID_PACING_LOG, 0, 11620 0, &log, false, NULL, __func__, __LINE__, &tv); 11621 } 11622 } 11623 #endif 11624 11625 #ifdef TCP_REQUEST_TRK 11626 static void 11627 rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len) 11628 { 11629 struct tcp_sendfile_track *rc_cur; 11630 struct tcpcb *tp; 11631 int err = 0; 11632 11633 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); 11634 if (rc_cur == NULL) { 11635 /* If not in the beginning what about the end piece */ 11636 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11637 rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, (seq + len - 1)); 11638 } else { 11639 err = 12345; 11640 } 11641 /* If we find no parameters we are in straight DGP mode */ 11642 if(rc_cur == NULL) { 11643 /* None found for this seq, just DGP for now */ 11644 rack->r_ctl.client_suggested_maxseg = 0; 11645 rack->rc_catch_up = 0; 11646 rack->r_ctl.bw_rate_cap = 0; 11647 rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); 11648 if (rack->r_ctl.rc_last_sft) { 11649 rack->r_ctl.rc_last_sft = NULL; 11650 } 11651 return; 11652 } 11653 /* 11654 * Ok if we have a new entry *or* have never 11655 * set up an entry we need to proceed. If 11656 * we have already set it up this entry we 11657 * just continue along with what we already 11658 * setup. 11659 */ 11660 tp = rack->rc_tp; 11661 if ((rack->r_ctl.rc_last_sft != NULL) && 11662 (rack->r_ctl.rc_last_sft == rc_cur)) { 11663 /* Its already in place */ 11664 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_ISSAME, __LINE__, 0); 11665 return; 11666 } 11667 if (rack->rc_hybrid_mode == 0) { 11668 rack->r_ctl.rc_last_sft = rc_cur; 11669 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11670 return; 11671 } 11672 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ 11673 /* Compensate for all the header overhead's */ 11674 rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); 11675 } else 11676 rack->r_ctl.bw_rate_cap = 0; 11677 if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) 11678 rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; 11679 else 11680 rack->r_ctl.client_suggested_maxseg = 0; 11681 if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && 11682 (rc_cur->cspr > 0)) { 11683 uint64_t len; 11684 11685 rack->rc_catch_up = 1; 11686 /* 11687 * Calculate the deadline time, first set the 11688 * time to when the request arrived. 11689 */ 11690 rc_cur->deadline = rc_cur->localtime; 11691 /* 11692 * Next calculate the length and compensate for 11693 * TLS if need be. 11694 */ 11695 len = rc_cur->end - rc_cur->start; 11696 if (tp->t_inpcb.inp_socket->so_snd.sb_tls_info) { 11697 /* 11698 * This session is doing TLS. Take a swag guess 11699 * at the overhead. 11700 */ 11701 len += tcp_estimate_tls_overhead(tp->t_inpcb.inp_socket, len); 11702 } 11703 /* 11704 * Now considering the size, and the cspr, what is the time that 11705 * would be required at the cspr rate. Here we use the raw 11706 * cspr value since the client only looks at the raw data. We 11707 * do use len which includes TLS overhead, but not the TCP/IP etc. 11708 * That will get made up for in the CU pacing rate set. 11709 */ 11710 len *= HPTS_USEC_IN_SEC; 11711 len /= rc_cur->cspr; 11712 rc_cur->deadline += len; 11713 } else { 11714 rack->rc_catch_up = 0; 11715 rc_cur->deadline = 0; 11716 } 11717 if (rack->r_ctl.client_suggested_maxseg != 0) { 11718 /* 11719 * We need to reset the max pace segs if we have a 11720 * client_suggested_maxseg. 11721 */ 11722 rack_set_pace_segments(tp, rack, __LINE__, NULL); 11723 } 11724 rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); 11725 /* Remember it for next time and for CU mode */ 11726 rack->r_ctl.rc_last_sft = rc_cur; 11727 } 11728 #endif 11729 11730 static void 11731 rack_chk_req_and_hybrid_on_out(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) 11732 { 11733 #ifdef TCP_REQUEST_TRK 11734 struct tcp_sendfile_track *ent; 11735 11736 ent = rack->r_ctl.rc_last_sft; 11737 if ((ent == NULL) || 11738 (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || 11739 (SEQ_GEQ(seq, ent->end_seq))) { 11740 /* Time to update the track. */ 11741 rack_set_dgp_hybrid_mode(rack, seq, len); 11742 ent = rack->r_ctl.rc_last_sft; 11743 } 11744 /* Out of all */ 11745 if (ent == NULL) { 11746 return; 11747 } 11748 if (SEQ_LT(ent->end_seq, (seq + len))) { 11749 /* 11750 * This is the case where our end_seq guess 11751 * was wrong. This is usually due to TLS having 11752 * more bytes then our guess. It could also be the 11753 * case that the client sent in two requests closely 11754 * and the SB is full of both so we are sending part 11755 * of each (end|beg). In such a case lets move this 11756 * guys end to match the end of this send. That 11757 * way it will complete when all of it is acked. 11758 */ 11759 ent->end_seq = (seq + len); 11760 rack_log_hybrid_bw(rack, seq, len, 0, 0, HYBRID_LOG_EXTEND, 0, ent); 11761 } 11762 /* Now validate we have set the send time of this one */ 11763 if ((ent->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { 11764 ent->flags |= TCP_TRK_TRACK_FLG_FSND; 11765 ent->first_send = cts; 11766 ent->sent_at_fs = rack->rc_tp->t_sndbytes; 11767 ent->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; 11768 } 11769 #endif 11770 } 11771 11772 static void 11773 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 11774 { 11775 /* 11776 * The fast output path is enabled and we 11777 * have moved the cumack forward. Lets see if 11778 * we can expand forward the fast path length by 11779 * that amount. What we would ideally like to 11780 * do is increase the number of bytes in the 11781 * fast path block (left_to_send) by the 11782 * acked amount. However we have to gate that 11783 * by two factors: 11784 * 1) The amount outstanding and the rwnd of the peer 11785 * (i.e. we don't want to exceed the rwnd of the peer). 11786 * <and> 11787 * 2) The amount of data left in the socket buffer (i.e. 11788 * we can't send beyond what is in the buffer). 11789 * 11790 * Note that this does not take into account any increase 11791 * in the cwnd. We will only extend the fast path by 11792 * what was acked. 11793 */ 11794 uint32_t new_total, gating_val; 11795 11796 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 11797 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 11798 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 11799 if (new_total <= gating_val) { 11800 /* We can increase left_to_send by the acked amount */ 11801 counter_u64_add(rack_extended_rfo, 1); 11802 rack->r_ctl.fsb.left_to_send = new_total; 11803 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 11804 ("rack:%p left_to_send:%u sbavail:%u out:%u", 11805 rack, rack->r_ctl.fsb.left_to_send, 11806 sbavail(&rack->rc_inp->inp_socket->so_snd), 11807 (tp->snd_max - tp->snd_una))); 11808 11809 } 11810 } 11811 11812 static void 11813 rack_adjust_sendmap_head(struct tcp_rack *rack, struct sockbuf *sb) 11814 { 11815 /* 11816 * Here any sendmap entry that points to the 11817 * beginning mbuf must be adjusted to the correct 11818 * offset. This must be called with: 11819 * 1) The socket buffer locked 11820 * 2) snd_una adjusted to its new position. 11821 * 11822 * Note that (2) implies rack_ack_received has also 11823 * been called and all the sbcut's have been done. 11824 * 11825 * We grab the first mbuf in the socket buffer and 11826 * then go through the front of the sendmap, recalculating 11827 * the stored offset for any sendmap entry that has 11828 * that mbuf. We must use the sb functions to do this 11829 * since its possible an add was done has well as 11830 * the subtraction we may have just completed. This should 11831 * not be a penalty though, since we just referenced the sb 11832 * to go in and trim off the mbufs that we freed (of course 11833 * there will be a penalty for the sendmap references though). 11834 * 11835 * Note also with INVARIANT on, we validate with a KASSERT 11836 * that the first sendmap entry has a soff of 0. 11837 * 11838 */ 11839 struct mbuf *m; 11840 struct rack_sendmap *rsm; 11841 tcp_seq snd_una; 11842 #ifdef INVARIANTS 11843 int first_processed = 0; 11844 #endif 11845 11846 snd_una = rack->rc_tp->snd_una; 11847 SOCKBUF_LOCK_ASSERT(sb); 11848 m = sb->sb_mb; 11849 rsm = tqhash_min(rack->r_ctl.tqh); 11850 if ((rsm == NULL) || (m == NULL)) { 11851 /* Nothing outstanding */ 11852 return; 11853 } 11854 /* The very first RSM's mbuf must point to the head mbuf in the sb */ 11855 KASSERT((rsm->m == m), 11856 ("Rack:%p sb:%p rsm:%p -- first rsm mbuf not aligned to sb", 11857 rack, sb, rsm)); 11858 while (rsm->m && (rsm->m == m)) { 11859 /* one to adjust */ 11860 #ifdef INVARIANTS 11861 struct mbuf *tm; 11862 uint32_t soff; 11863 11864 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 11865 if ((rsm->orig_m_len != m->m_len) || 11866 (rsm->orig_t_space != M_TRAILINGROOM(m))){ 11867 rack_adjust_orig_mlen(rsm); 11868 } 11869 if (first_processed == 0) { 11870 KASSERT((rsm->soff == 0), 11871 ("Rack:%p rsm:%p -- rsm at head but soff not zero", 11872 rack, rsm)); 11873 first_processed = 1; 11874 } 11875 if ((rsm->soff != soff) || (rsm->m != tm)) { 11876 /* 11877 * This is not a fatal error, we anticipate it 11878 * might happen (the else code), so we count it here 11879 * so that under invariant we can see that it really 11880 * does happen. 11881 */ 11882 counter_u64_add(rack_adjust_map_bw, 1); 11883 } 11884 rsm->m = tm; 11885 rsm->soff = soff; 11886 if (tm) { 11887 rsm->orig_m_len = rsm->m->m_len; 11888 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11889 } else { 11890 rsm->orig_m_len = 0; 11891 rsm->orig_t_space = 0; 11892 } 11893 #else 11894 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 11895 if (rsm->m) { 11896 rsm->orig_m_len = rsm->m->m_len; 11897 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 11898 } else { 11899 rsm->orig_m_len = 0; 11900 rsm->orig_t_space = 0; 11901 } 11902 #endif 11903 rsm = tqhash_next(rack->r_ctl.tqh, rsm); 11904 if (rsm == NULL) 11905 break; 11906 } 11907 } 11908 11909 #ifdef TCP_REQUEST_TRK 11910 static inline void 11911 rack_req_check_for_comp(struct tcp_rack *rack, tcp_seq th_ack) 11912 { 11913 struct tcp_sendfile_track *ent; 11914 int i; 11915 11916 if ((rack->rc_hybrid_mode == 0) && 11917 (tcp_bblogging_point_on(rack->rc_tp, TCP_BBPOINT_REQ_LEVEL_LOGGING) == 0)) { 11918 /* 11919 * Just do normal completions hybrid pacing is not on 11920 * and CLDL is off as well. 11921 */ 11922 tcp_req_check_for_comp(rack->rc_tp, th_ack); 11923 return; 11924 } 11925 /* 11926 * Originally I was just going to find the th_ack associated 11927 * with an entry. But then I realized a large strech ack could 11928 * in theory ack two or more requests at once. So instead we 11929 * need to find all entries that are completed by th_ack not 11930 * just a single entry and do our logging. 11931 */ 11932 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11933 while (ent != NULL) { 11934 /* 11935 * We may be doing hybrid pacing or CLDL and need more details possibly 11936 * so we do it manually instead of calling 11937 * tcp_req_check_for_comp() 11938 */ 11939 uint64_t laa, tim, data, cbw, ftim; 11940 11941 /* Ok this ack frees it */ 11942 rack_log_hybrid(rack, th_ack, 11943 ent, HYBRID_LOG_REQ_COMP, __LINE__, 0); 11944 /* calculate the time based on the ack arrival */ 11945 data = ent->end - ent->start; 11946 laa = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); 11947 if (ent->flags & TCP_TRK_TRACK_FLG_FSND) { 11948 if (ent->first_send > ent->localtime) 11949 ftim = ent->first_send; 11950 else 11951 ftim = ent->localtime; 11952 } else { 11953 /* TSNH */ 11954 ftim = ent->localtime; 11955 } 11956 if (laa > ent->localtime) 11957 tim = laa - ftim; 11958 else 11959 tim = 0; 11960 cbw = data * HPTS_USEC_IN_SEC; 11961 if (tim > 0) 11962 cbw /= tim; 11963 else 11964 cbw = 0; 11965 rack_log_hybrid_bw(rack, th_ack, cbw, tim, data, HYBRID_LOG_BW_MEASURE, 0, ent); 11966 /* 11967 * Check to see if we are freeing what we are pointing to send wise 11968 * if so be sure to NULL the pointer so we know we are no longer 11969 * set to anything. 11970 */ 11971 if (ent == rack->r_ctl.rc_last_sft) 11972 rack->r_ctl.rc_last_sft = NULL; 11973 /* Generate the log that the tcp_netflix call would have */ 11974 tcp_req_log_req_info(rack->rc_tp, ent, 11975 i, TCP_TRK_REQ_LOG_FREED, 0, 0); 11976 /* Free it and see if there is another one */ 11977 tcp_req_free_a_slot(rack->rc_tp, ent); 11978 ent = tcp_req_find_a_req_that_is_completed_by(rack->rc_tp, th_ack, &i); 11979 } 11980 } 11981 #endif 11982 11983 11984 /* 11985 * Return value of 1, we do not need to call rack_process_data(). 11986 * return value of 0, rack_process_data can be called. 11987 * For ret_val if its 0 the TCP is locked, if its non-zero 11988 * its unlocked and probably unsafe to touch the TCB. 11989 */ 11990 static int 11991 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11992 struct tcpcb *tp, struct tcpopt *to, 11993 uint32_t tiwin, int32_t tlen, 11994 int32_t * ofia, int32_t thflags, int32_t *ret_val) 11995 { 11996 int32_t ourfinisacked = 0; 11997 int32_t nsegs, acked_amount; 11998 int32_t acked; 11999 struct mbuf *mfree; 12000 struct tcp_rack *rack; 12001 int32_t under_pacing = 0; 12002 int32_t recovery = 0; 12003 12004 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12005 12006 rack = (struct tcp_rack *)tp->t_fb_ptr; 12007 if (SEQ_GT(th->th_ack, tp->snd_max)) { 12008 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 12009 &rack->r_ctl.challenge_ack_ts, 12010 &rack->r_ctl.challenge_ack_cnt); 12011 rack->r_wanted_output = 1; 12012 return (1); 12013 } 12014 if (rack->gp_ready && 12015 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12016 under_pacing = 1; 12017 } 12018 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 12019 int in_rec, dup_ack_struck = 0; 12020 int dsack_seen = 0, sacks_seen = 0; 12021 12022 in_rec = IN_FASTRECOVERY(tp->t_flags); 12023 if (rack->rc_in_persist) { 12024 tp->t_rxtshift = 0; 12025 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12026 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12027 } 12028 12029 if ((th->th_ack == tp->snd_una) && 12030 (tiwin == tp->snd_wnd) && 12031 ((to->to_flags & TOF_SACK) == 0)) { 12032 rack_strike_dupack(rack); 12033 dup_ack_struck = 1; 12034 } 12035 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), 12036 dup_ack_struck, &dsack_seen, &sacks_seen); 12037 if ((rack->sack_attack_disable > 0) && 12038 (th->th_ack == tp->snd_una) && 12039 (tiwin == tp->snd_wnd) && 12040 (dsack_seen == 0) && 12041 (sacks_seen > 0)) { 12042 /* 12043 * If sacks have been disabled we may 12044 * want to strike a dup-ack "ignoring" the 12045 * sack as long as the sack was not a "dsack". Note 12046 * that if no sack is sent (TOF_SACK is off) then the 12047 * normal dsack code above rack_log_ack() would have 12048 * already struck. So this is just to catch the case 12049 * were we are ignoring sacks from this guy due to 12050 * it being a suspected attacker. 12051 */ 12052 rack_strike_dupack(rack); 12053 } 12054 12055 } 12056 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12057 /* 12058 * Old ack, behind (or duplicate to) the last one rcv'd 12059 * Note: We mark reordering is occuring if its 12060 * less than and we have not closed our window. 12061 */ 12062 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 12063 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12064 if (rack->r_ctl.rc_reorder_ts == 0) 12065 rack->r_ctl.rc_reorder_ts = 1; 12066 } 12067 return (0); 12068 } 12069 /* 12070 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 12071 * something we sent. 12072 */ 12073 if (tp->t_flags & TF_NEEDSYN) { 12074 /* 12075 * T/TCP: Connection was half-synchronized, and our SYN has 12076 * been ACK'd (so connection is now fully synchronized). Go 12077 * to non-starred state, increment snd_una for ACK of SYN, 12078 * and check if we can do window scaling. 12079 */ 12080 tp->t_flags &= ~TF_NEEDSYN; 12081 tp->snd_una++; 12082 /* Do window scaling? */ 12083 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 12084 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 12085 tp->rcv_scale = tp->request_r_scale; 12086 /* Send window already scaled. */ 12087 } 12088 } 12089 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12090 12091 acked = BYTES_THIS_ACK(tp, th); 12092 if (acked) { 12093 /* 12094 * Any time we move the cum-ack forward clear 12095 * keep-alive tied probe-not-answered. The 12096 * persists clears its own on entry. 12097 */ 12098 rack->probe_not_answered = 0; 12099 } 12100 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12101 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12102 /* 12103 * If we just performed our first retransmit, and the ACK arrives 12104 * within our recovery window, then it was a mistake to do the 12105 * retransmit in the first place. Recover our original cwnd and 12106 * ssthresh, and proceed to transmit where we left off. 12107 */ 12108 if ((tp->t_flags & TF_PREVVALID) && 12109 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12110 tp->t_flags &= ~TF_PREVVALID; 12111 if (tp->t_rxtshift == 1 && 12112 (int)(ticks - tp->t_badrxtwin) < 0) 12113 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12114 } 12115 if (acked) { 12116 /* assure we are not backed off */ 12117 tp->t_rxtshift = 0; 12118 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12119 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12120 rack->rc_tlp_in_progress = 0; 12121 rack->r_ctl.rc_tlp_cnt_out = 0; 12122 /* 12123 * If it is the RXT timer we want to 12124 * stop it, so we can restart a TLP. 12125 */ 12126 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12127 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12128 #ifdef TCP_REQUEST_TRK 12129 rack_req_check_for_comp(rack, th->th_ack); 12130 #endif 12131 } 12132 /* 12133 * If we have a timestamp reply, update smoothed round trip time. If 12134 * no timestamp is present but transmit timer is running and timed 12135 * sequence number was acked, update smoothed round trip time. Since 12136 * we now have an rtt measurement, cancel the timer backoff (cf., 12137 * Phil Karn's retransmit alg.). Recompute the initial retransmit 12138 * timer. 12139 * 12140 * Some boxes send broken timestamp replies during the SYN+ACK 12141 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12142 * and blow up the retransmit timer. 12143 */ 12144 /* 12145 * If all outstanding data is acked, stop retransmit timer and 12146 * remember to restart (more output or persist). If there is more 12147 * data to be acked, restart retransmit timer, using current 12148 * (possibly backed-off) value. 12149 */ 12150 if (acked == 0) { 12151 if (ofia) 12152 *ofia = ourfinisacked; 12153 return (0); 12154 } 12155 if (IN_RECOVERY(tp->t_flags)) { 12156 if (SEQ_LT(th->th_ack, tp->snd_recover) && 12157 (SEQ_LT(th->th_ack, tp->snd_max))) { 12158 tcp_rack_partialack(tp); 12159 } else { 12160 rack_post_recovery(tp, th->th_ack); 12161 recovery = 1; 12162 } 12163 } 12164 /* 12165 * Let the congestion control algorithm update congestion control 12166 * related information. This typically means increasing the 12167 * congestion window. 12168 */ 12169 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 12170 SOCKBUF_LOCK(&so->so_snd); 12171 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 12172 tp->snd_wnd -= acked_amount; 12173 mfree = sbcut_locked(&so->so_snd, acked_amount); 12174 if ((sbused(&so->so_snd) == 0) && 12175 (acked > acked_amount) && 12176 (tp->t_state >= TCPS_FIN_WAIT_1) && 12177 (tp->t_flags & TF_SENTFIN)) { 12178 /* 12179 * We must be sure our fin 12180 * was sent and acked (we can be 12181 * in FIN_WAIT_1 without having 12182 * sent the fin). 12183 */ 12184 ourfinisacked = 1; 12185 } 12186 tp->snd_una = th->th_ack; 12187 /* wakeups? */ 12188 if (acked_amount && sbavail(&so->so_snd)) 12189 rack_adjust_sendmap_head(rack, &so->so_snd); 12190 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12191 /* NB: sowwakeup_locked() does an implicit unlock. */ 12192 sowwakeup_locked(so); 12193 /* now check the rxt clamps */ 12194 if ((recovery == 1) && 12195 (rack->excess_rxt_on) && 12196 (rack->r_cwnd_was_clamped == 0)) { 12197 do_rack_excess_rxt(tp, rack); 12198 } else if (rack->r_cwnd_was_clamped) 12199 do_rack_check_for_unclamp(tp, rack); 12200 m_freem(mfree); 12201 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 12202 tp->snd_recover = tp->snd_una; 12203 12204 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 12205 tp->snd_nxt = tp->snd_una; 12206 } 12207 if (under_pacing && 12208 (rack->use_fixed_rate == 0) && 12209 (rack->in_probe_rtt == 0) && 12210 rack->rc_gp_dyn_mul && 12211 rack->rc_always_pace) { 12212 /* Check if we are dragging bottom */ 12213 rack_check_bottom_drag(tp, rack, so); 12214 } 12215 if (tp->snd_una == tp->snd_max) { 12216 /* Nothing left outstanding */ 12217 tp->t_flags &= ~TF_PREVVALID; 12218 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 12219 rack->r_ctl.retran_during_recovery = 0; 12220 rack->r_ctl.dsack_byte_cnt = 0; 12221 if (rack->r_ctl.rc_went_idle_time == 0) 12222 rack->r_ctl.rc_went_idle_time = 1; 12223 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 12224 if (sbavail(&tptosocket(tp)->so_snd) == 0) 12225 tp->t_acktime = 0; 12226 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12227 rack->rc_suspicious = 0; 12228 /* Set need output so persist might get set */ 12229 rack->r_wanted_output = 1; 12230 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12231 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 12232 (sbavail(&so->so_snd) == 0) && 12233 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 12234 /* 12235 * The socket was gone and the 12236 * peer sent data (now or in the past), time to 12237 * reset him. 12238 */ 12239 *ret_val = 1; 12240 /* tcp_close will kill the inp pre-log the Reset */ 12241 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 12242 tp = tcp_close(tp); 12243 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 12244 return (1); 12245 } 12246 } 12247 if (ofia) 12248 *ofia = ourfinisacked; 12249 return (0); 12250 } 12251 12252 12253 static void 12254 rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line, 12255 int dir, uint32_t flags, struct rack_sendmap *rsm) 12256 { 12257 if (tcp_bblogging_on(rack->rc_tp)) { 12258 union tcp_log_stackspecific log; 12259 struct timeval tv; 12260 12261 memset(&log, 0, sizeof(log)); 12262 log.u_bbr.flex1 = cnt; 12263 log.u_bbr.flex2 = split; 12264 log.u_bbr.flex3 = out; 12265 log.u_bbr.flex4 = line; 12266 log.u_bbr.flex5 = rack->r_must_retran; 12267 log.u_bbr.flex6 = flags; 12268 log.u_bbr.flex7 = rack->rc_has_collapsed; 12269 log.u_bbr.flex8 = dir; /* 12270 * 1 is collapsed, 0 is uncollapsed, 12271 * 2 is log of a rsm being marked, 3 is a split. 12272 */ 12273 if (rsm == NULL) 12274 log.u_bbr.rttProp = 0; 12275 else 12276 log.u_bbr.rttProp = (uint64_t)rsm; 12277 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 12278 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 12279 TCP_LOG_EVENTP(rack->rc_tp, NULL, 12280 &rack->rc_inp->inp_socket->so_rcv, 12281 &rack->rc_inp->inp_socket->so_snd, 12282 TCP_RACK_LOG_COLLAPSE, 0, 12283 0, &log, false, &tv); 12284 } 12285 } 12286 12287 static void 12288 rack_collapsed_window(struct tcp_rack *rack, uint32_t out, tcp_seq th_ack, int line) 12289 { 12290 /* 12291 * Here all we do is mark the collapsed point and set the flag. 12292 * This may happen again and again, but there is no 12293 * sense splitting our map until we know where the 12294 * peer finally lands in the collapse. 12295 */ 12296 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12297 if ((rack->rc_has_collapsed == 0) || 12298 (rack->r_ctl.last_collapse_point != (th_ack + rack->rc_tp->snd_wnd))) 12299 counter_u64_add(rack_collapsed_win_seen, 1); 12300 rack->r_ctl.last_collapse_point = th_ack + rack->rc_tp->snd_wnd; 12301 rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max; 12302 rack->rc_has_collapsed = 1; 12303 rack->r_collapse_point_valid = 1; 12304 rack_log_collapse(rack, 0, th_ack, rack->r_ctl.last_collapse_point, line, 1, 0, NULL); 12305 } 12306 12307 static void 12308 rack_un_collapse_window(struct tcp_rack *rack, int line) 12309 { 12310 struct rack_sendmap *nrsm, *rsm; 12311 int cnt = 0, split = 0; 12312 int insret __diagused; 12313 12314 12315 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_WND); 12316 rack->rc_has_collapsed = 0; 12317 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 12318 if (rsm == NULL) { 12319 /* Nothing to do maybe the peer ack'ed it all */ 12320 rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12321 return; 12322 } 12323 /* Now do we need to split this one? */ 12324 if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) { 12325 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 12326 rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm); 12327 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 12328 if (nrsm == NULL) { 12329 /* We can't get a rsm, mark all? */ 12330 nrsm = rsm; 12331 goto no_split; 12332 } 12333 /* Clone it */ 12334 split = 1; 12335 rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point); 12336 #ifndef INVARIANTS 12337 (void)tqhash_insert(rack->r_ctl.tqh, nrsm); 12338 #else 12339 if ((insret = tqhash_insert(rack->r_ctl.tqh, nrsm)) != 0) { 12340 panic("Insert in rb tree of %p fails ret:%d rack:%p rsm:%p", 12341 nrsm, insret, rack, rsm); 12342 } 12343 #endif 12344 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 12345 rack->r_ctl.last_collapse_point, __LINE__); 12346 if (rsm->r_in_tmap) { 12347 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 12348 nrsm->r_in_tmap = 1; 12349 } 12350 /* 12351 * Set in the new RSM as the 12352 * collapsed starting point 12353 */ 12354 rsm = nrsm; 12355 } 12356 12357 no_split: 12358 TQHASH_FOREACH_FROM(nrsm, rack->r_ctl.tqh, rsm) { 12359 cnt++; 12360 nrsm->r_flags |= RACK_RWND_COLLAPSED; 12361 rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm); 12362 cnt++; 12363 } 12364 if (cnt) { 12365 counter_u64_add(rack_collapsed_win, 1); 12366 } 12367 rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL); 12368 } 12369 12370 static void 12371 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 12372 int32_t tlen, int32_t tfo_syn) 12373 { 12374 if (DELAY_ACK(tp, tlen) || tfo_syn) { 12375 rack_timer_cancel(tp, rack, 12376 rack->r_ctl.rc_rcvtime, __LINE__); 12377 tp->t_flags |= TF_DELACK; 12378 } else { 12379 rack->r_wanted_output = 1; 12380 tp->t_flags |= TF_ACKNOW; 12381 } 12382 } 12383 12384 static void 12385 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 12386 { 12387 /* 12388 * If fast output is in progress, lets validate that 12389 * the new window did not shrink on us and make it 12390 * so fast output should end. 12391 */ 12392 if (rack->r_fast_output) { 12393 uint32_t out; 12394 12395 /* 12396 * Calculate what we will send if left as is 12397 * and compare that to our send window. 12398 */ 12399 out = ctf_outstanding(tp); 12400 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 12401 /* ok we have an issue */ 12402 if (out >= tp->snd_wnd) { 12403 /* Turn off fast output the window is met or collapsed */ 12404 rack->r_fast_output = 0; 12405 } else { 12406 /* we have some room left */ 12407 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 12408 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 12409 /* If not at least 1 full segment never mind */ 12410 rack->r_fast_output = 0; 12411 } 12412 } 12413 } 12414 } 12415 } 12416 12417 12418 /* 12419 * Return value of 1, the TCB is unlocked and most 12420 * likely gone, return value of 0, the TCP is still 12421 * locked. 12422 */ 12423 static int 12424 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 12425 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 12426 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 12427 { 12428 /* 12429 * Update window information. Don't look at window if no ACK: TAC's 12430 * send garbage on first SYN. 12431 */ 12432 int32_t nsegs; 12433 int32_t tfo_syn; 12434 struct tcp_rack *rack; 12435 12436 INP_WLOCK_ASSERT(tptoinpcb(tp)); 12437 12438 rack = (struct tcp_rack *)tp->t_fb_ptr; 12439 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12440 if ((thflags & TH_ACK) && 12441 (SEQ_LT(tp->snd_wl1, th->th_seq) || 12442 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 12443 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 12444 /* keep track of pure window updates */ 12445 if (tlen == 0 && 12446 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 12447 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 12448 tp->snd_wnd = tiwin; 12449 rack_validate_fo_sendwin_up(tp, rack); 12450 tp->snd_wl1 = th->th_seq; 12451 tp->snd_wl2 = th->th_ack; 12452 if (tp->snd_wnd > tp->max_sndwnd) 12453 tp->max_sndwnd = tp->snd_wnd; 12454 rack->r_wanted_output = 1; 12455 } else if (thflags & TH_ACK) { 12456 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 12457 tp->snd_wnd = tiwin; 12458 rack_validate_fo_sendwin_up(tp, rack); 12459 tp->snd_wl1 = th->th_seq; 12460 tp->snd_wl2 = th->th_ack; 12461 } 12462 } 12463 if (tp->snd_wnd < ctf_outstanding(tp)) 12464 /* The peer collapsed the window */ 12465 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12466 else if (rack->rc_has_collapsed) 12467 rack_un_collapse_window(rack, __LINE__); 12468 if ((rack->r_collapse_point_valid) && 12469 (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point))) 12470 rack->r_collapse_point_valid = 0; 12471 /* Was persist timer active and now we have window space? */ 12472 if ((rack->rc_in_persist != 0) && 12473 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12474 rack->r_ctl.rc_pace_min_segs))) { 12475 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12476 tp->snd_nxt = tp->snd_max; 12477 /* Make sure we output to start the timer */ 12478 rack->r_wanted_output = 1; 12479 } 12480 /* Do we enter persists? */ 12481 if ((rack->rc_in_persist == 0) && 12482 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12483 TCPS_HAVEESTABLISHED(tp->t_state) && 12484 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12485 sbavail(&tptosocket(tp)->so_snd) && 12486 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12487 /* 12488 * Here the rwnd is less than 12489 * the pacing size, we are established, 12490 * nothing is outstanding, and there is 12491 * data to send. Enter persists. 12492 */ 12493 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 12494 } 12495 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 12496 m_freem(m); 12497 return (0); 12498 } 12499 /* 12500 * don't process the URG bit, ignore them drag 12501 * along the up. 12502 */ 12503 tp->rcv_up = tp->rcv_nxt; 12504 12505 /* 12506 * Process the segment text, merging it into the TCP sequencing 12507 * queue, and arranging for acknowledgment of receipt if necessary. 12508 * This process logically involves adjusting tp->rcv_wnd as data is 12509 * presented to the user (this happens in tcp_usrreq.c, case 12510 * PRU_RCVD). If a FIN has already been received on this connection 12511 * then we just ignore the text. 12512 */ 12513 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 12514 IS_FASTOPEN(tp->t_flags)); 12515 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 12516 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12517 tcp_seq save_start = th->th_seq; 12518 tcp_seq save_rnxt = tp->rcv_nxt; 12519 int save_tlen = tlen; 12520 12521 m_adj(m, drop_hdrlen); /* delayed header drop */ 12522 /* 12523 * Insert segment which includes th into TCP reassembly 12524 * queue with control block tp. Set thflags to whether 12525 * reassembly now includes a segment with FIN. This handles 12526 * the common case inline (segment is the next to be 12527 * received on an established connection, and the queue is 12528 * empty), avoiding linkage into and removal from the queue 12529 * and repetition of various conversions. Set DELACK for 12530 * segments received in order, but ack immediately when 12531 * segments are out of order (so fast retransmit can work). 12532 */ 12533 if (th->th_seq == tp->rcv_nxt && 12534 SEGQ_EMPTY(tp) && 12535 (TCPS_HAVEESTABLISHED(tp->t_state) || 12536 tfo_syn)) { 12537 #ifdef NETFLIX_SB_LIMITS 12538 u_int mcnt, appended; 12539 12540 if (so->so_rcv.sb_shlim) { 12541 mcnt = m_memcnt(m); 12542 appended = 0; 12543 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12544 CFO_NOSLEEP, NULL) == false) { 12545 counter_u64_add(tcp_sb_shlim_fails, 1); 12546 m_freem(m); 12547 return (0); 12548 } 12549 } 12550 #endif 12551 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 12552 tp->rcv_nxt += tlen; 12553 if (tlen && 12554 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12555 (tp->t_fbyte_in == 0)) { 12556 tp->t_fbyte_in = ticks; 12557 if (tp->t_fbyte_in == 0) 12558 tp->t_fbyte_in = 1; 12559 if (tp->t_fbyte_out && tp->t_fbyte_in) 12560 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12561 } 12562 thflags = tcp_get_flags(th) & TH_FIN; 12563 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12564 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12565 SOCKBUF_LOCK(&so->so_rcv); 12566 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12567 m_freem(m); 12568 } else 12569 #ifdef NETFLIX_SB_LIMITS 12570 appended = 12571 #endif 12572 sbappendstream_locked(&so->so_rcv, m, 0); 12573 12574 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12575 /* NB: sorwakeup_locked() does an implicit unlock. */ 12576 sorwakeup_locked(so); 12577 #ifdef NETFLIX_SB_LIMITS 12578 if (so->so_rcv.sb_shlim && appended != mcnt) 12579 counter_fo_release(so->so_rcv.sb_shlim, 12580 mcnt - appended); 12581 #endif 12582 } else { 12583 /* 12584 * XXX: Due to the header drop above "th" is 12585 * theoretically invalid by now. Fortunately 12586 * m_adj() doesn't actually frees any mbufs when 12587 * trimming from the head. 12588 */ 12589 tcp_seq temp = save_start; 12590 12591 thflags = tcp_reass(tp, th, &temp, &tlen, m); 12592 tp->t_flags |= TF_ACKNOW; 12593 if (tp->t_flags & TF_WAKESOR) { 12594 tp->t_flags &= ~TF_WAKESOR; 12595 /* NB: sorwakeup_locked() does an implicit unlock. */ 12596 sorwakeup_locked(so); 12597 } 12598 } 12599 if ((tp->t_flags & TF_SACK_PERMIT) && 12600 (save_tlen > 0) && 12601 TCPS_HAVEESTABLISHED(tp->t_state)) { 12602 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 12603 /* 12604 * DSACK actually handled in the fastpath 12605 * above. 12606 */ 12607 tcp_update_sack_list(tp, save_start, 12608 save_start + save_tlen); 12609 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 12610 if ((tp->rcv_numsacks >= 1) && 12611 (tp->sackblks[0].end == save_start)) { 12612 /* 12613 * Partial overlap, recorded at todrop 12614 * above. 12615 */ 12616 tcp_update_sack_list(tp, 12617 tp->sackblks[0].start, 12618 tp->sackblks[0].end); 12619 } else { 12620 tcp_update_dsack_list(tp, save_start, 12621 save_start + save_tlen); 12622 } 12623 } else if (tlen >= save_tlen) { 12624 /* Update of sackblks. */ 12625 tcp_update_dsack_list(tp, save_start, 12626 save_start + save_tlen); 12627 } else if (tlen > 0) { 12628 tcp_update_dsack_list(tp, save_start, 12629 save_start + tlen); 12630 } 12631 } 12632 } else { 12633 m_freem(m); 12634 thflags &= ~TH_FIN; 12635 } 12636 12637 /* 12638 * If FIN is received ACK the FIN and let the user know that the 12639 * connection is closing. 12640 */ 12641 if (thflags & TH_FIN) { 12642 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 12643 /* The socket upcall is handled by socantrcvmore. */ 12644 socantrcvmore(so); 12645 /* 12646 * If connection is half-synchronized (ie NEEDSYN 12647 * flag on) then delay ACK, so it may be piggybacked 12648 * when SYN is sent. Otherwise, since we received a 12649 * FIN then no more input can be expected, send ACK 12650 * now. 12651 */ 12652 if (tp->t_flags & TF_NEEDSYN) { 12653 rack_timer_cancel(tp, rack, 12654 rack->r_ctl.rc_rcvtime, __LINE__); 12655 tp->t_flags |= TF_DELACK; 12656 } else { 12657 tp->t_flags |= TF_ACKNOW; 12658 } 12659 tp->rcv_nxt++; 12660 } 12661 switch (tp->t_state) { 12662 /* 12663 * In SYN_RECEIVED and ESTABLISHED STATES enter the 12664 * CLOSE_WAIT state. 12665 */ 12666 case TCPS_SYN_RECEIVED: 12667 tp->t_starttime = ticks; 12668 /* FALLTHROUGH */ 12669 case TCPS_ESTABLISHED: 12670 rack_timer_cancel(tp, rack, 12671 rack->r_ctl.rc_rcvtime, __LINE__); 12672 tcp_state_change(tp, TCPS_CLOSE_WAIT); 12673 break; 12674 12675 /* 12676 * If still in FIN_WAIT_1 STATE FIN has not been 12677 * acked so enter the CLOSING state. 12678 */ 12679 case TCPS_FIN_WAIT_1: 12680 rack_timer_cancel(tp, rack, 12681 rack->r_ctl.rc_rcvtime, __LINE__); 12682 tcp_state_change(tp, TCPS_CLOSING); 12683 break; 12684 12685 /* 12686 * In FIN_WAIT_2 state enter the TIME_WAIT state, 12687 * starting the time-wait timer, turning off the 12688 * other standard timers. 12689 */ 12690 case TCPS_FIN_WAIT_2: 12691 rack_timer_cancel(tp, rack, 12692 rack->r_ctl.rc_rcvtime, __LINE__); 12693 tcp_twstart(tp); 12694 return (1); 12695 } 12696 } 12697 /* 12698 * Return any desired output. 12699 */ 12700 if ((tp->t_flags & TF_ACKNOW) || 12701 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 12702 rack->r_wanted_output = 1; 12703 } 12704 return (0); 12705 } 12706 12707 /* 12708 * Here nothing is really faster, its just that we 12709 * have broken out the fast-data path also just like 12710 * the fast-ack. 12711 */ 12712 static int 12713 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 12714 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12715 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 12716 { 12717 int32_t nsegs; 12718 int32_t newsize = 0; /* automatic sockbuf scaling */ 12719 struct tcp_rack *rack; 12720 #ifdef NETFLIX_SB_LIMITS 12721 u_int mcnt, appended; 12722 #endif 12723 12724 /* 12725 * If last ACK falls within this segment's sequence numbers, record 12726 * the timestamp. NOTE that the test is modified according to the 12727 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12728 */ 12729 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 12730 return (0); 12731 } 12732 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 12733 return (0); 12734 } 12735 if (tiwin && tiwin != tp->snd_wnd) { 12736 return (0); 12737 } 12738 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 12739 return (0); 12740 } 12741 if (__predict_false((to->to_flags & TOF_TS) && 12742 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 12743 return (0); 12744 } 12745 if (__predict_false((th->th_ack != tp->snd_una))) { 12746 return (0); 12747 } 12748 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 12749 return (0); 12750 } 12751 if ((to->to_flags & TOF_TS) != 0 && 12752 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12753 tp->ts_recent_age = tcp_ts_getticks(); 12754 tp->ts_recent = to->to_tsval; 12755 } 12756 rack = (struct tcp_rack *)tp->t_fb_ptr; 12757 /* 12758 * This is a pure, in-sequence data packet with nothing on the 12759 * reassembly queue and we have enough buffer space to take it. 12760 */ 12761 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12762 12763 #ifdef NETFLIX_SB_LIMITS 12764 if (so->so_rcv.sb_shlim) { 12765 mcnt = m_memcnt(m); 12766 appended = 0; 12767 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 12768 CFO_NOSLEEP, NULL) == false) { 12769 counter_u64_add(tcp_sb_shlim_fails, 1); 12770 m_freem(m); 12771 return (1); 12772 } 12773 } 12774 #endif 12775 /* Clean receiver SACK report if present */ 12776 if (tp->rcv_numsacks) 12777 tcp_clean_sackreport(tp); 12778 KMOD_TCPSTAT_INC(tcps_preddat); 12779 tp->rcv_nxt += tlen; 12780 if (tlen && 12781 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 12782 (tp->t_fbyte_in == 0)) { 12783 tp->t_fbyte_in = ticks; 12784 if (tp->t_fbyte_in == 0) 12785 tp->t_fbyte_in = 1; 12786 if (tp->t_fbyte_out && tp->t_fbyte_in) 12787 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 12788 } 12789 /* 12790 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 12791 */ 12792 tp->snd_wl1 = th->th_seq; 12793 /* 12794 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 12795 */ 12796 tp->rcv_up = tp->rcv_nxt; 12797 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 12798 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 12799 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 12800 12801 /* Add data to socket buffer. */ 12802 SOCKBUF_LOCK(&so->so_rcv); 12803 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12804 m_freem(m); 12805 } else { 12806 /* 12807 * Set new socket buffer size. Give up when limit is 12808 * reached. 12809 */ 12810 if (newsize) 12811 if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) 12812 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 12813 m_adj(m, drop_hdrlen); /* delayed header drop */ 12814 #ifdef NETFLIX_SB_LIMITS 12815 appended = 12816 #endif 12817 sbappendstream_locked(&so->so_rcv, m, 0); 12818 ctf_calc_rwin(so, tp); 12819 } 12820 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 12821 /* NB: sorwakeup_locked() does an implicit unlock. */ 12822 sorwakeup_locked(so); 12823 #ifdef NETFLIX_SB_LIMITS 12824 if (so->so_rcv.sb_shlim && mcnt != appended) 12825 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 12826 #endif 12827 rack_handle_delayed_ack(tp, rack, tlen, 0); 12828 if (tp->snd_una == tp->snd_max) 12829 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 12830 return (1); 12831 } 12832 12833 /* 12834 * This subfunction is used to try to highly optimize the 12835 * fast path. We again allow window updates that are 12836 * in sequence to remain in the fast-path. We also add 12837 * in the __predict's to attempt to help the compiler. 12838 * Note that if we return a 0, then we can *not* process 12839 * it and the caller should push the packet into the 12840 * slow-path. 12841 */ 12842 static int 12843 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12844 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12845 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 12846 { 12847 int32_t acked; 12848 int32_t nsegs; 12849 int32_t under_pacing = 0; 12850 struct tcp_rack *rack; 12851 12852 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 12853 /* Old ack, behind (or duplicate to) the last one rcv'd */ 12854 return (0); 12855 } 12856 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 12857 /* Above what we have sent? */ 12858 return (0); 12859 } 12860 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 12861 /* We are retransmitting */ 12862 return (0); 12863 } 12864 if (__predict_false(tiwin == 0)) { 12865 /* zero window */ 12866 return (0); 12867 } 12868 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 12869 /* We need a SYN or a FIN, unlikely.. */ 12870 return (0); 12871 } 12872 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 12873 /* Timestamp is behind .. old ack with seq wrap? */ 12874 return (0); 12875 } 12876 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 12877 /* Still recovering */ 12878 return (0); 12879 } 12880 rack = (struct tcp_rack *)tp->t_fb_ptr; 12881 if (rack->r_ctl.rc_sacked) { 12882 /* We have sack holes on our scoreboard */ 12883 return (0); 12884 } 12885 /* Ok if we reach here, we can process a fast-ack */ 12886 if (rack->gp_ready && 12887 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 12888 under_pacing = 1; 12889 } 12890 nsegs = max(1, m->m_pkthdr.lro_nsegs); 12891 rack_log_ack(tp, to, th, 0, 0, NULL, NULL); 12892 /* Did the window get updated? */ 12893 if (tiwin != tp->snd_wnd) { 12894 tp->snd_wnd = tiwin; 12895 rack_validate_fo_sendwin_up(tp, rack); 12896 tp->snd_wl1 = th->th_seq; 12897 if (tp->snd_wnd > tp->max_sndwnd) 12898 tp->max_sndwnd = tp->snd_wnd; 12899 } 12900 /* Do we exit persists? */ 12901 if ((rack->rc_in_persist != 0) && 12902 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12903 rack->r_ctl.rc_pace_min_segs))) { 12904 rack_exit_persist(tp, rack, cts); 12905 } 12906 /* Do we enter persists? */ 12907 if ((rack->rc_in_persist == 0) && 12908 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12909 TCPS_HAVEESTABLISHED(tp->t_state) && 12910 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 12911 sbavail(&tptosocket(tp)->so_snd) && 12912 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 12913 /* 12914 * Here the rwnd is less than 12915 * the pacing size, we are established, 12916 * nothing is outstanding, and there is 12917 * data to send. Enter persists. 12918 */ 12919 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, th->th_ack); 12920 } 12921 /* 12922 * If last ACK falls within this segment's sequence numbers, record 12923 * the timestamp. NOTE that the test is modified according to the 12924 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 12925 */ 12926 if ((to->to_flags & TOF_TS) != 0 && 12927 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 12928 tp->ts_recent_age = tcp_ts_getticks(); 12929 tp->ts_recent = to->to_tsval; 12930 } 12931 /* 12932 * This is a pure ack for outstanding data. 12933 */ 12934 KMOD_TCPSTAT_INC(tcps_predack); 12935 12936 /* 12937 * "bad retransmit" recovery. 12938 */ 12939 if ((tp->t_flags & TF_PREVVALID) && 12940 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 12941 tp->t_flags &= ~TF_PREVVALID; 12942 if (tp->t_rxtshift == 1 && 12943 (int)(ticks - tp->t_badrxtwin) < 0) 12944 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack, __LINE__); 12945 } 12946 /* 12947 * Recalculate the transmit timer / rtt. 12948 * 12949 * Some boxes send broken timestamp replies during the SYN+ACK 12950 * phase, ignore timestamps of 0 or we could calculate a huge RTT 12951 * and blow up the retransmit timer. 12952 */ 12953 acked = BYTES_THIS_ACK(tp, th); 12954 12955 #ifdef TCP_HHOOK 12956 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 12957 hhook_run_tcp_est_in(tp, th, to); 12958 #endif 12959 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 12960 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 12961 if (acked) { 12962 struct mbuf *mfree; 12963 12964 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 12965 SOCKBUF_LOCK(&so->so_snd); 12966 mfree = sbcut_locked(&so->so_snd, acked); 12967 tp->snd_una = th->th_ack; 12968 /* Note we want to hold the sb lock through the sendmap adjust */ 12969 rack_adjust_sendmap_head(rack, &so->so_snd); 12970 /* Wake up the socket if we have room to write more */ 12971 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 12972 sowwakeup_locked(so); 12973 m_freem(mfree); 12974 tp->t_rxtshift = 0; 12975 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 12976 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 12977 rack->rc_tlp_in_progress = 0; 12978 rack->r_ctl.rc_tlp_cnt_out = 0; 12979 /* 12980 * If it is the RXT timer we want to 12981 * stop it, so we can restart a TLP. 12982 */ 12983 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 12984 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12985 12986 #ifdef TCP_REQUEST_TRK 12987 rack_req_check_for_comp(rack, th->th_ack); 12988 #endif 12989 } 12990 /* 12991 * Let the congestion control algorithm update congestion control 12992 * related information. This typically means increasing the 12993 * congestion window. 12994 */ 12995 if (tp->snd_wnd < ctf_outstanding(tp)) { 12996 /* The peer collapsed the window */ 12997 rack_collapsed_window(rack, ctf_outstanding(tp), th->th_ack, __LINE__); 12998 } else if (rack->rc_has_collapsed) 12999 rack_un_collapse_window(rack, __LINE__); 13000 if ((rack->r_collapse_point_valid) && 13001 (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point))) 13002 rack->r_collapse_point_valid = 0; 13003 /* 13004 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 13005 */ 13006 tp->snd_wl2 = th->th_ack; 13007 tp->t_dupacks = 0; 13008 m_freem(m); 13009 /* ND6_HINT(tp); *//* Some progress has been made. */ 13010 13011 /* 13012 * If all outstanding data are acked, stop retransmit timer, 13013 * otherwise restart timer using current (possibly backed-off) 13014 * value. If process is waiting for space, wakeup/selwakeup/signal. 13015 * If data are ready to send, let tcp_output decide between more 13016 * output or persist. 13017 */ 13018 if (under_pacing && 13019 (rack->use_fixed_rate == 0) && 13020 (rack->in_probe_rtt == 0) && 13021 rack->rc_gp_dyn_mul && 13022 rack->rc_always_pace) { 13023 /* Check if we are dragging bottom */ 13024 rack_check_bottom_drag(tp, rack, so); 13025 } 13026 if (tp->snd_una == tp->snd_max) { 13027 tp->t_flags &= ~TF_PREVVALID; 13028 rack->r_ctl.retran_during_recovery = 0; 13029 rack->rc_suspicious = 0; 13030 rack->r_ctl.dsack_byte_cnt = 0; 13031 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13032 if (rack->r_ctl.rc_went_idle_time == 0) 13033 rack->r_ctl.rc_went_idle_time = 1; 13034 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13035 if (sbavail(&tptosocket(tp)->so_snd) == 0) 13036 tp->t_acktime = 0; 13037 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13038 } 13039 if (acked && rack->r_fast_output) 13040 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 13041 if (sbavail(&so->so_snd)) { 13042 rack->r_wanted_output = 1; 13043 } 13044 return (1); 13045 } 13046 13047 /* 13048 * Return value of 1, the TCB is unlocked and most 13049 * likely gone, return value of 0, the TCP is still 13050 * locked. 13051 */ 13052 static int 13053 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 13054 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13055 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13056 { 13057 int32_t ret_val = 0; 13058 int32_t todrop; 13059 int32_t ourfinisacked = 0; 13060 struct tcp_rack *rack; 13061 13062 INP_WLOCK_ASSERT(tptoinpcb(tp)); 13063 13064 ctf_calc_rwin(so, tp); 13065 /* 13066 * If the state is SYN_SENT: if seg contains an ACK, but not for our 13067 * SYN, drop the input. if seg contains a RST, then drop the 13068 * connection. if seg does not contain SYN, then drop it. Otherwise 13069 * this is an acceptable SYN segment initialize tp->rcv_nxt and 13070 * tp->irs if seg contains ack then advance tp->snd_una if seg 13071 * contains an ECE and ECN support is enabled, the stream is ECN 13072 * capable. if SYN has been acked change to ESTABLISHED else 13073 * SYN_RCVD state arrange for segment to be acked (eventually) 13074 * continue processing rest of data/controls. 13075 */ 13076 if ((thflags & TH_ACK) && 13077 (SEQ_LEQ(th->th_ack, tp->iss) || 13078 SEQ_GT(th->th_ack, tp->snd_max))) { 13079 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13080 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13081 return (1); 13082 } 13083 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 13084 TCP_PROBE5(connect__refused, NULL, tp, 13085 mtod(m, const char *), tp, th); 13086 tp = tcp_drop(tp, ECONNREFUSED); 13087 ctf_do_drop(m, tp); 13088 return (1); 13089 } 13090 if (thflags & TH_RST) { 13091 ctf_do_drop(m, tp); 13092 return (1); 13093 } 13094 if (!(thflags & TH_SYN)) { 13095 ctf_do_drop(m, tp); 13096 return (1); 13097 } 13098 tp->irs = th->th_seq; 13099 tcp_rcvseqinit(tp); 13100 rack = (struct tcp_rack *)tp->t_fb_ptr; 13101 if (thflags & TH_ACK) { 13102 int tfo_partial = 0; 13103 13104 KMOD_TCPSTAT_INC(tcps_connects); 13105 soisconnected(so); 13106 #ifdef MAC 13107 mac_socketpeer_set_from_mbuf(m, so); 13108 #endif 13109 /* Do window scaling on this connection? */ 13110 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13111 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13112 tp->rcv_scale = tp->request_r_scale; 13113 } 13114 tp->rcv_adv += min(tp->rcv_wnd, 13115 TCP_MAXWIN << tp->rcv_scale); 13116 /* 13117 * If not all the data that was sent in the TFO SYN 13118 * has been acked, resend the remainder right away. 13119 */ 13120 if (IS_FASTOPEN(tp->t_flags) && 13121 (tp->snd_una != tp->snd_max)) { 13122 tp->snd_nxt = th->th_ack; 13123 tfo_partial = 1; 13124 } 13125 /* 13126 * If there's data, delay ACK; if there's also a FIN ACKNOW 13127 * will be turned on later. 13128 */ 13129 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 13130 rack_timer_cancel(tp, rack, 13131 rack->r_ctl.rc_rcvtime, __LINE__); 13132 tp->t_flags |= TF_DELACK; 13133 } else { 13134 rack->r_wanted_output = 1; 13135 tp->t_flags |= TF_ACKNOW; 13136 } 13137 13138 tcp_ecn_input_syn_sent(tp, thflags, iptos); 13139 13140 if (SEQ_GT(th->th_ack, tp->snd_una)) { 13141 /* 13142 * We advance snd_una for the 13143 * fast open case. If th_ack is 13144 * acknowledging data beyond 13145 * snd_una we can't just call 13146 * ack-processing since the 13147 * data stream in our send-map 13148 * will start at snd_una + 1 (one 13149 * beyond the SYN). If its just 13150 * equal we don't need to do that 13151 * and there is no send_map. 13152 */ 13153 tp->snd_una++; 13154 } 13155 /* 13156 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 13157 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 13158 */ 13159 tp->t_starttime = ticks; 13160 if (tp->t_flags & TF_NEEDFIN) { 13161 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13162 tp->t_flags &= ~TF_NEEDFIN; 13163 thflags &= ~TH_SYN; 13164 } else { 13165 tcp_state_change(tp, TCPS_ESTABLISHED); 13166 TCP_PROBE5(connect__established, NULL, tp, 13167 mtod(m, const char *), tp, th); 13168 rack_cc_conn_init(tp); 13169 } 13170 } else { 13171 /* 13172 * Received initial SYN in SYN-SENT[*] state => simultaneous 13173 * open. If segment contains CC option and there is a 13174 * cached CC, apply TAO test. If it succeeds, connection is * 13175 * half-synchronized. Otherwise, do 3-way handshake: 13176 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 13177 * there was no CC option, clear cached CC value. 13178 */ 13179 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN | TF_SONOTCONN); 13180 tcp_state_change(tp, TCPS_SYN_RECEIVED); 13181 } 13182 /* 13183 * Advance th->th_seq to correspond to first data byte. If data, 13184 * trim to stay within window, dropping FIN if necessary. 13185 */ 13186 th->th_seq++; 13187 if (tlen > tp->rcv_wnd) { 13188 todrop = tlen - tp->rcv_wnd; 13189 m_adj(m, -todrop); 13190 tlen = tp->rcv_wnd; 13191 thflags &= ~TH_FIN; 13192 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 13193 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 13194 } 13195 tp->snd_wl1 = th->th_seq - 1; 13196 tp->rcv_up = th->th_seq; 13197 /* 13198 * Client side of transaction: already sent SYN and data. If the 13199 * remote host used T/TCP to validate the SYN, our data will be 13200 * ACK'd; if so, enter normal data segment processing in the middle 13201 * of step 5, ack processing. Otherwise, goto step 6. 13202 */ 13203 if (thflags & TH_ACK) { 13204 /* For syn-sent we need to possibly update the rtt */ 13205 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13206 uint32_t t, mcts; 13207 13208 mcts = tcp_ts_getticks(); 13209 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13210 if (!tp->t_rttlow || tp->t_rttlow > t) 13211 tp->t_rttlow = t; 13212 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 13213 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13214 tcp_rack_xmit_timer_commit(rack, tp); 13215 } 13216 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 13217 return (ret_val); 13218 /* We may have changed to FIN_WAIT_1 above */ 13219 if (tp->t_state == TCPS_FIN_WAIT_1) { 13220 /* 13221 * In FIN_WAIT_1 STATE in addition to the processing 13222 * for the ESTABLISHED state if our FIN is now 13223 * acknowledged then enter FIN_WAIT_2. 13224 */ 13225 if (ourfinisacked) { 13226 /* 13227 * If we can't receive any more data, then 13228 * closing user can proceed. Starting the 13229 * timer is contrary to the specification, 13230 * but if we don't get a FIN we'll hang 13231 * forever. 13232 * 13233 * XXXjl: we should release the tp also, and 13234 * use a compressed state. 13235 */ 13236 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13237 soisdisconnected(so); 13238 tcp_timer_activate(tp, TT_2MSL, 13239 (tcp_fast_finwait2_recycle ? 13240 tcp_finwait2_timeout : 13241 TP_MAXIDLE(tp))); 13242 } 13243 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13244 } 13245 } 13246 } 13247 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13248 tiwin, thflags, nxt_pkt)); 13249 } 13250 13251 /* 13252 * Return value of 1, the TCB is unlocked and most 13253 * likely gone, return value of 0, the TCP is still 13254 * locked. 13255 */ 13256 static int 13257 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 13258 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13259 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13260 { 13261 struct tcp_rack *rack; 13262 int32_t ret_val = 0; 13263 int32_t ourfinisacked = 0; 13264 13265 ctf_calc_rwin(so, tp); 13266 if ((thflags & TH_ACK) && 13267 (SEQ_LEQ(th->th_ack, tp->snd_una) || 13268 SEQ_GT(th->th_ack, tp->snd_max))) { 13269 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13270 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13271 return (1); 13272 } 13273 rack = (struct tcp_rack *)tp->t_fb_ptr; 13274 if (IS_FASTOPEN(tp->t_flags)) { 13275 /* 13276 * When a TFO connection is in SYN_RECEIVED, the 13277 * only valid packets are the initial SYN, a 13278 * retransmit/copy of the initial SYN (possibly with 13279 * a subset of the original data), a valid ACK, a 13280 * FIN, or a RST. 13281 */ 13282 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 13283 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13284 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13285 return (1); 13286 } else if (thflags & TH_SYN) { 13287 /* non-initial SYN is ignored */ 13288 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 13289 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 13290 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 13291 ctf_do_drop(m, NULL); 13292 return (0); 13293 } 13294 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 13295 ctf_do_drop(m, NULL); 13296 return (0); 13297 } 13298 } 13299 13300 if ((thflags & TH_RST) || 13301 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13302 return (__ctf_process_rst(m, th, so, tp, 13303 &rack->r_ctl.challenge_ack_ts, 13304 &rack->r_ctl.challenge_ack_cnt)); 13305 /* 13306 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13307 * it's less than ts_recent, drop it. 13308 */ 13309 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13310 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13311 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13312 return (ret_val); 13313 } 13314 /* 13315 * In the SYN-RECEIVED state, validate that the packet belongs to 13316 * this connection before trimming the data to fit the receive 13317 * window. Check the sequence number versus IRS since we know the 13318 * sequence numbers haven't wrapped. This is a partial fix for the 13319 * "LAND" DoS attack. 13320 */ 13321 if (SEQ_LT(th->th_seq, tp->irs)) { 13322 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13323 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13324 return (1); 13325 } 13326 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13327 &rack->r_ctl.challenge_ack_ts, 13328 &rack->r_ctl.challenge_ack_cnt)) { 13329 return (ret_val); 13330 } 13331 /* 13332 * If last ACK falls within this segment's sequence numbers, record 13333 * its timestamp. NOTE: 1) That the test incorporates suggestions 13334 * from the latest proposal of the tcplw@cray.com list (Braden 13335 * 1993/04/26). 2) That updating only on newer timestamps interferes 13336 * with our earlier PAWS tests, so this check should be solely 13337 * predicated on the sequence space of this segment. 3) That we 13338 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13339 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13340 * SEG.Len, This modified check allows us to overcome RFC1323's 13341 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13342 * p.869. In such cases, we can still calculate the RTT correctly 13343 * when RCV.NXT == Last.ACK.Sent. 13344 */ 13345 if ((to->to_flags & TOF_TS) != 0 && 13346 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13347 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13348 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13349 tp->ts_recent_age = tcp_ts_getticks(); 13350 tp->ts_recent = to->to_tsval; 13351 } 13352 tp->snd_wnd = tiwin; 13353 rack_validate_fo_sendwin_up(tp, rack); 13354 /* 13355 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13356 * is on (half-synchronized state), then queue data for later 13357 * processing; else drop segment and return. 13358 */ 13359 if ((thflags & TH_ACK) == 0) { 13360 if (IS_FASTOPEN(tp->t_flags)) { 13361 rack_cc_conn_init(tp); 13362 } 13363 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13364 tiwin, thflags, nxt_pkt)); 13365 } 13366 KMOD_TCPSTAT_INC(tcps_connects); 13367 if (tp->t_flags & TF_SONOTCONN) { 13368 tp->t_flags &= ~TF_SONOTCONN; 13369 soisconnected(so); 13370 } 13371 /* Do window scaling? */ 13372 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 13373 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 13374 tp->rcv_scale = tp->request_r_scale; 13375 } 13376 /* 13377 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 13378 * FIN-WAIT-1 13379 */ 13380 tp->t_starttime = ticks; 13381 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 13382 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 13383 tp->t_tfo_pending = NULL; 13384 } 13385 if (tp->t_flags & TF_NEEDFIN) { 13386 tcp_state_change(tp, TCPS_FIN_WAIT_1); 13387 tp->t_flags &= ~TF_NEEDFIN; 13388 } else { 13389 tcp_state_change(tp, TCPS_ESTABLISHED); 13390 TCP_PROBE5(accept__established, NULL, tp, 13391 mtod(m, const char *), tp, th); 13392 /* 13393 * TFO connections call cc_conn_init() during SYN 13394 * processing. Calling it again here for such connections 13395 * is not harmless as it would undo the snd_cwnd reduction 13396 * that occurs when a TFO SYN|ACK is retransmitted. 13397 */ 13398 if (!IS_FASTOPEN(tp->t_flags)) 13399 rack_cc_conn_init(tp); 13400 } 13401 /* 13402 * Account for the ACK of our SYN prior to 13403 * regular ACK processing below, except for 13404 * simultaneous SYN, which is handled later. 13405 */ 13406 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 13407 tp->snd_una++; 13408 /* 13409 * If segment contains data or ACK, will call tcp_reass() later; if 13410 * not, do so now to pass queued data to user. 13411 */ 13412 if (tlen == 0 && (thflags & TH_FIN) == 0) { 13413 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 13414 (struct mbuf *)0); 13415 if (tp->t_flags & TF_WAKESOR) { 13416 tp->t_flags &= ~TF_WAKESOR; 13417 /* NB: sorwakeup_locked() does an implicit unlock. */ 13418 sorwakeup_locked(so); 13419 } 13420 } 13421 tp->snd_wl1 = th->th_seq - 1; 13422 /* For syn-recv we need to possibly update the rtt */ 13423 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 13424 uint32_t t, mcts; 13425 13426 mcts = tcp_ts_getticks(); 13427 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 13428 if (!tp->t_rttlow || tp->t_rttlow > t) 13429 tp->t_rttlow = t; 13430 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 13431 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 13432 tcp_rack_xmit_timer_commit(rack, tp); 13433 } 13434 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 13435 return (ret_val); 13436 } 13437 if (tp->t_state == TCPS_FIN_WAIT_1) { 13438 /* We could have went to FIN_WAIT_1 (or EST) above */ 13439 /* 13440 * In FIN_WAIT_1 STATE in addition to the processing for the 13441 * ESTABLISHED state if our FIN is now acknowledged then 13442 * enter FIN_WAIT_2. 13443 */ 13444 if (ourfinisacked) { 13445 /* 13446 * If we can't receive any more data, then closing 13447 * user can proceed. Starting the timer is contrary 13448 * to the specification, but if we don't get a FIN 13449 * we'll hang forever. 13450 * 13451 * XXXjl: we should release the tp also, and use a 13452 * compressed state. 13453 */ 13454 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13455 soisdisconnected(so); 13456 tcp_timer_activate(tp, TT_2MSL, 13457 (tcp_fast_finwait2_recycle ? 13458 tcp_finwait2_timeout : 13459 TP_MAXIDLE(tp))); 13460 } 13461 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13462 } 13463 } 13464 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13465 tiwin, thflags, nxt_pkt)); 13466 } 13467 13468 /* 13469 * Return value of 1, the TCB is unlocked and most 13470 * likely gone, return value of 0, the TCP is still 13471 * locked. 13472 */ 13473 static int 13474 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 13475 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13476 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13477 { 13478 int32_t ret_val = 0; 13479 struct tcp_rack *rack; 13480 13481 /* 13482 * Header prediction: check for the two common cases of a 13483 * uni-directional data xfer. If the packet has no control flags, 13484 * is in-sequence, the window didn't change and we're not 13485 * retransmitting, it's a candidate. If the length is zero and the 13486 * ack moved forward, we're the sender side of the xfer. Just free 13487 * the data acked & wake any higher level process that was blocked 13488 * waiting for space. If the length is non-zero and the ack didn't 13489 * move, we're the receiver side. If we're getting packets in-order 13490 * (the reassembly queue is empty), add the data toc The socket 13491 * buffer and note that we need a delayed ack. Make sure that the 13492 * hidden state-flags are also off. Since we check for 13493 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 13494 */ 13495 rack = (struct tcp_rack *)tp->t_fb_ptr; 13496 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 13497 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 13498 __predict_true(SEGQ_EMPTY(tp)) && 13499 __predict_true(th->th_seq == tp->rcv_nxt)) { 13500 if (tlen == 0) { 13501 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 13502 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 13503 return (0); 13504 } 13505 } else { 13506 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 13507 tiwin, nxt_pkt, iptos)) { 13508 return (0); 13509 } 13510 } 13511 } 13512 ctf_calc_rwin(so, tp); 13513 13514 if ((thflags & TH_RST) || 13515 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13516 return (__ctf_process_rst(m, th, so, tp, 13517 &rack->r_ctl.challenge_ack_ts, 13518 &rack->r_ctl.challenge_ack_cnt)); 13519 13520 /* 13521 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13522 * synchronized state. 13523 */ 13524 if (thflags & TH_SYN) { 13525 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13526 return (ret_val); 13527 } 13528 /* 13529 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13530 * it's less than ts_recent, drop it. 13531 */ 13532 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13533 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13534 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13535 return (ret_val); 13536 } 13537 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13538 &rack->r_ctl.challenge_ack_ts, 13539 &rack->r_ctl.challenge_ack_cnt)) { 13540 return (ret_val); 13541 } 13542 /* 13543 * If last ACK falls within this segment's sequence numbers, record 13544 * its timestamp. NOTE: 1) That the test incorporates suggestions 13545 * from the latest proposal of the tcplw@cray.com list (Braden 13546 * 1993/04/26). 2) That updating only on newer timestamps interferes 13547 * with our earlier PAWS tests, so this check should be solely 13548 * predicated on the sequence space of this segment. 3) That we 13549 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13550 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13551 * SEG.Len, This modified check allows us to overcome RFC1323's 13552 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13553 * p.869. In such cases, we can still calculate the RTT correctly 13554 * when RCV.NXT == Last.ACK.Sent. 13555 */ 13556 if ((to->to_flags & TOF_TS) != 0 && 13557 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13558 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13559 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13560 tp->ts_recent_age = tcp_ts_getticks(); 13561 tp->ts_recent = to->to_tsval; 13562 } 13563 /* 13564 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13565 * is on (half-synchronized state), then queue data for later 13566 * processing; else drop segment and return. 13567 */ 13568 if ((thflags & TH_ACK) == 0) { 13569 if (tp->t_flags & TF_NEEDSYN) { 13570 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13571 tiwin, thflags, nxt_pkt)); 13572 13573 } else if (tp->t_flags & TF_ACKNOW) { 13574 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13575 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13576 return (ret_val); 13577 } else { 13578 ctf_do_drop(m, NULL); 13579 return (0); 13580 } 13581 } 13582 /* 13583 * Ack processing. 13584 */ 13585 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 13586 return (ret_val); 13587 } 13588 if (sbavail(&so->so_snd)) { 13589 if (ctf_progress_timeout_check(tp, true)) { 13590 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 13591 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13592 return (1); 13593 } 13594 } 13595 /* State changes only happen in rack_process_data() */ 13596 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13597 tiwin, thflags, nxt_pkt)); 13598 } 13599 13600 /* 13601 * Return value of 1, the TCB is unlocked and most 13602 * likely gone, return value of 0, the TCP is still 13603 * locked. 13604 */ 13605 static int 13606 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 13607 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13608 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13609 { 13610 int32_t ret_val = 0; 13611 struct tcp_rack *rack; 13612 13613 rack = (struct tcp_rack *)tp->t_fb_ptr; 13614 ctf_calc_rwin(so, tp); 13615 if ((thflags & TH_RST) || 13616 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13617 return (__ctf_process_rst(m, th, so, tp, 13618 &rack->r_ctl.challenge_ack_ts, 13619 &rack->r_ctl.challenge_ack_cnt)); 13620 /* 13621 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13622 * synchronized state. 13623 */ 13624 if (thflags & TH_SYN) { 13625 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13626 return (ret_val); 13627 } 13628 /* 13629 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13630 * it's less than ts_recent, drop it. 13631 */ 13632 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13633 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13634 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13635 return (ret_val); 13636 } 13637 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13638 &rack->r_ctl.challenge_ack_ts, 13639 &rack->r_ctl.challenge_ack_cnt)) { 13640 return (ret_val); 13641 } 13642 /* 13643 * If last ACK falls within this segment's sequence numbers, record 13644 * its timestamp. NOTE: 1) That the test incorporates suggestions 13645 * from the latest proposal of the tcplw@cray.com list (Braden 13646 * 1993/04/26). 2) That updating only on newer timestamps interferes 13647 * with our earlier PAWS tests, so this check should be solely 13648 * predicated on the sequence space of this segment. 3) That we 13649 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13650 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13651 * SEG.Len, This modified check allows us to overcome RFC1323's 13652 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13653 * p.869. In such cases, we can still calculate the RTT correctly 13654 * when RCV.NXT == Last.ACK.Sent. 13655 */ 13656 if ((to->to_flags & TOF_TS) != 0 && 13657 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13658 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13659 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13660 tp->ts_recent_age = tcp_ts_getticks(); 13661 tp->ts_recent = to->to_tsval; 13662 } 13663 /* 13664 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13665 * is on (half-synchronized state), then queue data for later 13666 * processing; else drop segment and return. 13667 */ 13668 if ((thflags & TH_ACK) == 0) { 13669 if (tp->t_flags & TF_NEEDSYN) { 13670 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13671 tiwin, thflags, nxt_pkt)); 13672 13673 } else if (tp->t_flags & TF_ACKNOW) { 13674 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13675 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13676 return (ret_val); 13677 } else { 13678 ctf_do_drop(m, NULL); 13679 return (0); 13680 } 13681 } 13682 /* 13683 * Ack processing. 13684 */ 13685 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 13686 return (ret_val); 13687 } 13688 if (sbavail(&so->so_snd)) { 13689 if (ctf_progress_timeout_check(tp, true)) { 13690 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13691 tp, tick, PROGRESS_DROP, __LINE__); 13692 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13693 return (1); 13694 } 13695 } 13696 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13697 tiwin, thflags, nxt_pkt)); 13698 } 13699 13700 static int 13701 rack_check_data_after_close(struct mbuf *m, 13702 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 13703 { 13704 struct tcp_rack *rack; 13705 13706 rack = (struct tcp_rack *)tp->t_fb_ptr; 13707 if (rack->rc_allow_data_af_clo == 0) { 13708 close_now: 13709 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13710 /* tcp_close will kill the inp pre-log the Reset */ 13711 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13712 tp = tcp_close(tp); 13713 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 13714 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 13715 return (1); 13716 } 13717 if (sbavail(&so->so_snd) == 0) 13718 goto close_now; 13719 /* Ok we allow data that is ignored and a followup reset */ 13720 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 13721 tp->rcv_nxt = th->th_seq + *tlen; 13722 tp->t_flags2 |= TF2_DROP_AF_DATA; 13723 rack->r_wanted_output = 1; 13724 *tlen = 0; 13725 return (0); 13726 } 13727 13728 /* 13729 * Return value of 1, the TCB is unlocked and most 13730 * likely gone, return value of 0, the TCP is still 13731 * locked. 13732 */ 13733 static int 13734 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 13735 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13736 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13737 { 13738 int32_t ret_val = 0; 13739 int32_t ourfinisacked = 0; 13740 struct tcp_rack *rack; 13741 13742 rack = (struct tcp_rack *)tp->t_fb_ptr; 13743 ctf_calc_rwin(so, tp); 13744 13745 if ((thflags & TH_RST) || 13746 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13747 return (__ctf_process_rst(m, th, so, tp, 13748 &rack->r_ctl.challenge_ack_ts, 13749 &rack->r_ctl.challenge_ack_cnt)); 13750 /* 13751 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13752 * synchronized state. 13753 */ 13754 if (thflags & TH_SYN) { 13755 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13756 return (ret_val); 13757 } 13758 /* 13759 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13760 * it's less than ts_recent, drop it. 13761 */ 13762 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13763 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13764 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13765 return (ret_val); 13766 } 13767 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13768 &rack->r_ctl.challenge_ack_ts, 13769 &rack->r_ctl.challenge_ack_cnt)) { 13770 return (ret_val); 13771 } 13772 /* 13773 * If new data are received on a connection after the user processes 13774 * are gone, then RST the other end. 13775 */ 13776 if ((tp->t_flags & TF_CLOSED) && tlen && 13777 rack_check_data_after_close(m, tp, &tlen, th, so)) 13778 return (1); 13779 /* 13780 * If last ACK falls within this segment's sequence numbers, record 13781 * its timestamp. NOTE: 1) That the test incorporates suggestions 13782 * from the latest proposal of the tcplw@cray.com list (Braden 13783 * 1993/04/26). 2) That updating only on newer timestamps interferes 13784 * with our earlier PAWS tests, so this check should be solely 13785 * predicated on the sequence space of this segment. 3) That we 13786 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13787 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13788 * SEG.Len, This modified check allows us to overcome RFC1323's 13789 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13790 * p.869. In such cases, we can still calculate the RTT correctly 13791 * when RCV.NXT == Last.ACK.Sent. 13792 */ 13793 if ((to->to_flags & TOF_TS) != 0 && 13794 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13795 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13796 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13797 tp->ts_recent_age = tcp_ts_getticks(); 13798 tp->ts_recent = to->to_tsval; 13799 } 13800 /* 13801 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13802 * is on (half-synchronized state), then queue data for later 13803 * processing; else drop segment and return. 13804 */ 13805 if ((thflags & TH_ACK) == 0) { 13806 if (tp->t_flags & TF_NEEDSYN) { 13807 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13808 tiwin, thflags, nxt_pkt)); 13809 } else if (tp->t_flags & TF_ACKNOW) { 13810 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13811 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13812 return (ret_val); 13813 } else { 13814 ctf_do_drop(m, NULL); 13815 return (0); 13816 } 13817 } 13818 /* 13819 * Ack processing. 13820 */ 13821 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 13822 return (ret_val); 13823 } 13824 if (ourfinisacked) { 13825 /* 13826 * If we can't receive any more data, then closing user can 13827 * proceed. Starting the timer is contrary to the 13828 * specification, but if we don't get a FIN we'll hang 13829 * forever. 13830 * 13831 * XXXjl: we should release the tp also, and use a 13832 * compressed state. 13833 */ 13834 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13835 soisdisconnected(so); 13836 tcp_timer_activate(tp, TT_2MSL, 13837 (tcp_fast_finwait2_recycle ? 13838 tcp_finwait2_timeout : 13839 TP_MAXIDLE(tp))); 13840 } 13841 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13842 } 13843 if (sbavail(&so->so_snd)) { 13844 if (ctf_progress_timeout_check(tp, true)) { 13845 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13846 tp, tick, PROGRESS_DROP, __LINE__); 13847 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13848 return (1); 13849 } 13850 } 13851 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13852 tiwin, thflags, nxt_pkt)); 13853 } 13854 13855 /* 13856 * Return value of 1, the TCB is unlocked and most 13857 * likely gone, return value of 0, the TCP is still 13858 * locked. 13859 */ 13860 static int 13861 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 13862 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13863 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13864 { 13865 int32_t ret_val = 0; 13866 int32_t ourfinisacked = 0; 13867 struct tcp_rack *rack; 13868 13869 rack = (struct tcp_rack *)tp->t_fb_ptr; 13870 ctf_calc_rwin(so, tp); 13871 13872 if ((thflags & TH_RST) || 13873 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13874 return (__ctf_process_rst(m, th, so, tp, 13875 &rack->r_ctl.challenge_ack_ts, 13876 &rack->r_ctl.challenge_ack_cnt)); 13877 /* 13878 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13879 * synchronized state. 13880 */ 13881 if (thflags & TH_SYN) { 13882 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13883 return (ret_val); 13884 } 13885 /* 13886 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 13887 * it's less than ts_recent, drop it. 13888 */ 13889 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 13890 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 13891 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 13892 return (ret_val); 13893 } 13894 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 13895 &rack->r_ctl.challenge_ack_ts, 13896 &rack->r_ctl.challenge_ack_cnt)) { 13897 return (ret_val); 13898 } 13899 /* 13900 * If new data are received on a connection after the user processes 13901 * are gone, then RST the other end. 13902 */ 13903 if ((tp->t_flags & TF_CLOSED) && tlen && 13904 rack_check_data_after_close(m, tp, &tlen, th, so)) 13905 return (1); 13906 /* 13907 * If last ACK falls within this segment's sequence numbers, record 13908 * its timestamp. NOTE: 1) That the test incorporates suggestions 13909 * from the latest proposal of the tcplw@cray.com list (Braden 13910 * 1993/04/26). 2) That updating only on newer timestamps interferes 13911 * with our earlier PAWS tests, so this check should be solely 13912 * predicated on the sequence space of this segment. 3) That we 13913 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 13914 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 13915 * SEG.Len, This modified check allows us to overcome RFC1323's 13916 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 13917 * p.869. In such cases, we can still calculate the RTT correctly 13918 * when RCV.NXT == Last.ACK.Sent. 13919 */ 13920 if ((to->to_flags & TOF_TS) != 0 && 13921 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 13922 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 13923 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 13924 tp->ts_recent_age = tcp_ts_getticks(); 13925 tp->ts_recent = to->to_tsval; 13926 } 13927 /* 13928 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 13929 * is on (half-synchronized state), then queue data for later 13930 * processing; else drop segment and return. 13931 */ 13932 if ((thflags & TH_ACK) == 0) { 13933 if (tp->t_flags & TF_NEEDSYN) { 13934 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13935 tiwin, thflags, nxt_pkt)); 13936 } else if (tp->t_flags & TF_ACKNOW) { 13937 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 13938 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 13939 return (ret_val); 13940 } else { 13941 ctf_do_drop(m, NULL); 13942 return (0); 13943 } 13944 } 13945 /* 13946 * Ack processing. 13947 */ 13948 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 13949 return (ret_val); 13950 } 13951 if (ourfinisacked) { 13952 tcp_twstart(tp); 13953 m_freem(m); 13954 return (1); 13955 } 13956 if (sbavail(&so->so_snd)) { 13957 if (ctf_progress_timeout_check(tp, true)) { 13958 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13959 tp, tick, PROGRESS_DROP, __LINE__); 13960 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13961 return (1); 13962 } 13963 } 13964 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 13965 tiwin, thflags, nxt_pkt)); 13966 } 13967 13968 /* 13969 * Return value of 1, the TCB is unlocked and most 13970 * likely gone, return value of 0, the TCP is still 13971 * locked. 13972 */ 13973 static int 13974 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 13975 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 13976 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 13977 { 13978 int32_t ret_val = 0; 13979 int32_t ourfinisacked = 0; 13980 struct tcp_rack *rack; 13981 13982 rack = (struct tcp_rack *)tp->t_fb_ptr; 13983 ctf_calc_rwin(so, tp); 13984 13985 if ((thflags & TH_RST) || 13986 (tp->t_fin_is_rst && (thflags & TH_FIN))) 13987 return (__ctf_process_rst(m, th, so, tp, 13988 &rack->r_ctl.challenge_ack_ts, 13989 &rack->r_ctl.challenge_ack_cnt)); 13990 /* 13991 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 13992 * synchronized state. 13993 */ 13994 if (thflags & TH_SYN) { 13995 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 13996 return (ret_val); 13997 } 13998 /* 13999 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14000 * it's less than ts_recent, drop it. 14001 */ 14002 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14003 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14004 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14005 return (ret_val); 14006 } 14007 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14008 &rack->r_ctl.challenge_ack_ts, 14009 &rack->r_ctl.challenge_ack_cnt)) { 14010 return (ret_val); 14011 } 14012 /* 14013 * If new data are received on a connection after the user processes 14014 * are gone, then RST the other end. 14015 */ 14016 if ((tp->t_flags & TF_CLOSED) && tlen && 14017 rack_check_data_after_close(m, tp, &tlen, th, so)) 14018 return (1); 14019 /* 14020 * If last ACK falls within this segment's sequence numbers, record 14021 * its timestamp. NOTE: 1) That the test incorporates suggestions 14022 * from the latest proposal of the tcplw@cray.com list (Braden 14023 * 1993/04/26). 2) That updating only on newer timestamps interferes 14024 * with our earlier PAWS tests, so this check should be solely 14025 * predicated on the sequence space of this segment. 3) That we 14026 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14027 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14028 * SEG.Len, This modified check allows us to overcome RFC1323's 14029 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14030 * p.869. In such cases, we can still calculate the RTT correctly 14031 * when RCV.NXT == Last.ACK.Sent. 14032 */ 14033 if ((to->to_flags & TOF_TS) != 0 && 14034 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14035 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14036 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14037 tp->ts_recent_age = tcp_ts_getticks(); 14038 tp->ts_recent = to->to_tsval; 14039 } 14040 /* 14041 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14042 * is on (half-synchronized state), then queue data for later 14043 * processing; else drop segment and return. 14044 */ 14045 if ((thflags & TH_ACK) == 0) { 14046 if (tp->t_flags & TF_NEEDSYN) { 14047 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14048 tiwin, thflags, nxt_pkt)); 14049 } else if (tp->t_flags & TF_ACKNOW) { 14050 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14051 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14052 return (ret_val); 14053 } else { 14054 ctf_do_drop(m, NULL); 14055 return (0); 14056 } 14057 } 14058 /* 14059 * case TCPS_LAST_ACK: Ack processing. 14060 */ 14061 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 14062 return (ret_val); 14063 } 14064 if (ourfinisacked) { 14065 tp = tcp_close(tp); 14066 ctf_do_drop(m, tp); 14067 return (1); 14068 } 14069 if (sbavail(&so->so_snd)) { 14070 if (ctf_progress_timeout_check(tp, true)) { 14071 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14072 tp, tick, PROGRESS_DROP, __LINE__); 14073 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14074 return (1); 14075 } 14076 } 14077 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14078 tiwin, thflags, nxt_pkt)); 14079 } 14080 14081 /* 14082 * Return value of 1, the TCB is unlocked and most 14083 * likely gone, return value of 0, the TCP is still 14084 * locked. 14085 */ 14086 static int 14087 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 14088 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 14089 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 14090 { 14091 int32_t ret_val = 0; 14092 int32_t ourfinisacked = 0; 14093 struct tcp_rack *rack; 14094 14095 rack = (struct tcp_rack *)tp->t_fb_ptr; 14096 ctf_calc_rwin(so, tp); 14097 14098 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 14099 if ((thflags & TH_RST) || 14100 (tp->t_fin_is_rst && (thflags & TH_FIN))) 14101 return (__ctf_process_rst(m, th, so, tp, 14102 &rack->r_ctl.challenge_ack_ts, 14103 &rack->r_ctl.challenge_ack_cnt)); 14104 /* 14105 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 14106 * synchronized state. 14107 */ 14108 if (thflags & TH_SYN) { 14109 ctf_challenge_ack(m, th, tp, iptos, &ret_val); 14110 return (ret_val); 14111 } 14112 /* 14113 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 14114 * it's less than ts_recent, drop it. 14115 */ 14116 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 14117 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 14118 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 14119 return (ret_val); 14120 } 14121 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 14122 &rack->r_ctl.challenge_ack_ts, 14123 &rack->r_ctl.challenge_ack_cnt)) { 14124 return (ret_val); 14125 } 14126 /* 14127 * If new data are received on a connection after the user processes 14128 * are gone, then RST the other end. 14129 */ 14130 if ((tp->t_flags & TF_CLOSED) && tlen && 14131 rack_check_data_after_close(m, tp, &tlen, th, so)) 14132 return (1); 14133 /* 14134 * If last ACK falls within this segment's sequence numbers, record 14135 * its timestamp. NOTE: 1) That the test incorporates suggestions 14136 * from the latest proposal of the tcplw@cray.com list (Braden 14137 * 1993/04/26). 2) That updating only on newer timestamps interferes 14138 * with our earlier PAWS tests, so this check should be solely 14139 * predicated on the sequence space of this segment. 3) That we 14140 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 14141 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 14142 * SEG.Len, This modified check allows us to overcome RFC1323's 14143 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 14144 * p.869. In such cases, we can still calculate the RTT correctly 14145 * when RCV.NXT == Last.ACK.Sent. 14146 */ 14147 if ((to->to_flags & TOF_TS) != 0 && 14148 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 14149 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 14150 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 14151 tp->ts_recent_age = tcp_ts_getticks(); 14152 tp->ts_recent = to->to_tsval; 14153 } 14154 /* 14155 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 14156 * is on (half-synchronized state), then queue data for later 14157 * processing; else drop segment and return. 14158 */ 14159 if ((thflags & TH_ACK) == 0) { 14160 if (tp->t_flags & TF_NEEDSYN) { 14161 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14162 tiwin, thflags, nxt_pkt)); 14163 } else if (tp->t_flags & TF_ACKNOW) { 14164 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 14165 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 14166 return (ret_val); 14167 } else { 14168 ctf_do_drop(m, NULL); 14169 return (0); 14170 } 14171 } 14172 /* 14173 * Ack processing. 14174 */ 14175 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 14176 return (ret_val); 14177 } 14178 if (sbavail(&so->so_snd)) { 14179 if (ctf_progress_timeout_check(tp, true)) { 14180 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14181 tp, tick, PROGRESS_DROP, __LINE__); 14182 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14183 return (1); 14184 } 14185 } 14186 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 14187 tiwin, thflags, nxt_pkt)); 14188 } 14189 14190 static void inline 14191 rack_clear_rate_sample(struct tcp_rack *rack) 14192 { 14193 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 14194 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 14195 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 14196 } 14197 14198 static void 14199 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 14200 { 14201 uint64_t bw_est, rate_wanted; 14202 int chged = 0; 14203 uint32_t user_max, orig_min, orig_max; 14204 14205 #ifdef TCP_REQUEST_TRK 14206 if (rack->rc_hybrid_mode && 14207 (rack->r_ctl.rc_pace_max_segs != 0) && 14208 (rack_hybrid_allow_set_maxseg == 1) && 14209 (rack->r_ctl.rc_last_sft != NULL)) { 14210 rack->r_ctl.rc_last_sft->hybrid_flags &= ~TCP_HYBRID_PACING_SETMSS; 14211 return; 14212 } 14213 #endif 14214 orig_min = rack->r_ctl.rc_pace_min_segs; 14215 orig_max = rack->r_ctl.rc_pace_max_segs; 14216 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 14217 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 14218 chged = 1; 14219 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 14220 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 14221 if (user_max != rack->r_ctl.rc_pace_max_segs) 14222 chged = 1; 14223 } 14224 if (rack->rc_force_max_seg) { 14225 rack->r_ctl.rc_pace_max_segs = user_max; 14226 } else if (rack->use_fixed_rate) { 14227 bw_est = rack_get_bw(rack); 14228 if ((rack->r_ctl.crte == NULL) || 14229 (bw_est != rack->r_ctl.crte->rate)) { 14230 rack->r_ctl.rc_pace_max_segs = user_max; 14231 } else { 14232 /* We are pacing right at the hardware rate */ 14233 uint32_t segsiz, pace_one; 14234 14235 if (rack_pace_one_seg || 14236 (rack->r_ctl.rc_user_set_min_segs == 1)) 14237 pace_one = 1; 14238 else 14239 pace_one = 0; 14240 segsiz = min(ctf_fixed_maxseg(tp), 14241 rack->r_ctl.rc_pace_min_segs); 14242 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor( 14243 tp, bw_est, segsiz, pace_one, 14244 rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 14245 } 14246 } else if (rack->rc_always_pace) { 14247 if (rack->r_ctl.gp_bw || 14248 rack->r_ctl.init_rate) { 14249 /* We have a rate of some sort set */ 14250 uint32_t orig; 14251 14252 bw_est = rack_get_bw(rack); 14253 orig = rack->r_ctl.rc_pace_max_segs; 14254 if (fill_override) 14255 rate_wanted = *fill_override; 14256 else 14257 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); 14258 if (rate_wanted) { 14259 /* We have something */ 14260 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 14261 rate_wanted, 14262 ctf_fixed_maxseg(rack->rc_tp)); 14263 } else 14264 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 14265 if (orig != rack->r_ctl.rc_pace_max_segs) 14266 chged = 1; 14267 } else if ((rack->r_ctl.gp_bw == 0) && 14268 (rack->r_ctl.rc_pace_max_segs == 0)) { 14269 /* 14270 * If we have nothing limit us to bursting 14271 * out IW sized pieces. 14272 */ 14273 chged = 1; 14274 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 14275 } 14276 } 14277 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 14278 chged = 1; 14279 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 14280 } 14281 if (chged) 14282 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 14283 } 14284 14285 14286 static void 14287 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack, int32_t flags) 14288 { 14289 #ifdef INET6 14290 struct ip6_hdr *ip6 = NULL; 14291 #endif 14292 #ifdef INET 14293 struct ip *ip = NULL; 14294 #endif 14295 struct udphdr *udp = NULL; 14296 14297 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 14298 #ifdef INET6 14299 if (rack->r_is_v6) { 14300 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 14301 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 14302 if (tp->t_port) { 14303 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14304 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 14305 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14306 udp->uh_dport = tp->t_port; 14307 rack->r_ctl.fsb.udp = udp; 14308 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14309 } else 14310 { 14311 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 14312 rack->r_ctl.fsb.udp = NULL; 14313 } 14314 tcpip_fillheaders(rack->rc_inp, 14315 tp->t_port, 14316 ip6, rack->r_ctl.fsb.th); 14317 rack->r_ctl.fsb.hoplimit = in6_selecthlim(rack->rc_inp, NULL); 14318 } else 14319 #endif /* INET6 */ 14320 #ifdef INET 14321 { 14322 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 14323 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 14324 if (tp->t_port) { 14325 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 14326 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 14327 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 14328 udp->uh_dport = tp->t_port; 14329 rack->r_ctl.fsb.udp = udp; 14330 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 14331 } else 14332 { 14333 rack->r_ctl.fsb.udp = NULL; 14334 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 14335 } 14336 tcpip_fillheaders(rack->rc_inp, 14337 tp->t_port, 14338 ip, rack->r_ctl.fsb.th); 14339 rack->r_ctl.fsb.hoplimit = tptoinpcb(tp)->inp_ip_ttl; 14340 } 14341 #endif 14342 rack->r_ctl.fsb.recwin = lmin(lmax(sbspace(&tptosocket(tp)->so_rcv), 0), 14343 (long)TCP_MAXWIN << tp->rcv_scale); 14344 rack->r_fsb_inited = 1; 14345 } 14346 14347 static int 14348 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 14349 { 14350 /* 14351 * Allocate the larger of spaces V6 if available else just 14352 * V4 and include udphdr (overbook) 14353 */ 14354 #ifdef INET6 14355 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 14356 #else 14357 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 14358 #endif 14359 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 14360 M_TCPFSB, M_NOWAIT|M_ZERO); 14361 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 14362 return (ENOMEM); 14363 } 14364 rack->r_fsb_inited = 0; 14365 return (0); 14366 } 14367 14368 static void 14369 rack_log_hystart_event(struct tcp_rack *rack, uint32_t high_seq, uint8_t mod) 14370 { 14371 /* 14372 * Types of logs (mod value) 14373 * 20 - Initial round setup 14374 * 21 - Rack declares a new round. 14375 */ 14376 struct tcpcb *tp; 14377 14378 tp = rack->rc_tp; 14379 if (tcp_bblogging_on(tp)) { 14380 union tcp_log_stackspecific log; 14381 struct timeval tv; 14382 14383 memset(&log, 0, sizeof(log)); 14384 log.u_bbr.flex1 = rack->r_ctl.current_round; 14385 log.u_bbr.flex2 = rack->r_ctl.roundends; 14386 log.u_bbr.flex3 = high_seq; 14387 log.u_bbr.flex4 = tp->snd_max; 14388 log.u_bbr.flex8 = mod; 14389 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14390 log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; 14391 log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; 14392 TCP_LOG_EVENTP(tp, NULL, 14393 &tptosocket(tp)->so_rcv, 14394 &tptosocket(tp)->so_snd, 14395 TCP_HYSTART, 0, 14396 0, &log, false, &tv); 14397 } 14398 } 14399 14400 static void 14401 rack_deferred_init(struct tcpcb *tp, struct tcp_rack *rack) 14402 { 14403 rack->rack_deferred_inited = 1; 14404 rack->r_ctl.roundends = tp->snd_max; 14405 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 14406 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 14407 } 14408 14409 static void 14410 rack_init_retransmit_value(struct tcp_rack *rack, int ctl) 14411 { 14412 /* Retransmit bit controls. 14413 * 14414 * The setting of these values control one of 14415 * three settings you can have and dictate 14416 * how rack does retransmissions. Note this 14417 * is in *any* mode i.e. pacing on or off DGP 14418 * fixed rate pacing, or just bursting rack. 14419 * 14420 * 1 - Use full sized retransmits i.e. limit 14421 * the size to whatever the pace_max_segments 14422 * size is. 14423 * 14424 * 2 - Use pacer min granularity as a guide to 14425 * the size combined with the current calculated 14426 * goodput b/w measurement. So for example if 14427 * the goodput is measured at 20Mbps we would 14428 * calculate 8125 (pacer minimum 250usec in 14429 * that b/w) and then round it up to the next 14430 * MSS i.e. for 1448 mss 6 MSS or 8688 bytes. 14431 * 14432 * 0 - The rack default 1 MSS (anything not 0/1/2 14433 * fall here too if we are setting via rack_init()). 14434 * 14435 */ 14436 if (ctl == 1) { 14437 rack->full_size_rxt = 1; 14438 rack->shape_rxt_to_pacing_min = 0; 14439 } else if (ctl == 2) { 14440 rack->full_size_rxt = 0; 14441 rack->shape_rxt_to_pacing_min = 1; 14442 } else { 14443 rack->full_size_rxt = 0; 14444 rack->shape_rxt_to_pacing_min = 0; 14445 } 14446 } 14447 14448 static void 14449 rack_log_chg_info(struct tcpcb *tp, struct tcp_rack *rack, uint8_t mod, 14450 uint32_t flex1, 14451 uint32_t flex2, 14452 uint32_t flex3) 14453 { 14454 if (tcp_bblogging_on(rack->rc_tp)) { 14455 union tcp_log_stackspecific log; 14456 struct timeval tv; 14457 14458 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14459 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14460 log.u_bbr.flex8 = mod; 14461 log.u_bbr.flex1 = flex1; 14462 log.u_bbr.flex2 = flex2; 14463 log.u_bbr.flex3 = flex3; 14464 tcp_log_event(tp, NULL, NULL, NULL, TCP_CHG_QUERY, 0, 14465 0, &log, false, NULL, __func__, __LINE__, &tv); 14466 } 14467 } 14468 14469 static int 14470 rack_chg_query(struct tcpcb *tp, struct tcp_query_resp *reqr) 14471 { 14472 struct tcp_rack *rack; 14473 struct rack_sendmap *rsm; 14474 int i; 14475 14476 14477 rack = (struct tcp_rack *)tp->t_fb_ptr; 14478 switch (reqr->req) { 14479 case TCP_QUERY_SENDMAP: 14480 if ((reqr->req_param == tp->snd_max) || 14481 (tp->snd_max == tp->snd_una)){ 14482 /* Unlikely */ 14483 return (0); 14484 } 14485 rsm = tqhash_find(rack->r_ctl.tqh, reqr->req_param); 14486 if (rsm == NULL) { 14487 /* Can't find that seq -- unlikely */ 14488 return (0); 14489 } 14490 reqr->sendmap_start = rsm->r_start; 14491 reqr->sendmap_end = rsm->r_end; 14492 reqr->sendmap_send_cnt = rsm->r_rtr_cnt; 14493 reqr->sendmap_fas = rsm->r_fas; 14494 if (reqr->sendmap_send_cnt > SNDMAP_NRTX) 14495 reqr->sendmap_send_cnt = SNDMAP_NRTX; 14496 for(i=0; i<reqr->sendmap_send_cnt; i++) 14497 reqr->sendmap_time[i] = rsm->r_tim_lastsent[i]; 14498 reqr->sendmap_ack_arrival = rsm->r_ack_arrival; 14499 reqr->sendmap_flags = rsm->r_flags & SNDMAP_MASK; 14500 reqr->sendmap_r_rtr_bytes = rsm->r_rtr_bytes; 14501 reqr->sendmap_dupacks = rsm->r_dupack; 14502 rack_log_chg_info(tp, rack, 1, 14503 rsm->r_start, 14504 rsm->r_end, 14505 rsm->r_flags); 14506 return(1); 14507 break; 14508 case TCP_QUERY_TIMERS_UP: 14509 if (rack->r_ctl.rc_hpts_flags == 0) { 14510 /* no timers up */ 14511 return (0); 14512 } 14513 reqr->timer_hpts_flags = rack->r_ctl.rc_hpts_flags; 14514 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14515 reqr->timer_pacing_to = rack->r_ctl.rc_last_output_to; 14516 } 14517 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14518 reqr->timer_timer_exp = rack->r_ctl.rc_timer_exp; 14519 } 14520 rack_log_chg_info(tp, rack, 2, 14521 rack->r_ctl.rc_hpts_flags, 14522 rack->r_ctl.rc_last_output_to, 14523 rack->r_ctl.rc_timer_exp); 14524 return (1); 14525 break; 14526 case TCP_QUERY_RACK_TIMES: 14527 /* Reordering items */ 14528 reqr->rack_num_dsacks = rack->r_ctl.num_dsack; 14529 reqr->rack_reorder_ts = rack->r_ctl.rc_reorder_ts; 14530 /* Timerstamps and timers */ 14531 reqr->rack_rxt_last_time = rack->r_ctl.rc_tlp_rxt_last_time; 14532 reqr->rack_min_rtt = rack->r_ctl.rc_rack_min_rtt; 14533 reqr->rack_rtt = rack->rc_rack_rtt; 14534 reqr->rack_tmit_time = rack->r_ctl.rc_rack_tmit_time; 14535 reqr->rack_srtt_measured = rack->rc_srtt_measure_made; 14536 /* PRR data */ 14537 reqr->rack_sacked = rack->r_ctl.rc_sacked; 14538 reqr->rack_holes_rxt = rack->r_ctl.rc_holes_rxt; 14539 reqr->rack_prr_delivered = rack->r_ctl.rc_prr_delivered; 14540 reqr->rack_prr_recovery_fs = rack->r_ctl.rc_prr_recovery_fs; 14541 reqr->rack_prr_sndcnt = rack->r_ctl.rc_prr_sndcnt; 14542 reqr->rack_prr_out = rack->r_ctl.rc_prr_out; 14543 /* TLP and persists info */ 14544 reqr->rack_tlp_out = rack->rc_tlp_in_progress; 14545 reqr->rack_tlp_cnt_out = rack->r_ctl.rc_tlp_cnt_out; 14546 if (rack->rc_in_persist) { 14547 reqr->rack_time_went_idle = rack->r_ctl.rc_went_idle_time; 14548 reqr->rack_in_persist = 1; 14549 } else { 14550 reqr->rack_time_went_idle = 0; 14551 reqr->rack_in_persist = 0; 14552 } 14553 if (rack->r_wanted_output) 14554 reqr->rack_wanted_output = 1; 14555 else 14556 reqr->rack_wanted_output = 0; 14557 return (1); 14558 break; 14559 default: 14560 return (-EINVAL); 14561 } 14562 } 14563 14564 static void 14565 rack_switch_failed(struct tcpcb *tp) 14566 { 14567 /* 14568 * This method gets called if a stack switch was 14569 * attempted and it failed. We are left 14570 * but our hpts timers were stopped and we 14571 * need to validate time units and t_flags2. 14572 */ 14573 struct tcp_rack *rack; 14574 struct timeval tv; 14575 uint32_t cts; 14576 uint32_t toval; 14577 struct hpts_diag diag; 14578 14579 rack = (struct tcp_rack *)tp->t_fb_ptr; 14580 tcp_change_time_units(tp, TCP_TMR_GRANULARITY_USEC); 14581 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 14582 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 14583 else 14584 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 14585 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 14586 tp->t_flags2 |= TF2_MBUF_ACKCMP; 14587 if (tp->t_in_hpts > IHPTS_NONE) { 14588 /* Strange */ 14589 return; 14590 } 14591 cts = tcp_get_usecs(&tv); 14592 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14593 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 14594 toval = rack->r_ctl.rc_last_output_to - cts; 14595 } else { 14596 /* one slot please */ 14597 toval = HPTS_TICKS_PER_SLOT; 14598 } 14599 } else if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 14600 if (TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 14601 toval = rack->r_ctl.rc_timer_exp - cts; 14602 } else { 14603 /* one slot please */ 14604 toval = HPTS_TICKS_PER_SLOT; 14605 } 14606 } else 14607 toval = HPTS_TICKS_PER_SLOT; 14608 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(toval), 14609 __LINE__, &diag); 14610 rack_log_hpts_diag(rack, cts, &diag, &tv); 14611 } 14612 14613 static int 14614 rack_init_outstanding(struct tcpcb *tp, struct tcp_rack *rack, uint32_t us_cts, void *ptr) 14615 { 14616 struct rack_sendmap *rsm, *ersm; 14617 int insret __diagused; 14618 /* 14619 * When initing outstanding, we must be quite careful 14620 * to not refer to tp->t_fb_ptr. This has the old rack 14621 * pointer in it, not the "new" one (when we are doing 14622 * a stack switch). 14623 */ 14624 14625 14626 if (tp->t_fb->tfb_chg_query == NULL) { 14627 /* Create a send map for the current outstanding data */ 14628 14629 rsm = rack_alloc(rack); 14630 if (rsm == NULL) { 14631 uma_zfree(rack_pcb_zone, ptr); 14632 return (ENOMEM); 14633 } 14634 rsm->r_no_rtt_allowed = 1; 14635 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 14636 rsm->r_rtr_cnt = 1; 14637 rsm->r_rtr_bytes = 0; 14638 if (tp->t_flags & TF_SENTFIN) 14639 rsm->r_flags |= RACK_HAS_FIN; 14640 rsm->r_end = tp->snd_max; 14641 if (tp->snd_una == tp->iss) { 14642 /* The data space is one beyond snd_una */ 14643 rsm->r_flags |= RACK_HAS_SYN; 14644 rsm->r_start = tp->iss; 14645 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 14646 } else 14647 rsm->r_start = tp->snd_una; 14648 rsm->r_dupack = 0; 14649 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 14650 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 14651 if (rsm->m) { 14652 rsm->orig_m_len = rsm->m->m_len; 14653 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14654 } else { 14655 rsm->orig_m_len = 0; 14656 rsm->orig_t_space = 0; 14657 } 14658 } else { 14659 /* 14660 * This can happen if we have a stand-alone FIN or 14661 * SYN. 14662 */ 14663 rsm->m = NULL; 14664 rsm->orig_m_len = 0; 14665 rsm->orig_t_space = 0; 14666 rsm->soff = 0; 14667 } 14668 #ifdef INVARIANTS 14669 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14670 panic("Insert in rb tree fails ret:%d rack:%p rsm:%p", 14671 insret, rack, rsm); 14672 } 14673 #else 14674 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14675 #endif 14676 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14677 rsm->r_in_tmap = 1; 14678 } else { 14679 /* We have a query mechanism, lets use it */ 14680 struct tcp_query_resp qr; 14681 int i; 14682 tcp_seq at; 14683 14684 at = tp->snd_una; 14685 while (at != tp->snd_max) { 14686 memset(&qr, 0, sizeof(qr)); 14687 qr.req = TCP_QUERY_SENDMAP; 14688 qr.req_param = at; 14689 if ((*tp->t_fb->tfb_chg_query)(tp, &qr) == 0) 14690 break; 14691 /* Move forward */ 14692 at = qr.sendmap_end; 14693 /* Now lets build the entry for this one */ 14694 rsm = rack_alloc(rack); 14695 if (rsm == NULL) { 14696 uma_zfree(rack_pcb_zone, ptr); 14697 return (ENOMEM); 14698 } 14699 memset(rsm, 0, sizeof(struct rack_sendmap)); 14700 /* Now configure the rsm and insert it */ 14701 rsm->r_dupack = qr.sendmap_dupacks; 14702 rsm->r_start = qr.sendmap_start; 14703 rsm->r_end = qr.sendmap_end; 14704 if (qr.sendmap_fas) 14705 rsm->r_fas = qr.sendmap_end; 14706 else 14707 rsm->r_fas = rsm->r_start - tp->snd_una; 14708 /* 14709 * We have carefully aligned the bits 14710 * so that all we have to do is copy over 14711 * the bits with the mask. 14712 */ 14713 rsm->r_flags = qr.sendmap_flags & SNDMAP_MASK; 14714 rsm->r_rtr_bytes = qr.sendmap_r_rtr_bytes; 14715 rsm->r_rtr_cnt = qr.sendmap_send_cnt; 14716 rsm->r_ack_arrival = qr.sendmap_ack_arrival; 14717 for (i=0 ; i<rsm->r_rtr_cnt; i++) 14718 rsm->r_tim_lastsent[i] = qr.sendmap_time[i]; 14719 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 14720 (rsm->r_start - tp->snd_una), &rsm->soff); 14721 if (rsm->m) { 14722 rsm->orig_m_len = rsm->m->m_len; 14723 rsm->orig_t_space = M_TRAILINGROOM(rsm->m); 14724 } else { 14725 rsm->orig_m_len = 0; 14726 rsm->orig_t_space = 0; 14727 } 14728 #ifdef INVARIANTS 14729 if ((insret = tqhash_insert(rack->r_ctl.tqh, rsm)) != 0) { 14730 panic("Insert in rb tree fails ret:%d rack:%p rsm:%p", 14731 insret, rack, rsm); 14732 } 14733 #else 14734 (void)tqhash_insert(rack->r_ctl.tqh, rsm); 14735 #endif 14736 if ((rsm->r_flags & RACK_ACKED) == 0) { 14737 TAILQ_FOREACH(ersm, &rack->r_ctl.rc_tmap, r_tnext) { 14738 if (ersm->r_tim_lastsent[(ersm->r_rtr_cnt-1)] > 14739 rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]) { 14740 /* 14741 * If the existing ersm was sent at 14742 * a later time than the new one, then 14743 * the new one should appear ahead of this 14744 * ersm. 14745 */ 14746 rsm->r_in_tmap = 1; 14747 TAILQ_INSERT_BEFORE(ersm, rsm, r_tnext); 14748 break; 14749 } 14750 } 14751 if (rsm->r_in_tmap == 0) { 14752 /* 14753 * Not found so shove it on the tail. 14754 */ 14755 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 14756 rsm->r_in_tmap = 1; 14757 } 14758 } else { 14759 if ((rack->r_ctl.rc_sacklast == NULL) || 14760 (SEQ_GT(rsm->r_end, rack->r_ctl.rc_sacklast->r_end))) { 14761 rack->r_ctl.rc_sacklast = rsm; 14762 } 14763 } 14764 rack_log_chg_info(tp, rack, 3, 14765 rsm->r_start, 14766 rsm->r_end, 14767 rsm->r_flags); 14768 } 14769 } 14770 return (0); 14771 } 14772 14773 static void 14774 rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval) 14775 { 14776 /* 14777 * P = percent bits 14778 * F = fill cw bit -- Toggle fillcw if this bit is set. 14779 * S = Segment bits 14780 * M = set max segment bit 14781 * U = Unclamined 14782 * C = If set to non-zero override the max number of clamps. 14783 * L = Bit to indicate if clamped gets lower. 14784 * 14785 * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP 14786 * 14787 * The lowest 3 nibbles is the perentage .1 - 6553.5% 14788 * where 10.1 = 101, max 6553.5 14789 * The upper 16 bits holds some options. 14790 * The F bit will turn on fill-cw on if you are 14791 * not pacing, it will turn it off if dgp is on. 14792 * The L bit will change it so when clamped we get 14793 * the min(gp, lt-bw) for dgp. 14794 */ 14795 uint16_t per; 14796 14797 rack->r_ctl.saved_rxt_clamp_val = optval; 14798 per = optval & 0x0000ffff; 14799 rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff); 14800 if (optval > 0) { 14801 uint16_t clamp_opt; 14802 14803 rack->excess_rxt_on = 1; 14804 clamp_opt = ((optval & 0xffff0000) >> 16); 14805 rack->r_ctl.clamp_options = clamp_opt & 0x00ff; 14806 if (clamp_opt & 0xff00) { 14807 /* A max clamps is also present */ 14808 rack->r_ctl.max_clamps = (clamp_opt >> 8); 14809 } else { 14810 /* No specified clamps means no limit */ 14811 rack->r_ctl.max_clamps = 0; 14812 } 14813 if (rack->r_ctl.clamp_options & 0x0002) { 14814 rack->r_clamped_gets_lower = 1; 14815 } else { 14816 rack->r_clamped_gets_lower = 0; 14817 } 14818 } else { 14819 /* Turn it off back to default */ 14820 rack->excess_rxt_on = 0; 14821 rack->r_clamped_gets_lower = 0; 14822 } 14823 14824 } 14825 14826 14827 static int32_t 14828 rack_init(struct tcpcb *tp, void **ptr) 14829 { 14830 struct inpcb *inp = tptoinpcb(tp); 14831 struct tcp_rack *rack = NULL; 14832 uint32_t iwin, snt, us_cts; 14833 int err, no_query; 14834 14835 /* 14836 * First are we the initial or are we a switched stack? 14837 * If we are initing via tcp_newtcppcb the ptr passed 14838 * will be tp->t_fb_ptr. If its a stack switch that 14839 * has a previous stack we can query it will be a local 14840 * var that will in the end be set into t_fb_ptr. 14841 */ 14842 if (ptr == &tp->t_fb_ptr) 14843 no_query = 1; 14844 else 14845 no_query = 0; 14846 *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 14847 if (*ptr == NULL) { 14848 /* 14849 * We need to allocate memory but cant. The INP and INP_INFO 14850 * locks and they are recursive (happens during setup. So a 14851 * scheme to drop the locks fails :( 14852 * 14853 */ 14854 return(ENOMEM); 14855 } 14856 memset(*ptr, 0, sizeof(struct tcp_rack)); 14857 rack = (struct tcp_rack *)*ptr; 14858 rack->r_ctl.tqh = malloc(sizeof(struct tailq_hash), M_TCPFSB, M_NOWAIT); 14859 if (rack->r_ctl.tqh == NULL) { 14860 uma_zfree(rack_pcb_zone, rack); 14861 return(ENOMEM); 14862 } 14863 tqhash_init(rack->r_ctl.tqh); 14864 TAILQ_INIT(&rack->r_ctl.rc_free); 14865 TAILQ_INIT(&rack->r_ctl.rc_tmap); 14866 rack->rc_tp = tp; 14867 rack->rc_inp = inp; 14868 /* Set the flag */ 14869 rack->r_is_v6 = (inp->inp_vflag & INP_IPV6) != 0; 14870 /* Probably not needed but lets be sure */ 14871 rack_clear_rate_sample(rack); 14872 /* 14873 * Save off the default values, socket options will poke 14874 * at these if pacing is not on or we have not yet 14875 * reached where pacing is on (gp_ready/fixed enabled). 14876 * When they get set into the CC module (when gp_ready 14877 * is enabled or we enable fixed) then we will set these 14878 * values into the CC and place in here the old values 14879 * so we have a restoral. Then we will set the flag 14880 * rc_pacing_cc_set. That way whenever we turn off pacing 14881 * or switch off this stack, we will know to go restore 14882 * the saved values. 14883 * 14884 * We specifically put into the beta the ecn value for pacing. 14885 */ 14886 rack->rc_new_rnd_needed = 1; 14887 rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; 14888 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 14889 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 14890 /* We want abe like behavior as well */ 14891 rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 14892 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 14893 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 14894 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 14895 if (rack_rxt_clamp_thresh) { 14896 rack_translate_clamp_value(rack, rack_rxt_clamp_thresh); 14897 rack->excess_rxt_on = 1; 14898 } 14899 if (rack_uses_full_dgp_in_rec) 14900 rack->r_ctl.full_dgp_in_rec = 1; 14901 if (rack_fill_cw_state) 14902 rack->rc_pace_to_cwnd = 1; 14903 if (rack_pacing_min_seg) 14904 rack->r_ctl.rc_user_set_min_segs = rack_pacing_min_seg; 14905 if (use_rack_rr) 14906 rack->use_rack_rr = 1; 14907 if (rack_dnd_default) { 14908 rack->rc_pace_dnd = 1; 14909 } 14910 if (V_tcp_delack_enabled) 14911 tp->t_delayed_ack = 1; 14912 else 14913 tp->t_delayed_ack = 0; 14914 #ifdef TCP_ACCOUNTING 14915 if (rack_tcp_accounting) { 14916 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 14917 } 14918 #endif 14919 rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; 14920 rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; 14921 if (rack_enable_shared_cwnd) 14922 rack->rack_enable_scwnd = 1; 14923 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 14924 rack->rc_user_set_max_segs = rack_hptsi_segments; 14925 rack->rc_force_max_seg = 0; 14926 TAILQ_INIT(&rack->r_ctl.opt_list); 14927 if (rack_hibeta_setting) 14928 rack->rack_hibeta = 1; 14929 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 14930 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 14931 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 14932 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 14933 rack->r_ctl.rc_highest_us_rtt = 0; 14934 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 14935 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 14936 if (rack_use_cmp_acks) 14937 rack->r_use_cmp_ack = 1; 14938 if (rack_disable_prr) 14939 rack->rack_no_prr = 1; 14940 if (rack_gp_no_rec_chg) 14941 rack->rc_gp_no_rec_chg = 1; 14942 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 14943 rack->rc_always_pace = 1; 14944 if ((rack->gp_ready) && (rack->rc_always_pace && (rack->use_fixed_rate == 0))) 14945 rack_set_cc_pacing(rack); 14946 } else 14947 rack->rc_always_pace = 0; 14948 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 14949 rack->r_mbuf_queue = 1; 14950 else 14951 rack->r_mbuf_queue = 0; 14952 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14953 if (rack_limits_scwnd) 14954 rack->r_limit_scw = 1; 14955 else 14956 rack->r_limit_scw = 0; 14957 rack_init_retransmit_value(rack, rack_rxt_controls); 14958 rack->rc_labc = V_tcp_abc_l_var; 14959 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 14960 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 14961 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 14962 rack->r_ctl.rc_min_to = rack_min_to; 14963 microuptime(&rack->r_ctl.act_rcv_time); 14964 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 14965 rack->rc_init_win = rack_default_init_window; 14966 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 14967 if (rack_hw_up_only) 14968 rack->r_up_only = 1; 14969 if (rack_do_dyn_mul) { 14970 /* When dynamic adjustment is on CA needs to start at 100% */ 14971 rack->rc_gp_dyn_mul = 1; 14972 if (rack_do_dyn_mul >= 100) 14973 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 14974 } else 14975 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 14976 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 14977 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 14978 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 14979 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 14980 rack_probertt_filter_life); 14981 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 14982 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 14983 rack->r_ctl.rc_time_of_last_probertt = us_cts; 14984 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 14985 rack->r_ctl.rc_time_probertt_starts = 0; 14986 if (rack_dsack_std_based & 0x1) { 14987 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 14988 rack->rc_rack_tmr_std_based = 1; 14989 } 14990 if (rack_dsack_std_based & 0x2) { 14991 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 14992 rack->rc_rack_use_dsack = 1; 14993 } 14994 /* We require at least one measurement, even if the sysctl is 0 */ 14995 if (rack_req_measurements) 14996 rack->r_ctl.req_measurements = rack_req_measurements; 14997 else 14998 rack->r_ctl.req_measurements = 1; 14999 if (rack_enable_hw_pacing) 15000 rack->rack_hdw_pace_ena = 1; 15001 if (rack_hw_rate_caps) 15002 rack->r_rack_hw_rate_caps = 1; 15003 #ifdef TCP_SAD_DETECTION 15004 rack->do_detection = 1; 15005 #else 15006 rack->do_detection = 0; 15007 #endif 15008 if (rack_non_rxt_use_cr) 15009 rack->rack_rec_nonrxt_use_cr = 1; 15010 /* Lets setup the fsb block */ 15011 err = rack_init_fsb(tp, rack); 15012 if (err) { 15013 uma_zfree(rack_pcb_zone, *ptr); 15014 *ptr = NULL; 15015 return (err); 15016 } 15017 if (rack_do_hystart) { 15018 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 15019 if (rack_do_hystart > 1) 15020 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 15021 if (rack_do_hystart > 2) 15022 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 15023 } 15024 /* Log what we will do with queries */ 15025 rack_log_chg_info(tp, rack, 7, 15026 no_query, 0, 0); 15027 if (rack_def_profile) 15028 rack_set_profile(rack, rack_def_profile); 15029 /* Cancel the GP measurement in progress */ 15030 tp->t_flags &= ~TF_GPUTINPROG; 15031 if ((tp->t_state != TCPS_CLOSED) && 15032 (tp->t_state != TCPS_TIME_WAIT)) { 15033 /* 15034 * We are already open, we may 15035 * need to adjust a few things. 15036 */ 15037 if (SEQ_GT(tp->snd_max, tp->iss)) 15038 snt = tp->snd_max - tp->iss; 15039 else 15040 snt = 0; 15041 iwin = rc_init_window(rack); 15042 if ((snt < iwin) && 15043 (no_query == 1)) { 15044 /* We are not past the initial window 15045 * on the first init (i.e. a stack switch 15046 * has not yet occured) so we need to make 15047 * sure cwnd and ssthresh is correct. 15048 */ 15049 if (tp->snd_cwnd < iwin) 15050 tp->snd_cwnd = iwin; 15051 /* 15052 * If we are within the initial window 15053 * we want ssthresh to be unlimited. Setting 15054 * it to the rwnd (which the default stack does 15055 * and older racks) is not really a good idea 15056 * since we want to be in SS and grow both the 15057 * cwnd and the rwnd (via dynamic rwnd growth). If 15058 * we set it to the rwnd then as the peer grows its 15059 * rwnd we will be stuck in CA and never hit SS. 15060 * 15061 * Its far better to raise it up high (this takes the 15062 * risk that there as been a loss already, probably 15063 * we should have an indicator in all stacks of loss 15064 * but we don't), but considering the normal use this 15065 * is a risk worth taking. The consequences of not 15066 * hitting SS are far worse than going one more time 15067 * into it early on (before we have sent even a IW). 15068 * It is highly unlikely that we will have had a loss 15069 * before getting the IW out. 15070 */ 15071 tp->snd_ssthresh = 0xffffffff; 15072 } 15073 /* 15074 * Any init based on sequence numbers 15075 * should be done in the deferred init path 15076 * since we can be CLOSED and not have them 15077 * inited when rack_init() is called. We 15078 * are not closed so lets call it. 15079 */ 15080 rack_deferred_init(tp, rack); 15081 } 15082 if ((tp->t_state != TCPS_CLOSED) && 15083 (tp->t_state != TCPS_TIME_WAIT) && 15084 (no_query == 0) && 15085 (tp->snd_una != tp->snd_max)) { 15086 err = rack_init_outstanding(tp, rack, us_cts, *ptr); 15087 if (err) { 15088 *ptr = NULL; 15089 return(err); 15090 } 15091 } 15092 rack_stop_all_timers(tp, rack); 15093 /* Setup all the t_flags2 */ 15094 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 15095 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 15096 else 15097 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 15098 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15099 tp->t_flags2 |= TF2_MBUF_ACKCMP; 15100 /* 15101 * Timers in Rack are kept in microseconds so lets 15102 * convert any initial incoming variables 15103 * from ticks into usecs. Note that we 15104 * also change the values of t_srtt and t_rttvar, if 15105 * they are non-zero. They are kept with a 5 15106 * bit decimal so we have to carefully convert 15107 * these to get the full precision. 15108 */ 15109 rack_convert_rtts(tp); 15110 rack_log_hystart_event(rack, rack->r_ctl.roundends, 20); 15111 if ((tptoinpcb(tp)->inp_flags & INP_DROPPED) == 0) { 15112 /* We do not start any timers on DROPPED connections */ 15113 if (tp->t_fb->tfb_chg_query == NULL) { 15114 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15115 } else { 15116 struct tcp_query_resp qr; 15117 int ret; 15118 15119 memset(&qr, 0, sizeof(qr)); 15120 15121 /* Get the misc time stamps and such for rack */ 15122 qr.req = TCP_QUERY_RACK_TIMES; 15123 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15124 if (ret == 1) { 15125 rack->r_ctl.rc_reorder_ts = qr.rack_reorder_ts; 15126 rack->r_ctl.num_dsack = qr.rack_num_dsacks; 15127 rack->r_ctl.rc_tlp_rxt_last_time = qr.rack_rxt_last_time; 15128 rack->r_ctl.rc_rack_min_rtt = qr.rack_min_rtt; 15129 rack->rc_rack_rtt = qr.rack_rtt; 15130 rack->r_ctl.rc_rack_tmit_time = qr.rack_tmit_time; 15131 rack->r_ctl.rc_sacked = qr.rack_sacked; 15132 rack->r_ctl.rc_holes_rxt = qr.rack_holes_rxt; 15133 rack->r_ctl.rc_prr_delivered = qr.rack_prr_delivered; 15134 rack->r_ctl.rc_prr_recovery_fs = qr.rack_prr_recovery_fs; 15135 rack->r_ctl.rc_prr_sndcnt = qr.rack_prr_sndcnt; 15136 rack->r_ctl.rc_prr_out = qr.rack_prr_out; 15137 if (qr.rack_tlp_out) { 15138 rack->rc_tlp_in_progress = 1; 15139 rack->r_ctl.rc_tlp_cnt_out = qr.rack_tlp_cnt_out; 15140 } else { 15141 rack->rc_tlp_in_progress = 0; 15142 rack->r_ctl.rc_tlp_cnt_out = 0; 15143 } 15144 if (qr.rack_srtt_measured) 15145 rack->rc_srtt_measure_made = 1; 15146 if (qr.rack_in_persist == 1) { 15147 rack->r_ctl.rc_went_idle_time = qr.rack_time_went_idle; 15148 #ifdef NETFLIX_SHARED_CWND 15149 if (rack->r_ctl.rc_scw) { 15150 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 15151 rack->rack_scwnd_is_idle = 1; 15152 } 15153 #endif 15154 rack->r_ctl.persist_lost_ends = 0; 15155 rack->probe_not_answered = 0; 15156 rack->forced_ack = 0; 15157 tp->t_rxtshift = 0; 15158 rack->rc_in_persist = 1; 15159 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 15160 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 15161 } 15162 if (qr.rack_wanted_output) 15163 rack->r_wanted_output = 1; 15164 rack_log_chg_info(tp, rack, 6, 15165 qr.rack_min_rtt, 15166 qr.rack_rtt, 15167 qr.rack_reorder_ts); 15168 } 15169 /* Get the old stack timers */ 15170 qr.req_param = 0; 15171 qr.req = TCP_QUERY_TIMERS_UP; 15172 ret = (*tp->t_fb->tfb_chg_query)(tp, &qr); 15173 if (ret) { 15174 /* 15175 * non-zero return means we have a timer('s) 15176 * to start. Zero means no timer (no keepalive 15177 * I suppose). 15178 */ 15179 uint32_t tov = 0; 15180 15181 rack->r_ctl.rc_hpts_flags = qr.timer_hpts_flags; 15182 if (qr.timer_hpts_flags & PACE_PKT_OUTPUT) { 15183 rack->r_ctl.rc_last_output_to = qr.timer_pacing_to; 15184 if (TSTMP_GT(qr.timer_pacing_to, us_cts)) 15185 tov = qr.timer_pacing_to - us_cts; 15186 else 15187 tov = HPTS_TICKS_PER_SLOT; 15188 } 15189 if (qr.timer_hpts_flags & PACE_TMR_MASK) { 15190 rack->r_ctl.rc_timer_exp = qr.timer_timer_exp; 15191 if (tov == 0) { 15192 if (TSTMP_GT(qr.timer_timer_exp, us_cts)) 15193 tov = qr.timer_timer_exp - us_cts; 15194 else 15195 tov = HPTS_TICKS_PER_SLOT; 15196 } 15197 } 15198 rack_log_chg_info(tp, rack, 4, 15199 rack->r_ctl.rc_hpts_flags, 15200 rack->r_ctl.rc_last_output_to, 15201 rack->r_ctl.rc_timer_exp); 15202 if (tov) { 15203 struct hpts_diag diag; 15204 15205 (void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(tov), 15206 __LINE__, &diag); 15207 rack_log_hpts_diag(rack, us_cts, &diag, &rack->r_ctl.act_rcv_time); 15208 } 15209 } 15210 } 15211 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 15212 __LINE__, RACK_RTTS_INIT); 15213 } 15214 return (0); 15215 } 15216 15217 static int 15218 rack_handoff_ok(struct tcpcb *tp) 15219 { 15220 if ((tp->t_state == TCPS_CLOSED) || 15221 (tp->t_state == TCPS_LISTEN)) { 15222 /* Sure no problem though it may not stick */ 15223 return (0); 15224 } 15225 if ((tp->t_state == TCPS_SYN_SENT) || 15226 (tp->t_state == TCPS_SYN_RECEIVED)) { 15227 /* 15228 * We really don't know if you support sack, 15229 * you have to get to ESTAB or beyond to tell. 15230 */ 15231 return (EAGAIN); 15232 } 15233 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 15234 /* 15235 * Rack will only send a FIN after all data is acknowledged. 15236 * So in this case we have more data outstanding. We can't 15237 * switch stacks until either all data and only the FIN 15238 * is left (in which case rack_init() now knows how 15239 * to deal with that) <or> all is acknowledged and we 15240 * are only left with incoming data, though why you 15241 * would want to switch to rack after all data is acknowledged 15242 * I have no idea (rrs)! 15243 */ 15244 return (EAGAIN); 15245 } 15246 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 15247 return (0); 15248 } 15249 /* 15250 * If we reach here we don't do SACK on this connection so we can 15251 * never do rack. 15252 */ 15253 return (EINVAL); 15254 } 15255 15256 static void 15257 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 15258 { 15259 15260 if (tp->t_fb_ptr) { 15261 uint32_t cnt_free = 0; 15262 struct tcp_rack *rack; 15263 struct rack_sendmap *rsm; 15264 15265 tcp_handle_orphaned_packets(tp); 15266 tp->t_flags &= ~TF_FORCEDATA; 15267 rack = (struct tcp_rack *)tp->t_fb_ptr; 15268 rack_log_pacing_delay_calc(rack, 15269 0, 15270 0, 15271 0, 15272 rack_get_gp_est(rack), /* delRate */ 15273 rack_get_lt_bw(rack), /* rttProp */ 15274 20, __LINE__, NULL, 0); 15275 #ifdef NETFLIX_SHARED_CWND 15276 if (rack->r_ctl.rc_scw) { 15277 uint32_t limit; 15278 15279 if (rack->r_limit_scw) 15280 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 15281 else 15282 limit = 0; 15283 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 15284 rack->r_ctl.rc_scw_index, 15285 limit); 15286 rack->r_ctl.rc_scw = NULL; 15287 } 15288 #endif 15289 if (rack->r_ctl.fsb.tcp_ip_hdr) { 15290 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 15291 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 15292 rack->r_ctl.fsb.th = NULL; 15293 } 15294 if (rack->rc_always_pace) { 15295 tcp_decrement_paced_conn(); 15296 rack_undo_cc_pacing(rack); 15297 rack->rc_always_pace = 0; 15298 } 15299 /* Clean up any options if they were not applied */ 15300 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 15301 struct deferred_opt_list *dol; 15302 15303 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 15304 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 15305 free(dol, M_TCPDO); 15306 } 15307 /* rack does not use force data but other stacks may clear it */ 15308 if (rack->r_ctl.crte != NULL) { 15309 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 15310 rack->rack_hdrw_pacing = 0; 15311 rack->r_ctl.crte = NULL; 15312 } 15313 #ifdef TCP_BLACKBOX 15314 tcp_log_flowend(tp); 15315 #endif 15316 /* 15317 * Lets take a different approach to purging just 15318 * get each one and free it like a cum-ack would and 15319 * not use a foreach loop. 15320 */ 15321 rsm = tqhash_min(rack->r_ctl.tqh); 15322 while (rsm) { 15323 tqhash_remove(rack->r_ctl.tqh, rsm, REMOVE_TYPE_CUMACK); 15324 rack->r_ctl.rc_num_maps_alloced--; 15325 uma_zfree(rack_zone, rsm); 15326 rsm = tqhash_min(rack->r_ctl.tqh); 15327 } 15328 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15329 while (rsm) { 15330 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 15331 rack->r_ctl.rc_num_maps_alloced--; 15332 rack->rc_free_cnt--; 15333 cnt_free++; 15334 uma_zfree(rack_zone, rsm); 15335 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15336 } 15337 if ((rack->r_ctl.rc_num_maps_alloced > 0) && 15338 (tcp_bblogging_on(tp))) { 15339 union tcp_log_stackspecific log; 15340 struct timeval tv; 15341 15342 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15343 log.u_bbr.flex8 = 10; 15344 log.u_bbr.flex1 = rack->r_ctl.rc_num_maps_alloced; 15345 log.u_bbr.flex2 = rack->rc_free_cnt; 15346 log.u_bbr.flex3 = cnt_free; 15347 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15348 rsm = tqhash_min(rack->r_ctl.tqh); 15349 log.u_bbr.delRate = (uint64_t)rsm; 15350 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 15351 log.u_bbr.cur_del_rate = (uint64_t)rsm; 15352 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15353 log.u_bbr.pkt_epoch = __LINE__; 15354 (void)tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15355 0, &log, false, NULL, NULL, 0, &tv); 15356 } 15357 KASSERT((rack->r_ctl.rc_num_maps_alloced == 0), 15358 ("rack:%p num_aloc:%u after freeing all?", 15359 rack, 15360 rack->r_ctl.rc_num_maps_alloced)); 15361 rack->rc_free_cnt = 0; 15362 free(rack->r_ctl.tqh, M_TCPFSB); 15363 rack->r_ctl.tqh = NULL; 15364 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 15365 tp->t_fb_ptr = NULL; 15366 } 15367 /* Make sure snd_nxt is correctly set */ 15368 tp->snd_nxt = tp->snd_max; 15369 } 15370 15371 static void 15372 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 15373 { 15374 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 15375 rack->r_is_v6 = (tptoinpcb(tp)->inp_vflag & INP_IPV6) != 0; 15376 } 15377 switch (tp->t_state) { 15378 case TCPS_SYN_SENT: 15379 rack->r_state = TCPS_SYN_SENT; 15380 rack->r_substate = rack_do_syn_sent; 15381 break; 15382 case TCPS_SYN_RECEIVED: 15383 rack->r_state = TCPS_SYN_RECEIVED; 15384 rack->r_substate = rack_do_syn_recv; 15385 break; 15386 case TCPS_ESTABLISHED: 15387 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15388 rack->r_state = TCPS_ESTABLISHED; 15389 rack->r_substate = rack_do_established; 15390 break; 15391 case TCPS_CLOSE_WAIT: 15392 rack->r_state = TCPS_CLOSE_WAIT; 15393 rack->r_substate = rack_do_close_wait; 15394 break; 15395 case TCPS_FIN_WAIT_1: 15396 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15397 rack->r_state = TCPS_FIN_WAIT_1; 15398 rack->r_substate = rack_do_fin_wait_1; 15399 break; 15400 case TCPS_CLOSING: 15401 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15402 rack->r_state = TCPS_CLOSING; 15403 rack->r_substate = rack_do_closing; 15404 break; 15405 case TCPS_LAST_ACK: 15406 rack_set_pace_segments(tp, rack, __LINE__, NULL); 15407 rack->r_state = TCPS_LAST_ACK; 15408 rack->r_substate = rack_do_lastack; 15409 break; 15410 case TCPS_FIN_WAIT_2: 15411 rack->r_state = TCPS_FIN_WAIT_2; 15412 rack->r_substate = rack_do_fin_wait_2; 15413 break; 15414 case TCPS_LISTEN: 15415 case TCPS_CLOSED: 15416 case TCPS_TIME_WAIT: 15417 default: 15418 break; 15419 }; 15420 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 15421 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 15422 15423 } 15424 15425 static void 15426 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 15427 { 15428 /* 15429 * We received an ack, and then did not 15430 * call send or were bounced out due to the 15431 * hpts was running. Now a timer is up as well, is 15432 * it the right timer? 15433 */ 15434 struct rack_sendmap *rsm; 15435 int tmr_up; 15436 15437 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 15438 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 15439 return; 15440 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 15441 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 15442 (tmr_up == PACE_TMR_RXT)) { 15443 /* Should be an RXT */ 15444 return; 15445 } 15446 if (rsm == NULL) { 15447 /* Nothing outstanding? */ 15448 if (tp->t_flags & TF_DELACK) { 15449 if (tmr_up == PACE_TMR_DELACK) 15450 /* We are supposed to have delayed ack up and we do */ 15451 return; 15452 } else if (sbavail(&tptosocket(tp)->so_snd) && (tmr_up == PACE_TMR_RXT)) { 15453 /* 15454 * if we hit enobufs then we would expect the possibility 15455 * of nothing outstanding and the RXT up (and the hptsi timer). 15456 */ 15457 return; 15458 } else if (((V_tcp_always_keepalive || 15459 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 15460 (tp->t_state <= TCPS_CLOSING)) && 15461 (tmr_up == PACE_TMR_KEEP) && 15462 (tp->snd_max == tp->snd_una)) { 15463 /* We should have keep alive up and we do */ 15464 return; 15465 } 15466 } 15467 if (SEQ_GT(tp->snd_max, tp->snd_una) && 15468 ((tmr_up == PACE_TMR_TLP) || 15469 (tmr_up == PACE_TMR_RACK) || 15470 (tmr_up == PACE_TMR_RXT))) { 15471 /* 15472 * Either a Rack, TLP or RXT is fine if we 15473 * have outstanding data. 15474 */ 15475 return; 15476 } else if (tmr_up == PACE_TMR_DELACK) { 15477 /* 15478 * If the delayed ack was going to go off 15479 * before the rtx/tlp/rack timer were going to 15480 * expire, then that would be the timer in control. 15481 * Note we don't check the time here trusting the 15482 * code is correct. 15483 */ 15484 return; 15485 } 15486 /* 15487 * Ok the timer originally started is not what we want now. 15488 * We will force the hpts to be stopped if any, and restart 15489 * with the slot set to what was in the saved slot. 15490 */ 15491 if (tcp_in_hpts(rack->rc_tp)) { 15492 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 15493 uint32_t us_cts; 15494 15495 us_cts = tcp_get_usecs(NULL); 15496 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 15497 rack->r_early = 1; 15498 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 15499 } 15500 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 15501 } 15502 tcp_hpts_remove(rack->rc_tp); 15503 } 15504 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 15505 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 15506 } 15507 15508 15509 static void 15510 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts) 15511 { 15512 if ((SEQ_LT(tp->snd_wl1, seq) || 15513 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 15514 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 15515 /* keep track of pure window updates */ 15516 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 15517 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 15518 tp->snd_wnd = tiwin; 15519 rack_validate_fo_sendwin_up(tp, rack); 15520 tp->snd_wl1 = seq; 15521 tp->snd_wl2 = ack; 15522 if (tp->snd_wnd > tp->max_sndwnd) 15523 tp->max_sndwnd = tp->snd_wnd; 15524 rack->r_wanted_output = 1; 15525 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 15526 tp->snd_wnd = tiwin; 15527 rack_validate_fo_sendwin_up(tp, rack); 15528 tp->snd_wl1 = seq; 15529 tp->snd_wl2 = ack; 15530 } else { 15531 /* Not a valid win update */ 15532 return; 15533 } 15534 if (tp->snd_wnd > tp->max_sndwnd) 15535 tp->max_sndwnd = tp->snd_wnd; 15536 /* Do we exit persists? */ 15537 if ((rack->rc_in_persist != 0) && 15538 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 15539 rack->r_ctl.rc_pace_min_segs))) { 15540 rack_exit_persist(tp, rack, cts); 15541 } 15542 /* Do we enter persists? */ 15543 if ((rack->rc_in_persist == 0) && 15544 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 15545 TCPS_HAVEESTABLISHED(tp->t_state) && 15546 ((tp->snd_max == tp->snd_una) || rack->rc_has_collapsed) && 15547 sbavail(&tptosocket(tp)->so_snd) && 15548 (sbavail(&tptosocket(tp)->so_snd) > tp->snd_wnd)) { 15549 /* 15550 * Here the rwnd is less than 15551 * the pacing size, we are established, 15552 * nothing is outstanding, and there is 15553 * data to send. Enter persists. 15554 */ 15555 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, ack); 15556 } 15557 } 15558 15559 static void 15560 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 15561 { 15562 15563 if (tcp_bblogging_on(rack->rc_tp)) { 15564 struct inpcb *inp = tptoinpcb(tp); 15565 union tcp_log_stackspecific log; 15566 struct timeval ltv; 15567 char tcp_hdr_buf[60]; 15568 struct tcphdr *th; 15569 struct timespec ts; 15570 uint32_t orig_snd_una; 15571 uint8_t xx = 0; 15572 15573 #ifdef TCP_REQUEST_TRK 15574 struct tcp_sendfile_track *tcp_req; 15575 15576 if (SEQ_GT(ae->ack, tp->snd_una)) { 15577 tcp_req = tcp_req_find_req_for_seq(tp, (ae->ack-1)); 15578 } else { 15579 tcp_req = tcp_req_find_req_for_seq(tp, ae->ack); 15580 } 15581 #endif 15582 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15583 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 15584 if (rack->rack_no_prr == 0) 15585 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15586 else 15587 log.u_bbr.flex1 = 0; 15588 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 15589 log.u_bbr.use_lt_bw <<= 1; 15590 log.u_bbr.use_lt_bw |= rack->r_might_revert; 15591 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 15592 log.u_bbr.bbr_state = rack->rc_free_cnt; 15593 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15594 log.u_bbr.pkts_out = tp->t_maxseg; 15595 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 15596 log.u_bbr.flex7 = 1; 15597 log.u_bbr.lost = ae->flags; 15598 log.u_bbr.cwnd_gain = ackval; 15599 log.u_bbr.pacing_gain = 0x2; 15600 if (ae->flags & TSTMP_HDWR) { 15601 /* Record the hardware timestamp if present */ 15602 log.u_bbr.flex3 = M_TSTMP; 15603 ts.tv_sec = ae->timestamp / 1000000000; 15604 ts.tv_nsec = ae->timestamp % 1000000000; 15605 ltv.tv_sec = ts.tv_sec; 15606 ltv.tv_usec = ts.tv_nsec / 1000; 15607 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 15608 } else if (ae->flags & TSTMP_LRO) { 15609 /* Record the LRO the arrival timestamp */ 15610 log.u_bbr.flex3 = M_TSTMP_LRO; 15611 ts.tv_sec = ae->timestamp / 1000000000; 15612 ts.tv_nsec = ae->timestamp % 1000000000; 15613 ltv.tv_sec = ts.tv_sec; 15614 ltv.tv_usec = ts.tv_nsec / 1000; 15615 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 15616 } 15617 log.u_bbr.timeStamp = tcp_get_usecs(<v); 15618 /* Log the rcv time */ 15619 log.u_bbr.delRate = ae->timestamp; 15620 #ifdef TCP_REQUEST_TRK 15621 log.u_bbr.applimited = tp->t_tcpreq_closed; 15622 log.u_bbr.applimited <<= 8; 15623 log.u_bbr.applimited |= tp->t_tcpreq_open; 15624 log.u_bbr.applimited <<= 8; 15625 log.u_bbr.applimited |= tp->t_tcpreq_req; 15626 if (tcp_req) { 15627 /* Copy out any client req info */ 15628 /* seconds */ 15629 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 15630 /* useconds */ 15631 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 15632 log.u_bbr.rttProp = tcp_req->timestamp; 15633 log.u_bbr.cur_del_rate = tcp_req->start; 15634 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 15635 log.u_bbr.flex8 |= 1; 15636 } else { 15637 log.u_bbr.flex8 |= 2; 15638 log.u_bbr.bw_inuse = tcp_req->end; 15639 } 15640 log.u_bbr.flex6 = tcp_req->start_seq; 15641 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 15642 log.u_bbr.flex8 |= 4; 15643 log.u_bbr.epoch = tcp_req->end_seq; 15644 } 15645 } 15646 #endif 15647 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 15648 th = (struct tcphdr *)tcp_hdr_buf; 15649 th->th_seq = ae->seq; 15650 th->th_ack = ae->ack; 15651 th->th_win = ae->win; 15652 /* Now fill in the ports */ 15653 th->th_sport = inp->inp_fport; 15654 th->th_dport = inp->inp_lport; 15655 tcp_set_flags(th, ae->flags); 15656 /* Now do we have a timestamp option? */ 15657 if (ae->flags & HAS_TSTMP) { 15658 u_char *cp; 15659 uint32_t val; 15660 15661 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 15662 cp = (u_char *)(th + 1); 15663 *cp = TCPOPT_NOP; 15664 cp++; 15665 *cp = TCPOPT_NOP; 15666 cp++; 15667 *cp = TCPOPT_TIMESTAMP; 15668 cp++; 15669 *cp = TCPOLEN_TIMESTAMP; 15670 cp++; 15671 val = htonl(ae->ts_value); 15672 bcopy((char *)&val, 15673 (char *)cp, sizeof(uint32_t)); 15674 val = htonl(ae->ts_echo); 15675 bcopy((char *)&val, 15676 (char *)(cp + 4), sizeof(uint32_t)); 15677 } else 15678 th->th_off = (sizeof(struct tcphdr) >> 2); 15679 15680 /* 15681 * For sane logging we need to play a little trick. 15682 * If the ack were fully processed we would have moved 15683 * snd_una to high_seq, but since compressed acks are 15684 * processed in two phases, at this point (logging) snd_una 15685 * won't be advanced. So we would see multiple acks showing 15686 * the advancement. We can prevent that by "pretending" that 15687 * snd_una was advanced and then un-advancing it so that the 15688 * logging code has the right value for tlb_snd_una. 15689 */ 15690 if (tp->snd_una != high_seq) { 15691 orig_snd_una = tp->snd_una; 15692 tp->snd_una = high_seq; 15693 xx = 1; 15694 } else 15695 xx = 0; 15696 TCP_LOG_EVENTP(tp, th, 15697 &tptosocket(tp)->so_rcv, 15698 &tptosocket(tp)->so_snd, TCP_LOG_IN, 0, 15699 0, &log, true, <v); 15700 if (xx) { 15701 tp->snd_una = orig_snd_una; 15702 } 15703 } 15704 15705 } 15706 15707 static void 15708 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 15709 { 15710 uint32_t us_rtt; 15711 /* 15712 * A persist or keep-alive was forced out, update our 15713 * min rtt time. Note now worry about lost responses. 15714 * When a subsequent keep-alive or persist times out 15715 * and forced_ack is still on, then the last probe 15716 * was not responded to. In such cases we have a 15717 * sysctl that controls the behavior. Either we apply 15718 * the rtt but with reduced confidence (0). Or we just 15719 * plain don't apply the rtt estimate. Having data flow 15720 * will clear the probe_not_answered flag i.e. cum-ack 15721 * move forward <or> exiting and reentering persists. 15722 */ 15723 15724 rack->forced_ack = 0; 15725 rack->rc_tp->t_rxtshift = 0; 15726 if ((rack->rc_in_persist && 15727 (tiwin == rack->rc_tp->snd_wnd)) || 15728 (rack->rc_in_persist == 0)) { 15729 /* 15730 * In persists only apply the RTT update if this is 15731 * a response to our window probe. And that 15732 * means the rwnd sent must match the current 15733 * snd_wnd. If it does not, then we got a 15734 * window update ack instead. For keepalive 15735 * we allow the answer no matter what the window. 15736 * 15737 * Note that if the probe_not_answered is set then 15738 * the forced_ack_ts is the oldest one i.e. the first 15739 * probe sent that might have been lost. This assures 15740 * us that if we do calculate an RTT it is longer not 15741 * some short thing. 15742 */ 15743 if (rack->rc_in_persist) 15744 counter_u64_add(rack_persists_acks, 1); 15745 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 15746 if (us_rtt == 0) 15747 us_rtt = 1; 15748 if (rack->probe_not_answered == 0) { 15749 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15750 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 15751 } else { 15752 /* We have a retransmitted probe here too */ 15753 if (rack_apply_rtt_with_reduced_conf) { 15754 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 15755 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 15756 } 15757 } 15758 } 15759 } 15760 15761 static int 15762 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 15763 { 15764 /* 15765 * Handle a "special" compressed ack mbuf. Each incoming 15766 * ack has only four possible dispositions: 15767 * 15768 * A) It moves the cum-ack forward 15769 * B) It is behind the cum-ack. 15770 * C) It is a window-update ack. 15771 * D) It is a dup-ack. 15772 * 15773 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 15774 * in the incoming mbuf. We also need to still pay attention 15775 * to nxt_pkt since there may be another packet after this 15776 * one. 15777 */ 15778 #ifdef TCP_ACCOUNTING 15779 uint64_t ts_val; 15780 uint64_t rdstc; 15781 #endif 15782 int segsiz; 15783 struct timespec ts; 15784 struct tcp_rack *rack; 15785 struct tcp_ackent *ae; 15786 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 15787 int cnt, i, did_out, ourfinisacked = 0; 15788 struct tcpopt to_holder, *to = NULL; 15789 #ifdef TCP_ACCOUNTING 15790 int win_up_req = 0; 15791 #endif 15792 int nsegs = 0; 15793 int under_pacing = 0; 15794 int recovery = 0; 15795 #ifdef TCP_ACCOUNTING 15796 sched_pin(); 15797 #endif 15798 rack = (struct tcp_rack *)tp->t_fb_ptr; 15799 if (rack->gp_ready && 15800 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 15801 under_pacing = 1; 15802 15803 if (rack->r_state != tp->t_state) 15804 rack_set_state(tp, rack); 15805 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 15806 (tp->t_flags & TF_GPUTINPROG)) { 15807 /* 15808 * We have a goodput in progress 15809 * and we have entered a late state. 15810 * Do we have enough data in the sb 15811 * to handle the GPUT request? 15812 */ 15813 uint32_t bytes; 15814 15815 bytes = tp->gput_ack - tp->gput_seq; 15816 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 15817 bytes += tp->gput_seq - tp->snd_una; 15818 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 15819 /* 15820 * There are not enough bytes in the socket 15821 * buffer that have been sent to cover this 15822 * measurement. Cancel it. 15823 */ 15824 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 15825 rack->r_ctl.rc_gp_srtt /*flex1*/, 15826 tp->gput_seq, 15827 0, 0, 18, __LINE__, NULL, 0); 15828 tp->t_flags &= ~TF_GPUTINPROG; 15829 } 15830 } 15831 to = &to_holder; 15832 to->to_flags = 0; 15833 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 15834 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 15835 cnt = m->m_len / sizeof(struct tcp_ackent); 15836 counter_u64_add(rack_multi_single_eq, cnt); 15837 high_seq = tp->snd_una; 15838 the_win = tp->snd_wnd; 15839 win_seq = tp->snd_wl1; 15840 win_upd_ack = tp->snd_wl2; 15841 cts = tcp_tv_to_usectick(tv); 15842 ms_cts = tcp_tv_to_mssectick(tv); 15843 rack->r_ctl.rc_rcvtime = cts; 15844 segsiz = ctf_fixed_maxseg(tp); 15845 if ((rack->rc_gp_dyn_mul) && 15846 (rack->use_fixed_rate == 0) && 15847 (rack->rc_always_pace)) { 15848 /* Check in on probertt */ 15849 rack_check_probe_rtt(rack, cts); 15850 } 15851 for (i = 0; i < cnt; i++) { 15852 #ifdef TCP_ACCOUNTING 15853 ts_val = get_cyclecount(); 15854 #endif 15855 rack_clear_rate_sample(rack); 15856 ae = ((mtod(m, struct tcp_ackent *)) + i); 15857 if (ae->flags & TH_FIN) 15858 rack_log_pacing_delay_calc(rack, 15859 0, 15860 0, 15861 0, 15862 rack_get_gp_est(rack), /* delRate */ 15863 rack_get_lt_bw(rack), /* rttProp */ 15864 20, __LINE__, NULL, 0); 15865 /* Setup the window */ 15866 tiwin = ae->win << tp->snd_scale; 15867 if (tiwin > rack->r_ctl.rc_high_rwnd) 15868 rack->r_ctl.rc_high_rwnd = tiwin; 15869 /* figure out the type of ack */ 15870 if (SEQ_LT(ae->ack, high_seq)) { 15871 /* Case B*/ 15872 ae->ack_val_set = ACK_BEHIND; 15873 } else if (SEQ_GT(ae->ack, high_seq)) { 15874 /* Case A */ 15875 ae->ack_val_set = ACK_CUMACK; 15876 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 15877 /* Case D */ 15878 ae->ack_val_set = ACK_DUPACK; 15879 } else { 15880 /* Case C */ 15881 ae->ack_val_set = ACK_RWND; 15882 } 15883 if (rack->sack_attack_disable > 0) { 15884 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 15885 rack->r_ctl.ack_during_sd++; 15886 } 15887 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 15888 /* Validate timestamp */ 15889 if (ae->flags & HAS_TSTMP) { 15890 /* Setup for a timestamp */ 15891 to->to_flags = TOF_TS; 15892 ae->ts_echo -= tp->ts_offset; 15893 to->to_tsecr = ae->ts_echo; 15894 to->to_tsval = ae->ts_value; 15895 /* 15896 * If echoed timestamp is later than the current time, fall back to 15897 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 15898 * were used when this connection was established. 15899 */ 15900 if (TSTMP_GT(ae->ts_echo, ms_cts)) 15901 to->to_tsecr = 0; 15902 if (tp->ts_recent && 15903 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 15904 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 15905 #ifdef TCP_ACCOUNTING 15906 rdstc = get_cyclecount(); 15907 if (rdstc > ts_val) { 15908 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15909 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 15910 } 15911 } 15912 #endif 15913 continue; 15914 } 15915 } 15916 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 15917 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 15918 tp->ts_recent_age = tcp_ts_getticks(); 15919 tp->ts_recent = ae->ts_value; 15920 } 15921 } else { 15922 /* Setup for a no options */ 15923 to->to_flags = 0; 15924 } 15925 /* Update the rcv time and perform idle reduction possibly */ 15926 if (tp->t_idle_reduce && 15927 (tp->snd_max == tp->snd_una) && 15928 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 15929 counter_u64_add(rack_input_idle_reduces, 1); 15930 rack_cc_after_idle(rack, tp); 15931 } 15932 tp->t_rcvtime = ticks; 15933 /* Now what about ECN of a chain of pure ACKs? */ 15934 if (tcp_ecn_input_segment(tp, ae->flags, 0, 15935 tcp_packets_this_ack(tp, ae->ack), 15936 ae->codepoint)) 15937 rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__); 15938 #ifdef TCP_ACCOUNTING 15939 /* Count for the specific type of ack in */ 15940 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15941 tp->tcp_cnt_counters[ae->ack_val_set]++; 15942 } 15943 #endif 15944 /* 15945 * Note how we could move up these in the determination 15946 * above, but we don't so that way the timestamp checks (and ECN) 15947 * is done first before we do any processing on the ACK. 15948 * The non-compressed path through the code has this 15949 * weakness (noted by @jtl) that it actually does some 15950 * processing before verifying the timestamp information. 15951 * We don't take that path here which is why we set 15952 * the ack_val_set first, do the timestamp and ecn 15953 * processing, and then look at what we have setup. 15954 */ 15955 if (ae->ack_val_set == ACK_BEHIND) { 15956 /* 15957 * Case B flag reordering, if window is not closed 15958 * or it could be a keep-alive or persists 15959 */ 15960 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 15961 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 15962 if (rack->r_ctl.rc_reorder_ts == 0) 15963 rack->r_ctl.rc_reorder_ts = 1; 15964 } 15965 } else if (ae->ack_val_set == ACK_DUPACK) { 15966 /* Case D */ 15967 rack_strike_dupack(rack); 15968 } else if (ae->ack_val_set == ACK_RWND) { 15969 /* Case C */ 15970 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 15971 ts.tv_sec = ae->timestamp / 1000000000; 15972 ts.tv_nsec = ae->timestamp % 1000000000; 15973 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 15974 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 15975 } else { 15976 rack->r_ctl.act_rcv_time = *tv; 15977 } 15978 if (rack->forced_ack) { 15979 rack_handle_probe_response(rack, tiwin, 15980 tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 15981 } 15982 #ifdef TCP_ACCOUNTING 15983 win_up_req = 1; 15984 #endif 15985 win_upd_ack = ae->ack; 15986 win_seq = ae->seq; 15987 the_win = tiwin; 15988 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 15989 } else { 15990 /* Case A */ 15991 if (SEQ_GT(ae->ack, tp->snd_max)) { 15992 /* 15993 * We just send an ack since the incoming 15994 * ack is beyond the largest seq we sent. 15995 */ 15996 if ((tp->t_flags & TF_ACKNOW) == 0) { 15997 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 15998 if (tp->t_flags && TF_ACKNOW) 15999 rack->r_wanted_output = 1; 16000 } 16001 } else { 16002 nsegs++; 16003 /* If the window changed setup to update */ 16004 if (tiwin != tp->snd_wnd) { 16005 win_upd_ack = ae->ack; 16006 win_seq = ae->seq; 16007 the_win = tiwin; 16008 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts); 16009 } 16010 #ifdef TCP_ACCOUNTING 16011 /* Account for the acks */ 16012 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16013 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 16014 } 16015 #endif 16016 high_seq = ae->ack; 16017 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) 16018 rack_log_hystart_event(rack, high_seq, 8); 16019 /* Setup our act_rcv_time */ 16020 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 16021 ts.tv_sec = ae->timestamp / 1000000000; 16022 ts.tv_nsec = ae->timestamp % 1000000000; 16023 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16024 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16025 } else { 16026 rack->r_ctl.act_rcv_time = *tv; 16027 } 16028 rack_process_to_cumack(tp, rack, ae->ack, cts, to, 16029 tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time)); 16030 #ifdef TCP_REQUEST_TRK 16031 rack_req_check_for_comp(rack, high_seq); 16032 #endif 16033 if (rack->rc_dsack_round_seen) { 16034 /* Is the dsack round over? */ 16035 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 16036 /* Yes it is */ 16037 rack->rc_dsack_round_seen = 0; 16038 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 16039 } 16040 } 16041 } 16042 } 16043 /* And lets be sure to commit the rtt measurements for this ack */ 16044 tcp_rack_xmit_timer_commit(rack, tp); 16045 #ifdef TCP_ACCOUNTING 16046 rdstc = get_cyclecount(); 16047 if (rdstc > ts_val) { 16048 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16049 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 16050 if (ae->ack_val_set == ACK_CUMACK) 16051 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 16052 } 16053 } 16054 #endif 16055 } 16056 #ifdef TCP_ACCOUNTING 16057 ts_val = get_cyclecount(); 16058 #endif 16059 /* Tend to any collapsed window */ 16060 if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) { 16061 /* The peer collapsed the window */ 16062 rack_collapsed_window(rack, (tp->snd_max - high_seq), high_seq, __LINE__); 16063 } else if (rack->rc_has_collapsed) 16064 rack_un_collapse_window(rack, __LINE__); 16065 if ((rack->r_collapse_point_valid) && 16066 (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point))) 16067 rack->r_collapse_point_valid = 0; 16068 acked_amount = acked = (high_seq - tp->snd_una); 16069 if (acked) { 16070 /* 16071 * The draft (v3) calls for us to use SEQ_GEQ, but that 16072 * causes issues when we are just going app limited. Lets 16073 * instead use SEQ_GT <or> where its equal but more data 16074 * is outstanding. 16075 * 16076 * Also make sure we are on the last ack of a series. We 16077 * have to have all the ack's processed in queue to know 16078 * if there is something left outstanding. 16079 * 16080 */ 16081 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && 16082 (rack->rc_new_rnd_needed == 0) && 16083 (nxt_pkt == 0)) { 16084 rack_log_hystart_event(rack, high_seq, 21); 16085 rack->r_ctl.current_round++; 16086 /* Force the next send to setup the next round */ 16087 rack->rc_new_rnd_needed = 1; 16088 if (CC_ALGO(tp)->newround != NULL) { 16089 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 16090 } 16091 } 16092 /* 16093 * Clear the probe not answered flag 16094 * since cum-ack moved forward. 16095 */ 16096 rack->probe_not_answered = 0; 16097 if (rack->sack_attack_disable == 0) 16098 rack_do_decay(rack); 16099 if (acked >= segsiz) { 16100 /* 16101 * You only get credit for 16102 * MSS and greater (and you get extra 16103 * credit for larger cum-ack moves). 16104 */ 16105 int ac; 16106 16107 ac = acked / segsiz; 16108 rack->r_ctl.ack_count += ac; 16109 counter_u64_add(rack_ack_total, ac); 16110 } 16111 if (rack->r_ctl.ack_count > 0xfff00000) { 16112 /* 16113 * reduce the number to keep us under 16114 * a uint32_t. 16115 */ 16116 rack->r_ctl.ack_count /= 2; 16117 rack->r_ctl.sack_count /= 2; 16118 } 16119 if (tp->t_flags & TF_NEEDSYN) { 16120 /* 16121 * T/TCP: Connection was half-synchronized, and our SYN has 16122 * been ACK'd (so connection is now fully synchronized). Go 16123 * to non-starred state, increment snd_una for ACK of SYN, 16124 * and check if we can do window scaling. 16125 */ 16126 tp->t_flags &= ~TF_NEEDSYN; 16127 tp->snd_una++; 16128 acked_amount = acked = (high_seq - tp->snd_una); 16129 } 16130 if (acked > sbavail(&so->so_snd)) 16131 acked_amount = sbavail(&so->so_snd); 16132 #ifdef TCP_SAD_DETECTION 16133 /* 16134 * We only care on a cum-ack move if we are in a sack-disabled 16135 * state. We have already added in to the ack_count, and we never 16136 * would disable on a cum-ack move, so we only care to do the 16137 * detection if it may "undo" it, i.e. we were in disabled already. 16138 */ 16139 if (rack->sack_attack_disable) 16140 rack_do_detection(tp, rack, acked_amount, segsiz); 16141 #endif 16142 if (IN_FASTRECOVERY(tp->t_flags) && 16143 (rack->rack_no_prr == 0)) 16144 rack_update_prr(tp, rack, acked_amount, high_seq); 16145 if (IN_RECOVERY(tp->t_flags)) { 16146 if (SEQ_LT(high_seq, tp->snd_recover) && 16147 (SEQ_LT(high_seq, tp->snd_max))) { 16148 tcp_rack_partialack(tp); 16149 } else { 16150 rack_post_recovery(tp, high_seq); 16151 recovery = 1; 16152 } 16153 } 16154 /* Handle the rack-log-ack part (sendmap) */ 16155 if ((sbused(&so->so_snd) == 0) && 16156 (acked > acked_amount) && 16157 (tp->t_state >= TCPS_FIN_WAIT_1) && 16158 (tp->t_flags & TF_SENTFIN)) { 16159 /* 16160 * We must be sure our fin 16161 * was sent and acked (we can be 16162 * in FIN_WAIT_1 without having 16163 * sent the fin). 16164 */ 16165 ourfinisacked = 1; 16166 /* 16167 * Lets make sure snd_una is updated 16168 * since most likely acked_amount = 0 (it 16169 * should be). 16170 */ 16171 tp->snd_una = high_seq; 16172 } 16173 /* Did we make a RTO error? */ 16174 if ((tp->t_flags & TF_PREVVALID) && 16175 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 16176 tp->t_flags &= ~TF_PREVVALID; 16177 if (tp->t_rxtshift == 1 && 16178 (int)(ticks - tp->t_badrxtwin) < 0) 16179 rack_cong_signal(tp, CC_RTO_ERR, high_seq, __LINE__); 16180 } 16181 /* Handle the data in the socket buffer */ 16182 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 16183 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 16184 if (acked_amount > 0) { 16185 struct mbuf *mfree; 16186 16187 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 16188 SOCKBUF_LOCK(&so->so_snd); 16189 mfree = sbcut_locked(&so->so_snd, acked_amount); 16190 tp->snd_una = high_seq; 16191 /* Note we want to hold the sb lock through the sendmap adjust */ 16192 rack_adjust_sendmap_head(rack, &so->so_snd); 16193 /* Wake up the socket if we have room to write more */ 16194 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 16195 sowwakeup_locked(so); 16196 if ((recovery == 1) && 16197 (rack->excess_rxt_on) && 16198 (rack->r_cwnd_was_clamped == 0)) { 16199 do_rack_excess_rxt(tp, rack); 16200 } else if (rack->r_cwnd_was_clamped) 16201 do_rack_check_for_unclamp(tp, rack); 16202 m_freem(mfree); 16203 } 16204 /* update progress */ 16205 tp->t_acktime = ticks; 16206 rack_log_progress_event(rack, tp, tp->t_acktime, 16207 PROGRESS_UPDATE, __LINE__); 16208 /* Clear out shifts and such */ 16209 tp->t_rxtshift = 0; 16210 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 16211 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 16212 rack->rc_tlp_in_progress = 0; 16213 rack->r_ctl.rc_tlp_cnt_out = 0; 16214 /* Send recover and snd_nxt must be dragged along */ 16215 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 16216 tp->snd_recover = tp->snd_una; 16217 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 16218 tp->snd_nxt = tp->snd_una; 16219 /* 16220 * If the RXT timer is running we want to 16221 * stop it, so we can restart a TLP (or new RXT). 16222 */ 16223 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 16224 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16225 tp->snd_wl2 = high_seq; 16226 tp->t_dupacks = 0; 16227 if (under_pacing && 16228 (rack->use_fixed_rate == 0) && 16229 (rack->in_probe_rtt == 0) && 16230 rack->rc_gp_dyn_mul && 16231 rack->rc_always_pace) { 16232 /* Check if we are dragging bottom */ 16233 rack_check_bottom_drag(tp, rack, so); 16234 } 16235 if (tp->snd_una == tp->snd_max) { 16236 tp->t_flags &= ~TF_PREVVALID; 16237 rack->r_ctl.retran_during_recovery = 0; 16238 rack->rc_suspicious = 0; 16239 rack->r_ctl.dsack_byte_cnt = 0; 16240 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 16241 if (rack->r_ctl.rc_went_idle_time == 0) 16242 rack->r_ctl.rc_went_idle_time = 1; 16243 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 16244 if (sbavail(&tptosocket(tp)->so_snd) == 0) 16245 tp->t_acktime = 0; 16246 /* Set so we might enter persists... */ 16247 rack->r_wanted_output = 1; 16248 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16249 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 16250 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16251 (sbavail(&so->so_snd) == 0) && 16252 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 16253 /* 16254 * The socket was gone and the 16255 * peer sent data (not now in the past), time to 16256 * reset him. 16257 */ 16258 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 16259 /* tcp_close will kill the inp pre-log the Reset */ 16260 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 16261 #ifdef TCP_ACCOUNTING 16262 rdstc = get_cyclecount(); 16263 if (rdstc > ts_val) { 16264 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16265 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16266 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16267 } 16268 } 16269 #endif 16270 m_freem(m); 16271 tp = tcp_close(tp); 16272 if (tp == NULL) { 16273 #ifdef TCP_ACCOUNTING 16274 sched_unpin(); 16275 #endif 16276 return (1); 16277 } 16278 /* 16279 * We would normally do drop-with-reset which would 16280 * send back a reset. We can't since we don't have 16281 * all the needed bits. Instead lets arrange for 16282 * a call to tcp_output(). That way since we 16283 * are in the closed state we will generate a reset. 16284 * 16285 * Note if tcp_accounting is on we don't unpin since 16286 * we do that after the goto label. 16287 */ 16288 goto send_out_a_rst; 16289 } 16290 if ((sbused(&so->so_snd) == 0) && 16291 (tp->t_state >= TCPS_FIN_WAIT_1) && 16292 (tp->t_flags & TF_SENTFIN)) { 16293 /* 16294 * If we can't receive any more data, then closing user can 16295 * proceed. Starting the timer is contrary to the 16296 * specification, but if we don't get a FIN we'll hang 16297 * forever. 16298 * 16299 */ 16300 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16301 soisdisconnected(so); 16302 tcp_timer_activate(tp, TT_2MSL, 16303 (tcp_fast_finwait2_recycle ? 16304 tcp_finwait2_timeout : 16305 TP_MAXIDLE(tp))); 16306 } 16307 if (ourfinisacked == 0) { 16308 /* 16309 * We don't change to fin-wait-2 if we have our fin acked 16310 * which means we are probably in TCPS_CLOSING. 16311 */ 16312 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16313 } 16314 } 16315 } 16316 /* Wake up the socket if we have room to write more */ 16317 if (sbavail(&so->so_snd)) { 16318 rack->r_wanted_output = 1; 16319 if (ctf_progress_timeout_check(tp, true)) { 16320 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 16321 tp, tick, PROGRESS_DROP, __LINE__); 16322 /* 16323 * We cheat here and don't send a RST, we should send one 16324 * when the pacer drops the connection. 16325 */ 16326 #ifdef TCP_ACCOUNTING 16327 rdstc = get_cyclecount(); 16328 if (rdstc > ts_val) { 16329 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16330 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16331 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16332 } 16333 } 16334 sched_unpin(); 16335 #endif 16336 (void)tcp_drop(tp, ETIMEDOUT); 16337 m_freem(m); 16338 return (1); 16339 } 16340 } 16341 if (ourfinisacked) { 16342 switch(tp->t_state) { 16343 case TCPS_CLOSING: 16344 #ifdef TCP_ACCOUNTING 16345 rdstc = get_cyclecount(); 16346 if (rdstc > ts_val) { 16347 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16348 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16349 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16350 } 16351 } 16352 sched_unpin(); 16353 #endif 16354 tcp_twstart(tp); 16355 m_freem(m); 16356 return (1); 16357 break; 16358 case TCPS_LAST_ACK: 16359 #ifdef TCP_ACCOUNTING 16360 rdstc = get_cyclecount(); 16361 if (rdstc > ts_val) { 16362 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16363 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16364 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16365 } 16366 } 16367 sched_unpin(); 16368 #endif 16369 tp = tcp_close(tp); 16370 ctf_do_drop(m, tp); 16371 return (1); 16372 break; 16373 case TCPS_FIN_WAIT_1: 16374 #ifdef TCP_ACCOUNTING 16375 rdstc = get_cyclecount(); 16376 if (rdstc > ts_val) { 16377 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16378 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16379 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16380 } 16381 } 16382 #endif 16383 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 16384 soisdisconnected(so); 16385 tcp_timer_activate(tp, TT_2MSL, 16386 (tcp_fast_finwait2_recycle ? 16387 tcp_finwait2_timeout : 16388 TP_MAXIDLE(tp))); 16389 } 16390 tcp_state_change(tp, TCPS_FIN_WAIT_2); 16391 break; 16392 default: 16393 break; 16394 } 16395 } 16396 if (rack->r_fast_output) { 16397 /* 16398 * We re doing fast output.. can we expand that? 16399 */ 16400 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 16401 } 16402 #ifdef TCP_ACCOUNTING 16403 rdstc = get_cyclecount(); 16404 if (rdstc > ts_val) { 16405 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16406 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 16407 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 16408 } 16409 } 16410 16411 } else if (win_up_req) { 16412 rdstc = get_cyclecount(); 16413 if (rdstc > ts_val) { 16414 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16415 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 16416 } 16417 } 16418 #endif 16419 } 16420 /* Now is there a next packet, if so we are done */ 16421 m_freem(m); 16422 did_out = 0; 16423 if (nxt_pkt) { 16424 #ifdef TCP_ACCOUNTING 16425 sched_unpin(); 16426 #endif 16427 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 16428 return (0); 16429 } 16430 rack_handle_might_revert(tp, rack); 16431 ctf_calc_rwin(so, tp); 16432 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 16433 send_out_a_rst: 16434 if (tcp_output(tp) < 0) { 16435 #ifdef TCP_ACCOUNTING 16436 sched_unpin(); 16437 #endif 16438 return (1); 16439 } 16440 did_out = 1; 16441 } 16442 if (tp->t_flags2 & TF2_HPTS_CALLS) 16443 tp->t_flags2 &= ~TF2_HPTS_CALLS; 16444 rack_free_trim(rack); 16445 #ifdef TCP_ACCOUNTING 16446 sched_unpin(); 16447 #endif 16448 rack_timer_audit(tp, rack, &so->so_snd); 16449 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 16450 return (0); 16451 } 16452 16453 #define TCP_LRO_TS_OPTION \ 16454 ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 16455 (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP) 16456 16457 static int 16458 rack_do_segment_nounlock(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 16459 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, 16460 struct timeval *tv) 16461 { 16462 struct inpcb *inp = tptoinpcb(tp); 16463 struct socket *so = tptosocket(tp); 16464 #ifdef TCP_ACCOUNTING 16465 uint64_t ts_val; 16466 #endif 16467 int32_t thflags, retval, did_out = 0; 16468 int32_t way_out = 0; 16469 /* 16470 * cts - is the current time from tv (caller gets ts) in microseconds. 16471 * ms_cts - is the current time from tv in milliseconds. 16472 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 16473 */ 16474 uint32_t cts, us_cts, ms_cts; 16475 uint32_t tiwin, high_seq; 16476 struct timespec ts; 16477 struct tcpopt to; 16478 struct tcp_rack *rack; 16479 struct rack_sendmap *rsm; 16480 int32_t prev_state = 0; 16481 int no_output = 0; 16482 int slot_remaining = 0; 16483 #ifdef TCP_ACCOUNTING 16484 int ack_val_set = 0xf; 16485 #endif 16486 int nsegs; 16487 16488 NET_EPOCH_ASSERT(); 16489 INP_WLOCK_ASSERT(inp); 16490 16491 /* 16492 * tv passed from common code is from either M_TSTMP_LRO or 16493 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 16494 */ 16495 rack = (struct tcp_rack *)tp->t_fb_ptr; 16496 if (rack->rack_deferred_inited == 0) { 16497 /* 16498 * If we are the connecting socket we will 16499 * hit rack_init() when no sequence numbers 16500 * are setup. This makes it so we must defer 16501 * some initialization. Call that now. 16502 */ 16503 rack_deferred_init(tp, rack); 16504 } 16505 /* 16506 * Check to see if we need to skip any output plans. This 16507 * can happen in the non-LRO path where we are pacing and 16508 * must process the ack coming in but need to defer sending 16509 * anything becase a pacing timer is running. 16510 */ 16511 us_cts = tcp_tv_to_usectick(tv); 16512 if (m->m_flags & M_ACKCMP) { 16513 /* 16514 * All compressed ack's are ack's by definition so 16515 * remove any ack required flag and then do the processing. 16516 */ 16517 rack->rc_ack_required = 0; 16518 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 16519 } 16520 thflags = tcp_get_flags(th); 16521 if ((rack->rc_always_pace == 1) && 16522 (rack->rc_ack_can_sendout_data == 0) && 16523 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16524 (TSTMP_LT(us_cts, rack->r_ctl.rc_last_output_to))) { 16525 /* 16526 * Ok conditions are right for queuing the packets 16527 * but we do have to check the flags in the inp, it 16528 * could be, if a sack is present, we want to be awoken and 16529 * so should process the packets. 16530 */ 16531 slot_remaining = rack->r_ctl.rc_last_output_to - us_cts; 16532 if (rack->rc_tp->t_flags2 & TF2_DONT_SACK_QUEUE) { 16533 no_output = 1; 16534 } else { 16535 /* 16536 * If there is no options, or just a 16537 * timestamp option, we will want to queue 16538 * the packets. This is the same that LRO does 16539 * and will need to change with accurate ECN. 16540 */ 16541 uint32_t *ts_ptr; 16542 int optlen; 16543 16544 optlen = (th->th_off << 2) - sizeof(struct tcphdr); 16545 ts_ptr = (uint32_t *)(th + 1); 16546 if ((optlen == 0) || 16547 ((optlen == TCPOLEN_TSTAMP_APPA) && 16548 (*ts_ptr == TCP_LRO_TS_OPTION))) 16549 no_output = 1; 16550 } 16551 if ((no_output == 1) && (slot_remaining < tcp_min_hptsi_time)) { 16552 /* 16553 * It is unrealistic to think we can pace in less than 16554 * the minimum granularity of the pacer (def:250usec). So 16555 * if we have less than that time remaining we should go 16556 * ahead and allow output to be "early". We will attempt to 16557 * make up for it in any pacing time we try to apply on 16558 * the outbound packet. 16559 */ 16560 no_output = 0; 16561 } 16562 } 16563 /* 16564 * If there is a RST or FIN lets dump out the bw 16565 * with a FIN the connection may go on but we 16566 * may not. 16567 */ 16568 if ((thflags & TH_FIN) || (thflags & TH_RST)) 16569 rack_log_pacing_delay_calc(rack, 16570 rack->r_ctl.gp_bw, 16571 0, 16572 0, 16573 rack_get_gp_est(rack), /* delRate */ 16574 rack_get_lt_bw(rack), /* rttProp */ 16575 20, __LINE__, NULL, 0); 16576 if (m->m_flags & M_ACKCMP) { 16577 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 16578 } 16579 cts = tcp_tv_to_usectick(tv); 16580 ms_cts = tcp_tv_to_mssectick(tv); 16581 nsegs = m->m_pkthdr.lro_nsegs; 16582 counter_u64_add(rack_proc_non_comp_ack, 1); 16583 #ifdef TCP_ACCOUNTING 16584 sched_pin(); 16585 if (thflags & TH_ACK) 16586 ts_val = get_cyclecount(); 16587 #endif 16588 if ((m->m_flags & M_TSTMP) || 16589 (m->m_flags & M_TSTMP_LRO)) { 16590 mbuf_tstmp2timespec(m, &ts); 16591 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 16592 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 16593 } else 16594 rack->r_ctl.act_rcv_time = *tv; 16595 kern_prefetch(rack, &prev_state); 16596 prev_state = 0; 16597 /* 16598 * Unscale the window into a 32-bit value. For the SYN_SENT state 16599 * the scale is zero. 16600 */ 16601 tiwin = th->th_win << tp->snd_scale; 16602 #ifdef TCP_ACCOUNTING 16603 if (thflags & TH_ACK) { 16604 /* 16605 * We have a tradeoff here. We can either do what we are 16606 * doing i.e. pinning to this CPU and then doing the accounting 16607 * <or> we could do a critical enter, setup the rdtsc and cpu 16608 * as in below, and then validate we are on the same CPU on 16609 * exit. I have choosen to not do the critical enter since 16610 * that often will gain you a context switch, and instead lock 16611 * us (line above this if) to the same CPU with sched_pin(). This 16612 * means we may be context switched out for a higher priority 16613 * interupt but we won't be moved to another CPU. 16614 * 16615 * If this occurs (which it won't very often since we most likely 16616 * are running this code in interupt context and only a higher 16617 * priority will bump us ... clock?) we will falsely add in 16618 * to the time the interupt processing time plus the ack processing 16619 * time. This is ok since its a rare event. 16620 */ 16621 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 16622 ctf_fixed_maxseg(tp)); 16623 } 16624 #endif 16625 /* 16626 * Parse options on any incoming segment. 16627 */ 16628 memset(&to, 0, sizeof(to)); 16629 tcp_dooptions(&to, (u_char *)(th + 1), 16630 (th->th_off << 2) - sizeof(struct tcphdr), 16631 (thflags & TH_SYN) ? TO_SYN : 0); 16632 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 16633 __func__)); 16634 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 16635 __func__)); 16636 16637 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 16638 (tp->t_flags & TF_GPUTINPROG)) { 16639 /* 16640 * We have a goodput in progress 16641 * and we have entered a late state. 16642 * Do we have enough data in the sb 16643 * to handle the GPUT request? 16644 */ 16645 uint32_t bytes; 16646 16647 bytes = tp->gput_ack - tp->gput_seq; 16648 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 16649 bytes += tp->gput_seq - tp->snd_una; 16650 if (bytes > sbavail(&tptosocket(tp)->so_snd)) { 16651 /* 16652 * There are not enough bytes in the socket 16653 * buffer that have been sent to cover this 16654 * measurement. Cancel it. 16655 */ 16656 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 16657 rack->r_ctl.rc_gp_srtt /*flex1*/, 16658 tp->gput_seq, 16659 0, 0, 18, __LINE__, NULL, 0); 16660 tp->t_flags &= ~TF_GPUTINPROG; 16661 } 16662 } 16663 high_seq = th->th_ack; 16664 if (tcp_bblogging_on(rack->rc_tp)) { 16665 union tcp_log_stackspecific log; 16666 struct timeval ltv; 16667 #ifdef TCP_REQUEST_TRK 16668 struct tcp_sendfile_track *tcp_req; 16669 16670 if (SEQ_GT(th->th_ack, tp->snd_una)) { 16671 tcp_req = tcp_req_find_req_for_seq(tp, (th->th_ack-1)); 16672 } else { 16673 tcp_req = tcp_req_find_req_for_seq(tp, th->th_ack); 16674 } 16675 #endif 16676 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16677 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 16678 if (rack->rack_no_prr == 0) 16679 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16680 else 16681 log.u_bbr.flex1 = 0; 16682 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 16683 log.u_bbr.use_lt_bw <<= 1; 16684 log.u_bbr.use_lt_bw |= rack->r_might_revert; 16685 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 16686 log.u_bbr.bbr_state = rack->rc_free_cnt; 16687 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16688 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 16689 log.u_bbr.flex3 = m->m_flags; 16690 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 16691 log.u_bbr.lost = thflags; 16692 log.u_bbr.pacing_gain = 0x1; 16693 #ifdef TCP_ACCOUNTING 16694 log.u_bbr.cwnd_gain = ack_val_set; 16695 #endif 16696 log.u_bbr.flex7 = 2; 16697 if (m->m_flags & M_TSTMP) { 16698 /* Record the hardware timestamp if present */ 16699 mbuf_tstmp2timespec(m, &ts); 16700 ltv.tv_sec = ts.tv_sec; 16701 ltv.tv_usec = ts.tv_nsec / 1000; 16702 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 16703 } else if (m->m_flags & M_TSTMP_LRO) { 16704 /* Record the LRO the arrival timestamp */ 16705 mbuf_tstmp2timespec(m, &ts); 16706 ltv.tv_sec = ts.tv_sec; 16707 ltv.tv_usec = ts.tv_nsec / 1000; 16708 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 16709 } 16710 log.u_bbr.timeStamp = tcp_get_usecs(<v); 16711 /* Log the rcv time */ 16712 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 16713 #ifdef TCP_REQUEST_TRK 16714 log.u_bbr.applimited = tp->t_tcpreq_closed; 16715 log.u_bbr.applimited <<= 8; 16716 log.u_bbr.applimited |= tp->t_tcpreq_open; 16717 log.u_bbr.applimited <<= 8; 16718 log.u_bbr.applimited |= tp->t_tcpreq_req; 16719 if (tcp_req) { 16720 /* Copy out any client req info */ 16721 /* seconds */ 16722 log.u_bbr.pkt_epoch = (tcp_req->localtime / HPTS_USEC_IN_SEC); 16723 /* useconds */ 16724 log.u_bbr.delivered = (tcp_req->localtime % HPTS_USEC_IN_SEC); 16725 log.u_bbr.rttProp = tcp_req->timestamp; 16726 log.u_bbr.cur_del_rate = tcp_req->start; 16727 if (tcp_req->flags & TCP_TRK_TRACK_FLG_OPEN) { 16728 log.u_bbr.flex8 |= 1; 16729 } else { 16730 log.u_bbr.flex8 |= 2; 16731 log.u_bbr.bw_inuse = tcp_req->end; 16732 } 16733 log.u_bbr.flex6 = tcp_req->start_seq; 16734 if (tcp_req->flags & TCP_TRK_TRACK_FLG_COMP) { 16735 log.u_bbr.flex8 |= 4; 16736 log.u_bbr.epoch = tcp_req->end_seq; 16737 } 16738 } 16739 #endif 16740 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 16741 tlen, &log, true, <v); 16742 } 16743 /* Remove ack required flag if set, we have one */ 16744 if (thflags & TH_ACK) 16745 rack->rc_ack_required = 0; 16746 if (rack->sack_attack_disable > 0) { 16747 rack->r_ctl.ack_during_sd++; 16748 rack_log_type_bbrsnd(rack, 0, 0, cts, tv, __LINE__); 16749 } 16750 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 16751 way_out = 4; 16752 retval = 0; 16753 m_freem(m); 16754 goto done_with_input; 16755 } 16756 /* 16757 * If a segment with the ACK-bit set arrives in the SYN-SENT state 16758 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 16759 */ 16760 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 16761 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 16762 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 16763 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 16764 #ifdef TCP_ACCOUNTING 16765 sched_unpin(); 16766 #endif 16767 return (1); 16768 } 16769 /* 16770 * If timestamps were negotiated during SYN/ACK and a 16771 * segment without a timestamp is received, silently drop 16772 * the segment, unless it is a RST segment or missing timestamps are 16773 * tolerated. 16774 * See section 3.2 of RFC 7323. 16775 */ 16776 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 16777 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 16778 way_out = 5; 16779 retval = 0; 16780 m_freem(m); 16781 goto done_with_input; 16782 } 16783 16784 /* 16785 * Segment received on connection. Reset idle time and keep-alive 16786 * timer. XXX: This should be done after segment validation to 16787 * ignore broken/spoofed segs. 16788 */ 16789 if (tp->t_idle_reduce && 16790 (tp->snd_max == tp->snd_una) && 16791 (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 16792 counter_u64_add(rack_input_idle_reduces, 1); 16793 rack_cc_after_idle(rack, tp); 16794 } 16795 tp->t_rcvtime = ticks; 16796 #ifdef STATS 16797 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 16798 #endif 16799 if (tiwin > rack->r_ctl.rc_high_rwnd) 16800 rack->r_ctl.rc_high_rwnd = tiwin; 16801 /* 16802 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 16803 * this to occur after we've validated the segment. 16804 */ 16805 if (tcp_ecn_input_segment(tp, thflags, tlen, 16806 tcp_packets_this_ack(tp, th->th_ack), 16807 iptos)) 16808 rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__); 16809 16810 /* 16811 * If echoed timestamp is later than the current time, fall back to 16812 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 16813 * were used when this connection was established. 16814 */ 16815 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 16816 to.to_tsecr -= tp->ts_offset; 16817 if (TSTMP_GT(to.to_tsecr, ms_cts)) 16818 to.to_tsecr = 0; 16819 } 16820 16821 /* 16822 * If its the first time in we need to take care of options and 16823 * verify we can do SACK for rack! 16824 */ 16825 if (rack->r_state == 0) { 16826 /* Should be init'd by rack_init() */ 16827 KASSERT(rack->rc_inp != NULL, 16828 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 16829 if (rack->rc_inp == NULL) { 16830 rack->rc_inp = inp; 16831 } 16832 16833 /* 16834 * Process options only when we get SYN/ACK back. The SYN 16835 * case for incoming connections is handled in tcp_syncache. 16836 * According to RFC1323 the window field in a SYN (i.e., a 16837 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 16838 * this is traditional behavior, may need to be cleaned up. 16839 */ 16840 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 16841 /* Handle parallel SYN for ECN */ 16842 tcp_ecn_input_parallel_syn(tp, thflags, iptos); 16843 if ((to.to_flags & TOF_SCALE) && 16844 (tp->t_flags & TF_REQ_SCALE)) { 16845 tp->t_flags |= TF_RCVD_SCALE; 16846 tp->snd_scale = to.to_wscale; 16847 } else 16848 tp->t_flags &= ~TF_REQ_SCALE; 16849 /* 16850 * Initial send window. It will be updated with the 16851 * next incoming segment to the scaled value. 16852 */ 16853 tp->snd_wnd = th->th_win; 16854 rack_validate_fo_sendwin_up(tp, rack); 16855 if ((to.to_flags & TOF_TS) && 16856 (tp->t_flags & TF_REQ_TSTMP)) { 16857 tp->t_flags |= TF_RCVD_TSTMP; 16858 tp->ts_recent = to.to_tsval; 16859 tp->ts_recent_age = cts; 16860 } else 16861 tp->t_flags &= ~TF_REQ_TSTMP; 16862 if (to.to_flags & TOF_MSS) { 16863 tcp_mss(tp, to.to_mss); 16864 } 16865 if ((tp->t_flags & TF_SACK_PERMIT) && 16866 (to.to_flags & TOF_SACKPERM) == 0) 16867 tp->t_flags &= ~TF_SACK_PERMIT; 16868 if (IS_FASTOPEN(tp->t_flags)) { 16869 if (to.to_flags & TOF_FASTOPEN) { 16870 uint16_t mss; 16871 16872 if (to.to_flags & TOF_MSS) 16873 mss = to.to_mss; 16874 else 16875 if ((inp->inp_vflag & INP_IPV6) != 0) 16876 mss = TCP6_MSS; 16877 else 16878 mss = TCP_MSS; 16879 tcp_fastopen_update_cache(tp, mss, 16880 to.to_tfo_len, to.to_tfo_cookie); 16881 } else 16882 tcp_fastopen_disable_path(tp); 16883 } 16884 } 16885 /* 16886 * At this point we are at the initial call. Here we decide 16887 * if we are doing RACK or not. We do this by seeing if 16888 * TF_SACK_PERMIT is set and the sack-not-required is clear. 16889 * The code now does do dup-ack counting so if you don't 16890 * switch back you won't get rack & TLP, but you will still 16891 * get this stack. 16892 */ 16893 16894 if ((rack_sack_not_required == 0) && 16895 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 16896 tcp_switch_back_to_default(tp); 16897 (*tp->t_fb->tfb_tcp_do_segment)(tp, m, th, drop_hdrlen, 16898 tlen, iptos); 16899 #ifdef TCP_ACCOUNTING 16900 sched_unpin(); 16901 #endif 16902 return (1); 16903 } 16904 tcp_set_hpts(tp); 16905 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 16906 } 16907 if (thflags & TH_FIN) 16908 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 16909 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 16910 if ((rack->rc_gp_dyn_mul) && 16911 (rack->use_fixed_rate == 0) && 16912 (rack->rc_always_pace)) { 16913 /* Check in on probertt */ 16914 rack_check_probe_rtt(rack, us_cts); 16915 } 16916 rack_clear_rate_sample(rack); 16917 if ((rack->forced_ack) && 16918 ((tcp_get_flags(th) & TH_RST) == 0)) { 16919 rack_handle_probe_response(rack, tiwin, us_cts); 16920 } 16921 /* 16922 * This is the one exception case where we set the rack state 16923 * always. All other times (timers etc) we must have a rack-state 16924 * set (so we assure we have done the checks above for SACK). 16925 */ 16926 rack->r_ctl.rc_rcvtime = cts; 16927 if (rack->r_state != tp->t_state) 16928 rack_set_state(tp, rack); 16929 if (SEQ_GT(th->th_ack, tp->snd_una) && 16930 (rsm = tqhash_min(rack->r_ctl.tqh)) != NULL) 16931 kern_prefetch(rsm, &prev_state); 16932 prev_state = rack->r_state; 16933 if ((thflags & TH_RST) && 16934 ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && 16935 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || 16936 (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq))) { 16937 /* The connection will be killed by a reset check the tracepoint */ 16938 tcp_trace_point(rack->rc_tp, TCP_TP_RESET_RCV); 16939 } 16940 retval = (*rack->r_substate) (m, th, so, 16941 tp, &to, drop_hdrlen, 16942 tlen, tiwin, thflags, nxt_pkt, iptos); 16943 if (retval == 0) { 16944 /* 16945 * If retval is 1 the tcb is unlocked and most likely the tp 16946 * is gone. 16947 */ 16948 INP_WLOCK_ASSERT(inp); 16949 if ((rack->rc_gp_dyn_mul) && 16950 (rack->rc_always_pace) && 16951 (rack->use_fixed_rate == 0) && 16952 rack->in_probe_rtt && 16953 (rack->r_ctl.rc_time_probertt_starts == 0)) { 16954 /* 16955 * If we are going for target, lets recheck before 16956 * we output. 16957 */ 16958 rack_check_probe_rtt(rack, us_cts); 16959 } 16960 if (rack->set_pacing_done_a_iw == 0) { 16961 /* How much has been acked? */ 16962 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 16963 /* We have enough to set in the pacing segment size */ 16964 rack->set_pacing_done_a_iw = 1; 16965 rack_set_pace_segments(tp, rack, __LINE__, NULL); 16966 } 16967 } 16968 tcp_rack_xmit_timer_commit(rack, tp); 16969 #ifdef TCP_ACCOUNTING 16970 /* 16971 * If we set the ack_val_se to what ack processing we are doing 16972 * we also want to track how many cycles we burned. Note 16973 * the bits after tcp_output we let be "free". This is because 16974 * we are also tracking the tcp_output times as well. Note the 16975 * use of 0xf here since we only have 11 counter (0 - 0xa) and 16976 * 0xf cannot be returned and is what we initialize it too to 16977 * indicate we are not doing the tabulations. 16978 */ 16979 if (ack_val_set != 0xf) { 16980 uint64_t crtsc; 16981 16982 crtsc = get_cyclecount(); 16983 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16984 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 16985 } 16986 } 16987 #endif 16988 if ((nxt_pkt == 0) && (no_output == 0)) { 16989 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 16990 do_output_now: 16991 if (tcp_output(tp) < 0) { 16992 #ifdef TCP_ACCOUNTING 16993 sched_unpin(); 16994 #endif 16995 return (1); 16996 } 16997 did_out = 1; 16998 } 16999 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 17000 rack_free_trim(rack); 17001 } else if ((no_output == 1) && 17002 (nxt_pkt == 0) && 17003 (tcp_in_hpts(rack->rc_tp) == 0)) { 17004 /* 17005 * We are not in hpts and we had a pacing timer up. Use 17006 * the remaining time (slot_remaining) to restart the timer. 17007 */ 17008 KASSERT ((slot_remaining != 0), ("slot remaining is zero for rack:%p tp:%p", rack, tp)); 17009 rack_start_hpts_timer(rack, tp, cts, slot_remaining, 0, 0); 17010 rack_free_trim(rack); 17011 } 17012 /* Clear the flag, it may have been cleared by output but we may not have */ 17013 if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) 17014 tp->t_flags2 &= ~TF2_HPTS_CALLS; 17015 /* Update any rounds needed */ 17016 if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) 17017 rack_log_hystart_event(rack, high_seq, 8); 17018 /* 17019 * The draft (v3) calls for us to use SEQ_GEQ, but that 17020 * causes issues when we are just going app limited. Lets 17021 * instead use SEQ_GT <or> where its equal but more data 17022 * is outstanding. 17023 * 17024 * Also make sure we are on the last ack of a series. We 17025 * have to have all the ack's processed in queue to know 17026 * if there is something left outstanding. 17027 */ 17028 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && 17029 (rack->rc_new_rnd_needed == 0) && 17030 (nxt_pkt == 0)) { 17031 rack_log_hystart_event(rack, tp->snd_una, 21); 17032 rack->r_ctl.current_round++; 17033 /* Force the next send to setup the next round */ 17034 rack->rc_new_rnd_needed = 1; 17035 if (CC_ALGO(tp)->newround != NULL) { 17036 CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); 17037 } 17038 } 17039 if ((nxt_pkt == 0) && 17040 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 17041 (SEQ_GT(tp->snd_max, tp->snd_una) || 17042 (tp->t_flags & TF_DELACK) || 17043 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 17044 (tp->t_state <= TCPS_CLOSING)))) { 17045 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 17046 if ((tp->snd_max == tp->snd_una) && 17047 ((tp->t_flags & TF_DELACK) == 0) && 17048 (tcp_in_hpts(rack->rc_tp)) && 17049 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 17050 /* keep alive not needed if we are hptsi output yet */ 17051 ; 17052 } else { 17053 int late = 0; 17054 if (tcp_in_hpts(tp)) { 17055 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 17056 us_cts = tcp_get_usecs(NULL); 17057 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 17058 rack->r_early = 1; 17059 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 17060 } else 17061 late = 1; 17062 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 17063 } 17064 tcp_hpts_remove(tp); 17065 } 17066 if (late && (did_out == 0)) { 17067 /* 17068 * We are late in the sending 17069 * and we did not call the output 17070 * (this probably should not happen). 17071 */ 17072 goto do_output_now; 17073 } 17074 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 17075 } 17076 way_out = 1; 17077 } else if (nxt_pkt == 0) { 17078 /* Do we have the correct timer running? */ 17079 rack_timer_audit(tp, rack, &so->so_snd); 17080 way_out = 2; 17081 } 17082 done_with_input: 17083 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 17084 if (did_out) 17085 rack->r_wanted_output = 0; 17086 } 17087 #ifdef TCP_ACCOUNTING 17088 sched_unpin(); 17089 #endif 17090 return (retval); 17091 } 17092 17093 static void 17094 rack_do_segment(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 17095 int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 17096 { 17097 struct timeval tv; 17098 17099 /* First lets see if we have old packets */ 17100 if (!STAILQ_EMPTY(&tp->t_inqueue)) { 17101 if (ctf_do_queued_segments(tp, 1)) { 17102 m_freem(m); 17103 return; 17104 } 17105 } 17106 if (m->m_flags & M_TSTMP_LRO) { 17107 mbuf_tstmp2timeval(m, &tv); 17108 } else { 17109 /* Should not be should we kassert instead? */ 17110 tcp_get_usecs(&tv); 17111 } 17112 if (rack_do_segment_nounlock(tp, m, th, drop_hdrlen, tlen, iptos, 0, 17113 &tv) == 0) { 17114 INP_WUNLOCK(tptoinpcb(tp)); 17115 } 17116 } 17117 17118 struct rack_sendmap * 17119 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 17120 { 17121 struct rack_sendmap *rsm = NULL; 17122 int32_t idx; 17123 uint32_t srtt = 0, thresh = 0, ts_low = 0; 17124 int no_sack = 0; 17125 17126 /* Return the next guy to be re-transmitted */ 17127 if (tqhash_empty(rack->r_ctl.tqh)) { 17128 return (NULL); 17129 } 17130 if (tp->t_flags & TF_SENTFIN) { 17131 /* retran the end FIN? */ 17132 return (NULL); 17133 } 17134 /* ok lets look at this one */ 17135 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17136 if (rack->r_must_retran && rsm && (rsm->r_flags & RACK_MUST_RXT)) { 17137 return (rsm); 17138 } 17139 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 17140 goto check_it; 17141 } 17142 rsm = rack_find_lowest_rsm(rack); 17143 if (rsm == NULL) { 17144 return (NULL); 17145 } 17146 check_it: 17147 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) || 17148 (rack->sack_attack_disable > 0)) { 17149 no_sack = 1; 17150 } 17151 if ((no_sack > 0) && 17152 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 17153 /* 17154 * No sack so we automatically do the 3 strikes and 17155 * retransmit (no rack timer would be started). 17156 */ 17157 return (rsm); 17158 } 17159 if (rsm->r_flags & RACK_ACKED) { 17160 return (NULL); 17161 } 17162 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 17163 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 17164 /* Its not yet ready */ 17165 return (NULL); 17166 } 17167 srtt = rack_grab_rtt(tp, rack); 17168 idx = rsm->r_rtr_cnt - 1; 17169 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 17170 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 17171 if ((tsused == ts_low) || 17172 (TSTMP_LT(tsused, ts_low))) { 17173 /* No time since sending */ 17174 return (NULL); 17175 } 17176 if ((tsused - ts_low) < thresh) { 17177 /* It has not been long enough yet */ 17178 return (NULL); 17179 } 17180 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 17181 ((rsm->r_flags & RACK_SACK_PASSED) && 17182 (rack->sack_attack_disable == 0))) { 17183 /* 17184 * We have passed the dup-ack threshold <or> 17185 * a SACK has indicated this is missing. 17186 * Note that if you are a declared attacker 17187 * it is only the dup-ack threshold that 17188 * will cause retransmits. 17189 */ 17190 /* log retransmit reason */ 17191 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 17192 rack->r_fast_output = 0; 17193 return (rsm); 17194 } 17195 return (NULL); 17196 } 17197 17198 static void 17199 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 17200 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 17201 int line, struct rack_sendmap *rsm, uint8_t quality) 17202 { 17203 if (tcp_bblogging_on(rack->rc_tp)) { 17204 union tcp_log_stackspecific log; 17205 struct timeval tv; 17206 17207 memset(&log, 0, sizeof(log)); 17208 log.u_bbr.flex1 = slot; 17209 log.u_bbr.flex2 = len; 17210 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 17211 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 17212 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 17213 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 17214 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 17215 log.u_bbr.use_lt_bw <<= 1; 17216 log.u_bbr.use_lt_bw |= rack->r_late; 17217 log.u_bbr.use_lt_bw <<= 1; 17218 log.u_bbr.use_lt_bw |= rack->r_early; 17219 log.u_bbr.use_lt_bw <<= 1; 17220 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 17221 log.u_bbr.use_lt_bw <<= 1; 17222 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 17223 log.u_bbr.use_lt_bw <<= 1; 17224 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 17225 log.u_bbr.use_lt_bw <<= 1; 17226 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 17227 log.u_bbr.use_lt_bw <<= 1; 17228 log.u_bbr.use_lt_bw |= rack->gp_ready; 17229 log.u_bbr.pkt_epoch = line; 17230 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 17231 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 17232 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 17233 log.u_bbr.bw_inuse = bw_est; 17234 log.u_bbr.delRate = bw; 17235 if (rack->r_ctl.gp_bw == 0) 17236 log.u_bbr.cur_del_rate = 0; 17237 else 17238 log.u_bbr.cur_del_rate = rack_get_bw(rack); 17239 log.u_bbr.rttProp = len_time; 17240 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 17241 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 17242 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 17243 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 17244 /* We are in slow start */ 17245 log.u_bbr.flex7 = 1; 17246 } else { 17247 /* we are on congestion avoidance */ 17248 log.u_bbr.flex7 = 0; 17249 } 17250 log.u_bbr.flex8 = method; 17251 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 17252 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 17253 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 17254 log.u_bbr.cwnd_gain <<= 1; 17255 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 17256 log.u_bbr.cwnd_gain <<= 1; 17257 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 17258 log.u_bbr.bbr_substate = quality; 17259 log.u_bbr.bbr_state = rack->dgp_on; 17260 log.u_bbr.bbr_state <<= 1; 17261 log.u_bbr.bbr_state |= rack->r_fill_less_agg; 17262 log.u_bbr.bbr_state <<= 1; 17263 log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; 17264 log.u_bbr.bbr_state <<= 2; 17265 log.u_bbr.bbr_state |= rack->r_pacing_discount; 17266 log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7); 17267 TCP_LOG_EVENTP(rack->rc_tp, NULL, 17268 &rack->rc_inp->inp_socket->so_rcv, 17269 &rack->rc_inp->inp_socket->so_snd, 17270 BBR_LOG_HPTSI_CALC, 0, 17271 0, &log, false, &tv); 17272 } 17273 } 17274 17275 static uint32_t 17276 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 17277 { 17278 uint32_t new_tso, user_max, pace_one; 17279 17280 user_max = rack->rc_user_set_max_segs * mss; 17281 if (rack->rc_force_max_seg) { 17282 return (user_max); 17283 } 17284 if (rack->use_fixed_rate && 17285 ((rack->r_ctl.crte == NULL) || 17286 (bw != rack->r_ctl.crte->rate))) { 17287 /* Use the user mss since we are not exactly matched */ 17288 return (user_max); 17289 } 17290 if (rack_pace_one_seg || 17291 (rack->r_ctl.rc_user_set_min_segs == 1)) 17292 pace_one = 1; 17293 else 17294 pace_one = 0; 17295 17296 new_tso = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, bw, mss, 17297 pace_one, rack->r_ctl.crte, NULL, rack->r_ctl.pace_len_divisor); 17298 if (new_tso > user_max) 17299 new_tso = user_max; 17300 if (rack->rc_hybrid_mode && rack->r_ctl.client_suggested_maxseg) { 17301 if (((uint32_t)rack->r_ctl.client_suggested_maxseg * mss) > new_tso) 17302 new_tso = (uint32_t)rack->r_ctl.client_suggested_maxseg * mss; 17303 } 17304 if (rack->r_ctl.rc_user_set_min_segs && 17305 ((rack->r_ctl.rc_user_set_min_segs * mss) > new_tso)) 17306 new_tso = rack->r_ctl.rc_user_set_min_segs * mss; 17307 return (new_tso); 17308 } 17309 17310 static int32_t 17311 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 17312 { 17313 uint64_t lentim, fill_bw; 17314 17315 /* Lets first see if we are full, if so continue with normal rate */ 17316 rack->r_via_fill_cw = 0; 17317 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 17318 return (slot); 17319 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 17320 return (slot); 17321 if (rack->r_ctl.rc_last_us_rtt == 0) 17322 return (slot); 17323 if (rack->rc_pace_fill_if_rttin_range && 17324 (rack->r_ctl.rc_last_us_rtt >= 17325 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 17326 /* The rtt is huge, N * smallest, lets not fill */ 17327 return (slot); 17328 } 17329 /* 17330 * first lets calculate the b/w based on the last us-rtt 17331 * and the the smallest send window. 17332 */ 17333 fill_bw = min(rack->rc_tp->snd_cwnd, rack->r_ctl.cwnd_to_use); 17334 /* Take the rwnd if its smaller */ 17335 if (fill_bw > rack->rc_tp->snd_wnd) 17336 fill_bw = rack->rc_tp->snd_wnd; 17337 /* Now lets make it into a b/w */ 17338 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 17339 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 17340 if (rack->r_fill_less_agg) { 17341 /* 17342 * We want the average of the rate_wanted 17343 * and our fill-cw calculated bw. We also want 17344 * to cap any increase to be no more than 17345 * X times the lt_bw (where X is the rack_bw_multipler). 17346 */ 17347 uint64_t lt_bw, rate; 17348 17349 lt_bw = rack_get_lt_bw(rack); 17350 if (lt_bw > *rate_wanted) 17351 rate = lt_bw; 17352 else 17353 rate = *rate_wanted; 17354 fill_bw += rate; 17355 fill_bw /= 2; 17356 if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) { 17357 fill_bw = rate * rack_bw_multipler; 17358 } 17359 } 17360 /* We are below the min b/w */ 17361 if (non_paced) 17362 *rate_wanted = fill_bw; 17363 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 17364 return (slot); 17365 rack->r_via_fill_cw = 1; 17366 if (rack->r_rack_hw_rate_caps && 17367 (rack->r_ctl.crte != NULL)) { 17368 uint64_t high_rate; 17369 17370 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 17371 if (fill_bw > high_rate) { 17372 /* We are capping bw at the highest rate table entry */ 17373 if (*rate_wanted > high_rate) { 17374 /* The original rate was also capped */ 17375 rack->r_via_fill_cw = 0; 17376 } 17377 rack_log_hdwr_pacing(rack, 17378 fill_bw, high_rate, __LINE__, 17379 0, 3); 17380 fill_bw = high_rate; 17381 if (capped) 17382 *capped = 1; 17383 } 17384 } else if ((rack->r_ctl.crte == NULL) && 17385 (rack->rack_hdrw_pacing == 0) && 17386 (rack->rack_hdw_pace_ena) && 17387 rack->r_rack_hw_rate_caps && 17388 (rack->rack_attempt_hdwr_pace == 0) && 17389 (rack->rc_inp->inp_route.ro_nh != NULL) && 17390 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17391 /* 17392 * Ok we may have a first attempt that is greater than our top rate 17393 * lets check. 17394 */ 17395 uint64_t high_rate; 17396 17397 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 17398 if (high_rate) { 17399 if (fill_bw > high_rate) { 17400 fill_bw = high_rate; 17401 if (capped) 17402 *capped = 1; 17403 } 17404 } 17405 } 17406 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { 17407 rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, 17408 fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL); 17409 fill_bw = rack->r_ctl.bw_rate_cap; 17410 } 17411 /* 17412 * Ok fill_bw holds our mythical b/w to fill the cwnd 17413 * in an rtt (unless it was capped), what does that 17414 * time wise equate too? 17415 */ 17416 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 17417 lentim /= fill_bw; 17418 *rate_wanted = fill_bw; 17419 if (non_paced || (lentim < slot)) { 17420 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 17421 0, lentim, 12, __LINE__, NULL, 0); 17422 return ((int32_t)lentim); 17423 } else 17424 return (slot); 17425 } 17426 17427 static int32_t 17428 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 17429 { 17430 uint64_t srtt; 17431 int32_t slot = 0; 17432 int32_t minslot = 0; 17433 int can_start_hw_pacing = 1; 17434 int err; 17435 int pace_one; 17436 17437 if (rack_pace_one_seg || 17438 (rack->r_ctl.rc_user_set_min_segs == 1)) 17439 pace_one = 1; 17440 else 17441 pace_one = 0; 17442 if (rack->rc_always_pace == 0) { 17443 /* 17444 * We use the most optimistic possible cwnd/srtt for 17445 * sending calculations. This will make our 17446 * calculation anticipate getting more through 17447 * quicker then possible. But thats ok we don't want 17448 * the peer to have a gap in data sending. 17449 */ 17450 uint64_t cwnd, tr_perms = 0; 17451 int32_t reduce = 0; 17452 17453 old_method: 17454 /* 17455 * We keep no precise pacing with the old method 17456 * instead we use the pacer to mitigate bursts. 17457 */ 17458 if (rack->r_ctl.rc_rack_min_rtt) 17459 srtt = rack->r_ctl.rc_rack_min_rtt; 17460 else 17461 srtt = max(tp->t_srtt, 1); 17462 if (rack->r_ctl.rc_rack_largest_cwnd) 17463 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 17464 else 17465 cwnd = rack->r_ctl.cwnd_to_use; 17466 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 17467 tr_perms = (cwnd * 1000) / srtt; 17468 if (tr_perms == 0) { 17469 tr_perms = ctf_fixed_maxseg(tp); 17470 } 17471 /* 17472 * Calculate how long this will take to drain, if 17473 * the calculation comes out to zero, thats ok we 17474 * will use send_a_lot to possibly spin around for 17475 * more increasing tot_len_this_send to the point 17476 * that its going to require a pace, or we hit the 17477 * cwnd. Which in that case we are just waiting for 17478 * a ACK. 17479 */ 17480 slot = len / tr_perms; 17481 /* Now do we reduce the time so we don't run dry? */ 17482 if (slot && rack_slot_reduction) { 17483 reduce = (slot / rack_slot_reduction); 17484 if (reduce < slot) { 17485 slot -= reduce; 17486 } else 17487 slot = 0; 17488 } 17489 slot *= HPTS_USEC_IN_MSEC; 17490 if (rack->rc_pace_to_cwnd) { 17491 uint64_t rate_wanted = 0; 17492 17493 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 17494 rack->rc_ack_can_sendout_data = 1; 17495 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 17496 } else 17497 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 17498 /*******************************************************/ 17499 /* RRS: We insert non-paced call to stats here for len */ 17500 /*******************************************************/ 17501 } else { 17502 uint64_t bw_est, res, lentim, rate_wanted; 17503 uint32_t segs, oh; 17504 int capped = 0; 17505 int prev_fill; 17506 17507 if ((rack->r_rr_config == 1) && rsm) { 17508 return (rack->r_ctl.rc_min_to); 17509 } 17510 if (rack->use_fixed_rate) { 17511 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 17512 } else if ((rack->r_ctl.init_rate == 0) && 17513 (rack->r_ctl.gp_bw == 0)) { 17514 /* no way to yet do an estimate */ 17515 bw_est = rate_wanted = 0; 17516 } else { 17517 bw_est = rack_get_bw(rack); 17518 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 17519 } 17520 if ((bw_est == 0) || (rate_wanted == 0) || 17521 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 17522 /* 17523 * No way yet to make a b/w estimate or 17524 * our raise is set incorrectly. 17525 */ 17526 goto old_method; 17527 } 17528 rack_rate_cap_bw(rack, &rate_wanted, &capped); 17529 /* We need to account for all the overheads */ 17530 segs = (len + segsiz - 1) / segsiz; 17531 /* 17532 * We need the diff between 1514 bytes (e-mtu with e-hdr) 17533 * and how much data we put in each packet. Yes this 17534 * means we may be off if we are larger than 1500 bytes 17535 * or smaller. But this just makes us more conservative. 17536 */ 17537 17538 oh = (tp->t_maxseg - segsiz) + sizeof(struct tcphdr); 17539 if (rack->r_is_v6) { 17540 #ifdef INET6 17541 oh += sizeof(struct ip6_hdr); 17542 #endif 17543 } else { 17544 #ifdef INET 17545 oh += sizeof(struct ip); 17546 #endif 17547 } 17548 /* We add a fixed 14 for the ethernet header */ 17549 oh += 14; 17550 segs *= oh; 17551 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 17552 res = lentim / rate_wanted; 17553 slot = (uint32_t)res; 17554 if (rack_hw_rate_min && 17555 (rate_wanted < rack_hw_rate_min)) { 17556 can_start_hw_pacing = 0; 17557 if (rack->r_ctl.crte) { 17558 /* 17559 * Ok we need to release it, we 17560 * have fallen too low. 17561 */ 17562 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17563 rack->r_ctl.crte = NULL; 17564 rack->rack_attempt_hdwr_pace = 0; 17565 rack->rack_hdrw_pacing = 0; 17566 } 17567 } 17568 if (rack->r_ctl.crte && 17569 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17570 /* 17571 * We want more than the hardware can give us, 17572 * don't start any hw pacing. 17573 */ 17574 can_start_hw_pacing = 0; 17575 if (rack->r_rack_hw_rate_caps == 0) { 17576 /* 17577 * Ok we need to release it, we 17578 * want more than the card can give us and 17579 * no rate cap is in place. Set it up so 17580 * when we want less we can retry. 17581 */ 17582 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17583 rack->r_ctl.crte = NULL; 17584 rack->rack_attempt_hdwr_pace = 0; 17585 rack->rack_hdrw_pacing = 0; 17586 } 17587 } 17588 if ((rack->r_ctl.crte != NULL) && (rack->rc_inp->inp_snd_tag == NULL)) { 17589 /* 17590 * We lost our rate somehow, this can happen 17591 * if the interface changed underneath us. 17592 */ 17593 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17594 rack->r_ctl.crte = NULL; 17595 /* Lets re-allow attempting to setup pacing */ 17596 rack->rack_hdrw_pacing = 0; 17597 rack->rack_attempt_hdwr_pace = 0; 17598 rack_log_hdwr_pacing(rack, 17599 rate_wanted, bw_est, __LINE__, 17600 0, 6); 17601 } 17602 prev_fill = rack->r_via_fill_cw; 17603 if ((rack->rc_pace_to_cwnd) && 17604 (capped == 0) && 17605 (rack->use_fixed_rate == 0) && 17606 (rack->in_probe_rtt == 0) && 17607 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 17608 /* 17609 * We want to pace at our rate *or* faster to 17610 * fill the cwnd to the max if its not full. 17611 */ 17612 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 17613 /* Re-check to make sure we are not exceeding our max b/w */ 17614 if ((rack->r_ctl.crte != NULL) && 17615 (tcp_hw_highest_rate(rack->r_ctl.crte) < rate_wanted)) { 17616 /* 17617 * We want more than the hardware can give us, 17618 * don't start any hw pacing. 17619 */ 17620 can_start_hw_pacing = 0; 17621 if (rack->r_rack_hw_rate_caps == 0) { 17622 /* 17623 * Ok we need to release it, we 17624 * want more than the card can give us and 17625 * no rate cap is in place. Set it up so 17626 * when we want less we can retry. 17627 */ 17628 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17629 rack->r_ctl.crte = NULL; 17630 rack->rack_attempt_hdwr_pace = 0; 17631 rack->rack_hdrw_pacing = 0; 17632 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 17633 } 17634 } 17635 } 17636 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 17637 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 17638 if ((rack->rack_hdw_pace_ena) && 17639 (can_start_hw_pacing > 0) && 17640 (rack->rack_hdrw_pacing == 0) && 17641 (rack->rack_attempt_hdwr_pace == 0)) { 17642 /* 17643 * Lets attempt to turn on hardware pacing 17644 * if we can. 17645 */ 17646 rack->rack_attempt_hdwr_pace = 1; 17647 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 17648 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17649 rate_wanted, 17650 RS_PACING_GEQ, 17651 &err, &rack->r_ctl.crte_prev_rate); 17652 if (rack->r_ctl.crte) { 17653 rack->rack_hdrw_pacing = 1; 17654 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, segsiz, 17655 pace_one, rack->r_ctl.crte, 17656 NULL, rack->r_ctl.pace_len_divisor); 17657 rack_log_hdwr_pacing(rack, 17658 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17659 err, 0); 17660 rack->r_ctl.last_hw_bw_req = rate_wanted; 17661 } else { 17662 counter_u64_add(rack_hw_pace_init_fail, 1); 17663 } 17664 } else if (rack->rack_hdrw_pacing && 17665 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 17666 /* Do we need to adjust our rate? */ 17667 const struct tcp_hwrate_limit_table *nrte; 17668 17669 if (rack->r_up_only && 17670 (rate_wanted < rack->r_ctl.crte->rate)) { 17671 /** 17672 * We have four possible states here 17673 * having to do with the previous time 17674 * and this time. 17675 * previous | this-time 17676 * A) 0 | 0 -- fill_cw not in the picture 17677 * B) 1 | 0 -- we were doing a fill-cw but now are not 17678 * C) 1 | 1 -- all rates from fill_cw 17679 * D) 0 | 1 -- we were doing non-fill and now we are filling 17680 * 17681 * For case A, C and D we don't allow a drop. But for 17682 * case B where we now our on our steady rate we do 17683 * allow a drop. 17684 * 17685 */ 17686 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 17687 goto done_w_hdwr; 17688 } 17689 if ((rate_wanted > rack->r_ctl.crte->rate) || 17690 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 17691 if (rack_hw_rate_to_low && 17692 (bw_est < rack_hw_rate_to_low)) { 17693 /* 17694 * The pacing rate is too low for hardware, but 17695 * do allow hardware pacing to be restarted. 17696 */ 17697 rack_log_hdwr_pacing(rack, 17698 bw_est, rack->r_ctl.crte->rate, __LINE__, 17699 0, 5); 17700 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 17701 rack->r_ctl.crte = NULL; 17702 rack->rack_attempt_hdwr_pace = 0; 17703 rack->rack_hdrw_pacing = 0; 17704 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17705 goto done_w_hdwr; 17706 } 17707 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 17708 rack->rc_tp, 17709 rack->rc_inp->inp_route.ro_nh->nh_ifp, 17710 rate_wanted, 17711 RS_PACING_GEQ, 17712 &err, &rack->r_ctl.crte_prev_rate); 17713 if (nrte == NULL) { 17714 /* 17715 * Lost the rate, lets drop hardware pacing 17716 * period. 17717 */ 17718 rack->rack_hdrw_pacing = 0; 17719 rack->r_ctl.crte = NULL; 17720 rack_log_hdwr_pacing(rack, 17721 rate_wanted, 0, __LINE__, 17722 err, 1); 17723 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17724 counter_u64_add(rack_hw_pace_lost, 1); 17725 } else if (nrte != rack->r_ctl.crte) { 17726 rack->r_ctl.crte = nrte; 17727 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size_w_divisor(tp, rate_wanted, 17728 segsiz, pace_one, rack->r_ctl.crte, 17729 NULL, rack->r_ctl.pace_len_divisor); 17730 rack_log_hdwr_pacing(rack, 17731 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17732 err, 2); 17733 rack->r_ctl.last_hw_bw_req = rate_wanted; 17734 } 17735 } else { 17736 /* We just need to adjust the segment size */ 17737 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 17738 rack_log_hdwr_pacing(rack, 17739 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 17740 0, 4); 17741 rack->r_ctl.last_hw_bw_req = rate_wanted; 17742 } 17743 } 17744 } 17745 if (minslot && (minslot > slot)) { 17746 rack_log_pacing_delay_calc(rack, minslot, slot, rack->r_ctl.crte->rate, bw_est, lentim, 17747 98, __LINE__, NULL, 0); 17748 slot = minslot; 17749 } 17750 done_w_hdwr: 17751 if (rack_limit_time_with_srtt && 17752 (rack->use_fixed_rate == 0) && 17753 (rack->rack_hdrw_pacing == 0)) { 17754 /* 17755 * Sanity check, we do not allow the pacing delay 17756 * to be longer than the SRTT of the path. If it is 17757 * a slow path, then adding a packet should increase 17758 * the RTT and compensate for this i.e. the srtt will 17759 * be greater so the allowed pacing time will be greater. 17760 * 17761 * Note this restriction is not for where a peak rate 17762 * is set, we are doing fixed pacing or hardware pacing. 17763 */ 17764 if (rack->rc_tp->t_srtt) 17765 srtt = rack->rc_tp->t_srtt; 17766 else 17767 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 17768 if (srtt < (uint64_t)slot) { 17769 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 17770 slot = srtt; 17771 } 17772 } 17773 /*******************************************************************/ 17774 /* RRS: We insert paced call to stats here for len and rate_wanted */ 17775 /*******************************************************************/ 17776 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 17777 } 17778 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 17779 /* 17780 * If this rate is seeing enobufs when it 17781 * goes to send then either the nic is out 17782 * of gas or we are mis-estimating the time 17783 * somehow and not letting the queue empty 17784 * completely. Lets add to the pacing time. 17785 */ 17786 int hw_boost_delay; 17787 17788 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 17789 if (hw_boost_delay > rack_enobuf_hw_max) 17790 hw_boost_delay = rack_enobuf_hw_max; 17791 else if (hw_boost_delay < rack_enobuf_hw_min) 17792 hw_boost_delay = rack_enobuf_hw_min; 17793 slot += hw_boost_delay; 17794 } 17795 return (slot); 17796 } 17797 17798 static void 17799 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 17800 tcp_seq startseq, uint32_t sb_offset) 17801 { 17802 struct rack_sendmap *my_rsm = NULL; 17803 17804 if (tp->t_state < TCPS_ESTABLISHED) { 17805 /* 17806 * We don't start any measurements if we are 17807 * not at least established. 17808 */ 17809 return; 17810 } 17811 if (tp->t_state >= TCPS_FIN_WAIT_1) { 17812 /* 17813 * We will get no more data into the SB 17814 * this means we need to have the data available 17815 * before we start a measurement. 17816 */ 17817 17818 if (sbavail(&tptosocket(tp)->so_snd) < 17819 max(rc_init_window(rack), 17820 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 17821 /* Nope not enough data */ 17822 return; 17823 } 17824 } 17825 tp->t_flags |= TF_GPUTINPROG; 17826 rack->r_ctl.rc_gp_cumack_ts = 0; 17827 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 17828 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 17829 tp->gput_seq = startseq; 17830 rack->app_limited_needs_set = 0; 17831 if (rack->in_probe_rtt) 17832 rack->measure_saw_probe_rtt = 1; 17833 else if ((rack->measure_saw_probe_rtt) && 17834 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 17835 rack->measure_saw_probe_rtt = 0; 17836 if (rack->rc_gp_filled) 17837 tp->gput_ts = rack->r_ctl.last_cumack_advance; 17838 else { 17839 /* Special case initial measurement */ 17840 struct timeval tv; 17841 17842 tp->gput_ts = tcp_get_usecs(&tv); 17843 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 17844 } 17845 /* 17846 * We take a guess out into the future, 17847 * if we have no measurement and no 17848 * initial rate, we measure the first 17849 * initial-windows worth of data to 17850 * speed up getting some GP measurement and 17851 * thus start pacing. 17852 */ 17853 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 17854 rack->app_limited_needs_set = 1; 17855 tp->gput_ack = startseq + max(rc_init_window(rack), 17856 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 17857 rack_log_pacing_delay_calc(rack, 17858 tp->gput_seq, 17859 tp->gput_ack, 17860 0, 17861 tp->gput_ts, 17862 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17863 9, 17864 __LINE__, NULL, 0); 17865 rack_tend_gp_marks(tp, rack); 17866 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17867 return; 17868 } 17869 if (sb_offset) { 17870 /* 17871 * We are out somewhere in the sb 17872 * can we use the already outstanding data? 17873 */ 17874 17875 if (rack->r_ctl.rc_app_limited_cnt == 0) { 17876 /* 17877 * Yes first one is good and in this case 17878 * the tp->gput_ts is correctly set based on 17879 * the last ack that arrived (no need to 17880 * set things up when an ack comes in). 17881 */ 17882 my_rsm = tqhash_min(rack->r_ctl.tqh); 17883 if ((my_rsm == NULL) || 17884 (my_rsm->r_rtr_cnt != 1)) { 17885 /* retransmission? */ 17886 goto use_latest; 17887 } 17888 } else { 17889 if (rack->r_ctl.rc_first_appl == NULL) { 17890 /* 17891 * If rc_first_appl is NULL 17892 * then the cnt should be 0. 17893 * This is probably an error, maybe 17894 * a KASSERT would be approprate. 17895 */ 17896 goto use_latest; 17897 } 17898 /* 17899 * If we have a marker pointer to the last one that is 17900 * app limited we can use that, but we need to set 17901 * things up so that when it gets ack'ed we record 17902 * the ack time (if its not already acked). 17903 */ 17904 rack->app_limited_needs_set = 1; 17905 /* 17906 * We want to get to the rsm that is either 17907 * next with space i.e. over 1 MSS or the one 17908 * after that (after the app-limited). 17909 */ 17910 my_rsm = tqhash_next(rack->r_ctl.tqh, rack->r_ctl.rc_first_appl); 17911 if (my_rsm) { 17912 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 17913 /* Have to use the next one */ 17914 my_rsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17915 else { 17916 /* Use after the first MSS of it is acked */ 17917 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 17918 goto start_set; 17919 } 17920 } 17921 if ((my_rsm == NULL) || 17922 (my_rsm->r_rtr_cnt != 1)) { 17923 /* 17924 * Either its a retransmit or 17925 * the last is the app-limited one. 17926 */ 17927 goto use_latest; 17928 } 17929 } 17930 tp->gput_seq = my_rsm->r_start; 17931 start_set: 17932 if (my_rsm->r_flags & RACK_ACKED) { 17933 /* 17934 * This one has been acked use the arrival ack time 17935 */ 17936 struct rack_sendmap *nrsm; 17937 17938 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 17939 rack->app_limited_needs_set = 0; 17940 /* 17941 * Ok in this path we need to use the r_end now 17942 * since this guy is the starting ack. 17943 */ 17944 tp->gput_seq = my_rsm->r_end; 17945 /* 17946 * We also need to adjust up the sendtime 17947 * to the send of the next data after my_rsm. 17948 */ 17949 nrsm = tqhash_next(rack->r_ctl.tqh, my_rsm); 17950 if (nrsm != NULL) 17951 my_rsm = nrsm; 17952 else { 17953 /* 17954 * The next as not been sent, thats the 17955 * case for using the latest. 17956 */ 17957 goto use_latest; 17958 } 17959 } 17960 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 17961 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 17962 rack->r_ctl.rc_gp_cumack_ts = 0; 17963 rack_log_pacing_delay_calc(rack, 17964 tp->gput_seq, 17965 tp->gput_ack, 17966 (uint64_t)my_rsm, 17967 tp->gput_ts, 17968 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 17969 9, 17970 __LINE__, my_rsm, 0); 17971 /* Now lets make sure all are marked as they should be */ 17972 rack_tend_gp_marks(tp, rack); 17973 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 17974 return; 17975 } 17976 17977 use_latest: 17978 /* 17979 * We don't know how long we may have been 17980 * idle or if this is the first-send. Lets 17981 * setup the flag so we will trim off 17982 * the first ack'd data so we get a true 17983 * measurement. 17984 */ 17985 rack->app_limited_needs_set = 1; 17986 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 17987 rack->r_ctl.rc_gp_cumack_ts = 0; 17988 /* Find this guy so we can pull the send time */ 17989 my_rsm = tqhash_find(rack->r_ctl.tqh, startseq); 17990 if (my_rsm) { 17991 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; 17992 if (my_rsm->r_flags & RACK_ACKED) { 17993 /* 17994 * Unlikely since its probably what was 17995 * just transmitted (but I am paranoid). 17996 */ 17997 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 17998 rack->app_limited_needs_set = 0; 17999 } 18000 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 18001 /* This also is unlikely */ 18002 tp->gput_seq = my_rsm->r_start; 18003 } 18004 } else { 18005 /* 18006 * TSNH unless we have some send-map limit, 18007 * and even at that it should not be hitting 18008 * that limit (we should have stopped sending). 18009 */ 18010 struct timeval tv; 18011 18012 microuptime(&tv); 18013 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 18014 } 18015 rack_tend_gp_marks(tp, rack); 18016 rack_log_pacing_delay_calc(rack, 18017 tp->gput_seq, 18018 tp->gput_ack, 18019 (uint64_t)my_rsm, 18020 tp->gput_ts, 18021 (((uint64_t)rack->r_ctl.rc_app_limited_cnt << 32) | (uint64_t)rack->r_ctl.rc_gp_output_ts), 18022 9, __LINE__, NULL, 0); 18023 rack_log_gpset(rack, tp->gput_ack, 0, 0, __LINE__, 1, NULL); 18024 } 18025 18026 static inline uint32_t 18027 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 18028 uint32_t avail, int32_t sb_offset) 18029 { 18030 uint32_t len; 18031 uint32_t sendwin; 18032 18033 if (tp->snd_wnd > cwnd_to_use) 18034 sendwin = cwnd_to_use; 18035 else 18036 sendwin = tp->snd_wnd; 18037 if (ctf_outstanding(tp) >= tp->snd_wnd) { 18038 /* We never want to go over our peers rcv-window */ 18039 len = 0; 18040 } else { 18041 uint32_t flight; 18042 18043 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 18044 if (flight >= sendwin) { 18045 /* 18046 * We have in flight what we are allowed by cwnd (if 18047 * it was rwnd blocking it would have hit above out 18048 * >= tp->snd_wnd). 18049 */ 18050 return (0); 18051 } 18052 len = sendwin - flight; 18053 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 18054 /* We would send too much (beyond the rwnd) */ 18055 len = tp->snd_wnd - ctf_outstanding(tp); 18056 } 18057 if ((len + sb_offset) > avail) { 18058 /* 18059 * We don't have that much in the SB, how much is 18060 * there? 18061 */ 18062 len = avail - sb_offset; 18063 } 18064 } 18065 return (len); 18066 } 18067 18068 static void 18069 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 18070 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 18071 int rsm_is_null, int optlen, int line, uint16_t mode) 18072 { 18073 if (tcp_bblogging_on(rack->rc_tp)) { 18074 union tcp_log_stackspecific log; 18075 struct timeval tv; 18076 18077 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18078 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18079 log.u_bbr.flex1 = error; 18080 log.u_bbr.flex2 = flags; 18081 log.u_bbr.flex3 = rsm_is_null; 18082 log.u_bbr.flex4 = ipoptlen; 18083 log.u_bbr.flex5 = tp->rcv_numsacks; 18084 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18085 log.u_bbr.flex7 = optlen; 18086 log.u_bbr.flex8 = rack->r_fsb_inited; 18087 log.u_bbr.applimited = rack->r_fast_output; 18088 log.u_bbr.bw_inuse = rack_get_bw(rack); 18089 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18090 log.u_bbr.cwnd_gain = mode; 18091 log.u_bbr.pkts_out = orig_len; 18092 log.u_bbr.lt_epoch = len; 18093 log.u_bbr.delivered = line; 18094 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 18095 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18096 tcp_log_event(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 18097 len, &log, false, NULL, __func__, __LINE__, &tv); 18098 } 18099 } 18100 18101 18102 static struct mbuf * 18103 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 18104 struct rack_fast_send_blk *fsb, 18105 int32_t seglimit, int32_t segsize, int hw_tls) 18106 { 18107 #ifdef KERN_TLS 18108 struct ktls_session *tls, *ntls; 18109 #ifdef INVARIANTS 18110 struct mbuf *start; 18111 #endif 18112 #endif 18113 struct mbuf *m, *n, **np, *smb; 18114 struct mbuf *top; 18115 int32_t off, soff; 18116 int32_t len = *plen; 18117 int32_t fragsize; 18118 int32_t len_cp = 0; 18119 uint32_t mlen, frags; 18120 18121 soff = off = the_off; 18122 smb = m = the_m; 18123 np = ⊤ 18124 top = NULL; 18125 #ifdef KERN_TLS 18126 if (hw_tls && (m->m_flags & M_EXTPG)) 18127 tls = m->m_epg_tls; 18128 else 18129 tls = NULL; 18130 #ifdef INVARIANTS 18131 start = m; 18132 #endif 18133 #endif 18134 while (len > 0) { 18135 if (m == NULL) { 18136 *plen = len_cp; 18137 break; 18138 } 18139 #ifdef KERN_TLS 18140 if (hw_tls) { 18141 if (m->m_flags & M_EXTPG) 18142 ntls = m->m_epg_tls; 18143 else 18144 ntls = NULL; 18145 18146 /* 18147 * Avoid mixing TLS records with handshake 18148 * data or TLS records from different 18149 * sessions. 18150 */ 18151 if (tls != ntls) { 18152 MPASS(m != start); 18153 *plen = len_cp; 18154 break; 18155 } 18156 } 18157 #endif 18158 mlen = min(len, m->m_len - off); 18159 if (seglimit) { 18160 /* 18161 * For M_EXTPG mbufs, add 3 segments 18162 * + 1 in case we are crossing page boundaries 18163 * + 2 in case the TLS hdr/trailer are used 18164 * It is cheaper to just add the segments 18165 * than it is to take the cache miss to look 18166 * at the mbuf ext_pgs state in detail. 18167 */ 18168 if (m->m_flags & M_EXTPG) { 18169 fragsize = min(segsize, PAGE_SIZE); 18170 frags = 3; 18171 } else { 18172 fragsize = segsize; 18173 frags = 0; 18174 } 18175 18176 /* Break if we really can't fit anymore. */ 18177 if ((frags + 1) >= seglimit) { 18178 *plen = len_cp; 18179 break; 18180 } 18181 18182 /* 18183 * Reduce size if you can't copy the whole 18184 * mbuf. If we can't copy the whole mbuf, also 18185 * adjust len so the loop will end after this 18186 * mbuf. 18187 */ 18188 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 18189 mlen = (seglimit - frags - 1) * fragsize; 18190 len = mlen; 18191 *plen = len_cp + len; 18192 } 18193 frags += howmany(mlen, fragsize); 18194 if (frags == 0) 18195 frags++; 18196 seglimit -= frags; 18197 KASSERT(seglimit > 0, 18198 ("%s: seglimit went too low", __func__)); 18199 } 18200 n = m_get(M_NOWAIT, m->m_type); 18201 *np = n; 18202 if (n == NULL) 18203 goto nospace; 18204 n->m_len = mlen; 18205 soff += mlen; 18206 len_cp += n->m_len; 18207 if (m->m_flags & (M_EXT|M_EXTPG)) { 18208 n->m_data = m->m_data + off; 18209 mb_dupcl(n, m); 18210 } else { 18211 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 18212 (u_int)n->m_len); 18213 } 18214 len -= n->m_len; 18215 off = 0; 18216 m = m->m_next; 18217 np = &n->m_next; 18218 if (len || (soff == smb->m_len)) { 18219 /* 18220 * We have more so we move forward or 18221 * we have consumed the entire mbuf and 18222 * len has fell to 0. 18223 */ 18224 soff = 0; 18225 smb = m; 18226 } 18227 18228 } 18229 if (fsb != NULL) { 18230 fsb->m = smb; 18231 fsb->off = soff; 18232 if (smb) { 18233 /* 18234 * Save off the size of the mbuf. We do 18235 * this so that we can recognize when it 18236 * has been trimmed by sbcut() as acks 18237 * come in. 18238 */ 18239 fsb->o_m_len = smb->m_len; 18240 fsb->o_t_len = M_TRAILINGROOM(smb); 18241 } else { 18242 /* 18243 * This is the case where the next mbuf went to NULL. This 18244 * means with this copy we have sent everything in the sb. 18245 * In theory we could clear the fast_output flag, but lets 18246 * not since its possible that we could get more added 18247 * and acks that call the extend function which would let 18248 * us send more. 18249 */ 18250 fsb->o_m_len = 0; 18251 fsb->o_t_len = 0; 18252 } 18253 } 18254 return (top); 18255 nospace: 18256 if (top) 18257 m_freem(top); 18258 return (NULL); 18259 18260 } 18261 18262 /* 18263 * This is a copy of m_copym(), taking the TSO segment size/limit 18264 * constraints into account, and advancing the sndptr as it goes. 18265 */ 18266 static struct mbuf * 18267 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 18268 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 18269 { 18270 struct mbuf *m, *n; 18271 int32_t soff; 18272 18273 m = rack->r_ctl.fsb.m; 18274 if (M_TRAILINGROOM(m) != rack->r_ctl.fsb.o_t_len) { 18275 /* 18276 * The trailing space changed, mbufs can grow 18277 * at the tail but they can't shrink from 18278 * it, KASSERT that. Adjust the orig_m_len to 18279 * compensate for this change. 18280 */ 18281 KASSERT((rack->r_ctl.fsb.o_t_len > M_TRAILINGROOM(m)), 18282 ("mbuf:%p rack:%p trailing_space:%jd ots:%u oml:%u mlen:%u\n", 18283 m, 18284 rack, 18285 (intmax_t)M_TRAILINGROOM(m), 18286 rack->r_ctl.fsb.o_t_len, 18287 rack->r_ctl.fsb.o_m_len, 18288 m->m_len)); 18289 rack->r_ctl.fsb.o_m_len += (rack->r_ctl.fsb.o_t_len - M_TRAILINGROOM(m)); 18290 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(m); 18291 } 18292 if (m->m_len < rack->r_ctl.fsb.o_m_len) { 18293 /* 18294 * Mbuf shrank, trimmed off the top by an ack, our 18295 * offset changes. 18296 */ 18297 KASSERT((rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len - m->m_len)), 18298 ("mbuf:%p len:%u rack:%p oml:%u soff:%u\n", 18299 m, m->m_len, 18300 rack, rack->r_ctl.fsb.o_m_len, 18301 rack->r_ctl.fsb.off)); 18302 18303 if (rack->r_ctl.fsb.off >= (rack->r_ctl.fsb.o_m_len- m->m_len)) 18304 rack->r_ctl.fsb.off -= (rack->r_ctl.fsb.o_m_len - m->m_len); 18305 else 18306 rack->r_ctl.fsb.off = 0; 18307 rack->r_ctl.fsb.o_m_len = m->m_len; 18308 #ifdef INVARIANTS 18309 } else if (m->m_len > rack->r_ctl.fsb.o_m_len) { 18310 panic("rack:%p m:%p m_len grew outside of t_space compensation", 18311 rack, m); 18312 #endif 18313 } 18314 soff = rack->r_ctl.fsb.off; 18315 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 18316 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 18317 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 18318 __FUNCTION__, 18319 rack, *plen, m, m->m_len)); 18320 /* Save off the right location before we copy and advance */ 18321 *s_soff = soff; 18322 *s_mb = rack->r_ctl.fsb.m; 18323 n = rack_fo_base_copym(m, soff, plen, 18324 &rack->r_ctl.fsb, 18325 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 18326 return (n); 18327 } 18328 18329 /* Log the buffer level */ 18330 static void 18331 rack_log_queue_level(struct tcpcb *tp, struct tcp_rack *rack, 18332 int len, struct timeval *tv, 18333 uint32_t cts) 18334 { 18335 uint32_t p_rate = 0, p_queue = 0, err = 0; 18336 union tcp_log_stackspecific log; 18337 18338 #ifdef RATELIMIT 18339 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18340 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18341 #endif 18342 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18343 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18344 log.u_bbr.flex1 = p_rate; 18345 log.u_bbr.flex2 = p_queue; 18346 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18347 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18348 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18349 log.u_bbr.flex7 = 99; 18350 log.u_bbr.flex8 = 0; 18351 log.u_bbr.pkts_out = err; 18352 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18353 log.u_bbr.timeStamp = cts; 18354 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18355 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18356 len, &log, false, NULL, __func__, __LINE__, tv); 18357 18358 } 18359 18360 static uint32_t 18361 rack_check_queue_level(struct tcp_rack *rack, struct tcpcb *tp, 18362 struct timeval *tv, uint32_t cts, int len, uint32_t segsiz) 18363 { 18364 uint64_t lentime = 0; 18365 #ifdef RATELIMIT 18366 uint32_t p_rate = 0, p_queue = 0, err; 18367 union tcp_log_stackspecific log; 18368 uint64_t bw; 18369 18370 err = in_pcbquery_txrlevel(rack->rc_inp, &p_queue); 18371 /* Failed or queue is zero */ 18372 if (err || (p_queue == 0)) { 18373 lentime = 0; 18374 goto out; 18375 } 18376 err = in_pcbquery_txrtlmt(rack->rc_inp, &p_rate); 18377 if (err) { 18378 lentime = 0; 18379 goto out; 18380 } 18381 /* 18382 * If we reach here we have some bytes in 18383 * the queue. The number returned is a value 18384 * between 0 and 0xffff where ffff is full 18385 * and 0 is empty. So how best to make this into 18386 * something usable? 18387 * 18388 * The "safer" way is lets take the b/w gotten 18389 * from the query (which should be our b/w rate) 18390 * and pretend that a full send (our rc_pace_max_segs) 18391 * is outstanding. We factor it so its as if a full 18392 * number of our MSS segment is terms of full 18393 * ethernet segments are outstanding. 18394 */ 18395 bw = p_rate / 8; 18396 if (bw) { 18397 lentime = (rack->r_ctl.rc_pace_max_segs / segsiz); 18398 lentime *= ETHERNET_SEGMENT_SIZE; 18399 lentime *= (uint64_t)HPTS_USEC_IN_SEC; 18400 lentime /= bw; 18401 } else { 18402 /* TSNH -- KASSERT? */ 18403 lentime = 0; 18404 } 18405 out: 18406 if (tcp_bblogging_on(tp)) { 18407 memset(&log, 0, sizeof(log)); 18408 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18409 log.u_bbr.flex1 = p_rate; 18410 log.u_bbr.flex2 = p_queue; 18411 log.u_bbr.flex4 = (uint32_t)rack->r_ctl.crte->using; 18412 log.u_bbr.flex5 = (uint32_t)rack->r_ctl.crte->rs_num_enobufs; 18413 log.u_bbr.flex6 = rack->r_ctl.crte->time_between; 18414 log.u_bbr.flex7 = 99; 18415 log.u_bbr.flex8 = 0; 18416 log.u_bbr.pkts_out = err; 18417 log.u_bbr.delRate = rack->r_ctl.crte->rate; 18418 log.u_bbr.cur_del_rate = lentime; 18419 log.u_bbr.timeStamp = cts; 18420 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18421 tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_HDWR_PACE, 0, 18422 len, &log, false, NULL, __func__, __LINE__,tv); 18423 } 18424 #endif 18425 return ((uint32_t)lentime); 18426 } 18427 18428 static int 18429 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 18430 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 18431 { 18432 /* 18433 * Enter the fast retransmit path. We are given that a sched_pin is 18434 * in place (if accounting is compliled in) and the cycle count taken 18435 * at the entry is in the ts_val. The concept her is that the rsm 18436 * now holds the mbuf offsets and such so we can directly transmit 18437 * without a lot of overhead, the len field is already set for 18438 * us to prohibit us from sending too much (usually its 1MSS). 18439 */ 18440 struct ip *ip = NULL; 18441 struct udphdr *udp = NULL; 18442 struct tcphdr *th = NULL; 18443 struct mbuf *m = NULL; 18444 struct inpcb *inp; 18445 uint8_t *cpto; 18446 struct tcp_log_buffer *lgb; 18447 #ifdef TCP_ACCOUNTING 18448 uint64_t crtsc; 18449 int cnt_thru = 1; 18450 #endif 18451 struct tcpopt to; 18452 u_char opt[TCP_MAXOLEN]; 18453 uint32_t hdrlen, optlen; 18454 int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; 18455 uint16_t flags; 18456 uint32_t if_hw_tsomaxsegcount = 0, startseq; 18457 uint32_t if_hw_tsomaxsegsize; 18458 int32_t ip_sendflag = IP_NO_SND_TAG_RL; 18459 18460 #ifdef INET6 18461 struct ip6_hdr *ip6 = NULL; 18462 18463 if (rack->r_is_v6) { 18464 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18465 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18466 } else 18467 #endif /* INET6 */ 18468 { 18469 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18470 hdrlen = sizeof(struct tcpiphdr); 18471 } 18472 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 18473 goto failed; 18474 } 18475 if (doing_tlp) { 18476 /* Its a TLP add the flag, it may already be there but be sure */ 18477 rsm->r_flags |= RACK_TLP; 18478 } else { 18479 /* If it was a TLP it is not not on this retransmit */ 18480 rsm->r_flags &= ~RACK_TLP; 18481 } 18482 startseq = rsm->r_start; 18483 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 18484 inp = rack->rc_inp; 18485 to.to_flags = 0; 18486 flags = tcp_outflags[tp->t_state]; 18487 if (flags & (TH_SYN|TH_RST)) { 18488 goto failed; 18489 } 18490 if (rsm->r_flags & RACK_HAS_FIN) { 18491 /* We can't send a FIN here */ 18492 goto failed; 18493 } 18494 if (flags & TH_FIN) { 18495 /* We never send a FIN */ 18496 flags &= ~TH_FIN; 18497 } 18498 if (tp->t_flags & TF_RCVD_TSTMP) { 18499 to.to_tsval = ms_cts + tp->ts_offset; 18500 to.to_tsecr = tp->ts_recent; 18501 to.to_flags = TOF_TS; 18502 } 18503 optlen = tcp_addoptions(&to, opt); 18504 hdrlen += optlen; 18505 udp = rack->r_ctl.fsb.udp; 18506 if (udp) 18507 hdrlen += sizeof(struct udphdr); 18508 if (rack->r_ctl.rc_pace_max_segs) 18509 max_val = rack->r_ctl.rc_pace_max_segs; 18510 else if (rack->rc_user_set_max_segs) 18511 max_val = rack->rc_user_set_max_segs * segsiz; 18512 else 18513 max_val = len; 18514 if ((tp->t_flags & TF_TSO) && 18515 V_tcp_do_tso && 18516 (len > segsiz) && 18517 (tp->t_port == 0)) 18518 tso = 1; 18519 #ifdef INET6 18520 if (MHLEN < hdrlen + max_linkhdr) 18521 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18522 else 18523 #endif 18524 m = m_gethdr(M_NOWAIT, MT_DATA); 18525 if (m == NULL) 18526 goto failed; 18527 m->m_data += max_linkhdr; 18528 m->m_len = hdrlen; 18529 th = rack->r_ctl.fsb.th; 18530 /* Establish the len to send */ 18531 if (len > max_val) 18532 len = max_val; 18533 if ((tso) && (len + optlen > segsiz)) { 18534 uint32_t if_hw_tsomax; 18535 int32_t max_len; 18536 18537 /* extract TSO information */ 18538 if_hw_tsomax = tp->t_tsomax; 18539 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18540 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18541 /* 18542 * Check if we should limit by maximum payload 18543 * length: 18544 */ 18545 if (if_hw_tsomax != 0) { 18546 /* compute maximum TSO length */ 18547 max_len = (if_hw_tsomax - hdrlen - 18548 max_linkhdr); 18549 if (max_len <= 0) { 18550 goto failed; 18551 } else if (len > max_len) { 18552 len = max_len; 18553 } 18554 } 18555 if (len <= segsiz) { 18556 /* 18557 * In case there are too many small fragments don't 18558 * use TSO: 18559 */ 18560 tso = 0; 18561 } 18562 } else { 18563 tso = 0; 18564 } 18565 if ((tso == 0) && (len > segsiz)) 18566 len = segsiz; 18567 (void)tcp_get_usecs(tv); 18568 if ((len == 0) || 18569 (len <= MHLEN - hdrlen - max_linkhdr)) { 18570 goto failed; 18571 } 18572 th->th_seq = htonl(rsm->r_start); 18573 th->th_ack = htonl(tp->rcv_nxt); 18574 /* 18575 * The PUSH bit should only be applied 18576 * if the full retransmission is made. If 18577 * we are sending less than this is the 18578 * left hand edge and should not have 18579 * the PUSH bit. 18580 */ 18581 if ((rsm->r_flags & RACK_HAD_PUSH) && 18582 (len == (rsm->r_end - rsm->r_start))) 18583 flags |= TH_PUSH; 18584 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 18585 if (th->th_win == 0) { 18586 tp->t_sndzerowin++; 18587 tp->t_flags |= TF_RXWIN0SENT; 18588 } else 18589 tp->t_flags &= ~TF_RXWIN0SENT; 18590 if (rsm->r_flags & RACK_TLP) { 18591 /* 18592 * TLP should not count in retran count, but 18593 * in its own bin 18594 */ 18595 counter_u64_add(rack_tlp_retran, 1); 18596 counter_u64_add(rack_tlp_retran_bytes, len); 18597 } else { 18598 tp->t_sndrexmitpack++; 18599 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18600 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18601 } 18602 #ifdef STATS 18603 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18604 len); 18605 #endif 18606 if (rsm->m == NULL) 18607 goto failed; 18608 if (rsm->m && 18609 ((rsm->orig_m_len != rsm->m->m_len) || 18610 (M_TRAILINGROOM(rsm->m) != rsm->orig_t_space))) { 18611 /* Fix up the orig_m_len and possibly the mbuf offset */ 18612 rack_adjust_orig_mlen(rsm); 18613 } 18614 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 18615 if (len <= segsiz) { 18616 /* 18617 * Must have ran out of mbufs for the copy 18618 * shorten it to no longer need tso. Lets 18619 * not put on sendalot since we are low on 18620 * mbufs. 18621 */ 18622 tso = 0; 18623 } 18624 if ((m->m_next == NULL) || (len <= 0)){ 18625 goto failed; 18626 } 18627 if (udp) { 18628 if (rack->r_is_v6) 18629 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18630 else 18631 ulen = hdrlen + len - sizeof(struct ip); 18632 udp->uh_ulen = htons(ulen); 18633 } 18634 m->m_pkthdr.rcvif = (struct ifnet *)0; 18635 if (TCPS_HAVERCVDSYN(tp->t_state) && 18636 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 18637 int ect = tcp_ecn_output_established(tp, &flags, len, true); 18638 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18639 (tp->t_flags2 & TF2_ECN_SND_ECE)) 18640 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18641 #ifdef INET6 18642 if (rack->r_is_v6) { 18643 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 18644 ip6->ip6_flow |= htonl(ect << 20); 18645 } 18646 else 18647 #endif 18648 { 18649 ip->ip_tos &= ~IPTOS_ECN_MASK; 18650 ip->ip_tos |= ect; 18651 } 18652 } 18653 if (rack->r_ctl.crte != NULL) { 18654 /* See if we can send via the hw queue */ 18655 slot = rack_check_queue_level(rack, tp, tv, cts, len, segsiz); 18656 /* If there is nothing in queue (no pacing time) we can send via the hw queue */ 18657 if (slot == 0) 18658 ip_sendflag = 0; 18659 } 18660 tcp_set_flags(th, flags); 18661 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18662 #ifdef INET6 18663 if (rack->r_is_v6) { 18664 if (tp->t_port) { 18665 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18666 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18667 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18668 th->th_sum = htons(0); 18669 UDPSTAT_INC(udps_opackets); 18670 } else { 18671 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18672 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18673 th->th_sum = in6_cksum_pseudo(ip6, 18674 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18675 0); 18676 } 18677 } 18678 #endif 18679 #if defined(INET6) && defined(INET) 18680 else 18681 #endif 18682 #ifdef INET 18683 { 18684 if (tp->t_port) { 18685 m->m_pkthdr.csum_flags = CSUM_UDP; 18686 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18687 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18688 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18689 th->th_sum = htons(0); 18690 UDPSTAT_INC(udps_opackets); 18691 } else { 18692 m->m_pkthdr.csum_flags = CSUM_TCP; 18693 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18694 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18695 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18696 IPPROTO_TCP + len + optlen)); 18697 } 18698 /* IP version must be set here for ipv4/ipv6 checking later */ 18699 KASSERT(ip->ip_v == IPVERSION, 18700 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18701 } 18702 #endif 18703 if (tso) { 18704 /* 18705 * Here we use segsiz since we have no added options besides 18706 * any standard timestamp options (no DSACKs or SACKS are sent 18707 * via either fast-path). 18708 */ 18709 KASSERT(len > segsiz, 18710 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 18711 m->m_pkthdr.csum_flags |= CSUM_TSO; 18712 m->m_pkthdr.tso_segsz = segsiz; 18713 } 18714 #ifdef INET6 18715 if (rack->r_is_v6) { 18716 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 18717 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18718 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18719 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18720 else 18721 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18722 } 18723 #endif 18724 #if defined(INET) && defined(INET6) 18725 else 18726 #endif 18727 #ifdef INET 18728 { 18729 ip->ip_len = htons(m->m_pkthdr.len); 18730 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 18731 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18732 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18733 if (tp->t_port == 0 || len < V_tcp_minmss) { 18734 ip->ip_off |= htons(IP_DF); 18735 } 18736 } else { 18737 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18738 } 18739 } 18740 #endif 18741 if (doing_tlp == 0) { 18742 /* Set we retransmitted */ 18743 rack->rc_gp_saw_rec = 1; 18744 } else { 18745 /* Its a TLP set ca or ss */ 18746 if (tp->snd_cwnd > tp->snd_ssthresh) { 18747 /* Set we sent in CA */ 18748 rack->rc_gp_saw_ca = 1; 18749 } else { 18750 /* Set we sent in SS */ 18751 rack->rc_gp_saw_ss = 1; 18752 } 18753 } 18754 /* Time to copy in our header */ 18755 cpto = mtod(m, uint8_t *); 18756 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18757 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18758 if (optlen) { 18759 bcopy(opt, th + 1, optlen); 18760 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18761 } else { 18762 th->th_off = sizeof(struct tcphdr) >> 2; 18763 } 18764 if (tcp_bblogging_on(rack->rc_tp)) { 18765 union tcp_log_stackspecific log; 18766 18767 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 18768 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 18769 counter_u64_add(rack_collapsed_win_rxt, 1); 18770 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 18771 } 18772 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18773 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 18774 if (rack->rack_no_prr) 18775 log.u_bbr.flex1 = 0; 18776 else 18777 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18778 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18779 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18780 log.u_bbr.flex4 = max_val; 18781 /* Save off the early/late values */ 18782 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18783 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18784 log.u_bbr.bw_inuse = rack_get_bw(rack); 18785 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 18786 if (doing_tlp == 0) 18787 log.u_bbr.flex8 = 1; 18788 else 18789 log.u_bbr.flex8 = 2; 18790 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 18791 log.u_bbr.flex7 = 55; 18792 log.u_bbr.pkts_out = tp->t_maxseg; 18793 log.u_bbr.timeStamp = cts; 18794 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18795 if (rsm && (rsm->r_rtr_cnt > 0)) { 18796 /* 18797 * When we have a retransmit we want to log the 18798 * burst at send and flight at send from before. 18799 */ 18800 log.u_bbr.flex5 = rsm->r_fas; 18801 log.u_bbr.bbr_substate = rsm->r_bas; 18802 } else { 18803 /* 18804 * This is currently unlikely until we do the 18805 * packet pair probes but I will add it for completeness. 18806 */ 18807 log.u_bbr.flex5 = log.u_bbr.inflight; 18808 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 18809 } 18810 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 18811 log.u_bbr.delivered = 0; 18812 log.u_bbr.rttProp = (uint64_t)rsm; 18813 log.u_bbr.delRate = rsm->r_flags; 18814 log.u_bbr.delRate <<= 31; 18815 log.u_bbr.delRate |= rack->r_must_retran; 18816 log.u_bbr.delRate <<= 1; 18817 log.u_bbr.delRate |= 1; 18818 log.u_bbr.pkt_epoch = __LINE__; 18819 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 18820 len, &log, false, NULL, __func__, __LINE__, tv); 18821 } else 18822 lgb = NULL; 18823 if ((rack->r_ctl.crte != NULL) && 18824 tcp_bblogging_on(tp)) { 18825 rack_log_queue_level(tp, rack, len, tv, cts); 18826 } 18827 #ifdef INET6 18828 if (rack->r_is_v6) { 18829 error = ip6_output(m, NULL, 18830 &inp->inp_route6, 18831 ip_sendflag, NULL, NULL, inp); 18832 } 18833 else 18834 #endif 18835 #ifdef INET 18836 { 18837 error = ip_output(m, NULL, 18838 &inp->inp_route, 18839 ip_sendflag, 0, inp); 18840 } 18841 #endif 18842 m = NULL; 18843 if (lgb) { 18844 lgb->tlb_errno = error; 18845 lgb = NULL; 18846 } 18847 if (error) { 18848 goto failed; 18849 } else if (rack->rc_hw_nobuf && (ip_sendflag != IP_NO_SND_TAG_RL)) { 18850 rack->rc_hw_nobuf = 0; 18851 rack->r_ctl.rc_agg_delayed = 0; 18852 rack->r_early = 0; 18853 rack->r_late = 0; 18854 rack->r_ctl.rc_agg_early = 0; 18855 } 18856 18857 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 18858 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); 18859 if (doing_tlp) { 18860 rack->rc_tlp_in_progress = 1; 18861 rack->r_ctl.rc_tlp_cnt_out++; 18862 } 18863 if (error == 0) { 18864 counter_u64_add(rack_total_bytes, len); 18865 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 18866 if (doing_tlp) { 18867 rack->rc_last_sent_tlp_past_cumack = 0; 18868 rack->rc_last_sent_tlp_seq_valid = 1; 18869 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 18870 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 18871 } 18872 } 18873 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 18874 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18875 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 18876 rack->r_ctl.retran_during_recovery += len; 18877 { 18878 int idx; 18879 18880 idx = (len / segsiz) + 3; 18881 if (idx >= TCP_MSS_ACCT_ATIMER) 18882 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18883 else 18884 counter_u64_add(rack_out_size[idx], 1); 18885 } 18886 if (tp->t_rtttime == 0) { 18887 tp->t_rtttime = ticks; 18888 tp->t_rtseq = startseq; 18889 KMOD_TCPSTAT_INC(tcps_segstimed); 18890 } 18891 counter_u64_add(rack_fto_rsm_send, 1); 18892 if (error && (error == ENOBUFS)) { 18893 if (rack->r_ctl.crte != NULL) { 18894 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 18895 if (tcp_bblogging_on(rack->rc_tp)) 18896 rack_log_queue_level(tp, rack, len, tv, cts); 18897 } else 18898 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 18899 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 18900 if (rack->rc_enobuf < 0x7f) 18901 rack->rc_enobuf++; 18902 if (slot < (10 * HPTS_USEC_IN_MSEC)) 18903 slot = 10 * HPTS_USEC_IN_MSEC; 18904 if (rack->r_ctl.crte != NULL) { 18905 counter_u64_add(rack_saw_enobuf_hw, 1); 18906 tcp_rl_log_enobuf(rack->r_ctl.crte); 18907 } 18908 counter_u64_add(rack_saw_enobuf, 1); 18909 } else 18910 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 18911 if ((slot == 0) || 18912 (rack->rc_always_pace == 0) || 18913 (rack->r_rr_config == 1)) { 18914 /* 18915 * We have no pacing set or we 18916 * are using old-style rack or 18917 * we are overridden to use the old 1ms pacing. 18918 */ 18919 slot = rack->r_ctl.rc_min_to; 18920 } 18921 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 18922 #ifdef TCP_ACCOUNTING 18923 crtsc = get_cyclecount(); 18924 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18925 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 18926 } 18927 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18928 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 18929 } 18930 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18931 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 18932 } 18933 sched_unpin(); 18934 #endif 18935 return (0); 18936 failed: 18937 if (m) 18938 m_free(m); 18939 return (-1); 18940 } 18941 18942 static void 18943 rack_sndbuf_autoscale(struct tcp_rack *rack) 18944 { 18945 /* 18946 * Automatic sizing of send socket buffer. Often the send buffer 18947 * size is not optimally adjusted to the actual network conditions 18948 * at hand (delay bandwidth product). Setting the buffer size too 18949 * small limits throughput on links with high bandwidth and high 18950 * delay (eg. trans-continental/oceanic links). Setting the 18951 * buffer size too big consumes too much real kernel memory, 18952 * especially with many connections on busy servers. 18953 * 18954 * The criteria to step up the send buffer one notch are: 18955 * 1. receive window of remote host is larger than send buffer 18956 * (with a fudge factor of 5/4th); 18957 * 2. send buffer is filled to 7/8th with data (so we actually 18958 * have data to make use of it); 18959 * 3. send buffer fill has not hit maximal automatic size; 18960 * 4. our send window (slow start and cogestion controlled) is 18961 * larger than sent but unacknowledged data in send buffer. 18962 * 18963 * Note that the rack version moves things much faster since 18964 * we want to avoid hitting cache lines in the rack_fast_output() 18965 * path so this is called much less often and thus moves 18966 * the SB forward by a percentage. 18967 */ 18968 struct socket *so; 18969 struct tcpcb *tp; 18970 uint32_t sendwin, scaleup; 18971 18972 tp = rack->rc_tp; 18973 so = rack->rc_inp->inp_socket; 18974 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 18975 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 18976 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 18977 sbused(&so->so_snd) >= 18978 (so->so_snd.sb_hiwat / 8 * 7) && 18979 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 18980 sendwin >= (sbused(&so->so_snd) - 18981 (tp->snd_nxt - tp->snd_una))) { 18982 if (rack_autosndbuf_inc) 18983 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 18984 else 18985 scaleup = V_tcp_autosndbuf_inc; 18986 if (scaleup < V_tcp_autosndbuf_inc) 18987 scaleup = V_tcp_autosndbuf_inc; 18988 scaleup += so->so_snd.sb_hiwat; 18989 if (scaleup > V_tcp_autosndbuf_max) 18990 scaleup = V_tcp_autosndbuf_max; 18991 if (!sbreserve_locked(so, SO_SND, scaleup, curthread)) 18992 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 18993 } 18994 } 18995 } 18996 18997 static int 18998 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 18999 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 19000 { 19001 /* 19002 * Enter to do fast output. We are given that the sched_pin is 19003 * in place (if accounting is compiled in) and the cycle count taken 19004 * at entry is in place in ts_val. The idea here is that 19005 * we know how many more bytes needs to be sent (presumably either 19006 * during pacing or to fill the cwnd and that was greater than 19007 * the max-burst). We have how much to send and all the info we 19008 * need to just send. 19009 */ 19010 #ifdef INET 19011 struct ip *ip = NULL; 19012 #endif 19013 struct udphdr *udp = NULL; 19014 struct tcphdr *th = NULL; 19015 struct mbuf *m, *s_mb; 19016 struct inpcb *inp; 19017 uint8_t *cpto; 19018 struct tcp_log_buffer *lgb; 19019 #ifdef TCP_ACCOUNTING 19020 uint64_t crtsc; 19021 #endif 19022 struct tcpopt to; 19023 u_char opt[TCP_MAXOLEN]; 19024 uint32_t hdrlen, optlen; 19025 #ifdef TCP_ACCOUNTING 19026 int cnt_thru = 1; 19027 #endif 19028 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; 19029 uint16_t flags; 19030 uint32_t s_soff; 19031 uint32_t if_hw_tsomaxsegcount = 0, startseq; 19032 uint32_t if_hw_tsomaxsegsize; 19033 uint16_t add_flag = RACK_SENT_FP; 19034 #ifdef INET6 19035 struct ip6_hdr *ip6 = NULL; 19036 19037 if (rack->r_is_v6) { 19038 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 19039 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 19040 } else 19041 #endif /* INET6 */ 19042 { 19043 #ifdef INET 19044 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 19045 hdrlen = sizeof(struct tcpiphdr); 19046 #endif 19047 } 19048 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 19049 m = NULL; 19050 goto failed; 19051 } 19052 startseq = tp->snd_max; 19053 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19054 inp = rack->rc_inp; 19055 len = rack->r_ctl.fsb.left_to_send; 19056 to.to_flags = 0; 19057 flags = rack->r_ctl.fsb.tcp_flags; 19058 if (tp->t_flags & TF_RCVD_TSTMP) { 19059 to.to_tsval = ms_cts + tp->ts_offset; 19060 to.to_tsecr = tp->ts_recent; 19061 to.to_flags = TOF_TS; 19062 } 19063 optlen = tcp_addoptions(&to, opt); 19064 hdrlen += optlen; 19065 udp = rack->r_ctl.fsb.udp; 19066 if (udp) 19067 hdrlen += sizeof(struct udphdr); 19068 if (rack->r_ctl.rc_pace_max_segs) 19069 max_val = rack->r_ctl.rc_pace_max_segs; 19070 else if (rack->rc_user_set_max_segs) 19071 max_val = rack->rc_user_set_max_segs * segsiz; 19072 else 19073 max_val = len; 19074 if ((tp->t_flags & TF_TSO) && 19075 V_tcp_do_tso && 19076 (len > segsiz) && 19077 (tp->t_port == 0)) 19078 tso = 1; 19079 again: 19080 #ifdef INET6 19081 if (MHLEN < hdrlen + max_linkhdr) 19082 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 19083 else 19084 #endif 19085 m = m_gethdr(M_NOWAIT, MT_DATA); 19086 if (m == NULL) 19087 goto failed; 19088 m->m_data += max_linkhdr; 19089 m->m_len = hdrlen; 19090 th = rack->r_ctl.fsb.th; 19091 /* Establish the len to send */ 19092 if (len > max_val) 19093 len = max_val; 19094 if ((tso) && (len + optlen > segsiz)) { 19095 uint32_t if_hw_tsomax; 19096 int32_t max_len; 19097 19098 /* extract TSO information */ 19099 if_hw_tsomax = tp->t_tsomax; 19100 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 19101 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 19102 /* 19103 * Check if we should limit by maximum payload 19104 * length: 19105 */ 19106 if (if_hw_tsomax != 0) { 19107 /* compute maximum TSO length */ 19108 max_len = (if_hw_tsomax - hdrlen - 19109 max_linkhdr); 19110 if (max_len <= 0) { 19111 goto failed; 19112 } else if (len > max_len) { 19113 len = max_len; 19114 } 19115 } 19116 if (len <= segsiz) { 19117 /* 19118 * In case there are too many small fragments don't 19119 * use TSO: 19120 */ 19121 tso = 0; 19122 } 19123 } else { 19124 tso = 0; 19125 } 19126 if ((tso == 0) && (len > segsiz)) 19127 len = segsiz; 19128 (void)tcp_get_usecs(tv); 19129 if ((len == 0) || 19130 (len <= MHLEN - hdrlen - max_linkhdr)) { 19131 goto failed; 19132 } 19133 sb_offset = tp->snd_max - tp->snd_una; 19134 th->th_seq = htonl(tp->snd_max); 19135 th->th_ack = htonl(tp->rcv_nxt); 19136 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 19137 if (th->th_win == 0) { 19138 tp->t_sndzerowin++; 19139 tp->t_flags |= TF_RXWIN0SENT; 19140 } else 19141 tp->t_flags &= ~TF_RXWIN0SENT; 19142 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 19143 KMOD_TCPSTAT_INC(tcps_sndpack); 19144 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 19145 #ifdef STATS 19146 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 19147 len); 19148 #endif 19149 if (rack->r_ctl.fsb.m == NULL) 19150 goto failed; 19151 19152 /* s_mb and s_soff are saved for rack_log_output */ 19153 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 19154 &s_mb, &s_soff); 19155 if (len <= segsiz) { 19156 /* 19157 * Must have ran out of mbufs for the copy 19158 * shorten it to no longer need tso. Lets 19159 * not put on sendalot since we are low on 19160 * mbufs. 19161 */ 19162 tso = 0; 19163 } 19164 if (rack->r_ctl.fsb.rfo_apply_push && 19165 (len == rack->r_ctl.fsb.left_to_send)) { 19166 tcp_set_flags(th, flags | TH_PUSH); 19167 add_flag |= RACK_HAD_PUSH; 19168 } 19169 if ((m->m_next == NULL) || (len <= 0)){ 19170 goto failed; 19171 } 19172 if (udp) { 19173 if (rack->r_is_v6) 19174 ulen = hdrlen + len - sizeof(struct ip6_hdr); 19175 else 19176 ulen = hdrlen + len - sizeof(struct ip); 19177 udp->uh_ulen = htons(ulen); 19178 } 19179 m->m_pkthdr.rcvif = (struct ifnet *)0; 19180 if (TCPS_HAVERCVDSYN(tp->t_state) && 19181 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 19182 int ect = tcp_ecn_output_established(tp, &flags, len, false); 19183 if ((tp->t_state == TCPS_SYN_RECEIVED) && 19184 (tp->t_flags2 & TF2_ECN_SND_ECE)) 19185 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 19186 #ifdef INET6 19187 if (rack->r_is_v6) { 19188 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 19189 ip6->ip6_flow |= htonl(ect << 20); 19190 } 19191 else 19192 #endif 19193 { 19194 #ifdef INET 19195 ip->ip_tos &= ~IPTOS_ECN_MASK; 19196 ip->ip_tos |= ect; 19197 #endif 19198 } 19199 } 19200 tcp_set_flags(th, flags); 19201 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 19202 #ifdef INET6 19203 if (rack->r_is_v6) { 19204 if (tp->t_port) { 19205 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 19206 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19207 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 19208 th->th_sum = htons(0); 19209 UDPSTAT_INC(udps_opackets); 19210 } else { 19211 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 19212 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19213 th->th_sum = in6_cksum_pseudo(ip6, 19214 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 19215 0); 19216 } 19217 } 19218 #endif 19219 #if defined(INET6) && defined(INET) 19220 else 19221 #endif 19222 #ifdef INET 19223 { 19224 if (tp->t_port) { 19225 m->m_pkthdr.csum_flags = CSUM_UDP; 19226 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 19227 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 19228 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 19229 th->th_sum = htons(0); 19230 UDPSTAT_INC(udps_opackets); 19231 } else { 19232 m->m_pkthdr.csum_flags = CSUM_TCP; 19233 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 19234 th->th_sum = in_pseudo(ip->ip_src.s_addr, 19235 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 19236 IPPROTO_TCP + len + optlen)); 19237 } 19238 /* IP version must be set here for ipv4/ipv6 checking later */ 19239 KASSERT(ip->ip_v == IPVERSION, 19240 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 19241 } 19242 #endif 19243 if (tso) { 19244 /* 19245 * Here we use segsiz since we have no added options besides 19246 * any standard timestamp options (no DSACKs or SACKS are sent 19247 * via either fast-path). 19248 */ 19249 KASSERT(len > segsiz, 19250 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 19251 m->m_pkthdr.csum_flags |= CSUM_TSO; 19252 m->m_pkthdr.tso_segsz = segsiz; 19253 } 19254 #ifdef INET6 19255 if (rack->r_is_v6) { 19256 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 19257 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 19258 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 19259 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19260 else 19261 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19262 } 19263 #endif 19264 #if defined(INET) && defined(INET6) 19265 else 19266 #endif 19267 #ifdef INET 19268 { 19269 ip->ip_len = htons(m->m_pkthdr.len); 19270 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 19271 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 19272 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 19273 if (tp->t_port == 0 || len < V_tcp_minmss) { 19274 ip->ip_off |= htons(IP_DF); 19275 } 19276 } else { 19277 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 19278 } 19279 } 19280 #endif 19281 if (tp->snd_cwnd > tp->snd_ssthresh) { 19282 /* Set we sent in CA */ 19283 rack->rc_gp_saw_ca = 1; 19284 } else { 19285 /* Set we sent in SS */ 19286 rack->rc_gp_saw_ss = 1; 19287 } 19288 /* Time to copy in our header */ 19289 cpto = mtod(m, uint8_t *); 19290 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 19291 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 19292 if (optlen) { 19293 bcopy(opt, th + 1, optlen); 19294 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 19295 } else { 19296 th->th_off = sizeof(struct tcphdr) >> 2; 19297 } 19298 if ((rack->r_ctl.crte != NULL) && 19299 tcp_bblogging_on(tp)) { 19300 rack_log_queue_level(tp, rack, len, tv, cts); 19301 } 19302 if (tcp_bblogging_on(rack->rc_tp)) { 19303 union tcp_log_stackspecific log; 19304 19305 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 19306 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 19307 if (rack->rack_no_prr) 19308 log.u_bbr.flex1 = 0; 19309 else 19310 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 19311 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 19312 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 19313 log.u_bbr.flex4 = max_val; 19314 /* Save off the early/late values */ 19315 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 19316 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 19317 log.u_bbr.bw_inuse = rack_get_bw(rack); 19318 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 19319 log.u_bbr.flex8 = 0; 19320 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 19321 log.u_bbr.flex7 = 44; 19322 log.u_bbr.pkts_out = tp->t_maxseg; 19323 log.u_bbr.timeStamp = cts; 19324 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 19325 log.u_bbr.flex5 = log.u_bbr.inflight; 19326 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 19327 log.u_bbr.delivered = 0; 19328 log.u_bbr.rttProp = 0; 19329 log.u_bbr.delRate = rack->r_must_retran; 19330 log.u_bbr.delRate <<= 1; 19331 log.u_bbr.pkt_epoch = __LINE__; 19332 /* For fast output no retrans so just inflight and how many mss we send */ 19333 log.u_bbr.flex5 = log.u_bbr.inflight; 19334 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 19335 lgb = tcp_log_event(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 19336 len, &log, false, NULL, __func__, __LINE__, tv); 19337 } else 19338 lgb = NULL; 19339 #ifdef INET6 19340 if (rack->r_is_v6) { 19341 error = ip6_output(m, NULL, 19342 &inp->inp_route6, 19343 0, NULL, NULL, inp); 19344 } 19345 #endif 19346 #if defined(INET) && defined(INET6) 19347 else 19348 #endif 19349 #ifdef INET 19350 { 19351 error = ip_output(m, NULL, 19352 &inp->inp_route, 19353 0, 0, inp); 19354 } 19355 #endif 19356 if (lgb) { 19357 lgb->tlb_errno = error; 19358 lgb = NULL; 19359 } 19360 if (error) { 19361 *send_err = error; 19362 m = NULL; 19363 goto failed; 19364 } else if (rack->rc_hw_nobuf) { 19365 rack->rc_hw_nobuf = 0; 19366 rack->r_ctl.rc_agg_delayed = 0; 19367 rack->r_early = 0; 19368 rack->r_late = 0; 19369 rack->r_ctl.rc_agg_early = 0; 19370 } 19371 if ((error == 0) && (rack->lt_bw_up == 0)) { 19372 /* Unlikely */ 19373 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); 19374 rack->r_ctl.lt_seq = tp->snd_una; 19375 rack->lt_bw_up = 1; 19376 } 19377 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 19378 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); 19379 m = NULL; 19380 if (tp->snd_una == tp->snd_max) { 19381 rack->r_ctl.rc_tlp_rxt_last_time = cts; 19382 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 19383 tp->t_acktime = ticks; 19384 } 19385 counter_u64_add(rack_total_bytes, len); 19386 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 19387 19388 rack->forced_ack = 0; /* If we send something zap the FA flag */ 19389 tot_len += len; 19390 if ((tp->t_flags & TF_GPUTINPROG) == 0) 19391 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 19392 tp->snd_max += len; 19393 tp->snd_nxt = tp->snd_max; 19394 if (rack->rc_new_rnd_needed) { 19395 /* 19396 * Update the rnd to start ticking not 19397 * that from a time perspective all of 19398 * the preceding idle time is "in the round" 19399 */ 19400 rack->rc_new_rnd_needed = 0; 19401 rack->r_ctl.roundends = tp->snd_max; 19402 } 19403 { 19404 int idx; 19405 19406 idx = (len / segsiz) + 3; 19407 if (idx >= TCP_MSS_ACCT_ATIMER) 19408 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 19409 else 19410 counter_u64_add(rack_out_size[idx], 1); 19411 } 19412 if (len <= rack->r_ctl.fsb.left_to_send) 19413 rack->r_ctl.fsb.left_to_send -= len; 19414 else 19415 rack->r_ctl.fsb.left_to_send = 0; 19416 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19417 rack->r_fast_output = 0; 19418 rack->r_ctl.fsb.left_to_send = 0; 19419 /* At the end of fast_output scale up the sb */ 19420 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 19421 rack_sndbuf_autoscale(rack); 19422 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 19423 } 19424 if (tp->t_rtttime == 0) { 19425 tp->t_rtttime = ticks; 19426 tp->t_rtseq = startseq; 19427 KMOD_TCPSTAT_INC(tcps_segstimed); 19428 } 19429 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 19430 (max_val > len) && 19431 (tso == 0)) { 19432 max_val -= len; 19433 len = segsiz; 19434 th = rack->r_ctl.fsb.th; 19435 #ifdef TCP_ACCOUNTING 19436 cnt_thru++; 19437 #endif 19438 goto again; 19439 } 19440 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19441 counter_u64_add(rack_fto_send, 1); 19442 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 19443 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 19444 #ifdef TCP_ACCOUNTING 19445 crtsc = get_cyclecount(); 19446 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19447 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 19448 } 19449 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19450 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 19451 } 19452 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19453 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 19454 } 19455 sched_unpin(); 19456 #endif 19457 return (0); 19458 failed: 19459 if (m) 19460 m_free(m); 19461 rack->r_fast_output = 0; 19462 return (-1); 19463 } 19464 19465 static inline void 19466 rack_setup_fast_output(struct tcpcb *tp, struct tcp_rack *rack, 19467 struct sockbuf *sb, 19468 int len, int orig_len, int segsiz, uint32_t pace_max_seg, 19469 bool hw_tls, 19470 uint16_t flags) 19471 { 19472 rack->r_fast_output = 1; 19473 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19474 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19475 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 19476 rack->r_ctl.fsb.tcp_flags = flags; 19477 rack->r_ctl.fsb.left_to_send = orig_len - len; 19478 if (rack->r_ctl.fsb.left_to_send < pace_max_seg) { 19479 /* Less than a full sized pace, lets not */ 19480 rack->r_fast_output = 0; 19481 return; 19482 } else { 19483 /* Round down to the nearest pace_max_seg */ 19484 rack->r_ctl.fsb.left_to_send = rounddown(rack->r_ctl.fsb.left_to_send, pace_max_seg); 19485 } 19486 if (hw_tls) 19487 rack->r_ctl.fsb.hw_tls = 1; 19488 else 19489 rack->r_ctl.fsb.hw_tls = 0; 19490 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19491 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19492 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19493 (tp->snd_max - tp->snd_una))); 19494 if (rack->r_ctl.fsb.left_to_send < segsiz) 19495 rack->r_fast_output = 0; 19496 else { 19497 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19498 rack->r_ctl.fsb.rfo_apply_push = 1; 19499 else 19500 rack->r_ctl.fsb.rfo_apply_push = 0; 19501 } 19502 } 19503 19504 static uint32_t 19505 rack_get_hpts_pacing_min_for_bw(struct tcp_rack *rack, int32_t segsiz) 19506 { 19507 uint64_t min_time; 19508 uint32_t maxlen; 19509 19510 min_time = (uint64_t)get_hpts_min_sleep_time(); 19511 maxlen = (uint32_t)((rack->r_ctl.gp_bw * min_time) / (uint64_t)HPTS_USEC_IN_SEC); 19512 maxlen = roundup(maxlen, segsiz); 19513 return (maxlen); 19514 } 19515 19516 static struct rack_sendmap * 19517 rack_check_collapsed(struct tcp_rack *rack, uint32_t cts) 19518 { 19519 struct rack_sendmap *rsm = NULL; 19520 int thresh; 19521 19522 restart: 19523 rsm = tqhash_find(rack->r_ctl.tqh, rack->r_ctl.last_collapse_point); 19524 if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) { 19525 /* Nothing, strange turn off validity */ 19526 rack->r_collapse_point_valid = 0; 19527 return (NULL); 19528 } 19529 /* Can we send it yet? */ 19530 if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) { 19531 /* 19532 * Receiver window has not grown enough for 19533 * the segment to be put on the wire. 19534 */ 19535 return (NULL); 19536 } 19537 if (rsm->r_flags & RACK_ACKED) { 19538 /* 19539 * It has been sacked, lets move to the 19540 * next one if possible. 19541 */ 19542 rack->r_ctl.last_collapse_point = rsm->r_end; 19543 /* Are we done? */ 19544 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19545 rack->r_ctl.high_collapse_point)) { 19546 rack->r_collapse_point_valid = 0; 19547 return (NULL); 19548 } 19549 goto restart; 19550 } 19551 /* Now has it been long enough ? */ 19552 thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts); 19553 if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { 19554 rack_log_collapse(rack, rsm->r_start, 19555 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19556 thresh, __LINE__, 6, rsm->r_flags, rsm); 19557 return (rsm); 19558 } 19559 /* Not enough time */ 19560 rack_log_collapse(rack, rsm->r_start, 19561 (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), 19562 thresh, __LINE__, 7, rsm->r_flags, rsm); 19563 return (NULL); 19564 } 19565 19566 static inline void 19567 rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) 19568 { 19569 if ((rack->full_size_rxt == 0) && 19570 (rack->shape_rxt_to_pacing_min == 0) && 19571 (*len >= segsiz)) { 19572 *len = segsiz; 19573 } else if (rack->shape_rxt_to_pacing_min && 19574 rack->gp_ready) { 19575 /* We use pacing min as shaping len req */ 19576 uint32_t maxlen; 19577 19578 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 19579 if (*len > maxlen) 19580 *len = maxlen; 19581 } else { 19582 /* 19583 * The else is full_size_rxt is on so send it all 19584 * note we do need to check this for exceeding 19585 * our max segment size due to the fact that 19586 * we do sometimes merge chunks together i.e. 19587 * we cannot just assume that we will never have 19588 * a chunk greater than pace_max_seg 19589 */ 19590 if (*len > pace_max_seg) 19591 *len = pace_max_seg; 19592 } 19593 } 19594 19595 static int 19596 rack_output(struct tcpcb *tp) 19597 { 19598 struct socket *so; 19599 uint32_t recwin; 19600 uint32_t sb_offset, s_moff = 0; 19601 int32_t len, error = 0; 19602 uint16_t flags; 19603 struct mbuf *m, *s_mb = NULL; 19604 struct mbuf *mb; 19605 uint32_t if_hw_tsomaxsegcount = 0; 19606 uint32_t if_hw_tsomaxsegsize; 19607 int32_t segsiz, minseg; 19608 long tot_len_this_send = 0; 19609 #ifdef INET 19610 struct ip *ip = NULL; 19611 #endif 19612 struct udphdr *udp = NULL; 19613 struct tcp_rack *rack; 19614 struct tcphdr *th; 19615 uint8_t pass = 0; 19616 uint8_t mark = 0; 19617 uint8_t check_done = 0; 19618 uint8_t wanted_cookie = 0; 19619 u_char opt[TCP_MAXOLEN]; 19620 unsigned ipoptlen, optlen, hdrlen, ulen=0; 19621 uint32_t rack_seq; 19622 19623 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 19624 unsigned ipsec_optlen = 0; 19625 19626 #endif 19627 int32_t idle, sendalot; 19628 int32_t sub_from_prr = 0; 19629 volatile int32_t sack_rxmit; 19630 struct rack_sendmap *rsm = NULL; 19631 int32_t tso, mtu; 19632 struct tcpopt to; 19633 int32_t slot = 0; 19634 int32_t sup_rack = 0; 19635 uint32_t cts, ms_cts, delayed, early; 19636 uint16_t add_flag = RACK_SENT_SP; 19637 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 19638 uint8_t doing_tlp = 0; 19639 uint32_t cwnd_to_use, pace_max_seg; 19640 int32_t do_a_prefetch = 0; 19641 int32_t prefetch_rsm = 0; 19642 int32_t orig_len = 0; 19643 struct timeval tv; 19644 int32_t prefetch_so_done = 0; 19645 struct tcp_log_buffer *lgb; 19646 struct inpcb *inp = tptoinpcb(tp); 19647 struct sockbuf *sb; 19648 uint64_t ts_val = 0; 19649 #ifdef TCP_ACCOUNTING 19650 uint64_t crtsc; 19651 #endif 19652 #ifdef INET6 19653 struct ip6_hdr *ip6 = NULL; 19654 int32_t isipv6; 19655 #endif 19656 bool hpts_calling, hw_tls = false; 19657 19658 NET_EPOCH_ASSERT(); 19659 INP_WLOCK_ASSERT(inp); 19660 19661 /* setup and take the cache hits here */ 19662 rack = (struct tcp_rack *)tp->t_fb_ptr; 19663 #ifdef TCP_ACCOUNTING 19664 sched_pin(); 19665 ts_val = get_cyclecount(); 19666 #endif 19667 hpts_calling = !!(tp->t_flags2 & TF2_HPTS_CALLS); 19668 tp->t_flags2 &= ~TF2_HPTS_CALLS; 19669 #ifdef TCP_OFFLOAD 19670 if (tp->t_flags & TF_TOE) { 19671 #ifdef TCP_ACCOUNTING 19672 sched_unpin(); 19673 #endif 19674 return (tcp_offload_output(tp)); 19675 } 19676 #endif 19677 if (rack->rack_deferred_inited == 0) { 19678 /* 19679 * If we are the connecting socket we will 19680 * hit rack_init() when no sequence numbers 19681 * are setup. This makes it so we must defer 19682 * some initialization. Call that now. 19683 */ 19684 rack_deferred_init(tp, rack); 19685 } 19686 /* 19687 * For TFO connections in SYN_RECEIVED, only allow the initial 19688 * SYN|ACK and those sent by the retransmit timer. 19689 */ 19690 if (IS_FASTOPEN(tp->t_flags) && 19691 (tp->t_state == TCPS_SYN_RECEIVED) && 19692 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 19693 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 19694 #ifdef TCP_ACCOUNTING 19695 sched_unpin(); 19696 #endif 19697 return (0); 19698 } 19699 #ifdef INET6 19700 if (rack->r_state) { 19701 /* Use the cache line loaded if possible */ 19702 isipv6 = rack->r_is_v6; 19703 } else { 19704 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 19705 } 19706 #endif 19707 early = 0; 19708 cts = tcp_get_usecs(&tv); 19709 ms_cts = tcp_tv_to_mssectick(&tv); 19710 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 19711 tcp_in_hpts(rack->rc_tp)) { 19712 /* 19713 * We are on the hpts for some timer but not hptsi output. 19714 * Remove from the hpts unconditionally. 19715 */ 19716 rack_timer_cancel(tp, rack, cts, __LINE__); 19717 } 19718 /* Are we pacing and late? */ 19719 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19720 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 19721 /* We are delayed */ 19722 delayed = cts - rack->r_ctl.rc_last_output_to; 19723 } else { 19724 delayed = 0; 19725 } 19726 /* Do the timers, which may override the pacer */ 19727 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 19728 int retval; 19729 19730 retval = rack_process_timers(tp, rack, cts, hpts_calling, 19731 &doing_tlp); 19732 if (retval != 0) { 19733 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 19734 #ifdef TCP_ACCOUNTING 19735 sched_unpin(); 19736 #endif 19737 /* 19738 * If timers want tcp_drop(), then pass error out, 19739 * otherwise suppress it. 19740 */ 19741 return (retval < 0 ? retval : 0); 19742 } 19743 } 19744 if (rack->rc_in_persist) { 19745 if (tcp_in_hpts(rack->rc_tp) == 0) { 19746 /* Timer is not running */ 19747 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19748 } 19749 #ifdef TCP_ACCOUNTING 19750 sched_unpin(); 19751 #endif 19752 return (0); 19753 } 19754 if ((rack->rc_ack_required == 1) && 19755 (rack->r_timer_override == 0)){ 19756 /* A timeout occurred and no ack has arrived */ 19757 if (tcp_in_hpts(rack->rc_tp) == 0) { 19758 /* Timer is not running */ 19759 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 19760 } 19761 #ifdef TCP_ACCOUNTING 19762 sched_unpin(); 19763 #endif 19764 return (0); 19765 } 19766 if ((rack->r_timer_override) || 19767 (rack->rc_ack_can_sendout_data) || 19768 (delayed) || 19769 (tp->t_state < TCPS_ESTABLISHED)) { 19770 rack->rc_ack_can_sendout_data = 0; 19771 if (tcp_in_hpts(rack->rc_tp)) 19772 tcp_hpts_remove(rack->rc_tp); 19773 } else if (tcp_in_hpts(rack->rc_tp)) { 19774 /* 19775 * On the hpts you can't pass even if ACKNOW is on, we will 19776 * when the hpts fires. 19777 */ 19778 #ifdef TCP_ACCOUNTING 19779 crtsc = get_cyclecount(); 19780 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19781 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 19782 } 19783 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19784 tp->tcp_cnt_counters[SND_BLOCKED]++; 19785 } 19786 sched_unpin(); 19787 #endif 19788 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 19789 return (0); 19790 } 19791 /* Finish out both pacing early and late accounting */ 19792 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 19793 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 19794 early = rack->r_ctl.rc_last_output_to - cts; 19795 } else 19796 early = 0; 19797 if (delayed) { 19798 rack->r_ctl.rc_agg_delayed += delayed; 19799 rack->r_late = 1; 19800 } else if (early) { 19801 rack->r_ctl.rc_agg_early += early; 19802 rack->r_early = 1; 19803 } 19804 /* Now that early/late accounting is done turn off the flag */ 19805 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 19806 rack->r_wanted_output = 0; 19807 rack->r_timer_override = 0; 19808 if ((tp->t_state != rack->r_state) && 19809 TCPS_HAVEESTABLISHED(tp->t_state)) { 19810 rack_set_state(tp, rack); 19811 } 19812 if ((rack->r_fast_output) && 19813 (doing_tlp == 0) && 19814 (tp->rcv_numsacks == 0)) { 19815 int ret; 19816 19817 error = 0; 19818 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 19819 if (ret >= 0) 19820 return(ret); 19821 else if (error) { 19822 inp = rack->rc_inp; 19823 so = inp->inp_socket; 19824 sb = &so->so_snd; 19825 goto nomore; 19826 } 19827 } 19828 inp = rack->rc_inp; 19829 /* 19830 * For TFO connections in SYN_SENT or SYN_RECEIVED, 19831 * only allow the initial SYN or SYN|ACK and those sent 19832 * by the retransmit timer. 19833 */ 19834 if (IS_FASTOPEN(tp->t_flags) && 19835 ((tp->t_state == TCPS_SYN_RECEIVED) || 19836 (tp->t_state == TCPS_SYN_SENT)) && 19837 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 19838 (tp->t_rxtshift == 0)) { /* not a retransmit */ 19839 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19840 so = inp->inp_socket; 19841 sb = &so->so_snd; 19842 goto just_return_nolock; 19843 } 19844 /* 19845 * Determine length of data that should be transmitted, and flags 19846 * that will be used. If there is some data or critical controls 19847 * (SYN, RST) to send, then transmit; otherwise, investigate 19848 * further. 19849 */ 19850 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 19851 if (tp->t_idle_reduce) { 19852 if (idle && (TICKS_2_USEC(ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 19853 rack_cc_after_idle(rack, tp); 19854 } 19855 tp->t_flags &= ~TF_LASTIDLE; 19856 if (idle) { 19857 if (tp->t_flags & TF_MORETOCOME) { 19858 tp->t_flags |= TF_LASTIDLE; 19859 idle = 0; 19860 } 19861 } 19862 if ((tp->snd_una == tp->snd_max) && 19863 rack->r_ctl.rc_went_idle_time && 19864 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 19865 idle = cts - rack->r_ctl.rc_went_idle_time; 19866 if (idle > rack_min_probertt_hold) { 19867 /* Count as a probe rtt */ 19868 if (rack->in_probe_rtt == 0) { 19869 rack->r_ctl.rc_lower_rtt_us_cts = cts; 19870 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 19871 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 19872 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 19873 } else { 19874 rack_exit_probertt(rack, cts); 19875 } 19876 } 19877 idle = 0; 19878 } 19879 if (rack_use_fsb && 19880 (rack->r_ctl.fsb.tcp_ip_hdr) && 19881 (rack->r_fsb_inited == 0) && 19882 (rack->r_state != TCPS_CLOSED)) 19883 rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); 19884 again: 19885 /* 19886 * If we've recently taken a timeout, snd_max will be greater than 19887 * snd_nxt. There may be SACK information that allows us to avoid 19888 * resending already delivered data. Adjust snd_nxt accordingly. 19889 */ 19890 sendalot = 0; 19891 cts = tcp_get_usecs(&tv); 19892 ms_cts = tcp_tv_to_mssectick(&tv); 19893 tso = 0; 19894 mtu = 0; 19895 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 19896 minseg = segsiz; 19897 if (rack->r_ctl.rc_pace_max_segs == 0) 19898 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 19899 else 19900 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 19901 sb_offset = tp->snd_max - tp->snd_una; 19902 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 19903 flags = tcp_outflags[tp->t_state]; 19904 while (rack->rc_free_cnt < rack_free_cache) { 19905 rsm = rack_alloc(rack); 19906 if (rsm == NULL) { 19907 if (hpts_calling) 19908 /* Retry in a ms */ 19909 slot = (1 * HPTS_USEC_IN_MSEC); 19910 so = inp->inp_socket; 19911 sb = &so->so_snd; 19912 goto just_return_nolock; 19913 } 19914 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 19915 rack->rc_free_cnt++; 19916 rsm = NULL; 19917 } 19918 sack_rxmit = 0; 19919 len = 0; 19920 rsm = NULL; 19921 if (flags & TH_RST) { 19922 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 19923 so = inp->inp_socket; 19924 sb = &so->so_snd; 19925 goto send; 19926 } 19927 if (rack->r_ctl.rc_resend) { 19928 /* Retransmit timer */ 19929 rsm = rack->r_ctl.rc_resend; 19930 rack->r_ctl.rc_resend = NULL; 19931 len = rsm->r_end - rsm->r_start; 19932 sack_rxmit = 1; 19933 sendalot = 0; 19934 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 19935 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 19936 __func__, __LINE__, 19937 rsm->r_start, tp->snd_una, tp, rack, rsm)); 19938 sb_offset = rsm->r_start - tp->snd_una; 19939 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19940 } else if (rack->r_collapse_point_valid && 19941 ((rsm = rack_check_collapsed(rack, cts)) != NULL)) { 19942 /* 19943 * If an RSM is returned then enough time has passed 19944 * for us to retransmit it. Move up the collapse point, 19945 * since this rsm has its chance to retransmit now. 19946 */ 19947 tcp_trace_point(rack->rc_tp, TCP_TP_COLLAPSED_RXT); 19948 rack->r_ctl.last_collapse_point = rsm->r_end; 19949 /* Are we done? */ 19950 if (SEQ_GEQ(rack->r_ctl.last_collapse_point, 19951 rack->r_ctl.high_collapse_point)) 19952 rack->r_collapse_point_valid = 0; 19953 sack_rxmit = 1; 19954 /* We are not doing a TLP */ 19955 doing_tlp = 0; 19956 len = rsm->r_end - rsm->r_start; 19957 sb_offset = rsm->r_start - tp->snd_una; 19958 sendalot = 0; 19959 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19960 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 19961 /* We have a retransmit that takes precedence */ 19962 if ((!IN_FASTRECOVERY(tp->t_flags)) && 19963 ((rsm->r_flags & RACK_MUST_RXT) == 0) && 19964 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 19965 /* Enter recovery if not induced by a time-out */ 19966 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); 19967 } 19968 #ifdef INVARIANTS 19969 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 19970 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 19971 tp, rack, rsm, rsm->r_start, tp->snd_una); 19972 } 19973 #endif 19974 len = rsm->r_end - rsm->r_start; 19975 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 19976 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 19977 __func__, __LINE__, 19978 rsm->r_start, tp->snd_una, tp, rack, rsm)); 19979 sb_offset = rsm->r_start - tp->snd_una; 19980 sendalot = 0; 19981 rack_validate_sizes(rack, &len, segsiz, pace_max_seg); 19982 if (len > 0) { 19983 sack_rxmit = 1; 19984 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 19985 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 19986 min(len, segsiz)); 19987 } 19988 } else if (rack->r_ctl.rc_tlpsend) { 19989 /* Tail loss probe */ 19990 long cwin; 19991 long tlen; 19992 19993 /* 19994 * Check if we can do a TLP with a RACK'd packet 19995 * this can happen if we are not doing the rack 19996 * cheat and we skipped to a TLP and it 19997 * went off. 19998 */ 19999 rsm = rack->r_ctl.rc_tlpsend; 20000 /* We are doing a TLP make sure the flag is preent */ 20001 rsm->r_flags |= RACK_TLP; 20002 rack->r_ctl.rc_tlpsend = NULL; 20003 sack_rxmit = 1; 20004 tlen = rsm->r_end - rsm->r_start; 20005 if (tlen > segsiz) 20006 tlen = segsiz; 20007 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 20008 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 20009 __func__, __LINE__, 20010 rsm->r_start, tp->snd_una, tp, rack, rsm)); 20011 sb_offset = rsm->r_start - tp->snd_una; 20012 cwin = min(tp->snd_wnd, tlen); 20013 len = cwin; 20014 } 20015 if (rack->r_must_retran && 20016 (doing_tlp == 0) && 20017 (SEQ_GT(tp->snd_max, tp->snd_una)) && 20018 (rsm == NULL)) { 20019 /* 20020 * There are two different ways that we 20021 * can get into this block: 20022 * a) This is a non-sack connection, we had a time-out 20023 * and thus r_must_retran was set and everything 20024 * left outstanding as been marked for retransmit. 20025 * b) The MTU of the path shrank, so that everything 20026 * was marked to be retransmitted with the smaller 20027 * mtu and r_must_retran was set. 20028 * 20029 * This means that we expect the sendmap (outstanding) 20030 * to all be marked must. We can use the tmap to 20031 * look at them. 20032 * 20033 */ 20034 int sendwin, flight; 20035 20036 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 20037 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 20038 if (flight >= sendwin) { 20039 /* 20040 * We can't send yet. 20041 */ 20042 so = inp->inp_socket; 20043 sb = &so->so_snd; 20044 goto just_return_nolock; 20045 } 20046 /* 20047 * This is the case a/b mentioned above. All 20048 * outstanding/not-acked should be marked. 20049 * We can use the tmap to find them. 20050 */ 20051 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 20052 if (rsm == NULL) { 20053 /* TSNH */ 20054 rack->r_must_retran = 0; 20055 rack->r_ctl.rc_out_at_rto = 0; 20056 so = inp->inp_socket; 20057 sb = &so->so_snd; 20058 goto just_return_nolock; 20059 } 20060 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 20061 /* 20062 * The first one does not have the flag, did we collapse 20063 * further up in our list? 20064 */ 20065 rack->r_must_retran = 0; 20066 rack->r_ctl.rc_out_at_rto = 0; 20067 rsm = NULL; 20068 sack_rxmit = 0; 20069 } else { 20070 sack_rxmit = 1; 20071 len = rsm->r_end - rsm->r_start; 20072 sb_offset = rsm->r_start - tp->snd_una; 20073 sendalot = 0; 20074 if ((rack->full_size_rxt == 0) && 20075 (rack->shape_rxt_to_pacing_min == 0) && 20076 (len >= segsiz)) 20077 len = segsiz; 20078 else if (rack->shape_rxt_to_pacing_min && 20079 rack->gp_ready) { 20080 /* We use pacing min as shaping len req */ 20081 uint32_t maxlen; 20082 20083 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20084 if (len > maxlen) 20085 len = maxlen; 20086 } 20087 /* 20088 * Delay removing the flag RACK_MUST_RXT so 20089 * that the fastpath for retransmit will 20090 * work with this rsm. 20091 */ 20092 } 20093 } 20094 /* 20095 * Enforce a connection sendmap count limit if set 20096 * as long as we are not retransmiting. 20097 */ 20098 if ((rsm == NULL) && 20099 (rack->do_detection == 0) && 20100 (V_tcp_map_entries_limit > 0) && 20101 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 20102 counter_u64_add(rack_to_alloc_limited, 1); 20103 if (!rack->alloc_limit_reported) { 20104 rack->alloc_limit_reported = 1; 20105 counter_u64_add(rack_alloc_limited_conns, 1); 20106 } 20107 so = inp->inp_socket; 20108 sb = &so->so_snd; 20109 goto just_return_nolock; 20110 } 20111 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 20112 /* we are retransmitting the fin */ 20113 len--; 20114 if (len) { 20115 /* 20116 * When retransmitting data do *not* include the 20117 * FIN. This could happen from a TLP probe. 20118 */ 20119 flags &= ~TH_FIN; 20120 } 20121 } 20122 if (rsm && rack->r_fsb_inited && 20123 rack_use_rsm_rfo && 20124 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 20125 int ret; 20126 20127 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 20128 if (ret == 0) 20129 return (0); 20130 } 20131 so = inp->inp_socket; 20132 sb = &so->so_snd; 20133 if (do_a_prefetch == 0) { 20134 kern_prefetch(sb, &do_a_prefetch); 20135 do_a_prefetch = 1; 20136 } 20137 #ifdef NETFLIX_SHARED_CWND 20138 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 20139 rack->rack_enable_scwnd) { 20140 /* We are doing cwnd sharing */ 20141 if (rack->gp_ready && 20142 (rack->rack_attempted_scwnd == 0) && 20143 (rack->r_ctl.rc_scw == NULL) && 20144 tp->t_lib) { 20145 /* The pcbid is in, lets make an attempt */ 20146 counter_u64_add(rack_try_scwnd, 1); 20147 rack->rack_attempted_scwnd = 1; 20148 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 20149 &rack->r_ctl.rc_scw_index, 20150 segsiz); 20151 } 20152 if (rack->r_ctl.rc_scw && 20153 (rack->rack_scwnd_is_idle == 1) && 20154 sbavail(&so->so_snd)) { 20155 /* we are no longer out of data */ 20156 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20157 rack->rack_scwnd_is_idle = 0; 20158 } 20159 if (rack->r_ctl.rc_scw) { 20160 /* First lets update and get the cwnd */ 20161 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 20162 rack->r_ctl.rc_scw_index, 20163 tp->snd_cwnd, tp->snd_wnd, segsiz); 20164 } 20165 } 20166 #endif 20167 /* 20168 * Get standard flags, and add SYN or FIN if requested by 'hidden' 20169 * state flags. 20170 */ 20171 if (tp->t_flags & TF_NEEDFIN) 20172 flags |= TH_FIN; 20173 if (tp->t_flags & TF_NEEDSYN) 20174 flags |= TH_SYN; 20175 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 20176 void *end_rsm; 20177 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 20178 if (end_rsm) 20179 kern_prefetch(end_rsm, &prefetch_rsm); 20180 prefetch_rsm = 1; 20181 } 20182 SOCKBUF_LOCK(sb); 20183 /* 20184 * If snd_nxt == snd_max and we have transmitted a FIN, the 20185 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 20186 * negative length. This can also occur when TCP opens up its 20187 * congestion window while receiving additional duplicate acks after 20188 * fast-retransmit because TCP will reset snd_nxt to snd_max after 20189 * the fast-retransmit. 20190 * 20191 * In the normal retransmit-FIN-only case, however, snd_nxt will be 20192 * set to snd_una, the sb_offset will be 0, and the length may wind 20193 * up 0. 20194 * 20195 * If sack_rxmit is true we are retransmitting from the scoreboard 20196 * in which case len is already set. 20197 */ 20198 if ((sack_rxmit == 0) && 20199 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 20200 uint32_t avail; 20201 20202 avail = sbavail(sb); 20203 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 20204 sb_offset = tp->snd_nxt - tp->snd_una; 20205 else 20206 sb_offset = 0; 20207 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 20208 if (rack->r_ctl.rc_tlp_new_data) { 20209 /* TLP is forcing out new data */ 20210 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 20211 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 20212 } 20213 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 20214 if (tp->snd_wnd > sb_offset) 20215 len = tp->snd_wnd - sb_offset; 20216 else 20217 len = 0; 20218 } else { 20219 len = rack->r_ctl.rc_tlp_new_data; 20220 } 20221 rack->r_ctl.rc_tlp_new_data = 0; 20222 } else { 20223 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 20224 } 20225 if ((rack->r_ctl.crte == NULL) && 20226 IN_FASTRECOVERY(tp->t_flags) && 20227 (rack->full_size_rxt == 0) && 20228 (rack->shape_rxt_to_pacing_min == 0) && 20229 (len > segsiz)) { 20230 /* 20231 * For prr=off, we need to send only 1 MSS 20232 * at a time. We do this because another sack could 20233 * be arriving that causes us to send retransmits and 20234 * we don't want to be on a long pace due to a larger send 20235 * that keeps us from sending out the retransmit. 20236 */ 20237 len = segsiz; 20238 } else if (rack->shape_rxt_to_pacing_min && 20239 rack->gp_ready) { 20240 /* We use pacing min as shaping len req */ 20241 uint32_t maxlen; 20242 20243 maxlen = rack_get_hpts_pacing_min_for_bw(rack, segsiz); 20244 if (len > maxlen) 20245 len = maxlen; 20246 }/* The else is full_size_rxt is on so send it all */ 20247 } else { 20248 uint32_t outstanding; 20249 /* 20250 * We are inside of a Fast recovery episode, this 20251 * is caused by a SACK or 3 dup acks. At this point 20252 * we have sent all the retransmissions and we rely 20253 * on PRR to dictate what we will send in the form of 20254 * new data. 20255 */ 20256 20257 outstanding = tp->snd_max - tp->snd_una; 20258 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 20259 if (tp->snd_wnd > outstanding) { 20260 len = tp->snd_wnd - outstanding; 20261 /* Check to see if we have the data */ 20262 if ((sb_offset + len) > avail) { 20263 /* It does not all fit */ 20264 if (avail > sb_offset) 20265 len = avail - sb_offset; 20266 else 20267 len = 0; 20268 } 20269 } else { 20270 len = 0; 20271 } 20272 } else if (avail > sb_offset) { 20273 len = avail - sb_offset; 20274 } else { 20275 len = 0; 20276 } 20277 if (len > 0) { 20278 if (len > rack->r_ctl.rc_prr_sndcnt) { 20279 len = rack->r_ctl.rc_prr_sndcnt; 20280 } 20281 if (len > 0) { 20282 sub_from_prr = 1; 20283 } 20284 } 20285 if (len > segsiz) { 20286 /* 20287 * We should never send more than a MSS when 20288 * retransmitting or sending new data in prr 20289 * mode unless the override flag is on. Most 20290 * likely the PRR algorithm is not going to 20291 * let us send a lot as well :-) 20292 */ 20293 if (rack->r_ctl.rc_prr_sendalot == 0) { 20294 len = segsiz; 20295 } 20296 } else if (len < segsiz) { 20297 /* 20298 * Do we send any? The idea here is if the 20299 * send empty's the socket buffer we want to 20300 * do it. However if not then lets just wait 20301 * for our prr_sndcnt to get bigger. 20302 */ 20303 long leftinsb; 20304 20305 leftinsb = sbavail(sb) - sb_offset; 20306 if (leftinsb > len) { 20307 /* This send does not empty the sb */ 20308 len = 0; 20309 } 20310 } 20311 } 20312 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 20313 /* 20314 * If you have not established 20315 * and are not doing FAST OPEN 20316 * no data please. 20317 */ 20318 if ((sack_rxmit == 0) && 20319 (!IS_FASTOPEN(tp->t_flags))){ 20320 len = 0; 20321 sb_offset = 0; 20322 } 20323 } 20324 if (prefetch_so_done == 0) { 20325 kern_prefetch(so, &prefetch_so_done); 20326 prefetch_so_done = 1; 20327 } 20328 /* 20329 * Lop off SYN bit if it has already been sent. However, if this is 20330 * SYN-SENT state and if segment contains data and if we don't know 20331 * that foreign host supports TAO, suppress sending segment. 20332 */ 20333 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 20334 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 20335 /* 20336 * When sending additional segments following a TFO SYN|ACK, 20337 * do not include the SYN bit. 20338 */ 20339 if (IS_FASTOPEN(tp->t_flags) && 20340 (tp->t_state == TCPS_SYN_RECEIVED)) 20341 flags &= ~TH_SYN; 20342 } 20343 /* 20344 * Be careful not to send data and/or FIN on SYN segments. This 20345 * measure is needed to prevent interoperability problems with not 20346 * fully conformant TCP implementations. 20347 */ 20348 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 20349 len = 0; 20350 flags &= ~TH_FIN; 20351 } 20352 /* 20353 * On TFO sockets, ensure no data is sent in the following cases: 20354 * 20355 * - When retransmitting SYN|ACK on a passively-created socket 20356 * 20357 * - When retransmitting SYN on an actively created socket 20358 * 20359 * - When sending a zero-length cookie (cookie request) on an 20360 * actively created socket 20361 * 20362 * - When the socket is in the CLOSED state (RST is being sent) 20363 */ 20364 if (IS_FASTOPEN(tp->t_flags) && 20365 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 20366 ((tp->t_state == TCPS_SYN_SENT) && 20367 (tp->t_tfo_client_cookie_len == 0)) || 20368 (flags & TH_RST))) { 20369 sack_rxmit = 0; 20370 len = 0; 20371 } 20372 /* Without fast-open there should never be data sent on a SYN */ 20373 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 20374 tp->snd_nxt = tp->iss; 20375 len = 0; 20376 } 20377 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 20378 /* We only send 1 MSS if we have a DSACK block */ 20379 add_flag |= RACK_SENT_W_DSACK; 20380 len = segsiz; 20381 } 20382 orig_len = len; 20383 if (len <= 0) { 20384 /* 20385 * If FIN has been sent but not acked, but we haven't been 20386 * called to retransmit, len will be < 0. Otherwise, window 20387 * shrank after we sent into it. If window shrank to 0, 20388 * cancel pending retransmit, pull snd_nxt back to (closed) 20389 * window, and set the persist timer if it isn't already 20390 * going. If the window didn't close completely, just wait 20391 * for an ACK. 20392 * 20393 * We also do a general check here to ensure that we will 20394 * set the persist timer when we have data to send, but a 20395 * 0-byte window. This makes sure the persist timer is set 20396 * even if the packet hits one of the "goto send" lines 20397 * below. 20398 */ 20399 len = 0; 20400 if ((tp->snd_wnd == 0) && 20401 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20402 (tp->snd_una == tp->snd_max) && 20403 (sb_offset < (int)sbavail(sb))) { 20404 rack_enter_persist(tp, rack, cts, tp->snd_una); 20405 } 20406 } else if ((rsm == NULL) && 20407 (doing_tlp == 0) && 20408 (len < pace_max_seg)) { 20409 /* 20410 * We are not sending a maximum sized segment for 20411 * some reason. Should we not send anything (think 20412 * sws or persists)? 20413 */ 20414 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20415 (TCPS_HAVEESTABLISHED(tp->t_state)) && 20416 (len < minseg) && 20417 (len < (int)(sbavail(sb) - sb_offset))) { 20418 /* 20419 * Here the rwnd is less than 20420 * the minimum pacing size, this is not a retransmit, 20421 * we are established and 20422 * the send is not the last in the socket buffer 20423 * we send nothing, and we may enter persists 20424 * if nothing is outstanding. 20425 */ 20426 len = 0; 20427 if (tp->snd_max == tp->snd_una) { 20428 /* 20429 * Nothing out we can 20430 * go into persists. 20431 */ 20432 rack_enter_persist(tp, rack, cts, tp->snd_una); 20433 } 20434 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 20435 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20436 (len < (int)(sbavail(sb) - sb_offset)) && 20437 (len < minseg)) { 20438 /* 20439 * Here we are not retransmitting, and 20440 * the cwnd is not so small that we could 20441 * not send at least a min size (rxt timer 20442 * not having gone off), We have 2 segments or 20443 * more already in flight, its not the tail end 20444 * of the socket buffer and the cwnd is blocking 20445 * us from sending out a minimum pacing segment size. 20446 * Lets not send anything. 20447 */ 20448 len = 0; 20449 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 20450 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 20451 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 20452 (len < (int)(sbavail(sb) - sb_offset)) && 20453 (TCPS_HAVEESTABLISHED(tp->t_state))) { 20454 /* 20455 * Here we have a send window but we have 20456 * filled it up and we can't send another pacing segment. 20457 * We also have in flight more than 2 segments 20458 * and we are not completing the sb i.e. we allow 20459 * the last bytes of the sb to go out even if 20460 * its not a full pacing segment. 20461 */ 20462 len = 0; 20463 } else if ((rack->r_ctl.crte != NULL) && 20464 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 20465 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 20466 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 20467 (len < (int)(sbavail(sb) - sb_offset))) { 20468 /* 20469 * Here we are doing hardware pacing, this is not a TLP, 20470 * we are not sending a pace max segment size, there is rwnd 20471 * room to send at least N pace_max_seg, the cwnd is greater 20472 * than or equal to a full pacing segments plus 4 mss and we have 2 or 20473 * more segments in flight and its not the tail of the socket buffer. 20474 * 20475 * We don't want to send instead we need to get more ack's in to 20476 * allow us to send a full pacing segment. Normally, if we are pacing 20477 * about the right speed, we should have finished our pacing 20478 * send as most of the acks have come back if we are at the 20479 * right rate. This is a bit fuzzy since return path delay 20480 * can delay the acks, which is why we want to make sure we 20481 * have cwnd space to have a bit more than a max pace segments in flight. 20482 * 20483 * If we have not gotten our acks back we are pacing at too high a 20484 * rate delaying will not hurt and will bring our GP estimate down by 20485 * injecting the delay. If we don't do this we will send 20486 * 2 MSS out in response to the acks being clocked in which 20487 * defeats the point of hw-pacing (i.e. to help us get 20488 * larger TSO's out). 20489 */ 20490 len = 0; 20491 } 20492 20493 } 20494 /* len will be >= 0 after this point. */ 20495 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 20496 rack_sndbuf_autoscale(rack); 20497 /* 20498 * Decide if we can use TCP Segmentation Offloading (if supported by 20499 * hardware). 20500 * 20501 * TSO may only be used if we are in a pure bulk sending state. The 20502 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 20503 * options prevent using TSO. With TSO the TCP header is the same 20504 * (except for the sequence number) for all generated packets. This 20505 * makes it impossible to transmit any options which vary per 20506 * generated segment or packet. 20507 * 20508 * IPv4 handling has a clear separation of ip options and ip header 20509 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 20510 * the right thing below to provide length of just ip options and thus 20511 * checking for ipoptlen is enough to decide if ip options are present. 20512 */ 20513 ipoptlen = 0; 20514 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20515 /* 20516 * Pre-calculate here as we save another lookup into the darknesses 20517 * of IPsec that way and can actually decide if TSO is ok. 20518 */ 20519 #ifdef INET6 20520 if (isipv6 && IPSEC_ENABLED(ipv6)) 20521 ipsec_optlen = IPSEC_HDRSIZE(ipv6, inp); 20522 #ifdef INET 20523 else 20524 #endif 20525 #endif /* INET6 */ 20526 #ifdef INET 20527 if (IPSEC_ENABLED(ipv4)) 20528 ipsec_optlen = IPSEC_HDRSIZE(ipv4, inp); 20529 #endif /* INET */ 20530 #endif 20531 20532 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 20533 ipoptlen += ipsec_optlen; 20534 #endif 20535 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 20536 (tp->t_port == 0) && 20537 ((tp->t_flags & TF_SIGNATURE) == 0) && 20538 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 20539 ipoptlen == 0) 20540 tso = 1; 20541 { 20542 uint32_t outstanding __unused; 20543 20544 outstanding = tp->snd_max - tp->snd_una; 20545 if (tp->t_flags & TF_SENTFIN) { 20546 /* 20547 * If we sent a fin, snd_max is 1 higher than 20548 * snd_una 20549 */ 20550 outstanding--; 20551 } 20552 if (sack_rxmit) { 20553 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 20554 flags &= ~TH_FIN; 20555 } else { 20556 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 20557 sbused(sb))) 20558 flags &= ~TH_FIN; 20559 } 20560 } 20561 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 20562 (long)TCP_MAXWIN << tp->rcv_scale); 20563 20564 /* 20565 * Sender silly window avoidance. We transmit under the following 20566 * conditions when len is non-zero: 20567 * 20568 * - We have a full segment (or more with TSO) - This is the last 20569 * buffer in a write()/send() and we are either idle or running 20570 * NODELAY - we've timed out (e.g. persist timer) - we have more 20571 * then 1/2 the maximum send window's worth of data (receiver may be 20572 * limited the window size) - we need to retransmit 20573 */ 20574 if (len) { 20575 if (len >= segsiz) { 20576 goto send; 20577 } 20578 /* 20579 * NOTE! on localhost connections an 'ack' from the remote 20580 * end may occur synchronously with the output and cause us 20581 * to flush a buffer queued with moretocome. XXX 20582 * 20583 */ 20584 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 20585 (idle || (tp->t_flags & TF_NODELAY)) && 20586 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20587 (tp->t_flags & TF_NOPUSH) == 0) { 20588 pass = 2; 20589 goto send; 20590 } 20591 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 20592 pass = 22; 20593 goto send; 20594 } 20595 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 20596 pass = 4; 20597 goto send; 20598 } 20599 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 20600 pass = 5; 20601 goto send; 20602 } 20603 if (sack_rxmit) { 20604 pass = 6; 20605 goto send; 20606 } 20607 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 20608 (ctf_outstanding(tp) < (segsiz * 2))) { 20609 /* 20610 * We have less than two MSS outstanding (delayed ack) 20611 * and our rwnd will not let us send a full sized 20612 * MSS. Lets go ahead and let this small segment 20613 * out because we want to try to have at least two 20614 * packets inflight to not be caught by delayed ack. 20615 */ 20616 pass = 12; 20617 goto send; 20618 } 20619 } 20620 /* 20621 * Sending of standalone window updates. 20622 * 20623 * Window updates are important when we close our window due to a 20624 * full socket buffer and are opening it again after the application 20625 * reads data from it. Once the window has opened again and the 20626 * remote end starts to send again the ACK clock takes over and 20627 * provides the most current window information. 20628 * 20629 * We must avoid the silly window syndrome whereas every read from 20630 * the receive buffer, no matter how small, causes a window update 20631 * to be sent. We also should avoid sending a flurry of window 20632 * updates when the socket buffer had queued a lot of data and the 20633 * application is doing small reads. 20634 * 20635 * Prevent a flurry of pointless window updates by only sending an 20636 * update when we can increase the advertized window by more than 20637 * 1/4th of the socket buffer capacity. When the buffer is getting 20638 * full or is very small be more aggressive and send an update 20639 * whenever we can increase by two mss sized segments. In all other 20640 * situations the ACK's to new incoming data will carry further 20641 * window increases. 20642 * 20643 * Don't send an independent window update if a delayed ACK is 20644 * pending (it will get piggy-backed on it) or the remote side 20645 * already has done a half-close and won't send more data. Skip 20646 * this if the connection is in T/TCP half-open state. 20647 */ 20648 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 20649 !(tp->t_flags & TF_DELACK) && 20650 !TCPS_HAVERCVDFIN(tp->t_state)) { 20651 /* 20652 * "adv" is the amount we could increase the window, taking 20653 * into account that we are limited by TCP_MAXWIN << 20654 * tp->rcv_scale. 20655 */ 20656 int32_t adv; 20657 int oldwin; 20658 20659 adv = recwin; 20660 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 20661 oldwin = (tp->rcv_adv - tp->rcv_nxt); 20662 if (adv > oldwin) 20663 adv -= oldwin; 20664 else { 20665 /* We can't increase the window */ 20666 adv = 0; 20667 } 20668 } else 20669 oldwin = 0; 20670 20671 /* 20672 * If the new window size ends up being the same as or less 20673 * than the old size when it is scaled, then don't force 20674 * a window update. 20675 */ 20676 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 20677 goto dontupdate; 20678 20679 if (adv >= (int32_t)(2 * segsiz) && 20680 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 20681 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 20682 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 20683 pass = 7; 20684 goto send; 20685 } 20686 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 20687 pass = 23; 20688 goto send; 20689 } 20690 } 20691 dontupdate: 20692 20693 /* 20694 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 20695 * is also a catch-all for the retransmit timer timeout case. 20696 */ 20697 if (tp->t_flags & TF_ACKNOW) { 20698 pass = 8; 20699 goto send; 20700 } 20701 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 20702 pass = 9; 20703 goto send; 20704 } 20705 /* 20706 * If our state indicates that FIN should be sent and we have not 20707 * yet done so, then we need to send. 20708 */ 20709 if ((flags & TH_FIN) && 20710 (tp->snd_nxt == tp->snd_una)) { 20711 pass = 11; 20712 goto send; 20713 } 20714 /* 20715 * No reason to send a segment, just return. 20716 */ 20717 just_return: 20718 SOCKBUF_UNLOCK(sb); 20719 just_return_nolock: 20720 { 20721 int app_limited = CTF_JR_SENT_DATA; 20722 20723 if (tot_len_this_send > 0) { 20724 /* Make sure snd_nxt is up to max */ 20725 rack->r_ctl.fsb.recwin = recwin; 20726 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 20727 if ((error == 0) && 20728 rack_use_rfo && 20729 ((flags & (TH_SYN|TH_FIN)) == 0) && 20730 (ipoptlen == 0) && 20731 (tp->snd_nxt == tp->snd_max) && 20732 (tp->rcv_numsacks == 0) && 20733 rack->r_fsb_inited && 20734 TCPS_HAVEESTABLISHED(tp->t_state) && 20735 ((IN_RECOVERY(tp->t_flags)) == 0) && 20736 (rack->r_must_retran == 0) && 20737 ((tp->t_flags & TF_NEEDFIN) == 0) && 20738 (len > 0) && (orig_len > 0) && 20739 (orig_len > len) && 20740 ((orig_len - len) >= segsiz) && 20741 ((optlen == 0) || 20742 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 20743 /* We can send at least one more MSS using our fsb */ 20744 rack_setup_fast_output(tp, rack, sb, len, orig_len, 20745 segsiz, pace_max_seg, hw_tls, flags); 20746 } else 20747 rack->r_fast_output = 0; 20748 20749 20750 rack_log_fsb(rack, tp, so, flags, 20751 ipoptlen, orig_len, len, 0, 20752 1, optlen, __LINE__, 1); 20753 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 20754 tp->snd_nxt = tp->snd_max; 20755 } else { 20756 int end_window = 0; 20757 uint32_t seq = tp->gput_ack; 20758 20759 rsm = tqhash_max(rack->r_ctl.tqh); 20760 if (rsm) { 20761 /* 20762 * Mark the last sent that we just-returned (hinting 20763 * that delayed ack may play a role in any rtt measurement). 20764 */ 20765 rsm->r_just_ret = 1; 20766 } 20767 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 20768 rack->r_ctl.rc_agg_delayed = 0; 20769 rack->r_early = 0; 20770 rack->r_late = 0; 20771 rack->r_ctl.rc_agg_early = 0; 20772 if ((ctf_outstanding(tp) + 20773 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 20774 minseg)) >= tp->snd_wnd) { 20775 /* We are limited by the rwnd */ 20776 app_limited = CTF_JR_RWND_LIMITED; 20777 if (IN_FASTRECOVERY(tp->t_flags)) 20778 rack->r_ctl.rc_prr_sndcnt = 0; 20779 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 20780 /* We are limited by whats available -- app limited */ 20781 app_limited = CTF_JR_APP_LIMITED; 20782 if (IN_FASTRECOVERY(tp->t_flags)) 20783 rack->r_ctl.rc_prr_sndcnt = 0; 20784 } else if ((idle == 0) && 20785 ((tp->t_flags & TF_NODELAY) == 0) && 20786 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 20787 (len < segsiz)) { 20788 /* 20789 * No delay is not on and the 20790 * user is sending less than 1MSS. This 20791 * brings out SWS avoidance so we 20792 * don't send. Another app-limited case. 20793 */ 20794 app_limited = CTF_JR_APP_LIMITED; 20795 } else if (tp->t_flags & TF_NOPUSH) { 20796 /* 20797 * The user has requested no push of 20798 * the last segment and we are 20799 * at the last segment. Another app 20800 * limited case. 20801 */ 20802 app_limited = CTF_JR_APP_LIMITED; 20803 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 20804 /* Its the cwnd */ 20805 app_limited = CTF_JR_CWND_LIMITED; 20806 } else if (IN_FASTRECOVERY(tp->t_flags) && 20807 (rack->rack_no_prr == 0) && 20808 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 20809 app_limited = CTF_JR_PRR; 20810 } else { 20811 /* Now why here are we not sending? */ 20812 #ifdef NOW 20813 #ifdef INVARIANTS 20814 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 20815 #endif 20816 #endif 20817 app_limited = CTF_JR_ASSESSING; 20818 } 20819 /* 20820 * App limited in some fashion, for our pacing GP 20821 * measurements we don't want any gap (even cwnd). 20822 * Close down the measurement window. 20823 */ 20824 if (rack_cwnd_block_ends_measure && 20825 ((app_limited == CTF_JR_CWND_LIMITED) || 20826 (app_limited == CTF_JR_PRR))) { 20827 /* 20828 * The reason we are not sending is 20829 * the cwnd (or prr). We have been configured 20830 * to end the measurement window in 20831 * this case. 20832 */ 20833 end_window = 1; 20834 } else if (rack_rwnd_block_ends_measure && 20835 (app_limited == CTF_JR_RWND_LIMITED)) { 20836 /* 20837 * We are rwnd limited and have been 20838 * configured to end the measurement 20839 * window in this case. 20840 */ 20841 end_window = 1; 20842 } else if (app_limited == CTF_JR_APP_LIMITED) { 20843 /* 20844 * A true application limited period, we have 20845 * ran out of data. 20846 */ 20847 end_window = 1; 20848 } else if (app_limited == CTF_JR_ASSESSING) { 20849 /* 20850 * In the assessing case we hit the end of 20851 * the if/else and had no known reason 20852 * This will panic us under invariants.. 20853 * 20854 * If we get this out in logs we need to 20855 * investagate which reason we missed. 20856 */ 20857 end_window = 1; 20858 } 20859 if (end_window) { 20860 uint8_t log = 0; 20861 20862 /* Adjust the Gput measurement */ 20863 if ((tp->t_flags & TF_GPUTINPROG) && 20864 SEQ_GT(tp->gput_ack, tp->snd_max)) { 20865 tp->gput_ack = tp->snd_max; 20866 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 20867 /* 20868 * There is not enough to measure. 20869 */ 20870 tp->t_flags &= ~TF_GPUTINPROG; 20871 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 20872 rack->r_ctl.rc_gp_srtt /*flex1*/, 20873 tp->gput_seq, 20874 0, 0, 18, __LINE__, NULL, 0); 20875 } else 20876 log = 1; 20877 } 20878 /* Mark the last packet has app limited */ 20879 rsm = tqhash_max(rack->r_ctl.tqh); 20880 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 20881 if (rack->r_ctl.rc_app_limited_cnt == 0) 20882 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 20883 else { 20884 /* 20885 * Go out to the end app limited and mark 20886 * this new one as next and move the end_appl up 20887 * to this guy. 20888 */ 20889 if (rack->r_ctl.rc_end_appl) 20890 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 20891 rack->r_ctl.rc_end_appl = rsm; 20892 } 20893 rsm->r_flags |= RACK_APP_LIMITED; 20894 rack->r_ctl.rc_app_limited_cnt++; 20895 } 20896 if (log) 20897 rack_log_pacing_delay_calc(rack, 20898 rack->r_ctl.rc_app_limited_cnt, seq, 20899 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 20900 } 20901 } 20902 /* Check if we need to go into persists or not */ 20903 if ((tp->snd_max == tp->snd_una) && 20904 TCPS_HAVEESTABLISHED(tp->t_state) && 20905 sbavail(sb) && 20906 (sbavail(sb) > tp->snd_wnd) && 20907 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 20908 /* Yes lets make sure to move to persist before timer-start */ 20909 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime, tp->snd_una); 20910 } 20911 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 20912 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 20913 } 20914 #ifdef NETFLIX_SHARED_CWND 20915 if ((sbavail(sb) == 0) && 20916 rack->r_ctl.rc_scw) { 20917 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 20918 rack->rack_scwnd_is_idle = 1; 20919 } 20920 #endif 20921 #ifdef TCP_ACCOUNTING 20922 if (tot_len_this_send > 0) { 20923 crtsc = get_cyclecount(); 20924 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20925 tp->tcp_cnt_counters[SND_OUT_DATA]++; 20926 } 20927 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20928 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 20929 } 20930 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20931 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 20932 } 20933 } else { 20934 crtsc = get_cyclecount(); 20935 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20936 tp->tcp_cnt_counters[SND_LIMITED]++; 20937 } 20938 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 20939 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 20940 } 20941 } 20942 sched_unpin(); 20943 #endif 20944 return (0); 20945 20946 send: 20947 if ((rack->r_ctl.crte != NULL) && 20948 (rsm == NULL) && 20949 ((rack->rc_hw_nobuf == 1) || 20950 (rack_hw_check_queue && (check_done == 0)))) { 20951 /* 20952 * We only want to do this once with the hw_check_queue, 20953 * for the enobuf case we would only do it once if 20954 * we come around to again, the flag will be clear. 20955 */ 20956 check_done = 1; 20957 slot = rack_check_queue_level(rack, tp, &tv, cts, len, segsiz); 20958 if (slot) { 20959 rack->r_ctl.rc_agg_delayed = 0; 20960 rack->r_ctl.rc_agg_early = 0; 20961 rack->r_early = 0; 20962 rack->r_late = 0; 20963 SOCKBUF_UNLOCK(&so->so_snd); 20964 goto skip_all_send; 20965 } 20966 } 20967 if (rsm || sack_rxmit) 20968 counter_u64_add(rack_nfto_resend, 1); 20969 else 20970 counter_u64_add(rack_non_fto_send, 1); 20971 if ((flags & TH_FIN) && 20972 sbavail(sb)) { 20973 /* 20974 * We do not transmit a FIN 20975 * with data outstanding. We 20976 * need to make it so all data 20977 * is acked first. 20978 */ 20979 flags &= ~TH_FIN; 20980 } 20981 /* Enforce stack imposed max seg size if we have one */ 20982 if (rack->r_ctl.rc_pace_max_segs && 20983 (len > rack->r_ctl.rc_pace_max_segs)) { 20984 mark = 1; 20985 len = rack->r_ctl.rc_pace_max_segs; 20986 } 20987 SOCKBUF_LOCK_ASSERT(sb); 20988 if (len > 0) { 20989 if (len >= segsiz) 20990 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 20991 else 20992 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 20993 } 20994 /* 20995 * Before ESTABLISHED, force sending of initial options unless TCP 20996 * set not to do any options. NOTE: we assume that the IP/TCP header 20997 * plus TCP options always fit in a single mbuf, leaving room for a 20998 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 20999 * + optlen <= MCLBYTES 21000 */ 21001 optlen = 0; 21002 #ifdef INET6 21003 if (isipv6) 21004 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 21005 else 21006 #endif 21007 hdrlen = sizeof(struct tcpiphdr); 21008 21009 /* 21010 * Compute options for segment. We only have to care about SYN and 21011 * established connection segments. Options for SYN-ACK segments 21012 * are handled in TCP syncache. 21013 */ 21014 to.to_flags = 0; 21015 if ((tp->t_flags & TF_NOOPT) == 0) { 21016 /* Maximum segment size. */ 21017 if (flags & TH_SYN) { 21018 tp->snd_nxt = tp->iss; 21019 to.to_mss = tcp_mssopt(&inp->inp_inc); 21020 if (tp->t_port) 21021 to.to_mss -= V_tcp_udp_tunneling_overhead; 21022 to.to_flags |= TOF_MSS; 21023 21024 /* 21025 * On SYN or SYN|ACK transmits on TFO connections, 21026 * only include the TFO option if it is not a 21027 * retransmit, as the presence of the TFO option may 21028 * have caused the original SYN or SYN|ACK to have 21029 * been dropped by a middlebox. 21030 */ 21031 if (IS_FASTOPEN(tp->t_flags) && 21032 (tp->t_rxtshift == 0)) { 21033 if (tp->t_state == TCPS_SYN_RECEIVED) { 21034 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 21035 to.to_tfo_cookie = 21036 (u_int8_t *)&tp->t_tfo_cookie.server; 21037 to.to_flags |= TOF_FASTOPEN; 21038 wanted_cookie = 1; 21039 } else if (tp->t_state == TCPS_SYN_SENT) { 21040 to.to_tfo_len = 21041 tp->t_tfo_client_cookie_len; 21042 to.to_tfo_cookie = 21043 tp->t_tfo_cookie.client; 21044 to.to_flags |= TOF_FASTOPEN; 21045 wanted_cookie = 1; 21046 /* 21047 * If we wind up having more data to 21048 * send with the SYN than can fit in 21049 * one segment, don't send any more 21050 * until the SYN|ACK comes back from 21051 * the other end. 21052 */ 21053 sendalot = 0; 21054 } 21055 } 21056 } 21057 /* Window scaling. */ 21058 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 21059 to.to_wscale = tp->request_r_scale; 21060 to.to_flags |= TOF_SCALE; 21061 } 21062 /* Timestamps. */ 21063 if ((tp->t_flags & TF_RCVD_TSTMP) || 21064 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 21065 to.to_tsval = ms_cts + tp->ts_offset; 21066 to.to_tsecr = tp->ts_recent; 21067 to.to_flags |= TOF_TS; 21068 } 21069 /* Set receive buffer autosizing timestamp. */ 21070 if (tp->rfbuf_ts == 0 && 21071 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 21072 tp->rfbuf_ts = tcp_ts_getticks(); 21073 /* Selective ACK's. */ 21074 if (tp->t_flags & TF_SACK_PERMIT) { 21075 if (flags & TH_SYN) 21076 to.to_flags |= TOF_SACKPERM; 21077 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 21078 tp->rcv_numsacks > 0) { 21079 to.to_flags |= TOF_SACK; 21080 to.to_nsacks = tp->rcv_numsacks; 21081 to.to_sacks = (u_char *)tp->sackblks; 21082 } 21083 } 21084 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21085 /* TCP-MD5 (RFC2385). */ 21086 if (tp->t_flags & TF_SIGNATURE) 21087 to.to_flags |= TOF_SIGNATURE; 21088 #endif /* TCP_SIGNATURE */ 21089 21090 /* Processing the options. */ 21091 hdrlen += optlen = tcp_addoptions(&to, opt); 21092 /* 21093 * If we wanted a TFO option to be added, but it was unable 21094 * to fit, ensure no data is sent. 21095 */ 21096 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 21097 !(to.to_flags & TOF_FASTOPEN)) 21098 len = 0; 21099 } 21100 if (tp->t_port) { 21101 if (V_tcp_udp_tunneling_port == 0) { 21102 /* The port was removed?? */ 21103 SOCKBUF_UNLOCK(&so->so_snd); 21104 #ifdef TCP_ACCOUNTING 21105 crtsc = get_cyclecount(); 21106 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21107 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 21108 } 21109 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 21110 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 21111 } 21112 sched_unpin(); 21113 #endif 21114 return (EHOSTUNREACH); 21115 } 21116 hdrlen += sizeof(struct udphdr); 21117 } 21118 #ifdef INET6 21119 if (isipv6) 21120 ipoptlen = ip6_optlen(inp); 21121 else 21122 #endif 21123 if (inp->inp_options) 21124 ipoptlen = inp->inp_options->m_len - 21125 offsetof(struct ipoption, ipopt_list); 21126 else 21127 ipoptlen = 0; 21128 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21129 ipoptlen += ipsec_optlen; 21130 #endif 21131 21132 /* 21133 * Adjust data length if insertion of options will bump the packet 21134 * length beyond the t_maxseg length. Clear the FIN bit because we 21135 * cut off the tail of the segment. 21136 */ 21137 if (len + optlen + ipoptlen > tp->t_maxseg) { 21138 if (tso) { 21139 uint32_t if_hw_tsomax; 21140 uint32_t moff; 21141 int32_t max_len; 21142 21143 /* extract TSO information */ 21144 if_hw_tsomax = tp->t_tsomax; 21145 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 21146 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 21147 KASSERT(ipoptlen == 0, 21148 ("%s: TSO can't do IP options", __func__)); 21149 21150 /* 21151 * Check if we should limit by maximum payload 21152 * length: 21153 */ 21154 if (if_hw_tsomax != 0) { 21155 /* compute maximum TSO length */ 21156 max_len = (if_hw_tsomax - hdrlen - 21157 max_linkhdr); 21158 if (max_len <= 0) { 21159 len = 0; 21160 } else if (len > max_len) { 21161 sendalot = 1; 21162 len = max_len; 21163 mark = 2; 21164 } 21165 } 21166 /* 21167 * Prevent the last segment from being fractional 21168 * unless the send sockbuf can be emptied: 21169 */ 21170 max_len = (tp->t_maxseg - optlen); 21171 if ((sb_offset + len) < sbavail(sb)) { 21172 moff = len % (u_int)max_len; 21173 if (moff != 0) { 21174 mark = 3; 21175 len -= moff; 21176 } 21177 } 21178 /* 21179 * In case there are too many small fragments don't 21180 * use TSO: 21181 */ 21182 if (len <= max_len) { 21183 mark = 4; 21184 tso = 0; 21185 } 21186 /* 21187 * Send the FIN in a separate segment after the bulk 21188 * sending is done. We don't trust the TSO 21189 * implementations to clear the FIN flag on all but 21190 * the last segment. 21191 */ 21192 if (tp->t_flags & TF_NEEDFIN) { 21193 sendalot = 4; 21194 } 21195 } else { 21196 mark = 5; 21197 if (optlen + ipoptlen >= tp->t_maxseg) { 21198 /* 21199 * Since we don't have enough space to put 21200 * the IP header chain and the TCP header in 21201 * one packet as required by RFC 7112, don't 21202 * send it. Also ensure that at least one 21203 * byte of the payload can be put into the 21204 * TCP segment. 21205 */ 21206 SOCKBUF_UNLOCK(&so->so_snd); 21207 error = EMSGSIZE; 21208 sack_rxmit = 0; 21209 goto out; 21210 } 21211 len = tp->t_maxseg - optlen - ipoptlen; 21212 sendalot = 5; 21213 } 21214 } else { 21215 tso = 0; 21216 mark = 6; 21217 } 21218 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 21219 ("%s: len > IP_MAXPACKET", __func__)); 21220 #ifdef DIAGNOSTIC 21221 #ifdef INET6 21222 if (max_linkhdr + hdrlen > MCLBYTES) 21223 #else 21224 if (max_linkhdr + hdrlen > MHLEN) 21225 #endif 21226 panic("tcphdr too big"); 21227 #endif 21228 21229 /* 21230 * This KASSERT is here to catch edge cases at a well defined place. 21231 * Before, those had triggered (random) panic conditions further 21232 * down. 21233 */ 21234 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 21235 if ((len == 0) && 21236 (flags & TH_FIN) && 21237 (sbused(sb))) { 21238 /* 21239 * We have outstanding data, don't send a fin by itself!. 21240 */ 21241 goto just_return; 21242 } 21243 /* 21244 * Grab a header mbuf, attaching a copy of data to be transmitted, 21245 * and initialize the header from the template for sends on this 21246 * connection. 21247 */ 21248 hw_tls = tp->t_nic_ktls_xmit != 0; 21249 if (len) { 21250 uint32_t max_val; 21251 uint32_t moff; 21252 21253 if (rack->r_ctl.rc_pace_max_segs) 21254 max_val = rack->r_ctl.rc_pace_max_segs; 21255 else if (rack->rc_user_set_max_segs) 21256 max_val = rack->rc_user_set_max_segs * segsiz; 21257 else 21258 max_val = len; 21259 /* 21260 * We allow a limit on sending with hptsi. 21261 */ 21262 if (len > max_val) { 21263 mark = 7; 21264 len = max_val; 21265 } 21266 #ifdef INET6 21267 if (MHLEN < hdrlen + max_linkhdr) 21268 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 21269 else 21270 #endif 21271 m = m_gethdr(M_NOWAIT, MT_DATA); 21272 21273 if (m == NULL) { 21274 SOCKBUF_UNLOCK(sb); 21275 error = ENOBUFS; 21276 sack_rxmit = 0; 21277 goto out; 21278 } 21279 m->m_data += max_linkhdr; 21280 m->m_len = hdrlen; 21281 21282 /* 21283 * Start the m_copy functions from the closest mbuf to the 21284 * sb_offset in the socket buffer chain. 21285 */ 21286 mb = sbsndptr_noadv(sb, sb_offset, &moff); 21287 s_mb = mb; 21288 s_moff = moff; 21289 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 21290 m_copydata(mb, moff, (int)len, 21291 mtod(m, caddr_t)+hdrlen); 21292 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 21293 sbsndptr_adv(sb, mb, len); 21294 m->m_len += len; 21295 } else { 21296 struct sockbuf *msb; 21297 21298 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 21299 msb = NULL; 21300 else 21301 msb = sb; 21302 m->m_next = tcp_m_copym( 21303 mb, moff, &len, 21304 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 21305 ((rsm == NULL) ? hw_tls : 0) 21306 #ifdef NETFLIX_COPY_ARGS 21307 , &s_mb, &s_moff 21308 #endif 21309 ); 21310 if (len <= (tp->t_maxseg - optlen)) { 21311 /* 21312 * Must have ran out of mbufs for the copy 21313 * shorten it to no longer need tso. Lets 21314 * not put on sendalot since we are low on 21315 * mbufs. 21316 */ 21317 tso = 0; 21318 } 21319 if (m->m_next == NULL) { 21320 SOCKBUF_UNLOCK(sb); 21321 (void)m_free(m); 21322 error = ENOBUFS; 21323 sack_rxmit = 0; 21324 goto out; 21325 } 21326 } 21327 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 21328 if (rsm && (rsm->r_flags & RACK_TLP)) { 21329 /* 21330 * TLP should not count in retran count, but 21331 * in its own bin 21332 */ 21333 counter_u64_add(rack_tlp_retran, 1); 21334 counter_u64_add(rack_tlp_retran_bytes, len); 21335 } else { 21336 tp->t_sndrexmitpack++; 21337 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 21338 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 21339 } 21340 #ifdef STATS 21341 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 21342 len); 21343 #endif 21344 } else { 21345 KMOD_TCPSTAT_INC(tcps_sndpack); 21346 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 21347 #ifdef STATS 21348 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 21349 len); 21350 #endif 21351 } 21352 /* 21353 * If we're sending everything we've got, set PUSH. (This 21354 * will keep happy those implementations which only give 21355 * data to the user when a buffer fills or a PUSH comes in.) 21356 */ 21357 if (sb_offset + len == sbused(sb) && 21358 sbused(sb) && 21359 !(flags & TH_SYN)) { 21360 flags |= TH_PUSH; 21361 add_flag |= RACK_HAD_PUSH; 21362 } 21363 21364 SOCKBUF_UNLOCK(sb); 21365 } else { 21366 SOCKBUF_UNLOCK(sb); 21367 if (tp->t_flags & TF_ACKNOW) 21368 KMOD_TCPSTAT_INC(tcps_sndacks); 21369 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 21370 KMOD_TCPSTAT_INC(tcps_sndctrl); 21371 else 21372 KMOD_TCPSTAT_INC(tcps_sndwinup); 21373 21374 m = m_gethdr(M_NOWAIT, MT_DATA); 21375 if (m == NULL) { 21376 error = ENOBUFS; 21377 sack_rxmit = 0; 21378 goto out; 21379 } 21380 #ifdef INET6 21381 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 21382 MHLEN >= hdrlen) { 21383 M_ALIGN(m, hdrlen); 21384 } else 21385 #endif 21386 m->m_data += max_linkhdr; 21387 m->m_len = hdrlen; 21388 } 21389 SOCKBUF_UNLOCK_ASSERT(sb); 21390 m->m_pkthdr.rcvif = (struct ifnet *)0; 21391 #ifdef MAC 21392 mac_inpcb_create_mbuf(inp, m); 21393 #endif 21394 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21395 #ifdef INET6 21396 if (isipv6) 21397 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 21398 else 21399 #endif /* INET6 */ 21400 #ifdef INET 21401 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 21402 #endif 21403 th = rack->r_ctl.fsb.th; 21404 udp = rack->r_ctl.fsb.udp; 21405 if (udp) { 21406 #ifdef INET6 21407 if (isipv6) 21408 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21409 else 21410 #endif /* INET6 */ 21411 ulen = hdrlen + len - sizeof(struct ip); 21412 udp->uh_ulen = htons(ulen); 21413 } 21414 } else { 21415 #ifdef INET6 21416 if (isipv6) { 21417 ip6 = mtod(m, struct ip6_hdr *); 21418 if (tp->t_port) { 21419 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 21420 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21421 udp->uh_dport = tp->t_port; 21422 ulen = hdrlen + len - sizeof(struct ip6_hdr); 21423 udp->uh_ulen = htons(ulen); 21424 th = (struct tcphdr *)(udp + 1); 21425 } else 21426 th = (struct tcphdr *)(ip6 + 1); 21427 tcpip_fillheaders(inp, tp->t_port, ip6, th); 21428 } else 21429 #endif /* INET6 */ 21430 { 21431 #ifdef INET 21432 ip = mtod(m, struct ip *); 21433 if (tp->t_port) { 21434 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 21435 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 21436 udp->uh_dport = tp->t_port; 21437 ulen = hdrlen + len - sizeof(struct ip); 21438 udp->uh_ulen = htons(ulen); 21439 th = (struct tcphdr *)(udp + 1); 21440 } else 21441 th = (struct tcphdr *)(ip + 1); 21442 tcpip_fillheaders(inp, tp->t_port, ip, th); 21443 #endif 21444 } 21445 } 21446 /* 21447 * Fill in fields, remembering maximum advertised window for use in 21448 * delaying messages about window sizes. If resending a FIN, be sure 21449 * not to use a new sequence number. 21450 */ 21451 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 21452 tp->snd_nxt == tp->snd_max) 21453 tp->snd_nxt--; 21454 /* 21455 * If we are starting a connection, send ECN setup SYN packet. If we 21456 * are on a retransmit, we may resend those bits a number of times 21457 * as per RFC 3168. 21458 */ 21459 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { 21460 flags |= tcp_ecn_output_syn_sent(tp); 21461 } 21462 /* Also handle parallel SYN for ECN */ 21463 if (TCPS_HAVERCVDSYN(tp->t_state) && 21464 (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { 21465 int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); 21466 if ((tp->t_state == TCPS_SYN_RECEIVED) && 21467 (tp->t_flags2 & TF2_ECN_SND_ECE)) 21468 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 21469 #ifdef INET6 21470 if (isipv6) { 21471 ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); 21472 ip6->ip6_flow |= htonl(ect << 20); 21473 } 21474 else 21475 #endif 21476 { 21477 #ifdef INET 21478 ip->ip_tos &= ~IPTOS_ECN_MASK; 21479 ip->ip_tos |= ect; 21480 #endif 21481 } 21482 } 21483 /* 21484 * If we are doing retransmissions, then snd_nxt will not reflect 21485 * the first unsent octet. For ACK only packets, we do not want the 21486 * sequence number of the retransmitted packet, we want the sequence 21487 * number of the next unsent octet. So, if there is no data (and no 21488 * SYN or FIN), use snd_max instead of snd_nxt when filling in 21489 * ti_seq. But if we are in persist state, snd_max might reflect 21490 * one byte beyond the right edge of the window, so use snd_nxt in 21491 * that case, since we know we aren't doing a retransmission. 21492 * (retransmit and persist are mutually exclusive...) 21493 */ 21494 if (sack_rxmit == 0) { 21495 if (len || (flags & (TH_SYN | TH_FIN))) { 21496 th->th_seq = htonl(tp->snd_nxt); 21497 rack_seq = tp->snd_nxt; 21498 } else { 21499 th->th_seq = htonl(tp->snd_max); 21500 rack_seq = tp->snd_max; 21501 } 21502 } else { 21503 th->th_seq = htonl(rsm->r_start); 21504 rack_seq = rsm->r_start; 21505 } 21506 th->th_ack = htonl(tp->rcv_nxt); 21507 tcp_set_flags(th, flags); 21508 /* 21509 * Calculate receive window. Don't shrink window, but avoid silly 21510 * window syndrome. 21511 * If a RST segment is sent, advertise a window of zero. 21512 */ 21513 if (flags & TH_RST) { 21514 recwin = 0; 21515 } else { 21516 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 21517 recwin < (long)segsiz) { 21518 recwin = 0; 21519 } 21520 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 21521 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 21522 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 21523 } 21524 21525 /* 21526 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 21527 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 21528 * handled in syncache. 21529 */ 21530 if (flags & TH_SYN) 21531 th->th_win = htons((u_short) 21532 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 21533 else { 21534 /* Avoid shrinking window with window scaling. */ 21535 recwin = roundup2(recwin, 1 << tp->rcv_scale); 21536 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 21537 } 21538 /* 21539 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 21540 * window. This may cause the remote transmitter to stall. This 21541 * flag tells soreceive() to disable delayed acknowledgements when 21542 * draining the buffer. This can occur if the receiver is 21543 * attempting to read more data than can be buffered prior to 21544 * transmitting on the connection. 21545 */ 21546 if (th->th_win == 0) { 21547 tp->t_sndzerowin++; 21548 tp->t_flags |= TF_RXWIN0SENT; 21549 } else 21550 tp->t_flags &= ~TF_RXWIN0SENT; 21551 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 21552 /* Now are we using fsb?, if so copy the template data to the mbuf */ 21553 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 21554 uint8_t *cpto; 21555 21556 cpto = mtod(m, uint8_t *); 21557 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 21558 /* 21559 * We have just copied in: 21560 * IP/IP6 21561 * <optional udphdr> 21562 * tcphdr (no options) 21563 * 21564 * We need to grab the correct pointers into the mbuf 21565 * for both the tcp header, and possibly the udp header (if tunneling). 21566 * We do this by using the offset in the copy buffer and adding it 21567 * to the mbuf base pointer (cpto). 21568 */ 21569 #ifdef INET6 21570 if (isipv6) 21571 ip6 = mtod(m, struct ip6_hdr *); 21572 else 21573 #endif /* INET6 */ 21574 #ifdef INET 21575 ip = mtod(m, struct ip *); 21576 #endif 21577 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 21578 /* If we have a udp header lets set it into the mbuf as well */ 21579 if (udp) 21580 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 21581 } 21582 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 21583 if (to.to_flags & TOF_SIGNATURE) { 21584 /* 21585 * Calculate MD5 signature and put it into the place 21586 * determined before. 21587 * NOTE: since TCP options buffer doesn't point into 21588 * mbuf's data, calculate offset and use it. 21589 */ 21590 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 21591 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 21592 /* 21593 * Do not send segment if the calculation of MD5 21594 * digest has failed. 21595 */ 21596 goto out; 21597 } 21598 } 21599 #endif 21600 if (optlen) { 21601 bcopy(opt, th + 1, optlen); 21602 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 21603 } 21604 /* 21605 * Put TCP length in extended header, and then checksum extended 21606 * header and data. 21607 */ 21608 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 21609 #ifdef INET6 21610 if (isipv6) { 21611 /* 21612 * ip6_plen is not need to be filled now, and will be filled 21613 * in ip6_output. 21614 */ 21615 if (tp->t_port) { 21616 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 21617 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21618 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 21619 th->th_sum = htons(0); 21620 UDPSTAT_INC(udps_opackets); 21621 } else { 21622 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 21623 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21624 th->th_sum = in6_cksum_pseudo(ip6, 21625 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 21626 0); 21627 } 21628 } 21629 #endif 21630 #if defined(INET6) && defined(INET) 21631 else 21632 #endif 21633 #ifdef INET 21634 { 21635 if (tp->t_port) { 21636 m->m_pkthdr.csum_flags = CSUM_UDP; 21637 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 21638 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 21639 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 21640 th->th_sum = htons(0); 21641 UDPSTAT_INC(udps_opackets); 21642 } else { 21643 m->m_pkthdr.csum_flags = CSUM_TCP; 21644 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 21645 th->th_sum = in_pseudo(ip->ip_src.s_addr, 21646 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 21647 IPPROTO_TCP + len + optlen)); 21648 } 21649 /* IP version must be set here for ipv4/ipv6 checking later */ 21650 KASSERT(ip->ip_v == IPVERSION, 21651 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 21652 } 21653 #endif 21654 /* 21655 * Enable TSO and specify the size of the segments. The TCP pseudo 21656 * header checksum is always provided. XXX: Fixme: This is currently 21657 * not the case for IPv6. 21658 */ 21659 if (tso) { 21660 /* 21661 * Here we must use t_maxseg and the optlen since 21662 * the optlen may include SACK's (or DSACK). 21663 */ 21664 KASSERT(len > tp->t_maxseg - optlen, 21665 ("%s: len <= tso_segsz", __func__)); 21666 m->m_pkthdr.csum_flags |= CSUM_TSO; 21667 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 21668 } 21669 KASSERT(len + hdrlen == m_length(m, NULL), 21670 ("%s: mbuf chain different than expected: %d + %u != %u", 21671 __func__, len, hdrlen, m_length(m, NULL))); 21672 21673 #ifdef TCP_HHOOK 21674 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 21675 hhook_run_tcp_est_out(tp, th, &to, len, tso); 21676 #endif 21677 if ((rack->r_ctl.crte != NULL) && 21678 (rack->rc_hw_nobuf == 0) && 21679 tcp_bblogging_on(tp)) { 21680 rack_log_queue_level(tp, rack, len, &tv, cts); 21681 } 21682 /* We're getting ready to send; log now. */ 21683 if (tcp_bblogging_on(rack->rc_tp)) { 21684 union tcp_log_stackspecific log; 21685 21686 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 21687 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_tp); 21688 if (rack->rack_no_prr) 21689 log.u_bbr.flex1 = 0; 21690 else 21691 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 21692 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 21693 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 21694 log.u_bbr.flex4 = orig_len; 21695 /* Save off the early/late values */ 21696 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 21697 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 21698 log.u_bbr.bw_inuse = rack_get_bw(rack); 21699 log.u_bbr.cur_del_rate = rack->r_ctl.gp_bw; 21700 log.u_bbr.flex8 = 0; 21701 if (rsm) { 21702 if (rsm->r_flags & RACK_RWND_COLLAPSED) { 21703 rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm); 21704 counter_u64_add(rack_collapsed_win_rxt, 1); 21705 counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start)); 21706 } 21707 if (doing_tlp) 21708 log.u_bbr.flex8 = 2; 21709 else 21710 log.u_bbr.flex8 = 1; 21711 } else { 21712 if (doing_tlp) 21713 log.u_bbr.flex8 = 3; 21714 } 21715 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 21716 log.u_bbr.flex7 = mark; 21717 log.u_bbr.flex7 <<= 8; 21718 log.u_bbr.flex7 |= pass; 21719 log.u_bbr.pkts_out = tp->t_maxseg; 21720 log.u_bbr.timeStamp = cts; 21721 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 21722 if (rsm && (rsm->r_rtr_cnt > 0)) { 21723 /* 21724 * When we have a retransmit we want to log the 21725 * burst at send and flight at send from before. 21726 */ 21727 log.u_bbr.flex5 = rsm->r_fas; 21728 log.u_bbr.bbr_substate = rsm->r_bas; 21729 } else { 21730 /* 21731 * New transmits we log in flex5 the inflight again as 21732 * well as the number of segments in our send in the 21733 * substate field. 21734 */ 21735 log.u_bbr.flex5 = log.u_bbr.inflight; 21736 log.u_bbr.bbr_substate = (uint8_t)((len + segsiz - 1)/segsiz); 21737 } 21738 log.u_bbr.lt_epoch = cwnd_to_use; 21739 log.u_bbr.delivered = sendalot; 21740 log.u_bbr.rttProp = (uint64_t)rsm; 21741 log.u_bbr.pkt_epoch = __LINE__; 21742 if (rsm) { 21743 log.u_bbr.delRate = rsm->r_flags; 21744 log.u_bbr.delRate <<= 31; 21745 log.u_bbr.delRate |= rack->r_must_retran; 21746 log.u_bbr.delRate <<= 1; 21747 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21748 } else { 21749 log.u_bbr.delRate = rack->r_must_retran; 21750 log.u_bbr.delRate <<= 1; 21751 log.u_bbr.delRate |= (sack_rxmit & 0x00000001); 21752 } 21753 lgb = tcp_log_event(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 21754 len, &log, false, NULL, __func__, __LINE__, &tv); 21755 } else 21756 lgb = NULL; 21757 21758 /* 21759 * Fill in IP length and desired time to live and send to IP level. 21760 * There should be a better way to handle ttl and tos; we could keep 21761 * them in the template, but need a way to checksum without them. 21762 */ 21763 /* 21764 * m->m_pkthdr.len should have been set before cksum calcuration, 21765 * because in6_cksum() need it. 21766 */ 21767 #ifdef INET6 21768 if (isipv6) { 21769 /* 21770 * we separately set hoplimit for every segment, since the 21771 * user might want to change the value via setsockopt. Also, 21772 * desired default hop limit might be changed via Neighbor 21773 * Discovery. 21774 */ 21775 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 21776 21777 /* 21778 * Set the packet size here for the benefit of DTrace 21779 * probes. ip6_output() will set it properly; it's supposed 21780 * to include the option header lengths as well. 21781 */ 21782 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 21783 21784 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 21785 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 21786 else 21787 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 21788 21789 if (tp->t_state == TCPS_SYN_SENT) 21790 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 21791 21792 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 21793 /* TODO: IPv6 IP6TOS_ECT bit on */ 21794 error = ip6_output(m, 21795 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21796 inp->in6p_outputopts, 21797 #else 21798 NULL, 21799 #endif 21800 &inp->inp_route6, 21801 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 21802 NULL, NULL, inp); 21803 21804 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 21805 mtu = inp->inp_route6.ro_nh->nh_mtu; 21806 } 21807 #endif /* INET6 */ 21808 #if defined(INET) && defined(INET6) 21809 else 21810 #endif 21811 #ifdef INET 21812 { 21813 ip->ip_len = htons(m->m_pkthdr.len); 21814 #ifdef INET6 21815 if (inp->inp_vflag & INP_IPV6PROTO) 21816 ip->ip_ttl = in6_selecthlim(inp, NULL); 21817 #endif /* INET6 */ 21818 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 21819 /* 21820 * If we do path MTU discovery, then we set DF on every 21821 * packet. This might not be the best thing to do according 21822 * to RFC3390 Section 2. However the tcp hostcache migitates 21823 * the problem so it affects only the first tcp connection 21824 * with a host. 21825 * 21826 * NB: Don't set DF on small MTU/MSS to have a safe 21827 * fallback. 21828 */ 21829 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 21830 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 21831 if (tp->t_port == 0 || len < V_tcp_minmss) { 21832 ip->ip_off |= htons(IP_DF); 21833 } 21834 } else { 21835 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 21836 } 21837 21838 if (tp->t_state == TCPS_SYN_SENT) 21839 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 21840 21841 TCP_PROBE5(send, NULL, tp, ip, tp, th); 21842 21843 error = ip_output(m, 21844 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 21845 inp->inp_options, 21846 #else 21847 NULL, 21848 #endif 21849 &inp->inp_route, 21850 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 21851 inp); 21852 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 21853 mtu = inp->inp_route.ro_nh->nh_mtu; 21854 } 21855 #endif /* INET */ 21856 21857 out: 21858 if (lgb) { 21859 lgb->tlb_errno = error; 21860 lgb = NULL; 21861 } 21862 /* 21863 * In transmit state, time the transmission and arrange for the 21864 * retransmit. In persist state, just set snd_max. 21865 */ 21866 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 21867 rack_to_usec_ts(&tv), 21868 rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); 21869 if (error == 0) { 21870 if (rsm == NULL) { 21871 if (rack->lt_bw_up == 0) { 21872 rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); 21873 rack->r_ctl.lt_seq = tp->snd_una; 21874 rack->lt_bw_up = 1; 21875 } else if (((rack_seq + len) - rack->r_ctl.lt_seq) > 0x7fffffff) { 21876 /* 21877 * Need to record what we have since we are 21878 * approaching seq wrap. 21879 */ 21880 uint64_t tmark; 21881 21882 rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); 21883 rack->r_ctl.lt_seq = tp->snd_una; 21884 tmark = tcp_tv_to_lusectick(&tv); 21885 rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); 21886 rack->r_ctl.lt_timemark = tmark; 21887 } 21888 } 21889 rack->forced_ack = 0; /* If we send something zap the FA flag */ 21890 counter_u64_add(rack_total_bytes, len); 21891 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 21892 if (rsm && doing_tlp) { 21893 rack->rc_last_sent_tlp_past_cumack = 0; 21894 rack->rc_last_sent_tlp_seq_valid = 1; 21895 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 21896 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 21897 } 21898 if (rack->rc_hw_nobuf) { 21899 rack->rc_hw_nobuf = 0; 21900 rack->r_ctl.rc_agg_delayed = 0; 21901 rack->r_early = 0; 21902 rack->r_late = 0; 21903 rack->r_ctl.rc_agg_early = 0; 21904 } 21905 if (rsm && (doing_tlp == 0)) { 21906 /* Set we retransmitted */ 21907 rack->rc_gp_saw_rec = 1; 21908 } else { 21909 if (cwnd_to_use > tp->snd_ssthresh) { 21910 /* Set we sent in CA */ 21911 rack->rc_gp_saw_ca = 1; 21912 } else { 21913 /* Set we sent in SS */ 21914 rack->rc_gp_saw_ss = 1; 21915 } 21916 } 21917 if (TCPS_HAVEESTABLISHED(tp->t_state) && 21918 (tp->t_flags & TF_SACK_PERMIT) && 21919 tp->rcv_numsacks > 0) 21920 tcp_clean_dsack_blocks(tp); 21921 tot_len_this_send += len; 21922 if (len == 0) { 21923 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 21924 } else { 21925 int idx; 21926 21927 idx = (len / segsiz) + 3; 21928 if (idx >= TCP_MSS_ACCT_ATIMER) 21929 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 21930 else 21931 counter_u64_add(rack_out_size[idx], 1); 21932 } 21933 } 21934 if ((rack->rack_no_prr == 0) && 21935 sub_from_prr && 21936 (error == 0)) { 21937 if (rack->r_ctl.rc_prr_sndcnt >= len) 21938 rack->r_ctl.rc_prr_sndcnt -= len; 21939 else 21940 rack->r_ctl.rc_prr_sndcnt = 0; 21941 } 21942 sub_from_prr = 0; 21943 if (doing_tlp) { 21944 /* Make sure the TLP is added */ 21945 add_flag |= RACK_TLP; 21946 } else if (rsm) { 21947 /* If its a resend without TLP then it must not have the flag */ 21948 rsm->r_flags &= ~RACK_TLP; 21949 } 21950 21951 21952 if ((error == 0) && 21953 (len > 0) && 21954 (tp->snd_una == tp->snd_max)) 21955 rack->r_ctl.rc_tlp_rxt_last_time = cts; 21956 { 21957 tcp_seq startseq = tp->snd_nxt; 21958 21959 /* Track our lost count */ 21960 if (rsm && (doing_tlp == 0)) 21961 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 21962 /* 21963 * Advance snd_nxt over sequence space of this segment. 21964 */ 21965 if (error) 21966 /* We don't log or do anything with errors */ 21967 goto nomore; 21968 if (doing_tlp == 0) { 21969 if (rsm == NULL) { 21970 /* 21971 * Not a retransmission of some 21972 * sort, new data is going out so 21973 * clear our TLP count and flag. 21974 */ 21975 rack->rc_tlp_in_progress = 0; 21976 rack->r_ctl.rc_tlp_cnt_out = 0; 21977 } 21978 } else { 21979 /* 21980 * We have just sent a TLP, mark that it is true 21981 * and make sure our in progress is set so we 21982 * continue to check the count. 21983 */ 21984 rack->rc_tlp_in_progress = 1; 21985 rack->r_ctl.rc_tlp_cnt_out++; 21986 } 21987 if (flags & (TH_SYN | TH_FIN)) { 21988 if (flags & TH_SYN) 21989 tp->snd_nxt++; 21990 if (flags & TH_FIN) { 21991 tp->snd_nxt++; 21992 tp->t_flags |= TF_SENTFIN; 21993 } 21994 } 21995 /* In the ENOBUFS case we do *not* update snd_max */ 21996 if (sack_rxmit) 21997 goto nomore; 21998 21999 tp->snd_nxt += len; 22000 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 22001 if (tp->snd_una == tp->snd_max) { 22002 /* 22003 * Update the time we just added data since 22004 * none was outstanding. 22005 */ 22006 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 22007 tp->t_acktime = ticks; 22008 } 22009 tp->snd_max = tp->snd_nxt; 22010 if (rack->rc_new_rnd_needed) { 22011 /* 22012 * Update the rnd to start ticking not 22013 * that from a time perspective all of 22014 * the preceding idle time is "in the round" 22015 */ 22016 rack->rc_new_rnd_needed = 0; 22017 rack->r_ctl.roundends = tp->snd_max; 22018 } 22019 /* 22020 * Time this transmission if not a retransmission and 22021 * not currently timing anything. 22022 * This is only relevant in case of switching back to 22023 * the base stack. 22024 */ 22025 if (tp->t_rtttime == 0) { 22026 tp->t_rtttime = ticks; 22027 tp->t_rtseq = startseq; 22028 KMOD_TCPSTAT_INC(tcps_segstimed); 22029 } 22030 if (len && 22031 ((tp->t_flags & TF_GPUTINPROG) == 0)) 22032 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 22033 } 22034 /* 22035 * If we are doing FO we need to update the mbuf position and subtract 22036 * this happens when the peer sends us duplicate information and 22037 * we thus want to send a DSACK. 22038 * 22039 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 22040 * turned off? If not then we are going to echo multiple DSACK blocks 22041 * out (with the TSO), which we should not be doing. 22042 */ 22043 if (rack->r_fast_output && len) { 22044 if (rack->r_ctl.fsb.left_to_send > len) 22045 rack->r_ctl.fsb.left_to_send -= len; 22046 else 22047 rack->r_ctl.fsb.left_to_send = 0; 22048 if (rack->r_ctl.fsb.left_to_send < segsiz) 22049 rack->r_fast_output = 0; 22050 if (rack->r_fast_output) { 22051 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 22052 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 22053 rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); 22054 } 22055 } 22056 } 22057 nomore: 22058 if (error) { 22059 rack->r_ctl.rc_agg_delayed = 0; 22060 rack->r_early = 0; 22061 rack->r_late = 0; 22062 rack->r_ctl.rc_agg_early = 0; 22063 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 22064 /* 22065 * Failures do not advance the seq counter above. For the 22066 * case of ENOBUFS we will fall out and retry in 1ms with 22067 * the hpts. Everything else will just have to retransmit 22068 * with the timer. 22069 * 22070 * In any case, we do not want to loop around for another 22071 * send without a good reason. 22072 */ 22073 sendalot = 0; 22074 switch (error) { 22075 case EPERM: 22076 tp->t_softerror = error; 22077 #ifdef TCP_ACCOUNTING 22078 crtsc = get_cyclecount(); 22079 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22080 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22081 } 22082 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22083 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22084 } 22085 sched_unpin(); 22086 #endif 22087 return (error); 22088 case ENOBUFS: 22089 /* 22090 * Pace us right away to retry in a some 22091 * time 22092 */ 22093 if (rack->r_ctl.crte != NULL) { 22094 tcp_trace_point(rack->rc_tp, TCP_TP_HWENOBUF); 22095 if (tcp_bblogging_on(rack->rc_tp)) 22096 rack_log_queue_level(tp, rack, len, &tv, cts); 22097 } else 22098 tcp_trace_point(rack->rc_tp, TCP_TP_ENOBUF); 22099 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 22100 if (rack->rc_enobuf < 0x7f) 22101 rack->rc_enobuf++; 22102 if (slot < (10 * HPTS_USEC_IN_MSEC)) 22103 slot = 10 * HPTS_USEC_IN_MSEC; 22104 if (rack->r_ctl.crte != NULL) { 22105 counter_u64_add(rack_saw_enobuf_hw, 1); 22106 tcp_rl_log_enobuf(rack->r_ctl.crte); 22107 } 22108 counter_u64_add(rack_saw_enobuf, 1); 22109 goto enobufs; 22110 case EMSGSIZE: 22111 /* 22112 * For some reason the interface we used initially 22113 * to send segments changed to another or lowered 22114 * its MTU. If TSO was active we either got an 22115 * interface without TSO capabilits or TSO was 22116 * turned off. If we obtained mtu from ip_output() 22117 * then update it and try again. 22118 */ 22119 if (tso) 22120 tp->t_flags &= ~TF_TSO; 22121 if (mtu != 0) { 22122 int saved_mtu; 22123 22124 saved_mtu = tp->t_maxseg; 22125 tcp_mss_update(tp, -1, mtu, NULL, NULL); 22126 if (saved_mtu > tp->t_maxseg) { 22127 goto again; 22128 } 22129 } 22130 slot = 10 * HPTS_USEC_IN_MSEC; 22131 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22132 #ifdef TCP_ACCOUNTING 22133 crtsc = get_cyclecount(); 22134 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22135 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22136 } 22137 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22138 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22139 } 22140 sched_unpin(); 22141 #endif 22142 return (error); 22143 case ENETUNREACH: 22144 counter_u64_add(rack_saw_enetunreach, 1); 22145 case EHOSTDOWN: 22146 case EHOSTUNREACH: 22147 case ENETDOWN: 22148 if (TCPS_HAVERCVDSYN(tp->t_state)) { 22149 tp->t_softerror = error; 22150 } 22151 /* FALLTHROUGH */ 22152 default: 22153 slot = 10 * HPTS_USEC_IN_MSEC; 22154 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 22155 #ifdef TCP_ACCOUNTING 22156 crtsc = get_cyclecount(); 22157 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22158 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 22159 } 22160 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22161 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 22162 } 22163 sched_unpin(); 22164 #endif 22165 return (error); 22166 } 22167 } else { 22168 rack->rc_enobuf = 0; 22169 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 22170 rack->r_ctl.retran_during_recovery += len; 22171 } 22172 KMOD_TCPSTAT_INC(tcps_sndtotal); 22173 22174 /* 22175 * Data sent (as far as we can tell). If this advertises a larger 22176 * window than any other segment, then remember the size of the 22177 * advertised window. Any pending ACK has now been sent. 22178 */ 22179 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 22180 tp->rcv_adv = tp->rcv_nxt + recwin; 22181 22182 tp->last_ack_sent = tp->rcv_nxt; 22183 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 22184 enobufs: 22185 if (sendalot) { 22186 /* Do we need to turn off sendalot? */ 22187 if (rack->r_ctl.rc_pace_max_segs && 22188 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 22189 /* We hit our max. */ 22190 sendalot = 0; 22191 } else if ((rack->rc_user_set_max_segs) && 22192 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 22193 /* We hit the user defined max */ 22194 sendalot = 0; 22195 } 22196 } 22197 if ((error == 0) && (flags & TH_FIN)) 22198 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 22199 if (flags & TH_RST) { 22200 /* 22201 * We don't send again after sending a RST. 22202 */ 22203 slot = 0; 22204 sendalot = 0; 22205 if (error == 0) 22206 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 22207 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 22208 /* 22209 * Get our pacing rate, if an error 22210 * occurred in sending (ENOBUF) we would 22211 * hit the else if with slot preset. Other 22212 * errors return. 22213 */ 22214 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 22215 } 22216 if (rsm && 22217 (rsm->r_flags & RACK_HAS_SYN) == 0 && 22218 rack->use_rack_rr) { 22219 /* Its a retransmit and we use the rack cheat? */ 22220 if ((slot == 0) || 22221 (rack->rc_always_pace == 0) || 22222 (rack->r_rr_config == 1)) { 22223 /* 22224 * We have no pacing set or we 22225 * are using old-style rack or 22226 * we are overridden to use the old 1ms pacing. 22227 */ 22228 slot = rack->r_ctl.rc_min_to; 22229 } 22230 } 22231 /* We have sent clear the flag */ 22232 rack->r_ent_rec_ns = 0; 22233 if (rack->r_must_retran) { 22234 if (rsm) { 22235 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 22236 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 22237 /* 22238 * We have retransmitted all. 22239 */ 22240 rack->r_must_retran = 0; 22241 rack->r_ctl.rc_out_at_rto = 0; 22242 } 22243 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22244 /* 22245 * Sending new data will also kill 22246 * the loop. 22247 */ 22248 rack->r_must_retran = 0; 22249 rack->r_ctl.rc_out_at_rto = 0; 22250 } 22251 } 22252 rack->r_ctl.fsb.recwin = recwin; 22253 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 22254 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 22255 /* 22256 * We hit an RTO and now have past snd_max at the RTO 22257 * clear all the WAS flags. 22258 */ 22259 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 22260 } 22261 if (slot) { 22262 /* set the rack tcb into the slot N */ 22263 if ((error == 0) && 22264 rack_use_rfo && 22265 ((flags & (TH_SYN|TH_FIN)) == 0) && 22266 (rsm == NULL) && 22267 (tp->snd_nxt == tp->snd_max) && 22268 (ipoptlen == 0) && 22269 (tp->rcv_numsacks == 0) && 22270 rack->r_fsb_inited && 22271 TCPS_HAVEESTABLISHED(tp->t_state) && 22272 ((IN_RECOVERY(tp->t_flags)) == 0) && 22273 (rack->r_must_retran == 0) && 22274 ((tp->t_flags & TF_NEEDFIN) == 0) && 22275 (len > 0) && (orig_len > 0) && 22276 (orig_len > len) && 22277 ((orig_len - len) >= segsiz) && 22278 ((optlen == 0) || 22279 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22280 /* We can send at least one more MSS using our fsb */ 22281 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22282 segsiz, pace_max_seg, hw_tls, flags); 22283 } else 22284 rack->r_fast_output = 0; 22285 rack_log_fsb(rack, tp, so, flags, 22286 ipoptlen, orig_len, len, error, 22287 (rsm == NULL), optlen, __LINE__, 2); 22288 } else if (sendalot) { 22289 int ret; 22290 22291 sack_rxmit = 0; 22292 if ((error == 0) && 22293 rack_use_rfo && 22294 ((flags & (TH_SYN|TH_FIN)) == 0) && 22295 (rsm == NULL) && 22296 (ipoptlen == 0) && 22297 (tp->rcv_numsacks == 0) && 22298 (tp->snd_nxt == tp->snd_max) && 22299 (rack->r_must_retran == 0) && 22300 rack->r_fsb_inited && 22301 TCPS_HAVEESTABLISHED(tp->t_state) && 22302 ((IN_RECOVERY(tp->t_flags)) == 0) && 22303 ((tp->t_flags & TF_NEEDFIN) == 0) && 22304 (len > 0) && (orig_len > 0) && 22305 (orig_len > len) && 22306 ((orig_len - len) >= segsiz) && 22307 ((optlen == 0) || 22308 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 22309 /* we can use fast_output for more */ 22310 rack_setup_fast_output(tp, rack, sb, len, orig_len, 22311 segsiz, pace_max_seg, hw_tls, flags); 22312 if (rack->r_fast_output) { 22313 error = 0; 22314 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 22315 if (ret >= 0) 22316 return (ret); 22317 else if (error) 22318 goto nomore; 22319 22320 } 22321 } 22322 goto again; 22323 } 22324 /* Assure when we leave that snd_nxt will point to top */ 22325 skip_all_send: 22326 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 22327 tp->snd_nxt = tp->snd_max; 22328 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 22329 #ifdef TCP_ACCOUNTING 22330 crtsc = get_cyclecount() - ts_val; 22331 if (tot_len_this_send) { 22332 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22333 tp->tcp_cnt_counters[SND_OUT_DATA]++; 22334 } 22335 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22336 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 22337 } 22338 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22339 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 22340 } 22341 } else { 22342 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22343 tp->tcp_cnt_counters[SND_OUT_ACK]++; 22344 } 22345 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 22346 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 22347 } 22348 } 22349 sched_unpin(); 22350 #endif 22351 if (error == ENOBUFS) 22352 error = 0; 22353 return (error); 22354 } 22355 22356 static void 22357 rack_update_seg(struct tcp_rack *rack) 22358 { 22359 uint32_t orig_val; 22360 22361 orig_val = rack->r_ctl.rc_pace_max_segs; 22362 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 22363 if (orig_val != rack->r_ctl.rc_pace_max_segs) 22364 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 22365 } 22366 22367 static void 22368 rack_mtu_change(struct tcpcb *tp) 22369 { 22370 /* 22371 * The MSS may have changed 22372 */ 22373 struct tcp_rack *rack; 22374 struct rack_sendmap *rsm; 22375 22376 rack = (struct tcp_rack *)tp->t_fb_ptr; 22377 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 22378 /* 22379 * The MTU has changed we need to resend everything 22380 * since all we have sent is lost. We first fix 22381 * up the mtu though. 22382 */ 22383 rack_set_pace_segments(tp, rack, __LINE__, NULL); 22384 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 22385 rack_remxt_tmr(tp); 22386 rack->r_fast_output = 0; 22387 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 22388 rack->r_ctl.rc_sacked); 22389 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 22390 rack->r_must_retran = 1; 22391 /* Mark all inflight to needing to be rxt'd */ 22392 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 22393 rsm->r_flags |= (RACK_MUST_RXT|RACK_PMTU_CHG); 22394 } 22395 } 22396 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 22397 /* We don't use snd_nxt to retransmit */ 22398 tp->snd_nxt = tp->snd_max; 22399 } 22400 22401 static int 22402 rack_set_dgp(struct tcp_rack *rack) 22403 { 22404 /* pace_always=1 */ 22405 if (rack->rc_always_pace == 0) { 22406 if (tcp_can_enable_pacing() == 0) 22407 return (EBUSY); 22408 } 22409 rack->dgp_on = 1; 22410 rack->rc_always_pace = 1; 22411 rack->use_fixed_rate = 0; 22412 if (rack->gp_ready) 22413 rack_set_cc_pacing(rack); 22414 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22415 rack->rack_attempt_hdwr_pace = 0; 22416 /* rxt settings */ 22417 rack->full_size_rxt = 1; 22418 rack->shape_rxt_to_pacing_min = 0; 22419 /* cmpack=1 */ 22420 rack->r_use_cmp_ack = 1; 22421 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 22422 rack->r_use_cmp_ack) 22423 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22424 /* scwnd=1 */ 22425 rack->rack_enable_scwnd = 1; 22426 /* dynamic=100 */ 22427 rack->rc_gp_dyn_mul = 1; 22428 /* gp_inc_ca */ 22429 rack->r_ctl.rack_per_of_gp_ca = 100; 22430 /* rrr_conf=3 */ 22431 rack->r_rr_config = 3; 22432 /* npush=2 */ 22433 rack->r_ctl.rc_no_push_at_mrtt = 2; 22434 /* fillcw=1 */ 22435 if (rack->r_cwnd_was_clamped == 0) { 22436 rack->rc_pace_to_cwnd = 1; 22437 } else { 22438 rack->rc_pace_to_cwnd = 0; 22439 /* Reset all multipliers to 100.0 so just the measured bw */ 22440 rack->r_ctl.rack_per_of_gp_ss = 100; 22441 rack->r_ctl.rack_per_of_gp_ca = 100; 22442 } 22443 rack->rc_pace_fill_if_rttin_range = 0; 22444 rack->rtt_limit_mul = 0; 22445 /* noprr=1 */ 22446 rack->rack_no_prr = 1; 22447 /* lscwnd=1 */ 22448 rack->r_limit_scw = 1; 22449 /* gp_inc_rec */ 22450 rack->r_ctl.rack_per_of_gp_rec = 90; 22451 rack_client_buffer_level_set(rack); 22452 return (0); 22453 } 22454 22455 22456 22457 static int 22458 rack_set_profile(struct tcp_rack *rack, int prof) 22459 { 22460 int err = EINVAL; 22461 if (prof == 1) { 22462 /* 22463 * Profile 1 is "standard" DGP. It ignores 22464 * client buffer level. 22465 */ 22466 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0; 22467 err = rack_set_dgp(rack); 22468 if (err) 22469 return (err); 22470 } else if (prof == 2) { 22471 /* 22472 * Profile 2 is DGP. Less aggressive with 22473 * respect to client buffer level. 22474 */ 22475 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1; 22476 err = rack_set_dgp(rack); 22477 if (err) 22478 return (err); 22479 } else if (prof == 3) { 22480 /* 22481 * Profile 3 is DGP. Even Less aggressive with 22482 * respect to client buffer level. 22483 */ 22484 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2; 22485 err = rack_set_dgp(rack); 22486 if (err) 22487 return (err); 22488 } else if (prof == 4) { 22489 /* 22490 * Profile 4 is DGP with the most responsiveness 22491 * to client buffer level. 22492 */ 22493 rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3; 22494 err = rack_set_dgp(rack); 22495 if (err) 22496 return (err); 22497 } else if (prof == 0) { 22498 /* This changes things back to the default settings */ 22499 rack->dgp_on = 0; 22500 rack->rc_hybrid_mode = 0; 22501 err = 0; 22502 if (rack_fill_cw_state) 22503 rack->rc_pace_to_cwnd = 1; 22504 else 22505 rack->rc_pace_to_cwnd = 0; 22506 if (rack->rc_always_pace) { 22507 tcp_decrement_paced_conn(); 22508 rack_undo_cc_pacing(rack); 22509 rack->rc_always_pace = 0; 22510 } 22511 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 22512 rack->rc_always_pace = 1; 22513 if ((rack->gp_ready) && (rack->use_fixed_rate == 0)) 22514 rack_set_cc_pacing(rack); 22515 } else 22516 rack->rc_always_pace = 0; 22517 if (rack_dsack_std_based & 0x1) { 22518 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 22519 rack->rc_rack_tmr_std_based = 1; 22520 } 22521 if (rack_dsack_std_based & 0x2) { 22522 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 22523 rack->rc_rack_use_dsack = 1; 22524 } 22525 if (rack_use_cmp_acks) 22526 rack->r_use_cmp_ack = 1; 22527 else 22528 rack->r_use_cmp_ack = 0; 22529 if (rack_disable_prr) 22530 rack->rack_no_prr = 1; 22531 else 22532 rack->rack_no_prr = 0; 22533 if (rack_gp_no_rec_chg) 22534 rack->rc_gp_no_rec_chg = 1; 22535 else 22536 rack->rc_gp_no_rec_chg = 0; 22537 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 22538 rack->r_mbuf_queue = 1; 22539 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 22540 rack->rc_tp->t_flags2 |= TF2_MBUF_ACKCMP; 22541 rack->rc_tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22542 } else { 22543 rack->r_mbuf_queue = 0; 22544 rack->rc_tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22545 } 22546 if (rack_enable_shared_cwnd) 22547 rack->rack_enable_scwnd = 1; 22548 else 22549 rack->rack_enable_scwnd = 0; 22550 if (rack_do_dyn_mul) { 22551 /* When dynamic adjustment is on CA needs to start at 100% */ 22552 rack->rc_gp_dyn_mul = 1; 22553 if (rack_do_dyn_mul >= 100) 22554 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 22555 } else { 22556 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 22557 rack->rc_gp_dyn_mul = 0; 22558 } 22559 rack->r_rr_config = 0; 22560 rack->r_ctl.rc_no_push_at_mrtt = 0; 22561 rack->rc_pace_to_cwnd = 0; 22562 rack->rc_pace_fill_if_rttin_range = 0; 22563 rack->rtt_limit_mul = 0; 22564 22565 if (rack_enable_hw_pacing) 22566 rack->rack_hdw_pace_ena = 1; 22567 else 22568 rack->rack_hdw_pace_ena = 0; 22569 if (rack_disable_prr) 22570 rack->rack_no_prr = 1; 22571 else 22572 rack->rack_no_prr = 0; 22573 if (rack_limits_scwnd) 22574 rack->r_limit_scw = 1; 22575 else 22576 rack->r_limit_scw = 0; 22577 rack_init_retransmit_value(rack, rack_rxt_controls); 22578 err = 0; 22579 } 22580 return (err); 22581 } 22582 22583 static int 22584 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 22585 { 22586 struct deferred_opt_list *dol; 22587 22588 dol = malloc(sizeof(struct deferred_opt_list), 22589 M_TCPFSB, M_NOWAIT|M_ZERO); 22590 if (dol == NULL) { 22591 /* 22592 * No space yikes -- fail out.. 22593 */ 22594 return (0); 22595 } 22596 dol->optname = sopt_name; 22597 dol->optval = loptval; 22598 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 22599 return (1); 22600 } 22601 22602 static int 22603 process_hybrid_pacing(struct tcp_rack *rack, struct tcp_hybrid_req *hybrid) 22604 { 22605 #ifdef TCP_REQUEST_TRK 22606 struct tcp_sendfile_track *sft; 22607 struct timeval tv; 22608 tcp_seq seq; 22609 int err; 22610 22611 microuptime(&tv); 22612 22613 /* 22614 * If BB logging is not on we need to look at the DTL flag. 22615 * If its on already then those reasons override the DTL input. 22616 * We do this with any request, you can turn DTL on, but it does 22617 * not turn off at least from hybrid pacing requests. 22618 */ 22619 if (tcp_bblogging_on(rack->rc_tp) == 0) { 22620 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) { 22621 /* Turn on BB point logging */ 22622 tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS, 22623 TCP_BBPOINT_REQ_LEVEL_LOGGING); 22624 } 22625 } 22626 /* Make sure no fixed rate is on */ 22627 rack->use_fixed_rate = 0; 22628 rack->r_ctl.rc_fixed_pacing_rate_rec = 0; 22629 rack->r_ctl.rc_fixed_pacing_rate_ca = 0; 22630 rack->r_ctl.rc_fixed_pacing_rate_ss = 0; 22631 /* Now allocate or find our entry that will have these settings */ 22632 sft = tcp_req_alloc_req_full(rack->rc_tp, &hybrid->req, tcp_tv_to_lusectick(&tv), 0); 22633 if (sft == NULL) { 22634 rack->rc_tp->tcp_hybrid_error++; 22635 /* no space, where would it have gone? */ 22636 seq = rack->rc_tp->snd_una + rack->rc_tp->t_inpcb.inp_socket->so_snd.sb_ccc; 22637 rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); 22638 return (ENOSPC); 22639 } 22640 /* The seq will be snd_una + everything in the buffer */ 22641 seq = sft->start_seq; 22642 if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { 22643 /* Disabling hybrid pacing */ 22644 if (rack->rc_hybrid_mode) { 22645 rack_set_profile(rack, 0); 22646 rack->rc_tp->tcp_hybrid_stop++; 22647 } 22648 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_TURNED_OFF, __LINE__, 0); 22649 return (0); 22650 } 22651 if (rack->dgp_on == 0) { 22652 /* 22653 * If we have not yet turned DGP on, do so 22654 * now setting pure DGP mode, no buffer level 22655 * response. 22656 */ 22657 if ((err = rack_set_profile(rack, 1)) != 0){ 22658 /* Failed to turn pacing on */ 22659 rack->rc_tp->tcp_hybrid_error++; 22660 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_NO_PACING, __LINE__, 0); 22661 return (err); 22662 } 22663 } 22664 /* Now set in our flags */ 22665 sft->hybrid_flags = hybrid->hybrid_flags; 22666 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) 22667 sft->cspr = hybrid->cspr; 22668 else 22669 sft->cspr = 0; 22670 if (hybrid->hybrid_flags & TCP_HYBRID_PACING_H_MS) 22671 sft->hint_maxseg = hybrid->hint_maxseg; 22672 else 22673 sft->hint_maxseg = 0; 22674 rack->rc_hybrid_mode = 1; 22675 rack->rc_tp->tcp_hybrid_start++; 22676 rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); 22677 return (0); 22678 #else 22679 return (ENOTSUP); 22680 #endif 22681 } 22682 22683 static int 22684 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 22685 uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) 22686 22687 { 22688 struct epoch_tracker et; 22689 struct sockopt sopt; 22690 struct cc_newreno_opts opt; 22691 uint64_t val; 22692 int error = 0; 22693 uint16_t ca, ss; 22694 22695 switch (sopt_name) { 22696 case TCP_RACK_SET_RXT_OPTIONS: 22697 if ((optval >= 0) && (optval <= 2)) { 22698 rack_init_retransmit_value(rack, optval); 22699 } else { 22700 /* 22701 * You must send in 0, 1 or 2 all else is 22702 * invalid. 22703 */ 22704 error = EINVAL; 22705 } 22706 break; 22707 case TCP_RACK_DSACK_OPT: 22708 RACK_OPTS_INC(tcp_rack_dsack_opt); 22709 if (optval & 0x1) { 22710 rack->rc_rack_tmr_std_based = 1; 22711 } else { 22712 rack->rc_rack_tmr_std_based = 0; 22713 } 22714 if (optval & 0x2) { 22715 rack->rc_rack_use_dsack = 1; 22716 } else { 22717 rack->rc_rack_use_dsack = 0; 22718 } 22719 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 22720 break; 22721 case TCP_RACK_PACING_DIVISOR: 22722 RACK_OPTS_INC(tcp_rack_pacing_divisor); 22723 if (optval == 0) { 22724 rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; 22725 } else { 22726 if (optval < RL_MIN_DIVISOR) 22727 rack->r_ctl.pace_len_divisor = RL_MIN_DIVISOR; 22728 else 22729 rack->r_ctl.pace_len_divisor = optval; 22730 } 22731 break; 22732 case TCP_RACK_HI_BETA: 22733 RACK_OPTS_INC(tcp_rack_hi_beta); 22734 if (optval) 22735 rack->rack_hibeta = 1; 22736 else 22737 rack->rack_hibeta = 0; 22738 break; 22739 case TCP_RACK_PACING_BETA: 22740 RACK_OPTS_INC(tcp_rack_beta); 22741 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 22742 /* This only works for newreno. */ 22743 error = EINVAL; 22744 break; 22745 } 22746 if (rack->rc_pacing_cc_set) { 22747 /* 22748 * Set them into the real CC module 22749 * whats in the rack pcb is the old values 22750 * to be used on restoral/ 22751 */ 22752 sopt.sopt_dir = SOPT_SET; 22753 opt.name = CC_NEWRENO_BETA; 22754 opt.val = optval; 22755 if (CC_ALGO(tp)->ctl_output != NULL) 22756 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 22757 else { 22758 error = ENOENT; 22759 break; 22760 } 22761 } else { 22762 /* 22763 * Not pacing yet so set it into our local 22764 * rack pcb storage. 22765 */ 22766 rack->r_ctl.rc_saved_beta.beta = optval; 22767 } 22768 break; 22769 case TCP_RACK_TIMER_SLOP: 22770 RACK_OPTS_INC(tcp_rack_timer_slop); 22771 rack->r_ctl.timer_slop = optval; 22772 if (rack->rc_tp->t_srtt) { 22773 /* 22774 * If we have an SRTT lets update t_rxtcur 22775 * to have the new slop. 22776 */ 22777 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 22778 rack_rto_min, rack_rto_max, 22779 rack->r_ctl.timer_slop); 22780 } 22781 break; 22782 case TCP_RACK_PACING_BETA_ECN: 22783 RACK_OPTS_INC(tcp_rack_beta_ecn); 22784 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { 22785 /* This only works for newreno. */ 22786 error = EINVAL; 22787 break; 22788 } 22789 if (rack->rc_pacing_cc_set) { 22790 /* 22791 * Set them into the real CC module 22792 * whats in the rack pcb is the old values 22793 * to be used on restoral/ 22794 */ 22795 sopt.sopt_dir = SOPT_SET; 22796 opt.name = CC_NEWRENO_BETA_ECN; 22797 opt.val = optval; 22798 if (CC_ALGO(tp)->ctl_output != NULL) 22799 error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); 22800 else 22801 error = ENOENT; 22802 } else { 22803 /* 22804 * Not pacing yet so set it into our local 22805 * rack pcb storage. 22806 */ 22807 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 22808 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; 22809 } 22810 break; 22811 case TCP_DEFER_OPTIONS: 22812 RACK_OPTS_INC(tcp_defer_opt); 22813 if (optval) { 22814 if (rack->gp_ready) { 22815 /* Too late */ 22816 error = EINVAL; 22817 break; 22818 } 22819 rack->defer_options = 1; 22820 } else 22821 rack->defer_options = 0; 22822 break; 22823 case TCP_RACK_MEASURE_CNT: 22824 RACK_OPTS_INC(tcp_rack_measure_cnt); 22825 if (optval && (optval <= 0xff)) { 22826 rack->r_ctl.req_measurements = optval; 22827 } else 22828 error = EINVAL; 22829 break; 22830 case TCP_REC_ABC_VAL: 22831 RACK_OPTS_INC(tcp_rec_abc_val); 22832 if (optval > 0) 22833 rack->r_use_labc_for_rec = 1; 22834 else 22835 rack->r_use_labc_for_rec = 0; 22836 break; 22837 case TCP_RACK_ABC_VAL: 22838 RACK_OPTS_INC(tcp_rack_abc_val); 22839 if ((optval > 0) && (optval < 255)) 22840 rack->rc_labc = optval; 22841 else 22842 error = EINVAL; 22843 break; 22844 case TCP_HDWR_UP_ONLY: 22845 RACK_OPTS_INC(tcp_pacing_up_only); 22846 if (optval) 22847 rack->r_up_only = 1; 22848 else 22849 rack->r_up_only = 0; 22850 break; 22851 case TCP_PACING_RATE_CAP: 22852 RACK_OPTS_INC(tcp_pacing_rate_cap); 22853 rack->r_ctl.bw_rate_cap = loptval; 22854 break; 22855 case TCP_HYBRID_PACING: 22856 if (hybrid == NULL) { 22857 error = EINVAL; 22858 break; 22859 } 22860 error = process_hybrid_pacing(rack, hybrid); 22861 break; 22862 case TCP_RACK_PROFILE: 22863 RACK_OPTS_INC(tcp_profile); 22864 error = rack_set_profile(rack, optval); 22865 break; 22866 case TCP_USE_CMP_ACKS: 22867 RACK_OPTS_INC(tcp_use_cmp_acks); 22868 if ((optval == 0) && (tp->t_flags2 & TF2_MBUF_ACKCMP)) { 22869 /* You can't turn it off once its on! */ 22870 error = EINVAL; 22871 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 22872 rack->r_use_cmp_ack = 1; 22873 rack->r_mbuf_queue = 1; 22874 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22875 } 22876 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 22877 tp->t_flags2 |= TF2_MBUF_ACKCMP; 22878 break; 22879 case TCP_SHARED_CWND_TIME_LIMIT: 22880 RACK_OPTS_INC(tcp_lscwnd); 22881 if (optval) 22882 rack->r_limit_scw = 1; 22883 else 22884 rack->r_limit_scw = 0; 22885 break; 22886 case TCP_RACK_DGP_IN_REC: 22887 RACK_OPTS_INC(tcp_dgp_in_rec); 22888 if (optval) 22889 rack->r_ctl.full_dgp_in_rec = 1; 22890 else 22891 rack->r_ctl.full_dgp_in_rec = 0; 22892 break; 22893 case TCP_RXT_CLAMP: 22894 RACK_OPTS_INC(tcp_rxt_clamp); 22895 rack_translate_clamp_value(rack, optval); 22896 break; 22897 case TCP_RACK_PACE_TO_FILL: 22898 RACK_OPTS_INC(tcp_fillcw); 22899 if (optval == 0) 22900 rack->rc_pace_to_cwnd = 0; 22901 else { 22902 rack->rc_pace_to_cwnd = 1; 22903 if (optval > 1) 22904 rack->r_fill_less_agg = 1; 22905 } 22906 if ((optval >= rack_gp_rtt_maxmul) && 22907 rack_gp_rtt_maxmul && 22908 (optval < 0xf)) { 22909 rack->rc_pace_fill_if_rttin_range = 1; 22910 rack->rtt_limit_mul = optval; 22911 } else { 22912 rack->rc_pace_fill_if_rttin_range = 0; 22913 rack->rtt_limit_mul = 0; 22914 } 22915 break; 22916 case TCP_RACK_NO_PUSH_AT_MAX: 22917 RACK_OPTS_INC(tcp_npush); 22918 if (optval == 0) 22919 rack->r_ctl.rc_no_push_at_mrtt = 0; 22920 else if (optval < 0xff) 22921 rack->r_ctl.rc_no_push_at_mrtt = optval; 22922 else 22923 error = EINVAL; 22924 break; 22925 case TCP_SHARED_CWND_ENABLE: 22926 RACK_OPTS_INC(tcp_rack_scwnd); 22927 if (optval == 0) 22928 rack->rack_enable_scwnd = 0; 22929 else 22930 rack->rack_enable_scwnd = 1; 22931 break; 22932 case TCP_RACK_MBUF_QUEUE: 22933 /* Now do we use the LRO mbuf-queue feature */ 22934 RACK_OPTS_INC(tcp_rack_mbufq); 22935 if (optval || rack->r_use_cmp_ack) 22936 rack->r_mbuf_queue = 1; 22937 else 22938 rack->r_mbuf_queue = 0; 22939 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 22940 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 22941 else 22942 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 22943 break; 22944 case TCP_RACK_NONRXT_CFG_RATE: 22945 RACK_OPTS_INC(tcp_rack_cfg_rate); 22946 if (optval == 0) 22947 rack->rack_rec_nonrxt_use_cr = 0; 22948 else 22949 rack->rack_rec_nonrxt_use_cr = 1; 22950 break; 22951 case TCP_NO_PRR: 22952 RACK_OPTS_INC(tcp_rack_noprr); 22953 if (optval == 0) 22954 rack->rack_no_prr = 0; 22955 else if (optval == 1) 22956 rack->rack_no_prr = 1; 22957 else if (optval == 2) 22958 rack->no_prr_addback = 1; 22959 else 22960 error = EINVAL; 22961 break; 22962 case TCP_TIMELY_DYN_ADJ: 22963 RACK_OPTS_INC(tcp_timely_dyn); 22964 if (optval == 0) 22965 rack->rc_gp_dyn_mul = 0; 22966 else { 22967 rack->rc_gp_dyn_mul = 1; 22968 if (optval >= 100) { 22969 /* 22970 * If the user sets something 100 or more 22971 * its the gp_ca value. 22972 */ 22973 rack->r_ctl.rack_per_of_gp_ca = optval; 22974 } 22975 } 22976 break; 22977 case TCP_RACK_DO_DETECTION: 22978 RACK_OPTS_INC(tcp_rack_do_detection); 22979 if (optval == 0) 22980 rack->do_detection = 0; 22981 else 22982 rack->do_detection = 1; 22983 break; 22984 case TCP_RACK_TLP_USE: 22985 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 22986 error = EINVAL; 22987 break; 22988 } 22989 RACK_OPTS_INC(tcp_tlp_use); 22990 rack->rack_tlp_threshold_use = optval; 22991 break; 22992 case TCP_RACK_TLP_REDUCE: 22993 /* RACK TLP cwnd reduction (bool) */ 22994 RACK_OPTS_INC(tcp_rack_tlp_reduce); 22995 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 22996 break; 22997 /* Pacing related ones */ 22998 case TCP_RACK_PACE_ALWAYS: 22999 /* 23000 * zero is old rack method, 1 is new 23001 * method using a pacing rate. 23002 */ 23003 RACK_OPTS_INC(tcp_rack_pace_always); 23004 if (optval > 0) { 23005 if (rack->rc_always_pace) { 23006 error = EALREADY; 23007 break; 23008 } else if (tcp_can_enable_pacing()) { 23009 rack->rc_always_pace = 1; 23010 if ((rack->gp_ready) && (rack->use_fixed_rate == 0)) 23011 rack_set_cc_pacing(rack); 23012 } 23013 else { 23014 error = ENOSPC; 23015 break; 23016 } 23017 } else { 23018 if (rack->rc_always_pace) { 23019 tcp_decrement_paced_conn(); 23020 rack->rc_always_pace = 0; 23021 rack_undo_cc_pacing(rack); 23022 } 23023 } 23024 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 23025 tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; 23026 else 23027 tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; 23028 /* A rate may be set irate or other, if so set seg size */ 23029 rack_update_seg(rack); 23030 break; 23031 case TCP_BBR_RACK_INIT_RATE: 23032 RACK_OPTS_INC(tcp_initial_rate); 23033 val = optval; 23034 /* Change from kbits per second to bytes per second */ 23035 val *= 1000; 23036 val /= 8; 23037 rack->r_ctl.init_rate = val; 23038 if (rack->rc_init_win != rack_default_init_window) { 23039 uint32_t win, snt; 23040 23041 /* 23042 * Options don't always get applied 23043 * in the order you think. So in order 23044 * to assure we update a cwnd we need 23045 * to check and see if we are still 23046 * where we should raise the cwnd. 23047 */ 23048 win = rc_init_window(rack); 23049 if (SEQ_GT(tp->snd_max, tp->iss)) 23050 snt = tp->snd_max - tp->iss; 23051 else 23052 snt = 0; 23053 if ((snt < win) && 23054 (tp->snd_cwnd < win)) 23055 tp->snd_cwnd = win; 23056 } 23057 if (rack->rc_always_pace) 23058 rack_update_seg(rack); 23059 break; 23060 case TCP_BBR_IWINTSO: 23061 RACK_OPTS_INC(tcp_initial_win); 23062 if (optval && (optval <= 0xff)) { 23063 uint32_t win, snt; 23064 23065 rack->rc_init_win = optval; 23066 win = rc_init_window(rack); 23067 if (SEQ_GT(tp->snd_max, tp->iss)) 23068 snt = tp->snd_max - tp->iss; 23069 else 23070 snt = 0; 23071 if ((snt < win) && 23072 (tp->t_srtt | 23073 rack->r_ctl.init_rate)) { 23074 /* 23075 * We are not past the initial window 23076 * and we have some bases for pacing, 23077 * so we need to possibly adjust up 23078 * the cwnd. Note even if we don't set 23079 * the cwnd, its still ok to raise the rc_init_win 23080 * which can be used coming out of idle when we 23081 * would have a rate. 23082 */ 23083 if (tp->snd_cwnd < win) 23084 tp->snd_cwnd = win; 23085 } 23086 if (rack->rc_always_pace) 23087 rack_update_seg(rack); 23088 } else 23089 error = EINVAL; 23090 break; 23091 case TCP_RACK_FORCE_MSEG: 23092 RACK_OPTS_INC(tcp_rack_force_max_seg); 23093 if (optval) 23094 rack->rc_force_max_seg = 1; 23095 else 23096 rack->rc_force_max_seg = 0; 23097 break; 23098 case TCP_RACK_PACE_MIN_SEG: 23099 RACK_OPTS_INC(tcp_rack_min_seg); 23100 rack->r_ctl.rc_user_set_min_segs = (0x0000ffff & optval); 23101 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23102 break; 23103 case TCP_RACK_PACE_MAX_SEG: 23104 /* Max segments size in a pace in bytes */ 23105 RACK_OPTS_INC(tcp_rack_max_seg); 23106 rack->rc_user_set_max_segs = optval; 23107 rack_set_pace_segments(tp, rack, __LINE__, NULL); 23108 break; 23109 case TCP_RACK_PACE_RATE_REC: 23110 /* Set the fixed pacing rate in Bytes per second ca */ 23111 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 23112 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23113 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23114 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23115 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23116 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23117 rack->use_fixed_rate = 1; 23118 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) 23119 rack_set_cc_pacing(rack); 23120 rack_log_pacing_delay_calc(rack, 23121 rack->r_ctl.rc_fixed_pacing_rate_ss, 23122 rack->r_ctl.rc_fixed_pacing_rate_ca, 23123 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23124 __LINE__, NULL,0); 23125 break; 23126 23127 case TCP_RACK_PACE_RATE_SS: 23128 /* Set the fixed pacing rate in Bytes per second ca */ 23129 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 23130 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23131 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 23132 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23133 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23134 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23135 rack->use_fixed_rate = 1; 23136 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) 23137 rack_set_cc_pacing(rack); 23138 rack_log_pacing_delay_calc(rack, 23139 rack->r_ctl.rc_fixed_pacing_rate_ss, 23140 rack->r_ctl.rc_fixed_pacing_rate_ca, 23141 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23142 __LINE__, NULL, 0); 23143 break; 23144 23145 case TCP_RACK_PACE_RATE_CA: 23146 /* Set the fixed pacing rate in Bytes per second ca */ 23147 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 23148 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 23149 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 23150 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 23151 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 23152 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 23153 rack->use_fixed_rate = 1; 23154 if (rack->rc_always_pace && rack->gp_ready && rack->rack_hibeta) 23155 rack_set_cc_pacing(rack); 23156 rack_log_pacing_delay_calc(rack, 23157 rack->r_ctl.rc_fixed_pacing_rate_ss, 23158 rack->r_ctl.rc_fixed_pacing_rate_ca, 23159 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 23160 __LINE__, NULL, 0); 23161 break; 23162 case TCP_RACK_GP_INCREASE_REC: 23163 RACK_OPTS_INC(tcp_gp_inc_rec); 23164 rack->r_ctl.rack_per_of_gp_rec = optval; 23165 rack_log_pacing_delay_calc(rack, 23166 rack->r_ctl.rack_per_of_gp_ss, 23167 rack->r_ctl.rack_per_of_gp_ca, 23168 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23169 __LINE__, NULL, 0); 23170 break; 23171 case TCP_RACK_GP_INCREASE_CA: 23172 RACK_OPTS_INC(tcp_gp_inc_ca); 23173 ca = optval; 23174 if (ca < 100) { 23175 /* 23176 * We don't allow any reduction 23177 * over the GP b/w. 23178 */ 23179 error = EINVAL; 23180 break; 23181 } 23182 rack->r_ctl.rack_per_of_gp_ca = ca; 23183 rack_log_pacing_delay_calc(rack, 23184 rack->r_ctl.rack_per_of_gp_ss, 23185 rack->r_ctl.rack_per_of_gp_ca, 23186 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23187 __LINE__, NULL, 0); 23188 break; 23189 case TCP_RACK_GP_INCREASE_SS: 23190 RACK_OPTS_INC(tcp_gp_inc_ss); 23191 ss = optval; 23192 if (ss < 100) { 23193 /* 23194 * We don't allow any reduction 23195 * over the GP b/w. 23196 */ 23197 error = EINVAL; 23198 break; 23199 } 23200 rack->r_ctl.rack_per_of_gp_ss = ss; 23201 rack_log_pacing_delay_calc(rack, 23202 rack->r_ctl.rack_per_of_gp_ss, 23203 rack->r_ctl.rack_per_of_gp_ca, 23204 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 23205 __LINE__, NULL, 0); 23206 break; 23207 case TCP_RACK_RR_CONF: 23208 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 23209 if (optval && optval <= 3) 23210 rack->r_rr_config = optval; 23211 else 23212 rack->r_rr_config = 0; 23213 break; 23214 case TCP_PACING_DND: /* URL:dnd */ 23215 if (optval > 0) 23216 rack->rc_pace_dnd = 1; 23217 else 23218 rack->rc_pace_dnd = 0; 23219 break; 23220 case TCP_HDWR_RATE_CAP: 23221 RACK_OPTS_INC(tcp_hdwr_rate_cap); 23222 if (optval) { 23223 if (rack->r_rack_hw_rate_caps == 0) 23224 rack->r_rack_hw_rate_caps = 1; 23225 else 23226 error = EALREADY; 23227 } else { 23228 rack->r_rack_hw_rate_caps = 0; 23229 } 23230 break; 23231 case TCP_RACK_SPLIT_LIMIT: 23232 RACK_OPTS_INC(tcp_split_limit); 23233 rack->r_ctl.rc_split_limit = optval; 23234 break; 23235 case TCP_BBR_HDWR_PACE: 23236 RACK_OPTS_INC(tcp_hdwr_pacing); 23237 if (optval){ 23238 if (rack->rack_hdrw_pacing == 0) { 23239 rack->rack_hdw_pace_ena = 1; 23240 rack->rack_attempt_hdwr_pace = 0; 23241 } else 23242 error = EALREADY; 23243 } else { 23244 rack->rack_hdw_pace_ena = 0; 23245 #ifdef RATELIMIT 23246 if (rack->r_ctl.crte != NULL) { 23247 rack->rack_hdrw_pacing = 0; 23248 rack->rack_attempt_hdwr_pace = 0; 23249 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 23250 rack->r_ctl.crte = NULL; 23251 } 23252 #endif 23253 } 23254 break; 23255 /* End Pacing related ones */ 23256 case TCP_RACK_PRR_SENDALOT: 23257 /* Allow PRR to send more than one seg */ 23258 RACK_OPTS_INC(tcp_rack_prr_sendalot); 23259 rack->r_ctl.rc_prr_sendalot = optval; 23260 break; 23261 case TCP_RACK_MIN_TO: 23262 /* Minimum time between rack t-o's in ms */ 23263 RACK_OPTS_INC(tcp_rack_min_to); 23264 rack->r_ctl.rc_min_to = optval; 23265 break; 23266 case TCP_RACK_EARLY_SEG: 23267 /* If early recovery max segments */ 23268 RACK_OPTS_INC(tcp_rack_early_seg); 23269 rack->r_ctl.rc_early_recovery_segs = optval; 23270 break; 23271 case TCP_RACK_ENABLE_HYSTART: 23272 { 23273 if (optval) { 23274 tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; 23275 if (rack_do_hystart > RACK_HYSTART_ON) 23276 tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; 23277 if (rack_do_hystart > RACK_HYSTART_ON_W_SC) 23278 tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; 23279 } else { 23280 tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); 23281 } 23282 } 23283 break; 23284 case TCP_RACK_REORD_THRESH: 23285 /* RACK reorder threshold (shift amount) */ 23286 RACK_OPTS_INC(tcp_rack_reord_thresh); 23287 if ((optval > 0) && (optval < 31)) 23288 rack->r_ctl.rc_reorder_shift = optval; 23289 else 23290 error = EINVAL; 23291 break; 23292 case TCP_RACK_REORD_FADE: 23293 /* Does reordering fade after ms time */ 23294 RACK_OPTS_INC(tcp_rack_reord_fade); 23295 rack->r_ctl.rc_reorder_fade = optval; 23296 break; 23297 case TCP_RACK_TLP_THRESH: 23298 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 23299 RACK_OPTS_INC(tcp_rack_tlp_thresh); 23300 if (optval) 23301 rack->r_ctl.rc_tlp_threshold = optval; 23302 else 23303 error = EINVAL; 23304 break; 23305 case TCP_BBR_USE_RACK_RR: 23306 RACK_OPTS_INC(tcp_rack_rr); 23307 if (optval) 23308 rack->use_rack_rr = 1; 23309 else 23310 rack->use_rack_rr = 0; 23311 break; 23312 case TCP_RACK_PKT_DELAY: 23313 /* RACK added ms i.e. rack-rtt + reord + N */ 23314 RACK_OPTS_INC(tcp_rack_pkt_delay); 23315 rack->r_ctl.rc_pkt_delay = optval; 23316 break; 23317 case TCP_DELACK: 23318 RACK_OPTS_INC(tcp_rack_delayed_ack); 23319 if (optval == 0) 23320 tp->t_delayed_ack = 0; 23321 else 23322 tp->t_delayed_ack = 1; 23323 if (tp->t_flags & TF_DELACK) { 23324 tp->t_flags &= ~TF_DELACK; 23325 tp->t_flags |= TF_ACKNOW; 23326 NET_EPOCH_ENTER(et); 23327 rack_output(tp); 23328 NET_EPOCH_EXIT(et); 23329 } 23330 break; 23331 23332 case TCP_BBR_RACK_RTT_USE: 23333 RACK_OPTS_INC(tcp_rack_rtt_use); 23334 if ((optval != USE_RTT_HIGH) && 23335 (optval != USE_RTT_LOW) && 23336 (optval != USE_RTT_AVG)) 23337 error = EINVAL; 23338 else 23339 rack->r_ctl.rc_rate_sample_method = optval; 23340 break; 23341 case TCP_DATA_AFTER_CLOSE: 23342 RACK_OPTS_INC(tcp_data_after_close); 23343 if (optval) 23344 rack->rc_allow_data_af_clo = 1; 23345 else 23346 rack->rc_allow_data_af_clo = 0; 23347 break; 23348 default: 23349 break; 23350 } 23351 tcp_log_socket_option(tp, sopt_name, optval, error); 23352 return (error); 23353 } 23354 23355 23356 static void 23357 rack_apply_deferred_options(struct tcp_rack *rack) 23358 { 23359 struct deferred_opt_list *dol, *sdol; 23360 uint32_t s_optval; 23361 23362 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 23363 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 23364 /* Disadvantage of deferal is you loose the error return */ 23365 s_optval = (uint32_t)dol->optval; 23366 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval, NULL); 23367 free(dol, M_TCPDO); 23368 } 23369 } 23370 23371 static void 23372 rack_hw_tls_change(struct tcpcb *tp, int chg) 23373 { 23374 /* Update HW tls state */ 23375 struct tcp_rack *rack; 23376 23377 rack = (struct tcp_rack *)tp->t_fb_ptr; 23378 if (chg) 23379 rack->r_ctl.fsb.hw_tls = 1; 23380 else 23381 rack->r_ctl.fsb.hw_tls = 0; 23382 } 23383 23384 static int 23385 rack_pru_options(struct tcpcb *tp, int flags) 23386 { 23387 if (flags & PRUS_OOB) 23388 return (EOPNOTSUPP); 23389 return (0); 23390 } 23391 23392 static bool 23393 rack_wake_check(struct tcpcb *tp) 23394 { 23395 struct tcp_rack *rack; 23396 struct timeval tv; 23397 uint32_t cts; 23398 23399 rack = (struct tcp_rack *)tp->t_fb_ptr; 23400 if (rack->r_ctl.rc_hpts_flags) { 23401 cts = tcp_get_usecs(&tv); 23402 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == PACE_PKT_OUTPUT){ 23403 /* 23404 * Pacing timer is up, check if we are ready. 23405 */ 23406 if (TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) 23407 return (true); 23408 } else if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) != 0) { 23409 /* 23410 * A timer is up, check if we are ready. 23411 */ 23412 if (TSTMP_GEQ(cts, rack->r_ctl.rc_timer_exp)) 23413 return (true); 23414 } 23415 } 23416 return (false); 23417 } 23418 23419 static struct tcp_function_block __tcp_rack = { 23420 .tfb_tcp_block_name = __XSTRING(STACKNAME), 23421 .tfb_tcp_output = rack_output, 23422 .tfb_do_queued_segments = ctf_do_queued_segments, 23423 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 23424 .tfb_tcp_do_segment = rack_do_segment, 23425 .tfb_tcp_ctloutput = rack_ctloutput, 23426 .tfb_tcp_fb_init = rack_init, 23427 .tfb_tcp_fb_fini = rack_fini, 23428 .tfb_tcp_timer_stop_all = rack_stopall, 23429 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 23430 .tfb_tcp_handoff_ok = rack_handoff_ok, 23431 .tfb_tcp_mtu_chg = rack_mtu_change, 23432 .tfb_pru_options = rack_pru_options, 23433 .tfb_hwtls_change = rack_hw_tls_change, 23434 .tfb_chg_query = rack_chg_query, 23435 .tfb_switch_failed = rack_switch_failed, 23436 .tfb_early_wake_check = rack_wake_check, 23437 .tfb_compute_pipe = rack_compute_pipe, 23438 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 23439 }; 23440 23441 /* 23442 * rack_ctloutput() must drop the inpcb lock before performing copyin on 23443 * socket option arguments. When it re-acquires the lock after the copy, it 23444 * has to revalidate that the connection is still valid for the socket 23445 * option. 23446 */ 23447 static int 23448 rack_set_sockopt(struct tcpcb *tp, struct sockopt *sopt) 23449 { 23450 struct inpcb *inp = tptoinpcb(tp); 23451 #ifdef INET6 23452 struct ip6_hdr *ip6; 23453 int32_t mask, tclass; 23454 #endif 23455 #ifdef INET 23456 struct ip *ip; 23457 #endif 23458 struct tcp_rack *rack; 23459 struct tcp_hybrid_req hybrid; 23460 uint64_t loptval; 23461 int32_t error = 0, optval; 23462 23463 rack = (struct tcp_rack *)tp->t_fb_ptr; 23464 if (rack == NULL) { 23465 INP_WUNLOCK(inp); 23466 return (EINVAL); 23467 } 23468 #ifdef INET6 23469 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 23470 #endif 23471 #ifdef INET 23472 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 23473 #endif 23474 23475 switch (sopt->sopt_level) { 23476 #ifdef INET6 23477 case IPPROTO_IPV6: 23478 MPASS(inp->inp_vflag & INP_IPV6PROTO); 23479 switch (sopt->sopt_name) { 23480 case IPV6_USE_MIN_MTU: 23481 tcp6_use_min_mtu(tp); 23482 break; 23483 case IPV6_TCLASS: 23484 /* 23485 * The DSCP codepoint has changed, update the fsb 23486 * by overwriting any previous traffic class. 23487 */ 23488 if (inp->in6p_outputopts) { 23489 mask = 0xfc; 23490 tclass = inp->in6p_outputopts->ip6po_tclass; 23491 ip6->ip6_flow &= htonl((~mask) << 20); 23492 ip6->ip6_flow |= htonl((tclass & mask) << 20); 23493 } 23494 break; 23495 } 23496 INP_WUNLOCK(inp); 23497 return (0); 23498 #endif 23499 #ifdef INET 23500 case IPPROTO_IP: 23501 switch (sopt->sopt_name) { 23502 case IP_TOS: 23503 /* 23504 * The DSCP codepoint has changed, update the fsb. 23505 */ 23506 ip->ip_tos = rack->rc_inp->inp_ip_tos; 23507 break; 23508 case IP_TTL: 23509 /* 23510 * The TTL has changed, update the fsb. 23511 */ 23512 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 23513 break; 23514 } 23515 INP_WUNLOCK(inp); 23516 return (0); 23517 #endif 23518 #ifdef SO_PEERPRIO 23519 case SOL_SOCKET: 23520 switch (sopt->sopt_name) { 23521 case SO_PEERPRIO: /* SC-URL:bs */ 23522 /* Already read in and sanity checked in sosetopt(). */ 23523 if (inp->inp_socket) { 23524 rack->client_bufferlvl = inp->inp_socket->so_peerprio; 23525 rack_client_buffer_level_set(rack); 23526 } 23527 break; 23528 } 23529 INP_WUNLOCK(inp); 23530 return (0); 23531 #endif 23532 case IPPROTO_TCP: 23533 switch (sopt->sopt_name) { 23534 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 23535 /* Pacing related ones */ 23536 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 23537 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 23538 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 23539 case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ 23540 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 23541 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 23542 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 23543 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 23544 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 23545 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 23546 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 23547 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 23548 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 23549 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 23550 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 23551 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 23552 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 23553 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 23554 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 23555 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 23556 case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */ 23557 /* End pacing related */ 23558 case TCP_RXT_CLAMP: /* URL:rxtclamp */ 23559 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 23560 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 23561 case TCP_RACK_MIN_TO: /* URL:min_to */ 23562 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 23563 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 23564 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 23565 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 23566 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 23567 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 23568 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 23569 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 23570 case TCP_RACK_DO_DETECTION: /* URL:detect */ 23571 case TCP_NO_PRR: /* URL:noprr */ 23572 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 23573 case TCP_DATA_AFTER_CLOSE: /* no URL */ 23574 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 23575 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 23576 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 23577 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 23578 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 23579 case TCP_RACK_PROFILE: /* URL:profile */ 23580 case TCP_HYBRID_PACING: /* URL:hybrid */ 23581 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 23582 case TCP_RACK_ABC_VAL: /* URL:labc */ 23583 case TCP_REC_ABC_VAL: /* URL:reclabc */ 23584 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 23585 case TCP_DEFER_OPTIONS: /* URL:defer */ 23586 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 23587 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 23588 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 23589 case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ 23590 case TCP_RACK_HI_BETA: /* URL:hibeta */ 23591 case TCP_RACK_SPLIT_LIMIT: /* URL:split */ 23592 case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ 23593 case TCP_PACING_DND: /* URL:dnd */ 23594 goto process_opt; 23595 break; 23596 default: 23597 /* Filter off all unknown options to the base stack */ 23598 return (tcp_default_ctloutput(tp, sopt)); 23599 break; 23600 } 23601 23602 default: 23603 INP_WUNLOCK(inp); 23604 return (0); 23605 } 23606 process_opt: 23607 INP_WUNLOCK(inp); 23608 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 23609 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 23610 /* 23611 * We truncate it down to 32 bits for the socket-option trace this 23612 * means rates > 34Gbps won't show right, but thats probably ok. 23613 */ 23614 optval = (uint32_t)loptval; 23615 } else if (sopt->sopt_name == TCP_HYBRID_PACING) { 23616 error = sooptcopyin(sopt, &hybrid, sizeof(hybrid), sizeof(hybrid)); 23617 } else { 23618 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 23619 /* Save it in 64 bit form too */ 23620 loptval = optval; 23621 } 23622 if (error) 23623 return (error); 23624 INP_WLOCK(inp); 23625 if (tp->t_fb != &__tcp_rack) { 23626 INP_WUNLOCK(inp); 23627 return (ENOPROTOOPT); 23628 } 23629 if (rack->defer_options && (rack->gp_ready == 0) && 23630 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 23631 (sopt->sopt_name != TCP_HYBRID_PACING) && 23632 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 23633 (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && 23634 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 23635 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 23636 /* Options are beind deferred */ 23637 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 23638 INP_WUNLOCK(inp); 23639 return (0); 23640 } else { 23641 /* No memory to defer, fail */ 23642 INP_WUNLOCK(inp); 23643 return (ENOMEM); 23644 } 23645 } 23646 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval, &hybrid); 23647 INP_WUNLOCK(inp); 23648 return (error); 23649 } 23650 23651 static void 23652 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 23653 { 23654 23655 INP_WLOCK_ASSERT(tptoinpcb(tp)); 23656 bzero(ti, sizeof(*ti)); 23657 23658 ti->tcpi_state = tp->t_state; 23659 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 23660 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 23661 if (tp->t_flags & TF_SACK_PERMIT) 23662 ti->tcpi_options |= TCPI_OPT_SACK; 23663 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 23664 ti->tcpi_options |= TCPI_OPT_WSCALE; 23665 ti->tcpi_snd_wscale = tp->snd_scale; 23666 ti->tcpi_rcv_wscale = tp->rcv_scale; 23667 } 23668 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) 23669 ti->tcpi_options |= TCPI_OPT_ECN; 23670 if (tp->t_flags & TF_FASTOPEN) 23671 ti->tcpi_options |= TCPI_OPT_TFO; 23672 /* still kept in ticks is t_rcvtime */ 23673 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 23674 /* Since we hold everything in precise useconds this is easy */ 23675 ti->tcpi_rtt = tp->t_srtt; 23676 ti->tcpi_rttvar = tp->t_rttvar; 23677 ti->tcpi_rto = tp->t_rxtcur; 23678 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 23679 ti->tcpi_snd_cwnd = tp->snd_cwnd; 23680 /* 23681 * FreeBSD-specific extension fields for tcp_info. 23682 */ 23683 ti->tcpi_rcv_space = tp->rcv_wnd; 23684 ti->tcpi_rcv_nxt = tp->rcv_nxt; 23685 ti->tcpi_snd_wnd = tp->snd_wnd; 23686 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 23687 ti->tcpi_snd_nxt = tp->snd_nxt; 23688 ti->tcpi_snd_mss = tp->t_maxseg; 23689 ti->tcpi_rcv_mss = tp->t_maxseg; 23690 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 23691 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 23692 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 23693 ti->tcpi_total_tlp = tp->t_sndtlppack; 23694 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 23695 #ifdef NETFLIX_STATS 23696 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 23697 #endif 23698 #ifdef TCP_OFFLOAD 23699 if (tp->t_flags & TF_TOE) { 23700 ti->tcpi_options |= TCPI_OPT_TOE; 23701 tcp_offload_tcp_info(tp, ti); 23702 } 23703 #endif 23704 } 23705 23706 static int 23707 rack_get_sockopt(struct tcpcb *tp, struct sockopt *sopt) 23708 { 23709 struct inpcb *inp = tptoinpcb(tp); 23710 struct tcp_rack *rack; 23711 int32_t error, optval; 23712 uint64_t val, loptval; 23713 struct tcp_info ti; 23714 /* 23715 * Because all our options are either boolean or an int, we can just 23716 * pull everything into optval and then unlock and copy. If we ever 23717 * add a option that is not a int, then this will have quite an 23718 * impact to this routine. 23719 */ 23720 error = 0; 23721 rack = (struct tcp_rack *)tp->t_fb_ptr; 23722 if (rack == NULL) { 23723 INP_WUNLOCK(inp); 23724 return (EINVAL); 23725 } 23726 switch (sopt->sopt_name) { 23727 case TCP_INFO: 23728 /* First get the info filled */ 23729 rack_fill_info(tp, &ti); 23730 /* Fix up the rtt related fields if needed */ 23731 INP_WUNLOCK(inp); 23732 error = sooptcopyout(sopt, &ti, sizeof ti); 23733 return (error); 23734 /* 23735 * Beta is the congestion control value for NewReno that influences how 23736 * much of a backoff happens when loss is detected. It is normally set 23737 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 23738 * when you exit recovery. 23739 */ 23740 case TCP_RACK_PACING_BETA: 23741 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 23742 error = EINVAL; 23743 else if (rack->rc_pacing_cc_set == 0) 23744 optval = rack->r_ctl.rc_saved_beta.beta; 23745 else { 23746 /* 23747 * Reach out into the CC data and report back what 23748 * I have previously set. Yeah it looks hackish but 23749 * we don't want to report the saved values. 23750 */ 23751 if (tp->t_ccv.cc_data) 23752 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; 23753 else 23754 error = EINVAL; 23755 } 23756 break; 23757 /* 23758 * Beta_ecn is the congestion control value for NewReno that influences how 23759 * much of a backoff happens when a ECN mark is detected. It is normally set 23760 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 23761 * you exit recovery. Note that classic ECN has a beta of 50, it is only 23762 * ABE Ecn that uses this "less" value, but we do too with pacing :) 23763 */ 23764 23765 case TCP_RACK_PACING_BETA_ECN: 23766 if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) 23767 error = EINVAL; 23768 else if (rack->rc_pacing_cc_set == 0) 23769 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 23770 else { 23771 /* 23772 * Reach out into the CC data and report back what 23773 * I have previously set. Yeah it looks hackish but 23774 * we don't want to report the saved values. 23775 */ 23776 if (tp->t_ccv.cc_data) 23777 optval = ((struct newreno *)tp->t_ccv.cc_data)->beta_ecn; 23778 else 23779 error = EINVAL; 23780 } 23781 break; 23782 case TCP_RACK_DSACK_OPT: 23783 optval = 0; 23784 if (rack->rc_rack_tmr_std_based) { 23785 optval |= 1; 23786 } 23787 if (rack->rc_rack_use_dsack) { 23788 optval |= 2; 23789 } 23790 break; 23791 case TCP_RACK_ENABLE_HYSTART: 23792 { 23793 if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { 23794 optval = RACK_HYSTART_ON; 23795 if (tp->t_ccv.flags & CCF_HYSTART_CAN_SH_CWND) 23796 optval = RACK_HYSTART_ON_W_SC; 23797 if (tp->t_ccv.flags & CCF_HYSTART_CONS_SSTH) 23798 optval = RACK_HYSTART_ON_W_SC_C; 23799 } else { 23800 optval = RACK_HYSTART_OFF; 23801 } 23802 } 23803 break; 23804 case TCP_RACK_DGP_IN_REC: 23805 optval = rack->r_ctl.full_dgp_in_rec; 23806 break; 23807 case TCP_RACK_HI_BETA: 23808 optval = rack->rack_hibeta; 23809 break; 23810 case TCP_RXT_CLAMP: 23811 optval = rack->r_ctl.saved_rxt_clamp_val; 23812 break; 23813 case TCP_DEFER_OPTIONS: 23814 optval = rack->defer_options; 23815 break; 23816 case TCP_RACK_MEASURE_CNT: 23817 optval = rack->r_ctl.req_measurements; 23818 break; 23819 case TCP_REC_ABC_VAL: 23820 optval = rack->r_use_labc_for_rec; 23821 break; 23822 case TCP_RACK_ABC_VAL: 23823 optval = rack->rc_labc; 23824 break; 23825 case TCP_HDWR_UP_ONLY: 23826 optval= rack->r_up_only; 23827 break; 23828 case TCP_PACING_RATE_CAP: 23829 loptval = rack->r_ctl.bw_rate_cap; 23830 break; 23831 case TCP_RACK_PROFILE: 23832 /* You cannot retrieve a profile, its write only */ 23833 error = EINVAL; 23834 break; 23835 case TCP_HYBRID_PACING: 23836 /* You cannot retrieve hybrid pacing information, its write only */ 23837 error = EINVAL; 23838 break; 23839 case TCP_USE_CMP_ACKS: 23840 optval = rack->r_use_cmp_ack; 23841 break; 23842 case TCP_RACK_PACE_TO_FILL: 23843 optval = rack->rc_pace_to_cwnd; 23844 if (optval && rack->r_fill_less_agg) 23845 optval++; 23846 break; 23847 case TCP_RACK_NO_PUSH_AT_MAX: 23848 optval = rack->r_ctl.rc_no_push_at_mrtt; 23849 break; 23850 case TCP_SHARED_CWND_ENABLE: 23851 optval = rack->rack_enable_scwnd; 23852 break; 23853 case TCP_RACK_NONRXT_CFG_RATE: 23854 optval = rack->rack_rec_nonrxt_use_cr; 23855 break; 23856 case TCP_NO_PRR: 23857 if (rack->rack_no_prr == 1) 23858 optval = 1; 23859 else if (rack->no_prr_addback == 1) 23860 optval = 2; 23861 else 23862 optval = 0; 23863 break; 23864 case TCP_RACK_DO_DETECTION: 23865 optval = rack->do_detection; 23866 break; 23867 case TCP_RACK_MBUF_QUEUE: 23868 /* Now do we use the LRO mbuf-queue feature */ 23869 optval = rack->r_mbuf_queue; 23870 break; 23871 case TCP_TIMELY_DYN_ADJ: 23872 optval = rack->rc_gp_dyn_mul; 23873 break; 23874 case TCP_BBR_IWINTSO: 23875 optval = rack->rc_init_win; 23876 break; 23877 case TCP_RACK_TLP_REDUCE: 23878 /* RACK TLP cwnd reduction (bool) */ 23879 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 23880 break; 23881 case TCP_BBR_RACK_INIT_RATE: 23882 val = rack->r_ctl.init_rate; 23883 /* convert to kbits per sec */ 23884 val *= 8; 23885 val /= 1000; 23886 optval = (uint32_t)val; 23887 break; 23888 case TCP_RACK_FORCE_MSEG: 23889 optval = rack->rc_force_max_seg; 23890 break; 23891 case TCP_RACK_PACE_MIN_SEG: 23892 optval = rack->r_ctl.rc_user_set_min_segs; 23893 break; 23894 case TCP_RACK_PACE_MAX_SEG: 23895 /* Max segments in a pace */ 23896 optval = rack->rc_user_set_max_segs; 23897 break; 23898 case TCP_RACK_PACE_ALWAYS: 23899 /* Use the always pace method */ 23900 optval = rack->rc_always_pace; 23901 break; 23902 case TCP_RACK_PRR_SENDALOT: 23903 /* Allow PRR to send more than one seg */ 23904 optval = rack->r_ctl.rc_prr_sendalot; 23905 break; 23906 case TCP_RACK_MIN_TO: 23907 /* Minimum time between rack t-o's in ms */ 23908 optval = rack->r_ctl.rc_min_to; 23909 break; 23910 case TCP_RACK_SPLIT_LIMIT: 23911 optval = rack->r_ctl.rc_split_limit; 23912 break; 23913 case TCP_RACK_EARLY_SEG: 23914 /* If early recovery max segments */ 23915 optval = rack->r_ctl.rc_early_recovery_segs; 23916 break; 23917 case TCP_RACK_REORD_THRESH: 23918 /* RACK reorder threshold (shift amount) */ 23919 optval = rack->r_ctl.rc_reorder_shift; 23920 break; 23921 case TCP_RACK_REORD_FADE: 23922 /* Does reordering fade after ms time */ 23923 optval = rack->r_ctl.rc_reorder_fade; 23924 break; 23925 case TCP_BBR_USE_RACK_RR: 23926 /* Do we use the rack cheat for rxt */ 23927 optval = rack->use_rack_rr; 23928 break; 23929 case TCP_RACK_RR_CONF: 23930 optval = rack->r_rr_config; 23931 break; 23932 case TCP_HDWR_RATE_CAP: 23933 optval = rack->r_rack_hw_rate_caps; 23934 break; 23935 case TCP_BBR_HDWR_PACE: 23936 optval = rack->rack_hdw_pace_ena; 23937 break; 23938 case TCP_RACK_TLP_THRESH: 23939 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 23940 optval = rack->r_ctl.rc_tlp_threshold; 23941 break; 23942 case TCP_RACK_PKT_DELAY: 23943 /* RACK added ms i.e. rack-rtt + reord + N */ 23944 optval = rack->r_ctl.rc_pkt_delay; 23945 break; 23946 case TCP_RACK_TLP_USE: 23947 optval = rack->rack_tlp_threshold_use; 23948 break; 23949 case TCP_PACING_DND: 23950 optval = rack->rc_pace_dnd; 23951 break; 23952 case TCP_RACK_PACE_RATE_CA: 23953 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 23954 break; 23955 case TCP_RACK_PACE_RATE_SS: 23956 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 23957 break; 23958 case TCP_RACK_PACE_RATE_REC: 23959 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 23960 break; 23961 case TCP_RACK_GP_INCREASE_SS: 23962 optval = rack->r_ctl.rack_per_of_gp_ca; 23963 break; 23964 case TCP_RACK_GP_INCREASE_CA: 23965 optval = rack->r_ctl.rack_per_of_gp_ss; 23966 break; 23967 case TCP_RACK_PACING_DIVISOR: 23968 optval = rack->r_ctl.pace_len_divisor; 23969 break; 23970 case TCP_BBR_RACK_RTT_USE: 23971 optval = rack->r_ctl.rc_rate_sample_method; 23972 break; 23973 case TCP_DELACK: 23974 optval = tp->t_delayed_ack; 23975 break; 23976 case TCP_DATA_AFTER_CLOSE: 23977 optval = rack->rc_allow_data_af_clo; 23978 break; 23979 case TCP_SHARED_CWND_TIME_LIMIT: 23980 optval = rack->r_limit_scw; 23981 break; 23982 case TCP_RACK_TIMER_SLOP: 23983 optval = rack->r_ctl.timer_slop; 23984 break; 23985 default: 23986 return (tcp_default_ctloutput(tp, sopt)); 23987 break; 23988 } 23989 INP_WUNLOCK(inp); 23990 if (error == 0) { 23991 if (TCP_PACING_RATE_CAP) 23992 error = sooptcopyout(sopt, &loptval, sizeof loptval); 23993 else 23994 error = sooptcopyout(sopt, &optval, sizeof optval); 23995 } 23996 return (error); 23997 } 23998 23999 static int 24000 rack_ctloutput(struct tcpcb *tp, struct sockopt *sopt) 24001 { 24002 if (sopt->sopt_dir == SOPT_SET) { 24003 return (rack_set_sockopt(tp, sopt)); 24004 } else if (sopt->sopt_dir == SOPT_GET) { 24005 return (rack_get_sockopt(tp, sopt)); 24006 } else { 24007 panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir); 24008 } 24009 } 24010 24011 static const char *rack_stack_names[] = { 24012 __XSTRING(STACKNAME), 24013 #ifdef STACKALIAS 24014 __XSTRING(STACKALIAS), 24015 #endif 24016 }; 24017 24018 static int 24019 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 24020 { 24021 memset(mem, 0, size); 24022 return (0); 24023 } 24024 24025 static void 24026 rack_dtor(void *mem, int32_t size, void *arg) 24027 { 24028 24029 } 24030 24031 static bool rack_mod_inited = false; 24032 24033 static int 24034 tcp_addrack(module_t mod, int32_t type, void *data) 24035 { 24036 int32_t err = 0; 24037 int num_stacks; 24038 24039 switch (type) { 24040 case MOD_LOAD: 24041 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 24042 sizeof(struct rack_sendmap), 24043 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 24044 24045 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 24046 sizeof(struct tcp_rack), 24047 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 24048 24049 sysctl_ctx_init(&rack_sysctl_ctx); 24050 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 24051 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 24052 OID_AUTO, 24053 #ifdef STACKALIAS 24054 __XSTRING(STACKALIAS), 24055 #else 24056 __XSTRING(STACKNAME), 24057 #endif 24058 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 24059 ""); 24060 if (rack_sysctl_root == NULL) { 24061 printf("Failed to add sysctl node\n"); 24062 err = EFAULT; 24063 goto free_uma; 24064 } 24065 rack_init_sysctls(); 24066 num_stacks = nitems(rack_stack_names); 24067 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 24068 rack_stack_names, &num_stacks); 24069 if (err) { 24070 printf("Failed to register %s stack name for " 24071 "%s module\n", rack_stack_names[num_stacks], 24072 __XSTRING(MODNAME)); 24073 sysctl_ctx_free(&rack_sysctl_ctx); 24074 free_uma: 24075 uma_zdestroy(rack_zone); 24076 uma_zdestroy(rack_pcb_zone); 24077 rack_counter_destroy(); 24078 printf("Failed to register rack module -- err:%d\n", err); 24079 return (err); 24080 } 24081 tcp_lro_reg_mbufq(); 24082 rack_mod_inited = true; 24083 break; 24084 case MOD_QUIESCE: 24085 err = deregister_tcp_functions(&__tcp_rack, true, false); 24086 break; 24087 case MOD_UNLOAD: 24088 err = deregister_tcp_functions(&__tcp_rack, false, true); 24089 if (err == EBUSY) 24090 break; 24091 if (rack_mod_inited) { 24092 uma_zdestroy(rack_zone); 24093 uma_zdestroy(rack_pcb_zone); 24094 sysctl_ctx_free(&rack_sysctl_ctx); 24095 rack_counter_destroy(); 24096 rack_mod_inited = false; 24097 } 24098 tcp_lro_dereg_mbufq(); 24099 err = 0; 24100 break; 24101 default: 24102 return (EOPNOTSUPP); 24103 } 24104 return (err); 24105 } 24106 24107 static moduledata_t tcp_rack = { 24108 .name = __XSTRING(MODNAME), 24109 .evhand = tcp_addrack, 24110 .priv = 0 24111 }; 24112 24113 MODULE_VERSION(MODNAME, 1); 24114 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 24115 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 24116 24117 #endif /* #if !defined(INET) && !defined(INET6) */ 24118