1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 #ifdef STATS 54 #include <sys/qmath.h> 55 #include <sys/tree.h> 56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 57 #else 58 #include <sys/tree.h> 59 #endif 60 #include <sys/refcount.h> 61 #include <sys/queue.h> 62 #include <sys/tim_filter.h> 63 #include <sys/smp.h> 64 #include <sys/kthread.h> 65 #include <sys/kern_prefetch.h> 66 #include <sys/protosw.h> 67 #ifdef TCP_ACCOUNTING 68 #include <sys/sched.h> 69 #include <machine/cpu.h> 70 #endif 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_hpts.h> 97 #include <netinet/tcp_ratelimit.h> 98 #include <netinet/tcp_accounting.h> 99 #include <netinet/tcpip.h> 100 #include <netinet/cc/cc.h> 101 #include <netinet/cc/cc_newreno.h> 102 #include <netinet/tcp_fastopen.h> 103 #include <netinet/tcp_lro.h> 104 #ifdef NETFLIX_SHARED_CWND 105 #include <netinet/tcp_shared_cwnd.h> 106 #endif 107 #ifdef TCPDEBUG 108 #include <netinet/tcp_debug.h> 109 #endif /* TCPDEBUG */ 110 #ifdef TCP_OFFLOAD 111 #include <netinet/tcp_offload.h> 112 #endif 113 #ifdef INET6 114 #include <netinet6/tcp6_var.h> 115 #endif 116 117 #include <netipsec/ipsec_support.h> 118 119 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 120 #include <netipsec/ipsec.h> 121 #include <netipsec/ipsec6.h> 122 #endif /* IPSEC */ 123 124 #include <netinet/udp.h> 125 #include <netinet/udp_var.h> 126 #include <machine/in_cksum.h> 127 128 #ifdef MAC 129 #include <security/mac/mac_framework.h> 130 #endif 131 #include "sack_filter.h" 132 #include "tcp_rack.h" 133 #include "rack_bbr_common.h" 134 135 uma_zone_t rack_zone; 136 uma_zone_t rack_pcb_zone; 137 138 #ifndef TICKS2SBT 139 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 140 #endif 141 142 VNET_DECLARE(uint32_t, newreno_beta); 143 VNET_DECLARE(uint32_t, newreno_beta_ecn); 144 #define V_newreno_beta VNET(newreno_beta) 145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 146 147 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 150 151 struct sysctl_ctx_list rack_sysctl_ctx; 152 struct sysctl_oid *rack_sysctl_root; 153 154 #define CUM_ACKED 1 155 #define SACKED 2 156 157 /* 158 * The RACK module incorporates a number of 159 * TCP ideas that have been put out into the IETF 160 * over the last few years: 161 * - Matt Mathis's Rate Halving which slowly drops 162 * the congestion window so that the ack clock can 163 * be maintained during a recovery. 164 * - Yuchung Cheng's RACK TCP (for which its named) that 165 * will stop us using the number of dup acks and instead 166 * use time as the gage of when we retransmit. 167 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 168 * of Dukkipati et.al. 169 * RACK depends on SACK, so if an endpoint arrives that 170 * cannot do SACK the state machine below will shuttle the 171 * connection back to using the "default" TCP stack that is 172 * in FreeBSD. 173 * 174 * To implement RACK the original TCP stack was first decomposed 175 * into a functional state machine with individual states 176 * for each of the possible TCP connection states. The do_segment 177 * functions role in life is to mandate the connection supports SACK 178 * initially and then assure that the RACK state matches the conenction 179 * state before calling the states do_segment function. Each 180 * state is simplified due to the fact that the original do_segment 181 * has been decomposed and we *know* what state we are in (no 182 * switches on the state) and all tests for SACK are gone. This 183 * greatly simplifies what each state does. 184 * 185 * TCP output is also over-written with a new version since it 186 * must maintain the new rack scoreboard. 187 * 188 */ 189 static int32_t rack_tlp_thresh = 1; 190 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 191 static int32_t rack_tlp_use_greater = 1; 192 static int32_t rack_reorder_thresh = 2; 193 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 194 * - 60 seconds */ 195 static uint8_t rack_req_measurements = 1; 196 /* Attack threshold detections */ 197 static uint32_t rack_highest_sack_thresh_seen = 0; 198 static uint32_t rack_highest_move_thresh_seen = 0; 199 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 200 static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ 201 static int32_t rack_hw_rate_caps = 1; /* 1; */ 202 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 203 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 204 static int32_t rack_hw_up_only = 1; 205 static int32_t rack_stats_gets_ms_rtt = 1; 206 static int32_t rack_prr_addbackmax = 2; 207 static int32_t rack_do_hystart = 0; 208 static int32_t rack_apply_rtt_with_reduced_conf = 0; 209 210 static int32_t rack_pkt_delay = 1000; 211 static int32_t rack_send_a_lot_in_prr = 1; 212 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 213 static int32_t rack_verbose_logging = 0; 214 static int32_t rack_ignore_data_after_close = 1; 215 static int32_t rack_enable_shared_cwnd = 1; 216 static int32_t rack_use_cmp_acks = 1; 217 static int32_t rack_use_fsb = 1; 218 static int32_t rack_use_rfo = 1; 219 static int32_t rack_use_rsm_rfo = 1; 220 static int32_t rack_max_abc_post_recovery = 2; 221 static int32_t rack_client_low_buf = 0; 222 static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ 223 #ifdef TCP_ACCOUNTING 224 static int32_t rack_tcp_accounting = 0; 225 #endif 226 static int32_t rack_limits_scwnd = 1; 227 static int32_t rack_enable_mqueue_for_nonpaced = 0; 228 static int32_t rack_disable_prr = 0; 229 static int32_t use_rack_rr = 1; 230 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 231 static int32_t rack_persist_min = 250000; /* 250usec */ 232 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 233 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 234 static int32_t rack_default_init_window = 0; /* Use system default */ 235 static int32_t rack_limit_time_with_srtt = 0; 236 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 237 static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ 238 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 239 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 240 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 241 /* 242 * Currently regular tcp has a rto_min of 30ms 243 * the backoff goes 12 times so that ends up 244 * being a total of 122.850 seconds before a 245 * connection is killed. 246 */ 247 static uint32_t rack_def_data_window = 20; 248 static uint32_t rack_goal_bdp = 2; 249 static uint32_t rack_min_srtts = 1; 250 static uint32_t rack_min_measure_usec = 0; 251 static int32_t rack_tlp_min = 10000; /* 10ms */ 252 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 253 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 254 static const int32_t rack_free_cache = 2; 255 static int32_t rack_hptsi_segments = 40; 256 static int32_t rack_rate_sample_method = USE_RTT_LOW; 257 static int32_t rack_pace_every_seg = 0; 258 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 259 static int32_t rack_slot_reduction = 4; 260 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 261 static int32_t rack_cwnd_block_ends_measure = 0; 262 static int32_t rack_rwnd_block_ends_measure = 0; 263 static int32_t rack_def_profile = 0; 264 265 static int32_t rack_lower_cwnd_at_tlp = 0; 266 static int32_t rack_limited_retran = 0; 267 static int32_t rack_always_send_oldest = 0; 268 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 269 270 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 271 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 272 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 273 274 /* Probertt */ 275 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 276 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 277 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 278 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 279 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 280 281 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 282 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 283 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 284 static uint32_t rack_probertt_use_min_rtt_exit = 0; 285 static uint32_t rack_probe_rtt_sets_cwnd = 0; 286 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 287 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 288 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 289 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 290 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 291 static uint32_t rack_probertt_filter_life = 10000000; 292 static uint32_t rack_probertt_lower_within = 10; 293 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 294 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 295 static int32_t rack_probertt_clear_is = 1; 296 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 297 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 298 299 /* Part of pacing */ 300 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 301 302 /* Timely information */ 303 /* Combine these two gives the range of 'no change' to bw */ 304 /* ie the up/down provide the upper and lower bound */ 305 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 306 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 307 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 308 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 309 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 310 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 311 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 312 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 313 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 314 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 315 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 316 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 317 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 318 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 319 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 320 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 321 static int32_t rack_use_max_for_nobackoff = 0; 322 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 323 static int32_t rack_timely_no_stopping = 0; 324 static int32_t rack_down_raise_thresh = 100; 325 static int32_t rack_req_segs = 1; 326 static uint64_t rack_bw_rate_cap = 0; 327 328 /* Weird delayed ack mode */ 329 static int32_t rack_use_imac_dack = 0; 330 /* Rack specific counters */ 331 counter_u64_t rack_badfr; 332 counter_u64_t rack_badfr_bytes; 333 counter_u64_t rack_rtm_prr_retran; 334 counter_u64_t rack_rtm_prr_newdata; 335 counter_u64_t rack_timestamp_mismatch; 336 counter_u64_t rack_reorder_seen; 337 counter_u64_t rack_paced_segments; 338 counter_u64_t rack_unpaced_segments; 339 counter_u64_t rack_calc_zero; 340 counter_u64_t rack_calc_nonzero; 341 counter_u64_t rack_saw_enobuf; 342 counter_u64_t rack_saw_enobuf_hw; 343 counter_u64_t rack_saw_enetunreach; 344 counter_u64_t rack_per_timer_hole; 345 counter_u64_t rack_large_ackcmp; 346 counter_u64_t rack_small_ackcmp; 347 counter_u64_t rack_persists_sends; 348 counter_u64_t rack_persists_acks; 349 counter_u64_t rack_persists_loss; 350 counter_u64_t rack_persists_lost_ends; 351 #ifdef INVARIANTS 352 counter_u64_t rack_adjust_map_bw; 353 #endif 354 /* Tail loss probe counters */ 355 counter_u64_t rack_tlp_tot; 356 counter_u64_t rack_tlp_newdata; 357 counter_u64_t rack_tlp_retran; 358 counter_u64_t rack_tlp_retran_bytes; 359 counter_u64_t rack_tlp_retran_fail; 360 counter_u64_t rack_to_tot; 361 counter_u64_t rack_to_arm_rack; 362 counter_u64_t rack_to_arm_tlp; 363 counter_u64_t rack_hot_alloc; 364 counter_u64_t rack_to_alloc; 365 counter_u64_t rack_to_alloc_hard; 366 counter_u64_t rack_to_alloc_emerg; 367 counter_u64_t rack_to_alloc_limited; 368 counter_u64_t rack_alloc_limited_conns; 369 counter_u64_t rack_split_limited; 370 371 #define MAX_NUM_OF_CNTS 13 372 counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS]; 373 counter_u64_t rack_multi_single_eq; 374 counter_u64_t rack_proc_non_comp_ack; 375 376 counter_u64_t rack_fto_send; 377 counter_u64_t rack_fto_rsm_send; 378 counter_u64_t rack_nfto_resend; 379 counter_u64_t rack_non_fto_send; 380 counter_u64_t rack_extended_rfo; 381 382 counter_u64_t rack_sack_proc_all; 383 counter_u64_t rack_sack_proc_short; 384 counter_u64_t rack_sack_proc_restart; 385 counter_u64_t rack_sack_attacks_detected; 386 counter_u64_t rack_sack_attacks_reversed; 387 counter_u64_t rack_sack_used_next_merge; 388 counter_u64_t rack_sack_splits; 389 counter_u64_t rack_sack_used_prev_merge; 390 counter_u64_t rack_sack_skipped_acked; 391 counter_u64_t rack_ack_total; 392 counter_u64_t rack_express_sack; 393 counter_u64_t rack_sack_total; 394 counter_u64_t rack_move_none; 395 counter_u64_t rack_move_some; 396 397 counter_u64_t rack_used_tlpmethod; 398 counter_u64_t rack_used_tlpmethod2; 399 counter_u64_t rack_enter_tlp_calc; 400 counter_u64_t rack_input_idle_reduces; 401 counter_u64_t rack_collapsed_win; 402 counter_u64_t rack_tlp_does_nada; 403 counter_u64_t rack_try_scwnd; 404 counter_u64_t rack_hw_pace_init_fail; 405 counter_u64_t rack_hw_pace_lost; 406 counter_u64_t rack_sbsndptr_right; 407 counter_u64_t rack_sbsndptr_wrong; 408 409 /* Temp CPU counters */ 410 counter_u64_t rack_find_high; 411 412 counter_u64_t rack_progress_drops; 413 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 414 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 415 416 417 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 418 419 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 420 (tv) = (value) + slop; \ 421 if ((u_long)(tv) < (u_long)(tvmin)) \ 422 (tv) = (tvmin); \ 423 if ((u_long)(tv) > (u_long)(tvmax)) \ 424 (tv) = (tvmax); \ 425 } while (0) 426 427 static void 428 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 429 430 static int 431 rack_process_ack(struct mbuf *m, struct tcphdr *th, 432 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 433 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 434 static int 435 rack_process_data(struct mbuf *m, struct tcphdr *th, 436 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 437 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 438 static void 439 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 440 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 441 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 442 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 443 uint8_t limit_type); 444 static struct rack_sendmap * 445 rack_check_recovery_mode(struct tcpcb *tp, 446 uint32_t tsused); 447 static void 448 rack_cong_signal(struct tcpcb *tp, 449 uint32_t type, uint32_t ack); 450 static void rack_counter_destroy(void); 451 static int 452 rack_ctloutput(struct socket *so, struct sockopt *sopt, 453 struct inpcb *inp, struct tcpcb *tp); 454 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 455 static void 456 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 457 static void 458 rack_do_segment(struct mbuf *m, struct tcphdr *th, 459 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 460 uint8_t iptos); 461 static void rack_dtor(void *mem, int32_t size, void *arg); 462 static void 463 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 464 uint32_t flex1, uint32_t flex2, 465 uint32_t flex3, uint32_t flex4, 466 uint32_t flex5, uint32_t flex6, 467 uint16_t flex7, uint8_t mod); 468 469 static void 470 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 471 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 472 struct rack_sendmap *rsm, uint8_t quality); 473 static struct rack_sendmap * 474 rack_find_high_nonack(struct tcp_rack *rack, 475 struct rack_sendmap *rsm); 476 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 477 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 478 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 479 static int 480 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 481 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 482 static void 483 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 484 tcp_seq th_ack, int line, uint8_t quality); 485 static uint32_t 486 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 487 static int32_t rack_handoff_ok(struct tcpcb *tp); 488 static int32_t rack_init(struct tcpcb *tp); 489 static void rack_init_sysctls(void); 490 static void 491 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 492 struct tcphdr *th, int entered_rec, int dup_ack_struck); 493 static void 494 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 495 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts, 496 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls); 497 498 static void 499 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 500 struct rack_sendmap *rsm); 501 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 502 static int32_t rack_output(struct tcpcb *tp); 503 504 static uint32_t 505 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 506 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 507 uint32_t cts, int *moved_two); 508 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 509 static void rack_remxt_tmr(struct tcpcb *tp); 510 static int 511 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 512 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 513 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 514 static int32_t rack_stopall(struct tcpcb *tp); 515 static void 516 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 517 uint32_t delta); 518 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 519 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 520 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 521 static uint32_t 522 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 523 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); 524 static void 525 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 526 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); 527 static int 528 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 529 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 530 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 531 static int 532 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 533 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 534 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 535 static int 536 rack_do_closing(struct mbuf *m, struct tcphdr *th, 537 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 538 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 539 static int 540 rack_do_established(struct mbuf *m, struct tcphdr *th, 541 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 542 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 543 static int 544 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 545 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 546 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 547 static int 548 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 549 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 550 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 551 static int 552 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 553 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 554 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 555 static int 556 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 557 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 558 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 559 static int 560 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 561 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 562 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 563 static int 564 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 565 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 566 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 567 struct rack_sendmap * 568 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 569 uint32_t tsused); 570 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 571 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 572 static void 573 tcp_rack_partialack(struct tcpcb *tp); 574 static int 575 rack_set_profile(struct tcp_rack *rack, int prof); 576 static void 577 rack_apply_deferred_options(struct tcp_rack *rack); 578 579 int32_t rack_clear_counter=0; 580 581 static void 582 rack_set_cc_pacing(struct tcp_rack *rack) 583 { 584 struct sockopt sopt; 585 struct cc_newreno_opts opt; 586 struct newreno old, *ptr; 587 struct tcpcb *tp; 588 int error; 589 590 if (rack->rc_pacing_cc_set) 591 return; 592 593 tp = rack->rc_tp; 594 if (tp->cc_algo == NULL) { 595 /* Tcb is leaving */ 596 printf("No cc algorithm?\n"); 597 return; 598 } 599 rack->rc_pacing_cc_set = 1; 600 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 601 /* Not new-reno we can't play games with beta! */ 602 goto out; 603 } 604 ptr = ((struct newreno *)tp->ccv->cc_data); 605 if (CC_ALGO(tp)->ctl_output == NULL) { 606 /* Huh, why does new_reno no longer have a set function? */ 607 goto out; 608 } 609 if (ptr == NULL) { 610 /* Just the default values */ 611 old.beta = V_newreno_beta_ecn; 612 old.beta_ecn = V_newreno_beta_ecn; 613 old.newreno_flags = 0; 614 } else { 615 old.beta = ptr->beta; 616 old.beta_ecn = ptr->beta_ecn; 617 old.newreno_flags = ptr->newreno_flags; 618 } 619 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 620 sopt.sopt_dir = SOPT_SET; 621 opt.name = CC_NEWRENO_BETA; 622 opt.val = rack->r_ctl.rc_saved_beta.beta; 623 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 624 if (error) { 625 goto out; 626 } 627 /* 628 * Hack alert we need to set in our newreno_flags 629 * so that Abe behavior is also applied. 630 */ 631 ((struct newreno *)tp->ccv->cc_data)->newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 632 opt.name = CC_NEWRENO_BETA_ECN; 633 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 634 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 635 if (error) { 636 goto out; 637 } 638 /* Save off the original values for restoral */ 639 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 640 out: 641 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 642 union tcp_log_stackspecific log; 643 struct timeval tv; 644 645 ptr = ((struct newreno *)tp->ccv->cc_data); 646 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 647 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 648 if (ptr) { 649 log.u_bbr.flex1 = ptr->beta; 650 log.u_bbr.flex2 = ptr->beta_ecn; 651 log.u_bbr.flex3 = ptr->newreno_flags; 652 } 653 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 654 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 655 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 656 log.u_bbr.flex7 = rack->gp_ready; 657 log.u_bbr.flex7 <<= 1; 658 log.u_bbr.flex7 |= rack->use_fixed_rate; 659 log.u_bbr.flex7 <<= 1; 660 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 661 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 662 log.u_bbr.flex8 = 3; 663 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 664 0, &log, false, NULL, NULL, 0, &tv); 665 } 666 } 667 668 static void 669 rack_undo_cc_pacing(struct tcp_rack *rack) 670 { 671 struct newreno old, *ptr; 672 struct tcpcb *tp; 673 674 if (rack->rc_pacing_cc_set == 0) 675 return; 676 tp = rack->rc_tp; 677 rack->rc_pacing_cc_set = 0; 678 if (tp->cc_algo == NULL) 679 /* Tcb is leaving */ 680 return; 681 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 682 /* Not new-reno nothing to do! */ 683 return; 684 } 685 ptr = ((struct newreno *)tp->ccv->cc_data); 686 if (ptr == NULL) { 687 /* 688 * This happens at rack_fini() if the 689 * cc module gets freed on us. In that 690 * case we loose our "new" settings but 691 * thats ok, since the tcb is going away anyway. 692 */ 693 return; 694 } 695 /* Grab out our set values */ 696 memcpy(&old, ptr, sizeof(struct newreno)); 697 /* Copy back in the original values */ 698 memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno)); 699 /* Now save back the values we had set in (for when pacing is restored) */ 700 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 701 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 702 union tcp_log_stackspecific log; 703 struct timeval tv; 704 705 ptr = ((struct newreno *)tp->ccv->cc_data); 706 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 707 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 708 log.u_bbr.flex1 = ptr->beta; 709 log.u_bbr.flex2 = ptr->beta_ecn; 710 log.u_bbr.flex3 = ptr->newreno_flags; 711 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 712 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 713 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 714 log.u_bbr.flex7 = rack->gp_ready; 715 log.u_bbr.flex7 <<= 1; 716 log.u_bbr.flex7 |= rack->use_fixed_rate; 717 log.u_bbr.flex7 <<= 1; 718 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 719 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 720 log.u_bbr.flex8 = 4; 721 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 722 0, &log, false, NULL, NULL, 0, &tv); 723 } 724 } 725 726 #ifdef NETFLIX_PEAKRATE 727 static inline void 728 rack_update_peakrate_thr(struct tcpcb *tp) 729 { 730 /* Keep in mind that t_maxpeakrate is in B/s. */ 731 uint64_t peak; 732 peak = uqmax((tp->t_maxseg * 2), 733 (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC)); 734 tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX); 735 } 736 #endif 737 738 static int 739 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 740 { 741 uint32_t stat; 742 int32_t error; 743 int i; 744 745 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 746 if (error || req->newptr == NULL) 747 return error; 748 749 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 750 if (error) 751 return (error); 752 if (stat == 1) { 753 #ifdef INVARIANTS 754 printf("Clearing RACK counters\n"); 755 #endif 756 counter_u64_zero(rack_badfr); 757 counter_u64_zero(rack_badfr_bytes); 758 counter_u64_zero(rack_rtm_prr_retran); 759 counter_u64_zero(rack_rtm_prr_newdata); 760 counter_u64_zero(rack_timestamp_mismatch); 761 counter_u64_zero(rack_reorder_seen); 762 counter_u64_zero(rack_tlp_tot); 763 counter_u64_zero(rack_tlp_newdata); 764 counter_u64_zero(rack_tlp_retran); 765 counter_u64_zero(rack_tlp_retran_bytes); 766 counter_u64_zero(rack_tlp_retran_fail); 767 counter_u64_zero(rack_to_tot); 768 counter_u64_zero(rack_to_arm_rack); 769 counter_u64_zero(rack_to_arm_tlp); 770 counter_u64_zero(rack_paced_segments); 771 counter_u64_zero(rack_calc_zero); 772 counter_u64_zero(rack_calc_nonzero); 773 counter_u64_zero(rack_unpaced_segments); 774 counter_u64_zero(rack_saw_enobuf); 775 counter_u64_zero(rack_saw_enobuf_hw); 776 counter_u64_zero(rack_saw_enetunreach); 777 counter_u64_zero(rack_per_timer_hole); 778 counter_u64_zero(rack_large_ackcmp); 779 counter_u64_zero(rack_small_ackcmp); 780 counter_u64_zero(rack_persists_sends); 781 counter_u64_zero(rack_persists_acks); 782 counter_u64_zero(rack_persists_loss); 783 counter_u64_zero(rack_persists_lost_ends); 784 #ifdef INVARIANTS 785 counter_u64_zero(rack_adjust_map_bw); 786 #endif 787 counter_u64_zero(rack_to_alloc_hard); 788 counter_u64_zero(rack_to_alloc_emerg); 789 counter_u64_zero(rack_sack_proc_all); 790 counter_u64_zero(rack_fto_send); 791 counter_u64_zero(rack_fto_rsm_send); 792 counter_u64_zero(rack_extended_rfo); 793 counter_u64_zero(rack_hw_pace_init_fail); 794 counter_u64_zero(rack_hw_pace_lost); 795 counter_u64_zero(rack_sbsndptr_wrong); 796 counter_u64_zero(rack_sbsndptr_right); 797 counter_u64_zero(rack_non_fto_send); 798 counter_u64_zero(rack_nfto_resend); 799 counter_u64_zero(rack_sack_proc_short); 800 counter_u64_zero(rack_sack_proc_restart); 801 counter_u64_zero(rack_to_alloc); 802 counter_u64_zero(rack_to_alloc_limited); 803 counter_u64_zero(rack_alloc_limited_conns); 804 counter_u64_zero(rack_split_limited); 805 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 806 counter_u64_zero(rack_proc_comp_ack[i]); 807 } 808 counter_u64_zero(rack_multi_single_eq); 809 counter_u64_zero(rack_proc_non_comp_ack); 810 counter_u64_zero(rack_find_high); 811 counter_u64_zero(rack_sack_attacks_detected); 812 counter_u64_zero(rack_sack_attacks_reversed); 813 counter_u64_zero(rack_sack_used_next_merge); 814 counter_u64_zero(rack_sack_used_prev_merge); 815 counter_u64_zero(rack_sack_splits); 816 counter_u64_zero(rack_sack_skipped_acked); 817 counter_u64_zero(rack_ack_total); 818 counter_u64_zero(rack_express_sack); 819 counter_u64_zero(rack_sack_total); 820 counter_u64_zero(rack_move_none); 821 counter_u64_zero(rack_move_some); 822 counter_u64_zero(rack_used_tlpmethod); 823 counter_u64_zero(rack_used_tlpmethod2); 824 counter_u64_zero(rack_enter_tlp_calc); 825 counter_u64_zero(rack_progress_drops); 826 counter_u64_zero(rack_tlp_does_nada); 827 counter_u64_zero(rack_try_scwnd); 828 counter_u64_zero(rack_collapsed_win); 829 } 830 rack_clear_counter = 0; 831 return (0); 832 } 833 834 static void 835 rack_init_sysctls(void) 836 { 837 int i; 838 struct sysctl_oid *rack_counters; 839 struct sysctl_oid *rack_attack; 840 struct sysctl_oid *rack_pacing; 841 struct sysctl_oid *rack_timely; 842 struct sysctl_oid *rack_timers; 843 struct sysctl_oid *rack_tlp; 844 struct sysctl_oid *rack_misc; 845 struct sysctl_oid *rack_features; 846 struct sysctl_oid *rack_measure; 847 struct sysctl_oid *rack_probertt; 848 struct sysctl_oid *rack_hw_pacing; 849 850 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 851 SYSCTL_CHILDREN(rack_sysctl_root), 852 OID_AUTO, 853 "sack_attack", 854 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 855 "Rack Sack Attack Counters and Controls"); 856 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_sysctl_root), 858 OID_AUTO, 859 "stats", 860 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 861 "Rack Counters"); 862 SYSCTL_ADD_S32(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_sysctl_root), 864 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 865 &rack_rate_sample_method , USE_RTT_LOW, 866 "What method should we use for rate sampling 0=high, 1=low "); 867 /* Probe rtt related controls */ 868 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 869 SYSCTL_CHILDREN(rack_sysctl_root), 870 OID_AUTO, 871 "probertt", 872 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 873 "ProbeRTT related Controls"); 874 SYSCTL_ADD_U16(&rack_sysctl_ctx, 875 SYSCTL_CHILDREN(rack_probertt), 876 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 877 &rack_atexit_prtt_hbp, 130, 878 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 879 SYSCTL_ADD_U16(&rack_sysctl_ctx, 880 SYSCTL_CHILDREN(rack_probertt), 881 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 882 &rack_atexit_prtt, 130, 883 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 884 SYSCTL_ADD_U16(&rack_sysctl_ctx, 885 SYSCTL_CHILDREN(rack_probertt), 886 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 887 &rack_per_of_gp_probertt, 60, 888 "What percentage of goodput do we pace at in probertt"); 889 SYSCTL_ADD_U16(&rack_sysctl_ctx, 890 SYSCTL_CHILDREN(rack_probertt), 891 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 892 &rack_per_of_gp_probertt_reduce, 10, 893 "What percentage of goodput do we reduce every gp_srtt"); 894 SYSCTL_ADD_U16(&rack_sysctl_ctx, 895 SYSCTL_CHILDREN(rack_probertt), 896 OID_AUTO, "gp_per_low", CTLFLAG_RW, 897 &rack_per_of_gp_lowthresh, 40, 898 "What percentage of goodput do we allow the multiplier to fall to"); 899 SYSCTL_ADD_U32(&rack_sysctl_ctx, 900 SYSCTL_CHILDREN(rack_probertt), 901 OID_AUTO, "time_between", CTLFLAG_RW, 902 & rack_time_between_probertt, 96000000, 903 "How many useconds between the lowest rtt falling must past before we enter probertt"); 904 SYSCTL_ADD_U32(&rack_sysctl_ctx, 905 SYSCTL_CHILDREN(rack_probertt), 906 OID_AUTO, "safety", CTLFLAG_RW, 907 &rack_probe_rtt_safety_val, 2000000, 908 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 909 SYSCTL_ADD_U32(&rack_sysctl_ctx, 910 SYSCTL_CHILDREN(rack_probertt), 911 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 912 &rack_probe_rtt_sets_cwnd, 0, 913 "Do we set the cwnd too (if always_lower is on)"); 914 SYSCTL_ADD_U32(&rack_sysctl_ctx, 915 SYSCTL_CHILDREN(rack_probertt), 916 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 917 &rack_max_drain_wait, 2, 918 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 919 SYSCTL_ADD_U32(&rack_sysctl_ctx, 920 SYSCTL_CHILDREN(rack_probertt), 921 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 922 &rack_must_drain, 1, 923 "We must drain this many gp_srtt's waiting for flight to reach goal"); 924 SYSCTL_ADD_U32(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_probertt), 926 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 927 &rack_probertt_use_min_rtt_entry, 1, 928 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 929 SYSCTL_ADD_U32(&rack_sysctl_ctx, 930 SYSCTL_CHILDREN(rack_probertt), 931 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 932 &rack_probertt_use_min_rtt_exit, 0, 933 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 934 SYSCTL_ADD_U32(&rack_sysctl_ctx, 935 SYSCTL_CHILDREN(rack_probertt), 936 OID_AUTO, "length_div", CTLFLAG_RW, 937 &rack_probertt_gpsrtt_cnt_div, 0, 938 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 939 SYSCTL_ADD_U32(&rack_sysctl_ctx, 940 SYSCTL_CHILDREN(rack_probertt), 941 OID_AUTO, "length_mul", CTLFLAG_RW, 942 &rack_probertt_gpsrtt_cnt_mul, 0, 943 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 944 SYSCTL_ADD_U32(&rack_sysctl_ctx, 945 SYSCTL_CHILDREN(rack_probertt), 946 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 947 &rack_min_probertt_hold, 200000, 948 "What is the minimum time we hold probertt at target"); 949 SYSCTL_ADD_U32(&rack_sysctl_ctx, 950 SYSCTL_CHILDREN(rack_probertt), 951 OID_AUTO, "filter_life", CTLFLAG_RW, 952 &rack_probertt_filter_life, 10000000, 953 "What is the time for the filters life in useconds"); 954 SYSCTL_ADD_U32(&rack_sysctl_ctx, 955 SYSCTL_CHILDREN(rack_probertt), 956 OID_AUTO, "lower_within", CTLFLAG_RW, 957 &rack_probertt_lower_within, 10, 958 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 959 SYSCTL_ADD_U32(&rack_sysctl_ctx, 960 SYSCTL_CHILDREN(rack_probertt), 961 OID_AUTO, "must_move", CTLFLAG_RW, 962 &rack_min_rtt_movement, 250, 963 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 964 SYSCTL_ADD_U32(&rack_sysctl_ctx, 965 SYSCTL_CHILDREN(rack_probertt), 966 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 967 &rack_probertt_clear_is, 1, 968 "Do we clear I/S counts on exiting probe-rtt"); 969 SYSCTL_ADD_S32(&rack_sysctl_ctx, 970 SYSCTL_CHILDREN(rack_probertt), 971 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 972 &rack_max_drain_hbp, 1, 973 "How many extra drain gpsrtt's do we get in highly buffered paths"); 974 SYSCTL_ADD_S32(&rack_sysctl_ctx, 975 SYSCTL_CHILDREN(rack_probertt), 976 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 977 &rack_hbp_thresh, 3, 978 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 979 /* Pacing related sysctls */ 980 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_sysctl_root), 982 OID_AUTO, 983 "pacing", 984 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 985 "Pacing related Controls"); 986 SYSCTL_ADD_S32(&rack_sysctl_ctx, 987 SYSCTL_CHILDREN(rack_pacing), 988 OID_AUTO, "max_pace_over", CTLFLAG_RW, 989 &rack_max_per_above, 30, 990 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 991 SYSCTL_ADD_S32(&rack_sysctl_ctx, 992 SYSCTL_CHILDREN(rack_pacing), 993 OID_AUTO, "pace_to_one", CTLFLAG_RW, 994 &rack_pace_one_seg, 0, 995 "Do we allow low b/w pacing of 1MSS instead of two"); 996 SYSCTL_ADD_S32(&rack_sysctl_ctx, 997 SYSCTL_CHILDREN(rack_pacing), 998 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 999 &rack_limit_time_with_srtt, 0, 1000 "Do we limit pacing time based on srtt"); 1001 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1002 SYSCTL_CHILDREN(rack_pacing), 1003 OID_AUTO, "init_win", CTLFLAG_RW, 1004 &rack_default_init_window, 0, 1005 "Do we have a rack initial window 0 = system default"); 1006 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1007 SYSCTL_CHILDREN(rack_pacing), 1008 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1009 &rack_per_of_gp_ss, 250, 1010 "If non zero, what percentage of goodput to pace at in slow start"); 1011 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1012 SYSCTL_CHILDREN(rack_pacing), 1013 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1014 &rack_per_of_gp_ca, 150, 1015 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1016 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1017 SYSCTL_CHILDREN(rack_pacing), 1018 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1019 &rack_per_of_gp_rec, 200, 1020 "If non zero, what percentage of goodput to pace at in recovery"); 1021 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1022 SYSCTL_CHILDREN(rack_pacing), 1023 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1024 &rack_hptsi_segments, 40, 1025 "What size is the max for TSO segments in pacing and burst mitigation"); 1026 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1027 SYSCTL_CHILDREN(rack_pacing), 1028 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1029 &rack_slot_reduction, 4, 1030 "When doing only burst mitigation what is the reduce divisor"); 1031 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1032 SYSCTL_CHILDREN(rack_sysctl_root), 1033 OID_AUTO, "use_pacing", CTLFLAG_RW, 1034 &rack_pace_every_seg, 0, 1035 "If set we use pacing, if clear we use only the original burst mitigation"); 1036 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1037 SYSCTL_CHILDREN(rack_pacing), 1038 OID_AUTO, "rate_cap", CTLFLAG_RW, 1039 &rack_bw_rate_cap, 0, 1040 "If set we apply this value to the absolute rate cap used by pacing"); 1041 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1042 SYSCTL_CHILDREN(rack_sysctl_root), 1043 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1044 &rack_req_measurements, 1, 1045 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1046 /* Hardware pacing */ 1047 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1048 SYSCTL_CHILDREN(rack_sysctl_root), 1049 OID_AUTO, 1050 "hdwr_pacing", 1051 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1052 "Pacing related Controls"); 1053 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1054 SYSCTL_CHILDREN(rack_hw_pacing), 1055 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1056 &rack_hw_rwnd_factor, 2, 1057 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1058 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1059 SYSCTL_CHILDREN(rack_hw_pacing), 1060 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1061 &rack_enobuf_hw_boost_mult, 2, 1062 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1063 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1064 SYSCTL_CHILDREN(rack_hw_pacing), 1065 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1066 &rack_enobuf_hw_max, 2, 1067 "What is the max boost the pacing time if we see a ENOBUFS?"); 1068 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1069 SYSCTL_CHILDREN(rack_hw_pacing), 1070 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1071 &rack_enobuf_hw_min, 2, 1072 "What is the min boost the pacing time if we see a ENOBUFS?"); 1073 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1074 SYSCTL_CHILDREN(rack_hw_pacing), 1075 OID_AUTO, "enable", CTLFLAG_RW, 1076 &rack_enable_hw_pacing, 0, 1077 "Should RACK attempt to use hw pacing?"); 1078 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1079 SYSCTL_CHILDREN(rack_hw_pacing), 1080 OID_AUTO, "rate_cap", CTLFLAG_RW, 1081 &rack_hw_rate_caps, 1, 1082 "Does the highest hardware pacing rate cap the rate we will send at??"); 1083 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1084 SYSCTL_CHILDREN(rack_hw_pacing), 1085 OID_AUTO, "rate_min", CTLFLAG_RW, 1086 &rack_hw_rate_min, 0, 1087 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1088 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1089 SYSCTL_CHILDREN(rack_hw_pacing), 1090 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1091 &rack_hw_rate_to_low, 0, 1092 "If we fall below this rate, dis-engage hw pacing?"); 1093 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1094 SYSCTL_CHILDREN(rack_hw_pacing), 1095 OID_AUTO, "up_only", CTLFLAG_RW, 1096 &rack_hw_up_only, 1, 1097 "Do we allow hw pacing to lower the rate selected?"); 1098 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1099 SYSCTL_CHILDREN(rack_hw_pacing), 1100 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1101 &rack_hw_pace_extra_slots, 2, 1102 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1103 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1104 SYSCTL_CHILDREN(rack_sysctl_root), 1105 OID_AUTO, 1106 "timely", 1107 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1108 "Rack Timely RTT Controls"); 1109 /* Timely based GP dynmics */ 1110 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1111 SYSCTL_CHILDREN(rack_timely), 1112 OID_AUTO, "upper", CTLFLAG_RW, 1113 &rack_gp_per_bw_mul_up, 2, 1114 "Rack timely upper range for equal b/w (in percentage)"); 1115 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1116 SYSCTL_CHILDREN(rack_timely), 1117 OID_AUTO, "lower", CTLFLAG_RW, 1118 &rack_gp_per_bw_mul_down, 4, 1119 "Rack timely lower range for equal b/w (in percentage)"); 1120 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1121 SYSCTL_CHILDREN(rack_timely), 1122 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1123 &rack_gp_rtt_maxmul, 3, 1124 "Rack timely multipler of lowest rtt for rtt_max"); 1125 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1126 SYSCTL_CHILDREN(rack_timely), 1127 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1128 &rack_gp_rtt_mindiv, 4, 1129 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1130 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1131 SYSCTL_CHILDREN(rack_timely), 1132 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1133 &rack_gp_rtt_minmul, 1, 1134 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1135 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1136 SYSCTL_CHILDREN(rack_timely), 1137 OID_AUTO, "decrease", CTLFLAG_RW, 1138 &rack_gp_decrease_per, 20, 1139 "Rack timely decrease percentage of our GP multiplication factor"); 1140 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1141 SYSCTL_CHILDREN(rack_timely), 1142 OID_AUTO, "increase", CTLFLAG_RW, 1143 &rack_gp_increase_per, 2, 1144 "Rack timely increase perentage of our GP multiplication factor"); 1145 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1146 SYSCTL_CHILDREN(rack_timely), 1147 OID_AUTO, "lowerbound", CTLFLAG_RW, 1148 &rack_per_lower_bound, 50, 1149 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1150 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1151 SYSCTL_CHILDREN(rack_timely), 1152 OID_AUTO, "upperboundss", CTLFLAG_RW, 1153 &rack_per_upper_bound_ss, 0, 1154 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1155 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1156 SYSCTL_CHILDREN(rack_timely), 1157 OID_AUTO, "upperboundca", CTLFLAG_RW, 1158 &rack_per_upper_bound_ca, 0, 1159 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1160 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1161 SYSCTL_CHILDREN(rack_timely), 1162 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1163 &rack_do_dyn_mul, 0, 1164 "Rack timely do we enable dynmaic timely goodput by default"); 1165 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1166 SYSCTL_CHILDREN(rack_timely), 1167 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1168 &rack_gp_no_rec_chg, 1, 1169 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1170 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1171 SYSCTL_CHILDREN(rack_timely), 1172 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1173 &rack_timely_dec_clear, 6, 1174 "Rack timely what threshold do we count to before another boost during b/w decent"); 1175 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1176 SYSCTL_CHILDREN(rack_timely), 1177 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1178 &rack_timely_max_push_rise, 3, 1179 "Rack timely how many times do we push up with b/w increase"); 1180 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1181 SYSCTL_CHILDREN(rack_timely), 1182 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1183 &rack_timely_max_push_drop, 3, 1184 "Rack timely how many times do we push back on b/w decent"); 1185 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1186 SYSCTL_CHILDREN(rack_timely), 1187 OID_AUTO, "min_segs", CTLFLAG_RW, 1188 &rack_timely_min_segs, 4, 1189 "Rack timely when setting the cwnd what is the min num segments"); 1190 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1191 SYSCTL_CHILDREN(rack_timely), 1192 OID_AUTO, "noback_max", CTLFLAG_RW, 1193 &rack_use_max_for_nobackoff, 0, 1194 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1195 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1196 SYSCTL_CHILDREN(rack_timely), 1197 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1198 &rack_timely_int_timely_only, 0, 1199 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1200 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1201 SYSCTL_CHILDREN(rack_timely), 1202 OID_AUTO, "nonstop", CTLFLAG_RW, 1203 &rack_timely_no_stopping, 0, 1204 "Rack timely don't stop increase"); 1205 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_timely), 1207 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1208 &rack_down_raise_thresh, 100, 1209 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1210 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1211 SYSCTL_CHILDREN(rack_timely), 1212 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1213 &rack_req_segs, 1, 1214 "Bottom dragging if not these many segments outstanding and room"); 1215 1216 /* TLP and Rack related parameters */ 1217 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1218 SYSCTL_CHILDREN(rack_sysctl_root), 1219 OID_AUTO, 1220 "tlp", 1221 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1222 "TLP and Rack related Controls"); 1223 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1224 SYSCTL_CHILDREN(rack_tlp), 1225 OID_AUTO, "use_rrr", CTLFLAG_RW, 1226 &use_rack_rr, 1, 1227 "Do we use Rack Rapid Recovery"); 1228 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1229 SYSCTL_CHILDREN(rack_tlp), 1230 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1231 &rack_max_abc_post_recovery, 2, 1232 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1233 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1234 SYSCTL_CHILDREN(rack_tlp), 1235 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1236 &rack_non_rxt_use_cr, 0, 1237 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1238 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1239 SYSCTL_CHILDREN(rack_tlp), 1240 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1241 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1242 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1243 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1244 SYSCTL_CHILDREN(rack_tlp), 1245 OID_AUTO, "limit", CTLFLAG_RW, 1246 &rack_tlp_limit, 2, 1247 "How many TLP's can be sent without sending new data"); 1248 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1249 SYSCTL_CHILDREN(rack_tlp), 1250 OID_AUTO, "use_greater", CTLFLAG_RW, 1251 &rack_tlp_use_greater, 1, 1252 "Should we use the rack_rtt time if its greater than srtt"); 1253 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1254 SYSCTL_CHILDREN(rack_tlp), 1255 OID_AUTO, "tlpminto", CTLFLAG_RW, 1256 &rack_tlp_min, 10000, 1257 "TLP minimum timeout per the specification (in microseconds)"); 1258 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1259 SYSCTL_CHILDREN(rack_tlp), 1260 OID_AUTO, "send_oldest", CTLFLAG_RW, 1261 &rack_always_send_oldest, 0, 1262 "Should we always send the oldest TLP and RACK-TLP"); 1263 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1264 SYSCTL_CHILDREN(rack_tlp), 1265 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1266 &rack_limited_retran, 0, 1267 "How many times can a rack timeout drive out sends"); 1268 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1269 SYSCTL_CHILDREN(rack_tlp), 1270 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1271 &rack_lower_cwnd_at_tlp, 0, 1272 "When a TLP completes a retran should we enter recovery"); 1273 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1274 SYSCTL_CHILDREN(rack_tlp), 1275 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1276 &rack_reorder_thresh, 2, 1277 "What factor for rack will be added when seeing reordering (shift right)"); 1278 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1279 SYSCTL_CHILDREN(rack_tlp), 1280 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1281 &rack_tlp_thresh, 1, 1282 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1283 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_tlp), 1285 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1286 &rack_reorder_fade, 60000000, 1287 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1288 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1289 SYSCTL_CHILDREN(rack_tlp), 1290 OID_AUTO, "pktdelay", CTLFLAG_RW, 1291 &rack_pkt_delay, 1000, 1292 "Extra RACK time (in microseconds) besides reordering thresh"); 1293 1294 /* Timer related controls */ 1295 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1296 SYSCTL_CHILDREN(rack_sysctl_root), 1297 OID_AUTO, 1298 "timers", 1299 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1300 "Timer related controls"); 1301 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1302 SYSCTL_CHILDREN(rack_timers), 1303 OID_AUTO, "persmin", CTLFLAG_RW, 1304 &rack_persist_min, 250000, 1305 "What is the minimum time in microseconds between persists"); 1306 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1307 SYSCTL_CHILDREN(rack_timers), 1308 OID_AUTO, "persmax", CTLFLAG_RW, 1309 &rack_persist_max, 2000000, 1310 "What is the largest delay in microseconds between persists"); 1311 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1312 SYSCTL_CHILDREN(rack_timers), 1313 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1314 &rack_delayed_ack_time, 40000, 1315 "Delayed ack time (40ms in microseconds)"); 1316 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1317 SYSCTL_CHILDREN(rack_timers), 1318 OID_AUTO, "minrto", CTLFLAG_RW, 1319 &rack_rto_min, 30000, 1320 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1321 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1322 SYSCTL_CHILDREN(rack_timers), 1323 OID_AUTO, "maxrto", CTLFLAG_RW, 1324 &rack_rto_max, 4000000, 1325 "Maximum RTO in microseconds -- should be at least as large as min_rto"); 1326 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1327 SYSCTL_CHILDREN(rack_timers), 1328 OID_AUTO, "minto", CTLFLAG_RW, 1329 &rack_min_to, 1000, 1330 "Minimum rack timeout in microseconds"); 1331 /* Measure controls */ 1332 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1333 SYSCTL_CHILDREN(rack_sysctl_root), 1334 OID_AUTO, 1335 "measure", 1336 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1337 "Measure related controls"); 1338 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1339 SYSCTL_CHILDREN(rack_measure), 1340 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1341 &rack_wma_divisor, 8, 1342 "When doing b/w calculation what is the divisor for the WMA"); 1343 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1344 SYSCTL_CHILDREN(rack_measure), 1345 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1346 &rack_cwnd_block_ends_measure, 0, 1347 "Does a cwnd just-return end the measurement window (app limited)"); 1348 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1349 SYSCTL_CHILDREN(rack_measure), 1350 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1351 &rack_rwnd_block_ends_measure, 0, 1352 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1353 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1354 SYSCTL_CHILDREN(rack_measure), 1355 OID_AUTO, "min_target", CTLFLAG_RW, 1356 &rack_def_data_window, 20, 1357 "What is the minimum target window (in mss) for a GP measurements"); 1358 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1359 SYSCTL_CHILDREN(rack_measure), 1360 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1361 &rack_goal_bdp, 2, 1362 "What is the goal BDP to measure"); 1363 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1364 SYSCTL_CHILDREN(rack_measure), 1365 OID_AUTO, "min_srtts", CTLFLAG_RW, 1366 &rack_min_srtts, 1, 1367 "What is the goal BDP to measure"); 1368 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1369 SYSCTL_CHILDREN(rack_measure), 1370 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1371 &rack_min_measure_usec, 0, 1372 "What is the Minimum time time for a measurement if 0, this is off"); 1373 /* Features */ 1374 rack_features = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1375 SYSCTL_CHILDREN(rack_sysctl_root), 1376 OID_AUTO, 1377 "features", 1378 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1379 "Feature controls"); 1380 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1381 SYSCTL_CHILDREN(rack_features), 1382 OID_AUTO, "cmpack", CTLFLAG_RW, 1383 &rack_use_cmp_acks, 1, 1384 "Should RACK have LRO send compressed acks"); 1385 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1386 SYSCTL_CHILDREN(rack_features), 1387 OID_AUTO, "fsb", CTLFLAG_RW, 1388 &rack_use_fsb, 1, 1389 "Should RACK use the fast send block?"); 1390 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1391 SYSCTL_CHILDREN(rack_features), 1392 OID_AUTO, "rfo", CTLFLAG_RW, 1393 &rack_use_rfo, 1, 1394 "Should RACK use rack_fast_output()?"); 1395 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1396 SYSCTL_CHILDREN(rack_features), 1397 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1398 &rack_use_rsm_rfo, 1, 1399 "Should RACK use rack_fast_rsm_output()?"); 1400 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1401 SYSCTL_CHILDREN(rack_features), 1402 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1403 &rack_enable_mqueue_for_nonpaced, 0, 1404 "Should RACK use mbuf queuing for non-paced connections"); 1405 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1406 SYSCTL_CHILDREN(rack_features), 1407 OID_AUTO, "hystartplusplus", CTLFLAG_RW, 1408 &rack_do_hystart, 0, 1409 "Should RACK enable HyStart++ on connections?"); 1410 /* Misc rack controls */ 1411 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1412 SYSCTL_CHILDREN(rack_sysctl_root), 1413 OID_AUTO, 1414 "misc", 1415 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1416 "Misc related controls"); 1417 #ifdef TCP_ACCOUNTING 1418 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1419 SYSCTL_CHILDREN(rack_misc), 1420 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1421 &rack_tcp_accounting, 0, 1422 "Should we turn on TCP accounting for all rack sessions?"); 1423 #endif 1424 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1425 SYSCTL_CHILDREN(rack_misc), 1426 OID_AUTO, "apply_rtt_with_low_conf", CTLFLAG_RW, 1427 &rack_apply_rtt_with_reduced_conf, 0, 1428 "When a persist or keep-alive probe is not answered do we calculate rtt on subsequent answers?"); 1429 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1430 SYSCTL_CHILDREN(rack_misc), 1431 OID_AUTO, "rack_dsack_ctl", CTLFLAG_RW, 1432 &rack_dsack_std_based, 3, 1433 "How do we process dsack with respect to rack timers, bit field, 3 is standards based?"); 1434 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1435 SYSCTL_CHILDREN(rack_misc), 1436 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1437 &rack_prr_addbackmax, 2, 1438 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1439 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1440 SYSCTL_CHILDREN(rack_misc), 1441 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1442 &rack_stats_gets_ms_rtt, 1, 1443 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1444 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1445 SYSCTL_CHILDREN(rack_misc), 1446 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1447 &rack_client_low_buf, 0, 1448 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1449 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1450 SYSCTL_CHILDREN(rack_misc), 1451 OID_AUTO, "defprofile", CTLFLAG_RW, 1452 &rack_def_profile, 0, 1453 "Should RACK use a default profile (0=no, num == profile num)?"); 1454 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1455 SYSCTL_CHILDREN(rack_misc), 1456 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1457 &rack_enable_shared_cwnd, 1, 1458 "Should RACK try to use the shared cwnd on connections where allowed"); 1459 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1460 SYSCTL_CHILDREN(rack_misc), 1461 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1462 &rack_limits_scwnd, 1, 1463 "Should RACK place low end time limits on the shared cwnd feature"); 1464 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1465 SYSCTL_CHILDREN(rack_misc), 1466 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1467 &rack_use_imac_dack, 0, 1468 "Should RACK try to emulate iMac delayed ack"); 1469 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1470 SYSCTL_CHILDREN(rack_misc), 1471 OID_AUTO, "no_prr", CTLFLAG_RW, 1472 &rack_disable_prr, 0, 1473 "Should RACK not use prr and only pace (must have pacing on)"); 1474 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1475 SYSCTL_CHILDREN(rack_misc), 1476 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1477 &rack_verbose_logging, 0, 1478 "Should RACK black box logging be verbose"); 1479 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1480 SYSCTL_CHILDREN(rack_misc), 1481 OID_AUTO, "data_after_close", CTLFLAG_RW, 1482 &rack_ignore_data_after_close, 1, 1483 "Do we hold off sending a RST until all pending data is ack'd"); 1484 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1485 SYSCTL_CHILDREN(rack_misc), 1486 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1487 &rack_sack_not_required, 1, 1488 "Do we allow rack to run on connections not supporting SACK"); 1489 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_misc), 1491 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1492 &rack_send_a_lot_in_prr, 1, 1493 "Send a lot in prr"); 1494 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1495 SYSCTL_CHILDREN(rack_misc), 1496 OID_AUTO, "autoscale", CTLFLAG_RW, 1497 &rack_autosndbuf_inc, 20, 1498 "What percentage should rack scale up its snd buffer by?"); 1499 /* Sack Attacker detection stuff */ 1500 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1501 SYSCTL_CHILDREN(rack_attack), 1502 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1503 &rack_highest_sack_thresh_seen, 0, 1504 "Highest sack to ack ratio seen"); 1505 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1506 SYSCTL_CHILDREN(rack_attack), 1507 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1508 &rack_highest_move_thresh_seen, 0, 1509 "Highest move to non-move ratio seen"); 1510 rack_ack_total = counter_u64_alloc(M_WAITOK); 1511 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1512 SYSCTL_CHILDREN(rack_attack), 1513 OID_AUTO, "acktotal", CTLFLAG_RD, 1514 &rack_ack_total, 1515 "Total number of Ack's"); 1516 rack_express_sack = counter_u64_alloc(M_WAITOK); 1517 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1518 SYSCTL_CHILDREN(rack_attack), 1519 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1520 &rack_express_sack, 1521 "Total expresss number of Sack's"); 1522 rack_sack_total = counter_u64_alloc(M_WAITOK); 1523 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1524 SYSCTL_CHILDREN(rack_attack), 1525 OID_AUTO, "sacktotal", CTLFLAG_RD, 1526 &rack_sack_total, 1527 "Total number of SACKs"); 1528 rack_move_none = counter_u64_alloc(M_WAITOK); 1529 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1530 SYSCTL_CHILDREN(rack_attack), 1531 OID_AUTO, "move_none", CTLFLAG_RD, 1532 &rack_move_none, 1533 "Total number of SACK index reuse of postions under threshold"); 1534 rack_move_some = counter_u64_alloc(M_WAITOK); 1535 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1536 SYSCTL_CHILDREN(rack_attack), 1537 OID_AUTO, "move_some", CTLFLAG_RD, 1538 &rack_move_some, 1539 "Total number of SACK index reuse of postions over threshold"); 1540 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1541 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1542 SYSCTL_CHILDREN(rack_attack), 1543 OID_AUTO, "attacks", CTLFLAG_RD, 1544 &rack_sack_attacks_detected, 1545 "Total number of SACK attackers that had sack disabled"); 1546 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1547 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1548 SYSCTL_CHILDREN(rack_attack), 1549 OID_AUTO, "reversed", CTLFLAG_RD, 1550 &rack_sack_attacks_reversed, 1551 "Total number of SACK attackers that were later determined false positive"); 1552 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1553 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1554 SYSCTL_CHILDREN(rack_attack), 1555 OID_AUTO, "nextmerge", CTLFLAG_RD, 1556 &rack_sack_used_next_merge, 1557 "Total number of times we used the next merge"); 1558 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1559 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1560 SYSCTL_CHILDREN(rack_attack), 1561 OID_AUTO, "prevmerge", CTLFLAG_RD, 1562 &rack_sack_used_prev_merge, 1563 "Total number of times we used the prev merge"); 1564 /* Counters */ 1565 rack_fto_send = counter_u64_alloc(M_WAITOK); 1566 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1567 SYSCTL_CHILDREN(rack_counters), 1568 OID_AUTO, "fto_send", CTLFLAG_RD, 1569 &rack_fto_send, "Total number of rack_fast_output sends"); 1570 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1571 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1572 SYSCTL_CHILDREN(rack_counters), 1573 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1574 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1575 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1576 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1577 SYSCTL_CHILDREN(rack_counters), 1578 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1579 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1580 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1581 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1582 SYSCTL_CHILDREN(rack_counters), 1583 OID_AUTO, "nfto_send", CTLFLAG_RD, 1584 &rack_non_fto_send, "Total number of rack_output first sends"); 1585 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1586 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1587 SYSCTL_CHILDREN(rack_counters), 1588 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1589 &rack_extended_rfo, "Total number of times we extended rfo"); 1590 1591 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1592 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1593 SYSCTL_CHILDREN(rack_counters), 1594 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1595 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1596 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1597 1598 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1599 SYSCTL_CHILDREN(rack_counters), 1600 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1601 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1602 rack_badfr = counter_u64_alloc(M_WAITOK); 1603 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1604 SYSCTL_CHILDREN(rack_counters), 1605 OID_AUTO, "badfr", CTLFLAG_RD, 1606 &rack_badfr, "Total number of bad FRs"); 1607 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1608 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1609 SYSCTL_CHILDREN(rack_counters), 1610 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1611 &rack_badfr_bytes, "Total number of bad FRs"); 1612 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1613 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1614 SYSCTL_CHILDREN(rack_counters), 1615 OID_AUTO, "prrsndret", CTLFLAG_RD, 1616 &rack_rtm_prr_retran, 1617 "Total number of prr based retransmits"); 1618 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1619 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1620 SYSCTL_CHILDREN(rack_counters), 1621 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1622 &rack_rtm_prr_newdata, 1623 "Total number of prr based new transmits"); 1624 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1625 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1626 SYSCTL_CHILDREN(rack_counters), 1627 OID_AUTO, "tsnf", CTLFLAG_RD, 1628 &rack_timestamp_mismatch, 1629 "Total number of timestamps that we could not find the reported ts"); 1630 rack_find_high = counter_u64_alloc(M_WAITOK); 1631 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1632 SYSCTL_CHILDREN(rack_counters), 1633 OID_AUTO, "findhigh", CTLFLAG_RD, 1634 &rack_find_high, 1635 "Total number of FIN causing find-high"); 1636 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1637 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1638 SYSCTL_CHILDREN(rack_counters), 1639 OID_AUTO, "reordering", CTLFLAG_RD, 1640 &rack_reorder_seen, 1641 "Total number of times we added delay due to reordering"); 1642 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1643 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1644 SYSCTL_CHILDREN(rack_counters), 1645 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1646 &rack_tlp_tot, 1647 "Total number of tail loss probe expirations"); 1648 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1649 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1650 SYSCTL_CHILDREN(rack_counters), 1651 OID_AUTO, "tlp_new", CTLFLAG_RD, 1652 &rack_tlp_newdata, 1653 "Total number of tail loss probe sending new data"); 1654 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1655 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1656 SYSCTL_CHILDREN(rack_counters), 1657 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1658 &rack_tlp_retran, 1659 "Total number of tail loss probe sending retransmitted data"); 1660 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1661 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1662 SYSCTL_CHILDREN(rack_counters), 1663 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1664 &rack_tlp_retran_bytes, 1665 "Total bytes of tail loss probe sending retransmitted data"); 1666 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1667 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1668 SYSCTL_CHILDREN(rack_counters), 1669 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1670 &rack_tlp_retran_fail, 1671 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1672 rack_to_tot = counter_u64_alloc(M_WAITOK); 1673 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1674 SYSCTL_CHILDREN(rack_counters), 1675 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1676 &rack_to_tot, 1677 "Total number of times the rack to expired"); 1678 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1679 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1680 SYSCTL_CHILDREN(rack_counters), 1681 OID_AUTO, "arm_rack", CTLFLAG_RD, 1682 &rack_to_arm_rack, 1683 "Total number of times the rack timer armed"); 1684 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1685 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1686 SYSCTL_CHILDREN(rack_counters), 1687 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1688 &rack_to_arm_tlp, 1689 "Total number of times the tlp timer armed"); 1690 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1691 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1692 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1693 SYSCTL_CHILDREN(rack_counters), 1694 OID_AUTO, "calc_zero", CTLFLAG_RD, 1695 &rack_calc_zero, 1696 "Total number of times pacing time worked out to zero"); 1697 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1698 SYSCTL_CHILDREN(rack_counters), 1699 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1700 &rack_calc_nonzero, 1701 "Total number of times pacing time worked out to non-zero"); 1702 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1703 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1704 SYSCTL_CHILDREN(rack_counters), 1705 OID_AUTO, "paced", CTLFLAG_RD, 1706 &rack_paced_segments, 1707 "Total number of times a segment send caused hptsi"); 1708 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1709 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1710 SYSCTL_CHILDREN(rack_counters), 1711 OID_AUTO, "unpaced", CTLFLAG_RD, 1712 &rack_unpaced_segments, 1713 "Total number of times a segment did not cause hptsi"); 1714 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1715 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1716 SYSCTL_CHILDREN(rack_counters), 1717 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1718 &rack_saw_enobuf, 1719 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1720 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1721 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1722 SYSCTL_CHILDREN(rack_counters), 1723 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1724 &rack_saw_enobuf_hw, 1725 "Total number of times a send returned enobuf for hdwr paced connections"); 1726 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1727 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1728 SYSCTL_CHILDREN(rack_counters), 1729 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1730 &rack_saw_enetunreach, 1731 "Total number of times a send received a enetunreachable"); 1732 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1733 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1734 SYSCTL_CHILDREN(rack_counters), 1735 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1736 &rack_hot_alloc, 1737 "Total allocations from the top of our list"); 1738 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1739 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1740 SYSCTL_CHILDREN(rack_counters), 1741 OID_AUTO, "allocs", CTLFLAG_RD, 1742 &rack_to_alloc, 1743 "Total allocations of tracking structures"); 1744 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1745 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1746 SYSCTL_CHILDREN(rack_counters), 1747 OID_AUTO, "allochard", CTLFLAG_RD, 1748 &rack_to_alloc_hard, 1749 "Total allocations done with sleeping the hard way"); 1750 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1751 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1752 SYSCTL_CHILDREN(rack_counters), 1753 OID_AUTO, "allocemerg", CTLFLAG_RD, 1754 &rack_to_alloc_emerg, 1755 "Total allocations done from emergency cache"); 1756 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1757 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1758 SYSCTL_CHILDREN(rack_counters), 1759 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1760 &rack_to_alloc_limited, 1761 "Total allocations dropped due to limit"); 1762 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1763 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1764 SYSCTL_CHILDREN(rack_counters), 1765 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1766 &rack_alloc_limited_conns, 1767 "Connections with allocations dropped due to limit"); 1768 rack_split_limited = counter_u64_alloc(M_WAITOK); 1769 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1770 SYSCTL_CHILDREN(rack_counters), 1771 OID_AUTO, "split_limited", CTLFLAG_RD, 1772 &rack_split_limited, 1773 "Split allocations dropped due to limit"); 1774 1775 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 1776 char name[32]; 1777 sprintf(name, "cmp_ack_cnt_%d", i); 1778 rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK); 1779 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1780 SYSCTL_CHILDREN(rack_counters), 1781 OID_AUTO, name, CTLFLAG_RD, 1782 &rack_proc_comp_ack[i], 1783 "Number of compressed acks we processed"); 1784 } 1785 rack_large_ackcmp = counter_u64_alloc(M_WAITOK); 1786 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1787 SYSCTL_CHILDREN(rack_counters), 1788 OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD, 1789 &rack_large_ackcmp, 1790 "Number of TCP connections with large mbuf's for compressed acks"); 1791 rack_persists_sends = counter_u64_alloc(M_WAITOK); 1792 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1793 SYSCTL_CHILDREN(rack_counters), 1794 OID_AUTO, "persist_sends", CTLFLAG_RD, 1795 &rack_persists_sends, 1796 "Number of times we sent a persist probe"); 1797 rack_persists_acks = counter_u64_alloc(M_WAITOK); 1798 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1799 SYSCTL_CHILDREN(rack_counters), 1800 OID_AUTO, "persist_acks", CTLFLAG_RD, 1801 &rack_persists_acks, 1802 "Number of times a persist probe was acked"); 1803 rack_persists_loss = counter_u64_alloc(M_WAITOK); 1804 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1805 SYSCTL_CHILDREN(rack_counters), 1806 OID_AUTO, "persist_loss", CTLFLAG_RD, 1807 &rack_persists_loss, 1808 "Number of times we detected a lost persist probe (no ack)"); 1809 rack_persists_lost_ends = counter_u64_alloc(M_WAITOK); 1810 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1811 SYSCTL_CHILDREN(rack_counters), 1812 OID_AUTO, "persist_loss_ends", CTLFLAG_RD, 1813 &rack_persists_lost_ends, 1814 "Number of lost persist probe (no ack) that the run ended with a PERSIST abort"); 1815 rack_small_ackcmp = counter_u64_alloc(M_WAITOK); 1816 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1817 SYSCTL_CHILDREN(rack_counters), 1818 OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD, 1819 &rack_small_ackcmp, 1820 "Number of TCP connections with small mbuf's for compressed acks"); 1821 #ifdef INVARIANTS 1822 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1823 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1824 SYSCTL_CHILDREN(rack_counters), 1825 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1826 &rack_adjust_map_bw, 1827 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1828 #endif 1829 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1830 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1831 SYSCTL_CHILDREN(rack_counters), 1832 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1833 &rack_multi_single_eq, 1834 "Number of compressed acks total represented"); 1835 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1836 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1837 SYSCTL_CHILDREN(rack_counters), 1838 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1839 &rack_proc_non_comp_ack, 1840 "Number of non compresseds acks that we processed"); 1841 1842 1843 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1844 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1845 SYSCTL_CHILDREN(rack_counters), 1846 OID_AUTO, "sack_long", CTLFLAG_RD, 1847 &rack_sack_proc_all, 1848 "Total times we had to walk whole list for sack processing"); 1849 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1850 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1851 SYSCTL_CHILDREN(rack_counters), 1852 OID_AUTO, "sack_restart", CTLFLAG_RD, 1853 &rack_sack_proc_restart, 1854 "Total times we had to walk whole list due to a restart"); 1855 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1856 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1857 SYSCTL_CHILDREN(rack_counters), 1858 OID_AUTO, "sack_short", CTLFLAG_RD, 1859 &rack_sack_proc_short, 1860 "Total times we took shortcut for sack processing"); 1861 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1862 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1863 SYSCTL_CHILDREN(rack_counters), 1864 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1865 &rack_enter_tlp_calc, 1866 "Total times we called calc-tlp"); 1867 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1868 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1869 SYSCTL_CHILDREN(rack_counters), 1870 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1871 &rack_used_tlpmethod, 1872 "Total number of runt sacks"); 1873 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1874 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1875 SYSCTL_CHILDREN(rack_counters), 1876 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1877 &rack_used_tlpmethod2, 1878 "Total number of times we hit TLP method 2"); 1879 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1880 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1881 SYSCTL_CHILDREN(rack_attack), 1882 OID_AUTO, "skipacked", CTLFLAG_RD, 1883 &rack_sack_skipped_acked, 1884 "Total number of times we skipped previously sacked"); 1885 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1886 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1887 SYSCTL_CHILDREN(rack_attack), 1888 OID_AUTO, "ofsplit", CTLFLAG_RD, 1889 &rack_sack_splits, 1890 "Total number of times we did the old fashion tree split"); 1891 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1892 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1893 SYSCTL_CHILDREN(rack_counters), 1894 OID_AUTO, "prog_drops", CTLFLAG_RD, 1895 &rack_progress_drops, 1896 "Total number of progress drops"); 1897 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1898 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1899 SYSCTL_CHILDREN(rack_counters), 1900 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1901 &rack_input_idle_reduces, 1902 "Total number of idle reductions on input"); 1903 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1904 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1905 SYSCTL_CHILDREN(rack_counters), 1906 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1907 &rack_collapsed_win, 1908 "Total number of collapsed windows"); 1909 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1910 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1911 SYSCTL_CHILDREN(rack_counters), 1912 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1913 &rack_tlp_does_nada, 1914 "Total number of nada tlp calls"); 1915 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1916 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1917 SYSCTL_CHILDREN(rack_counters), 1918 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1919 &rack_try_scwnd, 1920 "Total number of scwnd attempts"); 1921 1922 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1923 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1924 SYSCTL_CHILDREN(rack_counters), 1925 OID_AUTO, "timer_hole", CTLFLAG_RD, 1926 &rack_per_timer_hole, 1927 "Total persists start in timer hole"); 1928 1929 rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK); 1930 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1931 SYSCTL_CHILDREN(rack_counters), 1932 OID_AUTO, "sndptr_wrong", CTLFLAG_RD, 1933 &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret"); 1934 rack_sbsndptr_right = counter_u64_alloc(M_WAITOK); 1935 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1936 SYSCTL_CHILDREN(rack_counters), 1937 OID_AUTO, "sndptr_right", CTLFLAG_RD, 1938 &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret"); 1939 1940 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1941 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1942 OID_AUTO, "outsize", CTLFLAG_RD, 1943 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1944 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1945 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1946 OID_AUTO, "opts", CTLFLAG_RD, 1947 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1948 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1949 SYSCTL_CHILDREN(rack_sysctl_root), 1950 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1951 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1952 } 1953 1954 static __inline int 1955 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1956 { 1957 if (SEQ_GEQ(b->r_start, a->r_start) && 1958 SEQ_LT(b->r_start, a->r_end)) { 1959 /* 1960 * The entry b is within the 1961 * block a. i.e.: 1962 * a -- |-------------| 1963 * b -- |----| 1964 * <or> 1965 * b -- |------| 1966 * <or> 1967 * b -- |-----------| 1968 */ 1969 return (0); 1970 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1971 /* 1972 * b falls as either the next 1973 * sequence block after a so a 1974 * is said to be smaller than b. 1975 * i.e: 1976 * a -- |------| 1977 * b -- |--------| 1978 * or 1979 * b -- |-----| 1980 */ 1981 return (1); 1982 } 1983 /* 1984 * Whats left is where a is 1985 * larger than b. i.e: 1986 * a -- |-------| 1987 * b -- |---| 1988 * or even possibly 1989 * b -- |--------------| 1990 */ 1991 return (-1); 1992 } 1993 1994 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1995 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1996 1997 static uint32_t 1998 rc_init_window(struct tcp_rack *rack) 1999 { 2000 uint32_t win; 2001 2002 if (rack->rc_init_win == 0) { 2003 /* 2004 * Nothing set by the user, use the system stack 2005 * default. 2006 */ 2007 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 2008 } 2009 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 2010 return (win); 2011 } 2012 2013 static uint64_t 2014 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 2015 { 2016 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 2017 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 2018 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2019 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 2020 else 2021 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 2022 } 2023 2024 static uint64_t 2025 rack_get_bw(struct tcp_rack *rack) 2026 { 2027 if (rack->use_fixed_rate) { 2028 /* Return the fixed pacing rate */ 2029 return (rack_get_fixed_pacing_bw(rack)); 2030 } 2031 if (rack->r_ctl.gp_bw == 0) { 2032 /* 2033 * We have yet no b/w measurement, 2034 * if we have a user set initial bw 2035 * return it. If we don't have that and 2036 * we have an srtt, use the tcp IW (10) to 2037 * calculate a fictional b/w over the SRTT 2038 * which is more or less a guess. Note 2039 * we don't use our IW from rack on purpose 2040 * so if we have like IW=30, we are not 2041 * calculating a "huge" b/w. 2042 */ 2043 uint64_t bw, srtt; 2044 if (rack->r_ctl.init_rate) 2045 return (rack->r_ctl.init_rate); 2046 2047 /* Has the user set a max peak rate? */ 2048 #ifdef NETFLIX_PEAKRATE 2049 if (rack->rc_tp->t_maxpeakrate) 2050 return (rack->rc_tp->t_maxpeakrate); 2051 #endif 2052 /* Ok lets come up with the IW guess, if we have a srtt */ 2053 if (rack->rc_tp->t_srtt == 0) { 2054 /* 2055 * Go with old pacing method 2056 * i.e. burst mitigation only. 2057 */ 2058 return (0); 2059 } 2060 /* Ok lets get the initial TCP win (not racks) */ 2061 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2062 srtt = (uint64_t)rack->rc_tp->t_srtt; 2063 bw *= (uint64_t)USECS_IN_SECOND; 2064 bw /= srtt; 2065 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2066 bw = rack->r_ctl.bw_rate_cap; 2067 return (bw); 2068 } else { 2069 uint64_t bw; 2070 2071 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2072 /* Averaging is done, we can return the value */ 2073 bw = rack->r_ctl.gp_bw; 2074 } else { 2075 /* Still doing initial average must calculate */ 2076 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; 2077 } 2078 #ifdef NETFLIX_PEAKRATE 2079 if ((rack->rc_tp->t_maxpeakrate) && 2080 (bw > rack->rc_tp->t_maxpeakrate)) { 2081 /* The user has set a peak rate to pace at 2082 * don't allow us to pace faster than that. 2083 */ 2084 return (rack->rc_tp->t_maxpeakrate); 2085 } 2086 #endif 2087 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2088 bw = rack->r_ctl.bw_rate_cap; 2089 return (bw); 2090 } 2091 } 2092 2093 static uint16_t 2094 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2095 { 2096 if (rack->use_fixed_rate) { 2097 return (100); 2098 } else if (rack->in_probe_rtt && (rsm == NULL)) 2099 return (rack->r_ctl.rack_per_of_gp_probertt); 2100 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2101 rack->r_ctl.rack_per_of_gp_rec)) { 2102 if (rsm) { 2103 /* a retransmission always use the recovery rate */ 2104 return (rack->r_ctl.rack_per_of_gp_rec); 2105 } else if (rack->rack_rec_nonrxt_use_cr) { 2106 /* Directed to use the configured rate */ 2107 goto configured_rate; 2108 } else if (rack->rack_no_prr && 2109 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2110 /* No PRR, lets just use the b/w estimate only */ 2111 return (100); 2112 } else { 2113 /* 2114 * Here we may have a non-retransmit but we 2115 * have no overrides, so just use the recovery 2116 * rate (prr is in effect). 2117 */ 2118 return (rack->r_ctl.rack_per_of_gp_rec); 2119 } 2120 } 2121 configured_rate: 2122 /* For the configured rate we look at our cwnd vs the ssthresh */ 2123 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2124 return (rack->r_ctl.rack_per_of_gp_ss); 2125 else 2126 return (rack->r_ctl.rack_per_of_gp_ca); 2127 } 2128 2129 static void 2130 rack_log_dsack_event(struct tcp_rack *rack, uint8_t mod, uint32_t flex4, uint32_t flex5, uint32_t flex6) 2131 { 2132 /* 2133 * Types of logs (mod value) 2134 * 1 = dsack_persists reduced by 1 via T-O or fast recovery exit. 2135 * 2 = a dsack round begins, persist is reset to 16. 2136 * 3 = a dsack round ends 2137 * 4 = Dsack option increases rack rtt flex5 is the srtt input, flex6 is thresh 2138 * 5 = Socket option set changing the control flags rc_rack_tmr_std_based, rc_rack_use_dsack 2139 * 6 = Final rack rtt, flex4 is srtt and flex6 is final limited thresh. 2140 */ 2141 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2142 union tcp_log_stackspecific log; 2143 struct timeval tv; 2144 2145 memset(&log, 0, sizeof(log)); 2146 log.u_bbr.flex1 = rack->rc_rack_tmr_std_based; 2147 log.u_bbr.flex1 <<= 1; 2148 log.u_bbr.flex1 |= rack->rc_rack_use_dsack; 2149 log.u_bbr.flex1 <<= 1; 2150 log.u_bbr.flex1 |= rack->rc_dsack_round_seen; 2151 log.u_bbr.flex2 = rack->r_ctl.dsack_round_end; 2152 log.u_bbr.flex3 = rack->r_ctl.num_dsack; 2153 log.u_bbr.flex4 = flex4; 2154 log.u_bbr.flex5 = flex5; 2155 log.u_bbr.flex6 = flex6; 2156 log.u_bbr.flex7 = rack->r_ctl.dsack_persist; 2157 log.u_bbr.flex8 = mod; 2158 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2159 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2160 &rack->rc_inp->inp_socket->so_rcv, 2161 &rack->rc_inp->inp_socket->so_snd, 2162 RACK_DSACK_HANDLING, 0, 2163 0, &log, false, &tv); 2164 } 2165 } 2166 2167 static void 2168 rack_log_hdwr_pacing(struct tcp_rack *rack, 2169 uint64_t rate, uint64_t hw_rate, int line, 2170 int error, uint16_t mod) 2171 { 2172 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2173 union tcp_log_stackspecific log; 2174 struct timeval tv; 2175 const struct ifnet *ifp; 2176 2177 memset(&log, 0, sizeof(log)); 2178 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2179 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2180 if (rack->r_ctl.crte) { 2181 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2182 } else if (rack->rc_inp->inp_route.ro_nh && 2183 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2184 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2185 } else 2186 ifp = NULL; 2187 if (ifp) { 2188 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2189 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2190 } 2191 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2192 log.u_bbr.bw_inuse = rate; 2193 log.u_bbr.flex5 = line; 2194 log.u_bbr.flex6 = error; 2195 log.u_bbr.flex7 = mod; 2196 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2197 log.u_bbr.flex8 = rack->use_fixed_rate; 2198 log.u_bbr.flex8 <<= 1; 2199 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2200 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2201 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2202 if (rack->r_ctl.crte) 2203 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2204 else 2205 log.u_bbr.cur_del_rate = 0; 2206 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2207 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2208 &rack->rc_inp->inp_socket->so_rcv, 2209 &rack->rc_inp->inp_socket->so_snd, 2210 BBR_LOG_HDWR_PACE, 0, 2211 0, &log, false, &tv); 2212 } 2213 } 2214 2215 static uint64_t 2216 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2217 { 2218 /* 2219 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2220 */ 2221 uint64_t bw_est, high_rate; 2222 uint64_t gain; 2223 2224 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2225 bw_est = bw * gain; 2226 bw_est /= (uint64_t)100; 2227 /* Never fall below the minimum (def 64kbps) */ 2228 if (bw_est < RACK_MIN_BW) 2229 bw_est = RACK_MIN_BW; 2230 if (rack->r_rack_hw_rate_caps) { 2231 /* Rate caps are in place */ 2232 if (rack->r_ctl.crte != NULL) { 2233 /* We have a hdwr rate already */ 2234 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2235 if (bw_est >= high_rate) { 2236 /* We are capping bw at the highest rate table entry */ 2237 rack_log_hdwr_pacing(rack, 2238 bw_est, high_rate, __LINE__, 2239 0, 3); 2240 bw_est = high_rate; 2241 if (capped) 2242 *capped = 1; 2243 } 2244 } else if ((rack->rack_hdrw_pacing == 0) && 2245 (rack->rack_hdw_pace_ena) && 2246 (rack->rack_attempt_hdwr_pace == 0) && 2247 (rack->rc_inp->inp_route.ro_nh != NULL) && 2248 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2249 /* 2250 * Special case, we have not yet attempted hardware 2251 * pacing, and yet we may, when we do, find out if we are 2252 * above the highest rate. We need to know the maxbw for the interface 2253 * in question (if it supports ratelimiting). We get back 2254 * a 0, if the interface is not found in the RL lists. 2255 */ 2256 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2257 if (high_rate) { 2258 /* Yep, we have a rate is it above this rate? */ 2259 if (bw_est > high_rate) { 2260 bw_est = high_rate; 2261 if (capped) 2262 *capped = 1; 2263 } 2264 } 2265 } 2266 } 2267 return (bw_est); 2268 } 2269 2270 static void 2271 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2272 { 2273 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2274 union tcp_log_stackspecific log; 2275 struct timeval tv; 2276 2277 if ((mod != 1) && (rack_verbose_logging == 0)) { 2278 /* 2279 * We get 3 values currently for mod 2280 * 1 - We are retransmitting and this tells the reason. 2281 * 2 - We are clearing a dup-ack count. 2282 * 3 - We are incrementing a dup-ack count. 2283 * 2284 * The clear/increment are only logged 2285 * if you have BBverbose on. 2286 */ 2287 return; 2288 } 2289 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2290 log.u_bbr.flex1 = tsused; 2291 log.u_bbr.flex2 = thresh; 2292 log.u_bbr.flex3 = rsm->r_flags; 2293 log.u_bbr.flex4 = rsm->r_dupack; 2294 log.u_bbr.flex5 = rsm->r_start; 2295 log.u_bbr.flex6 = rsm->r_end; 2296 log.u_bbr.flex8 = mod; 2297 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2298 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2299 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2300 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2301 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2302 log.u_bbr.pacing_gain = rack->r_must_retran; 2303 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2304 &rack->rc_inp->inp_socket->so_rcv, 2305 &rack->rc_inp->inp_socket->so_snd, 2306 BBR_LOG_SETTINGS_CHG, 0, 2307 0, &log, false, &tv); 2308 } 2309 } 2310 2311 static void 2312 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2313 { 2314 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2315 union tcp_log_stackspecific log; 2316 struct timeval tv; 2317 2318 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2319 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2320 log.u_bbr.flex2 = to; 2321 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2322 log.u_bbr.flex4 = slot; 2323 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 2324 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2325 log.u_bbr.flex7 = rack->rc_in_persist; 2326 log.u_bbr.flex8 = which; 2327 if (rack->rack_no_prr) 2328 log.u_bbr.pkts_out = 0; 2329 else 2330 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2331 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2332 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2333 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2334 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2335 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2336 log.u_bbr.pacing_gain = rack->r_must_retran; 2337 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2338 log.u_bbr.lost = rack_rto_min; 2339 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2340 &rack->rc_inp->inp_socket->so_rcv, 2341 &rack->rc_inp->inp_socket->so_snd, 2342 BBR_LOG_TIMERSTAR, 0, 2343 0, &log, false, &tv); 2344 } 2345 } 2346 2347 static void 2348 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2349 { 2350 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2351 union tcp_log_stackspecific log; 2352 struct timeval tv; 2353 2354 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2355 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2356 log.u_bbr.flex8 = to_num; 2357 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2358 log.u_bbr.flex2 = rack->rc_rack_rtt; 2359 if (rsm == NULL) 2360 log.u_bbr.flex3 = 0; 2361 else 2362 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2363 if (rack->rack_no_prr) 2364 log.u_bbr.flex5 = 0; 2365 else 2366 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2367 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2368 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2369 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2370 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2371 log.u_bbr.pacing_gain = rack->r_must_retran; 2372 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2373 &rack->rc_inp->inp_socket->so_rcv, 2374 &rack->rc_inp->inp_socket->so_snd, 2375 BBR_LOG_RTO, 0, 2376 0, &log, false, &tv); 2377 } 2378 } 2379 2380 static void 2381 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2382 struct rack_sendmap *prev, 2383 struct rack_sendmap *rsm, 2384 struct rack_sendmap *next, 2385 int flag, uint32_t th_ack, int line) 2386 { 2387 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2388 union tcp_log_stackspecific log; 2389 struct timeval tv; 2390 2391 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2392 log.u_bbr.flex8 = flag; 2393 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2394 log.u_bbr.cur_del_rate = (uint64_t)prev; 2395 log.u_bbr.delRate = (uint64_t)rsm; 2396 log.u_bbr.rttProp = (uint64_t)next; 2397 log.u_bbr.flex7 = 0; 2398 if (prev) { 2399 log.u_bbr.flex1 = prev->r_start; 2400 log.u_bbr.flex2 = prev->r_end; 2401 log.u_bbr.flex7 |= 0x4; 2402 } 2403 if (rsm) { 2404 log.u_bbr.flex3 = rsm->r_start; 2405 log.u_bbr.flex4 = rsm->r_end; 2406 log.u_bbr.flex7 |= 0x2; 2407 } 2408 if (next) { 2409 log.u_bbr.flex5 = next->r_start; 2410 log.u_bbr.flex6 = next->r_end; 2411 log.u_bbr.flex7 |= 0x1; 2412 } 2413 log.u_bbr.applimited = line; 2414 log.u_bbr.pkts_out = th_ack; 2415 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2416 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2417 if (rack->rack_no_prr) 2418 log.u_bbr.lost = 0; 2419 else 2420 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2421 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2422 &rack->rc_inp->inp_socket->so_rcv, 2423 &rack->rc_inp->inp_socket->so_snd, 2424 TCP_LOG_MAPCHG, 0, 2425 0, &log, false, &tv); 2426 } 2427 } 2428 2429 static void 2430 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2431 struct rack_sendmap *rsm, int conf) 2432 { 2433 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2434 union tcp_log_stackspecific log; 2435 struct timeval tv; 2436 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2437 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2438 log.u_bbr.flex1 = t; 2439 log.u_bbr.flex2 = len; 2440 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2441 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2442 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2443 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2444 log.u_bbr.flex7 = conf; 2445 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2446 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2447 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2448 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2449 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2450 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2451 if (rsm) { 2452 log.u_bbr.pkt_epoch = rsm->r_start; 2453 log.u_bbr.lost = rsm->r_end; 2454 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2455 /* We loose any upper of the 24 bits */ 2456 log.u_bbr.pacing_gain = (uint16_t)rsm->r_flags; 2457 } else { 2458 /* Its a SYN */ 2459 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2460 log.u_bbr.lost = 0; 2461 log.u_bbr.cwnd_gain = 0; 2462 log.u_bbr.pacing_gain = 0; 2463 } 2464 /* Write out general bits of interest rrs here */ 2465 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2466 log.u_bbr.use_lt_bw <<= 1; 2467 log.u_bbr.use_lt_bw |= rack->forced_ack; 2468 log.u_bbr.use_lt_bw <<= 1; 2469 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2470 log.u_bbr.use_lt_bw <<= 1; 2471 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2472 log.u_bbr.use_lt_bw <<= 1; 2473 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2474 log.u_bbr.use_lt_bw <<= 1; 2475 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2476 log.u_bbr.use_lt_bw <<= 1; 2477 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2478 log.u_bbr.use_lt_bw <<= 1; 2479 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2480 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2481 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2482 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2483 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2484 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2485 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2486 log.u_bbr.bw_inuse <<= 32; 2487 if (rsm) 2488 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2489 TCP_LOG_EVENTP(tp, NULL, 2490 &rack->rc_inp->inp_socket->so_rcv, 2491 &rack->rc_inp->inp_socket->so_snd, 2492 BBR_LOG_BBRRTT, 0, 2493 0, &log, false, &tv); 2494 2495 2496 } 2497 } 2498 2499 static void 2500 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2501 { 2502 /* 2503 * Log the rtt sample we are 2504 * applying to the srtt algorithm in 2505 * useconds. 2506 */ 2507 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2508 union tcp_log_stackspecific log; 2509 struct timeval tv; 2510 2511 /* Convert our ms to a microsecond */ 2512 memset(&log, 0, sizeof(log)); 2513 log.u_bbr.flex1 = rtt; 2514 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2515 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2516 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2517 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2518 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2519 log.u_bbr.flex7 = 1; 2520 log.u_bbr.flex8 = rack->sack_attack_disable; 2521 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2522 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2523 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2524 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2525 log.u_bbr.pacing_gain = rack->r_must_retran; 2526 /* 2527 * We capture in delRate the upper 32 bits as 2528 * the confidence level we had declared, and the 2529 * lower 32 bits as the actual RTT using the arrival 2530 * timestamp. 2531 */ 2532 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2533 log.u_bbr.delRate <<= 32; 2534 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2535 /* Lets capture all the things that make up t_rtxcur */ 2536 log.u_bbr.applimited = rack_rto_min; 2537 log.u_bbr.epoch = rack_rto_max; 2538 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2539 log.u_bbr.lost = rack_rto_min; 2540 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2541 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2542 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2543 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2544 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2545 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2546 &rack->rc_inp->inp_socket->so_rcv, 2547 &rack->rc_inp->inp_socket->so_snd, 2548 TCP_LOG_RTT, 0, 2549 0, &log, false, &tv); 2550 } 2551 } 2552 2553 static void 2554 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2555 { 2556 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2557 union tcp_log_stackspecific log; 2558 struct timeval tv; 2559 2560 /* Convert our ms to a microsecond */ 2561 memset(&log, 0, sizeof(log)); 2562 log.u_bbr.flex1 = rtt; 2563 log.u_bbr.flex2 = send_time; 2564 log.u_bbr.flex3 = ack_time; 2565 log.u_bbr.flex4 = where; 2566 log.u_bbr.flex7 = 2; 2567 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2568 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2569 &rack->rc_inp->inp_socket->so_rcv, 2570 &rack->rc_inp->inp_socket->so_snd, 2571 TCP_LOG_RTT, 0, 2572 0, &log, false, &tv); 2573 } 2574 } 2575 2576 2577 2578 static inline void 2579 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2580 { 2581 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2582 union tcp_log_stackspecific log; 2583 struct timeval tv; 2584 2585 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2586 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2587 log.u_bbr.flex1 = line; 2588 log.u_bbr.flex2 = tick; 2589 log.u_bbr.flex3 = tp->t_maxunacktime; 2590 log.u_bbr.flex4 = tp->t_acktime; 2591 log.u_bbr.flex8 = event; 2592 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2593 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2594 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2595 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2596 log.u_bbr.pacing_gain = rack->r_must_retran; 2597 TCP_LOG_EVENTP(tp, NULL, 2598 &rack->rc_inp->inp_socket->so_rcv, 2599 &rack->rc_inp->inp_socket->so_snd, 2600 BBR_LOG_PROGRESS, 0, 2601 0, &log, false, &tv); 2602 } 2603 } 2604 2605 static void 2606 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 2607 { 2608 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2609 union tcp_log_stackspecific log; 2610 2611 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2612 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2613 log.u_bbr.flex1 = slot; 2614 if (rack->rack_no_prr) 2615 log.u_bbr.flex2 = 0; 2616 else 2617 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2618 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2619 log.u_bbr.flex8 = rack->rc_in_persist; 2620 log.u_bbr.timeStamp = cts; 2621 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2622 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2623 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2624 log.u_bbr.pacing_gain = rack->r_must_retran; 2625 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2626 &rack->rc_inp->inp_socket->so_rcv, 2627 &rack->rc_inp->inp_socket->so_snd, 2628 BBR_LOG_BBRSND, 0, 2629 0, &log, false, tv); 2630 } 2631 } 2632 2633 static void 2634 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2635 { 2636 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2637 union tcp_log_stackspecific log; 2638 struct timeval tv; 2639 2640 memset(&log, 0, sizeof(log)); 2641 log.u_bbr.flex1 = did_out; 2642 log.u_bbr.flex2 = nxt_pkt; 2643 log.u_bbr.flex3 = way_out; 2644 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2645 if (rack->rack_no_prr) 2646 log.u_bbr.flex5 = 0; 2647 else 2648 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2649 log.u_bbr.flex6 = nsegs; 2650 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2651 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2652 log.u_bbr.flex7 <<= 1; 2653 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2654 log.u_bbr.flex7 <<= 1; 2655 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2656 log.u_bbr.flex8 = rack->rc_in_persist; 2657 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2658 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2659 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2660 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2661 log.u_bbr.use_lt_bw <<= 1; 2662 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2663 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2664 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2665 log.u_bbr.pacing_gain = rack->r_must_retran; 2666 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2667 &rack->rc_inp->inp_socket->so_rcv, 2668 &rack->rc_inp->inp_socket->so_snd, 2669 BBR_LOG_DOSEG_DONE, 0, 2670 0, &log, false, &tv); 2671 } 2672 } 2673 2674 static void 2675 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2676 { 2677 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2678 union tcp_log_stackspecific log; 2679 struct timeval tv; 2680 uint32_t cts; 2681 2682 memset(&log, 0, sizeof(log)); 2683 cts = tcp_get_usecs(&tv); 2684 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2685 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2686 log.u_bbr.flex4 = arg1; 2687 log.u_bbr.flex5 = arg2; 2688 log.u_bbr.flex6 = arg3; 2689 log.u_bbr.flex8 = frm; 2690 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2691 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2692 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2693 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 2694 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2695 log.u_bbr.pacing_gain = rack->r_must_retran; 2696 TCP_LOG_EVENTP(tp, NULL, 2697 &tp->t_inpcb->inp_socket->so_rcv, 2698 &tp->t_inpcb->inp_socket->so_snd, 2699 TCP_HDWR_PACE_SIZE, 0, 2700 0, &log, false, &tv); 2701 } 2702 } 2703 2704 static void 2705 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2706 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2707 { 2708 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2709 union tcp_log_stackspecific log; 2710 struct timeval tv; 2711 2712 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2713 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2714 log.u_bbr.flex1 = slot; 2715 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2716 log.u_bbr.flex4 = reason; 2717 if (rack->rack_no_prr) 2718 log.u_bbr.flex5 = 0; 2719 else 2720 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2721 log.u_bbr.flex7 = hpts_calling; 2722 log.u_bbr.flex8 = rack->rc_in_persist; 2723 log.u_bbr.lt_epoch = cwnd_to_use; 2724 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2725 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2726 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2727 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2728 log.u_bbr.pacing_gain = rack->r_must_retran; 2729 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2730 &rack->rc_inp->inp_socket->so_rcv, 2731 &rack->rc_inp->inp_socket->so_snd, 2732 BBR_LOG_JUSTRET, 0, 2733 tlen, &log, false, &tv); 2734 } 2735 } 2736 2737 static void 2738 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2739 struct timeval *tv, uint32_t flags_on_entry) 2740 { 2741 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2742 union tcp_log_stackspecific log; 2743 2744 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2745 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 2746 log.u_bbr.flex1 = line; 2747 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2748 log.u_bbr.flex3 = flags_on_entry; 2749 log.u_bbr.flex4 = us_cts; 2750 if (rack->rack_no_prr) 2751 log.u_bbr.flex5 = 0; 2752 else 2753 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2754 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2755 log.u_bbr.flex7 = hpts_removed; 2756 log.u_bbr.flex8 = 1; 2757 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2758 log.u_bbr.timeStamp = us_cts; 2759 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2760 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2761 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2762 log.u_bbr.pacing_gain = rack->r_must_retran; 2763 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2764 &rack->rc_inp->inp_socket->so_rcv, 2765 &rack->rc_inp->inp_socket->so_snd, 2766 BBR_LOG_TIMERCANC, 0, 2767 0, &log, false, tv); 2768 } 2769 } 2770 2771 static void 2772 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2773 uint32_t flex1, uint32_t flex2, 2774 uint32_t flex3, uint32_t flex4, 2775 uint32_t flex5, uint32_t flex6, 2776 uint16_t flex7, uint8_t mod) 2777 { 2778 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2779 union tcp_log_stackspecific log; 2780 struct timeval tv; 2781 2782 if (mod == 1) { 2783 /* No you can't use 1, its for the real to cancel */ 2784 return; 2785 } 2786 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2787 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2788 log.u_bbr.flex1 = flex1; 2789 log.u_bbr.flex2 = flex2; 2790 log.u_bbr.flex3 = flex3; 2791 log.u_bbr.flex4 = flex4; 2792 log.u_bbr.flex5 = flex5; 2793 log.u_bbr.flex6 = flex6; 2794 log.u_bbr.flex7 = flex7; 2795 log.u_bbr.flex8 = mod; 2796 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2797 &rack->rc_inp->inp_socket->so_rcv, 2798 &rack->rc_inp->inp_socket->so_snd, 2799 BBR_LOG_TIMERCANC, 0, 2800 0, &log, false, &tv); 2801 } 2802 } 2803 2804 static void 2805 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2806 { 2807 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2808 union tcp_log_stackspecific log; 2809 struct timeval tv; 2810 2811 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2812 log.u_bbr.flex1 = timers; 2813 log.u_bbr.flex2 = ret; 2814 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2815 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2816 log.u_bbr.flex5 = cts; 2817 if (rack->rack_no_prr) 2818 log.u_bbr.flex6 = 0; 2819 else 2820 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2821 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2822 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2823 log.u_bbr.pacing_gain = rack->r_must_retran; 2824 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2825 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2826 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2827 &rack->rc_inp->inp_socket->so_rcv, 2828 &rack->rc_inp->inp_socket->so_snd, 2829 BBR_LOG_TO_PROCESS, 0, 2830 0, &log, false, &tv); 2831 } 2832 } 2833 2834 static void 2835 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2836 { 2837 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2838 union tcp_log_stackspecific log; 2839 struct timeval tv; 2840 2841 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2842 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2843 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2844 if (rack->rack_no_prr) 2845 log.u_bbr.flex3 = 0; 2846 else 2847 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2848 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2849 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2850 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2851 log.u_bbr.flex8 = frm; 2852 log.u_bbr.pkts_out = orig_cwnd; 2853 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2854 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2855 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2856 log.u_bbr.use_lt_bw <<= 1; 2857 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2858 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2859 &rack->rc_inp->inp_socket->so_rcv, 2860 &rack->rc_inp->inp_socket->so_snd, 2861 BBR_LOG_BBRUPD, 0, 2862 0, &log, false, &tv); 2863 } 2864 } 2865 2866 #ifdef NETFLIX_EXP_DETECTION 2867 static void 2868 rack_log_sad(struct tcp_rack *rack, int event) 2869 { 2870 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2871 union tcp_log_stackspecific log; 2872 struct timeval tv; 2873 2874 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2875 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2876 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2877 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2878 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2879 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2880 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2881 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2882 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2883 log.u_bbr.lt_epoch |= rack->do_detection; 2884 log.u_bbr.applimited = tcp_map_minimum; 2885 log.u_bbr.flex7 = rack->sack_attack_disable; 2886 log.u_bbr.flex8 = event; 2887 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2888 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2889 log.u_bbr.delivered = tcp_sad_decay_val; 2890 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2891 &rack->rc_inp->inp_socket->so_rcv, 2892 &rack->rc_inp->inp_socket->so_snd, 2893 TCP_SAD_DETECTION, 0, 2894 0, &log, false, &tv); 2895 } 2896 } 2897 #endif 2898 2899 static void 2900 rack_counter_destroy(void) 2901 { 2902 int i; 2903 2904 counter_u64_free(rack_fto_send); 2905 counter_u64_free(rack_fto_rsm_send); 2906 counter_u64_free(rack_nfto_resend); 2907 counter_u64_free(rack_hw_pace_init_fail); 2908 counter_u64_free(rack_hw_pace_lost); 2909 counter_u64_free(rack_non_fto_send); 2910 counter_u64_free(rack_extended_rfo); 2911 counter_u64_free(rack_ack_total); 2912 counter_u64_free(rack_express_sack); 2913 counter_u64_free(rack_sack_total); 2914 counter_u64_free(rack_move_none); 2915 counter_u64_free(rack_move_some); 2916 counter_u64_free(rack_sack_attacks_detected); 2917 counter_u64_free(rack_sack_attacks_reversed); 2918 counter_u64_free(rack_sack_used_next_merge); 2919 counter_u64_free(rack_sack_used_prev_merge); 2920 counter_u64_free(rack_badfr); 2921 counter_u64_free(rack_badfr_bytes); 2922 counter_u64_free(rack_rtm_prr_retran); 2923 counter_u64_free(rack_rtm_prr_newdata); 2924 counter_u64_free(rack_timestamp_mismatch); 2925 counter_u64_free(rack_find_high); 2926 counter_u64_free(rack_reorder_seen); 2927 counter_u64_free(rack_tlp_tot); 2928 counter_u64_free(rack_tlp_newdata); 2929 counter_u64_free(rack_tlp_retran); 2930 counter_u64_free(rack_tlp_retran_bytes); 2931 counter_u64_free(rack_tlp_retran_fail); 2932 counter_u64_free(rack_to_tot); 2933 counter_u64_free(rack_to_arm_rack); 2934 counter_u64_free(rack_to_arm_tlp); 2935 counter_u64_free(rack_calc_zero); 2936 counter_u64_free(rack_calc_nonzero); 2937 counter_u64_free(rack_paced_segments); 2938 counter_u64_free(rack_unpaced_segments); 2939 counter_u64_free(rack_saw_enobuf); 2940 counter_u64_free(rack_saw_enobuf_hw); 2941 counter_u64_free(rack_saw_enetunreach); 2942 counter_u64_free(rack_hot_alloc); 2943 counter_u64_free(rack_to_alloc); 2944 counter_u64_free(rack_to_alloc_hard); 2945 counter_u64_free(rack_to_alloc_emerg); 2946 counter_u64_free(rack_to_alloc_limited); 2947 counter_u64_free(rack_alloc_limited_conns); 2948 counter_u64_free(rack_split_limited); 2949 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 2950 counter_u64_free(rack_proc_comp_ack[i]); 2951 } 2952 counter_u64_free(rack_multi_single_eq); 2953 counter_u64_free(rack_proc_non_comp_ack); 2954 counter_u64_free(rack_sack_proc_all); 2955 counter_u64_free(rack_sack_proc_restart); 2956 counter_u64_free(rack_sack_proc_short); 2957 counter_u64_free(rack_enter_tlp_calc); 2958 counter_u64_free(rack_used_tlpmethod); 2959 counter_u64_free(rack_used_tlpmethod2); 2960 counter_u64_free(rack_sack_skipped_acked); 2961 counter_u64_free(rack_sack_splits); 2962 counter_u64_free(rack_progress_drops); 2963 counter_u64_free(rack_input_idle_reduces); 2964 counter_u64_free(rack_collapsed_win); 2965 counter_u64_free(rack_tlp_does_nada); 2966 counter_u64_free(rack_try_scwnd); 2967 counter_u64_free(rack_per_timer_hole); 2968 counter_u64_free(rack_large_ackcmp); 2969 counter_u64_free(rack_small_ackcmp); 2970 counter_u64_free(rack_persists_sends); 2971 counter_u64_free(rack_persists_acks); 2972 counter_u64_free(rack_persists_loss); 2973 counter_u64_free(rack_persists_lost_ends); 2974 #ifdef INVARIANTS 2975 counter_u64_free(rack_adjust_map_bw); 2976 #endif 2977 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2978 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2979 } 2980 2981 static struct rack_sendmap * 2982 rack_alloc(struct tcp_rack *rack) 2983 { 2984 struct rack_sendmap *rsm; 2985 2986 /* 2987 * First get the top of the list it in 2988 * theory is the "hottest" rsm we have, 2989 * possibly just freed by ack processing. 2990 */ 2991 if (rack->rc_free_cnt > rack_free_cache) { 2992 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2993 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2994 counter_u64_add(rack_hot_alloc, 1); 2995 rack->rc_free_cnt--; 2996 return (rsm); 2997 } 2998 /* 2999 * Once we get under our free cache we probably 3000 * no longer have a "hot" one available. Lets 3001 * get one from UMA. 3002 */ 3003 rsm = uma_zalloc(rack_zone, M_NOWAIT); 3004 if (rsm) { 3005 rack->r_ctl.rc_num_maps_alloced++; 3006 counter_u64_add(rack_to_alloc, 1); 3007 return (rsm); 3008 } 3009 /* 3010 * Dig in to our aux rsm's (the last two) since 3011 * UMA failed to get us one. 3012 */ 3013 if (rack->rc_free_cnt) { 3014 counter_u64_add(rack_to_alloc_emerg, 1); 3015 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 3016 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3017 rack->rc_free_cnt--; 3018 return (rsm); 3019 } 3020 return (NULL); 3021 } 3022 3023 static struct rack_sendmap * 3024 rack_alloc_full_limit(struct tcp_rack *rack) 3025 { 3026 if ((V_tcp_map_entries_limit > 0) && 3027 (rack->do_detection == 0) && 3028 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 3029 counter_u64_add(rack_to_alloc_limited, 1); 3030 if (!rack->alloc_limit_reported) { 3031 rack->alloc_limit_reported = 1; 3032 counter_u64_add(rack_alloc_limited_conns, 1); 3033 } 3034 return (NULL); 3035 } 3036 return (rack_alloc(rack)); 3037 } 3038 3039 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 3040 static struct rack_sendmap * 3041 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 3042 { 3043 struct rack_sendmap *rsm; 3044 3045 if (limit_type) { 3046 /* currently there is only one limit type */ 3047 if (V_tcp_map_split_limit > 0 && 3048 (rack->do_detection == 0) && 3049 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 3050 counter_u64_add(rack_split_limited, 1); 3051 if (!rack->alloc_limit_reported) { 3052 rack->alloc_limit_reported = 1; 3053 counter_u64_add(rack_alloc_limited_conns, 1); 3054 } 3055 return (NULL); 3056 } 3057 } 3058 3059 /* allocate and mark in the limit type, if set */ 3060 rsm = rack_alloc(rack); 3061 if (rsm != NULL && limit_type) { 3062 rsm->r_limit_type = limit_type; 3063 rack->r_ctl.rc_num_split_allocs++; 3064 } 3065 return (rsm); 3066 } 3067 3068 static void 3069 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 3070 { 3071 if (rsm->r_flags & RACK_APP_LIMITED) { 3072 if (rack->r_ctl.rc_app_limited_cnt > 0) { 3073 rack->r_ctl.rc_app_limited_cnt--; 3074 } 3075 } 3076 if (rsm->r_limit_type) { 3077 /* currently there is only one limit type */ 3078 rack->r_ctl.rc_num_split_allocs--; 3079 } 3080 if (rsm == rack->r_ctl.rc_first_appl) { 3081 if (rack->r_ctl.rc_app_limited_cnt == 0) 3082 rack->r_ctl.rc_first_appl = NULL; 3083 else { 3084 /* Follow the next one out */ 3085 struct rack_sendmap fe; 3086 3087 fe.r_start = rsm->r_nseq_appl; 3088 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3089 } 3090 } 3091 if (rsm == rack->r_ctl.rc_resend) 3092 rack->r_ctl.rc_resend = NULL; 3093 if (rsm == rack->r_ctl.rc_rsm_at_retran) 3094 rack->r_ctl.rc_rsm_at_retran = NULL; 3095 if (rsm == rack->r_ctl.rc_end_appl) 3096 rack->r_ctl.rc_end_appl = NULL; 3097 if (rack->r_ctl.rc_tlpsend == rsm) 3098 rack->r_ctl.rc_tlpsend = NULL; 3099 if (rack->r_ctl.rc_sacklast == rsm) 3100 rack->r_ctl.rc_sacklast = NULL; 3101 memset(rsm, 0, sizeof(struct rack_sendmap)); 3102 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3103 rack->rc_free_cnt++; 3104 } 3105 3106 static void 3107 rack_free_trim(struct tcp_rack *rack) 3108 { 3109 struct rack_sendmap *rsm; 3110 3111 /* 3112 * Free up all the tail entries until 3113 * we get our list down to the limit. 3114 */ 3115 while (rack->rc_free_cnt > rack_free_cache) { 3116 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3117 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3118 rack->rc_free_cnt--; 3119 uma_zfree(rack_zone, rsm); 3120 } 3121 } 3122 3123 3124 static uint32_t 3125 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3126 { 3127 uint64_t srtt, bw, len, tim; 3128 uint32_t segsiz, def_len, minl; 3129 3130 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3131 def_len = rack_def_data_window * segsiz; 3132 if (rack->rc_gp_filled == 0) { 3133 /* 3134 * We have no measurement (IW is in flight?) so 3135 * we can only guess using our data_window sysctl 3136 * value (usually 20MSS). 3137 */ 3138 return (def_len); 3139 } 3140 /* 3141 * Now we have a number of factors to consider. 3142 * 3143 * 1) We have a desired BDP which is usually 3144 * at least 2. 3145 * 2) We have a minimum number of rtt's usually 1 SRTT 3146 * but we allow it too to be more. 3147 * 3) We want to make sure a measurement last N useconds (if 3148 * we have set rack_min_measure_usec. 3149 * 3150 * We handle the first concern here by trying to create a data 3151 * window of max(rack_def_data_window, DesiredBDP). The 3152 * second concern we handle in not letting the measurement 3153 * window end normally until at least the required SRTT's 3154 * have gone by which is done further below in 3155 * rack_enough_for_measurement(). Finally the third concern 3156 * we also handle here by calculating how long that time 3157 * would take at the current BW and then return the 3158 * max of our first calculation and that length. Note 3159 * that if rack_min_measure_usec is 0, we don't deal 3160 * with concern 3. Also for both Concern 1 and 3 an 3161 * application limited period could end the measurement 3162 * earlier. 3163 * 3164 * So lets calculate the BDP with the "known" b/w using 3165 * the SRTT has our rtt and then multiply it by the 3166 * goal. 3167 */ 3168 bw = rack_get_bw(rack); 3169 srtt = (uint64_t)tp->t_srtt; 3170 len = bw * srtt; 3171 len /= (uint64_t)HPTS_USEC_IN_SEC; 3172 len *= max(1, rack_goal_bdp); 3173 /* Now we need to round up to the nearest MSS */ 3174 len = roundup(len, segsiz); 3175 if (rack_min_measure_usec) { 3176 /* Now calculate our min length for this b/w */ 3177 tim = rack_min_measure_usec; 3178 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3179 if (minl == 0) 3180 minl = 1; 3181 minl = roundup(minl, segsiz); 3182 if (len < minl) 3183 len = minl; 3184 } 3185 /* 3186 * Now if we have a very small window we want 3187 * to attempt to get the window that is 3188 * as small as possible. This happens on 3189 * low b/w connections and we don't want to 3190 * span huge numbers of rtt's between measurements. 3191 * 3192 * We basically include 2 over our "MIN window" so 3193 * that the measurement can be shortened (possibly) by 3194 * an ack'ed packet. 3195 */ 3196 if (len < def_len) 3197 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3198 else 3199 return (max((uint32_t)len, def_len)); 3200 3201 } 3202 3203 static int 3204 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3205 { 3206 uint32_t tim, srtts, segsiz; 3207 3208 /* 3209 * Has enough time passed for the GP measurement to be valid? 3210 */ 3211 if ((tp->snd_max == tp->snd_una) || 3212 (th_ack == tp->snd_max)){ 3213 /* All is acked */ 3214 *quality = RACK_QUALITY_ALLACKED; 3215 return (1); 3216 } 3217 if (SEQ_LT(th_ack, tp->gput_seq)) { 3218 /* Not enough bytes yet */ 3219 return (0); 3220 } 3221 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3222 if (SEQ_LT(th_ack, tp->gput_ack) && 3223 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3224 /* Not enough bytes yet */ 3225 return (0); 3226 } 3227 if (rack->r_ctl.rc_first_appl && 3228 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3229 /* 3230 * We are up to the app limited send point 3231 * we have to measure irrespective of the time.. 3232 */ 3233 *quality = RACK_QUALITY_APPLIMITED; 3234 return (1); 3235 } 3236 /* Now what about time? */ 3237 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3238 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3239 if (tim >= srtts) { 3240 *quality = RACK_QUALITY_HIGH; 3241 return (1); 3242 } 3243 /* Nope not even a full SRTT has passed */ 3244 return (0); 3245 } 3246 3247 static void 3248 rack_log_timely(struct tcp_rack *rack, 3249 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3250 uint64_t up_bnd, int line, uint8_t method) 3251 { 3252 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3253 union tcp_log_stackspecific log; 3254 struct timeval tv; 3255 3256 memset(&log, 0, sizeof(log)); 3257 log.u_bbr.flex1 = logged; 3258 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3259 log.u_bbr.flex2 <<= 4; 3260 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3261 log.u_bbr.flex2 <<= 4; 3262 log.u_bbr.flex2 |= rack->rc_gp_incr; 3263 log.u_bbr.flex2 <<= 4; 3264 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3265 log.u_bbr.flex3 = rack->rc_gp_incr; 3266 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3267 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3268 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3269 log.u_bbr.flex7 = rack->rc_gp_bwred; 3270 log.u_bbr.flex8 = method; 3271 log.u_bbr.cur_del_rate = cur_bw; 3272 log.u_bbr.delRate = low_bnd; 3273 log.u_bbr.bw_inuse = up_bnd; 3274 log.u_bbr.rttProp = rack_get_bw(rack); 3275 log.u_bbr.pkt_epoch = line; 3276 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3277 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3278 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3279 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3280 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3281 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3282 log.u_bbr.cwnd_gain <<= 1; 3283 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3284 log.u_bbr.cwnd_gain <<= 1; 3285 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3286 log.u_bbr.cwnd_gain <<= 1; 3287 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3288 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3289 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3290 &rack->rc_inp->inp_socket->so_rcv, 3291 &rack->rc_inp->inp_socket->so_snd, 3292 TCP_TIMELY_WORK, 0, 3293 0, &log, false, &tv); 3294 } 3295 } 3296 3297 static int 3298 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3299 { 3300 /* 3301 * Before we increase we need to know if 3302 * the estimate just made was less than 3303 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3304 * 3305 * If we already are pacing at a fast enough 3306 * rate to push us faster there is no sense of 3307 * increasing. 3308 * 3309 * We first caculate our actual pacing rate (ss or ca multipler 3310 * times our cur_bw). 3311 * 3312 * Then we take the last measured rate and multipy by our 3313 * maximum pacing overage to give us a max allowable rate. 3314 * 3315 * If our act_rate is smaller than our max_allowable rate 3316 * then we should increase. Else we should hold steady. 3317 * 3318 */ 3319 uint64_t act_rate, max_allow_rate; 3320 3321 if (rack_timely_no_stopping) 3322 return (1); 3323 3324 if ((cur_bw == 0) || (last_bw_est == 0)) { 3325 /* 3326 * Initial startup case or 3327 * everything is acked case. 3328 */ 3329 rack_log_timely(rack, mult, cur_bw, 0, 0, 3330 __LINE__, 9); 3331 return (1); 3332 } 3333 if (mult <= 100) { 3334 /* 3335 * We can always pace at or slightly above our rate. 3336 */ 3337 rack_log_timely(rack, mult, cur_bw, 0, 0, 3338 __LINE__, 9); 3339 return (1); 3340 } 3341 act_rate = cur_bw * (uint64_t)mult; 3342 act_rate /= 100; 3343 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3344 max_allow_rate /= 100; 3345 if (act_rate < max_allow_rate) { 3346 /* 3347 * Here the rate we are actually pacing at 3348 * is smaller than 10% above our last measurement. 3349 * This means we are pacing below what we would 3350 * like to try to achieve (plus some wiggle room). 3351 */ 3352 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3353 __LINE__, 9); 3354 return (1); 3355 } else { 3356 /* 3357 * Here we are already pacing at least rack_max_per_above(10%) 3358 * what we are getting back. This indicates most likely 3359 * that we are being limited (cwnd/rwnd/app) and can't 3360 * get any more b/w. There is no sense of trying to 3361 * raise up the pacing rate its not speeding us up 3362 * and we already are pacing faster than we are getting. 3363 */ 3364 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3365 __LINE__, 8); 3366 return (0); 3367 } 3368 } 3369 3370 static void 3371 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3372 { 3373 /* 3374 * When we drag bottom, we want to assure 3375 * that no multiplier is below 1.0, if so 3376 * we want to restore it to at least that. 3377 */ 3378 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3379 /* This is unlikely we usually do not touch recovery */ 3380 rack->r_ctl.rack_per_of_gp_rec = 100; 3381 } 3382 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3383 rack->r_ctl.rack_per_of_gp_ca = 100; 3384 } 3385 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3386 rack->r_ctl.rack_per_of_gp_ss = 100; 3387 } 3388 } 3389 3390 static void 3391 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3392 { 3393 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3394 rack->r_ctl.rack_per_of_gp_ca = 100; 3395 } 3396 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3397 rack->r_ctl.rack_per_of_gp_ss = 100; 3398 } 3399 } 3400 3401 static void 3402 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3403 { 3404 int32_t calc, logged, plus; 3405 3406 logged = 0; 3407 3408 if (override) { 3409 /* 3410 * override is passed when we are 3411 * loosing b/w and making one last 3412 * gasp at trying to not loose out 3413 * to a new-reno flow. 3414 */ 3415 goto extra_boost; 3416 } 3417 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3418 if (rack->rc_gp_incr && 3419 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3420 /* 3421 * Reset and get 5 strokes more before the boost. Note 3422 * that the count is 0 based so we have to add one. 3423 */ 3424 extra_boost: 3425 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3426 rack->rc_gp_timely_inc_cnt = 0; 3427 } else 3428 plus = (uint32_t)rack_gp_increase_per; 3429 /* Must be at least 1% increase for true timely increases */ 3430 if ((plus < 1) && 3431 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3432 plus = 1; 3433 if (rack->rc_gp_saw_rec && 3434 (rack->rc_gp_no_rec_chg == 0) && 3435 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3436 rack->r_ctl.rack_per_of_gp_rec)) { 3437 /* We have been in recovery ding it too */ 3438 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3439 if (calc > 0xffff) 3440 calc = 0xffff; 3441 logged |= 1; 3442 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3443 if (rack_per_upper_bound_ss && 3444 (rack->rc_dragged_bottom == 0) && 3445 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 3446 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 3447 } 3448 if (rack->rc_gp_saw_ca && 3449 (rack->rc_gp_saw_ss == 0) && 3450 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3451 rack->r_ctl.rack_per_of_gp_ca)) { 3452 /* In CA */ 3453 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3454 if (calc > 0xffff) 3455 calc = 0xffff; 3456 logged |= 2; 3457 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3458 if (rack_per_upper_bound_ca && 3459 (rack->rc_dragged_bottom == 0) && 3460 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 3461 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 3462 } 3463 if (rack->rc_gp_saw_ss && 3464 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3465 rack->r_ctl.rack_per_of_gp_ss)) { 3466 /* In SS */ 3467 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3468 if (calc > 0xffff) 3469 calc = 0xffff; 3470 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3471 if (rack_per_upper_bound_ss && 3472 (rack->rc_dragged_bottom == 0) && 3473 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 3474 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 3475 logged |= 4; 3476 } 3477 if (logged && 3478 (rack->rc_gp_incr == 0)){ 3479 /* Go into increment mode */ 3480 rack->rc_gp_incr = 1; 3481 rack->rc_gp_timely_inc_cnt = 0; 3482 } 3483 if (rack->rc_gp_incr && 3484 logged && 3485 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3486 rack->rc_gp_timely_inc_cnt++; 3487 } 3488 rack_log_timely(rack, logged, plus, 0, 0, 3489 __LINE__, 1); 3490 } 3491 3492 static uint32_t 3493 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3494 { 3495 /* 3496 * norm_grad = rtt_diff / minrtt; 3497 * new_per = curper * (1 - B * norm_grad) 3498 * 3499 * B = rack_gp_decrease_per (default 10%) 3500 * rtt_dif = input var current rtt-diff 3501 * curper = input var current percentage 3502 * minrtt = from rack filter 3503 * 3504 */ 3505 uint64_t perf; 3506 3507 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3508 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3509 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3510 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3511 (uint64_t)1000000)) / 3512 (uint64_t)1000000); 3513 if (perf > curper) { 3514 /* TSNH */ 3515 perf = curper - 1; 3516 } 3517 return ((uint32_t)perf); 3518 } 3519 3520 static uint32_t 3521 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3522 { 3523 /* 3524 * highrttthresh 3525 * result = curper * (1 - (B * ( 1 - ------ )) 3526 * gp_srtt 3527 * 3528 * B = rack_gp_decrease_per (default 10%) 3529 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3530 */ 3531 uint64_t perf; 3532 uint32_t highrttthresh; 3533 3534 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3535 3536 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3537 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3538 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3539 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3540 return (perf); 3541 } 3542 3543 static void 3544 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3545 { 3546 uint64_t logvar, logvar2, logvar3; 3547 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3548 3549 if (rack->rc_gp_incr) { 3550 /* Turn off increment counting */ 3551 rack->rc_gp_incr = 0; 3552 rack->rc_gp_timely_inc_cnt = 0; 3553 } 3554 ss_red = ca_red = rec_red = 0; 3555 logged = 0; 3556 /* Calculate the reduction value */ 3557 if (rtt_diff < 0) { 3558 rtt_diff *= -1; 3559 } 3560 /* Must be at least 1% reduction */ 3561 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3562 /* We have been in recovery ding it too */ 3563 if (timely_says == 2) { 3564 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3565 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3566 if (alt < new_per) 3567 val = alt; 3568 else 3569 val = new_per; 3570 } else 3571 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3572 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3573 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3574 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3575 } else { 3576 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3577 rec_red = 0; 3578 } 3579 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3580 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3581 logged |= 1; 3582 } 3583 if (rack->rc_gp_saw_ss) { 3584 /* Sent in SS */ 3585 if (timely_says == 2) { 3586 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3587 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3588 if (alt < new_per) 3589 val = alt; 3590 else 3591 val = new_per; 3592 } else 3593 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3594 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3595 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3596 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3597 } else { 3598 ss_red = new_per; 3599 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3600 logvar = new_per; 3601 logvar <<= 32; 3602 logvar |= alt; 3603 logvar2 = (uint32_t)rtt; 3604 logvar2 <<= 32; 3605 logvar2 |= (uint32_t)rtt_diff; 3606 logvar3 = rack_gp_rtt_maxmul; 3607 logvar3 <<= 32; 3608 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3609 rack_log_timely(rack, timely_says, 3610 logvar2, logvar3, 3611 logvar, __LINE__, 10); 3612 } 3613 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3614 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3615 logged |= 4; 3616 } else if (rack->rc_gp_saw_ca) { 3617 /* Sent in CA */ 3618 if (timely_says == 2) { 3619 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3620 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3621 if (alt < new_per) 3622 val = alt; 3623 else 3624 val = new_per; 3625 } else 3626 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3627 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3628 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3629 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3630 } else { 3631 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3632 ca_red = 0; 3633 logvar = new_per; 3634 logvar <<= 32; 3635 logvar |= alt; 3636 logvar2 = (uint32_t)rtt; 3637 logvar2 <<= 32; 3638 logvar2 |= (uint32_t)rtt_diff; 3639 logvar3 = rack_gp_rtt_maxmul; 3640 logvar3 <<= 32; 3641 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3642 rack_log_timely(rack, timely_says, 3643 logvar2, logvar3, 3644 logvar, __LINE__, 10); 3645 } 3646 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3647 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3648 logged |= 2; 3649 } 3650 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3651 rack->rc_gp_timely_dec_cnt++; 3652 if (rack_timely_dec_clear && 3653 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3654 rack->rc_gp_timely_dec_cnt = 0; 3655 } 3656 logvar = ss_red; 3657 logvar <<= 32; 3658 logvar |= ca_red; 3659 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3660 __LINE__, 2); 3661 } 3662 3663 static void 3664 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3665 uint32_t rtt, uint32_t line, uint8_t reas) 3666 { 3667 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3668 union tcp_log_stackspecific log; 3669 struct timeval tv; 3670 3671 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3672 log.u_bbr.flex1 = line; 3673 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 3674 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 3675 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3676 log.u_bbr.flex5 = rtt; 3677 log.u_bbr.flex6 = rack->rc_highly_buffered; 3678 log.u_bbr.flex6 <<= 1; 3679 log.u_bbr.flex6 |= rack->forced_ack; 3680 log.u_bbr.flex6 <<= 1; 3681 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 3682 log.u_bbr.flex6 <<= 1; 3683 log.u_bbr.flex6 |= rack->in_probe_rtt; 3684 log.u_bbr.flex6 <<= 1; 3685 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 3686 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 3687 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 3688 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 3689 log.u_bbr.flex8 = reas; 3690 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3691 log.u_bbr.delRate = rack_get_bw(rack); 3692 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 3693 log.u_bbr.cur_del_rate <<= 32; 3694 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 3695 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 3696 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3697 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3698 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3699 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3700 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 3701 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 3702 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3703 log.u_bbr.rttProp = us_cts; 3704 log.u_bbr.rttProp <<= 32; 3705 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 3706 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3707 &rack->rc_inp->inp_socket->so_rcv, 3708 &rack->rc_inp->inp_socket->so_snd, 3709 BBR_LOG_RTT_SHRINKS, 0, 3710 0, &log, false, &rack->r_ctl.act_rcv_time); 3711 } 3712 } 3713 3714 static void 3715 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 3716 { 3717 uint64_t bwdp; 3718 3719 bwdp = rack_get_bw(rack); 3720 bwdp *= (uint64_t)rtt; 3721 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 3722 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 3723 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 3724 /* 3725 * A window protocol must be able to have 4 packets 3726 * outstanding as the floor in order to function 3727 * (especially considering delayed ack :D). 3728 */ 3729 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 3730 } 3731 } 3732 3733 static void 3734 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 3735 { 3736 /** 3737 * ProbeRTT is a bit different in rack_pacing than in 3738 * BBR. It is like BBR in that it uses the lowering of 3739 * the RTT as a signal that we saw something new and 3740 * counts from there for how long between. But it is 3741 * different in that its quite simple. It does not 3742 * play with the cwnd and wait until we get down 3743 * to N segments outstanding and hold that for 3744 * 200ms. Instead it just sets the pacing reduction 3745 * rate to a set percentage (70 by default) and hold 3746 * that for a number of recent GP Srtt's. 3747 */ 3748 uint32_t segsiz; 3749 3750 if (rack->rc_gp_dyn_mul == 0) 3751 return; 3752 3753 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3754 /* We are idle */ 3755 return; 3756 } 3757 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3758 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3759 /* 3760 * Stop the goodput now, the idea here is 3761 * that future measurements with in_probe_rtt 3762 * won't register if they are not greater so 3763 * we want to get what info (if any) is available 3764 * now. 3765 */ 3766 rack_do_goodput_measurement(rack->rc_tp, rack, 3767 rack->rc_tp->snd_una, __LINE__, 3768 RACK_QUALITY_PROBERTT); 3769 } 3770 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3771 rack->r_ctl.rc_time_probertt_entered = us_cts; 3772 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3773 rack->r_ctl.rc_pace_min_segs); 3774 rack->in_probe_rtt = 1; 3775 rack->measure_saw_probe_rtt = 1; 3776 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3777 rack->r_ctl.rc_time_probertt_starts = 0; 3778 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3779 if (rack_probertt_use_min_rtt_entry) 3780 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3781 else 3782 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3783 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3784 __LINE__, RACK_RTTS_ENTERPROBE); 3785 } 3786 3787 static void 3788 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3789 { 3790 struct rack_sendmap *rsm; 3791 uint32_t segsiz; 3792 3793 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3794 rack->r_ctl.rc_pace_min_segs); 3795 rack->in_probe_rtt = 0; 3796 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3797 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3798 /* 3799 * Stop the goodput now, the idea here is 3800 * that future measurements with in_probe_rtt 3801 * won't register if they are not greater so 3802 * we want to get what info (if any) is available 3803 * now. 3804 */ 3805 rack_do_goodput_measurement(rack->rc_tp, rack, 3806 rack->rc_tp->snd_una, __LINE__, 3807 RACK_QUALITY_PROBERTT); 3808 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3809 /* 3810 * We don't have enough data to make a measurement. 3811 * So lets just stop and start here after exiting 3812 * probe-rtt. We probably are not interested in 3813 * the results anyway. 3814 */ 3815 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3816 } 3817 /* 3818 * Measurements through the current snd_max are going 3819 * to be limited by the slower pacing rate. 3820 * 3821 * We need to mark these as app-limited so we 3822 * don't collapse the b/w. 3823 */ 3824 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3825 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3826 if (rack->r_ctl.rc_app_limited_cnt == 0) 3827 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3828 else { 3829 /* 3830 * Go out to the end app limited and mark 3831 * this new one as next and move the end_appl up 3832 * to this guy. 3833 */ 3834 if (rack->r_ctl.rc_end_appl) 3835 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3836 rack->r_ctl.rc_end_appl = rsm; 3837 } 3838 rsm->r_flags |= RACK_APP_LIMITED; 3839 rack->r_ctl.rc_app_limited_cnt++; 3840 } 3841 /* 3842 * Now, we need to examine our pacing rate multipliers. 3843 * If its under 100%, we need to kick it back up to 3844 * 100%. We also don't let it be over our "max" above 3845 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3846 * Note setting clamp_atexit_prtt to 0 has the effect 3847 * of setting CA/SS to 100% always at exit (which is 3848 * the default behavior). 3849 */ 3850 if (rack_probertt_clear_is) { 3851 rack->rc_gp_incr = 0; 3852 rack->rc_gp_bwred = 0; 3853 rack->rc_gp_timely_inc_cnt = 0; 3854 rack->rc_gp_timely_dec_cnt = 0; 3855 } 3856 /* Do we do any clamping at exit? */ 3857 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3858 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3859 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3860 } 3861 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3862 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3863 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3864 } 3865 /* 3866 * Lets set rtt_diff to 0, so that we will get a "boost" 3867 * after exiting. 3868 */ 3869 rack->r_ctl.rc_rtt_diff = 0; 3870 3871 /* Clear all flags so we start fresh */ 3872 rack->rc_tp->t_bytes_acked = 0; 3873 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3874 /* 3875 * If configured to, set the cwnd and ssthresh to 3876 * our targets. 3877 */ 3878 if (rack_probe_rtt_sets_cwnd) { 3879 uint64_t ebdp; 3880 uint32_t setto; 3881 3882 /* Set ssthresh so we get into CA once we hit our target */ 3883 if (rack_probertt_use_min_rtt_exit == 1) { 3884 /* Set to min rtt */ 3885 rack_set_prtt_target(rack, segsiz, 3886 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3887 } else if (rack_probertt_use_min_rtt_exit == 2) { 3888 /* Set to current gp rtt */ 3889 rack_set_prtt_target(rack, segsiz, 3890 rack->r_ctl.rc_gp_srtt); 3891 } else if (rack_probertt_use_min_rtt_exit == 3) { 3892 /* Set to entry gp rtt */ 3893 rack_set_prtt_target(rack, segsiz, 3894 rack->r_ctl.rc_entry_gp_rtt); 3895 } else { 3896 uint64_t sum; 3897 uint32_t setval; 3898 3899 sum = rack->r_ctl.rc_entry_gp_rtt; 3900 sum *= 10; 3901 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3902 if (sum >= 20) { 3903 /* 3904 * A highly buffered path needs 3905 * cwnd space for timely to work. 3906 * Lets set things up as if 3907 * we are heading back here again. 3908 */ 3909 setval = rack->r_ctl.rc_entry_gp_rtt; 3910 } else if (sum >= 15) { 3911 /* 3912 * Lets take the smaller of the 3913 * two since we are just somewhat 3914 * buffered. 3915 */ 3916 setval = rack->r_ctl.rc_gp_srtt; 3917 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3918 setval = rack->r_ctl.rc_entry_gp_rtt; 3919 } else { 3920 /* 3921 * Here we are not highly buffered 3922 * and should pick the min we can to 3923 * keep from causing loss. 3924 */ 3925 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3926 } 3927 rack_set_prtt_target(rack, segsiz, 3928 setval); 3929 } 3930 if (rack_probe_rtt_sets_cwnd > 1) { 3931 /* There is a percentage here to boost */ 3932 ebdp = rack->r_ctl.rc_target_probertt_flight; 3933 ebdp *= rack_probe_rtt_sets_cwnd; 3934 ebdp /= 100; 3935 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3936 } else 3937 setto = rack->r_ctl.rc_target_probertt_flight; 3938 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3939 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3940 /* Enforce a min */ 3941 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3942 } 3943 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3944 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3945 } 3946 rack_log_rtt_shrinks(rack, us_cts, 3947 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3948 __LINE__, RACK_RTTS_EXITPROBE); 3949 /* Clear times last so log has all the info */ 3950 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3951 rack->r_ctl.rc_time_probertt_entered = us_cts; 3952 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3953 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3954 } 3955 3956 static void 3957 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3958 { 3959 /* Check in on probe-rtt */ 3960 if (rack->rc_gp_filled == 0) { 3961 /* We do not do p-rtt unless we have gp measurements */ 3962 return; 3963 } 3964 if (rack->in_probe_rtt) { 3965 uint64_t no_overflow; 3966 uint32_t endtime, must_stay; 3967 3968 if (rack->r_ctl.rc_went_idle_time && 3969 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3970 /* 3971 * We went idle during prtt, just exit now. 3972 */ 3973 rack_exit_probertt(rack, us_cts); 3974 } else if (rack_probe_rtt_safety_val && 3975 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3976 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3977 /* 3978 * Probe RTT safety value triggered! 3979 */ 3980 rack_log_rtt_shrinks(rack, us_cts, 3981 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3982 __LINE__, RACK_RTTS_SAFETY); 3983 rack_exit_probertt(rack, us_cts); 3984 } 3985 /* Calculate the max we will wait */ 3986 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3987 if (rack->rc_highly_buffered) 3988 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3989 /* Calculate the min we must wait */ 3990 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3991 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3992 TSTMP_LT(us_cts, endtime)) { 3993 uint32_t calc; 3994 /* Do we lower more? */ 3995 no_exit: 3996 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3997 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3998 else 3999 calc = 0; 4000 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 4001 if (calc) { 4002 /* Maybe */ 4003 calc *= rack_per_of_gp_probertt_reduce; 4004 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 4005 /* Limit it too */ 4006 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 4007 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 4008 } 4009 /* We must reach target or the time set */ 4010 return; 4011 } 4012 if (rack->r_ctl.rc_time_probertt_starts == 0) { 4013 if ((TSTMP_LT(us_cts, must_stay) && 4014 rack->rc_highly_buffered) || 4015 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 4016 rack->r_ctl.rc_target_probertt_flight)) { 4017 /* We are not past the must_stay time */ 4018 goto no_exit; 4019 } 4020 rack_log_rtt_shrinks(rack, us_cts, 4021 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4022 __LINE__, RACK_RTTS_REACHTARGET); 4023 rack->r_ctl.rc_time_probertt_starts = us_cts; 4024 if (rack->r_ctl.rc_time_probertt_starts == 0) 4025 rack->r_ctl.rc_time_probertt_starts = 1; 4026 /* Restore back to our rate we want to pace at in prtt */ 4027 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 4028 } 4029 /* 4030 * Setup our end time, some number of gp_srtts plus 200ms. 4031 */ 4032 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 4033 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 4034 if (rack_probertt_gpsrtt_cnt_div) 4035 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 4036 else 4037 endtime = 0; 4038 endtime += rack_min_probertt_hold; 4039 endtime += rack->r_ctl.rc_time_probertt_starts; 4040 if (TSTMP_GEQ(us_cts, endtime)) { 4041 /* yes, exit probertt */ 4042 rack_exit_probertt(rack, us_cts); 4043 } 4044 4045 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 4046 /* Go into probertt, its been too long since we went lower */ 4047 rack_enter_probertt(rack, us_cts); 4048 } 4049 } 4050 4051 static void 4052 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 4053 uint32_t rtt, int32_t rtt_diff) 4054 { 4055 uint64_t cur_bw, up_bnd, low_bnd, subfr; 4056 uint32_t losses; 4057 4058 if ((rack->rc_gp_dyn_mul == 0) || 4059 (rack->use_fixed_rate) || 4060 (rack->in_probe_rtt) || 4061 (rack->rc_always_pace == 0)) { 4062 /* No dynamic GP multipler in play */ 4063 return; 4064 } 4065 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 4066 cur_bw = rack_get_bw(rack); 4067 /* Calculate our up and down range */ 4068 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 4069 up_bnd /= 100; 4070 up_bnd += rack->r_ctl.last_gp_comp_bw; 4071 4072 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 4073 subfr /= 100; 4074 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 4075 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 4076 /* 4077 * This is the case where our RTT is above 4078 * the max target and we have been configured 4079 * to just do timely no bonus up stuff in that case. 4080 * 4081 * There are two configurations, set to 1, and we 4082 * just do timely if we are over our max. If its 4083 * set above 1 then we slam the multipliers down 4084 * to 100 and then decrement per timely. 4085 */ 4086 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4087 __LINE__, 3); 4088 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4089 rack_validate_multipliers_at_or_below_100(rack); 4090 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4091 } else if ((last_bw_est < low_bnd) && !losses) { 4092 /* 4093 * We are decreasing this is a bit complicated this 4094 * means we are loosing ground. This could be 4095 * because another flow entered and we are competing 4096 * for b/w with it. This will push the RTT up which 4097 * makes timely unusable unless we want to get shoved 4098 * into a corner and just be backed off (the age 4099 * old problem with delay based CC). 4100 * 4101 * On the other hand if it was a route change we 4102 * would like to stay somewhat contained and not 4103 * blow out the buffers. 4104 */ 4105 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4106 __LINE__, 3); 4107 rack->r_ctl.last_gp_comp_bw = cur_bw; 4108 if (rack->rc_gp_bwred == 0) { 4109 /* Go into reduction counting */ 4110 rack->rc_gp_bwred = 1; 4111 rack->rc_gp_timely_dec_cnt = 0; 4112 } 4113 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 4114 (timely_says == 0)) { 4115 /* 4116 * Push another time with a faster pacing 4117 * to try to gain back (we include override to 4118 * get a full raise factor). 4119 */ 4120 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4121 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4122 (timely_says == 0) || 4123 (rack_down_raise_thresh == 0)) { 4124 /* 4125 * Do an override up in b/w if we were 4126 * below the threshold or if the threshold 4127 * is zero we always do the raise. 4128 */ 4129 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4130 } else { 4131 /* Log it stays the same */ 4132 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4133 __LINE__, 11); 4134 } 4135 rack->rc_gp_timely_dec_cnt++; 4136 /* We are not incrementing really no-count */ 4137 rack->rc_gp_incr = 0; 4138 rack->rc_gp_timely_inc_cnt = 0; 4139 } else { 4140 /* 4141 * Lets just use the RTT 4142 * information and give up 4143 * pushing. 4144 */ 4145 goto use_timely; 4146 } 4147 } else if ((timely_says != 2) && 4148 !losses && 4149 (last_bw_est > up_bnd)) { 4150 /* 4151 * We are increasing b/w lets keep going, updating 4152 * our b/w and ignoring any timely input, unless 4153 * of course we are at our max raise (if there is one). 4154 */ 4155 4156 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4157 __LINE__, 3); 4158 rack->r_ctl.last_gp_comp_bw = cur_bw; 4159 if (rack->rc_gp_saw_ss && 4160 rack_per_upper_bound_ss && 4161 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 4162 /* 4163 * In cases where we can't go higher 4164 * we should just use timely. 4165 */ 4166 goto use_timely; 4167 } 4168 if (rack->rc_gp_saw_ca && 4169 rack_per_upper_bound_ca && 4170 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 4171 /* 4172 * In cases where we can't go higher 4173 * we should just use timely. 4174 */ 4175 goto use_timely; 4176 } 4177 rack->rc_gp_bwred = 0; 4178 rack->rc_gp_timely_dec_cnt = 0; 4179 /* You get a set number of pushes if timely is trying to reduce */ 4180 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4181 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4182 } else { 4183 /* Log it stays the same */ 4184 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4185 __LINE__, 12); 4186 } 4187 return; 4188 } else { 4189 /* 4190 * We are staying between the lower and upper range bounds 4191 * so use timely to decide. 4192 */ 4193 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4194 __LINE__, 3); 4195 use_timely: 4196 if (timely_says) { 4197 rack->rc_gp_incr = 0; 4198 rack->rc_gp_timely_inc_cnt = 0; 4199 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4200 !losses && 4201 (last_bw_est < low_bnd)) { 4202 /* We are loosing ground */ 4203 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4204 rack->rc_gp_timely_dec_cnt++; 4205 /* We are not incrementing really no-count */ 4206 rack->rc_gp_incr = 0; 4207 rack->rc_gp_timely_inc_cnt = 0; 4208 } else 4209 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4210 } else { 4211 rack->rc_gp_bwred = 0; 4212 rack->rc_gp_timely_dec_cnt = 0; 4213 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4214 } 4215 } 4216 } 4217 4218 static int32_t 4219 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4220 { 4221 int32_t timely_says; 4222 uint64_t log_mult, log_rtt_a_diff; 4223 4224 log_rtt_a_diff = rtt; 4225 log_rtt_a_diff <<= 32; 4226 log_rtt_a_diff |= (uint32_t)rtt_diff; 4227 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4228 rack_gp_rtt_maxmul)) { 4229 /* Reduce the b/w multipler */ 4230 timely_says = 2; 4231 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4232 log_mult <<= 32; 4233 log_mult |= prev_rtt; 4234 rack_log_timely(rack, timely_says, log_mult, 4235 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4236 log_rtt_a_diff, __LINE__, 4); 4237 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4238 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4239 max(rack_gp_rtt_mindiv , 1)))) { 4240 /* Increase the b/w multipler */ 4241 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4242 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4243 max(rack_gp_rtt_mindiv , 1)); 4244 log_mult <<= 32; 4245 log_mult |= prev_rtt; 4246 timely_says = 0; 4247 rack_log_timely(rack, timely_says, log_mult , 4248 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4249 log_rtt_a_diff, __LINE__, 5); 4250 } else { 4251 /* 4252 * Use a gradient to find it the timely gradient 4253 * is: 4254 * grad = rc_rtt_diff / min_rtt; 4255 * 4256 * anything below or equal to 0 will be 4257 * a increase indication. Anything above 4258 * zero is a decrease. Note we take care 4259 * of the actual gradient calculation 4260 * in the reduction (its not needed for 4261 * increase). 4262 */ 4263 log_mult = prev_rtt; 4264 if (rtt_diff <= 0) { 4265 /* 4266 * Rttdiff is less than zero, increase the 4267 * b/w multipler (its 0 or negative) 4268 */ 4269 timely_says = 0; 4270 rack_log_timely(rack, timely_says, log_mult, 4271 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4272 } else { 4273 /* Reduce the b/w multipler */ 4274 timely_says = 1; 4275 rack_log_timely(rack, timely_says, log_mult, 4276 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4277 } 4278 } 4279 return (timely_says); 4280 } 4281 4282 static void 4283 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4284 tcp_seq th_ack, int line, uint8_t quality) 4285 { 4286 uint64_t tim, bytes_ps, ltim, stim, utim; 4287 uint32_t segsiz, bytes, reqbytes, us_cts; 4288 int32_t gput, new_rtt_diff, timely_says; 4289 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4290 int did_add = 0; 4291 4292 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4293 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4294 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4295 tim = us_cts - tp->gput_ts; 4296 else 4297 tim = 0; 4298 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4299 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4300 else 4301 stim = 0; 4302 /* 4303 * Use the larger of the send time or ack time. This prevents us 4304 * from being influenced by ack artifacts to come up with too 4305 * high of measurement. Note that since we are spanning over many more 4306 * bytes in most of our measurements hopefully that is less likely to 4307 * occur. 4308 */ 4309 if (tim > stim) 4310 utim = max(tim, 1); 4311 else 4312 utim = max(stim, 1); 4313 /* Lets get a msec time ltim too for the old stuff */ 4314 ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); 4315 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 4316 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4317 if ((tim == 0) && (stim == 0)) { 4318 /* 4319 * Invalid measurement time, maybe 4320 * all on one ack/one send? 4321 */ 4322 bytes = 0; 4323 bytes_ps = 0; 4324 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4325 0, 0, 0, 10, __LINE__, NULL, quality); 4326 goto skip_measurement; 4327 } 4328 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4329 /* We never made a us_rtt measurement? */ 4330 bytes = 0; 4331 bytes_ps = 0; 4332 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4333 0, 0, 0, 10, __LINE__, NULL, quality); 4334 goto skip_measurement; 4335 } 4336 /* 4337 * Calculate the maximum possible b/w this connection 4338 * could have. We base our calculation on the lowest 4339 * rtt we have seen during the measurement and the 4340 * largest rwnd the client has given us in that time. This 4341 * forms a BDP that is the maximum that we could ever 4342 * get to the client. Anything larger is not valid. 4343 * 4344 * I originally had code here that rejected measurements 4345 * where the time was less than 1/2 the latest us_rtt. 4346 * But after thinking on that I realized its wrong since 4347 * say you had a 150Mbps or even 1Gbps link, and you 4348 * were a long way away.. example I am in Europe (100ms rtt) 4349 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4350 * bytes my time would be 1.2ms, and yet my rtt would say 4351 * the measurement was invalid the time was < 50ms. The 4352 * same thing is true for 150Mb (8ms of time). 4353 * 4354 * A better way I realized is to look at what the maximum 4355 * the connection could possibly do. This is gated on 4356 * the lowest RTT we have seen and the highest rwnd. 4357 * We should in theory never exceed that, if we are 4358 * then something on the path is storing up packets 4359 * and then feeding them all at once to our endpoint 4360 * messing up our measurement. 4361 */ 4362 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4363 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4364 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4365 if (SEQ_LT(th_ack, tp->gput_seq)) { 4366 /* No measurement can be made */ 4367 bytes = 0; 4368 bytes_ps = 0; 4369 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4370 0, 0, 0, 10, __LINE__, NULL, quality); 4371 goto skip_measurement; 4372 } else 4373 bytes = (th_ack - tp->gput_seq); 4374 bytes_ps = (uint64_t)bytes; 4375 /* 4376 * Don't measure a b/w for pacing unless we have gotten at least 4377 * an initial windows worth of data in this measurement interval. 4378 * 4379 * Small numbers of bytes get badly influenced by delayed ack and 4380 * other artifacts. Note we take the initial window or our 4381 * defined minimum GP (defaulting to 10 which hopefully is the 4382 * IW). 4383 */ 4384 if (rack->rc_gp_filled == 0) { 4385 /* 4386 * The initial estimate is special. We 4387 * have blasted out an IW worth of packets 4388 * without a real valid ack ts results. We 4389 * then setup the app_limited_needs_set flag, 4390 * this should get the first ack in (probably 2 4391 * MSS worth) to be recorded as the timestamp. 4392 * We thus allow a smaller number of bytes i.e. 4393 * IW - 2MSS. 4394 */ 4395 reqbytes -= (2 * segsiz); 4396 /* Also lets fill previous for our first measurement to be neutral */ 4397 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4398 } 4399 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4400 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4401 rack->r_ctl.rc_app_limited_cnt, 4402 0, 0, 10, __LINE__, NULL, quality); 4403 goto skip_measurement; 4404 } 4405 /* 4406 * We now need to calculate the Timely like status so 4407 * we can update (possibly) the b/w multipliers. 4408 */ 4409 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4410 if (rack->rc_gp_filled == 0) { 4411 /* No previous reading */ 4412 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4413 } else { 4414 if (rack->measure_saw_probe_rtt == 0) { 4415 /* 4416 * We don't want a probertt to be counted 4417 * since it will be negative incorrectly. We 4418 * expect to be reducing the RTT when we 4419 * pace at a slower rate. 4420 */ 4421 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4422 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4423 } 4424 } 4425 timely_says = rack_make_timely_judgement(rack, 4426 rack->r_ctl.rc_gp_srtt, 4427 rack->r_ctl.rc_rtt_diff, 4428 rack->r_ctl.rc_prev_gp_srtt 4429 ); 4430 bytes_ps *= HPTS_USEC_IN_SEC; 4431 bytes_ps /= utim; 4432 if (bytes_ps > rack->r_ctl.last_max_bw) { 4433 /* 4434 * Something is on path playing 4435 * since this b/w is not possible based 4436 * on our BDP (highest rwnd and lowest rtt 4437 * we saw in the measurement window). 4438 * 4439 * Another option here would be to 4440 * instead skip the measurement. 4441 */ 4442 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4443 bytes_ps, rack->r_ctl.last_max_bw, 0, 4444 11, __LINE__, NULL, quality); 4445 bytes_ps = rack->r_ctl.last_max_bw; 4446 } 4447 /* We store gp for b/w in bytes per second */ 4448 if (rack->rc_gp_filled == 0) { 4449 /* Initial measurment */ 4450 if (bytes_ps) { 4451 rack->r_ctl.gp_bw = bytes_ps; 4452 rack->rc_gp_filled = 1; 4453 rack->r_ctl.num_measurements = 1; 4454 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4455 } else { 4456 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4457 rack->r_ctl.rc_app_limited_cnt, 4458 0, 0, 10, __LINE__, NULL, quality); 4459 } 4460 if (tcp_in_hpts(rack->rc_inp) && 4461 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4462 /* 4463 * Ok we can't trust the pacer in this case 4464 * where we transition from un-paced to paced. 4465 * Or for that matter when the burst mitigation 4466 * was making a wild guess and got it wrong. 4467 * Stop the pacer and clear up all the aggregate 4468 * delays etc. 4469 */ 4470 tcp_hpts_remove(rack->rc_inp); 4471 rack->r_ctl.rc_hpts_flags = 0; 4472 rack->r_ctl.rc_last_output_to = 0; 4473 } 4474 did_add = 2; 4475 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4476 /* Still a small number run an average */ 4477 rack->r_ctl.gp_bw += bytes_ps; 4478 addpart = rack->r_ctl.num_measurements; 4479 rack->r_ctl.num_measurements++; 4480 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4481 /* We have collected enought to move forward */ 4482 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4483 } 4484 did_add = 3; 4485 } else { 4486 /* 4487 * We want to take 1/wma of the goodput and add in to 7/8th 4488 * of the old value weighted by the srtt. So if your measurement 4489 * period is say 2 SRTT's long you would get 1/4 as the 4490 * value, if it was like 1/2 SRTT then you would get 1/16th. 4491 * 4492 * But we must be careful not to take too much i.e. if the 4493 * srtt is say 20ms and the measurement is taken over 4494 * 400ms our weight would be 400/20 i.e. 20. On the 4495 * other hand if we get a measurement over 1ms with a 4496 * 10ms rtt we only want to take a much smaller portion. 4497 */ 4498 if (rack->r_ctl.num_measurements < 0xff) { 4499 rack->r_ctl.num_measurements++; 4500 } 4501 srtt = (uint64_t)tp->t_srtt; 4502 if (srtt == 0) { 4503 /* 4504 * Strange why did t_srtt go back to zero? 4505 */ 4506 if (rack->r_ctl.rc_rack_min_rtt) 4507 srtt = rack->r_ctl.rc_rack_min_rtt; 4508 else 4509 srtt = HPTS_USEC_IN_MSEC; 4510 } 4511 /* 4512 * XXXrrs: Note for reviewers, in playing with 4513 * dynamic pacing I discovered this GP calculation 4514 * as done originally leads to some undesired results. 4515 * Basically you can get longer measurements contributing 4516 * too much to the WMA. Thus I changed it if you are doing 4517 * dynamic adjustments to only do the aportioned adjustment 4518 * if we have a very small (time wise) measurement. Longer 4519 * measurements just get there weight (defaulting to 1/8) 4520 * add to the WMA. We may want to think about changing 4521 * this to always do that for both sides i.e. dynamic 4522 * and non-dynamic... but considering lots of folks 4523 * were playing with this I did not want to change the 4524 * calculation per.se. without your thoughts.. Lawerence? 4525 * Peter?? 4526 */ 4527 if (rack->rc_gp_dyn_mul == 0) { 4528 subpart = rack->r_ctl.gp_bw * utim; 4529 subpart /= (srtt * 8); 4530 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4531 /* 4532 * The b/w update takes no more 4533 * away then 1/2 our running total 4534 * so factor it in. 4535 */ 4536 addpart = bytes_ps * utim; 4537 addpart /= (srtt * 8); 4538 } else { 4539 /* 4540 * Don't allow a single measurement 4541 * to account for more than 1/2 of the 4542 * WMA. This could happen on a retransmission 4543 * where utim becomes huge compared to 4544 * srtt (multiple retransmissions when using 4545 * the sending rate which factors in all the 4546 * transmissions from the first one). 4547 */ 4548 subpart = rack->r_ctl.gp_bw / 2; 4549 addpart = bytes_ps / 2; 4550 } 4551 resid_bw = rack->r_ctl.gp_bw - subpart; 4552 rack->r_ctl.gp_bw = resid_bw + addpart; 4553 did_add = 1; 4554 } else { 4555 if ((utim / srtt) <= 1) { 4556 /* 4557 * The b/w update was over a small period 4558 * of time. The idea here is to prevent a small 4559 * measurement time period from counting 4560 * too much. So we scale it based on the 4561 * time so it attributes less than 1/rack_wma_divisor 4562 * of its measurement. 4563 */ 4564 subpart = rack->r_ctl.gp_bw * utim; 4565 subpart /= (srtt * rack_wma_divisor); 4566 addpart = bytes_ps * utim; 4567 addpart /= (srtt * rack_wma_divisor); 4568 } else { 4569 /* 4570 * The scaled measurement was long 4571 * enough so lets just add in the 4572 * portion of the measurment i.e. 1/rack_wma_divisor 4573 */ 4574 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 4575 addpart = bytes_ps / rack_wma_divisor; 4576 } 4577 if ((rack->measure_saw_probe_rtt == 0) || 4578 (bytes_ps > rack->r_ctl.gp_bw)) { 4579 /* 4580 * For probe-rtt we only add it in 4581 * if its larger, all others we just 4582 * add in. 4583 */ 4584 did_add = 1; 4585 resid_bw = rack->r_ctl.gp_bw - subpart; 4586 rack->r_ctl.gp_bw = resid_bw + addpart; 4587 } 4588 } 4589 } 4590 if ((rack->gp_ready == 0) && 4591 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 4592 /* We have enough measurements now */ 4593 rack->gp_ready = 1; 4594 rack_set_cc_pacing(rack); 4595 if (rack->defer_options) 4596 rack_apply_deferred_options(rack); 4597 } 4598 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 4599 rack_get_bw(rack), 22, did_add, NULL, quality); 4600 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 4601 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 4602 rack_update_multiplier(rack, timely_says, bytes_ps, 4603 rack->r_ctl.rc_gp_srtt, 4604 rack->r_ctl.rc_rtt_diff); 4605 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 4606 rack_get_bw(rack), 3, line, NULL, quality); 4607 /* reset the gp srtt and setup the new prev */ 4608 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4609 /* Record the lost count for the next measurement */ 4610 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 4611 /* 4612 * We restart our diffs based on the gpsrtt in the 4613 * measurement window. 4614 */ 4615 rack->rc_gp_rtt_set = 0; 4616 rack->rc_gp_saw_rec = 0; 4617 rack->rc_gp_saw_ca = 0; 4618 rack->rc_gp_saw_ss = 0; 4619 rack->rc_dragged_bottom = 0; 4620 skip_measurement: 4621 4622 #ifdef STATS 4623 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 4624 gput); 4625 /* 4626 * XXXLAS: This is a temporary hack, and should be 4627 * chained off VOI_TCP_GPUT when stats(9) grows an 4628 * API to deal with chained VOIs. 4629 */ 4630 if (tp->t_stats_gput_prev > 0) 4631 stats_voi_update_abs_s32(tp->t_stats, 4632 VOI_TCP_GPUT_ND, 4633 ((gput - tp->t_stats_gput_prev) * 100) / 4634 tp->t_stats_gput_prev); 4635 #endif 4636 tp->t_flags &= ~TF_GPUTINPROG; 4637 tp->t_stats_gput_prev = gput; 4638 /* 4639 * Now are we app limited now and there is space from where we 4640 * were to where we want to go? 4641 * 4642 * We don't do the other case i.e. non-applimited here since 4643 * the next send will trigger us picking up the missing data. 4644 */ 4645 if (rack->r_ctl.rc_first_appl && 4646 TCPS_HAVEESTABLISHED(tp->t_state) && 4647 rack->r_ctl.rc_app_limited_cnt && 4648 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 4649 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 4650 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 4651 /* 4652 * Yep there is enough outstanding to make a measurement here. 4653 */ 4654 struct rack_sendmap *rsm, fe; 4655 4656 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 4657 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 4658 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4659 rack->app_limited_needs_set = 0; 4660 tp->gput_seq = th_ack; 4661 if (rack->in_probe_rtt) 4662 rack->measure_saw_probe_rtt = 1; 4663 else if ((rack->measure_saw_probe_rtt) && 4664 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 4665 rack->measure_saw_probe_rtt = 0; 4666 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 4667 /* There is a full window to gain info from */ 4668 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 4669 } else { 4670 /* We can only measure up to the applimited point */ 4671 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 4672 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 4673 /* 4674 * We don't have enough to make a measurement. 4675 */ 4676 tp->t_flags &= ~TF_GPUTINPROG; 4677 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 4678 0, 0, 0, 6, __LINE__, NULL, quality); 4679 return; 4680 } 4681 } 4682 if (tp->t_state >= TCPS_FIN_WAIT_1) { 4683 /* 4684 * We will get no more data into the SB 4685 * this means we need to have the data available 4686 * before we start a measurement. 4687 */ 4688 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) { 4689 /* Nope not enough data. */ 4690 return; 4691 } 4692 } 4693 tp->t_flags |= TF_GPUTINPROG; 4694 /* 4695 * Now we need to find the timestamp of the send at tp->gput_seq 4696 * for the send based measurement. 4697 */ 4698 fe.r_start = tp->gput_seq; 4699 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 4700 if (rsm) { 4701 /* Ok send-based limit is set */ 4702 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 4703 /* 4704 * Move back to include the earlier part 4705 * so our ack time lines up right (this may 4706 * make an overlapping measurement but thats 4707 * ok). 4708 */ 4709 tp->gput_seq = rsm->r_start; 4710 } 4711 if (rsm->r_flags & RACK_ACKED) 4712 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 4713 else 4714 rack->app_limited_needs_set = 1; 4715 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 4716 } else { 4717 /* 4718 * If we don't find the rsm due to some 4719 * send-limit set the current time, which 4720 * basically disables the send-limit. 4721 */ 4722 struct timeval tv; 4723 4724 microuptime(&tv); 4725 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 4726 } 4727 rack_log_pacing_delay_calc(rack, 4728 tp->gput_seq, 4729 tp->gput_ack, 4730 (uint64_t)rsm, 4731 tp->gput_ts, 4732 rack->r_ctl.rc_app_limited_cnt, 4733 9, 4734 __LINE__, NULL, quality); 4735 } 4736 } 4737 4738 /* 4739 * CC wrapper hook functions 4740 */ 4741 static void 4742 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 4743 uint16_t type, int32_t recovery) 4744 { 4745 uint32_t prior_cwnd, acked; 4746 struct tcp_log_buffer *lgb = NULL; 4747 uint8_t labc_to_use, quality; 4748 4749 INP_WLOCK_ASSERT(tp->t_inpcb); 4750 tp->ccv->nsegs = nsegs; 4751 acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una); 4752 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 4753 uint32_t max; 4754 4755 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 4756 if (tp->ccv->bytes_this_ack > max) { 4757 tp->ccv->bytes_this_ack = max; 4758 } 4759 } 4760 #ifdef STATS 4761 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 4762 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 4763 #endif 4764 quality = RACK_QUALITY_NONE; 4765 if ((tp->t_flags & TF_GPUTINPROG) && 4766 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 4767 /* Measure the Goodput */ 4768 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 4769 #ifdef NETFLIX_PEAKRATE 4770 if ((type == CC_ACK) && 4771 (tp->t_maxpeakrate)) { 4772 /* 4773 * We update t_peakrate_thr. This gives us roughly 4774 * one update per round trip time. Note 4775 * it will only be used if pace_always is off i.e 4776 * we don't do this for paced flows. 4777 */ 4778 rack_update_peakrate_thr(tp); 4779 } 4780 #endif 4781 } 4782 /* Which way our we limited, if not cwnd limited no advance in CA */ 4783 if (tp->snd_cwnd <= tp->snd_wnd) 4784 tp->ccv->flags |= CCF_CWND_LIMITED; 4785 else 4786 tp->ccv->flags &= ~CCF_CWND_LIMITED; 4787 if (tp->snd_cwnd > tp->snd_ssthresh) { 4788 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 4789 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 4790 /* For the setting of a window past use the actual scwnd we are using */ 4791 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 4792 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 4793 tp->ccv->flags |= CCF_ABC_SENTAWND; 4794 } 4795 } else { 4796 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 4797 tp->t_bytes_acked = 0; 4798 } 4799 prior_cwnd = tp->snd_cwnd; 4800 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 4801 (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) 4802 labc_to_use = rack->rc_labc; 4803 else 4804 labc_to_use = rack_max_abc_post_recovery; 4805 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4806 union tcp_log_stackspecific log; 4807 struct timeval tv; 4808 4809 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4810 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4811 log.u_bbr.flex1 = th_ack; 4812 log.u_bbr.flex2 = tp->ccv->flags; 4813 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4814 log.u_bbr.flex4 = tp->ccv->nsegs; 4815 log.u_bbr.flex5 = labc_to_use; 4816 log.u_bbr.flex6 = prior_cwnd; 4817 log.u_bbr.flex7 = V_tcp_do_newsack; 4818 log.u_bbr.flex8 = 1; 4819 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4820 0, &log, false, NULL, NULL, 0, &tv); 4821 } 4822 if (CC_ALGO(tp)->ack_received != NULL) { 4823 /* XXXLAS: Find a way to live without this */ 4824 tp->ccv->curack = th_ack; 4825 tp->ccv->labc = labc_to_use; 4826 tp->ccv->flags |= CCF_USE_LOCAL_ABC; 4827 CC_ALGO(tp)->ack_received(tp->ccv, type); 4828 } 4829 if (lgb) { 4830 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 4831 } 4832 if (rack->r_must_retran) { 4833 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 4834 /* 4835 * We now are beyond the rxt point so lets disable 4836 * the flag. 4837 */ 4838 rack->r_ctl.rc_out_at_rto = 0; 4839 rack->r_must_retran = 0; 4840 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 4841 /* 4842 * Only decrement the rc_out_at_rto if the cwnd advances 4843 * at least a whole segment. Otherwise next time the peer 4844 * acks, we won't be able to send this generaly happens 4845 * when we are in Congestion Avoidance. 4846 */ 4847 if (acked <= rack->r_ctl.rc_out_at_rto){ 4848 rack->r_ctl.rc_out_at_rto -= acked; 4849 } else { 4850 rack->r_ctl.rc_out_at_rto = 0; 4851 } 4852 } 4853 } 4854 #ifdef STATS 4855 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4856 #endif 4857 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4858 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4859 } 4860 #ifdef NETFLIX_PEAKRATE 4861 /* we enforce max peak rate if it is set and we are not pacing */ 4862 if ((rack->rc_always_pace == 0) && 4863 tp->t_peakrate_thr && 4864 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4865 tp->snd_cwnd = tp->t_peakrate_thr; 4866 } 4867 #endif 4868 } 4869 4870 static void 4871 tcp_rack_partialack(struct tcpcb *tp) 4872 { 4873 struct tcp_rack *rack; 4874 4875 rack = (struct tcp_rack *)tp->t_fb_ptr; 4876 INP_WLOCK_ASSERT(tp->t_inpcb); 4877 /* 4878 * If we are doing PRR and have enough 4879 * room to send <or> we are pacing and prr 4880 * is disabled we will want to see if we 4881 * can send data (by setting r_wanted_output to 4882 * true). 4883 */ 4884 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4885 rack->rack_no_prr) 4886 rack->r_wanted_output = 1; 4887 } 4888 4889 static void 4890 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 4891 { 4892 struct tcp_rack *rack; 4893 uint32_t orig_cwnd; 4894 4895 orig_cwnd = tp->snd_cwnd; 4896 INP_WLOCK_ASSERT(tp->t_inpcb); 4897 rack = (struct tcp_rack *)tp->t_fb_ptr; 4898 /* only alert CC if we alerted when we entered */ 4899 if (CC_ALGO(tp)->post_recovery != NULL) { 4900 tp->ccv->curack = th_ack; 4901 CC_ALGO(tp)->post_recovery(tp->ccv); 4902 if (tp->snd_cwnd < tp->snd_ssthresh) { 4903 /* 4904 * Rack has burst control and pacing 4905 * so lets not set this any lower than 4906 * snd_ssthresh per RFC-6582 (option 2). 4907 */ 4908 tp->snd_cwnd = tp->snd_ssthresh; 4909 } 4910 } 4911 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4912 union tcp_log_stackspecific log; 4913 struct timeval tv; 4914 4915 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4916 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4917 log.u_bbr.flex1 = th_ack; 4918 log.u_bbr.flex2 = tp->ccv->flags; 4919 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4920 log.u_bbr.flex4 = tp->ccv->nsegs; 4921 log.u_bbr.flex5 = V_tcp_abc_l_var; 4922 log.u_bbr.flex6 = orig_cwnd; 4923 log.u_bbr.flex7 = V_tcp_do_newsack; 4924 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 4925 log.u_bbr.flex8 = 2; 4926 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4927 0, &log, false, NULL, NULL, 0, &tv); 4928 } 4929 if ((rack->rack_no_prr == 0) && 4930 (rack->no_prr_addback == 0) && 4931 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4932 /* 4933 * Suck the next prr cnt back into cwnd, but 4934 * only do that if we are not application limited. 4935 */ 4936 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4937 /* 4938 * We are allowed to add back to the cwnd the amount we did 4939 * not get out if: 4940 * a) no_prr_addback is off. 4941 * b) we are not app limited 4942 * c) we are doing prr 4943 * <and> 4944 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 4945 */ 4946 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 4947 rack->r_ctl.rc_prr_sndcnt); 4948 } 4949 rack->r_ctl.rc_prr_sndcnt = 0; 4950 rack_log_to_prr(rack, 1, 0); 4951 } 4952 rack_log_to_prr(rack, 14, orig_cwnd); 4953 tp->snd_recover = tp->snd_una; 4954 if (rack->r_ctl.dsack_persist) { 4955 rack->r_ctl.dsack_persist--; 4956 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 4957 rack->r_ctl.num_dsack = 0; 4958 } 4959 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 4960 } 4961 EXIT_RECOVERY(tp->t_flags); 4962 } 4963 4964 static void 4965 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack) 4966 { 4967 struct tcp_rack *rack; 4968 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 4969 4970 INP_WLOCK_ASSERT(tp->t_inpcb); 4971 #ifdef STATS 4972 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 4973 #endif 4974 if (IN_RECOVERY(tp->t_flags) == 0) { 4975 in_rec_at_entry = 0; 4976 ssthresh_enter = tp->snd_ssthresh; 4977 cwnd_enter = tp->snd_cwnd; 4978 } else 4979 in_rec_at_entry = 1; 4980 rack = (struct tcp_rack *)tp->t_fb_ptr; 4981 switch (type) { 4982 case CC_NDUPACK: 4983 tp->t_flags &= ~TF_WASFRECOVERY; 4984 tp->t_flags &= ~TF_WASCRECOVERY; 4985 if (!IN_FASTRECOVERY(tp->t_flags)) { 4986 rack->r_ctl.rc_prr_delivered = 0; 4987 rack->r_ctl.rc_prr_out = 0; 4988 if (rack->rack_no_prr == 0) { 4989 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4990 rack_log_to_prr(rack, 2, in_rec_at_entry); 4991 } 4992 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4993 tp->snd_recover = tp->snd_max; 4994 if (tp->t_flags2 & TF2_ECN_PERMIT) 4995 tp->t_flags2 |= TF2_ECN_SND_CWR; 4996 } 4997 break; 4998 case CC_ECN: 4999 if (!IN_CONGRECOVERY(tp->t_flags) || 5000 /* 5001 * Allow ECN reaction on ACK to CWR, if 5002 * that data segment was also CE marked. 5003 */ 5004 SEQ_GEQ(ack, tp->snd_recover)) { 5005 EXIT_CONGRECOVERY(tp->t_flags); 5006 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 5007 tp->snd_recover = tp->snd_max + 1; 5008 if (tp->t_flags2 & TF2_ECN_PERMIT) 5009 tp->t_flags2 |= TF2_ECN_SND_CWR; 5010 } 5011 break; 5012 case CC_RTO: 5013 tp->t_dupacks = 0; 5014 tp->t_bytes_acked = 0; 5015 EXIT_RECOVERY(tp->t_flags); 5016 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 5017 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 5018 orig_cwnd = tp->snd_cwnd; 5019 tp->snd_cwnd = ctf_fixed_maxseg(tp); 5020 rack_log_to_prr(rack, 16, orig_cwnd); 5021 if (tp->t_flags2 & TF2_ECN_PERMIT) 5022 tp->t_flags2 |= TF2_ECN_SND_CWR; 5023 break; 5024 case CC_RTO_ERR: 5025 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 5026 /* RTO was unnecessary, so reset everything. */ 5027 tp->snd_cwnd = tp->snd_cwnd_prev; 5028 tp->snd_ssthresh = tp->snd_ssthresh_prev; 5029 tp->snd_recover = tp->snd_recover_prev; 5030 if (tp->t_flags & TF_WASFRECOVERY) { 5031 ENTER_FASTRECOVERY(tp->t_flags); 5032 tp->t_flags &= ~TF_WASFRECOVERY; 5033 } 5034 if (tp->t_flags & TF_WASCRECOVERY) { 5035 ENTER_CONGRECOVERY(tp->t_flags); 5036 tp->t_flags &= ~TF_WASCRECOVERY; 5037 } 5038 tp->snd_nxt = tp->snd_max; 5039 tp->t_badrxtwin = 0; 5040 break; 5041 } 5042 if ((CC_ALGO(tp)->cong_signal != NULL) && 5043 (type != CC_RTO)){ 5044 tp->ccv->curack = ack; 5045 CC_ALGO(tp)->cong_signal(tp->ccv, type); 5046 } 5047 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 5048 rack_log_to_prr(rack, 15, cwnd_enter); 5049 rack->r_ctl.dsack_byte_cnt = 0; 5050 rack->r_ctl.retran_during_recovery = 0; 5051 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 5052 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 5053 rack->r_ent_rec_ns = 1; 5054 } 5055 } 5056 5057 static inline void 5058 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 5059 { 5060 uint32_t i_cwnd; 5061 5062 INP_WLOCK_ASSERT(tp->t_inpcb); 5063 5064 #ifdef NETFLIX_STATS 5065 KMOD_TCPSTAT_INC(tcps_idle_restarts); 5066 if (tp->t_state == TCPS_ESTABLISHED) 5067 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 5068 #endif 5069 if (CC_ALGO(tp)->after_idle != NULL) 5070 CC_ALGO(tp)->after_idle(tp->ccv); 5071 5072 if (tp->snd_cwnd == 1) 5073 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 5074 else 5075 i_cwnd = rc_init_window(rack); 5076 5077 /* 5078 * Being idle is no differnt than the initial window. If the cc 5079 * clamps it down below the initial window raise it to the initial 5080 * window. 5081 */ 5082 if (tp->snd_cwnd < i_cwnd) { 5083 tp->snd_cwnd = i_cwnd; 5084 } 5085 } 5086 5087 /* 5088 * Indicate whether this ack should be delayed. We can delay the ack if 5089 * following conditions are met: 5090 * - There is no delayed ack timer in progress. 5091 * - Our last ack wasn't a 0-sized window. We never want to delay 5092 * the ack that opens up a 0-sized window. 5093 * - LRO wasn't used for this segment. We make sure by checking that the 5094 * segment size is not larger than the MSS. 5095 * - Delayed acks are enabled or this is a half-synchronized T/TCP 5096 * connection. 5097 */ 5098 #define DELAY_ACK(tp, tlen) \ 5099 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 5100 ((tp->t_flags & TF_DELACK) == 0) && \ 5101 (tlen <= tp->t_maxseg) && \ 5102 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 5103 5104 static struct rack_sendmap * 5105 rack_find_lowest_rsm(struct tcp_rack *rack) 5106 { 5107 struct rack_sendmap *rsm; 5108 5109 /* 5110 * Walk the time-order transmitted list looking for an rsm that is 5111 * not acked. This will be the one that was sent the longest time 5112 * ago that is still outstanding. 5113 */ 5114 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 5115 if (rsm->r_flags & RACK_ACKED) { 5116 continue; 5117 } 5118 goto finish; 5119 } 5120 finish: 5121 return (rsm); 5122 } 5123 5124 static struct rack_sendmap * 5125 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5126 { 5127 struct rack_sendmap *prsm; 5128 5129 /* 5130 * Walk the sequence order list backward until we hit and arrive at 5131 * the highest seq not acked. In theory when this is called it 5132 * should be the last segment (which it was not). 5133 */ 5134 counter_u64_add(rack_find_high, 1); 5135 prsm = rsm; 5136 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 5137 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5138 continue; 5139 } 5140 return (prsm); 5141 } 5142 return (NULL); 5143 } 5144 5145 static uint32_t 5146 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 5147 { 5148 int32_t lro; 5149 uint32_t thresh; 5150 5151 /* 5152 * lro is the flag we use to determine if we have seen reordering. 5153 * If it gets set we have seen reordering. The reorder logic either 5154 * works in one of two ways: 5155 * 5156 * If reorder-fade is configured, then we track the last time we saw 5157 * re-ordering occur. If we reach the point where enough time as 5158 * passed we no longer consider reordering has occuring. 5159 * 5160 * Or if reorder-face is 0, then once we see reordering we consider 5161 * the connection to alway be subject to reordering and just set lro 5162 * to 1. 5163 * 5164 * In the end if lro is non-zero we add the extra time for 5165 * reordering in. 5166 */ 5167 if (srtt == 0) 5168 srtt = 1; 5169 if (rack->r_ctl.rc_reorder_ts) { 5170 if (rack->r_ctl.rc_reorder_fade) { 5171 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 5172 lro = cts - rack->r_ctl.rc_reorder_ts; 5173 if (lro == 0) { 5174 /* 5175 * No time as passed since the last 5176 * reorder, mark it as reordering. 5177 */ 5178 lro = 1; 5179 } 5180 } else { 5181 /* Negative time? */ 5182 lro = 0; 5183 } 5184 if (lro > rack->r_ctl.rc_reorder_fade) { 5185 /* Turn off reordering seen too */ 5186 rack->r_ctl.rc_reorder_ts = 0; 5187 lro = 0; 5188 } 5189 } else { 5190 /* Reodering does not fade */ 5191 lro = 1; 5192 } 5193 } else { 5194 lro = 0; 5195 } 5196 if (rack->rc_rack_tmr_std_based == 0) { 5197 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5198 } else { 5199 /* Standards based pkt-delay is 1/4 srtt */ 5200 thresh = srtt + (srtt >> 2); 5201 } 5202 if (lro && (rack->rc_rack_tmr_std_based == 0)) { 5203 /* It must be set, if not you get 1/4 rtt */ 5204 if (rack->r_ctl.rc_reorder_shift) 5205 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5206 else 5207 thresh += (srtt >> 2); 5208 } 5209 if (rack->rc_rack_use_dsack && 5210 lro && 5211 (rack->r_ctl.num_dsack > 0)) { 5212 /* 5213 * We only increase the reordering window if we 5214 * have seen reordering <and> we have a DSACK count. 5215 */ 5216 thresh += rack->r_ctl.num_dsack * (srtt >> 2); 5217 rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh); 5218 } 5219 /* SRTT * 2 is the ceiling */ 5220 if (thresh > (srtt * 2)) { 5221 thresh = srtt * 2; 5222 } 5223 /* And we don't want it above the RTO max either */ 5224 if (thresh > rack_rto_max) { 5225 thresh = rack_rto_max; 5226 } 5227 rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh); 5228 return (thresh); 5229 } 5230 5231 static uint32_t 5232 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5233 struct rack_sendmap *rsm, uint32_t srtt) 5234 { 5235 struct rack_sendmap *prsm; 5236 uint32_t thresh, len; 5237 int segsiz; 5238 5239 if (srtt == 0) 5240 srtt = 1; 5241 if (rack->r_ctl.rc_tlp_threshold) 5242 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5243 else 5244 thresh = (srtt * 2); 5245 5246 /* Get the previous sent packet, if any */ 5247 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5248 counter_u64_add(rack_enter_tlp_calc, 1); 5249 len = rsm->r_end - rsm->r_start; 5250 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5251 /* Exactly like the ID */ 5252 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5253 uint32_t alt_thresh; 5254 /* 5255 * Compensate for delayed-ack with the d-ack time. 5256 */ 5257 counter_u64_add(rack_used_tlpmethod, 1); 5258 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5259 if (alt_thresh > thresh) 5260 thresh = alt_thresh; 5261 } 5262 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 5263 /* 2.1 behavior */ 5264 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 5265 if (prsm && (len <= segsiz)) { 5266 /* 5267 * Two packets outstanding, thresh should be (2*srtt) + 5268 * possible inter-packet delay (if any). 5269 */ 5270 uint32_t inter_gap = 0; 5271 int idx, nidx; 5272 5273 counter_u64_add(rack_used_tlpmethod, 1); 5274 idx = rsm->r_rtr_cnt - 1; 5275 nidx = prsm->r_rtr_cnt - 1; 5276 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 5277 /* Yes it was sent later (or at the same time) */ 5278 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 5279 } 5280 thresh += inter_gap; 5281 } else if (len <= segsiz) { 5282 /* 5283 * Possibly compensate for delayed-ack. 5284 */ 5285 uint32_t alt_thresh; 5286 5287 counter_u64_add(rack_used_tlpmethod2, 1); 5288 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5289 if (alt_thresh > thresh) 5290 thresh = alt_thresh; 5291 } 5292 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 5293 /* 2.2 behavior */ 5294 if (len <= segsiz) { 5295 uint32_t alt_thresh; 5296 /* 5297 * Compensate for delayed-ack with the d-ack time. 5298 */ 5299 counter_u64_add(rack_used_tlpmethod, 1); 5300 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5301 if (alt_thresh > thresh) 5302 thresh = alt_thresh; 5303 } 5304 } 5305 /* Not above an RTO */ 5306 if (thresh > tp->t_rxtcur) { 5307 thresh = tp->t_rxtcur; 5308 } 5309 /* Not above a RTO max */ 5310 if (thresh > rack_rto_max) { 5311 thresh = rack_rto_max; 5312 } 5313 /* Apply user supplied min TLP */ 5314 if (thresh < rack_tlp_min) { 5315 thresh = rack_tlp_min; 5316 } 5317 return (thresh); 5318 } 5319 5320 static uint32_t 5321 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 5322 { 5323 /* 5324 * We want the rack_rtt which is the 5325 * last rtt we measured. However if that 5326 * does not exist we fallback to the srtt (which 5327 * we probably will never do) and then as a last 5328 * resort we use RACK_INITIAL_RTO if no srtt is 5329 * yet set. 5330 */ 5331 if (rack->rc_rack_rtt) 5332 return (rack->rc_rack_rtt); 5333 else if (tp->t_srtt == 0) 5334 return (RACK_INITIAL_RTO); 5335 return (tp->t_srtt); 5336 } 5337 5338 static struct rack_sendmap * 5339 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 5340 { 5341 /* 5342 * Check to see that we don't need to fall into recovery. We will 5343 * need to do so if our oldest transmit is past the time we should 5344 * have had an ack. 5345 */ 5346 struct tcp_rack *rack; 5347 struct rack_sendmap *rsm; 5348 int32_t idx; 5349 uint32_t srtt, thresh; 5350 5351 rack = (struct tcp_rack *)tp->t_fb_ptr; 5352 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 5353 return (NULL); 5354 } 5355 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5356 if (rsm == NULL) 5357 return (NULL); 5358 5359 if (rsm->r_flags & RACK_ACKED) { 5360 rsm = rack_find_lowest_rsm(rack); 5361 if (rsm == NULL) 5362 return (NULL); 5363 } 5364 idx = rsm->r_rtr_cnt - 1; 5365 srtt = rack_grab_rtt(tp, rack); 5366 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 5367 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 5368 return (NULL); 5369 } 5370 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 5371 return (NULL); 5372 } 5373 /* Ok if we reach here we are over-due and this guy can be sent */ 5374 if (IN_RECOVERY(tp->t_flags) == 0) { 5375 /* 5376 * For the one that enters us into recovery record undo 5377 * info. 5378 */ 5379 rack->r_ctl.rc_rsm_start = rsm->r_start; 5380 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 5381 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 5382 } 5383 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 5384 return (rsm); 5385 } 5386 5387 static uint32_t 5388 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 5389 { 5390 int32_t t; 5391 int32_t tt; 5392 uint32_t ret_val; 5393 5394 t = (tp->t_srtt + (tp->t_rttvar << 2)); 5395 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 5396 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 5397 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 5398 ret_val = (uint32_t)tt; 5399 return (ret_val); 5400 } 5401 5402 static uint32_t 5403 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 5404 { 5405 /* 5406 * Start the FR timer, we do this based on getting the first one in 5407 * the rc_tmap. Note that if its NULL we must stop the timer. in all 5408 * events we need to stop the running timer (if its running) before 5409 * starting the new one. 5410 */ 5411 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 5412 uint32_t srtt_cur; 5413 int32_t idx; 5414 int32_t is_tlp_timer = 0; 5415 struct rack_sendmap *rsm; 5416 5417 if (rack->t_timers_stopped) { 5418 /* All timers have been stopped none are to run */ 5419 return (0); 5420 } 5421 if (rack->rc_in_persist) { 5422 /* We can't start any timer in persists */ 5423 return (rack_get_persists_timer_val(tp, rack)); 5424 } 5425 rack->rc_on_min_to = 0; 5426 if ((tp->t_state < TCPS_ESTABLISHED) || 5427 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 5428 goto activate_rxt; 5429 } 5430 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5431 if ((rsm == NULL) || sup_rack) { 5432 /* Nothing on the send map or no rack */ 5433 activate_rxt: 5434 time_since_sent = 0; 5435 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5436 if (rsm) { 5437 /* 5438 * Should we discount the RTX timer any? 5439 * 5440 * We want to discount it the smallest amount. 5441 * If a timer (Rack/TLP or RXT) has gone off more 5442 * recently thats the discount we want to use (now - timer time). 5443 * If the retransmit of the oldest packet was more recent then 5444 * we want to use that (now - oldest-packet-last_transmit_time). 5445 * 5446 */ 5447 idx = rsm->r_rtr_cnt - 1; 5448 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 5449 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5450 else 5451 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5452 if (TSTMP_GT(cts, tstmp_touse)) 5453 time_since_sent = cts - tstmp_touse; 5454 } 5455 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 5456 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 5457 to = tp->t_rxtcur; 5458 if (to > time_since_sent) 5459 to -= time_since_sent; 5460 else 5461 to = rack->r_ctl.rc_min_to; 5462 if (to == 0) 5463 to = 1; 5464 /* Special case for KEEPINIT */ 5465 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 5466 (TP_KEEPINIT(tp) != 0) && 5467 rsm) { 5468 /* 5469 * We have to put a ceiling on the rxt timer 5470 * of the keep-init timeout. 5471 */ 5472 uint32_t max_time, red; 5473 5474 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 5475 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 5476 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 5477 if (red < max_time) 5478 max_time -= red; 5479 else 5480 max_time = 1; 5481 } 5482 /* Reduce timeout to the keep value if needed */ 5483 if (max_time < to) 5484 to = max_time; 5485 } 5486 return (to); 5487 } 5488 return (0); 5489 } 5490 if (rsm->r_flags & RACK_ACKED) { 5491 rsm = rack_find_lowest_rsm(rack); 5492 if (rsm == NULL) { 5493 /* No lowest? */ 5494 goto activate_rxt; 5495 } 5496 } 5497 if (rack->sack_attack_disable) { 5498 /* 5499 * We don't want to do 5500 * any TLP's if you are an attacker. 5501 * Though if you are doing what 5502 * is expected you may still have 5503 * SACK-PASSED marks. 5504 */ 5505 goto activate_rxt; 5506 } 5507 /* Convert from ms to usecs */ 5508 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 5509 if ((tp->t_flags & TF_SENTFIN) && 5510 ((tp->snd_max - tp->snd_una) == 1) && 5511 (rsm->r_flags & RACK_HAS_FIN)) { 5512 /* 5513 * We don't start a rack timer if all we have is a 5514 * FIN outstanding. 5515 */ 5516 goto activate_rxt; 5517 } 5518 if ((rack->use_rack_rr == 0) && 5519 (IN_FASTRECOVERY(tp->t_flags)) && 5520 (rack->rack_no_prr == 0) && 5521 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5522 /* 5523 * We are not cheating, in recovery and 5524 * not enough ack's to yet get our next 5525 * retransmission out. 5526 * 5527 * Note that classified attackers do not 5528 * get to use the rack-cheat. 5529 */ 5530 goto activate_tlp; 5531 } 5532 srtt = rack_grab_rtt(tp, rack); 5533 thresh = rack_calc_thresh_rack(rack, srtt, cts); 5534 idx = rsm->r_rtr_cnt - 1; 5535 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 5536 if (SEQ_GEQ(exp, cts)) { 5537 to = exp - cts; 5538 if (to < rack->r_ctl.rc_min_to) { 5539 to = rack->r_ctl.rc_min_to; 5540 if (rack->r_rr_config == 3) 5541 rack->rc_on_min_to = 1; 5542 } 5543 } else { 5544 to = rack->r_ctl.rc_min_to; 5545 if (rack->r_rr_config == 3) 5546 rack->rc_on_min_to = 1; 5547 } 5548 } else { 5549 /* Ok we need to do a TLP not RACK */ 5550 activate_tlp: 5551 if ((rack->rc_tlp_in_progress != 0) && 5552 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 5553 /* 5554 * The previous send was a TLP and we have sent 5555 * N TLP's without sending new data. 5556 */ 5557 goto activate_rxt; 5558 } 5559 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 5560 if (rsm == NULL) { 5561 /* We found no rsm to TLP with. */ 5562 goto activate_rxt; 5563 } 5564 if (rsm->r_flags & RACK_HAS_FIN) { 5565 /* If its a FIN we dont do TLP */ 5566 rsm = NULL; 5567 goto activate_rxt; 5568 } 5569 idx = rsm->r_rtr_cnt - 1; 5570 time_since_sent = 0; 5571 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 5572 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5573 else 5574 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5575 if (TSTMP_GT(cts, tstmp_touse)) 5576 time_since_sent = cts - tstmp_touse; 5577 is_tlp_timer = 1; 5578 if (tp->t_srtt) { 5579 if ((rack->rc_srtt_measure_made == 0) && 5580 (tp->t_srtt == 1)) { 5581 /* 5582 * If another stack as run and set srtt to 1, 5583 * then the srtt was 0, so lets use the initial. 5584 */ 5585 srtt = RACK_INITIAL_RTO; 5586 } else { 5587 srtt_cur = tp->t_srtt; 5588 srtt = srtt_cur; 5589 } 5590 } else 5591 srtt = RACK_INITIAL_RTO; 5592 /* 5593 * If the SRTT is not keeping up and the 5594 * rack RTT has spiked we want to use 5595 * the last RTT not the smoothed one. 5596 */ 5597 if (rack_tlp_use_greater && 5598 tp->t_srtt && 5599 (srtt < rack_grab_rtt(tp, rack))) { 5600 srtt = rack_grab_rtt(tp, rack); 5601 } 5602 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 5603 if (thresh > time_since_sent) { 5604 to = thresh - time_since_sent; 5605 } else { 5606 to = rack->r_ctl.rc_min_to; 5607 rack_log_alt_to_to_cancel(rack, 5608 thresh, /* flex1 */ 5609 time_since_sent, /* flex2 */ 5610 tstmp_touse, /* flex3 */ 5611 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 5612 (uint32_t)rsm->r_tim_lastsent[idx], 5613 srtt, 5614 idx, 99); 5615 } 5616 if (to < rack_tlp_min) { 5617 to = rack_tlp_min; 5618 } 5619 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 5620 /* 5621 * If the TLP time works out to larger than the max 5622 * RTO lets not do TLP.. just RTO. 5623 */ 5624 goto activate_rxt; 5625 } 5626 } 5627 if (is_tlp_timer == 0) { 5628 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 5629 } else { 5630 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 5631 } 5632 if (to == 0) 5633 to = 1; 5634 return (to); 5635 } 5636 5637 static void 5638 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5639 { 5640 if (rack->rc_in_persist == 0) { 5641 if (tp->t_flags & TF_GPUTINPROG) { 5642 /* 5643 * Stop the goodput now, the calling of the 5644 * measurement function clears the flag. 5645 */ 5646 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 5647 RACK_QUALITY_PERSIST); 5648 } 5649 #ifdef NETFLIX_SHARED_CWND 5650 if (rack->r_ctl.rc_scw) { 5651 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5652 rack->rack_scwnd_is_idle = 1; 5653 } 5654 #endif 5655 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 5656 if (rack->r_ctl.rc_went_idle_time == 0) 5657 rack->r_ctl.rc_went_idle_time = 1; 5658 rack_timer_cancel(tp, rack, cts, __LINE__); 5659 rack->r_ctl.persist_lost_ends = 0; 5660 rack->probe_not_answered = 0; 5661 rack->forced_ack = 0; 5662 tp->t_rxtshift = 0; 5663 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5664 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5665 rack->rc_in_persist = 1; 5666 } 5667 } 5668 5669 static void 5670 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5671 { 5672 if (tcp_in_hpts(rack->rc_inp)) { 5673 tcp_hpts_remove(rack->rc_inp); 5674 rack->r_ctl.rc_hpts_flags = 0; 5675 } 5676 #ifdef NETFLIX_SHARED_CWND 5677 if (rack->r_ctl.rc_scw) { 5678 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5679 rack->rack_scwnd_is_idle = 0; 5680 } 5681 #endif 5682 if (rack->rc_gp_dyn_mul && 5683 (rack->use_fixed_rate == 0) && 5684 (rack->rc_always_pace)) { 5685 /* 5686 * Do we count this as if a probe-rtt just 5687 * finished? 5688 */ 5689 uint32_t time_idle, idle_min; 5690 5691 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 5692 idle_min = rack_min_probertt_hold; 5693 if (rack_probertt_gpsrtt_cnt_div) { 5694 uint64_t extra; 5695 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 5696 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 5697 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 5698 idle_min += (uint32_t)extra; 5699 } 5700 if (time_idle >= idle_min) { 5701 /* Yes, we count it as a probe-rtt. */ 5702 uint32_t us_cts; 5703 5704 us_cts = tcp_get_usecs(NULL); 5705 if (rack->in_probe_rtt == 0) { 5706 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 5707 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 5708 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 5709 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 5710 } else { 5711 rack_exit_probertt(rack, us_cts); 5712 } 5713 } 5714 } 5715 rack->rc_in_persist = 0; 5716 rack->r_ctl.rc_went_idle_time = 0; 5717 tp->t_rxtshift = 0; 5718 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5719 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5720 rack->r_ctl.rc_agg_delayed = 0; 5721 rack->r_early = 0; 5722 rack->r_late = 0; 5723 rack->r_ctl.rc_agg_early = 0; 5724 } 5725 5726 static void 5727 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 5728 struct hpts_diag *diag, struct timeval *tv) 5729 { 5730 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5731 union tcp_log_stackspecific log; 5732 5733 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5734 log.u_bbr.flex1 = diag->p_nxt_slot; 5735 log.u_bbr.flex2 = diag->p_cur_slot; 5736 log.u_bbr.flex3 = diag->slot_req; 5737 log.u_bbr.flex4 = diag->inp_hptsslot; 5738 log.u_bbr.flex5 = diag->slot_remaining; 5739 log.u_bbr.flex6 = diag->need_new_to; 5740 log.u_bbr.flex7 = diag->p_hpts_active; 5741 log.u_bbr.flex8 = diag->p_on_min_sleep; 5742 /* Hijack other fields as needed */ 5743 log.u_bbr.epoch = diag->have_slept; 5744 log.u_bbr.lt_epoch = diag->yet_to_sleep; 5745 log.u_bbr.pkts_out = diag->co_ret; 5746 log.u_bbr.applimited = diag->hpts_sleep_time; 5747 log.u_bbr.delivered = diag->p_prev_slot; 5748 log.u_bbr.inflight = diag->p_runningslot; 5749 log.u_bbr.bw_inuse = diag->wheel_slot; 5750 log.u_bbr.rttProp = diag->wheel_cts; 5751 log.u_bbr.timeStamp = cts; 5752 log.u_bbr.delRate = diag->maxslots; 5753 log.u_bbr.cur_del_rate = diag->p_curtick; 5754 log.u_bbr.cur_del_rate <<= 32; 5755 log.u_bbr.cur_del_rate |= diag->p_lasttick; 5756 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5757 &rack->rc_inp->inp_socket->so_rcv, 5758 &rack->rc_inp->inp_socket->so_snd, 5759 BBR_LOG_HPTSDIAG, 0, 5760 0, &log, false, tv); 5761 } 5762 5763 } 5764 5765 static void 5766 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 5767 { 5768 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5769 union tcp_log_stackspecific log; 5770 struct timeval tv; 5771 5772 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5773 log.u_bbr.flex1 = sb->sb_flags; 5774 log.u_bbr.flex2 = len; 5775 log.u_bbr.flex3 = sb->sb_state; 5776 log.u_bbr.flex8 = type; 5777 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5778 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5779 &rack->rc_inp->inp_socket->so_rcv, 5780 &rack->rc_inp->inp_socket->so_snd, 5781 TCP_LOG_SB_WAKE, 0, 5782 len, &log, false, &tv); 5783 } 5784 } 5785 5786 static void 5787 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 5788 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 5789 { 5790 struct hpts_diag diag; 5791 struct inpcb *inp; 5792 struct timeval tv; 5793 uint32_t delayed_ack = 0; 5794 uint32_t hpts_timeout; 5795 uint32_t entry_slot = slot; 5796 uint8_t stopped; 5797 uint32_t left = 0; 5798 uint32_t us_cts; 5799 5800 inp = tp->t_inpcb; 5801 if ((tp->t_state == TCPS_CLOSED) || 5802 (tp->t_state == TCPS_LISTEN)) { 5803 return; 5804 } 5805 if (tcp_in_hpts(inp)) { 5806 /* Already on the pacer */ 5807 return; 5808 } 5809 stopped = rack->rc_tmr_stopped; 5810 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 5811 left = rack->r_ctl.rc_timer_exp - cts; 5812 } 5813 rack->r_ctl.rc_timer_exp = 0; 5814 rack->r_ctl.rc_hpts_flags = 0; 5815 us_cts = tcp_get_usecs(&tv); 5816 /* Now early/late accounting */ 5817 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 5818 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 5819 /* 5820 * We have a early carry over set, 5821 * we can always add more time so we 5822 * can always make this compensation. 5823 * 5824 * Note if ack's are allowed to wake us do not 5825 * penalize the next timer for being awoke 5826 * by an ack aka the rc_agg_early (non-paced mode). 5827 */ 5828 slot += rack->r_ctl.rc_agg_early; 5829 rack->r_early = 0; 5830 rack->r_ctl.rc_agg_early = 0; 5831 } 5832 if (rack->r_late) { 5833 /* 5834 * This is harder, we can 5835 * compensate some but it 5836 * really depends on what 5837 * the current pacing time is. 5838 */ 5839 if (rack->r_ctl.rc_agg_delayed >= slot) { 5840 /* 5841 * We can't compensate for it all. 5842 * And we have to have some time 5843 * on the clock. We always have a min 5844 * 10 slots (10 x 10 i.e. 100 usecs). 5845 */ 5846 if (slot <= HPTS_TICKS_PER_SLOT) { 5847 /* We gain delay */ 5848 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 5849 slot = HPTS_TICKS_PER_SLOT; 5850 } else { 5851 /* We take off some */ 5852 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 5853 slot = HPTS_TICKS_PER_SLOT; 5854 } 5855 } else { 5856 slot -= rack->r_ctl.rc_agg_delayed; 5857 rack->r_ctl.rc_agg_delayed = 0; 5858 /* Make sure we have 100 useconds at minimum */ 5859 if (slot < HPTS_TICKS_PER_SLOT) { 5860 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 5861 slot = HPTS_TICKS_PER_SLOT; 5862 } 5863 if (rack->r_ctl.rc_agg_delayed == 0) 5864 rack->r_late = 0; 5865 } 5866 } 5867 if (slot) { 5868 /* We are pacing too */ 5869 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 5870 } 5871 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 5872 #ifdef NETFLIX_EXP_DETECTION 5873 if (rack->sack_attack_disable && 5874 (slot < tcp_sad_pacing_interval)) { 5875 /* 5876 * We have a potential attacker on 5877 * the line. We have possibly some 5878 * (or now) pacing time set. We want to 5879 * slow down the processing of sacks by some 5880 * amount (if it is an attacker). Set the default 5881 * slot for attackers in place (unless the orginal 5882 * interval is longer). Its stored in 5883 * micro-seconds, so lets convert to msecs. 5884 */ 5885 slot = tcp_sad_pacing_interval; 5886 } 5887 #endif 5888 if (tp->t_flags & TF_DELACK) { 5889 delayed_ack = TICKS_2_USEC(tcp_delacktime); 5890 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 5891 } 5892 if (delayed_ack && ((hpts_timeout == 0) || 5893 (delayed_ack < hpts_timeout))) 5894 hpts_timeout = delayed_ack; 5895 else 5896 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5897 /* 5898 * If no timers are going to run and we will fall off the hptsi 5899 * wheel, we resort to a keep-alive timer if its configured. 5900 */ 5901 if ((hpts_timeout == 0) && 5902 (slot == 0)) { 5903 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5904 (tp->t_state <= TCPS_CLOSING)) { 5905 /* 5906 * Ok we have no timer (persists, rack, tlp, rxt or 5907 * del-ack), we don't have segments being paced. So 5908 * all that is left is the keepalive timer. 5909 */ 5910 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 5911 /* Get the established keep-alive time */ 5912 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 5913 } else { 5914 /* 5915 * Get the initial setup keep-alive time, 5916 * note that this is probably not going to 5917 * happen, since rack will be running a rxt timer 5918 * if a SYN of some sort is outstanding. It is 5919 * actually handled in rack_timeout_rxt(). 5920 */ 5921 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 5922 } 5923 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 5924 if (rack->in_probe_rtt) { 5925 /* 5926 * We want to instead not wake up a long time from 5927 * now but to wake up about the time we would 5928 * exit probe-rtt and initiate a keep-alive ack. 5929 * This will get us out of probe-rtt and update 5930 * our min-rtt. 5931 */ 5932 hpts_timeout = rack_min_probertt_hold; 5933 } 5934 } 5935 } 5936 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 5937 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 5938 /* 5939 * RACK, TLP, persists and RXT timers all are restartable 5940 * based on actions input .. i.e we received a packet (ack 5941 * or sack) and that changes things (rw, or snd_una etc). 5942 * Thus we can restart them with a new value. For 5943 * keep-alive, delayed_ack we keep track of what was left 5944 * and restart the timer with a smaller value. 5945 */ 5946 if (left < hpts_timeout) 5947 hpts_timeout = left; 5948 } 5949 if (hpts_timeout) { 5950 /* 5951 * Hack alert for now we can't time-out over 2,147,483 5952 * seconds (a bit more than 596 hours), which is probably ok 5953 * :). 5954 */ 5955 if (hpts_timeout > 0x7ffffffe) 5956 hpts_timeout = 0x7ffffffe; 5957 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 5958 } 5959 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 5960 if ((rack->gp_ready == 0) && 5961 (rack->use_fixed_rate == 0) && 5962 (hpts_timeout < slot) && 5963 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 5964 /* 5965 * We have no good estimate yet for the 5966 * old clunky burst mitigation or the 5967 * real pacing. And the tlp or rxt is smaller 5968 * than the pacing calculation. Lets not 5969 * pace that long since we know the calculation 5970 * so far is not accurate. 5971 */ 5972 slot = hpts_timeout; 5973 } 5974 rack->r_ctl.last_pacing_time = slot; 5975 /** 5976 * Turn off all the flags for queuing by default. The 5977 * flags have important meanings to what happens when 5978 * LRO interacts with the transport. Most likely (by default now) 5979 * mbuf_queueing and ack compression are on. So the transport 5980 * has a couple of flags that control what happens (if those 5981 * are not on then these flags won't have any effect since it 5982 * won't go through the queuing LRO path). 5983 * 5984 * INP_MBUF_QUEUE_READY - This flags says that I am busy 5985 * pacing output, so don't disturb. But 5986 * it also means LRO can wake me if there 5987 * is a SACK arrival. 5988 * 5989 * INP_DONT_SACK_QUEUE - This flag is used in conjunction 5990 * with the above flag (QUEUE_READY) and 5991 * when present it says don't even wake me 5992 * if a SACK arrives. 5993 * 5994 * The idea behind these flags is that if we are pacing we 5995 * set the MBUF_QUEUE_READY and only get woken up if 5996 * a SACK arrives (which could change things) or if 5997 * our pacing timer expires. If, however, we have a rack 5998 * timer running, then we don't even want a sack to wake 5999 * us since the rack timer has to expire before we can send. 6000 * 6001 * Other cases should usually have none of the flags set 6002 * so LRO can call into us. 6003 */ 6004 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 6005 if (slot) { 6006 rack->r_ctl.rc_last_output_to = us_cts + slot; 6007 /* 6008 * A pacing timer (slot) is being set, in 6009 * such a case we cannot send (we are blocked by 6010 * the timer). So lets tell LRO that it should not 6011 * wake us unless there is a SACK. Note this only 6012 * will be effective if mbuf queueing is on or 6013 * compressed acks are being processed. 6014 */ 6015 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 6016 /* 6017 * But wait if we have a Rack timer running 6018 * even a SACK should not disturb us (with 6019 * the exception of r_rr_config 3). 6020 */ 6021 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 6022 (rack->r_rr_config != 3)) 6023 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 6024 if (rack->rc_ack_can_sendout_data) { 6025 /* 6026 * Ahh but wait, this is that special case 6027 * where the pacing timer can be disturbed 6028 * backout the changes (used for non-paced 6029 * burst limiting). 6030 */ 6031 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 6032 } 6033 if ((rack->use_rack_rr) && 6034 (rack->r_rr_config < 2) && 6035 ((hpts_timeout) && (hpts_timeout < slot))) { 6036 /* 6037 * Arrange for the hpts to kick back in after the 6038 * t-o if the t-o does not cause a send. 6039 */ 6040 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 6041 __LINE__, &diag); 6042 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6043 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6044 } else { 6045 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 6046 __LINE__, &diag); 6047 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6048 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 6049 } 6050 } else if (hpts_timeout) { 6051 /* 6052 * With respect to inp_flags2 here, lets let any new acks wake 6053 * us up here. Since we are not pacing (no pacing timer), output 6054 * can happen so we should let it. If its a Rack timer, then any inbound 6055 * packet probably won't change the sending (we will be blocked) 6056 * but it may change the prr stats so letting it in (the set defaults 6057 * at the start of this block) are good enough. 6058 */ 6059 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 6060 __LINE__, &diag); 6061 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 6062 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 6063 } else { 6064 /* No timer starting */ 6065 #ifdef INVARIANTS 6066 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 6067 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 6068 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 6069 } 6070 #endif 6071 } 6072 rack->rc_tmr_stopped = 0; 6073 if (slot) 6074 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 6075 } 6076 6077 /* 6078 * RACK Timer, here we simply do logging and house keeping. 6079 * the normal rack_output() function will call the 6080 * appropriate thing to check if we need to do a RACK retransmit. 6081 * We return 1, saying don't proceed with rack_output only 6082 * when all timers have been stopped (destroyed PCB?). 6083 */ 6084 static int 6085 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6086 { 6087 /* 6088 * This timer simply provides an internal trigger to send out data. 6089 * The check_recovery_mode call will see if there are needed 6090 * retransmissions, if so we will enter fast-recovery. The output 6091 * call may or may not do the same thing depending on sysctl 6092 * settings. 6093 */ 6094 struct rack_sendmap *rsm; 6095 6096 if (tp->t_timers->tt_flags & TT_STOPPED) { 6097 return (1); 6098 } 6099 counter_u64_add(rack_to_tot, 1); 6100 if (rack->r_state && (rack->r_state != tp->t_state)) 6101 rack_set_state(tp, rack); 6102 rack->rc_on_min_to = 0; 6103 rsm = rack_check_recovery_mode(tp, cts); 6104 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 6105 if (rsm) { 6106 rack->r_ctl.rc_resend = rsm; 6107 rack->r_timer_override = 1; 6108 if (rack->use_rack_rr) { 6109 /* 6110 * Don't accumulate extra pacing delay 6111 * we are allowing the rack timer to 6112 * over-ride pacing i.e. rrr takes precedence 6113 * if the pacing interval is longer than the rrr 6114 * time (in other words we get the min pacing 6115 * time versus rrr pacing time). 6116 */ 6117 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6118 } 6119 } 6120 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 6121 if (rsm == NULL) { 6122 /* restart a timer and return 1 */ 6123 rack_start_hpts_timer(rack, tp, cts, 6124 0, 0, 0); 6125 return (1); 6126 } 6127 return (0); 6128 } 6129 6130 static void 6131 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 6132 { 6133 if (rsm->m->m_len > rsm->orig_m_len) { 6134 /* 6135 * Mbuf grew, caused by sbcompress, our offset does 6136 * not change. 6137 */ 6138 rsm->orig_m_len = rsm->m->m_len; 6139 } else if (rsm->m->m_len < rsm->orig_m_len) { 6140 /* 6141 * Mbuf shrank, trimmed off the top by an ack, our 6142 * offset changes. 6143 */ 6144 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 6145 rsm->orig_m_len = rsm->m->m_len; 6146 } 6147 } 6148 6149 static void 6150 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 6151 { 6152 struct mbuf *m; 6153 uint32_t soff; 6154 6155 if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) { 6156 /* Fix up the orig_m_len and possibly the mbuf offset */ 6157 rack_adjust_orig_mlen(src_rsm); 6158 } 6159 m = src_rsm->m; 6160 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 6161 while (soff >= m->m_len) { 6162 /* Move out past this mbuf */ 6163 soff -= m->m_len; 6164 m = m->m_next; 6165 KASSERT((m != NULL), 6166 ("rsm:%p nrsm:%p hit at soff:%u null m", 6167 src_rsm, rsm, soff)); 6168 } 6169 rsm->m = m; 6170 rsm->soff = soff; 6171 rsm->orig_m_len = m->m_len; 6172 } 6173 6174 static __inline void 6175 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 6176 struct rack_sendmap *rsm, uint32_t start) 6177 { 6178 int idx; 6179 6180 nrsm->r_start = start; 6181 nrsm->r_end = rsm->r_end; 6182 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 6183 nrsm->r_flags = rsm->r_flags; 6184 nrsm->r_dupack = rsm->r_dupack; 6185 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 6186 nrsm->r_rtr_bytes = 0; 6187 nrsm->r_fas = rsm->r_fas; 6188 rsm->r_end = nrsm->r_start; 6189 nrsm->r_just_ret = rsm->r_just_ret; 6190 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 6191 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 6192 } 6193 /* Now if we have SYN flag we keep it on the left edge */ 6194 if (nrsm->r_flags & RACK_HAS_SYN) 6195 nrsm->r_flags &= ~RACK_HAS_SYN; 6196 /* Now if we have a FIN flag we keep it on the right edge */ 6197 if (rsm->r_flags & RACK_HAS_FIN) 6198 rsm->r_flags &= ~RACK_HAS_FIN; 6199 /* Push bit must go to the right edge as well */ 6200 if (rsm->r_flags & RACK_HAD_PUSH) 6201 rsm->r_flags &= ~RACK_HAD_PUSH; 6202 /* Clone over the state of the hw_tls flag */ 6203 nrsm->r_hw_tls = rsm->r_hw_tls; 6204 /* 6205 * Now we need to find nrsm's new location in the mbuf chain 6206 * we basically calculate a new offset, which is soff + 6207 * how much is left in original rsm. Then we walk out the mbuf 6208 * chain to find the righ postion, it may be the same mbuf 6209 * or maybe not. 6210 */ 6211 KASSERT(((rsm->m != NULL) || 6212 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 6213 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 6214 if (rsm->m) 6215 rack_setup_offset_for_rsm(rsm, nrsm); 6216 } 6217 6218 static struct rack_sendmap * 6219 rack_merge_rsm(struct tcp_rack *rack, 6220 struct rack_sendmap *l_rsm, 6221 struct rack_sendmap *r_rsm) 6222 { 6223 /* 6224 * We are merging two ack'd RSM's, 6225 * the l_rsm is on the left (lower seq 6226 * values) and the r_rsm is on the right 6227 * (higher seq value). The simplest way 6228 * to merge these is to move the right 6229 * one into the left. I don't think there 6230 * is any reason we need to try to find 6231 * the oldest (or last oldest retransmitted). 6232 */ 6233 struct rack_sendmap *rm; 6234 6235 rack_log_map_chg(rack->rc_tp, rack, NULL, 6236 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 6237 l_rsm->r_end = r_rsm->r_end; 6238 if (l_rsm->r_dupack < r_rsm->r_dupack) 6239 l_rsm->r_dupack = r_rsm->r_dupack; 6240 if (r_rsm->r_rtr_bytes) 6241 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 6242 if (r_rsm->r_in_tmap) { 6243 /* This really should not happen */ 6244 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 6245 r_rsm->r_in_tmap = 0; 6246 } 6247 6248 /* Now the flags */ 6249 if (r_rsm->r_flags & RACK_HAS_FIN) 6250 l_rsm->r_flags |= RACK_HAS_FIN; 6251 if (r_rsm->r_flags & RACK_TLP) 6252 l_rsm->r_flags |= RACK_TLP; 6253 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 6254 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 6255 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 6256 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 6257 /* 6258 * If both are app-limited then let the 6259 * free lower the count. If right is app 6260 * limited and left is not, transfer. 6261 */ 6262 l_rsm->r_flags |= RACK_APP_LIMITED; 6263 r_rsm->r_flags &= ~RACK_APP_LIMITED; 6264 if (r_rsm == rack->r_ctl.rc_first_appl) 6265 rack->r_ctl.rc_first_appl = l_rsm; 6266 } 6267 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 6268 #ifdef INVARIANTS 6269 if (rm != r_rsm) { 6270 panic("removing head in rack:%p rsm:%p rm:%p", 6271 rack, r_rsm, rm); 6272 } 6273 #endif 6274 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 6275 /* Transfer the split limit to the map we free */ 6276 r_rsm->r_limit_type = l_rsm->r_limit_type; 6277 l_rsm->r_limit_type = 0; 6278 } 6279 rack_free(rack, r_rsm); 6280 return (l_rsm); 6281 } 6282 6283 /* 6284 * TLP Timer, here we simply setup what segment we want to 6285 * have the TLP expire on, the normal rack_output() will then 6286 * send it out. 6287 * 6288 * We return 1, saying don't proceed with rack_output only 6289 * when all timers have been stopped (destroyed PCB?). 6290 */ 6291 static int 6292 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 6293 { 6294 /* 6295 * Tail Loss Probe. 6296 */ 6297 struct rack_sendmap *rsm = NULL; 6298 struct rack_sendmap *insret; 6299 struct socket *so; 6300 uint32_t amm; 6301 uint32_t out, avail; 6302 int collapsed_win = 0; 6303 6304 if (tp->t_timers->tt_flags & TT_STOPPED) { 6305 return (1); 6306 } 6307 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6308 /* Its not time yet */ 6309 return (0); 6310 } 6311 if (ctf_progress_timeout_check(tp, true)) { 6312 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6313 return (-ETIMEDOUT); /* tcp_drop() */ 6314 } 6315 /* 6316 * A TLP timer has expired. We have been idle for 2 rtts. So we now 6317 * need to figure out how to force a full MSS segment out. 6318 */ 6319 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 6320 rack->r_ctl.retran_during_recovery = 0; 6321 rack->r_ctl.dsack_byte_cnt = 0; 6322 counter_u64_add(rack_tlp_tot, 1); 6323 if (rack->r_state && (rack->r_state != tp->t_state)) 6324 rack_set_state(tp, rack); 6325 so = tp->t_inpcb->inp_socket; 6326 avail = sbavail(&so->so_snd); 6327 out = tp->snd_max - tp->snd_una; 6328 if (out > tp->snd_wnd) { 6329 /* special case, we need a retransmission */ 6330 collapsed_win = 1; 6331 goto need_retran; 6332 } 6333 if (rack->r_ctl.dsack_persist && (rack->r_ctl.rc_tlp_cnt_out >= 1)) { 6334 rack->r_ctl.dsack_persist--; 6335 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 6336 rack->r_ctl.num_dsack = 0; 6337 } 6338 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 6339 } 6340 if ((tp->t_flags & TF_GPUTINPROG) && 6341 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 6342 /* 6343 * If this is the second in a row 6344 * TLP and we are doing a measurement 6345 * its time to abandon the measurement. 6346 * Something is likely broken on 6347 * the clients network and measuring a 6348 * broken network does us no good. 6349 */ 6350 tp->t_flags &= ~TF_GPUTINPROG; 6351 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6352 rack->r_ctl.rc_gp_srtt /*flex1*/, 6353 tp->gput_seq, 6354 0, 0, 18, __LINE__, NULL, 0); 6355 } 6356 /* 6357 * Check our send oldest always settings, and if 6358 * there is an oldest to send jump to the need_retran. 6359 */ 6360 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 6361 goto need_retran; 6362 6363 if (avail > out) { 6364 /* New data is available */ 6365 amm = avail - out; 6366 if (amm > ctf_fixed_maxseg(tp)) { 6367 amm = ctf_fixed_maxseg(tp); 6368 if ((amm + out) > tp->snd_wnd) { 6369 /* We are rwnd limited */ 6370 goto need_retran; 6371 } 6372 } else if (amm < ctf_fixed_maxseg(tp)) { 6373 /* not enough to fill a MTU */ 6374 goto need_retran; 6375 } 6376 if (IN_FASTRECOVERY(tp->t_flags)) { 6377 /* Unlikely */ 6378 if (rack->rack_no_prr == 0) { 6379 if (out + amm <= tp->snd_wnd) { 6380 rack->r_ctl.rc_prr_sndcnt = amm; 6381 rack->r_ctl.rc_tlp_new_data = amm; 6382 rack_log_to_prr(rack, 4, 0); 6383 } 6384 } else 6385 goto need_retran; 6386 } else { 6387 /* Set the send-new override */ 6388 if (out + amm <= tp->snd_wnd) 6389 rack->r_ctl.rc_tlp_new_data = amm; 6390 else 6391 goto need_retran; 6392 } 6393 rack->r_ctl.rc_tlpsend = NULL; 6394 counter_u64_add(rack_tlp_newdata, 1); 6395 goto send; 6396 } 6397 need_retran: 6398 /* 6399 * Ok we need to arrange the last un-acked segment to be re-sent, or 6400 * optionally the first un-acked segment. 6401 */ 6402 if (collapsed_win == 0) { 6403 if (rack_always_send_oldest) 6404 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6405 else { 6406 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6407 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 6408 rsm = rack_find_high_nonack(rack, rsm); 6409 } 6410 } 6411 if (rsm == NULL) { 6412 counter_u64_add(rack_tlp_does_nada, 1); 6413 #ifdef TCP_BLACKBOX 6414 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6415 #endif 6416 goto out; 6417 } 6418 } else { 6419 /* 6420 * We must find the last segment 6421 * that was acceptable by the client. 6422 */ 6423 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6424 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 6425 /* Found one */ 6426 break; 6427 } 6428 } 6429 if (rsm == NULL) { 6430 /* None? if so send the first */ 6431 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6432 if (rsm == NULL) { 6433 counter_u64_add(rack_tlp_does_nada, 1); 6434 #ifdef TCP_BLACKBOX 6435 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6436 #endif 6437 goto out; 6438 } 6439 } 6440 } 6441 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 6442 /* 6443 * We need to split this the last segment in two. 6444 */ 6445 struct rack_sendmap *nrsm; 6446 6447 nrsm = rack_alloc_full_limit(rack); 6448 if (nrsm == NULL) { 6449 /* 6450 * No memory to split, we will just exit and punt 6451 * off to the RXT timer. 6452 */ 6453 counter_u64_add(rack_tlp_does_nada, 1); 6454 goto out; 6455 } 6456 rack_clone_rsm(rack, nrsm, rsm, 6457 (rsm->r_end - ctf_fixed_maxseg(tp))); 6458 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 6459 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6460 #ifdef INVARIANTS 6461 if (insret != NULL) { 6462 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6463 nrsm, insret, rack, rsm); 6464 } 6465 #endif 6466 if (rsm->r_in_tmap) { 6467 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6468 nrsm->r_in_tmap = 1; 6469 } 6470 rsm = nrsm; 6471 } 6472 rack->r_ctl.rc_tlpsend = rsm; 6473 send: 6474 /* Make sure output path knows we are doing a TLP */ 6475 *doing_tlp = 1; 6476 rack->r_timer_override = 1; 6477 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6478 return (0); 6479 out: 6480 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6481 return (0); 6482 } 6483 6484 /* 6485 * Delayed ack Timer, here we simply need to setup the 6486 * ACK_NOW flag and remove the DELACK flag. From there 6487 * the output routine will send the ack out. 6488 * 6489 * We only return 1, saying don't proceed, if all timers 6490 * are stopped (destroyed PCB?). 6491 */ 6492 static int 6493 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6494 { 6495 if (tp->t_timers->tt_flags & TT_STOPPED) { 6496 return (1); 6497 } 6498 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 6499 tp->t_flags &= ~TF_DELACK; 6500 tp->t_flags |= TF_ACKNOW; 6501 KMOD_TCPSTAT_INC(tcps_delack); 6502 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6503 return (0); 6504 } 6505 6506 /* 6507 * Persists timer, here we simply send the 6508 * same thing as a keepalive will. 6509 * the one byte send. 6510 * 6511 * We only return 1, saying don't proceed, if all timers 6512 * are stopped (destroyed PCB?). 6513 */ 6514 static int 6515 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6516 { 6517 struct tcptemp *t_template; 6518 struct inpcb *inp; 6519 int32_t retval = 1; 6520 6521 inp = tp->t_inpcb; 6522 6523 if (tp->t_timers->tt_flags & TT_STOPPED) { 6524 return (1); 6525 } 6526 if (rack->rc_in_persist == 0) 6527 return (0); 6528 if (ctf_progress_timeout_check(tp, false)) { 6529 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6530 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6531 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6532 return (-ETIMEDOUT); /* tcp_drop() */ 6533 } 6534 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 6535 /* 6536 * Persistence timer into zero window. Force a byte to be output, if 6537 * possible. 6538 */ 6539 KMOD_TCPSTAT_INC(tcps_persisttimeo); 6540 /* 6541 * Hack: if the peer is dead/unreachable, we do not time out if the 6542 * window is closed. After a full backoff, drop the connection if 6543 * the idle time (no responses to probes) reaches the maximum 6544 * backoff that we would use if retransmitting. 6545 */ 6546 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 6547 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 6548 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 6549 KMOD_TCPSTAT_INC(tcps_persistdrop); 6550 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6551 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6552 retval = -ETIMEDOUT; /* tcp_drop() */ 6553 goto out; 6554 } 6555 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 6556 tp->snd_una == tp->snd_max) 6557 rack_exit_persist(tp, rack, cts); 6558 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 6559 /* 6560 * If the user has closed the socket then drop a persisting 6561 * connection after a much reduced timeout. 6562 */ 6563 if (tp->t_state > TCPS_CLOSE_WAIT && 6564 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 6565 KMOD_TCPSTAT_INC(tcps_persistdrop); 6566 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6567 counter_u64_add(rack_persists_lost_ends, rack->r_ctl.persist_lost_ends); 6568 retval = -ETIMEDOUT; /* tcp_drop() */ 6569 goto out; 6570 } 6571 t_template = tcpip_maketemplate(rack->rc_inp); 6572 if (t_template) { 6573 /* only set it if we were answered */ 6574 if (rack->forced_ack == 0) { 6575 rack->forced_ack = 1; 6576 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6577 } else { 6578 rack->probe_not_answered = 1; 6579 counter_u64_add(rack_persists_loss, 1); 6580 rack->r_ctl.persist_lost_ends++; 6581 } 6582 counter_u64_add(rack_persists_sends, 1); 6583 tcp_respond(tp, t_template->tt_ipgen, 6584 &t_template->tt_t, (struct mbuf *)NULL, 6585 tp->rcv_nxt, tp->snd_una - 1, 0); 6586 /* This sends an ack */ 6587 if (tp->t_flags & TF_DELACK) 6588 tp->t_flags &= ~TF_DELACK; 6589 free(t_template, M_TEMP); 6590 } 6591 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 6592 tp->t_rxtshift++; 6593 out: 6594 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 6595 rack_start_hpts_timer(rack, tp, cts, 6596 0, 0, 0); 6597 return (retval); 6598 } 6599 6600 /* 6601 * If a keepalive goes off, we had no other timers 6602 * happening. We always return 1 here since this 6603 * routine either drops the connection or sends 6604 * out a segment with respond. 6605 */ 6606 static int 6607 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6608 { 6609 struct tcptemp *t_template; 6610 struct inpcb *inp; 6611 6612 if (tp->t_timers->tt_flags & TT_STOPPED) { 6613 return (1); 6614 } 6615 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 6616 inp = tp->t_inpcb; 6617 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 6618 /* 6619 * Keep-alive timer went off; send something or drop connection if 6620 * idle for too long. 6621 */ 6622 KMOD_TCPSTAT_INC(tcps_keeptimeo); 6623 if (tp->t_state < TCPS_ESTABLISHED) 6624 goto dropit; 6625 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6626 tp->t_state <= TCPS_CLOSING) { 6627 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 6628 goto dropit; 6629 /* 6630 * Send a packet designed to force a response if the peer is 6631 * up and reachable: either an ACK if the connection is 6632 * still alive, or an RST if the peer has closed the 6633 * connection due to timeout or reboot. Using sequence 6634 * number tp->snd_una-1 causes the transmitted zero-length 6635 * segment to lie outside the receive window; by the 6636 * protocol spec, this requires the correspondent TCP to 6637 * respond. 6638 */ 6639 KMOD_TCPSTAT_INC(tcps_keepprobe); 6640 t_template = tcpip_maketemplate(inp); 6641 if (t_template) { 6642 if (rack->forced_ack == 0) { 6643 rack->forced_ack = 1; 6644 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6645 } else { 6646 rack->probe_not_answered = 1; 6647 } 6648 tcp_respond(tp, t_template->tt_ipgen, 6649 &t_template->tt_t, (struct mbuf *)NULL, 6650 tp->rcv_nxt, tp->snd_una - 1, 0); 6651 free(t_template, M_TEMP); 6652 } 6653 } 6654 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 6655 return (1); 6656 dropit: 6657 KMOD_TCPSTAT_INC(tcps_keepdrops); 6658 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6659 return (-ETIMEDOUT); /* tcp_drop() */ 6660 } 6661 6662 /* 6663 * Retransmit helper function, clear up all the ack 6664 * flags and take care of important book keeping. 6665 */ 6666 static void 6667 rack_remxt_tmr(struct tcpcb *tp) 6668 { 6669 /* 6670 * The retransmit timer went off, all sack'd blocks must be 6671 * un-acked. 6672 */ 6673 struct rack_sendmap *rsm, *trsm = NULL; 6674 struct tcp_rack *rack; 6675 6676 rack = (struct tcp_rack *)tp->t_fb_ptr; 6677 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 6678 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 6679 if (rack->r_state && (rack->r_state != tp->t_state)) 6680 rack_set_state(tp, rack); 6681 /* 6682 * Ideally we would like to be able to 6683 * mark SACK-PASS on anything not acked here. 6684 * 6685 * However, if we do that we would burst out 6686 * all that data 1ms apart. This would be unwise, 6687 * so for now we will just let the normal rxt timer 6688 * and tlp timer take care of it. 6689 * 6690 * Also we really need to stick them back in sequence 6691 * order. This way we send in the proper order and any 6692 * sacks that come floating in will "re-ack" the data. 6693 * To do this we zap the tmap with an INIT and then 6694 * walk through and place every rsm in the RB tree 6695 * back in its seq ordered place. 6696 */ 6697 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6698 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6699 rsm->r_dupack = 0; 6700 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6701 /* We must re-add it back to the tlist */ 6702 if (trsm == NULL) { 6703 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6704 } else { 6705 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 6706 } 6707 rsm->r_in_tmap = 1; 6708 trsm = rsm; 6709 if (rsm->r_flags & RACK_ACKED) 6710 rsm->r_flags |= RACK_WAS_ACKED; 6711 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 6712 rsm->r_flags |= RACK_MUST_RXT; 6713 } 6714 /* Clear the count (we just un-acked them) */ 6715 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 6716 rack->r_ctl.rc_sacked = 0; 6717 rack->r_ctl.rc_sacklast = NULL; 6718 rack->r_ctl.rc_agg_delayed = 0; 6719 rack->r_early = 0; 6720 rack->r_ctl.rc_agg_early = 0; 6721 rack->r_late = 0; 6722 /* Clear the tlp rtx mark */ 6723 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6724 if (rack->r_ctl.rc_resend != NULL) 6725 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 6726 rack->r_ctl.rc_prr_sndcnt = 0; 6727 rack_log_to_prr(rack, 6, 0); 6728 rack->r_timer_override = 1; 6729 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 6730 #ifdef NETFLIX_EXP_DETECTION 6731 || (rack->sack_attack_disable != 0) 6732 #endif 6733 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 6734 /* 6735 * For non-sack customers new data 6736 * needs to go out as retransmits until 6737 * we retransmit up to snd_max. 6738 */ 6739 rack->r_must_retran = 1; 6740 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 6741 rack->r_ctl.rc_sacked); 6742 } 6743 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 6744 } 6745 6746 static void 6747 rack_convert_rtts(struct tcpcb *tp) 6748 { 6749 if (tp->t_srtt > 1) { 6750 uint32_t val, frac; 6751 6752 val = tp->t_srtt >> TCP_RTT_SHIFT; 6753 frac = tp->t_srtt & 0x1f; 6754 tp->t_srtt = TICKS_2_USEC(val); 6755 /* 6756 * frac is the fractional part of the srtt (if any) 6757 * but its in ticks and every bit represents 6758 * 1/32nd of a hz. 6759 */ 6760 if (frac) { 6761 if (hz == 1000) { 6762 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6763 } else { 6764 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6765 } 6766 tp->t_srtt += frac; 6767 } 6768 } 6769 if (tp->t_rttvar) { 6770 uint32_t val, frac; 6771 6772 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; 6773 frac = tp->t_rttvar & 0x1f; 6774 tp->t_rttvar = TICKS_2_USEC(val); 6775 /* 6776 * frac is the fractional part of the srtt (if any) 6777 * but its in ticks and every bit represents 6778 * 1/32nd of a hz. 6779 */ 6780 if (frac) { 6781 if (hz == 1000) { 6782 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6783 } else { 6784 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6785 } 6786 tp->t_rttvar += frac; 6787 } 6788 } 6789 tp->t_rxtcur = RACK_REXMTVAL(tp); 6790 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6791 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 6792 } 6793 if (tp->t_rxtcur > rack_rto_max) { 6794 tp->t_rxtcur = rack_rto_max; 6795 } 6796 } 6797 6798 static void 6799 rack_cc_conn_init(struct tcpcb *tp) 6800 { 6801 struct tcp_rack *rack; 6802 uint32_t srtt; 6803 6804 rack = (struct tcp_rack *)tp->t_fb_ptr; 6805 srtt = tp->t_srtt; 6806 cc_conn_init(tp); 6807 /* 6808 * Now convert to rack's internal format, 6809 * if required. 6810 */ 6811 if ((srtt == 0) && (tp->t_srtt != 0)) 6812 rack_convert_rtts(tp); 6813 /* 6814 * We want a chance to stay in slowstart as 6815 * we create a connection. TCP spec says that 6816 * initially ssthresh is infinite. For our 6817 * purposes that is the snd_wnd. 6818 */ 6819 if (tp->snd_ssthresh < tp->snd_wnd) { 6820 tp->snd_ssthresh = tp->snd_wnd; 6821 } 6822 /* 6823 * We also want to assure a IW worth of 6824 * data can get inflight. 6825 */ 6826 if (rc_init_window(rack) < tp->snd_cwnd) 6827 tp->snd_cwnd = rc_init_window(rack); 6828 } 6829 6830 /* 6831 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 6832 * we will setup to retransmit the lowest seq number outstanding. 6833 */ 6834 static int 6835 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6836 { 6837 int32_t rexmt; 6838 struct inpcb *inp; 6839 int32_t retval = 0; 6840 bool isipv6; 6841 6842 inp = tp->t_inpcb; 6843 if (tp->t_timers->tt_flags & TT_STOPPED) { 6844 return (1); 6845 } 6846 if ((tp->t_flags & TF_GPUTINPROG) && 6847 (tp->t_rxtshift)) { 6848 /* 6849 * We have had a second timeout 6850 * measurements on successive rxt's are not profitable. 6851 * It is unlikely to be of any use (the network is 6852 * broken or the client went away). 6853 */ 6854 tp->t_flags &= ~TF_GPUTINPROG; 6855 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6856 rack->r_ctl.rc_gp_srtt /*flex1*/, 6857 tp->gput_seq, 6858 0, 0, 18, __LINE__, NULL, 0); 6859 } 6860 if (ctf_progress_timeout_check(tp, false)) { 6861 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6862 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6863 return (-ETIMEDOUT); /* tcp_drop() */ 6864 } 6865 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 6866 rack->r_ctl.retran_during_recovery = 0; 6867 rack->r_ctl.dsack_byte_cnt = 0; 6868 if (IN_FASTRECOVERY(tp->t_flags)) 6869 tp->t_flags |= TF_WASFRECOVERY; 6870 else 6871 tp->t_flags &= ~TF_WASFRECOVERY; 6872 if (IN_CONGRECOVERY(tp->t_flags)) 6873 tp->t_flags |= TF_WASCRECOVERY; 6874 else 6875 tp->t_flags &= ~TF_WASCRECOVERY; 6876 if (TCPS_HAVEESTABLISHED(tp->t_state) && 6877 (tp->snd_una == tp->snd_max)) { 6878 /* Nothing outstanding .. nothing to do */ 6879 return (0); 6880 } 6881 if (rack->r_ctl.dsack_persist) { 6882 rack->r_ctl.dsack_persist--; 6883 if (rack->r_ctl.num_dsack && (rack->r_ctl.dsack_persist == 0)) { 6884 rack->r_ctl.num_dsack = 0; 6885 } 6886 rack_log_dsack_event(rack, 1, __LINE__, 0, 0); 6887 } 6888 /* 6889 * Rack can only run one timer at a time, so we cannot 6890 * run a KEEPINIT (gating SYN sending) and a retransmit 6891 * timer for the SYN. So if we are in a front state and 6892 * have a KEEPINIT timer we need to check the first transmit 6893 * against now to see if we have exceeded the KEEPINIT time 6894 * (if one is set). 6895 */ 6896 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6897 (TP_KEEPINIT(tp) != 0)) { 6898 struct rack_sendmap *rsm; 6899 6900 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6901 if (rsm) { 6902 /* Ok we have something outstanding to test keepinit with */ 6903 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 6904 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 6905 /* We have exceeded the KEEPINIT time */ 6906 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6907 goto drop_it; 6908 } 6909 } 6910 } 6911 /* 6912 * Retransmission timer went off. Message has not been acked within 6913 * retransmit interval. Back off to a longer retransmit interval 6914 * and retransmit one segment. 6915 */ 6916 rack_remxt_tmr(tp); 6917 if ((rack->r_ctl.rc_resend == NULL) || 6918 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 6919 /* 6920 * If the rwnd collapsed on 6921 * the one we are retransmitting 6922 * it does not count against the 6923 * rxt count. 6924 */ 6925 tp->t_rxtshift++; 6926 } 6927 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 6928 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6929 drop_it: 6930 tp->t_rxtshift = TCP_MAXRXTSHIFT; 6931 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 6932 /* XXXGL: previously t_softerror was casted to uint16_t */ 6933 MPASS(tp->t_softerror >= 0); 6934 retval = tp->t_softerror ? -tp->t_softerror : -ETIMEDOUT; 6935 goto out; /* tcp_drop() */ 6936 } 6937 if (tp->t_state == TCPS_SYN_SENT) { 6938 /* 6939 * If the SYN was retransmitted, indicate CWND to be limited 6940 * to 1 segment in cc_conn_init(). 6941 */ 6942 tp->snd_cwnd = 1; 6943 } else if (tp->t_rxtshift == 1) { 6944 /* 6945 * first retransmit; record ssthresh and cwnd so they can be 6946 * recovered if this turns out to be a "bad" retransmit. A 6947 * retransmit is considered "bad" if an ACK for this segment 6948 * is received within RTT/2 interval; the assumption here is 6949 * that the ACK was already in flight. See "On Estimating 6950 * End-to-End Network Path Properties" by Allman and Paxson 6951 * for more details. 6952 */ 6953 tp->snd_cwnd_prev = tp->snd_cwnd; 6954 tp->snd_ssthresh_prev = tp->snd_ssthresh; 6955 tp->snd_recover_prev = tp->snd_recover; 6956 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 6957 tp->t_flags |= TF_PREVVALID; 6958 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 6959 tp->t_flags &= ~TF_PREVVALID; 6960 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 6961 if ((tp->t_state == TCPS_SYN_SENT) || 6962 (tp->t_state == TCPS_SYN_RECEIVED)) 6963 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 6964 else 6965 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 6966 6967 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 6968 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 6969 /* 6970 * We enter the path for PLMTUD if connection is established or, if 6971 * connection is FIN_WAIT_1 status, reason for the last is that if 6972 * amount of data we send is very small, we could send it in couple 6973 * of packets and process straight to FIN. In that case we won't 6974 * catch ESTABLISHED state. 6975 */ 6976 #ifdef INET6 6977 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 6978 #else 6979 isipv6 = false; 6980 #endif 6981 if (((V_tcp_pmtud_blackhole_detect == 1) || 6982 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 6983 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 6984 ((tp->t_state == TCPS_ESTABLISHED) || 6985 (tp->t_state == TCPS_FIN_WAIT_1))) { 6986 /* 6987 * Idea here is that at each stage of mtu probe (usually, 6988 * 1448 -> 1188 -> 524) should be given 2 chances to recover 6989 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 6990 * should take care of that. 6991 */ 6992 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 6993 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 6994 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 6995 tp->t_rxtshift % 2 == 0)) { 6996 /* 6997 * Enter Path MTU Black-hole Detection mechanism: - 6998 * Disable Path MTU Discovery (IP "DF" bit). - 6999 * Reduce MTU to lower value than what we negotiated 7000 * with peer. 7001 */ 7002 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 7003 /* Record that we may have found a black hole. */ 7004 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 7005 /* Keep track of previous MSS. */ 7006 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 7007 } 7008 7009 /* 7010 * Reduce the MSS to blackhole value or to the 7011 * default in an attempt to retransmit. 7012 */ 7013 #ifdef INET6 7014 if (isipv6 && 7015 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 7016 /* Use the sysctl tuneable blackhole MSS. */ 7017 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 7018 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7019 } else if (isipv6) { 7020 /* Use the default MSS. */ 7021 tp->t_maxseg = V_tcp_v6mssdflt; 7022 /* 7023 * Disable Path MTU Discovery when we switch 7024 * to minmss. 7025 */ 7026 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7027 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7028 } 7029 #endif 7030 #if defined(INET6) && defined(INET) 7031 else 7032 #endif 7033 #ifdef INET 7034 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 7035 /* Use the sysctl tuneable blackhole MSS. */ 7036 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 7037 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 7038 } else { 7039 /* Use the default MSS. */ 7040 tp->t_maxseg = V_tcp_mssdflt; 7041 /* 7042 * Disable Path MTU Discovery when we switch 7043 * to minmss. 7044 */ 7045 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 7046 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 7047 } 7048 #endif 7049 } else { 7050 /* 7051 * If further retransmissions are still unsuccessful 7052 * with a lowered MTU, maybe this isn't a blackhole 7053 * and we restore the previous MSS and blackhole 7054 * detection flags. The limit '6' is determined by 7055 * giving each probe stage (1448, 1188, 524) 2 7056 * chances to recover. 7057 */ 7058 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 7059 (tp->t_rxtshift >= 6)) { 7060 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 7061 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 7062 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 7063 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 7064 } 7065 } 7066 } 7067 /* 7068 * Disable RFC1323 and SACK if we haven't got any response to 7069 * our third SYN to work-around some broken terminal servers 7070 * (most of which have hopefully been retired) that have bad VJ 7071 * header compression code which trashes TCP segments containing 7072 * unknown-to-them TCP options. 7073 */ 7074 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 7075 (tp->t_rxtshift == 3)) 7076 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 7077 /* 7078 * If we backed off this far, our srtt estimate is probably bogus. 7079 * Clobber it so we'll take the next rtt measurement as our srtt; 7080 * move the current srtt into rttvar to keep the current retransmit 7081 * times until then. 7082 */ 7083 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 7084 #ifdef INET6 7085 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 7086 in6_losing(tp->t_inpcb); 7087 else 7088 #endif 7089 in_losing(tp->t_inpcb); 7090 tp->t_rttvar += tp->t_srtt; 7091 tp->t_srtt = 0; 7092 } 7093 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 7094 tp->snd_recover = tp->snd_max; 7095 tp->t_flags |= TF_ACKNOW; 7096 tp->t_rtttime = 0; 7097 rack_cong_signal(tp, CC_RTO, tp->snd_una); 7098 out: 7099 return (retval); 7100 } 7101 7102 static int 7103 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 7104 { 7105 int32_t ret = 0; 7106 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 7107 7108 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 7109 (tp->t_flags & TF_GPUTINPROG)) { 7110 /* 7111 * We have a goodput in progress 7112 * and we have entered a late state. 7113 * Do we have enough data in the sb 7114 * to handle the GPUT request? 7115 */ 7116 uint32_t bytes; 7117 7118 bytes = tp->gput_ack - tp->gput_seq; 7119 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 7120 bytes += tp->gput_seq - tp->snd_una; 7121 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 7122 /* 7123 * There are not enough bytes in the socket 7124 * buffer that have been sent to cover this 7125 * measurement. Cancel it. 7126 */ 7127 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 7128 rack->r_ctl.rc_gp_srtt /*flex1*/, 7129 tp->gput_seq, 7130 0, 0, 18, __LINE__, NULL, 0); 7131 tp->t_flags &= ~TF_GPUTINPROG; 7132 } 7133 } 7134 if (timers == 0) { 7135 return (0); 7136 } 7137 if (tp->t_state == TCPS_LISTEN) { 7138 /* no timers on listen sockets */ 7139 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 7140 return (0); 7141 return (1); 7142 } 7143 if ((timers & PACE_TMR_RACK) && 7144 rack->rc_on_min_to) { 7145 /* 7146 * For the rack timer when we 7147 * are on a min-timeout (which means rrr_conf = 3) 7148 * we don't want to check the timer. It may 7149 * be going off for a pace and thats ok we 7150 * want to send the retransmit (if its ready). 7151 * 7152 * If its on a normal rack timer (non-min) then 7153 * we will check if its expired. 7154 */ 7155 goto skip_time_check; 7156 } 7157 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7158 uint32_t left; 7159 7160 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 7161 ret = -1; 7162 rack_log_to_processing(rack, cts, ret, 0); 7163 return (0); 7164 } 7165 if (hpts_calling == 0) { 7166 /* 7167 * A user send or queued mbuf (sack) has called us? We 7168 * return 0 and let the pacing guards 7169 * deal with it if they should or 7170 * should not cause a send. 7171 */ 7172 ret = -2; 7173 rack_log_to_processing(rack, cts, ret, 0); 7174 return (0); 7175 } 7176 /* 7177 * Ok our timer went off early and we are not paced false 7178 * alarm, go back to sleep. 7179 */ 7180 ret = -3; 7181 left = rack->r_ctl.rc_timer_exp - cts; 7182 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 7183 rack_log_to_processing(rack, cts, ret, left); 7184 return (1); 7185 } 7186 skip_time_check: 7187 rack->rc_tmr_stopped = 0; 7188 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 7189 if (timers & PACE_TMR_DELACK) { 7190 ret = rack_timeout_delack(tp, rack, cts); 7191 } else if (timers & PACE_TMR_RACK) { 7192 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7193 rack->r_fast_output = 0; 7194 ret = rack_timeout_rack(tp, rack, cts); 7195 } else if (timers & PACE_TMR_TLP) { 7196 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7197 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 7198 } else if (timers & PACE_TMR_RXT) { 7199 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7200 rack->r_fast_output = 0; 7201 ret = rack_timeout_rxt(tp, rack, cts); 7202 } else if (timers & PACE_TMR_PERSIT) { 7203 ret = rack_timeout_persist(tp, rack, cts); 7204 } else if (timers & PACE_TMR_KEEP) { 7205 ret = rack_timeout_keepalive(tp, rack, cts); 7206 } 7207 rack_log_to_processing(rack, cts, ret, timers); 7208 return (ret); 7209 } 7210 7211 static void 7212 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 7213 { 7214 struct timeval tv; 7215 uint32_t us_cts, flags_on_entry; 7216 uint8_t hpts_removed = 0; 7217 7218 flags_on_entry = rack->r_ctl.rc_hpts_flags; 7219 us_cts = tcp_get_usecs(&tv); 7220 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 7221 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 7222 ((tp->snd_max - tp->snd_una) == 0))) { 7223 tcp_hpts_remove(rack->rc_inp); 7224 hpts_removed = 1; 7225 /* If we were not delayed cancel out the flag. */ 7226 if ((tp->snd_max - tp->snd_una) == 0) 7227 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7228 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7229 } 7230 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7231 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7232 if (tcp_in_hpts(rack->rc_inp) && 7233 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 7234 /* 7235 * Canceling timer's when we have no output being 7236 * paced. We also must remove ourselves from the 7237 * hpts. 7238 */ 7239 tcp_hpts_remove(rack->rc_inp); 7240 hpts_removed = 1; 7241 } 7242 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 7243 } 7244 if (hpts_removed == 0) 7245 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7246 } 7247 7248 static void 7249 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 7250 { 7251 return; 7252 } 7253 7254 static int 7255 rack_stopall(struct tcpcb *tp) 7256 { 7257 struct tcp_rack *rack; 7258 rack = (struct tcp_rack *)tp->t_fb_ptr; 7259 rack->t_timers_stopped = 1; 7260 return (0); 7261 } 7262 7263 static void 7264 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 7265 { 7266 return; 7267 } 7268 7269 static int 7270 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 7271 { 7272 return (0); 7273 } 7274 7275 static void 7276 rack_stop_all_timers(struct tcpcb *tp) 7277 { 7278 struct tcp_rack *rack; 7279 7280 /* 7281 * Assure no timers are running. 7282 */ 7283 if (tcp_timer_active(tp, TT_PERSIST)) { 7284 /* We enter in persists, set the flag appropriately */ 7285 rack = (struct tcp_rack *)tp->t_fb_ptr; 7286 rack->rc_in_persist = 1; 7287 } 7288 tcp_timer_suspend(tp, TT_PERSIST); 7289 tcp_timer_suspend(tp, TT_REXMT); 7290 tcp_timer_suspend(tp, TT_KEEP); 7291 tcp_timer_suspend(tp, TT_DELACK); 7292 } 7293 7294 static void 7295 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 7296 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) 7297 { 7298 int32_t idx; 7299 7300 rsm->r_rtr_cnt++; 7301 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7302 rsm->r_dupack = 0; 7303 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 7304 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 7305 rsm->r_flags |= RACK_OVERMAX; 7306 } 7307 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 7308 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 7309 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 7310 } 7311 idx = rsm->r_rtr_cnt - 1; 7312 rsm->r_tim_lastsent[idx] = ts; 7313 /* 7314 * Here we don't add in the len of send, since its already 7315 * in snduna <->snd_max. 7316 */ 7317 rsm->r_fas = ctf_flight_size(rack->rc_tp, 7318 rack->r_ctl.rc_sacked); 7319 if (rsm->r_flags & RACK_ACKED) { 7320 /* Problably MTU discovery messing with us */ 7321 rsm->r_flags &= ~RACK_ACKED; 7322 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7323 } 7324 if (rsm->r_in_tmap) { 7325 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7326 rsm->r_in_tmap = 0; 7327 } 7328 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7329 rsm->r_in_tmap = 1; 7330 if (rsm->r_flags & RACK_SACK_PASSED) { 7331 /* We have retransmitted due to the SACK pass */ 7332 rsm->r_flags &= ~RACK_SACK_PASSED; 7333 rsm->r_flags |= RACK_WAS_SACKPASS; 7334 } 7335 } 7336 7337 static uint32_t 7338 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 7339 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) 7340 { 7341 /* 7342 * We (re-)transmitted starting at rsm->r_start for some length 7343 * (possibly less than r_end. 7344 */ 7345 struct rack_sendmap *nrsm, *insret; 7346 uint32_t c_end; 7347 int32_t len; 7348 7349 len = *lenp; 7350 c_end = rsm->r_start + len; 7351 if (SEQ_GEQ(c_end, rsm->r_end)) { 7352 /* 7353 * We retransmitted the whole piece or more than the whole 7354 * slopping into the next rsm. 7355 */ 7356 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7357 if (c_end == rsm->r_end) { 7358 *lenp = 0; 7359 return (0); 7360 } else { 7361 int32_t act_len; 7362 7363 /* Hangs over the end return whats left */ 7364 act_len = rsm->r_end - rsm->r_start; 7365 *lenp = (len - act_len); 7366 return (rsm->r_end); 7367 } 7368 /* We don't get out of this block. */ 7369 } 7370 /* 7371 * Here we retransmitted less than the whole thing which means we 7372 * have to split this into what was transmitted and what was not. 7373 */ 7374 nrsm = rack_alloc_full_limit(rack); 7375 if (nrsm == NULL) { 7376 /* 7377 * We can't get memory, so lets not proceed. 7378 */ 7379 *lenp = 0; 7380 return (0); 7381 } 7382 /* 7383 * So here we are going to take the original rsm and make it what we 7384 * retransmitted. nrsm will be the tail portion we did not 7385 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 7386 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 7387 * 1, 6 and the new piece will be 6, 11. 7388 */ 7389 rack_clone_rsm(rack, nrsm, rsm, c_end); 7390 nrsm->r_dupack = 0; 7391 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7392 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7393 #ifdef INVARIANTS 7394 if (insret != NULL) { 7395 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7396 nrsm, insret, rack, rsm); 7397 } 7398 #endif 7399 if (rsm->r_in_tmap) { 7400 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7401 nrsm->r_in_tmap = 1; 7402 } 7403 rsm->r_flags &= (~RACK_HAS_FIN); 7404 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7405 /* Log a split of rsm into rsm and nrsm */ 7406 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7407 *lenp = 0; 7408 return (0); 7409 } 7410 7411 static void 7412 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 7413 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts, 7414 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls) 7415 { 7416 struct tcp_rack *rack; 7417 struct rack_sendmap *rsm, *nrsm, *insret, fe; 7418 register uint32_t snd_max, snd_una; 7419 7420 /* 7421 * Add to the RACK log of packets in flight or retransmitted. If 7422 * there is a TS option we will use the TS echoed, if not we will 7423 * grab a TS. 7424 * 7425 * Retransmissions will increment the count and move the ts to its 7426 * proper place. Note that if options do not include TS's then we 7427 * won't be able to effectively use the ACK for an RTT on a retran. 7428 * 7429 * Notes about r_start and r_end. Lets consider a send starting at 7430 * sequence 1 for 10 bytes. In such an example the r_start would be 7431 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 7432 * This means that r_end is actually the first sequence for the next 7433 * slot (11). 7434 * 7435 */ 7436 /* 7437 * If err is set what do we do XXXrrs? should we not add the thing? 7438 * -- i.e. return if err != 0 or should we pretend we sent it? -- 7439 * i.e. proceed with add ** do this for now. 7440 */ 7441 INP_WLOCK_ASSERT(tp->t_inpcb); 7442 if (err) 7443 /* 7444 * We don't log errors -- we could but snd_max does not 7445 * advance in this case either. 7446 */ 7447 return; 7448 7449 if (th_flags & TH_RST) { 7450 /* 7451 * We don't log resets and we return immediately from 7452 * sending 7453 */ 7454 return; 7455 } 7456 rack = (struct tcp_rack *)tp->t_fb_ptr; 7457 snd_una = tp->snd_una; 7458 snd_max = tp->snd_max; 7459 if (th_flags & (TH_SYN | TH_FIN)) { 7460 /* 7461 * The call to rack_log_output is made before bumping 7462 * snd_max. This means we can record one extra byte on a SYN 7463 * or FIN if seq_out is adding more on and a FIN is present 7464 * (and we are not resending). 7465 */ 7466 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 7467 len++; 7468 if (th_flags & TH_FIN) 7469 len++; 7470 if (SEQ_LT(snd_max, tp->snd_nxt)) { 7471 /* 7472 * The add/update as not been done for the FIN/SYN 7473 * yet. 7474 */ 7475 snd_max = tp->snd_nxt; 7476 } 7477 } 7478 if (SEQ_LEQ((seq_out + len), snd_una)) { 7479 /* Are sending an old segment to induce an ack (keep-alive)? */ 7480 return; 7481 } 7482 if (SEQ_LT(seq_out, snd_una)) { 7483 /* huh? should we panic? */ 7484 uint32_t end; 7485 7486 end = seq_out + len; 7487 seq_out = snd_una; 7488 if (SEQ_GEQ(end, seq_out)) 7489 len = end - seq_out; 7490 else 7491 len = 0; 7492 } 7493 if (len == 0) { 7494 /* We don't log zero window probes */ 7495 return; 7496 } 7497 rack->r_ctl.rc_time_last_sent = cts; 7498 if (IN_FASTRECOVERY(tp->t_flags)) { 7499 rack->r_ctl.rc_prr_out += len; 7500 } 7501 /* First question is it a retransmission or new? */ 7502 if (seq_out == snd_max) { 7503 /* Its new */ 7504 again: 7505 rsm = rack_alloc(rack); 7506 if (rsm == NULL) { 7507 /* 7508 * Hmm out of memory and the tcb got destroyed while 7509 * we tried to wait. 7510 */ 7511 return; 7512 } 7513 if (th_flags & TH_FIN) { 7514 rsm->r_flags = RACK_HAS_FIN|add_flag; 7515 } else { 7516 rsm->r_flags = add_flag; 7517 } 7518 if (hw_tls) 7519 rsm->r_hw_tls = 1; 7520 rsm->r_tim_lastsent[0] = cts; 7521 rsm->r_rtr_cnt = 1; 7522 rsm->r_rtr_bytes = 0; 7523 if (th_flags & TH_SYN) { 7524 /* The data space is one beyond snd_una */ 7525 rsm->r_flags |= RACK_HAS_SYN; 7526 } 7527 rsm->r_start = seq_out; 7528 rsm->r_end = rsm->r_start + len; 7529 rsm->r_dupack = 0; 7530 /* 7531 * save off the mbuf location that 7532 * sndmbuf_noadv returned (which is 7533 * where we started copying from).. 7534 */ 7535 rsm->m = s_mb; 7536 rsm->soff = s_moff; 7537 /* 7538 * Here we do add in the len of send, since its not yet 7539 * reflected in in snduna <->snd_max 7540 */ 7541 rsm->r_fas = (ctf_flight_size(rack->rc_tp, 7542 rack->r_ctl.rc_sacked) + 7543 (rsm->r_end - rsm->r_start)); 7544 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 7545 if (rsm->m) { 7546 if (rsm->m->m_len <= rsm->soff) { 7547 /* 7548 * XXXrrs Question, will this happen? 7549 * 7550 * If sbsndptr is set at the correct place 7551 * then s_moff should always be somewhere 7552 * within rsm->m. But if the sbsndptr was 7553 * off then that won't be true. If it occurs 7554 * we need to walkout to the correct location. 7555 */ 7556 struct mbuf *lm; 7557 7558 lm = rsm->m; 7559 while (lm->m_len <= rsm->soff) { 7560 rsm->soff -= lm->m_len; 7561 lm = lm->m_next; 7562 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 7563 __func__, rack, s_moff, s_mb, rsm->soff)); 7564 } 7565 rsm->m = lm; 7566 counter_u64_add(rack_sbsndptr_wrong, 1); 7567 } else 7568 counter_u64_add(rack_sbsndptr_right, 1); 7569 rsm->orig_m_len = rsm->m->m_len; 7570 } else 7571 rsm->orig_m_len = 0; 7572 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7573 /* Log a new rsm */ 7574 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 7575 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7576 #ifdef INVARIANTS 7577 if (insret != NULL) { 7578 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7579 nrsm, insret, rack, rsm); 7580 } 7581 #endif 7582 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7583 rsm->r_in_tmap = 1; 7584 /* 7585 * Special case detection, is there just a single 7586 * packet outstanding when we are not in recovery? 7587 * 7588 * If this is true mark it so. 7589 */ 7590 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 7591 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 7592 struct rack_sendmap *prsm; 7593 7594 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7595 if (prsm) 7596 prsm->r_one_out_nr = 1; 7597 } 7598 return; 7599 } 7600 /* 7601 * If we reach here its a retransmission and we need to find it. 7602 */ 7603 memset(&fe, 0, sizeof(fe)); 7604 more: 7605 if (hintrsm && (hintrsm->r_start == seq_out)) { 7606 rsm = hintrsm; 7607 hintrsm = NULL; 7608 } else { 7609 /* No hints sorry */ 7610 rsm = NULL; 7611 } 7612 if ((rsm) && (rsm->r_start == seq_out)) { 7613 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7614 if (len == 0) { 7615 return; 7616 } else { 7617 goto more; 7618 } 7619 } 7620 /* Ok it was not the last pointer go through it the hard way. */ 7621 refind: 7622 fe.r_start = seq_out; 7623 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7624 if (rsm) { 7625 if (rsm->r_start == seq_out) { 7626 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7627 if (len == 0) { 7628 return; 7629 } else { 7630 goto refind; 7631 } 7632 } 7633 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 7634 /* Transmitted within this piece */ 7635 /* 7636 * Ok we must split off the front and then let the 7637 * update do the rest 7638 */ 7639 nrsm = rack_alloc_full_limit(rack); 7640 if (nrsm == NULL) { 7641 rack_update_rsm(tp, rack, rsm, cts, add_flag); 7642 return; 7643 } 7644 /* 7645 * copy rsm to nrsm and then trim the front of rsm 7646 * to not include this part. 7647 */ 7648 rack_clone_rsm(rack, nrsm, rsm, seq_out); 7649 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7650 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7651 #ifdef INVARIANTS 7652 if (insret != NULL) { 7653 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7654 nrsm, insret, rack, rsm); 7655 } 7656 #endif 7657 if (rsm->r_in_tmap) { 7658 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7659 nrsm->r_in_tmap = 1; 7660 } 7661 rsm->r_flags &= (~RACK_HAS_FIN); 7662 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); 7663 if (len == 0) { 7664 return; 7665 } else if (len > 0) 7666 goto refind; 7667 } 7668 } 7669 /* 7670 * Hmm not found in map did they retransmit both old and on into the 7671 * new? 7672 */ 7673 if (seq_out == tp->snd_max) { 7674 goto again; 7675 } else if (SEQ_LT(seq_out, tp->snd_max)) { 7676 #ifdef INVARIANTS 7677 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 7678 seq_out, len, tp->snd_una, tp->snd_max); 7679 printf("Starting Dump of all rack entries\n"); 7680 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 7681 printf("rsm:%p start:%u end:%u\n", 7682 rsm, rsm->r_start, rsm->r_end); 7683 } 7684 printf("Dump complete\n"); 7685 panic("seq_out not found rack:%p tp:%p", 7686 rack, tp); 7687 #endif 7688 } else { 7689 #ifdef INVARIANTS 7690 /* 7691 * Hmm beyond sndmax? (only if we are using the new rtt-pack 7692 * flag) 7693 */ 7694 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 7695 seq_out, len, tp->snd_max, tp); 7696 #endif 7697 } 7698 } 7699 7700 /* 7701 * Record one of the RTT updates from an ack into 7702 * our sample structure. 7703 */ 7704 7705 static void 7706 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 7707 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 7708 { 7709 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7710 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 7711 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 7712 } 7713 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7714 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 7715 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 7716 } 7717 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 7718 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 7719 rack->r_ctl.rc_gp_lowrtt = us_rtt; 7720 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 7721 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 7722 } 7723 if ((confidence == 1) && 7724 ((rsm == NULL) || 7725 (rsm->r_just_ret) || 7726 (rsm->r_one_out_nr && 7727 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 7728 /* 7729 * If the rsm had a just return 7730 * hit it then we can't trust the 7731 * rtt measurement for buffer deterimination 7732 * Note that a confidence of 2, indicates 7733 * SACK'd which overrides the r_just_ret or 7734 * the r_one_out_nr. If it was a CUM-ACK and 7735 * we had only two outstanding, but get an 7736 * ack for only 1. Then that also lowers our 7737 * confidence. 7738 */ 7739 confidence = 0; 7740 } 7741 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7742 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 7743 if (rack->r_ctl.rack_rs.confidence == 0) { 7744 /* 7745 * We take anything with no current confidence 7746 * saved. 7747 */ 7748 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7749 rack->r_ctl.rack_rs.confidence = confidence; 7750 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7751 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 7752 /* 7753 * Once we have a confident number, 7754 * we can update it with a smaller 7755 * value since this confident number 7756 * may include the DSACK time until 7757 * the next segment (the second one) arrived. 7758 */ 7759 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7760 rack->r_ctl.rack_rs.confidence = confidence; 7761 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7762 } 7763 } 7764 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 7765 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 7766 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 7767 rack->r_ctl.rack_rs.rs_rtt_cnt++; 7768 } 7769 7770 /* 7771 * Collect new round-trip time estimate 7772 * and update averages and current timeout. 7773 */ 7774 static void 7775 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 7776 { 7777 int32_t delta; 7778 uint32_t o_srtt, o_var; 7779 int32_t hrtt_up = 0; 7780 int32_t rtt; 7781 7782 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 7783 /* No valid sample */ 7784 return; 7785 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 7786 /* We are to use the lowest RTT seen in a single ack */ 7787 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 7788 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 7789 /* We are to use the highest RTT seen in a single ack */ 7790 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 7791 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 7792 /* We are to use the average RTT seen in a single ack */ 7793 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 7794 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 7795 } else { 7796 #ifdef INVARIANTS 7797 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 7798 #endif 7799 return; 7800 } 7801 if (rtt == 0) 7802 rtt = 1; 7803 if (rack->rc_gp_rtt_set == 0) { 7804 /* 7805 * With no RTT we have to accept 7806 * even one we are not confident of. 7807 */ 7808 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 7809 rack->rc_gp_rtt_set = 1; 7810 } else if (rack->r_ctl.rack_rs.confidence) { 7811 /* update the running gp srtt */ 7812 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 7813 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 7814 } 7815 if (rack->r_ctl.rack_rs.confidence) { 7816 /* 7817 * record the low and high for highly buffered path computation, 7818 * we only do this if we are confident (not a retransmission). 7819 */ 7820 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 7821 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7822 hrtt_up = 1; 7823 } 7824 if (rack->rc_highly_buffered == 0) { 7825 /* 7826 * Currently once we declare a path has 7827 * highly buffered there is no going 7828 * back, which may be a problem... 7829 */ 7830 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 7831 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 7832 rack->r_ctl.rc_highest_us_rtt, 7833 rack->r_ctl.rc_lowest_us_rtt, 7834 RACK_RTTS_SEEHBP); 7835 rack->rc_highly_buffered = 1; 7836 } 7837 } 7838 } 7839 if ((rack->r_ctl.rack_rs.confidence) || 7840 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 7841 /* 7842 * If we are highly confident of it <or> it was 7843 * never retransmitted we accept it as the last us_rtt. 7844 */ 7845 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7846 /* The lowest rtt can be set if its was not retransmited */ 7847 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 7848 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7849 if (rack->r_ctl.rc_lowest_us_rtt == 0) 7850 rack->r_ctl.rc_lowest_us_rtt = 1; 7851 } 7852 } 7853 o_srtt = tp->t_srtt; 7854 o_var = tp->t_rttvar; 7855 rack = (struct tcp_rack *)tp->t_fb_ptr; 7856 if (tp->t_srtt != 0) { 7857 /* 7858 * We keep a simple srtt in microseconds, like our rtt 7859 * measurement. We don't need to do any tricks with shifting 7860 * etc. Instead we just add in 1/8th of the new measurement 7861 * and subtract out 1/8 of the old srtt. We do the same with 7862 * the variance after finding the absolute value of the 7863 * difference between this sample and the current srtt. 7864 */ 7865 delta = tp->t_srtt - rtt; 7866 /* Take off 1/8th of the current sRTT */ 7867 tp->t_srtt -= (tp->t_srtt >> 3); 7868 /* Add in 1/8th of the new RTT just measured */ 7869 tp->t_srtt += (rtt >> 3); 7870 if (tp->t_srtt <= 0) 7871 tp->t_srtt = 1; 7872 /* Now lets make the absolute value of the variance */ 7873 if (delta < 0) 7874 delta = -delta; 7875 /* Subtract out 1/8th */ 7876 tp->t_rttvar -= (tp->t_rttvar >> 3); 7877 /* Add in 1/8th of the new variance we just saw */ 7878 tp->t_rttvar += (delta >> 3); 7879 if (tp->t_rttvar <= 0) 7880 tp->t_rttvar = 1; 7881 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 7882 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7883 } else { 7884 /* 7885 * No rtt measurement yet - use the unsmoothed rtt. Set the 7886 * variance to half the rtt (so our first retransmit happens 7887 * at 3*rtt). 7888 */ 7889 tp->t_srtt = rtt; 7890 tp->t_rttvar = rtt >> 1; 7891 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7892 } 7893 rack->rc_srtt_measure_made = 1; 7894 KMOD_TCPSTAT_INC(tcps_rttupdated); 7895 tp->t_rttupdated++; 7896 #ifdef STATS 7897 if (rack_stats_gets_ms_rtt == 0) { 7898 /* Send in the microsecond rtt used for rxt timeout purposes */ 7899 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 7900 } else if (rack_stats_gets_ms_rtt == 1) { 7901 /* Send in the millisecond rtt used for rxt timeout purposes */ 7902 int32_t ms_rtt; 7903 7904 /* Round up */ 7905 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7906 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7907 } else if (rack_stats_gets_ms_rtt == 2) { 7908 /* Send in the millisecond rtt has close to the path RTT as we can get */ 7909 int32_t ms_rtt; 7910 7911 /* Round up */ 7912 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7913 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7914 } else { 7915 /* Send in the microsecond rtt has close to the path RTT as we can get */ 7916 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 7917 } 7918 7919 #endif 7920 /* 7921 * the retransmit should happen at rtt + 4 * rttvar. Because of the 7922 * way we do the smoothing, srtt and rttvar will each average +1/2 7923 * tick of bias. When we compute the retransmit timer, we want 1/2 7924 * tick of rounding and 1 extra tick because of +-1/2 tick 7925 * uncertainty in the firing of the timer. The bias will give us 7926 * exactly the 1.5 tick we need. But, because the bias is 7927 * statistical, we have to test that we don't drop below the minimum 7928 * feasible timer (which is 2 ticks). 7929 */ 7930 tp->t_rxtshift = 0; 7931 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7932 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 7933 rack_log_rtt_sample(rack, rtt); 7934 tp->t_softerror = 0; 7935 } 7936 7937 7938 static void 7939 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 7940 { 7941 /* 7942 * Apply to filter the inbound us-rtt at us_cts. 7943 */ 7944 uint32_t old_rtt; 7945 7946 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 7947 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 7948 us_rtt, us_cts); 7949 if (rack->r_ctl.last_pacing_time && 7950 rack->rc_gp_dyn_mul && 7951 (rack->r_ctl.last_pacing_time > us_rtt)) 7952 rack->pacing_longer_than_rtt = 1; 7953 else 7954 rack->pacing_longer_than_rtt = 0; 7955 if (old_rtt > us_rtt) { 7956 /* We just hit a new lower rtt time */ 7957 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 7958 __LINE__, RACK_RTTS_NEWRTT); 7959 /* 7960 * Only count it if its lower than what we saw within our 7961 * calculated range. 7962 */ 7963 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 7964 if (rack_probertt_lower_within && 7965 rack->rc_gp_dyn_mul && 7966 (rack->use_fixed_rate == 0) && 7967 (rack->rc_always_pace)) { 7968 /* 7969 * We are seeing a new lower rtt very close 7970 * to the time that we would have entered probe-rtt. 7971 * This is probably due to the fact that a peer flow 7972 * has entered probe-rtt. Lets go in now too. 7973 */ 7974 uint32_t val; 7975 7976 val = rack_probertt_lower_within * rack_time_between_probertt; 7977 val /= 100; 7978 if ((rack->in_probe_rtt == 0) && 7979 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 7980 rack_enter_probertt(rack, us_cts); 7981 } 7982 } 7983 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 7984 } 7985 } 7986 } 7987 7988 static int 7989 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 7990 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 7991 { 7992 uint32_t us_rtt; 7993 int32_t i, all; 7994 uint32_t t, len_acked; 7995 7996 if ((rsm->r_flags & RACK_ACKED) || 7997 (rsm->r_flags & RACK_WAS_ACKED)) 7998 /* Already done */ 7999 return (0); 8000 if (rsm->r_no_rtt_allowed) { 8001 /* Not allowed */ 8002 return (0); 8003 } 8004 if (ack_type == CUM_ACKED) { 8005 if (SEQ_GT(th_ack, rsm->r_end)) { 8006 len_acked = rsm->r_end - rsm->r_start; 8007 all = 1; 8008 } else { 8009 len_acked = th_ack - rsm->r_start; 8010 all = 0; 8011 } 8012 } else { 8013 len_acked = rsm->r_end - rsm->r_start; 8014 all = 0; 8015 } 8016 if (rsm->r_rtr_cnt == 1) { 8017 8018 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8019 if ((int)t <= 0) 8020 t = 1; 8021 if (!tp->t_rttlow || tp->t_rttlow > t) 8022 tp->t_rttlow = t; 8023 if (!rack->r_ctl.rc_rack_min_rtt || 8024 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8025 rack->r_ctl.rc_rack_min_rtt = t; 8026 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8027 rack->r_ctl.rc_rack_min_rtt = 1; 8028 } 8029 } 8030 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 8031 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8032 else 8033 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8034 if (us_rtt == 0) 8035 us_rtt = 1; 8036 if (CC_ALGO(tp)->rttsample != NULL) { 8037 /* Kick the RTT to the CC */ 8038 CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas); 8039 } 8040 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 8041 if (ack_type == SACKED) { 8042 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 8043 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 8044 } else { 8045 /* 8046 * We need to setup what our confidence 8047 * is in this ack. 8048 * 8049 * If the rsm was app limited and it is 8050 * less than a mss in length (the end 8051 * of the send) then we have a gap. If we 8052 * were app limited but say we were sending 8053 * multiple MSS's then we are more confident 8054 * int it. 8055 * 8056 * When we are not app-limited then we see if 8057 * the rsm is being included in the current 8058 * measurement, we tell this by the app_limited_needs_set 8059 * flag. 8060 * 8061 * Note that being cwnd blocked is not applimited 8062 * as well as the pacing delay between packets which 8063 * are sending only 1 or 2 MSS's also will show up 8064 * in the RTT. We probably need to examine this algorithm 8065 * a bit more and enhance it to account for the delay 8066 * between rsm's. We could do that by saving off the 8067 * pacing delay of each rsm (in an rsm) and then 8068 * factoring that in somehow though for now I am 8069 * not sure how :) 8070 */ 8071 int calc_conf = 0; 8072 8073 if (rsm->r_flags & RACK_APP_LIMITED) { 8074 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 8075 calc_conf = 0; 8076 else 8077 calc_conf = 1; 8078 } else if (rack->app_limited_needs_set == 0) { 8079 calc_conf = 1; 8080 } else { 8081 calc_conf = 0; 8082 } 8083 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 8084 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 8085 calc_conf, rsm, rsm->r_rtr_cnt); 8086 } 8087 if ((rsm->r_flags & RACK_TLP) && 8088 (!IN_FASTRECOVERY(tp->t_flags))) { 8089 /* Segment was a TLP and our retrans matched */ 8090 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 8091 rack->r_ctl.rc_rsm_start = tp->snd_max; 8092 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8093 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8094 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 8095 } 8096 } 8097 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 8098 /* New more recent rack_tmit_time */ 8099 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8100 rack->rc_rack_rtt = t; 8101 } 8102 return (1); 8103 } 8104 /* 8105 * We clear the soft/rxtshift since we got an ack. 8106 * There is no assurance we will call the commit() function 8107 * so we need to clear these to avoid incorrect handling. 8108 */ 8109 tp->t_rxtshift = 0; 8110 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 8111 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 8112 tp->t_softerror = 0; 8113 if (to && (to->to_flags & TOF_TS) && 8114 (ack_type == CUM_ACKED) && 8115 (to->to_tsecr) && 8116 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 8117 /* 8118 * Now which timestamp does it match? In this block the ACK 8119 * must be coming from a previous transmission. 8120 */ 8121 for (i = 0; i < rsm->r_rtr_cnt; i++) { 8122 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 8123 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8124 if ((int)t <= 0) 8125 t = 1; 8126 if (CC_ALGO(tp)->rttsample != NULL) { 8127 /* 8128 * Kick the RTT to the CC, here 8129 * we lie a bit in that we know the 8130 * retransmission is correct even though 8131 * we retransmitted. This is because 8132 * we match the timestamps. 8133 */ 8134 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[i])) 8135 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[i]; 8136 else 8137 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[i]; 8138 CC_ALGO(tp)->rttsample(tp->ccv, us_rtt, 1, rsm->r_fas); 8139 } 8140 if ((i + 1) < rsm->r_rtr_cnt) { 8141 /* 8142 * The peer ack'd from our previous 8143 * transmission. We have a spurious 8144 * retransmission and thus we dont 8145 * want to update our rack_rtt. 8146 * 8147 * Hmm should there be a CC revert here? 8148 * 8149 */ 8150 return (0); 8151 } 8152 if (!tp->t_rttlow || tp->t_rttlow > t) 8153 tp->t_rttlow = t; 8154 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8155 rack->r_ctl.rc_rack_min_rtt = t; 8156 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8157 rack->r_ctl.rc_rack_min_rtt = 1; 8158 } 8159 } 8160 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 8161 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 8162 /* New more recent rack_tmit_time */ 8163 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8164 rack->rc_rack_rtt = t; 8165 } 8166 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 8167 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 8168 rsm->r_rtr_cnt); 8169 return (1); 8170 } 8171 } 8172 goto ts_not_found; 8173 } else { 8174 /* 8175 * Ok its a SACK block that we retransmitted. or a windows 8176 * machine without timestamps. We can tell nothing from the 8177 * time-stamp since its not there or the time the peer last 8178 * recieved a segment that moved forward its cum-ack point. 8179 */ 8180 ts_not_found: 8181 i = rsm->r_rtr_cnt - 1; 8182 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8183 if ((int)t <= 0) 8184 t = 1; 8185 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8186 /* 8187 * We retransmitted and the ack came back in less 8188 * than the smallest rtt we have observed. We most 8189 * likely did an improper retransmit as outlined in 8190 * 6.2 Step 2 point 2 in the rack-draft so we 8191 * don't want to update our rack_rtt. We in 8192 * theory (in future) might want to think about reverting our 8193 * cwnd state but we won't for now. 8194 */ 8195 return (0); 8196 } else if (rack->r_ctl.rc_rack_min_rtt) { 8197 /* 8198 * We retransmitted it and the retransmit did the 8199 * job. 8200 */ 8201 if (!rack->r_ctl.rc_rack_min_rtt || 8202 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8203 rack->r_ctl.rc_rack_min_rtt = t; 8204 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8205 rack->r_ctl.rc_rack_min_rtt = 1; 8206 } 8207 } 8208 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { 8209 /* New more recent rack_tmit_time */ 8210 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 8211 rack->rc_rack_rtt = t; 8212 } 8213 return (1); 8214 } 8215 } 8216 return (0); 8217 } 8218 8219 /* 8220 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 8221 */ 8222 static void 8223 rack_log_sack_passed(struct tcpcb *tp, 8224 struct tcp_rack *rack, struct rack_sendmap *rsm) 8225 { 8226 struct rack_sendmap *nrsm; 8227 8228 nrsm = rsm; 8229 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 8230 rack_head, r_tnext) { 8231 if (nrsm == rsm) { 8232 /* Skip orginal segment he is acked */ 8233 continue; 8234 } 8235 if (nrsm->r_flags & RACK_ACKED) { 8236 /* 8237 * Skip ack'd segments, though we 8238 * should not see these, since tmap 8239 * should not have ack'd segments. 8240 */ 8241 continue; 8242 } 8243 if (nrsm->r_flags & RACK_SACK_PASSED) { 8244 /* 8245 * We found one that is already marked 8246 * passed, we have been here before and 8247 * so all others below this are marked. 8248 */ 8249 break; 8250 } 8251 nrsm->r_flags |= RACK_SACK_PASSED; 8252 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 8253 } 8254 } 8255 8256 static void 8257 rack_need_set_test(struct tcpcb *tp, 8258 struct tcp_rack *rack, 8259 struct rack_sendmap *rsm, 8260 tcp_seq th_ack, 8261 int line, 8262 int use_which) 8263 { 8264 8265 if ((tp->t_flags & TF_GPUTINPROG) && 8266 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8267 /* 8268 * We were app limited, and this ack 8269 * butts up or goes beyond the point where we want 8270 * to start our next measurement. We need 8271 * to record the new gput_ts as here and 8272 * possibly update the start sequence. 8273 */ 8274 uint32_t seq, ts; 8275 8276 if (rsm->r_rtr_cnt > 1) { 8277 /* 8278 * This is a retransmit, can we 8279 * really make any assessment at this 8280 * point? We are not really sure of 8281 * the timestamp, is it this or the 8282 * previous transmission? 8283 * 8284 * Lets wait for something better that 8285 * is not retransmitted. 8286 */ 8287 return; 8288 } 8289 seq = tp->gput_seq; 8290 ts = tp->gput_ts; 8291 rack->app_limited_needs_set = 0; 8292 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 8293 /* Do we start at a new end? */ 8294 if ((use_which == RACK_USE_BEG) && 8295 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 8296 /* 8297 * When we get an ACK that just eats 8298 * up some of the rsm, we set RACK_USE_BEG 8299 * since whats at r_start (i.e. th_ack) 8300 * is left unacked and thats where the 8301 * measurement not starts. 8302 */ 8303 tp->gput_seq = rsm->r_start; 8304 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8305 } 8306 if ((use_which == RACK_USE_END) && 8307 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8308 /* 8309 * We use the end when the cumack 8310 * is moving forward and completely 8311 * deleting the rsm passed so basically 8312 * r_end holds th_ack. 8313 * 8314 * For SACK's we also want to use the end 8315 * since this piece just got sacked and 8316 * we want to target anything after that 8317 * in our measurement. 8318 */ 8319 tp->gput_seq = rsm->r_end; 8320 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8321 } 8322 if (use_which == RACK_USE_END_OR_THACK) { 8323 /* 8324 * special case for ack moving forward, 8325 * not a sack, we need to move all the 8326 * way up to where this ack cum-ack moves 8327 * to. 8328 */ 8329 if (SEQ_GT(th_ack, rsm->r_end)) 8330 tp->gput_seq = th_ack; 8331 else 8332 tp->gput_seq = rsm->r_end; 8333 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8334 } 8335 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 8336 /* 8337 * We moved beyond this guy's range, re-calculate 8338 * the new end point. 8339 */ 8340 if (rack->rc_gp_filled == 0) { 8341 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 8342 } else { 8343 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 8344 } 8345 } 8346 /* 8347 * We are moving the goal post, we may be able to clear the 8348 * measure_saw_probe_rtt flag. 8349 */ 8350 if ((rack->in_probe_rtt == 0) && 8351 (rack->measure_saw_probe_rtt) && 8352 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 8353 rack->measure_saw_probe_rtt = 0; 8354 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 8355 seq, tp->gput_seq, 0, 5, line, NULL, 0); 8356 if (rack->rc_gp_filled && 8357 ((tp->gput_ack - tp->gput_seq) < 8358 max(rc_init_window(rack), (MIN_GP_WIN * 8359 ctf_fixed_maxseg(tp))))) { 8360 uint32_t ideal_amount; 8361 8362 ideal_amount = rack_get_measure_window(tp, rack); 8363 if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 8364 /* 8365 * There is no sense of continuing this measurement 8366 * because its too small to gain us anything we 8367 * trust. Skip it and that way we can start a new 8368 * measurement quicker. 8369 */ 8370 tp->t_flags &= ~TF_GPUTINPROG; 8371 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 8372 0, 0, 0, 6, __LINE__, NULL, 0); 8373 } else { 8374 /* 8375 * Reset the window further out. 8376 */ 8377 tp->gput_ack = tp->gput_seq + ideal_amount; 8378 } 8379 } 8380 } 8381 } 8382 8383 static inline int 8384 is_rsm_inside_declared_tlp_block(struct tcp_rack *rack, struct rack_sendmap *rsm) 8385 { 8386 if (SEQ_LT(rsm->r_end, rack->r_ctl.last_tlp_acked_start)) { 8387 /* Behind our TLP definition or right at */ 8388 return (0); 8389 } 8390 if (SEQ_GT(rsm->r_start, rack->r_ctl.last_tlp_acked_end)) { 8391 /* The start is beyond or right at our end of TLP definition */ 8392 return (0); 8393 } 8394 /* It has to be a sub-part of the original TLP recorded */ 8395 return (1); 8396 } 8397 8398 8399 static uint32_t 8400 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 8401 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 8402 { 8403 uint32_t start, end, changed = 0; 8404 struct rack_sendmap stack_map; 8405 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 8406 int32_t used_ref = 1; 8407 int moved = 0; 8408 8409 start = sack->start; 8410 end = sack->end; 8411 rsm = *prsm; 8412 memset(&fe, 0, sizeof(fe)); 8413 do_rest_ofb: 8414 if ((rsm == NULL) || 8415 (SEQ_LT(end, rsm->r_start)) || 8416 (SEQ_GEQ(start, rsm->r_end)) || 8417 (SEQ_LT(start, rsm->r_start))) { 8418 /* 8419 * We are not in the right spot, 8420 * find the correct spot in the tree. 8421 */ 8422 used_ref = 0; 8423 fe.r_start = start; 8424 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8425 moved++; 8426 } 8427 if (rsm == NULL) { 8428 /* TSNH */ 8429 goto out; 8430 } 8431 /* Ok we have an ACK for some piece of this rsm */ 8432 if (rsm->r_start != start) { 8433 if ((rsm->r_flags & RACK_ACKED) == 0) { 8434 /* 8435 * Before any splitting or hookery is 8436 * done is it a TLP of interest i.e. rxt? 8437 */ 8438 if ((rsm->r_flags & RACK_TLP) && 8439 (rsm->r_rtr_cnt > 1)) { 8440 /* 8441 * We are splitting a rxt TLP, check 8442 * if we need to save off the start/end 8443 */ 8444 if (rack->rc_last_tlp_acked_set && 8445 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8446 /* 8447 * We already turned this on since we are inside 8448 * the previous one was a partially sack now we 8449 * are getting another one (maybe all of it). 8450 * 8451 */ 8452 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8453 /* 8454 * Lets make sure we have all of it though. 8455 */ 8456 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8457 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8458 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8459 rack->r_ctl.last_tlp_acked_end); 8460 } 8461 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8462 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8463 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8464 rack->r_ctl.last_tlp_acked_end); 8465 } 8466 } else { 8467 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8468 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8469 rack->rc_last_tlp_past_cumack = 0; 8470 rack->rc_last_tlp_acked_set = 1; 8471 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8472 } 8473 } 8474 /** 8475 * Need to split this in two pieces the before and after, 8476 * the before remains in the map, the after must be 8477 * added. In other words we have: 8478 * rsm |--------------| 8479 * sackblk |-------> 8480 * rsm will become 8481 * rsm |---| 8482 * and nrsm will be the sacked piece 8483 * nrsm |----------| 8484 * 8485 * But before we start down that path lets 8486 * see if the sack spans over on top of 8487 * the next guy and it is already sacked. 8488 * 8489 */ 8490 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8491 if (next && (next->r_flags & RACK_ACKED) && 8492 SEQ_GEQ(end, next->r_start)) { 8493 /** 8494 * So the next one is already acked, and 8495 * we can thus by hookery use our stack_map 8496 * to reflect the piece being sacked and 8497 * then adjust the two tree entries moving 8498 * the start and ends around. So we start like: 8499 * rsm |------------| (not-acked) 8500 * next |-----------| (acked) 8501 * sackblk |--------> 8502 * We want to end like so: 8503 * rsm |------| (not-acked) 8504 * next |-----------------| (acked) 8505 * nrsm |-----| 8506 * Where nrsm is a temporary stack piece we 8507 * use to update all the gizmos. 8508 */ 8509 /* Copy up our fudge block */ 8510 nrsm = &stack_map; 8511 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8512 /* Now adjust our tree blocks */ 8513 rsm->r_end = start; 8514 next->r_start = start; 8515 /* Now we must adjust back where next->m is */ 8516 rack_setup_offset_for_rsm(rsm, next); 8517 8518 /* We don't need to adjust rsm, it did not change */ 8519 /* Clear out the dup ack count of the remainder */ 8520 rsm->r_dupack = 0; 8521 rsm->r_just_ret = 0; 8522 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8523 /* Now lets make sure our fudge block is right */ 8524 nrsm->r_start = start; 8525 /* Now lets update all the stats and such */ 8526 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8527 if (rack->app_limited_needs_set) 8528 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8529 changed += (nrsm->r_end - nrsm->r_start); 8530 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8531 if (nrsm->r_flags & RACK_SACK_PASSED) { 8532 counter_u64_add(rack_reorder_seen, 1); 8533 rack->r_ctl.rc_reorder_ts = cts; 8534 } 8535 /* 8536 * Now we want to go up from rsm (the 8537 * one left un-acked) to the next one 8538 * in the tmap. We do this so when 8539 * we walk backwards we include marking 8540 * sack-passed on rsm (The one passed in 8541 * is skipped since it is generally called 8542 * on something sacked before removing it 8543 * from the tmap). 8544 */ 8545 if (rsm->r_in_tmap) { 8546 nrsm = TAILQ_NEXT(rsm, r_tnext); 8547 /* 8548 * Now that we have the next 8549 * one walk backwards from there. 8550 */ 8551 if (nrsm && nrsm->r_in_tmap) 8552 rack_log_sack_passed(tp, rack, nrsm); 8553 } 8554 /* Now are we done? */ 8555 if (SEQ_LT(end, next->r_end) || 8556 (end == next->r_end)) { 8557 /* Done with block */ 8558 goto out; 8559 } 8560 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 8561 counter_u64_add(rack_sack_used_next_merge, 1); 8562 /* Postion for the next block */ 8563 start = next->r_end; 8564 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 8565 if (rsm == NULL) 8566 goto out; 8567 } else { 8568 /** 8569 * We can't use any hookery here, so we 8570 * need to split the map. We enter like 8571 * so: 8572 * rsm |--------| 8573 * sackblk |-----> 8574 * We will add the new block nrsm and 8575 * that will be the new portion, and then 8576 * fall through after reseting rsm. So we 8577 * split and look like this: 8578 * rsm |----| 8579 * sackblk |-----> 8580 * nrsm |---| 8581 * We then fall through reseting 8582 * rsm to nrsm, so the next block 8583 * picks it up. 8584 */ 8585 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8586 if (nrsm == NULL) { 8587 /* 8588 * failed XXXrrs what can we do but loose the sack 8589 * info? 8590 */ 8591 goto out; 8592 } 8593 counter_u64_add(rack_sack_splits, 1); 8594 rack_clone_rsm(rack, nrsm, rsm, start); 8595 rsm->r_just_ret = 0; 8596 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8597 #ifdef INVARIANTS 8598 if (insret != NULL) { 8599 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8600 nrsm, insret, rack, rsm); 8601 } 8602 #endif 8603 if (rsm->r_in_tmap) { 8604 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8605 nrsm->r_in_tmap = 1; 8606 } 8607 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 8608 rsm->r_flags &= (~RACK_HAS_FIN); 8609 /* Position us to point to the new nrsm that starts the sack blk */ 8610 rsm = nrsm; 8611 } 8612 } else { 8613 /* Already sacked this piece */ 8614 counter_u64_add(rack_sack_skipped_acked, 1); 8615 moved++; 8616 if (end == rsm->r_end) { 8617 /* Done with block */ 8618 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8619 goto out; 8620 } else if (SEQ_LT(end, rsm->r_end)) { 8621 /* A partial sack to a already sacked block */ 8622 moved++; 8623 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8624 goto out; 8625 } else { 8626 /* 8627 * The end goes beyond this guy 8628 * repostion the start to the 8629 * next block. 8630 */ 8631 start = rsm->r_end; 8632 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8633 if (rsm == NULL) 8634 goto out; 8635 } 8636 } 8637 } 8638 if (SEQ_GEQ(end, rsm->r_end)) { 8639 /** 8640 * The end of this block is either beyond this guy or right 8641 * at this guy. I.e.: 8642 * rsm --- |-----| 8643 * end |-----| 8644 * <or> 8645 * end |---------| 8646 */ 8647 if ((rsm->r_flags & RACK_ACKED) == 0) { 8648 /* 8649 * Is it a TLP of interest? 8650 */ 8651 if ((rsm->r_flags & RACK_TLP) && 8652 (rsm->r_rtr_cnt > 1)) { 8653 /* 8654 * We are splitting a rxt TLP, check 8655 * if we need to save off the start/end 8656 */ 8657 if (rack->rc_last_tlp_acked_set && 8658 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8659 /* 8660 * We already turned this on since we are inside 8661 * the previous one was a partially sack now we 8662 * are getting another one (maybe all of it). 8663 */ 8664 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8665 /* 8666 * Lets make sure we have all of it though. 8667 */ 8668 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8669 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8670 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8671 rack->r_ctl.last_tlp_acked_end); 8672 } 8673 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8674 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8675 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8676 rack->r_ctl.last_tlp_acked_end); 8677 } 8678 } else { 8679 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8680 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8681 rack->rc_last_tlp_past_cumack = 0; 8682 rack->rc_last_tlp_acked_set = 1; 8683 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8684 } 8685 } 8686 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8687 changed += (rsm->r_end - rsm->r_start); 8688 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8689 if (rsm->r_in_tmap) /* should be true */ 8690 rack_log_sack_passed(tp, rack, rsm); 8691 /* Is Reordering occuring? */ 8692 if (rsm->r_flags & RACK_SACK_PASSED) { 8693 rsm->r_flags &= ~RACK_SACK_PASSED; 8694 counter_u64_add(rack_reorder_seen, 1); 8695 rack->r_ctl.rc_reorder_ts = cts; 8696 } 8697 if (rack->app_limited_needs_set) 8698 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8699 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8700 rsm->r_flags |= RACK_ACKED; 8701 if (rsm->r_in_tmap) { 8702 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8703 rsm->r_in_tmap = 0; 8704 } 8705 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 8706 } else { 8707 counter_u64_add(rack_sack_skipped_acked, 1); 8708 moved++; 8709 } 8710 if (end == rsm->r_end) { 8711 /* This block only - done, setup for next */ 8712 goto out; 8713 } 8714 /* 8715 * There is more not coverend by this rsm move on 8716 * to the next block in the RB tree. 8717 */ 8718 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8719 start = rsm->r_end; 8720 rsm = nrsm; 8721 if (rsm == NULL) 8722 goto out; 8723 goto do_rest_ofb; 8724 } 8725 /** 8726 * The end of this sack block is smaller than 8727 * our rsm i.e.: 8728 * rsm --- |-----| 8729 * end |--| 8730 */ 8731 if ((rsm->r_flags & RACK_ACKED) == 0) { 8732 /* 8733 * Is it a TLP of interest? 8734 */ 8735 if ((rsm->r_flags & RACK_TLP) && 8736 (rsm->r_rtr_cnt > 1)) { 8737 /* 8738 * We are splitting a rxt TLP, check 8739 * if we need to save off the start/end 8740 */ 8741 if (rack->rc_last_tlp_acked_set && 8742 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8743 /* 8744 * We already turned this on since we are inside 8745 * the previous one was a partially sack now we 8746 * are getting another one (maybe all of it). 8747 */ 8748 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8749 /* 8750 * Lets make sure we have all of it though. 8751 */ 8752 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8753 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8754 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8755 rack->r_ctl.last_tlp_acked_end); 8756 } 8757 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8758 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8759 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8760 rack->r_ctl.last_tlp_acked_end); 8761 } 8762 } else { 8763 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8764 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8765 rack->rc_last_tlp_past_cumack = 0; 8766 rack->rc_last_tlp_acked_set = 1; 8767 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8768 } 8769 } 8770 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8771 if (prev && 8772 (prev->r_flags & RACK_ACKED)) { 8773 /** 8774 * Goal, we want the right remainder of rsm to shrink 8775 * in place and span from (rsm->r_start = end) to rsm->r_end. 8776 * We want to expand prev to go all the way 8777 * to prev->r_end <- end. 8778 * so in the tree we have before: 8779 * prev |--------| (acked) 8780 * rsm |-------| (non-acked) 8781 * sackblk |-| 8782 * We churn it so we end up with 8783 * prev |----------| (acked) 8784 * rsm |-----| (non-acked) 8785 * nrsm |-| (temporary) 8786 * 8787 * Note if either prev/rsm is a TLP we don't 8788 * do this. 8789 */ 8790 nrsm = &stack_map; 8791 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8792 prev->r_end = end; 8793 rsm->r_start = end; 8794 /* Now adjust nrsm (stack copy) to be 8795 * the one that is the small 8796 * piece that was "sacked". 8797 */ 8798 nrsm->r_end = end; 8799 rsm->r_dupack = 0; 8800 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8801 /* 8802 * Now that the rsm has had its start moved forward 8803 * lets go ahead and get its new place in the world. 8804 */ 8805 rack_setup_offset_for_rsm(prev, rsm); 8806 /* 8807 * Now nrsm is our new little piece 8808 * that is acked (which was merged 8809 * to prev). Update the rtt and changed 8810 * based on that. Also check for reordering. 8811 */ 8812 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8813 if (rack->app_limited_needs_set) 8814 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8815 changed += (nrsm->r_end - nrsm->r_start); 8816 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8817 if (nrsm->r_flags & RACK_SACK_PASSED) { 8818 counter_u64_add(rack_reorder_seen, 1); 8819 rack->r_ctl.rc_reorder_ts = cts; 8820 } 8821 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 8822 rsm = prev; 8823 counter_u64_add(rack_sack_used_prev_merge, 1); 8824 } else { 8825 /** 8826 * This is the case where our previous 8827 * block is not acked either, so we must 8828 * split the block in two. 8829 */ 8830 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8831 if (nrsm == NULL) { 8832 /* failed rrs what can we do but loose the sack info? */ 8833 goto out; 8834 } 8835 if ((rsm->r_flags & RACK_TLP) && 8836 (rsm->r_rtr_cnt > 1)) { 8837 /* 8838 * We are splitting a rxt TLP, check 8839 * if we need to save off the start/end 8840 */ 8841 if (rack->rc_last_tlp_acked_set && 8842 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 8843 /* 8844 * We already turned this on since this block is inside 8845 * the previous one was a partially sack now we 8846 * are getting another one (maybe all of it). 8847 */ 8848 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 8849 /* 8850 * Lets make sure we have all of it though. 8851 */ 8852 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 8853 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8854 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8855 rack->r_ctl.last_tlp_acked_end); 8856 } 8857 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 8858 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8859 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 8860 rack->r_ctl.last_tlp_acked_end); 8861 } 8862 } else { 8863 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 8864 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 8865 rack->rc_last_tlp_acked_set = 1; 8866 rack->rc_last_tlp_past_cumack = 0; 8867 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 8868 } 8869 } 8870 /** 8871 * In this case nrsm becomes 8872 * nrsm->r_start = end; 8873 * nrsm->r_end = rsm->r_end; 8874 * which is un-acked. 8875 * <and> 8876 * rsm->r_end = nrsm->r_start; 8877 * i.e. the remaining un-acked 8878 * piece is left on the left 8879 * hand side. 8880 * 8881 * So we start like this 8882 * rsm |----------| (not acked) 8883 * sackblk |---| 8884 * build it so we have 8885 * rsm |---| (acked) 8886 * nrsm |------| (not acked) 8887 */ 8888 counter_u64_add(rack_sack_splits, 1); 8889 rack_clone_rsm(rack, nrsm, rsm, end); 8890 rsm->r_flags &= (~RACK_HAS_FIN); 8891 rsm->r_just_ret = 0; 8892 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8893 #ifdef INVARIANTS 8894 if (insret != NULL) { 8895 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8896 nrsm, insret, rack, rsm); 8897 } 8898 #endif 8899 if (rsm->r_in_tmap) { 8900 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8901 nrsm->r_in_tmap = 1; 8902 } 8903 nrsm->r_dupack = 0; 8904 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8905 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8906 changed += (rsm->r_end - rsm->r_start); 8907 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8908 if (rsm->r_in_tmap) /* should be true */ 8909 rack_log_sack_passed(tp, rack, rsm); 8910 /* Is Reordering occuring? */ 8911 if (rsm->r_flags & RACK_SACK_PASSED) { 8912 rsm->r_flags &= ~RACK_SACK_PASSED; 8913 counter_u64_add(rack_reorder_seen, 1); 8914 rack->r_ctl.rc_reorder_ts = cts; 8915 } 8916 if (rack->app_limited_needs_set) 8917 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8918 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8919 rsm->r_flags |= RACK_ACKED; 8920 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 8921 if (rsm->r_in_tmap) { 8922 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8923 rsm->r_in_tmap = 0; 8924 } 8925 } 8926 } else if (start != end){ 8927 /* 8928 * The block was already acked. 8929 */ 8930 counter_u64_add(rack_sack_skipped_acked, 1); 8931 moved++; 8932 } 8933 out: 8934 if (rsm && 8935 ((rsm->r_flags & RACK_TLP) == 0) && 8936 (rsm->r_flags & RACK_ACKED)) { 8937 /* 8938 * Now can we merge where we worked 8939 * with either the previous or 8940 * next block? 8941 */ 8942 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8943 while (next) { 8944 if (next->r_flags & RACK_TLP) 8945 break; 8946 if (next->r_flags & RACK_ACKED) { 8947 /* yep this and next can be merged */ 8948 rsm = rack_merge_rsm(rack, rsm, next); 8949 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8950 } else 8951 break; 8952 } 8953 /* Now what about the previous? */ 8954 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8955 while (prev) { 8956 if (prev->r_flags & RACK_TLP) 8957 break; 8958 if (prev->r_flags & RACK_ACKED) { 8959 /* yep the previous and this can be merged */ 8960 rsm = rack_merge_rsm(rack, prev, rsm); 8961 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8962 } else 8963 break; 8964 } 8965 } 8966 if (used_ref == 0) { 8967 counter_u64_add(rack_sack_proc_all, 1); 8968 } else { 8969 counter_u64_add(rack_sack_proc_short, 1); 8970 } 8971 /* Save off the next one for quick reference. */ 8972 if (rsm) 8973 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8974 else 8975 nrsm = NULL; 8976 *prsm = rack->r_ctl.rc_sacklast = nrsm; 8977 /* Pass back the moved. */ 8978 *moved_two = moved; 8979 return (changed); 8980 } 8981 8982 static void inline 8983 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 8984 { 8985 struct rack_sendmap *tmap; 8986 8987 tmap = NULL; 8988 while (rsm && (rsm->r_flags & RACK_ACKED)) { 8989 /* Its no longer sacked, mark it so */ 8990 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8991 #ifdef INVARIANTS 8992 if (rsm->r_in_tmap) { 8993 panic("rack:%p rsm:%p flags:0x%x in tmap?", 8994 rack, rsm, rsm->r_flags); 8995 } 8996 #endif 8997 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 8998 /* Rebuild it into our tmap */ 8999 if (tmap == NULL) { 9000 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9001 tmap = rsm; 9002 } else { 9003 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 9004 tmap = rsm; 9005 } 9006 tmap->r_in_tmap = 1; 9007 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 9008 } 9009 /* 9010 * Now lets possibly clear the sack filter so we start 9011 * recognizing sacks that cover this area. 9012 */ 9013 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 9014 9015 } 9016 9017 static void 9018 rack_do_decay(struct tcp_rack *rack) 9019 { 9020 struct timeval res; 9021 9022 #define timersub(tvp, uvp, vvp) \ 9023 do { \ 9024 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 9025 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 9026 if ((vvp)->tv_usec < 0) { \ 9027 (vvp)->tv_sec--; \ 9028 (vvp)->tv_usec += 1000000; \ 9029 } \ 9030 } while (0) 9031 9032 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 9033 #undef timersub 9034 9035 rack->r_ctl.input_pkt++; 9036 if ((rack->rc_in_persist) || 9037 (res.tv_sec >= 1) || 9038 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 9039 /* 9040 * Check for decay of non-SAD, 9041 * we want all SAD detection metrics to 9042 * decay 1/4 per second (or more) passed. 9043 */ 9044 uint32_t pkt_delta; 9045 9046 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 9047 /* Update our saved tracking values */ 9048 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 9049 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 9050 /* Now do we escape without decay? */ 9051 #ifdef NETFLIX_EXP_DETECTION 9052 if (rack->rc_in_persist || 9053 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 9054 (pkt_delta < tcp_sad_low_pps)){ 9055 /* 9056 * We don't decay idle connections 9057 * or ones that have a low input pps. 9058 */ 9059 return; 9060 } 9061 /* Decay the counters */ 9062 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 9063 tcp_sad_decay_val); 9064 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 9065 tcp_sad_decay_val); 9066 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 9067 tcp_sad_decay_val); 9068 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 9069 tcp_sad_decay_val); 9070 #endif 9071 } 9072 } 9073 9074 static void 9075 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) 9076 { 9077 struct rack_sendmap *rsm, *rm; 9078 9079 /* 9080 * The ACK point is advancing to th_ack, we must drop off 9081 * the packets in the rack log and calculate any eligble 9082 * RTT's. 9083 */ 9084 rack->r_wanted_output = 1; 9085 9086 /* Tend any TLP that has been marked for 1/2 the seq space (its old) */ 9087 if ((rack->rc_last_tlp_acked_set == 1)&& 9088 (rack->rc_last_tlp_past_cumack == 1) && 9089 (SEQ_GT(rack->r_ctl.last_tlp_acked_start, th_ack))) { 9090 /* 9091 * We have reached the point where our last rack 9092 * tlp retransmit sequence is ahead of the cum-ack. 9093 * This can only happen when the cum-ack moves all 9094 * the way around (its been a full 2^^31+1 bytes 9095 * or more since we sent a retransmitted TLP). Lets 9096 * turn off the valid flag since its not really valid. 9097 * 9098 * Note since sack's also turn on this event we have 9099 * a complication, we have to wait to age it out until 9100 * the cum-ack is by the TLP before checking which is 9101 * what the next else clause does. 9102 */ 9103 rack_log_dsack_event(rack, 9, __LINE__, 9104 rack->r_ctl.last_tlp_acked_start, 9105 rack->r_ctl.last_tlp_acked_end); 9106 rack->rc_last_tlp_acked_set = 0; 9107 rack->rc_last_tlp_past_cumack = 0; 9108 } else if ((rack->rc_last_tlp_acked_set == 1) && 9109 (rack->rc_last_tlp_past_cumack == 0) && 9110 (SEQ_GEQ(th_ack, rack->r_ctl.last_tlp_acked_end))) { 9111 /* 9112 * It is safe to start aging TLP's out. 9113 */ 9114 rack->rc_last_tlp_past_cumack = 1; 9115 } 9116 /* We do the same for the tlp send seq as well */ 9117 if ((rack->rc_last_sent_tlp_seq_valid == 1) && 9118 (rack->rc_last_sent_tlp_past_cumack == 1) && 9119 (SEQ_GT(rack->r_ctl.last_sent_tlp_seq, th_ack))) { 9120 rack_log_dsack_event(rack, 9, __LINE__, 9121 rack->r_ctl.last_sent_tlp_seq, 9122 (rack->r_ctl.last_sent_tlp_seq + 9123 rack->r_ctl.last_sent_tlp_len)); 9124 rack->rc_last_sent_tlp_seq_valid = 0; 9125 rack->rc_last_sent_tlp_past_cumack = 0; 9126 } else if ((rack->rc_last_sent_tlp_seq_valid == 1) && 9127 (rack->rc_last_sent_tlp_past_cumack == 0) && 9128 (SEQ_GEQ(th_ack, rack->r_ctl.last_sent_tlp_seq))) { 9129 /* 9130 * It is safe to start aging TLP's send. 9131 */ 9132 rack->rc_last_sent_tlp_past_cumack = 1; 9133 } 9134 more: 9135 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9136 if (rsm == NULL) { 9137 if ((th_ack - 1) == tp->iss) { 9138 /* 9139 * For the SYN incoming case we will not 9140 * have called tcp_output for the sending of 9141 * the SYN, so there will be no map. All 9142 * other cases should probably be a panic. 9143 */ 9144 return; 9145 } 9146 if (tp->t_flags & TF_SENTFIN) { 9147 /* if we sent a FIN we often will not have map */ 9148 return; 9149 } 9150 #ifdef INVARIANTS 9151 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 9152 tp, 9153 tp->t_state, th_ack, rack, 9154 tp->snd_una, tp->snd_max, tp->snd_nxt); 9155 #endif 9156 return; 9157 } 9158 if (SEQ_LT(th_ack, rsm->r_start)) { 9159 /* Huh map is missing this */ 9160 #ifdef INVARIANTS 9161 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 9162 rsm->r_start, 9163 th_ack, tp->t_state, rack->r_state); 9164 #endif 9165 return; 9166 } 9167 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 9168 9169 /* Now was it a retransmitted TLP? */ 9170 if ((rsm->r_flags & RACK_TLP) && 9171 (rsm->r_rtr_cnt > 1)) { 9172 /* 9173 * Yes, this rsm was a TLP and retransmitted, remember that 9174 * since if a DSACK comes back on this we don't want 9175 * to think of it as a reordered segment. This may 9176 * get updated again with possibly even other TLPs 9177 * in flight, but thats ok. Only when we don't send 9178 * a retransmitted TLP for 1/2 the sequences space 9179 * will it get turned off (above). 9180 */ 9181 if (rack->rc_last_tlp_acked_set && 9182 (is_rsm_inside_declared_tlp_block(rack, rsm))) { 9183 /* 9184 * We already turned this on since the end matches, 9185 * the previous one was a partially ack now we 9186 * are getting another one (maybe all of it). 9187 */ 9188 rack_log_dsack_event(rack, 10, __LINE__, rsm->r_start, rsm->r_end); 9189 /* 9190 * Lets make sure we have all of it though. 9191 */ 9192 if (SEQ_LT(rsm->r_start, rack->r_ctl.last_tlp_acked_start)) { 9193 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9194 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9195 rack->r_ctl.last_tlp_acked_end); 9196 } 9197 if (SEQ_GT(rsm->r_end, rack->r_ctl.last_tlp_acked_end)) { 9198 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9199 rack_log_dsack_event(rack, 11, __LINE__, rack->r_ctl.last_tlp_acked_start, 9200 rack->r_ctl.last_tlp_acked_end); 9201 } 9202 } else { 9203 rack->rc_last_tlp_past_cumack = 1; 9204 rack->r_ctl.last_tlp_acked_start = rsm->r_start; 9205 rack->r_ctl.last_tlp_acked_end = rsm->r_end; 9206 rack->rc_last_tlp_acked_set = 1; 9207 rack_log_dsack_event(rack, 8, __LINE__, rsm->r_start, rsm->r_end); 9208 } 9209 } 9210 /* Now do we consume the whole thing? */ 9211 if (SEQ_GEQ(th_ack, rsm->r_end)) { 9212 /* Its all consumed. */ 9213 uint32_t left; 9214 uint8_t newly_acked; 9215 9216 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 9217 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 9218 rsm->r_rtr_bytes = 0; 9219 /* Record the time of highest cumack sent */ 9220 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9221 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 9222 #ifdef INVARIANTS 9223 if (rm != rsm) { 9224 panic("removing head in rack:%p rsm:%p rm:%p", 9225 rack, rsm, rm); 9226 } 9227 #endif 9228 if (rsm->r_in_tmap) { 9229 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 9230 rsm->r_in_tmap = 0; 9231 } 9232 newly_acked = 1; 9233 if (rsm->r_flags & RACK_ACKED) { 9234 /* 9235 * It was acked on the scoreboard -- remove 9236 * it from total 9237 */ 9238 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 9239 newly_acked = 0; 9240 } else if (rsm->r_flags & RACK_SACK_PASSED) { 9241 /* 9242 * There are segments ACKED on the 9243 * scoreboard further up. We are seeing 9244 * reordering. 9245 */ 9246 rsm->r_flags &= ~RACK_SACK_PASSED; 9247 counter_u64_add(rack_reorder_seen, 1); 9248 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 9249 rsm->r_flags |= RACK_ACKED; 9250 rack->r_ctl.rc_reorder_ts = cts; 9251 if (rack->r_ent_rec_ns) { 9252 /* 9253 * We have sent no more, and we saw an sack 9254 * then ack arrive. 9255 */ 9256 rack->r_might_revert = 1; 9257 } 9258 } 9259 if ((rsm->r_flags & RACK_TO_REXT) && 9260 (tp->t_flags & TF_RCVD_TSTMP) && 9261 (to->to_flags & TOF_TS) && 9262 (to->to_tsecr != 0) && 9263 (tp->t_flags & TF_PREVVALID)) { 9264 /* 9265 * We can use the timestamp to see 9266 * if this retransmission was from the 9267 * first transmit. If so we made a mistake. 9268 */ 9269 tp->t_flags &= ~TF_PREVVALID; 9270 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 9271 /* The first transmit is what this ack is for */ 9272 rack_cong_signal(tp, CC_RTO_ERR, th_ack); 9273 } 9274 } 9275 left = th_ack - rsm->r_end; 9276 if (rack->app_limited_needs_set && newly_acked) 9277 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 9278 /* Free back to zone */ 9279 rack_free(rack, rsm); 9280 if (left) { 9281 goto more; 9282 } 9283 /* Check for reneging */ 9284 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9285 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 9286 /* 9287 * The peer has moved snd_una up to 9288 * the edge of this send, i.e. one 9289 * that it had previously acked. The only 9290 * way that can be true if the peer threw 9291 * away data (space issues) that it had 9292 * previously sacked (else it would have 9293 * given us snd_una up to (rsm->r_end). 9294 * We need to undo the acked markings here. 9295 * 9296 * Note we have to look to make sure th_ack is 9297 * our rsm->r_start in case we get an old ack 9298 * where th_ack is behind snd_una. 9299 */ 9300 rack_peer_reneges(rack, rsm, th_ack); 9301 } 9302 return; 9303 } 9304 if (rsm->r_flags & RACK_ACKED) { 9305 /* 9306 * It was acked on the scoreboard -- remove it from 9307 * total for the part being cum-acked. 9308 */ 9309 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 9310 } 9311 /* 9312 * Clear the dup ack count for 9313 * the piece that remains. 9314 */ 9315 rsm->r_dupack = 0; 9316 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 9317 if (rsm->r_rtr_bytes) { 9318 /* 9319 * It was retransmitted adjust the 9320 * sack holes for what was acked. 9321 */ 9322 int ack_am; 9323 9324 ack_am = (th_ack - rsm->r_start); 9325 if (ack_am >= rsm->r_rtr_bytes) { 9326 rack->r_ctl.rc_holes_rxt -= ack_am; 9327 rsm->r_rtr_bytes -= ack_am; 9328 } 9329 } 9330 /* 9331 * Update where the piece starts and record 9332 * the time of send of highest cumack sent. 9333 */ 9334 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 9335 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 9336 /* Now we need to move our offset forward too */ 9337 if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) { 9338 /* Fix up the orig_m_len and possibly the mbuf offset */ 9339 rack_adjust_orig_mlen(rsm); 9340 } 9341 rsm->soff += (th_ack - rsm->r_start); 9342 rsm->r_start = th_ack; 9343 /* Now do we need to move the mbuf fwd too? */ 9344 if (rsm->m) { 9345 while (rsm->soff >= rsm->m->m_len) { 9346 rsm->soff -= rsm->m->m_len; 9347 rsm->m = rsm->m->m_next; 9348 KASSERT((rsm->m != NULL), 9349 (" nrsm:%p hit at soff:%u null m", 9350 rsm, rsm->soff)); 9351 } 9352 rsm->orig_m_len = rsm->m->m_len; 9353 } 9354 if (rack->app_limited_needs_set) 9355 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 9356 } 9357 9358 static void 9359 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 9360 { 9361 struct rack_sendmap *rsm; 9362 int sack_pass_fnd = 0; 9363 9364 if (rack->r_might_revert) { 9365 /* 9366 * Ok we have reordering, have not sent anything, we 9367 * might want to revert the congestion state if nothing 9368 * further has SACK_PASSED on it. Lets check. 9369 * 9370 * We also get here when we have DSACKs come in for 9371 * all the data that we FR'd. Note that a rxt or tlp 9372 * timer clears this from happening. 9373 */ 9374 9375 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 9376 if (rsm->r_flags & RACK_SACK_PASSED) { 9377 sack_pass_fnd = 1; 9378 break; 9379 } 9380 } 9381 if (sack_pass_fnd == 0) { 9382 /* 9383 * We went into recovery 9384 * incorrectly due to reordering! 9385 */ 9386 int orig_cwnd; 9387 9388 rack->r_ent_rec_ns = 0; 9389 orig_cwnd = tp->snd_cwnd; 9390 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec; 9391 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 9392 tp->snd_recover = tp->snd_una; 9393 rack_log_to_prr(rack, 14, orig_cwnd); 9394 EXIT_RECOVERY(tp->t_flags); 9395 } 9396 rack->r_might_revert = 0; 9397 } 9398 } 9399 9400 #ifdef NETFLIX_EXP_DETECTION 9401 static void 9402 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 9403 { 9404 if ((rack->do_detection || tcp_force_detection) && 9405 tcp_sack_to_ack_thresh && 9406 tcp_sack_to_move_thresh && 9407 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 9408 /* 9409 * We have thresholds set to find 9410 * possible attackers and disable sack. 9411 * Check them. 9412 */ 9413 uint64_t ackratio, moveratio, movetotal; 9414 9415 /* Log detecting */ 9416 rack_log_sad(rack, 1); 9417 ackratio = (uint64_t)(rack->r_ctl.sack_count); 9418 ackratio *= (uint64_t)(1000); 9419 if (rack->r_ctl.ack_count) 9420 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 9421 else { 9422 /* We really should not hit here */ 9423 ackratio = 1000; 9424 } 9425 if ((rack->sack_attack_disable == 0) && 9426 (ackratio > rack_highest_sack_thresh_seen)) 9427 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 9428 movetotal = rack->r_ctl.sack_moved_extra; 9429 movetotal += rack->r_ctl.sack_noextra_move; 9430 moveratio = rack->r_ctl.sack_moved_extra; 9431 moveratio *= (uint64_t)1000; 9432 if (movetotal) 9433 moveratio /= movetotal; 9434 else { 9435 /* No moves, thats pretty good */ 9436 moveratio = 0; 9437 } 9438 if ((rack->sack_attack_disable == 0) && 9439 (moveratio > rack_highest_move_thresh_seen)) 9440 rack_highest_move_thresh_seen = (uint32_t)moveratio; 9441 if (rack->sack_attack_disable == 0) { 9442 if ((ackratio > tcp_sack_to_ack_thresh) && 9443 (moveratio > tcp_sack_to_move_thresh)) { 9444 /* Disable sack processing */ 9445 rack->sack_attack_disable = 1; 9446 if (rack->r_rep_attack == 0) { 9447 rack->r_rep_attack = 1; 9448 counter_u64_add(rack_sack_attacks_detected, 1); 9449 } 9450 if (tcp_attack_on_turns_on_logging) { 9451 /* 9452 * Turn on logging, used for debugging 9453 * false positives. 9454 */ 9455 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 9456 } 9457 /* Clamp the cwnd at flight size */ 9458 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 9459 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9460 rack_log_sad(rack, 2); 9461 } 9462 } else { 9463 /* We are sack-disabled check for false positives */ 9464 if ((ackratio <= tcp_restoral_thresh) || 9465 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 9466 rack->sack_attack_disable = 0; 9467 rack_log_sad(rack, 3); 9468 /* Restart counting */ 9469 rack->r_ctl.sack_count = 0; 9470 rack->r_ctl.sack_moved_extra = 0; 9471 rack->r_ctl.sack_noextra_move = 1; 9472 rack->r_ctl.ack_count = max(1, 9473 (bytes_this_ack / segsiz)); 9474 9475 if (rack->r_rep_reverse == 0) { 9476 rack->r_rep_reverse = 1; 9477 counter_u64_add(rack_sack_attacks_reversed, 1); 9478 } 9479 /* Restore the cwnd */ 9480 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 9481 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 9482 } 9483 } 9484 } 9485 } 9486 #endif 9487 9488 static int 9489 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 9490 { 9491 9492 uint32_t am, l_end; 9493 int was_tlp = 0; 9494 9495 if (SEQ_GT(end, start)) 9496 am = end - start; 9497 else 9498 am = 0; 9499 if ((rack->rc_last_tlp_acked_set ) && 9500 (SEQ_GEQ(start, rack->r_ctl.last_tlp_acked_start)) && 9501 (SEQ_LEQ(end, rack->r_ctl.last_tlp_acked_end))) { 9502 /* 9503 * The DSACK is because of a TLP which we don't 9504 * do anything with the reordering window over since 9505 * it was not reordering that caused the DSACK but 9506 * our previous retransmit TLP. 9507 */ 9508 rack_log_dsack_event(rack, 7, __LINE__, start, end); 9509 was_tlp = 1; 9510 goto skip_dsack_round; 9511 } 9512 if (rack->rc_last_sent_tlp_seq_valid) { 9513 l_end = rack->r_ctl.last_sent_tlp_seq + rack->r_ctl.last_sent_tlp_len; 9514 if (SEQ_GEQ(start, rack->r_ctl.last_sent_tlp_seq) && 9515 (SEQ_LEQ(end, l_end))) { 9516 /* 9517 * This dsack is from the last sent TLP, ignore it 9518 * for reordering purposes. 9519 */ 9520 rack_log_dsack_event(rack, 7, __LINE__, start, end); 9521 was_tlp = 1; 9522 goto skip_dsack_round; 9523 } 9524 } 9525 if (rack->rc_dsack_round_seen == 0) { 9526 rack->rc_dsack_round_seen = 1; 9527 rack->r_ctl.dsack_round_end = rack->rc_tp->snd_max; 9528 rack->r_ctl.num_dsack++; 9529 rack->r_ctl.dsack_persist = 16; /* 16 is from the standard */ 9530 rack_log_dsack_event(rack, 2, __LINE__, 0, 0); 9531 } 9532 skip_dsack_round: 9533 /* 9534 * We keep track of how many DSACK blocks we get 9535 * after a recovery incident. 9536 */ 9537 rack->r_ctl.dsack_byte_cnt += am; 9538 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 9539 rack->r_ctl.retran_during_recovery && 9540 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 9541 /* 9542 * False recovery most likely culprit is reordering. If 9543 * nothing else is missing we need to revert. 9544 */ 9545 rack->r_might_revert = 1; 9546 rack_handle_might_revert(rack->rc_tp, rack); 9547 rack->r_might_revert = 0; 9548 rack->r_ctl.retran_during_recovery = 0; 9549 rack->r_ctl.dsack_byte_cnt = 0; 9550 } 9551 return (was_tlp); 9552 } 9553 9554 static void 9555 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 9556 { 9557 /* Deal with changed and PRR here (in recovery only) */ 9558 uint32_t pipe, snd_una; 9559 9560 rack->r_ctl.rc_prr_delivered += changed; 9561 9562 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 9563 /* 9564 * It is all outstanding, we are application limited 9565 * and thus we don't need more room to send anything. 9566 * Note we use tp->snd_una here and not th_ack because 9567 * the data as yet not been cut from the sb. 9568 */ 9569 rack->r_ctl.rc_prr_sndcnt = 0; 9570 return; 9571 } 9572 /* Compute prr_sndcnt */ 9573 if (SEQ_GT(tp->snd_una, th_ack)) { 9574 snd_una = tp->snd_una; 9575 } else { 9576 snd_una = th_ack; 9577 } 9578 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 9579 if (pipe > tp->snd_ssthresh) { 9580 long sndcnt; 9581 9582 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 9583 if (rack->r_ctl.rc_prr_recovery_fs > 0) 9584 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 9585 else { 9586 rack->r_ctl.rc_prr_sndcnt = 0; 9587 rack_log_to_prr(rack, 9, 0); 9588 sndcnt = 0; 9589 } 9590 sndcnt++; 9591 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 9592 sndcnt -= rack->r_ctl.rc_prr_out; 9593 else 9594 sndcnt = 0; 9595 rack->r_ctl.rc_prr_sndcnt = sndcnt; 9596 rack_log_to_prr(rack, 10, 0); 9597 } else { 9598 uint32_t limit; 9599 9600 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 9601 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 9602 else 9603 limit = 0; 9604 if (changed > limit) 9605 limit = changed; 9606 limit += ctf_fixed_maxseg(tp); 9607 if (tp->snd_ssthresh > pipe) { 9608 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 9609 rack_log_to_prr(rack, 11, 0); 9610 } else { 9611 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 9612 rack_log_to_prr(rack, 12, 0); 9613 } 9614 } 9615 } 9616 9617 static void 9618 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) 9619 { 9620 uint32_t changed; 9621 struct tcp_rack *rack; 9622 struct rack_sendmap *rsm; 9623 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 9624 register uint32_t th_ack; 9625 int32_t i, j, k, num_sack_blks = 0; 9626 uint32_t cts, acked, ack_point, sack_changed = 0; 9627 int loop_start = 0, moved_two = 0; 9628 uint32_t tsused; 9629 9630 9631 INP_WLOCK_ASSERT(tp->t_inpcb); 9632 if (th->th_flags & TH_RST) { 9633 /* We don't log resets */ 9634 return; 9635 } 9636 rack = (struct tcp_rack *)tp->t_fb_ptr; 9637 cts = tcp_get_usecs(NULL); 9638 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9639 changed = 0; 9640 th_ack = th->th_ack; 9641 if (rack->sack_attack_disable == 0) 9642 rack_do_decay(rack); 9643 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 9644 /* 9645 * You only get credit for 9646 * MSS and greater (and you get extra 9647 * credit for larger cum-ack moves). 9648 */ 9649 int ac; 9650 9651 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 9652 rack->r_ctl.ack_count += ac; 9653 counter_u64_add(rack_ack_total, ac); 9654 } 9655 if (rack->r_ctl.ack_count > 0xfff00000) { 9656 /* 9657 * reduce the number to keep us under 9658 * a uint32_t. 9659 */ 9660 rack->r_ctl.ack_count /= 2; 9661 rack->r_ctl.sack_count /= 2; 9662 } 9663 if (SEQ_GT(th_ack, tp->snd_una)) { 9664 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 9665 tp->t_acktime = ticks; 9666 } 9667 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 9668 changed = th_ack - rsm->r_start; 9669 if (changed) { 9670 rack_process_to_cumack(tp, rack, th_ack, cts, to); 9671 } 9672 if ((to->to_flags & TOF_SACK) == 0) { 9673 /* We are done nothing left and no sack. */ 9674 rack_handle_might_revert(tp, rack); 9675 /* 9676 * For cases where we struck a dup-ack 9677 * with no SACK, add to the changes so 9678 * PRR will work right. 9679 */ 9680 if (dup_ack_struck && (changed == 0)) { 9681 changed += ctf_fixed_maxseg(rack->rc_tp); 9682 } 9683 goto out; 9684 } 9685 /* Sack block processing */ 9686 if (SEQ_GT(th_ack, tp->snd_una)) 9687 ack_point = th_ack; 9688 else 9689 ack_point = tp->snd_una; 9690 for (i = 0; i < to->to_nsacks; i++) { 9691 bcopy((to->to_sacks + i * TCPOLEN_SACK), 9692 &sack, sizeof(sack)); 9693 sack.start = ntohl(sack.start); 9694 sack.end = ntohl(sack.end); 9695 if (SEQ_GT(sack.end, sack.start) && 9696 SEQ_GT(sack.start, ack_point) && 9697 SEQ_LT(sack.start, tp->snd_max) && 9698 SEQ_GT(sack.end, ack_point) && 9699 SEQ_LEQ(sack.end, tp->snd_max)) { 9700 sack_blocks[num_sack_blks] = sack; 9701 num_sack_blks++; 9702 } else if (SEQ_LEQ(sack.start, th_ack) && 9703 SEQ_LEQ(sack.end, th_ack)) { 9704 int was_tlp; 9705 9706 was_tlp = rack_note_dsack(rack, sack.start, sack.end); 9707 /* 9708 * Its a D-SACK block. 9709 */ 9710 tcp_record_dsack(tp, sack.start, sack.end, was_tlp); 9711 } 9712 } 9713 if (rack->rc_dsack_round_seen) { 9714 /* Is the dsack roound over? */ 9715 if (SEQ_GEQ(th_ack, rack->r_ctl.dsack_round_end)) { 9716 /* Yes it is */ 9717 rack->rc_dsack_round_seen = 0; 9718 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 9719 } 9720 } 9721 /* 9722 * Sort the SACK blocks so we can update the rack scoreboard with 9723 * just one pass. 9724 */ 9725 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 9726 num_sack_blks, th->th_ack); 9727 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 9728 if (num_sack_blks == 0) { 9729 /* Nothing to sack (DSACKs?) */ 9730 goto out_with_totals; 9731 } 9732 if (num_sack_blks < 2) { 9733 /* Only one, we don't need to sort */ 9734 goto do_sack_work; 9735 } 9736 /* Sort the sacks */ 9737 for (i = 0; i < num_sack_blks; i++) { 9738 for (j = i + 1; j < num_sack_blks; j++) { 9739 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 9740 sack = sack_blocks[i]; 9741 sack_blocks[i] = sack_blocks[j]; 9742 sack_blocks[j] = sack; 9743 } 9744 } 9745 } 9746 /* 9747 * Now are any of the sack block ends the same (yes some 9748 * implementations send these)? 9749 */ 9750 again: 9751 if (num_sack_blks == 0) 9752 goto out_with_totals; 9753 if (num_sack_blks > 1) { 9754 for (i = 0; i < num_sack_blks; i++) { 9755 for (j = i + 1; j < num_sack_blks; j++) { 9756 if (sack_blocks[i].end == sack_blocks[j].end) { 9757 /* 9758 * Ok these two have the same end we 9759 * want the smallest end and then 9760 * throw away the larger and start 9761 * again. 9762 */ 9763 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 9764 /* 9765 * The second block covers 9766 * more area use that 9767 */ 9768 sack_blocks[i].start = sack_blocks[j].start; 9769 } 9770 /* 9771 * Now collapse out the dup-sack and 9772 * lower the count 9773 */ 9774 for (k = (j + 1); k < num_sack_blks; k++) { 9775 sack_blocks[j].start = sack_blocks[k].start; 9776 sack_blocks[j].end = sack_blocks[k].end; 9777 j++; 9778 } 9779 num_sack_blks--; 9780 goto again; 9781 } 9782 } 9783 } 9784 } 9785 do_sack_work: 9786 /* 9787 * First lets look to see if 9788 * we have retransmitted and 9789 * can use the transmit next? 9790 */ 9791 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9792 if (rsm && 9793 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 9794 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 9795 /* 9796 * We probably did the FR and the next 9797 * SACK in continues as we would expect. 9798 */ 9799 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 9800 if (acked) { 9801 rack->r_wanted_output = 1; 9802 changed += acked; 9803 sack_changed += acked; 9804 } 9805 if (num_sack_blks == 1) { 9806 /* 9807 * This is what we would expect from 9808 * a normal implementation to happen 9809 * after we have retransmitted the FR, 9810 * i.e the sack-filter pushes down 9811 * to 1 block and the next to be retransmitted 9812 * is the sequence in the sack block (has more 9813 * are acked). Count this as ACK'd data to boost 9814 * up the chances of recovering any false positives. 9815 */ 9816 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 9817 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 9818 counter_u64_add(rack_express_sack, 1); 9819 if (rack->r_ctl.ack_count > 0xfff00000) { 9820 /* 9821 * reduce the number to keep us under 9822 * a uint32_t. 9823 */ 9824 rack->r_ctl.ack_count /= 2; 9825 rack->r_ctl.sack_count /= 2; 9826 } 9827 goto out_with_totals; 9828 } else { 9829 /* 9830 * Start the loop through the 9831 * rest of blocks, past the first block. 9832 */ 9833 moved_two = 0; 9834 loop_start = 1; 9835 } 9836 } 9837 /* Its a sack of some sort */ 9838 rack->r_ctl.sack_count++; 9839 if (rack->r_ctl.sack_count > 0xfff00000) { 9840 /* 9841 * reduce the number to keep us under 9842 * a uint32_t. 9843 */ 9844 rack->r_ctl.ack_count /= 2; 9845 rack->r_ctl.sack_count /= 2; 9846 } 9847 counter_u64_add(rack_sack_total, 1); 9848 if (rack->sack_attack_disable) { 9849 /* An attacker disablement is in place */ 9850 if (num_sack_blks > 1) { 9851 rack->r_ctl.sack_count += (num_sack_blks - 1); 9852 rack->r_ctl.sack_moved_extra++; 9853 counter_u64_add(rack_move_some, 1); 9854 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 9855 rack->r_ctl.sack_moved_extra /= 2; 9856 rack->r_ctl.sack_noextra_move /= 2; 9857 } 9858 } 9859 goto out; 9860 } 9861 rsm = rack->r_ctl.rc_sacklast; 9862 for (i = loop_start; i < num_sack_blks; i++) { 9863 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 9864 if (acked) { 9865 rack->r_wanted_output = 1; 9866 changed += acked; 9867 sack_changed += acked; 9868 } 9869 if (moved_two) { 9870 /* 9871 * If we did not get a SACK for at least a MSS and 9872 * had to move at all, or if we moved more than our 9873 * threshold, it counts against the "extra" move. 9874 */ 9875 rack->r_ctl.sack_moved_extra += moved_two; 9876 counter_u64_add(rack_move_some, 1); 9877 } else { 9878 /* 9879 * else we did not have to move 9880 * any more than we would expect. 9881 */ 9882 rack->r_ctl.sack_noextra_move++; 9883 counter_u64_add(rack_move_none, 1); 9884 } 9885 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 9886 /* 9887 * If the SACK was not a full MSS then 9888 * we add to sack_count the number of 9889 * MSS's (or possibly more than 9890 * a MSS if its a TSO send) we had to skip by. 9891 */ 9892 rack->r_ctl.sack_count += moved_two; 9893 counter_u64_add(rack_sack_total, moved_two); 9894 } 9895 /* 9896 * Now we need to setup for the next 9897 * round. First we make sure we won't 9898 * exceed the size of our uint32_t on 9899 * the various counts, and then clear out 9900 * moved_two. 9901 */ 9902 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 9903 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 9904 rack->r_ctl.sack_moved_extra /= 2; 9905 rack->r_ctl.sack_noextra_move /= 2; 9906 } 9907 if (rack->r_ctl.sack_count > 0xfff00000) { 9908 rack->r_ctl.ack_count /= 2; 9909 rack->r_ctl.sack_count /= 2; 9910 } 9911 moved_two = 0; 9912 } 9913 out_with_totals: 9914 if (num_sack_blks > 1) { 9915 /* 9916 * You get an extra stroke if 9917 * you have more than one sack-blk, this 9918 * could be where we are skipping forward 9919 * and the sack-filter is still working, or 9920 * it could be an attacker constantly 9921 * moving us. 9922 */ 9923 rack->r_ctl.sack_moved_extra++; 9924 counter_u64_add(rack_move_some, 1); 9925 } 9926 out: 9927 #ifdef NETFLIX_EXP_DETECTION 9928 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 9929 #endif 9930 if (changed) { 9931 /* Something changed cancel the rack timer */ 9932 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9933 } 9934 tsused = tcp_get_usecs(NULL); 9935 rsm = tcp_rack_output(tp, rack, tsused); 9936 if ((!IN_FASTRECOVERY(tp->t_flags)) && 9937 rsm) { 9938 /* Enter recovery */ 9939 rack->r_ctl.rc_rsm_start = rsm->r_start; 9940 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 9941 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 9942 entered_recovery = 1; 9943 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 9944 /* 9945 * When we enter recovery we need to assure we send 9946 * one packet. 9947 */ 9948 if (rack->rack_no_prr == 0) { 9949 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 9950 rack_log_to_prr(rack, 8, 0); 9951 } 9952 rack->r_timer_override = 1; 9953 rack->r_early = 0; 9954 rack->r_ctl.rc_agg_early = 0; 9955 } else if (IN_FASTRECOVERY(tp->t_flags) && 9956 rsm && 9957 (rack->r_rr_config == 3)) { 9958 /* 9959 * Assure we can output and we get no 9960 * remembered pace time except the retransmit. 9961 */ 9962 rack->r_timer_override = 1; 9963 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 9964 rack->r_ctl.rc_resend = rsm; 9965 } 9966 if (IN_FASTRECOVERY(tp->t_flags) && 9967 (rack->rack_no_prr == 0) && 9968 (entered_recovery == 0)) { 9969 rack_update_prr(tp, rack, changed, th_ack); 9970 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 9971 ((tcp_in_hpts(rack->rc_inp) == 0) && 9972 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 9973 /* 9974 * If you are pacing output you don't want 9975 * to override. 9976 */ 9977 rack->r_early = 0; 9978 rack->r_ctl.rc_agg_early = 0; 9979 rack->r_timer_override = 1; 9980 } 9981 } 9982 } 9983 9984 static void 9985 rack_strike_dupack(struct tcp_rack *rack) 9986 { 9987 struct rack_sendmap *rsm; 9988 9989 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9990 while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 9991 rsm = TAILQ_NEXT(rsm, r_tnext); 9992 } 9993 if (rsm && (rsm->r_dupack < 0xff)) { 9994 rsm->r_dupack++; 9995 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 9996 struct timeval tv; 9997 uint32_t cts; 9998 /* 9999 * Here we see if we need to retransmit. For 10000 * a SACK type connection if enough time has passed 10001 * we will get a return of the rsm. For a non-sack 10002 * connection we will get the rsm returned if the 10003 * dupack value is 3 or more. 10004 */ 10005 cts = tcp_get_usecs(&tv); 10006 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 10007 if (rack->r_ctl.rc_resend != NULL) { 10008 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 10009 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 10010 rack->rc_tp->snd_una); 10011 } 10012 rack->r_wanted_output = 1; 10013 rack->r_timer_override = 1; 10014 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 10015 } 10016 } else { 10017 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 10018 } 10019 } 10020 } 10021 10022 static void 10023 rack_check_bottom_drag(struct tcpcb *tp, 10024 struct tcp_rack *rack, 10025 struct socket *so, int32_t acked) 10026 { 10027 uint32_t segsiz, minseg; 10028 10029 segsiz = ctf_fixed_maxseg(tp); 10030 minseg = segsiz; 10031 10032 if (tp->snd_max == tp->snd_una) { 10033 /* 10034 * We are doing dynamic pacing and we are way 10035 * under. Basically everything got acked while 10036 * we were still waiting on the pacer to expire. 10037 * 10038 * This means we need to boost the b/w in 10039 * addition to any earlier boosting of 10040 * the multipler. 10041 */ 10042 rack->rc_dragged_bottom = 1; 10043 rack_validate_multipliers_at_or_above100(rack); 10044 /* 10045 * Lets use the segment bytes acked plus 10046 * the lowest RTT seen as the basis to 10047 * form a b/w estimate. This will be off 10048 * due to the fact that the true estimate 10049 * should be around 1/2 the time of the RTT 10050 * but we can settle for that. 10051 */ 10052 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 10053 acked) { 10054 uint64_t bw, calc_bw, rtt; 10055 10056 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 10057 if (rtt == 0) { 10058 /* no us sample is there a ms one? */ 10059 if (rack->r_ctl.rack_rs.rs_rtt_lowest) { 10060 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 10061 } else { 10062 goto no_measurement; 10063 } 10064 } 10065 bw = acked; 10066 calc_bw = bw * 1000000; 10067 calc_bw /= rtt; 10068 if (rack->r_ctl.last_max_bw && 10069 (rack->r_ctl.last_max_bw < calc_bw)) { 10070 /* 10071 * If we have a last calculated max bw 10072 * enforce it. 10073 */ 10074 calc_bw = rack->r_ctl.last_max_bw; 10075 } 10076 /* now plop it in */ 10077 if (rack->rc_gp_filled == 0) { 10078 if (calc_bw > ONE_POINT_TWO_MEG) { 10079 /* 10080 * If we have no measurement 10081 * don't let us set in more than 10082 * 1.2Mbps. If we are still too 10083 * low after pacing with this we 10084 * will hopefully have a max b/w 10085 * available to sanity check things. 10086 */ 10087 calc_bw = ONE_POINT_TWO_MEG; 10088 } 10089 rack->r_ctl.rc_rtt_diff = 0; 10090 rack->r_ctl.gp_bw = calc_bw; 10091 rack->rc_gp_filled = 1; 10092 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 10093 rack->r_ctl.num_measurements = RACK_REQ_AVG; 10094 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 10095 } else if (calc_bw > rack->r_ctl.gp_bw) { 10096 rack->r_ctl.rc_rtt_diff = 0; 10097 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 10098 rack->r_ctl.num_measurements = RACK_REQ_AVG; 10099 rack->r_ctl.gp_bw = calc_bw; 10100 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 10101 } else 10102 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10103 if ((rack->gp_ready == 0) && 10104 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 10105 /* We have enough measurements now */ 10106 rack->gp_ready = 1; 10107 rack_set_cc_pacing(rack); 10108 if (rack->defer_options) 10109 rack_apply_deferred_options(rack); 10110 } 10111 /* 10112 * For acks over 1mss we do a extra boost to simulate 10113 * where we would get 2 acks (we want 110 for the mul). 10114 */ 10115 if (acked > segsiz) 10116 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10117 } else { 10118 /* 10119 * zero rtt possibly?, settle for just an old increase. 10120 */ 10121 no_measurement: 10122 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10123 } 10124 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 10125 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 10126 minseg)) && 10127 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 10128 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 10129 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 10130 (segsiz * rack_req_segs))) { 10131 /* 10132 * We are doing dynamic GP pacing and 10133 * we have everything except 1MSS or less 10134 * bytes left out. We are still pacing away. 10135 * And there is data that could be sent, This 10136 * means we are inserting delayed ack time in 10137 * our measurements because we are pacing too slow. 10138 */ 10139 rack_validate_multipliers_at_or_above100(rack); 10140 rack->rc_dragged_bottom = 1; 10141 rack_increase_bw_mul(rack, -1, 0, 0, 1); 10142 } 10143 } 10144 10145 10146 10147 static void 10148 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 10149 { 10150 /* 10151 * The fast output path is enabled and we 10152 * have moved the cumack forward. Lets see if 10153 * we can expand forward the fast path length by 10154 * that amount. What we would ideally like to 10155 * do is increase the number of bytes in the 10156 * fast path block (left_to_send) by the 10157 * acked amount. However we have to gate that 10158 * by two factors: 10159 * 1) The amount outstanding and the rwnd of the peer 10160 * (i.e. we don't want to exceed the rwnd of the peer). 10161 * <and> 10162 * 2) The amount of data left in the socket buffer (i.e. 10163 * we can't send beyond what is in the buffer). 10164 * 10165 * Note that this does not take into account any increase 10166 * in the cwnd. We will only extend the fast path by 10167 * what was acked. 10168 */ 10169 uint32_t new_total, gating_val; 10170 10171 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 10172 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 10173 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 10174 if (new_total <= gating_val) { 10175 /* We can increase left_to_send by the acked amount */ 10176 counter_u64_add(rack_extended_rfo, 1); 10177 rack->r_ctl.fsb.left_to_send = new_total; 10178 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 10179 ("rack:%p left_to_send:%u sbavail:%u out:%u", 10180 rack, rack->r_ctl.fsb.left_to_send, 10181 sbavail(&rack->rc_inp->inp_socket->so_snd), 10182 (tp->snd_max - tp->snd_una))); 10183 10184 } 10185 } 10186 10187 static void 10188 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) 10189 { 10190 /* 10191 * Here any sendmap entry that points to the 10192 * beginning mbuf must be adjusted to the correct 10193 * offset. This must be called with: 10194 * 1) The socket buffer locked 10195 * 2) snd_una adjusted to its new postion. 10196 * 10197 * Note that (2) implies rack_ack_received has also 10198 * been called. 10199 * 10200 * We grab the first mbuf in the socket buffer and 10201 * then go through the front of the sendmap, recalculating 10202 * the stored offset for any sendmap entry that has 10203 * that mbuf. We must use the sb functions to do this 10204 * since its possible an add was done has well as 10205 * the subtraction we may have just completed. This should 10206 * not be a penalty though, since we just referenced the sb 10207 * to go in and trim off the mbufs that we freed (of course 10208 * there will be a penalty for the sendmap references though). 10209 */ 10210 struct mbuf *m; 10211 struct rack_sendmap *rsm; 10212 10213 SOCKBUF_LOCK_ASSERT(sb); 10214 m = sb->sb_mb; 10215 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 10216 if ((rsm == NULL) || (m == NULL)) { 10217 /* Nothing outstanding */ 10218 return; 10219 } 10220 while (rsm->m && (rsm->m == m)) { 10221 /* one to adjust */ 10222 #ifdef INVARIANTS 10223 struct mbuf *tm; 10224 uint32_t soff; 10225 10226 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 10227 if (rsm->orig_m_len != m->m_len) { 10228 rack_adjust_orig_mlen(rsm); 10229 } 10230 if (rsm->soff != soff) { 10231 /* 10232 * This is not a fatal error, we anticipate it 10233 * might happen (the else code), so we count it here 10234 * so that under invariant we can see that it really 10235 * does happen. 10236 */ 10237 counter_u64_add(rack_adjust_map_bw, 1); 10238 } 10239 rsm->m = tm; 10240 rsm->soff = soff; 10241 if (tm) 10242 rsm->orig_m_len = rsm->m->m_len; 10243 else 10244 rsm->orig_m_len = 0; 10245 #else 10246 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 10247 if (rsm->m) 10248 rsm->orig_m_len = rsm->m->m_len; 10249 else 10250 rsm->orig_m_len = 0; 10251 #endif 10252 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 10253 rsm); 10254 if (rsm == NULL) 10255 break; 10256 } 10257 } 10258 10259 /* 10260 * Return value of 1, we do not need to call rack_process_data(). 10261 * return value of 0, rack_process_data can be called. 10262 * For ret_val if its 0 the TCP is locked, if its non-zero 10263 * its unlocked and probably unsafe to touch the TCB. 10264 */ 10265 static int 10266 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10267 struct tcpcb *tp, struct tcpopt *to, 10268 uint32_t tiwin, int32_t tlen, 10269 int32_t * ofia, int32_t thflags, int32_t *ret_val) 10270 { 10271 int32_t ourfinisacked = 0; 10272 int32_t nsegs, acked_amount; 10273 int32_t acked; 10274 struct mbuf *mfree; 10275 struct tcp_rack *rack; 10276 int32_t under_pacing = 0; 10277 int32_t recovery = 0; 10278 10279 rack = (struct tcp_rack *)tp->t_fb_ptr; 10280 if (SEQ_GT(th->th_ack, tp->snd_max)) { 10281 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 10282 &rack->r_ctl.challenge_ack_ts, 10283 &rack->r_ctl.challenge_ack_cnt); 10284 rack->r_wanted_output = 1; 10285 return (1); 10286 } 10287 if (rack->gp_ready && 10288 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 10289 under_pacing = 1; 10290 } 10291 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 10292 int in_rec, dup_ack_struck = 0; 10293 10294 in_rec = IN_FASTRECOVERY(tp->t_flags); 10295 if (rack->rc_in_persist) { 10296 tp->t_rxtshift = 0; 10297 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10298 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10299 } 10300 if ((th->th_ack == tp->snd_una) && 10301 (tiwin == tp->snd_wnd) && 10302 ((to->to_flags & TOF_SACK) == 0)) { 10303 rack_strike_dupack(rack); 10304 dup_ack_struck = 1; 10305 } 10306 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); 10307 } 10308 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 10309 /* 10310 * Old ack, behind (or duplicate to) the last one rcv'd 10311 * Note: We mark reordering is occuring if its 10312 * less than and we have not closed our window. 10313 */ 10314 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 10315 counter_u64_add(rack_reorder_seen, 1); 10316 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10317 } 10318 return (0); 10319 } 10320 /* 10321 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 10322 * something we sent. 10323 */ 10324 if (tp->t_flags & TF_NEEDSYN) { 10325 /* 10326 * T/TCP: Connection was half-synchronized, and our SYN has 10327 * been ACK'd (so connection is now fully synchronized). Go 10328 * to non-starred state, increment snd_una for ACK of SYN, 10329 * and check if we can do window scaling. 10330 */ 10331 tp->t_flags &= ~TF_NEEDSYN; 10332 tp->snd_una++; 10333 /* Do window scaling? */ 10334 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 10335 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 10336 tp->rcv_scale = tp->request_r_scale; 10337 /* Send window already scaled. */ 10338 } 10339 } 10340 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10341 INP_WLOCK_ASSERT(tp->t_inpcb); 10342 10343 acked = BYTES_THIS_ACK(tp, th); 10344 if (acked) { 10345 /* 10346 * Any time we move the cum-ack forward clear 10347 * keep-alive tied probe-not-answered. The 10348 * persists clears its own on entry. 10349 */ 10350 rack->probe_not_answered = 0; 10351 } 10352 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 10353 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 10354 /* 10355 * If we just performed our first retransmit, and the ACK arrives 10356 * within our recovery window, then it was a mistake to do the 10357 * retransmit in the first place. Recover our original cwnd and 10358 * ssthresh, and proceed to transmit where we left off. 10359 */ 10360 if ((tp->t_flags & TF_PREVVALID) && 10361 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 10362 tp->t_flags &= ~TF_PREVVALID; 10363 if (tp->t_rxtshift == 1 && 10364 (int)(ticks - tp->t_badrxtwin) < 0) 10365 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 10366 } 10367 if (acked) { 10368 /* assure we are not backed off */ 10369 tp->t_rxtshift = 0; 10370 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10371 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10372 rack->rc_tlp_in_progress = 0; 10373 rack->r_ctl.rc_tlp_cnt_out = 0; 10374 /* 10375 * If it is the RXT timer we want to 10376 * stop it, so we can restart a TLP. 10377 */ 10378 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 10379 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10380 #ifdef NETFLIX_HTTP_LOGGING 10381 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 10382 #endif 10383 } 10384 /* 10385 * If we have a timestamp reply, update smoothed round trip time. If 10386 * no timestamp is present but transmit timer is running and timed 10387 * sequence number was acked, update smoothed round trip time. Since 10388 * we now have an rtt measurement, cancel the timer backoff (cf., 10389 * Phil Karn's retransmit alg.). Recompute the initial retransmit 10390 * timer. 10391 * 10392 * Some boxes send broken timestamp replies during the SYN+ACK 10393 * phase, ignore timestamps of 0 or we could calculate a huge RTT 10394 * and blow up the retransmit timer. 10395 */ 10396 /* 10397 * If all outstanding data is acked, stop retransmit timer and 10398 * remember to restart (more output or persist). If there is more 10399 * data to be acked, restart retransmit timer, using current 10400 * (possibly backed-off) value. 10401 */ 10402 if (acked == 0) { 10403 if (ofia) 10404 *ofia = ourfinisacked; 10405 return (0); 10406 } 10407 if (IN_RECOVERY(tp->t_flags)) { 10408 if (SEQ_LT(th->th_ack, tp->snd_recover) && 10409 (SEQ_LT(th->th_ack, tp->snd_max))) { 10410 tcp_rack_partialack(tp); 10411 } else { 10412 rack_post_recovery(tp, th->th_ack); 10413 recovery = 1; 10414 } 10415 } 10416 /* 10417 * Let the congestion control algorithm update congestion control 10418 * related information. This typically means increasing the 10419 * congestion window. 10420 */ 10421 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 10422 SOCKBUF_LOCK(&so->so_snd); 10423 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 10424 tp->snd_wnd -= acked_amount; 10425 mfree = sbcut_locked(&so->so_snd, acked_amount); 10426 if ((sbused(&so->so_snd) == 0) && 10427 (acked > acked_amount) && 10428 (tp->t_state >= TCPS_FIN_WAIT_1) && 10429 (tp->t_flags & TF_SENTFIN)) { 10430 /* 10431 * We must be sure our fin 10432 * was sent and acked (we can be 10433 * in FIN_WAIT_1 without having 10434 * sent the fin). 10435 */ 10436 ourfinisacked = 1; 10437 } 10438 tp->snd_una = th->th_ack; 10439 if (acked_amount && sbavail(&so->so_snd)) 10440 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 10441 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 10442 /* NB: sowwakeup_locked() does an implicit unlock. */ 10443 sowwakeup_locked(so); 10444 m_freem(mfree); 10445 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 10446 tp->snd_recover = tp->snd_una; 10447 10448 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 10449 tp->snd_nxt = tp->snd_una; 10450 } 10451 if (under_pacing && 10452 (rack->use_fixed_rate == 0) && 10453 (rack->in_probe_rtt == 0) && 10454 rack->rc_gp_dyn_mul && 10455 rack->rc_always_pace) { 10456 /* Check if we are dragging bottom */ 10457 rack_check_bottom_drag(tp, rack, so, acked); 10458 } 10459 if (tp->snd_una == tp->snd_max) { 10460 /* Nothing left outstanding */ 10461 tp->t_flags &= ~TF_PREVVALID; 10462 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 10463 rack->r_ctl.retran_during_recovery = 0; 10464 rack->r_ctl.dsack_byte_cnt = 0; 10465 if (rack->r_ctl.rc_went_idle_time == 0) 10466 rack->r_ctl.rc_went_idle_time = 1; 10467 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 10468 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 10469 tp->t_acktime = 0; 10470 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10471 /* Set need output so persist might get set */ 10472 rack->r_wanted_output = 1; 10473 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 10474 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 10475 (sbavail(&so->so_snd) == 0) && 10476 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 10477 /* 10478 * The socket was gone and the 10479 * peer sent data (now or in the past), time to 10480 * reset him. 10481 */ 10482 *ret_val = 1; 10483 /* tcp_close will kill the inp pre-log the Reset */ 10484 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 10485 tp = tcp_close(tp); 10486 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 10487 return (1); 10488 } 10489 } 10490 if (ofia) 10491 *ofia = ourfinisacked; 10492 return (0); 10493 } 10494 10495 static void 10496 rack_collapsed_window(struct tcp_rack *rack) 10497 { 10498 /* 10499 * Now we must walk the 10500 * send map and divide the 10501 * ones left stranded. These 10502 * guys can't cause us to abort 10503 * the connection and are really 10504 * "unsent". However if a buggy 10505 * client actually did keep some 10506 * of the data i.e. collapsed the win 10507 * and refused to ack and then opened 10508 * the win and acked that data. We would 10509 * get into an ack war, the simplier 10510 * method then of just pretending we 10511 * did not send those segments something 10512 * won't work. 10513 */ 10514 struct rack_sendmap *rsm, *nrsm, fe, *insret; 10515 tcp_seq max_seq; 10516 10517 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 10518 memset(&fe, 0, sizeof(fe)); 10519 fe.r_start = max_seq; 10520 /* Find the first seq past or at maxseq */ 10521 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 10522 if (rsm == NULL) { 10523 /* Nothing to do strange */ 10524 rack->rc_has_collapsed = 0; 10525 return; 10526 } 10527 /* 10528 * Now do we need to split at 10529 * the collapse point? 10530 */ 10531 if (SEQ_GT(max_seq, rsm->r_start)) { 10532 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10533 if (nrsm == NULL) { 10534 /* We can't get a rsm, mark all? */ 10535 nrsm = rsm; 10536 goto no_split; 10537 } 10538 /* Clone it */ 10539 rack_clone_rsm(rack, nrsm, rsm, max_seq); 10540 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 10541 #ifdef INVARIANTS 10542 if (insret != NULL) { 10543 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 10544 nrsm, insret, rack, rsm); 10545 } 10546 #endif 10547 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__); 10548 if (rsm->r_in_tmap) { 10549 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10550 nrsm->r_in_tmap = 1; 10551 } 10552 /* 10553 * Set in the new RSM as the 10554 * collapsed starting point 10555 */ 10556 rsm = nrsm; 10557 } 10558 no_split: 10559 counter_u64_add(rack_collapsed_win, 1); 10560 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 10561 nrsm->r_flags |= RACK_RWND_COLLAPSED; 10562 } 10563 rack->rc_has_collapsed = 1; 10564 } 10565 10566 static void 10567 rack_un_collapse_window(struct tcp_rack *rack) 10568 { 10569 struct rack_sendmap *rsm; 10570 10571 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 10572 if (rsm->r_flags & RACK_RWND_COLLAPSED) 10573 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 10574 else 10575 break; 10576 } 10577 rack->rc_has_collapsed = 0; 10578 } 10579 10580 static void 10581 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 10582 int32_t tlen, int32_t tfo_syn) 10583 { 10584 if (DELAY_ACK(tp, tlen) || tfo_syn) { 10585 if (rack->rc_dack_mode && 10586 (tlen > 500) && 10587 (rack->rc_dack_toggle == 1)) { 10588 goto no_delayed_ack; 10589 } 10590 rack_timer_cancel(tp, rack, 10591 rack->r_ctl.rc_rcvtime, __LINE__); 10592 tp->t_flags |= TF_DELACK; 10593 } else { 10594 no_delayed_ack: 10595 rack->r_wanted_output = 1; 10596 tp->t_flags |= TF_ACKNOW; 10597 if (rack->rc_dack_mode) { 10598 if (tp->t_flags & TF_DELACK) 10599 rack->rc_dack_toggle = 1; 10600 else 10601 rack->rc_dack_toggle = 0; 10602 } 10603 } 10604 } 10605 10606 static void 10607 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 10608 { 10609 /* 10610 * If fast output is in progress, lets validate that 10611 * the new window did not shrink on us and make it 10612 * so fast output should end. 10613 */ 10614 if (rack->r_fast_output) { 10615 uint32_t out; 10616 10617 /* 10618 * Calculate what we will send if left as is 10619 * and compare that to our send window. 10620 */ 10621 out = ctf_outstanding(tp); 10622 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 10623 /* ok we have an issue */ 10624 if (out >= tp->snd_wnd) { 10625 /* Turn off fast output the window is met or collapsed */ 10626 rack->r_fast_output = 0; 10627 } else { 10628 /* we have some room left */ 10629 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 10630 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 10631 /* If not at least 1 full segment never mind */ 10632 rack->r_fast_output = 0; 10633 } 10634 } 10635 } 10636 } 10637 } 10638 10639 10640 /* 10641 * Return value of 1, the TCB is unlocked and most 10642 * likely gone, return value of 0, the TCP is still 10643 * locked. 10644 */ 10645 static int 10646 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 10647 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 10648 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 10649 { 10650 /* 10651 * Update window information. Don't look at window if no ACK: TAC's 10652 * send garbage on first SYN. 10653 */ 10654 int32_t nsegs; 10655 int32_t tfo_syn; 10656 struct tcp_rack *rack; 10657 10658 rack = (struct tcp_rack *)tp->t_fb_ptr; 10659 INP_WLOCK_ASSERT(tp->t_inpcb); 10660 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10661 if ((thflags & TH_ACK) && 10662 (SEQ_LT(tp->snd_wl1, th->th_seq) || 10663 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 10664 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 10665 /* keep track of pure window updates */ 10666 if (tlen == 0 && 10667 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 10668 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 10669 tp->snd_wnd = tiwin; 10670 rack_validate_fo_sendwin_up(tp, rack); 10671 tp->snd_wl1 = th->th_seq; 10672 tp->snd_wl2 = th->th_ack; 10673 if (tp->snd_wnd > tp->max_sndwnd) 10674 tp->max_sndwnd = tp->snd_wnd; 10675 rack->r_wanted_output = 1; 10676 } else if (thflags & TH_ACK) { 10677 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 10678 tp->snd_wnd = tiwin; 10679 rack_validate_fo_sendwin_up(tp, rack); 10680 tp->snd_wl1 = th->th_seq; 10681 tp->snd_wl2 = th->th_ack; 10682 } 10683 } 10684 if (tp->snd_wnd < ctf_outstanding(tp)) 10685 /* The peer collapsed the window */ 10686 rack_collapsed_window(rack); 10687 else if (rack->rc_has_collapsed) 10688 rack_un_collapse_window(rack); 10689 /* Was persist timer active and now we have window space? */ 10690 if ((rack->rc_in_persist != 0) && 10691 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10692 rack->r_ctl.rc_pace_min_segs))) { 10693 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10694 tp->snd_nxt = tp->snd_max; 10695 /* Make sure we output to start the timer */ 10696 rack->r_wanted_output = 1; 10697 } 10698 /* Do we enter persists? */ 10699 if ((rack->rc_in_persist == 0) && 10700 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10701 TCPS_HAVEESTABLISHED(tp->t_state) && 10702 (tp->snd_max == tp->snd_una) && 10703 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 10704 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 10705 /* 10706 * Here the rwnd is less than 10707 * the pacing size, we are established, 10708 * nothing is outstanding, and there is 10709 * data to send. Enter persists. 10710 */ 10711 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10712 } 10713 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 10714 m_freem(m); 10715 return (0); 10716 } 10717 /* 10718 * don't process the URG bit, ignore them drag 10719 * along the up. 10720 */ 10721 tp->rcv_up = tp->rcv_nxt; 10722 INP_WLOCK_ASSERT(tp->t_inpcb); 10723 10724 /* 10725 * Process the segment text, merging it into the TCP sequencing 10726 * queue, and arranging for acknowledgment of receipt if necessary. 10727 * This process logically involves adjusting tp->rcv_wnd as data is 10728 * presented to the user (this happens in tcp_usrreq.c, case 10729 * PRU_RCVD). If a FIN has already been received on this connection 10730 * then we just ignore the text. 10731 */ 10732 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 10733 IS_FASTOPEN(tp->t_flags)); 10734 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 10735 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10736 tcp_seq save_start = th->th_seq; 10737 tcp_seq save_rnxt = tp->rcv_nxt; 10738 int save_tlen = tlen; 10739 10740 m_adj(m, drop_hdrlen); /* delayed header drop */ 10741 /* 10742 * Insert segment which includes th into TCP reassembly 10743 * queue with control block tp. Set thflags to whether 10744 * reassembly now includes a segment with FIN. This handles 10745 * the common case inline (segment is the next to be 10746 * received on an established connection, and the queue is 10747 * empty), avoiding linkage into and removal from the queue 10748 * and repetition of various conversions. Set DELACK for 10749 * segments received in order, but ack immediately when 10750 * segments are out of order (so fast retransmit can work). 10751 */ 10752 if (th->th_seq == tp->rcv_nxt && 10753 SEGQ_EMPTY(tp) && 10754 (TCPS_HAVEESTABLISHED(tp->t_state) || 10755 tfo_syn)) { 10756 #ifdef NETFLIX_SB_LIMITS 10757 u_int mcnt, appended; 10758 10759 if (so->so_rcv.sb_shlim) { 10760 mcnt = m_memcnt(m); 10761 appended = 0; 10762 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10763 CFO_NOSLEEP, NULL) == false) { 10764 counter_u64_add(tcp_sb_shlim_fails, 1); 10765 m_freem(m); 10766 return (0); 10767 } 10768 } 10769 #endif 10770 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 10771 tp->rcv_nxt += tlen; 10772 if (tlen && 10773 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10774 (tp->t_fbyte_in == 0)) { 10775 tp->t_fbyte_in = ticks; 10776 if (tp->t_fbyte_in == 0) 10777 tp->t_fbyte_in = 1; 10778 if (tp->t_fbyte_out && tp->t_fbyte_in) 10779 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10780 } 10781 thflags = th->th_flags & TH_FIN; 10782 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10783 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10784 SOCKBUF_LOCK(&so->so_rcv); 10785 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10786 m_freem(m); 10787 } else 10788 #ifdef NETFLIX_SB_LIMITS 10789 appended = 10790 #endif 10791 sbappendstream_locked(&so->so_rcv, m, 0); 10792 10793 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10794 /* NB: sorwakeup_locked() does an implicit unlock. */ 10795 sorwakeup_locked(so); 10796 #ifdef NETFLIX_SB_LIMITS 10797 if (so->so_rcv.sb_shlim && appended != mcnt) 10798 counter_fo_release(so->so_rcv.sb_shlim, 10799 mcnt - appended); 10800 #endif 10801 } else { 10802 /* 10803 * XXX: Due to the header drop above "th" is 10804 * theoretically invalid by now. Fortunately 10805 * m_adj() doesn't actually frees any mbufs when 10806 * trimming from the head. 10807 */ 10808 tcp_seq temp = save_start; 10809 10810 thflags = tcp_reass(tp, th, &temp, &tlen, m); 10811 tp->t_flags |= TF_ACKNOW; 10812 if (tp->t_flags & TF_WAKESOR) { 10813 tp->t_flags &= ~TF_WAKESOR; 10814 /* NB: sorwakeup_locked() does an implicit unlock. */ 10815 sorwakeup_locked(so); 10816 } 10817 } 10818 if ((tp->t_flags & TF_SACK_PERMIT) && 10819 (save_tlen > 0) && 10820 TCPS_HAVEESTABLISHED(tp->t_state)) { 10821 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 10822 /* 10823 * DSACK actually handled in the fastpath 10824 * above. 10825 */ 10826 RACK_OPTS_INC(tcp_sack_path_1); 10827 tcp_update_sack_list(tp, save_start, 10828 save_start + save_tlen); 10829 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 10830 if ((tp->rcv_numsacks >= 1) && 10831 (tp->sackblks[0].end == save_start)) { 10832 /* 10833 * Partial overlap, recorded at todrop 10834 * above. 10835 */ 10836 RACK_OPTS_INC(tcp_sack_path_2a); 10837 tcp_update_sack_list(tp, 10838 tp->sackblks[0].start, 10839 tp->sackblks[0].end); 10840 } else { 10841 RACK_OPTS_INC(tcp_sack_path_2b); 10842 tcp_update_dsack_list(tp, save_start, 10843 save_start + save_tlen); 10844 } 10845 } else if (tlen >= save_tlen) { 10846 /* Update of sackblks. */ 10847 RACK_OPTS_INC(tcp_sack_path_3); 10848 tcp_update_dsack_list(tp, save_start, 10849 save_start + save_tlen); 10850 } else if (tlen > 0) { 10851 RACK_OPTS_INC(tcp_sack_path_4); 10852 tcp_update_dsack_list(tp, save_start, 10853 save_start + tlen); 10854 } 10855 } 10856 } else { 10857 m_freem(m); 10858 thflags &= ~TH_FIN; 10859 } 10860 10861 /* 10862 * If FIN is received ACK the FIN and let the user know that the 10863 * connection is closing. 10864 */ 10865 if (thflags & TH_FIN) { 10866 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10867 /* The socket upcall is handled by socantrcvmore. */ 10868 socantrcvmore(so); 10869 /* 10870 * If connection is half-synchronized (ie NEEDSYN 10871 * flag on) then delay ACK, so it may be piggybacked 10872 * when SYN is sent. Otherwise, since we received a 10873 * FIN then no more input can be expected, send ACK 10874 * now. 10875 */ 10876 if (tp->t_flags & TF_NEEDSYN) { 10877 rack_timer_cancel(tp, rack, 10878 rack->r_ctl.rc_rcvtime, __LINE__); 10879 tp->t_flags |= TF_DELACK; 10880 } else { 10881 tp->t_flags |= TF_ACKNOW; 10882 } 10883 tp->rcv_nxt++; 10884 } 10885 switch (tp->t_state) { 10886 /* 10887 * In SYN_RECEIVED and ESTABLISHED STATES enter the 10888 * CLOSE_WAIT state. 10889 */ 10890 case TCPS_SYN_RECEIVED: 10891 tp->t_starttime = ticks; 10892 /* FALLTHROUGH */ 10893 case TCPS_ESTABLISHED: 10894 rack_timer_cancel(tp, rack, 10895 rack->r_ctl.rc_rcvtime, __LINE__); 10896 tcp_state_change(tp, TCPS_CLOSE_WAIT); 10897 break; 10898 10899 /* 10900 * If still in FIN_WAIT_1 STATE FIN has not been 10901 * acked so enter the CLOSING state. 10902 */ 10903 case TCPS_FIN_WAIT_1: 10904 rack_timer_cancel(tp, rack, 10905 rack->r_ctl.rc_rcvtime, __LINE__); 10906 tcp_state_change(tp, TCPS_CLOSING); 10907 break; 10908 10909 /* 10910 * In FIN_WAIT_2 state enter the TIME_WAIT state, 10911 * starting the time-wait timer, turning off the 10912 * other standard timers. 10913 */ 10914 case TCPS_FIN_WAIT_2: 10915 rack_timer_cancel(tp, rack, 10916 rack->r_ctl.rc_rcvtime, __LINE__); 10917 tcp_twstart(tp); 10918 return (1); 10919 } 10920 } 10921 /* 10922 * Return any desired output. 10923 */ 10924 if ((tp->t_flags & TF_ACKNOW) || 10925 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 10926 rack->r_wanted_output = 1; 10927 } 10928 INP_WLOCK_ASSERT(tp->t_inpcb); 10929 return (0); 10930 } 10931 10932 /* 10933 * Here nothing is really faster, its just that we 10934 * have broken out the fast-data path also just like 10935 * the fast-ack. 10936 */ 10937 static int 10938 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 10939 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10940 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 10941 { 10942 int32_t nsegs; 10943 int32_t newsize = 0; /* automatic sockbuf scaling */ 10944 struct tcp_rack *rack; 10945 #ifdef NETFLIX_SB_LIMITS 10946 u_int mcnt, appended; 10947 #endif 10948 #ifdef TCPDEBUG 10949 /* 10950 * The size of tcp_saveipgen must be the size of the max ip header, 10951 * now IPv6. 10952 */ 10953 u_char tcp_saveipgen[IP6_HDR_LEN]; 10954 struct tcphdr tcp_savetcp; 10955 short ostate = 0; 10956 10957 #endif 10958 /* 10959 * If last ACK falls within this segment's sequence numbers, record 10960 * the timestamp. NOTE that the test is modified according to the 10961 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10962 */ 10963 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 10964 return (0); 10965 } 10966 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10967 return (0); 10968 } 10969 if (tiwin && tiwin != tp->snd_wnd) { 10970 return (0); 10971 } 10972 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 10973 return (0); 10974 } 10975 if (__predict_false((to->to_flags & TOF_TS) && 10976 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 10977 return (0); 10978 } 10979 if (__predict_false((th->th_ack != tp->snd_una))) { 10980 return (0); 10981 } 10982 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 10983 return (0); 10984 } 10985 if ((to->to_flags & TOF_TS) != 0 && 10986 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10987 tp->ts_recent_age = tcp_ts_getticks(); 10988 tp->ts_recent = to->to_tsval; 10989 } 10990 rack = (struct tcp_rack *)tp->t_fb_ptr; 10991 /* 10992 * This is a pure, in-sequence data packet with nothing on the 10993 * reassembly queue and we have enough buffer space to take it. 10994 */ 10995 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10996 10997 #ifdef NETFLIX_SB_LIMITS 10998 if (so->so_rcv.sb_shlim) { 10999 mcnt = m_memcnt(m); 11000 appended = 0; 11001 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 11002 CFO_NOSLEEP, NULL) == false) { 11003 counter_u64_add(tcp_sb_shlim_fails, 1); 11004 m_freem(m); 11005 return (1); 11006 } 11007 } 11008 #endif 11009 /* Clean receiver SACK report if present */ 11010 if (tp->rcv_numsacks) 11011 tcp_clean_sackreport(tp); 11012 KMOD_TCPSTAT_INC(tcps_preddat); 11013 tp->rcv_nxt += tlen; 11014 if (tlen && 11015 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 11016 (tp->t_fbyte_in == 0)) { 11017 tp->t_fbyte_in = ticks; 11018 if (tp->t_fbyte_in == 0) 11019 tp->t_fbyte_in = 1; 11020 if (tp->t_fbyte_out && tp->t_fbyte_in) 11021 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 11022 } 11023 /* 11024 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 11025 */ 11026 tp->snd_wl1 = th->th_seq; 11027 /* 11028 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 11029 */ 11030 tp->rcv_up = tp->rcv_nxt; 11031 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 11032 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 11033 #ifdef TCPDEBUG 11034 if (so->so_options & SO_DEBUG) 11035 tcp_trace(TA_INPUT, ostate, tp, 11036 (void *)tcp_saveipgen, &tcp_savetcp, 0); 11037 #endif 11038 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 11039 11040 /* Add data to socket buffer. */ 11041 SOCKBUF_LOCK(&so->so_rcv); 11042 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11043 m_freem(m); 11044 } else { 11045 /* 11046 * Set new socket buffer size. Give up when limit is 11047 * reached. 11048 */ 11049 if (newsize) 11050 if (!sbreserve_locked(&so->so_rcv, 11051 newsize, so, NULL)) 11052 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 11053 m_adj(m, drop_hdrlen); /* delayed header drop */ 11054 #ifdef NETFLIX_SB_LIMITS 11055 appended = 11056 #endif 11057 sbappendstream_locked(&so->so_rcv, m, 0); 11058 ctf_calc_rwin(so, tp); 11059 } 11060 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 11061 /* NB: sorwakeup_locked() does an implicit unlock. */ 11062 sorwakeup_locked(so); 11063 #ifdef NETFLIX_SB_LIMITS 11064 if (so->so_rcv.sb_shlim && mcnt != appended) 11065 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 11066 #endif 11067 rack_handle_delayed_ack(tp, rack, tlen, 0); 11068 if (tp->snd_una == tp->snd_max) 11069 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 11070 return (1); 11071 } 11072 11073 /* 11074 * This subfunction is used to try to highly optimize the 11075 * fast path. We again allow window updates that are 11076 * in sequence to remain in the fast-path. We also add 11077 * in the __predict's to attempt to help the compiler. 11078 * Note that if we return a 0, then we can *not* process 11079 * it and the caller should push the packet into the 11080 * slow-path. 11081 */ 11082 static int 11083 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11084 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11085 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 11086 { 11087 int32_t acked; 11088 int32_t nsegs; 11089 #ifdef TCPDEBUG 11090 /* 11091 * The size of tcp_saveipgen must be the size of the max ip header, 11092 * now IPv6. 11093 */ 11094 u_char tcp_saveipgen[IP6_HDR_LEN]; 11095 struct tcphdr tcp_savetcp; 11096 short ostate = 0; 11097 #endif 11098 int32_t under_pacing = 0; 11099 struct tcp_rack *rack; 11100 11101 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 11102 /* Old ack, behind (or duplicate to) the last one rcv'd */ 11103 return (0); 11104 } 11105 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 11106 /* Above what we have sent? */ 11107 return (0); 11108 } 11109 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 11110 /* We are retransmitting */ 11111 return (0); 11112 } 11113 if (__predict_false(tiwin == 0)) { 11114 /* zero window */ 11115 return (0); 11116 } 11117 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 11118 /* We need a SYN or a FIN, unlikely.. */ 11119 return (0); 11120 } 11121 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 11122 /* Timestamp is behind .. old ack with seq wrap? */ 11123 return (0); 11124 } 11125 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 11126 /* Still recovering */ 11127 return (0); 11128 } 11129 rack = (struct tcp_rack *)tp->t_fb_ptr; 11130 if (rack->r_ctl.rc_sacked) { 11131 /* We have sack holes on our scoreboard */ 11132 return (0); 11133 } 11134 /* Ok if we reach here, we can process a fast-ack */ 11135 if (rack->gp_ready && 11136 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11137 under_pacing = 1; 11138 } 11139 nsegs = max(1, m->m_pkthdr.lro_nsegs); 11140 rack_log_ack(tp, to, th, 0, 0); 11141 /* Did the window get updated? */ 11142 if (tiwin != tp->snd_wnd) { 11143 tp->snd_wnd = tiwin; 11144 rack_validate_fo_sendwin_up(tp, rack); 11145 tp->snd_wl1 = th->th_seq; 11146 if (tp->snd_wnd > tp->max_sndwnd) 11147 tp->max_sndwnd = tp->snd_wnd; 11148 } 11149 /* Do we exit persists? */ 11150 if ((rack->rc_in_persist != 0) && 11151 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 11152 rack->r_ctl.rc_pace_min_segs))) { 11153 rack_exit_persist(tp, rack, cts); 11154 } 11155 /* Do we enter persists? */ 11156 if ((rack->rc_in_persist == 0) && 11157 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 11158 TCPS_HAVEESTABLISHED(tp->t_state) && 11159 (tp->snd_max == tp->snd_una) && 11160 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 11161 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 11162 /* 11163 * Here the rwnd is less than 11164 * the pacing size, we are established, 11165 * nothing is outstanding, and there is 11166 * data to send. Enter persists. 11167 */ 11168 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 11169 } 11170 /* 11171 * If last ACK falls within this segment's sequence numbers, record 11172 * the timestamp. NOTE that the test is modified according to the 11173 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 11174 */ 11175 if ((to->to_flags & TOF_TS) != 0 && 11176 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 11177 tp->ts_recent_age = tcp_ts_getticks(); 11178 tp->ts_recent = to->to_tsval; 11179 } 11180 /* 11181 * This is a pure ack for outstanding data. 11182 */ 11183 KMOD_TCPSTAT_INC(tcps_predack); 11184 11185 /* 11186 * "bad retransmit" recovery. 11187 */ 11188 if ((tp->t_flags & TF_PREVVALID) && 11189 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 11190 tp->t_flags &= ~TF_PREVVALID; 11191 if (tp->t_rxtshift == 1 && 11192 (int)(ticks - tp->t_badrxtwin) < 0) 11193 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 11194 } 11195 /* 11196 * Recalculate the transmit timer / rtt. 11197 * 11198 * Some boxes send broken timestamp replies during the SYN+ACK 11199 * phase, ignore timestamps of 0 or we could calculate a huge RTT 11200 * and blow up the retransmit timer. 11201 */ 11202 acked = BYTES_THIS_ACK(tp, th); 11203 11204 #ifdef TCP_HHOOK 11205 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 11206 hhook_run_tcp_est_in(tp, th, to); 11207 #endif 11208 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 11209 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 11210 if (acked) { 11211 struct mbuf *mfree; 11212 11213 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 11214 SOCKBUF_LOCK(&so->so_snd); 11215 mfree = sbcut_locked(&so->so_snd, acked); 11216 tp->snd_una = th->th_ack; 11217 /* Note we want to hold the sb lock through the sendmap adjust */ 11218 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 11219 /* Wake up the socket if we have room to write more */ 11220 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 11221 sowwakeup_locked(so); 11222 m_freem(mfree); 11223 tp->t_rxtshift = 0; 11224 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 11225 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 11226 rack->rc_tlp_in_progress = 0; 11227 rack->r_ctl.rc_tlp_cnt_out = 0; 11228 /* 11229 * If it is the RXT timer we want to 11230 * stop it, so we can restart a TLP. 11231 */ 11232 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 11233 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11234 #ifdef NETFLIX_HTTP_LOGGING 11235 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 11236 #endif 11237 } 11238 /* 11239 * Let the congestion control algorithm update congestion control 11240 * related information. This typically means increasing the 11241 * congestion window. 11242 */ 11243 if (tp->snd_wnd < ctf_outstanding(tp)) { 11244 /* The peer collapsed the window */ 11245 rack_collapsed_window(rack); 11246 } else if (rack->rc_has_collapsed) 11247 rack_un_collapse_window(rack); 11248 11249 /* 11250 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 11251 */ 11252 tp->snd_wl2 = th->th_ack; 11253 tp->t_dupacks = 0; 11254 m_freem(m); 11255 /* ND6_HINT(tp); *//* Some progress has been made. */ 11256 11257 /* 11258 * If all outstanding data are acked, stop retransmit timer, 11259 * otherwise restart timer using current (possibly backed-off) 11260 * value. If process is waiting for space, wakeup/selwakeup/signal. 11261 * If data are ready to send, let tcp_output decide between more 11262 * output or persist. 11263 */ 11264 #ifdef TCPDEBUG 11265 if (so->so_options & SO_DEBUG) 11266 tcp_trace(TA_INPUT, ostate, tp, 11267 (void *)tcp_saveipgen, 11268 &tcp_savetcp, 0); 11269 #endif 11270 if (under_pacing && 11271 (rack->use_fixed_rate == 0) && 11272 (rack->in_probe_rtt == 0) && 11273 rack->rc_gp_dyn_mul && 11274 rack->rc_always_pace) { 11275 /* Check if we are dragging bottom */ 11276 rack_check_bottom_drag(tp, rack, so, acked); 11277 } 11278 if (tp->snd_una == tp->snd_max) { 11279 tp->t_flags &= ~TF_PREVVALID; 11280 rack->r_ctl.retran_during_recovery = 0; 11281 rack->r_ctl.dsack_byte_cnt = 0; 11282 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 11283 if (rack->r_ctl.rc_went_idle_time == 0) 11284 rack->r_ctl.rc_went_idle_time = 1; 11285 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 11286 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 11287 tp->t_acktime = 0; 11288 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 11289 } 11290 if (acked && rack->r_fast_output) 11291 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 11292 if (sbavail(&so->so_snd)) { 11293 rack->r_wanted_output = 1; 11294 } 11295 return (1); 11296 } 11297 11298 /* 11299 * Return value of 1, the TCB is unlocked and most 11300 * likely gone, return value of 0, the TCP is still 11301 * locked. 11302 */ 11303 static int 11304 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 11305 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11306 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11307 { 11308 int32_t ret_val = 0; 11309 int32_t todrop; 11310 int32_t ourfinisacked = 0; 11311 struct tcp_rack *rack; 11312 11313 ctf_calc_rwin(so, tp); 11314 /* 11315 * If the state is SYN_SENT: if seg contains an ACK, but not for our 11316 * SYN, drop the input. if seg contains a RST, then drop the 11317 * connection. if seg does not contain SYN, then drop it. Otherwise 11318 * this is an acceptable SYN segment initialize tp->rcv_nxt and 11319 * tp->irs if seg contains ack then advance tp->snd_una if seg 11320 * contains an ECE and ECN support is enabled, the stream is ECN 11321 * capable. if SYN has been acked change to ESTABLISHED else 11322 * SYN_RCVD state arrange for segment to be acked (eventually) 11323 * continue processing rest of data/controls. 11324 */ 11325 if ((thflags & TH_ACK) && 11326 (SEQ_LEQ(th->th_ack, tp->iss) || 11327 SEQ_GT(th->th_ack, tp->snd_max))) { 11328 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11329 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11330 return (1); 11331 } 11332 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 11333 TCP_PROBE5(connect__refused, NULL, tp, 11334 mtod(m, const char *), tp, th); 11335 tp = tcp_drop(tp, ECONNREFUSED); 11336 ctf_do_drop(m, tp); 11337 return (1); 11338 } 11339 if (thflags & TH_RST) { 11340 ctf_do_drop(m, tp); 11341 return (1); 11342 } 11343 if (!(thflags & TH_SYN)) { 11344 ctf_do_drop(m, tp); 11345 return (1); 11346 } 11347 tp->irs = th->th_seq; 11348 tcp_rcvseqinit(tp); 11349 rack = (struct tcp_rack *)tp->t_fb_ptr; 11350 if (thflags & TH_ACK) { 11351 int tfo_partial = 0; 11352 11353 KMOD_TCPSTAT_INC(tcps_connects); 11354 soisconnected(so); 11355 #ifdef MAC 11356 mac_socketpeer_set_from_mbuf(m, so); 11357 #endif 11358 /* Do window scaling on this connection? */ 11359 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11360 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11361 tp->rcv_scale = tp->request_r_scale; 11362 } 11363 tp->rcv_adv += min(tp->rcv_wnd, 11364 TCP_MAXWIN << tp->rcv_scale); 11365 /* 11366 * If not all the data that was sent in the TFO SYN 11367 * has been acked, resend the remainder right away. 11368 */ 11369 if (IS_FASTOPEN(tp->t_flags) && 11370 (tp->snd_una != tp->snd_max)) { 11371 tp->snd_nxt = th->th_ack; 11372 tfo_partial = 1; 11373 } 11374 /* 11375 * If there's data, delay ACK; if there's also a FIN ACKNOW 11376 * will be turned on later. 11377 */ 11378 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 11379 rack_timer_cancel(tp, rack, 11380 rack->r_ctl.rc_rcvtime, __LINE__); 11381 tp->t_flags |= TF_DELACK; 11382 } else { 11383 rack->r_wanted_output = 1; 11384 tp->t_flags |= TF_ACKNOW; 11385 rack->rc_dack_toggle = 0; 11386 } 11387 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 11388 (V_tcp_do_ecn == 1)) { 11389 tp->t_flags2 |= TF2_ECN_PERMIT; 11390 KMOD_TCPSTAT_INC(tcps_ecn_shs); 11391 } 11392 if (SEQ_GT(th->th_ack, tp->snd_una)) { 11393 /* 11394 * We advance snd_una for the 11395 * fast open case. If th_ack is 11396 * acknowledging data beyond 11397 * snd_una we can't just call 11398 * ack-processing since the 11399 * data stream in our send-map 11400 * will start at snd_una + 1 (one 11401 * beyond the SYN). If its just 11402 * equal we don't need to do that 11403 * and there is no send_map. 11404 */ 11405 tp->snd_una++; 11406 } 11407 /* 11408 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 11409 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 11410 */ 11411 tp->t_starttime = ticks; 11412 if (tp->t_flags & TF_NEEDFIN) { 11413 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11414 tp->t_flags &= ~TF_NEEDFIN; 11415 thflags &= ~TH_SYN; 11416 } else { 11417 tcp_state_change(tp, TCPS_ESTABLISHED); 11418 TCP_PROBE5(connect__established, NULL, tp, 11419 mtod(m, const char *), tp, th); 11420 rack_cc_conn_init(tp); 11421 } 11422 } else { 11423 /* 11424 * Received initial SYN in SYN-SENT[*] state => simultaneous 11425 * open. If segment contains CC option and there is a 11426 * cached CC, apply TAO test. If it succeeds, connection is * 11427 * half-synchronized. Otherwise, do 3-way handshake: 11428 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 11429 * there was no CC option, clear cached CC value. 11430 */ 11431 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 11432 tcp_state_change(tp, TCPS_SYN_RECEIVED); 11433 } 11434 INP_WLOCK_ASSERT(tp->t_inpcb); 11435 /* 11436 * Advance th->th_seq to correspond to first data byte. If data, 11437 * trim to stay within window, dropping FIN if necessary. 11438 */ 11439 th->th_seq++; 11440 if (tlen > tp->rcv_wnd) { 11441 todrop = tlen - tp->rcv_wnd; 11442 m_adj(m, -todrop); 11443 tlen = tp->rcv_wnd; 11444 thflags &= ~TH_FIN; 11445 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 11446 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 11447 } 11448 tp->snd_wl1 = th->th_seq - 1; 11449 tp->rcv_up = th->th_seq; 11450 /* 11451 * Client side of transaction: already sent SYN and data. If the 11452 * remote host used T/TCP to validate the SYN, our data will be 11453 * ACK'd; if so, enter normal data segment processing in the middle 11454 * of step 5, ack processing. Otherwise, goto step 6. 11455 */ 11456 if (thflags & TH_ACK) { 11457 /* For syn-sent we need to possibly update the rtt */ 11458 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11459 uint32_t t, mcts; 11460 11461 mcts = tcp_ts_getticks(); 11462 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11463 if (!tp->t_rttlow || tp->t_rttlow > t) 11464 tp->t_rttlow = t; 11465 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 11466 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11467 tcp_rack_xmit_timer_commit(rack, tp); 11468 } 11469 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 11470 return (ret_val); 11471 /* We may have changed to FIN_WAIT_1 above */ 11472 if (tp->t_state == TCPS_FIN_WAIT_1) { 11473 /* 11474 * In FIN_WAIT_1 STATE in addition to the processing 11475 * for the ESTABLISHED state if our FIN is now 11476 * acknowledged then enter FIN_WAIT_2. 11477 */ 11478 if (ourfinisacked) { 11479 /* 11480 * If we can't receive any more data, then 11481 * closing user can proceed. Starting the 11482 * timer is contrary to the specification, 11483 * but if we don't get a FIN we'll hang 11484 * forever. 11485 * 11486 * XXXjl: we should release the tp also, and 11487 * use a compressed state. 11488 */ 11489 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11490 soisdisconnected(so); 11491 tcp_timer_activate(tp, TT_2MSL, 11492 (tcp_fast_finwait2_recycle ? 11493 tcp_finwait2_timeout : 11494 TP_MAXIDLE(tp))); 11495 } 11496 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11497 } 11498 } 11499 } 11500 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11501 tiwin, thflags, nxt_pkt)); 11502 } 11503 11504 /* 11505 * Return value of 1, the TCB is unlocked and most 11506 * likely gone, return value of 0, the TCP is still 11507 * locked. 11508 */ 11509 static int 11510 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 11511 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11512 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11513 { 11514 struct tcp_rack *rack; 11515 int32_t ret_val = 0; 11516 int32_t ourfinisacked = 0; 11517 11518 ctf_calc_rwin(so, tp); 11519 if ((thflags & TH_ACK) && 11520 (SEQ_LEQ(th->th_ack, tp->snd_una) || 11521 SEQ_GT(th->th_ack, tp->snd_max))) { 11522 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11523 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11524 return (1); 11525 } 11526 rack = (struct tcp_rack *)tp->t_fb_ptr; 11527 if (IS_FASTOPEN(tp->t_flags)) { 11528 /* 11529 * When a TFO connection is in SYN_RECEIVED, the 11530 * only valid packets are the initial SYN, a 11531 * retransmit/copy of the initial SYN (possibly with 11532 * a subset of the original data), a valid ACK, a 11533 * FIN, or a RST. 11534 */ 11535 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 11536 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11537 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11538 return (1); 11539 } else if (thflags & TH_SYN) { 11540 /* non-initial SYN is ignored */ 11541 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 11542 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 11543 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 11544 ctf_do_drop(m, NULL); 11545 return (0); 11546 } 11547 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 11548 ctf_do_drop(m, NULL); 11549 return (0); 11550 } 11551 } 11552 11553 if ((thflags & TH_RST) || 11554 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11555 return (__ctf_process_rst(m, th, so, tp, 11556 &rack->r_ctl.challenge_ack_ts, 11557 &rack->r_ctl.challenge_ack_cnt)); 11558 /* 11559 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11560 * it's less than ts_recent, drop it. 11561 */ 11562 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11563 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11564 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11565 return (ret_val); 11566 } 11567 /* 11568 * In the SYN-RECEIVED state, validate that the packet belongs to 11569 * this connection before trimming the data to fit the receive 11570 * window. Check the sequence number versus IRS since we know the 11571 * sequence numbers haven't wrapped. This is a partial fix for the 11572 * "LAND" DoS attack. 11573 */ 11574 if (SEQ_LT(th->th_seq, tp->irs)) { 11575 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11576 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11577 return (1); 11578 } 11579 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11580 &rack->r_ctl.challenge_ack_ts, 11581 &rack->r_ctl.challenge_ack_cnt)) { 11582 return (ret_val); 11583 } 11584 /* 11585 * If last ACK falls within this segment's sequence numbers, record 11586 * its timestamp. NOTE: 1) That the test incorporates suggestions 11587 * from the latest proposal of the tcplw@cray.com list (Braden 11588 * 1993/04/26). 2) That updating only on newer timestamps interferes 11589 * with our earlier PAWS tests, so this check should be solely 11590 * predicated on the sequence space of this segment. 3) That we 11591 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11592 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11593 * SEG.Len, This modified check allows us to overcome RFC1323's 11594 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11595 * p.869. In such cases, we can still calculate the RTT correctly 11596 * when RCV.NXT == Last.ACK.Sent. 11597 */ 11598 if ((to->to_flags & TOF_TS) != 0 && 11599 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11600 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11601 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11602 tp->ts_recent_age = tcp_ts_getticks(); 11603 tp->ts_recent = to->to_tsval; 11604 } 11605 tp->snd_wnd = tiwin; 11606 rack_validate_fo_sendwin_up(tp, rack); 11607 /* 11608 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11609 * is on (half-synchronized state), then queue data for later 11610 * processing; else drop segment and return. 11611 */ 11612 if ((thflags & TH_ACK) == 0) { 11613 if (IS_FASTOPEN(tp->t_flags)) { 11614 rack_cc_conn_init(tp); 11615 } 11616 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11617 tiwin, thflags, nxt_pkt)); 11618 } 11619 KMOD_TCPSTAT_INC(tcps_connects); 11620 soisconnected(so); 11621 /* Do window scaling? */ 11622 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11623 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11624 tp->rcv_scale = tp->request_r_scale; 11625 } 11626 /* 11627 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 11628 * FIN-WAIT-1 11629 */ 11630 tp->t_starttime = ticks; 11631 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 11632 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 11633 tp->t_tfo_pending = NULL; 11634 } 11635 if (tp->t_flags & TF_NEEDFIN) { 11636 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11637 tp->t_flags &= ~TF_NEEDFIN; 11638 } else { 11639 tcp_state_change(tp, TCPS_ESTABLISHED); 11640 TCP_PROBE5(accept__established, NULL, tp, 11641 mtod(m, const char *), tp, th); 11642 /* 11643 * TFO connections call cc_conn_init() during SYN 11644 * processing. Calling it again here for such connections 11645 * is not harmless as it would undo the snd_cwnd reduction 11646 * that occurs when a TFO SYN|ACK is retransmitted. 11647 */ 11648 if (!IS_FASTOPEN(tp->t_flags)) 11649 rack_cc_conn_init(tp); 11650 } 11651 /* 11652 * Account for the ACK of our SYN prior to 11653 * regular ACK processing below, except for 11654 * simultaneous SYN, which is handled later. 11655 */ 11656 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 11657 tp->snd_una++; 11658 /* 11659 * If segment contains data or ACK, will call tcp_reass() later; if 11660 * not, do so now to pass queued data to user. 11661 */ 11662 if (tlen == 0 && (thflags & TH_FIN) == 0) { 11663 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 11664 (struct mbuf *)0); 11665 if (tp->t_flags & TF_WAKESOR) { 11666 tp->t_flags &= ~TF_WAKESOR; 11667 /* NB: sorwakeup_locked() does an implicit unlock. */ 11668 sorwakeup_locked(so); 11669 } 11670 } 11671 tp->snd_wl1 = th->th_seq - 1; 11672 /* For syn-recv we need to possibly update the rtt */ 11673 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11674 uint32_t t, mcts; 11675 11676 mcts = tcp_ts_getticks(); 11677 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11678 if (!tp->t_rttlow || tp->t_rttlow > t) 11679 tp->t_rttlow = t; 11680 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 11681 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11682 tcp_rack_xmit_timer_commit(rack, tp); 11683 } 11684 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11685 return (ret_val); 11686 } 11687 if (tp->t_state == TCPS_FIN_WAIT_1) { 11688 /* We could have went to FIN_WAIT_1 (or EST) above */ 11689 /* 11690 * In FIN_WAIT_1 STATE in addition to the processing for the 11691 * ESTABLISHED state if our FIN is now acknowledged then 11692 * enter FIN_WAIT_2. 11693 */ 11694 if (ourfinisacked) { 11695 /* 11696 * If we can't receive any more data, then closing 11697 * user can proceed. Starting the timer is contrary 11698 * to the specification, but if we don't get a FIN 11699 * we'll hang forever. 11700 * 11701 * XXXjl: we should release the tp also, and use a 11702 * compressed state. 11703 */ 11704 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11705 soisdisconnected(so); 11706 tcp_timer_activate(tp, TT_2MSL, 11707 (tcp_fast_finwait2_recycle ? 11708 tcp_finwait2_timeout : 11709 TP_MAXIDLE(tp))); 11710 } 11711 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11712 } 11713 } 11714 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11715 tiwin, thflags, nxt_pkt)); 11716 } 11717 11718 /* 11719 * Return value of 1, the TCB is unlocked and most 11720 * likely gone, return value of 0, the TCP is still 11721 * locked. 11722 */ 11723 static int 11724 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 11725 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11726 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11727 { 11728 int32_t ret_val = 0; 11729 struct tcp_rack *rack; 11730 11731 /* 11732 * Header prediction: check for the two common cases of a 11733 * uni-directional data xfer. If the packet has no control flags, 11734 * is in-sequence, the window didn't change and we're not 11735 * retransmitting, it's a candidate. If the length is zero and the 11736 * ack moved forward, we're the sender side of the xfer. Just free 11737 * the data acked & wake any higher level process that was blocked 11738 * waiting for space. If the length is non-zero and the ack didn't 11739 * move, we're the receiver side. If we're getting packets in-order 11740 * (the reassembly queue is empty), add the data toc The socket 11741 * buffer and note that we need a delayed ack. Make sure that the 11742 * hidden state-flags are also off. Since we check for 11743 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 11744 */ 11745 rack = (struct tcp_rack *)tp->t_fb_ptr; 11746 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 11747 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 11748 __predict_true(SEGQ_EMPTY(tp)) && 11749 __predict_true(th->th_seq == tp->rcv_nxt)) { 11750 if (tlen == 0) { 11751 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 11752 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 11753 return (0); 11754 } 11755 } else { 11756 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 11757 tiwin, nxt_pkt, iptos)) { 11758 return (0); 11759 } 11760 } 11761 } 11762 ctf_calc_rwin(so, tp); 11763 11764 if ((thflags & TH_RST) || 11765 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11766 return (__ctf_process_rst(m, th, so, tp, 11767 &rack->r_ctl.challenge_ack_ts, 11768 &rack->r_ctl.challenge_ack_cnt)); 11769 11770 /* 11771 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11772 * synchronized state. 11773 */ 11774 if (thflags & TH_SYN) { 11775 ctf_challenge_ack(m, th, tp, &ret_val); 11776 return (ret_val); 11777 } 11778 /* 11779 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11780 * it's less than ts_recent, drop it. 11781 */ 11782 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11783 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11784 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11785 return (ret_val); 11786 } 11787 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11788 &rack->r_ctl.challenge_ack_ts, 11789 &rack->r_ctl.challenge_ack_cnt)) { 11790 return (ret_val); 11791 } 11792 /* 11793 * If last ACK falls within this segment's sequence numbers, record 11794 * its timestamp. NOTE: 1) That the test incorporates suggestions 11795 * from the latest proposal of the tcplw@cray.com list (Braden 11796 * 1993/04/26). 2) That updating only on newer timestamps interferes 11797 * with our earlier PAWS tests, so this check should be solely 11798 * predicated on the sequence space of this segment. 3) That we 11799 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11800 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11801 * SEG.Len, This modified check allows us to overcome RFC1323's 11802 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11803 * p.869. In such cases, we can still calculate the RTT correctly 11804 * when RCV.NXT == Last.ACK.Sent. 11805 */ 11806 if ((to->to_flags & TOF_TS) != 0 && 11807 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11808 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11809 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11810 tp->ts_recent_age = tcp_ts_getticks(); 11811 tp->ts_recent = to->to_tsval; 11812 } 11813 /* 11814 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11815 * is on (half-synchronized state), then queue data for later 11816 * processing; else drop segment and return. 11817 */ 11818 if ((thflags & TH_ACK) == 0) { 11819 if (tp->t_flags & TF_NEEDSYN) { 11820 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11821 tiwin, thflags, nxt_pkt)); 11822 11823 } else if (tp->t_flags & TF_ACKNOW) { 11824 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11825 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11826 return (ret_val); 11827 } else { 11828 ctf_do_drop(m, NULL); 11829 return (0); 11830 } 11831 } 11832 /* 11833 * Ack processing. 11834 */ 11835 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11836 return (ret_val); 11837 } 11838 if (sbavail(&so->so_snd)) { 11839 if (ctf_progress_timeout_check(tp, true)) { 11840 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 11841 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11842 return (1); 11843 } 11844 } 11845 /* State changes only happen in rack_process_data() */ 11846 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11847 tiwin, thflags, nxt_pkt)); 11848 } 11849 11850 /* 11851 * Return value of 1, the TCB is unlocked and most 11852 * likely gone, return value of 0, the TCP is still 11853 * locked. 11854 */ 11855 static int 11856 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 11857 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11858 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11859 { 11860 int32_t ret_val = 0; 11861 struct tcp_rack *rack; 11862 11863 rack = (struct tcp_rack *)tp->t_fb_ptr; 11864 ctf_calc_rwin(so, tp); 11865 if ((thflags & TH_RST) || 11866 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11867 return (__ctf_process_rst(m, th, so, tp, 11868 &rack->r_ctl.challenge_ack_ts, 11869 &rack->r_ctl.challenge_ack_cnt)); 11870 /* 11871 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11872 * synchronized state. 11873 */ 11874 if (thflags & TH_SYN) { 11875 ctf_challenge_ack(m, th, tp, &ret_val); 11876 return (ret_val); 11877 } 11878 /* 11879 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11880 * it's less than ts_recent, drop it. 11881 */ 11882 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11883 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11884 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11885 return (ret_val); 11886 } 11887 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11888 &rack->r_ctl.challenge_ack_ts, 11889 &rack->r_ctl.challenge_ack_cnt)) { 11890 return (ret_val); 11891 } 11892 /* 11893 * If last ACK falls within this segment's sequence numbers, record 11894 * its timestamp. NOTE: 1) That the test incorporates suggestions 11895 * from the latest proposal of the tcplw@cray.com list (Braden 11896 * 1993/04/26). 2) That updating only on newer timestamps interferes 11897 * with our earlier PAWS tests, so this check should be solely 11898 * predicated on the sequence space of this segment. 3) That we 11899 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11900 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11901 * SEG.Len, This modified check allows us to overcome RFC1323's 11902 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11903 * p.869. In such cases, we can still calculate the RTT correctly 11904 * when RCV.NXT == Last.ACK.Sent. 11905 */ 11906 if ((to->to_flags & TOF_TS) != 0 && 11907 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11908 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11909 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11910 tp->ts_recent_age = tcp_ts_getticks(); 11911 tp->ts_recent = to->to_tsval; 11912 } 11913 /* 11914 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11915 * is on (half-synchronized state), then queue data for later 11916 * processing; else drop segment and return. 11917 */ 11918 if ((thflags & TH_ACK) == 0) { 11919 if (tp->t_flags & TF_NEEDSYN) { 11920 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11921 tiwin, thflags, nxt_pkt)); 11922 11923 } else if (tp->t_flags & TF_ACKNOW) { 11924 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11925 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11926 return (ret_val); 11927 } else { 11928 ctf_do_drop(m, NULL); 11929 return (0); 11930 } 11931 } 11932 /* 11933 * Ack processing. 11934 */ 11935 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11936 return (ret_val); 11937 } 11938 if (sbavail(&so->so_snd)) { 11939 if (ctf_progress_timeout_check(tp, true)) { 11940 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11941 tp, tick, PROGRESS_DROP, __LINE__); 11942 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11943 return (1); 11944 } 11945 } 11946 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11947 tiwin, thflags, nxt_pkt)); 11948 } 11949 11950 static int 11951 rack_check_data_after_close(struct mbuf *m, 11952 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 11953 { 11954 struct tcp_rack *rack; 11955 11956 rack = (struct tcp_rack *)tp->t_fb_ptr; 11957 if (rack->rc_allow_data_af_clo == 0) { 11958 close_now: 11959 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11960 /* tcp_close will kill the inp pre-log the Reset */ 11961 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 11962 tp = tcp_close(tp); 11963 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 11964 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 11965 return (1); 11966 } 11967 if (sbavail(&so->so_snd) == 0) 11968 goto close_now; 11969 /* Ok we allow data that is ignored and a followup reset */ 11970 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11971 tp->rcv_nxt = th->th_seq + *tlen; 11972 tp->t_flags2 |= TF2_DROP_AF_DATA; 11973 rack->r_wanted_output = 1; 11974 *tlen = 0; 11975 return (0); 11976 } 11977 11978 /* 11979 * Return value of 1, the TCB is unlocked and most 11980 * likely gone, return value of 0, the TCP is still 11981 * locked. 11982 */ 11983 static int 11984 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 11985 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11986 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11987 { 11988 int32_t ret_val = 0; 11989 int32_t ourfinisacked = 0; 11990 struct tcp_rack *rack; 11991 11992 rack = (struct tcp_rack *)tp->t_fb_ptr; 11993 ctf_calc_rwin(so, tp); 11994 11995 if ((thflags & TH_RST) || 11996 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11997 return (__ctf_process_rst(m, th, so, tp, 11998 &rack->r_ctl.challenge_ack_ts, 11999 &rack->r_ctl.challenge_ack_cnt)); 12000 /* 12001 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12002 * synchronized state. 12003 */ 12004 if (thflags & TH_SYN) { 12005 ctf_challenge_ack(m, th, tp, &ret_val); 12006 return (ret_val); 12007 } 12008 /* 12009 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12010 * it's less than ts_recent, drop it. 12011 */ 12012 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12013 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12014 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12015 return (ret_val); 12016 } 12017 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12018 &rack->r_ctl.challenge_ack_ts, 12019 &rack->r_ctl.challenge_ack_cnt)) { 12020 return (ret_val); 12021 } 12022 /* 12023 * If new data are received on a connection after the user processes 12024 * are gone, then RST the other end. 12025 */ 12026 if ((so->so_state & SS_NOFDREF) && tlen) { 12027 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 12028 return (1); 12029 } 12030 /* 12031 * If last ACK falls within this segment's sequence numbers, record 12032 * its timestamp. NOTE: 1) That the test incorporates suggestions 12033 * from the latest proposal of the tcplw@cray.com list (Braden 12034 * 1993/04/26). 2) That updating only on newer timestamps interferes 12035 * with our earlier PAWS tests, so this check should be solely 12036 * predicated on the sequence space of this segment. 3) That we 12037 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12038 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12039 * SEG.Len, This modified check allows us to overcome RFC1323's 12040 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12041 * p.869. In such cases, we can still calculate the RTT correctly 12042 * when RCV.NXT == Last.ACK.Sent. 12043 */ 12044 if ((to->to_flags & TOF_TS) != 0 && 12045 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12046 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12047 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12048 tp->ts_recent_age = tcp_ts_getticks(); 12049 tp->ts_recent = to->to_tsval; 12050 } 12051 /* 12052 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12053 * is on (half-synchronized state), then queue data for later 12054 * processing; else drop segment and return. 12055 */ 12056 if ((thflags & TH_ACK) == 0) { 12057 if (tp->t_flags & TF_NEEDSYN) { 12058 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12059 tiwin, thflags, nxt_pkt)); 12060 } else if (tp->t_flags & TF_ACKNOW) { 12061 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12062 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12063 return (ret_val); 12064 } else { 12065 ctf_do_drop(m, NULL); 12066 return (0); 12067 } 12068 } 12069 /* 12070 * Ack processing. 12071 */ 12072 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12073 return (ret_val); 12074 } 12075 if (ourfinisacked) { 12076 /* 12077 * If we can't receive any more data, then closing user can 12078 * proceed. Starting the timer is contrary to the 12079 * specification, but if we don't get a FIN we'll hang 12080 * forever. 12081 * 12082 * XXXjl: we should release the tp also, and use a 12083 * compressed state. 12084 */ 12085 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 12086 soisdisconnected(so); 12087 tcp_timer_activate(tp, TT_2MSL, 12088 (tcp_fast_finwait2_recycle ? 12089 tcp_finwait2_timeout : 12090 TP_MAXIDLE(tp))); 12091 } 12092 tcp_state_change(tp, TCPS_FIN_WAIT_2); 12093 } 12094 if (sbavail(&so->so_snd)) { 12095 if (ctf_progress_timeout_check(tp, true)) { 12096 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12097 tp, tick, PROGRESS_DROP, __LINE__); 12098 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12099 return (1); 12100 } 12101 } 12102 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12103 tiwin, thflags, nxt_pkt)); 12104 } 12105 12106 /* 12107 * Return value of 1, the TCB is unlocked and most 12108 * likely gone, return value of 0, the TCP is still 12109 * locked. 12110 */ 12111 static int 12112 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 12113 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12114 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12115 { 12116 int32_t ret_val = 0; 12117 int32_t ourfinisacked = 0; 12118 struct tcp_rack *rack; 12119 12120 rack = (struct tcp_rack *)tp->t_fb_ptr; 12121 ctf_calc_rwin(so, tp); 12122 12123 if ((thflags & TH_RST) || 12124 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12125 return (__ctf_process_rst(m, th, so, tp, 12126 &rack->r_ctl.challenge_ack_ts, 12127 &rack->r_ctl.challenge_ack_cnt)); 12128 /* 12129 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12130 * synchronized state. 12131 */ 12132 if (thflags & TH_SYN) { 12133 ctf_challenge_ack(m, th, tp, &ret_val); 12134 return (ret_val); 12135 } 12136 /* 12137 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12138 * it's less than ts_recent, drop it. 12139 */ 12140 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12141 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12142 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12143 return (ret_val); 12144 } 12145 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12146 &rack->r_ctl.challenge_ack_ts, 12147 &rack->r_ctl.challenge_ack_cnt)) { 12148 return (ret_val); 12149 } 12150 /* 12151 * If new data are received on a connection after the user processes 12152 * are gone, then RST the other end. 12153 */ 12154 if ((so->so_state & SS_NOFDREF) && tlen) { 12155 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 12156 return (1); 12157 } 12158 /* 12159 * If last ACK falls within this segment's sequence numbers, record 12160 * its timestamp. NOTE: 1) That the test incorporates suggestions 12161 * from the latest proposal of the tcplw@cray.com list (Braden 12162 * 1993/04/26). 2) That updating only on newer timestamps interferes 12163 * with our earlier PAWS tests, so this check should be solely 12164 * predicated on the sequence space of this segment. 3) That we 12165 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12166 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12167 * SEG.Len, This modified check allows us to overcome RFC1323's 12168 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12169 * p.869. In such cases, we can still calculate the RTT correctly 12170 * when RCV.NXT == Last.ACK.Sent. 12171 */ 12172 if ((to->to_flags & TOF_TS) != 0 && 12173 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12174 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12175 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12176 tp->ts_recent_age = tcp_ts_getticks(); 12177 tp->ts_recent = to->to_tsval; 12178 } 12179 /* 12180 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12181 * is on (half-synchronized state), then queue data for later 12182 * processing; else drop segment and return. 12183 */ 12184 if ((thflags & TH_ACK) == 0) { 12185 if (tp->t_flags & TF_NEEDSYN) { 12186 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12187 tiwin, thflags, nxt_pkt)); 12188 } else if (tp->t_flags & TF_ACKNOW) { 12189 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12190 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12191 return (ret_val); 12192 } else { 12193 ctf_do_drop(m, NULL); 12194 return (0); 12195 } 12196 } 12197 /* 12198 * Ack processing. 12199 */ 12200 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12201 return (ret_val); 12202 } 12203 if (ourfinisacked) { 12204 tcp_twstart(tp); 12205 m_freem(m); 12206 return (1); 12207 } 12208 if (sbavail(&so->so_snd)) { 12209 if (ctf_progress_timeout_check(tp, true)) { 12210 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12211 tp, tick, PROGRESS_DROP, __LINE__); 12212 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12213 return (1); 12214 } 12215 } 12216 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12217 tiwin, thflags, nxt_pkt)); 12218 } 12219 12220 /* 12221 * Return value of 1, the TCB is unlocked and most 12222 * likely gone, return value of 0, the TCP is still 12223 * locked. 12224 */ 12225 static int 12226 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 12227 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12228 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12229 { 12230 int32_t ret_val = 0; 12231 int32_t ourfinisacked = 0; 12232 struct tcp_rack *rack; 12233 12234 rack = (struct tcp_rack *)tp->t_fb_ptr; 12235 ctf_calc_rwin(so, tp); 12236 12237 if ((thflags & TH_RST) || 12238 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12239 return (__ctf_process_rst(m, th, so, tp, 12240 &rack->r_ctl.challenge_ack_ts, 12241 &rack->r_ctl.challenge_ack_cnt)); 12242 /* 12243 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12244 * synchronized state. 12245 */ 12246 if (thflags & TH_SYN) { 12247 ctf_challenge_ack(m, th, tp, &ret_val); 12248 return (ret_val); 12249 } 12250 /* 12251 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12252 * it's less than ts_recent, drop it. 12253 */ 12254 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12255 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12256 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12257 return (ret_val); 12258 } 12259 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12260 &rack->r_ctl.challenge_ack_ts, 12261 &rack->r_ctl.challenge_ack_cnt)) { 12262 return (ret_val); 12263 } 12264 /* 12265 * If new data are received on a connection after the user processes 12266 * are gone, then RST the other end. 12267 */ 12268 if ((so->so_state & SS_NOFDREF) && tlen) { 12269 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 12270 return (1); 12271 } 12272 /* 12273 * If last ACK falls within this segment's sequence numbers, record 12274 * its timestamp. NOTE: 1) That the test incorporates suggestions 12275 * from the latest proposal of the tcplw@cray.com list (Braden 12276 * 1993/04/26). 2) That updating only on newer timestamps interferes 12277 * with our earlier PAWS tests, so this check should be solely 12278 * predicated on the sequence space of this segment. 3) That we 12279 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12280 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12281 * SEG.Len, This modified check allows us to overcome RFC1323's 12282 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12283 * p.869. In such cases, we can still calculate the RTT correctly 12284 * when RCV.NXT == Last.ACK.Sent. 12285 */ 12286 if ((to->to_flags & TOF_TS) != 0 && 12287 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12288 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12289 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12290 tp->ts_recent_age = tcp_ts_getticks(); 12291 tp->ts_recent = to->to_tsval; 12292 } 12293 /* 12294 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12295 * is on (half-synchronized state), then queue data for later 12296 * processing; else drop segment and return. 12297 */ 12298 if ((thflags & TH_ACK) == 0) { 12299 if (tp->t_flags & TF_NEEDSYN) { 12300 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12301 tiwin, thflags, nxt_pkt)); 12302 } else if (tp->t_flags & TF_ACKNOW) { 12303 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12304 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12305 return (ret_val); 12306 } else { 12307 ctf_do_drop(m, NULL); 12308 return (0); 12309 } 12310 } 12311 /* 12312 * case TCPS_LAST_ACK: Ack processing. 12313 */ 12314 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12315 return (ret_val); 12316 } 12317 if (ourfinisacked) { 12318 tp = tcp_close(tp); 12319 ctf_do_drop(m, tp); 12320 return (1); 12321 } 12322 if (sbavail(&so->so_snd)) { 12323 if (ctf_progress_timeout_check(tp, true)) { 12324 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12325 tp, tick, PROGRESS_DROP, __LINE__); 12326 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12327 return (1); 12328 } 12329 } 12330 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12331 tiwin, thflags, nxt_pkt)); 12332 } 12333 12334 /* 12335 * Return value of 1, the TCB is unlocked and most 12336 * likely gone, return value of 0, the TCP is still 12337 * locked. 12338 */ 12339 static int 12340 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 12341 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 12342 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 12343 { 12344 int32_t ret_val = 0; 12345 int32_t ourfinisacked = 0; 12346 struct tcp_rack *rack; 12347 12348 rack = (struct tcp_rack *)tp->t_fb_ptr; 12349 ctf_calc_rwin(so, tp); 12350 12351 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 12352 if ((thflags & TH_RST) || 12353 (tp->t_fin_is_rst && (thflags & TH_FIN))) 12354 return (__ctf_process_rst(m, th, so, tp, 12355 &rack->r_ctl.challenge_ack_ts, 12356 &rack->r_ctl.challenge_ack_cnt)); 12357 /* 12358 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 12359 * synchronized state. 12360 */ 12361 if (thflags & TH_SYN) { 12362 ctf_challenge_ack(m, th, tp, &ret_val); 12363 return (ret_val); 12364 } 12365 /* 12366 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 12367 * it's less than ts_recent, drop it. 12368 */ 12369 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 12370 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 12371 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 12372 return (ret_val); 12373 } 12374 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 12375 &rack->r_ctl.challenge_ack_ts, 12376 &rack->r_ctl.challenge_ack_cnt)) { 12377 return (ret_val); 12378 } 12379 /* 12380 * If new data are received on a connection after the user processes 12381 * are gone, then RST the other end. 12382 */ 12383 if ((so->so_state & SS_NOFDREF) && 12384 tlen) { 12385 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 12386 return (1); 12387 } 12388 /* 12389 * If last ACK falls within this segment's sequence numbers, record 12390 * its timestamp. NOTE: 1) That the test incorporates suggestions 12391 * from the latest proposal of the tcplw@cray.com list (Braden 12392 * 1993/04/26). 2) That updating only on newer timestamps interferes 12393 * with our earlier PAWS tests, so this check should be solely 12394 * predicated on the sequence space of this segment. 3) That we 12395 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 12396 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 12397 * SEG.Len, This modified check allows us to overcome RFC1323's 12398 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 12399 * p.869. In such cases, we can still calculate the RTT correctly 12400 * when RCV.NXT == Last.ACK.Sent. 12401 */ 12402 if ((to->to_flags & TOF_TS) != 0 && 12403 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 12404 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 12405 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 12406 tp->ts_recent_age = tcp_ts_getticks(); 12407 tp->ts_recent = to->to_tsval; 12408 } 12409 /* 12410 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 12411 * is on (half-synchronized state), then queue data for later 12412 * processing; else drop segment and return. 12413 */ 12414 if ((thflags & TH_ACK) == 0) { 12415 if (tp->t_flags & TF_NEEDSYN) { 12416 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12417 tiwin, thflags, nxt_pkt)); 12418 } else if (tp->t_flags & TF_ACKNOW) { 12419 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 12420 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 12421 return (ret_val); 12422 } else { 12423 ctf_do_drop(m, NULL); 12424 return (0); 12425 } 12426 } 12427 /* 12428 * Ack processing. 12429 */ 12430 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 12431 return (ret_val); 12432 } 12433 if (sbavail(&so->so_snd)) { 12434 if (ctf_progress_timeout_check(tp, true)) { 12435 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 12436 tp, tick, PROGRESS_DROP, __LINE__); 12437 ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 12438 return (1); 12439 } 12440 } 12441 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 12442 tiwin, thflags, nxt_pkt)); 12443 } 12444 12445 static void inline 12446 rack_clear_rate_sample(struct tcp_rack *rack) 12447 { 12448 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 12449 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 12450 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 12451 } 12452 12453 static void 12454 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 12455 { 12456 uint64_t bw_est, rate_wanted; 12457 int chged = 0; 12458 uint32_t user_max, orig_min, orig_max; 12459 12460 orig_min = rack->r_ctl.rc_pace_min_segs; 12461 orig_max = rack->r_ctl.rc_pace_max_segs; 12462 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 12463 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 12464 chged = 1; 12465 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 12466 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 12467 if (user_max != rack->r_ctl.rc_pace_max_segs) 12468 chged = 1; 12469 } 12470 if (rack->rc_force_max_seg) { 12471 rack->r_ctl.rc_pace_max_segs = user_max; 12472 } else if (rack->use_fixed_rate) { 12473 bw_est = rack_get_bw(rack); 12474 if ((rack->r_ctl.crte == NULL) || 12475 (bw_est != rack->r_ctl.crte->rate)) { 12476 rack->r_ctl.rc_pace_max_segs = user_max; 12477 } else { 12478 /* We are pacing right at the hardware rate */ 12479 uint32_t segsiz; 12480 12481 segsiz = min(ctf_fixed_maxseg(tp), 12482 rack->r_ctl.rc_pace_min_segs); 12483 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 12484 tp, bw_est, segsiz, 0, 12485 rack->r_ctl.crte, NULL); 12486 } 12487 } else if (rack->rc_always_pace) { 12488 if (rack->r_ctl.gp_bw || 12489 #ifdef NETFLIX_PEAKRATE 12490 rack->rc_tp->t_maxpeakrate || 12491 #endif 12492 rack->r_ctl.init_rate) { 12493 /* We have a rate of some sort set */ 12494 uint32_t orig; 12495 12496 bw_est = rack_get_bw(rack); 12497 orig = rack->r_ctl.rc_pace_max_segs; 12498 if (fill_override) 12499 rate_wanted = *fill_override; 12500 else 12501 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); 12502 if (rate_wanted) { 12503 /* We have something */ 12504 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 12505 rate_wanted, 12506 ctf_fixed_maxseg(rack->rc_tp)); 12507 } else 12508 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 12509 if (orig != rack->r_ctl.rc_pace_max_segs) 12510 chged = 1; 12511 } else if ((rack->r_ctl.gp_bw == 0) && 12512 (rack->r_ctl.rc_pace_max_segs == 0)) { 12513 /* 12514 * If we have nothing limit us to bursting 12515 * out IW sized pieces. 12516 */ 12517 chged = 1; 12518 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 12519 } 12520 } 12521 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 12522 chged = 1; 12523 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 12524 } 12525 if (chged) 12526 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 12527 } 12528 12529 12530 static void 12531 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) 12532 { 12533 #ifdef INET6 12534 struct ip6_hdr *ip6 = NULL; 12535 #endif 12536 #ifdef INET 12537 struct ip *ip = NULL; 12538 #endif 12539 struct udphdr *udp = NULL; 12540 12541 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 12542 #ifdef INET6 12543 if (rack->r_is_v6) { 12544 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12545 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 12546 if (tp->t_port) { 12547 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12548 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 12549 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12550 udp->uh_dport = tp->t_port; 12551 rack->r_ctl.fsb.udp = udp; 12552 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12553 } else 12554 { 12555 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 12556 rack->r_ctl.fsb.udp = NULL; 12557 } 12558 tcpip_fillheaders(rack->rc_inp, 12559 tp->t_port, 12560 ip6, rack->r_ctl.fsb.th); 12561 } else 12562 #endif /* INET6 */ 12563 { 12564 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 12565 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 12566 if (tp->t_port) { 12567 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12568 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 12569 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12570 udp->uh_dport = tp->t_port; 12571 rack->r_ctl.fsb.udp = udp; 12572 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12573 } else 12574 { 12575 rack->r_ctl.fsb.udp = NULL; 12576 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 12577 } 12578 tcpip_fillheaders(rack->rc_inp, 12579 tp->t_port, 12580 ip, rack->r_ctl.fsb.th); 12581 } 12582 rack->r_fsb_inited = 1; 12583 } 12584 12585 static int 12586 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 12587 { 12588 /* 12589 * Allocate the larger of spaces V6 if available else just 12590 * V4 and include udphdr (overbook) 12591 */ 12592 #ifdef INET6 12593 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 12594 #else 12595 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 12596 #endif 12597 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 12598 M_TCPFSB, M_NOWAIT|M_ZERO); 12599 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 12600 return (ENOMEM); 12601 } 12602 rack->r_fsb_inited = 0; 12603 return (0); 12604 } 12605 12606 static int 12607 rack_init(struct tcpcb *tp) 12608 { 12609 struct tcp_rack *rack = NULL; 12610 struct rack_sendmap *insret; 12611 uint32_t iwin, snt, us_cts; 12612 int err; 12613 12614 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 12615 if (tp->t_fb_ptr == NULL) { 12616 /* 12617 * We need to allocate memory but cant. The INP and INP_INFO 12618 * locks and they are recusive (happens during setup. So a 12619 * scheme to drop the locks fails :( 12620 * 12621 */ 12622 return (ENOMEM); 12623 } 12624 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 12625 12626 rack = (struct tcp_rack *)tp->t_fb_ptr; 12627 RB_INIT(&rack->r_ctl.rc_mtree); 12628 TAILQ_INIT(&rack->r_ctl.rc_free); 12629 TAILQ_INIT(&rack->r_ctl.rc_tmap); 12630 rack->rc_tp = tp; 12631 rack->rc_inp = tp->t_inpcb; 12632 /* Set the flag */ 12633 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 12634 /* Probably not needed but lets be sure */ 12635 rack_clear_rate_sample(rack); 12636 /* 12637 * Save off the default values, socket options will poke 12638 * at these if pacing is not on or we have not yet 12639 * reached where pacing is on (gp_ready/fixed enabled). 12640 * When they get set into the CC module (when gp_ready 12641 * is enabled or we enable fixed) then we will set these 12642 * values into the CC and place in here the old values 12643 * so we have a restoral. Then we will set the flag 12644 * rc_pacing_cc_set. That way whenever we turn off pacing 12645 * or switch off this stack, we will know to go restore 12646 * the saved values. 12647 */ 12648 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 12649 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 12650 /* We want abe like behavior as well */ 12651 rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; 12652 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 12653 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 12654 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 12655 rack->r_ctl.roundends = tp->snd_max; 12656 if (use_rack_rr) 12657 rack->use_rack_rr = 1; 12658 if (V_tcp_delack_enabled) 12659 tp->t_delayed_ack = 1; 12660 else 12661 tp->t_delayed_ack = 0; 12662 #ifdef TCP_ACCOUNTING 12663 if (rack_tcp_accounting) { 12664 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 12665 } 12666 #endif 12667 if (rack_enable_shared_cwnd) 12668 rack->rack_enable_scwnd = 1; 12669 rack->rc_user_set_max_segs = rack_hptsi_segments; 12670 rack->rc_force_max_seg = 0; 12671 if (rack_use_imac_dack) 12672 rack->rc_dack_mode = 1; 12673 TAILQ_INIT(&rack->r_ctl.opt_list); 12674 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 12675 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 12676 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 12677 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 12678 rack->r_ctl.rc_highest_us_rtt = 0; 12679 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 12680 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 12681 if (rack_use_cmp_acks) 12682 rack->r_use_cmp_ack = 1; 12683 if (rack_disable_prr) 12684 rack->rack_no_prr = 1; 12685 if (rack_gp_no_rec_chg) 12686 rack->rc_gp_no_rec_chg = 1; 12687 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 12688 rack->rc_always_pace = 1; 12689 if (rack->use_fixed_rate || rack->gp_ready) 12690 rack_set_cc_pacing(rack); 12691 } else 12692 rack->rc_always_pace = 0; 12693 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 12694 rack->r_mbuf_queue = 1; 12695 else 12696 rack->r_mbuf_queue = 0; 12697 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 12698 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 12699 else 12700 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12701 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12702 if (rack_limits_scwnd) 12703 rack->r_limit_scw = 1; 12704 else 12705 rack->r_limit_scw = 0; 12706 rack->rc_labc = V_tcp_abc_l_var; 12707 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 12708 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12709 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 12710 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 12711 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 12712 rack->r_ctl.rc_min_to = rack_min_to; 12713 microuptime(&rack->r_ctl.act_rcv_time); 12714 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 12715 rack->r_running_late = 0; 12716 rack->r_running_early = 0; 12717 rack->rc_init_win = rack_default_init_window; 12718 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 12719 if (rack_hw_up_only) 12720 rack->r_up_only = 1; 12721 if (rack_do_dyn_mul) { 12722 /* When dynamic adjustment is on CA needs to start at 100% */ 12723 rack->rc_gp_dyn_mul = 1; 12724 if (rack_do_dyn_mul >= 100) 12725 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 12726 } else 12727 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 12728 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 12729 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 12730 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 12731 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 12732 rack_probertt_filter_life); 12733 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12734 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12735 rack->r_ctl.rc_time_of_last_probertt = us_cts; 12736 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 12737 rack->r_ctl.rc_time_probertt_starts = 0; 12738 if (rack_dsack_std_based & 0x1) { 12739 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 12740 rack->rc_rack_tmr_std_based = 1; 12741 } 12742 if (rack_dsack_std_based & 0x2) { 12743 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 12744 rack->rc_rack_use_dsack = 1; 12745 } 12746 /* We require at least one measurement, even if the sysctl is 0 */ 12747 if (rack_req_measurements) 12748 rack->r_ctl.req_measurements = rack_req_measurements; 12749 else 12750 rack->r_ctl.req_measurements = 1; 12751 if (rack_enable_hw_pacing) 12752 rack->rack_hdw_pace_ena = 1; 12753 if (rack_hw_rate_caps) 12754 rack->r_rack_hw_rate_caps = 1; 12755 /* Do we force on detection? */ 12756 #ifdef NETFLIX_EXP_DETECTION 12757 if (tcp_force_detection) 12758 rack->do_detection = 1; 12759 else 12760 #endif 12761 rack->do_detection = 0; 12762 if (rack_non_rxt_use_cr) 12763 rack->rack_rec_nonrxt_use_cr = 1; 12764 err = rack_init_fsb(tp, rack); 12765 if (err) { 12766 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12767 tp->t_fb_ptr = NULL; 12768 return (err); 12769 } 12770 if (tp->snd_una != tp->snd_max) { 12771 /* Create a send map for the current outstanding data */ 12772 struct rack_sendmap *rsm; 12773 12774 rsm = rack_alloc(rack); 12775 if (rsm == NULL) { 12776 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12777 tp->t_fb_ptr = NULL; 12778 return (ENOMEM); 12779 } 12780 rsm->r_no_rtt_allowed = 1; 12781 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 12782 rsm->r_rtr_cnt = 1; 12783 rsm->r_rtr_bytes = 0; 12784 if (tp->t_flags & TF_SENTFIN) { 12785 rsm->r_end = tp->snd_max - 1; 12786 rsm->r_flags |= RACK_HAS_FIN; 12787 } else { 12788 rsm->r_end = tp->snd_max; 12789 } 12790 if (tp->snd_una == tp->iss) { 12791 /* The data space is one beyond snd_una */ 12792 rsm->r_flags |= RACK_HAS_SYN; 12793 rsm->r_start = tp->iss; 12794 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 12795 } else 12796 rsm->r_start = tp->snd_una; 12797 rsm->r_dupack = 0; 12798 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 12799 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 12800 if (rsm->m) 12801 rsm->orig_m_len = rsm->m->m_len; 12802 else 12803 rsm->orig_m_len = 0; 12804 } else { 12805 /* 12806 * This can happen if we have a stand-alone FIN or 12807 * SYN. 12808 */ 12809 rsm->m = NULL; 12810 rsm->orig_m_len = 0; 12811 rsm->soff = 0; 12812 } 12813 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12814 #ifdef INVARIANTS 12815 if (insret != NULL) { 12816 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 12817 insret, rack, rsm); 12818 } 12819 #endif 12820 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 12821 rsm->r_in_tmap = 1; 12822 } 12823 /* 12824 * Timers in Rack are kept in microseconds so lets 12825 * convert any initial incoming variables 12826 * from ticks into usecs. Note that we 12827 * also change the values of t_srtt and t_rttvar, if 12828 * they are non-zero. They are kept with a 5 12829 * bit decimal so we have to carefully convert 12830 * these to get the full precision. 12831 */ 12832 rack_convert_rtts(tp); 12833 tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); 12834 if (rack_do_hystart) { 12835 struct sockopt sopt; 12836 struct cc_newreno_opts opt; 12837 12838 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 12839 sopt.sopt_dir = SOPT_SET; 12840 opt.name = CC_NEWRENO_ENABLE_HYSTART; 12841 opt.val = rack_do_hystart; 12842 if (CC_ALGO(tp)->ctl_output != NULL) 12843 (void)CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 12844 } 12845 if (rack_def_profile) 12846 rack_set_profile(rack, rack_def_profile); 12847 /* Cancel the GP measurement in progress */ 12848 tp->t_flags &= ~TF_GPUTINPROG; 12849 if (SEQ_GT(tp->snd_max, tp->iss)) 12850 snt = tp->snd_max - tp->iss; 12851 else 12852 snt = 0; 12853 iwin = rc_init_window(rack); 12854 if (snt < iwin) { 12855 /* We are not past the initial window 12856 * so we need to make sure cwnd is 12857 * correct. 12858 */ 12859 if (tp->snd_cwnd < iwin) 12860 tp->snd_cwnd = iwin; 12861 /* 12862 * If we are within the initial window 12863 * we want ssthresh to be unlimited. Setting 12864 * it to the rwnd (which the default stack does 12865 * and older racks) is not really a good idea 12866 * since we want to be in SS and grow both the 12867 * cwnd and the rwnd (via dynamic rwnd growth). If 12868 * we set it to the rwnd then as the peer grows its 12869 * rwnd we will be stuck in CA and never hit SS. 12870 * 12871 * Its far better to raise it up high (this takes the 12872 * risk that there as been a loss already, probably 12873 * we should have an indicator in all stacks of loss 12874 * but we don't), but considering the normal use this 12875 * is a risk worth taking. The consequences of not 12876 * hitting SS are far worse than going one more time 12877 * into it early on (before we have sent even a IW). 12878 * It is highly unlikely that we will have had a loss 12879 * before getting the IW out. 12880 */ 12881 tp->snd_ssthresh = 0xffffffff; 12882 } 12883 rack_stop_all_timers(tp); 12884 /* Lets setup the fsb block */ 12885 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12886 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 12887 __LINE__, RACK_RTTS_INIT); 12888 return (0); 12889 } 12890 12891 static int 12892 rack_handoff_ok(struct tcpcb *tp) 12893 { 12894 if ((tp->t_state == TCPS_CLOSED) || 12895 (tp->t_state == TCPS_LISTEN)) { 12896 /* Sure no problem though it may not stick */ 12897 return (0); 12898 } 12899 if ((tp->t_state == TCPS_SYN_SENT) || 12900 (tp->t_state == TCPS_SYN_RECEIVED)) { 12901 /* 12902 * We really don't know if you support sack, 12903 * you have to get to ESTAB or beyond to tell. 12904 */ 12905 return (EAGAIN); 12906 } 12907 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 12908 /* 12909 * Rack will only send a FIN after all data is acknowledged. 12910 * So in this case we have more data outstanding. We can't 12911 * switch stacks until either all data and only the FIN 12912 * is left (in which case rack_init() now knows how 12913 * to deal with that) <or> all is acknowledged and we 12914 * are only left with incoming data, though why you 12915 * would want to switch to rack after all data is acknowledged 12916 * I have no idea (rrs)! 12917 */ 12918 return (EAGAIN); 12919 } 12920 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 12921 return (0); 12922 } 12923 /* 12924 * If we reach here we don't do SACK on this connection so we can 12925 * never do rack. 12926 */ 12927 return (EINVAL); 12928 } 12929 12930 12931 static void 12932 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 12933 { 12934 int ack_cmp = 0; 12935 12936 if (tp->t_fb_ptr) { 12937 struct tcp_rack *rack; 12938 struct rack_sendmap *rsm, *nrsm, *rm; 12939 12940 rack = (struct tcp_rack *)tp->t_fb_ptr; 12941 if (tp->t_in_pkt) { 12942 /* 12943 * It is unsafe to process the packets since a 12944 * reset may be lurking in them (its rare but it 12945 * can occur). If we were to find a RST, then we 12946 * would end up dropping the connection and the 12947 * INP lock, so when we return the caller (tcp_usrreq) 12948 * will blow up when it trys to unlock the inp. 12949 */ 12950 struct mbuf *save, *m; 12951 12952 m = tp->t_in_pkt; 12953 tp->t_in_pkt = NULL; 12954 tp->t_tail_pkt = NULL; 12955 while (m) { 12956 save = m->m_nextpkt; 12957 m->m_nextpkt = NULL; 12958 m_freem(m); 12959 m = save; 12960 } 12961 if ((tp->t_inpcb) && 12962 (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP)) 12963 ack_cmp = 1; 12964 if (ack_cmp) { 12965 /* Total if we used large or small (if ack-cmp was used). */ 12966 if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS) 12967 counter_u64_add(rack_large_ackcmp, 1); 12968 else 12969 counter_u64_add(rack_small_ackcmp, 1); 12970 } 12971 } 12972 tp->t_flags &= ~TF_FORCEDATA; 12973 #ifdef NETFLIX_SHARED_CWND 12974 if (rack->r_ctl.rc_scw) { 12975 uint32_t limit; 12976 12977 if (rack->r_limit_scw) 12978 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 12979 else 12980 limit = 0; 12981 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 12982 rack->r_ctl.rc_scw_index, 12983 limit); 12984 rack->r_ctl.rc_scw = NULL; 12985 } 12986 #endif 12987 if (rack->r_ctl.fsb.tcp_ip_hdr) { 12988 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 12989 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 12990 rack->r_ctl.fsb.th = NULL; 12991 } 12992 /* Convert back to ticks, with */ 12993 if (tp->t_srtt > 1) { 12994 uint32_t val, frac; 12995 12996 val = USEC_2_TICKS(tp->t_srtt); 12997 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12998 tp->t_srtt = val << TCP_RTT_SHIFT; 12999 /* 13000 * frac is the fractional part here is left 13001 * over from converting to hz and shifting. 13002 * We need to convert this to the 5 bit 13003 * remainder. 13004 */ 13005 if (frac) { 13006 if (hz == 1000) { 13007 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 13008 } else { 13009 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 13010 } 13011 tp->t_srtt += frac; 13012 } 13013 } 13014 if (tp->t_rttvar) { 13015 uint32_t val, frac; 13016 13017 val = USEC_2_TICKS(tp->t_rttvar); 13018 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 13019 tp->t_rttvar = val << TCP_RTTVAR_SHIFT; 13020 /* 13021 * frac is the fractional part here is left 13022 * over from converting to hz and shifting. 13023 * We need to convert this to the 5 bit 13024 * remainder. 13025 */ 13026 if (frac) { 13027 if (hz == 1000) { 13028 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 13029 } else { 13030 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 13031 } 13032 tp->t_rttvar += frac; 13033 } 13034 } 13035 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); 13036 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); 13037 if (rack->rc_always_pace) { 13038 tcp_decrement_paced_conn(); 13039 rack_undo_cc_pacing(rack); 13040 rack->rc_always_pace = 0; 13041 } 13042 /* Clean up any options if they were not applied */ 13043 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 13044 struct deferred_opt_list *dol; 13045 13046 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 13047 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 13048 free(dol, M_TCPDO); 13049 } 13050 /* rack does not use force data but other stacks may clear it */ 13051 if (rack->r_ctl.crte != NULL) { 13052 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 13053 rack->rack_hdrw_pacing = 0; 13054 rack->r_ctl.crte = NULL; 13055 } 13056 #ifdef TCP_BLACKBOX 13057 tcp_log_flowend(tp); 13058 #endif 13059 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 13060 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 13061 #ifdef INVARIANTS 13062 if (rm != rsm) { 13063 panic("At fini, rack:%p rsm:%p rm:%p", 13064 rack, rsm, rm); 13065 } 13066 #endif 13067 uma_zfree(rack_zone, rsm); 13068 } 13069 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 13070 while (rsm) { 13071 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 13072 uma_zfree(rack_zone, rsm); 13073 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 13074 } 13075 rack->rc_free_cnt = 0; 13076 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 13077 tp->t_fb_ptr = NULL; 13078 } 13079 if (tp->t_inpcb) { 13080 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 13081 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 13082 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 13083 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP; 13084 /* Cancel the GP measurement in progress */ 13085 tp->t_flags &= ~TF_GPUTINPROG; 13086 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS; 13087 } 13088 /* Make sure snd_nxt is correctly set */ 13089 tp->snd_nxt = tp->snd_max; 13090 } 13091 13092 static void 13093 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 13094 { 13095 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 13096 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 13097 } 13098 switch (tp->t_state) { 13099 case TCPS_SYN_SENT: 13100 rack->r_state = TCPS_SYN_SENT; 13101 rack->r_substate = rack_do_syn_sent; 13102 break; 13103 case TCPS_SYN_RECEIVED: 13104 rack->r_state = TCPS_SYN_RECEIVED; 13105 rack->r_substate = rack_do_syn_recv; 13106 break; 13107 case TCPS_ESTABLISHED: 13108 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13109 rack->r_state = TCPS_ESTABLISHED; 13110 rack->r_substate = rack_do_established; 13111 break; 13112 case TCPS_CLOSE_WAIT: 13113 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13114 rack->r_state = TCPS_CLOSE_WAIT; 13115 rack->r_substate = rack_do_close_wait; 13116 break; 13117 case TCPS_FIN_WAIT_1: 13118 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13119 rack->r_state = TCPS_FIN_WAIT_1; 13120 rack->r_substate = rack_do_fin_wait_1; 13121 break; 13122 case TCPS_CLOSING: 13123 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13124 rack->r_state = TCPS_CLOSING; 13125 rack->r_substate = rack_do_closing; 13126 break; 13127 case TCPS_LAST_ACK: 13128 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13129 rack->r_state = TCPS_LAST_ACK; 13130 rack->r_substate = rack_do_lastack; 13131 break; 13132 case TCPS_FIN_WAIT_2: 13133 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13134 rack->r_state = TCPS_FIN_WAIT_2; 13135 rack->r_substate = rack_do_fin_wait_2; 13136 break; 13137 case TCPS_LISTEN: 13138 case TCPS_CLOSED: 13139 case TCPS_TIME_WAIT: 13140 default: 13141 break; 13142 }; 13143 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 13144 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 13145 13146 } 13147 13148 static void 13149 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 13150 { 13151 /* 13152 * We received an ack, and then did not 13153 * call send or were bounced out due to the 13154 * hpts was running. Now a timer is up as well, is 13155 * it the right timer? 13156 */ 13157 struct rack_sendmap *rsm; 13158 int tmr_up; 13159 13160 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 13161 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 13162 return; 13163 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 13164 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 13165 (tmr_up == PACE_TMR_RXT)) { 13166 /* Should be an RXT */ 13167 return; 13168 } 13169 if (rsm == NULL) { 13170 /* Nothing outstanding? */ 13171 if (tp->t_flags & TF_DELACK) { 13172 if (tmr_up == PACE_TMR_DELACK) 13173 /* We are supposed to have delayed ack up and we do */ 13174 return; 13175 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 13176 /* 13177 * if we hit enobufs then we would expect the possiblity 13178 * of nothing outstanding and the RXT up (and the hptsi timer). 13179 */ 13180 return; 13181 } else if (((V_tcp_always_keepalive || 13182 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 13183 (tp->t_state <= TCPS_CLOSING)) && 13184 (tmr_up == PACE_TMR_KEEP) && 13185 (tp->snd_max == tp->snd_una)) { 13186 /* We should have keep alive up and we do */ 13187 return; 13188 } 13189 } 13190 if (SEQ_GT(tp->snd_max, tp->snd_una) && 13191 ((tmr_up == PACE_TMR_TLP) || 13192 (tmr_up == PACE_TMR_RACK) || 13193 (tmr_up == PACE_TMR_RXT))) { 13194 /* 13195 * Either a Rack, TLP or RXT is fine if we 13196 * have outstanding data. 13197 */ 13198 return; 13199 } else if (tmr_up == PACE_TMR_DELACK) { 13200 /* 13201 * If the delayed ack was going to go off 13202 * before the rtx/tlp/rack timer were going to 13203 * expire, then that would be the timer in control. 13204 * Note we don't check the time here trusting the 13205 * code is correct. 13206 */ 13207 return; 13208 } 13209 /* 13210 * Ok the timer originally started is not what we want now. 13211 * We will force the hpts to be stopped if any, and restart 13212 * with the slot set to what was in the saved slot. 13213 */ 13214 if (tcp_in_hpts(rack->rc_inp)) { 13215 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 13216 uint32_t us_cts; 13217 13218 us_cts = tcp_get_usecs(NULL); 13219 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 13220 rack->r_early = 1; 13221 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 13222 } 13223 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 13224 } 13225 tcp_hpts_remove(tp->t_inpcb); 13226 } 13227 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13228 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 13229 } 13230 13231 13232 static void 13233 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) 13234 { 13235 if ((SEQ_LT(tp->snd_wl1, seq) || 13236 (tp->snd_wl1 == seq && (SEQ_LT(tp->snd_wl2, ack) || 13237 (tp->snd_wl2 == ack && tiwin > tp->snd_wnd))))) { 13238 /* keep track of pure window updates */ 13239 if ((tp->snd_wl2 == ack) && (tiwin > tp->snd_wnd)) 13240 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 13241 tp->snd_wnd = tiwin; 13242 rack_validate_fo_sendwin_up(tp, rack); 13243 tp->snd_wl1 = seq; 13244 tp->snd_wl2 = ack; 13245 if (tp->snd_wnd > tp->max_sndwnd) 13246 tp->max_sndwnd = tp->snd_wnd; 13247 rack->r_wanted_output = 1; 13248 } else if ((tp->snd_wl2 == ack) && (tiwin < tp->snd_wnd)) { 13249 tp->snd_wnd = tiwin; 13250 rack_validate_fo_sendwin_up(tp, rack); 13251 tp->snd_wl1 = seq; 13252 tp->snd_wl2 = ack; 13253 } else { 13254 /* Not a valid win update */ 13255 return; 13256 } 13257 if (tp->snd_wnd > tp->max_sndwnd) 13258 tp->max_sndwnd = tp->snd_wnd; 13259 if (tp->snd_wnd < (tp->snd_max - high_seq)) { 13260 /* The peer collapsed the window */ 13261 rack_collapsed_window(rack); 13262 } else if (rack->rc_has_collapsed) 13263 rack_un_collapse_window(rack); 13264 /* Do we exit persists? */ 13265 if ((rack->rc_in_persist != 0) && 13266 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 13267 rack->r_ctl.rc_pace_min_segs))) { 13268 rack_exit_persist(tp, rack, cts); 13269 } 13270 /* Do we enter persists? */ 13271 if ((rack->rc_in_persist == 0) && 13272 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 13273 TCPS_HAVEESTABLISHED(tp->t_state) && 13274 (tp->snd_max == tp->snd_una) && 13275 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 13276 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 13277 /* 13278 * Here the rwnd is less than 13279 * the pacing size, we are established, 13280 * nothing is outstanding, and there is 13281 * data to send. Enter persists. 13282 */ 13283 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 13284 } 13285 } 13286 13287 static void 13288 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 13289 { 13290 13291 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13292 union tcp_log_stackspecific log; 13293 struct timeval ltv; 13294 char tcp_hdr_buf[60]; 13295 struct tcphdr *th; 13296 struct timespec ts; 13297 uint32_t orig_snd_una; 13298 uint8_t xx = 0; 13299 13300 #ifdef NETFLIX_HTTP_LOGGING 13301 struct http_sendfile_track *http_req; 13302 13303 if (SEQ_GT(ae->ack, tp->snd_una)) { 13304 http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1)); 13305 } else { 13306 http_req = tcp_http_find_req_for_seq(tp, ae->ack); 13307 } 13308 #endif 13309 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13310 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 13311 if (rack->rack_no_prr == 0) 13312 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13313 else 13314 log.u_bbr.flex1 = 0; 13315 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 13316 log.u_bbr.use_lt_bw <<= 1; 13317 log.u_bbr.use_lt_bw |= rack->r_might_revert; 13318 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 13319 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 13320 log.u_bbr.pkts_out = tp->t_maxseg; 13321 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 13322 log.u_bbr.flex7 = 1; 13323 log.u_bbr.lost = ae->flags; 13324 log.u_bbr.cwnd_gain = ackval; 13325 log.u_bbr.pacing_gain = 0x2; 13326 if (ae->flags & TSTMP_HDWR) { 13327 /* Record the hardware timestamp if present */ 13328 log.u_bbr.flex3 = M_TSTMP; 13329 ts.tv_sec = ae->timestamp / 1000000000; 13330 ts.tv_nsec = ae->timestamp % 1000000000; 13331 ltv.tv_sec = ts.tv_sec; 13332 ltv.tv_usec = ts.tv_nsec / 1000; 13333 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 13334 } else if (ae->flags & TSTMP_LRO) { 13335 /* Record the LRO the arrival timestamp */ 13336 log.u_bbr.flex3 = M_TSTMP_LRO; 13337 ts.tv_sec = ae->timestamp / 1000000000; 13338 ts.tv_nsec = ae->timestamp % 1000000000; 13339 ltv.tv_sec = ts.tv_sec; 13340 ltv.tv_usec = ts.tv_nsec / 1000; 13341 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 13342 } 13343 log.u_bbr.timeStamp = tcp_get_usecs(<v); 13344 /* Log the rcv time */ 13345 log.u_bbr.delRate = ae->timestamp; 13346 #ifdef NETFLIX_HTTP_LOGGING 13347 log.u_bbr.applimited = tp->t_http_closed; 13348 log.u_bbr.applimited <<= 8; 13349 log.u_bbr.applimited |= tp->t_http_open; 13350 log.u_bbr.applimited <<= 8; 13351 log.u_bbr.applimited |= tp->t_http_req; 13352 if (http_req) { 13353 /* Copy out any client req info */ 13354 /* seconds */ 13355 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 13356 /* useconds */ 13357 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 13358 log.u_bbr.rttProp = http_req->timestamp; 13359 log.u_bbr.cur_del_rate = http_req->start; 13360 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 13361 log.u_bbr.flex8 |= 1; 13362 } else { 13363 log.u_bbr.flex8 |= 2; 13364 log.u_bbr.bw_inuse = http_req->end; 13365 } 13366 log.u_bbr.flex6 = http_req->start_seq; 13367 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 13368 log.u_bbr.flex8 |= 4; 13369 log.u_bbr.epoch = http_req->end_seq; 13370 } 13371 } 13372 #endif 13373 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 13374 th = (struct tcphdr *)tcp_hdr_buf; 13375 th->th_seq = ae->seq; 13376 th->th_ack = ae->ack; 13377 th->th_win = ae->win; 13378 /* Now fill in the ports */ 13379 th->th_sport = tp->t_inpcb->inp_fport; 13380 th->th_dport = tp->t_inpcb->inp_lport; 13381 th->th_flags = ae->flags & 0xff; 13382 /* Now do we have a timestamp option? */ 13383 if (ae->flags & HAS_TSTMP) { 13384 u_char *cp; 13385 uint32_t val; 13386 13387 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 13388 cp = (u_char *)(th + 1); 13389 *cp = TCPOPT_NOP; 13390 cp++; 13391 *cp = TCPOPT_NOP; 13392 cp++; 13393 *cp = TCPOPT_TIMESTAMP; 13394 cp++; 13395 *cp = TCPOLEN_TIMESTAMP; 13396 cp++; 13397 val = htonl(ae->ts_value); 13398 bcopy((char *)&val, 13399 (char *)cp, sizeof(uint32_t)); 13400 val = htonl(ae->ts_echo); 13401 bcopy((char *)&val, 13402 (char *)(cp + 4), sizeof(uint32_t)); 13403 } else 13404 th->th_off = (sizeof(struct tcphdr) >> 2); 13405 13406 /* 13407 * For sane logging we need to play a little trick. 13408 * If the ack were fully processed we would have moved 13409 * snd_una to high_seq, but since compressed acks are 13410 * processed in two phases, at this point (logging) snd_una 13411 * won't be advanced. So we would see multiple acks showing 13412 * the advancement. We can prevent that by "pretending" that 13413 * snd_una was advanced and then un-advancing it so that the 13414 * logging code has the right value for tlb_snd_una. 13415 */ 13416 if (tp->snd_una != high_seq) { 13417 orig_snd_una = tp->snd_una; 13418 tp->snd_una = high_seq; 13419 xx = 1; 13420 } else 13421 xx = 0; 13422 TCP_LOG_EVENTP(tp, th, 13423 &tp->t_inpcb->inp_socket->so_rcv, 13424 &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0, 13425 0, &log, true, <v); 13426 if (xx) { 13427 tp->snd_una = orig_snd_una; 13428 } 13429 } 13430 13431 } 13432 13433 static void 13434 rack_handle_probe_response(struct tcp_rack *rack, uint32_t tiwin, uint32_t us_cts) 13435 { 13436 uint32_t us_rtt; 13437 /* 13438 * A persist or keep-alive was forced out, update our 13439 * min rtt time. Note now worry about lost responses. 13440 * When a subsequent keep-alive or persist times out 13441 * and forced_ack is still on, then the last probe 13442 * was not responded to. In such cases we have a 13443 * sysctl that controls the behavior. Either we apply 13444 * the rtt but with reduced confidence (0). Or we just 13445 * plain don't apply the rtt estimate. Having data flow 13446 * will clear the probe_not_answered flag i.e. cum-ack 13447 * move forward <or> exiting and reentering persists. 13448 */ 13449 13450 rack->forced_ack = 0; 13451 rack->rc_tp->t_rxtshift = 0; 13452 if ((rack->rc_in_persist && 13453 (tiwin == rack->rc_tp->snd_wnd)) || 13454 (rack->rc_in_persist == 0)) { 13455 /* 13456 * In persists only apply the RTT update if this is 13457 * a response to our window probe. And that 13458 * means the rwnd sent must match the current 13459 * snd_wnd. If it does not, then we got a 13460 * window update ack instead. For keepalive 13461 * we allow the answer no matter what the window. 13462 * 13463 * Note that if the probe_not_answered is set then 13464 * the forced_ack_ts is the oldest one i.e. the first 13465 * probe sent that might have been lost. This assures 13466 * us that if we do calculate an RTT it is longer not 13467 * some short thing. 13468 */ 13469 if (rack->rc_in_persist) 13470 counter_u64_add(rack_persists_acks, 1); 13471 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 13472 if (us_rtt == 0) 13473 us_rtt = 1; 13474 if (rack->probe_not_answered == 0) { 13475 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13476 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 13477 } else { 13478 /* We have a retransmitted probe here too */ 13479 if (rack_apply_rtt_with_reduced_conf) { 13480 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13481 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 0, NULL, 1); 13482 } 13483 } 13484 } 13485 } 13486 13487 13488 static int 13489 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 13490 { 13491 /* 13492 * Handle a "special" compressed ack mbuf. Each incoming 13493 * ack has only four possible dispositions: 13494 * 13495 * A) It moves the cum-ack forward 13496 * B) It is behind the cum-ack. 13497 * C) It is a window-update ack. 13498 * D) It is a dup-ack. 13499 * 13500 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 13501 * in the incoming mbuf. We also need to still pay attention 13502 * to nxt_pkt since there may be another packet after this 13503 * one. 13504 */ 13505 #ifdef TCP_ACCOUNTING 13506 uint64_t ts_val; 13507 uint64_t rdstc; 13508 #endif 13509 int segsiz; 13510 struct timespec ts; 13511 struct tcp_rack *rack; 13512 struct tcp_ackent *ae; 13513 uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 13514 int cnt, i, did_out, ourfinisacked = 0; 13515 struct tcpopt to_holder, *to = NULL; 13516 int win_up_req = 0; 13517 int nsegs = 0; 13518 int under_pacing = 1; 13519 int recovery = 0; 13520 int idx; 13521 #ifdef TCP_ACCOUNTING 13522 sched_pin(); 13523 #endif 13524 rack = (struct tcp_rack *)tp->t_fb_ptr; 13525 if (rack->gp_ready && 13526 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 13527 under_pacing = 0; 13528 else 13529 under_pacing = 1; 13530 13531 if (rack->r_state != tp->t_state) 13532 rack_set_state(tp, rack); 13533 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13534 (tp->t_flags & TF_GPUTINPROG)) { 13535 /* 13536 * We have a goodput in progress 13537 * and we have entered a late state. 13538 * Do we have enough data in the sb 13539 * to handle the GPUT request? 13540 */ 13541 uint32_t bytes; 13542 13543 bytes = tp->gput_ack - tp->gput_seq; 13544 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 13545 bytes += tp->gput_seq - tp->snd_una; 13546 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 13547 /* 13548 * There are not enough bytes in the socket 13549 * buffer that have been sent to cover this 13550 * measurement. Cancel it. 13551 */ 13552 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 13553 rack->r_ctl.rc_gp_srtt /*flex1*/, 13554 tp->gput_seq, 13555 0, 0, 18, __LINE__, NULL, 0); 13556 tp->t_flags &= ~TF_GPUTINPROG; 13557 } 13558 } 13559 to = &to_holder; 13560 to->to_flags = 0; 13561 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 13562 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 13563 cnt = m->m_len / sizeof(struct tcp_ackent); 13564 idx = cnt / 5; 13565 if (idx >= MAX_NUM_OF_CNTS) 13566 idx = MAX_NUM_OF_CNTS - 1; 13567 counter_u64_add(rack_proc_comp_ack[idx], 1); 13568 counter_u64_add(rack_multi_single_eq, cnt); 13569 high_seq = tp->snd_una; 13570 the_win = tp->snd_wnd; 13571 win_seq = tp->snd_wl1; 13572 win_upd_ack = tp->snd_wl2; 13573 cts = tcp_tv_to_usectick(tv); 13574 ms_cts = tcp_tv_to_mssectick(tv); 13575 segsiz = ctf_fixed_maxseg(tp); 13576 if ((rack->rc_gp_dyn_mul) && 13577 (rack->use_fixed_rate == 0) && 13578 (rack->rc_always_pace)) { 13579 /* Check in on probertt */ 13580 rack_check_probe_rtt(rack, cts); 13581 } 13582 for (i = 0; i < cnt; i++) { 13583 #ifdef TCP_ACCOUNTING 13584 ts_val = get_cyclecount(); 13585 #endif 13586 rack_clear_rate_sample(rack); 13587 ae = ((mtod(m, struct tcp_ackent *)) + i); 13588 /* Setup the window */ 13589 tiwin = ae->win << tp->snd_scale; 13590 /* figure out the type of ack */ 13591 if (SEQ_LT(ae->ack, high_seq)) { 13592 /* Case B*/ 13593 ae->ack_val_set = ACK_BEHIND; 13594 } else if (SEQ_GT(ae->ack, high_seq)) { 13595 /* Case A */ 13596 ae->ack_val_set = ACK_CUMACK; 13597 } else if ((tiwin == the_win) && (rack->rc_in_persist == 0)){ 13598 /* Case D */ 13599 ae->ack_val_set = ACK_DUPACK; 13600 } else { 13601 /* Case C */ 13602 ae->ack_val_set = ACK_RWND; 13603 } 13604 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 13605 /* Validate timestamp */ 13606 if (ae->flags & HAS_TSTMP) { 13607 /* Setup for a timestamp */ 13608 to->to_flags = TOF_TS; 13609 ae->ts_echo -= tp->ts_offset; 13610 to->to_tsecr = ae->ts_echo; 13611 to->to_tsval = ae->ts_value; 13612 /* 13613 * If echoed timestamp is later than the current time, fall back to 13614 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 13615 * were used when this connection was established. 13616 */ 13617 if (TSTMP_GT(ae->ts_echo, ms_cts)) 13618 to->to_tsecr = 0; 13619 if (tp->ts_recent && 13620 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 13621 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 13622 #ifdef TCP_ACCOUNTING 13623 rdstc = get_cyclecount(); 13624 if (rdstc > ts_val) { 13625 counter_u64_add(tcp_proc_time[ae->ack_val_set] , 13626 (rdstc - ts_val)); 13627 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13628 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13629 } 13630 } 13631 #endif 13632 continue; 13633 } 13634 } 13635 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 13636 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 13637 tp->ts_recent_age = tcp_ts_getticks(); 13638 tp->ts_recent = ae->ts_value; 13639 } 13640 } else { 13641 /* Setup for a no options */ 13642 to->to_flags = 0; 13643 } 13644 /* Update the rcv time and perform idle reduction possibly */ 13645 if (tp->t_idle_reduce && 13646 (tp->snd_max == tp->snd_una) && 13647 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 13648 counter_u64_add(rack_input_idle_reduces, 1); 13649 rack_cc_after_idle(rack, tp); 13650 } 13651 tp->t_rcvtime = ticks; 13652 /* Now what about ECN? */ 13653 if (tp->t_flags2 & TF2_ECN_PERMIT) { 13654 if (ae->flags & TH_CWR) { 13655 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13656 tp->t_flags |= TF_ACKNOW; 13657 } 13658 switch (ae->codepoint & IPTOS_ECN_MASK) { 13659 case IPTOS_ECN_CE: 13660 tp->t_flags2 |= TF2_ECN_SND_ECE; 13661 KMOD_TCPSTAT_INC(tcps_ecn_ce); 13662 break; 13663 case IPTOS_ECN_ECT0: 13664 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13665 break; 13666 case IPTOS_ECN_ECT1: 13667 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 13668 break; 13669 } 13670 13671 /* Process a packet differently from RFC3168. */ 13672 cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint); 13673 /* Congestion experienced. */ 13674 if (ae->flags & TH_ECE) { 13675 rack_cong_signal(tp, CC_ECN, ae->ack); 13676 } 13677 } 13678 #ifdef TCP_ACCOUNTING 13679 /* Count for the specific type of ack in */ 13680 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1); 13681 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13682 tp->tcp_cnt_counters[ae->ack_val_set]++; 13683 } 13684 #endif 13685 /* 13686 * Note how we could move up these in the determination 13687 * above, but we don't so that way the timestamp checks (and ECN) 13688 * is done first before we do any processing on the ACK. 13689 * The non-compressed path through the code has this 13690 * weakness (noted by @jtl) that it actually does some 13691 * processing before verifying the timestamp information. 13692 * We don't take that path here which is why we set 13693 * the ack_val_set first, do the timestamp and ecn 13694 * processing, and then look at what we have setup. 13695 */ 13696 if (ae->ack_val_set == ACK_BEHIND) { 13697 /* 13698 * Case B flag reordering, if window is not closed 13699 * or it could be a keep-alive or persists 13700 */ 13701 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 13702 counter_u64_add(rack_reorder_seen, 1); 13703 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13704 } 13705 } else if (ae->ack_val_set == ACK_DUPACK) { 13706 /* Case D */ 13707 rack_strike_dupack(rack); 13708 } else if (ae->ack_val_set == ACK_RWND) { 13709 /* Case C */ 13710 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13711 ts.tv_sec = ae->timestamp / 1000000000; 13712 ts.tv_nsec = ae->timestamp % 1000000000; 13713 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13714 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13715 } else { 13716 rack->r_ctl.act_rcv_time = *tv; 13717 } 13718 if (rack->forced_ack) { 13719 rack_handle_probe_response(rack, tiwin, 13720 tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 13721 } 13722 win_up_req = 1; 13723 win_upd_ack = ae->ack; 13724 win_seq = ae->seq; 13725 the_win = tiwin; 13726 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13727 } else { 13728 /* Case A */ 13729 if (SEQ_GT(ae->ack, tp->snd_max)) { 13730 /* 13731 * We just send an ack since the incoming 13732 * ack is beyond the largest seq we sent. 13733 */ 13734 if ((tp->t_flags & TF_ACKNOW) == 0) { 13735 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 13736 if (tp->t_flags && TF_ACKNOW) 13737 rack->r_wanted_output = 1; 13738 } 13739 } else { 13740 nsegs++; 13741 /* If the window changed setup to update */ 13742 if (tiwin != tp->snd_wnd) { 13743 win_upd_ack = ae->ack; 13744 win_seq = ae->seq; 13745 the_win = tiwin; 13746 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13747 } 13748 #ifdef TCP_ACCOUNTING 13749 /* Account for the acks */ 13750 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13751 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 13752 } 13753 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN], 13754 (((ae->ack - high_seq) + segsiz - 1) / segsiz)); 13755 #endif 13756 high_seq = ae->ack; 13757 if (SEQ_GEQ(high_seq, rack->r_ctl.roundends)) { 13758 rack->r_ctl.current_round++; 13759 rack->r_ctl.roundends = tp->snd_max; 13760 if (CC_ALGO(tp)->newround != NULL) { 13761 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round); 13762 } 13763 } 13764 /* Setup our act_rcv_time */ 13765 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13766 ts.tv_sec = ae->timestamp / 1000000000; 13767 ts.tv_nsec = ae->timestamp % 1000000000; 13768 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13769 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13770 } else { 13771 rack->r_ctl.act_rcv_time = *tv; 13772 } 13773 rack_process_to_cumack(tp, rack, ae->ack, cts, to); 13774 if (rack->rc_dsack_round_seen) { 13775 /* Is the dsack round over? */ 13776 if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) { 13777 /* Yes it is */ 13778 rack->rc_dsack_round_seen = 0; 13779 rack_log_dsack_event(rack, 3, __LINE__, 0, 0); 13780 } 13781 } 13782 } 13783 } 13784 /* And lets be sure to commit the rtt measurements for this ack */ 13785 tcp_rack_xmit_timer_commit(rack, tp); 13786 #ifdef TCP_ACCOUNTING 13787 rdstc = get_cyclecount(); 13788 if (rdstc > ts_val) { 13789 counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val)); 13790 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13791 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13792 if (ae->ack_val_set == ACK_CUMACK) 13793 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 13794 } 13795 } 13796 #endif 13797 } 13798 #ifdef TCP_ACCOUNTING 13799 ts_val = get_cyclecount(); 13800 #endif 13801 acked_amount = acked = (high_seq - tp->snd_una); 13802 if (acked) { 13803 /* 13804 * Clear the probe not answered flag 13805 * since cum-ack moved forward. 13806 */ 13807 rack->probe_not_answered = 0; 13808 if (rack->sack_attack_disable == 0) 13809 rack_do_decay(rack); 13810 if (acked >= segsiz) { 13811 /* 13812 * You only get credit for 13813 * MSS and greater (and you get extra 13814 * credit for larger cum-ack moves). 13815 */ 13816 int ac; 13817 13818 ac = acked / segsiz; 13819 rack->r_ctl.ack_count += ac; 13820 counter_u64_add(rack_ack_total, ac); 13821 } 13822 if (rack->r_ctl.ack_count > 0xfff00000) { 13823 /* 13824 * reduce the number to keep us under 13825 * a uint32_t. 13826 */ 13827 rack->r_ctl.ack_count /= 2; 13828 rack->r_ctl.sack_count /= 2; 13829 } 13830 if (tp->t_flags & TF_NEEDSYN) { 13831 /* 13832 * T/TCP: Connection was half-synchronized, and our SYN has 13833 * been ACK'd (so connection is now fully synchronized). Go 13834 * to non-starred state, increment snd_una for ACK of SYN, 13835 * and check if we can do window scaling. 13836 */ 13837 tp->t_flags &= ~TF_NEEDSYN; 13838 tp->snd_una++; 13839 acked_amount = acked = (high_seq - tp->snd_una); 13840 } 13841 if (acked > sbavail(&so->so_snd)) 13842 acked_amount = sbavail(&so->so_snd); 13843 #ifdef NETFLIX_EXP_DETECTION 13844 /* 13845 * We only care on a cum-ack move if we are in a sack-disabled 13846 * state. We have already added in to the ack_count, and we never 13847 * would disable on a cum-ack move, so we only care to do the 13848 * detection if it may "undo" it, i.e. we were in disabled already. 13849 */ 13850 if (rack->sack_attack_disable) 13851 rack_do_detection(tp, rack, acked_amount, segsiz); 13852 #endif 13853 if (IN_FASTRECOVERY(tp->t_flags) && 13854 (rack->rack_no_prr == 0)) 13855 rack_update_prr(tp, rack, acked_amount, high_seq); 13856 if (IN_RECOVERY(tp->t_flags)) { 13857 if (SEQ_LT(high_seq, tp->snd_recover) && 13858 (SEQ_LT(high_seq, tp->snd_max))) { 13859 tcp_rack_partialack(tp); 13860 } else { 13861 rack_post_recovery(tp, high_seq); 13862 recovery = 1; 13863 } 13864 } 13865 /* Handle the rack-log-ack part (sendmap) */ 13866 if ((sbused(&so->so_snd) == 0) && 13867 (acked > acked_amount) && 13868 (tp->t_state >= TCPS_FIN_WAIT_1) && 13869 (tp->t_flags & TF_SENTFIN)) { 13870 /* 13871 * We must be sure our fin 13872 * was sent and acked (we can be 13873 * in FIN_WAIT_1 without having 13874 * sent the fin). 13875 */ 13876 ourfinisacked = 1; 13877 /* 13878 * Lets make sure snd_una is updated 13879 * since most likely acked_amount = 0 (it 13880 * should be). 13881 */ 13882 tp->snd_una = high_seq; 13883 } 13884 /* Did we make a RTO error? */ 13885 if ((tp->t_flags & TF_PREVVALID) && 13886 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13887 tp->t_flags &= ~TF_PREVVALID; 13888 if (tp->t_rxtshift == 1 && 13889 (int)(ticks - tp->t_badrxtwin) < 0) 13890 rack_cong_signal(tp, CC_RTO_ERR, high_seq); 13891 } 13892 /* Handle the data in the socket buffer */ 13893 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 13894 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13895 if (acked_amount > 0) { 13896 struct mbuf *mfree; 13897 13898 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 13899 SOCKBUF_LOCK(&so->so_snd); 13900 mfree = sbcut_locked(&so->so_snd, acked_amount); 13901 tp->snd_una = high_seq; 13902 /* Note we want to hold the sb lock through the sendmap adjust */ 13903 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 13904 /* Wake up the socket if we have room to write more */ 13905 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13906 sowwakeup_locked(so); 13907 m_freem(mfree); 13908 } 13909 /* update progress */ 13910 tp->t_acktime = ticks; 13911 rack_log_progress_event(rack, tp, tp->t_acktime, 13912 PROGRESS_UPDATE, __LINE__); 13913 /* Clear out shifts and such */ 13914 tp->t_rxtshift = 0; 13915 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13916 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13917 rack->rc_tlp_in_progress = 0; 13918 rack->r_ctl.rc_tlp_cnt_out = 0; 13919 /* Send recover and snd_nxt must be dragged along */ 13920 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 13921 tp->snd_recover = tp->snd_una; 13922 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 13923 tp->snd_nxt = tp->snd_una; 13924 /* 13925 * If the RXT timer is running we want to 13926 * stop it, so we can restart a TLP (or new RXT). 13927 */ 13928 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13929 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13930 #ifdef NETFLIX_HTTP_LOGGING 13931 tcp_http_check_for_comp(rack->rc_tp, high_seq); 13932 #endif 13933 tp->snd_wl2 = high_seq; 13934 tp->t_dupacks = 0; 13935 if (under_pacing && 13936 (rack->use_fixed_rate == 0) && 13937 (rack->in_probe_rtt == 0) && 13938 rack->rc_gp_dyn_mul && 13939 rack->rc_always_pace) { 13940 /* Check if we are dragging bottom */ 13941 rack_check_bottom_drag(tp, rack, so, acked); 13942 } 13943 if (tp->snd_una == tp->snd_max) { 13944 tp->t_flags &= ~TF_PREVVALID; 13945 rack->r_ctl.retran_during_recovery = 0; 13946 rack->r_ctl.dsack_byte_cnt = 0; 13947 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13948 if (rack->r_ctl.rc_went_idle_time == 0) 13949 rack->r_ctl.rc_went_idle_time = 1; 13950 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13951 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 13952 tp->t_acktime = 0; 13953 /* Set so we might enter persists... */ 13954 rack->r_wanted_output = 1; 13955 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13956 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 13957 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13958 (sbavail(&so->so_snd) == 0) && 13959 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 13960 /* 13961 * The socket was gone and the 13962 * peer sent data (not now in the past), time to 13963 * reset him. 13964 */ 13965 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13966 /* tcp_close will kill the inp pre-log the Reset */ 13967 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13968 #ifdef TCP_ACCOUNTING 13969 rdstc = get_cyclecount(); 13970 if (rdstc > ts_val) { 13971 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13972 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13973 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13974 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13975 } 13976 } 13977 #endif 13978 m_freem(m); 13979 tp = tcp_close(tp); 13980 if (tp == NULL) { 13981 #ifdef TCP_ACCOUNTING 13982 sched_unpin(); 13983 #endif 13984 return (1); 13985 } 13986 /* 13987 * We would normally do drop-with-reset which would 13988 * send back a reset. We can't since we don't have 13989 * all the needed bits. Instead lets arrange for 13990 * a call to tcp_output(). That way since we 13991 * are in the closed state we will generate a reset. 13992 * 13993 * Note if tcp_accounting is on we don't unpin since 13994 * we do that after the goto label. 13995 */ 13996 goto send_out_a_rst; 13997 } 13998 if ((sbused(&so->so_snd) == 0) && 13999 (tp->t_state >= TCPS_FIN_WAIT_1) && 14000 (tp->t_flags & TF_SENTFIN)) { 14001 /* 14002 * If we can't receive any more data, then closing user can 14003 * proceed. Starting the timer is contrary to the 14004 * specification, but if we don't get a FIN we'll hang 14005 * forever. 14006 * 14007 */ 14008 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 14009 soisdisconnected(so); 14010 tcp_timer_activate(tp, TT_2MSL, 14011 (tcp_fast_finwait2_recycle ? 14012 tcp_finwait2_timeout : 14013 TP_MAXIDLE(tp))); 14014 } 14015 if (ourfinisacked == 0) { 14016 /* 14017 * We don't change to fin-wait-2 if we have our fin acked 14018 * which means we are probably in TCPS_CLOSING. 14019 */ 14020 tcp_state_change(tp, TCPS_FIN_WAIT_2); 14021 } 14022 } 14023 } 14024 /* Wake up the socket if we have room to write more */ 14025 if (sbavail(&so->so_snd)) { 14026 rack->r_wanted_output = 1; 14027 if (ctf_progress_timeout_check(tp, true)) { 14028 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 14029 tp, tick, PROGRESS_DROP, __LINE__); 14030 /* 14031 * We cheat here and don't send a RST, we should send one 14032 * when the pacer drops the connection. 14033 */ 14034 #ifdef TCP_ACCOUNTING 14035 rdstc = get_cyclecount(); 14036 if (rdstc > ts_val) { 14037 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 14038 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14039 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 14040 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 14041 } 14042 } 14043 sched_unpin(); 14044 #endif 14045 (void)tcp_drop(tp, ETIMEDOUT); 14046 m_freem(m); 14047 return (1); 14048 } 14049 } 14050 if (ourfinisacked) { 14051 switch(tp->t_state) { 14052 case TCPS_CLOSING: 14053 #ifdef TCP_ACCOUNTING 14054 rdstc = get_cyclecount(); 14055 if (rdstc > ts_val) { 14056 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 14057 (rdstc - ts_val)); 14058 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14059 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 14060 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 14061 } 14062 } 14063 sched_unpin(); 14064 #endif 14065 tcp_twstart(tp); 14066 m_freem(m); 14067 return (1); 14068 break; 14069 case TCPS_LAST_ACK: 14070 #ifdef TCP_ACCOUNTING 14071 rdstc = get_cyclecount(); 14072 if (rdstc > ts_val) { 14073 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 14074 (rdstc - ts_val)); 14075 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14076 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 14077 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 14078 } 14079 } 14080 sched_unpin(); 14081 #endif 14082 tp = tcp_close(tp); 14083 ctf_do_drop(m, tp); 14084 return (1); 14085 break; 14086 case TCPS_FIN_WAIT_1: 14087 #ifdef TCP_ACCOUNTING 14088 rdstc = get_cyclecount(); 14089 if (rdstc > ts_val) { 14090 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 14091 (rdstc - ts_val)); 14092 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14093 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 14094 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 14095 } 14096 } 14097 #endif 14098 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 14099 soisdisconnected(so); 14100 tcp_timer_activate(tp, TT_2MSL, 14101 (tcp_fast_finwait2_recycle ? 14102 tcp_finwait2_timeout : 14103 TP_MAXIDLE(tp))); 14104 } 14105 tcp_state_change(tp, TCPS_FIN_WAIT_2); 14106 break; 14107 default: 14108 break; 14109 } 14110 } 14111 if (rack->r_fast_output) { 14112 /* 14113 * We re doing fast output.. can we expand that? 14114 */ 14115 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 14116 } 14117 #ifdef TCP_ACCOUNTING 14118 rdstc = get_cyclecount(); 14119 if (rdstc > ts_val) { 14120 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 14121 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14122 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 14123 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 14124 } 14125 } 14126 14127 } else if (win_up_req) { 14128 rdstc = get_cyclecount(); 14129 if (rdstc > ts_val) { 14130 counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val)); 14131 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14132 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 14133 } 14134 } 14135 #endif 14136 } 14137 /* Now is there a next packet, if so we are done */ 14138 m_freem(m); 14139 did_out = 0; 14140 if (nxt_pkt) { 14141 #ifdef TCP_ACCOUNTING 14142 sched_unpin(); 14143 #endif 14144 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 14145 return (0); 14146 } 14147 rack_handle_might_revert(tp, rack); 14148 ctf_calc_rwin(so, tp); 14149 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 14150 send_out_a_rst: 14151 if (tcp_output(tp) < 0) { 14152 #ifdef TCP_ACCOUNTING 14153 sched_unpin(); 14154 #endif 14155 return (1); 14156 } 14157 did_out = 1; 14158 } 14159 rack_free_trim(rack); 14160 #ifdef TCP_ACCOUNTING 14161 sched_unpin(); 14162 #endif 14163 rack_timer_audit(tp, rack, &so->so_snd); 14164 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 14165 return (0); 14166 } 14167 14168 14169 static int 14170 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 14171 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 14172 int32_t nxt_pkt, struct timeval *tv) 14173 { 14174 #ifdef TCP_ACCOUNTING 14175 uint64_t ts_val; 14176 #endif 14177 int32_t thflags, retval, did_out = 0; 14178 int32_t way_out = 0; 14179 /* 14180 * cts - is the current time from tv (caller gets ts) in microseconds. 14181 * ms_cts - is the current time from tv in milliseconds. 14182 * us_cts - is the time that LRO or hardware actually got the packet in microseconds. 14183 */ 14184 uint32_t cts, us_cts, ms_cts; 14185 uint32_t tiwin; 14186 struct timespec ts; 14187 struct tcpopt to; 14188 struct tcp_rack *rack; 14189 struct rack_sendmap *rsm; 14190 int32_t prev_state = 0; 14191 #ifdef TCP_ACCOUNTING 14192 int ack_val_set = 0xf; 14193 #endif 14194 int nsegs; 14195 /* 14196 * tv passed from common code is from either M_TSTMP_LRO or 14197 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 14198 */ 14199 rack = (struct tcp_rack *)tp->t_fb_ptr; 14200 if (m->m_flags & M_ACKCMP) { 14201 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 14202 } 14203 if (m->m_flags & M_ACKCMP) { 14204 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 14205 } 14206 cts = tcp_tv_to_usectick(tv); 14207 ms_cts = tcp_tv_to_mssectick(tv); 14208 nsegs = m->m_pkthdr.lro_nsegs; 14209 counter_u64_add(rack_proc_non_comp_ack, 1); 14210 thflags = th->th_flags; 14211 #ifdef TCP_ACCOUNTING 14212 sched_pin(); 14213 if (thflags & TH_ACK) 14214 ts_val = get_cyclecount(); 14215 #endif 14216 if ((m->m_flags & M_TSTMP) || 14217 (m->m_flags & M_TSTMP_LRO)) { 14218 mbuf_tstmp2timespec(m, &ts); 14219 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 14220 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 14221 } else 14222 rack->r_ctl.act_rcv_time = *tv; 14223 kern_prefetch(rack, &prev_state); 14224 prev_state = 0; 14225 /* 14226 * Unscale the window into a 32-bit value. For the SYN_SENT state 14227 * the scale is zero. 14228 */ 14229 tiwin = th->th_win << tp->snd_scale; 14230 #ifdef TCP_ACCOUNTING 14231 if (thflags & TH_ACK) { 14232 /* 14233 * We have a tradeoff here. We can either do what we are 14234 * doing i.e. pinning to this CPU and then doing the accounting 14235 * <or> we could do a critical enter, setup the rdtsc and cpu 14236 * as in below, and then validate we are on the same CPU on 14237 * exit. I have choosen to not do the critical enter since 14238 * that often will gain you a context switch, and instead lock 14239 * us (line above this if) to the same CPU with sched_pin(). This 14240 * means we may be context switched out for a higher priority 14241 * interupt but we won't be moved to another CPU. 14242 * 14243 * If this occurs (which it won't very often since we most likely 14244 * are running this code in interupt context and only a higher 14245 * priority will bump us ... clock?) we will falsely add in 14246 * to the time the interupt processing time plus the ack processing 14247 * time. This is ok since its a rare event. 14248 */ 14249 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 14250 ctf_fixed_maxseg(tp)); 14251 } 14252 #endif 14253 /* 14254 * Parse options on any incoming segment. 14255 */ 14256 memset(&to, 0, sizeof(to)); 14257 tcp_dooptions(&to, (u_char *)(th + 1), 14258 (th->th_off << 2) - sizeof(struct tcphdr), 14259 (thflags & TH_SYN) ? TO_SYN : 0); 14260 NET_EPOCH_ASSERT(); 14261 INP_WLOCK_ASSERT(tp->t_inpcb); 14262 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 14263 __func__)); 14264 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 14265 __func__)); 14266 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 14267 (tp->t_flags & TF_GPUTINPROG)) { 14268 /* 14269 * We have a goodput in progress 14270 * and we have entered a late state. 14271 * Do we have enough data in the sb 14272 * to handle the GPUT request? 14273 */ 14274 uint32_t bytes; 14275 14276 bytes = tp->gput_ack - tp->gput_seq; 14277 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 14278 bytes += tp->gput_seq - tp->snd_una; 14279 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 14280 /* 14281 * There are not enough bytes in the socket 14282 * buffer that have been sent to cover this 14283 * measurement. Cancel it. 14284 */ 14285 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 14286 rack->r_ctl.rc_gp_srtt /*flex1*/, 14287 tp->gput_seq, 14288 0, 0, 18, __LINE__, NULL, 0); 14289 tp->t_flags &= ~TF_GPUTINPROG; 14290 } 14291 } 14292 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 14293 union tcp_log_stackspecific log; 14294 struct timeval ltv; 14295 #ifdef NETFLIX_HTTP_LOGGING 14296 struct http_sendfile_track *http_req; 14297 14298 if (SEQ_GT(th->th_ack, tp->snd_una)) { 14299 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 14300 } else { 14301 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 14302 } 14303 #endif 14304 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14305 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 14306 if (rack->rack_no_prr == 0) 14307 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 14308 else 14309 log.u_bbr.flex1 = 0; 14310 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 14311 log.u_bbr.use_lt_bw <<= 1; 14312 log.u_bbr.use_lt_bw |= rack->r_might_revert; 14313 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 14314 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14315 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 14316 log.u_bbr.flex3 = m->m_flags; 14317 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 14318 log.u_bbr.lost = thflags; 14319 log.u_bbr.pacing_gain = 0x1; 14320 #ifdef TCP_ACCOUNTING 14321 log.u_bbr.cwnd_gain = ack_val_set; 14322 #endif 14323 log.u_bbr.flex7 = 2; 14324 if (m->m_flags & M_TSTMP) { 14325 /* Record the hardware timestamp if present */ 14326 mbuf_tstmp2timespec(m, &ts); 14327 ltv.tv_sec = ts.tv_sec; 14328 ltv.tv_usec = ts.tv_nsec / 1000; 14329 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 14330 } else if (m->m_flags & M_TSTMP_LRO) { 14331 /* Record the LRO the arrival timestamp */ 14332 mbuf_tstmp2timespec(m, &ts); 14333 ltv.tv_sec = ts.tv_sec; 14334 ltv.tv_usec = ts.tv_nsec / 1000; 14335 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 14336 } 14337 log.u_bbr.timeStamp = tcp_get_usecs(<v); 14338 /* Log the rcv time */ 14339 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 14340 #ifdef NETFLIX_HTTP_LOGGING 14341 log.u_bbr.applimited = tp->t_http_closed; 14342 log.u_bbr.applimited <<= 8; 14343 log.u_bbr.applimited |= tp->t_http_open; 14344 log.u_bbr.applimited <<= 8; 14345 log.u_bbr.applimited |= tp->t_http_req; 14346 if (http_req) { 14347 /* Copy out any client req info */ 14348 /* seconds */ 14349 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 14350 /* useconds */ 14351 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 14352 log.u_bbr.rttProp = http_req->timestamp; 14353 log.u_bbr.cur_del_rate = http_req->start; 14354 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 14355 log.u_bbr.flex8 |= 1; 14356 } else { 14357 log.u_bbr.flex8 |= 2; 14358 log.u_bbr.bw_inuse = http_req->end; 14359 } 14360 log.u_bbr.flex6 = http_req->start_seq; 14361 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 14362 log.u_bbr.flex8 |= 4; 14363 log.u_bbr.epoch = http_req->end_seq; 14364 } 14365 } 14366 #endif 14367 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 14368 tlen, &log, true, <v); 14369 } 14370 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 14371 way_out = 4; 14372 retval = 0; 14373 m_freem(m); 14374 goto done_with_input; 14375 } 14376 /* 14377 * If a segment with the ACK-bit set arrives in the SYN-SENT state 14378 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 14379 */ 14380 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 14381 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 14382 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 14383 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 14384 #ifdef TCP_ACCOUNTING 14385 sched_unpin(); 14386 #endif 14387 return (1); 14388 } 14389 /* 14390 * If timestamps were negotiated during SYN/ACK and a 14391 * segment without a timestamp is received, silently drop 14392 * the segment, unless it is a RST segment or missing timestamps are 14393 * tolerated. 14394 * See section 3.2 of RFC 7323. 14395 */ 14396 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 14397 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 14398 way_out = 5; 14399 retval = 0; 14400 m_freem(m); 14401 goto done_with_input; 14402 } 14403 14404 /* 14405 * Segment received on connection. Reset idle time and keep-alive 14406 * timer. XXX: This should be done after segment validation to 14407 * ignore broken/spoofed segs. 14408 */ 14409 if (tp->t_idle_reduce && 14410 (tp->snd_max == tp->snd_una) && 14411 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 14412 counter_u64_add(rack_input_idle_reduces, 1); 14413 rack_cc_after_idle(rack, tp); 14414 } 14415 tp->t_rcvtime = ticks; 14416 #ifdef STATS 14417 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 14418 #endif 14419 if (tiwin > rack->r_ctl.rc_high_rwnd) 14420 rack->r_ctl.rc_high_rwnd = tiwin; 14421 /* 14422 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 14423 * this to occur after we've validated the segment. 14424 */ 14425 if (tp->t_flags2 & TF2_ECN_PERMIT) { 14426 if (thflags & TH_CWR) { 14427 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 14428 tp->t_flags |= TF_ACKNOW; 14429 } 14430 switch (iptos & IPTOS_ECN_MASK) { 14431 case IPTOS_ECN_CE: 14432 tp->t_flags2 |= TF2_ECN_SND_ECE; 14433 KMOD_TCPSTAT_INC(tcps_ecn_ce); 14434 break; 14435 case IPTOS_ECN_ECT0: 14436 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 14437 break; 14438 case IPTOS_ECN_ECT1: 14439 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 14440 break; 14441 } 14442 14443 /* Process a packet differently from RFC3168. */ 14444 cc_ecnpkt_handler(tp, th, iptos); 14445 14446 /* Congestion experienced. */ 14447 if (thflags & TH_ECE) { 14448 rack_cong_signal(tp, CC_ECN, th->th_ack); 14449 } 14450 } 14451 14452 /* 14453 * If echoed timestamp is later than the current time, fall back to 14454 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 14455 * were used when this connection was established. 14456 */ 14457 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 14458 to.to_tsecr -= tp->ts_offset; 14459 if (TSTMP_GT(to.to_tsecr, ms_cts)) 14460 to.to_tsecr = 0; 14461 } 14462 14463 /* 14464 * If its the first time in we need to take care of options and 14465 * verify we can do SACK for rack! 14466 */ 14467 if (rack->r_state == 0) { 14468 /* Should be init'd by rack_init() */ 14469 KASSERT(rack->rc_inp != NULL, 14470 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 14471 if (rack->rc_inp == NULL) { 14472 rack->rc_inp = tp->t_inpcb; 14473 } 14474 14475 /* 14476 * Process options only when we get SYN/ACK back. The SYN 14477 * case for incoming connections is handled in tcp_syncache. 14478 * According to RFC1323 the window field in a SYN (i.e., a 14479 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 14480 * this is traditional behavior, may need to be cleaned up. 14481 */ 14482 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 14483 /* Handle parallel SYN for ECN */ 14484 if (!(thflags & TH_ACK) && 14485 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 14486 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 14487 tp->t_flags2 |= TF2_ECN_PERMIT; 14488 tp->t_flags2 |= TF2_ECN_SND_ECE; 14489 TCPSTAT_INC(tcps_ecn_shs); 14490 } 14491 if ((to.to_flags & TOF_SCALE) && 14492 (tp->t_flags & TF_REQ_SCALE)) { 14493 tp->t_flags |= TF_RCVD_SCALE; 14494 tp->snd_scale = to.to_wscale; 14495 } else 14496 tp->t_flags &= ~TF_REQ_SCALE; 14497 /* 14498 * Initial send window. It will be updated with the 14499 * next incoming segment to the scaled value. 14500 */ 14501 tp->snd_wnd = th->th_win; 14502 rack_validate_fo_sendwin_up(tp, rack); 14503 if ((to.to_flags & TOF_TS) && 14504 (tp->t_flags & TF_REQ_TSTMP)) { 14505 tp->t_flags |= TF_RCVD_TSTMP; 14506 tp->ts_recent = to.to_tsval; 14507 tp->ts_recent_age = cts; 14508 } else 14509 tp->t_flags &= ~TF_REQ_TSTMP; 14510 if (to.to_flags & TOF_MSS) { 14511 tcp_mss(tp, to.to_mss); 14512 } 14513 if ((tp->t_flags & TF_SACK_PERMIT) && 14514 (to.to_flags & TOF_SACKPERM) == 0) 14515 tp->t_flags &= ~TF_SACK_PERMIT; 14516 if (IS_FASTOPEN(tp->t_flags)) { 14517 if (to.to_flags & TOF_FASTOPEN) { 14518 uint16_t mss; 14519 14520 if (to.to_flags & TOF_MSS) 14521 mss = to.to_mss; 14522 else 14523 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 14524 mss = TCP6_MSS; 14525 else 14526 mss = TCP_MSS; 14527 tcp_fastopen_update_cache(tp, mss, 14528 to.to_tfo_len, to.to_tfo_cookie); 14529 } else 14530 tcp_fastopen_disable_path(tp); 14531 } 14532 } 14533 /* 14534 * At this point we are at the initial call. Here we decide 14535 * if we are doing RACK or not. We do this by seeing if 14536 * TF_SACK_PERMIT is set and the sack-not-required is clear. 14537 * The code now does do dup-ack counting so if you don't 14538 * switch back you won't get rack & TLP, but you will still 14539 * get this stack. 14540 */ 14541 14542 if ((rack_sack_not_required == 0) && 14543 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 14544 tcp_switch_back_to_default(tp); 14545 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 14546 tlen, iptos); 14547 #ifdef TCP_ACCOUNTING 14548 sched_unpin(); 14549 #endif 14550 return (1); 14551 } 14552 tcp_set_hpts(tp->t_inpcb); 14553 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 14554 } 14555 if (thflags & TH_FIN) 14556 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 14557 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 14558 if ((rack->rc_gp_dyn_mul) && 14559 (rack->use_fixed_rate == 0) && 14560 (rack->rc_always_pace)) { 14561 /* Check in on probertt */ 14562 rack_check_probe_rtt(rack, us_cts); 14563 } 14564 rack_clear_rate_sample(rack); 14565 if ((rack->forced_ack) && 14566 ((th->th_flags & TH_RST) == 0)) { 14567 rack_handle_probe_response(rack, tiwin, us_cts); 14568 } 14569 /* 14570 * This is the one exception case where we set the rack state 14571 * always. All other times (timers etc) we must have a rack-state 14572 * set (so we assure we have done the checks above for SACK). 14573 */ 14574 rack->r_ctl.rc_rcvtime = cts; 14575 if (rack->r_state != tp->t_state) 14576 rack_set_state(tp, rack); 14577 if (SEQ_GT(th->th_ack, tp->snd_una) && 14578 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 14579 kern_prefetch(rsm, &prev_state); 14580 prev_state = rack->r_state; 14581 retval = (*rack->r_substate) (m, th, so, 14582 tp, &to, drop_hdrlen, 14583 tlen, tiwin, thflags, nxt_pkt, iptos); 14584 #ifdef INVARIANTS 14585 if ((retval == 0) && 14586 (tp->t_inpcb == NULL)) { 14587 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 14588 retval, tp, prev_state); 14589 } 14590 #endif 14591 if (retval == 0) { 14592 /* 14593 * If retval is 1 the tcb is unlocked and most likely the tp 14594 * is gone. 14595 */ 14596 INP_WLOCK_ASSERT(tp->t_inpcb); 14597 if ((rack->rc_gp_dyn_mul) && 14598 (rack->rc_always_pace) && 14599 (rack->use_fixed_rate == 0) && 14600 rack->in_probe_rtt && 14601 (rack->r_ctl.rc_time_probertt_starts == 0)) { 14602 /* 14603 * If we are going for target, lets recheck before 14604 * we output. 14605 */ 14606 rack_check_probe_rtt(rack, us_cts); 14607 } 14608 if (rack->set_pacing_done_a_iw == 0) { 14609 /* How much has been acked? */ 14610 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 14611 /* We have enough to set in the pacing segment size */ 14612 rack->set_pacing_done_a_iw = 1; 14613 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14614 } 14615 } 14616 tcp_rack_xmit_timer_commit(rack, tp); 14617 #ifdef TCP_ACCOUNTING 14618 /* 14619 * If we set the ack_val_se to what ack processing we are doing 14620 * we also want to track how many cycles we burned. Note 14621 * the bits after tcp_output we let be "free". This is because 14622 * we are also tracking the tcp_output times as well. Note the 14623 * use of 0xf here since we only have 11 counter (0 - 0xa) and 14624 * 0xf cannot be returned and is what we initialize it too to 14625 * indicate we are not doing the tabulations. 14626 */ 14627 if (ack_val_set != 0xf) { 14628 uint64_t crtsc; 14629 14630 crtsc = get_cyclecount(); 14631 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14632 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14633 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 14634 } 14635 } 14636 #endif 14637 if (nxt_pkt == 0) { 14638 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 14639 do_output_now: 14640 if (tcp_output(tp) < 0) 14641 return (1); 14642 did_out = 1; 14643 } 14644 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 14645 rack_free_trim(rack); 14646 } 14647 /* Update any rounds needed */ 14648 if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends)) { 14649 rack->r_ctl.current_round++; 14650 rack->r_ctl.roundends = tp->snd_max; 14651 if (CC_ALGO(tp)->newround != NULL) { 14652 CC_ALGO(tp)->newround(tp->ccv, rack->r_ctl.current_round); 14653 } 14654 } 14655 if ((nxt_pkt == 0) && 14656 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 14657 (SEQ_GT(tp->snd_max, tp->snd_una) || 14658 (tp->t_flags & TF_DELACK) || 14659 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 14660 (tp->t_state <= TCPS_CLOSING)))) { 14661 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 14662 if ((tp->snd_max == tp->snd_una) && 14663 ((tp->t_flags & TF_DELACK) == 0) && 14664 (tcp_in_hpts(rack->rc_inp)) && 14665 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 14666 /* keep alive not needed if we are hptsi output yet */ 14667 ; 14668 } else { 14669 int late = 0; 14670 if (tcp_in_hpts(rack->rc_inp)) { 14671 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14672 us_cts = tcp_get_usecs(NULL); 14673 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 14674 rack->r_early = 1; 14675 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 14676 } else 14677 late = 1; 14678 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 14679 } 14680 tcp_hpts_remove(tp->t_inpcb); 14681 } 14682 if (late && (did_out == 0)) { 14683 /* 14684 * We are late in the sending 14685 * and we did not call the output 14686 * (this probably should not happen). 14687 */ 14688 goto do_output_now; 14689 } 14690 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 14691 } 14692 way_out = 1; 14693 } else if (nxt_pkt == 0) { 14694 /* Do we have the correct timer running? */ 14695 rack_timer_audit(tp, rack, &so->so_snd); 14696 way_out = 2; 14697 } 14698 done_with_input: 14699 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 14700 if (did_out) 14701 rack->r_wanted_output = 0; 14702 #ifdef INVARIANTS 14703 if (tp->t_inpcb == NULL) { 14704 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 14705 did_out, 14706 retval, tp, prev_state); 14707 } 14708 #endif 14709 #ifdef TCP_ACCOUNTING 14710 } else { 14711 /* 14712 * Track the time (see above). 14713 */ 14714 if (ack_val_set != 0xf) { 14715 uint64_t crtsc; 14716 14717 crtsc = get_cyclecount(); 14718 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14719 /* 14720 * Note we *DO NOT* increment the per-tcb counters since 14721 * in the else the TP may be gone!! 14722 */ 14723 } 14724 #endif 14725 } 14726 #ifdef TCP_ACCOUNTING 14727 sched_unpin(); 14728 #endif 14729 return (retval); 14730 } 14731 14732 void 14733 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 14734 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 14735 { 14736 struct timeval tv; 14737 14738 /* First lets see if we have old packets */ 14739 if (tp->t_in_pkt) { 14740 if (ctf_do_queued_segments(so, tp, 1)) { 14741 m_freem(m); 14742 return; 14743 } 14744 } 14745 if (m->m_flags & M_TSTMP_LRO) { 14746 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 14747 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 14748 } else { 14749 /* Should not be should we kassert instead? */ 14750 tcp_get_usecs(&tv); 14751 } 14752 if (rack_do_segment_nounlock(m, th, so, tp, 14753 drop_hdrlen, tlen, iptos, 0, &tv) == 0) { 14754 INP_WUNLOCK(tp->t_inpcb); 14755 } 14756 } 14757 14758 struct rack_sendmap * 14759 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 14760 { 14761 struct rack_sendmap *rsm = NULL; 14762 int32_t idx; 14763 uint32_t srtt = 0, thresh = 0, ts_low = 0; 14764 14765 /* Return the next guy to be re-transmitted */ 14766 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 14767 return (NULL); 14768 } 14769 if (tp->t_flags & TF_SENTFIN) { 14770 /* retran the end FIN? */ 14771 return (NULL); 14772 } 14773 /* ok lets look at this one */ 14774 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 14775 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 14776 goto check_it; 14777 } 14778 rsm = rack_find_lowest_rsm(rack); 14779 if (rsm == NULL) { 14780 return (NULL); 14781 } 14782 check_it: 14783 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 14784 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 14785 /* 14786 * No sack so we automatically do the 3 strikes and 14787 * retransmit (no rack timer would be started). 14788 */ 14789 14790 return (rsm); 14791 } 14792 if (rsm->r_flags & RACK_ACKED) { 14793 return (NULL); 14794 } 14795 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 14796 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 14797 /* Its not yet ready */ 14798 return (NULL); 14799 } 14800 srtt = rack_grab_rtt(tp, rack); 14801 idx = rsm->r_rtr_cnt - 1; 14802 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 14803 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 14804 if ((tsused == ts_low) || 14805 (TSTMP_LT(tsused, ts_low))) { 14806 /* No time since sending */ 14807 return (NULL); 14808 } 14809 if ((tsused - ts_low) < thresh) { 14810 /* It has not been long enough yet */ 14811 return (NULL); 14812 } 14813 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 14814 ((rsm->r_flags & RACK_SACK_PASSED) && 14815 (rack->sack_attack_disable == 0))) { 14816 /* 14817 * We have passed the dup-ack threshold <or> 14818 * a SACK has indicated this is missing. 14819 * Note that if you are a declared attacker 14820 * it is only the dup-ack threshold that 14821 * will cause retransmits. 14822 */ 14823 /* log retransmit reason */ 14824 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 14825 rack->r_fast_output = 0; 14826 return (rsm); 14827 } 14828 return (NULL); 14829 } 14830 14831 static void 14832 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 14833 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 14834 int line, struct rack_sendmap *rsm, uint8_t quality) 14835 { 14836 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 14837 union tcp_log_stackspecific log; 14838 struct timeval tv; 14839 14840 memset(&log, 0, sizeof(log)); 14841 log.u_bbr.flex1 = slot; 14842 log.u_bbr.flex2 = len; 14843 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 14844 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 14845 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 14846 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 14847 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 14848 log.u_bbr.use_lt_bw <<= 1; 14849 log.u_bbr.use_lt_bw |= rack->r_late; 14850 log.u_bbr.use_lt_bw <<= 1; 14851 log.u_bbr.use_lt_bw |= rack->r_early; 14852 log.u_bbr.use_lt_bw <<= 1; 14853 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 14854 log.u_bbr.use_lt_bw <<= 1; 14855 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 14856 log.u_bbr.use_lt_bw <<= 1; 14857 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 14858 log.u_bbr.use_lt_bw <<= 1; 14859 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 14860 log.u_bbr.use_lt_bw <<= 1; 14861 log.u_bbr.use_lt_bw |= rack->gp_ready; 14862 log.u_bbr.pkt_epoch = line; 14863 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 14864 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 14865 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 14866 log.u_bbr.bw_inuse = bw_est; 14867 log.u_bbr.delRate = bw; 14868 if (rack->r_ctl.gp_bw == 0) 14869 log.u_bbr.cur_del_rate = 0; 14870 else 14871 log.u_bbr.cur_del_rate = rack_get_bw(rack); 14872 log.u_bbr.rttProp = len_time; 14873 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 14874 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 14875 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 14876 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 14877 /* We are in slow start */ 14878 log.u_bbr.flex7 = 1; 14879 } else { 14880 /* we are on congestion avoidance */ 14881 log.u_bbr.flex7 = 0; 14882 } 14883 log.u_bbr.flex8 = method; 14884 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14885 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14886 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 14887 log.u_bbr.cwnd_gain <<= 1; 14888 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 14889 log.u_bbr.cwnd_gain <<= 1; 14890 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 14891 log.u_bbr.bbr_substate = quality; 14892 TCP_LOG_EVENTP(rack->rc_tp, NULL, 14893 &rack->rc_inp->inp_socket->so_rcv, 14894 &rack->rc_inp->inp_socket->so_snd, 14895 BBR_LOG_HPTSI_CALC, 0, 14896 0, &log, false, &tv); 14897 } 14898 } 14899 14900 static uint32_t 14901 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 14902 { 14903 uint32_t new_tso, user_max; 14904 14905 user_max = rack->rc_user_set_max_segs * mss; 14906 if (rack->rc_force_max_seg) { 14907 return (user_max); 14908 } 14909 if (rack->use_fixed_rate && 14910 ((rack->r_ctl.crte == NULL) || 14911 (bw != rack->r_ctl.crte->rate))) { 14912 /* Use the user mss since we are not exactly matched */ 14913 return (user_max); 14914 } 14915 new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 14916 if (new_tso > user_max) 14917 new_tso = user_max; 14918 return (new_tso); 14919 } 14920 14921 static int32_t 14922 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 14923 { 14924 uint64_t lentim, fill_bw; 14925 14926 /* Lets first see if we are full, if so continue with normal rate */ 14927 rack->r_via_fill_cw = 0; 14928 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 14929 return (slot); 14930 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 14931 return (slot); 14932 if (rack->r_ctl.rc_last_us_rtt == 0) 14933 return (slot); 14934 if (rack->rc_pace_fill_if_rttin_range && 14935 (rack->r_ctl.rc_last_us_rtt >= 14936 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 14937 /* The rtt is huge, N * smallest, lets not fill */ 14938 return (slot); 14939 } 14940 /* 14941 * first lets calculate the b/w based on the last us-rtt 14942 * and the sndwnd. 14943 */ 14944 fill_bw = rack->r_ctl.cwnd_to_use; 14945 /* Take the rwnd if its smaller */ 14946 if (fill_bw > rack->rc_tp->snd_wnd) 14947 fill_bw = rack->rc_tp->snd_wnd; 14948 if (rack->r_fill_less_agg) { 14949 /* 14950 * Now take away the inflight (this will reduce our 14951 * aggressiveness and yeah, if we get that much out in 1RTT 14952 * we will have had acks come back and still be behind). 14953 */ 14954 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14955 } 14956 /* Now lets make it into a b/w */ 14957 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 14958 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 14959 /* We are below the min b/w */ 14960 if (non_paced) 14961 *rate_wanted = fill_bw; 14962 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 14963 return (slot); 14964 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) 14965 fill_bw = rack->r_ctl.bw_rate_cap; 14966 rack->r_via_fill_cw = 1; 14967 if (rack->r_rack_hw_rate_caps && 14968 (rack->r_ctl.crte != NULL)) { 14969 uint64_t high_rate; 14970 14971 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 14972 if (fill_bw > high_rate) { 14973 /* We are capping bw at the highest rate table entry */ 14974 if (*rate_wanted > high_rate) { 14975 /* The original rate was also capped */ 14976 rack->r_via_fill_cw = 0; 14977 } 14978 rack_log_hdwr_pacing(rack, 14979 fill_bw, high_rate, __LINE__, 14980 0, 3); 14981 fill_bw = high_rate; 14982 if (capped) 14983 *capped = 1; 14984 } 14985 } else if ((rack->r_ctl.crte == NULL) && 14986 (rack->rack_hdrw_pacing == 0) && 14987 (rack->rack_hdw_pace_ena) && 14988 rack->r_rack_hw_rate_caps && 14989 (rack->rack_attempt_hdwr_pace == 0) && 14990 (rack->rc_inp->inp_route.ro_nh != NULL) && 14991 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14992 /* 14993 * Ok we may have a first attempt that is greater than our top rate 14994 * lets check. 14995 */ 14996 uint64_t high_rate; 14997 14998 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 14999 if (high_rate) { 15000 if (fill_bw > high_rate) { 15001 fill_bw = high_rate; 15002 if (capped) 15003 *capped = 1; 15004 } 15005 } 15006 } 15007 /* 15008 * Ok fill_bw holds our mythical b/w to fill the cwnd 15009 * in a rtt, what does that time wise equate too? 15010 */ 15011 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 15012 lentim /= fill_bw; 15013 *rate_wanted = fill_bw; 15014 if (non_paced || (lentim < slot)) { 15015 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 15016 0, lentim, 12, __LINE__, NULL, 0); 15017 return ((int32_t)lentim); 15018 } else 15019 return (slot); 15020 } 15021 15022 static int32_t 15023 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 15024 { 15025 uint64_t srtt; 15026 int32_t slot = 0; 15027 int can_start_hw_pacing = 1; 15028 int err; 15029 15030 if (rack->rc_always_pace == 0) { 15031 /* 15032 * We use the most optimistic possible cwnd/srtt for 15033 * sending calculations. This will make our 15034 * calculation anticipate getting more through 15035 * quicker then possible. But thats ok we don't want 15036 * the peer to have a gap in data sending. 15037 */ 15038 uint64_t cwnd, tr_perms = 0; 15039 int32_t reduce = 0; 15040 15041 old_method: 15042 /* 15043 * We keep no precise pacing with the old method 15044 * instead we use the pacer to mitigate bursts. 15045 */ 15046 if (rack->r_ctl.rc_rack_min_rtt) 15047 srtt = rack->r_ctl.rc_rack_min_rtt; 15048 else 15049 srtt = max(tp->t_srtt, 1); 15050 if (rack->r_ctl.rc_rack_largest_cwnd) 15051 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 15052 else 15053 cwnd = rack->r_ctl.cwnd_to_use; 15054 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 15055 tr_perms = (cwnd * 1000) / srtt; 15056 if (tr_perms == 0) { 15057 tr_perms = ctf_fixed_maxseg(tp); 15058 } 15059 /* 15060 * Calculate how long this will take to drain, if 15061 * the calculation comes out to zero, thats ok we 15062 * will use send_a_lot to possibly spin around for 15063 * more increasing tot_len_this_send to the point 15064 * that its going to require a pace, or we hit the 15065 * cwnd. Which in that case we are just waiting for 15066 * a ACK. 15067 */ 15068 slot = len / tr_perms; 15069 /* Now do we reduce the time so we don't run dry? */ 15070 if (slot && rack_slot_reduction) { 15071 reduce = (slot / rack_slot_reduction); 15072 if (reduce < slot) { 15073 slot -= reduce; 15074 } else 15075 slot = 0; 15076 } 15077 slot *= HPTS_USEC_IN_MSEC; 15078 if (rack->rc_pace_to_cwnd) { 15079 uint64_t rate_wanted = 0; 15080 15081 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 15082 rack->rc_ack_can_sendout_data = 1; 15083 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 15084 } else 15085 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 15086 } else { 15087 uint64_t bw_est, res, lentim, rate_wanted; 15088 uint32_t orig_val, segs, oh; 15089 int capped = 0; 15090 int prev_fill; 15091 15092 if ((rack->r_rr_config == 1) && rsm) { 15093 return (rack->r_ctl.rc_min_to); 15094 } 15095 if (rack->use_fixed_rate) { 15096 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 15097 } else if ((rack->r_ctl.init_rate == 0) && 15098 #ifdef NETFLIX_PEAKRATE 15099 (rack->rc_tp->t_maxpeakrate == 0) && 15100 #endif 15101 (rack->r_ctl.gp_bw == 0)) { 15102 /* no way to yet do an estimate */ 15103 bw_est = rate_wanted = 0; 15104 } else { 15105 bw_est = rack_get_bw(rack); 15106 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 15107 } 15108 if ((bw_est == 0) || (rate_wanted == 0) || 15109 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 15110 /* 15111 * No way yet to make a b/w estimate or 15112 * our raise is set incorrectly. 15113 */ 15114 goto old_method; 15115 } 15116 /* We need to account for all the overheads */ 15117 segs = (len + segsiz - 1) / segsiz; 15118 /* 15119 * We need the diff between 1514 bytes (e-mtu with e-hdr) 15120 * and how much data we put in each packet. Yes this 15121 * means we may be off if we are larger than 1500 bytes 15122 * or smaller. But this just makes us more conservative. 15123 */ 15124 if (rack_hw_rate_min && 15125 (bw_est < rack_hw_rate_min)) 15126 can_start_hw_pacing = 0; 15127 if (ETHERNET_SEGMENT_SIZE > segsiz) 15128 oh = ETHERNET_SEGMENT_SIZE - segsiz; 15129 else 15130 oh = 0; 15131 segs *= oh; 15132 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 15133 res = lentim / rate_wanted; 15134 slot = (uint32_t)res; 15135 orig_val = rack->r_ctl.rc_pace_max_segs; 15136 if (rack->r_ctl.crte == NULL) { 15137 /* 15138 * Only do this if we are not hardware pacing 15139 * since if we are doing hw-pacing below we will 15140 * set make a call after setting up or changing 15141 * the rate. 15142 */ 15143 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 15144 } else if (rack->rc_inp->inp_snd_tag == NULL) { 15145 /* 15146 * We lost our rate somehow, this can happen 15147 * if the interface changed underneath us. 15148 */ 15149 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 15150 rack->r_ctl.crte = NULL; 15151 /* Lets re-allow attempting to setup pacing */ 15152 rack->rack_hdrw_pacing = 0; 15153 rack->rack_attempt_hdwr_pace = 0; 15154 rack_log_hdwr_pacing(rack, 15155 rate_wanted, bw_est, __LINE__, 15156 0, 6); 15157 } 15158 /* Did we change the TSO size, if so log it */ 15159 if (rack->r_ctl.rc_pace_max_segs != orig_val) 15160 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0); 15161 prev_fill = rack->r_via_fill_cw; 15162 if ((rack->rc_pace_to_cwnd) && 15163 (capped == 0) && 15164 (rack->use_fixed_rate == 0) && 15165 (rack->in_probe_rtt == 0) && 15166 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 15167 /* 15168 * We want to pace at our rate *or* faster to 15169 * fill the cwnd to the max if its not full. 15170 */ 15171 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 15172 } 15173 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 15174 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 15175 if ((rack->rack_hdw_pace_ena) && 15176 (can_start_hw_pacing > 0) && 15177 (rack->rack_hdrw_pacing == 0) && 15178 (rack->rack_attempt_hdwr_pace == 0)) { 15179 /* 15180 * Lets attempt to turn on hardware pacing 15181 * if we can. 15182 */ 15183 rack->rack_attempt_hdwr_pace = 1; 15184 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 15185 rack->rc_inp->inp_route.ro_nh->nh_ifp, 15186 rate_wanted, 15187 RS_PACING_GEQ, 15188 &err, &rack->r_ctl.crte_prev_rate); 15189 if (rack->r_ctl.crte) { 15190 rack->rack_hdrw_pacing = 1; 15191 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, 15192 0, rack->r_ctl.crte, 15193 NULL); 15194 rack_log_hdwr_pacing(rack, 15195 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15196 err, 0); 15197 rack->r_ctl.last_hw_bw_req = rate_wanted; 15198 } else { 15199 counter_u64_add(rack_hw_pace_init_fail, 1); 15200 } 15201 } else if (rack->rack_hdrw_pacing && 15202 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 15203 /* Do we need to adjust our rate? */ 15204 const struct tcp_hwrate_limit_table *nrte; 15205 15206 if (rack->r_up_only && 15207 (rate_wanted < rack->r_ctl.crte->rate)) { 15208 /** 15209 * We have four possible states here 15210 * having to do with the previous time 15211 * and this time. 15212 * previous | this-time 15213 * A) 0 | 0 -- fill_cw not in the picture 15214 * B) 1 | 0 -- we were doing a fill-cw but now are not 15215 * C) 1 | 1 -- all rates from fill_cw 15216 * D) 0 | 1 -- we were doing non-fill and now we are filling 15217 * 15218 * For case A, C and D we don't allow a drop. But for 15219 * case B where we now our on our steady rate we do 15220 * allow a drop. 15221 * 15222 */ 15223 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 15224 goto done_w_hdwr; 15225 } 15226 if ((rate_wanted > rack->r_ctl.crte->rate) || 15227 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 15228 if (rack_hw_rate_to_low && 15229 (bw_est < rack_hw_rate_to_low)) { 15230 /* 15231 * The pacing rate is too low for hardware, but 15232 * do allow hardware pacing to be restarted. 15233 */ 15234 rack_log_hdwr_pacing(rack, 15235 bw_est, rack->r_ctl.crte->rate, __LINE__, 15236 0, 5); 15237 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 15238 rack->r_ctl.crte = NULL; 15239 rack->rack_attempt_hdwr_pace = 0; 15240 rack->rack_hdrw_pacing = 0; 15241 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15242 goto done_w_hdwr; 15243 } 15244 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 15245 rack->rc_tp, 15246 rack->rc_inp->inp_route.ro_nh->nh_ifp, 15247 rate_wanted, 15248 RS_PACING_GEQ, 15249 &err, &rack->r_ctl.crte_prev_rate); 15250 if (nrte == NULL) { 15251 /* Lost the rate */ 15252 rack->rack_hdrw_pacing = 0; 15253 rack->r_ctl.crte = NULL; 15254 rack_log_hdwr_pacing(rack, 15255 rate_wanted, 0, __LINE__, 15256 err, 1); 15257 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15258 counter_u64_add(rack_hw_pace_lost, 1); 15259 } else if (nrte != rack->r_ctl.crte) { 15260 rack->r_ctl.crte = nrte; 15261 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, 15262 segsiz, 0, 15263 rack->r_ctl.crte, 15264 NULL); 15265 rack_log_hdwr_pacing(rack, 15266 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15267 err, 2); 15268 rack->r_ctl.last_hw_bw_req = rate_wanted; 15269 } 15270 } else { 15271 /* We just need to adjust the segment size */ 15272 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 15273 rack_log_hdwr_pacing(rack, 15274 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 15275 0, 4); 15276 rack->r_ctl.last_hw_bw_req = rate_wanted; 15277 } 15278 } 15279 } 15280 if ((rack->r_ctl.crte != NULL) && 15281 (rack->r_ctl.crte->rate == rate_wanted)) { 15282 /* 15283 * We need to add a extra if the rates 15284 * are exactly matched. The idea is 15285 * we want the software to make sure the 15286 * queue is empty before adding more, this 15287 * gives us N MSS extra pace times where 15288 * N is our sysctl 15289 */ 15290 slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); 15291 } 15292 done_w_hdwr: 15293 if (rack_limit_time_with_srtt && 15294 (rack->use_fixed_rate == 0) && 15295 #ifdef NETFLIX_PEAKRATE 15296 (rack->rc_tp->t_maxpeakrate == 0) && 15297 #endif 15298 (rack->rack_hdrw_pacing == 0)) { 15299 /* 15300 * Sanity check, we do not allow the pacing delay 15301 * to be longer than the SRTT of the path. If it is 15302 * a slow path, then adding a packet should increase 15303 * the RTT and compensate for this i.e. the srtt will 15304 * be greater so the allowed pacing time will be greater. 15305 * 15306 * Note this restriction is not for where a peak rate 15307 * is set, we are doing fixed pacing or hardware pacing. 15308 */ 15309 if (rack->rc_tp->t_srtt) 15310 srtt = rack->rc_tp->t_srtt; 15311 else 15312 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 15313 if (srtt < (uint64_t)slot) { 15314 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 15315 slot = srtt; 15316 } 15317 } 15318 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 15319 } 15320 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 15321 /* 15322 * If this rate is seeing enobufs when it 15323 * goes to send then either the nic is out 15324 * of gas or we are mis-estimating the time 15325 * somehow and not letting the queue empty 15326 * completely. Lets add to the pacing time. 15327 */ 15328 int hw_boost_delay; 15329 15330 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 15331 if (hw_boost_delay > rack_enobuf_hw_max) 15332 hw_boost_delay = rack_enobuf_hw_max; 15333 else if (hw_boost_delay < rack_enobuf_hw_min) 15334 hw_boost_delay = rack_enobuf_hw_min; 15335 slot += hw_boost_delay; 15336 } 15337 if (slot) 15338 counter_u64_add(rack_calc_nonzero, 1); 15339 else 15340 counter_u64_add(rack_calc_zero, 1); 15341 return (slot); 15342 } 15343 15344 static void 15345 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 15346 tcp_seq startseq, uint32_t sb_offset) 15347 { 15348 struct rack_sendmap *my_rsm = NULL; 15349 struct rack_sendmap fe; 15350 15351 if (tp->t_state < TCPS_ESTABLISHED) { 15352 /* 15353 * We don't start any measurements if we are 15354 * not at least established. 15355 */ 15356 return; 15357 } 15358 if (tp->t_state >= TCPS_FIN_WAIT_1) { 15359 /* 15360 * We will get no more data into the SB 15361 * this means we need to have the data available 15362 * before we start a measurement. 15363 */ 15364 15365 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < 15366 max(rc_init_window(rack), 15367 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 15368 /* Nope not enough data */ 15369 return; 15370 } 15371 } 15372 tp->t_flags |= TF_GPUTINPROG; 15373 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 15374 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 15375 tp->gput_seq = startseq; 15376 rack->app_limited_needs_set = 0; 15377 if (rack->in_probe_rtt) 15378 rack->measure_saw_probe_rtt = 1; 15379 else if ((rack->measure_saw_probe_rtt) && 15380 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 15381 rack->measure_saw_probe_rtt = 0; 15382 if (rack->rc_gp_filled) 15383 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 15384 else { 15385 /* Special case initial measurement */ 15386 struct timeval tv; 15387 15388 tp->gput_ts = tcp_get_usecs(&tv); 15389 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 15390 } 15391 /* 15392 * We take a guess out into the future, 15393 * if we have no measurement and no 15394 * initial rate, we measure the first 15395 * initial-windows worth of data to 15396 * speed up getting some GP measurement and 15397 * thus start pacing. 15398 */ 15399 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 15400 rack->app_limited_needs_set = 1; 15401 tp->gput_ack = startseq + max(rc_init_window(rack), 15402 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 15403 rack_log_pacing_delay_calc(rack, 15404 tp->gput_seq, 15405 tp->gput_ack, 15406 0, 15407 tp->gput_ts, 15408 rack->r_ctl.rc_app_limited_cnt, 15409 9, 15410 __LINE__, NULL, 0); 15411 return; 15412 } 15413 if (sb_offset) { 15414 /* 15415 * We are out somewhere in the sb 15416 * can we use the already outstanding data? 15417 */ 15418 if (rack->r_ctl.rc_app_limited_cnt == 0) { 15419 /* 15420 * Yes first one is good and in this case 15421 * the tp->gput_ts is correctly set based on 15422 * the last ack that arrived (no need to 15423 * set things up when an ack comes in). 15424 */ 15425 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 15426 if ((my_rsm == NULL) || 15427 (my_rsm->r_rtr_cnt != 1)) { 15428 /* retransmission? */ 15429 goto use_latest; 15430 } 15431 } else { 15432 if (rack->r_ctl.rc_first_appl == NULL) { 15433 /* 15434 * If rc_first_appl is NULL 15435 * then the cnt should be 0. 15436 * This is probably an error, maybe 15437 * a KASSERT would be approprate. 15438 */ 15439 goto use_latest; 15440 } 15441 /* 15442 * If we have a marker pointer to the last one that is 15443 * app limited we can use that, but we need to set 15444 * things up so that when it gets ack'ed we record 15445 * the ack time (if its not already acked). 15446 */ 15447 rack->app_limited_needs_set = 1; 15448 /* 15449 * We want to get to the rsm that is either 15450 * next with space i.e. over 1 MSS or the one 15451 * after that (after the app-limited). 15452 */ 15453 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 15454 rack->r_ctl.rc_first_appl); 15455 if (my_rsm) { 15456 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 15457 /* Have to use the next one */ 15458 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 15459 my_rsm); 15460 else { 15461 /* Use after the first MSS of it is acked */ 15462 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 15463 goto start_set; 15464 } 15465 } 15466 if ((my_rsm == NULL) || 15467 (my_rsm->r_rtr_cnt != 1)) { 15468 /* 15469 * Either its a retransmit or 15470 * the last is the app-limited one. 15471 */ 15472 goto use_latest; 15473 } 15474 } 15475 tp->gput_seq = my_rsm->r_start; 15476 start_set: 15477 if (my_rsm->r_flags & RACK_ACKED) { 15478 /* 15479 * This one has been acked use the arrival ack time 15480 */ 15481 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 15482 rack->app_limited_needs_set = 0; 15483 } 15484 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 15485 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 15486 rack_log_pacing_delay_calc(rack, 15487 tp->gput_seq, 15488 tp->gput_ack, 15489 (uint64_t)my_rsm, 15490 tp->gput_ts, 15491 rack->r_ctl.rc_app_limited_cnt, 15492 9, 15493 __LINE__, NULL, 0); 15494 return; 15495 } 15496 15497 use_latest: 15498 /* 15499 * We don't know how long we may have been 15500 * idle or if this is the first-send. Lets 15501 * setup the flag so we will trim off 15502 * the first ack'd data so we get a true 15503 * measurement. 15504 */ 15505 rack->app_limited_needs_set = 1; 15506 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 15507 /* Find this guy so we can pull the send time */ 15508 fe.r_start = startseq; 15509 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 15510 if (my_rsm) { 15511 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 15512 if (my_rsm->r_flags & RACK_ACKED) { 15513 /* 15514 * Unlikely since its probably what was 15515 * just transmitted (but I am paranoid). 15516 */ 15517 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 15518 rack->app_limited_needs_set = 0; 15519 } 15520 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 15521 /* This also is unlikely */ 15522 tp->gput_seq = my_rsm->r_start; 15523 } 15524 } else { 15525 /* 15526 * TSNH unless we have some send-map limit, 15527 * and even at that it should not be hitting 15528 * that limit (we should have stopped sending). 15529 */ 15530 struct timeval tv; 15531 15532 microuptime(&tv); 15533 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 15534 } 15535 rack_log_pacing_delay_calc(rack, 15536 tp->gput_seq, 15537 tp->gput_ack, 15538 (uint64_t)my_rsm, 15539 tp->gput_ts, 15540 rack->r_ctl.rc_app_limited_cnt, 15541 9, __LINE__, NULL, 0); 15542 } 15543 15544 static inline uint32_t 15545 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 15546 uint32_t avail, int32_t sb_offset) 15547 { 15548 uint32_t len; 15549 uint32_t sendwin; 15550 15551 if (tp->snd_wnd > cwnd_to_use) 15552 sendwin = cwnd_to_use; 15553 else 15554 sendwin = tp->snd_wnd; 15555 if (ctf_outstanding(tp) >= tp->snd_wnd) { 15556 /* We never want to go over our peers rcv-window */ 15557 len = 0; 15558 } else { 15559 uint32_t flight; 15560 15561 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 15562 if (flight >= sendwin) { 15563 /* 15564 * We have in flight what we are allowed by cwnd (if 15565 * it was rwnd blocking it would have hit above out 15566 * >= tp->snd_wnd). 15567 */ 15568 return (0); 15569 } 15570 len = sendwin - flight; 15571 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 15572 /* We would send too much (beyond the rwnd) */ 15573 len = tp->snd_wnd - ctf_outstanding(tp); 15574 } 15575 if ((len + sb_offset) > avail) { 15576 /* 15577 * We don't have that much in the SB, how much is 15578 * there? 15579 */ 15580 len = avail - sb_offset; 15581 } 15582 } 15583 return (len); 15584 } 15585 15586 static void 15587 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 15588 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 15589 int rsm_is_null, int optlen, int line, uint16_t mode) 15590 { 15591 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15592 union tcp_log_stackspecific log; 15593 struct timeval tv; 15594 15595 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15596 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 15597 log.u_bbr.flex1 = error; 15598 log.u_bbr.flex2 = flags; 15599 log.u_bbr.flex3 = rsm_is_null; 15600 log.u_bbr.flex4 = ipoptlen; 15601 log.u_bbr.flex5 = tp->rcv_numsacks; 15602 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15603 log.u_bbr.flex7 = optlen; 15604 log.u_bbr.flex8 = rack->r_fsb_inited; 15605 log.u_bbr.applimited = rack->r_fast_output; 15606 log.u_bbr.bw_inuse = rack_get_bw(rack); 15607 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15608 log.u_bbr.cwnd_gain = mode; 15609 log.u_bbr.pkts_out = orig_len; 15610 log.u_bbr.lt_epoch = len; 15611 log.u_bbr.delivered = line; 15612 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15613 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15614 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 15615 len, &log, false, NULL, NULL, 0, &tv); 15616 } 15617 } 15618 15619 15620 static struct mbuf * 15621 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 15622 struct rack_fast_send_blk *fsb, 15623 int32_t seglimit, int32_t segsize, int hw_tls) 15624 { 15625 #ifdef KERN_TLS 15626 struct ktls_session *tls, *ntls; 15627 struct mbuf *start; 15628 #endif 15629 struct mbuf *m, *n, **np, *smb; 15630 struct mbuf *top; 15631 int32_t off, soff; 15632 int32_t len = *plen; 15633 int32_t fragsize; 15634 int32_t len_cp = 0; 15635 uint32_t mlen, frags; 15636 15637 soff = off = the_off; 15638 smb = m = the_m; 15639 np = ⊤ 15640 top = NULL; 15641 #ifdef KERN_TLS 15642 if (hw_tls && (m->m_flags & M_EXTPG)) 15643 tls = m->m_epg_tls; 15644 else 15645 tls = NULL; 15646 start = m; 15647 #endif 15648 while (len > 0) { 15649 if (m == NULL) { 15650 *plen = len_cp; 15651 break; 15652 } 15653 #ifdef KERN_TLS 15654 if (hw_tls) { 15655 if (m->m_flags & M_EXTPG) 15656 ntls = m->m_epg_tls; 15657 else 15658 ntls = NULL; 15659 15660 /* 15661 * Avoid mixing TLS records with handshake 15662 * data or TLS records from different 15663 * sessions. 15664 */ 15665 if (tls != ntls) { 15666 MPASS(m != start); 15667 *plen = len_cp; 15668 break; 15669 } 15670 } 15671 #endif 15672 mlen = min(len, m->m_len - off); 15673 if (seglimit) { 15674 /* 15675 * For M_EXTPG mbufs, add 3 segments 15676 * + 1 in case we are crossing page boundaries 15677 * + 2 in case the TLS hdr/trailer are used 15678 * It is cheaper to just add the segments 15679 * than it is to take the cache miss to look 15680 * at the mbuf ext_pgs state in detail. 15681 */ 15682 if (m->m_flags & M_EXTPG) { 15683 fragsize = min(segsize, PAGE_SIZE); 15684 frags = 3; 15685 } else { 15686 fragsize = segsize; 15687 frags = 0; 15688 } 15689 15690 /* Break if we really can't fit anymore. */ 15691 if ((frags + 1) >= seglimit) { 15692 *plen = len_cp; 15693 break; 15694 } 15695 15696 /* 15697 * Reduce size if you can't copy the whole 15698 * mbuf. If we can't copy the whole mbuf, also 15699 * adjust len so the loop will end after this 15700 * mbuf. 15701 */ 15702 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 15703 mlen = (seglimit - frags - 1) * fragsize; 15704 len = mlen; 15705 *plen = len_cp + len; 15706 } 15707 frags += howmany(mlen, fragsize); 15708 if (frags == 0) 15709 frags++; 15710 seglimit -= frags; 15711 KASSERT(seglimit > 0, 15712 ("%s: seglimit went too low", __func__)); 15713 } 15714 n = m_get(M_NOWAIT, m->m_type); 15715 *np = n; 15716 if (n == NULL) 15717 goto nospace; 15718 n->m_len = mlen; 15719 soff += mlen; 15720 len_cp += n->m_len; 15721 if (m->m_flags & (M_EXT|M_EXTPG)) { 15722 n->m_data = m->m_data + off; 15723 mb_dupcl(n, m); 15724 } else { 15725 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 15726 (u_int)n->m_len); 15727 } 15728 len -= n->m_len; 15729 off = 0; 15730 m = m->m_next; 15731 np = &n->m_next; 15732 if (len || (soff == smb->m_len)) { 15733 /* 15734 * We have more so we move forward or 15735 * we have consumed the entire mbuf and 15736 * len has fell to 0. 15737 */ 15738 soff = 0; 15739 smb = m; 15740 } 15741 15742 } 15743 if (fsb != NULL) { 15744 fsb->m = smb; 15745 fsb->off = soff; 15746 if (smb) { 15747 /* 15748 * Save off the size of the mbuf. We do 15749 * this so that we can recognize when it 15750 * has been trimmed by sbcut() as acks 15751 * come in. 15752 */ 15753 fsb->o_m_len = smb->m_len; 15754 } else { 15755 /* 15756 * This is the case where the next mbuf went to NULL. This 15757 * means with this copy we have sent everything in the sb. 15758 * In theory we could clear the fast_output flag, but lets 15759 * not since its possible that we could get more added 15760 * and acks that call the extend function which would let 15761 * us send more. 15762 */ 15763 fsb->o_m_len = 0; 15764 } 15765 } 15766 return (top); 15767 nospace: 15768 if (top) 15769 m_freem(top); 15770 return (NULL); 15771 15772 } 15773 15774 /* 15775 * This is a copy of m_copym(), taking the TSO segment size/limit 15776 * constraints into account, and advancing the sndptr as it goes. 15777 */ 15778 static struct mbuf * 15779 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 15780 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 15781 { 15782 struct mbuf *m, *n; 15783 int32_t soff; 15784 15785 soff = rack->r_ctl.fsb.off; 15786 m = rack->r_ctl.fsb.m; 15787 if (rack->r_ctl.fsb.o_m_len > m->m_len) { 15788 /* 15789 * The mbuf had the front of it chopped off by an ack 15790 * we need to adjust the soff/off by that difference. 15791 */ 15792 uint32_t delta; 15793 15794 delta = rack->r_ctl.fsb.o_m_len - m->m_len; 15795 soff -= delta; 15796 } else if (rack->r_ctl.fsb.o_m_len < m->m_len) { 15797 /* 15798 * The mbuf was expanded probably by 15799 * a m_compress. Just update o_m_len. 15800 */ 15801 rack->r_ctl.fsb.o_m_len = m->m_len; 15802 } 15803 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 15804 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 15805 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 15806 __FUNCTION__, 15807 rack, *plen, m, m->m_len)); 15808 /* Save off the right location before we copy and advance */ 15809 *s_soff = soff; 15810 *s_mb = rack->r_ctl.fsb.m; 15811 n = rack_fo_base_copym(m, soff, plen, 15812 &rack->r_ctl.fsb, 15813 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 15814 return (n); 15815 } 15816 15817 static int 15818 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 15819 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 15820 { 15821 /* 15822 * Enter the fast retransmit path. We are given that a sched_pin is 15823 * in place (if accounting is compliled in) and the cycle count taken 15824 * at the entry is in the ts_val. The concept her is that the rsm 15825 * now holds the mbuf offsets and such so we can directly transmit 15826 * without a lot of overhead, the len field is already set for 15827 * us to prohibit us from sending too much (usually its 1MSS). 15828 */ 15829 struct ip *ip = NULL; 15830 struct udphdr *udp = NULL; 15831 struct tcphdr *th = NULL; 15832 struct mbuf *m = NULL; 15833 struct inpcb *inp; 15834 uint8_t *cpto; 15835 struct tcp_log_buffer *lgb; 15836 #ifdef TCP_ACCOUNTING 15837 uint64_t crtsc; 15838 int cnt_thru = 1; 15839 #endif 15840 struct tcpopt to; 15841 u_char opt[TCP_MAXOLEN]; 15842 uint32_t hdrlen, optlen; 15843 int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0; 15844 uint32_t us_cts; 15845 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15846 uint32_t if_hw_tsomaxsegsize; 15847 15848 #ifdef INET6 15849 struct ip6_hdr *ip6 = NULL; 15850 15851 if (rack->r_is_v6) { 15852 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15853 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15854 } else 15855 #endif /* INET6 */ 15856 { 15857 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15858 hdrlen = sizeof(struct tcpiphdr); 15859 } 15860 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15861 goto failed; 15862 } 15863 if (doing_tlp) { 15864 /* Its a TLP add the flag, it may already be there but be sure */ 15865 rsm->r_flags |= RACK_TLP; 15866 } else { 15867 /* If it was a TLP it is not not on this retransmit */ 15868 rsm->r_flags &= ~RACK_TLP; 15869 } 15870 startseq = rsm->r_start; 15871 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15872 inp = rack->rc_inp; 15873 to.to_flags = 0; 15874 flags = tcp_outflags[tp->t_state]; 15875 if (flags & (TH_SYN|TH_RST)) { 15876 goto failed; 15877 } 15878 if (rsm->r_flags & RACK_HAS_FIN) { 15879 /* We can't send a FIN here */ 15880 goto failed; 15881 } 15882 if (flags & TH_FIN) { 15883 /* We never send a FIN */ 15884 flags &= ~TH_FIN; 15885 } 15886 if (tp->t_flags & TF_RCVD_TSTMP) { 15887 to.to_tsval = ms_cts + tp->ts_offset; 15888 to.to_tsecr = tp->ts_recent; 15889 to.to_flags = TOF_TS; 15890 } 15891 optlen = tcp_addoptions(&to, opt); 15892 hdrlen += optlen; 15893 udp = rack->r_ctl.fsb.udp; 15894 if (udp) 15895 hdrlen += sizeof(struct udphdr); 15896 if (rack->r_ctl.rc_pace_max_segs) 15897 max_val = rack->r_ctl.rc_pace_max_segs; 15898 else if (rack->rc_user_set_max_segs) 15899 max_val = rack->rc_user_set_max_segs * segsiz; 15900 else 15901 max_val = len; 15902 if ((tp->t_flags & TF_TSO) && 15903 V_tcp_do_tso && 15904 (len > segsiz) && 15905 (tp->t_port == 0)) 15906 tso = 1; 15907 #ifdef INET6 15908 if (MHLEN < hdrlen + max_linkhdr) 15909 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15910 else 15911 #endif 15912 m = m_gethdr(M_NOWAIT, MT_DATA); 15913 if (m == NULL) 15914 goto failed; 15915 m->m_data += max_linkhdr; 15916 m->m_len = hdrlen; 15917 th = rack->r_ctl.fsb.th; 15918 /* Establish the len to send */ 15919 if (len > max_val) 15920 len = max_val; 15921 if ((tso) && (len + optlen > tp->t_maxseg)) { 15922 uint32_t if_hw_tsomax; 15923 int32_t max_len; 15924 15925 /* extract TSO information */ 15926 if_hw_tsomax = tp->t_tsomax; 15927 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15928 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15929 /* 15930 * Check if we should limit by maximum payload 15931 * length: 15932 */ 15933 if (if_hw_tsomax != 0) { 15934 /* compute maximum TSO length */ 15935 max_len = (if_hw_tsomax - hdrlen - 15936 max_linkhdr); 15937 if (max_len <= 0) { 15938 goto failed; 15939 } else if (len > max_len) { 15940 len = max_len; 15941 } 15942 } 15943 if (len <= segsiz) { 15944 /* 15945 * In case there are too many small fragments don't 15946 * use TSO: 15947 */ 15948 tso = 0; 15949 } 15950 } else { 15951 tso = 0; 15952 } 15953 if ((tso == 0) && (len > segsiz)) 15954 len = segsiz; 15955 us_cts = tcp_get_usecs(tv); 15956 if ((len == 0) || 15957 (len <= MHLEN - hdrlen - max_linkhdr)) { 15958 goto failed; 15959 } 15960 th->th_seq = htonl(rsm->r_start); 15961 th->th_ack = htonl(tp->rcv_nxt); 15962 /* 15963 * The PUSH bit should only be applied 15964 * if the full retransmission is made. If 15965 * we are sending less than this is the 15966 * left hand edge and should not have 15967 * the PUSH bit. 15968 */ 15969 if ((rsm->r_flags & RACK_HAD_PUSH) && 15970 (len == (rsm->r_end - rsm->r_start))) 15971 flags |= TH_PUSH; 15972 th->th_flags = flags; 15973 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15974 if (th->th_win == 0) { 15975 tp->t_sndzerowin++; 15976 tp->t_flags |= TF_RXWIN0SENT; 15977 } else 15978 tp->t_flags &= ~TF_RXWIN0SENT; 15979 if (rsm->r_flags & RACK_TLP) { 15980 /* 15981 * TLP should not count in retran count, but 15982 * in its own bin 15983 */ 15984 counter_u64_add(rack_tlp_retran, 1); 15985 counter_u64_add(rack_tlp_retran_bytes, len); 15986 } else { 15987 tp->t_sndrexmitpack++; 15988 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 15989 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 15990 } 15991 #ifdef STATS 15992 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 15993 len); 15994 #endif 15995 if (rsm->m == NULL) 15996 goto failed; 15997 if (rsm->orig_m_len != rsm->m->m_len) { 15998 /* Fix up the orig_m_len and possibly the mbuf offset */ 15999 rack_adjust_orig_mlen(rsm); 16000 } 16001 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 16002 if (len <= segsiz) { 16003 /* 16004 * Must have ran out of mbufs for the copy 16005 * shorten it to no longer need tso. Lets 16006 * not put on sendalot since we are low on 16007 * mbufs. 16008 */ 16009 tso = 0; 16010 } 16011 if ((m->m_next == NULL) || (len <= 0)){ 16012 goto failed; 16013 } 16014 if (udp) { 16015 if (rack->r_is_v6) 16016 ulen = hdrlen + len - sizeof(struct ip6_hdr); 16017 else 16018 ulen = hdrlen + len - sizeof(struct ip); 16019 udp->uh_ulen = htons(ulen); 16020 } 16021 m->m_pkthdr.rcvif = (struct ifnet *)0; 16022 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 16023 #ifdef INET6 16024 if (rack->r_is_v6) { 16025 if (tp->t_port) { 16026 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 16027 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16028 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 16029 th->th_sum = htons(0); 16030 UDPSTAT_INC(udps_opackets); 16031 } else { 16032 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 16033 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16034 th->th_sum = in6_cksum_pseudo(ip6, 16035 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 16036 0); 16037 } 16038 } 16039 #endif 16040 #if defined(INET6) && defined(INET) 16041 else 16042 #endif 16043 #ifdef INET 16044 { 16045 if (tp->t_port) { 16046 m->m_pkthdr.csum_flags = CSUM_UDP; 16047 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16048 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 16049 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 16050 th->th_sum = htons(0); 16051 UDPSTAT_INC(udps_opackets); 16052 } else { 16053 m->m_pkthdr.csum_flags = CSUM_TCP; 16054 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16055 th->th_sum = in_pseudo(ip->ip_src.s_addr, 16056 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 16057 IPPROTO_TCP + len + optlen)); 16058 } 16059 /* IP version must be set here for ipv4/ipv6 checking later */ 16060 KASSERT(ip->ip_v == IPVERSION, 16061 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 16062 } 16063 #endif 16064 if (tso) { 16065 KASSERT(len > tp->t_maxseg - optlen, 16066 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 16067 m->m_pkthdr.csum_flags |= CSUM_TSO; 16068 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 16069 } 16070 #ifdef INET6 16071 if (rack->r_is_v6) { 16072 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 16073 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 16074 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 16075 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16076 else 16077 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16078 } 16079 #endif 16080 #if defined(INET) && defined(INET6) 16081 else 16082 #endif 16083 #ifdef INET 16084 { 16085 ip->ip_len = htons(m->m_pkthdr.len); 16086 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 16087 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 16088 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16089 if (tp->t_port == 0 || len < V_tcp_minmss) { 16090 ip->ip_off |= htons(IP_DF); 16091 } 16092 } else { 16093 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16094 } 16095 } 16096 #endif 16097 /* Time to copy in our header */ 16098 cpto = mtod(m, uint8_t *); 16099 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 16100 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 16101 if (optlen) { 16102 bcopy(opt, th + 1, optlen); 16103 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 16104 } else { 16105 th->th_off = sizeof(struct tcphdr) >> 2; 16106 } 16107 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 16108 union tcp_log_stackspecific log; 16109 16110 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16111 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 16112 if (rack->rack_no_prr) 16113 log.u_bbr.flex1 = 0; 16114 else 16115 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16116 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 16117 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 16118 log.u_bbr.flex4 = max_val; 16119 log.u_bbr.flex5 = 0; 16120 /* Save off the early/late values */ 16121 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 16122 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 16123 log.u_bbr.bw_inuse = rack_get_bw(rack); 16124 if (doing_tlp == 0) 16125 log.u_bbr.flex8 = 1; 16126 else 16127 log.u_bbr.flex8 = 2; 16128 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 16129 log.u_bbr.flex7 = 55; 16130 log.u_bbr.pkts_out = tp->t_maxseg; 16131 log.u_bbr.timeStamp = cts; 16132 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16133 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 16134 log.u_bbr.delivered = 0; 16135 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 16136 len, &log, false, NULL, NULL, 0, tv); 16137 } else 16138 lgb = NULL; 16139 #ifdef INET6 16140 if (rack->r_is_v6) { 16141 error = ip6_output(m, NULL, 16142 &inp->inp_route6, 16143 0, NULL, NULL, inp); 16144 } 16145 #endif 16146 #if defined(INET) && defined(INET6) 16147 else 16148 #endif 16149 #ifdef INET 16150 { 16151 error = ip_output(m, NULL, 16152 &inp->inp_route, 16153 0, 0, inp); 16154 } 16155 #endif 16156 m = NULL; 16157 if (lgb) { 16158 lgb->tlb_errno = error; 16159 lgb = NULL; 16160 } 16161 if (error) { 16162 goto failed; 16163 } 16164 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 16165 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls); 16166 if (doing_tlp && (rack->fast_rsm_hack == 0)) { 16167 rack->rc_tlp_in_progress = 1; 16168 rack->r_ctl.rc_tlp_cnt_out++; 16169 } 16170 if (error == 0) { 16171 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 16172 if (doing_tlp) { 16173 rack->rc_last_sent_tlp_past_cumack = 0; 16174 rack->rc_last_sent_tlp_seq_valid = 1; 16175 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 16176 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 16177 } 16178 } 16179 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 16180 rack->forced_ack = 0; /* If we send something zap the FA flag */ 16181 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 16182 rack->r_ctl.retran_during_recovery += len; 16183 { 16184 int idx; 16185 16186 idx = (len / segsiz) + 3; 16187 if (idx >= TCP_MSS_ACCT_ATIMER) 16188 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 16189 else 16190 counter_u64_add(rack_out_size[idx], 1); 16191 } 16192 if (tp->t_rtttime == 0) { 16193 tp->t_rtttime = ticks; 16194 tp->t_rtseq = startseq; 16195 KMOD_TCPSTAT_INC(tcps_segstimed); 16196 } 16197 counter_u64_add(rack_fto_rsm_send, 1); 16198 if (error && (error == ENOBUFS)) { 16199 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 16200 if (rack->rc_enobuf < 0x7f) 16201 rack->rc_enobuf++; 16202 if (slot < (10 * HPTS_USEC_IN_MSEC)) 16203 slot = 10 * HPTS_USEC_IN_MSEC; 16204 } else 16205 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 16206 if ((slot == 0) || 16207 (rack->rc_always_pace == 0) || 16208 (rack->r_rr_config == 1)) { 16209 /* 16210 * We have no pacing set or we 16211 * are using old-style rack or 16212 * we are overriden to use the old 1ms pacing. 16213 */ 16214 slot = rack->r_ctl.rc_min_to; 16215 } 16216 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 16217 if (rack->r_must_retran) { 16218 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 16219 if ((SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) || 16220 ((rsm->r_flags & RACK_MUST_RXT) == 0)) { 16221 /* 16222 * We have retransmitted all we need. If 16223 * RACK_MUST_RXT is not set then we need to 16224 * not retransmit this guy. 16225 */ 16226 rack->r_must_retran = 0; 16227 rack->r_ctl.rc_out_at_rto = 0; 16228 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 16229 /* Not one we should rxt */ 16230 goto failed; 16231 } else { 16232 /* Clear the flag */ 16233 rsm->r_flags &= ~RACK_MUST_RXT; 16234 } 16235 } else { 16236 /* Remove the flag */ 16237 rsm->r_flags &= ~RACK_MUST_RXT; 16238 } 16239 } 16240 #ifdef TCP_ACCOUNTING 16241 crtsc = get_cyclecount(); 16242 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16243 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 16244 } 16245 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 16246 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16247 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 16248 } 16249 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 16250 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16251 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 16252 } 16253 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz)); 16254 sched_unpin(); 16255 #endif 16256 return (0); 16257 failed: 16258 if (m) 16259 m_free(m); 16260 return (-1); 16261 } 16262 16263 static void 16264 rack_sndbuf_autoscale(struct tcp_rack *rack) 16265 { 16266 /* 16267 * Automatic sizing of send socket buffer. Often the send buffer 16268 * size is not optimally adjusted to the actual network conditions 16269 * at hand (delay bandwidth product). Setting the buffer size too 16270 * small limits throughput on links with high bandwidth and high 16271 * delay (eg. trans-continental/oceanic links). Setting the 16272 * buffer size too big consumes too much real kernel memory, 16273 * especially with many connections on busy servers. 16274 * 16275 * The criteria to step up the send buffer one notch are: 16276 * 1. receive window of remote host is larger than send buffer 16277 * (with a fudge factor of 5/4th); 16278 * 2. send buffer is filled to 7/8th with data (so we actually 16279 * have data to make use of it); 16280 * 3. send buffer fill has not hit maximal automatic size; 16281 * 4. our send window (slow start and cogestion controlled) is 16282 * larger than sent but unacknowledged data in send buffer. 16283 * 16284 * Note that the rack version moves things much faster since 16285 * we want to avoid hitting cache lines in the rack_fast_output() 16286 * path so this is called much less often and thus moves 16287 * the SB forward by a percentage. 16288 */ 16289 struct socket *so; 16290 struct tcpcb *tp; 16291 uint32_t sendwin, scaleup; 16292 16293 tp = rack->rc_tp; 16294 so = rack->rc_inp->inp_socket; 16295 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 16296 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 16297 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 16298 sbused(&so->so_snd) >= 16299 (so->so_snd.sb_hiwat / 8 * 7) && 16300 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 16301 sendwin >= (sbused(&so->so_snd) - 16302 (tp->snd_nxt - tp->snd_una))) { 16303 if (rack_autosndbuf_inc) 16304 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 16305 else 16306 scaleup = V_tcp_autosndbuf_inc; 16307 if (scaleup < V_tcp_autosndbuf_inc) 16308 scaleup = V_tcp_autosndbuf_inc; 16309 scaleup += so->so_snd.sb_hiwat; 16310 if (scaleup > V_tcp_autosndbuf_max) 16311 scaleup = V_tcp_autosndbuf_max; 16312 if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread)) 16313 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 16314 } 16315 } 16316 } 16317 16318 static int 16319 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 16320 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 16321 { 16322 /* 16323 * Enter to do fast output. We are given that the sched_pin is 16324 * in place (if accounting is compiled in) and the cycle count taken 16325 * at entry is in place in ts_val. The idea here is that 16326 * we know how many more bytes needs to be sent (presumably either 16327 * during pacing or to fill the cwnd and that was greater than 16328 * the max-burst). We have how much to send and all the info we 16329 * need to just send. 16330 */ 16331 struct ip *ip = NULL; 16332 struct udphdr *udp = NULL; 16333 struct tcphdr *th = NULL; 16334 struct mbuf *m, *s_mb; 16335 struct inpcb *inp; 16336 uint8_t *cpto; 16337 struct tcp_log_buffer *lgb; 16338 #ifdef TCP_ACCOUNTING 16339 uint64_t crtsc; 16340 #endif 16341 struct tcpopt to; 16342 u_char opt[TCP_MAXOLEN]; 16343 uint32_t hdrlen, optlen; 16344 int cnt_thru = 1; 16345 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0; 16346 uint32_t us_cts, s_soff; 16347 uint32_t if_hw_tsomaxsegcount = 0, startseq; 16348 uint32_t if_hw_tsomaxsegsize; 16349 uint16_t add_flag = RACK_SENT_FP; 16350 #ifdef INET6 16351 struct ip6_hdr *ip6 = NULL; 16352 16353 if (rack->r_is_v6) { 16354 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 16355 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 16356 } else 16357 #endif /* INET6 */ 16358 { 16359 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 16360 hdrlen = sizeof(struct tcpiphdr); 16361 } 16362 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 16363 m = NULL; 16364 goto failed; 16365 } 16366 startseq = tp->snd_max; 16367 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16368 inp = rack->rc_inp; 16369 len = rack->r_ctl.fsb.left_to_send; 16370 to.to_flags = 0; 16371 flags = rack->r_ctl.fsb.tcp_flags; 16372 if (tp->t_flags & TF_RCVD_TSTMP) { 16373 to.to_tsval = ms_cts + tp->ts_offset; 16374 to.to_tsecr = tp->ts_recent; 16375 to.to_flags = TOF_TS; 16376 } 16377 optlen = tcp_addoptions(&to, opt); 16378 hdrlen += optlen; 16379 udp = rack->r_ctl.fsb.udp; 16380 if (udp) 16381 hdrlen += sizeof(struct udphdr); 16382 if (rack->r_ctl.rc_pace_max_segs) 16383 max_val = rack->r_ctl.rc_pace_max_segs; 16384 else if (rack->rc_user_set_max_segs) 16385 max_val = rack->rc_user_set_max_segs * segsiz; 16386 else 16387 max_val = len; 16388 if ((tp->t_flags & TF_TSO) && 16389 V_tcp_do_tso && 16390 (len > segsiz) && 16391 (tp->t_port == 0)) 16392 tso = 1; 16393 again: 16394 #ifdef INET6 16395 if (MHLEN < hdrlen + max_linkhdr) 16396 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 16397 else 16398 #endif 16399 m = m_gethdr(M_NOWAIT, MT_DATA); 16400 if (m == NULL) 16401 goto failed; 16402 m->m_data += max_linkhdr; 16403 m->m_len = hdrlen; 16404 th = rack->r_ctl.fsb.th; 16405 /* Establish the len to send */ 16406 if (len > max_val) 16407 len = max_val; 16408 if ((tso) && (len + optlen > tp->t_maxseg)) { 16409 uint32_t if_hw_tsomax; 16410 int32_t max_len; 16411 16412 /* extract TSO information */ 16413 if_hw_tsomax = tp->t_tsomax; 16414 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 16415 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 16416 /* 16417 * Check if we should limit by maximum payload 16418 * length: 16419 */ 16420 if (if_hw_tsomax != 0) { 16421 /* compute maximum TSO length */ 16422 max_len = (if_hw_tsomax - hdrlen - 16423 max_linkhdr); 16424 if (max_len <= 0) { 16425 goto failed; 16426 } else if (len > max_len) { 16427 len = max_len; 16428 } 16429 } 16430 if (len <= segsiz) { 16431 /* 16432 * In case there are too many small fragments don't 16433 * use TSO: 16434 */ 16435 tso = 0; 16436 } 16437 } else { 16438 tso = 0; 16439 } 16440 if ((tso == 0) && (len > segsiz)) 16441 len = segsiz; 16442 us_cts = tcp_get_usecs(tv); 16443 if ((len == 0) || 16444 (len <= MHLEN - hdrlen - max_linkhdr)) { 16445 goto failed; 16446 } 16447 sb_offset = tp->snd_max - tp->snd_una; 16448 th->th_seq = htonl(tp->snd_max); 16449 th->th_ack = htonl(tp->rcv_nxt); 16450 th->th_flags = flags; 16451 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 16452 if (th->th_win == 0) { 16453 tp->t_sndzerowin++; 16454 tp->t_flags |= TF_RXWIN0SENT; 16455 } else 16456 tp->t_flags &= ~TF_RXWIN0SENT; 16457 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 16458 KMOD_TCPSTAT_INC(tcps_sndpack); 16459 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 16460 #ifdef STATS 16461 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 16462 len); 16463 #endif 16464 if (rack->r_ctl.fsb.m == NULL) 16465 goto failed; 16466 16467 /* s_mb and s_soff are saved for rack_log_output */ 16468 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 16469 &s_mb, &s_soff); 16470 if (len <= segsiz) { 16471 /* 16472 * Must have ran out of mbufs for the copy 16473 * shorten it to no longer need tso. Lets 16474 * not put on sendalot since we are low on 16475 * mbufs. 16476 */ 16477 tso = 0; 16478 } 16479 if (rack->r_ctl.fsb.rfo_apply_push && 16480 (len == rack->r_ctl.fsb.left_to_send)) { 16481 th->th_flags |= TH_PUSH; 16482 add_flag |= RACK_HAD_PUSH; 16483 } 16484 if ((m->m_next == NULL) || (len <= 0)){ 16485 goto failed; 16486 } 16487 if (udp) { 16488 if (rack->r_is_v6) 16489 ulen = hdrlen + len - sizeof(struct ip6_hdr); 16490 else 16491 ulen = hdrlen + len - sizeof(struct ip); 16492 udp->uh_ulen = htons(ulen); 16493 } 16494 m->m_pkthdr.rcvif = (struct ifnet *)0; 16495 if (tp->t_state == TCPS_ESTABLISHED && 16496 (tp->t_flags2 & TF2_ECN_PERMIT)) { 16497 /* 16498 * If the peer has ECN, mark data packets with ECN capable 16499 * transmission (ECT). Ignore pure ack packets, 16500 * retransmissions. 16501 */ 16502 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 16503 #ifdef INET6 16504 if (rack->r_is_v6) 16505 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 16506 else 16507 #endif 16508 ip->ip_tos |= IPTOS_ECN_ECT0; 16509 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 16510 /* 16511 * Reply with proper ECN notifications. 16512 * Only set CWR on new data segments. 16513 */ 16514 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 16515 flags |= TH_CWR; 16516 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 16517 } 16518 } 16519 if (tp->t_flags2 & TF2_ECN_SND_ECE) 16520 flags |= TH_ECE; 16521 } 16522 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 16523 #ifdef INET6 16524 if (rack->r_is_v6) { 16525 if (tp->t_port) { 16526 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 16527 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16528 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 16529 th->th_sum = htons(0); 16530 UDPSTAT_INC(udps_opackets); 16531 } else { 16532 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 16533 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16534 th->th_sum = in6_cksum_pseudo(ip6, 16535 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 16536 0); 16537 } 16538 } 16539 #endif 16540 #if defined(INET6) && defined(INET) 16541 else 16542 #endif 16543 #ifdef INET 16544 { 16545 if (tp->t_port) { 16546 m->m_pkthdr.csum_flags = CSUM_UDP; 16547 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 16548 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 16549 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 16550 th->th_sum = htons(0); 16551 UDPSTAT_INC(udps_opackets); 16552 } else { 16553 m->m_pkthdr.csum_flags = CSUM_TCP; 16554 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 16555 th->th_sum = in_pseudo(ip->ip_src.s_addr, 16556 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 16557 IPPROTO_TCP + len + optlen)); 16558 } 16559 /* IP version must be set here for ipv4/ipv6 checking later */ 16560 KASSERT(ip->ip_v == IPVERSION, 16561 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 16562 } 16563 #endif 16564 if (tso) { 16565 KASSERT(len > tp->t_maxseg - optlen, 16566 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 16567 m->m_pkthdr.csum_flags |= CSUM_TSO; 16568 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 16569 } 16570 #ifdef INET6 16571 if (rack->r_is_v6) { 16572 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 16573 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 16574 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 16575 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16576 else 16577 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16578 } 16579 #endif 16580 #if defined(INET) && defined(INET6) 16581 else 16582 #endif 16583 #ifdef INET 16584 { 16585 ip->ip_len = htons(m->m_pkthdr.len); 16586 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 16587 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 16588 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 16589 if (tp->t_port == 0 || len < V_tcp_minmss) { 16590 ip->ip_off |= htons(IP_DF); 16591 } 16592 } else { 16593 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 16594 } 16595 } 16596 #endif 16597 /* Time to copy in our header */ 16598 cpto = mtod(m, uint8_t *); 16599 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 16600 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 16601 if (optlen) { 16602 bcopy(opt, th + 1, optlen); 16603 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 16604 } else { 16605 th->th_off = sizeof(struct tcphdr) >> 2; 16606 } 16607 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 16608 union tcp_log_stackspecific log; 16609 16610 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 16611 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 16612 if (rack->rack_no_prr) 16613 log.u_bbr.flex1 = 0; 16614 else 16615 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 16616 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 16617 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 16618 log.u_bbr.flex4 = max_val; 16619 log.u_bbr.flex5 = 0; 16620 /* Save off the early/late values */ 16621 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 16622 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 16623 log.u_bbr.bw_inuse = rack_get_bw(rack); 16624 log.u_bbr.flex8 = 0; 16625 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 16626 log.u_bbr.flex7 = 44; 16627 log.u_bbr.pkts_out = tp->t_maxseg; 16628 log.u_bbr.timeStamp = cts; 16629 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 16630 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 16631 log.u_bbr.delivered = 0; 16632 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 16633 len, &log, false, NULL, NULL, 0, tv); 16634 } else 16635 lgb = NULL; 16636 #ifdef INET6 16637 if (rack->r_is_v6) { 16638 error = ip6_output(m, NULL, 16639 &inp->inp_route6, 16640 0, NULL, NULL, inp); 16641 } 16642 #endif 16643 #if defined(INET) && defined(INET6) 16644 else 16645 #endif 16646 #ifdef INET 16647 { 16648 error = ip_output(m, NULL, 16649 &inp->inp_route, 16650 0, 0, inp); 16651 } 16652 #endif 16653 if (lgb) { 16654 lgb->tlb_errno = error; 16655 lgb = NULL; 16656 } 16657 if (error) { 16658 *send_err = error; 16659 m = NULL; 16660 goto failed; 16661 } 16662 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 16663 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls); 16664 m = NULL; 16665 if (tp->snd_una == tp->snd_max) { 16666 rack->r_ctl.rc_tlp_rxt_last_time = cts; 16667 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 16668 tp->t_acktime = ticks; 16669 } 16670 if (error == 0) 16671 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 16672 16673 rack->forced_ack = 0; /* If we send something zap the FA flag */ 16674 tot_len += len; 16675 if ((tp->t_flags & TF_GPUTINPROG) == 0) 16676 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 16677 tp->snd_max += len; 16678 tp->snd_nxt = tp->snd_max; 16679 { 16680 int idx; 16681 16682 idx = (len / segsiz) + 3; 16683 if (idx >= TCP_MSS_ACCT_ATIMER) 16684 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 16685 else 16686 counter_u64_add(rack_out_size[idx], 1); 16687 } 16688 if (len <= rack->r_ctl.fsb.left_to_send) 16689 rack->r_ctl.fsb.left_to_send -= len; 16690 else 16691 rack->r_ctl.fsb.left_to_send = 0; 16692 if (rack->r_ctl.fsb.left_to_send < segsiz) { 16693 rack->r_fast_output = 0; 16694 rack->r_ctl.fsb.left_to_send = 0; 16695 /* At the end of fast_output scale up the sb */ 16696 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 16697 rack_sndbuf_autoscale(rack); 16698 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 16699 } 16700 if (tp->t_rtttime == 0) { 16701 tp->t_rtttime = ticks; 16702 tp->t_rtseq = startseq; 16703 KMOD_TCPSTAT_INC(tcps_segstimed); 16704 } 16705 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 16706 (max_val > len) && 16707 (tso == 0)) { 16708 max_val -= len; 16709 len = segsiz; 16710 th = rack->r_ctl.fsb.th; 16711 cnt_thru++; 16712 goto again; 16713 } 16714 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 16715 counter_u64_add(rack_fto_send, 1); 16716 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 16717 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 16718 #ifdef TCP_ACCOUNTING 16719 crtsc = get_cyclecount(); 16720 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16721 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 16722 } 16723 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 16724 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16725 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 16726 } 16727 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 16728 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16729 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 16730 } 16731 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz)); 16732 sched_unpin(); 16733 #endif 16734 return (0); 16735 failed: 16736 if (m) 16737 m_free(m); 16738 rack->r_fast_output = 0; 16739 return (-1); 16740 } 16741 16742 static int 16743 rack_output(struct tcpcb *tp) 16744 { 16745 struct socket *so; 16746 uint32_t recwin; 16747 uint32_t sb_offset, s_moff = 0; 16748 int32_t len, flags, error = 0; 16749 struct mbuf *m, *s_mb = NULL; 16750 struct mbuf *mb; 16751 uint32_t if_hw_tsomaxsegcount = 0; 16752 uint32_t if_hw_tsomaxsegsize; 16753 int32_t segsiz, minseg; 16754 long tot_len_this_send = 0; 16755 #ifdef INET 16756 struct ip *ip = NULL; 16757 #endif 16758 #ifdef TCPDEBUG 16759 struct ipovly *ipov = NULL; 16760 #endif 16761 struct udphdr *udp = NULL; 16762 struct tcp_rack *rack; 16763 struct tcphdr *th; 16764 uint8_t pass = 0; 16765 uint8_t mark = 0; 16766 uint8_t wanted_cookie = 0; 16767 u_char opt[TCP_MAXOLEN]; 16768 unsigned ipoptlen, optlen, hdrlen, ulen=0; 16769 uint32_t rack_seq; 16770 16771 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16772 unsigned ipsec_optlen = 0; 16773 16774 #endif 16775 int32_t idle, sendalot; 16776 int32_t sub_from_prr = 0; 16777 volatile int32_t sack_rxmit; 16778 struct rack_sendmap *rsm = NULL; 16779 int32_t tso, mtu; 16780 struct tcpopt to; 16781 int32_t slot = 0; 16782 int32_t sup_rack = 0; 16783 uint32_t cts, ms_cts, delayed, early; 16784 uint16_t add_flag = RACK_SENT_SP; 16785 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 16786 uint8_t hpts_calling, doing_tlp = 0; 16787 uint32_t cwnd_to_use, pace_max_seg; 16788 int32_t do_a_prefetch = 0; 16789 int32_t prefetch_rsm = 0; 16790 int32_t orig_len = 0; 16791 struct timeval tv; 16792 int32_t prefetch_so_done = 0; 16793 struct tcp_log_buffer *lgb; 16794 struct inpcb *inp; 16795 struct sockbuf *sb; 16796 uint64_t ts_val = 0; 16797 #ifdef TCP_ACCOUNTING 16798 uint64_t crtsc; 16799 #endif 16800 #ifdef INET6 16801 struct ip6_hdr *ip6 = NULL; 16802 int32_t isipv6; 16803 #endif 16804 uint8_t filled_all = 0; 16805 bool hw_tls = false; 16806 16807 /* setup and take the cache hits here */ 16808 rack = (struct tcp_rack *)tp->t_fb_ptr; 16809 #ifdef TCP_ACCOUNTING 16810 sched_pin(); 16811 ts_val = get_cyclecount(); 16812 #endif 16813 hpts_calling = rack->rc_inp->inp_hpts_calls; 16814 NET_EPOCH_ASSERT(); 16815 INP_WLOCK_ASSERT(rack->rc_inp); 16816 #ifdef TCP_OFFLOAD 16817 if (tp->t_flags & TF_TOE) { 16818 #ifdef TCP_ACCOUNTING 16819 sched_unpin(); 16820 #endif 16821 return (tcp_offload_output(tp)); 16822 } 16823 #endif 16824 /* 16825 * For TFO connections in SYN_RECEIVED, only allow the initial 16826 * SYN|ACK and those sent by the retransmit timer. 16827 */ 16828 if (IS_FASTOPEN(tp->t_flags) && 16829 (tp->t_state == TCPS_SYN_RECEIVED) && 16830 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 16831 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 16832 #ifdef TCP_ACCOUNTING 16833 sched_unpin(); 16834 #endif 16835 return (0); 16836 } 16837 #ifdef INET6 16838 if (rack->r_state) { 16839 /* Use the cache line loaded if possible */ 16840 isipv6 = rack->r_is_v6; 16841 } else { 16842 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 16843 } 16844 #endif 16845 early = 0; 16846 cts = tcp_get_usecs(&tv); 16847 ms_cts = tcp_tv_to_mssectick(&tv); 16848 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 16849 tcp_in_hpts(rack->rc_inp)) { 16850 /* 16851 * We are on the hpts for some timer but not hptsi output. 16852 * Remove from the hpts unconditionally. 16853 */ 16854 rack_timer_cancel(tp, rack, cts, __LINE__); 16855 } 16856 /* Are we pacing and late? */ 16857 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16858 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 16859 /* We are delayed */ 16860 delayed = cts - rack->r_ctl.rc_last_output_to; 16861 } else { 16862 delayed = 0; 16863 } 16864 /* Do the timers, which may override the pacer */ 16865 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 16866 int retval; 16867 16868 retval = rack_process_timers(tp, rack, cts, hpts_calling, 16869 &doing_tlp); 16870 if (retval != 0) { 16871 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 16872 #ifdef TCP_ACCOUNTING 16873 sched_unpin(); 16874 #endif 16875 /* 16876 * If timers want tcp_drop(), then pass error out, 16877 * otherwise suppress it. 16878 */ 16879 return (retval < 0 ? retval : 0); 16880 } 16881 } 16882 if (rack->rc_in_persist) { 16883 if (tcp_in_hpts(rack->rc_inp) == 0) { 16884 /* Timer is not running */ 16885 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16886 } 16887 #ifdef TCP_ACCOUNTING 16888 sched_unpin(); 16889 #endif 16890 return (0); 16891 } 16892 if ((rack->r_timer_override) || 16893 (rack->rc_ack_can_sendout_data) || 16894 (delayed) || 16895 (tp->t_state < TCPS_ESTABLISHED)) { 16896 rack->rc_ack_can_sendout_data = 0; 16897 if (tcp_in_hpts(rack->rc_inp)) 16898 tcp_hpts_remove(rack->rc_inp); 16899 } else if (tcp_in_hpts(rack->rc_inp)) { 16900 /* 16901 * On the hpts you can't pass even if ACKNOW is on, we will 16902 * when the hpts fires. 16903 */ 16904 #ifdef TCP_ACCOUNTING 16905 crtsc = get_cyclecount(); 16906 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16907 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 16908 } 16909 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val)); 16910 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16911 tp->tcp_cnt_counters[SND_BLOCKED]++; 16912 } 16913 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1); 16914 sched_unpin(); 16915 #endif 16916 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 16917 return (0); 16918 } 16919 rack->rc_inp->inp_hpts_calls = 0; 16920 /* Finish out both pacing early and late accounting */ 16921 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16922 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 16923 early = rack->r_ctl.rc_last_output_to - cts; 16924 } else 16925 early = 0; 16926 if (delayed) { 16927 rack->r_ctl.rc_agg_delayed += delayed; 16928 rack->r_late = 1; 16929 } else if (early) { 16930 rack->r_ctl.rc_agg_early += early; 16931 rack->r_early = 1; 16932 } 16933 /* Now that early/late accounting is done turn off the flag */ 16934 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16935 rack->r_wanted_output = 0; 16936 rack->r_timer_override = 0; 16937 if ((tp->t_state != rack->r_state) && 16938 TCPS_HAVEESTABLISHED(tp->t_state)) { 16939 rack_set_state(tp, rack); 16940 } 16941 if ((rack->r_fast_output) && 16942 (doing_tlp == 0) && 16943 (tp->rcv_numsacks == 0)) { 16944 int ret; 16945 16946 error = 0; 16947 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 16948 if (ret >= 0) 16949 return(ret); 16950 else if (error) { 16951 inp = rack->rc_inp; 16952 so = inp->inp_socket; 16953 sb = &so->so_snd; 16954 goto nomore; 16955 } 16956 } 16957 inp = rack->rc_inp; 16958 /* 16959 * For TFO connections in SYN_SENT or SYN_RECEIVED, 16960 * only allow the initial SYN or SYN|ACK and those sent 16961 * by the retransmit timer. 16962 */ 16963 if (IS_FASTOPEN(tp->t_flags) && 16964 ((tp->t_state == TCPS_SYN_RECEIVED) || 16965 (tp->t_state == TCPS_SYN_SENT)) && 16966 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 16967 (tp->t_rxtshift == 0)) { /* not a retransmit */ 16968 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16969 so = inp->inp_socket; 16970 sb = &so->so_snd; 16971 goto just_return_nolock; 16972 } 16973 /* 16974 * Determine length of data that should be transmitted, and flags 16975 * that will be used. If there is some data or critical controls 16976 * (SYN, RST) to send, then transmit; otherwise, investigate 16977 * further. 16978 */ 16979 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 16980 if (tp->t_idle_reduce) { 16981 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 16982 rack_cc_after_idle(rack, tp); 16983 } 16984 tp->t_flags &= ~TF_LASTIDLE; 16985 if (idle) { 16986 if (tp->t_flags & TF_MORETOCOME) { 16987 tp->t_flags |= TF_LASTIDLE; 16988 idle = 0; 16989 } 16990 } 16991 if ((tp->snd_una == tp->snd_max) && 16992 rack->r_ctl.rc_went_idle_time && 16993 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 16994 idle = cts - rack->r_ctl.rc_went_idle_time; 16995 if (idle > rack_min_probertt_hold) { 16996 /* Count as a probe rtt */ 16997 if (rack->in_probe_rtt == 0) { 16998 rack->r_ctl.rc_lower_rtt_us_cts = cts; 16999 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 17000 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 17001 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 17002 } else { 17003 rack_exit_probertt(rack, cts); 17004 } 17005 } 17006 idle = 0; 17007 } 17008 if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) 17009 rack_init_fsb_block(tp, rack); 17010 again: 17011 /* 17012 * If we've recently taken a timeout, snd_max will be greater than 17013 * snd_nxt. There may be SACK information that allows us to avoid 17014 * resending already delivered data. Adjust snd_nxt accordingly. 17015 */ 17016 sendalot = 0; 17017 cts = tcp_get_usecs(&tv); 17018 ms_cts = tcp_tv_to_mssectick(&tv); 17019 tso = 0; 17020 mtu = 0; 17021 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 17022 minseg = segsiz; 17023 if (rack->r_ctl.rc_pace_max_segs == 0) 17024 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 17025 else 17026 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 17027 sb_offset = tp->snd_max - tp->snd_una; 17028 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 17029 flags = tcp_outflags[tp->t_state]; 17030 while (rack->rc_free_cnt < rack_free_cache) { 17031 rsm = rack_alloc(rack); 17032 if (rsm == NULL) { 17033 if (inp->inp_hpts_calls) 17034 /* Retry in a ms */ 17035 slot = (1 * HPTS_USEC_IN_MSEC); 17036 so = inp->inp_socket; 17037 sb = &so->so_snd; 17038 goto just_return_nolock; 17039 } 17040 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 17041 rack->rc_free_cnt++; 17042 rsm = NULL; 17043 } 17044 if (inp->inp_hpts_calls) 17045 inp->inp_hpts_calls = 0; 17046 sack_rxmit = 0; 17047 len = 0; 17048 rsm = NULL; 17049 if (flags & TH_RST) { 17050 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 17051 so = inp->inp_socket; 17052 sb = &so->so_snd; 17053 goto send; 17054 } 17055 if (rack->r_ctl.rc_resend) { 17056 /* Retransmit timer */ 17057 rsm = rack->r_ctl.rc_resend; 17058 rack->r_ctl.rc_resend = NULL; 17059 len = rsm->r_end - rsm->r_start; 17060 sack_rxmit = 1; 17061 sendalot = 0; 17062 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 17063 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 17064 __func__, __LINE__, 17065 rsm->r_start, tp->snd_una, tp, rack, rsm)); 17066 sb_offset = rsm->r_start - tp->snd_una; 17067 if (len >= segsiz) 17068 len = segsiz; 17069 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 17070 /* We have a retransmit that takes precedence */ 17071 if ((!IN_FASTRECOVERY(tp->t_flags)) && 17072 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 17073 /* Enter recovery if not induced by a time-out */ 17074 rack->r_ctl.rc_rsm_start = rsm->r_start; 17075 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 17076 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 17077 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 17078 } 17079 #ifdef INVARIANTS 17080 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 17081 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 17082 tp, rack, rsm, rsm->r_start, tp->snd_una); 17083 } 17084 #endif 17085 len = rsm->r_end - rsm->r_start; 17086 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 17087 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 17088 __func__, __LINE__, 17089 rsm->r_start, tp->snd_una, tp, rack, rsm)); 17090 sb_offset = rsm->r_start - tp->snd_una; 17091 sendalot = 0; 17092 if (len >= segsiz) 17093 len = segsiz; 17094 if (len > 0) { 17095 sack_rxmit = 1; 17096 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 17097 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 17098 min(len, segsiz)); 17099 counter_u64_add(rack_rtm_prr_retran, 1); 17100 } 17101 } else if (rack->r_ctl.rc_tlpsend) { 17102 /* Tail loss probe */ 17103 long cwin; 17104 long tlen; 17105 17106 /* 17107 * Check if we can do a TLP with a RACK'd packet 17108 * this can happen if we are not doing the rack 17109 * cheat and we skipped to a TLP and it 17110 * went off. 17111 */ 17112 rsm = rack->r_ctl.rc_tlpsend; 17113 /* We are doing a TLP make sure the flag is preent */ 17114 rsm->r_flags |= RACK_TLP; 17115 rack->r_ctl.rc_tlpsend = NULL; 17116 sack_rxmit = 1; 17117 tlen = rsm->r_end - rsm->r_start; 17118 if (tlen > segsiz) 17119 tlen = segsiz; 17120 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 17121 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 17122 __func__, __LINE__, 17123 rsm->r_start, tp->snd_una, tp, rack, rsm)); 17124 sb_offset = rsm->r_start - tp->snd_una; 17125 cwin = min(tp->snd_wnd, tlen); 17126 len = cwin; 17127 } 17128 if (rack->r_must_retran && 17129 (doing_tlp == 0) && 17130 (rsm == NULL)) { 17131 /* 17132 * Non-Sack and we had a RTO or Sack/non-Sack and a 17133 * MTU change, we need to retransmit until we reach 17134 * the former snd_max (rack->r_ctl.rc_snd_max_at_rto). 17135 */ 17136 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 17137 int sendwin, flight; 17138 17139 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 17140 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 17141 if (flight >= sendwin) { 17142 so = inp->inp_socket; 17143 sb = &so->so_snd; 17144 goto just_return_nolock; 17145 } 17146 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 17147 if (rsm == NULL) { 17148 /* TSNH */ 17149 rack->r_must_retran = 0; 17150 rack->r_ctl.rc_out_at_rto = 0; 17151 rack->r_must_retran = 0; 17152 so = inp->inp_socket; 17153 sb = &so->so_snd; 17154 goto just_return_nolock; 17155 } 17156 if ((rsm->r_flags & RACK_MUST_RXT) == 0) { 17157 /* It does not have the flag, we are done */ 17158 rack->r_must_retran = 0; 17159 rack->r_ctl.rc_out_at_rto = 0; 17160 } else { 17161 sack_rxmit = 1; 17162 len = rsm->r_end - rsm->r_start; 17163 sendalot = 0; 17164 sb_offset = rsm->r_start - tp->snd_una; 17165 if (len >= segsiz) 17166 len = segsiz; 17167 /* 17168 * Delay removing the flag RACK_MUST_RXT so 17169 * that the fastpath for retransmit will 17170 * work with this rsm. 17171 */ 17172 17173 } 17174 } else { 17175 /* We must be done if there is nothing outstanding */ 17176 rack->r_must_retran = 0; 17177 rack->r_ctl.rc_out_at_rto = 0; 17178 } 17179 } 17180 /* 17181 * Enforce a connection sendmap count limit if set 17182 * as long as we are not retransmiting. 17183 */ 17184 if ((rsm == NULL) && 17185 (rack->do_detection == 0) && 17186 (V_tcp_map_entries_limit > 0) && 17187 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 17188 counter_u64_add(rack_to_alloc_limited, 1); 17189 if (!rack->alloc_limit_reported) { 17190 rack->alloc_limit_reported = 1; 17191 counter_u64_add(rack_alloc_limited_conns, 1); 17192 } 17193 so = inp->inp_socket; 17194 sb = &so->so_snd; 17195 goto just_return_nolock; 17196 } 17197 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 17198 /* we are retransmitting the fin */ 17199 len--; 17200 if (len) { 17201 /* 17202 * When retransmitting data do *not* include the 17203 * FIN. This could happen from a TLP probe. 17204 */ 17205 flags &= ~TH_FIN; 17206 } 17207 } 17208 #ifdef INVARIANTS 17209 /* For debugging */ 17210 rack->r_ctl.rc_rsm_at_retran = rsm; 17211 #endif 17212 if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && 17213 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 17214 int ret; 17215 17216 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 17217 if (ret == 0) 17218 return (0); 17219 } 17220 if (rsm && (rsm->r_flags & RACK_MUST_RXT)) { 17221 /* 17222 * Clear the flag in prep for the send 17223 * note that if we can't get an mbuf 17224 * and fail, we won't retransmit this 17225 * rsm but that should be ok (its rare). 17226 */ 17227 rsm->r_flags &= ~RACK_MUST_RXT; 17228 } 17229 so = inp->inp_socket; 17230 sb = &so->so_snd; 17231 if (do_a_prefetch == 0) { 17232 kern_prefetch(sb, &do_a_prefetch); 17233 do_a_prefetch = 1; 17234 } 17235 #ifdef NETFLIX_SHARED_CWND 17236 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 17237 rack->rack_enable_scwnd) { 17238 /* We are doing cwnd sharing */ 17239 if (rack->gp_ready && 17240 (rack->rack_attempted_scwnd == 0) && 17241 (rack->r_ctl.rc_scw == NULL) && 17242 tp->t_lib) { 17243 /* The pcbid is in, lets make an attempt */ 17244 counter_u64_add(rack_try_scwnd, 1); 17245 rack->rack_attempted_scwnd = 1; 17246 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 17247 &rack->r_ctl.rc_scw_index, 17248 segsiz); 17249 } 17250 if (rack->r_ctl.rc_scw && 17251 (rack->rack_scwnd_is_idle == 1) && 17252 sbavail(&so->so_snd)) { 17253 /* we are no longer out of data */ 17254 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 17255 rack->rack_scwnd_is_idle = 0; 17256 } 17257 if (rack->r_ctl.rc_scw) { 17258 /* First lets update and get the cwnd */ 17259 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 17260 rack->r_ctl.rc_scw_index, 17261 tp->snd_cwnd, tp->snd_wnd, segsiz); 17262 } 17263 } 17264 #endif 17265 /* 17266 * Get standard flags, and add SYN or FIN if requested by 'hidden' 17267 * state flags. 17268 */ 17269 if (tp->t_flags & TF_NEEDFIN) 17270 flags |= TH_FIN; 17271 if (tp->t_flags & TF_NEEDSYN) 17272 flags |= TH_SYN; 17273 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 17274 void *end_rsm; 17275 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 17276 if (end_rsm) 17277 kern_prefetch(end_rsm, &prefetch_rsm); 17278 prefetch_rsm = 1; 17279 } 17280 SOCKBUF_LOCK(sb); 17281 /* 17282 * If snd_nxt == snd_max and we have transmitted a FIN, the 17283 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 17284 * negative length. This can also occur when TCP opens up its 17285 * congestion window while receiving additional duplicate acks after 17286 * fast-retransmit because TCP will reset snd_nxt to snd_max after 17287 * the fast-retransmit. 17288 * 17289 * In the normal retransmit-FIN-only case, however, snd_nxt will be 17290 * set to snd_una, the sb_offset will be 0, and the length may wind 17291 * up 0. 17292 * 17293 * If sack_rxmit is true we are retransmitting from the scoreboard 17294 * in which case len is already set. 17295 */ 17296 if ((sack_rxmit == 0) && 17297 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 17298 uint32_t avail; 17299 17300 avail = sbavail(sb); 17301 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 17302 sb_offset = tp->snd_nxt - tp->snd_una; 17303 else 17304 sb_offset = 0; 17305 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 17306 if (rack->r_ctl.rc_tlp_new_data) { 17307 /* TLP is forcing out new data */ 17308 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 17309 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 17310 } 17311 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 17312 if (tp->snd_wnd > sb_offset) 17313 len = tp->snd_wnd - sb_offset; 17314 else 17315 len = 0; 17316 } else { 17317 len = rack->r_ctl.rc_tlp_new_data; 17318 } 17319 rack->r_ctl.rc_tlp_new_data = 0; 17320 } else { 17321 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 17322 } 17323 if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { 17324 /* 17325 * For prr=off, we need to send only 1 MSS 17326 * at a time. We do this because another sack could 17327 * be arriving that causes us to send retransmits and 17328 * we don't want to be on a long pace due to a larger send 17329 * that keeps us from sending out the retransmit. 17330 */ 17331 len = segsiz; 17332 } 17333 } else { 17334 uint32_t outstanding; 17335 /* 17336 * We are inside of a Fast recovery episode, this 17337 * is caused by a SACK or 3 dup acks. At this point 17338 * we have sent all the retransmissions and we rely 17339 * on PRR to dictate what we will send in the form of 17340 * new data. 17341 */ 17342 17343 outstanding = tp->snd_max - tp->snd_una; 17344 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 17345 if (tp->snd_wnd > outstanding) { 17346 len = tp->snd_wnd - outstanding; 17347 /* Check to see if we have the data */ 17348 if ((sb_offset + len) > avail) { 17349 /* It does not all fit */ 17350 if (avail > sb_offset) 17351 len = avail - sb_offset; 17352 else 17353 len = 0; 17354 } 17355 } else { 17356 len = 0; 17357 } 17358 } else if (avail > sb_offset) { 17359 len = avail - sb_offset; 17360 } else { 17361 len = 0; 17362 } 17363 if (len > 0) { 17364 if (len > rack->r_ctl.rc_prr_sndcnt) { 17365 len = rack->r_ctl.rc_prr_sndcnt; 17366 } 17367 if (len > 0) { 17368 sub_from_prr = 1; 17369 counter_u64_add(rack_rtm_prr_newdata, 1); 17370 } 17371 } 17372 if (len > segsiz) { 17373 /* 17374 * We should never send more than a MSS when 17375 * retransmitting or sending new data in prr 17376 * mode unless the override flag is on. Most 17377 * likely the PRR algorithm is not going to 17378 * let us send a lot as well :-) 17379 */ 17380 if (rack->r_ctl.rc_prr_sendalot == 0) { 17381 len = segsiz; 17382 } 17383 } else if (len < segsiz) { 17384 /* 17385 * Do we send any? The idea here is if the 17386 * send empty's the socket buffer we want to 17387 * do it. However if not then lets just wait 17388 * for our prr_sndcnt to get bigger. 17389 */ 17390 long leftinsb; 17391 17392 leftinsb = sbavail(sb) - sb_offset; 17393 if (leftinsb > len) { 17394 /* This send does not empty the sb */ 17395 len = 0; 17396 } 17397 } 17398 } 17399 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 17400 /* 17401 * If you have not established 17402 * and are not doing FAST OPEN 17403 * no data please. 17404 */ 17405 if ((sack_rxmit == 0) && 17406 (!IS_FASTOPEN(tp->t_flags))){ 17407 len = 0; 17408 sb_offset = 0; 17409 } 17410 } 17411 if (prefetch_so_done == 0) { 17412 kern_prefetch(so, &prefetch_so_done); 17413 prefetch_so_done = 1; 17414 } 17415 /* 17416 * Lop off SYN bit if it has already been sent. However, if this is 17417 * SYN-SENT state and if segment contains data and if we don't know 17418 * that foreign host supports TAO, suppress sending segment. 17419 */ 17420 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 17421 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 17422 /* 17423 * When sending additional segments following a TFO SYN|ACK, 17424 * do not include the SYN bit. 17425 */ 17426 if (IS_FASTOPEN(tp->t_flags) && 17427 (tp->t_state == TCPS_SYN_RECEIVED)) 17428 flags &= ~TH_SYN; 17429 } 17430 /* 17431 * Be careful not to send data and/or FIN on SYN segments. This 17432 * measure is needed to prevent interoperability problems with not 17433 * fully conformant TCP implementations. 17434 */ 17435 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 17436 len = 0; 17437 flags &= ~TH_FIN; 17438 } 17439 /* 17440 * On TFO sockets, ensure no data is sent in the following cases: 17441 * 17442 * - When retransmitting SYN|ACK on a passively-created socket 17443 * 17444 * - When retransmitting SYN on an actively created socket 17445 * 17446 * - When sending a zero-length cookie (cookie request) on an 17447 * actively created socket 17448 * 17449 * - When the socket is in the CLOSED state (RST is being sent) 17450 */ 17451 if (IS_FASTOPEN(tp->t_flags) && 17452 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 17453 ((tp->t_state == TCPS_SYN_SENT) && 17454 (tp->t_tfo_client_cookie_len == 0)) || 17455 (flags & TH_RST))) { 17456 sack_rxmit = 0; 17457 len = 0; 17458 } 17459 /* Without fast-open there should never be data sent on a SYN */ 17460 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 17461 tp->snd_nxt = tp->iss; 17462 len = 0; 17463 } 17464 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 17465 /* We only send 1 MSS if we have a DSACK block */ 17466 add_flag |= RACK_SENT_W_DSACK; 17467 len = segsiz; 17468 } 17469 orig_len = len; 17470 if (len <= 0) { 17471 /* 17472 * If FIN has been sent but not acked, but we haven't been 17473 * called to retransmit, len will be < 0. Otherwise, window 17474 * shrank after we sent into it. If window shrank to 0, 17475 * cancel pending retransmit, pull snd_nxt back to (closed) 17476 * window, and set the persist timer if it isn't already 17477 * going. If the window didn't close completely, just wait 17478 * for an ACK. 17479 * 17480 * We also do a general check here to ensure that we will 17481 * set the persist timer when we have data to send, but a 17482 * 0-byte window. This makes sure the persist timer is set 17483 * even if the packet hits one of the "goto send" lines 17484 * below. 17485 */ 17486 len = 0; 17487 if ((tp->snd_wnd == 0) && 17488 (TCPS_HAVEESTABLISHED(tp->t_state)) && 17489 (tp->snd_una == tp->snd_max) && 17490 (sb_offset < (int)sbavail(sb))) { 17491 rack_enter_persist(tp, rack, cts); 17492 } 17493 } else if ((rsm == NULL) && 17494 (doing_tlp == 0) && 17495 (len < pace_max_seg)) { 17496 /* 17497 * We are not sending a maximum sized segment for 17498 * some reason. Should we not send anything (think 17499 * sws or persists)? 17500 */ 17501 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 17502 (TCPS_HAVEESTABLISHED(tp->t_state)) && 17503 (len < minseg) && 17504 (len < (int)(sbavail(sb) - sb_offset))) { 17505 /* 17506 * Here the rwnd is less than 17507 * the minimum pacing size, this is not a retransmit, 17508 * we are established and 17509 * the send is not the last in the socket buffer 17510 * we send nothing, and we may enter persists 17511 * if nothing is outstanding. 17512 */ 17513 len = 0; 17514 if (tp->snd_max == tp->snd_una) { 17515 /* 17516 * Nothing out we can 17517 * go into persists. 17518 */ 17519 rack_enter_persist(tp, rack, cts); 17520 } 17521 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 17522 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 17523 (len < (int)(sbavail(sb) - sb_offset)) && 17524 (len < minseg)) { 17525 /* 17526 * Here we are not retransmitting, and 17527 * the cwnd is not so small that we could 17528 * not send at least a min size (rxt timer 17529 * not having gone off), We have 2 segments or 17530 * more already in flight, its not the tail end 17531 * of the socket buffer and the cwnd is blocking 17532 * us from sending out a minimum pacing segment size. 17533 * Lets not send anything. 17534 */ 17535 len = 0; 17536 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 17537 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 17538 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 17539 (len < (int)(sbavail(sb) - sb_offset)) && 17540 (TCPS_HAVEESTABLISHED(tp->t_state))) { 17541 /* 17542 * Here we have a send window but we have 17543 * filled it up and we can't send another pacing segment. 17544 * We also have in flight more than 2 segments 17545 * and we are not completing the sb i.e. we allow 17546 * the last bytes of the sb to go out even if 17547 * its not a full pacing segment. 17548 */ 17549 len = 0; 17550 } else if ((rack->r_ctl.crte != NULL) && 17551 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 17552 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 17553 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 17554 (len < (int)(sbavail(sb) - sb_offset))) { 17555 /* 17556 * Here we are doing hardware pacing, this is not a TLP, 17557 * we are not sending a pace max segment size, there is rwnd 17558 * room to send at least N pace_max_seg, the cwnd is greater 17559 * than or equal to a full pacing segments plus 4 mss and we have 2 or 17560 * more segments in flight and its not the tail of the socket buffer. 17561 * 17562 * We don't want to send instead we need to get more ack's in to 17563 * allow us to send a full pacing segment. Normally, if we are pacing 17564 * about the right speed, we should have finished our pacing 17565 * send as most of the acks have come back if we are at the 17566 * right rate. This is a bit fuzzy since return path delay 17567 * can delay the acks, which is why we want to make sure we 17568 * have cwnd space to have a bit more than a max pace segments in flight. 17569 * 17570 * If we have not gotten our acks back we are pacing at too high a 17571 * rate delaying will not hurt and will bring our GP estimate down by 17572 * injecting the delay. If we don't do this we will send 17573 * 2 MSS out in response to the acks being clocked in which 17574 * defeats the point of hw-pacing (i.e. to help us get 17575 * larger TSO's out). 17576 */ 17577 len = 0; 17578 17579 } 17580 17581 } 17582 /* len will be >= 0 after this point. */ 17583 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 17584 rack_sndbuf_autoscale(rack); 17585 /* 17586 * Decide if we can use TCP Segmentation Offloading (if supported by 17587 * hardware). 17588 * 17589 * TSO may only be used if we are in a pure bulk sending state. The 17590 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 17591 * options prevent using TSO. With TSO the TCP header is the same 17592 * (except for the sequence number) for all generated packets. This 17593 * makes it impossible to transmit any options which vary per 17594 * generated segment or packet. 17595 * 17596 * IPv4 handling has a clear separation of ip options and ip header 17597 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 17598 * the right thing below to provide length of just ip options and thus 17599 * checking for ipoptlen is enough to decide if ip options are present. 17600 */ 17601 ipoptlen = 0; 17602 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17603 /* 17604 * Pre-calculate here as we save another lookup into the darknesses 17605 * of IPsec that way and can actually decide if TSO is ok. 17606 */ 17607 #ifdef INET6 17608 if (isipv6 && IPSEC_ENABLED(ipv6)) 17609 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 17610 #ifdef INET 17611 else 17612 #endif 17613 #endif /* INET6 */ 17614 #ifdef INET 17615 if (IPSEC_ENABLED(ipv4)) 17616 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 17617 #endif /* INET */ 17618 #endif 17619 17620 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17621 ipoptlen += ipsec_optlen; 17622 #endif 17623 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 17624 (tp->t_port == 0) && 17625 ((tp->t_flags & TF_SIGNATURE) == 0) && 17626 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 17627 ipoptlen == 0) 17628 tso = 1; 17629 { 17630 uint32_t outstanding; 17631 17632 outstanding = tp->snd_max - tp->snd_una; 17633 if (tp->t_flags & TF_SENTFIN) { 17634 /* 17635 * If we sent a fin, snd_max is 1 higher than 17636 * snd_una 17637 */ 17638 outstanding--; 17639 } 17640 if (sack_rxmit) { 17641 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 17642 flags &= ~TH_FIN; 17643 } else { 17644 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 17645 sbused(sb))) 17646 flags &= ~TH_FIN; 17647 } 17648 } 17649 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 17650 (long)TCP_MAXWIN << tp->rcv_scale); 17651 17652 /* 17653 * Sender silly window avoidance. We transmit under the following 17654 * conditions when len is non-zero: 17655 * 17656 * - We have a full segment (or more with TSO) - This is the last 17657 * buffer in a write()/send() and we are either idle or running 17658 * NODELAY - we've timed out (e.g. persist timer) - we have more 17659 * then 1/2 the maximum send window's worth of data (receiver may be 17660 * limited the window size) - we need to retransmit 17661 */ 17662 if (len) { 17663 if (len >= segsiz) { 17664 goto send; 17665 } 17666 /* 17667 * NOTE! on localhost connections an 'ack' from the remote 17668 * end may occur synchronously with the output and cause us 17669 * to flush a buffer queued with moretocome. XXX 17670 * 17671 */ 17672 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 17673 (idle || (tp->t_flags & TF_NODELAY)) && 17674 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17675 (tp->t_flags & TF_NOPUSH) == 0) { 17676 pass = 2; 17677 goto send; 17678 } 17679 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 17680 pass = 22; 17681 goto send; 17682 } 17683 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 17684 pass = 4; 17685 goto send; 17686 } 17687 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 17688 pass = 5; 17689 goto send; 17690 } 17691 if (sack_rxmit) { 17692 pass = 6; 17693 goto send; 17694 } 17695 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 17696 (ctf_outstanding(tp) < (segsiz * 2))) { 17697 /* 17698 * We have less than two MSS outstanding (delayed ack) 17699 * and our rwnd will not let us send a full sized 17700 * MSS. Lets go ahead and let this small segment 17701 * out because we want to try to have at least two 17702 * packets inflight to not be caught by delayed ack. 17703 */ 17704 pass = 12; 17705 goto send; 17706 } 17707 } 17708 /* 17709 * Sending of standalone window updates. 17710 * 17711 * Window updates are important when we close our window due to a 17712 * full socket buffer and are opening it again after the application 17713 * reads data from it. Once the window has opened again and the 17714 * remote end starts to send again the ACK clock takes over and 17715 * provides the most current window information. 17716 * 17717 * We must avoid the silly window syndrome whereas every read from 17718 * the receive buffer, no matter how small, causes a window update 17719 * to be sent. We also should avoid sending a flurry of window 17720 * updates when the socket buffer had queued a lot of data and the 17721 * application is doing small reads. 17722 * 17723 * Prevent a flurry of pointless window updates by only sending an 17724 * update when we can increase the advertized window by more than 17725 * 1/4th of the socket buffer capacity. When the buffer is getting 17726 * full or is very small be more aggressive and send an update 17727 * whenever we can increase by two mss sized segments. In all other 17728 * situations the ACK's to new incoming data will carry further 17729 * window increases. 17730 * 17731 * Don't send an independent window update if a delayed ACK is 17732 * pending (it will get piggy-backed on it) or the remote side 17733 * already has done a half-close and won't send more data. Skip 17734 * this if the connection is in T/TCP half-open state. 17735 */ 17736 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 17737 !(tp->t_flags & TF_DELACK) && 17738 !TCPS_HAVERCVDFIN(tp->t_state)) { 17739 /* 17740 * "adv" is the amount we could increase the window, taking 17741 * into account that we are limited by TCP_MAXWIN << 17742 * tp->rcv_scale. 17743 */ 17744 int32_t adv; 17745 int oldwin; 17746 17747 adv = recwin; 17748 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 17749 oldwin = (tp->rcv_adv - tp->rcv_nxt); 17750 if (adv > oldwin) 17751 adv -= oldwin; 17752 else { 17753 /* We can't increase the window */ 17754 adv = 0; 17755 } 17756 } else 17757 oldwin = 0; 17758 17759 /* 17760 * If the new window size ends up being the same as or less 17761 * than the old size when it is scaled, then don't force 17762 * a window update. 17763 */ 17764 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 17765 goto dontupdate; 17766 17767 if (adv >= (int32_t)(2 * segsiz) && 17768 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 17769 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 17770 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 17771 pass = 7; 17772 goto send; 17773 } 17774 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 17775 pass = 23; 17776 goto send; 17777 } 17778 } 17779 dontupdate: 17780 17781 /* 17782 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 17783 * is also a catch-all for the retransmit timer timeout case. 17784 */ 17785 if (tp->t_flags & TF_ACKNOW) { 17786 pass = 8; 17787 goto send; 17788 } 17789 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 17790 pass = 9; 17791 goto send; 17792 } 17793 /* 17794 * If our state indicates that FIN should be sent and we have not 17795 * yet done so, then we need to send. 17796 */ 17797 if ((flags & TH_FIN) && 17798 (tp->snd_nxt == tp->snd_una)) { 17799 pass = 11; 17800 goto send; 17801 } 17802 /* 17803 * No reason to send a segment, just return. 17804 */ 17805 just_return: 17806 SOCKBUF_UNLOCK(sb); 17807 just_return_nolock: 17808 { 17809 int app_limited = CTF_JR_SENT_DATA; 17810 17811 if (tot_len_this_send > 0) { 17812 /* Make sure snd_nxt is up to max */ 17813 rack->r_ctl.fsb.recwin = recwin; 17814 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 17815 if ((error == 0) && 17816 rack_use_rfo && 17817 ((flags & (TH_SYN|TH_FIN)) == 0) && 17818 (ipoptlen == 0) && 17819 (tp->snd_nxt == tp->snd_max) && 17820 (tp->rcv_numsacks == 0) && 17821 rack->r_fsb_inited && 17822 TCPS_HAVEESTABLISHED(tp->t_state) && 17823 (rack->r_must_retran == 0) && 17824 ((tp->t_flags & TF_NEEDFIN) == 0) && 17825 (len > 0) && (orig_len > 0) && 17826 (orig_len > len) && 17827 ((orig_len - len) >= segsiz) && 17828 ((optlen == 0) || 17829 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 17830 /* We can send at least one more MSS using our fsb */ 17831 17832 rack->r_fast_output = 1; 17833 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 17834 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 17835 rack->r_ctl.fsb.tcp_flags = flags; 17836 rack->r_ctl.fsb.left_to_send = orig_len - len; 17837 if (hw_tls) 17838 rack->r_ctl.fsb.hw_tls = 1; 17839 else 17840 rack->r_ctl.fsb.hw_tls = 0; 17841 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 17842 ("rack:%p left_to_send:%u sbavail:%u out:%u", 17843 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 17844 (tp->snd_max - tp->snd_una))); 17845 if (rack->r_ctl.fsb.left_to_send < segsiz) 17846 rack->r_fast_output = 0; 17847 else { 17848 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 17849 rack->r_ctl.fsb.rfo_apply_push = 1; 17850 else 17851 rack->r_ctl.fsb.rfo_apply_push = 0; 17852 } 17853 } else 17854 rack->r_fast_output = 0; 17855 17856 17857 rack_log_fsb(rack, tp, so, flags, 17858 ipoptlen, orig_len, len, 0, 17859 1, optlen, __LINE__, 1); 17860 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 17861 tp->snd_nxt = tp->snd_max; 17862 } else { 17863 int end_window = 0; 17864 uint32_t seq = tp->gput_ack; 17865 17866 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17867 if (rsm) { 17868 /* 17869 * Mark the last sent that we just-returned (hinting 17870 * that delayed ack may play a role in any rtt measurement). 17871 */ 17872 rsm->r_just_ret = 1; 17873 } 17874 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 17875 rack->r_ctl.rc_agg_delayed = 0; 17876 rack->r_early = 0; 17877 rack->r_late = 0; 17878 rack->r_ctl.rc_agg_early = 0; 17879 if ((ctf_outstanding(tp) + 17880 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 17881 minseg)) >= tp->snd_wnd) { 17882 /* We are limited by the rwnd */ 17883 app_limited = CTF_JR_RWND_LIMITED; 17884 if (IN_FASTRECOVERY(tp->t_flags)) 17885 rack->r_ctl.rc_prr_sndcnt = 0; 17886 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 17887 /* We are limited by whats available -- app limited */ 17888 app_limited = CTF_JR_APP_LIMITED; 17889 if (IN_FASTRECOVERY(tp->t_flags)) 17890 rack->r_ctl.rc_prr_sndcnt = 0; 17891 } else if ((idle == 0) && 17892 ((tp->t_flags & TF_NODELAY) == 0) && 17893 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17894 (len < segsiz)) { 17895 /* 17896 * No delay is not on and the 17897 * user is sending less than 1MSS. This 17898 * brings out SWS avoidance so we 17899 * don't send. Another app-limited case. 17900 */ 17901 app_limited = CTF_JR_APP_LIMITED; 17902 } else if (tp->t_flags & TF_NOPUSH) { 17903 /* 17904 * The user has requested no push of 17905 * the last segment and we are 17906 * at the last segment. Another app 17907 * limited case. 17908 */ 17909 app_limited = CTF_JR_APP_LIMITED; 17910 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 17911 /* Its the cwnd */ 17912 app_limited = CTF_JR_CWND_LIMITED; 17913 } else if (IN_FASTRECOVERY(tp->t_flags) && 17914 (rack->rack_no_prr == 0) && 17915 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 17916 app_limited = CTF_JR_PRR; 17917 } else { 17918 /* Now why here are we not sending? */ 17919 #ifdef NOW 17920 #ifdef INVARIANTS 17921 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 17922 #endif 17923 #endif 17924 app_limited = CTF_JR_ASSESSING; 17925 } 17926 /* 17927 * App limited in some fashion, for our pacing GP 17928 * measurements we don't want any gap (even cwnd). 17929 * Close down the measurement window. 17930 */ 17931 if (rack_cwnd_block_ends_measure && 17932 ((app_limited == CTF_JR_CWND_LIMITED) || 17933 (app_limited == CTF_JR_PRR))) { 17934 /* 17935 * The reason we are not sending is 17936 * the cwnd (or prr). We have been configured 17937 * to end the measurement window in 17938 * this case. 17939 */ 17940 end_window = 1; 17941 } else if (rack_rwnd_block_ends_measure && 17942 (app_limited == CTF_JR_RWND_LIMITED)) { 17943 /* 17944 * We are rwnd limited and have been 17945 * configured to end the measurement 17946 * window in this case. 17947 */ 17948 end_window = 1; 17949 } else if (app_limited == CTF_JR_APP_LIMITED) { 17950 /* 17951 * A true application limited period, we have 17952 * ran out of data. 17953 */ 17954 end_window = 1; 17955 } else if (app_limited == CTF_JR_ASSESSING) { 17956 /* 17957 * In the assessing case we hit the end of 17958 * the if/else and had no known reason 17959 * This will panic us under invariants.. 17960 * 17961 * If we get this out in logs we need to 17962 * investagate which reason we missed. 17963 */ 17964 end_window = 1; 17965 } 17966 if (end_window) { 17967 uint8_t log = 0; 17968 17969 /* Adjust the Gput measurement */ 17970 if ((tp->t_flags & TF_GPUTINPROG) && 17971 SEQ_GT(tp->gput_ack, tp->snd_max)) { 17972 tp->gput_ack = tp->snd_max; 17973 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 17974 /* 17975 * There is not enough to measure. 17976 */ 17977 tp->t_flags &= ~TF_GPUTINPROG; 17978 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 17979 rack->r_ctl.rc_gp_srtt /*flex1*/, 17980 tp->gput_seq, 17981 0, 0, 18, __LINE__, NULL, 0); 17982 } else 17983 log = 1; 17984 } 17985 /* Mark the last packet has app limited */ 17986 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17987 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 17988 if (rack->r_ctl.rc_app_limited_cnt == 0) 17989 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 17990 else { 17991 /* 17992 * Go out to the end app limited and mark 17993 * this new one as next and move the end_appl up 17994 * to this guy. 17995 */ 17996 if (rack->r_ctl.rc_end_appl) 17997 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 17998 rack->r_ctl.rc_end_appl = rsm; 17999 } 18000 rsm->r_flags |= RACK_APP_LIMITED; 18001 rack->r_ctl.rc_app_limited_cnt++; 18002 } 18003 if (log) 18004 rack_log_pacing_delay_calc(rack, 18005 rack->r_ctl.rc_app_limited_cnt, seq, 18006 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 18007 } 18008 } 18009 if (slot) { 18010 /* set the rack tcb into the slot N */ 18011 counter_u64_add(rack_paced_segments, 1); 18012 } else if (tot_len_this_send) { 18013 counter_u64_add(rack_unpaced_segments, 1); 18014 } 18015 /* Check if we need to go into persists or not */ 18016 if ((tp->snd_max == tp->snd_una) && 18017 TCPS_HAVEESTABLISHED(tp->t_state) && 18018 sbavail(sb) && 18019 (sbavail(sb) > tp->snd_wnd) && 18020 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 18021 /* Yes lets make sure to move to persist before timer-start */ 18022 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 18023 } 18024 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 18025 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 18026 } 18027 #ifdef NETFLIX_SHARED_CWND 18028 if ((sbavail(sb) == 0) && 18029 rack->r_ctl.rc_scw) { 18030 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 18031 rack->rack_scwnd_is_idle = 1; 18032 } 18033 #endif 18034 #ifdef TCP_ACCOUNTING 18035 if (tot_len_this_send > 0) { 18036 crtsc = get_cyclecount(); 18037 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18038 tp->tcp_cnt_counters[SND_OUT_DATA]++; 18039 } 18040 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 18041 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18042 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 18043 } 18044 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 18045 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18046 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 18047 } 18048 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz)); 18049 } else { 18050 crtsc = get_cyclecount(); 18051 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18052 tp->tcp_cnt_counters[SND_LIMITED]++; 18053 } 18054 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1); 18055 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18056 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 18057 } 18058 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val)); 18059 } 18060 sched_unpin(); 18061 #endif 18062 return (0); 18063 18064 send: 18065 if (rsm || sack_rxmit) 18066 counter_u64_add(rack_nfto_resend, 1); 18067 else 18068 counter_u64_add(rack_non_fto_send, 1); 18069 if ((flags & TH_FIN) && 18070 sbavail(sb)) { 18071 /* 18072 * We do not transmit a FIN 18073 * with data outstanding. We 18074 * need to make it so all data 18075 * is acked first. 18076 */ 18077 flags &= ~TH_FIN; 18078 } 18079 /* Enforce stack imposed max seg size if we have one */ 18080 if (rack->r_ctl.rc_pace_max_segs && 18081 (len > rack->r_ctl.rc_pace_max_segs)) { 18082 mark = 1; 18083 len = rack->r_ctl.rc_pace_max_segs; 18084 } 18085 SOCKBUF_LOCK_ASSERT(sb); 18086 if (len > 0) { 18087 if (len >= segsiz) 18088 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 18089 else 18090 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 18091 } 18092 /* 18093 * Before ESTABLISHED, force sending of initial options unless TCP 18094 * set not to do any options. NOTE: we assume that the IP/TCP header 18095 * plus TCP options always fit in a single mbuf, leaving room for a 18096 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 18097 * + optlen <= MCLBYTES 18098 */ 18099 optlen = 0; 18100 #ifdef INET6 18101 if (isipv6) 18102 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 18103 else 18104 #endif 18105 hdrlen = sizeof(struct tcpiphdr); 18106 18107 /* 18108 * Compute options for segment. We only have to care about SYN and 18109 * established connection segments. Options for SYN-ACK segments 18110 * are handled in TCP syncache. 18111 */ 18112 to.to_flags = 0; 18113 if ((tp->t_flags & TF_NOOPT) == 0) { 18114 /* Maximum segment size. */ 18115 if (flags & TH_SYN) { 18116 tp->snd_nxt = tp->iss; 18117 to.to_mss = tcp_mssopt(&inp->inp_inc); 18118 if (tp->t_port) 18119 to.to_mss -= V_tcp_udp_tunneling_overhead; 18120 to.to_flags |= TOF_MSS; 18121 18122 /* 18123 * On SYN or SYN|ACK transmits on TFO connections, 18124 * only include the TFO option if it is not a 18125 * retransmit, as the presence of the TFO option may 18126 * have caused the original SYN or SYN|ACK to have 18127 * been dropped by a middlebox. 18128 */ 18129 if (IS_FASTOPEN(tp->t_flags) && 18130 (tp->t_rxtshift == 0)) { 18131 if (tp->t_state == TCPS_SYN_RECEIVED) { 18132 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 18133 to.to_tfo_cookie = 18134 (u_int8_t *)&tp->t_tfo_cookie.server; 18135 to.to_flags |= TOF_FASTOPEN; 18136 wanted_cookie = 1; 18137 } else if (tp->t_state == TCPS_SYN_SENT) { 18138 to.to_tfo_len = 18139 tp->t_tfo_client_cookie_len; 18140 to.to_tfo_cookie = 18141 tp->t_tfo_cookie.client; 18142 to.to_flags |= TOF_FASTOPEN; 18143 wanted_cookie = 1; 18144 /* 18145 * If we wind up having more data to 18146 * send with the SYN than can fit in 18147 * one segment, don't send any more 18148 * until the SYN|ACK comes back from 18149 * the other end. 18150 */ 18151 sendalot = 0; 18152 } 18153 } 18154 } 18155 /* Window scaling. */ 18156 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 18157 to.to_wscale = tp->request_r_scale; 18158 to.to_flags |= TOF_SCALE; 18159 } 18160 /* Timestamps. */ 18161 if ((tp->t_flags & TF_RCVD_TSTMP) || 18162 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 18163 to.to_tsval = ms_cts + tp->ts_offset; 18164 to.to_tsecr = tp->ts_recent; 18165 to.to_flags |= TOF_TS; 18166 } 18167 /* Set receive buffer autosizing timestamp. */ 18168 if (tp->rfbuf_ts == 0 && 18169 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 18170 tp->rfbuf_ts = tcp_ts_getticks(); 18171 /* Selective ACK's. */ 18172 if (tp->t_flags & TF_SACK_PERMIT) { 18173 if (flags & TH_SYN) 18174 to.to_flags |= TOF_SACKPERM; 18175 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 18176 tp->rcv_numsacks > 0) { 18177 to.to_flags |= TOF_SACK; 18178 to.to_nsacks = tp->rcv_numsacks; 18179 to.to_sacks = (u_char *)tp->sackblks; 18180 } 18181 } 18182 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18183 /* TCP-MD5 (RFC2385). */ 18184 if (tp->t_flags & TF_SIGNATURE) 18185 to.to_flags |= TOF_SIGNATURE; 18186 #endif /* TCP_SIGNATURE */ 18187 18188 /* Processing the options. */ 18189 hdrlen += optlen = tcp_addoptions(&to, opt); 18190 /* 18191 * If we wanted a TFO option to be added, but it was unable 18192 * to fit, ensure no data is sent. 18193 */ 18194 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 18195 !(to.to_flags & TOF_FASTOPEN)) 18196 len = 0; 18197 } 18198 if (tp->t_port) { 18199 if (V_tcp_udp_tunneling_port == 0) { 18200 /* The port was removed?? */ 18201 SOCKBUF_UNLOCK(&so->so_snd); 18202 #ifdef TCP_ACCOUNTING 18203 crtsc = get_cyclecount(); 18204 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18205 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18206 } 18207 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18208 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18209 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18210 } 18211 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18212 sched_unpin(); 18213 #endif 18214 return (EHOSTUNREACH); 18215 } 18216 hdrlen += sizeof(struct udphdr); 18217 } 18218 #ifdef INET6 18219 if (isipv6) 18220 ipoptlen = ip6_optlen(tp->t_inpcb); 18221 else 18222 #endif 18223 if (tp->t_inpcb->inp_options) 18224 ipoptlen = tp->t_inpcb->inp_options->m_len - 18225 offsetof(struct ipoption, ipopt_list); 18226 else 18227 ipoptlen = 0; 18228 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18229 ipoptlen += ipsec_optlen; 18230 #endif 18231 18232 /* 18233 * Adjust data length if insertion of options will bump the packet 18234 * length beyond the t_maxseg length. Clear the FIN bit because we 18235 * cut off the tail of the segment. 18236 */ 18237 if (len + optlen + ipoptlen > tp->t_maxseg) { 18238 if (tso) { 18239 uint32_t if_hw_tsomax; 18240 uint32_t moff; 18241 int32_t max_len; 18242 18243 /* extract TSO information */ 18244 if_hw_tsomax = tp->t_tsomax; 18245 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 18246 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 18247 KASSERT(ipoptlen == 0, 18248 ("%s: TSO can't do IP options", __func__)); 18249 18250 /* 18251 * Check if we should limit by maximum payload 18252 * length: 18253 */ 18254 if (if_hw_tsomax != 0) { 18255 /* compute maximum TSO length */ 18256 max_len = (if_hw_tsomax - hdrlen - 18257 max_linkhdr); 18258 if (max_len <= 0) { 18259 len = 0; 18260 } else if (len > max_len) { 18261 sendalot = 1; 18262 len = max_len; 18263 mark = 2; 18264 } 18265 } 18266 /* 18267 * Prevent the last segment from being fractional 18268 * unless the send sockbuf can be emptied: 18269 */ 18270 max_len = (tp->t_maxseg - optlen); 18271 if ((sb_offset + len) < sbavail(sb)) { 18272 moff = len % (u_int)max_len; 18273 if (moff != 0) { 18274 mark = 3; 18275 len -= moff; 18276 } 18277 } 18278 /* 18279 * In case there are too many small fragments don't 18280 * use TSO: 18281 */ 18282 if (len <= segsiz) { 18283 mark = 4; 18284 tso = 0; 18285 } 18286 /* 18287 * Send the FIN in a separate segment after the bulk 18288 * sending is done. We don't trust the TSO 18289 * implementations to clear the FIN flag on all but 18290 * the last segment. 18291 */ 18292 if (tp->t_flags & TF_NEEDFIN) { 18293 sendalot = 4; 18294 } 18295 } else { 18296 mark = 5; 18297 if (optlen + ipoptlen >= tp->t_maxseg) { 18298 /* 18299 * Since we don't have enough space to put 18300 * the IP header chain and the TCP header in 18301 * one packet as required by RFC 7112, don't 18302 * send it. Also ensure that at least one 18303 * byte of the payload can be put into the 18304 * TCP segment. 18305 */ 18306 SOCKBUF_UNLOCK(&so->so_snd); 18307 error = EMSGSIZE; 18308 sack_rxmit = 0; 18309 goto out; 18310 } 18311 len = tp->t_maxseg - optlen - ipoptlen; 18312 sendalot = 5; 18313 } 18314 } else { 18315 tso = 0; 18316 mark = 6; 18317 } 18318 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 18319 ("%s: len > IP_MAXPACKET", __func__)); 18320 #ifdef DIAGNOSTIC 18321 #ifdef INET6 18322 if (max_linkhdr + hdrlen > MCLBYTES) 18323 #else 18324 if (max_linkhdr + hdrlen > MHLEN) 18325 #endif 18326 panic("tcphdr too big"); 18327 #endif 18328 18329 /* 18330 * This KASSERT is here to catch edge cases at a well defined place. 18331 * Before, those had triggered (random) panic conditions further 18332 * down. 18333 */ 18334 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 18335 if ((len == 0) && 18336 (flags & TH_FIN) && 18337 (sbused(sb))) { 18338 /* 18339 * We have outstanding data, don't send a fin by itself!. 18340 */ 18341 goto just_return; 18342 } 18343 /* 18344 * Grab a header mbuf, attaching a copy of data to be transmitted, 18345 * and initialize the header from the template for sends on this 18346 * connection. 18347 */ 18348 hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; 18349 if (len) { 18350 uint32_t max_val; 18351 uint32_t moff; 18352 18353 if (rack->r_ctl.rc_pace_max_segs) 18354 max_val = rack->r_ctl.rc_pace_max_segs; 18355 else if (rack->rc_user_set_max_segs) 18356 max_val = rack->rc_user_set_max_segs * segsiz; 18357 else 18358 max_val = len; 18359 /* 18360 * We allow a limit on sending with hptsi. 18361 */ 18362 if (len > max_val) { 18363 mark = 7; 18364 len = max_val; 18365 } 18366 #ifdef INET6 18367 if (MHLEN < hdrlen + max_linkhdr) 18368 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 18369 else 18370 #endif 18371 m = m_gethdr(M_NOWAIT, MT_DATA); 18372 18373 if (m == NULL) { 18374 SOCKBUF_UNLOCK(sb); 18375 error = ENOBUFS; 18376 sack_rxmit = 0; 18377 goto out; 18378 } 18379 m->m_data += max_linkhdr; 18380 m->m_len = hdrlen; 18381 18382 /* 18383 * Start the m_copy functions from the closest mbuf to the 18384 * sb_offset in the socket buffer chain. 18385 */ 18386 mb = sbsndptr_noadv(sb, sb_offset, &moff); 18387 s_mb = mb; 18388 s_moff = moff; 18389 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 18390 m_copydata(mb, moff, (int)len, 18391 mtod(m, caddr_t)+hdrlen); 18392 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 18393 sbsndptr_adv(sb, mb, len); 18394 m->m_len += len; 18395 } else { 18396 struct sockbuf *msb; 18397 18398 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 18399 msb = NULL; 18400 else 18401 msb = sb; 18402 m->m_next = tcp_m_copym( 18403 mb, moff, &len, 18404 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 18405 ((rsm == NULL) ? hw_tls : 0) 18406 #ifdef NETFLIX_COPY_ARGS 18407 , &filled_all 18408 #endif 18409 ); 18410 if (len <= (tp->t_maxseg - optlen)) { 18411 /* 18412 * Must have ran out of mbufs for the copy 18413 * shorten it to no longer need tso. Lets 18414 * not put on sendalot since we are low on 18415 * mbufs. 18416 */ 18417 tso = 0; 18418 } 18419 if (m->m_next == NULL) { 18420 SOCKBUF_UNLOCK(sb); 18421 (void)m_free(m); 18422 error = ENOBUFS; 18423 sack_rxmit = 0; 18424 goto out; 18425 } 18426 } 18427 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 18428 if (rsm && (rsm->r_flags & RACK_TLP)) { 18429 /* 18430 * TLP should not count in retran count, but 18431 * in its own bin 18432 */ 18433 counter_u64_add(rack_tlp_retran, 1); 18434 counter_u64_add(rack_tlp_retran_bytes, len); 18435 } else { 18436 tp->t_sndrexmitpack++; 18437 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 18438 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 18439 } 18440 #ifdef STATS 18441 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 18442 len); 18443 #endif 18444 } else { 18445 KMOD_TCPSTAT_INC(tcps_sndpack); 18446 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 18447 #ifdef STATS 18448 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 18449 len); 18450 #endif 18451 } 18452 /* 18453 * If we're sending everything we've got, set PUSH. (This 18454 * will keep happy those implementations which only give 18455 * data to the user when a buffer fills or a PUSH comes in.) 18456 */ 18457 if (sb_offset + len == sbused(sb) && 18458 sbused(sb) && 18459 !(flags & TH_SYN)) { 18460 flags |= TH_PUSH; 18461 add_flag |= RACK_HAD_PUSH; 18462 } 18463 18464 SOCKBUF_UNLOCK(sb); 18465 } else { 18466 SOCKBUF_UNLOCK(sb); 18467 if (tp->t_flags & TF_ACKNOW) 18468 KMOD_TCPSTAT_INC(tcps_sndacks); 18469 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 18470 KMOD_TCPSTAT_INC(tcps_sndctrl); 18471 else 18472 KMOD_TCPSTAT_INC(tcps_sndwinup); 18473 18474 m = m_gethdr(M_NOWAIT, MT_DATA); 18475 if (m == NULL) { 18476 error = ENOBUFS; 18477 sack_rxmit = 0; 18478 goto out; 18479 } 18480 #ifdef INET6 18481 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 18482 MHLEN >= hdrlen) { 18483 M_ALIGN(m, hdrlen); 18484 } else 18485 #endif 18486 m->m_data += max_linkhdr; 18487 m->m_len = hdrlen; 18488 } 18489 SOCKBUF_UNLOCK_ASSERT(sb); 18490 m->m_pkthdr.rcvif = (struct ifnet *)0; 18491 #ifdef MAC 18492 mac_inpcb_create_mbuf(inp, m); 18493 #endif 18494 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 18495 #ifdef INET6 18496 if (isipv6) 18497 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 18498 else 18499 #endif /* INET6 */ 18500 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 18501 th = rack->r_ctl.fsb.th; 18502 udp = rack->r_ctl.fsb.udp; 18503 if (udp) { 18504 #ifdef INET6 18505 if (isipv6) 18506 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18507 else 18508 #endif /* INET6 */ 18509 ulen = hdrlen + len - sizeof(struct ip); 18510 udp->uh_ulen = htons(ulen); 18511 } 18512 } else { 18513 #ifdef INET6 18514 if (isipv6) { 18515 ip6 = mtod(m, struct ip6_hdr *); 18516 if (tp->t_port) { 18517 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 18518 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 18519 udp->uh_dport = tp->t_port; 18520 ulen = hdrlen + len - sizeof(struct ip6_hdr); 18521 udp->uh_ulen = htons(ulen); 18522 th = (struct tcphdr *)(udp + 1); 18523 } else 18524 th = (struct tcphdr *)(ip6 + 1); 18525 tcpip_fillheaders(inp, tp->t_port, ip6, th); 18526 } else 18527 #endif /* INET6 */ 18528 { 18529 ip = mtod(m, struct ip *); 18530 #ifdef TCPDEBUG 18531 ipov = (struct ipovly *)ip; 18532 #endif 18533 if (tp->t_port) { 18534 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 18535 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 18536 udp->uh_dport = tp->t_port; 18537 ulen = hdrlen + len - sizeof(struct ip); 18538 udp->uh_ulen = htons(ulen); 18539 th = (struct tcphdr *)(udp + 1); 18540 } else 18541 th = (struct tcphdr *)(ip + 1); 18542 tcpip_fillheaders(inp, tp->t_port, ip, th); 18543 } 18544 } 18545 /* 18546 * Fill in fields, remembering maximum advertised window for use in 18547 * delaying messages about window sizes. If resending a FIN, be sure 18548 * not to use a new sequence number. 18549 */ 18550 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 18551 tp->snd_nxt == tp->snd_max) 18552 tp->snd_nxt--; 18553 /* 18554 * If we are starting a connection, send ECN setup SYN packet. If we 18555 * are on a retransmit, we may resend those bits a number of times 18556 * as per RFC 3168. 18557 */ 18558 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 18559 if (tp->t_rxtshift >= 1) { 18560 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 18561 flags |= TH_ECE | TH_CWR; 18562 } else 18563 flags |= TH_ECE | TH_CWR; 18564 } 18565 /* Handle parallel SYN for ECN */ 18566 if ((tp->t_state == TCPS_SYN_RECEIVED) && 18567 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 18568 flags |= TH_ECE; 18569 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 18570 } 18571 if (TCPS_HAVEESTABLISHED(tp->t_state) && 18572 (tp->t_flags2 & TF2_ECN_PERMIT)) { 18573 /* 18574 * If the peer has ECN, mark data packets with ECN capable 18575 * transmission (ECT). Ignore pure ack packets, 18576 * retransmissions. 18577 */ 18578 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 18579 (sack_rxmit == 0)) { 18580 #ifdef INET6 18581 if (isipv6) 18582 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 18583 else 18584 #endif 18585 ip->ip_tos |= IPTOS_ECN_ECT0; 18586 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 18587 /* 18588 * Reply with proper ECN notifications. 18589 * Only set CWR on new data segments. 18590 */ 18591 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 18592 flags |= TH_CWR; 18593 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 18594 } 18595 } 18596 if (tp->t_flags2 & TF2_ECN_SND_ECE) 18597 flags |= TH_ECE; 18598 } 18599 /* 18600 * If we are doing retransmissions, then snd_nxt will not reflect 18601 * the first unsent octet. For ACK only packets, we do not want the 18602 * sequence number of the retransmitted packet, we want the sequence 18603 * number of the next unsent octet. So, if there is no data (and no 18604 * SYN or FIN), use snd_max instead of snd_nxt when filling in 18605 * ti_seq. But if we are in persist state, snd_max might reflect 18606 * one byte beyond the right edge of the window, so use snd_nxt in 18607 * that case, since we know we aren't doing a retransmission. 18608 * (retransmit and persist are mutually exclusive...) 18609 */ 18610 if (sack_rxmit == 0) { 18611 if (len || (flags & (TH_SYN | TH_FIN))) { 18612 th->th_seq = htonl(tp->snd_nxt); 18613 rack_seq = tp->snd_nxt; 18614 } else { 18615 th->th_seq = htonl(tp->snd_max); 18616 rack_seq = tp->snd_max; 18617 } 18618 } else { 18619 th->th_seq = htonl(rsm->r_start); 18620 rack_seq = rsm->r_start; 18621 } 18622 th->th_ack = htonl(tp->rcv_nxt); 18623 th->th_flags = flags; 18624 /* 18625 * Calculate receive window. Don't shrink window, but avoid silly 18626 * window syndrome. 18627 * If a RST segment is sent, advertise a window of zero. 18628 */ 18629 if (flags & TH_RST) { 18630 recwin = 0; 18631 } else { 18632 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 18633 recwin < (long)segsiz) { 18634 recwin = 0; 18635 } 18636 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 18637 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 18638 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 18639 } 18640 18641 /* 18642 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 18643 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 18644 * handled in syncache. 18645 */ 18646 if (flags & TH_SYN) 18647 th->th_win = htons((u_short) 18648 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 18649 else { 18650 /* Avoid shrinking window with window scaling. */ 18651 recwin = roundup2(recwin, 1 << tp->rcv_scale); 18652 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 18653 } 18654 /* 18655 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 18656 * window. This may cause the remote transmitter to stall. This 18657 * flag tells soreceive() to disable delayed acknowledgements when 18658 * draining the buffer. This can occur if the receiver is 18659 * attempting to read more data than can be buffered prior to 18660 * transmitting on the connection. 18661 */ 18662 if (th->th_win == 0) { 18663 tp->t_sndzerowin++; 18664 tp->t_flags |= TF_RXWIN0SENT; 18665 } else 18666 tp->t_flags &= ~TF_RXWIN0SENT; 18667 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 18668 /* Now are we using fsb?, if so copy the template data to the mbuf */ 18669 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 18670 uint8_t *cpto; 18671 18672 cpto = mtod(m, uint8_t *); 18673 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18674 /* 18675 * We have just copied in: 18676 * IP/IP6 18677 * <optional udphdr> 18678 * tcphdr (no options) 18679 * 18680 * We need to grab the correct pointers into the mbuf 18681 * for both the tcp header, and possibly the udp header (if tunneling). 18682 * We do this by using the offset in the copy buffer and adding it 18683 * to the mbuf base pointer (cpto). 18684 */ 18685 #ifdef INET6 18686 if (isipv6) 18687 ip6 = mtod(m, struct ip6_hdr *); 18688 else 18689 #endif /* INET6 */ 18690 ip = mtod(m, struct ip *); 18691 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18692 /* If we have a udp header lets set it into the mbuf as well */ 18693 if (udp) 18694 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 18695 } 18696 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18697 if (to.to_flags & TOF_SIGNATURE) { 18698 /* 18699 * Calculate MD5 signature and put it into the place 18700 * determined before. 18701 * NOTE: since TCP options buffer doesn't point into 18702 * mbuf's data, calculate offset and use it. 18703 */ 18704 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18705 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18706 /* 18707 * Do not send segment if the calculation of MD5 18708 * digest has failed. 18709 */ 18710 goto out; 18711 } 18712 } 18713 #endif 18714 if (optlen) { 18715 bcopy(opt, th + 1, optlen); 18716 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18717 } 18718 /* 18719 * Put TCP length in extended header, and then checksum extended 18720 * header and data. 18721 */ 18722 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18723 #ifdef INET6 18724 if (isipv6) { 18725 /* 18726 * ip6_plen is not need to be filled now, and will be filled 18727 * in ip6_output. 18728 */ 18729 if (tp->t_port) { 18730 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18731 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18732 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18733 th->th_sum = htons(0); 18734 UDPSTAT_INC(udps_opackets); 18735 } else { 18736 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18737 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18738 th->th_sum = in6_cksum_pseudo(ip6, 18739 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18740 0); 18741 } 18742 } 18743 #endif 18744 #if defined(INET6) && defined(INET) 18745 else 18746 #endif 18747 #ifdef INET 18748 { 18749 if (tp->t_port) { 18750 m->m_pkthdr.csum_flags = CSUM_UDP; 18751 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18752 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18753 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18754 th->th_sum = htons(0); 18755 UDPSTAT_INC(udps_opackets); 18756 } else { 18757 m->m_pkthdr.csum_flags = CSUM_TCP; 18758 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18759 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18760 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18761 IPPROTO_TCP + len + optlen)); 18762 } 18763 /* IP version must be set here for ipv4/ipv6 checking later */ 18764 KASSERT(ip->ip_v == IPVERSION, 18765 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18766 } 18767 #endif 18768 /* 18769 * Enable TSO and specify the size of the segments. The TCP pseudo 18770 * header checksum is always provided. XXX: Fixme: This is currently 18771 * not the case for IPv6. 18772 */ 18773 if (tso) { 18774 KASSERT(len > tp->t_maxseg - optlen, 18775 ("%s: len <= tso_segsz", __func__)); 18776 m->m_pkthdr.csum_flags |= CSUM_TSO; 18777 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 18778 } 18779 KASSERT(len + hdrlen == m_length(m, NULL), 18780 ("%s: mbuf chain different than expected: %d + %u != %u", 18781 __func__, len, hdrlen, m_length(m, NULL))); 18782 18783 #ifdef TCP_HHOOK 18784 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 18785 hhook_run_tcp_est_out(tp, th, &to, len, tso); 18786 #endif 18787 /* We're getting ready to send; log now. */ 18788 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 18789 union tcp_log_stackspecific log; 18790 18791 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18792 log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp); 18793 if (rack->rack_no_prr) 18794 log.u_bbr.flex1 = 0; 18795 else 18796 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18797 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18798 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18799 log.u_bbr.flex4 = orig_len; 18800 if (filled_all) 18801 log.u_bbr.flex5 = 0x80000000; 18802 else 18803 log.u_bbr.flex5 = 0; 18804 /* Save off the early/late values */ 18805 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18806 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18807 log.u_bbr.bw_inuse = rack_get_bw(rack); 18808 if (rsm || sack_rxmit) { 18809 if (doing_tlp) 18810 log.u_bbr.flex8 = 2; 18811 else 18812 log.u_bbr.flex8 = 1; 18813 } else { 18814 if (doing_tlp) 18815 log.u_bbr.flex8 = 3; 18816 else 18817 log.u_bbr.flex8 = 0; 18818 } 18819 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 18820 log.u_bbr.flex7 = mark; 18821 log.u_bbr.flex7 <<= 8; 18822 log.u_bbr.flex7 |= pass; 18823 log.u_bbr.pkts_out = tp->t_maxseg; 18824 log.u_bbr.timeStamp = cts; 18825 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18826 log.u_bbr.lt_epoch = cwnd_to_use; 18827 log.u_bbr.delivered = sendalot; 18828 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 18829 len, &log, false, NULL, NULL, 0, &tv); 18830 } else 18831 lgb = NULL; 18832 18833 /* 18834 * Fill in IP length and desired time to live and send to IP level. 18835 * There should be a better way to handle ttl and tos; we could keep 18836 * them in the template, but need a way to checksum without them. 18837 */ 18838 /* 18839 * m->m_pkthdr.len should have been set before cksum calcuration, 18840 * because in6_cksum() need it. 18841 */ 18842 #ifdef INET6 18843 if (isipv6) { 18844 /* 18845 * we separately set hoplimit for every segment, since the 18846 * user might want to change the value via setsockopt. Also, 18847 * desired default hop limit might be changed via Neighbor 18848 * Discovery. 18849 */ 18850 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 18851 18852 /* 18853 * Set the packet size here for the benefit of DTrace 18854 * probes. ip6_output() will set it properly; it's supposed 18855 * to include the option header lengths as well. 18856 */ 18857 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18858 18859 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18860 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18861 else 18862 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18863 18864 if (tp->t_state == TCPS_SYN_SENT) 18865 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 18866 18867 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 18868 /* TODO: IPv6 IP6TOS_ECT bit on */ 18869 error = ip6_output(m, 18870 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18871 inp->in6p_outputopts, 18872 #else 18873 NULL, 18874 #endif 18875 &inp->inp_route6, 18876 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 18877 NULL, NULL, inp); 18878 18879 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 18880 mtu = inp->inp_route6.ro_nh->nh_mtu; 18881 } 18882 #endif /* INET6 */ 18883 #if defined(INET) && defined(INET6) 18884 else 18885 #endif 18886 #ifdef INET 18887 { 18888 ip->ip_len = htons(m->m_pkthdr.len); 18889 #ifdef INET6 18890 if (inp->inp_vflag & INP_IPV6PROTO) 18891 ip->ip_ttl = in6_selecthlim(inp, NULL); 18892 #endif /* INET6 */ 18893 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 18894 /* 18895 * If we do path MTU discovery, then we set DF on every 18896 * packet. This might not be the best thing to do according 18897 * to RFC3390 Section 2. However the tcp hostcache migitates 18898 * the problem so it affects only the first tcp connection 18899 * with a host. 18900 * 18901 * NB: Don't set DF on small MTU/MSS to have a safe 18902 * fallback. 18903 */ 18904 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18905 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18906 if (tp->t_port == 0 || len < V_tcp_minmss) { 18907 ip->ip_off |= htons(IP_DF); 18908 } 18909 } else { 18910 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18911 } 18912 18913 if (tp->t_state == TCPS_SYN_SENT) 18914 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 18915 18916 TCP_PROBE5(send, NULL, tp, ip, tp, th); 18917 18918 error = ip_output(m, 18919 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18920 inp->inp_options, 18921 #else 18922 NULL, 18923 #endif 18924 &inp->inp_route, 18925 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 18926 inp); 18927 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 18928 mtu = inp->inp_route.ro_nh->nh_mtu; 18929 } 18930 #endif /* INET */ 18931 18932 out: 18933 if (lgb) { 18934 lgb->tlb_errno = error; 18935 lgb = NULL; 18936 } 18937 /* 18938 * In transmit state, time the transmission and arrange for the 18939 * retransmit. In persist state, just set snd_max. 18940 */ 18941 if (error == 0) { 18942 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 18943 if (rsm && doing_tlp) { 18944 rack->rc_last_sent_tlp_past_cumack = 0; 18945 rack->rc_last_sent_tlp_seq_valid = 1; 18946 rack->r_ctl.last_sent_tlp_seq = rsm->r_start; 18947 rack->r_ctl.last_sent_tlp_len = rsm->r_end - rsm->r_start; 18948 } 18949 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18950 if (rsm && (doing_tlp == 0)) { 18951 /* Set we retransmitted */ 18952 rack->rc_gp_saw_rec = 1; 18953 } else { 18954 if (cwnd_to_use > tp->snd_ssthresh) { 18955 /* Set we sent in CA */ 18956 rack->rc_gp_saw_ca = 1; 18957 } else { 18958 /* Set we sent in SS */ 18959 rack->rc_gp_saw_ss = 1; 18960 } 18961 } 18962 if (TCPS_HAVEESTABLISHED(tp->t_state) && 18963 (tp->t_flags & TF_SACK_PERMIT) && 18964 tp->rcv_numsacks > 0) 18965 tcp_clean_dsack_blocks(tp); 18966 tot_len_this_send += len; 18967 if (len == 0) 18968 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 18969 else if (len == 1) { 18970 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 18971 } else if (len > 1) { 18972 int idx; 18973 18974 idx = (len / segsiz) + 3; 18975 if (idx >= TCP_MSS_ACCT_ATIMER) 18976 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18977 else 18978 counter_u64_add(rack_out_size[idx], 1); 18979 } 18980 } 18981 if ((rack->rack_no_prr == 0) && 18982 sub_from_prr && 18983 (error == 0)) { 18984 if (rack->r_ctl.rc_prr_sndcnt >= len) 18985 rack->r_ctl.rc_prr_sndcnt -= len; 18986 else 18987 rack->r_ctl.rc_prr_sndcnt = 0; 18988 } 18989 sub_from_prr = 0; 18990 if (doing_tlp) { 18991 /* Make sure the TLP is added */ 18992 add_flag |= RACK_TLP; 18993 } else if (rsm) { 18994 /* If its a resend without TLP then it must not have the flag */ 18995 rsm->r_flags &= ~RACK_TLP; 18996 } 18997 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 18998 rack_to_usec_ts(&tv), 18999 rsm, add_flag, s_mb, s_moff, hw_tls); 19000 19001 19002 if ((error == 0) && 19003 (len > 0) && 19004 (tp->snd_una == tp->snd_max)) 19005 rack->r_ctl.rc_tlp_rxt_last_time = cts; 19006 { 19007 tcp_seq startseq = tp->snd_nxt; 19008 19009 /* Track our lost count */ 19010 if (rsm && (doing_tlp == 0)) 19011 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 19012 /* 19013 * Advance snd_nxt over sequence space of this segment. 19014 */ 19015 if (error) 19016 /* We don't log or do anything with errors */ 19017 goto nomore; 19018 if (doing_tlp == 0) { 19019 if (rsm == NULL) { 19020 /* 19021 * Not a retransmission of some 19022 * sort, new data is going out so 19023 * clear our TLP count and flag. 19024 */ 19025 rack->rc_tlp_in_progress = 0; 19026 rack->r_ctl.rc_tlp_cnt_out = 0; 19027 } 19028 } else { 19029 /* 19030 * We have just sent a TLP, mark that it is true 19031 * and make sure our in progress is set so we 19032 * continue to check the count. 19033 */ 19034 rack->rc_tlp_in_progress = 1; 19035 rack->r_ctl.rc_tlp_cnt_out++; 19036 } 19037 if (flags & (TH_SYN | TH_FIN)) { 19038 if (flags & TH_SYN) 19039 tp->snd_nxt++; 19040 if (flags & TH_FIN) { 19041 tp->snd_nxt++; 19042 tp->t_flags |= TF_SENTFIN; 19043 } 19044 } 19045 /* In the ENOBUFS case we do *not* update snd_max */ 19046 if (sack_rxmit) 19047 goto nomore; 19048 19049 tp->snd_nxt += len; 19050 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 19051 if (tp->snd_una == tp->snd_max) { 19052 /* 19053 * Update the time we just added data since 19054 * none was outstanding. 19055 */ 19056 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 19057 tp->t_acktime = ticks; 19058 } 19059 tp->snd_max = tp->snd_nxt; 19060 /* 19061 * Time this transmission if not a retransmission and 19062 * not currently timing anything. 19063 * This is only relevant in case of switching back to 19064 * the base stack. 19065 */ 19066 if (tp->t_rtttime == 0) { 19067 tp->t_rtttime = ticks; 19068 tp->t_rtseq = startseq; 19069 KMOD_TCPSTAT_INC(tcps_segstimed); 19070 } 19071 if (len && 19072 ((tp->t_flags & TF_GPUTINPROG) == 0)) 19073 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 19074 } 19075 /* 19076 * If we are doing FO we need to update the mbuf position and subtract 19077 * this happens when the peer sends us duplicate information and 19078 * we thus want to send a DSACK. 19079 * 19080 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 19081 * turned off? If not then we are going to echo multiple DSACK blocks 19082 * out (with the TSO), which we should not be doing. 19083 */ 19084 if (rack->r_fast_output && len) { 19085 if (rack->r_ctl.fsb.left_to_send > len) 19086 rack->r_ctl.fsb.left_to_send -= len; 19087 else 19088 rack->r_ctl.fsb.left_to_send = 0; 19089 if (rack->r_ctl.fsb.left_to_send < segsiz) 19090 rack->r_fast_output = 0; 19091 if (rack->r_fast_output) { 19092 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19093 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19094 } 19095 } 19096 } 19097 nomore: 19098 if (error) { 19099 rack->r_ctl.rc_agg_delayed = 0; 19100 rack->r_early = 0; 19101 rack->r_late = 0; 19102 rack->r_ctl.rc_agg_early = 0; 19103 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 19104 /* 19105 * Failures do not advance the seq counter above. For the 19106 * case of ENOBUFS we will fall out and retry in 1ms with 19107 * the hpts. Everything else will just have to retransmit 19108 * with the timer. 19109 * 19110 * In any case, we do not want to loop around for another 19111 * send without a good reason. 19112 */ 19113 sendalot = 0; 19114 switch (error) { 19115 case EPERM: 19116 tp->t_softerror = error; 19117 #ifdef TCP_ACCOUNTING 19118 crtsc = get_cyclecount(); 19119 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19120 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19121 } 19122 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19123 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19124 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19125 } 19126 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19127 sched_unpin(); 19128 #endif 19129 return (error); 19130 case ENOBUFS: 19131 /* 19132 * Pace us right away to retry in a some 19133 * time 19134 */ 19135 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 19136 if (rack->rc_enobuf < 0x7f) 19137 rack->rc_enobuf++; 19138 if (slot < (10 * HPTS_USEC_IN_MSEC)) 19139 slot = 10 * HPTS_USEC_IN_MSEC; 19140 if (rack->r_ctl.crte != NULL) { 19141 counter_u64_add(rack_saw_enobuf_hw, 1); 19142 tcp_rl_log_enobuf(rack->r_ctl.crte); 19143 } 19144 counter_u64_add(rack_saw_enobuf, 1); 19145 goto enobufs; 19146 case EMSGSIZE: 19147 /* 19148 * For some reason the interface we used initially 19149 * to send segments changed to another or lowered 19150 * its MTU. If TSO was active we either got an 19151 * interface without TSO capabilits or TSO was 19152 * turned off. If we obtained mtu from ip_output() 19153 * then update it and try again. 19154 */ 19155 if (tso) 19156 tp->t_flags &= ~TF_TSO; 19157 if (mtu != 0) { 19158 tcp_mss_update(tp, -1, mtu, NULL, NULL); 19159 goto again; 19160 } 19161 slot = 10 * HPTS_USEC_IN_MSEC; 19162 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 19163 #ifdef TCP_ACCOUNTING 19164 crtsc = get_cyclecount(); 19165 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19166 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19167 } 19168 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19169 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19170 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19171 } 19172 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19173 sched_unpin(); 19174 #endif 19175 return (error); 19176 case ENETUNREACH: 19177 counter_u64_add(rack_saw_enetunreach, 1); 19178 case EHOSTDOWN: 19179 case EHOSTUNREACH: 19180 case ENETDOWN: 19181 if (TCPS_HAVERCVDSYN(tp->t_state)) { 19182 tp->t_softerror = error; 19183 } 19184 /* FALLTHROUGH */ 19185 default: 19186 slot = 10 * HPTS_USEC_IN_MSEC; 19187 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 19188 #ifdef TCP_ACCOUNTING 19189 crtsc = get_cyclecount(); 19190 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19191 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 19192 } 19193 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 19194 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19195 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 19196 } 19197 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 19198 sched_unpin(); 19199 #endif 19200 return (error); 19201 } 19202 } else { 19203 rack->rc_enobuf = 0; 19204 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 19205 rack->r_ctl.retran_during_recovery += len; 19206 } 19207 KMOD_TCPSTAT_INC(tcps_sndtotal); 19208 19209 /* 19210 * Data sent (as far as we can tell). If this advertises a larger 19211 * window than any other segment, then remember the size of the 19212 * advertised window. Any pending ACK has now been sent. 19213 */ 19214 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 19215 tp->rcv_adv = tp->rcv_nxt + recwin; 19216 19217 tp->last_ack_sent = tp->rcv_nxt; 19218 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 19219 enobufs: 19220 if (sendalot) { 19221 /* Do we need to turn off sendalot? */ 19222 if (rack->r_ctl.rc_pace_max_segs && 19223 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 19224 /* We hit our max. */ 19225 sendalot = 0; 19226 } else if ((rack->rc_user_set_max_segs) && 19227 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 19228 /* We hit the user defined max */ 19229 sendalot = 0; 19230 } 19231 } 19232 if ((error == 0) && (flags & TH_FIN)) 19233 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 19234 if (flags & TH_RST) { 19235 /* 19236 * We don't send again after sending a RST. 19237 */ 19238 slot = 0; 19239 sendalot = 0; 19240 if (error == 0) 19241 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 19242 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 19243 /* 19244 * Get our pacing rate, if an error 19245 * occurred in sending (ENOBUF) we would 19246 * hit the else if with slot preset. Other 19247 * errors return. 19248 */ 19249 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 19250 } 19251 if (rsm && 19252 (rsm->r_flags & RACK_HAS_SYN) == 0 && 19253 rack->use_rack_rr) { 19254 /* Its a retransmit and we use the rack cheat? */ 19255 if ((slot == 0) || 19256 (rack->rc_always_pace == 0) || 19257 (rack->r_rr_config == 1)) { 19258 /* 19259 * We have no pacing set or we 19260 * are using old-style rack or 19261 * we are overriden to use the old 1ms pacing. 19262 */ 19263 slot = rack->r_ctl.rc_min_to; 19264 } 19265 } 19266 /* We have sent clear the flag */ 19267 rack->r_ent_rec_ns = 0; 19268 if (rack->r_must_retran) { 19269 if (rsm) { 19270 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 19271 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 19272 /* 19273 * We have retransmitted all. 19274 */ 19275 rack->r_must_retran = 0; 19276 rack->r_ctl.rc_out_at_rto = 0; 19277 } 19278 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 19279 /* 19280 * Sending new data will also kill 19281 * the loop. 19282 */ 19283 rack->r_must_retran = 0; 19284 rack->r_ctl.rc_out_at_rto = 0; 19285 } 19286 } 19287 rack->r_ctl.fsb.recwin = recwin; 19288 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 19289 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 19290 /* 19291 * We hit an RTO and now have past snd_max at the RTO 19292 * clear all the WAS flags. 19293 */ 19294 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 19295 } 19296 if (slot) { 19297 /* set the rack tcb into the slot N */ 19298 counter_u64_add(rack_paced_segments, 1); 19299 if ((error == 0) && 19300 rack_use_rfo && 19301 ((flags & (TH_SYN|TH_FIN)) == 0) && 19302 (rsm == NULL) && 19303 (tp->snd_nxt == tp->snd_max) && 19304 (ipoptlen == 0) && 19305 (tp->rcv_numsacks == 0) && 19306 rack->r_fsb_inited && 19307 TCPS_HAVEESTABLISHED(tp->t_state) && 19308 (rack->r_must_retran == 0) && 19309 ((tp->t_flags & TF_NEEDFIN) == 0) && 19310 (len > 0) && (orig_len > 0) && 19311 (orig_len > len) && 19312 ((orig_len - len) >= segsiz) && 19313 ((optlen == 0) || 19314 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 19315 /* We can send at least one more MSS using our fsb */ 19316 19317 rack->r_fast_output = 1; 19318 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19319 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19320 rack->r_ctl.fsb.tcp_flags = flags; 19321 rack->r_ctl.fsb.left_to_send = orig_len - len; 19322 if (hw_tls) 19323 rack->r_ctl.fsb.hw_tls = 1; 19324 else 19325 rack->r_ctl.fsb.hw_tls = 0; 19326 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19327 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19328 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19329 (tp->snd_max - tp->snd_una))); 19330 if (rack->r_ctl.fsb.left_to_send < segsiz) 19331 rack->r_fast_output = 0; 19332 else { 19333 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19334 rack->r_ctl.fsb.rfo_apply_push = 1; 19335 else 19336 rack->r_ctl.fsb.rfo_apply_push = 0; 19337 } 19338 } else 19339 rack->r_fast_output = 0; 19340 rack_log_fsb(rack, tp, so, flags, 19341 ipoptlen, orig_len, len, error, 19342 (rsm == NULL), optlen, __LINE__, 2); 19343 } else if (sendalot) { 19344 int ret; 19345 19346 if (len) 19347 counter_u64_add(rack_unpaced_segments, 1); 19348 sack_rxmit = 0; 19349 if ((error == 0) && 19350 rack_use_rfo && 19351 ((flags & (TH_SYN|TH_FIN)) == 0) && 19352 (rsm == NULL) && 19353 (ipoptlen == 0) && 19354 (tp->rcv_numsacks == 0) && 19355 (tp->snd_nxt == tp->snd_max) && 19356 (rack->r_must_retran == 0) && 19357 rack->r_fsb_inited && 19358 TCPS_HAVEESTABLISHED(tp->t_state) && 19359 ((tp->t_flags & TF_NEEDFIN) == 0) && 19360 (len > 0) && (orig_len > 0) && 19361 (orig_len > len) && 19362 ((orig_len - len) >= segsiz) && 19363 ((optlen == 0) || 19364 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 19365 /* we can use fast_output for more */ 19366 19367 rack->r_fast_output = 1; 19368 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 19369 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 19370 rack->r_ctl.fsb.tcp_flags = flags; 19371 rack->r_ctl.fsb.left_to_send = orig_len - len; 19372 if (hw_tls) 19373 rack->r_ctl.fsb.hw_tls = 1; 19374 else 19375 rack->r_ctl.fsb.hw_tls = 0; 19376 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 19377 ("rack:%p left_to_send:%u sbavail:%u out:%u", 19378 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 19379 (tp->snd_max - tp->snd_una))); 19380 if (rack->r_ctl.fsb.left_to_send < segsiz) { 19381 rack->r_fast_output = 0; 19382 } 19383 if (rack->r_fast_output) { 19384 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 19385 rack->r_ctl.fsb.rfo_apply_push = 1; 19386 else 19387 rack->r_ctl.fsb.rfo_apply_push = 0; 19388 rack_log_fsb(rack, tp, so, flags, 19389 ipoptlen, orig_len, len, error, 19390 (rsm == NULL), optlen, __LINE__, 3); 19391 error = 0; 19392 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 19393 if (ret >= 0) 19394 return (ret); 19395 else if (error) 19396 goto nomore; 19397 19398 } 19399 } 19400 goto again; 19401 } else if (len) { 19402 counter_u64_add(rack_unpaced_segments, 1); 19403 } 19404 /* Assure when we leave that snd_nxt will point to top */ 19405 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 19406 tp->snd_nxt = tp->snd_max; 19407 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 19408 #ifdef TCP_ACCOUNTING 19409 crtsc = get_cyclecount() - ts_val; 19410 if (tot_len_this_send) { 19411 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19412 tp->tcp_cnt_counters[SND_OUT_DATA]++; 19413 } 19414 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 19415 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19416 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 19417 } 19418 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc); 19419 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19420 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 19421 } 19422 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz)); 19423 } else { 19424 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19425 tp->tcp_cnt_counters[SND_OUT_ACK]++; 19426 } 19427 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1); 19428 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 19429 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 19430 } 19431 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc); 19432 } 19433 sched_unpin(); 19434 #endif 19435 if (error == ENOBUFS) 19436 error = 0; 19437 return (error); 19438 } 19439 19440 static void 19441 rack_update_seg(struct tcp_rack *rack) 19442 { 19443 uint32_t orig_val; 19444 19445 orig_val = rack->r_ctl.rc_pace_max_segs; 19446 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 19447 if (orig_val != rack->r_ctl.rc_pace_max_segs) 19448 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 19449 } 19450 19451 static void 19452 rack_mtu_change(struct tcpcb *tp) 19453 { 19454 /* 19455 * The MSS may have changed 19456 */ 19457 struct tcp_rack *rack; 19458 struct rack_sendmap *rsm; 19459 19460 rack = (struct tcp_rack *)tp->t_fb_ptr; 19461 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 19462 /* 19463 * The MTU has changed we need to resend everything 19464 * since all we have sent is lost. We first fix 19465 * up the mtu though. 19466 */ 19467 rack_set_pace_segments(tp, rack, __LINE__, NULL); 19468 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 19469 rack_remxt_tmr(tp); 19470 rack->r_fast_output = 0; 19471 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 19472 rack->r_ctl.rc_sacked); 19473 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 19474 rack->r_must_retran = 1; 19475 /* Mark all inflight to needing to be rxt'd */ 19476 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 19477 rsm->r_flags |= RACK_MUST_RXT; 19478 } 19479 } 19480 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 19481 /* We don't use snd_nxt to retransmit */ 19482 tp->snd_nxt = tp->snd_max; 19483 } 19484 19485 static int 19486 rack_set_profile(struct tcp_rack *rack, int prof) 19487 { 19488 int err = EINVAL; 19489 if (prof == 1) { 19490 /* pace_always=1 */ 19491 if (rack->rc_always_pace == 0) { 19492 if (tcp_can_enable_pacing() == 0) 19493 return (EBUSY); 19494 } 19495 rack->rc_always_pace = 1; 19496 if (rack->use_fixed_rate || rack->gp_ready) 19497 rack_set_cc_pacing(rack); 19498 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19499 rack->rack_attempt_hdwr_pace = 0; 19500 /* cmpack=1 */ 19501 if (rack_use_cmp_acks) 19502 rack->r_use_cmp_ack = 1; 19503 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 19504 rack->r_use_cmp_ack) 19505 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19506 /* scwnd=1 */ 19507 rack->rack_enable_scwnd = 1; 19508 /* dynamic=100 */ 19509 rack->rc_gp_dyn_mul = 1; 19510 /* gp_inc_ca */ 19511 rack->r_ctl.rack_per_of_gp_ca = 100; 19512 /* rrr_conf=3 */ 19513 rack->r_rr_config = 3; 19514 /* npush=2 */ 19515 rack->r_ctl.rc_no_push_at_mrtt = 2; 19516 /* fillcw=1 */ 19517 rack->rc_pace_to_cwnd = 1; 19518 rack->rc_pace_fill_if_rttin_range = 0; 19519 rack->rtt_limit_mul = 0; 19520 /* noprr=1 */ 19521 rack->rack_no_prr = 1; 19522 /* lscwnd=1 */ 19523 rack->r_limit_scw = 1; 19524 /* gp_inc_rec */ 19525 rack->r_ctl.rack_per_of_gp_rec = 90; 19526 err = 0; 19527 19528 } else if (prof == 3) { 19529 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ 19530 /* pace_always=1 */ 19531 if (rack->rc_always_pace == 0) { 19532 if (tcp_can_enable_pacing() == 0) 19533 return (EBUSY); 19534 } 19535 rack->rc_always_pace = 1; 19536 if (rack->use_fixed_rate || rack->gp_ready) 19537 rack_set_cc_pacing(rack); 19538 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19539 rack->rack_attempt_hdwr_pace = 0; 19540 /* cmpack=1 */ 19541 if (rack_use_cmp_acks) 19542 rack->r_use_cmp_ack = 1; 19543 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 19544 rack->r_use_cmp_ack) 19545 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19546 /* scwnd=1 */ 19547 rack->rack_enable_scwnd = 1; 19548 /* dynamic=100 */ 19549 rack->rc_gp_dyn_mul = 1; 19550 /* gp_inc_ca */ 19551 rack->r_ctl.rack_per_of_gp_ca = 100; 19552 /* rrr_conf=3 */ 19553 rack->r_rr_config = 3; 19554 /* npush=2 */ 19555 rack->r_ctl.rc_no_push_at_mrtt = 2; 19556 /* fillcw=2 */ 19557 rack->rc_pace_to_cwnd = 1; 19558 rack->r_fill_less_agg = 1; 19559 rack->rc_pace_fill_if_rttin_range = 0; 19560 rack->rtt_limit_mul = 0; 19561 /* noprr=1 */ 19562 rack->rack_no_prr = 1; 19563 /* lscwnd=1 */ 19564 rack->r_limit_scw = 1; 19565 /* gp_inc_rec */ 19566 rack->r_ctl.rack_per_of_gp_rec = 90; 19567 err = 0; 19568 19569 19570 } else if (prof == 2) { 19571 /* cmpack=1 */ 19572 if (rack->rc_always_pace == 0) { 19573 if (tcp_can_enable_pacing() == 0) 19574 return (EBUSY); 19575 } 19576 rack->rc_always_pace = 1; 19577 if (rack->use_fixed_rate || rack->gp_ready) 19578 rack_set_cc_pacing(rack); 19579 rack->r_use_cmp_ack = 1; 19580 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 19581 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19582 /* pace_always=1 */ 19583 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19584 /* scwnd=1 */ 19585 rack->rack_enable_scwnd = 1; 19586 /* dynamic=100 */ 19587 rack->rc_gp_dyn_mul = 1; 19588 rack->r_ctl.rack_per_of_gp_ca = 100; 19589 /* rrr_conf=3 */ 19590 rack->r_rr_config = 3; 19591 /* npush=2 */ 19592 rack->r_ctl.rc_no_push_at_mrtt = 2; 19593 /* fillcw=1 */ 19594 rack->rc_pace_to_cwnd = 1; 19595 rack->rc_pace_fill_if_rttin_range = 0; 19596 rack->rtt_limit_mul = 0; 19597 /* noprr=1 */ 19598 rack->rack_no_prr = 1; 19599 /* lscwnd=0 */ 19600 rack->r_limit_scw = 0; 19601 err = 0; 19602 } else if (prof == 0) { 19603 /* This changes things back to the default settings */ 19604 err = 0; 19605 if (rack->rc_always_pace) { 19606 tcp_decrement_paced_conn(); 19607 rack_undo_cc_pacing(rack); 19608 rack->rc_always_pace = 0; 19609 } 19610 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 19611 rack->rc_always_pace = 1; 19612 if (rack->use_fixed_rate || rack->gp_ready) 19613 rack_set_cc_pacing(rack); 19614 } else 19615 rack->rc_always_pace = 0; 19616 if (rack_dsack_std_based & 0x1) { 19617 /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ 19618 rack->rc_rack_tmr_std_based = 1; 19619 } 19620 if (rack_dsack_std_based & 0x2) { 19621 /* Basically this means rack timers are extended based on dsack by up to (2 * srtt) */ 19622 rack->rc_rack_use_dsack = 1; 19623 } 19624 if (rack_use_cmp_acks) 19625 rack->r_use_cmp_ack = 1; 19626 else 19627 rack->r_use_cmp_ack = 0; 19628 if (rack_disable_prr) 19629 rack->rack_no_prr = 1; 19630 else 19631 rack->rack_no_prr = 0; 19632 if (rack_gp_no_rec_chg) 19633 rack->rc_gp_no_rec_chg = 1; 19634 else 19635 rack->rc_gp_no_rec_chg = 0; 19636 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 19637 rack->r_mbuf_queue = 1; 19638 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 19639 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19640 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19641 } else { 19642 rack->r_mbuf_queue = 0; 19643 rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19644 } 19645 if (rack_enable_shared_cwnd) 19646 rack->rack_enable_scwnd = 1; 19647 else 19648 rack->rack_enable_scwnd = 0; 19649 if (rack_do_dyn_mul) { 19650 /* When dynamic adjustment is on CA needs to start at 100% */ 19651 rack->rc_gp_dyn_mul = 1; 19652 if (rack_do_dyn_mul >= 100) 19653 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 19654 } else { 19655 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 19656 rack->rc_gp_dyn_mul = 0; 19657 } 19658 rack->r_rr_config = 0; 19659 rack->r_ctl.rc_no_push_at_mrtt = 0; 19660 rack->rc_pace_to_cwnd = 0; 19661 rack->rc_pace_fill_if_rttin_range = 0; 19662 rack->rtt_limit_mul = 0; 19663 19664 if (rack_enable_hw_pacing) 19665 rack->rack_hdw_pace_ena = 1; 19666 else 19667 rack->rack_hdw_pace_ena = 0; 19668 if (rack_disable_prr) 19669 rack->rack_no_prr = 1; 19670 else 19671 rack->rack_no_prr = 0; 19672 if (rack_limits_scwnd) 19673 rack->r_limit_scw = 1; 19674 else 19675 rack->r_limit_scw = 0; 19676 err = 0; 19677 } 19678 return (err); 19679 } 19680 19681 static int 19682 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 19683 { 19684 struct deferred_opt_list *dol; 19685 19686 dol = malloc(sizeof(struct deferred_opt_list), 19687 M_TCPFSB, M_NOWAIT|M_ZERO); 19688 if (dol == NULL) { 19689 /* 19690 * No space yikes -- fail out.. 19691 */ 19692 return (0); 19693 } 19694 dol->optname = sopt_name; 19695 dol->optval = loptval; 19696 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 19697 return (1); 19698 } 19699 19700 static int 19701 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 19702 uint32_t optval, uint64_t loptval) 19703 { 19704 struct epoch_tracker et; 19705 struct sockopt sopt; 19706 struct cc_newreno_opts opt; 19707 uint64_t val; 19708 int error = 0; 19709 uint16_t ca, ss; 19710 19711 switch (sopt_name) { 19712 19713 case TCP_RACK_DSACK_OPT: 19714 RACK_OPTS_INC(tcp_rack_dsack_opt); 19715 if (optval & 0x1) { 19716 rack->rc_rack_tmr_std_based = 1; 19717 } else { 19718 rack->rc_rack_tmr_std_based = 0; 19719 } 19720 if (optval & 0x2) { 19721 rack->rc_rack_use_dsack = 1; 19722 } else { 19723 rack->rc_rack_use_dsack = 0; 19724 } 19725 rack_log_dsack_event(rack, 5, __LINE__, 0, 0); 19726 break; 19727 case TCP_RACK_PACING_BETA: 19728 RACK_OPTS_INC(tcp_rack_beta); 19729 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 19730 /* This only works for newreno. */ 19731 error = EINVAL; 19732 break; 19733 } 19734 if (rack->rc_pacing_cc_set) { 19735 /* 19736 * Set them into the real CC module 19737 * whats in the rack pcb is the old values 19738 * to be used on restoral/ 19739 */ 19740 sopt.sopt_dir = SOPT_SET; 19741 opt.name = CC_NEWRENO_BETA; 19742 opt.val = optval; 19743 if (CC_ALGO(tp)->ctl_output != NULL) 19744 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 19745 else { 19746 error = ENOENT; 19747 break; 19748 } 19749 } else { 19750 /* 19751 * Not pacing yet so set it into our local 19752 * rack pcb storage. 19753 */ 19754 rack->r_ctl.rc_saved_beta.beta = optval; 19755 } 19756 break; 19757 case TCP_RACK_TIMER_SLOP: 19758 RACK_OPTS_INC(tcp_rack_timer_slop); 19759 rack->r_ctl.timer_slop = optval; 19760 if (rack->rc_tp->t_srtt) { 19761 /* 19762 * If we have an SRTT lets update t_rxtcur 19763 * to have the new slop. 19764 */ 19765 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 19766 rack_rto_min, rack_rto_max, 19767 rack->r_ctl.timer_slop); 19768 } 19769 break; 19770 case TCP_RACK_PACING_BETA_ECN: 19771 RACK_OPTS_INC(tcp_rack_beta_ecn); 19772 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 19773 /* This only works for newreno. */ 19774 error = EINVAL; 19775 break; 19776 } 19777 if (rack->rc_pacing_cc_set) { 19778 /* 19779 * Set them into the real CC module 19780 * whats in the rack pcb is the old values 19781 * to be used on restoral/ 19782 */ 19783 sopt.sopt_dir = SOPT_SET; 19784 opt.name = CC_NEWRENO_BETA_ECN; 19785 opt.val = optval; 19786 if (CC_ALGO(tp)->ctl_output != NULL) 19787 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 19788 else 19789 error = ENOENT; 19790 } else { 19791 /* 19792 * Not pacing yet so set it into our local 19793 * rack pcb storage. 19794 */ 19795 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 19796 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN_ENABLED; 19797 } 19798 break; 19799 case TCP_DEFER_OPTIONS: 19800 RACK_OPTS_INC(tcp_defer_opt); 19801 if (optval) { 19802 if (rack->gp_ready) { 19803 /* Too late */ 19804 error = EINVAL; 19805 break; 19806 } 19807 rack->defer_options = 1; 19808 } else 19809 rack->defer_options = 0; 19810 break; 19811 case TCP_RACK_MEASURE_CNT: 19812 RACK_OPTS_INC(tcp_rack_measure_cnt); 19813 if (optval && (optval <= 0xff)) { 19814 rack->r_ctl.req_measurements = optval; 19815 } else 19816 error = EINVAL; 19817 break; 19818 case TCP_REC_ABC_VAL: 19819 RACK_OPTS_INC(tcp_rec_abc_val); 19820 if (optval > 0) 19821 rack->r_use_labc_for_rec = 1; 19822 else 19823 rack->r_use_labc_for_rec = 0; 19824 break; 19825 case TCP_RACK_ABC_VAL: 19826 RACK_OPTS_INC(tcp_rack_abc_val); 19827 if ((optval > 0) && (optval < 255)) 19828 rack->rc_labc = optval; 19829 else 19830 error = EINVAL; 19831 break; 19832 case TCP_HDWR_UP_ONLY: 19833 RACK_OPTS_INC(tcp_pacing_up_only); 19834 if (optval) 19835 rack->r_up_only = 1; 19836 else 19837 rack->r_up_only = 0; 19838 break; 19839 case TCP_PACING_RATE_CAP: 19840 RACK_OPTS_INC(tcp_pacing_rate_cap); 19841 rack->r_ctl.bw_rate_cap = loptval; 19842 break; 19843 case TCP_RACK_PROFILE: 19844 RACK_OPTS_INC(tcp_profile); 19845 error = rack_set_profile(rack, optval); 19846 break; 19847 case TCP_USE_CMP_ACKS: 19848 RACK_OPTS_INC(tcp_use_cmp_acks); 19849 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) { 19850 /* You can't turn it off once its on! */ 19851 error = EINVAL; 19852 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 19853 rack->r_use_cmp_ack = 1; 19854 rack->r_mbuf_queue = 1; 19855 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19856 } 19857 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 19858 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19859 break; 19860 case TCP_SHARED_CWND_TIME_LIMIT: 19861 RACK_OPTS_INC(tcp_lscwnd); 19862 if (optval) 19863 rack->r_limit_scw = 1; 19864 else 19865 rack->r_limit_scw = 0; 19866 break; 19867 case TCP_RACK_PACE_TO_FILL: 19868 RACK_OPTS_INC(tcp_fillcw); 19869 if (optval == 0) 19870 rack->rc_pace_to_cwnd = 0; 19871 else { 19872 rack->rc_pace_to_cwnd = 1; 19873 if (optval > 1) 19874 rack->r_fill_less_agg = 1; 19875 } 19876 if ((optval >= rack_gp_rtt_maxmul) && 19877 rack_gp_rtt_maxmul && 19878 (optval < 0xf)) { 19879 rack->rc_pace_fill_if_rttin_range = 1; 19880 rack->rtt_limit_mul = optval; 19881 } else { 19882 rack->rc_pace_fill_if_rttin_range = 0; 19883 rack->rtt_limit_mul = 0; 19884 } 19885 break; 19886 case TCP_RACK_NO_PUSH_AT_MAX: 19887 RACK_OPTS_INC(tcp_npush); 19888 if (optval == 0) 19889 rack->r_ctl.rc_no_push_at_mrtt = 0; 19890 else if (optval < 0xff) 19891 rack->r_ctl.rc_no_push_at_mrtt = optval; 19892 else 19893 error = EINVAL; 19894 break; 19895 case TCP_SHARED_CWND_ENABLE: 19896 RACK_OPTS_INC(tcp_rack_scwnd); 19897 if (optval == 0) 19898 rack->rack_enable_scwnd = 0; 19899 else 19900 rack->rack_enable_scwnd = 1; 19901 break; 19902 case TCP_RACK_MBUF_QUEUE: 19903 /* Now do we use the LRO mbuf-queue feature */ 19904 RACK_OPTS_INC(tcp_rack_mbufq); 19905 if (optval || rack->r_use_cmp_ack) 19906 rack->r_mbuf_queue = 1; 19907 else 19908 rack->r_mbuf_queue = 0; 19909 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19910 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19911 else 19912 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19913 break; 19914 case TCP_RACK_NONRXT_CFG_RATE: 19915 RACK_OPTS_INC(tcp_rack_cfg_rate); 19916 if (optval == 0) 19917 rack->rack_rec_nonrxt_use_cr = 0; 19918 else 19919 rack->rack_rec_nonrxt_use_cr = 1; 19920 break; 19921 case TCP_NO_PRR: 19922 RACK_OPTS_INC(tcp_rack_noprr); 19923 if (optval == 0) 19924 rack->rack_no_prr = 0; 19925 else if (optval == 1) 19926 rack->rack_no_prr = 1; 19927 else if (optval == 2) 19928 rack->no_prr_addback = 1; 19929 else 19930 error = EINVAL; 19931 break; 19932 case TCP_TIMELY_DYN_ADJ: 19933 RACK_OPTS_INC(tcp_timely_dyn); 19934 if (optval == 0) 19935 rack->rc_gp_dyn_mul = 0; 19936 else { 19937 rack->rc_gp_dyn_mul = 1; 19938 if (optval >= 100) { 19939 /* 19940 * If the user sets something 100 or more 19941 * its the gp_ca value. 19942 */ 19943 rack->r_ctl.rack_per_of_gp_ca = optval; 19944 } 19945 } 19946 break; 19947 case TCP_RACK_DO_DETECTION: 19948 RACK_OPTS_INC(tcp_rack_do_detection); 19949 if (optval == 0) 19950 rack->do_detection = 0; 19951 else 19952 rack->do_detection = 1; 19953 break; 19954 case TCP_RACK_TLP_USE: 19955 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 19956 error = EINVAL; 19957 break; 19958 } 19959 RACK_OPTS_INC(tcp_tlp_use); 19960 rack->rack_tlp_threshold_use = optval; 19961 break; 19962 case TCP_RACK_TLP_REDUCE: 19963 /* RACK TLP cwnd reduction (bool) */ 19964 RACK_OPTS_INC(tcp_rack_tlp_reduce); 19965 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 19966 break; 19967 /* Pacing related ones */ 19968 case TCP_RACK_PACE_ALWAYS: 19969 /* 19970 * zero is old rack method, 1 is new 19971 * method using a pacing rate. 19972 */ 19973 RACK_OPTS_INC(tcp_rack_pace_always); 19974 if (optval > 0) { 19975 if (rack->rc_always_pace) { 19976 error = EALREADY; 19977 break; 19978 } else if (tcp_can_enable_pacing()) { 19979 rack->rc_always_pace = 1; 19980 if (rack->use_fixed_rate || rack->gp_ready) 19981 rack_set_cc_pacing(rack); 19982 } 19983 else { 19984 error = ENOSPC; 19985 break; 19986 } 19987 } else { 19988 if (rack->rc_always_pace) { 19989 tcp_decrement_paced_conn(); 19990 rack->rc_always_pace = 0; 19991 rack_undo_cc_pacing(rack); 19992 } 19993 } 19994 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19995 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19996 else 19997 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19998 /* A rate may be set irate or other, if so set seg size */ 19999 rack_update_seg(rack); 20000 break; 20001 case TCP_BBR_RACK_INIT_RATE: 20002 RACK_OPTS_INC(tcp_initial_rate); 20003 val = optval; 20004 /* Change from kbits per second to bytes per second */ 20005 val *= 1000; 20006 val /= 8; 20007 rack->r_ctl.init_rate = val; 20008 if (rack->rc_init_win != rack_default_init_window) { 20009 uint32_t win, snt; 20010 20011 /* 20012 * Options don't always get applied 20013 * in the order you think. So in order 20014 * to assure we update a cwnd we need 20015 * to check and see if we are still 20016 * where we should raise the cwnd. 20017 */ 20018 win = rc_init_window(rack); 20019 if (SEQ_GT(tp->snd_max, tp->iss)) 20020 snt = tp->snd_max - tp->iss; 20021 else 20022 snt = 0; 20023 if ((snt < win) && 20024 (tp->snd_cwnd < win)) 20025 tp->snd_cwnd = win; 20026 } 20027 if (rack->rc_always_pace) 20028 rack_update_seg(rack); 20029 break; 20030 case TCP_BBR_IWINTSO: 20031 RACK_OPTS_INC(tcp_initial_win); 20032 if (optval && (optval <= 0xff)) { 20033 uint32_t win, snt; 20034 20035 rack->rc_init_win = optval; 20036 win = rc_init_window(rack); 20037 if (SEQ_GT(tp->snd_max, tp->iss)) 20038 snt = tp->snd_max - tp->iss; 20039 else 20040 snt = 0; 20041 if ((snt < win) && 20042 (tp->t_srtt | 20043 #ifdef NETFLIX_PEAKRATE 20044 tp->t_maxpeakrate | 20045 #endif 20046 rack->r_ctl.init_rate)) { 20047 /* 20048 * We are not past the initial window 20049 * and we have some bases for pacing, 20050 * so we need to possibly adjust up 20051 * the cwnd. Note even if we don't set 20052 * the cwnd, its still ok to raise the rc_init_win 20053 * which can be used coming out of idle when we 20054 * would have a rate. 20055 */ 20056 if (tp->snd_cwnd < win) 20057 tp->snd_cwnd = win; 20058 } 20059 if (rack->rc_always_pace) 20060 rack_update_seg(rack); 20061 } else 20062 error = EINVAL; 20063 break; 20064 case TCP_RACK_FORCE_MSEG: 20065 RACK_OPTS_INC(tcp_rack_force_max_seg); 20066 if (optval) 20067 rack->rc_force_max_seg = 1; 20068 else 20069 rack->rc_force_max_seg = 0; 20070 break; 20071 case TCP_RACK_PACE_MAX_SEG: 20072 /* Max segments size in a pace in bytes */ 20073 RACK_OPTS_INC(tcp_rack_max_seg); 20074 rack->rc_user_set_max_segs = optval; 20075 rack_set_pace_segments(tp, rack, __LINE__, NULL); 20076 break; 20077 case TCP_RACK_PACE_RATE_REC: 20078 /* Set the fixed pacing rate in Bytes per second ca */ 20079 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 20080 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 20081 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 20082 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 20083 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 20084 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 20085 rack->use_fixed_rate = 1; 20086 if (rack->rc_always_pace) 20087 rack_set_cc_pacing(rack); 20088 rack_log_pacing_delay_calc(rack, 20089 rack->r_ctl.rc_fixed_pacing_rate_ss, 20090 rack->r_ctl.rc_fixed_pacing_rate_ca, 20091 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 20092 __LINE__, NULL,0); 20093 break; 20094 20095 case TCP_RACK_PACE_RATE_SS: 20096 /* Set the fixed pacing rate in Bytes per second ca */ 20097 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 20098 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 20099 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 20100 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 20101 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 20102 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 20103 rack->use_fixed_rate = 1; 20104 if (rack->rc_always_pace) 20105 rack_set_cc_pacing(rack); 20106 rack_log_pacing_delay_calc(rack, 20107 rack->r_ctl.rc_fixed_pacing_rate_ss, 20108 rack->r_ctl.rc_fixed_pacing_rate_ca, 20109 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 20110 __LINE__, NULL, 0); 20111 break; 20112 20113 case TCP_RACK_PACE_RATE_CA: 20114 /* Set the fixed pacing rate in Bytes per second ca */ 20115 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 20116 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 20117 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 20118 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 20119 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 20120 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 20121 rack->use_fixed_rate = 1; 20122 if (rack->rc_always_pace) 20123 rack_set_cc_pacing(rack); 20124 rack_log_pacing_delay_calc(rack, 20125 rack->r_ctl.rc_fixed_pacing_rate_ss, 20126 rack->r_ctl.rc_fixed_pacing_rate_ca, 20127 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 20128 __LINE__, NULL, 0); 20129 break; 20130 case TCP_RACK_GP_INCREASE_REC: 20131 RACK_OPTS_INC(tcp_gp_inc_rec); 20132 rack->r_ctl.rack_per_of_gp_rec = optval; 20133 rack_log_pacing_delay_calc(rack, 20134 rack->r_ctl.rack_per_of_gp_ss, 20135 rack->r_ctl.rack_per_of_gp_ca, 20136 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20137 __LINE__, NULL, 0); 20138 break; 20139 case TCP_RACK_GP_INCREASE_CA: 20140 RACK_OPTS_INC(tcp_gp_inc_ca); 20141 ca = optval; 20142 if (ca < 100) { 20143 /* 20144 * We don't allow any reduction 20145 * over the GP b/w. 20146 */ 20147 error = EINVAL; 20148 break; 20149 } 20150 rack->r_ctl.rack_per_of_gp_ca = ca; 20151 rack_log_pacing_delay_calc(rack, 20152 rack->r_ctl.rack_per_of_gp_ss, 20153 rack->r_ctl.rack_per_of_gp_ca, 20154 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20155 __LINE__, NULL, 0); 20156 break; 20157 case TCP_RACK_GP_INCREASE_SS: 20158 RACK_OPTS_INC(tcp_gp_inc_ss); 20159 ss = optval; 20160 if (ss < 100) { 20161 /* 20162 * We don't allow any reduction 20163 * over the GP b/w. 20164 */ 20165 error = EINVAL; 20166 break; 20167 } 20168 rack->r_ctl.rack_per_of_gp_ss = ss; 20169 rack_log_pacing_delay_calc(rack, 20170 rack->r_ctl.rack_per_of_gp_ss, 20171 rack->r_ctl.rack_per_of_gp_ca, 20172 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 20173 __LINE__, NULL, 0); 20174 break; 20175 case TCP_RACK_RR_CONF: 20176 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 20177 if (optval && optval <= 3) 20178 rack->r_rr_config = optval; 20179 else 20180 rack->r_rr_config = 0; 20181 break; 20182 case TCP_HDWR_RATE_CAP: 20183 RACK_OPTS_INC(tcp_hdwr_rate_cap); 20184 if (optval) { 20185 if (rack->r_rack_hw_rate_caps == 0) 20186 rack->r_rack_hw_rate_caps = 1; 20187 else 20188 error = EALREADY; 20189 } else { 20190 rack->r_rack_hw_rate_caps = 0; 20191 } 20192 break; 20193 case TCP_BBR_HDWR_PACE: 20194 RACK_OPTS_INC(tcp_hdwr_pacing); 20195 if (optval){ 20196 if (rack->rack_hdrw_pacing == 0) { 20197 rack->rack_hdw_pace_ena = 1; 20198 rack->rack_attempt_hdwr_pace = 0; 20199 } else 20200 error = EALREADY; 20201 } else { 20202 rack->rack_hdw_pace_ena = 0; 20203 #ifdef RATELIMIT 20204 if (rack->r_ctl.crte != NULL) { 20205 rack->rack_hdrw_pacing = 0; 20206 rack->rack_attempt_hdwr_pace = 0; 20207 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 20208 rack->r_ctl.crte = NULL; 20209 } 20210 #endif 20211 } 20212 break; 20213 /* End Pacing related ones */ 20214 case TCP_RACK_PRR_SENDALOT: 20215 /* Allow PRR to send more than one seg */ 20216 RACK_OPTS_INC(tcp_rack_prr_sendalot); 20217 rack->r_ctl.rc_prr_sendalot = optval; 20218 break; 20219 case TCP_RACK_MIN_TO: 20220 /* Minimum time between rack t-o's in ms */ 20221 RACK_OPTS_INC(tcp_rack_min_to); 20222 rack->r_ctl.rc_min_to = optval; 20223 break; 20224 case TCP_RACK_EARLY_SEG: 20225 /* If early recovery max segments */ 20226 RACK_OPTS_INC(tcp_rack_early_seg); 20227 rack->r_ctl.rc_early_recovery_segs = optval; 20228 break; 20229 case TCP_RACK_ENABLE_HYSTART: 20230 { 20231 struct sockopt sopt; 20232 struct cc_newreno_opts opt; 20233 20234 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 20235 sopt.sopt_dir = SOPT_SET; 20236 opt.name = CC_NEWRENO_ENABLE_HYSTART; 20237 opt.val = optval; 20238 if (CC_ALGO(tp)->ctl_output != NULL) 20239 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 20240 else 20241 error = EINVAL; 20242 } 20243 break; 20244 case TCP_RACK_REORD_THRESH: 20245 /* RACK reorder threshold (shift amount) */ 20246 RACK_OPTS_INC(tcp_rack_reord_thresh); 20247 if ((optval > 0) && (optval < 31)) 20248 rack->r_ctl.rc_reorder_shift = optval; 20249 else 20250 error = EINVAL; 20251 break; 20252 case TCP_RACK_REORD_FADE: 20253 /* Does reordering fade after ms time */ 20254 RACK_OPTS_INC(tcp_rack_reord_fade); 20255 rack->r_ctl.rc_reorder_fade = optval; 20256 break; 20257 case TCP_RACK_TLP_THRESH: 20258 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 20259 RACK_OPTS_INC(tcp_rack_tlp_thresh); 20260 if (optval) 20261 rack->r_ctl.rc_tlp_threshold = optval; 20262 else 20263 error = EINVAL; 20264 break; 20265 case TCP_BBR_USE_RACK_RR: 20266 RACK_OPTS_INC(tcp_rack_rr); 20267 if (optval) 20268 rack->use_rack_rr = 1; 20269 else 20270 rack->use_rack_rr = 0; 20271 break; 20272 case TCP_FAST_RSM_HACK: 20273 RACK_OPTS_INC(tcp_rack_fastrsm_hack); 20274 if (optval) 20275 rack->fast_rsm_hack = 1; 20276 else 20277 rack->fast_rsm_hack = 0; 20278 break; 20279 case TCP_RACK_PKT_DELAY: 20280 /* RACK added ms i.e. rack-rtt + reord + N */ 20281 RACK_OPTS_INC(tcp_rack_pkt_delay); 20282 rack->r_ctl.rc_pkt_delay = optval; 20283 break; 20284 case TCP_DELACK: 20285 RACK_OPTS_INC(tcp_rack_delayed_ack); 20286 if (optval == 0) 20287 tp->t_delayed_ack = 0; 20288 else 20289 tp->t_delayed_ack = 1; 20290 if (tp->t_flags & TF_DELACK) { 20291 tp->t_flags &= ~TF_DELACK; 20292 tp->t_flags |= TF_ACKNOW; 20293 NET_EPOCH_ENTER(et); 20294 rack_output(tp); 20295 NET_EPOCH_EXIT(et); 20296 } 20297 break; 20298 20299 case TCP_BBR_RACK_RTT_USE: 20300 RACK_OPTS_INC(tcp_rack_rtt_use); 20301 if ((optval != USE_RTT_HIGH) && 20302 (optval != USE_RTT_LOW) && 20303 (optval != USE_RTT_AVG)) 20304 error = EINVAL; 20305 else 20306 rack->r_ctl.rc_rate_sample_method = optval; 20307 break; 20308 case TCP_DATA_AFTER_CLOSE: 20309 RACK_OPTS_INC(tcp_data_after_close); 20310 if (optval) 20311 rack->rc_allow_data_af_clo = 1; 20312 else 20313 rack->rc_allow_data_af_clo = 0; 20314 break; 20315 default: 20316 break; 20317 } 20318 #ifdef NETFLIX_STATS 20319 tcp_log_socket_option(tp, sopt_name, optval, error); 20320 #endif 20321 return (error); 20322 } 20323 20324 20325 static void 20326 rack_apply_deferred_options(struct tcp_rack *rack) 20327 { 20328 struct deferred_opt_list *dol, *sdol; 20329 uint32_t s_optval; 20330 20331 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 20332 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 20333 /* Disadvantage of deferal is you loose the error return */ 20334 s_optval = (uint32_t)dol->optval; 20335 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); 20336 free(dol, M_TCPDO); 20337 } 20338 } 20339 20340 static void 20341 rack_hw_tls_change(struct tcpcb *tp, int chg) 20342 { 20343 /* 20344 * HW tls state has changed.. fix all 20345 * rsm's in flight. 20346 */ 20347 struct tcp_rack *rack; 20348 struct rack_sendmap *rsm; 20349 20350 rack = (struct tcp_rack *)tp->t_fb_ptr; 20351 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 20352 if (chg) 20353 rsm->r_hw_tls = 1; 20354 else 20355 rsm->r_hw_tls = 0; 20356 } 20357 if (chg) 20358 rack->r_ctl.fsb.hw_tls = 1; 20359 else 20360 rack->r_ctl.fsb.hw_tls = 0; 20361 } 20362 20363 static int 20364 rack_pru_options(struct tcpcb *tp, int flags) 20365 { 20366 if (flags & PRUS_OOB) 20367 return (EOPNOTSUPP); 20368 return (0); 20369 } 20370 20371 static struct tcp_function_block __tcp_rack = { 20372 .tfb_tcp_block_name = __XSTRING(STACKNAME), 20373 .tfb_tcp_output = rack_output, 20374 .tfb_do_queued_segments = ctf_do_queued_segments, 20375 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 20376 .tfb_tcp_do_segment = rack_do_segment, 20377 .tfb_tcp_ctloutput = rack_ctloutput, 20378 .tfb_tcp_fb_init = rack_init, 20379 .tfb_tcp_fb_fini = rack_fini, 20380 .tfb_tcp_timer_stop_all = rack_stopall, 20381 .tfb_tcp_timer_activate = rack_timer_activate, 20382 .tfb_tcp_timer_active = rack_timer_active, 20383 .tfb_tcp_timer_stop = rack_timer_stop, 20384 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 20385 .tfb_tcp_handoff_ok = rack_handoff_ok, 20386 .tfb_tcp_mtu_chg = rack_mtu_change, 20387 .tfb_pru_options = rack_pru_options, 20388 .tfb_hwtls_change = rack_hw_tls_change, 20389 .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, 20390 }; 20391 20392 /* 20393 * rack_ctloutput() must drop the inpcb lock before performing copyin on 20394 * socket option arguments. When it re-acquires the lock after the copy, it 20395 * has to revalidate that the connection is still valid for the socket 20396 * option. 20397 */ 20398 static int 20399 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 20400 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 20401 { 20402 #ifdef INET6 20403 struct ip6_hdr *ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 20404 #endif 20405 #ifdef INET 20406 struct ip *ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 20407 #endif 20408 uint64_t loptval; 20409 int32_t error = 0, optval; 20410 20411 switch (sopt->sopt_level) { 20412 #ifdef INET6 20413 case IPPROTO_IPV6: 20414 MPASS(inp->inp_vflag & INP_IPV6PROTO); 20415 switch (sopt->sopt_name) { 20416 case IPV6_USE_MIN_MTU: 20417 tcp6_use_min_mtu(tp); 20418 break; 20419 case IPV6_TCLASS: 20420 /* 20421 * The DSCP codepoint has changed, update the fsb. 20422 */ 20423 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | 20424 (rack->rc_inp->inp_flow & IPV6_FLOWINFO_MASK); 20425 break; 20426 } 20427 INP_WUNLOCK(inp); 20428 return (0); 20429 #endif 20430 #ifdef INET 20431 case IPPROTO_IP: 20432 switch (sopt->sopt_name) { 20433 case IP_TOS: 20434 /* 20435 * The DSCP codepoint has changed, update the fsb. 20436 */ 20437 ip->ip_tos = rack->rc_inp->inp_ip_tos; 20438 break; 20439 case IP_TTL: 20440 /* 20441 * The TTL has changed, update the fsb. 20442 */ 20443 ip->ip_ttl = rack->rc_inp->inp_ip_ttl; 20444 break; 20445 } 20446 INP_WUNLOCK(inp); 20447 return (0); 20448 #endif 20449 } 20450 20451 switch (sopt->sopt_name) { 20452 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 20453 /* Pacing related ones */ 20454 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 20455 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 20456 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 20457 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 20458 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 20459 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 20460 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 20461 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 20462 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 20463 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 20464 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 20465 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 20466 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 20467 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 20468 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 20469 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 20470 /* End pacing related */ 20471 case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ 20472 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 20473 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 20474 case TCP_RACK_MIN_TO: /* URL:min_to */ 20475 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 20476 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 20477 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 20478 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 20479 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 20480 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 20481 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 20482 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 20483 case TCP_RACK_DO_DETECTION: /* URL:detect */ 20484 case TCP_NO_PRR: /* URL:noprr */ 20485 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 20486 case TCP_DATA_AFTER_CLOSE: /* no URL */ 20487 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 20488 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 20489 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 20490 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 20491 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 20492 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 20493 case TCP_RACK_PROFILE: /* URL:profile */ 20494 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 20495 case TCP_RACK_ABC_VAL: /* URL:labc */ 20496 case TCP_REC_ABC_VAL: /* URL:reclabc */ 20497 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 20498 case TCP_DEFER_OPTIONS: /* URL:defer */ 20499 case TCP_RACK_DSACK_OPT: /* URL:dsack */ 20500 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 20501 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 20502 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 20503 case TCP_RACK_ENABLE_HYSTART: /* URL:hystart */ 20504 break; 20505 default: 20506 /* Filter off all unknown options to the base stack */ 20507 return (tcp_default_ctloutput(so, sopt, inp, tp)); 20508 break; 20509 } 20510 INP_WUNLOCK(inp); 20511 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 20512 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 20513 /* 20514 * We truncate it down to 32 bits for the socket-option trace this 20515 * means rates > 34Gbps won't show right, but thats probably ok. 20516 */ 20517 optval = (uint32_t)loptval; 20518 } else { 20519 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 20520 /* Save it in 64 bit form too */ 20521 loptval = optval; 20522 } 20523 if (error) 20524 return (error); 20525 INP_WLOCK(inp); 20526 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 20527 INP_WUNLOCK(inp); 20528 return (ECONNRESET); 20529 } 20530 if (tp->t_fb != &__tcp_rack) { 20531 INP_WUNLOCK(inp); 20532 return (ENOPROTOOPT); 20533 } 20534 if (rack->defer_options && (rack->gp_ready == 0) && 20535 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 20536 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 20537 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 20538 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 20539 /* Options are beind deferred */ 20540 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 20541 INP_WUNLOCK(inp); 20542 return (0); 20543 } else { 20544 /* No memory to defer, fail */ 20545 INP_WUNLOCK(inp); 20546 return (ENOMEM); 20547 } 20548 } 20549 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); 20550 INP_WUNLOCK(inp); 20551 return (error); 20552 } 20553 20554 static void 20555 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 20556 { 20557 20558 INP_WLOCK_ASSERT(tp->t_inpcb); 20559 bzero(ti, sizeof(*ti)); 20560 20561 ti->tcpi_state = tp->t_state; 20562 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 20563 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 20564 if (tp->t_flags & TF_SACK_PERMIT) 20565 ti->tcpi_options |= TCPI_OPT_SACK; 20566 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 20567 ti->tcpi_options |= TCPI_OPT_WSCALE; 20568 ti->tcpi_snd_wscale = tp->snd_scale; 20569 ti->tcpi_rcv_wscale = tp->rcv_scale; 20570 } 20571 if (tp->t_flags2 & TF2_ECN_PERMIT) 20572 ti->tcpi_options |= TCPI_OPT_ECN; 20573 if (tp->t_flags & TF_FASTOPEN) 20574 ti->tcpi_options |= TCPI_OPT_TFO; 20575 /* still kept in ticks is t_rcvtime */ 20576 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 20577 /* Since we hold everything in precise useconds this is easy */ 20578 ti->tcpi_rtt = tp->t_srtt; 20579 ti->tcpi_rttvar = tp->t_rttvar; 20580 ti->tcpi_rto = tp->t_rxtcur; 20581 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 20582 ti->tcpi_snd_cwnd = tp->snd_cwnd; 20583 /* 20584 * FreeBSD-specific extension fields for tcp_info. 20585 */ 20586 ti->tcpi_rcv_space = tp->rcv_wnd; 20587 ti->tcpi_rcv_nxt = tp->rcv_nxt; 20588 ti->tcpi_snd_wnd = tp->snd_wnd; 20589 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 20590 ti->tcpi_snd_nxt = tp->snd_nxt; 20591 ti->tcpi_snd_mss = tp->t_maxseg; 20592 ti->tcpi_rcv_mss = tp->t_maxseg; 20593 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 20594 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 20595 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 20596 #ifdef NETFLIX_STATS 20597 ti->tcpi_total_tlp = tp->t_sndtlppack; 20598 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 20599 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 20600 #endif 20601 #ifdef TCP_OFFLOAD 20602 if (tp->t_flags & TF_TOE) { 20603 ti->tcpi_options |= TCPI_OPT_TOE; 20604 tcp_offload_tcp_info(tp, ti); 20605 } 20606 #endif 20607 } 20608 20609 static int 20610 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 20611 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 20612 { 20613 int32_t error, optval; 20614 uint64_t val, loptval; 20615 struct tcp_info ti; 20616 /* 20617 * Because all our options are either boolean or an int, we can just 20618 * pull everything into optval and then unlock and copy. If we ever 20619 * add a option that is not a int, then this will have quite an 20620 * impact to this routine. 20621 */ 20622 error = 0; 20623 switch (sopt->sopt_name) { 20624 case TCP_INFO: 20625 /* First get the info filled */ 20626 rack_fill_info(tp, &ti); 20627 /* Fix up the rtt related fields if needed */ 20628 INP_WUNLOCK(inp); 20629 error = sooptcopyout(sopt, &ti, sizeof ti); 20630 return (error); 20631 /* 20632 * Beta is the congestion control value for NewReno that influences how 20633 * much of a backoff happens when loss is detected. It is normally set 20634 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 20635 * when you exit recovery. 20636 */ 20637 case TCP_RACK_PACING_BETA: 20638 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 20639 error = EINVAL; 20640 else if (rack->rc_pacing_cc_set == 0) 20641 optval = rack->r_ctl.rc_saved_beta.beta; 20642 else { 20643 /* 20644 * Reach out into the CC data and report back what 20645 * I have previously set. Yeah it looks hackish but 20646 * we don't want to report the saved values. 20647 */ 20648 if (tp->ccv->cc_data) 20649 optval = ((struct newreno *)tp->ccv->cc_data)->beta; 20650 else 20651 error = EINVAL; 20652 } 20653 break; 20654 /* 20655 * Beta_ecn is the congestion control value for NewReno that influences how 20656 * much of a backoff happens when a ECN mark is detected. It is normally set 20657 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 20658 * you exit recovery. Note that classic ECN has a beta of 50, it is only 20659 * ABE Ecn that uses this "less" value, but we do too with pacing :) 20660 */ 20661 20662 case TCP_RACK_PACING_BETA_ECN: 20663 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 20664 error = EINVAL; 20665 else if (rack->rc_pacing_cc_set == 0) 20666 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 20667 else { 20668 /* 20669 * Reach out into the CC data and report back what 20670 * I have previously set. Yeah it looks hackish but 20671 * we don't want to report the saved values. 20672 */ 20673 if (tp->ccv->cc_data) 20674 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn; 20675 else 20676 error = EINVAL; 20677 } 20678 break; 20679 case TCP_RACK_DSACK_OPT: 20680 optval = 0; 20681 if (rack->rc_rack_tmr_std_based) { 20682 optval |= 1; 20683 } 20684 if (rack->rc_rack_use_dsack) { 20685 optval |= 2; 20686 } 20687 break; 20688 case TCP_RACK_ENABLE_HYSTART: 20689 { 20690 struct sockopt sopt; 20691 struct cc_newreno_opts opt; 20692 20693 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 20694 sopt.sopt_dir = SOPT_GET; 20695 opt.name = CC_NEWRENO_ENABLE_HYSTART; 20696 if (CC_ALGO(tp)->ctl_output != NULL) 20697 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 20698 else 20699 error = EINVAL; 20700 optval = opt.val; 20701 } 20702 break; 20703 case TCP_FAST_RSM_HACK: 20704 optval = rack->fast_rsm_hack; 20705 break; 20706 case TCP_DEFER_OPTIONS: 20707 optval = rack->defer_options; 20708 break; 20709 case TCP_RACK_MEASURE_CNT: 20710 optval = rack->r_ctl.req_measurements; 20711 break; 20712 case TCP_REC_ABC_VAL: 20713 optval = rack->r_use_labc_for_rec; 20714 break; 20715 case TCP_RACK_ABC_VAL: 20716 optval = rack->rc_labc; 20717 break; 20718 case TCP_HDWR_UP_ONLY: 20719 optval= rack->r_up_only; 20720 break; 20721 case TCP_PACING_RATE_CAP: 20722 loptval = rack->r_ctl.bw_rate_cap; 20723 break; 20724 case TCP_RACK_PROFILE: 20725 /* You cannot retrieve a profile, its write only */ 20726 error = EINVAL; 20727 break; 20728 case TCP_USE_CMP_ACKS: 20729 optval = rack->r_use_cmp_ack; 20730 break; 20731 case TCP_RACK_PACE_TO_FILL: 20732 optval = rack->rc_pace_to_cwnd; 20733 if (optval && rack->r_fill_less_agg) 20734 optval++; 20735 break; 20736 case TCP_RACK_NO_PUSH_AT_MAX: 20737 optval = rack->r_ctl.rc_no_push_at_mrtt; 20738 break; 20739 case TCP_SHARED_CWND_ENABLE: 20740 optval = rack->rack_enable_scwnd; 20741 break; 20742 case TCP_RACK_NONRXT_CFG_RATE: 20743 optval = rack->rack_rec_nonrxt_use_cr; 20744 break; 20745 case TCP_NO_PRR: 20746 if (rack->rack_no_prr == 1) 20747 optval = 1; 20748 else if (rack->no_prr_addback == 1) 20749 optval = 2; 20750 else 20751 optval = 0; 20752 break; 20753 case TCP_RACK_DO_DETECTION: 20754 optval = rack->do_detection; 20755 break; 20756 case TCP_RACK_MBUF_QUEUE: 20757 /* Now do we use the LRO mbuf-queue feature */ 20758 optval = rack->r_mbuf_queue; 20759 break; 20760 case TCP_TIMELY_DYN_ADJ: 20761 optval = rack->rc_gp_dyn_mul; 20762 break; 20763 case TCP_BBR_IWINTSO: 20764 optval = rack->rc_init_win; 20765 break; 20766 case TCP_RACK_TLP_REDUCE: 20767 /* RACK TLP cwnd reduction (bool) */ 20768 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 20769 break; 20770 case TCP_BBR_RACK_INIT_RATE: 20771 val = rack->r_ctl.init_rate; 20772 /* convert to kbits per sec */ 20773 val *= 8; 20774 val /= 1000; 20775 optval = (uint32_t)val; 20776 break; 20777 case TCP_RACK_FORCE_MSEG: 20778 optval = rack->rc_force_max_seg; 20779 break; 20780 case TCP_RACK_PACE_MAX_SEG: 20781 /* Max segments in a pace */ 20782 optval = rack->rc_user_set_max_segs; 20783 break; 20784 case TCP_RACK_PACE_ALWAYS: 20785 /* Use the always pace method */ 20786 optval = rack->rc_always_pace; 20787 break; 20788 case TCP_RACK_PRR_SENDALOT: 20789 /* Allow PRR to send more than one seg */ 20790 optval = rack->r_ctl.rc_prr_sendalot; 20791 break; 20792 case TCP_RACK_MIN_TO: 20793 /* Minimum time between rack t-o's in ms */ 20794 optval = rack->r_ctl.rc_min_to; 20795 break; 20796 case TCP_RACK_EARLY_SEG: 20797 /* If early recovery max segments */ 20798 optval = rack->r_ctl.rc_early_recovery_segs; 20799 break; 20800 case TCP_RACK_REORD_THRESH: 20801 /* RACK reorder threshold (shift amount) */ 20802 optval = rack->r_ctl.rc_reorder_shift; 20803 break; 20804 case TCP_RACK_REORD_FADE: 20805 /* Does reordering fade after ms time */ 20806 optval = rack->r_ctl.rc_reorder_fade; 20807 break; 20808 case TCP_BBR_USE_RACK_RR: 20809 /* Do we use the rack cheat for rxt */ 20810 optval = rack->use_rack_rr; 20811 break; 20812 case TCP_RACK_RR_CONF: 20813 optval = rack->r_rr_config; 20814 break; 20815 case TCP_HDWR_RATE_CAP: 20816 optval = rack->r_rack_hw_rate_caps; 20817 break; 20818 case TCP_BBR_HDWR_PACE: 20819 optval = rack->rack_hdw_pace_ena; 20820 break; 20821 case TCP_RACK_TLP_THRESH: 20822 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 20823 optval = rack->r_ctl.rc_tlp_threshold; 20824 break; 20825 case TCP_RACK_PKT_DELAY: 20826 /* RACK added ms i.e. rack-rtt + reord + N */ 20827 optval = rack->r_ctl.rc_pkt_delay; 20828 break; 20829 case TCP_RACK_TLP_USE: 20830 optval = rack->rack_tlp_threshold_use; 20831 break; 20832 case TCP_RACK_PACE_RATE_CA: 20833 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 20834 break; 20835 case TCP_RACK_PACE_RATE_SS: 20836 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 20837 break; 20838 case TCP_RACK_PACE_RATE_REC: 20839 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 20840 break; 20841 case TCP_RACK_GP_INCREASE_SS: 20842 optval = rack->r_ctl.rack_per_of_gp_ca; 20843 break; 20844 case TCP_RACK_GP_INCREASE_CA: 20845 optval = rack->r_ctl.rack_per_of_gp_ss; 20846 break; 20847 case TCP_BBR_RACK_RTT_USE: 20848 optval = rack->r_ctl.rc_rate_sample_method; 20849 break; 20850 case TCP_DELACK: 20851 optval = tp->t_delayed_ack; 20852 break; 20853 case TCP_DATA_AFTER_CLOSE: 20854 optval = rack->rc_allow_data_af_clo; 20855 break; 20856 case TCP_SHARED_CWND_TIME_LIMIT: 20857 optval = rack->r_limit_scw; 20858 break; 20859 case TCP_RACK_TIMER_SLOP: 20860 optval = rack->r_ctl.timer_slop; 20861 break; 20862 default: 20863 return (tcp_default_ctloutput(so, sopt, inp, tp)); 20864 break; 20865 } 20866 INP_WUNLOCK(inp); 20867 if (error == 0) { 20868 if (TCP_PACING_RATE_CAP) 20869 error = sooptcopyout(sopt, &loptval, sizeof loptval); 20870 else 20871 error = sooptcopyout(sopt, &optval, sizeof optval); 20872 } 20873 return (error); 20874 } 20875 20876 static int 20877 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 20878 { 20879 int32_t error = EINVAL; 20880 struct tcp_rack *rack; 20881 20882 rack = (struct tcp_rack *)tp->t_fb_ptr; 20883 if (rack == NULL) { 20884 /* Huh? */ 20885 goto out; 20886 } 20887 if (sopt->sopt_dir == SOPT_SET) { 20888 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 20889 } else if (sopt->sopt_dir == SOPT_GET) { 20890 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 20891 } 20892 out: 20893 INP_WUNLOCK(inp); 20894 return (error); 20895 } 20896 20897 static const char *rack_stack_names[] = { 20898 __XSTRING(STACKNAME), 20899 #ifdef STACKALIAS 20900 __XSTRING(STACKALIAS), 20901 #endif 20902 }; 20903 20904 static int 20905 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 20906 { 20907 memset(mem, 0, size); 20908 return (0); 20909 } 20910 20911 static void 20912 rack_dtor(void *mem, int32_t size, void *arg) 20913 { 20914 20915 } 20916 20917 static bool rack_mod_inited = false; 20918 20919 static int 20920 tcp_addrack(module_t mod, int32_t type, void *data) 20921 { 20922 int32_t err = 0; 20923 int num_stacks; 20924 20925 switch (type) { 20926 case MOD_LOAD: 20927 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 20928 sizeof(struct rack_sendmap), 20929 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 20930 20931 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 20932 sizeof(struct tcp_rack), 20933 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 20934 20935 sysctl_ctx_init(&rack_sysctl_ctx); 20936 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 20937 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 20938 OID_AUTO, 20939 #ifdef STACKALIAS 20940 __XSTRING(STACKALIAS), 20941 #else 20942 __XSTRING(STACKNAME), 20943 #endif 20944 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 20945 ""); 20946 if (rack_sysctl_root == NULL) { 20947 printf("Failed to add sysctl node\n"); 20948 err = EFAULT; 20949 goto free_uma; 20950 } 20951 rack_init_sysctls(); 20952 num_stacks = nitems(rack_stack_names); 20953 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 20954 rack_stack_names, &num_stacks); 20955 if (err) { 20956 printf("Failed to register %s stack name for " 20957 "%s module\n", rack_stack_names[num_stacks], 20958 __XSTRING(MODNAME)); 20959 sysctl_ctx_free(&rack_sysctl_ctx); 20960 free_uma: 20961 uma_zdestroy(rack_zone); 20962 uma_zdestroy(rack_pcb_zone); 20963 rack_counter_destroy(); 20964 printf("Failed to register rack module -- err:%d\n", err); 20965 return (err); 20966 } 20967 tcp_lro_reg_mbufq(); 20968 rack_mod_inited = true; 20969 break; 20970 case MOD_QUIESCE: 20971 err = deregister_tcp_functions(&__tcp_rack, true, false); 20972 break; 20973 case MOD_UNLOAD: 20974 err = deregister_tcp_functions(&__tcp_rack, false, true); 20975 if (err == EBUSY) 20976 break; 20977 if (rack_mod_inited) { 20978 uma_zdestroy(rack_zone); 20979 uma_zdestroy(rack_pcb_zone); 20980 sysctl_ctx_free(&rack_sysctl_ctx); 20981 rack_counter_destroy(); 20982 rack_mod_inited = false; 20983 } 20984 tcp_lro_dereg_mbufq(); 20985 err = 0; 20986 break; 20987 default: 20988 return (EOPNOTSUPP); 20989 } 20990 return (err); 20991 } 20992 20993 static moduledata_t tcp_rack = { 20994 .name = __XSTRING(MODNAME), 20995 .evhand = tcp_addrack, 20996 .priv = 0 20997 }; 20998 20999 MODULE_VERSION(MODNAME, 1); 21000 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 21001 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 21002