1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #include <sys/sysctl.h> 52 #include <sys/systm.h> 53 #ifdef STATS 54 #include <sys/qmath.h> 55 #include <sys/tree.h> 56 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 57 #else 58 #include <sys/tree.h> 59 #endif 60 #include <sys/refcount.h> 61 #include <sys/queue.h> 62 #include <sys/tim_filter.h> 63 #include <sys/smp.h> 64 #include <sys/kthread.h> 65 #include <sys/kern_prefetch.h> 66 #include <sys/protosw.h> 67 #ifdef TCP_ACCOUNTING 68 #include <sys/sched.h> 69 #include <machine/cpu.h> 70 #endif 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_hpts.h> 97 #include <netinet/tcp_ratelimit.h> 98 #include <netinet/tcp_accounting.h> 99 #include <netinet/tcpip.h> 100 #include <netinet/cc/cc.h> 101 #include <netinet/cc/cc_newreno.h> 102 #include <netinet/tcp_fastopen.h> 103 #include <netinet/tcp_lro.h> 104 #ifdef NETFLIX_SHARED_CWND 105 #include <netinet/tcp_shared_cwnd.h> 106 #endif 107 #ifdef TCPDEBUG 108 #include <netinet/tcp_debug.h> 109 #endif /* TCPDEBUG */ 110 #ifdef TCP_OFFLOAD 111 #include <netinet/tcp_offload.h> 112 #endif 113 #ifdef INET6 114 #include <netinet6/tcp6_var.h> 115 #endif 116 117 #include <netipsec/ipsec_support.h> 118 119 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 120 #include <netipsec/ipsec.h> 121 #include <netipsec/ipsec6.h> 122 #endif /* IPSEC */ 123 124 #include <netinet/udp.h> 125 #include <netinet/udp_var.h> 126 #include <machine/in_cksum.h> 127 128 #ifdef MAC 129 #include <security/mac/mac_framework.h> 130 #endif 131 #include "sack_filter.h" 132 #include "tcp_rack.h" 133 #include "rack_bbr_common.h" 134 135 uma_zone_t rack_zone; 136 uma_zone_t rack_pcb_zone; 137 138 #ifndef TICKS2SBT 139 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 140 #endif 141 142 VNET_DECLARE(uint32_t, newreno_beta); 143 VNET_DECLARE(uint32_t, newreno_beta_ecn); 144 #define V_newreno_beta VNET(newreno_beta) 145 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 146 147 148 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 149 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 150 151 struct sysctl_ctx_list rack_sysctl_ctx; 152 struct sysctl_oid *rack_sysctl_root; 153 154 #define CUM_ACKED 1 155 #define SACKED 2 156 157 /* 158 * The RACK module incorporates a number of 159 * TCP ideas that have been put out into the IETF 160 * over the last few years: 161 * - Matt Mathis's Rate Halving which slowly drops 162 * the congestion window so that the ack clock can 163 * be maintained during a recovery. 164 * - Yuchung Cheng's RACK TCP (for which its named) that 165 * will stop us using the number of dup acks and instead 166 * use time as the gage of when we retransmit. 167 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 168 * of Dukkipati et.al. 169 * RACK depends on SACK, so if an endpoint arrives that 170 * cannot do SACK the state machine below will shuttle the 171 * connection back to using the "default" TCP stack that is 172 * in FreeBSD. 173 * 174 * To implement RACK the original TCP stack was first decomposed 175 * into a functional state machine with individual states 176 * for each of the possible TCP connection states. The do_segement 177 * functions role in life is to mandate the connection supports SACK 178 * initially and then assure that the RACK state matches the conenction 179 * state before calling the states do_segment function. Each 180 * state is simplified due to the fact that the original do_segment 181 * has been decomposed and we *know* what state we are in (no 182 * switches on the state) and all tests for SACK are gone. This 183 * greatly simplifies what each state does. 184 * 185 * TCP output is also over-written with a new version since it 186 * must maintain the new rack scoreboard. 187 * 188 */ 189 static int32_t rack_tlp_thresh = 1; 190 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 191 static int32_t rack_tlp_use_greater = 1; 192 static int32_t rack_reorder_thresh = 2; 193 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 194 * - 60 seconds */ 195 static uint8_t rack_req_measurements = 1; 196 /* Attack threshold detections */ 197 static uint32_t rack_highest_sack_thresh_seen = 0; 198 static uint32_t rack_highest_move_thresh_seen = 0; 199 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 200 static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ 201 static int32_t rack_hw_rate_caps = 1; /* 1; */ 202 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 203 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 204 static int32_t rack_hw_up_only = 1; 205 static int32_t rack_stats_gets_ms_rtt = 1; 206 static int32_t rack_prr_addbackmax = 2; 207 208 static int32_t rack_pkt_delay = 1000; 209 static int32_t rack_send_a_lot_in_prr = 1; 210 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 211 static int32_t rack_verbose_logging = 0; 212 static int32_t rack_ignore_data_after_close = 1; 213 static int32_t rack_enable_shared_cwnd = 1; 214 static int32_t rack_use_cmp_acks = 1; 215 static int32_t rack_use_fsb = 1; 216 static int32_t rack_use_rfo = 1; 217 static int32_t rack_use_rsm_rfo = 1; 218 static int32_t rack_max_abc_post_recovery = 2; 219 static int32_t rack_client_low_buf = 0; 220 #ifdef TCP_ACCOUNTING 221 static int32_t rack_tcp_accounting = 0; 222 #endif 223 static int32_t rack_limits_scwnd = 1; 224 static int32_t rack_enable_mqueue_for_nonpaced = 0; 225 static int32_t rack_disable_prr = 0; 226 static int32_t use_rack_rr = 1; 227 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 228 static int32_t rack_persist_min = 250000; /* 250usec */ 229 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 230 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 231 static int32_t rack_default_init_window = 0; /* Use system default */ 232 static int32_t rack_limit_time_with_srtt = 0; 233 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 234 static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ 235 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 236 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 237 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 238 /* 239 * Currently regular tcp has a rto_min of 30ms 240 * the backoff goes 12 times so that ends up 241 * being a total of 122.850 seconds before a 242 * connection is killed. 243 */ 244 static uint32_t rack_def_data_window = 20; 245 static uint32_t rack_goal_bdp = 2; 246 static uint32_t rack_min_srtts = 1; 247 static uint32_t rack_min_measure_usec = 0; 248 static int32_t rack_tlp_min = 10000; /* 10ms */ 249 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 250 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 251 static const int32_t rack_free_cache = 2; 252 static int32_t rack_hptsi_segments = 40; 253 static int32_t rack_rate_sample_method = USE_RTT_LOW; 254 static int32_t rack_pace_every_seg = 0; 255 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 256 static int32_t rack_slot_reduction = 4; 257 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 258 static int32_t rack_cwnd_block_ends_measure = 0; 259 static int32_t rack_rwnd_block_ends_measure = 0; 260 static int32_t rack_def_profile = 0; 261 262 static int32_t rack_lower_cwnd_at_tlp = 0; 263 static int32_t rack_limited_retran = 0; 264 static int32_t rack_always_send_oldest = 0; 265 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 266 267 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 268 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 269 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 270 271 /* Probertt */ 272 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 273 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 274 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 275 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 276 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 277 278 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 279 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 280 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 281 static uint32_t rack_probertt_use_min_rtt_exit = 0; 282 static uint32_t rack_probe_rtt_sets_cwnd = 0; 283 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 284 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 285 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 286 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 287 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 288 static uint32_t rack_probertt_filter_life = 10000000; 289 static uint32_t rack_probertt_lower_within = 10; 290 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 291 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 292 static int32_t rack_probertt_clear_is = 1; 293 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 294 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 295 296 /* Part of pacing */ 297 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 298 299 /* Timely information */ 300 /* Combine these two gives the range of 'no change' to bw */ 301 /* ie the up/down provide the upper and lower bound */ 302 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 303 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 304 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 305 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 306 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 307 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 308 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 309 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 310 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 311 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 312 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 313 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 314 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 315 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 316 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 317 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 318 static int32_t rack_use_max_for_nobackoff = 0; 319 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 320 static int32_t rack_timely_no_stopping = 0; 321 static int32_t rack_down_raise_thresh = 100; 322 static int32_t rack_req_segs = 1; 323 static uint64_t rack_bw_rate_cap = 0; 324 325 /* Weird delayed ack mode */ 326 static int32_t rack_use_imac_dack = 0; 327 /* Rack specific counters */ 328 counter_u64_t rack_badfr; 329 counter_u64_t rack_badfr_bytes; 330 counter_u64_t rack_rtm_prr_retran; 331 counter_u64_t rack_rtm_prr_newdata; 332 counter_u64_t rack_timestamp_mismatch; 333 counter_u64_t rack_reorder_seen; 334 counter_u64_t rack_paced_segments; 335 counter_u64_t rack_unpaced_segments; 336 counter_u64_t rack_calc_zero; 337 counter_u64_t rack_calc_nonzero; 338 counter_u64_t rack_saw_enobuf; 339 counter_u64_t rack_saw_enobuf_hw; 340 counter_u64_t rack_saw_enetunreach; 341 counter_u64_t rack_per_timer_hole; 342 counter_u64_t rack_large_ackcmp; 343 counter_u64_t rack_small_ackcmp; 344 #ifdef INVARIANTS 345 counter_u64_t rack_adjust_map_bw; 346 #endif 347 /* Tail loss probe counters */ 348 counter_u64_t rack_tlp_tot; 349 counter_u64_t rack_tlp_newdata; 350 counter_u64_t rack_tlp_retran; 351 counter_u64_t rack_tlp_retran_bytes; 352 counter_u64_t rack_tlp_retran_fail; 353 counter_u64_t rack_to_tot; 354 counter_u64_t rack_to_arm_rack; 355 counter_u64_t rack_to_arm_tlp; 356 counter_u64_t rack_hot_alloc; 357 counter_u64_t rack_to_alloc; 358 counter_u64_t rack_to_alloc_hard; 359 counter_u64_t rack_to_alloc_emerg; 360 counter_u64_t rack_to_alloc_limited; 361 counter_u64_t rack_alloc_limited_conns; 362 counter_u64_t rack_split_limited; 363 364 #define MAX_NUM_OF_CNTS 13 365 counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS]; 366 counter_u64_t rack_multi_single_eq; 367 counter_u64_t rack_proc_non_comp_ack; 368 369 counter_u64_t rack_fto_send; 370 counter_u64_t rack_fto_rsm_send; 371 counter_u64_t rack_nfto_resend; 372 counter_u64_t rack_non_fto_send; 373 counter_u64_t rack_extended_rfo; 374 375 counter_u64_t rack_sack_proc_all; 376 counter_u64_t rack_sack_proc_short; 377 counter_u64_t rack_sack_proc_restart; 378 counter_u64_t rack_sack_attacks_detected; 379 counter_u64_t rack_sack_attacks_reversed; 380 counter_u64_t rack_sack_used_next_merge; 381 counter_u64_t rack_sack_splits; 382 counter_u64_t rack_sack_used_prev_merge; 383 counter_u64_t rack_sack_skipped_acked; 384 counter_u64_t rack_ack_total; 385 counter_u64_t rack_express_sack; 386 counter_u64_t rack_sack_total; 387 counter_u64_t rack_move_none; 388 counter_u64_t rack_move_some; 389 390 counter_u64_t rack_used_tlpmethod; 391 counter_u64_t rack_used_tlpmethod2; 392 counter_u64_t rack_enter_tlp_calc; 393 counter_u64_t rack_input_idle_reduces; 394 counter_u64_t rack_collapsed_win; 395 counter_u64_t rack_tlp_does_nada; 396 counter_u64_t rack_try_scwnd; 397 counter_u64_t rack_hw_pace_init_fail; 398 counter_u64_t rack_hw_pace_lost; 399 counter_u64_t rack_sbsndptr_right; 400 counter_u64_t rack_sbsndptr_wrong; 401 402 /* Temp CPU counters */ 403 counter_u64_t rack_find_high; 404 405 counter_u64_t rack_progress_drops; 406 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 407 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 408 409 410 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 411 412 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 413 (tv) = (value) + slop; \ 414 if ((u_long)(tv) < (u_long)(tvmin)) \ 415 (tv) = (tvmin); \ 416 if ((u_long)(tv) > (u_long)(tvmax)) \ 417 (tv) = (tvmax); \ 418 } while (0) 419 420 static void 421 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 422 423 static int 424 rack_process_ack(struct mbuf *m, struct tcphdr *th, 425 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 426 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 427 static int 428 rack_process_data(struct mbuf *m, struct tcphdr *th, 429 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 430 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 431 static void 432 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 433 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 434 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 435 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 436 uint8_t limit_type); 437 static struct rack_sendmap * 438 rack_check_recovery_mode(struct tcpcb *tp, 439 uint32_t tsused); 440 static void 441 rack_cong_signal(struct tcpcb *tp, 442 uint32_t type, uint32_t ack); 443 static void rack_counter_destroy(void); 444 static int 445 rack_ctloutput(struct socket *so, struct sockopt *sopt, 446 struct inpcb *inp, struct tcpcb *tp); 447 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 448 static void 449 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 450 static void 451 rack_do_segment(struct mbuf *m, struct tcphdr *th, 452 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 453 uint8_t iptos); 454 static void rack_dtor(void *mem, int32_t size, void *arg); 455 static void 456 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 457 uint32_t flex1, uint32_t flex2, 458 uint32_t flex3, uint32_t flex4, 459 uint32_t flex5, uint32_t flex6, 460 uint16_t flex7, uint8_t mod); 461 462 static void 463 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 464 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, 465 struct rack_sendmap *rsm, uint8_t quality); 466 static struct rack_sendmap * 467 rack_find_high_nonack(struct tcp_rack *rack, 468 struct rack_sendmap *rsm); 469 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 470 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 471 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 472 static int 473 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 474 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 475 static void 476 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 477 tcp_seq th_ack, int line, uint8_t quality); 478 static uint32_t 479 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 480 static int32_t rack_handoff_ok(struct tcpcb *tp); 481 static int32_t rack_init(struct tcpcb *tp); 482 static void rack_init_sysctls(void); 483 static void 484 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 485 struct tcphdr *th, int entered_rec, int dup_ack_struck); 486 static void 487 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 488 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts, 489 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls); 490 491 static void 492 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 493 struct rack_sendmap *rsm); 494 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 495 static int32_t rack_output(struct tcpcb *tp); 496 497 static uint32_t 498 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 499 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 500 uint32_t cts, int *moved_two); 501 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 502 static void rack_remxt_tmr(struct tcpcb *tp); 503 static int 504 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 505 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 506 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 507 static int32_t rack_stopall(struct tcpcb *tp); 508 static void 509 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 510 uint32_t delta); 511 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 512 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 513 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 514 static uint32_t 515 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 516 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); 517 static void 518 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 519 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); 520 static int 521 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 522 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 523 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 524 static int 525 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 526 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 527 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 528 static int 529 rack_do_closing(struct mbuf *m, struct tcphdr *th, 530 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 531 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 532 static int 533 rack_do_established(struct mbuf *m, struct tcphdr *th, 534 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 535 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 536 static int 537 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 538 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 539 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 540 static int 541 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 542 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 543 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 544 static int 545 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 546 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 547 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 548 static int 549 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 550 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 551 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 552 static int 553 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 554 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 555 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 556 static int 557 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 558 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 559 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 560 struct rack_sendmap * 561 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 562 uint32_t tsused); 563 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 564 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 565 static void 566 tcp_rack_partialack(struct tcpcb *tp); 567 static int 568 rack_set_profile(struct tcp_rack *rack, int prof); 569 static void 570 rack_apply_deferred_options(struct tcp_rack *rack); 571 572 int32_t rack_clear_counter=0; 573 574 static void 575 rack_set_cc_pacing(struct tcp_rack *rack) 576 { 577 struct sockopt sopt; 578 struct cc_newreno_opts opt; 579 struct newreno old, *ptr; 580 struct tcpcb *tp; 581 int error; 582 583 if (rack->rc_pacing_cc_set) 584 return; 585 586 tp = rack->rc_tp; 587 if (tp->cc_algo == NULL) { 588 /* Tcb is leaving */ 589 printf("No cc algorithm?\n"); 590 return; 591 } 592 rack->rc_pacing_cc_set = 1; 593 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 594 /* Not new-reno we can't play games with beta! */ 595 goto out; 596 } 597 ptr = ((struct newreno *)tp->ccv->cc_data); 598 if (CC_ALGO(tp)->ctl_output == NULL) { 599 /* Huh, why does new_reno no longer have a set function? */ 600 printf("no ctl_output for algo:%s\n", tp->cc_algo->name); 601 goto out; 602 } 603 if (ptr == NULL) { 604 /* Just the default values */ 605 old.beta = V_newreno_beta_ecn; 606 old.beta_ecn = V_newreno_beta_ecn; 607 old.newreno_flags = 0; 608 } else { 609 old.beta = ptr->beta; 610 old.beta_ecn = ptr->beta_ecn; 611 old.newreno_flags = ptr->newreno_flags; 612 } 613 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 614 sopt.sopt_dir = SOPT_SET; 615 opt.name = CC_NEWRENO_BETA; 616 opt.val = rack->r_ctl.rc_saved_beta.beta; 617 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 618 if (error) { 619 printf("Error returned by ctl_output %d\n", error); 620 goto out; 621 } 622 /* 623 * Hack alert we need to set in our newreno_flags 624 * so that Abe behavior is also applied. 625 */ 626 ((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN; 627 opt.name = CC_NEWRENO_BETA_ECN; 628 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 629 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 630 if (error) { 631 printf("Error returned by ctl_output %d\n", error); 632 goto out; 633 } 634 /* Save off the original values for restoral */ 635 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 636 out: 637 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 638 union tcp_log_stackspecific log; 639 struct timeval tv; 640 641 ptr = ((struct newreno *)tp->ccv->cc_data); 642 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 643 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 644 if (ptr) { 645 log.u_bbr.flex1 = ptr->beta; 646 log.u_bbr.flex2 = ptr->beta_ecn; 647 log.u_bbr.flex3 = ptr->newreno_flags; 648 } 649 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 650 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 651 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 652 log.u_bbr.flex7 = rack->gp_ready; 653 log.u_bbr.flex7 <<= 1; 654 log.u_bbr.flex7 |= rack->use_fixed_rate; 655 log.u_bbr.flex7 <<= 1; 656 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 657 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 658 log.u_bbr.flex8 = 3; 659 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 660 0, &log, false, NULL, NULL, 0, &tv); 661 } 662 } 663 664 static void 665 rack_undo_cc_pacing(struct tcp_rack *rack) 666 { 667 struct newreno old, *ptr; 668 struct tcpcb *tp; 669 670 if (rack->rc_pacing_cc_set == 0) 671 return; 672 tp = rack->rc_tp; 673 rack->rc_pacing_cc_set = 0; 674 if (tp->cc_algo == NULL) 675 /* Tcb is leaving */ 676 return; 677 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 678 /* Not new-reno nothing to do! */ 679 return; 680 } 681 ptr = ((struct newreno *)tp->ccv->cc_data); 682 if (ptr == NULL) { 683 /* 684 * This happens at rack_fini() if the 685 * cc module gets freed on us. In that 686 * case we loose our "new" settings but 687 * thats ok, since the tcb is going away anyway. 688 */ 689 return; 690 } 691 /* Grab out our set values */ 692 memcpy(&old, ptr, sizeof(struct newreno)); 693 /* Copy back in the original values */ 694 memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno)); 695 /* Now save back the values we had set in (for when pacing is restored) */ 696 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 697 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 698 union tcp_log_stackspecific log; 699 struct timeval tv; 700 701 ptr = ((struct newreno *)tp->ccv->cc_data); 702 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 703 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 704 log.u_bbr.flex1 = ptr->beta; 705 log.u_bbr.flex2 = ptr->beta_ecn; 706 log.u_bbr.flex3 = ptr->newreno_flags; 707 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 708 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 709 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 710 log.u_bbr.flex7 = rack->gp_ready; 711 log.u_bbr.flex7 <<= 1; 712 log.u_bbr.flex7 |= rack->use_fixed_rate; 713 log.u_bbr.flex7 <<= 1; 714 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 715 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 716 log.u_bbr.flex8 = 4; 717 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 718 0, &log, false, NULL, NULL, 0, &tv); 719 } 720 } 721 722 #ifdef NETFLIX_PEAKRATE 723 static inline void 724 rack_update_peakrate_thr(struct tcpcb *tp) 725 { 726 /* Keep in mind that t_maxpeakrate is in B/s. */ 727 uint64_t peak; 728 peak = uqmax((tp->t_maxseg * 2), 729 (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC)); 730 tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX); 731 } 732 #endif 733 734 static int 735 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 736 { 737 uint32_t stat; 738 int32_t error; 739 int i; 740 741 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 742 if (error || req->newptr == NULL) 743 return error; 744 745 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 746 if (error) 747 return (error); 748 if (stat == 1) { 749 #ifdef INVARIANTS 750 printf("Clearing RACK counters\n"); 751 #endif 752 counter_u64_zero(rack_badfr); 753 counter_u64_zero(rack_badfr_bytes); 754 counter_u64_zero(rack_rtm_prr_retran); 755 counter_u64_zero(rack_rtm_prr_newdata); 756 counter_u64_zero(rack_timestamp_mismatch); 757 counter_u64_zero(rack_reorder_seen); 758 counter_u64_zero(rack_tlp_tot); 759 counter_u64_zero(rack_tlp_newdata); 760 counter_u64_zero(rack_tlp_retran); 761 counter_u64_zero(rack_tlp_retran_bytes); 762 counter_u64_zero(rack_tlp_retran_fail); 763 counter_u64_zero(rack_to_tot); 764 counter_u64_zero(rack_to_arm_rack); 765 counter_u64_zero(rack_to_arm_tlp); 766 counter_u64_zero(rack_paced_segments); 767 counter_u64_zero(rack_calc_zero); 768 counter_u64_zero(rack_calc_nonzero); 769 counter_u64_zero(rack_unpaced_segments); 770 counter_u64_zero(rack_saw_enobuf); 771 counter_u64_zero(rack_saw_enobuf_hw); 772 counter_u64_zero(rack_saw_enetunreach); 773 counter_u64_zero(rack_per_timer_hole); 774 counter_u64_zero(rack_large_ackcmp); 775 counter_u64_zero(rack_small_ackcmp); 776 #ifdef INVARIANTS 777 counter_u64_zero(rack_adjust_map_bw); 778 #endif 779 counter_u64_zero(rack_to_alloc_hard); 780 counter_u64_zero(rack_to_alloc_emerg); 781 counter_u64_zero(rack_sack_proc_all); 782 counter_u64_zero(rack_fto_send); 783 counter_u64_zero(rack_fto_rsm_send); 784 counter_u64_zero(rack_extended_rfo); 785 counter_u64_zero(rack_hw_pace_init_fail); 786 counter_u64_zero(rack_hw_pace_lost); 787 counter_u64_zero(rack_sbsndptr_wrong); 788 counter_u64_zero(rack_sbsndptr_right); 789 counter_u64_zero(rack_non_fto_send); 790 counter_u64_zero(rack_nfto_resend); 791 counter_u64_zero(rack_sack_proc_short); 792 counter_u64_zero(rack_sack_proc_restart); 793 counter_u64_zero(rack_to_alloc); 794 counter_u64_zero(rack_to_alloc_limited); 795 counter_u64_zero(rack_alloc_limited_conns); 796 counter_u64_zero(rack_split_limited); 797 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 798 counter_u64_zero(rack_proc_comp_ack[i]); 799 } 800 counter_u64_zero(rack_multi_single_eq); 801 counter_u64_zero(rack_proc_non_comp_ack); 802 counter_u64_zero(rack_find_high); 803 counter_u64_zero(rack_sack_attacks_detected); 804 counter_u64_zero(rack_sack_attacks_reversed); 805 counter_u64_zero(rack_sack_used_next_merge); 806 counter_u64_zero(rack_sack_used_prev_merge); 807 counter_u64_zero(rack_sack_splits); 808 counter_u64_zero(rack_sack_skipped_acked); 809 counter_u64_zero(rack_ack_total); 810 counter_u64_zero(rack_express_sack); 811 counter_u64_zero(rack_sack_total); 812 counter_u64_zero(rack_move_none); 813 counter_u64_zero(rack_move_some); 814 counter_u64_zero(rack_used_tlpmethod); 815 counter_u64_zero(rack_used_tlpmethod2); 816 counter_u64_zero(rack_enter_tlp_calc); 817 counter_u64_zero(rack_progress_drops); 818 counter_u64_zero(rack_tlp_does_nada); 819 counter_u64_zero(rack_try_scwnd); 820 counter_u64_zero(rack_collapsed_win); 821 } 822 rack_clear_counter = 0; 823 return (0); 824 } 825 826 static void 827 rack_init_sysctls(void) 828 { 829 int i; 830 struct sysctl_oid *rack_counters; 831 struct sysctl_oid *rack_attack; 832 struct sysctl_oid *rack_pacing; 833 struct sysctl_oid *rack_timely; 834 struct sysctl_oid *rack_timers; 835 struct sysctl_oid *rack_tlp; 836 struct sysctl_oid *rack_misc; 837 struct sysctl_oid *rack_measure; 838 struct sysctl_oid *rack_probertt; 839 struct sysctl_oid *rack_hw_pacing; 840 841 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 842 SYSCTL_CHILDREN(rack_sysctl_root), 843 OID_AUTO, 844 "sack_attack", 845 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 846 "Rack Sack Attack Counters and Controls"); 847 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 848 SYSCTL_CHILDREN(rack_sysctl_root), 849 OID_AUTO, 850 "stats", 851 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 852 "Rack Counters"); 853 SYSCTL_ADD_S32(&rack_sysctl_ctx, 854 SYSCTL_CHILDREN(rack_sysctl_root), 855 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 856 &rack_rate_sample_method , USE_RTT_LOW, 857 "What method should we use for rate sampling 0=high, 1=low "); 858 /* Probe rtt related controls */ 859 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 860 SYSCTL_CHILDREN(rack_sysctl_root), 861 OID_AUTO, 862 "probertt", 863 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 864 "ProbeRTT related Controls"); 865 SYSCTL_ADD_U16(&rack_sysctl_ctx, 866 SYSCTL_CHILDREN(rack_probertt), 867 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 868 &rack_atexit_prtt_hbp, 130, 869 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 870 SYSCTL_ADD_U16(&rack_sysctl_ctx, 871 SYSCTL_CHILDREN(rack_probertt), 872 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 873 &rack_atexit_prtt, 130, 874 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 875 SYSCTL_ADD_U16(&rack_sysctl_ctx, 876 SYSCTL_CHILDREN(rack_probertt), 877 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 878 &rack_per_of_gp_probertt, 60, 879 "What percentage of goodput do we pace at in probertt"); 880 SYSCTL_ADD_U16(&rack_sysctl_ctx, 881 SYSCTL_CHILDREN(rack_probertt), 882 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 883 &rack_per_of_gp_probertt_reduce, 10, 884 "What percentage of goodput do we reduce every gp_srtt"); 885 SYSCTL_ADD_U16(&rack_sysctl_ctx, 886 SYSCTL_CHILDREN(rack_probertt), 887 OID_AUTO, "gp_per_low", CTLFLAG_RW, 888 &rack_per_of_gp_lowthresh, 40, 889 "What percentage of goodput do we allow the multiplier to fall to"); 890 SYSCTL_ADD_U32(&rack_sysctl_ctx, 891 SYSCTL_CHILDREN(rack_probertt), 892 OID_AUTO, "time_between", CTLFLAG_RW, 893 & rack_time_between_probertt, 96000000, 894 "How many useconds between the lowest rtt falling must past before we enter probertt"); 895 SYSCTL_ADD_U32(&rack_sysctl_ctx, 896 SYSCTL_CHILDREN(rack_probertt), 897 OID_AUTO, "safety", CTLFLAG_RW, 898 &rack_probe_rtt_safety_val, 2000000, 899 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 900 SYSCTL_ADD_U32(&rack_sysctl_ctx, 901 SYSCTL_CHILDREN(rack_probertt), 902 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 903 &rack_probe_rtt_sets_cwnd, 0, 904 "Do we set the cwnd too (if always_lower is on)"); 905 SYSCTL_ADD_U32(&rack_sysctl_ctx, 906 SYSCTL_CHILDREN(rack_probertt), 907 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 908 &rack_max_drain_wait, 2, 909 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 910 SYSCTL_ADD_U32(&rack_sysctl_ctx, 911 SYSCTL_CHILDREN(rack_probertt), 912 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 913 &rack_must_drain, 1, 914 "We must drain this many gp_srtt's waiting for flight to reach goal"); 915 SYSCTL_ADD_U32(&rack_sysctl_ctx, 916 SYSCTL_CHILDREN(rack_probertt), 917 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 918 &rack_probertt_use_min_rtt_entry, 1, 919 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 920 SYSCTL_ADD_U32(&rack_sysctl_ctx, 921 SYSCTL_CHILDREN(rack_probertt), 922 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 923 &rack_probertt_use_min_rtt_exit, 0, 924 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 925 SYSCTL_ADD_U32(&rack_sysctl_ctx, 926 SYSCTL_CHILDREN(rack_probertt), 927 OID_AUTO, "length_div", CTLFLAG_RW, 928 &rack_probertt_gpsrtt_cnt_div, 0, 929 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 930 SYSCTL_ADD_U32(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_probertt), 932 OID_AUTO, "length_mul", CTLFLAG_RW, 933 &rack_probertt_gpsrtt_cnt_mul, 0, 934 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 935 SYSCTL_ADD_U32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_probertt), 937 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 938 &rack_min_probertt_hold, 200000, 939 "What is the minimum time we hold probertt at target"); 940 SYSCTL_ADD_U32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_probertt), 942 OID_AUTO, "filter_life", CTLFLAG_RW, 943 &rack_probertt_filter_life, 10000000, 944 "What is the time for the filters life in useconds"); 945 SYSCTL_ADD_U32(&rack_sysctl_ctx, 946 SYSCTL_CHILDREN(rack_probertt), 947 OID_AUTO, "lower_within", CTLFLAG_RW, 948 &rack_probertt_lower_within, 10, 949 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 950 SYSCTL_ADD_U32(&rack_sysctl_ctx, 951 SYSCTL_CHILDREN(rack_probertt), 952 OID_AUTO, "must_move", CTLFLAG_RW, 953 &rack_min_rtt_movement, 250, 954 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 955 SYSCTL_ADD_U32(&rack_sysctl_ctx, 956 SYSCTL_CHILDREN(rack_probertt), 957 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 958 &rack_probertt_clear_is, 1, 959 "Do we clear I/S counts on exiting probe-rtt"); 960 SYSCTL_ADD_S32(&rack_sysctl_ctx, 961 SYSCTL_CHILDREN(rack_probertt), 962 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 963 &rack_max_drain_hbp, 1, 964 "How many extra drain gpsrtt's do we get in highly buffered paths"); 965 SYSCTL_ADD_S32(&rack_sysctl_ctx, 966 SYSCTL_CHILDREN(rack_probertt), 967 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 968 &rack_hbp_thresh, 3, 969 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 970 /* Pacing related sysctls */ 971 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 972 SYSCTL_CHILDREN(rack_sysctl_root), 973 OID_AUTO, 974 "pacing", 975 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 976 "Pacing related Controls"); 977 SYSCTL_ADD_S32(&rack_sysctl_ctx, 978 SYSCTL_CHILDREN(rack_pacing), 979 OID_AUTO, "max_pace_over", CTLFLAG_RW, 980 &rack_max_per_above, 30, 981 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 982 SYSCTL_ADD_S32(&rack_sysctl_ctx, 983 SYSCTL_CHILDREN(rack_pacing), 984 OID_AUTO, "pace_to_one", CTLFLAG_RW, 985 &rack_pace_one_seg, 0, 986 "Do we allow low b/w pacing of 1MSS instead of two"); 987 SYSCTL_ADD_S32(&rack_sysctl_ctx, 988 SYSCTL_CHILDREN(rack_pacing), 989 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 990 &rack_limit_time_with_srtt, 0, 991 "Do we limit pacing time based on srtt"); 992 SYSCTL_ADD_S32(&rack_sysctl_ctx, 993 SYSCTL_CHILDREN(rack_pacing), 994 OID_AUTO, "init_win", CTLFLAG_RW, 995 &rack_default_init_window, 0, 996 "Do we have a rack initial window 0 = system default"); 997 SYSCTL_ADD_U16(&rack_sysctl_ctx, 998 SYSCTL_CHILDREN(rack_pacing), 999 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 1000 &rack_per_of_gp_ss, 250, 1001 "If non zero, what percentage of goodput to pace at in slow start"); 1002 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_pacing), 1004 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1005 &rack_per_of_gp_ca, 150, 1006 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1007 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1008 SYSCTL_CHILDREN(rack_pacing), 1009 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1010 &rack_per_of_gp_rec, 200, 1011 "If non zero, what percentage of goodput to pace at in recovery"); 1012 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1013 SYSCTL_CHILDREN(rack_pacing), 1014 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1015 &rack_hptsi_segments, 40, 1016 "What size is the max for TSO segments in pacing and burst mitigation"); 1017 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1018 SYSCTL_CHILDREN(rack_pacing), 1019 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1020 &rack_slot_reduction, 4, 1021 "When doing only burst mitigation what is the reduce divisor"); 1022 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1023 SYSCTL_CHILDREN(rack_sysctl_root), 1024 OID_AUTO, "use_pacing", CTLFLAG_RW, 1025 &rack_pace_every_seg, 0, 1026 "If set we use pacing, if clear we use only the original burst mitigation"); 1027 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1028 SYSCTL_CHILDREN(rack_pacing), 1029 OID_AUTO, "rate_cap", CTLFLAG_RW, 1030 &rack_bw_rate_cap, 0, 1031 "If set we apply this value to the absolute rate cap used by pacing"); 1032 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1033 SYSCTL_CHILDREN(rack_sysctl_root), 1034 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1035 &rack_req_measurements, 1, 1036 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1037 /* Hardware pacing */ 1038 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1039 SYSCTL_CHILDREN(rack_sysctl_root), 1040 OID_AUTO, 1041 "hdwr_pacing", 1042 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1043 "Pacing related Controls"); 1044 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1045 SYSCTL_CHILDREN(rack_hw_pacing), 1046 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1047 &rack_hw_rwnd_factor, 2, 1048 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1049 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1050 SYSCTL_CHILDREN(rack_hw_pacing), 1051 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1052 &rack_enobuf_hw_boost_mult, 2, 1053 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1054 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1055 SYSCTL_CHILDREN(rack_hw_pacing), 1056 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1057 &rack_enobuf_hw_max, 2, 1058 "What is the max boost the pacing time if we see a ENOBUFS?"); 1059 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1060 SYSCTL_CHILDREN(rack_hw_pacing), 1061 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1062 &rack_enobuf_hw_min, 2, 1063 "What is the min boost the pacing time if we see a ENOBUFS?"); 1064 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1065 SYSCTL_CHILDREN(rack_hw_pacing), 1066 OID_AUTO, "enable", CTLFLAG_RW, 1067 &rack_enable_hw_pacing, 0, 1068 "Should RACK attempt to use hw pacing?"); 1069 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1070 SYSCTL_CHILDREN(rack_hw_pacing), 1071 OID_AUTO, "rate_cap", CTLFLAG_RW, 1072 &rack_hw_rate_caps, 1, 1073 "Does the highest hardware pacing rate cap the rate we will send at??"); 1074 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1075 SYSCTL_CHILDREN(rack_hw_pacing), 1076 OID_AUTO, "rate_min", CTLFLAG_RW, 1077 &rack_hw_rate_min, 0, 1078 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1079 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1080 SYSCTL_CHILDREN(rack_hw_pacing), 1081 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1082 &rack_hw_rate_to_low, 0, 1083 "If we fall below this rate, dis-engage hw pacing?"); 1084 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1085 SYSCTL_CHILDREN(rack_hw_pacing), 1086 OID_AUTO, "up_only", CTLFLAG_RW, 1087 &rack_hw_up_only, 1, 1088 "Do we allow hw pacing to lower the rate selected?"); 1089 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1090 SYSCTL_CHILDREN(rack_hw_pacing), 1091 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1092 &rack_hw_pace_extra_slots, 2, 1093 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1094 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1095 SYSCTL_CHILDREN(rack_sysctl_root), 1096 OID_AUTO, 1097 "timely", 1098 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1099 "Rack Timely RTT Controls"); 1100 /* Timely based GP dynmics */ 1101 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1102 SYSCTL_CHILDREN(rack_timely), 1103 OID_AUTO, "upper", CTLFLAG_RW, 1104 &rack_gp_per_bw_mul_up, 2, 1105 "Rack timely upper range for equal b/w (in percentage)"); 1106 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1107 SYSCTL_CHILDREN(rack_timely), 1108 OID_AUTO, "lower", CTLFLAG_RW, 1109 &rack_gp_per_bw_mul_down, 4, 1110 "Rack timely lower range for equal b/w (in percentage)"); 1111 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1112 SYSCTL_CHILDREN(rack_timely), 1113 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1114 &rack_gp_rtt_maxmul, 3, 1115 "Rack timely multipler of lowest rtt for rtt_max"); 1116 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1117 SYSCTL_CHILDREN(rack_timely), 1118 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1119 &rack_gp_rtt_mindiv, 4, 1120 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1121 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1122 SYSCTL_CHILDREN(rack_timely), 1123 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1124 &rack_gp_rtt_minmul, 1, 1125 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1126 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1127 SYSCTL_CHILDREN(rack_timely), 1128 OID_AUTO, "decrease", CTLFLAG_RW, 1129 &rack_gp_decrease_per, 20, 1130 "Rack timely decrease percentage of our GP multiplication factor"); 1131 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1132 SYSCTL_CHILDREN(rack_timely), 1133 OID_AUTO, "increase", CTLFLAG_RW, 1134 &rack_gp_increase_per, 2, 1135 "Rack timely increase perentage of our GP multiplication factor"); 1136 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1137 SYSCTL_CHILDREN(rack_timely), 1138 OID_AUTO, "lowerbound", CTLFLAG_RW, 1139 &rack_per_lower_bound, 50, 1140 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1141 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1142 SYSCTL_CHILDREN(rack_timely), 1143 OID_AUTO, "upperboundss", CTLFLAG_RW, 1144 &rack_per_upper_bound_ss, 0, 1145 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1146 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1147 SYSCTL_CHILDREN(rack_timely), 1148 OID_AUTO, "upperboundca", CTLFLAG_RW, 1149 &rack_per_upper_bound_ca, 0, 1150 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1151 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1152 SYSCTL_CHILDREN(rack_timely), 1153 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1154 &rack_do_dyn_mul, 0, 1155 "Rack timely do we enable dynmaic timely goodput by default"); 1156 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1157 SYSCTL_CHILDREN(rack_timely), 1158 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1159 &rack_gp_no_rec_chg, 1, 1160 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1161 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1162 SYSCTL_CHILDREN(rack_timely), 1163 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1164 &rack_timely_dec_clear, 6, 1165 "Rack timely what threshold do we count to before another boost during b/w decent"); 1166 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1167 SYSCTL_CHILDREN(rack_timely), 1168 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1169 &rack_timely_max_push_rise, 3, 1170 "Rack timely how many times do we push up with b/w increase"); 1171 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1172 SYSCTL_CHILDREN(rack_timely), 1173 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1174 &rack_timely_max_push_drop, 3, 1175 "Rack timely how many times do we push back on b/w decent"); 1176 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1177 SYSCTL_CHILDREN(rack_timely), 1178 OID_AUTO, "min_segs", CTLFLAG_RW, 1179 &rack_timely_min_segs, 4, 1180 "Rack timely when setting the cwnd what is the min num segments"); 1181 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1182 SYSCTL_CHILDREN(rack_timely), 1183 OID_AUTO, "noback_max", CTLFLAG_RW, 1184 &rack_use_max_for_nobackoff, 0, 1185 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1186 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1187 SYSCTL_CHILDREN(rack_timely), 1188 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1189 &rack_timely_int_timely_only, 0, 1190 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1191 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1192 SYSCTL_CHILDREN(rack_timely), 1193 OID_AUTO, "nonstop", CTLFLAG_RW, 1194 &rack_timely_no_stopping, 0, 1195 "Rack timely don't stop increase"); 1196 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1197 SYSCTL_CHILDREN(rack_timely), 1198 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1199 &rack_down_raise_thresh, 100, 1200 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1201 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_timely), 1203 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1204 &rack_req_segs, 1, 1205 "Bottom dragging if not these many segments outstanding and room"); 1206 1207 /* TLP and Rack related parameters */ 1208 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1209 SYSCTL_CHILDREN(rack_sysctl_root), 1210 OID_AUTO, 1211 "tlp", 1212 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1213 "TLP and Rack related Controls"); 1214 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1215 SYSCTL_CHILDREN(rack_tlp), 1216 OID_AUTO, "use_rrr", CTLFLAG_RW, 1217 &use_rack_rr, 1, 1218 "Do we use Rack Rapid Recovery"); 1219 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1220 SYSCTL_CHILDREN(rack_tlp), 1221 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1222 &rack_max_abc_post_recovery, 2, 1223 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1224 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1225 SYSCTL_CHILDREN(rack_tlp), 1226 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1227 &rack_non_rxt_use_cr, 0, 1228 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1229 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1230 SYSCTL_CHILDREN(rack_tlp), 1231 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1232 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1233 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1234 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1235 SYSCTL_CHILDREN(rack_tlp), 1236 OID_AUTO, "limit", CTLFLAG_RW, 1237 &rack_tlp_limit, 2, 1238 "How many TLP's can be sent without sending new data"); 1239 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1240 SYSCTL_CHILDREN(rack_tlp), 1241 OID_AUTO, "use_greater", CTLFLAG_RW, 1242 &rack_tlp_use_greater, 1, 1243 "Should we use the rack_rtt time if its greater than srtt"); 1244 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1245 SYSCTL_CHILDREN(rack_tlp), 1246 OID_AUTO, "tlpminto", CTLFLAG_RW, 1247 &rack_tlp_min, 10000, 1248 "TLP minimum timeout per the specification (in microseconds)"); 1249 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1250 SYSCTL_CHILDREN(rack_tlp), 1251 OID_AUTO, "send_oldest", CTLFLAG_RW, 1252 &rack_always_send_oldest, 0, 1253 "Should we always send the oldest TLP and RACK-TLP"); 1254 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1255 SYSCTL_CHILDREN(rack_tlp), 1256 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1257 &rack_limited_retran, 0, 1258 "How many times can a rack timeout drive out sends"); 1259 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1260 SYSCTL_CHILDREN(rack_tlp), 1261 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1262 &rack_lower_cwnd_at_tlp, 0, 1263 "When a TLP completes a retran should we enter recovery"); 1264 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1265 SYSCTL_CHILDREN(rack_tlp), 1266 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1267 &rack_reorder_thresh, 2, 1268 "What factor for rack will be added when seeing reordering (shift right)"); 1269 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1270 SYSCTL_CHILDREN(rack_tlp), 1271 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1272 &rack_tlp_thresh, 1, 1273 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1274 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1275 SYSCTL_CHILDREN(rack_tlp), 1276 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1277 &rack_reorder_fade, 60000000, 1278 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1279 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1280 SYSCTL_CHILDREN(rack_tlp), 1281 OID_AUTO, "pktdelay", CTLFLAG_RW, 1282 &rack_pkt_delay, 1000, 1283 "Extra RACK time (in microseconds) besides reordering thresh"); 1284 1285 /* Timer related controls */ 1286 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1287 SYSCTL_CHILDREN(rack_sysctl_root), 1288 OID_AUTO, 1289 "timers", 1290 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1291 "Timer related controls"); 1292 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1293 SYSCTL_CHILDREN(rack_timers), 1294 OID_AUTO, "persmin", CTLFLAG_RW, 1295 &rack_persist_min, 250000, 1296 "What is the minimum time in microseconds between persists"); 1297 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1298 SYSCTL_CHILDREN(rack_timers), 1299 OID_AUTO, "persmax", CTLFLAG_RW, 1300 &rack_persist_max, 2000000, 1301 "What is the largest delay in microseconds between persists"); 1302 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1303 SYSCTL_CHILDREN(rack_timers), 1304 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1305 &rack_delayed_ack_time, 40000, 1306 "Delayed ack time (40ms in microseconds)"); 1307 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1308 SYSCTL_CHILDREN(rack_timers), 1309 OID_AUTO, "minrto", CTLFLAG_RW, 1310 &rack_rto_min, 30000, 1311 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1312 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1313 SYSCTL_CHILDREN(rack_timers), 1314 OID_AUTO, "maxrto", CTLFLAG_RW, 1315 &rack_rto_max, 4000000, 1316 "Maxiumum RTO in microseconds -- should be at least as large as min_rto"); 1317 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1318 SYSCTL_CHILDREN(rack_timers), 1319 OID_AUTO, "minto", CTLFLAG_RW, 1320 &rack_min_to, 1000, 1321 "Minimum rack timeout in microseconds"); 1322 /* Measure controls */ 1323 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1324 SYSCTL_CHILDREN(rack_sysctl_root), 1325 OID_AUTO, 1326 "measure", 1327 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1328 "Measure related controls"); 1329 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1330 SYSCTL_CHILDREN(rack_measure), 1331 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1332 &rack_wma_divisor, 8, 1333 "When doing b/w calculation what is the divisor for the WMA"); 1334 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1335 SYSCTL_CHILDREN(rack_measure), 1336 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1337 &rack_cwnd_block_ends_measure, 0, 1338 "Does a cwnd just-return end the measurement window (app limited)"); 1339 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1340 SYSCTL_CHILDREN(rack_measure), 1341 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1342 &rack_rwnd_block_ends_measure, 0, 1343 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1344 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1345 SYSCTL_CHILDREN(rack_measure), 1346 OID_AUTO, "min_target", CTLFLAG_RW, 1347 &rack_def_data_window, 20, 1348 "What is the minimum target window (in mss) for a GP measurements"); 1349 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1350 SYSCTL_CHILDREN(rack_measure), 1351 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1352 &rack_goal_bdp, 2, 1353 "What is the goal BDP to measure"); 1354 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1355 SYSCTL_CHILDREN(rack_measure), 1356 OID_AUTO, "min_srtts", CTLFLAG_RW, 1357 &rack_min_srtts, 1, 1358 "What is the goal BDP to measure"); 1359 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1360 SYSCTL_CHILDREN(rack_measure), 1361 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1362 &rack_min_measure_usec, 0, 1363 "What is the Minimum time time for a measurement if 0, this is off"); 1364 /* Misc rack controls */ 1365 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1366 SYSCTL_CHILDREN(rack_sysctl_root), 1367 OID_AUTO, 1368 "misc", 1369 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1370 "Misc related controls"); 1371 #ifdef TCP_ACCOUNTING 1372 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1373 SYSCTL_CHILDREN(rack_misc), 1374 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1375 &rack_tcp_accounting, 0, 1376 "Should we turn on TCP accounting for all rack sessions?"); 1377 #endif 1378 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1379 SYSCTL_CHILDREN(rack_misc), 1380 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1381 &rack_prr_addbackmax, 2, 1382 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1383 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1384 SYSCTL_CHILDREN(rack_misc), 1385 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1386 &rack_stats_gets_ms_rtt, 1, 1387 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1388 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1389 SYSCTL_CHILDREN(rack_misc), 1390 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1391 &rack_client_low_buf, 0, 1392 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1393 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1394 SYSCTL_CHILDREN(rack_misc), 1395 OID_AUTO, "defprofile", CTLFLAG_RW, 1396 &rack_def_profile, 0, 1397 "Should RACK use a default profile (0=no, num == profile num)?"); 1398 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_misc), 1400 OID_AUTO, "cmpack", CTLFLAG_RW, 1401 &rack_use_cmp_acks, 1, 1402 "Should RACK have LRO send compressed acks"); 1403 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1404 SYSCTL_CHILDREN(rack_misc), 1405 OID_AUTO, "fsb", CTLFLAG_RW, 1406 &rack_use_fsb, 1, 1407 "Should RACK use the fast send block?"); 1408 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1409 SYSCTL_CHILDREN(rack_misc), 1410 OID_AUTO, "rfo", CTLFLAG_RW, 1411 &rack_use_rfo, 1, 1412 "Should RACK use rack_fast_output()?"); 1413 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1414 SYSCTL_CHILDREN(rack_misc), 1415 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1416 &rack_use_rsm_rfo, 1, 1417 "Should RACK use rack_fast_rsm_output()?"); 1418 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1419 SYSCTL_CHILDREN(rack_misc), 1420 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1421 &rack_enable_shared_cwnd, 1, 1422 "Should RACK try to use the shared cwnd on connections where allowed"); 1423 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1424 SYSCTL_CHILDREN(rack_misc), 1425 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1426 &rack_limits_scwnd, 1, 1427 "Should RACK place low end time limits on the shared cwnd feature"); 1428 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1429 SYSCTL_CHILDREN(rack_misc), 1430 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1431 &rack_enable_mqueue_for_nonpaced, 0, 1432 "Should RACK use mbuf queuing for non-paced connections"); 1433 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1434 SYSCTL_CHILDREN(rack_misc), 1435 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1436 &rack_use_imac_dack, 0, 1437 "Should RACK try to emulate iMac delayed ack"); 1438 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1439 SYSCTL_CHILDREN(rack_misc), 1440 OID_AUTO, "no_prr", CTLFLAG_RW, 1441 &rack_disable_prr, 0, 1442 "Should RACK not use prr and only pace (must have pacing on)"); 1443 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1444 SYSCTL_CHILDREN(rack_misc), 1445 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1446 &rack_verbose_logging, 0, 1447 "Should RACK black box logging be verbose"); 1448 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1449 SYSCTL_CHILDREN(rack_misc), 1450 OID_AUTO, "data_after_close", CTLFLAG_RW, 1451 &rack_ignore_data_after_close, 1, 1452 "Do we hold off sending a RST until all pending data is ack'd"); 1453 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1454 SYSCTL_CHILDREN(rack_misc), 1455 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1456 &rack_sack_not_required, 1, 1457 "Do we allow rack to run on connections not supporting SACK"); 1458 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1459 SYSCTL_CHILDREN(rack_misc), 1460 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1461 &rack_send_a_lot_in_prr, 1, 1462 "Send a lot in prr"); 1463 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1464 SYSCTL_CHILDREN(rack_misc), 1465 OID_AUTO, "autoscale", CTLFLAG_RW, 1466 &rack_autosndbuf_inc, 20, 1467 "What percentage should rack scale up its snd buffer by?"); 1468 /* Sack Attacker detection stuff */ 1469 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1470 SYSCTL_CHILDREN(rack_attack), 1471 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1472 &rack_highest_sack_thresh_seen, 0, 1473 "Highest sack to ack ratio seen"); 1474 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1475 SYSCTL_CHILDREN(rack_attack), 1476 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1477 &rack_highest_move_thresh_seen, 0, 1478 "Highest move to non-move ratio seen"); 1479 rack_ack_total = counter_u64_alloc(M_WAITOK); 1480 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1481 SYSCTL_CHILDREN(rack_attack), 1482 OID_AUTO, "acktotal", CTLFLAG_RD, 1483 &rack_ack_total, 1484 "Total number of Ack's"); 1485 rack_express_sack = counter_u64_alloc(M_WAITOK); 1486 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1487 SYSCTL_CHILDREN(rack_attack), 1488 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1489 &rack_express_sack, 1490 "Total expresss number of Sack's"); 1491 rack_sack_total = counter_u64_alloc(M_WAITOK); 1492 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1493 SYSCTL_CHILDREN(rack_attack), 1494 OID_AUTO, "sacktotal", CTLFLAG_RD, 1495 &rack_sack_total, 1496 "Total number of SACKs"); 1497 rack_move_none = counter_u64_alloc(M_WAITOK); 1498 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1499 SYSCTL_CHILDREN(rack_attack), 1500 OID_AUTO, "move_none", CTLFLAG_RD, 1501 &rack_move_none, 1502 "Total number of SACK index reuse of postions under threshold"); 1503 rack_move_some = counter_u64_alloc(M_WAITOK); 1504 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1505 SYSCTL_CHILDREN(rack_attack), 1506 OID_AUTO, "move_some", CTLFLAG_RD, 1507 &rack_move_some, 1508 "Total number of SACK index reuse of postions over threshold"); 1509 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1510 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1511 SYSCTL_CHILDREN(rack_attack), 1512 OID_AUTO, "attacks", CTLFLAG_RD, 1513 &rack_sack_attacks_detected, 1514 "Total number of SACK attackers that had sack disabled"); 1515 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1516 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1517 SYSCTL_CHILDREN(rack_attack), 1518 OID_AUTO, "reversed", CTLFLAG_RD, 1519 &rack_sack_attacks_reversed, 1520 "Total number of SACK attackers that were later determined false positive"); 1521 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1522 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1523 SYSCTL_CHILDREN(rack_attack), 1524 OID_AUTO, "nextmerge", CTLFLAG_RD, 1525 &rack_sack_used_next_merge, 1526 "Total number of times we used the next merge"); 1527 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1528 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1529 SYSCTL_CHILDREN(rack_attack), 1530 OID_AUTO, "prevmerge", CTLFLAG_RD, 1531 &rack_sack_used_prev_merge, 1532 "Total number of times we used the prev merge"); 1533 /* Counters */ 1534 rack_fto_send = counter_u64_alloc(M_WAITOK); 1535 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1536 SYSCTL_CHILDREN(rack_counters), 1537 OID_AUTO, "fto_send", CTLFLAG_RD, 1538 &rack_fto_send, "Total number of rack_fast_output sends"); 1539 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1540 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1541 SYSCTL_CHILDREN(rack_counters), 1542 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1543 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1544 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1545 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1546 SYSCTL_CHILDREN(rack_counters), 1547 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1548 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1549 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1550 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1551 SYSCTL_CHILDREN(rack_counters), 1552 OID_AUTO, "nfto_send", CTLFLAG_RD, 1553 &rack_non_fto_send, "Total number of rack_output first sends"); 1554 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1555 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1556 SYSCTL_CHILDREN(rack_counters), 1557 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1558 &rack_extended_rfo, "Total number of times we extended rfo"); 1559 1560 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1561 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1562 SYSCTL_CHILDREN(rack_counters), 1563 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1564 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1565 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1566 1567 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1568 SYSCTL_CHILDREN(rack_counters), 1569 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1570 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1571 rack_badfr = counter_u64_alloc(M_WAITOK); 1572 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1573 SYSCTL_CHILDREN(rack_counters), 1574 OID_AUTO, "badfr", CTLFLAG_RD, 1575 &rack_badfr, "Total number of bad FRs"); 1576 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1577 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1578 SYSCTL_CHILDREN(rack_counters), 1579 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1580 &rack_badfr_bytes, "Total number of bad FRs"); 1581 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1582 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1583 SYSCTL_CHILDREN(rack_counters), 1584 OID_AUTO, "prrsndret", CTLFLAG_RD, 1585 &rack_rtm_prr_retran, 1586 "Total number of prr based retransmits"); 1587 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1588 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1589 SYSCTL_CHILDREN(rack_counters), 1590 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1591 &rack_rtm_prr_newdata, 1592 "Total number of prr based new transmits"); 1593 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1594 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1595 SYSCTL_CHILDREN(rack_counters), 1596 OID_AUTO, "tsnf", CTLFLAG_RD, 1597 &rack_timestamp_mismatch, 1598 "Total number of timestamps that we could not find the reported ts"); 1599 rack_find_high = counter_u64_alloc(M_WAITOK); 1600 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1601 SYSCTL_CHILDREN(rack_counters), 1602 OID_AUTO, "findhigh", CTLFLAG_RD, 1603 &rack_find_high, 1604 "Total number of FIN causing find-high"); 1605 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1606 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1607 SYSCTL_CHILDREN(rack_counters), 1608 OID_AUTO, "reordering", CTLFLAG_RD, 1609 &rack_reorder_seen, 1610 "Total number of times we added delay due to reordering"); 1611 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1612 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1613 SYSCTL_CHILDREN(rack_counters), 1614 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1615 &rack_tlp_tot, 1616 "Total number of tail loss probe expirations"); 1617 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1618 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1619 SYSCTL_CHILDREN(rack_counters), 1620 OID_AUTO, "tlp_new", CTLFLAG_RD, 1621 &rack_tlp_newdata, 1622 "Total number of tail loss probe sending new data"); 1623 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1624 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1625 SYSCTL_CHILDREN(rack_counters), 1626 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1627 &rack_tlp_retran, 1628 "Total number of tail loss probe sending retransmitted data"); 1629 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1631 SYSCTL_CHILDREN(rack_counters), 1632 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1633 &rack_tlp_retran_bytes, 1634 "Total bytes of tail loss probe sending retransmitted data"); 1635 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1636 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1637 SYSCTL_CHILDREN(rack_counters), 1638 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1639 &rack_tlp_retran_fail, 1640 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1641 rack_to_tot = counter_u64_alloc(M_WAITOK); 1642 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1643 SYSCTL_CHILDREN(rack_counters), 1644 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1645 &rack_to_tot, 1646 "Total number of times the rack to expired"); 1647 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1648 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1649 SYSCTL_CHILDREN(rack_counters), 1650 OID_AUTO, "arm_rack", CTLFLAG_RD, 1651 &rack_to_arm_rack, 1652 "Total number of times the rack timer armed"); 1653 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1654 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1655 SYSCTL_CHILDREN(rack_counters), 1656 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1657 &rack_to_arm_tlp, 1658 "Total number of times the tlp timer armed"); 1659 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1660 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1661 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1662 SYSCTL_CHILDREN(rack_counters), 1663 OID_AUTO, "calc_zero", CTLFLAG_RD, 1664 &rack_calc_zero, 1665 "Total number of times pacing time worked out to zero"); 1666 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1667 SYSCTL_CHILDREN(rack_counters), 1668 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1669 &rack_calc_nonzero, 1670 "Total number of times pacing time worked out to non-zero"); 1671 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1672 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1673 SYSCTL_CHILDREN(rack_counters), 1674 OID_AUTO, "paced", CTLFLAG_RD, 1675 &rack_paced_segments, 1676 "Total number of times a segment send caused hptsi"); 1677 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1678 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1679 SYSCTL_CHILDREN(rack_counters), 1680 OID_AUTO, "unpaced", CTLFLAG_RD, 1681 &rack_unpaced_segments, 1682 "Total number of times a segment did not cause hptsi"); 1683 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1684 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1685 SYSCTL_CHILDREN(rack_counters), 1686 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1687 &rack_saw_enobuf, 1688 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1689 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1690 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1691 SYSCTL_CHILDREN(rack_counters), 1692 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1693 &rack_saw_enobuf_hw, 1694 "Total number of times a send returned enobuf for hdwr paced connections"); 1695 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1696 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1697 SYSCTL_CHILDREN(rack_counters), 1698 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1699 &rack_saw_enetunreach, 1700 "Total number of times a send received a enetunreachable"); 1701 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1702 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1703 SYSCTL_CHILDREN(rack_counters), 1704 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1705 &rack_hot_alloc, 1706 "Total allocations from the top of our list"); 1707 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1708 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1709 SYSCTL_CHILDREN(rack_counters), 1710 OID_AUTO, "allocs", CTLFLAG_RD, 1711 &rack_to_alloc, 1712 "Total allocations of tracking structures"); 1713 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1714 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1715 SYSCTL_CHILDREN(rack_counters), 1716 OID_AUTO, "allochard", CTLFLAG_RD, 1717 &rack_to_alloc_hard, 1718 "Total allocations done with sleeping the hard way"); 1719 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1720 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1721 SYSCTL_CHILDREN(rack_counters), 1722 OID_AUTO, "allocemerg", CTLFLAG_RD, 1723 &rack_to_alloc_emerg, 1724 "Total allocations done from emergency cache"); 1725 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1726 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1727 SYSCTL_CHILDREN(rack_counters), 1728 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1729 &rack_to_alloc_limited, 1730 "Total allocations dropped due to limit"); 1731 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1732 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1733 SYSCTL_CHILDREN(rack_counters), 1734 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1735 &rack_alloc_limited_conns, 1736 "Connections with allocations dropped due to limit"); 1737 rack_split_limited = counter_u64_alloc(M_WAITOK); 1738 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1739 SYSCTL_CHILDREN(rack_counters), 1740 OID_AUTO, "split_limited", CTLFLAG_RD, 1741 &rack_split_limited, 1742 "Split allocations dropped due to limit"); 1743 1744 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 1745 char name[32]; 1746 sprintf(name, "cmp_ack_cnt_%d", i); 1747 rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK); 1748 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1749 SYSCTL_CHILDREN(rack_counters), 1750 OID_AUTO, name, CTLFLAG_RD, 1751 &rack_proc_comp_ack[i], 1752 "Number of compressed acks we processed"); 1753 } 1754 rack_large_ackcmp = counter_u64_alloc(M_WAITOK); 1755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1756 SYSCTL_CHILDREN(rack_counters), 1757 OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD, 1758 &rack_large_ackcmp, 1759 "Number of TCP connections with large mbuf's for compressed acks"); 1760 rack_small_ackcmp = counter_u64_alloc(M_WAITOK); 1761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1762 SYSCTL_CHILDREN(rack_counters), 1763 OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD, 1764 &rack_small_ackcmp, 1765 "Number of TCP connections with small mbuf's for compressed acks"); 1766 #ifdef INVARIANTS 1767 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1768 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1769 SYSCTL_CHILDREN(rack_counters), 1770 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1771 &rack_adjust_map_bw, 1772 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1773 #endif 1774 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1775 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1776 SYSCTL_CHILDREN(rack_counters), 1777 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1778 &rack_multi_single_eq, 1779 "Number of compressed acks total represented"); 1780 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1781 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1782 SYSCTL_CHILDREN(rack_counters), 1783 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1784 &rack_proc_non_comp_ack, 1785 "Number of non compresseds acks that we processed"); 1786 1787 1788 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1789 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1790 SYSCTL_CHILDREN(rack_counters), 1791 OID_AUTO, "sack_long", CTLFLAG_RD, 1792 &rack_sack_proc_all, 1793 "Total times we had to walk whole list for sack processing"); 1794 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1795 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1796 SYSCTL_CHILDREN(rack_counters), 1797 OID_AUTO, "sack_restart", CTLFLAG_RD, 1798 &rack_sack_proc_restart, 1799 "Total times we had to walk whole list due to a restart"); 1800 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1801 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1802 SYSCTL_CHILDREN(rack_counters), 1803 OID_AUTO, "sack_short", CTLFLAG_RD, 1804 &rack_sack_proc_short, 1805 "Total times we took shortcut for sack processing"); 1806 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1807 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1808 SYSCTL_CHILDREN(rack_counters), 1809 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1810 &rack_enter_tlp_calc, 1811 "Total times we called calc-tlp"); 1812 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1813 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1814 SYSCTL_CHILDREN(rack_counters), 1815 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1816 &rack_used_tlpmethod, 1817 "Total number of runt sacks"); 1818 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1820 SYSCTL_CHILDREN(rack_counters), 1821 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1822 &rack_used_tlpmethod2, 1823 "Total number of times we hit TLP method 2"); 1824 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1826 SYSCTL_CHILDREN(rack_attack), 1827 OID_AUTO, "skipacked", CTLFLAG_RD, 1828 &rack_sack_skipped_acked, 1829 "Total number of times we skipped previously sacked"); 1830 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1831 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1832 SYSCTL_CHILDREN(rack_attack), 1833 OID_AUTO, "ofsplit", CTLFLAG_RD, 1834 &rack_sack_splits, 1835 "Total number of times we did the old fashion tree split"); 1836 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1837 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1838 SYSCTL_CHILDREN(rack_counters), 1839 OID_AUTO, "prog_drops", CTLFLAG_RD, 1840 &rack_progress_drops, 1841 "Total number of progress drops"); 1842 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1843 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1844 SYSCTL_CHILDREN(rack_counters), 1845 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1846 &rack_input_idle_reduces, 1847 "Total number of idle reductions on input"); 1848 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1849 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1850 SYSCTL_CHILDREN(rack_counters), 1851 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1852 &rack_collapsed_win, 1853 "Total number of collapsed windows"); 1854 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1855 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1856 SYSCTL_CHILDREN(rack_counters), 1857 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1858 &rack_tlp_does_nada, 1859 "Total number of nada tlp calls"); 1860 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1861 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1862 SYSCTL_CHILDREN(rack_counters), 1863 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1864 &rack_try_scwnd, 1865 "Total number of scwnd attempts"); 1866 1867 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1868 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1869 SYSCTL_CHILDREN(rack_counters), 1870 OID_AUTO, "timer_hole", CTLFLAG_RD, 1871 &rack_per_timer_hole, 1872 "Total persists start in timer hole"); 1873 1874 rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK); 1875 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1876 SYSCTL_CHILDREN(rack_counters), 1877 OID_AUTO, "sndptr_wrong", CTLFLAG_RD, 1878 &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret"); 1879 rack_sbsndptr_right = counter_u64_alloc(M_WAITOK); 1880 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1881 SYSCTL_CHILDREN(rack_counters), 1882 OID_AUTO, "sndptr_right", CTLFLAG_RD, 1883 &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret"); 1884 1885 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1886 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1887 OID_AUTO, "outsize", CTLFLAG_RD, 1888 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1889 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1890 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1891 OID_AUTO, "opts", CTLFLAG_RD, 1892 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1893 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1894 SYSCTL_CHILDREN(rack_sysctl_root), 1895 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1896 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1897 } 1898 1899 static __inline int 1900 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1901 { 1902 if (SEQ_GEQ(b->r_start, a->r_start) && 1903 SEQ_LT(b->r_start, a->r_end)) { 1904 /* 1905 * The entry b is within the 1906 * block a. i.e.: 1907 * a -- |-------------| 1908 * b -- |----| 1909 * <or> 1910 * b -- |------| 1911 * <or> 1912 * b -- |-----------| 1913 */ 1914 return (0); 1915 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1916 /* 1917 * b falls as either the next 1918 * sequence block after a so a 1919 * is said to be smaller than b. 1920 * i.e: 1921 * a -- |------| 1922 * b -- |--------| 1923 * or 1924 * b -- |-----| 1925 */ 1926 return (1); 1927 } 1928 /* 1929 * Whats left is where a is 1930 * larger than b. i.e: 1931 * a -- |-------| 1932 * b -- |---| 1933 * or even possibly 1934 * b -- |--------------| 1935 */ 1936 return (-1); 1937 } 1938 1939 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1940 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1941 1942 static uint32_t 1943 rc_init_window(struct tcp_rack *rack) 1944 { 1945 uint32_t win; 1946 1947 if (rack->rc_init_win == 0) { 1948 /* 1949 * Nothing set by the user, use the system stack 1950 * default. 1951 */ 1952 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1953 } 1954 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1955 return (win); 1956 } 1957 1958 static uint64_t 1959 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1960 { 1961 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1962 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1963 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1964 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1965 else 1966 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1967 } 1968 1969 static uint64_t 1970 rack_get_bw(struct tcp_rack *rack) 1971 { 1972 if (rack->use_fixed_rate) { 1973 /* Return the fixed pacing rate */ 1974 return (rack_get_fixed_pacing_bw(rack)); 1975 } 1976 if (rack->r_ctl.gp_bw == 0) { 1977 /* 1978 * We have yet no b/w measurement, 1979 * if we have a user set initial bw 1980 * return it. If we don't have that and 1981 * we have an srtt, use the tcp IW (10) to 1982 * calculate a fictional b/w over the SRTT 1983 * which is more or less a guess. Note 1984 * we don't use our IW from rack on purpose 1985 * so if we have like IW=30, we are not 1986 * calculating a "huge" b/w. 1987 */ 1988 uint64_t bw, srtt; 1989 if (rack->r_ctl.init_rate) 1990 return (rack->r_ctl.init_rate); 1991 1992 /* Has the user set a max peak rate? */ 1993 #ifdef NETFLIX_PEAKRATE 1994 if (rack->rc_tp->t_maxpeakrate) 1995 return (rack->rc_tp->t_maxpeakrate); 1996 #endif 1997 /* Ok lets come up with the IW guess, if we have a srtt */ 1998 if (rack->rc_tp->t_srtt == 0) { 1999 /* 2000 * Go with old pacing method 2001 * i.e. burst mitigation only. 2002 */ 2003 return (0); 2004 } 2005 /* Ok lets get the initial TCP win (not racks) */ 2006 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2007 srtt = (uint64_t)rack->rc_tp->t_srtt; 2008 bw *= (uint64_t)USECS_IN_SECOND; 2009 bw /= srtt; 2010 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2011 bw = rack->r_ctl.bw_rate_cap; 2012 return (bw); 2013 } else { 2014 uint64_t bw; 2015 2016 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2017 /* Averaging is done, we can return the value */ 2018 bw = rack->r_ctl.gp_bw; 2019 } else { 2020 /* Still doing initial average must calculate */ 2021 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; 2022 } 2023 #ifdef NETFLIX_PEAKRATE 2024 if ((rack->rc_tp->t_maxpeakrate) && 2025 (bw > rack->rc_tp->t_maxpeakrate)) { 2026 /* The user has set a peak rate to pace at 2027 * don't allow us to pace faster than that. 2028 */ 2029 return (rack->rc_tp->t_maxpeakrate); 2030 } 2031 #endif 2032 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2033 bw = rack->r_ctl.bw_rate_cap; 2034 return (bw); 2035 } 2036 } 2037 2038 static uint16_t 2039 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2040 { 2041 if (rack->use_fixed_rate) { 2042 return (100); 2043 } else if (rack->in_probe_rtt && (rsm == NULL)) 2044 return (rack->r_ctl.rack_per_of_gp_probertt); 2045 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2046 rack->r_ctl.rack_per_of_gp_rec)) { 2047 if (rsm) { 2048 /* a retransmission always use the recovery rate */ 2049 return (rack->r_ctl.rack_per_of_gp_rec); 2050 } else if (rack->rack_rec_nonrxt_use_cr) { 2051 /* Directed to use the configured rate */ 2052 goto configured_rate; 2053 } else if (rack->rack_no_prr && 2054 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2055 /* No PRR, lets just use the b/w estimate only */ 2056 return (100); 2057 } else { 2058 /* 2059 * Here we may have a non-retransmit but we 2060 * have no overrides, so just use the recovery 2061 * rate (prr is in effect). 2062 */ 2063 return (rack->r_ctl.rack_per_of_gp_rec); 2064 } 2065 } 2066 configured_rate: 2067 /* For the configured rate we look at our cwnd vs the ssthresh */ 2068 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2069 return (rack->r_ctl.rack_per_of_gp_ss); 2070 else 2071 return (rack->r_ctl.rack_per_of_gp_ca); 2072 } 2073 2074 static void 2075 rack_log_hdwr_pacing(struct tcp_rack *rack, 2076 uint64_t rate, uint64_t hw_rate, int line, 2077 int error, uint16_t mod) 2078 { 2079 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2080 union tcp_log_stackspecific log; 2081 struct timeval tv; 2082 const struct ifnet *ifp; 2083 2084 memset(&log, 0, sizeof(log)); 2085 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2086 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2087 if (rack->r_ctl.crte) { 2088 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2089 } else if (rack->rc_inp->inp_route.ro_nh && 2090 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2091 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2092 } else 2093 ifp = NULL; 2094 if (ifp) { 2095 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2096 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2097 } 2098 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2099 log.u_bbr.bw_inuse = rate; 2100 log.u_bbr.flex5 = line; 2101 log.u_bbr.flex6 = error; 2102 log.u_bbr.flex7 = mod; 2103 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2104 log.u_bbr.flex8 = rack->use_fixed_rate; 2105 log.u_bbr.flex8 <<= 1; 2106 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2107 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2108 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2109 if (rack->r_ctl.crte) 2110 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2111 else 2112 log.u_bbr.cur_del_rate = 0; 2113 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2114 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2115 &rack->rc_inp->inp_socket->so_rcv, 2116 &rack->rc_inp->inp_socket->so_snd, 2117 BBR_LOG_HDWR_PACE, 0, 2118 0, &log, false, &tv); 2119 } 2120 } 2121 2122 static uint64_t 2123 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2124 { 2125 /* 2126 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2127 */ 2128 uint64_t bw_est, high_rate; 2129 uint64_t gain; 2130 2131 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2132 bw_est = bw * gain; 2133 bw_est /= (uint64_t)100; 2134 /* Never fall below the minimum (def 64kbps) */ 2135 if (bw_est < RACK_MIN_BW) 2136 bw_est = RACK_MIN_BW; 2137 if (rack->r_rack_hw_rate_caps) { 2138 /* Rate caps are in place */ 2139 if (rack->r_ctl.crte != NULL) { 2140 /* We have a hdwr rate already */ 2141 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2142 if (bw_est >= high_rate) { 2143 /* We are capping bw at the highest rate table entry */ 2144 rack_log_hdwr_pacing(rack, 2145 bw_est, high_rate, __LINE__, 2146 0, 3); 2147 bw_est = high_rate; 2148 if (capped) 2149 *capped = 1; 2150 } 2151 } else if ((rack->rack_hdrw_pacing == 0) && 2152 (rack->rack_hdw_pace_ena) && 2153 (rack->rack_attempt_hdwr_pace == 0) && 2154 (rack->rc_inp->inp_route.ro_nh != NULL) && 2155 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2156 /* 2157 * Special case, we have not yet attempted hardware 2158 * pacing, and yet we may, when we do, find out if we are 2159 * above the highest rate. We need to know the maxbw for the interface 2160 * in question (if it supports ratelimiting). We get back 2161 * a 0, if the interface is not found in the RL lists. 2162 */ 2163 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2164 if (high_rate) { 2165 /* Yep, we have a rate is it above this rate? */ 2166 if (bw_est > high_rate) { 2167 bw_est = high_rate; 2168 if (capped) 2169 *capped = 1; 2170 } 2171 } 2172 } 2173 } 2174 return (bw_est); 2175 } 2176 2177 static void 2178 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2179 { 2180 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2181 union tcp_log_stackspecific log; 2182 struct timeval tv; 2183 2184 if ((mod != 1) && (rack_verbose_logging == 0)) { 2185 /* 2186 * We get 3 values currently for mod 2187 * 1 - We are retransmitting and this tells the reason. 2188 * 2 - We are clearing a dup-ack count. 2189 * 3 - We are incrementing a dup-ack count. 2190 * 2191 * The clear/increment are only logged 2192 * if you have BBverbose on. 2193 */ 2194 return; 2195 } 2196 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2197 log.u_bbr.flex1 = tsused; 2198 log.u_bbr.flex2 = thresh; 2199 log.u_bbr.flex3 = rsm->r_flags; 2200 log.u_bbr.flex4 = rsm->r_dupack; 2201 log.u_bbr.flex5 = rsm->r_start; 2202 log.u_bbr.flex6 = rsm->r_end; 2203 log.u_bbr.flex8 = mod; 2204 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2205 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2206 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2207 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2208 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2209 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2210 log.u_bbr.pacing_gain = rack->r_must_retran; 2211 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2212 &rack->rc_inp->inp_socket->so_rcv, 2213 &rack->rc_inp->inp_socket->so_snd, 2214 BBR_LOG_SETTINGS_CHG, 0, 2215 0, &log, false, &tv); 2216 } 2217 } 2218 2219 static void 2220 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2221 { 2222 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2223 union tcp_log_stackspecific log; 2224 struct timeval tv; 2225 2226 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2227 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2228 log.u_bbr.flex2 = to; 2229 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2230 log.u_bbr.flex4 = slot; 2231 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 2232 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2233 log.u_bbr.flex7 = rack->rc_in_persist; 2234 log.u_bbr.flex8 = which; 2235 if (rack->rack_no_prr) 2236 log.u_bbr.pkts_out = 0; 2237 else 2238 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2239 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2240 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2241 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2242 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2243 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2244 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2245 log.u_bbr.pacing_gain = rack->r_must_retran; 2246 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2247 log.u_bbr.lost = rack_rto_min; 2248 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2249 &rack->rc_inp->inp_socket->so_rcv, 2250 &rack->rc_inp->inp_socket->so_snd, 2251 BBR_LOG_TIMERSTAR, 0, 2252 0, &log, false, &tv); 2253 } 2254 } 2255 2256 static void 2257 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2258 { 2259 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2260 union tcp_log_stackspecific log; 2261 struct timeval tv; 2262 2263 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2264 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2265 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2266 log.u_bbr.flex8 = to_num; 2267 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2268 log.u_bbr.flex2 = rack->rc_rack_rtt; 2269 if (rsm == NULL) 2270 log.u_bbr.flex3 = 0; 2271 else 2272 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2273 if (rack->rack_no_prr) 2274 log.u_bbr.flex5 = 0; 2275 else 2276 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2277 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2278 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2279 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2280 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2281 log.u_bbr.pacing_gain = rack->r_must_retran; 2282 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2283 &rack->rc_inp->inp_socket->so_rcv, 2284 &rack->rc_inp->inp_socket->so_snd, 2285 BBR_LOG_RTO, 0, 2286 0, &log, false, &tv); 2287 } 2288 } 2289 2290 static void 2291 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2292 struct rack_sendmap *prev, 2293 struct rack_sendmap *rsm, 2294 struct rack_sendmap *next, 2295 int flag, uint32_t th_ack, int line) 2296 { 2297 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2298 union tcp_log_stackspecific log; 2299 struct timeval tv; 2300 2301 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2302 log.u_bbr.flex8 = flag; 2303 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2304 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2305 log.u_bbr.cur_del_rate = (uint64_t)prev; 2306 log.u_bbr.delRate = (uint64_t)rsm; 2307 log.u_bbr.rttProp = (uint64_t)next; 2308 log.u_bbr.flex7 = 0; 2309 if (prev) { 2310 log.u_bbr.flex1 = prev->r_start; 2311 log.u_bbr.flex2 = prev->r_end; 2312 log.u_bbr.flex7 |= 0x4; 2313 } 2314 if (rsm) { 2315 log.u_bbr.flex3 = rsm->r_start; 2316 log.u_bbr.flex4 = rsm->r_end; 2317 log.u_bbr.flex7 |= 0x2; 2318 } 2319 if (next) { 2320 log.u_bbr.flex5 = next->r_start; 2321 log.u_bbr.flex6 = next->r_end; 2322 log.u_bbr.flex7 |= 0x1; 2323 } 2324 log.u_bbr.applimited = line; 2325 log.u_bbr.pkts_out = th_ack; 2326 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2327 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2328 if (rack->rack_no_prr) 2329 log.u_bbr.lost = 0; 2330 else 2331 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2332 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2333 &rack->rc_inp->inp_socket->so_rcv, 2334 &rack->rc_inp->inp_socket->so_snd, 2335 TCP_LOG_MAPCHG, 0, 2336 0, &log, false, &tv); 2337 } 2338 } 2339 2340 static void 2341 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2342 struct rack_sendmap *rsm, int conf) 2343 { 2344 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2345 union tcp_log_stackspecific log; 2346 struct timeval tv; 2347 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2348 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2349 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2350 log.u_bbr.flex1 = t; 2351 log.u_bbr.flex2 = len; 2352 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2353 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2354 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2355 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2356 log.u_bbr.flex7 = conf; 2357 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2358 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2359 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2360 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2361 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2362 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2363 if (rsm) { 2364 log.u_bbr.pkt_epoch = rsm->r_start; 2365 log.u_bbr.lost = rsm->r_end; 2366 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2367 log.u_bbr.pacing_gain = rsm->r_flags; 2368 } else { 2369 /* Its a SYN */ 2370 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2371 log.u_bbr.lost = 0; 2372 log.u_bbr.cwnd_gain = 0; 2373 log.u_bbr.pacing_gain = 0; 2374 } 2375 /* Write out general bits of interest rrs here */ 2376 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2377 log.u_bbr.use_lt_bw <<= 1; 2378 log.u_bbr.use_lt_bw |= rack->forced_ack; 2379 log.u_bbr.use_lt_bw <<= 1; 2380 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2381 log.u_bbr.use_lt_bw <<= 1; 2382 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2383 log.u_bbr.use_lt_bw <<= 1; 2384 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2385 log.u_bbr.use_lt_bw <<= 1; 2386 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2387 log.u_bbr.use_lt_bw <<= 1; 2388 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2389 log.u_bbr.use_lt_bw <<= 1; 2390 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2391 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2392 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2393 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2394 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2395 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2396 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2397 log.u_bbr.bw_inuse <<= 32; 2398 if (rsm) 2399 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2400 TCP_LOG_EVENTP(tp, NULL, 2401 &rack->rc_inp->inp_socket->so_rcv, 2402 &rack->rc_inp->inp_socket->so_snd, 2403 BBR_LOG_BBRRTT, 0, 2404 0, &log, false, &tv); 2405 2406 2407 } 2408 } 2409 2410 static void 2411 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2412 { 2413 /* 2414 * Log the rtt sample we are 2415 * applying to the srtt algorithm in 2416 * useconds. 2417 */ 2418 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2419 union tcp_log_stackspecific log; 2420 struct timeval tv; 2421 2422 /* Convert our ms to a microsecond */ 2423 memset(&log, 0, sizeof(log)); 2424 log.u_bbr.flex1 = rtt; 2425 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2426 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2427 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2428 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2429 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2430 log.u_bbr.flex7 = 1; 2431 log.u_bbr.flex8 = rack->sack_attack_disable; 2432 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2433 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2434 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2435 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2436 log.u_bbr.pacing_gain = rack->r_must_retran; 2437 /* 2438 * We capture in delRate the upper 32 bits as 2439 * the confidence level we had declared, and the 2440 * lower 32 bits as the actual RTT using the arrival 2441 * timestamp. 2442 */ 2443 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2444 log.u_bbr.delRate <<= 32; 2445 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2446 /* Lets capture all the things that make up t_rtxcur */ 2447 log.u_bbr.applimited = rack_rto_min; 2448 log.u_bbr.epoch = rack_rto_max; 2449 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2450 log.u_bbr.lost = rack_rto_min; 2451 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2452 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2453 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2454 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2455 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2456 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2457 &rack->rc_inp->inp_socket->so_rcv, 2458 &rack->rc_inp->inp_socket->so_snd, 2459 TCP_LOG_RTT, 0, 2460 0, &log, false, &tv); 2461 } 2462 } 2463 2464 static void 2465 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2466 { 2467 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2468 union tcp_log_stackspecific log; 2469 struct timeval tv; 2470 2471 /* Convert our ms to a microsecond */ 2472 memset(&log, 0, sizeof(log)); 2473 log.u_bbr.flex1 = rtt; 2474 log.u_bbr.flex2 = send_time; 2475 log.u_bbr.flex3 = ack_time; 2476 log.u_bbr.flex4 = where; 2477 log.u_bbr.flex7 = 2; 2478 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2479 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2480 &rack->rc_inp->inp_socket->so_rcv, 2481 &rack->rc_inp->inp_socket->so_snd, 2482 TCP_LOG_RTT, 0, 2483 0, &log, false, &tv); 2484 } 2485 } 2486 2487 2488 2489 static inline void 2490 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2491 { 2492 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2493 union tcp_log_stackspecific log; 2494 struct timeval tv; 2495 2496 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2497 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2498 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2499 log.u_bbr.flex1 = line; 2500 log.u_bbr.flex2 = tick; 2501 log.u_bbr.flex3 = tp->t_maxunacktime; 2502 log.u_bbr.flex4 = tp->t_acktime; 2503 log.u_bbr.flex8 = event; 2504 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2505 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2506 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2507 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2508 log.u_bbr.pacing_gain = rack->r_must_retran; 2509 TCP_LOG_EVENTP(tp, NULL, 2510 &rack->rc_inp->inp_socket->so_rcv, 2511 &rack->rc_inp->inp_socket->so_snd, 2512 BBR_LOG_PROGRESS, 0, 2513 0, &log, false, &tv); 2514 } 2515 } 2516 2517 static void 2518 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 2519 { 2520 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2521 union tcp_log_stackspecific log; 2522 2523 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2524 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2525 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2526 log.u_bbr.flex1 = slot; 2527 if (rack->rack_no_prr) 2528 log.u_bbr.flex2 = 0; 2529 else 2530 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2531 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2532 log.u_bbr.flex8 = rack->rc_in_persist; 2533 log.u_bbr.timeStamp = cts; 2534 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2535 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2536 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2537 log.u_bbr.pacing_gain = rack->r_must_retran; 2538 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2539 &rack->rc_inp->inp_socket->so_rcv, 2540 &rack->rc_inp->inp_socket->so_snd, 2541 BBR_LOG_BBRSND, 0, 2542 0, &log, false, tv); 2543 } 2544 } 2545 2546 static void 2547 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2548 { 2549 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2550 union tcp_log_stackspecific log; 2551 struct timeval tv; 2552 2553 memset(&log, 0, sizeof(log)); 2554 log.u_bbr.flex1 = did_out; 2555 log.u_bbr.flex2 = nxt_pkt; 2556 log.u_bbr.flex3 = way_out; 2557 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2558 if (rack->rack_no_prr) 2559 log.u_bbr.flex5 = 0; 2560 else 2561 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2562 log.u_bbr.flex6 = nsegs; 2563 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2564 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2565 log.u_bbr.flex7 <<= 1; 2566 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2567 log.u_bbr.flex7 <<= 1; 2568 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2569 log.u_bbr.flex8 = rack->rc_in_persist; 2570 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2571 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2572 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2573 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2574 log.u_bbr.use_lt_bw <<= 1; 2575 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2576 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2577 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2578 log.u_bbr.pacing_gain = rack->r_must_retran; 2579 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2580 &rack->rc_inp->inp_socket->so_rcv, 2581 &rack->rc_inp->inp_socket->so_snd, 2582 BBR_LOG_DOSEG_DONE, 0, 2583 0, &log, false, &tv); 2584 } 2585 } 2586 2587 static void 2588 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2589 { 2590 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2591 union tcp_log_stackspecific log; 2592 struct timeval tv; 2593 uint32_t cts; 2594 2595 memset(&log, 0, sizeof(log)); 2596 cts = tcp_get_usecs(&tv); 2597 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2598 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2599 log.u_bbr.flex4 = arg1; 2600 log.u_bbr.flex5 = arg2; 2601 log.u_bbr.flex6 = arg3; 2602 log.u_bbr.flex8 = frm; 2603 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2604 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2605 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2606 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 2607 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2608 log.u_bbr.pacing_gain = rack->r_must_retran; 2609 TCP_LOG_EVENTP(tp, NULL, 2610 &tp->t_inpcb->inp_socket->so_rcv, 2611 &tp->t_inpcb->inp_socket->so_snd, 2612 TCP_HDWR_PACE_SIZE, 0, 2613 0, &log, false, &tv); 2614 } 2615 } 2616 2617 static void 2618 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2619 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2620 { 2621 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2622 union tcp_log_stackspecific log; 2623 struct timeval tv; 2624 2625 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2626 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2627 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2628 log.u_bbr.flex1 = slot; 2629 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2630 log.u_bbr.flex4 = reason; 2631 if (rack->rack_no_prr) 2632 log.u_bbr.flex5 = 0; 2633 else 2634 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2635 log.u_bbr.flex7 = hpts_calling; 2636 log.u_bbr.flex8 = rack->rc_in_persist; 2637 log.u_bbr.lt_epoch = cwnd_to_use; 2638 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2639 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2640 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2641 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2642 log.u_bbr.pacing_gain = rack->r_must_retran; 2643 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2644 &rack->rc_inp->inp_socket->so_rcv, 2645 &rack->rc_inp->inp_socket->so_snd, 2646 BBR_LOG_JUSTRET, 0, 2647 tlen, &log, false, &tv); 2648 } 2649 } 2650 2651 static void 2652 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2653 struct timeval *tv, uint32_t flags_on_entry) 2654 { 2655 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2656 union tcp_log_stackspecific log; 2657 2658 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2659 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2660 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2661 log.u_bbr.flex1 = line; 2662 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2663 log.u_bbr.flex3 = flags_on_entry; 2664 log.u_bbr.flex4 = us_cts; 2665 if (rack->rack_no_prr) 2666 log.u_bbr.flex5 = 0; 2667 else 2668 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2669 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2670 log.u_bbr.flex7 = hpts_removed; 2671 log.u_bbr.flex8 = 1; 2672 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2673 log.u_bbr.timeStamp = us_cts; 2674 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2675 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2676 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2677 log.u_bbr.pacing_gain = rack->r_must_retran; 2678 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2679 &rack->rc_inp->inp_socket->so_rcv, 2680 &rack->rc_inp->inp_socket->so_snd, 2681 BBR_LOG_TIMERCANC, 0, 2682 0, &log, false, tv); 2683 } 2684 } 2685 2686 static void 2687 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2688 uint32_t flex1, uint32_t flex2, 2689 uint32_t flex3, uint32_t flex4, 2690 uint32_t flex5, uint32_t flex6, 2691 uint16_t flex7, uint8_t mod) 2692 { 2693 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2694 union tcp_log_stackspecific log; 2695 struct timeval tv; 2696 2697 if (mod == 1) { 2698 /* No you can't use 1, its for the real to cancel */ 2699 return; 2700 } 2701 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2702 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2703 log.u_bbr.flex1 = flex1; 2704 log.u_bbr.flex2 = flex2; 2705 log.u_bbr.flex3 = flex3; 2706 log.u_bbr.flex4 = flex4; 2707 log.u_bbr.flex5 = flex5; 2708 log.u_bbr.flex6 = flex6; 2709 log.u_bbr.flex7 = flex7; 2710 log.u_bbr.flex8 = mod; 2711 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2712 &rack->rc_inp->inp_socket->so_rcv, 2713 &rack->rc_inp->inp_socket->so_snd, 2714 BBR_LOG_TIMERCANC, 0, 2715 0, &log, false, &tv); 2716 } 2717 } 2718 2719 static void 2720 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2721 { 2722 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2723 union tcp_log_stackspecific log; 2724 struct timeval tv; 2725 2726 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2727 log.u_bbr.flex1 = timers; 2728 log.u_bbr.flex2 = ret; 2729 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2730 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2731 log.u_bbr.flex5 = cts; 2732 if (rack->rack_no_prr) 2733 log.u_bbr.flex6 = 0; 2734 else 2735 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2736 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2737 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2738 log.u_bbr.pacing_gain = rack->r_must_retran; 2739 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2740 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2741 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2742 &rack->rc_inp->inp_socket->so_rcv, 2743 &rack->rc_inp->inp_socket->so_snd, 2744 BBR_LOG_TO_PROCESS, 0, 2745 0, &log, false, &tv); 2746 } 2747 } 2748 2749 static void 2750 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2751 { 2752 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2753 union tcp_log_stackspecific log; 2754 struct timeval tv; 2755 2756 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2757 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2758 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2759 if (rack->rack_no_prr) 2760 log.u_bbr.flex3 = 0; 2761 else 2762 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2763 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2764 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2765 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2766 log.u_bbr.flex8 = frm; 2767 log.u_bbr.pkts_out = orig_cwnd; 2768 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2769 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2770 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2771 log.u_bbr.use_lt_bw <<= 1; 2772 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2773 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2774 &rack->rc_inp->inp_socket->so_rcv, 2775 &rack->rc_inp->inp_socket->so_snd, 2776 BBR_LOG_BBRUPD, 0, 2777 0, &log, false, &tv); 2778 } 2779 } 2780 2781 #ifdef NETFLIX_EXP_DETECTION 2782 static void 2783 rack_log_sad(struct tcp_rack *rack, int event) 2784 { 2785 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2786 union tcp_log_stackspecific log; 2787 struct timeval tv; 2788 2789 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2790 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2791 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2792 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2793 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2794 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2795 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2796 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2797 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2798 log.u_bbr.lt_epoch |= rack->do_detection; 2799 log.u_bbr.applimited = tcp_map_minimum; 2800 log.u_bbr.flex7 = rack->sack_attack_disable; 2801 log.u_bbr.flex8 = event; 2802 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2803 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2804 log.u_bbr.delivered = tcp_sad_decay_val; 2805 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2806 &rack->rc_inp->inp_socket->so_rcv, 2807 &rack->rc_inp->inp_socket->so_snd, 2808 TCP_SAD_DETECTION, 0, 2809 0, &log, false, &tv); 2810 } 2811 } 2812 #endif 2813 2814 static void 2815 rack_counter_destroy(void) 2816 { 2817 int i; 2818 2819 counter_u64_free(rack_fto_send); 2820 counter_u64_free(rack_fto_rsm_send); 2821 counter_u64_free(rack_nfto_resend); 2822 counter_u64_free(rack_hw_pace_init_fail); 2823 counter_u64_free(rack_hw_pace_lost); 2824 counter_u64_free(rack_non_fto_send); 2825 counter_u64_free(rack_extended_rfo); 2826 counter_u64_free(rack_ack_total); 2827 counter_u64_free(rack_express_sack); 2828 counter_u64_free(rack_sack_total); 2829 counter_u64_free(rack_move_none); 2830 counter_u64_free(rack_move_some); 2831 counter_u64_free(rack_sack_attacks_detected); 2832 counter_u64_free(rack_sack_attacks_reversed); 2833 counter_u64_free(rack_sack_used_next_merge); 2834 counter_u64_free(rack_sack_used_prev_merge); 2835 counter_u64_free(rack_badfr); 2836 counter_u64_free(rack_badfr_bytes); 2837 counter_u64_free(rack_rtm_prr_retran); 2838 counter_u64_free(rack_rtm_prr_newdata); 2839 counter_u64_free(rack_timestamp_mismatch); 2840 counter_u64_free(rack_find_high); 2841 counter_u64_free(rack_reorder_seen); 2842 counter_u64_free(rack_tlp_tot); 2843 counter_u64_free(rack_tlp_newdata); 2844 counter_u64_free(rack_tlp_retran); 2845 counter_u64_free(rack_tlp_retran_bytes); 2846 counter_u64_free(rack_tlp_retran_fail); 2847 counter_u64_free(rack_to_tot); 2848 counter_u64_free(rack_to_arm_rack); 2849 counter_u64_free(rack_to_arm_tlp); 2850 counter_u64_free(rack_calc_zero); 2851 counter_u64_free(rack_calc_nonzero); 2852 counter_u64_free(rack_paced_segments); 2853 counter_u64_free(rack_unpaced_segments); 2854 counter_u64_free(rack_saw_enobuf); 2855 counter_u64_free(rack_saw_enobuf_hw); 2856 counter_u64_free(rack_saw_enetunreach); 2857 counter_u64_free(rack_hot_alloc); 2858 counter_u64_free(rack_to_alloc); 2859 counter_u64_free(rack_to_alloc_hard); 2860 counter_u64_free(rack_to_alloc_emerg); 2861 counter_u64_free(rack_to_alloc_limited); 2862 counter_u64_free(rack_alloc_limited_conns); 2863 counter_u64_free(rack_split_limited); 2864 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 2865 counter_u64_free(rack_proc_comp_ack[i]); 2866 } 2867 counter_u64_free(rack_multi_single_eq); 2868 counter_u64_free(rack_proc_non_comp_ack); 2869 counter_u64_free(rack_sack_proc_all); 2870 counter_u64_free(rack_sack_proc_restart); 2871 counter_u64_free(rack_sack_proc_short); 2872 counter_u64_free(rack_enter_tlp_calc); 2873 counter_u64_free(rack_used_tlpmethod); 2874 counter_u64_free(rack_used_tlpmethod2); 2875 counter_u64_free(rack_sack_skipped_acked); 2876 counter_u64_free(rack_sack_splits); 2877 counter_u64_free(rack_progress_drops); 2878 counter_u64_free(rack_input_idle_reduces); 2879 counter_u64_free(rack_collapsed_win); 2880 counter_u64_free(rack_tlp_does_nada); 2881 counter_u64_free(rack_try_scwnd); 2882 counter_u64_free(rack_per_timer_hole); 2883 counter_u64_free(rack_large_ackcmp); 2884 counter_u64_free(rack_small_ackcmp); 2885 #ifdef INVARIANTS 2886 counter_u64_free(rack_adjust_map_bw); 2887 #endif 2888 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2889 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2890 } 2891 2892 static struct rack_sendmap * 2893 rack_alloc(struct tcp_rack *rack) 2894 { 2895 struct rack_sendmap *rsm; 2896 2897 /* 2898 * First get the top of the list it in 2899 * theory is the "hottest" rsm we have, 2900 * possibly just freed by ack processing. 2901 */ 2902 if (rack->rc_free_cnt > rack_free_cache) { 2903 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2904 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2905 counter_u64_add(rack_hot_alloc, 1); 2906 rack->rc_free_cnt--; 2907 return (rsm); 2908 } 2909 /* 2910 * Once we get under our free cache we probably 2911 * no longer have a "hot" one available. Lets 2912 * get one from UMA. 2913 */ 2914 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2915 if (rsm) { 2916 rack->r_ctl.rc_num_maps_alloced++; 2917 counter_u64_add(rack_to_alloc, 1); 2918 return (rsm); 2919 } 2920 /* 2921 * Dig in to our aux rsm's (the last two) since 2922 * UMA failed to get us one. 2923 */ 2924 if (rack->rc_free_cnt) { 2925 counter_u64_add(rack_to_alloc_emerg, 1); 2926 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2927 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2928 rack->rc_free_cnt--; 2929 return (rsm); 2930 } 2931 return (NULL); 2932 } 2933 2934 static struct rack_sendmap * 2935 rack_alloc_full_limit(struct tcp_rack *rack) 2936 { 2937 if ((V_tcp_map_entries_limit > 0) && 2938 (rack->do_detection == 0) && 2939 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2940 counter_u64_add(rack_to_alloc_limited, 1); 2941 if (!rack->alloc_limit_reported) { 2942 rack->alloc_limit_reported = 1; 2943 counter_u64_add(rack_alloc_limited_conns, 1); 2944 } 2945 return (NULL); 2946 } 2947 return (rack_alloc(rack)); 2948 } 2949 2950 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2951 static struct rack_sendmap * 2952 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2953 { 2954 struct rack_sendmap *rsm; 2955 2956 if (limit_type) { 2957 /* currently there is only one limit type */ 2958 if (V_tcp_map_split_limit > 0 && 2959 (rack->do_detection == 0) && 2960 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2961 counter_u64_add(rack_split_limited, 1); 2962 if (!rack->alloc_limit_reported) { 2963 rack->alloc_limit_reported = 1; 2964 counter_u64_add(rack_alloc_limited_conns, 1); 2965 } 2966 return (NULL); 2967 } 2968 } 2969 2970 /* allocate and mark in the limit type, if set */ 2971 rsm = rack_alloc(rack); 2972 if (rsm != NULL && limit_type) { 2973 rsm->r_limit_type = limit_type; 2974 rack->r_ctl.rc_num_split_allocs++; 2975 } 2976 return (rsm); 2977 } 2978 2979 static void 2980 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2981 { 2982 if (rsm->r_flags & RACK_APP_LIMITED) { 2983 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2984 rack->r_ctl.rc_app_limited_cnt--; 2985 } 2986 } 2987 if (rsm->r_limit_type) { 2988 /* currently there is only one limit type */ 2989 rack->r_ctl.rc_num_split_allocs--; 2990 } 2991 if (rsm == rack->r_ctl.rc_first_appl) { 2992 if (rack->r_ctl.rc_app_limited_cnt == 0) 2993 rack->r_ctl.rc_first_appl = NULL; 2994 else { 2995 /* Follow the next one out */ 2996 struct rack_sendmap fe; 2997 2998 fe.r_start = rsm->r_nseq_appl; 2999 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3000 } 3001 } 3002 if (rsm == rack->r_ctl.rc_resend) 3003 rack->r_ctl.rc_resend = NULL; 3004 if (rsm == rack->r_ctl.rc_rsm_at_retran) 3005 rack->r_ctl.rc_rsm_at_retran = NULL; 3006 if (rsm == rack->r_ctl.rc_end_appl) 3007 rack->r_ctl.rc_end_appl = NULL; 3008 if (rack->r_ctl.rc_tlpsend == rsm) 3009 rack->r_ctl.rc_tlpsend = NULL; 3010 if (rack->r_ctl.rc_sacklast == rsm) 3011 rack->r_ctl.rc_sacklast = NULL; 3012 memset(rsm, 0, sizeof(struct rack_sendmap)); 3013 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3014 rack->rc_free_cnt++; 3015 } 3016 3017 static void 3018 rack_free_trim(struct tcp_rack *rack) 3019 { 3020 struct rack_sendmap *rsm; 3021 3022 /* 3023 * Free up all the tail entries until 3024 * we get our list down to the limit. 3025 */ 3026 while (rack->rc_free_cnt > rack_free_cache) { 3027 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3028 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3029 rack->rc_free_cnt--; 3030 uma_zfree(rack_zone, rsm); 3031 } 3032 } 3033 3034 3035 static uint32_t 3036 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3037 { 3038 uint64_t srtt, bw, len, tim; 3039 uint32_t segsiz, def_len, minl; 3040 3041 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3042 def_len = rack_def_data_window * segsiz; 3043 if (rack->rc_gp_filled == 0) { 3044 /* 3045 * We have no measurement (IW is in flight?) so 3046 * we can only guess using our data_window sysctl 3047 * value (usually 20MSS). 3048 */ 3049 return (def_len); 3050 } 3051 /* 3052 * Now we have a number of factors to consider. 3053 * 3054 * 1) We have a desired BDP which is usually 3055 * at least 2. 3056 * 2) We have a minimum number of rtt's usually 1 SRTT 3057 * but we allow it too to be more. 3058 * 3) We want to make sure a measurement last N useconds (if 3059 * we have set rack_min_measure_usec. 3060 * 3061 * We handle the first concern here by trying to create a data 3062 * window of max(rack_def_data_window, DesiredBDP). The 3063 * second concern we handle in not letting the measurement 3064 * window end normally until at least the required SRTT's 3065 * have gone by which is done further below in 3066 * rack_enough_for_measurement(). Finally the third concern 3067 * we also handle here by calculating how long that time 3068 * would take at the current BW and then return the 3069 * max of our first calculation and that length. Note 3070 * that if rack_min_measure_usec is 0, we don't deal 3071 * with concern 3. Also for both Concern 1 and 3 an 3072 * application limited period could end the measurement 3073 * earlier. 3074 * 3075 * So lets calculate the BDP with the "known" b/w using 3076 * the SRTT has our rtt and then multiply it by the 3077 * goal. 3078 */ 3079 bw = rack_get_bw(rack); 3080 srtt = (uint64_t)tp->t_srtt; 3081 len = bw * srtt; 3082 len /= (uint64_t)HPTS_USEC_IN_SEC; 3083 len *= max(1, rack_goal_bdp); 3084 /* Now we need to round up to the nearest MSS */ 3085 len = roundup(len, segsiz); 3086 if (rack_min_measure_usec) { 3087 /* Now calculate our min length for this b/w */ 3088 tim = rack_min_measure_usec; 3089 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3090 if (minl == 0) 3091 minl = 1; 3092 minl = roundup(minl, segsiz); 3093 if (len < minl) 3094 len = minl; 3095 } 3096 /* 3097 * Now if we have a very small window we want 3098 * to attempt to get the window that is 3099 * as small as possible. This happens on 3100 * low b/w connections and we don't want to 3101 * span huge numbers of rtt's between measurements. 3102 * 3103 * We basically include 2 over our "MIN window" so 3104 * that the measurement can be shortened (possibly) by 3105 * an ack'ed packet. 3106 */ 3107 if (len < def_len) 3108 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3109 else 3110 return (max((uint32_t)len, def_len)); 3111 3112 } 3113 3114 static int 3115 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, uint8_t *quality) 3116 { 3117 uint32_t tim, srtts, segsiz; 3118 3119 /* 3120 * Has enough time passed for the GP measurement to be valid? 3121 */ 3122 if ((tp->snd_max == tp->snd_una) || 3123 (th_ack == tp->snd_max)){ 3124 /* All is acked */ 3125 *quality = RACK_QUALITY_ALLACKED; 3126 return (1); 3127 } 3128 if (SEQ_LT(th_ack, tp->gput_seq)) { 3129 /* Not enough bytes yet */ 3130 return (0); 3131 } 3132 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3133 if (SEQ_LT(th_ack, tp->gput_ack) && 3134 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3135 /* Not enough bytes yet */ 3136 return (0); 3137 } 3138 if (rack->r_ctl.rc_first_appl && 3139 (SEQ_GEQ(th_ack, rack->r_ctl.rc_first_appl->r_end))) { 3140 /* 3141 * We are up to the app limited send point 3142 * we have to measure irrespective of the time.. 3143 */ 3144 *quality = RACK_QUALITY_APPLIMITED; 3145 return (1); 3146 } 3147 /* Now what about time? */ 3148 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3149 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3150 if (tim >= srtts) { 3151 *quality = RACK_QUALITY_HIGH; 3152 return (1); 3153 } 3154 /* Nope not even a full SRTT has passed */ 3155 return (0); 3156 } 3157 3158 static void 3159 rack_log_timely(struct tcp_rack *rack, 3160 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3161 uint64_t up_bnd, int line, uint8_t method) 3162 { 3163 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3164 union tcp_log_stackspecific log; 3165 struct timeval tv; 3166 3167 memset(&log, 0, sizeof(log)); 3168 log.u_bbr.flex1 = logged; 3169 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3170 log.u_bbr.flex2 <<= 4; 3171 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3172 log.u_bbr.flex2 <<= 4; 3173 log.u_bbr.flex2 |= rack->rc_gp_incr; 3174 log.u_bbr.flex2 <<= 4; 3175 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3176 log.u_bbr.flex3 = rack->rc_gp_incr; 3177 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3178 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3179 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3180 log.u_bbr.flex7 = rack->rc_gp_bwred; 3181 log.u_bbr.flex8 = method; 3182 log.u_bbr.cur_del_rate = cur_bw; 3183 log.u_bbr.delRate = low_bnd; 3184 log.u_bbr.bw_inuse = up_bnd; 3185 log.u_bbr.rttProp = rack_get_bw(rack); 3186 log.u_bbr.pkt_epoch = line; 3187 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3188 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3189 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3190 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3191 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3192 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3193 log.u_bbr.cwnd_gain <<= 1; 3194 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3195 log.u_bbr.cwnd_gain <<= 1; 3196 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3197 log.u_bbr.cwnd_gain <<= 1; 3198 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3199 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3200 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3201 &rack->rc_inp->inp_socket->so_rcv, 3202 &rack->rc_inp->inp_socket->so_snd, 3203 TCP_TIMELY_WORK, 0, 3204 0, &log, false, &tv); 3205 } 3206 } 3207 3208 static int 3209 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3210 { 3211 /* 3212 * Before we increase we need to know if 3213 * the estimate just made was less than 3214 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3215 * 3216 * If we already are pacing at a fast enough 3217 * rate to push us faster there is no sense of 3218 * increasing. 3219 * 3220 * We first caculate our actual pacing rate (ss or ca multipler 3221 * times our cur_bw). 3222 * 3223 * Then we take the last measured rate and multipy by our 3224 * maximum pacing overage to give us a max allowable rate. 3225 * 3226 * If our act_rate is smaller than our max_allowable rate 3227 * then we should increase. Else we should hold steady. 3228 * 3229 */ 3230 uint64_t act_rate, max_allow_rate; 3231 3232 if (rack_timely_no_stopping) 3233 return (1); 3234 3235 if ((cur_bw == 0) || (last_bw_est == 0)) { 3236 /* 3237 * Initial startup case or 3238 * everything is acked case. 3239 */ 3240 rack_log_timely(rack, mult, cur_bw, 0, 0, 3241 __LINE__, 9); 3242 return (1); 3243 } 3244 if (mult <= 100) { 3245 /* 3246 * We can always pace at or slightly above our rate. 3247 */ 3248 rack_log_timely(rack, mult, cur_bw, 0, 0, 3249 __LINE__, 9); 3250 return (1); 3251 } 3252 act_rate = cur_bw * (uint64_t)mult; 3253 act_rate /= 100; 3254 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3255 max_allow_rate /= 100; 3256 if (act_rate < max_allow_rate) { 3257 /* 3258 * Here the rate we are actually pacing at 3259 * is smaller than 10% above our last measurement. 3260 * This means we are pacing below what we would 3261 * like to try to achieve (plus some wiggle room). 3262 */ 3263 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3264 __LINE__, 9); 3265 return (1); 3266 } else { 3267 /* 3268 * Here we are already pacing at least rack_max_per_above(10%) 3269 * what we are getting back. This indicates most likely 3270 * that we are being limited (cwnd/rwnd/app) and can't 3271 * get any more b/w. There is no sense of trying to 3272 * raise up the pacing rate its not speeding us up 3273 * and we already are pacing faster than we are getting. 3274 */ 3275 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3276 __LINE__, 8); 3277 return (0); 3278 } 3279 } 3280 3281 static void 3282 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3283 { 3284 /* 3285 * When we drag bottom, we want to assure 3286 * that no multiplier is below 1.0, if so 3287 * we want to restore it to at least that. 3288 */ 3289 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3290 /* This is unlikely we usually do not touch recovery */ 3291 rack->r_ctl.rack_per_of_gp_rec = 100; 3292 } 3293 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3294 rack->r_ctl.rack_per_of_gp_ca = 100; 3295 } 3296 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3297 rack->r_ctl.rack_per_of_gp_ss = 100; 3298 } 3299 } 3300 3301 static void 3302 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3303 { 3304 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3305 rack->r_ctl.rack_per_of_gp_ca = 100; 3306 } 3307 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3308 rack->r_ctl.rack_per_of_gp_ss = 100; 3309 } 3310 } 3311 3312 static void 3313 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3314 { 3315 int32_t calc, logged, plus; 3316 3317 logged = 0; 3318 3319 if (override) { 3320 /* 3321 * override is passed when we are 3322 * loosing b/w and making one last 3323 * gasp at trying to not loose out 3324 * to a new-reno flow. 3325 */ 3326 goto extra_boost; 3327 } 3328 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3329 if (rack->rc_gp_incr && 3330 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3331 /* 3332 * Reset and get 5 strokes more before the boost. Note 3333 * that the count is 0 based so we have to add one. 3334 */ 3335 extra_boost: 3336 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3337 rack->rc_gp_timely_inc_cnt = 0; 3338 } else 3339 plus = (uint32_t)rack_gp_increase_per; 3340 /* Must be at least 1% increase for true timely increases */ 3341 if ((plus < 1) && 3342 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3343 plus = 1; 3344 if (rack->rc_gp_saw_rec && 3345 (rack->rc_gp_no_rec_chg == 0) && 3346 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3347 rack->r_ctl.rack_per_of_gp_rec)) { 3348 /* We have been in recovery ding it too */ 3349 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3350 if (calc > 0xffff) 3351 calc = 0xffff; 3352 logged |= 1; 3353 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3354 if (rack_per_upper_bound_ss && 3355 (rack->rc_dragged_bottom == 0) && 3356 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 3357 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 3358 } 3359 if (rack->rc_gp_saw_ca && 3360 (rack->rc_gp_saw_ss == 0) && 3361 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3362 rack->r_ctl.rack_per_of_gp_ca)) { 3363 /* In CA */ 3364 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3365 if (calc > 0xffff) 3366 calc = 0xffff; 3367 logged |= 2; 3368 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3369 if (rack_per_upper_bound_ca && 3370 (rack->rc_dragged_bottom == 0) && 3371 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 3372 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 3373 } 3374 if (rack->rc_gp_saw_ss && 3375 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3376 rack->r_ctl.rack_per_of_gp_ss)) { 3377 /* In SS */ 3378 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3379 if (calc > 0xffff) 3380 calc = 0xffff; 3381 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3382 if (rack_per_upper_bound_ss && 3383 (rack->rc_dragged_bottom == 0) && 3384 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 3385 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 3386 logged |= 4; 3387 } 3388 if (logged && 3389 (rack->rc_gp_incr == 0)){ 3390 /* Go into increment mode */ 3391 rack->rc_gp_incr = 1; 3392 rack->rc_gp_timely_inc_cnt = 0; 3393 } 3394 if (rack->rc_gp_incr && 3395 logged && 3396 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3397 rack->rc_gp_timely_inc_cnt++; 3398 } 3399 rack_log_timely(rack, logged, plus, 0, 0, 3400 __LINE__, 1); 3401 } 3402 3403 static uint32_t 3404 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3405 { 3406 /* 3407 * norm_grad = rtt_diff / minrtt; 3408 * new_per = curper * (1 - B * norm_grad) 3409 * 3410 * B = rack_gp_decrease_per (default 10%) 3411 * rtt_dif = input var current rtt-diff 3412 * curper = input var current percentage 3413 * minrtt = from rack filter 3414 * 3415 */ 3416 uint64_t perf; 3417 3418 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3419 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3420 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3421 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3422 (uint64_t)1000000)) / 3423 (uint64_t)1000000); 3424 if (perf > curper) { 3425 /* TSNH */ 3426 perf = curper - 1; 3427 } 3428 return ((uint32_t)perf); 3429 } 3430 3431 static uint32_t 3432 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3433 { 3434 /* 3435 * highrttthresh 3436 * result = curper * (1 - (B * ( 1 - ------ )) 3437 * gp_srtt 3438 * 3439 * B = rack_gp_decrease_per (default 10%) 3440 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3441 */ 3442 uint64_t perf; 3443 uint32_t highrttthresh; 3444 3445 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3446 3447 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3448 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3449 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3450 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3451 return (perf); 3452 } 3453 3454 static void 3455 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3456 { 3457 uint64_t logvar, logvar2, logvar3; 3458 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3459 3460 if (rack->rc_gp_incr) { 3461 /* Turn off increment counting */ 3462 rack->rc_gp_incr = 0; 3463 rack->rc_gp_timely_inc_cnt = 0; 3464 } 3465 ss_red = ca_red = rec_red = 0; 3466 logged = 0; 3467 /* Calculate the reduction value */ 3468 if (rtt_diff < 0) { 3469 rtt_diff *= -1; 3470 } 3471 /* Must be at least 1% reduction */ 3472 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3473 /* We have been in recovery ding it too */ 3474 if (timely_says == 2) { 3475 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3476 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3477 if (alt < new_per) 3478 val = alt; 3479 else 3480 val = new_per; 3481 } else 3482 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3483 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3484 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3485 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3486 } else { 3487 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3488 rec_red = 0; 3489 } 3490 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3491 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3492 logged |= 1; 3493 } 3494 if (rack->rc_gp_saw_ss) { 3495 /* Sent in SS */ 3496 if (timely_says == 2) { 3497 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3498 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3499 if (alt < new_per) 3500 val = alt; 3501 else 3502 val = new_per; 3503 } else 3504 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3505 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3506 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3507 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3508 } else { 3509 ss_red = new_per; 3510 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3511 logvar = new_per; 3512 logvar <<= 32; 3513 logvar |= alt; 3514 logvar2 = (uint32_t)rtt; 3515 logvar2 <<= 32; 3516 logvar2 |= (uint32_t)rtt_diff; 3517 logvar3 = rack_gp_rtt_maxmul; 3518 logvar3 <<= 32; 3519 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3520 rack_log_timely(rack, timely_says, 3521 logvar2, logvar3, 3522 logvar, __LINE__, 10); 3523 } 3524 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3525 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3526 logged |= 4; 3527 } else if (rack->rc_gp_saw_ca) { 3528 /* Sent in CA */ 3529 if (timely_says == 2) { 3530 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3531 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3532 if (alt < new_per) 3533 val = alt; 3534 else 3535 val = new_per; 3536 } else 3537 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3538 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3539 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3540 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3541 } else { 3542 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3543 ca_red = 0; 3544 logvar = new_per; 3545 logvar <<= 32; 3546 logvar |= alt; 3547 logvar2 = (uint32_t)rtt; 3548 logvar2 <<= 32; 3549 logvar2 |= (uint32_t)rtt_diff; 3550 logvar3 = rack_gp_rtt_maxmul; 3551 logvar3 <<= 32; 3552 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3553 rack_log_timely(rack, timely_says, 3554 logvar2, logvar3, 3555 logvar, __LINE__, 10); 3556 } 3557 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3558 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3559 logged |= 2; 3560 } 3561 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3562 rack->rc_gp_timely_dec_cnt++; 3563 if (rack_timely_dec_clear && 3564 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3565 rack->rc_gp_timely_dec_cnt = 0; 3566 } 3567 logvar = ss_red; 3568 logvar <<= 32; 3569 logvar |= ca_red; 3570 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3571 __LINE__, 2); 3572 } 3573 3574 static void 3575 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3576 uint32_t rtt, uint32_t line, uint8_t reas) 3577 { 3578 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3579 union tcp_log_stackspecific log; 3580 struct timeval tv; 3581 3582 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3583 log.u_bbr.flex1 = line; 3584 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 3585 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 3586 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3587 log.u_bbr.flex5 = rtt; 3588 log.u_bbr.flex6 = rack->rc_highly_buffered; 3589 log.u_bbr.flex6 <<= 1; 3590 log.u_bbr.flex6 |= rack->forced_ack; 3591 log.u_bbr.flex6 <<= 1; 3592 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 3593 log.u_bbr.flex6 <<= 1; 3594 log.u_bbr.flex6 |= rack->in_probe_rtt; 3595 log.u_bbr.flex6 <<= 1; 3596 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 3597 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 3598 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 3599 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 3600 log.u_bbr.flex8 = reas; 3601 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3602 log.u_bbr.delRate = rack_get_bw(rack); 3603 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 3604 log.u_bbr.cur_del_rate <<= 32; 3605 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 3606 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 3607 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3608 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3609 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3610 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3611 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 3612 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 3613 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3614 log.u_bbr.rttProp = us_cts; 3615 log.u_bbr.rttProp <<= 32; 3616 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 3617 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3618 &rack->rc_inp->inp_socket->so_rcv, 3619 &rack->rc_inp->inp_socket->so_snd, 3620 BBR_LOG_RTT_SHRINKS, 0, 3621 0, &log, false, &rack->r_ctl.act_rcv_time); 3622 } 3623 } 3624 3625 static void 3626 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 3627 { 3628 uint64_t bwdp; 3629 3630 bwdp = rack_get_bw(rack); 3631 bwdp *= (uint64_t)rtt; 3632 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 3633 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 3634 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 3635 /* 3636 * A window protocol must be able to have 4 packets 3637 * outstanding as the floor in order to function 3638 * (especially considering delayed ack :D). 3639 */ 3640 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 3641 } 3642 } 3643 3644 static void 3645 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 3646 { 3647 /** 3648 * ProbeRTT is a bit different in rack_pacing than in 3649 * BBR. It is like BBR in that it uses the lowering of 3650 * the RTT as a signal that we saw something new and 3651 * counts from there for how long between. But it is 3652 * different in that its quite simple. It does not 3653 * play with the cwnd and wait until we get down 3654 * to N segments outstanding and hold that for 3655 * 200ms. Instead it just sets the pacing reduction 3656 * rate to a set percentage (70 by default) and hold 3657 * that for a number of recent GP Srtt's. 3658 */ 3659 uint32_t segsiz; 3660 3661 if (rack->rc_gp_dyn_mul == 0) 3662 return; 3663 3664 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3665 /* We are idle */ 3666 return; 3667 } 3668 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3669 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3670 /* 3671 * Stop the goodput now, the idea here is 3672 * that future measurements with in_probe_rtt 3673 * won't register if they are not greater so 3674 * we want to get what info (if any) is available 3675 * now. 3676 */ 3677 rack_do_goodput_measurement(rack->rc_tp, rack, 3678 rack->rc_tp->snd_una, __LINE__, 3679 RACK_QUALITY_PROBERTT); 3680 } 3681 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3682 rack->r_ctl.rc_time_probertt_entered = us_cts; 3683 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3684 rack->r_ctl.rc_pace_min_segs); 3685 rack->in_probe_rtt = 1; 3686 rack->measure_saw_probe_rtt = 1; 3687 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3688 rack->r_ctl.rc_time_probertt_starts = 0; 3689 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3690 if (rack_probertt_use_min_rtt_entry) 3691 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3692 else 3693 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3694 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3695 __LINE__, RACK_RTTS_ENTERPROBE); 3696 } 3697 3698 static void 3699 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3700 { 3701 struct rack_sendmap *rsm; 3702 uint32_t segsiz; 3703 3704 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3705 rack->r_ctl.rc_pace_min_segs); 3706 rack->in_probe_rtt = 0; 3707 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3708 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3709 /* 3710 * Stop the goodput now, the idea here is 3711 * that future measurements with in_probe_rtt 3712 * won't register if they are not greater so 3713 * we want to get what info (if any) is available 3714 * now. 3715 */ 3716 rack_do_goodput_measurement(rack->rc_tp, rack, 3717 rack->rc_tp->snd_una, __LINE__, 3718 RACK_QUALITY_PROBERTT); 3719 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3720 /* 3721 * We don't have enough data to make a measurement. 3722 * So lets just stop and start here after exiting 3723 * probe-rtt. We probably are not interested in 3724 * the results anyway. 3725 */ 3726 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3727 } 3728 /* 3729 * Measurements through the current snd_max are going 3730 * to be limited by the slower pacing rate. 3731 * 3732 * We need to mark these as app-limited so we 3733 * don't collapse the b/w. 3734 */ 3735 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3736 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3737 if (rack->r_ctl.rc_app_limited_cnt == 0) 3738 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3739 else { 3740 /* 3741 * Go out to the end app limited and mark 3742 * this new one as next and move the end_appl up 3743 * to this guy. 3744 */ 3745 if (rack->r_ctl.rc_end_appl) 3746 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3747 rack->r_ctl.rc_end_appl = rsm; 3748 } 3749 rsm->r_flags |= RACK_APP_LIMITED; 3750 rack->r_ctl.rc_app_limited_cnt++; 3751 } 3752 /* 3753 * Now, we need to examine our pacing rate multipliers. 3754 * If its under 100%, we need to kick it back up to 3755 * 100%. We also don't let it be over our "max" above 3756 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3757 * Note setting clamp_atexit_prtt to 0 has the effect 3758 * of setting CA/SS to 100% always at exit (which is 3759 * the default behavior). 3760 */ 3761 if (rack_probertt_clear_is) { 3762 rack->rc_gp_incr = 0; 3763 rack->rc_gp_bwred = 0; 3764 rack->rc_gp_timely_inc_cnt = 0; 3765 rack->rc_gp_timely_dec_cnt = 0; 3766 } 3767 /* Do we do any clamping at exit? */ 3768 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3769 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3770 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3771 } 3772 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3773 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3774 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3775 } 3776 /* 3777 * Lets set rtt_diff to 0, so that we will get a "boost" 3778 * after exiting. 3779 */ 3780 rack->r_ctl.rc_rtt_diff = 0; 3781 3782 /* Clear all flags so we start fresh */ 3783 rack->rc_tp->t_bytes_acked = 0; 3784 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3785 /* 3786 * If configured to, set the cwnd and ssthresh to 3787 * our targets. 3788 */ 3789 if (rack_probe_rtt_sets_cwnd) { 3790 uint64_t ebdp; 3791 uint32_t setto; 3792 3793 /* Set ssthresh so we get into CA once we hit our target */ 3794 if (rack_probertt_use_min_rtt_exit == 1) { 3795 /* Set to min rtt */ 3796 rack_set_prtt_target(rack, segsiz, 3797 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3798 } else if (rack_probertt_use_min_rtt_exit == 2) { 3799 /* Set to current gp rtt */ 3800 rack_set_prtt_target(rack, segsiz, 3801 rack->r_ctl.rc_gp_srtt); 3802 } else if (rack_probertt_use_min_rtt_exit == 3) { 3803 /* Set to entry gp rtt */ 3804 rack_set_prtt_target(rack, segsiz, 3805 rack->r_ctl.rc_entry_gp_rtt); 3806 } else { 3807 uint64_t sum; 3808 uint32_t setval; 3809 3810 sum = rack->r_ctl.rc_entry_gp_rtt; 3811 sum *= 10; 3812 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3813 if (sum >= 20) { 3814 /* 3815 * A highly buffered path needs 3816 * cwnd space for timely to work. 3817 * Lets set things up as if 3818 * we are heading back here again. 3819 */ 3820 setval = rack->r_ctl.rc_entry_gp_rtt; 3821 } else if (sum >= 15) { 3822 /* 3823 * Lets take the smaller of the 3824 * two since we are just somewhat 3825 * buffered. 3826 */ 3827 setval = rack->r_ctl.rc_gp_srtt; 3828 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3829 setval = rack->r_ctl.rc_entry_gp_rtt; 3830 } else { 3831 /* 3832 * Here we are not highly buffered 3833 * and should pick the min we can to 3834 * keep from causing loss. 3835 */ 3836 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3837 } 3838 rack_set_prtt_target(rack, segsiz, 3839 setval); 3840 } 3841 if (rack_probe_rtt_sets_cwnd > 1) { 3842 /* There is a percentage here to boost */ 3843 ebdp = rack->r_ctl.rc_target_probertt_flight; 3844 ebdp *= rack_probe_rtt_sets_cwnd; 3845 ebdp /= 100; 3846 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3847 } else 3848 setto = rack->r_ctl.rc_target_probertt_flight; 3849 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3850 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3851 /* Enforce a min */ 3852 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3853 } 3854 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3855 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3856 } 3857 rack_log_rtt_shrinks(rack, us_cts, 3858 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3859 __LINE__, RACK_RTTS_EXITPROBE); 3860 /* Clear times last so log has all the info */ 3861 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3862 rack->r_ctl.rc_time_probertt_entered = us_cts; 3863 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3864 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3865 } 3866 3867 static void 3868 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3869 { 3870 /* Check in on probe-rtt */ 3871 if (rack->rc_gp_filled == 0) { 3872 /* We do not do p-rtt unless we have gp measurements */ 3873 return; 3874 } 3875 if (rack->in_probe_rtt) { 3876 uint64_t no_overflow; 3877 uint32_t endtime, must_stay; 3878 3879 if (rack->r_ctl.rc_went_idle_time && 3880 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3881 /* 3882 * We went idle during prtt, just exit now. 3883 */ 3884 rack_exit_probertt(rack, us_cts); 3885 } else if (rack_probe_rtt_safety_val && 3886 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3887 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3888 /* 3889 * Probe RTT safety value triggered! 3890 */ 3891 rack_log_rtt_shrinks(rack, us_cts, 3892 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3893 __LINE__, RACK_RTTS_SAFETY); 3894 rack_exit_probertt(rack, us_cts); 3895 } 3896 /* Calculate the max we will wait */ 3897 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3898 if (rack->rc_highly_buffered) 3899 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3900 /* Calculate the min we must wait */ 3901 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3902 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3903 TSTMP_LT(us_cts, endtime)) { 3904 uint32_t calc; 3905 /* Do we lower more? */ 3906 no_exit: 3907 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3908 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3909 else 3910 calc = 0; 3911 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3912 if (calc) { 3913 /* Maybe */ 3914 calc *= rack_per_of_gp_probertt_reduce; 3915 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3916 /* Limit it too */ 3917 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3918 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3919 } 3920 /* We must reach target or the time set */ 3921 return; 3922 } 3923 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3924 if ((TSTMP_LT(us_cts, must_stay) && 3925 rack->rc_highly_buffered) || 3926 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3927 rack->r_ctl.rc_target_probertt_flight)) { 3928 /* We are not past the must_stay time */ 3929 goto no_exit; 3930 } 3931 rack_log_rtt_shrinks(rack, us_cts, 3932 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3933 __LINE__, RACK_RTTS_REACHTARGET); 3934 rack->r_ctl.rc_time_probertt_starts = us_cts; 3935 if (rack->r_ctl.rc_time_probertt_starts == 0) 3936 rack->r_ctl.rc_time_probertt_starts = 1; 3937 /* Restore back to our rate we want to pace at in prtt */ 3938 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3939 } 3940 /* 3941 * Setup our end time, some number of gp_srtts plus 200ms. 3942 */ 3943 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3944 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3945 if (rack_probertt_gpsrtt_cnt_div) 3946 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3947 else 3948 endtime = 0; 3949 endtime += rack_min_probertt_hold; 3950 endtime += rack->r_ctl.rc_time_probertt_starts; 3951 if (TSTMP_GEQ(us_cts, endtime)) { 3952 /* yes, exit probertt */ 3953 rack_exit_probertt(rack, us_cts); 3954 } 3955 3956 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3957 /* Go into probertt, its been too long since we went lower */ 3958 rack_enter_probertt(rack, us_cts); 3959 } 3960 } 3961 3962 static void 3963 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3964 uint32_t rtt, int32_t rtt_diff) 3965 { 3966 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3967 uint32_t losses; 3968 3969 if ((rack->rc_gp_dyn_mul == 0) || 3970 (rack->use_fixed_rate) || 3971 (rack->in_probe_rtt) || 3972 (rack->rc_always_pace == 0)) { 3973 /* No dynamic GP multipler in play */ 3974 return; 3975 } 3976 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3977 cur_bw = rack_get_bw(rack); 3978 /* Calculate our up and down range */ 3979 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3980 up_bnd /= 100; 3981 up_bnd += rack->r_ctl.last_gp_comp_bw; 3982 3983 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3984 subfr /= 100; 3985 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3986 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3987 /* 3988 * This is the case where our RTT is above 3989 * the max target and we have been configured 3990 * to just do timely no bonus up stuff in that case. 3991 * 3992 * There are two configurations, set to 1, and we 3993 * just do timely if we are over our max. If its 3994 * set above 1 then we slam the multipliers down 3995 * to 100 and then decrement per timely. 3996 */ 3997 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3998 __LINE__, 3); 3999 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 4000 rack_validate_multipliers_at_or_below_100(rack); 4001 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4002 } else if ((last_bw_est < low_bnd) && !losses) { 4003 /* 4004 * We are decreasing this is a bit complicated this 4005 * means we are loosing ground. This could be 4006 * because another flow entered and we are competing 4007 * for b/w with it. This will push the RTT up which 4008 * makes timely unusable unless we want to get shoved 4009 * into a corner and just be backed off (the age 4010 * old problem with delay based CC). 4011 * 4012 * On the other hand if it was a route change we 4013 * would like to stay somewhat contained and not 4014 * blow out the buffers. 4015 */ 4016 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4017 __LINE__, 3); 4018 rack->r_ctl.last_gp_comp_bw = cur_bw; 4019 if (rack->rc_gp_bwred == 0) { 4020 /* Go into reduction counting */ 4021 rack->rc_gp_bwred = 1; 4022 rack->rc_gp_timely_dec_cnt = 0; 4023 } 4024 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 4025 (timely_says == 0)) { 4026 /* 4027 * Push another time with a faster pacing 4028 * to try to gain back (we include override to 4029 * get a full raise factor). 4030 */ 4031 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4032 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4033 (timely_says == 0) || 4034 (rack_down_raise_thresh == 0)) { 4035 /* 4036 * Do an override up in b/w if we were 4037 * below the threshold or if the threshold 4038 * is zero we always do the raise. 4039 */ 4040 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4041 } else { 4042 /* Log it stays the same */ 4043 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4044 __LINE__, 11); 4045 } 4046 rack->rc_gp_timely_dec_cnt++; 4047 /* We are not incrementing really no-count */ 4048 rack->rc_gp_incr = 0; 4049 rack->rc_gp_timely_inc_cnt = 0; 4050 } else { 4051 /* 4052 * Lets just use the RTT 4053 * information and give up 4054 * pushing. 4055 */ 4056 goto use_timely; 4057 } 4058 } else if ((timely_says != 2) && 4059 !losses && 4060 (last_bw_est > up_bnd)) { 4061 /* 4062 * We are increasing b/w lets keep going, updating 4063 * our b/w and ignoring any timely input, unless 4064 * of course we are at our max raise (if there is one). 4065 */ 4066 4067 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4068 __LINE__, 3); 4069 rack->r_ctl.last_gp_comp_bw = cur_bw; 4070 if (rack->rc_gp_saw_ss && 4071 rack_per_upper_bound_ss && 4072 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 4073 /* 4074 * In cases where we can't go higher 4075 * we should just use timely. 4076 */ 4077 goto use_timely; 4078 } 4079 if (rack->rc_gp_saw_ca && 4080 rack_per_upper_bound_ca && 4081 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 4082 /* 4083 * In cases where we can't go higher 4084 * we should just use timely. 4085 */ 4086 goto use_timely; 4087 } 4088 rack->rc_gp_bwred = 0; 4089 rack->rc_gp_timely_dec_cnt = 0; 4090 /* You get a set number of pushes if timely is trying to reduce */ 4091 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4092 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4093 } else { 4094 /* Log it stays the same */ 4095 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4096 __LINE__, 12); 4097 } 4098 return; 4099 } else { 4100 /* 4101 * We are staying between the lower and upper range bounds 4102 * so use timely to decide. 4103 */ 4104 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4105 __LINE__, 3); 4106 use_timely: 4107 if (timely_says) { 4108 rack->rc_gp_incr = 0; 4109 rack->rc_gp_timely_inc_cnt = 0; 4110 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4111 !losses && 4112 (last_bw_est < low_bnd)) { 4113 /* We are loosing ground */ 4114 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4115 rack->rc_gp_timely_dec_cnt++; 4116 /* We are not incrementing really no-count */ 4117 rack->rc_gp_incr = 0; 4118 rack->rc_gp_timely_inc_cnt = 0; 4119 } else 4120 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4121 } else { 4122 rack->rc_gp_bwred = 0; 4123 rack->rc_gp_timely_dec_cnt = 0; 4124 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4125 } 4126 } 4127 } 4128 4129 static int32_t 4130 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4131 { 4132 int32_t timely_says; 4133 uint64_t log_mult, log_rtt_a_diff; 4134 4135 log_rtt_a_diff = rtt; 4136 log_rtt_a_diff <<= 32; 4137 log_rtt_a_diff |= (uint32_t)rtt_diff; 4138 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4139 rack_gp_rtt_maxmul)) { 4140 /* Reduce the b/w multipler */ 4141 timely_says = 2; 4142 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4143 log_mult <<= 32; 4144 log_mult |= prev_rtt; 4145 rack_log_timely(rack, timely_says, log_mult, 4146 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4147 log_rtt_a_diff, __LINE__, 4); 4148 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4149 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4150 max(rack_gp_rtt_mindiv , 1)))) { 4151 /* Increase the b/w multipler */ 4152 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4153 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4154 max(rack_gp_rtt_mindiv , 1)); 4155 log_mult <<= 32; 4156 log_mult |= prev_rtt; 4157 timely_says = 0; 4158 rack_log_timely(rack, timely_says, log_mult , 4159 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4160 log_rtt_a_diff, __LINE__, 5); 4161 } else { 4162 /* 4163 * Use a gradient to find it the timely gradient 4164 * is: 4165 * grad = rc_rtt_diff / min_rtt; 4166 * 4167 * anything below or equal to 0 will be 4168 * a increase indication. Anything above 4169 * zero is a decrease. Note we take care 4170 * of the actual gradient calculation 4171 * in the reduction (its not needed for 4172 * increase). 4173 */ 4174 log_mult = prev_rtt; 4175 if (rtt_diff <= 0) { 4176 /* 4177 * Rttdiff is less than zero, increase the 4178 * b/w multipler (its 0 or negative) 4179 */ 4180 timely_says = 0; 4181 rack_log_timely(rack, timely_says, log_mult, 4182 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4183 } else { 4184 /* Reduce the b/w multipler */ 4185 timely_says = 1; 4186 rack_log_timely(rack, timely_says, log_mult, 4187 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4188 } 4189 } 4190 return (timely_says); 4191 } 4192 4193 static void 4194 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4195 tcp_seq th_ack, int line, uint8_t quality) 4196 { 4197 uint64_t tim, bytes_ps, ltim, stim, utim; 4198 uint32_t segsiz, bytes, reqbytes, us_cts; 4199 int32_t gput, new_rtt_diff, timely_says; 4200 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4201 int did_add = 0; 4202 4203 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4204 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4205 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4206 tim = us_cts - tp->gput_ts; 4207 else 4208 tim = 0; 4209 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4210 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4211 else 4212 stim = 0; 4213 /* 4214 * Use the larger of the send time or ack time. This prevents us 4215 * from being influenced by ack artifacts to come up with too 4216 * high of measurement. Note that since we are spanning over many more 4217 * bytes in most of our measurements hopefully that is less likely to 4218 * occur. 4219 */ 4220 if (tim > stim) 4221 utim = max(tim, 1); 4222 else 4223 utim = max(stim, 1); 4224 /* Lets get a msec time ltim too for the old stuff */ 4225 ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); 4226 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 4227 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4228 if ((tim == 0) && (stim == 0)) { 4229 /* 4230 * Invalid measurement time, maybe 4231 * all on one ack/one send? 4232 */ 4233 bytes = 0; 4234 bytes_ps = 0; 4235 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4236 0, 0, 0, 10, __LINE__, NULL, quality); 4237 goto skip_measurement; 4238 } 4239 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4240 /* We never made a us_rtt measurement? */ 4241 bytes = 0; 4242 bytes_ps = 0; 4243 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4244 0, 0, 0, 10, __LINE__, NULL, quality); 4245 goto skip_measurement; 4246 } 4247 /* 4248 * Calculate the maximum possible b/w this connection 4249 * could have. We base our calculation on the lowest 4250 * rtt we have seen during the measurement and the 4251 * largest rwnd the client has given us in that time. This 4252 * forms a BDP that is the maximum that we could ever 4253 * get to the client. Anything larger is not valid. 4254 * 4255 * I originally had code here that rejected measurements 4256 * where the time was less than 1/2 the latest us_rtt. 4257 * But after thinking on that I realized its wrong since 4258 * say you had a 150Mbps or even 1Gbps link, and you 4259 * were a long way away.. example I am in Europe (100ms rtt) 4260 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4261 * bytes my time would be 1.2ms, and yet my rtt would say 4262 * the measurement was invalid the time was < 50ms. The 4263 * same thing is true for 150Mb (8ms of time). 4264 * 4265 * A better way I realized is to look at what the maximum 4266 * the connection could possibly do. This is gated on 4267 * the lowest RTT we have seen and the highest rwnd. 4268 * We should in theory never exceed that, if we are 4269 * then something on the path is storing up packets 4270 * and then feeding them all at once to our endpoint 4271 * messing up our measurement. 4272 */ 4273 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4274 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4275 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4276 if (SEQ_LT(th_ack, tp->gput_seq)) { 4277 /* No measurement can be made */ 4278 bytes = 0; 4279 bytes_ps = 0; 4280 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4281 0, 0, 0, 10, __LINE__, NULL, quality); 4282 goto skip_measurement; 4283 } else 4284 bytes = (th_ack - tp->gput_seq); 4285 bytes_ps = (uint64_t)bytes; 4286 /* 4287 * Don't measure a b/w for pacing unless we have gotten at least 4288 * an initial windows worth of data in this measurement interval. 4289 * 4290 * Small numbers of bytes get badly influenced by delayed ack and 4291 * other artifacts. Note we take the initial window or our 4292 * defined minimum GP (defaulting to 10 which hopefully is the 4293 * IW). 4294 */ 4295 if (rack->rc_gp_filled == 0) { 4296 /* 4297 * The initial estimate is special. We 4298 * have blasted out an IW worth of packets 4299 * without a real valid ack ts results. We 4300 * then setup the app_limited_needs_set flag, 4301 * this should get the first ack in (probably 2 4302 * MSS worth) to be recorded as the timestamp. 4303 * We thus allow a smaller number of bytes i.e. 4304 * IW - 2MSS. 4305 */ 4306 reqbytes -= (2 * segsiz); 4307 /* Also lets fill previous for our first measurement to be neutral */ 4308 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4309 } 4310 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4311 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4312 rack->r_ctl.rc_app_limited_cnt, 4313 0, 0, 10, __LINE__, NULL, quality); 4314 goto skip_measurement; 4315 } 4316 /* 4317 * We now need to calculate the Timely like status so 4318 * we can update (possibly) the b/w multipliers. 4319 */ 4320 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4321 if (rack->rc_gp_filled == 0) { 4322 /* No previous reading */ 4323 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4324 } else { 4325 if (rack->measure_saw_probe_rtt == 0) { 4326 /* 4327 * We don't want a probertt to be counted 4328 * since it will be negative incorrectly. We 4329 * expect to be reducing the RTT when we 4330 * pace at a slower rate. 4331 */ 4332 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4333 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4334 } 4335 } 4336 timely_says = rack_make_timely_judgement(rack, 4337 rack->r_ctl.rc_gp_srtt, 4338 rack->r_ctl.rc_rtt_diff, 4339 rack->r_ctl.rc_prev_gp_srtt 4340 ); 4341 bytes_ps *= HPTS_USEC_IN_SEC; 4342 bytes_ps /= utim; 4343 if (bytes_ps > rack->r_ctl.last_max_bw) { 4344 /* 4345 * Something is on path playing 4346 * since this b/w is not possible based 4347 * on our BDP (highest rwnd and lowest rtt 4348 * we saw in the measurement window). 4349 * 4350 * Another option here would be to 4351 * instead skip the measurement. 4352 */ 4353 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4354 bytes_ps, rack->r_ctl.last_max_bw, 0, 4355 11, __LINE__, NULL, quality); 4356 bytes_ps = rack->r_ctl.last_max_bw; 4357 } 4358 /* We store gp for b/w in bytes per second */ 4359 if (rack->rc_gp_filled == 0) { 4360 /* Initial measurment */ 4361 if (bytes_ps) { 4362 rack->r_ctl.gp_bw = bytes_ps; 4363 rack->rc_gp_filled = 1; 4364 rack->r_ctl.num_measurements = 1; 4365 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4366 } else { 4367 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4368 rack->r_ctl.rc_app_limited_cnt, 4369 0, 0, 10, __LINE__, NULL, quality); 4370 } 4371 if (rack->rc_inp->inp_in_hpts && 4372 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4373 /* 4374 * Ok we can't trust the pacer in this case 4375 * where we transition from un-paced to paced. 4376 * Or for that matter when the burst mitigation 4377 * was making a wild guess and got it wrong. 4378 * Stop the pacer and clear up all the aggregate 4379 * delays etc. 4380 */ 4381 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4382 rack->r_ctl.rc_hpts_flags = 0; 4383 rack->r_ctl.rc_last_output_to = 0; 4384 } 4385 did_add = 2; 4386 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4387 /* Still a small number run an average */ 4388 rack->r_ctl.gp_bw += bytes_ps; 4389 addpart = rack->r_ctl.num_measurements; 4390 rack->r_ctl.num_measurements++; 4391 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4392 /* We have collected enought to move forward */ 4393 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4394 } 4395 did_add = 3; 4396 } else { 4397 /* 4398 * We want to take 1/wma of the goodput and add in to 7/8th 4399 * of the old value weighted by the srtt. So if your measurement 4400 * period is say 2 SRTT's long you would get 1/4 as the 4401 * value, if it was like 1/2 SRTT then you would get 1/16th. 4402 * 4403 * But we must be careful not to take too much i.e. if the 4404 * srtt is say 20ms and the measurement is taken over 4405 * 400ms our weight would be 400/20 i.e. 20. On the 4406 * other hand if we get a measurement over 1ms with a 4407 * 10ms rtt we only want to take a much smaller portion. 4408 */ 4409 if (rack->r_ctl.num_measurements < 0xff) { 4410 rack->r_ctl.num_measurements++; 4411 } 4412 srtt = (uint64_t)tp->t_srtt; 4413 if (srtt == 0) { 4414 /* 4415 * Strange why did t_srtt go back to zero? 4416 */ 4417 if (rack->r_ctl.rc_rack_min_rtt) 4418 srtt = rack->r_ctl.rc_rack_min_rtt; 4419 else 4420 srtt = HPTS_USEC_IN_MSEC; 4421 } 4422 /* 4423 * XXXrrs: Note for reviewers, in playing with 4424 * dynamic pacing I discovered this GP calculation 4425 * as done originally leads to some undesired results. 4426 * Basically you can get longer measurements contributing 4427 * too much to the WMA. Thus I changed it if you are doing 4428 * dynamic adjustments to only do the aportioned adjustment 4429 * if we have a very small (time wise) measurement. Longer 4430 * measurements just get there weight (defaulting to 1/8) 4431 * add to the WMA. We may want to think about changing 4432 * this to always do that for both sides i.e. dynamic 4433 * and non-dynamic... but considering lots of folks 4434 * were playing with this I did not want to change the 4435 * calculation per.se. without your thoughts.. Lawerence? 4436 * Peter?? 4437 */ 4438 if (rack->rc_gp_dyn_mul == 0) { 4439 subpart = rack->r_ctl.gp_bw * utim; 4440 subpart /= (srtt * 8); 4441 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4442 /* 4443 * The b/w update takes no more 4444 * away then 1/2 our running total 4445 * so factor it in. 4446 */ 4447 addpart = bytes_ps * utim; 4448 addpart /= (srtt * 8); 4449 } else { 4450 /* 4451 * Don't allow a single measurement 4452 * to account for more than 1/2 of the 4453 * WMA. This could happen on a retransmission 4454 * where utim becomes huge compared to 4455 * srtt (multiple retransmissions when using 4456 * the sending rate which factors in all the 4457 * transmissions from the first one). 4458 */ 4459 subpart = rack->r_ctl.gp_bw / 2; 4460 addpart = bytes_ps / 2; 4461 } 4462 resid_bw = rack->r_ctl.gp_bw - subpart; 4463 rack->r_ctl.gp_bw = resid_bw + addpart; 4464 did_add = 1; 4465 } else { 4466 if ((utim / srtt) <= 1) { 4467 /* 4468 * The b/w update was over a small period 4469 * of time. The idea here is to prevent a small 4470 * measurement time period from counting 4471 * too much. So we scale it based on the 4472 * time so it attributes less than 1/rack_wma_divisor 4473 * of its measurement. 4474 */ 4475 subpart = rack->r_ctl.gp_bw * utim; 4476 subpart /= (srtt * rack_wma_divisor); 4477 addpart = bytes_ps * utim; 4478 addpart /= (srtt * rack_wma_divisor); 4479 } else { 4480 /* 4481 * The scaled measurement was long 4482 * enough so lets just add in the 4483 * portion of the measurment i.e. 1/rack_wma_divisor 4484 */ 4485 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 4486 addpart = bytes_ps / rack_wma_divisor; 4487 } 4488 if ((rack->measure_saw_probe_rtt == 0) || 4489 (bytes_ps > rack->r_ctl.gp_bw)) { 4490 /* 4491 * For probe-rtt we only add it in 4492 * if its larger, all others we just 4493 * add in. 4494 */ 4495 did_add = 1; 4496 resid_bw = rack->r_ctl.gp_bw - subpart; 4497 rack->r_ctl.gp_bw = resid_bw + addpart; 4498 } 4499 } 4500 } 4501 if ((rack->gp_ready == 0) && 4502 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 4503 /* We have enough measurements now */ 4504 rack->gp_ready = 1; 4505 rack_set_cc_pacing(rack); 4506 if (rack->defer_options) 4507 rack_apply_deferred_options(rack); 4508 } 4509 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 4510 rack_get_bw(rack), 22, did_add, NULL, quality); 4511 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 4512 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 4513 rack_update_multiplier(rack, timely_says, bytes_ps, 4514 rack->r_ctl.rc_gp_srtt, 4515 rack->r_ctl.rc_rtt_diff); 4516 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 4517 rack_get_bw(rack), 3, line, NULL, quality); 4518 /* reset the gp srtt and setup the new prev */ 4519 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4520 /* Record the lost count for the next measurement */ 4521 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 4522 /* 4523 * We restart our diffs based on the gpsrtt in the 4524 * measurement window. 4525 */ 4526 rack->rc_gp_rtt_set = 0; 4527 rack->rc_gp_saw_rec = 0; 4528 rack->rc_gp_saw_ca = 0; 4529 rack->rc_gp_saw_ss = 0; 4530 rack->rc_dragged_bottom = 0; 4531 skip_measurement: 4532 4533 #ifdef STATS 4534 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 4535 gput); 4536 /* 4537 * XXXLAS: This is a temporary hack, and should be 4538 * chained off VOI_TCP_GPUT when stats(9) grows an 4539 * API to deal with chained VOIs. 4540 */ 4541 if (tp->t_stats_gput_prev > 0) 4542 stats_voi_update_abs_s32(tp->t_stats, 4543 VOI_TCP_GPUT_ND, 4544 ((gput - tp->t_stats_gput_prev) * 100) / 4545 tp->t_stats_gput_prev); 4546 #endif 4547 tp->t_flags &= ~TF_GPUTINPROG; 4548 tp->t_stats_gput_prev = gput; 4549 /* 4550 * Now are we app limited now and there is space from where we 4551 * were to where we want to go? 4552 * 4553 * We don't do the other case i.e. non-applimited here since 4554 * the next send will trigger us picking up the missing data. 4555 */ 4556 if (rack->r_ctl.rc_first_appl && 4557 TCPS_HAVEESTABLISHED(tp->t_state) && 4558 rack->r_ctl.rc_app_limited_cnt && 4559 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 4560 ((rack->r_ctl.rc_first_appl->r_end - th_ack) > 4561 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 4562 /* 4563 * Yep there is enough outstanding to make a measurement here. 4564 */ 4565 struct rack_sendmap *rsm, fe; 4566 4567 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 4568 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 4569 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4570 rack->app_limited_needs_set = 0; 4571 tp->gput_seq = th_ack; 4572 if (rack->in_probe_rtt) 4573 rack->measure_saw_probe_rtt = 1; 4574 else if ((rack->measure_saw_probe_rtt) && 4575 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 4576 rack->measure_saw_probe_rtt = 0; 4577 if ((rack->r_ctl.rc_first_appl->r_end - th_ack) >= rack_get_measure_window(tp, rack)) { 4578 /* There is a full window to gain info from */ 4579 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 4580 } else { 4581 /* We can only measure up to the applimited point */ 4582 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_end - th_ack); 4583 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 4584 /* 4585 * We don't have enough to make a measurement. 4586 */ 4587 tp->t_flags &= ~TF_GPUTINPROG; 4588 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 4589 0, 0, 0, 6, __LINE__, NULL, quality); 4590 return; 4591 } 4592 } 4593 if (tp->t_state >= TCPS_FIN_WAIT_1) { 4594 /* 4595 * We will get no more data into the SB 4596 * this means we need to have the data available 4597 * before we start a measurement. 4598 */ 4599 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < (tp->gput_ack - tp->gput_seq)) { 4600 /* Nope not enough data. */ 4601 return; 4602 } 4603 } 4604 tp->t_flags |= TF_GPUTINPROG; 4605 /* 4606 * Now we need to find the timestamp of the send at tp->gput_seq 4607 * for the send based measurement. 4608 */ 4609 fe.r_start = tp->gput_seq; 4610 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 4611 if (rsm) { 4612 /* Ok send-based limit is set */ 4613 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 4614 /* 4615 * Move back to include the earlier part 4616 * so our ack time lines up right (this may 4617 * make an overlapping measurement but thats 4618 * ok). 4619 */ 4620 tp->gput_seq = rsm->r_start; 4621 } 4622 if (rsm->r_flags & RACK_ACKED) 4623 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 4624 else 4625 rack->app_limited_needs_set = 1; 4626 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 4627 } else { 4628 /* 4629 * If we don't find the rsm due to some 4630 * send-limit set the current time, which 4631 * basically disables the send-limit. 4632 */ 4633 struct timeval tv; 4634 4635 microuptime(&tv); 4636 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 4637 } 4638 rack_log_pacing_delay_calc(rack, 4639 tp->gput_seq, 4640 tp->gput_ack, 4641 (uint64_t)rsm, 4642 tp->gput_ts, 4643 rack->r_ctl.rc_app_limited_cnt, 4644 9, 4645 __LINE__, NULL, quality); 4646 } 4647 } 4648 4649 /* 4650 * CC wrapper hook functions 4651 */ 4652 static void 4653 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 4654 uint16_t type, int32_t recovery) 4655 { 4656 uint32_t prior_cwnd, acked; 4657 struct tcp_log_buffer *lgb = NULL; 4658 uint8_t labc_to_use, quality; 4659 4660 INP_WLOCK_ASSERT(tp->t_inpcb); 4661 tp->ccv->nsegs = nsegs; 4662 acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una); 4663 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 4664 uint32_t max; 4665 4666 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 4667 if (tp->ccv->bytes_this_ack > max) { 4668 tp->ccv->bytes_this_ack = max; 4669 } 4670 } 4671 #ifdef STATS 4672 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 4673 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 4674 #endif 4675 quality = RACK_QUALITY_NONE; 4676 if ((tp->t_flags & TF_GPUTINPROG) && 4677 rack_enough_for_measurement(tp, rack, th_ack, &quality)) { 4678 /* Measure the Goodput */ 4679 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__, quality); 4680 #ifdef NETFLIX_PEAKRATE 4681 if ((type == CC_ACK) && 4682 (tp->t_maxpeakrate)) { 4683 /* 4684 * We update t_peakrate_thr. This gives us roughly 4685 * one update per round trip time. Note 4686 * it will only be used if pace_always is off i.e 4687 * we don't do this for paced flows. 4688 */ 4689 rack_update_peakrate_thr(tp); 4690 } 4691 #endif 4692 } 4693 /* Which way our we limited, if not cwnd limited no advance in CA */ 4694 if (tp->snd_cwnd <= tp->snd_wnd) 4695 tp->ccv->flags |= CCF_CWND_LIMITED; 4696 else 4697 tp->ccv->flags &= ~CCF_CWND_LIMITED; 4698 if (tp->snd_cwnd > tp->snd_ssthresh) { 4699 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 4700 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 4701 /* For the setting of a window past use the actual scwnd we are using */ 4702 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 4703 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 4704 tp->ccv->flags |= CCF_ABC_SENTAWND; 4705 } 4706 } else { 4707 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 4708 tp->t_bytes_acked = 0; 4709 } 4710 prior_cwnd = tp->snd_cwnd; 4711 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 4712 (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) 4713 labc_to_use = rack->rc_labc; 4714 else 4715 labc_to_use = rack_max_abc_post_recovery; 4716 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4717 union tcp_log_stackspecific log; 4718 struct timeval tv; 4719 4720 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4721 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4722 log.u_bbr.flex1 = th_ack; 4723 log.u_bbr.flex2 = tp->ccv->flags; 4724 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4725 log.u_bbr.flex4 = tp->ccv->nsegs; 4726 log.u_bbr.flex5 = labc_to_use; 4727 log.u_bbr.flex6 = prior_cwnd; 4728 log.u_bbr.flex7 = V_tcp_do_newsack; 4729 log.u_bbr.flex8 = 1; 4730 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4731 0, &log, false, NULL, NULL, 0, &tv); 4732 } 4733 if (CC_ALGO(tp)->ack_received != NULL) { 4734 /* XXXLAS: Find a way to live without this */ 4735 tp->ccv->curack = th_ack; 4736 tp->ccv->labc = labc_to_use; 4737 tp->ccv->flags |= CCF_USE_LOCAL_ABC; 4738 CC_ALGO(tp)->ack_received(tp->ccv, type); 4739 } 4740 if (lgb) { 4741 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 4742 } 4743 if (rack->r_must_retran) { 4744 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 4745 /* 4746 * We now are beyond the rxt point so lets disable 4747 * the flag. 4748 */ 4749 rack->r_ctl.rc_out_at_rto = 0; 4750 rack->r_must_retran = 0; 4751 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 4752 /* 4753 * Only decrement the rc_out_at_rto if the cwnd advances 4754 * at least a whole segment. Otherwise next time the peer 4755 * acks, we won't be able to send this generaly happens 4756 * when we are in Congestion Avoidance. 4757 */ 4758 if (acked <= rack->r_ctl.rc_out_at_rto){ 4759 rack->r_ctl.rc_out_at_rto -= acked; 4760 } else { 4761 rack->r_ctl.rc_out_at_rto = 0; 4762 } 4763 } 4764 } 4765 #ifdef STATS 4766 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4767 #endif 4768 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4769 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4770 } 4771 #ifdef NETFLIX_PEAKRATE 4772 /* we enforce max peak rate if it is set and we are not pacing */ 4773 if ((rack->rc_always_pace == 0) && 4774 tp->t_peakrate_thr && 4775 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4776 tp->snd_cwnd = tp->t_peakrate_thr; 4777 } 4778 #endif 4779 } 4780 4781 static void 4782 tcp_rack_partialack(struct tcpcb *tp) 4783 { 4784 struct tcp_rack *rack; 4785 4786 rack = (struct tcp_rack *)tp->t_fb_ptr; 4787 INP_WLOCK_ASSERT(tp->t_inpcb); 4788 /* 4789 * If we are doing PRR and have enough 4790 * room to send <or> we are pacing and prr 4791 * is disabled we will want to see if we 4792 * can send data (by setting r_wanted_output to 4793 * true). 4794 */ 4795 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4796 rack->rack_no_prr) 4797 rack->r_wanted_output = 1; 4798 } 4799 4800 static void 4801 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 4802 { 4803 struct tcp_rack *rack; 4804 uint32_t orig_cwnd; 4805 4806 orig_cwnd = tp->snd_cwnd; 4807 INP_WLOCK_ASSERT(tp->t_inpcb); 4808 rack = (struct tcp_rack *)tp->t_fb_ptr; 4809 /* only alert CC if we alerted when we entered */ 4810 if (CC_ALGO(tp)->post_recovery != NULL) { 4811 tp->ccv->curack = th_ack; 4812 CC_ALGO(tp)->post_recovery(tp->ccv); 4813 if (tp->snd_cwnd < tp->snd_ssthresh) { 4814 /* 4815 * Rack has burst control and pacing 4816 * so lets not set this any lower than 4817 * snd_ssthresh per RFC-6582 (option 2). 4818 */ 4819 tp->snd_cwnd = tp->snd_ssthresh; 4820 } 4821 } 4822 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4823 union tcp_log_stackspecific log; 4824 struct timeval tv; 4825 4826 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4827 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4828 log.u_bbr.flex1 = th_ack; 4829 log.u_bbr.flex2 = tp->ccv->flags; 4830 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4831 log.u_bbr.flex4 = tp->ccv->nsegs; 4832 log.u_bbr.flex5 = V_tcp_abc_l_var; 4833 log.u_bbr.flex6 = orig_cwnd; 4834 log.u_bbr.flex7 = V_tcp_do_newsack; 4835 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 4836 log.u_bbr.flex8 = 2; 4837 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4838 0, &log, false, NULL, NULL, 0, &tv); 4839 } 4840 if ((rack->rack_no_prr == 0) && 4841 (rack->no_prr_addback == 0) && 4842 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4843 /* 4844 * Suck the next prr cnt back into cwnd, but 4845 * only do that if we are not application limited. 4846 */ 4847 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4848 /* 4849 * We are allowed to add back to the cwnd the amount we did 4850 * not get out if: 4851 * a) no_prr_addback is off. 4852 * b) we are not app limited 4853 * c) we are doing prr 4854 * <and> 4855 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 4856 */ 4857 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 4858 rack->r_ctl.rc_prr_sndcnt); 4859 } 4860 rack->r_ctl.rc_prr_sndcnt = 0; 4861 rack_log_to_prr(rack, 1, 0); 4862 } 4863 rack_log_to_prr(rack, 14, orig_cwnd); 4864 tp->snd_recover = tp->snd_una; 4865 EXIT_RECOVERY(tp->t_flags); 4866 } 4867 4868 static void 4869 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack) 4870 { 4871 struct tcp_rack *rack; 4872 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 4873 4874 INP_WLOCK_ASSERT(tp->t_inpcb); 4875 #ifdef STATS 4876 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 4877 #endif 4878 if (IN_RECOVERY(tp->t_flags) == 0) { 4879 in_rec_at_entry = 0; 4880 ssthresh_enter = tp->snd_ssthresh; 4881 cwnd_enter = tp->snd_cwnd; 4882 } else 4883 in_rec_at_entry = 1; 4884 rack = (struct tcp_rack *)tp->t_fb_ptr; 4885 switch (type) { 4886 case CC_NDUPACK: 4887 tp->t_flags &= ~TF_WASFRECOVERY; 4888 tp->t_flags &= ~TF_WASCRECOVERY; 4889 if (!IN_FASTRECOVERY(tp->t_flags)) { 4890 rack->r_ctl.rc_prr_delivered = 0; 4891 rack->r_ctl.rc_prr_out = 0; 4892 if (rack->rack_no_prr == 0) { 4893 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4894 rack_log_to_prr(rack, 2, in_rec_at_entry); 4895 } 4896 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4897 tp->snd_recover = tp->snd_max; 4898 if (tp->t_flags2 & TF2_ECN_PERMIT) 4899 tp->t_flags2 |= TF2_ECN_SND_CWR; 4900 } 4901 break; 4902 case CC_ECN: 4903 if (!IN_CONGRECOVERY(tp->t_flags) || 4904 /* 4905 * Allow ECN reaction on ACK to CWR, if 4906 * that data segment was also CE marked. 4907 */ 4908 SEQ_GEQ(ack, tp->snd_recover)) { 4909 EXIT_CONGRECOVERY(tp->t_flags); 4910 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4911 tp->snd_recover = tp->snd_max + 1; 4912 if (tp->t_flags2 & TF2_ECN_PERMIT) 4913 tp->t_flags2 |= TF2_ECN_SND_CWR; 4914 } 4915 break; 4916 case CC_RTO: 4917 tp->t_dupacks = 0; 4918 tp->t_bytes_acked = 0; 4919 EXIT_RECOVERY(tp->t_flags); 4920 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4921 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4922 orig_cwnd = tp->snd_cwnd; 4923 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4924 rack_log_to_prr(rack, 16, orig_cwnd); 4925 if (tp->t_flags2 & TF2_ECN_PERMIT) 4926 tp->t_flags2 |= TF2_ECN_SND_CWR; 4927 break; 4928 case CC_RTO_ERR: 4929 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4930 /* RTO was unnecessary, so reset everything. */ 4931 tp->snd_cwnd = tp->snd_cwnd_prev; 4932 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4933 tp->snd_recover = tp->snd_recover_prev; 4934 if (tp->t_flags & TF_WASFRECOVERY) { 4935 ENTER_FASTRECOVERY(tp->t_flags); 4936 tp->t_flags &= ~TF_WASFRECOVERY; 4937 } 4938 if (tp->t_flags & TF_WASCRECOVERY) { 4939 ENTER_CONGRECOVERY(tp->t_flags); 4940 tp->t_flags &= ~TF_WASCRECOVERY; 4941 } 4942 tp->snd_nxt = tp->snd_max; 4943 tp->t_badrxtwin = 0; 4944 break; 4945 } 4946 if ((CC_ALGO(tp)->cong_signal != NULL) && 4947 (type != CC_RTO)){ 4948 tp->ccv->curack = ack; 4949 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4950 } 4951 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 4952 rack_log_to_prr(rack, 15, cwnd_enter); 4953 rack->r_ctl.dsack_byte_cnt = 0; 4954 rack->r_ctl.retran_during_recovery = 0; 4955 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 4956 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 4957 rack->r_ent_rec_ns = 1; 4958 } 4959 } 4960 4961 static inline void 4962 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4963 { 4964 uint32_t i_cwnd; 4965 4966 INP_WLOCK_ASSERT(tp->t_inpcb); 4967 4968 #ifdef NETFLIX_STATS 4969 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4970 if (tp->t_state == TCPS_ESTABLISHED) 4971 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4972 #endif 4973 if (CC_ALGO(tp)->after_idle != NULL) 4974 CC_ALGO(tp)->after_idle(tp->ccv); 4975 4976 if (tp->snd_cwnd == 1) 4977 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4978 else 4979 i_cwnd = rc_init_window(rack); 4980 4981 /* 4982 * Being idle is no differnt than the initial window. If the cc 4983 * clamps it down below the initial window raise it to the initial 4984 * window. 4985 */ 4986 if (tp->snd_cwnd < i_cwnd) { 4987 tp->snd_cwnd = i_cwnd; 4988 } 4989 } 4990 4991 /* 4992 * Indicate whether this ack should be delayed. We can delay the ack if 4993 * following conditions are met: 4994 * - There is no delayed ack timer in progress. 4995 * - Our last ack wasn't a 0-sized window. We never want to delay 4996 * the ack that opens up a 0-sized window. 4997 * - LRO wasn't used for this segment. We make sure by checking that the 4998 * segment size is not larger than the MSS. 4999 * - Delayed acks are enabled or this is a half-synchronized T/TCP 5000 * connection. 5001 */ 5002 #define DELAY_ACK(tp, tlen) \ 5003 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 5004 ((tp->t_flags & TF_DELACK) == 0) && \ 5005 (tlen <= tp->t_maxseg) && \ 5006 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 5007 5008 static struct rack_sendmap * 5009 rack_find_lowest_rsm(struct tcp_rack *rack) 5010 { 5011 struct rack_sendmap *rsm; 5012 5013 /* 5014 * Walk the time-order transmitted list looking for an rsm that is 5015 * not acked. This will be the one that was sent the longest time 5016 * ago that is still outstanding. 5017 */ 5018 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 5019 if (rsm->r_flags & RACK_ACKED) { 5020 continue; 5021 } 5022 goto finish; 5023 } 5024 finish: 5025 return (rsm); 5026 } 5027 5028 static struct rack_sendmap * 5029 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5030 { 5031 struct rack_sendmap *prsm; 5032 5033 /* 5034 * Walk the sequence order list backward until we hit and arrive at 5035 * the highest seq not acked. In theory when this is called it 5036 * should be the last segment (which it was not). 5037 */ 5038 counter_u64_add(rack_find_high, 1); 5039 prsm = rsm; 5040 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 5041 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5042 continue; 5043 } 5044 return (prsm); 5045 } 5046 return (NULL); 5047 } 5048 5049 static uint32_t 5050 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 5051 { 5052 int32_t lro; 5053 uint32_t thresh; 5054 5055 /* 5056 * lro is the flag we use to determine if we have seen reordering. 5057 * If it gets set we have seen reordering. The reorder logic either 5058 * works in one of two ways: 5059 * 5060 * If reorder-fade is configured, then we track the last time we saw 5061 * re-ordering occur. If we reach the point where enough time as 5062 * passed we no longer consider reordering has occuring. 5063 * 5064 * Or if reorder-face is 0, then once we see reordering we consider 5065 * the connection to alway be subject to reordering and just set lro 5066 * to 1. 5067 * 5068 * In the end if lro is non-zero we add the extra time for 5069 * reordering in. 5070 */ 5071 if (srtt == 0) 5072 srtt = 1; 5073 if (rack->r_ctl.rc_reorder_ts) { 5074 if (rack->r_ctl.rc_reorder_fade) { 5075 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 5076 lro = cts - rack->r_ctl.rc_reorder_ts; 5077 if (lro == 0) { 5078 /* 5079 * No time as passed since the last 5080 * reorder, mark it as reordering. 5081 */ 5082 lro = 1; 5083 } 5084 } else { 5085 /* Negative time? */ 5086 lro = 0; 5087 } 5088 if (lro > rack->r_ctl.rc_reorder_fade) { 5089 /* Turn off reordering seen too */ 5090 rack->r_ctl.rc_reorder_ts = 0; 5091 lro = 0; 5092 } 5093 } else { 5094 /* Reodering does not fade */ 5095 lro = 1; 5096 } 5097 } else { 5098 lro = 0; 5099 } 5100 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5101 if (lro) { 5102 /* It must be set, if not you get 1/4 rtt */ 5103 if (rack->r_ctl.rc_reorder_shift) 5104 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5105 else 5106 thresh += (srtt >> 2); 5107 } else { 5108 thresh += 1; 5109 } 5110 /* We don't let the rack timeout be above a RTO */ 5111 if (thresh > rack->rc_tp->t_rxtcur) { 5112 thresh = rack->rc_tp->t_rxtcur; 5113 } 5114 /* And we don't want it above the RTO max either */ 5115 if (thresh > rack_rto_max) { 5116 thresh = rack_rto_max; 5117 } 5118 return (thresh); 5119 } 5120 5121 static uint32_t 5122 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5123 struct rack_sendmap *rsm, uint32_t srtt) 5124 { 5125 struct rack_sendmap *prsm; 5126 uint32_t thresh, len; 5127 int segsiz; 5128 5129 if (srtt == 0) 5130 srtt = 1; 5131 if (rack->r_ctl.rc_tlp_threshold) 5132 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5133 else 5134 thresh = (srtt * 2); 5135 5136 /* Get the previous sent packet, if any */ 5137 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5138 counter_u64_add(rack_enter_tlp_calc, 1); 5139 len = rsm->r_end - rsm->r_start; 5140 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5141 /* Exactly like the ID */ 5142 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5143 uint32_t alt_thresh; 5144 /* 5145 * Compensate for delayed-ack with the d-ack time. 5146 */ 5147 counter_u64_add(rack_used_tlpmethod, 1); 5148 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5149 if (alt_thresh > thresh) 5150 thresh = alt_thresh; 5151 } 5152 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 5153 /* 2.1 behavior */ 5154 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 5155 if (prsm && (len <= segsiz)) { 5156 /* 5157 * Two packets outstanding, thresh should be (2*srtt) + 5158 * possible inter-packet delay (if any). 5159 */ 5160 uint32_t inter_gap = 0; 5161 int idx, nidx; 5162 5163 counter_u64_add(rack_used_tlpmethod, 1); 5164 idx = rsm->r_rtr_cnt - 1; 5165 nidx = prsm->r_rtr_cnt - 1; 5166 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 5167 /* Yes it was sent later (or at the same time) */ 5168 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 5169 } 5170 thresh += inter_gap; 5171 } else if (len <= segsiz) { 5172 /* 5173 * Possibly compensate for delayed-ack. 5174 */ 5175 uint32_t alt_thresh; 5176 5177 counter_u64_add(rack_used_tlpmethod2, 1); 5178 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5179 if (alt_thresh > thresh) 5180 thresh = alt_thresh; 5181 } 5182 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 5183 /* 2.2 behavior */ 5184 if (len <= segsiz) { 5185 uint32_t alt_thresh; 5186 /* 5187 * Compensate for delayed-ack with the d-ack time. 5188 */ 5189 counter_u64_add(rack_used_tlpmethod, 1); 5190 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5191 if (alt_thresh > thresh) 5192 thresh = alt_thresh; 5193 } 5194 } 5195 /* Not above an RTO */ 5196 if (thresh > tp->t_rxtcur) { 5197 thresh = tp->t_rxtcur; 5198 } 5199 /* Not above a RTO max */ 5200 if (thresh > rack_rto_max) { 5201 thresh = rack_rto_max; 5202 } 5203 /* Apply user supplied min TLP */ 5204 if (thresh < rack_tlp_min) { 5205 thresh = rack_tlp_min; 5206 } 5207 return (thresh); 5208 } 5209 5210 static uint32_t 5211 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 5212 { 5213 /* 5214 * We want the rack_rtt which is the 5215 * last rtt we measured. However if that 5216 * does not exist we fallback to the srtt (which 5217 * we probably will never do) and then as a last 5218 * resort we use RACK_INITIAL_RTO if no srtt is 5219 * yet set. 5220 */ 5221 if (rack->rc_rack_rtt) 5222 return (rack->rc_rack_rtt); 5223 else if (tp->t_srtt == 0) 5224 return (RACK_INITIAL_RTO); 5225 return (tp->t_srtt); 5226 } 5227 5228 static struct rack_sendmap * 5229 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 5230 { 5231 /* 5232 * Check to see that we don't need to fall into recovery. We will 5233 * need to do so if our oldest transmit is past the time we should 5234 * have had an ack. 5235 */ 5236 struct tcp_rack *rack; 5237 struct rack_sendmap *rsm; 5238 int32_t idx; 5239 uint32_t srtt, thresh; 5240 5241 rack = (struct tcp_rack *)tp->t_fb_ptr; 5242 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 5243 return (NULL); 5244 } 5245 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5246 if (rsm == NULL) 5247 return (NULL); 5248 5249 if (rsm->r_flags & RACK_ACKED) { 5250 rsm = rack_find_lowest_rsm(rack); 5251 if (rsm == NULL) 5252 return (NULL); 5253 } 5254 idx = rsm->r_rtr_cnt - 1; 5255 srtt = rack_grab_rtt(tp, rack); 5256 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 5257 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 5258 return (NULL); 5259 } 5260 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 5261 return (NULL); 5262 } 5263 /* Ok if we reach here we are over-due and this guy can be sent */ 5264 if (IN_RECOVERY(tp->t_flags) == 0) { 5265 /* 5266 * For the one that enters us into recovery record undo 5267 * info. 5268 */ 5269 rack->r_ctl.rc_rsm_start = rsm->r_start; 5270 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 5271 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 5272 } 5273 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 5274 return (rsm); 5275 } 5276 5277 static uint32_t 5278 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 5279 { 5280 int32_t t; 5281 int32_t tt; 5282 uint32_t ret_val; 5283 5284 t = (tp->t_srtt + (tp->t_rttvar << 2)); 5285 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 5286 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 5287 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5288 tp->t_rxtshift++; 5289 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 5290 ret_val = (uint32_t)tt; 5291 return (ret_val); 5292 } 5293 5294 static uint32_t 5295 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 5296 { 5297 /* 5298 * Start the FR timer, we do this based on getting the first one in 5299 * the rc_tmap. Note that if its NULL we must stop the timer. in all 5300 * events we need to stop the running timer (if its running) before 5301 * starting the new one. 5302 */ 5303 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 5304 uint32_t srtt_cur; 5305 int32_t idx; 5306 int32_t is_tlp_timer = 0; 5307 struct rack_sendmap *rsm; 5308 5309 if (rack->t_timers_stopped) { 5310 /* All timers have been stopped none are to run */ 5311 return (0); 5312 } 5313 if (rack->rc_in_persist) { 5314 /* We can't start any timer in persists */ 5315 return (rack_get_persists_timer_val(tp, rack)); 5316 } 5317 rack->rc_on_min_to = 0; 5318 if ((tp->t_state < TCPS_ESTABLISHED) || 5319 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 5320 goto activate_rxt; 5321 } 5322 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5323 if ((rsm == NULL) || sup_rack) { 5324 /* Nothing on the send map or no rack */ 5325 activate_rxt: 5326 time_since_sent = 0; 5327 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5328 if (rsm) { 5329 /* 5330 * Should we discount the RTX timer any? 5331 * 5332 * We want to discount it the smallest amount. 5333 * If a timer (Rack/TLP or RXT) has gone off more 5334 * recently thats the discount we want to use (now - timer time). 5335 * If the retransmit of the oldest packet was more recent then 5336 * we want to use that (now - oldest-packet-last_transmit_time). 5337 * 5338 */ 5339 idx = rsm->r_rtr_cnt - 1; 5340 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 5341 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5342 else 5343 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5344 if (TSTMP_GT(cts, tstmp_touse)) 5345 time_since_sent = cts - tstmp_touse; 5346 } 5347 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 5348 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 5349 to = tp->t_rxtcur; 5350 if (to > time_since_sent) 5351 to -= time_since_sent; 5352 else 5353 to = rack->r_ctl.rc_min_to; 5354 if (to == 0) 5355 to = 1; 5356 /* Special case for KEEPINIT */ 5357 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 5358 (TP_KEEPINIT(tp) != 0) && 5359 rsm) { 5360 /* 5361 * We have to put a ceiling on the rxt timer 5362 * of the keep-init timeout. 5363 */ 5364 uint32_t max_time, red; 5365 5366 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 5367 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 5368 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 5369 if (red < max_time) 5370 max_time -= red; 5371 else 5372 max_time = 1; 5373 } 5374 /* Reduce timeout to the keep value if needed */ 5375 if (max_time < to) 5376 to = max_time; 5377 } 5378 return (to); 5379 } 5380 return (0); 5381 } 5382 if (rsm->r_flags & RACK_ACKED) { 5383 rsm = rack_find_lowest_rsm(rack); 5384 if (rsm == NULL) { 5385 /* No lowest? */ 5386 goto activate_rxt; 5387 } 5388 } 5389 if (rack->sack_attack_disable) { 5390 /* 5391 * We don't want to do 5392 * any TLP's if you are an attacker. 5393 * Though if you are doing what 5394 * is expected you may still have 5395 * SACK-PASSED marks. 5396 */ 5397 goto activate_rxt; 5398 } 5399 /* Convert from ms to usecs */ 5400 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 5401 if ((tp->t_flags & TF_SENTFIN) && 5402 ((tp->snd_max - tp->snd_una) == 1) && 5403 (rsm->r_flags & RACK_HAS_FIN)) { 5404 /* 5405 * We don't start a rack timer if all we have is a 5406 * FIN outstanding. 5407 */ 5408 goto activate_rxt; 5409 } 5410 if ((rack->use_rack_rr == 0) && 5411 (IN_FASTRECOVERY(tp->t_flags)) && 5412 (rack->rack_no_prr == 0) && 5413 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5414 /* 5415 * We are not cheating, in recovery and 5416 * not enough ack's to yet get our next 5417 * retransmission out. 5418 * 5419 * Note that classified attackers do not 5420 * get to use the rack-cheat. 5421 */ 5422 goto activate_tlp; 5423 } 5424 srtt = rack_grab_rtt(tp, rack); 5425 thresh = rack_calc_thresh_rack(rack, srtt, cts); 5426 idx = rsm->r_rtr_cnt - 1; 5427 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 5428 if (SEQ_GEQ(exp, cts)) { 5429 to = exp - cts; 5430 if (to < rack->r_ctl.rc_min_to) { 5431 to = rack->r_ctl.rc_min_to; 5432 if (rack->r_rr_config == 3) 5433 rack->rc_on_min_to = 1; 5434 } 5435 } else { 5436 to = rack->r_ctl.rc_min_to; 5437 if (rack->r_rr_config == 3) 5438 rack->rc_on_min_to = 1; 5439 } 5440 } else { 5441 /* Ok we need to do a TLP not RACK */ 5442 activate_tlp: 5443 if ((rack->rc_tlp_in_progress != 0) && 5444 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 5445 /* 5446 * The previous send was a TLP and we have sent 5447 * N TLP's without sending new data. 5448 */ 5449 goto activate_rxt; 5450 } 5451 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 5452 if (rsm == NULL) { 5453 /* We found no rsm to TLP with. */ 5454 goto activate_rxt; 5455 } 5456 if (rsm->r_flags & RACK_HAS_FIN) { 5457 /* If its a FIN we dont do TLP */ 5458 rsm = NULL; 5459 goto activate_rxt; 5460 } 5461 idx = rsm->r_rtr_cnt - 1; 5462 time_since_sent = 0; 5463 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 5464 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5465 else 5466 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5467 if (TSTMP_GT(cts, tstmp_touse)) 5468 time_since_sent = cts - tstmp_touse; 5469 is_tlp_timer = 1; 5470 if (tp->t_srtt) { 5471 if ((rack->rc_srtt_measure_made == 0) && 5472 (tp->t_srtt == 1)) { 5473 /* 5474 * If another stack as run and set srtt to 1, 5475 * then the srtt was 0, so lets use the initial. 5476 */ 5477 srtt = RACK_INITIAL_RTO; 5478 } else { 5479 srtt_cur = tp->t_srtt; 5480 srtt = srtt_cur; 5481 } 5482 } else 5483 srtt = RACK_INITIAL_RTO; 5484 /* 5485 * If the SRTT is not keeping up and the 5486 * rack RTT has spiked we want to use 5487 * the last RTT not the smoothed one. 5488 */ 5489 if (rack_tlp_use_greater && 5490 tp->t_srtt && 5491 (srtt < rack_grab_rtt(tp, rack))) { 5492 srtt = rack_grab_rtt(tp, rack); 5493 } 5494 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 5495 if (thresh > time_since_sent) { 5496 to = thresh - time_since_sent; 5497 } else { 5498 to = rack->r_ctl.rc_min_to; 5499 rack_log_alt_to_to_cancel(rack, 5500 thresh, /* flex1 */ 5501 time_since_sent, /* flex2 */ 5502 tstmp_touse, /* flex3 */ 5503 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 5504 (uint32_t)rsm->r_tim_lastsent[idx], 5505 srtt, 5506 idx, 99); 5507 } 5508 if (to < rack_tlp_min) { 5509 to = rack_tlp_min; 5510 } 5511 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 5512 /* 5513 * If the TLP time works out to larger than the max 5514 * RTO lets not do TLP.. just RTO. 5515 */ 5516 goto activate_rxt; 5517 } 5518 } 5519 if (is_tlp_timer == 0) { 5520 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 5521 } else { 5522 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 5523 } 5524 if (to == 0) 5525 to = 1; 5526 return (to); 5527 } 5528 5529 static void 5530 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5531 { 5532 if (rack->rc_in_persist == 0) { 5533 if (tp->t_flags & TF_GPUTINPROG) { 5534 /* 5535 * Stop the goodput now, the calling of the 5536 * measurement function clears the flag. 5537 */ 5538 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__, 5539 RACK_QUALITY_PERSIST); 5540 } 5541 #ifdef NETFLIX_SHARED_CWND 5542 if (rack->r_ctl.rc_scw) { 5543 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5544 rack->rack_scwnd_is_idle = 1; 5545 } 5546 #endif 5547 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 5548 if (rack->r_ctl.rc_went_idle_time == 0) 5549 rack->r_ctl.rc_went_idle_time = 1; 5550 rack_timer_cancel(tp, rack, cts, __LINE__); 5551 tp->t_rxtshift = 0; 5552 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5553 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5554 rack->rc_in_persist = 1; 5555 } 5556 } 5557 5558 static void 5559 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5560 { 5561 if (rack->rc_inp->inp_in_hpts) { 5562 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5563 rack->r_ctl.rc_hpts_flags = 0; 5564 } 5565 #ifdef NETFLIX_SHARED_CWND 5566 if (rack->r_ctl.rc_scw) { 5567 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5568 rack->rack_scwnd_is_idle = 0; 5569 } 5570 #endif 5571 if (rack->rc_gp_dyn_mul && 5572 (rack->use_fixed_rate == 0) && 5573 (rack->rc_always_pace)) { 5574 /* 5575 * Do we count this as if a probe-rtt just 5576 * finished? 5577 */ 5578 uint32_t time_idle, idle_min; 5579 5580 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 5581 idle_min = rack_min_probertt_hold; 5582 if (rack_probertt_gpsrtt_cnt_div) { 5583 uint64_t extra; 5584 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 5585 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 5586 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 5587 idle_min += (uint32_t)extra; 5588 } 5589 if (time_idle >= idle_min) { 5590 /* Yes, we count it as a probe-rtt. */ 5591 uint32_t us_cts; 5592 5593 us_cts = tcp_get_usecs(NULL); 5594 if (rack->in_probe_rtt == 0) { 5595 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 5596 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 5597 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 5598 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 5599 } else { 5600 rack_exit_probertt(rack, us_cts); 5601 } 5602 } 5603 } 5604 rack->rc_in_persist = 0; 5605 rack->r_ctl.rc_went_idle_time = 0; 5606 tp->t_rxtshift = 0; 5607 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5608 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5609 rack->r_ctl.rc_agg_delayed = 0; 5610 rack->r_early = 0; 5611 rack->r_late = 0; 5612 rack->r_ctl.rc_agg_early = 0; 5613 } 5614 5615 static void 5616 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 5617 struct hpts_diag *diag, struct timeval *tv) 5618 { 5619 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5620 union tcp_log_stackspecific log; 5621 5622 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5623 log.u_bbr.flex1 = diag->p_nxt_slot; 5624 log.u_bbr.flex2 = diag->p_cur_slot; 5625 log.u_bbr.flex3 = diag->slot_req; 5626 log.u_bbr.flex4 = diag->inp_hptsslot; 5627 log.u_bbr.flex5 = diag->slot_remaining; 5628 log.u_bbr.flex6 = diag->need_new_to; 5629 log.u_bbr.flex7 = diag->p_hpts_active; 5630 log.u_bbr.flex8 = diag->p_on_min_sleep; 5631 /* Hijack other fields as needed */ 5632 log.u_bbr.epoch = diag->have_slept; 5633 log.u_bbr.lt_epoch = diag->yet_to_sleep; 5634 log.u_bbr.pkts_out = diag->co_ret; 5635 log.u_bbr.applimited = diag->hpts_sleep_time; 5636 log.u_bbr.delivered = diag->p_prev_slot; 5637 log.u_bbr.inflight = diag->p_runningslot; 5638 log.u_bbr.bw_inuse = diag->wheel_slot; 5639 log.u_bbr.rttProp = diag->wheel_cts; 5640 log.u_bbr.timeStamp = cts; 5641 log.u_bbr.delRate = diag->maxslots; 5642 log.u_bbr.cur_del_rate = diag->p_curtick; 5643 log.u_bbr.cur_del_rate <<= 32; 5644 log.u_bbr.cur_del_rate |= diag->p_lasttick; 5645 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5646 &rack->rc_inp->inp_socket->so_rcv, 5647 &rack->rc_inp->inp_socket->so_snd, 5648 BBR_LOG_HPTSDIAG, 0, 5649 0, &log, false, tv); 5650 } 5651 5652 } 5653 5654 static void 5655 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 5656 { 5657 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5658 union tcp_log_stackspecific log; 5659 struct timeval tv; 5660 5661 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5662 log.u_bbr.flex1 = sb->sb_flags; 5663 log.u_bbr.flex2 = len; 5664 log.u_bbr.flex3 = sb->sb_state; 5665 log.u_bbr.flex8 = type; 5666 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5667 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5668 &rack->rc_inp->inp_socket->so_rcv, 5669 &rack->rc_inp->inp_socket->so_snd, 5670 TCP_LOG_SB_WAKE, 0, 5671 len, &log, false, &tv); 5672 } 5673 } 5674 5675 static void 5676 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 5677 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 5678 { 5679 struct hpts_diag diag; 5680 struct inpcb *inp; 5681 struct timeval tv; 5682 uint32_t delayed_ack = 0; 5683 uint32_t hpts_timeout; 5684 uint32_t entry_slot = slot; 5685 uint8_t stopped; 5686 uint32_t left = 0; 5687 uint32_t us_cts; 5688 5689 inp = tp->t_inpcb; 5690 if ((tp->t_state == TCPS_CLOSED) || 5691 (tp->t_state == TCPS_LISTEN)) { 5692 return; 5693 } 5694 if (inp->inp_in_hpts) { 5695 /* Already on the pacer */ 5696 return; 5697 } 5698 stopped = rack->rc_tmr_stopped; 5699 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 5700 left = rack->r_ctl.rc_timer_exp - cts; 5701 } 5702 rack->r_ctl.rc_timer_exp = 0; 5703 rack->r_ctl.rc_hpts_flags = 0; 5704 us_cts = tcp_get_usecs(&tv); 5705 /* Now early/late accounting */ 5706 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL, 0); 5707 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 5708 /* 5709 * We have a early carry over set, 5710 * we can always add more time so we 5711 * can always make this compensation. 5712 * 5713 * Note if ack's are allowed to wake us do not 5714 * penalize the next timer for being awoke 5715 * by an ack aka the rc_agg_early (non-paced mode). 5716 */ 5717 slot += rack->r_ctl.rc_agg_early; 5718 rack->r_early = 0; 5719 rack->r_ctl.rc_agg_early = 0; 5720 } 5721 if (rack->r_late) { 5722 /* 5723 * This is harder, we can 5724 * compensate some but it 5725 * really depends on what 5726 * the current pacing time is. 5727 */ 5728 if (rack->r_ctl.rc_agg_delayed >= slot) { 5729 /* 5730 * We can't compensate for it all. 5731 * And we have to have some time 5732 * on the clock. We always have a min 5733 * 10 slots (10 x 10 i.e. 100 usecs). 5734 */ 5735 if (slot <= HPTS_TICKS_PER_SLOT) { 5736 /* We gain delay */ 5737 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot); 5738 slot = HPTS_TICKS_PER_SLOT; 5739 } else { 5740 /* We take off some */ 5741 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT); 5742 slot = HPTS_TICKS_PER_SLOT; 5743 } 5744 } else { 5745 slot -= rack->r_ctl.rc_agg_delayed; 5746 rack->r_ctl.rc_agg_delayed = 0; 5747 /* Make sure we have 100 useconds at minimum */ 5748 if (slot < HPTS_TICKS_PER_SLOT) { 5749 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot; 5750 slot = HPTS_TICKS_PER_SLOT; 5751 } 5752 if (rack->r_ctl.rc_agg_delayed == 0) 5753 rack->r_late = 0; 5754 } 5755 } 5756 if (slot) { 5757 /* We are pacing too */ 5758 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 5759 } 5760 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 5761 #ifdef NETFLIX_EXP_DETECTION 5762 if (rack->sack_attack_disable && 5763 (slot < tcp_sad_pacing_interval)) { 5764 /* 5765 * We have a potential attacker on 5766 * the line. We have possibly some 5767 * (or now) pacing time set. We want to 5768 * slow down the processing of sacks by some 5769 * amount (if it is an attacker). Set the default 5770 * slot for attackers in place (unless the orginal 5771 * interval is longer). Its stored in 5772 * micro-seconds, so lets convert to msecs. 5773 */ 5774 slot = tcp_sad_pacing_interval; 5775 } 5776 #endif 5777 if (tp->t_flags & TF_DELACK) { 5778 delayed_ack = TICKS_2_USEC(tcp_delacktime); 5779 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 5780 } 5781 if (delayed_ack && ((hpts_timeout == 0) || 5782 (delayed_ack < hpts_timeout))) 5783 hpts_timeout = delayed_ack; 5784 else 5785 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5786 /* 5787 * If no timers are going to run and we will fall off the hptsi 5788 * wheel, we resort to a keep-alive timer if its configured. 5789 */ 5790 if ((hpts_timeout == 0) && 5791 (slot == 0)) { 5792 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5793 (tp->t_state <= TCPS_CLOSING)) { 5794 /* 5795 * Ok we have no timer (persists, rack, tlp, rxt or 5796 * del-ack), we don't have segments being paced. So 5797 * all that is left is the keepalive timer. 5798 */ 5799 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 5800 /* Get the established keep-alive time */ 5801 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 5802 } else { 5803 /* 5804 * Get the initial setup keep-alive time, 5805 * note that this is probably not going to 5806 * happen, since rack will be running a rxt timer 5807 * if a SYN of some sort is outstanding. It is 5808 * actually handled in rack_timeout_rxt(). 5809 */ 5810 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 5811 } 5812 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 5813 if (rack->in_probe_rtt) { 5814 /* 5815 * We want to instead not wake up a long time from 5816 * now but to wake up about the time we would 5817 * exit probe-rtt and initiate a keep-alive ack. 5818 * This will get us out of probe-rtt and update 5819 * our min-rtt. 5820 */ 5821 hpts_timeout = rack_min_probertt_hold; 5822 } 5823 } 5824 } 5825 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 5826 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 5827 /* 5828 * RACK, TLP, persists and RXT timers all are restartable 5829 * based on actions input .. i.e we received a packet (ack 5830 * or sack) and that changes things (rw, or snd_una etc). 5831 * Thus we can restart them with a new value. For 5832 * keep-alive, delayed_ack we keep track of what was left 5833 * and restart the timer with a smaller value. 5834 */ 5835 if (left < hpts_timeout) 5836 hpts_timeout = left; 5837 } 5838 if (hpts_timeout) { 5839 /* 5840 * Hack alert for now we can't time-out over 2,147,483 5841 * seconds (a bit more than 596 hours), which is probably ok 5842 * :). 5843 */ 5844 if (hpts_timeout > 0x7ffffffe) 5845 hpts_timeout = 0x7ffffffe; 5846 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 5847 } 5848 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL, 0); 5849 if ((rack->gp_ready == 0) && 5850 (rack->use_fixed_rate == 0) && 5851 (hpts_timeout < slot) && 5852 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 5853 /* 5854 * We have no good estimate yet for the 5855 * old clunky burst mitigation or the 5856 * real pacing. And the tlp or rxt is smaller 5857 * than the pacing calculation. Lets not 5858 * pace that long since we know the calculation 5859 * so far is not accurate. 5860 */ 5861 slot = hpts_timeout; 5862 } 5863 rack->r_ctl.last_pacing_time = slot; 5864 /** 5865 * Turn off all the flags for queuing by default. The 5866 * flags have important meanings to what happens when 5867 * LRO interacts with the transport. Most likely (by default now) 5868 * mbuf_queueing and ack compression are on. So the transport 5869 * has a couple of flags that control what happens (if those 5870 * are not on then these flags won't have any effect since it 5871 * won't go through the queuing LRO path). 5872 * 5873 * INP_MBUF_QUEUE_READY - This flags says that I am busy 5874 * pacing output, so don't disturb. But 5875 * it also means LRO can wake me if there 5876 * is a SACK arrival. 5877 * 5878 * INP_DONT_SACK_QUEUE - This flag is used in conjunction 5879 * with the above flag (QUEUE_READY) and 5880 * when present it says don't even wake me 5881 * if a SACK arrives. 5882 * 5883 * The idea behind these flags is that if we are pacing we 5884 * set the MBUF_QUEUE_READY and only get woken up if 5885 * a SACK arrives (which could change things) or if 5886 * our pacing timer expires. If, however, we have a rack 5887 * timer running, then we don't even want a sack to wake 5888 * us since the rack timer has to expire before we can send. 5889 * 5890 * Other cases should usually have none of the flags set 5891 * so LRO can call into us. 5892 */ 5893 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5894 if (slot) { 5895 rack->r_ctl.rc_last_output_to = us_cts + slot; 5896 /* 5897 * A pacing timer (slot) is being set, in 5898 * such a case we cannot send (we are blocked by 5899 * the timer). So lets tell LRO that it should not 5900 * wake us unless there is a SACK. Note this only 5901 * will be effective if mbuf queueing is on or 5902 * compressed acks are being processed. 5903 */ 5904 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5905 /* 5906 * But wait if we have a Rack timer running 5907 * even a SACK should not disturb us (with 5908 * the exception of r_rr_config 3). 5909 */ 5910 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 5911 (rack->r_rr_config != 3)) 5912 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5913 if (rack->rc_ack_can_sendout_data) { 5914 /* 5915 * Ahh but wait, this is that special case 5916 * where the pacing timer can be disturbed 5917 * backout the changes (used for non-paced 5918 * burst limiting). 5919 */ 5920 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5921 } 5922 if ((rack->use_rack_rr) && 5923 (rack->r_rr_config < 2) && 5924 ((hpts_timeout) && (hpts_timeout < slot))) { 5925 /* 5926 * Arrange for the hpts to kick back in after the 5927 * t-o if the t-o does not cause a send. 5928 */ 5929 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 5930 __LINE__, &diag); 5931 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5932 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5933 } else { 5934 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 5935 __LINE__, &diag); 5936 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5937 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 5938 } 5939 } else if (hpts_timeout) { 5940 /* 5941 * With respect to inp_flags2 here, lets let any new acks wake 5942 * us up here. Since we are not pacing (no pacing timer), output 5943 * can happen so we should let it. If its a Rack timer, then any inbound 5944 * packet probably won't change the sending (we will be blocked) 5945 * but it may change the prr stats so letting it in (the set defaults 5946 * at the start of this block) are good enough. 5947 */ 5948 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 5949 __LINE__, &diag); 5950 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5951 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5952 } else { 5953 /* No timer starting */ 5954 #ifdef INVARIANTS 5955 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 5956 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 5957 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 5958 } 5959 #endif 5960 } 5961 rack->rc_tmr_stopped = 0; 5962 if (slot) 5963 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 5964 } 5965 5966 /* 5967 * RACK Timer, here we simply do logging and house keeping. 5968 * the normal rack_output() function will call the 5969 * appropriate thing to check if we need to do a RACK retransmit. 5970 * We return 1, saying don't proceed with rack_output only 5971 * when all timers have been stopped (destroyed PCB?). 5972 */ 5973 static int 5974 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5975 { 5976 /* 5977 * This timer simply provides an internal trigger to send out data. 5978 * The check_recovery_mode call will see if there are needed 5979 * retransmissions, if so we will enter fast-recovery. The output 5980 * call may or may not do the same thing depending on sysctl 5981 * settings. 5982 */ 5983 struct rack_sendmap *rsm; 5984 5985 if (tp->t_timers->tt_flags & TT_STOPPED) { 5986 return (1); 5987 } 5988 counter_u64_add(rack_to_tot, 1); 5989 if (rack->r_state && (rack->r_state != tp->t_state)) 5990 rack_set_state(tp, rack); 5991 rack->rc_on_min_to = 0; 5992 rsm = rack_check_recovery_mode(tp, cts); 5993 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5994 if (rsm) { 5995 rack->r_ctl.rc_resend = rsm; 5996 rack->r_timer_override = 1; 5997 if (rack->use_rack_rr) { 5998 /* 5999 * Don't accumulate extra pacing delay 6000 * we are allowing the rack timer to 6001 * over-ride pacing i.e. rrr takes precedence 6002 * if the pacing interval is longer than the rrr 6003 * time (in other words we get the min pacing 6004 * time versus rrr pacing time). 6005 */ 6006 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 6007 } 6008 } 6009 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 6010 if (rsm == NULL) { 6011 /* restart a timer and return 1 */ 6012 rack_start_hpts_timer(rack, tp, cts, 6013 0, 0, 0); 6014 return (1); 6015 } 6016 return (0); 6017 } 6018 6019 static void 6020 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 6021 { 6022 if (rsm->m->m_len > rsm->orig_m_len) { 6023 /* 6024 * Mbuf grew, caused by sbcompress, our offset does 6025 * not change. 6026 */ 6027 rsm->orig_m_len = rsm->m->m_len; 6028 } else if (rsm->m->m_len < rsm->orig_m_len) { 6029 /* 6030 * Mbuf shrank, trimmed off the top by an ack, our 6031 * offset changes. 6032 */ 6033 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 6034 rsm->orig_m_len = rsm->m->m_len; 6035 } 6036 } 6037 6038 static void 6039 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 6040 { 6041 struct mbuf *m; 6042 uint32_t soff; 6043 6044 if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) { 6045 /* Fix up the orig_m_len and possibly the mbuf offset */ 6046 rack_adjust_orig_mlen(src_rsm); 6047 } 6048 m = src_rsm->m; 6049 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 6050 while (soff >= m->m_len) { 6051 /* Move out past this mbuf */ 6052 soff -= m->m_len; 6053 m = m->m_next; 6054 KASSERT((m != NULL), 6055 ("rsm:%p nrsm:%p hit at soff:%u null m", 6056 src_rsm, rsm, soff)); 6057 } 6058 rsm->m = m; 6059 rsm->soff = soff; 6060 rsm->orig_m_len = m->m_len; 6061 } 6062 6063 static __inline void 6064 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 6065 struct rack_sendmap *rsm, uint32_t start) 6066 { 6067 int idx; 6068 6069 nrsm->r_start = start; 6070 nrsm->r_end = rsm->r_end; 6071 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 6072 nrsm->r_flags = rsm->r_flags; 6073 nrsm->r_dupack = rsm->r_dupack; 6074 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 6075 nrsm->r_rtr_bytes = 0; 6076 rsm->r_end = nrsm->r_start; 6077 nrsm->r_just_ret = rsm->r_just_ret; 6078 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 6079 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 6080 } 6081 /* Now if we have SYN flag we keep it on the left edge */ 6082 if (nrsm->r_flags & RACK_HAS_SYN) 6083 nrsm->r_flags &= ~RACK_HAS_SYN; 6084 /* Now if we have a FIN flag we keep it on the right edge */ 6085 if (rsm->r_flags & RACK_HAS_FIN) 6086 rsm->r_flags &= ~RACK_HAS_FIN; 6087 /* Push bit must go to the right edge as well */ 6088 if (rsm->r_flags & RACK_HAD_PUSH) 6089 rsm->r_flags &= ~RACK_HAD_PUSH; 6090 /* Clone over the state of the hw_tls flag */ 6091 nrsm->r_hw_tls = rsm->r_hw_tls; 6092 /* 6093 * Now we need to find nrsm's new location in the mbuf chain 6094 * we basically calculate a new offset, which is soff + 6095 * how much is left in original rsm. Then we walk out the mbuf 6096 * chain to find the righ postion, it may be the same mbuf 6097 * or maybe not. 6098 */ 6099 KASSERT(((rsm->m != NULL) || 6100 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 6101 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 6102 if (rsm->m) 6103 rack_setup_offset_for_rsm(rsm, nrsm); 6104 } 6105 6106 static struct rack_sendmap * 6107 rack_merge_rsm(struct tcp_rack *rack, 6108 struct rack_sendmap *l_rsm, 6109 struct rack_sendmap *r_rsm) 6110 { 6111 /* 6112 * We are merging two ack'd RSM's, 6113 * the l_rsm is on the left (lower seq 6114 * values) and the r_rsm is on the right 6115 * (higher seq value). The simplest way 6116 * to merge these is to move the right 6117 * one into the left. I don't think there 6118 * is any reason we need to try to find 6119 * the oldest (or last oldest retransmitted). 6120 */ 6121 struct rack_sendmap *rm; 6122 6123 rack_log_map_chg(rack->rc_tp, rack, NULL, 6124 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 6125 l_rsm->r_end = r_rsm->r_end; 6126 if (l_rsm->r_dupack < r_rsm->r_dupack) 6127 l_rsm->r_dupack = r_rsm->r_dupack; 6128 if (r_rsm->r_rtr_bytes) 6129 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 6130 if (r_rsm->r_in_tmap) { 6131 /* This really should not happen */ 6132 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 6133 r_rsm->r_in_tmap = 0; 6134 } 6135 6136 /* Now the flags */ 6137 if (r_rsm->r_flags & RACK_HAS_FIN) 6138 l_rsm->r_flags |= RACK_HAS_FIN; 6139 if (r_rsm->r_flags & RACK_TLP) 6140 l_rsm->r_flags |= RACK_TLP; 6141 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 6142 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 6143 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 6144 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 6145 /* 6146 * If both are app-limited then let the 6147 * free lower the count. If right is app 6148 * limited and left is not, transfer. 6149 */ 6150 l_rsm->r_flags |= RACK_APP_LIMITED; 6151 r_rsm->r_flags &= ~RACK_APP_LIMITED; 6152 if (r_rsm == rack->r_ctl.rc_first_appl) 6153 rack->r_ctl.rc_first_appl = l_rsm; 6154 } 6155 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 6156 #ifdef INVARIANTS 6157 if (rm != r_rsm) { 6158 panic("removing head in rack:%p rsm:%p rm:%p", 6159 rack, r_rsm, rm); 6160 } 6161 #endif 6162 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 6163 /* Transfer the split limit to the map we free */ 6164 r_rsm->r_limit_type = l_rsm->r_limit_type; 6165 l_rsm->r_limit_type = 0; 6166 } 6167 rack_free(rack, r_rsm); 6168 return (l_rsm); 6169 } 6170 6171 /* 6172 * TLP Timer, here we simply setup what segment we want to 6173 * have the TLP expire on, the normal rack_output() will then 6174 * send it out. 6175 * 6176 * We return 1, saying don't proceed with rack_output only 6177 * when all timers have been stopped (destroyed PCB?). 6178 */ 6179 static int 6180 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t *doing_tlp) 6181 { 6182 /* 6183 * Tail Loss Probe. 6184 */ 6185 struct rack_sendmap *rsm = NULL; 6186 struct rack_sendmap *insret; 6187 struct socket *so; 6188 uint32_t amm; 6189 uint32_t out, avail; 6190 int collapsed_win = 0; 6191 6192 if (tp->t_timers->tt_flags & TT_STOPPED) { 6193 return (1); 6194 } 6195 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6196 /* Its not time yet */ 6197 return (0); 6198 } 6199 if (ctf_progress_timeout_check(tp, true)) { 6200 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6201 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6202 return (1); 6203 } 6204 /* 6205 * A TLP timer has expired. We have been idle for 2 rtts. So we now 6206 * need to figure out how to force a full MSS segment out. 6207 */ 6208 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 6209 rack->r_ctl.retran_during_recovery = 0; 6210 rack->r_ctl.dsack_byte_cnt = 0; 6211 counter_u64_add(rack_tlp_tot, 1); 6212 if (rack->r_state && (rack->r_state != tp->t_state)) 6213 rack_set_state(tp, rack); 6214 so = tp->t_inpcb->inp_socket; 6215 avail = sbavail(&so->so_snd); 6216 out = tp->snd_max - tp->snd_una; 6217 if (out > tp->snd_wnd) { 6218 /* special case, we need a retransmission */ 6219 collapsed_win = 1; 6220 goto need_retran; 6221 } 6222 if ((tp->t_flags & TF_GPUTINPROG) && 6223 (rack->r_ctl.rc_tlp_cnt_out == 1)) { 6224 /* 6225 * If this is the second in a row 6226 * TLP and we are doing a measurement 6227 * its time to abandon the measurement. 6228 * Something is likely broken on 6229 * the clients network and measuring a 6230 * broken network does us no good. 6231 */ 6232 tp->t_flags &= ~TF_GPUTINPROG; 6233 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6234 rack->r_ctl.rc_gp_srtt /*flex1*/, 6235 tp->gput_seq, 6236 0, 0, 18, __LINE__, NULL, 0); 6237 } 6238 /* 6239 * Check our send oldest always settings, and if 6240 * there is an oldest to send jump to the need_retran. 6241 */ 6242 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 6243 goto need_retran; 6244 6245 if (avail > out) { 6246 /* New data is available */ 6247 amm = avail - out; 6248 if (amm > ctf_fixed_maxseg(tp)) { 6249 amm = ctf_fixed_maxseg(tp); 6250 if ((amm + out) > tp->snd_wnd) { 6251 /* We are rwnd limited */ 6252 goto need_retran; 6253 } 6254 } else if (amm < ctf_fixed_maxseg(tp)) { 6255 /* not enough to fill a MTU */ 6256 goto need_retran; 6257 } 6258 if (IN_FASTRECOVERY(tp->t_flags)) { 6259 /* Unlikely */ 6260 if (rack->rack_no_prr == 0) { 6261 if (out + amm <= tp->snd_wnd) { 6262 rack->r_ctl.rc_prr_sndcnt = amm; 6263 rack->r_ctl.rc_tlp_new_data = amm; 6264 rack_log_to_prr(rack, 4, 0); 6265 } 6266 } else 6267 goto need_retran; 6268 } else { 6269 /* Set the send-new override */ 6270 if (out + amm <= tp->snd_wnd) 6271 rack->r_ctl.rc_tlp_new_data = amm; 6272 else 6273 goto need_retran; 6274 } 6275 rack->r_ctl.rc_tlpsend = NULL; 6276 counter_u64_add(rack_tlp_newdata, 1); 6277 goto send; 6278 } 6279 need_retran: 6280 /* 6281 * Ok we need to arrange the last un-acked segment to be re-sent, or 6282 * optionally the first un-acked segment. 6283 */ 6284 if (collapsed_win == 0) { 6285 if (rack_always_send_oldest) 6286 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6287 else { 6288 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6289 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 6290 rsm = rack_find_high_nonack(rack, rsm); 6291 } 6292 } 6293 if (rsm == NULL) { 6294 counter_u64_add(rack_tlp_does_nada, 1); 6295 #ifdef TCP_BLACKBOX 6296 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6297 #endif 6298 goto out; 6299 } 6300 } else { 6301 /* 6302 * We must find the last segment 6303 * that was acceptable by the client. 6304 */ 6305 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6306 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 6307 /* Found one */ 6308 break; 6309 } 6310 } 6311 if (rsm == NULL) { 6312 /* None? if so send the first */ 6313 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6314 if (rsm == NULL) { 6315 counter_u64_add(rack_tlp_does_nada, 1); 6316 #ifdef TCP_BLACKBOX 6317 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6318 #endif 6319 goto out; 6320 } 6321 } 6322 } 6323 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 6324 /* 6325 * We need to split this the last segment in two. 6326 */ 6327 struct rack_sendmap *nrsm; 6328 6329 nrsm = rack_alloc_full_limit(rack); 6330 if (nrsm == NULL) { 6331 /* 6332 * No memory to split, we will just exit and punt 6333 * off to the RXT timer. 6334 */ 6335 counter_u64_add(rack_tlp_does_nada, 1); 6336 goto out; 6337 } 6338 rack_clone_rsm(rack, nrsm, rsm, 6339 (rsm->r_end - ctf_fixed_maxseg(tp))); 6340 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 6341 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6342 #ifdef INVARIANTS 6343 if (insret != NULL) { 6344 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6345 nrsm, insret, rack, rsm); 6346 } 6347 #endif 6348 if (rsm->r_in_tmap) { 6349 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6350 nrsm->r_in_tmap = 1; 6351 } 6352 rsm->r_flags &= (~RACK_HAS_FIN); 6353 rsm = nrsm; 6354 } 6355 rack->r_ctl.rc_tlpsend = rsm; 6356 send: 6357 /* Make sure output path knows we are doing a TLP */ 6358 *doing_tlp = 1; 6359 rack->r_timer_override = 1; 6360 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6361 return (0); 6362 out: 6363 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6364 return (0); 6365 } 6366 6367 /* 6368 * Delayed ack Timer, here we simply need to setup the 6369 * ACK_NOW flag and remove the DELACK flag. From there 6370 * the output routine will send the ack out. 6371 * 6372 * We only return 1, saying don't proceed, if all timers 6373 * are stopped (destroyed PCB?). 6374 */ 6375 static int 6376 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6377 { 6378 if (tp->t_timers->tt_flags & TT_STOPPED) { 6379 return (1); 6380 } 6381 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 6382 tp->t_flags &= ~TF_DELACK; 6383 tp->t_flags |= TF_ACKNOW; 6384 KMOD_TCPSTAT_INC(tcps_delack); 6385 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6386 return (0); 6387 } 6388 6389 /* 6390 * Persists timer, here we simply send the 6391 * same thing as a keepalive will. 6392 * the one byte send. 6393 * 6394 * We only return 1, saying don't proceed, if all timers 6395 * are stopped (destroyed PCB?). 6396 */ 6397 static int 6398 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6399 { 6400 struct tcptemp *t_template; 6401 struct inpcb *inp; 6402 int32_t retval = 1; 6403 6404 inp = tp->t_inpcb; 6405 6406 if (tp->t_timers->tt_flags & TT_STOPPED) { 6407 return (1); 6408 } 6409 if (rack->rc_in_persist == 0) 6410 return (0); 6411 if (ctf_progress_timeout_check(tp, false)) { 6412 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6413 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6414 tcp_set_inp_to_drop(inp, ETIMEDOUT); 6415 return (1); 6416 } 6417 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 6418 /* 6419 * Persistence timer into zero window. Force a byte to be output, if 6420 * possible. 6421 */ 6422 KMOD_TCPSTAT_INC(tcps_persisttimeo); 6423 /* 6424 * Hack: if the peer is dead/unreachable, we do not time out if the 6425 * window is closed. After a full backoff, drop the connection if 6426 * the idle time (no responses to probes) reaches the maximum 6427 * backoff that we would use if retransmitting. 6428 */ 6429 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 6430 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 6431 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 6432 KMOD_TCPSTAT_INC(tcps_persistdrop); 6433 retval = 1; 6434 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6435 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6436 goto out; 6437 } 6438 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 6439 tp->snd_una == tp->snd_max) 6440 rack_exit_persist(tp, rack, cts); 6441 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 6442 /* 6443 * If the user has closed the socket then drop a persisting 6444 * connection after a much reduced timeout. 6445 */ 6446 if (tp->t_state > TCPS_CLOSE_WAIT && 6447 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 6448 retval = 1; 6449 KMOD_TCPSTAT_INC(tcps_persistdrop); 6450 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6451 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6452 goto out; 6453 } 6454 t_template = tcpip_maketemplate(rack->rc_inp); 6455 if (t_template) { 6456 /* only set it if we were answered */ 6457 if (rack->forced_ack == 0) { 6458 rack->forced_ack = 1; 6459 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6460 } 6461 tcp_respond(tp, t_template->tt_ipgen, 6462 &t_template->tt_t, (struct mbuf *)NULL, 6463 tp->rcv_nxt, tp->snd_una - 1, 0); 6464 /* This sends an ack */ 6465 if (tp->t_flags & TF_DELACK) 6466 tp->t_flags &= ~TF_DELACK; 6467 free(t_template, M_TEMP); 6468 } 6469 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 6470 tp->t_rxtshift++; 6471 out: 6472 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 6473 rack_start_hpts_timer(rack, tp, cts, 6474 0, 0, 0); 6475 return (retval); 6476 } 6477 6478 /* 6479 * If a keepalive goes off, we had no other timers 6480 * happening. We always return 1 here since this 6481 * routine either drops the connection or sends 6482 * out a segment with respond. 6483 */ 6484 static int 6485 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6486 { 6487 struct tcptemp *t_template; 6488 struct inpcb *inp; 6489 6490 if (tp->t_timers->tt_flags & TT_STOPPED) { 6491 return (1); 6492 } 6493 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 6494 inp = tp->t_inpcb; 6495 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 6496 /* 6497 * Keep-alive timer went off; send something or drop connection if 6498 * idle for too long. 6499 */ 6500 KMOD_TCPSTAT_INC(tcps_keeptimeo); 6501 if (tp->t_state < TCPS_ESTABLISHED) 6502 goto dropit; 6503 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6504 tp->t_state <= TCPS_CLOSING) { 6505 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 6506 goto dropit; 6507 /* 6508 * Send a packet designed to force a response if the peer is 6509 * up and reachable: either an ACK if the connection is 6510 * still alive, or an RST if the peer has closed the 6511 * connection due to timeout or reboot. Using sequence 6512 * number tp->snd_una-1 causes the transmitted zero-length 6513 * segment to lie outside the receive window; by the 6514 * protocol spec, this requires the correspondent TCP to 6515 * respond. 6516 */ 6517 KMOD_TCPSTAT_INC(tcps_keepprobe); 6518 t_template = tcpip_maketemplate(inp); 6519 if (t_template) { 6520 if (rack->forced_ack == 0) { 6521 rack->forced_ack = 1; 6522 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6523 } 6524 tcp_respond(tp, t_template->tt_ipgen, 6525 &t_template->tt_t, (struct mbuf *)NULL, 6526 tp->rcv_nxt, tp->snd_una - 1, 0); 6527 free(t_template, M_TEMP); 6528 } 6529 } 6530 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 6531 return (1); 6532 dropit: 6533 KMOD_TCPSTAT_INC(tcps_keepdrops); 6534 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6535 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6536 return (1); 6537 } 6538 6539 /* 6540 * Retransmit helper function, clear up all the ack 6541 * flags and take care of important book keeping. 6542 */ 6543 static void 6544 rack_remxt_tmr(struct tcpcb *tp) 6545 { 6546 /* 6547 * The retransmit timer went off, all sack'd blocks must be 6548 * un-acked. 6549 */ 6550 struct rack_sendmap *rsm, *trsm = NULL; 6551 struct tcp_rack *rack; 6552 6553 rack = (struct tcp_rack *)tp->t_fb_ptr; 6554 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 6555 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 6556 if (rack->r_state && (rack->r_state != tp->t_state)) 6557 rack_set_state(tp, rack); 6558 /* 6559 * Ideally we would like to be able to 6560 * mark SACK-PASS on anything not acked here. 6561 * 6562 * However, if we do that we would burst out 6563 * all that data 1ms apart. This would be unwise, 6564 * so for now we will just let the normal rxt timer 6565 * and tlp timer take care of it. 6566 * 6567 * Also we really need to stick them back in sequence 6568 * order. This way we send in the proper order and any 6569 * sacks that come floating in will "re-ack" the data. 6570 * To do this we zap the tmap with an INIT and then 6571 * walk through and place every rsm in the RB tree 6572 * back in its seq ordered place. 6573 */ 6574 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6575 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6576 rsm->r_dupack = 0; 6577 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6578 /* We must re-add it back to the tlist */ 6579 if (trsm == NULL) { 6580 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6581 } else { 6582 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 6583 } 6584 rsm->r_in_tmap = 1; 6585 trsm = rsm; 6586 if (rsm->r_flags & RACK_ACKED) 6587 rsm->r_flags |= RACK_WAS_ACKED; 6588 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 6589 } 6590 /* Clear the count (we just un-acked them) */ 6591 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 6592 rack->r_ctl.rc_sacked = 0; 6593 rack->r_ctl.rc_sacklast = NULL; 6594 rack->r_ctl.rc_agg_delayed = 0; 6595 rack->r_early = 0; 6596 rack->r_ctl.rc_agg_early = 0; 6597 rack->r_late = 0; 6598 /* Clear the tlp rtx mark */ 6599 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6600 if (rack->r_ctl.rc_resend != NULL) 6601 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 6602 rack->r_ctl.rc_prr_sndcnt = 0; 6603 rack_log_to_prr(rack, 6, 0); 6604 rack->r_timer_override = 1; 6605 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 6606 #ifdef NETFLIX_EXP_DETECTION 6607 || (rack->sack_attack_disable != 0) 6608 #endif 6609 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 6610 /* 6611 * For non-sack customers new data 6612 * needs to go out as retransmits until 6613 * we retransmit up to snd_max. 6614 */ 6615 rack->r_must_retran = 1; 6616 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 6617 rack->r_ctl.rc_sacked); 6618 } 6619 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 6620 } 6621 6622 static void 6623 rack_convert_rtts(struct tcpcb *tp) 6624 { 6625 if (tp->t_srtt > 1) { 6626 uint32_t val, frac; 6627 6628 val = tp->t_srtt >> TCP_RTT_SHIFT; 6629 frac = tp->t_srtt & 0x1f; 6630 tp->t_srtt = TICKS_2_USEC(val); 6631 /* 6632 * frac is the fractional part of the srtt (if any) 6633 * but its in ticks and every bit represents 6634 * 1/32nd of a hz. 6635 */ 6636 if (frac) { 6637 if (hz == 1000) { 6638 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6639 } else { 6640 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6641 } 6642 tp->t_srtt += frac; 6643 } 6644 } 6645 if (tp->t_rttvar) { 6646 uint32_t val, frac; 6647 6648 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; 6649 frac = tp->t_rttvar & 0x1f; 6650 tp->t_rttvar = TICKS_2_USEC(val); 6651 /* 6652 * frac is the fractional part of the srtt (if any) 6653 * but its in ticks and every bit represents 6654 * 1/32nd of a hz. 6655 */ 6656 if (frac) { 6657 if (hz == 1000) { 6658 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6659 } else { 6660 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6661 } 6662 tp->t_rttvar += frac; 6663 } 6664 } 6665 tp->t_rxtcur = RACK_REXMTVAL(tp); 6666 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6667 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 6668 } 6669 if (tp->t_rxtcur > rack_rto_max) { 6670 tp->t_rxtcur = rack_rto_max; 6671 } 6672 } 6673 6674 static void 6675 rack_cc_conn_init(struct tcpcb *tp) 6676 { 6677 struct tcp_rack *rack; 6678 uint32_t srtt; 6679 6680 rack = (struct tcp_rack *)tp->t_fb_ptr; 6681 srtt = tp->t_srtt; 6682 cc_conn_init(tp); 6683 /* 6684 * Now convert to rack's internal format, 6685 * if required. 6686 */ 6687 if ((srtt == 0) && (tp->t_srtt != 0)) 6688 rack_convert_rtts(tp); 6689 /* 6690 * We want a chance to stay in slowstart as 6691 * we create a connection. TCP spec says that 6692 * initially ssthresh is infinite. For our 6693 * purposes that is the snd_wnd. 6694 */ 6695 if (tp->snd_ssthresh < tp->snd_wnd) { 6696 tp->snd_ssthresh = tp->snd_wnd; 6697 } 6698 /* 6699 * We also want to assure a IW worth of 6700 * data can get inflight. 6701 */ 6702 if (rc_init_window(rack) < tp->snd_cwnd) 6703 tp->snd_cwnd = rc_init_window(rack); 6704 } 6705 6706 /* 6707 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 6708 * we will setup to retransmit the lowest seq number outstanding. 6709 */ 6710 static int 6711 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6712 { 6713 int32_t rexmt; 6714 struct inpcb *inp; 6715 int32_t retval = 0; 6716 bool isipv6; 6717 6718 inp = tp->t_inpcb; 6719 if (tp->t_timers->tt_flags & TT_STOPPED) { 6720 return (1); 6721 } 6722 if ((tp->t_flags & TF_GPUTINPROG) && 6723 (tp->t_rxtshift)) { 6724 /* 6725 * We have had a second timeout 6726 * measurements on successive rxt's are not profitable. 6727 * It is unlikely to be of any use (the network is 6728 * broken or the client went away). 6729 */ 6730 tp->t_flags &= ~TF_GPUTINPROG; 6731 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6732 rack->r_ctl.rc_gp_srtt /*flex1*/, 6733 tp->gput_seq, 6734 0, 0, 18, __LINE__, NULL, 0); 6735 } 6736 if (ctf_progress_timeout_check(tp, false)) { 6737 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6738 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6739 tcp_set_inp_to_drop(inp, ETIMEDOUT); 6740 return (1); 6741 } 6742 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 6743 rack->r_ctl.retran_during_recovery = 0; 6744 rack->r_ctl.dsack_byte_cnt = 0; 6745 if (IN_FASTRECOVERY(tp->t_flags)) 6746 tp->t_flags |= TF_WASFRECOVERY; 6747 else 6748 tp->t_flags &= ~TF_WASFRECOVERY; 6749 if (IN_CONGRECOVERY(tp->t_flags)) 6750 tp->t_flags |= TF_WASCRECOVERY; 6751 else 6752 tp->t_flags &= ~TF_WASCRECOVERY; 6753 if (TCPS_HAVEESTABLISHED(tp->t_state) && 6754 (tp->snd_una == tp->snd_max)) { 6755 /* Nothing outstanding .. nothing to do */ 6756 return (0); 6757 } 6758 /* 6759 * Rack can only run one timer at a time, so we cannot 6760 * run a KEEPINIT (gating SYN sending) and a retransmit 6761 * timer for the SYN. So if we are in a front state and 6762 * have a KEEPINIT timer we need to check the first transmit 6763 * against now to see if we have exceeded the KEEPINIT time 6764 * (if one is set). 6765 */ 6766 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6767 (TP_KEEPINIT(tp) != 0)) { 6768 struct rack_sendmap *rsm; 6769 6770 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6771 if (rsm) { 6772 /* Ok we have something outstanding to test keepinit with */ 6773 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 6774 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 6775 /* We have exceeded the KEEPINIT time */ 6776 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6777 goto drop_it; 6778 } 6779 } 6780 } 6781 /* 6782 * Retransmission timer went off. Message has not been acked within 6783 * retransmit interval. Back off to a longer retransmit interval 6784 * and retransmit one segment. 6785 */ 6786 rack_remxt_tmr(tp); 6787 if ((rack->r_ctl.rc_resend == NULL) || 6788 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 6789 /* 6790 * If the rwnd collapsed on 6791 * the one we are retransmitting 6792 * it does not count against the 6793 * rxt count. 6794 */ 6795 tp->t_rxtshift++; 6796 } 6797 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 6798 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6799 drop_it: 6800 tp->t_rxtshift = TCP_MAXRXTSHIFT; 6801 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 6802 retval = 1; 6803 tcp_set_inp_to_drop(rack->rc_inp, 6804 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 6805 goto out; 6806 } 6807 if (tp->t_state == TCPS_SYN_SENT) { 6808 /* 6809 * If the SYN was retransmitted, indicate CWND to be limited 6810 * to 1 segment in cc_conn_init(). 6811 */ 6812 tp->snd_cwnd = 1; 6813 } else if (tp->t_rxtshift == 1) { 6814 /* 6815 * first retransmit; record ssthresh and cwnd so they can be 6816 * recovered if this turns out to be a "bad" retransmit. A 6817 * retransmit is considered "bad" if an ACK for this segment 6818 * is received within RTT/2 interval; the assumption here is 6819 * that the ACK was already in flight. See "On Estimating 6820 * End-to-End Network Path Properties" by Allman and Paxson 6821 * for more details. 6822 */ 6823 tp->snd_cwnd_prev = tp->snd_cwnd; 6824 tp->snd_ssthresh_prev = tp->snd_ssthresh; 6825 tp->snd_recover_prev = tp->snd_recover; 6826 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 6827 tp->t_flags |= TF_PREVVALID; 6828 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 6829 tp->t_flags &= ~TF_PREVVALID; 6830 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 6831 if ((tp->t_state == TCPS_SYN_SENT) || 6832 (tp->t_state == TCPS_SYN_RECEIVED)) 6833 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 6834 else 6835 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 6836 6837 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 6838 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 6839 /* 6840 * We enter the path for PLMTUD if connection is established or, if 6841 * connection is FIN_WAIT_1 status, reason for the last is that if 6842 * amount of data we send is very small, we could send it in couple 6843 * of packets and process straight to FIN. In that case we won't 6844 * catch ESTABLISHED state. 6845 */ 6846 #ifdef INET6 6847 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 6848 #else 6849 isipv6 = false; 6850 #endif 6851 if (((V_tcp_pmtud_blackhole_detect == 1) || 6852 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 6853 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 6854 ((tp->t_state == TCPS_ESTABLISHED) || 6855 (tp->t_state == TCPS_FIN_WAIT_1))) { 6856 /* 6857 * Idea here is that at each stage of mtu probe (usually, 6858 * 1448 -> 1188 -> 524) should be given 2 chances to recover 6859 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 6860 * should take care of that. 6861 */ 6862 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 6863 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 6864 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 6865 tp->t_rxtshift % 2 == 0)) { 6866 /* 6867 * Enter Path MTU Black-hole Detection mechanism: - 6868 * Disable Path MTU Discovery (IP "DF" bit). - 6869 * Reduce MTU to lower value than what we negotiated 6870 * with peer. 6871 */ 6872 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 6873 /* Record that we may have found a black hole. */ 6874 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 6875 /* Keep track of previous MSS. */ 6876 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 6877 } 6878 6879 /* 6880 * Reduce the MSS to blackhole value or to the 6881 * default in an attempt to retransmit. 6882 */ 6883 #ifdef INET6 6884 if (isipv6 && 6885 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 6886 /* Use the sysctl tuneable blackhole MSS. */ 6887 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 6888 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6889 } else if (isipv6) { 6890 /* Use the default MSS. */ 6891 tp->t_maxseg = V_tcp_v6mssdflt; 6892 /* 6893 * Disable Path MTU Discovery when we switch 6894 * to minmss. 6895 */ 6896 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6897 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6898 } 6899 #endif 6900 #if defined(INET6) && defined(INET) 6901 else 6902 #endif 6903 #ifdef INET 6904 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 6905 /* Use the sysctl tuneable blackhole MSS. */ 6906 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 6907 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6908 } else { 6909 /* Use the default MSS. */ 6910 tp->t_maxseg = V_tcp_mssdflt; 6911 /* 6912 * Disable Path MTU Discovery when we switch 6913 * to minmss. 6914 */ 6915 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6916 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6917 } 6918 #endif 6919 } else { 6920 /* 6921 * If further retransmissions are still unsuccessful 6922 * with a lowered MTU, maybe this isn't a blackhole 6923 * and we restore the previous MSS and blackhole 6924 * detection flags. The limit '6' is determined by 6925 * giving each probe stage (1448, 1188, 524) 2 6926 * chances to recover. 6927 */ 6928 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 6929 (tp->t_rxtshift >= 6)) { 6930 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 6931 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 6932 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 6933 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 6934 } 6935 } 6936 } 6937 /* 6938 * Disable RFC1323 and SACK if we haven't got any response to 6939 * our third SYN to work-around some broken terminal servers 6940 * (most of which have hopefully been retired) that have bad VJ 6941 * header compression code which trashes TCP segments containing 6942 * unknown-to-them TCP options. 6943 */ 6944 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 6945 (tp->t_rxtshift == 3)) 6946 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 6947 /* 6948 * If we backed off this far, our srtt estimate is probably bogus. 6949 * Clobber it so we'll take the next rtt measurement as our srtt; 6950 * move the current srtt into rttvar to keep the current retransmit 6951 * times until then. 6952 */ 6953 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 6954 #ifdef INET6 6955 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 6956 in6_losing(tp->t_inpcb); 6957 else 6958 #endif 6959 in_losing(tp->t_inpcb); 6960 tp->t_rttvar += tp->t_srtt; 6961 tp->t_srtt = 0; 6962 } 6963 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 6964 tp->snd_recover = tp->snd_max; 6965 tp->t_flags |= TF_ACKNOW; 6966 tp->t_rtttime = 0; 6967 rack_cong_signal(tp, CC_RTO, tp->snd_una); 6968 out: 6969 return (retval); 6970 } 6971 6972 static int 6973 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling, uint8_t *doing_tlp) 6974 { 6975 int32_t ret = 0; 6976 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 6977 6978 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 6979 (tp->t_flags & TF_GPUTINPROG)) { 6980 /* 6981 * We have a goodput in progress 6982 * and we have entered a late state. 6983 * Do we have enough data in the sb 6984 * to handle the GPUT request? 6985 */ 6986 uint32_t bytes; 6987 6988 bytes = tp->gput_ack - tp->gput_seq; 6989 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 6990 bytes += tp->gput_seq - tp->snd_una; 6991 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 6992 /* 6993 * There are not enough bytes in the socket 6994 * buffer that have been sent to cover this 6995 * measurement. Cancel it. 6996 */ 6997 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 6998 rack->r_ctl.rc_gp_srtt /*flex1*/, 6999 tp->gput_seq, 7000 0, 0, 18, __LINE__, NULL, 0); 7001 tp->t_flags &= ~TF_GPUTINPROG; 7002 } 7003 } 7004 if (timers == 0) { 7005 return (0); 7006 } 7007 if (tp->t_state == TCPS_LISTEN) { 7008 /* no timers on listen sockets */ 7009 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 7010 return (0); 7011 return (1); 7012 } 7013 if ((timers & PACE_TMR_RACK) && 7014 rack->rc_on_min_to) { 7015 /* 7016 * For the rack timer when we 7017 * are on a min-timeout (which means rrr_conf = 3) 7018 * we don't want to check the timer. It may 7019 * be going off for a pace and thats ok we 7020 * want to send the retransmit (if its ready). 7021 * 7022 * If its on a normal rack timer (non-min) then 7023 * we will check if its expired. 7024 */ 7025 goto skip_time_check; 7026 } 7027 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 7028 uint32_t left; 7029 7030 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 7031 ret = -1; 7032 rack_log_to_processing(rack, cts, ret, 0); 7033 return (0); 7034 } 7035 if (hpts_calling == 0) { 7036 /* 7037 * A user send or queued mbuf (sack) has called us? We 7038 * return 0 and let the pacing guards 7039 * deal with it if they should or 7040 * should not cause a send. 7041 */ 7042 ret = -2; 7043 rack_log_to_processing(rack, cts, ret, 0); 7044 return (0); 7045 } 7046 /* 7047 * Ok our timer went off early and we are not paced false 7048 * alarm, go back to sleep. 7049 */ 7050 ret = -3; 7051 left = rack->r_ctl.rc_timer_exp - cts; 7052 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 7053 rack_log_to_processing(rack, cts, ret, left); 7054 return (1); 7055 } 7056 skip_time_check: 7057 rack->rc_tmr_stopped = 0; 7058 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 7059 if (timers & PACE_TMR_DELACK) { 7060 ret = rack_timeout_delack(tp, rack, cts); 7061 } else if (timers & PACE_TMR_RACK) { 7062 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7063 rack->r_fast_output = 0; 7064 ret = rack_timeout_rack(tp, rack, cts); 7065 } else if (timers & PACE_TMR_TLP) { 7066 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7067 ret = rack_timeout_tlp(tp, rack, cts, doing_tlp); 7068 } else if (timers & PACE_TMR_RXT) { 7069 rack->r_ctl.rc_tlp_rxt_last_time = cts; 7070 rack->r_fast_output = 0; 7071 ret = rack_timeout_rxt(tp, rack, cts); 7072 } else if (timers & PACE_TMR_PERSIT) { 7073 ret = rack_timeout_persist(tp, rack, cts); 7074 } else if (timers & PACE_TMR_KEEP) { 7075 ret = rack_timeout_keepalive(tp, rack, cts); 7076 } 7077 rack_log_to_processing(rack, cts, ret, timers); 7078 return (ret); 7079 } 7080 7081 static void 7082 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 7083 { 7084 struct timeval tv; 7085 uint32_t us_cts, flags_on_entry; 7086 uint8_t hpts_removed = 0; 7087 7088 flags_on_entry = rack->r_ctl.rc_hpts_flags; 7089 us_cts = tcp_get_usecs(&tv); 7090 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 7091 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 7092 ((tp->snd_max - tp->snd_una) == 0))) { 7093 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 7094 hpts_removed = 1; 7095 /* If we were not delayed cancel out the flag. */ 7096 if ((tp->snd_max - tp->snd_una) == 0) 7097 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7098 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7099 } 7100 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7101 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7102 if (rack->rc_inp->inp_in_hpts && 7103 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 7104 /* 7105 * Canceling timer's when we have no output being 7106 * paced. We also must remove ourselves from the 7107 * hpts. 7108 */ 7109 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 7110 hpts_removed = 1; 7111 } 7112 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 7113 } 7114 if (hpts_removed == 0) 7115 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7116 } 7117 7118 static void 7119 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 7120 { 7121 return; 7122 } 7123 7124 static int 7125 rack_stopall(struct tcpcb *tp) 7126 { 7127 struct tcp_rack *rack; 7128 rack = (struct tcp_rack *)tp->t_fb_ptr; 7129 rack->t_timers_stopped = 1; 7130 return (0); 7131 } 7132 7133 static void 7134 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 7135 { 7136 return; 7137 } 7138 7139 static int 7140 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 7141 { 7142 return (0); 7143 } 7144 7145 static void 7146 rack_stop_all_timers(struct tcpcb *tp) 7147 { 7148 struct tcp_rack *rack; 7149 7150 /* 7151 * Assure no timers are running. 7152 */ 7153 if (tcp_timer_active(tp, TT_PERSIST)) { 7154 /* We enter in persists, set the flag appropriately */ 7155 rack = (struct tcp_rack *)tp->t_fb_ptr; 7156 rack->rc_in_persist = 1; 7157 } 7158 tcp_timer_suspend(tp, TT_PERSIST); 7159 tcp_timer_suspend(tp, TT_REXMT); 7160 tcp_timer_suspend(tp, TT_KEEP); 7161 tcp_timer_suspend(tp, TT_DELACK); 7162 } 7163 7164 static void 7165 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 7166 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) 7167 { 7168 int32_t idx; 7169 uint16_t stripped_flags; 7170 7171 rsm->r_rtr_cnt++; 7172 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7173 rsm->r_dupack = 0; 7174 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 7175 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 7176 rsm->r_flags |= RACK_OVERMAX; 7177 } 7178 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 7179 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 7180 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 7181 } 7182 idx = rsm->r_rtr_cnt - 1; 7183 rsm->r_tim_lastsent[idx] = ts; 7184 stripped_flags = rsm->r_flags & ~(RACK_SENT_SP|RACK_SENT_FP); 7185 if (rsm->r_flags & RACK_ACKED) { 7186 /* Problably MTU discovery messing with us */ 7187 rsm->r_flags &= ~RACK_ACKED; 7188 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7189 } 7190 if (rsm->r_in_tmap) { 7191 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7192 rsm->r_in_tmap = 0; 7193 } 7194 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7195 rsm->r_in_tmap = 1; 7196 if (rsm->r_flags & RACK_SACK_PASSED) { 7197 /* We have retransmitted due to the SACK pass */ 7198 rsm->r_flags &= ~RACK_SACK_PASSED; 7199 rsm->r_flags |= RACK_WAS_SACKPASS; 7200 } 7201 } 7202 7203 static uint32_t 7204 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 7205 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) 7206 { 7207 /* 7208 * We (re-)transmitted starting at rsm->r_start for some length 7209 * (possibly less than r_end. 7210 */ 7211 struct rack_sendmap *nrsm, *insret; 7212 uint32_t c_end; 7213 int32_t len; 7214 7215 len = *lenp; 7216 c_end = rsm->r_start + len; 7217 if (SEQ_GEQ(c_end, rsm->r_end)) { 7218 /* 7219 * We retransmitted the whole piece or more than the whole 7220 * slopping into the next rsm. 7221 */ 7222 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7223 if (c_end == rsm->r_end) { 7224 *lenp = 0; 7225 return (0); 7226 } else { 7227 int32_t act_len; 7228 7229 /* Hangs over the end return whats left */ 7230 act_len = rsm->r_end - rsm->r_start; 7231 *lenp = (len - act_len); 7232 return (rsm->r_end); 7233 } 7234 /* We don't get out of this block. */ 7235 } 7236 /* 7237 * Here we retransmitted less than the whole thing which means we 7238 * have to split this into what was transmitted and what was not. 7239 */ 7240 nrsm = rack_alloc_full_limit(rack); 7241 if (nrsm == NULL) { 7242 /* 7243 * We can't get memory, so lets not proceed. 7244 */ 7245 *lenp = 0; 7246 return (0); 7247 } 7248 /* 7249 * So here we are going to take the original rsm and make it what we 7250 * retransmitted. nrsm will be the tail portion we did not 7251 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 7252 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 7253 * 1, 6 and the new piece will be 6, 11. 7254 */ 7255 rack_clone_rsm(rack, nrsm, rsm, c_end); 7256 nrsm->r_dupack = 0; 7257 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7258 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7259 #ifdef INVARIANTS 7260 if (insret != NULL) { 7261 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7262 nrsm, insret, rack, rsm); 7263 } 7264 #endif 7265 if (rsm->r_in_tmap) { 7266 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7267 nrsm->r_in_tmap = 1; 7268 } 7269 rsm->r_flags &= (~RACK_HAS_FIN); 7270 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7271 /* Log a split of rsm into rsm and nrsm */ 7272 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7273 *lenp = 0; 7274 return (0); 7275 } 7276 7277 static void 7278 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 7279 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts, 7280 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls) 7281 { 7282 struct tcp_rack *rack; 7283 struct rack_sendmap *rsm, *nrsm, *insret, fe; 7284 register uint32_t snd_max, snd_una; 7285 7286 /* 7287 * Add to the RACK log of packets in flight or retransmitted. If 7288 * there is a TS option we will use the TS echoed, if not we will 7289 * grab a TS. 7290 * 7291 * Retransmissions will increment the count and move the ts to its 7292 * proper place. Note that if options do not include TS's then we 7293 * won't be able to effectively use the ACK for an RTT on a retran. 7294 * 7295 * Notes about r_start and r_end. Lets consider a send starting at 7296 * sequence 1 for 10 bytes. In such an example the r_start would be 7297 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 7298 * This means that r_end is actually the first sequence for the next 7299 * slot (11). 7300 * 7301 */ 7302 /* 7303 * If err is set what do we do XXXrrs? should we not add the thing? 7304 * -- i.e. return if err != 0 or should we pretend we sent it? -- 7305 * i.e. proceed with add ** do this for now. 7306 */ 7307 INP_WLOCK_ASSERT(tp->t_inpcb); 7308 if (err) 7309 /* 7310 * We don't log errors -- we could but snd_max does not 7311 * advance in this case either. 7312 */ 7313 return; 7314 7315 if (th_flags & TH_RST) { 7316 /* 7317 * We don't log resets and we return immediately from 7318 * sending 7319 */ 7320 return; 7321 } 7322 rack = (struct tcp_rack *)tp->t_fb_ptr; 7323 snd_una = tp->snd_una; 7324 snd_max = tp->snd_max; 7325 if (th_flags & (TH_SYN | TH_FIN)) { 7326 /* 7327 * The call to rack_log_output is made before bumping 7328 * snd_max. This means we can record one extra byte on a SYN 7329 * or FIN if seq_out is adding more on and a FIN is present 7330 * (and we are not resending). 7331 */ 7332 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 7333 len++; 7334 if (th_flags & TH_FIN) 7335 len++; 7336 if (SEQ_LT(snd_max, tp->snd_nxt)) { 7337 /* 7338 * The add/update as not been done for the FIN/SYN 7339 * yet. 7340 */ 7341 snd_max = tp->snd_nxt; 7342 } 7343 } 7344 if (SEQ_LEQ((seq_out + len), snd_una)) { 7345 /* Are sending an old segment to induce an ack (keep-alive)? */ 7346 return; 7347 } 7348 if (SEQ_LT(seq_out, snd_una)) { 7349 /* huh? should we panic? */ 7350 uint32_t end; 7351 7352 end = seq_out + len; 7353 seq_out = snd_una; 7354 if (SEQ_GEQ(end, seq_out)) 7355 len = end - seq_out; 7356 else 7357 len = 0; 7358 } 7359 if (len == 0) { 7360 /* We don't log zero window probes */ 7361 return; 7362 } 7363 rack->r_ctl.rc_time_last_sent = cts; 7364 if (IN_FASTRECOVERY(tp->t_flags)) { 7365 rack->r_ctl.rc_prr_out += len; 7366 } 7367 /* First question is it a retransmission or new? */ 7368 if (seq_out == snd_max) { 7369 /* Its new */ 7370 again: 7371 rsm = rack_alloc(rack); 7372 if (rsm == NULL) { 7373 /* 7374 * Hmm out of memory and the tcb got destroyed while 7375 * we tried to wait. 7376 */ 7377 return; 7378 } 7379 if (th_flags & TH_FIN) { 7380 rsm->r_flags = RACK_HAS_FIN|add_flag; 7381 } else { 7382 rsm->r_flags = add_flag; 7383 } 7384 if (hw_tls) 7385 rsm->r_hw_tls = 1; 7386 rsm->r_tim_lastsent[0] = cts; 7387 rsm->r_rtr_cnt = 1; 7388 rsm->r_rtr_bytes = 0; 7389 if (th_flags & TH_SYN) { 7390 /* The data space is one beyond snd_una */ 7391 rsm->r_flags |= RACK_HAS_SYN; 7392 } 7393 rsm->r_start = seq_out; 7394 rsm->r_end = rsm->r_start + len; 7395 rsm->r_dupack = 0; 7396 /* 7397 * save off the mbuf location that 7398 * sndmbuf_noadv returned (which is 7399 * where we started copying from).. 7400 */ 7401 rsm->m = s_mb; 7402 rsm->soff = s_moff; 7403 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 7404 if (rsm->m) { 7405 if (rsm->m->m_len <= rsm->soff) { 7406 /* 7407 * XXXrrs Question, will this happen? 7408 * 7409 * If sbsndptr is set at the correct place 7410 * then s_moff should always be somewhere 7411 * within rsm->m. But if the sbsndptr was 7412 * off then that won't be true. If it occurs 7413 * we need to walkout to the correct location. 7414 */ 7415 struct mbuf *lm; 7416 7417 lm = rsm->m; 7418 while (lm->m_len <= rsm->soff) { 7419 rsm->soff -= lm->m_len; 7420 lm = lm->m_next; 7421 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 7422 __func__, rack, s_moff, s_mb, rsm->soff)); 7423 } 7424 rsm->m = lm; 7425 counter_u64_add(rack_sbsndptr_wrong, 1); 7426 } else 7427 counter_u64_add(rack_sbsndptr_right, 1); 7428 rsm->orig_m_len = rsm->m->m_len; 7429 } else 7430 rsm->orig_m_len = 0; 7431 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7432 /* Log a new rsm */ 7433 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 7434 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7435 #ifdef INVARIANTS 7436 if (insret != NULL) { 7437 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7438 nrsm, insret, rack, rsm); 7439 } 7440 #endif 7441 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7442 rsm->r_in_tmap = 1; 7443 /* 7444 * Special case detection, is there just a single 7445 * packet outstanding when we are not in recovery? 7446 * 7447 * If this is true mark it so. 7448 */ 7449 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 7450 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 7451 struct rack_sendmap *prsm; 7452 7453 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7454 if (prsm) 7455 prsm->r_one_out_nr = 1; 7456 } 7457 return; 7458 } 7459 /* 7460 * If we reach here its a retransmission and we need to find it. 7461 */ 7462 memset(&fe, 0, sizeof(fe)); 7463 more: 7464 if (hintrsm && (hintrsm->r_start == seq_out)) { 7465 rsm = hintrsm; 7466 hintrsm = NULL; 7467 } else { 7468 /* No hints sorry */ 7469 rsm = NULL; 7470 } 7471 if ((rsm) && (rsm->r_start == seq_out)) { 7472 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7473 if (len == 0) { 7474 return; 7475 } else { 7476 goto more; 7477 } 7478 } 7479 /* Ok it was not the last pointer go through it the hard way. */ 7480 refind: 7481 fe.r_start = seq_out; 7482 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7483 if (rsm) { 7484 if (rsm->r_start == seq_out) { 7485 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7486 if (len == 0) { 7487 return; 7488 } else { 7489 goto refind; 7490 } 7491 } 7492 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 7493 /* Transmitted within this piece */ 7494 /* 7495 * Ok we must split off the front and then let the 7496 * update do the rest 7497 */ 7498 nrsm = rack_alloc_full_limit(rack); 7499 if (nrsm == NULL) { 7500 rack_update_rsm(tp, rack, rsm, cts, add_flag); 7501 return; 7502 } 7503 /* 7504 * copy rsm to nrsm and then trim the front of rsm 7505 * to not include this part. 7506 */ 7507 rack_clone_rsm(rack, nrsm, rsm, seq_out); 7508 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7509 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7510 #ifdef INVARIANTS 7511 if (insret != NULL) { 7512 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7513 nrsm, insret, rack, rsm); 7514 } 7515 #endif 7516 if (rsm->r_in_tmap) { 7517 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7518 nrsm->r_in_tmap = 1; 7519 } 7520 rsm->r_flags &= (~RACK_HAS_FIN); 7521 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); 7522 if (len == 0) { 7523 return; 7524 } else if (len > 0) 7525 goto refind; 7526 } 7527 } 7528 /* 7529 * Hmm not found in map did they retransmit both old and on into the 7530 * new? 7531 */ 7532 if (seq_out == tp->snd_max) { 7533 goto again; 7534 } else if (SEQ_LT(seq_out, tp->snd_max)) { 7535 #ifdef INVARIANTS 7536 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 7537 seq_out, len, tp->snd_una, tp->snd_max); 7538 printf("Starting Dump of all rack entries\n"); 7539 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 7540 printf("rsm:%p start:%u end:%u\n", 7541 rsm, rsm->r_start, rsm->r_end); 7542 } 7543 printf("Dump complete\n"); 7544 panic("seq_out not found rack:%p tp:%p", 7545 rack, tp); 7546 #endif 7547 } else { 7548 #ifdef INVARIANTS 7549 /* 7550 * Hmm beyond sndmax? (only if we are using the new rtt-pack 7551 * flag) 7552 */ 7553 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 7554 seq_out, len, tp->snd_max, tp); 7555 #endif 7556 } 7557 } 7558 7559 /* 7560 * Record one of the RTT updates from an ack into 7561 * our sample structure. 7562 */ 7563 7564 static void 7565 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 7566 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 7567 { 7568 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7569 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 7570 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 7571 } 7572 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7573 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 7574 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 7575 } 7576 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 7577 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 7578 rack->r_ctl.rc_gp_lowrtt = us_rtt; 7579 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 7580 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 7581 } 7582 if ((confidence == 1) && 7583 ((rsm == NULL) || 7584 (rsm->r_just_ret) || 7585 (rsm->r_one_out_nr && 7586 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 7587 /* 7588 * If the rsm had a just return 7589 * hit it then we can't trust the 7590 * rtt measurement for buffer deterimination 7591 * Note that a confidence of 2, indicates 7592 * SACK'd which overrides the r_just_ret or 7593 * the r_one_out_nr. If it was a CUM-ACK and 7594 * we had only two outstanding, but get an 7595 * ack for only 1. Then that also lowers our 7596 * confidence. 7597 */ 7598 confidence = 0; 7599 } 7600 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7601 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 7602 if (rack->r_ctl.rack_rs.confidence == 0) { 7603 /* 7604 * We take anything with no current confidence 7605 * saved. 7606 */ 7607 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7608 rack->r_ctl.rack_rs.confidence = confidence; 7609 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7610 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 7611 /* 7612 * Once we have a confident number, 7613 * we can update it with a smaller 7614 * value since this confident number 7615 * may include the DSACK time until 7616 * the next segment (the second one) arrived. 7617 */ 7618 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7619 rack->r_ctl.rack_rs.confidence = confidence; 7620 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7621 } 7622 } 7623 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 7624 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 7625 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 7626 rack->r_ctl.rack_rs.rs_rtt_cnt++; 7627 } 7628 7629 /* 7630 * Collect new round-trip time estimate 7631 * and update averages and current timeout. 7632 */ 7633 static void 7634 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 7635 { 7636 int32_t delta; 7637 uint32_t o_srtt, o_var; 7638 int32_t hrtt_up = 0; 7639 int32_t rtt; 7640 7641 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 7642 /* No valid sample */ 7643 return; 7644 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 7645 /* We are to use the lowest RTT seen in a single ack */ 7646 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 7647 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 7648 /* We are to use the highest RTT seen in a single ack */ 7649 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 7650 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 7651 /* We are to use the average RTT seen in a single ack */ 7652 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 7653 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 7654 } else { 7655 #ifdef INVARIANTS 7656 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 7657 #endif 7658 return; 7659 } 7660 if (rtt == 0) 7661 rtt = 1; 7662 if (rack->rc_gp_rtt_set == 0) { 7663 /* 7664 * With no RTT we have to accept 7665 * even one we are not confident of. 7666 */ 7667 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 7668 rack->rc_gp_rtt_set = 1; 7669 } else if (rack->r_ctl.rack_rs.confidence) { 7670 /* update the running gp srtt */ 7671 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 7672 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 7673 } 7674 if (rack->r_ctl.rack_rs.confidence) { 7675 /* 7676 * record the low and high for highly buffered path computation, 7677 * we only do this if we are confident (not a retransmission). 7678 */ 7679 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 7680 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7681 hrtt_up = 1; 7682 } 7683 if (rack->rc_highly_buffered == 0) { 7684 /* 7685 * Currently once we declare a path has 7686 * highly buffered there is no going 7687 * back, which may be a problem... 7688 */ 7689 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 7690 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 7691 rack->r_ctl.rc_highest_us_rtt, 7692 rack->r_ctl.rc_lowest_us_rtt, 7693 RACK_RTTS_SEEHBP); 7694 rack->rc_highly_buffered = 1; 7695 } 7696 } 7697 } 7698 if ((rack->r_ctl.rack_rs.confidence) || 7699 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 7700 /* 7701 * If we are highly confident of it <or> it was 7702 * never retransmitted we accept it as the last us_rtt. 7703 */ 7704 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7705 /* The lowest rtt can be set if its was not retransmited */ 7706 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 7707 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7708 if (rack->r_ctl.rc_lowest_us_rtt == 0) 7709 rack->r_ctl.rc_lowest_us_rtt = 1; 7710 } 7711 } 7712 o_srtt = tp->t_srtt; 7713 o_var = tp->t_rttvar; 7714 rack = (struct tcp_rack *)tp->t_fb_ptr; 7715 if (tp->t_srtt != 0) { 7716 /* 7717 * We keep a simple srtt in microseconds, like our rtt 7718 * measurement. We don't need to do any tricks with shifting 7719 * etc. Instead we just add in 1/8th of the new measurement 7720 * and subtract out 1/8 of the old srtt. We do the same with 7721 * the variance after finding the absolute value of the 7722 * difference between this sample and the current srtt. 7723 */ 7724 delta = tp->t_srtt - rtt; 7725 /* Take off 1/8th of the current sRTT */ 7726 tp->t_srtt -= (tp->t_srtt >> 3); 7727 /* Add in 1/8th of the new RTT just measured */ 7728 tp->t_srtt += (rtt >> 3); 7729 if (tp->t_srtt <= 0) 7730 tp->t_srtt = 1; 7731 /* Now lets make the absolute value of the variance */ 7732 if (delta < 0) 7733 delta = -delta; 7734 /* Subtract out 1/8th */ 7735 tp->t_rttvar -= (tp->t_rttvar >> 3); 7736 /* Add in 1/8th of the new variance we just saw */ 7737 tp->t_rttvar += (delta >> 3); 7738 if (tp->t_rttvar <= 0) 7739 tp->t_rttvar = 1; 7740 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 7741 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7742 } else { 7743 /* 7744 * No rtt measurement yet - use the unsmoothed rtt. Set the 7745 * variance to half the rtt (so our first retransmit happens 7746 * at 3*rtt). 7747 */ 7748 tp->t_srtt = rtt; 7749 tp->t_rttvar = rtt >> 1; 7750 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7751 } 7752 rack->rc_srtt_measure_made = 1; 7753 KMOD_TCPSTAT_INC(tcps_rttupdated); 7754 tp->t_rttupdated++; 7755 #ifdef STATS 7756 if (rack_stats_gets_ms_rtt == 0) { 7757 /* Send in the microsecond rtt used for rxt timeout purposes */ 7758 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 7759 } else if (rack_stats_gets_ms_rtt == 1) { 7760 /* Send in the millisecond rtt used for rxt timeout purposes */ 7761 int32_t ms_rtt; 7762 7763 /* Round up */ 7764 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7765 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7766 } else if (rack_stats_gets_ms_rtt == 2) { 7767 /* Send in the millisecond rtt has close to the path RTT as we can get */ 7768 int32_t ms_rtt; 7769 7770 /* Round up */ 7771 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7772 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7773 } else { 7774 /* Send in the microsecond rtt has close to the path RTT as we can get */ 7775 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 7776 } 7777 7778 #endif 7779 /* 7780 * the retransmit should happen at rtt + 4 * rttvar. Because of the 7781 * way we do the smoothing, srtt and rttvar will each average +1/2 7782 * tick of bias. When we compute the retransmit timer, we want 1/2 7783 * tick of rounding and 1 extra tick because of +-1/2 tick 7784 * uncertainty in the firing of the timer. The bias will give us 7785 * exactly the 1.5 tick we need. But, because the bias is 7786 * statistical, we have to test that we don't drop below the minimum 7787 * feasible timer (which is 2 ticks). 7788 */ 7789 tp->t_rxtshift = 0; 7790 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7791 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 7792 rack_log_rtt_sample(rack, rtt); 7793 tp->t_softerror = 0; 7794 } 7795 7796 7797 static void 7798 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 7799 { 7800 /* 7801 * Apply to filter the inbound us-rtt at us_cts. 7802 */ 7803 uint32_t old_rtt; 7804 7805 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 7806 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 7807 us_rtt, us_cts); 7808 if (rack->r_ctl.last_pacing_time && 7809 rack->rc_gp_dyn_mul && 7810 (rack->r_ctl.last_pacing_time > us_rtt)) 7811 rack->pacing_longer_than_rtt = 1; 7812 else 7813 rack->pacing_longer_than_rtt = 0; 7814 if (old_rtt > us_rtt) { 7815 /* We just hit a new lower rtt time */ 7816 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 7817 __LINE__, RACK_RTTS_NEWRTT); 7818 /* 7819 * Only count it if its lower than what we saw within our 7820 * calculated range. 7821 */ 7822 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 7823 if (rack_probertt_lower_within && 7824 rack->rc_gp_dyn_mul && 7825 (rack->use_fixed_rate == 0) && 7826 (rack->rc_always_pace)) { 7827 /* 7828 * We are seeing a new lower rtt very close 7829 * to the time that we would have entered probe-rtt. 7830 * This is probably due to the fact that a peer flow 7831 * has entered probe-rtt. Lets go in now too. 7832 */ 7833 uint32_t val; 7834 7835 val = rack_probertt_lower_within * rack_time_between_probertt; 7836 val /= 100; 7837 if ((rack->in_probe_rtt == 0) && 7838 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 7839 rack_enter_probertt(rack, us_cts); 7840 } 7841 } 7842 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 7843 } 7844 } 7845 } 7846 7847 static int 7848 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 7849 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 7850 { 7851 int32_t i, all; 7852 uint32_t t, len_acked; 7853 7854 if ((rsm->r_flags & RACK_ACKED) || 7855 (rsm->r_flags & RACK_WAS_ACKED)) 7856 /* Already done */ 7857 return (0); 7858 if (rsm->r_no_rtt_allowed) { 7859 /* Not allowed */ 7860 return (0); 7861 } 7862 if (ack_type == CUM_ACKED) { 7863 if (SEQ_GT(th_ack, rsm->r_end)) { 7864 len_acked = rsm->r_end - rsm->r_start; 7865 all = 1; 7866 } else { 7867 len_acked = th_ack - rsm->r_start; 7868 all = 0; 7869 } 7870 } else { 7871 len_acked = rsm->r_end - rsm->r_start; 7872 all = 0; 7873 } 7874 if (rsm->r_rtr_cnt == 1) { 7875 uint32_t us_rtt; 7876 7877 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7878 if ((int)t <= 0) 7879 t = 1; 7880 if (!tp->t_rttlow || tp->t_rttlow > t) 7881 tp->t_rttlow = t; 7882 if (!rack->r_ctl.rc_rack_min_rtt || 7883 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7884 rack->r_ctl.rc_rack_min_rtt = t; 7885 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7886 rack->r_ctl.rc_rack_min_rtt = 1; 7887 } 7888 } 7889 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 7890 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7891 else 7892 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7893 if (us_rtt == 0) 7894 us_rtt = 1; 7895 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 7896 if (ack_type == SACKED) { 7897 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 7898 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 7899 } else { 7900 /* 7901 * We need to setup what our confidence 7902 * is in this ack. 7903 * 7904 * If the rsm was app limited and it is 7905 * less than a mss in length (the end 7906 * of the send) then we have a gap. If we 7907 * were app limited but say we were sending 7908 * multiple MSS's then we are more confident 7909 * int it. 7910 * 7911 * When we are not app-limited then we see if 7912 * the rsm is being included in the current 7913 * measurement, we tell this by the app_limited_needs_set 7914 * flag. 7915 * 7916 * Note that being cwnd blocked is not applimited 7917 * as well as the pacing delay between packets which 7918 * are sending only 1 or 2 MSS's also will show up 7919 * in the RTT. We probably need to examine this algorithm 7920 * a bit more and enhance it to account for the delay 7921 * between rsm's. We could do that by saving off the 7922 * pacing delay of each rsm (in an rsm) and then 7923 * factoring that in somehow though for now I am 7924 * not sure how :) 7925 */ 7926 int calc_conf = 0; 7927 7928 if (rsm->r_flags & RACK_APP_LIMITED) { 7929 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 7930 calc_conf = 0; 7931 else 7932 calc_conf = 1; 7933 } else if (rack->app_limited_needs_set == 0) { 7934 calc_conf = 1; 7935 } else { 7936 calc_conf = 0; 7937 } 7938 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 7939 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 7940 calc_conf, rsm, rsm->r_rtr_cnt); 7941 } 7942 if ((rsm->r_flags & RACK_TLP) && 7943 (!IN_FASTRECOVERY(tp->t_flags))) { 7944 /* Segment was a TLP and our retrans matched */ 7945 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 7946 rack->r_ctl.rc_rsm_start = tp->snd_max; 7947 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7948 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7949 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 7950 } 7951 } 7952 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 7953 /* New more recent rack_tmit_time */ 7954 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7955 rack->rc_rack_rtt = t; 7956 } 7957 return (1); 7958 } 7959 /* 7960 * We clear the soft/rxtshift since we got an ack. 7961 * There is no assurance we will call the commit() function 7962 * so we need to clear these to avoid incorrect handling. 7963 */ 7964 tp->t_rxtshift = 0; 7965 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7966 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 7967 tp->t_softerror = 0; 7968 if (to && (to->to_flags & TOF_TS) && 7969 (ack_type == CUM_ACKED) && 7970 (to->to_tsecr) && 7971 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 7972 /* 7973 * Now which timestamp does it match? In this block the ACK 7974 * must be coming from a previous transmission. 7975 */ 7976 for (i = 0; i < rsm->r_rtr_cnt; i++) { 7977 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 7978 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 7979 if ((int)t <= 0) 7980 t = 1; 7981 if ((i + 1) < rsm->r_rtr_cnt) { 7982 /* 7983 * The peer ack'd from our previous 7984 * transmission. We have a spurious 7985 * retransmission and thus we dont 7986 * want to update our rack_rtt. 7987 */ 7988 return (0); 7989 } 7990 if (!tp->t_rttlow || tp->t_rttlow > t) 7991 tp->t_rttlow = t; 7992 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7993 rack->r_ctl.rc_rack_min_rtt = t; 7994 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7995 rack->r_ctl.rc_rack_min_rtt = 1; 7996 } 7997 } 7998 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 7999 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 8000 /* New more recent rack_tmit_time */ 8001 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 8002 rack->rc_rack_rtt = t; 8003 } 8004 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 8005 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 8006 rsm->r_rtr_cnt); 8007 return (1); 8008 } 8009 } 8010 goto ts_not_found; 8011 } else { 8012 /* 8013 * Ok its a SACK block that we retransmitted. or a windows 8014 * machine without timestamps. We can tell nothing from the 8015 * time-stamp since its not there or the time the peer last 8016 * recieved a segment that moved forward its cum-ack point. 8017 */ 8018 ts_not_found: 8019 i = rsm->r_rtr_cnt - 1; 8020 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 8021 if ((int)t <= 0) 8022 t = 1; 8023 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8024 /* 8025 * We retransmitted and the ack came back in less 8026 * than the smallest rtt we have observed. We most 8027 * likely did an improper retransmit as outlined in 8028 * 6.2 Step 2 point 2 in the rack-draft so we 8029 * don't want to update our rack_rtt. We in 8030 * theory (in future) might want to think about reverting our 8031 * cwnd state but we won't for now. 8032 */ 8033 return (0); 8034 } else if (rack->r_ctl.rc_rack_min_rtt) { 8035 /* 8036 * We retransmitted it and the retransmit did the 8037 * job. 8038 */ 8039 if (!rack->r_ctl.rc_rack_min_rtt || 8040 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 8041 rack->r_ctl.rc_rack_min_rtt = t; 8042 if (rack->r_ctl.rc_rack_min_rtt == 0) { 8043 rack->r_ctl.rc_rack_min_rtt = 1; 8044 } 8045 } 8046 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { 8047 /* New more recent rack_tmit_time */ 8048 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 8049 rack->rc_rack_rtt = t; 8050 } 8051 return (1); 8052 } 8053 } 8054 return (0); 8055 } 8056 8057 /* 8058 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 8059 */ 8060 static void 8061 rack_log_sack_passed(struct tcpcb *tp, 8062 struct tcp_rack *rack, struct rack_sendmap *rsm) 8063 { 8064 struct rack_sendmap *nrsm; 8065 8066 nrsm = rsm; 8067 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 8068 rack_head, r_tnext) { 8069 if (nrsm == rsm) { 8070 /* Skip orginal segment he is acked */ 8071 continue; 8072 } 8073 if (nrsm->r_flags & RACK_ACKED) { 8074 /* 8075 * Skip ack'd segments, though we 8076 * should not see these, since tmap 8077 * should not have ack'd segments. 8078 */ 8079 continue; 8080 } 8081 if (nrsm->r_flags & RACK_SACK_PASSED) { 8082 /* 8083 * We found one that is already marked 8084 * passed, we have been here before and 8085 * so all others below this are marked. 8086 */ 8087 break; 8088 } 8089 nrsm->r_flags |= RACK_SACK_PASSED; 8090 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 8091 } 8092 } 8093 8094 static void 8095 rack_need_set_test(struct tcpcb *tp, 8096 struct tcp_rack *rack, 8097 struct rack_sendmap *rsm, 8098 tcp_seq th_ack, 8099 int line, 8100 int use_which) 8101 { 8102 8103 if ((tp->t_flags & TF_GPUTINPROG) && 8104 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8105 /* 8106 * We were app limited, and this ack 8107 * butts up or goes beyond the point where we want 8108 * to start our next measurement. We need 8109 * to record the new gput_ts as here and 8110 * possibly update the start sequence. 8111 */ 8112 uint32_t seq, ts; 8113 8114 if (rsm->r_rtr_cnt > 1) { 8115 /* 8116 * This is a retransmit, can we 8117 * really make any assessment at this 8118 * point? We are not really sure of 8119 * the timestamp, is it this or the 8120 * previous transmission? 8121 * 8122 * Lets wait for something better that 8123 * is not retransmitted. 8124 */ 8125 return; 8126 } 8127 seq = tp->gput_seq; 8128 ts = tp->gput_ts; 8129 rack->app_limited_needs_set = 0; 8130 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 8131 /* Do we start at a new end? */ 8132 if ((use_which == RACK_USE_BEG) && 8133 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 8134 /* 8135 * When we get an ACK that just eats 8136 * up some of the rsm, we set RACK_USE_BEG 8137 * since whats at r_start (i.e. th_ack) 8138 * is left unacked and thats where the 8139 * measurement not starts. 8140 */ 8141 tp->gput_seq = rsm->r_start; 8142 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8143 } 8144 if ((use_which == RACK_USE_END) && 8145 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8146 /* 8147 * We use the end when the cumack 8148 * is moving forward and completely 8149 * deleting the rsm passed so basically 8150 * r_end holds th_ack. 8151 * 8152 * For SACK's we also want to use the end 8153 * since this piece just got sacked and 8154 * we want to target anything after that 8155 * in our measurement. 8156 */ 8157 tp->gput_seq = rsm->r_end; 8158 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8159 } 8160 if (use_which == RACK_USE_END_OR_THACK) { 8161 /* 8162 * special case for ack moving forward, 8163 * not a sack, we need to move all the 8164 * way up to where this ack cum-ack moves 8165 * to. 8166 */ 8167 if (SEQ_GT(th_ack, rsm->r_end)) 8168 tp->gput_seq = th_ack; 8169 else 8170 tp->gput_seq = rsm->r_end; 8171 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8172 } 8173 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 8174 /* 8175 * We moved beyond this guy's range, re-calculate 8176 * the new end point. 8177 */ 8178 if (rack->rc_gp_filled == 0) { 8179 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 8180 } else { 8181 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 8182 } 8183 } 8184 /* 8185 * We are moving the goal post, we may be able to clear the 8186 * measure_saw_probe_rtt flag. 8187 */ 8188 if ((rack->in_probe_rtt == 0) && 8189 (rack->measure_saw_probe_rtt) && 8190 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 8191 rack->measure_saw_probe_rtt = 0; 8192 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 8193 seq, tp->gput_seq, 0, 5, line, NULL, 0); 8194 if (rack->rc_gp_filled && 8195 ((tp->gput_ack - tp->gput_seq) < 8196 max(rc_init_window(rack), (MIN_GP_WIN * 8197 ctf_fixed_maxseg(tp))))) { 8198 uint32_t ideal_amount; 8199 8200 ideal_amount = rack_get_measure_window(tp, rack); 8201 if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 8202 /* 8203 * There is no sense of continuing this measurement 8204 * because its too small to gain us anything we 8205 * trust. Skip it and that way we can start a new 8206 * measurement quicker. 8207 */ 8208 tp->t_flags &= ~TF_GPUTINPROG; 8209 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 8210 0, 0, 0, 6, __LINE__, NULL, 0); 8211 } else { 8212 /* 8213 * Reset the window further out. 8214 */ 8215 tp->gput_ack = tp->gput_seq + ideal_amount; 8216 } 8217 } 8218 } 8219 } 8220 8221 static uint32_t 8222 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 8223 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 8224 { 8225 uint32_t start, end, changed = 0; 8226 struct rack_sendmap stack_map; 8227 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 8228 int32_t used_ref = 1; 8229 int moved = 0; 8230 8231 start = sack->start; 8232 end = sack->end; 8233 rsm = *prsm; 8234 memset(&fe, 0, sizeof(fe)); 8235 do_rest_ofb: 8236 if ((rsm == NULL) || 8237 (SEQ_LT(end, rsm->r_start)) || 8238 (SEQ_GEQ(start, rsm->r_end)) || 8239 (SEQ_LT(start, rsm->r_start))) { 8240 /* 8241 * We are not in the right spot, 8242 * find the correct spot in the tree. 8243 */ 8244 used_ref = 0; 8245 fe.r_start = start; 8246 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8247 moved++; 8248 } 8249 if (rsm == NULL) { 8250 /* TSNH */ 8251 goto out; 8252 } 8253 /* Ok we have an ACK for some piece of this rsm */ 8254 if (rsm->r_start != start) { 8255 if ((rsm->r_flags & RACK_ACKED) == 0) { 8256 /** 8257 * Need to split this in two pieces the before and after, 8258 * the before remains in the map, the after must be 8259 * added. In other words we have: 8260 * rsm |--------------| 8261 * sackblk |-------> 8262 * rsm will become 8263 * rsm |---| 8264 * and nrsm will be the sacked piece 8265 * nrsm |----------| 8266 * 8267 * But before we start down that path lets 8268 * see if the sack spans over on top of 8269 * the next guy and it is already sacked. 8270 */ 8271 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8272 if (next && (next->r_flags & RACK_ACKED) && 8273 SEQ_GEQ(end, next->r_start)) { 8274 /** 8275 * So the next one is already acked, and 8276 * we can thus by hookery use our stack_map 8277 * to reflect the piece being sacked and 8278 * then adjust the two tree entries moving 8279 * the start and ends around. So we start like: 8280 * rsm |------------| (not-acked) 8281 * next |-----------| (acked) 8282 * sackblk |--------> 8283 * We want to end like so: 8284 * rsm |------| (not-acked) 8285 * next |-----------------| (acked) 8286 * nrsm |-----| 8287 * Where nrsm is a temporary stack piece we 8288 * use to update all the gizmos. 8289 */ 8290 /* Copy up our fudge block */ 8291 nrsm = &stack_map; 8292 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8293 /* Now adjust our tree blocks */ 8294 rsm->r_end = start; 8295 next->r_start = start; 8296 /* Now we must adjust back where next->m is */ 8297 rack_setup_offset_for_rsm(rsm, next); 8298 8299 /* We don't need to adjust rsm, it did not change */ 8300 /* Clear out the dup ack count of the remainder */ 8301 rsm->r_dupack = 0; 8302 rsm->r_just_ret = 0; 8303 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8304 /* Now lets make sure our fudge block is right */ 8305 nrsm->r_start = start; 8306 /* Now lets update all the stats and such */ 8307 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8308 if (rack->app_limited_needs_set) 8309 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8310 changed += (nrsm->r_end - nrsm->r_start); 8311 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8312 if (nrsm->r_flags & RACK_SACK_PASSED) { 8313 counter_u64_add(rack_reorder_seen, 1); 8314 rack->r_ctl.rc_reorder_ts = cts; 8315 } 8316 /* 8317 * Now we want to go up from rsm (the 8318 * one left un-acked) to the next one 8319 * in the tmap. We do this so when 8320 * we walk backwards we include marking 8321 * sack-passed on rsm (The one passed in 8322 * is skipped since it is generally called 8323 * on something sacked before removing it 8324 * from the tmap). 8325 */ 8326 if (rsm->r_in_tmap) { 8327 nrsm = TAILQ_NEXT(rsm, r_tnext); 8328 /* 8329 * Now that we have the next 8330 * one walk backwards from there. 8331 */ 8332 if (nrsm && nrsm->r_in_tmap) 8333 rack_log_sack_passed(tp, rack, nrsm); 8334 } 8335 /* Now are we done? */ 8336 if (SEQ_LT(end, next->r_end) || 8337 (end == next->r_end)) { 8338 /* Done with block */ 8339 goto out; 8340 } 8341 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 8342 counter_u64_add(rack_sack_used_next_merge, 1); 8343 /* Postion for the next block */ 8344 start = next->r_end; 8345 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 8346 if (rsm == NULL) 8347 goto out; 8348 } else { 8349 /** 8350 * We can't use any hookery here, so we 8351 * need to split the map. We enter like 8352 * so: 8353 * rsm |--------| 8354 * sackblk |-----> 8355 * We will add the new block nrsm and 8356 * that will be the new portion, and then 8357 * fall through after reseting rsm. So we 8358 * split and look like this: 8359 * rsm |----| 8360 * sackblk |-----> 8361 * nrsm |---| 8362 * We then fall through reseting 8363 * rsm to nrsm, so the next block 8364 * picks it up. 8365 */ 8366 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8367 if (nrsm == NULL) { 8368 /* 8369 * failed XXXrrs what can we do but loose the sack 8370 * info? 8371 */ 8372 goto out; 8373 } 8374 counter_u64_add(rack_sack_splits, 1); 8375 rack_clone_rsm(rack, nrsm, rsm, start); 8376 rsm->r_just_ret = 0; 8377 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8378 #ifdef INVARIANTS 8379 if (insret != NULL) { 8380 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8381 nrsm, insret, rack, rsm); 8382 } 8383 #endif 8384 if (rsm->r_in_tmap) { 8385 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8386 nrsm->r_in_tmap = 1; 8387 } 8388 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 8389 rsm->r_flags &= (~RACK_HAS_FIN); 8390 /* Position us to point to the new nrsm that starts the sack blk */ 8391 rsm = nrsm; 8392 } 8393 } else { 8394 /* Already sacked this piece */ 8395 counter_u64_add(rack_sack_skipped_acked, 1); 8396 moved++; 8397 if (end == rsm->r_end) { 8398 /* Done with block */ 8399 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8400 goto out; 8401 } else if (SEQ_LT(end, rsm->r_end)) { 8402 /* A partial sack to a already sacked block */ 8403 moved++; 8404 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8405 goto out; 8406 } else { 8407 /* 8408 * The end goes beyond this guy 8409 * repostion the start to the 8410 * next block. 8411 */ 8412 start = rsm->r_end; 8413 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8414 if (rsm == NULL) 8415 goto out; 8416 } 8417 } 8418 } 8419 if (SEQ_GEQ(end, rsm->r_end)) { 8420 /** 8421 * The end of this block is either beyond this guy or right 8422 * at this guy. I.e.: 8423 * rsm --- |-----| 8424 * end |-----| 8425 * <or> 8426 * end |---------| 8427 */ 8428 if ((rsm->r_flags & RACK_ACKED) == 0) { 8429 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8430 changed += (rsm->r_end - rsm->r_start); 8431 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8432 if (rsm->r_in_tmap) /* should be true */ 8433 rack_log_sack_passed(tp, rack, rsm); 8434 /* Is Reordering occuring? */ 8435 if (rsm->r_flags & RACK_SACK_PASSED) { 8436 rsm->r_flags &= ~RACK_SACK_PASSED; 8437 counter_u64_add(rack_reorder_seen, 1); 8438 rack->r_ctl.rc_reorder_ts = cts; 8439 } 8440 if (rack->app_limited_needs_set) 8441 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8442 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8443 rsm->r_flags |= RACK_ACKED; 8444 rsm->r_flags &= ~RACK_TLP; 8445 if (rsm->r_in_tmap) { 8446 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8447 rsm->r_in_tmap = 0; 8448 } 8449 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 8450 } else { 8451 counter_u64_add(rack_sack_skipped_acked, 1); 8452 moved++; 8453 } 8454 if (end == rsm->r_end) { 8455 /* This block only - done, setup for next */ 8456 goto out; 8457 } 8458 /* 8459 * There is more not coverend by this rsm move on 8460 * to the next block in the RB tree. 8461 */ 8462 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8463 start = rsm->r_end; 8464 rsm = nrsm; 8465 if (rsm == NULL) 8466 goto out; 8467 goto do_rest_ofb; 8468 } 8469 /** 8470 * The end of this sack block is smaller than 8471 * our rsm i.e.: 8472 * rsm --- |-----| 8473 * end |--| 8474 */ 8475 if ((rsm->r_flags & RACK_ACKED) == 0) { 8476 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8477 if (prev && (prev->r_flags & RACK_ACKED)) { 8478 /** 8479 * Goal, we want the right remainder of rsm to shrink 8480 * in place and span from (rsm->r_start = end) to rsm->r_end. 8481 * We want to expand prev to go all the way 8482 * to prev->r_end <- end. 8483 * so in the tree we have before: 8484 * prev |--------| (acked) 8485 * rsm |-------| (non-acked) 8486 * sackblk |-| 8487 * We churn it so we end up with 8488 * prev |----------| (acked) 8489 * rsm |-----| (non-acked) 8490 * nrsm |-| (temporary) 8491 */ 8492 nrsm = &stack_map; 8493 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8494 prev->r_end = end; 8495 rsm->r_start = end; 8496 /* Now adjust nrsm (stack copy) to be 8497 * the one that is the small 8498 * piece that was "sacked". 8499 */ 8500 nrsm->r_end = end; 8501 rsm->r_dupack = 0; 8502 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8503 /* 8504 * Now that the rsm has had its start moved forward 8505 * lets go ahead and get its new place in the world. 8506 */ 8507 rack_setup_offset_for_rsm(prev, rsm); 8508 /* 8509 * Now nrsm is our new little piece 8510 * that is acked (which was merged 8511 * to prev). Update the rtt and changed 8512 * based on that. Also check for reordering. 8513 */ 8514 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8515 if (rack->app_limited_needs_set) 8516 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8517 changed += (nrsm->r_end - nrsm->r_start); 8518 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8519 if (nrsm->r_flags & RACK_SACK_PASSED) { 8520 counter_u64_add(rack_reorder_seen, 1); 8521 rack->r_ctl.rc_reorder_ts = cts; 8522 } 8523 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 8524 rsm = prev; 8525 counter_u64_add(rack_sack_used_prev_merge, 1); 8526 } else { 8527 /** 8528 * This is the case where our previous 8529 * block is not acked either, so we must 8530 * split the block in two. 8531 */ 8532 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8533 if (nrsm == NULL) { 8534 /* failed rrs what can we do but loose the sack info? */ 8535 goto out; 8536 } 8537 /** 8538 * In this case nrsm becomes 8539 * nrsm->r_start = end; 8540 * nrsm->r_end = rsm->r_end; 8541 * which is un-acked. 8542 * <and> 8543 * rsm->r_end = nrsm->r_start; 8544 * i.e. the remaining un-acked 8545 * piece is left on the left 8546 * hand side. 8547 * 8548 * So we start like this 8549 * rsm |----------| (not acked) 8550 * sackblk |---| 8551 * build it so we have 8552 * rsm |---| (acked) 8553 * nrsm |------| (not acked) 8554 */ 8555 counter_u64_add(rack_sack_splits, 1); 8556 rack_clone_rsm(rack, nrsm, rsm, end); 8557 rsm->r_flags &= (~RACK_HAS_FIN); 8558 rsm->r_just_ret = 0; 8559 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8560 #ifdef INVARIANTS 8561 if (insret != NULL) { 8562 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8563 nrsm, insret, rack, rsm); 8564 } 8565 #endif 8566 if (rsm->r_in_tmap) { 8567 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8568 nrsm->r_in_tmap = 1; 8569 } 8570 nrsm->r_dupack = 0; 8571 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8572 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8573 changed += (rsm->r_end - rsm->r_start); 8574 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8575 if (rsm->r_in_tmap) /* should be true */ 8576 rack_log_sack_passed(tp, rack, rsm); 8577 /* Is Reordering occuring? */ 8578 if (rsm->r_flags & RACK_SACK_PASSED) { 8579 rsm->r_flags &= ~RACK_SACK_PASSED; 8580 counter_u64_add(rack_reorder_seen, 1); 8581 rack->r_ctl.rc_reorder_ts = cts; 8582 } 8583 if (rack->app_limited_needs_set) 8584 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8585 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8586 rsm->r_flags |= RACK_ACKED; 8587 rsm->r_flags &= ~RACK_TLP; 8588 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 8589 if (rsm->r_in_tmap) { 8590 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8591 rsm->r_in_tmap = 0; 8592 } 8593 } 8594 } else if (start != end){ 8595 /* 8596 * The block was already acked. 8597 */ 8598 counter_u64_add(rack_sack_skipped_acked, 1); 8599 moved++; 8600 } 8601 out: 8602 if (rsm && (rsm->r_flags & RACK_ACKED)) { 8603 /* 8604 * Now can we merge where we worked 8605 * with either the previous or 8606 * next block? 8607 */ 8608 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8609 while (next) { 8610 if (next->r_flags & RACK_ACKED) { 8611 /* yep this and next can be merged */ 8612 rsm = rack_merge_rsm(rack, rsm, next); 8613 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8614 } else 8615 break; 8616 } 8617 /* Now what about the previous? */ 8618 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8619 while (prev) { 8620 if (prev->r_flags & RACK_ACKED) { 8621 /* yep the previous and this can be merged */ 8622 rsm = rack_merge_rsm(rack, prev, rsm); 8623 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8624 } else 8625 break; 8626 } 8627 } 8628 if (used_ref == 0) { 8629 counter_u64_add(rack_sack_proc_all, 1); 8630 } else { 8631 counter_u64_add(rack_sack_proc_short, 1); 8632 } 8633 /* Save off the next one for quick reference. */ 8634 if (rsm) 8635 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8636 else 8637 nrsm = NULL; 8638 *prsm = rack->r_ctl.rc_sacklast = nrsm; 8639 /* Pass back the moved. */ 8640 *moved_two = moved; 8641 return (changed); 8642 } 8643 8644 static void inline 8645 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 8646 { 8647 struct rack_sendmap *tmap; 8648 8649 tmap = NULL; 8650 while (rsm && (rsm->r_flags & RACK_ACKED)) { 8651 /* Its no longer sacked, mark it so */ 8652 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8653 #ifdef INVARIANTS 8654 if (rsm->r_in_tmap) { 8655 panic("rack:%p rsm:%p flags:0x%x in tmap?", 8656 rack, rsm, rsm->r_flags); 8657 } 8658 #endif 8659 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 8660 /* Rebuild it into our tmap */ 8661 if (tmap == NULL) { 8662 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8663 tmap = rsm; 8664 } else { 8665 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 8666 tmap = rsm; 8667 } 8668 tmap->r_in_tmap = 1; 8669 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8670 } 8671 /* 8672 * Now lets possibly clear the sack filter so we start 8673 * recognizing sacks that cover this area. 8674 */ 8675 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 8676 8677 } 8678 8679 static void 8680 rack_do_decay(struct tcp_rack *rack) 8681 { 8682 struct timeval res; 8683 8684 #define timersub(tvp, uvp, vvp) \ 8685 do { \ 8686 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 8687 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 8688 if ((vvp)->tv_usec < 0) { \ 8689 (vvp)->tv_sec--; \ 8690 (vvp)->tv_usec += 1000000; \ 8691 } \ 8692 } while (0) 8693 8694 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 8695 #undef timersub 8696 8697 rack->r_ctl.input_pkt++; 8698 if ((rack->rc_in_persist) || 8699 (res.tv_sec >= 1) || 8700 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 8701 /* 8702 * Check for decay of non-SAD, 8703 * we want all SAD detection metrics to 8704 * decay 1/4 per second (or more) passed. 8705 */ 8706 uint32_t pkt_delta; 8707 8708 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 8709 /* Update our saved tracking values */ 8710 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 8711 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 8712 /* Now do we escape without decay? */ 8713 #ifdef NETFLIX_EXP_DETECTION 8714 if (rack->rc_in_persist || 8715 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 8716 (pkt_delta < tcp_sad_low_pps)){ 8717 /* 8718 * We don't decay idle connections 8719 * or ones that have a low input pps. 8720 */ 8721 return; 8722 } 8723 /* Decay the counters */ 8724 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 8725 tcp_sad_decay_val); 8726 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 8727 tcp_sad_decay_val); 8728 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 8729 tcp_sad_decay_val); 8730 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 8731 tcp_sad_decay_val); 8732 #endif 8733 } 8734 } 8735 8736 static void 8737 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) 8738 { 8739 struct rack_sendmap *rsm, *rm; 8740 8741 /* 8742 * The ACK point is advancing to th_ack, we must drop off 8743 * the packets in the rack log and calculate any eligble 8744 * RTT's. 8745 */ 8746 rack->r_wanted_output = 1; 8747 more: 8748 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 8749 if (rsm == NULL) { 8750 if ((th_ack - 1) == tp->iss) { 8751 /* 8752 * For the SYN incoming case we will not 8753 * have called tcp_output for the sending of 8754 * the SYN, so there will be no map. All 8755 * other cases should probably be a panic. 8756 */ 8757 return; 8758 } 8759 if (tp->t_flags & TF_SENTFIN) { 8760 /* if we sent a FIN we often will not have map */ 8761 return; 8762 } 8763 #ifdef INVARIANTS 8764 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 8765 tp, 8766 tp->t_state, th_ack, rack, 8767 tp->snd_una, tp->snd_max, tp->snd_nxt); 8768 #endif 8769 return; 8770 } 8771 if (SEQ_LT(th_ack, rsm->r_start)) { 8772 /* Huh map is missing this */ 8773 #ifdef INVARIANTS 8774 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 8775 rsm->r_start, 8776 th_ack, tp->t_state, rack->r_state); 8777 #endif 8778 return; 8779 } 8780 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 8781 /* Now do we consume the whole thing? */ 8782 if (SEQ_GEQ(th_ack, rsm->r_end)) { 8783 /* Its all consumed. */ 8784 uint32_t left; 8785 uint8_t newly_acked; 8786 8787 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 8788 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 8789 rsm->r_rtr_bytes = 0; 8790 /* Record the time of highest cumack sent */ 8791 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8792 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8793 #ifdef INVARIANTS 8794 if (rm != rsm) { 8795 panic("removing head in rack:%p rsm:%p rm:%p", 8796 rack, rsm, rm); 8797 } 8798 #endif 8799 if (rsm->r_in_tmap) { 8800 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8801 rsm->r_in_tmap = 0; 8802 } 8803 newly_acked = 1; 8804 if (rsm->r_flags & RACK_ACKED) { 8805 /* 8806 * It was acked on the scoreboard -- remove 8807 * it from total 8808 */ 8809 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8810 newly_acked = 0; 8811 } else if (rsm->r_flags & RACK_SACK_PASSED) { 8812 /* 8813 * There are segments ACKED on the 8814 * scoreboard further up. We are seeing 8815 * reordering. 8816 */ 8817 rsm->r_flags &= ~RACK_SACK_PASSED; 8818 counter_u64_add(rack_reorder_seen, 1); 8819 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8820 rsm->r_flags |= RACK_ACKED; 8821 rack->r_ctl.rc_reorder_ts = cts; 8822 if (rack->r_ent_rec_ns) { 8823 /* 8824 * We have sent no more, and we saw an sack 8825 * then ack arrive. 8826 */ 8827 rack->r_might_revert = 1; 8828 } 8829 } 8830 if ((rsm->r_flags & RACK_TO_REXT) && 8831 (tp->t_flags & TF_RCVD_TSTMP) && 8832 (to->to_flags & TOF_TS) && 8833 (tp->t_flags & TF_PREVVALID)) { 8834 /* 8835 * We can use the timestamp to see 8836 * if this retransmission was from the 8837 * first transmit. If so we made a mistake. 8838 */ 8839 tp->t_flags &= ~TF_PREVVALID; 8840 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 8841 /* The first transmit is what this ack is for */ 8842 rack_cong_signal(tp, CC_RTO_ERR, th_ack); 8843 } 8844 } 8845 left = th_ack - rsm->r_end; 8846 if (rack->app_limited_needs_set && newly_acked) 8847 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 8848 /* Free back to zone */ 8849 rack_free(rack, rsm); 8850 if (left) { 8851 goto more; 8852 } 8853 /* Check for reneging */ 8854 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 8855 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 8856 /* 8857 * The peer has moved snd_una up to 8858 * the edge of this send, i.e. one 8859 * that it had previously acked. The only 8860 * way that can be true if the peer threw 8861 * away data (space issues) that it had 8862 * previously sacked (else it would have 8863 * given us snd_una up to (rsm->r_end). 8864 * We need to undo the acked markings here. 8865 * 8866 * Note we have to look to make sure th_ack is 8867 * our rsm->r_start in case we get an old ack 8868 * where th_ack is behind snd_una. 8869 */ 8870 rack_peer_reneges(rack, rsm, th_ack); 8871 } 8872 return; 8873 } 8874 if (rsm->r_flags & RACK_ACKED) { 8875 /* 8876 * It was acked on the scoreboard -- remove it from 8877 * total for the part being cum-acked. 8878 */ 8879 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 8880 } 8881 /* 8882 * Clear the dup ack count for 8883 * the piece that remains. 8884 */ 8885 rsm->r_dupack = 0; 8886 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8887 if (rsm->r_rtr_bytes) { 8888 /* 8889 * It was retransmitted adjust the 8890 * sack holes for what was acked. 8891 */ 8892 int ack_am; 8893 8894 ack_am = (th_ack - rsm->r_start); 8895 if (ack_am >= rsm->r_rtr_bytes) { 8896 rack->r_ctl.rc_holes_rxt -= ack_am; 8897 rsm->r_rtr_bytes -= ack_am; 8898 } 8899 } 8900 /* 8901 * Update where the piece starts and record 8902 * the time of send of highest cumack sent. 8903 */ 8904 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8905 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 8906 /* Now we need to move our offset forward too */ 8907 if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) { 8908 /* Fix up the orig_m_len and possibly the mbuf offset */ 8909 rack_adjust_orig_mlen(rsm); 8910 } 8911 rsm->soff += (th_ack - rsm->r_start); 8912 rsm->r_start = th_ack; 8913 /* Now do we need to move the mbuf fwd too? */ 8914 if (rsm->m) { 8915 while (rsm->soff >= rsm->m->m_len) { 8916 rsm->soff -= rsm->m->m_len; 8917 rsm->m = rsm->m->m_next; 8918 KASSERT((rsm->m != NULL), 8919 (" nrsm:%p hit at soff:%u null m", 8920 rsm, rsm->soff)); 8921 } 8922 rsm->orig_m_len = rsm->m->m_len; 8923 } 8924 if (rack->app_limited_needs_set) 8925 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 8926 } 8927 8928 static void 8929 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 8930 { 8931 struct rack_sendmap *rsm; 8932 int sack_pass_fnd = 0; 8933 8934 if (rack->r_might_revert) { 8935 /* 8936 * Ok we have reordering, have not sent anything, we 8937 * might want to revert the congestion state if nothing 8938 * further has SACK_PASSED on it. Lets check. 8939 * 8940 * We also get here when we have DSACKs come in for 8941 * all the data that we FR'd. Note that a rxt or tlp 8942 * timer clears this from happening. 8943 */ 8944 8945 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 8946 if (rsm->r_flags & RACK_SACK_PASSED) { 8947 sack_pass_fnd = 1; 8948 break; 8949 } 8950 } 8951 if (sack_pass_fnd == 0) { 8952 /* 8953 * We went into recovery 8954 * incorrectly due to reordering! 8955 */ 8956 int orig_cwnd; 8957 8958 rack->r_ent_rec_ns = 0; 8959 orig_cwnd = tp->snd_cwnd; 8960 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec; 8961 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 8962 tp->snd_recover = tp->snd_una; 8963 rack_log_to_prr(rack, 14, orig_cwnd); 8964 EXIT_RECOVERY(tp->t_flags); 8965 } 8966 rack->r_might_revert = 0; 8967 } 8968 } 8969 8970 #ifdef NETFLIX_EXP_DETECTION 8971 static void 8972 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 8973 { 8974 if ((rack->do_detection || tcp_force_detection) && 8975 tcp_sack_to_ack_thresh && 8976 tcp_sack_to_move_thresh && 8977 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 8978 /* 8979 * We have thresholds set to find 8980 * possible attackers and disable sack. 8981 * Check them. 8982 */ 8983 uint64_t ackratio, moveratio, movetotal; 8984 8985 /* Log detecting */ 8986 rack_log_sad(rack, 1); 8987 ackratio = (uint64_t)(rack->r_ctl.sack_count); 8988 ackratio *= (uint64_t)(1000); 8989 if (rack->r_ctl.ack_count) 8990 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 8991 else { 8992 /* We really should not hit here */ 8993 ackratio = 1000; 8994 } 8995 if ((rack->sack_attack_disable == 0) && 8996 (ackratio > rack_highest_sack_thresh_seen)) 8997 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 8998 movetotal = rack->r_ctl.sack_moved_extra; 8999 movetotal += rack->r_ctl.sack_noextra_move; 9000 moveratio = rack->r_ctl.sack_moved_extra; 9001 moveratio *= (uint64_t)1000; 9002 if (movetotal) 9003 moveratio /= movetotal; 9004 else { 9005 /* No moves, thats pretty good */ 9006 moveratio = 0; 9007 } 9008 if ((rack->sack_attack_disable == 0) && 9009 (moveratio > rack_highest_move_thresh_seen)) 9010 rack_highest_move_thresh_seen = (uint32_t)moveratio; 9011 if (rack->sack_attack_disable == 0) { 9012 if ((ackratio > tcp_sack_to_ack_thresh) && 9013 (moveratio > tcp_sack_to_move_thresh)) { 9014 /* Disable sack processing */ 9015 rack->sack_attack_disable = 1; 9016 if (rack->r_rep_attack == 0) { 9017 rack->r_rep_attack = 1; 9018 counter_u64_add(rack_sack_attacks_detected, 1); 9019 } 9020 if (tcp_attack_on_turns_on_logging) { 9021 /* 9022 * Turn on logging, used for debugging 9023 * false positives. 9024 */ 9025 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 9026 } 9027 /* Clamp the cwnd at flight size */ 9028 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 9029 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 9030 rack_log_sad(rack, 2); 9031 } 9032 } else { 9033 /* We are sack-disabled check for false positives */ 9034 if ((ackratio <= tcp_restoral_thresh) || 9035 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 9036 rack->sack_attack_disable = 0; 9037 rack_log_sad(rack, 3); 9038 /* Restart counting */ 9039 rack->r_ctl.sack_count = 0; 9040 rack->r_ctl.sack_moved_extra = 0; 9041 rack->r_ctl.sack_noextra_move = 1; 9042 rack->r_ctl.ack_count = max(1, 9043 (bytes_this_ack / segsiz)); 9044 9045 if (rack->r_rep_reverse == 0) { 9046 rack->r_rep_reverse = 1; 9047 counter_u64_add(rack_sack_attacks_reversed, 1); 9048 } 9049 /* Restore the cwnd */ 9050 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 9051 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 9052 } 9053 } 9054 } 9055 } 9056 #endif 9057 9058 static void 9059 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 9060 { 9061 9062 uint32_t am; 9063 9064 if (SEQ_GT(end, start)) 9065 am = end - start; 9066 else 9067 am = 0; 9068 /* 9069 * We keep track of how many DSACK blocks we get 9070 * after a recovery incident. 9071 */ 9072 rack->r_ctl.dsack_byte_cnt += am; 9073 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 9074 rack->r_ctl.retran_during_recovery && 9075 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 9076 /* 9077 * False recovery most likely culprit is reordering. If 9078 * nothing else is missing we need to revert. 9079 */ 9080 rack->r_might_revert = 1; 9081 rack_handle_might_revert(rack->rc_tp, rack); 9082 rack->r_might_revert = 0; 9083 rack->r_ctl.retran_during_recovery = 0; 9084 rack->r_ctl.dsack_byte_cnt = 0; 9085 } 9086 } 9087 9088 static void 9089 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 9090 { 9091 /* Deal with changed and PRR here (in recovery only) */ 9092 uint32_t pipe, snd_una; 9093 9094 rack->r_ctl.rc_prr_delivered += changed; 9095 9096 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 9097 /* 9098 * It is all outstanding, we are application limited 9099 * and thus we don't need more room to send anything. 9100 * Note we use tp->snd_una here and not th_ack because 9101 * the data as yet not been cut from the sb. 9102 */ 9103 rack->r_ctl.rc_prr_sndcnt = 0; 9104 return; 9105 } 9106 /* Compute prr_sndcnt */ 9107 if (SEQ_GT(tp->snd_una, th_ack)) { 9108 snd_una = tp->snd_una; 9109 } else { 9110 snd_una = th_ack; 9111 } 9112 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 9113 if (pipe > tp->snd_ssthresh) { 9114 long sndcnt; 9115 9116 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 9117 if (rack->r_ctl.rc_prr_recovery_fs > 0) 9118 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 9119 else { 9120 rack->r_ctl.rc_prr_sndcnt = 0; 9121 rack_log_to_prr(rack, 9, 0); 9122 sndcnt = 0; 9123 } 9124 sndcnt++; 9125 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 9126 sndcnt -= rack->r_ctl.rc_prr_out; 9127 else 9128 sndcnt = 0; 9129 rack->r_ctl.rc_prr_sndcnt = sndcnt; 9130 rack_log_to_prr(rack, 10, 0); 9131 } else { 9132 uint32_t limit; 9133 9134 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 9135 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 9136 else 9137 limit = 0; 9138 if (changed > limit) 9139 limit = changed; 9140 limit += ctf_fixed_maxseg(tp); 9141 if (tp->snd_ssthresh > pipe) { 9142 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 9143 rack_log_to_prr(rack, 11, 0); 9144 } else { 9145 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 9146 rack_log_to_prr(rack, 12, 0); 9147 } 9148 } 9149 } 9150 9151 static void 9152 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) 9153 { 9154 uint32_t changed; 9155 struct tcp_rack *rack; 9156 struct rack_sendmap *rsm; 9157 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 9158 register uint32_t th_ack; 9159 int32_t i, j, k, num_sack_blks = 0; 9160 uint32_t cts, acked, ack_point, sack_changed = 0; 9161 int loop_start = 0, moved_two = 0; 9162 uint32_t tsused; 9163 9164 9165 INP_WLOCK_ASSERT(tp->t_inpcb); 9166 if (th->th_flags & TH_RST) { 9167 /* We don't log resets */ 9168 return; 9169 } 9170 rack = (struct tcp_rack *)tp->t_fb_ptr; 9171 cts = tcp_get_usecs(NULL); 9172 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9173 changed = 0; 9174 th_ack = th->th_ack; 9175 if (rack->sack_attack_disable == 0) 9176 rack_do_decay(rack); 9177 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 9178 /* 9179 * You only get credit for 9180 * MSS and greater (and you get extra 9181 * credit for larger cum-ack moves). 9182 */ 9183 int ac; 9184 9185 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 9186 rack->r_ctl.ack_count += ac; 9187 counter_u64_add(rack_ack_total, ac); 9188 } 9189 if (rack->r_ctl.ack_count > 0xfff00000) { 9190 /* 9191 * reduce the number to keep us under 9192 * a uint32_t. 9193 */ 9194 rack->r_ctl.ack_count /= 2; 9195 rack->r_ctl.sack_count /= 2; 9196 } 9197 if (SEQ_GT(th_ack, tp->snd_una)) { 9198 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 9199 tp->t_acktime = ticks; 9200 } 9201 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 9202 changed = th_ack - rsm->r_start; 9203 if (changed) { 9204 rack_process_to_cumack(tp, rack, th_ack, cts, to); 9205 } 9206 if ((to->to_flags & TOF_SACK) == 0) { 9207 /* We are done nothing left and no sack. */ 9208 rack_handle_might_revert(tp, rack); 9209 /* 9210 * For cases where we struck a dup-ack 9211 * with no SACK, add to the changes so 9212 * PRR will work right. 9213 */ 9214 if (dup_ack_struck && (changed == 0)) { 9215 changed += ctf_fixed_maxseg(rack->rc_tp); 9216 } 9217 goto out; 9218 } 9219 /* Sack block processing */ 9220 if (SEQ_GT(th_ack, tp->snd_una)) 9221 ack_point = th_ack; 9222 else 9223 ack_point = tp->snd_una; 9224 for (i = 0; i < to->to_nsacks; i++) { 9225 bcopy((to->to_sacks + i * TCPOLEN_SACK), 9226 &sack, sizeof(sack)); 9227 sack.start = ntohl(sack.start); 9228 sack.end = ntohl(sack.end); 9229 if (SEQ_GT(sack.end, sack.start) && 9230 SEQ_GT(sack.start, ack_point) && 9231 SEQ_LT(sack.start, tp->snd_max) && 9232 SEQ_GT(sack.end, ack_point) && 9233 SEQ_LEQ(sack.end, tp->snd_max)) { 9234 sack_blocks[num_sack_blks] = sack; 9235 num_sack_blks++; 9236 #ifdef NETFLIX_STATS 9237 } else if (SEQ_LEQ(sack.start, th_ack) && 9238 SEQ_LEQ(sack.end, th_ack)) { 9239 /* 9240 * Its a D-SACK block. 9241 */ 9242 tcp_record_dsack(sack.start, sack.end); 9243 #endif 9244 rack_note_dsack(rack, sack.start, sack.end); 9245 } 9246 } 9247 /* 9248 * Sort the SACK blocks so we can update the rack scoreboard with 9249 * just one pass. 9250 */ 9251 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 9252 num_sack_blks, th->th_ack); 9253 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 9254 if (num_sack_blks == 0) { 9255 /* Nothing to sack (DSACKs?) */ 9256 goto out_with_totals; 9257 } 9258 if (num_sack_blks < 2) { 9259 /* Only one, we don't need to sort */ 9260 goto do_sack_work; 9261 } 9262 /* Sort the sacks */ 9263 for (i = 0; i < num_sack_blks; i++) { 9264 for (j = i + 1; j < num_sack_blks; j++) { 9265 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 9266 sack = sack_blocks[i]; 9267 sack_blocks[i] = sack_blocks[j]; 9268 sack_blocks[j] = sack; 9269 } 9270 } 9271 } 9272 /* 9273 * Now are any of the sack block ends the same (yes some 9274 * implementations send these)? 9275 */ 9276 again: 9277 if (num_sack_blks == 0) 9278 goto out_with_totals; 9279 if (num_sack_blks > 1) { 9280 for (i = 0; i < num_sack_blks; i++) { 9281 for (j = i + 1; j < num_sack_blks; j++) { 9282 if (sack_blocks[i].end == sack_blocks[j].end) { 9283 /* 9284 * Ok these two have the same end we 9285 * want the smallest end and then 9286 * throw away the larger and start 9287 * again. 9288 */ 9289 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 9290 /* 9291 * The second block covers 9292 * more area use that 9293 */ 9294 sack_blocks[i].start = sack_blocks[j].start; 9295 } 9296 /* 9297 * Now collapse out the dup-sack and 9298 * lower the count 9299 */ 9300 for (k = (j + 1); k < num_sack_blks; k++) { 9301 sack_blocks[j].start = sack_blocks[k].start; 9302 sack_blocks[j].end = sack_blocks[k].end; 9303 j++; 9304 } 9305 num_sack_blks--; 9306 goto again; 9307 } 9308 } 9309 } 9310 } 9311 do_sack_work: 9312 /* 9313 * First lets look to see if 9314 * we have retransmitted and 9315 * can use the transmit next? 9316 */ 9317 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9318 if (rsm && 9319 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 9320 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 9321 /* 9322 * We probably did the FR and the next 9323 * SACK in continues as we would expect. 9324 */ 9325 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 9326 if (acked) { 9327 rack->r_wanted_output = 1; 9328 changed += acked; 9329 sack_changed += acked; 9330 } 9331 if (num_sack_blks == 1) { 9332 /* 9333 * This is what we would expect from 9334 * a normal implementation to happen 9335 * after we have retransmitted the FR, 9336 * i.e the sack-filter pushes down 9337 * to 1 block and the next to be retransmitted 9338 * is the sequence in the sack block (has more 9339 * are acked). Count this as ACK'd data to boost 9340 * up the chances of recovering any false positives. 9341 */ 9342 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 9343 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 9344 counter_u64_add(rack_express_sack, 1); 9345 if (rack->r_ctl.ack_count > 0xfff00000) { 9346 /* 9347 * reduce the number to keep us under 9348 * a uint32_t. 9349 */ 9350 rack->r_ctl.ack_count /= 2; 9351 rack->r_ctl.sack_count /= 2; 9352 } 9353 goto out_with_totals; 9354 } else { 9355 /* 9356 * Start the loop through the 9357 * rest of blocks, past the first block. 9358 */ 9359 moved_two = 0; 9360 loop_start = 1; 9361 } 9362 } 9363 /* Its a sack of some sort */ 9364 rack->r_ctl.sack_count++; 9365 if (rack->r_ctl.sack_count > 0xfff00000) { 9366 /* 9367 * reduce the number to keep us under 9368 * a uint32_t. 9369 */ 9370 rack->r_ctl.ack_count /= 2; 9371 rack->r_ctl.sack_count /= 2; 9372 } 9373 counter_u64_add(rack_sack_total, 1); 9374 if (rack->sack_attack_disable) { 9375 /* An attacker disablement is in place */ 9376 if (num_sack_blks > 1) { 9377 rack->r_ctl.sack_count += (num_sack_blks - 1); 9378 rack->r_ctl.sack_moved_extra++; 9379 counter_u64_add(rack_move_some, 1); 9380 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 9381 rack->r_ctl.sack_moved_extra /= 2; 9382 rack->r_ctl.sack_noextra_move /= 2; 9383 } 9384 } 9385 goto out; 9386 } 9387 rsm = rack->r_ctl.rc_sacklast; 9388 for (i = loop_start; i < num_sack_blks; i++) { 9389 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 9390 if (acked) { 9391 rack->r_wanted_output = 1; 9392 changed += acked; 9393 sack_changed += acked; 9394 } 9395 if (moved_two) { 9396 /* 9397 * If we did not get a SACK for at least a MSS and 9398 * had to move at all, or if we moved more than our 9399 * threshold, it counts against the "extra" move. 9400 */ 9401 rack->r_ctl.sack_moved_extra += moved_two; 9402 counter_u64_add(rack_move_some, 1); 9403 } else { 9404 /* 9405 * else we did not have to move 9406 * any more than we would expect. 9407 */ 9408 rack->r_ctl.sack_noextra_move++; 9409 counter_u64_add(rack_move_none, 1); 9410 } 9411 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 9412 /* 9413 * If the SACK was not a full MSS then 9414 * we add to sack_count the number of 9415 * MSS's (or possibly more than 9416 * a MSS if its a TSO send) we had to skip by. 9417 */ 9418 rack->r_ctl.sack_count += moved_two; 9419 counter_u64_add(rack_sack_total, moved_two); 9420 } 9421 /* 9422 * Now we need to setup for the next 9423 * round. First we make sure we won't 9424 * exceed the size of our uint32_t on 9425 * the various counts, and then clear out 9426 * moved_two. 9427 */ 9428 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 9429 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 9430 rack->r_ctl.sack_moved_extra /= 2; 9431 rack->r_ctl.sack_noextra_move /= 2; 9432 } 9433 if (rack->r_ctl.sack_count > 0xfff00000) { 9434 rack->r_ctl.ack_count /= 2; 9435 rack->r_ctl.sack_count /= 2; 9436 } 9437 moved_two = 0; 9438 } 9439 out_with_totals: 9440 if (num_sack_blks > 1) { 9441 /* 9442 * You get an extra stroke if 9443 * you have more than one sack-blk, this 9444 * could be where we are skipping forward 9445 * and the sack-filter is still working, or 9446 * it could be an attacker constantly 9447 * moving us. 9448 */ 9449 rack->r_ctl.sack_moved_extra++; 9450 counter_u64_add(rack_move_some, 1); 9451 } 9452 out: 9453 #ifdef NETFLIX_EXP_DETECTION 9454 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 9455 #endif 9456 if (changed) { 9457 /* Something changed cancel the rack timer */ 9458 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9459 } 9460 tsused = tcp_get_usecs(NULL); 9461 rsm = tcp_rack_output(tp, rack, tsused); 9462 if ((!IN_FASTRECOVERY(tp->t_flags)) && 9463 rsm) { 9464 /* Enter recovery */ 9465 rack->r_ctl.rc_rsm_start = rsm->r_start; 9466 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 9467 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 9468 entered_recovery = 1; 9469 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 9470 /* 9471 * When we enter recovery we need to assure we send 9472 * one packet. 9473 */ 9474 if (rack->rack_no_prr == 0) { 9475 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 9476 rack_log_to_prr(rack, 8, 0); 9477 } 9478 rack->r_timer_override = 1; 9479 rack->r_early = 0; 9480 rack->r_ctl.rc_agg_early = 0; 9481 } else if (IN_FASTRECOVERY(tp->t_flags) && 9482 rsm && 9483 (rack->r_rr_config == 3)) { 9484 /* 9485 * Assure we can output and we get no 9486 * remembered pace time except the retransmit. 9487 */ 9488 rack->r_timer_override = 1; 9489 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 9490 rack->r_ctl.rc_resend = rsm; 9491 } 9492 if (IN_FASTRECOVERY(tp->t_flags) && 9493 (rack->rack_no_prr == 0) && 9494 (entered_recovery == 0)) { 9495 rack_update_prr(tp, rack, changed, th_ack); 9496 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 9497 ((rack->rc_inp->inp_in_hpts == 0) && 9498 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 9499 /* 9500 * If you are pacing output you don't want 9501 * to override. 9502 */ 9503 rack->r_early = 0; 9504 rack->r_ctl.rc_agg_early = 0; 9505 rack->r_timer_override = 1; 9506 } 9507 } 9508 } 9509 9510 static void 9511 rack_strike_dupack(struct tcp_rack *rack) 9512 { 9513 struct rack_sendmap *rsm; 9514 9515 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9516 while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 9517 rsm = TAILQ_NEXT(rsm, r_tnext); 9518 } 9519 if (rsm && (rsm->r_dupack < 0xff)) { 9520 rsm->r_dupack++; 9521 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 9522 struct timeval tv; 9523 uint32_t cts; 9524 /* 9525 * Here we see if we need to retransmit. For 9526 * a SACK type connection if enough time has passed 9527 * we will get a return of the rsm. For a non-sack 9528 * connection we will get the rsm returned if the 9529 * dupack value is 3 or more. 9530 */ 9531 cts = tcp_get_usecs(&tv); 9532 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 9533 if (rack->r_ctl.rc_resend != NULL) { 9534 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 9535 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 9536 rack->rc_tp->snd_una); 9537 } 9538 rack->r_wanted_output = 1; 9539 rack->r_timer_override = 1; 9540 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 9541 } 9542 } else { 9543 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 9544 } 9545 } 9546 } 9547 9548 static void 9549 rack_check_bottom_drag(struct tcpcb *tp, 9550 struct tcp_rack *rack, 9551 struct socket *so, int32_t acked) 9552 { 9553 uint32_t segsiz, minseg; 9554 9555 segsiz = ctf_fixed_maxseg(tp); 9556 minseg = segsiz; 9557 9558 if (tp->snd_max == tp->snd_una) { 9559 /* 9560 * We are doing dynamic pacing and we are way 9561 * under. Basically everything got acked while 9562 * we were still waiting on the pacer to expire. 9563 * 9564 * This means we need to boost the b/w in 9565 * addition to any earlier boosting of 9566 * the multipler. 9567 */ 9568 rack->rc_dragged_bottom = 1; 9569 rack_validate_multipliers_at_or_above100(rack); 9570 /* 9571 * Lets use the segment bytes acked plus 9572 * the lowest RTT seen as the basis to 9573 * form a b/w estimate. This will be off 9574 * due to the fact that the true estimate 9575 * should be around 1/2 the time of the RTT 9576 * but we can settle for that. 9577 */ 9578 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 9579 acked) { 9580 uint64_t bw, calc_bw, rtt; 9581 9582 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9583 if (rtt == 0) { 9584 /* no us sample is there a ms one? */ 9585 if (rack->r_ctl.rack_rs.rs_rtt_lowest) { 9586 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 9587 } else { 9588 goto no_measurement; 9589 } 9590 } 9591 bw = acked; 9592 calc_bw = bw * 1000000; 9593 calc_bw /= rtt; 9594 if (rack->r_ctl.last_max_bw && 9595 (rack->r_ctl.last_max_bw < calc_bw)) { 9596 /* 9597 * If we have a last calculated max bw 9598 * enforce it. 9599 */ 9600 calc_bw = rack->r_ctl.last_max_bw; 9601 } 9602 /* now plop it in */ 9603 if (rack->rc_gp_filled == 0) { 9604 if (calc_bw > ONE_POINT_TWO_MEG) { 9605 /* 9606 * If we have no measurement 9607 * don't let us set in more than 9608 * 1.2Mbps. If we are still too 9609 * low after pacing with this we 9610 * will hopefully have a max b/w 9611 * available to sanity check things. 9612 */ 9613 calc_bw = ONE_POINT_TWO_MEG; 9614 } 9615 rack->r_ctl.rc_rtt_diff = 0; 9616 rack->r_ctl.gp_bw = calc_bw; 9617 rack->rc_gp_filled = 1; 9618 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9619 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9620 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9621 } else if (calc_bw > rack->r_ctl.gp_bw) { 9622 rack->r_ctl.rc_rtt_diff = 0; 9623 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9624 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9625 rack->r_ctl.gp_bw = calc_bw; 9626 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9627 } else 9628 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9629 if ((rack->gp_ready == 0) && 9630 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 9631 /* We have enough measurements now */ 9632 rack->gp_ready = 1; 9633 rack_set_cc_pacing(rack); 9634 if (rack->defer_options) 9635 rack_apply_deferred_options(rack); 9636 } 9637 /* 9638 * For acks over 1mss we do a extra boost to simulate 9639 * where we would get 2 acks (we want 110 for the mul). 9640 */ 9641 if (acked > segsiz) 9642 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9643 } else { 9644 /* 9645 * zero rtt possibly?, settle for just an old increase. 9646 */ 9647 no_measurement: 9648 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9649 } 9650 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 9651 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 9652 minseg)) && 9653 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 9654 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 9655 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 9656 (segsiz * rack_req_segs))) { 9657 /* 9658 * We are doing dynamic GP pacing and 9659 * we have everything except 1MSS or less 9660 * bytes left out. We are still pacing away. 9661 * And there is data that could be sent, This 9662 * means we are inserting delayed ack time in 9663 * our measurements because we are pacing too slow. 9664 */ 9665 rack_validate_multipliers_at_or_above100(rack); 9666 rack->rc_dragged_bottom = 1; 9667 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9668 } 9669 } 9670 9671 9672 9673 static void 9674 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 9675 { 9676 /* 9677 * The fast output path is enabled and we 9678 * have moved the cumack forward. Lets see if 9679 * we can expand forward the fast path length by 9680 * that amount. What we would ideally like to 9681 * do is increase the number of bytes in the 9682 * fast path block (left_to_send) by the 9683 * acked amount. However we have to gate that 9684 * by two factors: 9685 * 1) The amount outstanding and the rwnd of the peer 9686 * (i.e. we don't want to exceed the rwnd of the peer). 9687 * <and> 9688 * 2) The amount of data left in the socket buffer (i.e. 9689 * we can't send beyond what is in the buffer). 9690 * 9691 * Note that this does not take into account any increase 9692 * in the cwnd. We will only extend the fast path by 9693 * what was acked. 9694 */ 9695 uint32_t new_total, gating_val; 9696 9697 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 9698 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 9699 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 9700 if (new_total <= gating_val) { 9701 /* We can increase left_to_send by the acked amount */ 9702 counter_u64_add(rack_extended_rfo, 1); 9703 rack->r_ctl.fsb.left_to_send = new_total; 9704 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 9705 ("rack:%p left_to_send:%u sbavail:%u out:%u", 9706 rack, rack->r_ctl.fsb.left_to_send, 9707 sbavail(&rack->rc_inp->inp_socket->so_snd), 9708 (tp->snd_max - tp->snd_una))); 9709 9710 } 9711 } 9712 9713 static void 9714 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) 9715 { 9716 /* 9717 * Here any sendmap entry that points to the 9718 * beginning mbuf must be adjusted to the correct 9719 * offset. This must be called with: 9720 * 1) The socket buffer locked 9721 * 2) snd_una adjusted to its new postion. 9722 * 9723 * Note that (2) implies rack_ack_received has also 9724 * been called. 9725 * 9726 * We grab the first mbuf in the socket buffer and 9727 * then go through the front of the sendmap, recalculating 9728 * the stored offset for any sendmap entry that has 9729 * that mbuf. We must use the sb functions to do this 9730 * since its possible an add was done has well as 9731 * the subtraction we may have just completed. This should 9732 * not be a penalty though, since we just referenced the sb 9733 * to go in and trim off the mbufs that we freed (of course 9734 * there will be a penalty for the sendmap references though). 9735 */ 9736 struct mbuf *m; 9737 struct rack_sendmap *rsm; 9738 9739 SOCKBUF_LOCK_ASSERT(sb); 9740 m = sb->sb_mb; 9741 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9742 if ((rsm == NULL) || (m == NULL)) { 9743 /* Nothing outstanding */ 9744 return; 9745 } 9746 while (rsm->m && (rsm->m == m)) { 9747 /* one to adjust */ 9748 #ifdef INVARIANTS 9749 struct mbuf *tm; 9750 uint32_t soff; 9751 9752 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 9753 if (rsm->orig_m_len != m->m_len) { 9754 rack_adjust_orig_mlen(rsm); 9755 } 9756 if (rsm->soff != soff) { 9757 /* 9758 * This is not a fatal error, we anticipate it 9759 * might happen (the else code), so we count it here 9760 * so that under invariant we can see that it really 9761 * does happen. 9762 */ 9763 counter_u64_add(rack_adjust_map_bw, 1); 9764 } 9765 rsm->m = tm; 9766 rsm->soff = soff; 9767 if (tm) 9768 rsm->orig_m_len = rsm->m->m_len; 9769 else 9770 rsm->orig_m_len = 0; 9771 #else 9772 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 9773 if (rsm->m) 9774 rsm->orig_m_len = rsm->m->m_len; 9775 else 9776 rsm->orig_m_len = 0; 9777 #endif 9778 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 9779 rsm); 9780 if (rsm == NULL) 9781 break; 9782 } 9783 } 9784 9785 /* 9786 * Return value of 1, we do not need to call rack_process_data(). 9787 * return value of 0, rack_process_data can be called. 9788 * For ret_val if its 0 the TCP is locked, if its non-zero 9789 * its unlocked and probably unsafe to touch the TCB. 9790 */ 9791 static int 9792 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 9793 struct tcpcb *tp, struct tcpopt *to, 9794 uint32_t tiwin, int32_t tlen, 9795 int32_t * ofia, int32_t thflags, int32_t *ret_val) 9796 { 9797 int32_t ourfinisacked = 0; 9798 int32_t nsegs, acked_amount; 9799 int32_t acked; 9800 struct mbuf *mfree; 9801 struct tcp_rack *rack; 9802 int32_t under_pacing = 0; 9803 int32_t recovery = 0; 9804 9805 rack = (struct tcp_rack *)tp->t_fb_ptr; 9806 if (SEQ_GT(th->th_ack, tp->snd_max)) { 9807 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 9808 &rack->r_ctl.challenge_ack_ts, 9809 &rack->r_ctl.challenge_ack_cnt); 9810 rack->r_wanted_output = 1; 9811 return (1); 9812 } 9813 if (rack->gp_ready && 9814 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9815 under_pacing = 1; 9816 } 9817 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 9818 int in_rec, dup_ack_struck = 0; 9819 9820 in_rec = IN_FASTRECOVERY(tp->t_flags); 9821 if (rack->rc_in_persist) { 9822 tp->t_rxtshift = 0; 9823 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9824 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9825 } 9826 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) { 9827 rack_strike_dupack(rack); 9828 dup_ack_struck = 1; 9829 } 9830 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); 9831 } 9832 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 9833 /* 9834 * Old ack, behind (or duplicate to) the last one rcv'd 9835 * Note: We mark reordering is occuring if its 9836 * less than and we have not closed our window. 9837 */ 9838 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 9839 counter_u64_add(rack_reorder_seen, 1); 9840 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 9841 } 9842 return (0); 9843 } 9844 /* 9845 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 9846 * something we sent. 9847 */ 9848 if (tp->t_flags & TF_NEEDSYN) { 9849 /* 9850 * T/TCP: Connection was half-synchronized, and our SYN has 9851 * been ACK'd (so connection is now fully synchronized). Go 9852 * to non-starred state, increment snd_una for ACK of SYN, 9853 * and check if we can do window scaling. 9854 */ 9855 tp->t_flags &= ~TF_NEEDSYN; 9856 tp->snd_una++; 9857 /* Do window scaling? */ 9858 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9859 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9860 tp->rcv_scale = tp->request_r_scale; 9861 /* Send window already scaled. */ 9862 } 9863 } 9864 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9865 INP_WLOCK_ASSERT(tp->t_inpcb); 9866 9867 acked = BYTES_THIS_ACK(tp, th); 9868 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9869 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9870 /* 9871 * If we just performed our first retransmit, and the ACK arrives 9872 * within our recovery window, then it was a mistake to do the 9873 * retransmit in the first place. Recover our original cwnd and 9874 * ssthresh, and proceed to transmit where we left off. 9875 */ 9876 if ((tp->t_flags & TF_PREVVALID) && 9877 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 9878 tp->t_flags &= ~TF_PREVVALID; 9879 if (tp->t_rxtshift == 1 && 9880 (int)(ticks - tp->t_badrxtwin) < 0) 9881 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 9882 } 9883 if (acked) { 9884 /* assure we are not backed off */ 9885 tp->t_rxtshift = 0; 9886 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9887 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9888 rack->rc_tlp_in_progress = 0; 9889 rack->r_ctl.rc_tlp_cnt_out = 0; 9890 /* 9891 * If it is the RXT timer we want to 9892 * stop it, so we can restart a TLP. 9893 */ 9894 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9895 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9896 #ifdef NETFLIX_HTTP_LOGGING 9897 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9898 #endif 9899 } 9900 /* 9901 * If we have a timestamp reply, update smoothed round trip time. If 9902 * no timestamp is present but transmit timer is running and timed 9903 * sequence number was acked, update smoothed round trip time. Since 9904 * we now have an rtt measurement, cancel the timer backoff (cf., 9905 * Phil Karn's retransmit alg.). Recompute the initial retransmit 9906 * timer. 9907 * 9908 * Some boxes send broken timestamp replies during the SYN+ACK 9909 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9910 * and blow up the retransmit timer. 9911 */ 9912 /* 9913 * If all outstanding data is acked, stop retransmit timer and 9914 * remember to restart (more output or persist). If there is more 9915 * data to be acked, restart retransmit timer, using current 9916 * (possibly backed-off) value. 9917 */ 9918 if (acked == 0) { 9919 if (ofia) 9920 *ofia = ourfinisacked; 9921 return (0); 9922 } 9923 if (IN_RECOVERY(tp->t_flags)) { 9924 if (SEQ_LT(th->th_ack, tp->snd_recover) && 9925 (SEQ_LT(th->th_ack, tp->snd_max))) { 9926 tcp_rack_partialack(tp); 9927 } else { 9928 rack_post_recovery(tp, th->th_ack); 9929 recovery = 1; 9930 } 9931 } 9932 /* 9933 * Let the congestion control algorithm update congestion control 9934 * related information. This typically means increasing the 9935 * congestion window. 9936 */ 9937 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 9938 SOCKBUF_LOCK(&so->so_snd); 9939 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 9940 tp->snd_wnd -= acked_amount; 9941 mfree = sbcut_locked(&so->so_snd, acked_amount); 9942 if ((sbused(&so->so_snd) == 0) && 9943 (acked > acked_amount) && 9944 (tp->t_state >= TCPS_FIN_WAIT_1) && 9945 (tp->t_flags & TF_SENTFIN)) { 9946 /* 9947 * We must be sure our fin 9948 * was sent and acked (we can be 9949 * in FIN_WAIT_1 without having 9950 * sent the fin). 9951 */ 9952 ourfinisacked = 1; 9953 } 9954 tp->snd_una = th->th_ack; 9955 if (acked_amount && sbavail(&so->so_snd)) 9956 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 9957 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 9958 /* NB: sowwakeup_locked() does an implicit unlock. */ 9959 sowwakeup_locked(so); 9960 m_freem(mfree); 9961 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 9962 tp->snd_recover = tp->snd_una; 9963 9964 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 9965 tp->snd_nxt = tp->snd_una; 9966 } 9967 if (under_pacing && 9968 (rack->use_fixed_rate == 0) && 9969 (rack->in_probe_rtt == 0) && 9970 rack->rc_gp_dyn_mul && 9971 rack->rc_always_pace) { 9972 /* Check if we are dragging bottom */ 9973 rack_check_bottom_drag(tp, rack, so, acked); 9974 } 9975 if (tp->snd_una == tp->snd_max) { 9976 /* Nothing left outstanding */ 9977 tp->t_flags &= ~TF_PREVVALID; 9978 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9979 rack->r_ctl.retran_during_recovery = 0; 9980 rack->r_ctl.dsack_byte_cnt = 0; 9981 if (rack->r_ctl.rc_went_idle_time == 0) 9982 rack->r_ctl.rc_went_idle_time = 1; 9983 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9984 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9985 tp->t_acktime = 0; 9986 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9987 /* Set need output so persist might get set */ 9988 rack->r_wanted_output = 1; 9989 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 9990 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 9991 (sbavail(&so->so_snd) == 0) && 9992 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 9993 /* 9994 * The socket was gone and the 9995 * peer sent data (now or in the past), time to 9996 * reset him. 9997 */ 9998 *ret_val = 1; 9999 /* tcp_close will kill the inp pre-log the Reset */ 10000 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 10001 tp = tcp_close(tp); 10002 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 10003 return (1); 10004 } 10005 } 10006 if (ofia) 10007 *ofia = ourfinisacked; 10008 return (0); 10009 } 10010 10011 static void 10012 rack_collapsed_window(struct tcp_rack *rack) 10013 { 10014 /* 10015 * Now we must walk the 10016 * send map and divide the 10017 * ones left stranded. These 10018 * guys can't cause us to abort 10019 * the connection and are really 10020 * "unsent". However if a buggy 10021 * client actually did keep some 10022 * of the data i.e. collapsed the win 10023 * and refused to ack and then opened 10024 * the win and acked that data. We would 10025 * get into an ack war, the simplier 10026 * method then of just pretending we 10027 * did not send those segments something 10028 * won't work. 10029 */ 10030 struct rack_sendmap *rsm, *nrsm, fe, *insret; 10031 tcp_seq max_seq; 10032 10033 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 10034 memset(&fe, 0, sizeof(fe)); 10035 fe.r_start = max_seq; 10036 /* Find the first seq past or at maxseq */ 10037 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 10038 if (rsm == NULL) { 10039 /* Nothing to do strange */ 10040 rack->rc_has_collapsed = 0; 10041 return; 10042 } 10043 /* 10044 * Now do we need to split at 10045 * the collapse point? 10046 */ 10047 if (SEQ_GT(max_seq, rsm->r_start)) { 10048 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 10049 if (nrsm == NULL) { 10050 /* We can't get a rsm, mark all? */ 10051 nrsm = rsm; 10052 goto no_split; 10053 } 10054 /* Clone it */ 10055 rack_clone_rsm(rack, nrsm, rsm, max_seq); 10056 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 10057 #ifdef INVARIANTS 10058 if (insret != NULL) { 10059 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 10060 nrsm, insret, rack, rsm); 10061 } 10062 #endif 10063 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__); 10064 if (rsm->r_in_tmap) { 10065 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 10066 nrsm->r_in_tmap = 1; 10067 } 10068 /* 10069 * Set in the new RSM as the 10070 * collapsed starting point 10071 */ 10072 rsm = nrsm; 10073 } 10074 no_split: 10075 counter_u64_add(rack_collapsed_win, 1); 10076 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 10077 nrsm->r_flags |= RACK_RWND_COLLAPSED; 10078 } 10079 rack->rc_has_collapsed = 1; 10080 } 10081 10082 static void 10083 rack_un_collapse_window(struct tcp_rack *rack) 10084 { 10085 struct rack_sendmap *rsm; 10086 10087 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 10088 if (rsm->r_flags & RACK_RWND_COLLAPSED) 10089 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 10090 else 10091 break; 10092 } 10093 rack->rc_has_collapsed = 0; 10094 } 10095 10096 static void 10097 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 10098 int32_t tlen, int32_t tfo_syn) 10099 { 10100 if (DELAY_ACK(tp, tlen) || tfo_syn) { 10101 if (rack->rc_dack_mode && 10102 (tlen > 500) && 10103 (rack->rc_dack_toggle == 1)) { 10104 goto no_delayed_ack; 10105 } 10106 rack_timer_cancel(tp, rack, 10107 rack->r_ctl.rc_rcvtime, __LINE__); 10108 tp->t_flags |= TF_DELACK; 10109 } else { 10110 no_delayed_ack: 10111 rack->r_wanted_output = 1; 10112 tp->t_flags |= TF_ACKNOW; 10113 if (rack->rc_dack_mode) { 10114 if (tp->t_flags & TF_DELACK) 10115 rack->rc_dack_toggle = 1; 10116 else 10117 rack->rc_dack_toggle = 0; 10118 } 10119 } 10120 } 10121 10122 static void 10123 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 10124 { 10125 /* 10126 * If fast output is in progress, lets validate that 10127 * the new window did not shrink on us and make it 10128 * so fast output should end. 10129 */ 10130 if (rack->r_fast_output) { 10131 uint32_t out; 10132 10133 /* 10134 * Calculate what we will send if left as is 10135 * and compare that to our send window. 10136 */ 10137 out = ctf_outstanding(tp); 10138 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 10139 /* ok we have an issue */ 10140 if (out >= tp->snd_wnd) { 10141 /* Turn off fast output the window is met or collapsed */ 10142 rack->r_fast_output = 0; 10143 } else { 10144 /* we have some room left */ 10145 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 10146 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 10147 /* If not at least 1 full segment never mind */ 10148 rack->r_fast_output = 0; 10149 } 10150 } 10151 } 10152 } 10153 } 10154 10155 10156 /* 10157 * Return value of 1, the TCB is unlocked and most 10158 * likely gone, return value of 0, the TCP is still 10159 * locked. 10160 */ 10161 static int 10162 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 10163 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 10164 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 10165 { 10166 /* 10167 * Update window information. Don't look at window if no ACK: TAC's 10168 * send garbage on first SYN. 10169 */ 10170 int32_t nsegs; 10171 int32_t tfo_syn; 10172 struct tcp_rack *rack; 10173 10174 rack = (struct tcp_rack *)tp->t_fb_ptr; 10175 INP_WLOCK_ASSERT(tp->t_inpcb); 10176 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10177 if ((thflags & TH_ACK) && 10178 (SEQ_LT(tp->snd_wl1, th->th_seq) || 10179 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 10180 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 10181 /* keep track of pure window updates */ 10182 if (tlen == 0 && 10183 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 10184 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 10185 tp->snd_wnd = tiwin; 10186 rack_validate_fo_sendwin_up(tp, rack); 10187 tp->snd_wl1 = th->th_seq; 10188 tp->snd_wl2 = th->th_ack; 10189 if (tp->snd_wnd > tp->max_sndwnd) 10190 tp->max_sndwnd = tp->snd_wnd; 10191 rack->r_wanted_output = 1; 10192 } else if (thflags & TH_ACK) { 10193 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 10194 tp->snd_wnd = tiwin; 10195 rack_validate_fo_sendwin_up(tp, rack); 10196 tp->snd_wl1 = th->th_seq; 10197 tp->snd_wl2 = th->th_ack; 10198 } 10199 } 10200 if (tp->snd_wnd < ctf_outstanding(tp)) 10201 /* The peer collapsed the window */ 10202 rack_collapsed_window(rack); 10203 else if (rack->rc_has_collapsed) 10204 rack_un_collapse_window(rack); 10205 /* Was persist timer active and now we have window space? */ 10206 if ((rack->rc_in_persist != 0) && 10207 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10208 rack->r_ctl.rc_pace_min_segs))) { 10209 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10210 tp->snd_nxt = tp->snd_max; 10211 /* Make sure we output to start the timer */ 10212 rack->r_wanted_output = 1; 10213 } 10214 /* Do we enter persists? */ 10215 if ((rack->rc_in_persist == 0) && 10216 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10217 TCPS_HAVEESTABLISHED(tp->t_state) && 10218 (tp->snd_max == tp->snd_una) && 10219 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 10220 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 10221 /* 10222 * Here the rwnd is less than 10223 * the pacing size, we are established, 10224 * nothing is outstanding, and there is 10225 * data to send. Enter persists. 10226 */ 10227 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10228 } 10229 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 10230 m_freem(m); 10231 return (0); 10232 } 10233 /* 10234 * don't process the URG bit, ignore them drag 10235 * along the up. 10236 */ 10237 tp->rcv_up = tp->rcv_nxt; 10238 INP_WLOCK_ASSERT(tp->t_inpcb); 10239 10240 /* 10241 * Process the segment text, merging it into the TCP sequencing 10242 * queue, and arranging for acknowledgment of receipt if necessary. 10243 * This process logically involves adjusting tp->rcv_wnd as data is 10244 * presented to the user (this happens in tcp_usrreq.c, case 10245 * PRU_RCVD). If a FIN has already been received on this connection 10246 * then we just ignore the text. 10247 */ 10248 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 10249 IS_FASTOPEN(tp->t_flags)); 10250 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 10251 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10252 tcp_seq save_start = th->th_seq; 10253 tcp_seq save_rnxt = tp->rcv_nxt; 10254 int save_tlen = tlen; 10255 10256 m_adj(m, drop_hdrlen); /* delayed header drop */ 10257 /* 10258 * Insert segment which includes th into TCP reassembly 10259 * queue with control block tp. Set thflags to whether 10260 * reassembly now includes a segment with FIN. This handles 10261 * the common case inline (segment is the next to be 10262 * received on an established connection, and the queue is 10263 * empty), avoiding linkage into and removal from the queue 10264 * and repetition of various conversions. Set DELACK for 10265 * segments received in order, but ack immediately when 10266 * segments are out of order (so fast retransmit can work). 10267 */ 10268 if (th->th_seq == tp->rcv_nxt && 10269 SEGQ_EMPTY(tp) && 10270 (TCPS_HAVEESTABLISHED(tp->t_state) || 10271 tfo_syn)) { 10272 #ifdef NETFLIX_SB_LIMITS 10273 u_int mcnt, appended; 10274 10275 if (so->so_rcv.sb_shlim) { 10276 mcnt = m_memcnt(m); 10277 appended = 0; 10278 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10279 CFO_NOSLEEP, NULL) == false) { 10280 counter_u64_add(tcp_sb_shlim_fails, 1); 10281 m_freem(m); 10282 return (0); 10283 } 10284 } 10285 #endif 10286 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 10287 tp->rcv_nxt += tlen; 10288 if (tlen && 10289 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10290 (tp->t_fbyte_in == 0)) { 10291 tp->t_fbyte_in = ticks; 10292 if (tp->t_fbyte_in == 0) 10293 tp->t_fbyte_in = 1; 10294 if (tp->t_fbyte_out && tp->t_fbyte_in) 10295 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10296 } 10297 thflags = th->th_flags & TH_FIN; 10298 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10299 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10300 SOCKBUF_LOCK(&so->so_rcv); 10301 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10302 m_freem(m); 10303 } else 10304 #ifdef NETFLIX_SB_LIMITS 10305 appended = 10306 #endif 10307 sbappendstream_locked(&so->so_rcv, m, 0); 10308 10309 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10310 /* NB: sorwakeup_locked() does an implicit unlock. */ 10311 sorwakeup_locked(so); 10312 #ifdef NETFLIX_SB_LIMITS 10313 if (so->so_rcv.sb_shlim && appended != mcnt) 10314 counter_fo_release(so->so_rcv.sb_shlim, 10315 mcnt - appended); 10316 #endif 10317 } else { 10318 /* 10319 * XXX: Due to the header drop above "th" is 10320 * theoretically invalid by now. Fortunately 10321 * m_adj() doesn't actually frees any mbufs when 10322 * trimming from the head. 10323 */ 10324 tcp_seq temp = save_start; 10325 10326 thflags = tcp_reass(tp, th, &temp, &tlen, m); 10327 tp->t_flags |= TF_ACKNOW; 10328 if (tp->t_flags & TF_WAKESOR) { 10329 tp->t_flags &= ~TF_WAKESOR; 10330 /* NB: sorwakeup_locked() does an implicit unlock. */ 10331 sorwakeup_locked(so); 10332 } 10333 } 10334 if ((tp->t_flags & TF_SACK_PERMIT) && 10335 (save_tlen > 0) && 10336 TCPS_HAVEESTABLISHED(tp->t_state)) { 10337 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 10338 /* 10339 * DSACK actually handled in the fastpath 10340 * above. 10341 */ 10342 RACK_OPTS_INC(tcp_sack_path_1); 10343 tcp_update_sack_list(tp, save_start, 10344 save_start + save_tlen); 10345 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 10346 if ((tp->rcv_numsacks >= 1) && 10347 (tp->sackblks[0].end == save_start)) { 10348 /* 10349 * Partial overlap, recorded at todrop 10350 * above. 10351 */ 10352 RACK_OPTS_INC(tcp_sack_path_2a); 10353 tcp_update_sack_list(tp, 10354 tp->sackblks[0].start, 10355 tp->sackblks[0].end); 10356 } else { 10357 RACK_OPTS_INC(tcp_sack_path_2b); 10358 tcp_update_dsack_list(tp, save_start, 10359 save_start + save_tlen); 10360 } 10361 } else if (tlen >= save_tlen) { 10362 /* Update of sackblks. */ 10363 RACK_OPTS_INC(tcp_sack_path_3); 10364 tcp_update_dsack_list(tp, save_start, 10365 save_start + save_tlen); 10366 } else if (tlen > 0) { 10367 RACK_OPTS_INC(tcp_sack_path_4); 10368 tcp_update_dsack_list(tp, save_start, 10369 save_start + tlen); 10370 } 10371 } 10372 } else { 10373 m_freem(m); 10374 thflags &= ~TH_FIN; 10375 } 10376 10377 /* 10378 * If FIN is received ACK the FIN and let the user know that the 10379 * connection is closing. 10380 */ 10381 if (thflags & TH_FIN) { 10382 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10383 /* The socket upcall is handled by socantrcvmore. */ 10384 socantrcvmore(so); 10385 /* 10386 * If connection is half-synchronized (ie NEEDSYN 10387 * flag on) then delay ACK, so it may be piggybacked 10388 * when SYN is sent. Otherwise, since we received a 10389 * FIN then no more input can be expected, send ACK 10390 * now. 10391 */ 10392 if (tp->t_flags & TF_NEEDSYN) { 10393 rack_timer_cancel(tp, rack, 10394 rack->r_ctl.rc_rcvtime, __LINE__); 10395 tp->t_flags |= TF_DELACK; 10396 } else { 10397 tp->t_flags |= TF_ACKNOW; 10398 } 10399 tp->rcv_nxt++; 10400 } 10401 switch (tp->t_state) { 10402 /* 10403 * In SYN_RECEIVED and ESTABLISHED STATES enter the 10404 * CLOSE_WAIT state. 10405 */ 10406 case TCPS_SYN_RECEIVED: 10407 tp->t_starttime = ticks; 10408 /* FALLTHROUGH */ 10409 case TCPS_ESTABLISHED: 10410 rack_timer_cancel(tp, rack, 10411 rack->r_ctl.rc_rcvtime, __LINE__); 10412 tcp_state_change(tp, TCPS_CLOSE_WAIT); 10413 break; 10414 10415 /* 10416 * If still in FIN_WAIT_1 STATE FIN has not been 10417 * acked so enter the CLOSING state. 10418 */ 10419 case TCPS_FIN_WAIT_1: 10420 rack_timer_cancel(tp, rack, 10421 rack->r_ctl.rc_rcvtime, __LINE__); 10422 tcp_state_change(tp, TCPS_CLOSING); 10423 break; 10424 10425 /* 10426 * In FIN_WAIT_2 state enter the TIME_WAIT state, 10427 * starting the time-wait timer, turning off the 10428 * other standard timers. 10429 */ 10430 case TCPS_FIN_WAIT_2: 10431 rack_timer_cancel(tp, rack, 10432 rack->r_ctl.rc_rcvtime, __LINE__); 10433 tcp_twstart(tp); 10434 return (1); 10435 } 10436 } 10437 /* 10438 * Return any desired output. 10439 */ 10440 if ((tp->t_flags & TF_ACKNOW) || 10441 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 10442 rack->r_wanted_output = 1; 10443 } 10444 INP_WLOCK_ASSERT(tp->t_inpcb); 10445 return (0); 10446 } 10447 10448 /* 10449 * Here nothing is really faster, its just that we 10450 * have broken out the fast-data path also just like 10451 * the fast-ack. 10452 */ 10453 static int 10454 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 10455 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10456 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 10457 { 10458 int32_t nsegs; 10459 int32_t newsize = 0; /* automatic sockbuf scaling */ 10460 struct tcp_rack *rack; 10461 #ifdef NETFLIX_SB_LIMITS 10462 u_int mcnt, appended; 10463 #endif 10464 #ifdef TCPDEBUG 10465 /* 10466 * The size of tcp_saveipgen must be the size of the max ip header, 10467 * now IPv6. 10468 */ 10469 u_char tcp_saveipgen[IP6_HDR_LEN]; 10470 struct tcphdr tcp_savetcp; 10471 short ostate = 0; 10472 10473 #endif 10474 /* 10475 * If last ACK falls within this segment's sequence numbers, record 10476 * the timestamp. NOTE that the test is modified according to the 10477 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10478 */ 10479 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 10480 return (0); 10481 } 10482 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10483 return (0); 10484 } 10485 if (tiwin && tiwin != tp->snd_wnd) { 10486 return (0); 10487 } 10488 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 10489 return (0); 10490 } 10491 if (__predict_false((to->to_flags & TOF_TS) && 10492 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 10493 return (0); 10494 } 10495 if (__predict_false((th->th_ack != tp->snd_una))) { 10496 return (0); 10497 } 10498 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 10499 return (0); 10500 } 10501 if ((to->to_flags & TOF_TS) != 0 && 10502 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10503 tp->ts_recent_age = tcp_ts_getticks(); 10504 tp->ts_recent = to->to_tsval; 10505 } 10506 rack = (struct tcp_rack *)tp->t_fb_ptr; 10507 /* 10508 * This is a pure, in-sequence data packet with nothing on the 10509 * reassembly queue and we have enough buffer space to take it. 10510 */ 10511 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10512 10513 #ifdef NETFLIX_SB_LIMITS 10514 if (so->so_rcv.sb_shlim) { 10515 mcnt = m_memcnt(m); 10516 appended = 0; 10517 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10518 CFO_NOSLEEP, NULL) == false) { 10519 counter_u64_add(tcp_sb_shlim_fails, 1); 10520 m_freem(m); 10521 return (1); 10522 } 10523 } 10524 #endif 10525 /* Clean receiver SACK report if present */ 10526 if (tp->rcv_numsacks) 10527 tcp_clean_sackreport(tp); 10528 KMOD_TCPSTAT_INC(tcps_preddat); 10529 tp->rcv_nxt += tlen; 10530 if (tlen && 10531 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10532 (tp->t_fbyte_in == 0)) { 10533 tp->t_fbyte_in = ticks; 10534 if (tp->t_fbyte_in == 0) 10535 tp->t_fbyte_in = 1; 10536 if (tp->t_fbyte_out && tp->t_fbyte_in) 10537 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10538 } 10539 /* 10540 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 10541 */ 10542 tp->snd_wl1 = th->th_seq; 10543 /* 10544 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 10545 */ 10546 tp->rcv_up = tp->rcv_nxt; 10547 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10548 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10549 #ifdef TCPDEBUG 10550 if (so->so_options & SO_DEBUG) 10551 tcp_trace(TA_INPUT, ostate, tp, 10552 (void *)tcp_saveipgen, &tcp_savetcp, 0); 10553 #endif 10554 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 10555 10556 /* Add data to socket buffer. */ 10557 SOCKBUF_LOCK(&so->so_rcv); 10558 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10559 m_freem(m); 10560 } else { 10561 /* 10562 * Set new socket buffer size. Give up when limit is 10563 * reached. 10564 */ 10565 if (newsize) 10566 if (!sbreserve_locked(&so->so_rcv, 10567 newsize, so, NULL)) 10568 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 10569 m_adj(m, drop_hdrlen); /* delayed header drop */ 10570 #ifdef NETFLIX_SB_LIMITS 10571 appended = 10572 #endif 10573 sbappendstream_locked(&so->so_rcv, m, 0); 10574 ctf_calc_rwin(so, tp); 10575 } 10576 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10577 /* NB: sorwakeup_locked() does an implicit unlock. */ 10578 sorwakeup_locked(so); 10579 #ifdef NETFLIX_SB_LIMITS 10580 if (so->so_rcv.sb_shlim && mcnt != appended) 10581 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 10582 #endif 10583 rack_handle_delayed_ack(tp, rack, tlen, 0); 10584 if (tp->snd_una == tp->snd_max) 10585 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 10586 return (1); 10587 } 10588 10589 /* 10590 * This subfunction is used to try to highly optimize the 10591 * fast path. We again allow window updates that are 10592 * in sequence to remain in the fast-path. We also add 10593 * in the __predict's to attempt to help the compiler. 10594 * Note that if we return a 0, then we can *not* process 10595 * it and the caller should push the packet into the 10596 * slow-path. 10597 */ 10598 static int 10599 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10600 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10601 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 10602 { 10603 int32_t acked; 10604 int32_t nsegs; 10605 #ifdef TCPDEBUG 10606 /* 10607 * The size of tcp_saveipgen must be the size of the max ip header, 10608 * now IPv6. 10609 */ 10610 u_char tcp_saveipgen[IP6_HDR_LEN]; 10611 struct tcphdr tcp_savetcp; 10612 short ostate = 0; 10613 #endif 10614 int32_t under_pacing = 0; 10615 struct tcp_rack *rack; 10616 10617 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 10618 /* Old ack, behind (or duplicate to) the last one rcv'd */ 10619 return (0); 10620 } 10621 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 10622 /* Above what we have sent? */ 10623 return (0); 10624 } 10625 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10626 /* We are retransmitting */ 10627 return (0); 10628 } 10629 if (__predict_false(tiwin == 0)) { 10630 /* zero window */ 10631 return (0); 10632 } 10633 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 10634 /* We need a SYN or a FIN, unlikely.. */ 10635 return (0); 10636 } 10637 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 10638 /* Timestamp is behind .. old ack with seq wrap? */ 10639 return (0); 10640 } 10641 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 10642 /* Still recovering */ 10643 return (0); 10644 } 10645 rack = (struct tcp_rack *)tp->t_fb_ptr; 10646 if (rack->r_ctl.rc_sacked) { 10647 /* We have sack holes on our scoreboard */ 10648 return (0); 10649 } 10650 /* Ok if we reach here, we can process a fast-ack */ 10651 if (rack->gp_ready && 10652 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 10653 under_pacing = 1; 10654 } 10655 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10656 rack_log_ack(tp, to, th, 0, 0); 10657 /* Did the window get updated? */ 10658 if (tiwin != tp->snd_wnd) { 10659 tp->snd_wnd = tiwin; 10660 rack_validate_fo_sendwin_up(tp, rack); 10661 tp->snd_wl1 = th->th_seq; 10662 if (tp->snd_wnd > tp->max_sndwnd) 10663 tp->max_sndwnd = tp->snd_wnd; 10664 } 10665 /* Do we exit persists? */ 10666 if ((rack->rc_in_persist != 0) && 10667 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10668 rack->r_ctl.rc_pace_min_segs))) { 10669 rack_exit_persist(tp, rack, cts); 10670 } 10671 /* Do we enter persists? */ 10672 if ((rack->rc_in_persist == 0) && 10673 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10674 TCPS_HAVEESTABLISHED(tp->t_state) && 10675 (tp->snd_max == tp->snd_una) && 10676 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 10677 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 10678 /* 10679 * Here the rwnd is less than 10680 * the pacing size, we are established, 10681 * nothing is outstanding, and there is 10682 * data to send. Enter persists. 10683 */ 10684 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10685 } 10686 /* 10687 * If last ACK falls within this segment's sequence numbers, record 10688 * the timestamp. NOTE that the test is modified according to the 10689 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10690 */ 10691 if ((to->to_flags & TOF_TS) != 0 && 10692 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10693 tp->ts_recent_age = tcp_ts_getticks(); 10694 tp->ts_recent = to->to_tsval; 10695 } 10696 /* 10697 * This is a pure ack for outstanding data. 10698 */ 10699 KMOD_TCPSTAT_INC(tcps_predack); 10700 10701 /* 10702 * "bad retransmit" recovery. 10703 */ 10704 if ((tp->t_flags & TF_PREVVALID) && 10705 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 10706 tp->t_flags &= ~TF_PREVVALID; 10707 if (tp->t_rxtshift == 1 && 10708 (int)(ticks - tp->t_badrxtwin) < 0) 10709 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 10710 } 10711 /* 10712 * Recalculate the transmit timer / rtt. 10713 * 10714 * Some boxes send broken timestamp replies during the SYN+ACK 10715 * phase, ignore timestamps of 0 or we could calculate a huge RTT 10716 * and blow up the retransmit timer. 10717 */ 10718 acked = BYTES_THIS_ACK(tp, th); 10719 10720 #ifdef TCP_HHOOK 10721 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 10722 hhook_run_tcp_est_in(tp, th, to); 10723 #endif 10724 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 10725 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 10726 if (acked) { 10727 struct mbuf *mfree; 10728 10729 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 10730 SOCKBUF_LOCK(&so->so_snd); 10731 mfree = sbcut_locked(&so->so_snd, acked); 10732 tp->snd_una = th->th_ack; 10733 /* Note we want to hold the sb lock through the sendmap adjust */ 10734 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 10735 /* Wake up the socket if we have room to write more */ 10736 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 10737 sowwakeup_locked(so); 10738 m_freem(mfree); 10739 tp->t_rxtshift = 0; 10740 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10741 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10742 rack->rc_tlp_in_progress = 0; 10743 rack->r_ctl.rc_tlp_cnt_out = 0; 10744 /* 10745 * If it is the RXT timer we want to 10746 * stop it, so we can restart a TLP. 10747 */ 10748 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 10749 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10750 #ifdef NETFLIX_HTTP_LOGGING 10751 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 10752 #endif 10753 } 10754 /* 10755 * Let the congestion control algorithm update congestion control 10756 * related information. This typically means increasing the 10757 * congestion window. 10758 */ 10759 if (tp->snd_wnd < ctf_outstanding(tp)) { 10760 /* The peer collapsed the window */ 10761 rack_collapsed_window(rack); 10762 } else if (rack->rc_has_collapsed) 10763 rack_un_collapse_window(rack); 10764 10765 /* 10766 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 10767 */ 10768 tp->snd_wl2 = th->th_ack; 10769 tp->t_dupacks = 0; 10770 m_freem(m); 10771 /* ND6_HINT(tp); *//* Some progress has been made. */ 10772 10773 /* 10774 * If all outstanding data are acked, stop retransmit timer, 10775 * otherwise restart timer using current (possibly backed-off) 10776 * value. If process is waiting for space, wakeup/selwakeup/signal. 10777 * If data are ready to send, let tcp_output decide between more 10778 * output or persist. 10779 */ 10780 #ifdef TCPDEBUG 10781 if (so->so_options & SO_DEBUG) 10782 tcp_trace(TA_INPUT, ostate, tp, 10783 (void *)tcp_saveipgen, 10784 &tcp_savetcp, 0); 10785 #endif 10786 if (under_pacing && 10787 (rack->use_fixed_rate == 0) && 10788 (rack->in_probe_rtt == 0) && 10789 rack->rc_gp_dyn_mul && 10790 rack->rc_always_pace) { 10791 /* Check if we are dragging bottom */ 10792 rack_check_bottom_drag(tp, rack, so, acked); 10793 } 10794 if (tp->snd_una == tp->snd_max) { 10795 tp->t_flags &= ~TF_PREVVALID; 10796 rack->r_ctl.retran_during_recovery = 0; 10797 rack->r_ctl.dsack_byte_cnt = 0; 10798 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 10799 if (rack->r_ctl.rc_went_idle_time == 0) 10800 rack->r_ctl.rc_went_idle_time = 1; 10801 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 10802 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 10803 tp->t_acktime = 0; 10804 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10805 } 10806 if (acked && rack->r_fast_output) 10807 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 10808 if (sbavail(&so->so_snd)) { 10809 rack->r_wanted_output = 1; 10810 } 10811 return (1); 10812 } 10813 10814 /* 10815 * Return value of 1, the TCB is unlocked and most 10816 * likely gone, return value of 0, the TCP is still 10817 * locked. 10818 */ 10819 static int 10820 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 10821 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10822 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10823 { 10824 int32_t ret_val = 0; 10825 int32_t todrop; 10826 int32_t ourfinisacked = 0; 10827 struct tcp_rack *rack; 10828 10829 ctf_calc_rwin(so, tp); 10830 /* 10831 * If the state is SYN_SENT: if seg contains an ACK, but not for our 10832 * SYN, drop the input. if seg contains a RST, then drop the 10833 * connection. if seg does not contain SYN, then drop it. Otherwise 10834 * this is an acceptable SYN segment initialize tp->rcv_nxt and 10835 * tp->irs if seg contains ack then advance tp->snd_una if seg 10836 * contains an ECE and ECN support is enabled, the stream is ECN 10837 * capable. if SYN has been acked change to ESTABLISHED else 10838 * SYN_RCVD state arrange for segment to be acked (eventually) 10839 * continue processing rest of data/controls. 10840 */ 10841 if ((thflags & TH_ACK) && 10842 (SEQ_LEQ(th->th_ack, tp->iss) || 10843 SEQ_GT(th->th_ack, tp->snd_max))) { 10844 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10845 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10846 return (1); 10847 } 10848 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 10849 TCP_PROBE5(connect__refused, NULL, tp, 10850 mtod(m, const char *), tp, th); 10851 tp = tcp_drop(tp, ECONNREFUSED); 10852 ctf_do_drop(m, tp); 10853 return (1); 10854 } 10855 if (thflags & TH_RST) { 10856 ctf_do_drop(m, tp); 10857 return (1); 10858 } 10859 if (!(thflags & TH_SYN)) { 10860 ctf_do_drop(m, tp); 10861 return (1); 10862 } 10863 tp->irs = th->th_seq; 10864 tcp_rcvseqinit(tp); 10865 rack = (struct tcp_rack *)tp->t_fb_ptr; 10866 if (thflags & TH_ACK) { 10867 int tfo_partial = 0; 10868 10869 KMOD_TCPSTAT_INC(tcps_connects); 10870 soisconnected(so); 10871 #ifdef MAC 10872 mac_socketpeer_set_from_mbuf(m, so); 10873 #endif 10874 /* Do window scaling on this connection? */ 10875 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 10876 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 10877 tp->rcv_scale = tp->request_r_scale; 10878 } 10879 tp->rcv_adv += min(tp->rcv_wnd, 10880 TCP_MAXWIN << tp->rcv_scale); 10881 /* 10882 * If not all the data that was sent in the TFO SYN 10883 * has been acked, resend the remainder right away. 10884 */ 10885 if (IS_FASTOPEN(tp->t_flags) && 10886 (tp->snd_una != tp->snd_max)) { 10887 tp->snd_nxt = th->th_ack; 10888 tfo_partial = 1; 10889 } 10890 /* 10891 * If there's data, delay ACK; if there's also a FIN ACKNOW 10892 * will be turned on later. 10893 */ 10894 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 10895 rack_timer_cancel(tp, rack, 10896 rack->r_ctl.rc_rcvtime, __LINE__); 10897 tp->t_flags |= TF_DELACK; 10898 } else { 10899 rack->r_wanted_output = 1; 10900 tp->t_flags |= TF_ACKNOW; 10901 rack->rc_dack_toggle = 0; 10902 } 10903 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 10904 (V_tcp_do_ecn == 1)) { 10905 tp->t_flags2 |= TF2_ECN_PERMIT; 10906 KMOD_TCPSTAT_INC(tcps_ecn_shs); 10907 } 10908 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10909 /* 10910 * We advance snd_una for the 10911 * fast open case. If th_ack is 10912 * acknowledging data beyond 10913 * snd_una we can't just call 10914 * ack-processing since the 10915 * data stream in our send-map 10916 * will start at snd_una + 1 (one 10917 * beyond the SYN). If its just 10918 * equal we don't need to do that 10919 * and there is no send_map. 10920 */ 10921 tp->snd_una++; 10922 } 10923 /* 10924 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 10925 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 10926 */ 10927 tp->t_starttime = ticks; 10928 if (tp->t_flags & TF_NEEDFIN) { 10929 tcp_state_change(tp, TCPS_FIN_WAIT_1); 10930 tp->t_flags &= ~TF_NEEDFIN; 10931 thflags &= ~TH_SYN; 10932 } else { 10933 tcp_state_change(tp, TCPS_ESTABLISHED); 10934 TCP_PROBE5(connect__established, NULL, tp, 10935 mtod(m, const char *), tp, th); 10936 rack_cc_conn_init(tp); 10937 } 10938 } else { 10939 /* 10940 * Received initial SYN in SYN-SENT[*] state => simultaneous 10941 * open. If segment contains CC option and there is a 10942 * cached CC, apply TAO test. If it succeeds, connection is * 10943 * half-synchronized. Otherwise, do 3-way handshake: 10944 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 10945 * there was no CC option, clear cached CC value. 10946 */ 10947 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 10948 tcp_state_change(tp, TCPS_SYN_RECEIVED); 10949 } 10950 INP_WLOCK_ASSERT(tp->t_inpcb); 10951 /* 10952 * Advance th->th_seq to correspond to first data byte. If data, 10953 * trim to stay within window, dropping FIN if necessary. 10954 */ 10955 th->th_seq++; 10956 if (tlen > tp->rcv_wnd) { 10957 todrop = tlen - tp->rcv_wnd; 10958 m_adj(m, -todrop); 10959 tlen = tp->rcv_wnd; 10960 thflags &= ~TH_FIN; 10961 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 10962 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 10963 } 10964 tp->snd_wl1 = th->th_seq - 1; 10965 tp->rcv_up = th->th_seq; 10966 /* 10967 * Client side of transaction: already sent SYN and data. If the 10968 * remote host used T/TCP to validate the SYN, our data will be 10969 * ACK'd; if so, enter normal data segment processing in the middle 10970 * of step 5, ack processing. Otherwise, goto step 6. 10971 */ 10972 if (thflags & TH_ACK) { 10973 /* For syn-sent we need to possibly update the rtt */ 10974 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 10975 uint32_t t, mcts; 10976 10977 mcts = tcp_ts_getticks(); 10978 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 10979 if (!tp->t_rttlow || tp->t_rttlow > t) 10980 tp->t_rttlow = t; 10981 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 10982 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 10983 tcp_rack_xmit_timer_commit(rack, tp); 10984 } 10985 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 10986 return (ret_val); 10987 /* We may have changed to FIN_WAIT_1 above */ 10988 if (tp->t_state == TCPS_FIN_WAIT_1) { 10989 /* 10990 * In FIN_WAIT_1 STATE in addition to the processing 10991 * for the ESTABLISHED state if our FIN is now 10992 * acknowledged then enter FIN_WAIT_2. 10993 */ 10994 if (ourfinisacked) { 10995 /* 10996 * If we can't receive any more data, then 10997 * closing user can proceed. Starting the 10998 * timer is contrary to the specification, 10999 * but if we don't get a FIN we'll hang 11000 * forever. 11001 * 11002 * XXXjl: we should release the tp also, and 11003 * use a compressed state. 11004 */ 11005 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11006 soisdisconnected(so); 11007 tcp_timer_activate(tp, TT_2MSL, 11008 (tcp_fast_finwait2_recycle ? 11009 tcp_finwait2_timeout : 11010 TP_MAXIDLE(tp))); 11011 } 11012 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11013 } 11014 } 11015 } 11016 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11017 tiwin, thflags, nxt_pkt)); 11018 } 11019 11020 /* 11021 * Return value of 1, the TCB is unlocked and most 11022 * likely gone, return value of 0, the TCP is still 11023 * locked. 11024 */ 11025 static int 11026 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 11027 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11028 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11029 { 11030 struct tcp_rack *rack; 11031 int32_t ret_val = 0; 11032 int32_t ourfinisacked = 0; 11033 11034 ctf_calc_rwin(so, tp); 11035 if ((thflags & TH_ACK) && 11036 (SEQ_LEQ(th->th_ack, tp->snd_una) || 11037 SEQ_GT(th->th_ack, tp->snd_max))) { 11038 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11039 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11040 return (1); 11041 } 11042 rack = (struct tcp_rack *)tp->t_fb_ptr; 11043 if (IS_FASTOPEN(tp->t_flags)) { 11044 /* 11045 * When a TFO connection is in SYN_RECEIVED, the 11046 * only valid packets are the initial SYN, a 11047 * retransmit/copy of the initial SYN (possibly with 11048 * a subset of the original data), a valid ACK, a 11049 * FIN, or a RST. 11050 */ 11051 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 11052 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11053 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11054 return (1); 11055 } else if (thflags & TH_SYN) { 11056 /* non-initial SYN is ignored */ 11057 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 11058 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 11059 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 11060 ctf_do_drop(m, NULL); 11061 return (0); 11062 } 11063 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 11064 ctf_do_drop(m, NULL); 11065 return (0); 11066 } 11067 } 11068 if ((thflags & TH_RST) || 11069 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11070 return (ctf_process_rst(m, th, so, tp)); 11071 /* 11072 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11073 * it's less than ts_recent, drop it. 11074 */ 11075 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11076 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11077 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11078 return (ret_val); 11079 } 11080 /* 11081 * In the SYN-RECEIVED state, validate that the packet belongs to 11082 * this connection before trimming the data to fit the receive 11083 * window. Check the sequence number versus IRS since we know the 11084 * sequence numbers haven't wrapped. This is a partial fix for the 11085 * "LAND" DoS attack. 11086 */ 11087 if (SEQ_LT(th->th_seq, tp->irs)) { 11088 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11089 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11090 return (1); 11091 } 11092 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11093 &rack->r_ctl.challenge_ack_ts, 11094 &rack->r_ctl.challenge_ack_cnt)) { 11095 return (ret_val); 11096 } 11097 /* 11098 * If last ACK falls within this segment's sequence numbers, record 11099 * its timestamp. NOTE: 1) That the test incorporates suggestions 11100 * from the latest proposal of the tcplw@cray.com list (Braden 11101 * 1993/04/26). 2) That updating only on newer timestamps interferes 11102 * with our earlier PAWS tests, so this check should be solely 11103 * predicated on the sequence space of this segment. 3) That we 11104 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11105 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11106 * SEG.Len, This modified check allows us to overcome RFC1323's 11107 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11108 * p.869. In such cases, we can still calculate the RTT correctly 11109 * when RCV.NXT == Last.ACK.Sent. 11110 */ 11111 if ((to->to_flags & TOF_TS) != 0 && 11112 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11113 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11114 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11115 tp->ts_recent_age = tcp_ts_getticks(); 11116 tp->ts_recent = to->to_tsval; 11117 } 11118 tp->snd_wnd = tiwin; 11119 rack_validate_fo_sendwin_up(tp, rack); 11120 /* 11121 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11122 * is on (half-synchronized state), then queue data for later 11123 * processing; else drop segment and return. 11124 */ 11125 if ((thflags & TH_ACK) == 0) { 11126 if (IS_FASTOPEN(tp->t_flags)) { 11127 rack_cc_conn_init(tp); 11128 } 11129 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11130 tiwin, thflags, nxt_pkt)); 11131 } 11132 KMOD_TCPSTAT_INC(tcps_connects); 11133 soisconnected(so); 11134 /* Do window scaling? */ 11135 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11136 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11137 tp->rcv_scale = tp->request_r_scale; 11138 } 11139 /* 11140 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 11141 * FIN-WAIT-1 11142 */ 11143 tp->t_starttime = ticks; 11144 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 11145 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 11146 tp->t_tfo_pending = NULL; 11147 } 11148 if (tp->t_flags & TF_NEEDFIN) { 11149 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11150 tp->t_flags &= ~TF_NEEDFIN; 11151 } else { 11152 tcp_state_change(tp, TCPS_ESTABLISHED); 11153 TCP_PROBE5(accept__established, NULL, tp, 11154 mtod(m, const char *), tp, th); 11155 /* 11156 * TFO connections call cc_conn_init() during SYN 11157 * processing. Calling it again here for such connections 11158 * is not harmless as it would undo the snd_cwnd reduction 11159 * that occurs when a TFO SYN|ACK is retransmitted. 11160 */ 11161 if (!IS_FASTOPEN(tp->t_flags)) 11162 rack_cc_conn_init(tp); 11163 } 11164 /* 11165 * Account for the ACK of our SYN prior to 11166 * regular ACK processing below, except for 11167 * simultaneous SYN, which is handled later. 11168 */ 11169 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 11170 tp->snd_una++; 11171 /* 11172 * If segment contains data or ACK, will call tcp_reass() later; if 11173 * not, do so now to pass queued data to user. 11174 */ 11175 if (tlen == 0 && (thflags & TH_FIN) == 0) { 11176 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 11177 (struct mbuf *)0); 11178 if (tp->t_flags & TF_WAKESOR) { 11179 tp->t_flags &= ~TF_WAKESOR; 11180 /* NB: sorwakeup_locked() does an implicit unlock. */ 11181 sorwakeup_locked(so); 11182 } 11183 } 11184 tp->snd_wl1 = th->th_seq - 1; 11185 /* For syn-recv we need to possibly update the rtt */ 11186 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11187 uint32_t t, mcts; 11188 11189 mcts = tcp_ts_getticks(); 11190 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11191 if (!tp->t_rttlow || tp->t_rttlow > t) 11192 tp->t_rttlow = t; 11193 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 11194 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11195 tcp_rack_xmit_timer_commit(rack, tp); 11196 } 11197 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11198 return (ret_val); 11199 } 11200 if (tp->t_state == TCPS_FIN_WAIT_1) { 11201 /* We could have went to FIN_WAIT_1 (or EST) above */ 11202 /* 11203 * In FIN_WAIT_1 STATE in addition to the processing for the 11204 * ESTABLISHED state if our FIN is now acknowledged then 11205 * enter FIN_WAIT_2. 11206 */ 11207 if (ourfinisacked) { 11208 /* 11209 * If we can't receive any more data, then closing 11210 * user can proceed. Starting the timer is contrary 11211 * to the specification, but if we don't get a FIN 11212 * we'll hang forever. 11213 * 11214 * XXXjl: we should release the tp also, and use a 11215 * compressed state. 11216 */ 11217 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11218 soisdisconnected(so); 11219 tcp_timer_activate(tp, TT_2MSL, 11220 (tcp_fast_finwait2_recycle ? 11221 tcp_finwait2_timeout : 11222 TP_MAXIDLE(tp))); 11223 } 11224 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11225 } 11226 } 11227 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11228 tiwin, thflags, nxt_pkt)); 11229 } 11230 11231 /* 11232 * Return value of 1, the TCB is unlocked and most 11233 * likely gone, return value of 0, the TCP is still 11234 * locked. 11235 */ 11236 static int 11237 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 11238 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11239 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11240 { 11241 int32_t ret_val = 0; 11242 struct tcp_rack *rack; 11243 11244 /* 11245 * Header prediction: check for the two common cases of a 11246 * uni-directional data xfer. If the packet has no control flags, 11247 * is in-sequence, the window didn't change and we're not 11248 * retransmitting, it's a candidate. If the length is zero and the 11249 * ack moved forward, we're the sender side of the xfer. Just free 11250 * the data acked & wake any higher level process that was blocked 11251 * waiting for space. If the length is non-zero and the ack didn't 11252 * move, we're the receiver side. If we're getting packets in-order 11253 * (the reassembly queue is empty), add the data toc The socket 11254 * buffer and note that we need a delayed ack. Make sure that the 11255 * hidden state-flags are also off. Since we check for 11256 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 11257 */ 11258 rack = (struct tcp_rack *)tp->t_fb_ptr; 11259 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 11260 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 11261 __predict_true(SEGQ_EMPTY(tp)) && 11262 __predict_true(th->th_seq == tp->rcv_nxt)) { 11263 if (tlen == 0) { 11264 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 11265 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 11266 return (0); 11267 } 11268 } else { 11269 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 11270 tiwin, nxt_pkt, iptos)) { 11271 return (0); 11272 } 11273 } 11274 } 11275 ctf_calc_rwin(so, tp); 11276 11277 if ((thflags & TH_RST) || 11278 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11279 return (ctf_process_rst(m, th, so, tp)); 11280 11281 /* 11282 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11283 * synchronized state. 11284 */ 11285 if (thflags & TH_SYN) { 11286 ctf_challenge_ack(m, th, tp, &ret_val); 11287 return (ret_val); 11288 } 11289 /* 11290 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11291 * it's less than ts_recent, drop it. 11292 */ 11293 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11294 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11295 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11296 return (ret_val); 11297 } 11298 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11299 &rack->r_ctl.challenge_ack_ts, 11300 &rack->r_ctl.challenge_ack_cnt)) { 11301 return (ret_val); 11302 } 11303 /* 11304 * If last ACK falls within this segment's sequence numbers, record 11305 * its timestamp. NOTE: 1) That the test incorporates suggestions 11306 * from the latest proposal of the tcplw@cray.com list (Braden 11307 * 1993/04/26). 2) That updating only on newer timestamps interferes 11308 * with our earlier PAWS tests, so this check should be solely 11309 * predicated on the sequence space of this segment. 3) That we 11310 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11311 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11312 * SEG.Len, This modified check allows us to overcome RFC1323's 11313 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11314 * p.869. In such cases, we can still calculate the RTT correctly 11315 * when RCV.NXT == Last.ACK.Sent. 11316 */ 11317 if ((to->to_flags & TOF_TS) != 0 && 11318 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11319 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11320 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11321 tp->ts_recent_age = tcp_ts_getticks(); 11322 tp->ts_recent = to->to_tsval; 11323 } 11324 /* 11325 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11326 * is on (half-synchronized state), then queue data for later 11327 * processing; else drop segment and return. 11328 */ 11329 if ((thflags & TH_ACK) == 0) { 11330 if (tp->t_flags & TF_NEEDSYN) { 11331 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11332 tiwin, thflags, nxt_pkt)); 11333 11334 } else if (tp->t_flags & TF_ACKNOW) { 11335 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11336 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11337 return (ret_val); 11338 } else { 11339 ctf_do_drop(m, NULL); 11340 return (0); 11341 } 11342 } 11343 /* 11344 * Ack processing. 11345 */ 11346 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11347 return (ret_val); 11348 } 11349 if (sbavail(&so->so_snd)) { 11350 if (ctf_progress_timeout_check(tp, true)) { 11351 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 11352 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11353 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11354 return (1); 11355 } 11356 } 11357 /* State changes only happen in rack_process_data() */ 11358 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11359 tiwin, thflags, nxt_pkt)); 11360 } 11361 11362 /* 11363 * Return value of 1, the TCB is unlocked and most 11364 * likely gone, return value of 0, the TCP is still 11365 * locked. 11366 */ 11367 static int 11368 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 11369 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11370 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11371 { 11372 int32_t ret_val = 0; 11373 struct tcp_rack *rack; 11374 11375 rack = (struct tcp_rack *)tp->t_fb_ptr; 11376 ctf_calc_rwin(so, tp); 11377 if ((thflags & TH_RST) || 11378 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11379 return (ctf_process_rst(m, th, so, tp)); 11380 /* 11381 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11382 * synchronized state. 11383 */ 11384 if (thflags & TH_SYN) { 11385 ctf_challenge_ack(m, th, tp, &ret_val); 11386 return (ret_val); 11387 } 11388 /* 11389 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11390 * it's less than ts_recent, drop it. 11391 */ 11392 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11393 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11394 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11395 return (ret_val); 11396 } 11397 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11398 &rack->r_ctl.challenge_ack_ts, 11399 &rack->r_ctl.challenge_ack_cnt)) { 11400 return (ret_val); 11401 } 11402 /* 11403 * If last ACK falls within this segment's sequence numbers, record 11404 * its timestamp. NOTE: 1) That the test incorporates suggestions 11405 * from the latest proposal of the tcplw@cray.com list (Braden 11406 * 1993/04/26). 2) That updating only on newer timestamps interferes 11407 * with our earlier PAWS tests, so this check should be solely 11408 * predicated on the sequence space of this segment. 3) That we 11409 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11410 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11411 * SEG.Len, This modified check allows us to overcome RFC1323's 11412 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11413 * p.869. In such cases, we can still calculate the RTT correctly 11414 * when RCV.NXT == Last.ACK.Sent. 11415 */ 11416 if ((to->to_flags & TOF_TS) != 0 && 11417 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11418 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11419 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11420 tp->ts_recent_age = tcp_ts_getticks(); 11421 tp->ts_recent = to->to_tsval; 11422 } 11423 /* 11424 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11425 * is on (half-synchronized state), then queue data for later 11426 * processing; else drop segment and return. 11427 */ 11428 if ((thflags & TH_ACK) == 0) { 11429 if (tp->t_flags & TF_NEEDSYN) { 11430 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11431 tiwin, thflags, nxt_pkt)); 11432 11433 } else if (tp->t_flags & TF_ACKNOW) { 11434 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11435 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11436 return (ret_val); 11437 } else { 11438 ctf_do_drop(m, NULL); 11439 return (0); 11440 } 11441 } 11442 /* 11443 * Ack processing. 11444 */ 11445 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11446 return (ret_val); 11447 } 11448 if (sbavail(&so->so_snd)) { 11449 if (ctf_progress_timeout_check(tp, true)) { 11450 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11451 tp, tick, PROGRESS_DROP, __LINE__); 11452 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11453 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11454 return (1); 11455 } 11456 } 11457 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11458 tiwin, thflags, nxt_pkt)); 11459 } 11460 11461 static int 11462 rack_check_data_after_close(struct mbuf *m, 11463 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 11464 { 11465 struct tcp_rack *rack; 11466 11467 rack = (struct tcp_rack *)tp->t_fb_ptr; 11468 if (rack->rc_allow_data_af_clo == 0) { 11469 close_now: 11470 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11471 /* tcp_close will kill the inp pre-log the Reset */ 11472 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 11473 tp = tcp_close(tp); 11474 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 11475 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 11476 return (1); 11477 } 11478 if (sbavail(&so->so_snd) == 0) 11479 goto close_now; 11480 /* Ok we allow data that is ignored and a followup reset */ 11481 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11482 tp->rcv_nxt = th->th_seq + *tlen; 11483 tp->t_flags2 |= TF2_DROP_AF_DATA; 11484 rack->r_wanted_output = 1; 11485 *tlen = 0; 11486 return (0); 11487 } 11488 11489 /* 11490 * Return value of 1, the TCB is unlocked and most 11491 * likely gone, return value of 0, the TCP is still 11492 * locked. 11493 */ 11494 static int 11495 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 11496 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11497 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11498 { 11499 int32_t ret_val = 0; 11500 int32_t ourfinisacked = 0; 11501 struct tcp_rack *rack; 11502 11503 rack = (struct tcp_rack *)tp->t_fb_ptr; 11504 ctf_calc_rwin(so, tp); 11505 11506 if ((thflags & TH_RST) || 11507 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11508 return (ctf_process_rst(m, th, so, tp)); 11509 /* 11510 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11511 * synchronized state. 11512 */ 11513 if (thflags & TH_SYN) { 11514 ctf_challenge_ack(m, th, tp, &ret_val); 11515 return (ret_val); 11516 } 11517 /* 11518 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11519 * it's less than ts_recent, drop it. 11520 */ 11521 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11522 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11523 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11524 return (ret_val); 11525 } 11526 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11527 &rack->r_ctl.challenge_ack_ts, 11528 &rack->r_ctl.challenge_ack_cnt)) { 11529 return (ret_val); 11530 } 11531 /* 11532 * If new data are received on a connection after the user processes 11533 * are gone, then RST the other end. 11534 */ 11535 if ((so->so_state & SS_NOFDREF) && tlen) { 11536 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11537 return (1); 11538 } 11539 /* 11540 * If last ACK falls within this segment's sequence numbers, record 11541 * its timestamp. NOTE: 1) That the test incorporates suggestions 11542 * from the latest proposal of the tcplw@cray.com list (Braden 11543 * 1993/04/26). 2) That updating only on newer timestamps interferes 11544 * with our earlier PAWS tests, so this check should be solely 11545 * predicated on the sequence space of this segment. 3) That we 11546 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11547 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11548 * SEG.Len, This modified check allows us to overcome RFC1323's 11549 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11550 * p.869. In such cases, we can still calculate the RTT correctly 11551 * when RCV.NXT == Last.ACK.Sent. 11552 */ 11553 if ((to->to_flags & TOF_TS) != 0 && 11554 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11555 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11556 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11557 tp->ts_recent_age = tcp_ts_getticks(); 11558 tp->ts_recent = to->to_tsval; 11559 } 11560 /* 11561 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11562 * is on (half-synchronized state), then queue data for later 11563 * processing; else drop segment and return. 11564 */ 11565 if ((thflags & TH_ACK) == 0) { 11566 if (tp->t_flags & TF_NEEDSYN) { 11567 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11568 tiwin, thflags, nxt_pkt)); 11569 } else if (tp->t_flags & TF_ACKNOW) { 11570 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11571 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11572 return (ret_val); 11573 } else { 11574 ctf_do_drop(m, NULL); 11575 return (0); 11576 } 11577 } 11578 /* 11579 * Ack processing. 11580 */ 11581 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11582 return (ret_val); 11583 } 11584 if (ourfinisacked) { 11585 /* 11586 * If we can't receive any more data, then closing user can 11587 * proceed. Starting the timer is contrary to the 11588 * specification, but if we don't get a FIN we'll hang 11589 * forever. 11590 * 11591 * XXXjl: we should release the tp also, and use a 11592 * compressed state. 11593 */ 11594 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11595 soisdisconnected(so); 11596 tcp_timer_activate(tp, TT_2MSL, 11597 (tcp_fast_finwait2_recycle ? 11598 tcp_finwait2_timeout : 11599 TP_MAXIDLE(tp))); 11600 } 11601 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11602 } 11603 if (sbavail(&so->so_snd)) { 11604 if (ctf_progress_timeout_check(tp, true)) { 11605 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11606 tp, tick, PROGRESS_DROP, __LINE__); 11607 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11608 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11609 return (1); 11610 } 11611 } 11612 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11613 tiwin, thflags, nxt_pkt)); 11614 } 11615 11616 /* 11617 * Return value of 1, the TCB is unlocked and most 11618 * likely gone, return value of 0, the TCP is still 11619 * locked. 11620 */ 11621 static int 11622 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 11623 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11624 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11625 { 11626 int32_t ret_val = 0; 11627 int32_t ourfinisacked = 0; 11628 struct tcp_rack *rack; 11629 11630 rack = (struct tcp_rack *)tp->t_fb_ptr; 11631 ctf_calc_rwin(so, tp); 11632 11633 if ((thflags & TH_RST) || 11634 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11635 return (ctf_process_rst(m, th, so, tp)); 11636 /* 11637 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11638 * synchronized state. 11639 */ 11640 if (thflags & TH_SYN) { 11641 ctf_challenge_ack(m, th, tp, &ret_val); 11642 return (ret_val); 11643 } 11644 /* 11645 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11646 * it's less than ts_recent, drop it. 11647 */ 11648 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11649 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11650 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11651 return (ret_val); 11652 } 11653 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11654 &rack->r_ctl.challenge_ack_ts, 11655 &rack->r_ctl.challenge_ack_cnt)) { 11656 return (ret_val); 11657 } 11658 /* 11659 * If new data are received on a connection after the user processes 11660 * are gone, then RST the other end. 11661 */ 11662 if ((so->so_state & SS_NOFDREF) && tlen) { 11663 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11664 return (1); 11665 } 11666 /* 11667 * If last ACK falls within this segment's sequence numbers, record 11668 * its timestamp. NOTE: 1) That the test incorporates suggestions 11669 * from the latest proposal of the tcplw@cray.com list (Braden 11670 * 1993/04/26). 2) That updating only on newer timestamps interferes 11671 * with our earlier PAWS tests, so this check should be solely 11672 * predicated on the sequence space of this segment. 3) That we 11673 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11674 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11675 * SEG.Len, This modified check allows us to overcome RFC1323's 11676 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11677 * p.869. In such cases, we can still calculate the RTT correctly 11678 * when RCV.NXT == Last.ACK.Sent. 11679 */ 11680 if ((to->to_flags & TOF_TS) != 0 && 11681 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11682 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11683 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11684 tp->ts_recent_age = tcp_ts_getticks(); 11685 tp->ts_recent = to->to_tsval; 11686 } 11687 /* 11688 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11689 * is on (half-synchronized state), then queue data for later 11690 * processing; else drop segment and return. 11691 */ 11692 if ((thflags & TH_ACK) == 0) { 11693 if (tp->t_flags & TF_NEEDSYN) { 11694 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11695 tiwin, thflags, nxt_pkt)); 11696 } else if (tp->t_flags & TF_ACKNOW) { 11697 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11698 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11699 return (ret_val); 11700 } else { 11701 ctf_do_drop(m, NULL); 11702 return (0); 11703 } 11704 } 11705 /* 11706 * Ack processing. 11707 */ 11708 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11709 return (ret_val); 11710 } 11711 if (ourfinisacked) { 11712 tcp_twstart(tp); 11713 m_freem(m); 11714 return (1); 11715 } 11716 if (sbavail(&so->so_snd)) { 11717 if (ctf_progress_timeout_check(tp, true)) { 11718 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11719 tp, tick, PROGRESS_DROP, __LINE__); 11720 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11721 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11722 return (1); 11723 } 11724 } 11725 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11726 tiwin, thflags, nxt_pkt)); 11727 } 11728 11729 /* 11730 * Return value of 1, the TCB is unlocked and most 11731 * likely gone, return value of 0, the TCP is still 11732 * locked. 11733 */ 11734 static int 11735 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11736 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11737 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11738 { 11739 int32_t ret_val = 0; 11740 int32_t ourfinisacked = 0; 11741 struct tcp_rack *rack; 11742 11743 rack = (struct tcp_rack *)tp->t_fb_ptr; 11744 ctf_calc_rwin(so, tp); 11745 11746 if ((thflags & TH_RST) || 11747 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11748 return (ctf_process_rst(m, th, so, tp)); 11749 /* 11750 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11751 * synchronized state. 11752 */ 11753 if (thflags & TH_SYN) { 11754 ctf_challenge_ack(m, th, tp, &ret_val); 11755 return (ret_val); 11756 } 11757 /* 11758 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11759 * it's less than ts_recent, drop it. 11760 */ 11761 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11762 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11763 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11764 return (ret_val); 11765 } 11766 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11767 &rack->r_ctl.challenge_ack_ts, 11768 &rack->r_ctl.challenge_ack_cnt)) { 11769 return (ret_val); 11770 } 11771 /* 11772 * If new data are received on a connection after the user processes 11773 * are gone, then RST the other end. 11774 */ 11775 if ((so->so_state & SS_NOFDREF) && tlen) { 11776 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11777 return (1); 11778 } 11779 /* 11780 * If last ACK falls within this segment's sequence numbers, record 11781 * its timestamp. NOTE: 1) That the test incorporates suggestions 11782 * from the latest proposal of the tcplw@cray.com list (Braden 11783 * 1993/04/26). 2) That updating only on newer timestamps interferes 11784 * with our earlier PAWS tests, so this check should be solely 11785 * predicated on the sequence space of this segment. 3) That we 11786 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11787 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11788 * SEG.Len, This modified check allows us to overcome RFC1323's 11789 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11790 * p.869. In such cases, we can still calculate the RTT correctly 11791 * when RCV.NXT == Last.ACK.Sent. 11792 */ 11793 if ((to->to_flags & TOF_TS) != 0 && 11794 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11795 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11796 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11797 tp->ts_recent_age = tcp_ts_getticks(); 11798 tp->ts_recent = to->to_tsval; 11799 } 11800 /* 11801 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11802 * is on (half-synchronized state), then queue data for later 11803 * processing; else drop segment and return. 11804 */ 11805 if ((thflags & TH_ACK) == 0) { 11806 if (tp->t_flags & TF_NEEDSYN) { 11807 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11808 tiwin, thflags, nxt_pkt)); 11809 } else if (tp->t_flags & TF_ACKNOW) { 11810 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11811 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11812 return (ret_val); 11813 } else { 11814 ctf_do_drop(m, NULL); 11815 return (0); 11816 } 11817 } 11818 /* 11819 * case TCPS_LAST_ACK: Ack processing. 11820 */ 11821 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11822 return (ret_val); 11823 } 11824 if (ourfinisacked) { 11825 tp = tcp_close(tp); 11826 ctf_do_drop(m, tp); 11827 return (1); 11828 } 11829 if (sbavail(&so->so_snd)) { 11830 if (ctf_progress_timeout_check(tp, true)) { 11831 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11832 tp, tick, PROGRESS_DROP, __LINE__); 11833 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11834 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11835 return (1); 11836 } 11837 } 11838 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11839 tiwin, thflags, nxt_pkt)); 11840 } 11841 11842 /* 11843 * Return value of 1, the TCB is unlocked and most 11844 * likely gone, return value of 0, the TCP is still 11845 * locked. 11846 */ 11847 static int 11848 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 11849 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11850 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11851 { 11852 int32_t ret_val = 0; 11853 int32_t ourfinisacked = 0; 11854 struct tcp_rack *rack; 11855 11856 rack = (struct tcp_rack *)tp->t_fb_ptr; 11857 ctf_calc_rwin(so, tp); 11858 11859 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 11860 if ((thflags & TH_RST) || 11861 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11862 return (ctf_process_rst(m, th, so, tp)); 11863 /* 11864 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11865 * synchronized state. 11866 */ 11867 if (thflags & TH_SYN) { 11868 ctf_challenge_ack(m, th, tp, &ret_val); 11869 return (ret_val); 11870 } 11871 /* 11872 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11873 * it's less than ts_recent, drop it. 11874 */ 11875 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11876 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11877 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11878 return (ret_val); 11879 } 11880 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11881 &rack->r_ctl.challenge_ack_ts, 11882 &rack->r_ctl.challenge_ack_cnt)) { 11883 return (ret_val); 11884 } 11885 /* 11886 * If new data are received on a connection after the user processes 11887 * are gone, then RST the other end. 11888 */ 11889 if ((so->so_state & SS_NOFDREF) && 11890 tlen) { 11891 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11892 return (1); 11893 } 11894 /* 11895 * If last ACK falls within this segment's sequence numbers, record 11896 * its timestamp. NOTE: 1) That the test incorporates suggestions 11897 * from the latest proposal of the tcplw@cray.com list (Braden 11898 * 1993/04/26). 2) That updating only on newer timestamps interferes 11899 * with our earlier PAWS tests, so this check should be solely 11900 * predicated on the sequence space of this segment. 3) That we 11901 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11902 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11903 * SEG.Len, This modified check allows us to overcome RFC1323's 11904 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11905 * p.869. In such cases, we can still calculate the RTT correctly 11906 * when RCV.NXT == Last.ACK.Sent. 11907 */ 11908 if ((to->to_flags & TOF_TS) != 0 && 11909 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11910 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11911 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11912 tp->ts_recent_age = tcp_ts_getticks(); 11913 tp->ts_recent = to->to_tsval; 11914 } 11915 /* 11916 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11917 * is on (half-synchronized state), then queue data for later 11918 * processing; else drop segment and return. 11919 */ 11920 if ((thflags & TH_ACK) == 0) { 11921 if (tp->t_flags & TF_NEEDSYN) { 11922 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11923 tiwin, thflags, nxt_pkt)); 11924 } else if (tp->t_flags & TF_ACKNOW) { 11925 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11926 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11927 return (ret_val); 11928 } else { 11929 ctf_do_drop(m, NULL); 11930 return (0); 11931 } 11932 } 11933 /* 11934 * Ack processing. 11935 */ 11936 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11937 return (ret_val); 11938 } 11939 if (sbavail(&so->so_snd)) { 11940 if (ctf_progress_timeout_check(tp, true)) { 11941 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11942 tp, tick, PROGRESS_DROP, __LINE__); 11943 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11944 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11945 return (1); 11946 } 11947 } 11948 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11949 tiwin, thflags, nxt_pkt)); 11950 } 11951 11952 static void inline 11953 rack_clear_rate_sample(struct tcp_rack *rack) 11954 { 11955 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 11956 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 11957 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 11958 } 11959 11960 static void 11961 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 11962 { 11963 uint64_t bw_est, rate_wanted; 11964 int chged = 0; 11965 uint32_t user_max, orig_min, orig_max; 11966 11967 orig_min = rack->r_ctl.rc_pace_min_segs; 11968 orig_max = rack->r_ctl.rc_pace_max_segs; 11969 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 11970 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 11971 chged = 1; 11972 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 11973 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 11974 if (user_max != rack->r_ctl.rc_pace_max_segs) 11975 chged = 1; 11976 } 11977 if (rack->rc_force_max_seg) { 11978 rack->r_ctl.rc_pace_max_segs = user_max; 11979 } else if (rack->use_fixed_rate) { 11980 bw_est = rack_get_bw(rack); 11981 if ((rack->r_ctl.crte == NULL) || 11982 (bw_est != rack->r_ctl.crte->rate)) { 11983 rack->r_ctl.rc_pace_max_segs = user_max; 11984 } else { 11985 /* We are pacing right at the hardware rate */ 11986 uint32_t segsiz; 11987 11988 segsiz = min(ctf_fixed_maxseg(tp), 11989 rack->r_ctl.rc_pace_min_segs); 11990 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 11991 tp, bw_est, segsiz, 0, 11992 rack->r_ctl.crte, NULL); 11993 } 11994 } else if (rack->rc_always_pace) { 11995 if (rack->r_ctl.gp_bw || 11996 #ifdef NETFLIX_PEAKRATE 11997 rack->rc_tp->t_maxpeakrate || 11998 #endif 11999 rack->r_ctl.init_rate) { 12000 /* We have a rate of some sort set */ 12001 uint32_t orig; 12002 12003 bw_est = rack_get_bw(rack); 12004 orig = rack->r_ctl.rc_pace_max_segs; 12005 if (fill_override) 12006 rate_wanted = *fill_override; 12007 else 12008 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); 12009 if (rate_wanted) { 12010 /* We have something */ 12011 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 12012 rate_wanted, 12013 ctf_fixed_maxseg(rack->rc_tp)); 12014 } else 12015 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 12016 if (orig != rack->r_ctl.rc_pace_max_segs) 12017 chged = 1; 12018 } else if ((rack->r_ctl.gp_bw == 0) && 12019 (rack->r_ctl.rc_pace_max_segs == 0)) { 12020 /* 12021 * If we have nothing limit us to bursting 12022 * out IW sized pieces. 12023 */ 12024 chged = 1; 12025 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 12026 } 12027 } 12028 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 12029 chged = 1; 12030 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 12031 } 12032 if (chged) 12033 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 12034 } 12035 12036 12037 static void 12038 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) 12039 { 12040 #ifdef INET6 12041 struct ip6_hdr *ip6 = NULL; 12042 #endif 12043 #ifdef INET 12044 struct ip *ip = NULL; 12045 #endif 12046 #if defined(INET) || defined(INET6) 12047 struct udphdr *udp = NULL; 12048 #endif 12049 12050 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 12051 #ifdef INET6 12052 if (rack->r_is_v6) { 12053 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 12054 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 12055 if (tp->t_port) { 12056 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12057 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 12058 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12059 udp->uh_dport = tp->t_port; 12060 rack->r_ctl.fsb.udp = udp; 12061 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12062 } else 12063 { 12064 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 12065 rack->r_ctl.fsb.udp = NULL; 12066 } 12067 tcpip_fillheaders(rack->rc_inp, 12068 tp->t_port, 12069 ip6, rack->r_ctl.fsb.th); 12070 } else 12071 #endif /* INET6 */ 12072 #ifdef INET 12073 { 12074 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 12075 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 12076 if (tp->t_port) { 12077 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 12078 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 12079 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 12080 udp->uh_dport = tp->t_port; 12081 rack->r_ctl.fsb.udp = udp; 12082 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 12083 } else 12084 { 12085 rack->r_ctl.fsb.udp = NULL; 12086 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 12087 } 12088 tcpip_fillheaders(rack->rc_inp, 12089 tp->t_port, 12090 ip, rack->r_ctl.fsb.th); 12091 } 12092 #endif 12093 rack->r_fsb_inited = 1; 12094 } 12095 12096 static int 12097 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 12098 { 12099 /* 12100 * Allocate the larger of spaces V6 if available else just 12101 * V4 and include udphdr (overbook) 12102 */ 12103 #ifdef INET6 12104 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 12105 #else 12106 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 12107 #endif 12108 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 12109 M_TCPFSB, M_NOWAIT|M_ZERO); 12110 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 12111 return (ENOMEM); 12112 } 12113 rack->r_fsb_inited = 0; 12114 return (0); 12115 } 12116 12117 static int 12118 rack_init(struct tcpcb *tp) 12119 { 12120 struct tcp_rack *rack = NULL; 12121 struct rack_sendmap *insret; 12122 uint32_t iwin, snt, us_cts; 12123 int err; 12124 12125 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 12126 if (tp->t_fb_ptr == NULL) { 12127 /* 12128 * We need to allocate memory but cant. The INP and INP_INFO 12129 * locks and they are recusive (happens during setup. So a 12130 * scheme to drop the locks fails :( 12131 * 12132 */ 12133 return (ENOMEM); 12134 } 12135 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 12136 12137 rack = (struct tcp_rack *)tp->t_fb_ptr; 12138 RB_INIT(&rack->r_ctl.rc_mtree); 12139 TAILQ_INIT(&rack->r_ctl.rc_free); 12140 TAILQ_INIT(&rack->r_ctl.rc_tmap); 12141 rack->rc_tp = tp; 12142 rack->rc_inp = tp->t_inpcb; 12143 /* Set the flag */ 12144 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 12145 /* Probably not needed but lets be sure */ 12146 rack_clear_rate_sample(rack); 12147 /* 12148 * Save off the default values, socket options will poke 12149 * at these if pacing is not on or we have not yet 12150 * reached where pacing is on (gp_ready/fixed enabled). 12151 * When they get set into the CC module (when gp_ready 12152 * is enabled or we enable fixed) then we will set these 12153 * values into the CC and place in here the old values 12154 * so we have a restoral. Then we will set the flag 12155 * rc_pacing_cc_set. That way whenever we turn off pacing 12156 * or switch off this stack, we will know to go restore 12157 * the saved values. 12158 */ 12159 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 12160 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 12161 /* We want abe like behavior as well */ 12162 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; 12163 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 12164 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 12165 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 12166 if (use_rack_rr) 12167 rack->use_rack_rr = 1; 12168 if (V_tcp_delack_enabled) 12169 tp->t_delayed_ack = 1; 12170 else 12171 tp->t_delayed_ack = 0; 12172 #ifdef TCP_ACCOUNTING 12173 if (rack_tcp_accounting) { 12174 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 12175 } 12176 #endif 12177 if (rack_enable_shared_cwnd) 12178 rack->rack_enable_scwnd = 1; 12179 rack->rc_user_set_max_segs = rack_hptsi_segments; 12180 rack->rc_force_max_seg = 0; 12181 if (rack_use_imac_dack) 12182 rack->rc_dack_mode = 1; 12183 TAILQ_INIT(&rack->r_ctl.opt_list); 12184 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 12185 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 12186 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 12187 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 12188 rack->r_ctl.rc_highest_us_rtt = 0; 12189 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 12190 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 12191 if (rack_use_cmp_acks) 12192 rack->r_use_cmp_ack = 1; 12193 if (rack_disable_prr) 12194 rack->rack_no_prr = 1; 12195 if (rack_gp_no_rec_chg) 12196 rack->rc_gp_no_rec_chg = 1; 12197 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 12198 rack->rc_always_pace = 1; 12199 if (rack->use_fixed_rate || rack->gp_ready) 12200 rack_set_cc_pacing(rack); 12201 } else 12202 rack->rc_always_pace = 0; 12203 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 12204 rack->r_mbuf_queue = 1; 12205 else 12206 rack->r_mbuf_queue = 0; 12207 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 12208 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 12209 else 12210 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12211 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12212 if (rack_limits_scwnd) 12213 rack->r_limit_scw = 1; 12214 else 12215 rack->r_limit_scw = 0; 12216 rack->rc_labc = V_tcp_abc_l_var; 12217 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 12218 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12219 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 12220 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 12221 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 12222 rack->r_ctl.rc_min_to = rack_min_to; 12223 microuptime(&rack->r_ctl.act_rcv_time); 12224 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 12225 rack->r_running_late = 0; 12226 rack->r_running_early = 0; 12227 rack->rc_init_win = rack_default_init_window; 12228 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 12229 if (rack_hw_up_only) 12230 rack->r_up_only = 1; 12231 if (rack_do_dyn_mul) { 12232 /* When dynamic adjustment is on CA needs to start at 100% */ 12233 rack->rc_gp_dyn_mul = 1; 12234 if (rack_do_dyn_mul >= 100) 12235 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 12236 } else 12237 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 12238 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 12239 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 12240 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 12241 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 12242 rack_probertt_filter_life); 12243 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12244 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12245 rack->r_ctl.rc_time_of_last_probertt = us_cts; 12246 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 12247 rack->r_ctl.rc_time_probertt_starts = 0; 12248 /* We require at least one measurement, even if the sysctl is 0 */ 12249 if (rack_req_measurements) 12250 rack->r_ctl.req_measurements = rack_req_measurements; 12251 else 12252 rack->r_ctl.req_measurements = 1; 12253 if (rack_enable_hw_pacing) 12254 rack->rack_hdw_pace_ena = 1; 12255 if (rack_hw_rate_caps) 12256 rack->r_rack_hw_rate_caps = 1; 12257 /* Do we force on detection? */ 12258 #ifdef NETFLIX_EXP_DETECTION 12259 if (tcp_force_detection) 12260 rack->do_detection = 1; 12261 else 12262 #endif 12263 rack->do_detection = 0; 12264 if (rack_non_rxt_use_cr) 12265 rack->rack_rec_nonrxt_use_cr = 1; 12266 err = rack_init_fsb(tp, rack); 12267 if (err) { 12268 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12269 tp->t_fb_ptr = NULL; 12270 return (err); 12271 } 12272 if (tp->snd_una != tp->snd_max) { 12273 /* Create a send map for the current outstanding data */ 12274 struct rack_sendmap *rsm; 12275 12276 rsm = rack_alloc(rack); 12277 if (rsm == NULL) { 12278 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12279 tp->t_fb_ptr = NULL; 12280 return (ENOMEM); 12281 } 12282 rsm->r_no_rtt_allowed = 1; 12283 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 12284 rsm->r_rtr_cnt = 1; 12285 rsm->r_rtr_bytes = 0; 12286 if (tp->t_flags & TF_SENTFIN) { 12287 rsm->r_end = tp->snd_max - 1; 12288 rsm->r_flags |= RACK_HAS_FIN; 12289 } else { 12290 rsm->r_end = tp->snd_max; 12291 } 12292 if (tp->snd_una == tp->iss) { 12293 /* The data space is one beyond snd_una */ 12294 rsm->r_flags |= RACK_HAS_SYN; 12295 rsm->r_start = tp->iss; 12296 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 12297 } else 12298 rsm->r_start = tp->snd_una; 12299 rsm->r_dupack = 0; 12300 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 12301 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 12302 if (rsm->m) 12303 rsm->orig_m_len = rsm->m->m_len; 12304 else 12305 rsm->orig_m_len = 0; 12306 } else { 12307 /* 12308 * This can happen if we have a stand-alone FIN or 12309 * SYN. 12310 */ 12311 rsm->m = NULL; 12312 rsm->orig_m_len = 0; 12313 rsm->soff = 0; 12314 } 12315 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12316 #ifdef INVARIANTS 12317 if (insret != NULL) { 12318 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 12319 insret, rack, rsm); 12320 } 12321 #endif 12322 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 12323 rsm->r_in_tmap = 1; 12324 } 12325 /* 12326 * Timers in Rack are kept in microseconds so lets 12327 * convert any initial incoming variables 12328 * from ticks into usecs. Note that we 12329 * also change the values of t_srtt and t_rttvar, if 12330 * they are non-zero. They are kept with a 5 12331 * bit decimal so we have to carefully convert 12332 * these to get the full precision. 12333 */ 12334 rack_convert_rtts(tp); 12335 tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); 12336 if (rack_def_profile) 12337 rack_set_profile(rack, rack_def_profile); 12338 /* Cancel the GP measurement in progress */ 12339 tp->t_flags &= ~TF_GPUTINPROG; 12340 if (SEQ_GT(tp->snd_max, tp->iss)) 12341 snt = tp->snd_max - tp->iss; 12342 else 12343 snt = 0; 12344 iwin = rc_init_window(rack); 12345 if (snt < iwin) { 12346 /* We are not past the initial window 12347 * so we need to make sure cwnd is 12348 * correct. 12349 */ 12350 if (tp->snd_cwnd < iwin) 12351 tp->snd_cwnd = iwin; 12352 /* 12353 * If we are within the initial window 12354 * we want ssthresh to be unlimited. Setting 12355 * it to the rwnd (which the default stack does 12356 * and older racks) is not really a good idea 12357 * since we want to be in SS and grow both the 12358 * cwnd and the rwnd (via dynamic rwnd growth). If 12359 * we set it to the rwnd then as the peer grows its 12360 * rwnd we will be stuck in CA and never hit SS. 12361 * 12362 * Its far better to raise it up high (this takes the 12363 * risk that there as been a loss already, probably 12364 * we should have an indicator in all stacks of loss 12365 * but we don't), but considering the normal use this 12366 * is a risk worth taking. The consequences of not 12367 * hitting SS are far worse than going one more time 12368 * into it early on (before we have sent even a IW). 12369 * It is highly unlikely that we will have had a loss 12370 * before getting the IW out. 12371 */ 12372 tp->snd_ssthresh = 0xffffffff; 12373 } 12374 rack_stop_all_timers(tp); 12375 /* Lets setup the fsb block */ 12376 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12377 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 12378 __LINE__, RACK_RTTS_INIT); 12379 return (0); 12380 } 12381 12382 static int 12383 rack_handoff_ok(struct tcpcb *tp) 12384 { 12385 if ((tp->t_state == TCPS_CLOSED) || 12386 (tp->t_state == TCPS_LISTEN)) { 12387 /* Sure no problem though it may not stick */ 12388 return (0); 12389 } 12390 if ((tp->t_state == TCPS_SYN_SENT) || 12391 (tp->t_state == TCPS_SYN_RECEIVED)) { 12392 /* 12393 * We really don't know if you support sack, 12394 * you have to get to ESTAB or beyond to tell. 12395 */ 12396 return (EAGAIN); 12397 } 12398 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 12399 /* 12400 * Rack will only send a FIN after all data is acknowledged. 12401 * So in this case we have more data outstanding. We can't 12402 * switch stacks until either all data and only the FIN 12403 * is left (in which case rack_init() now knows how 12404 * to deal with that) <or> all is acknowledged and we 12405 * are only left with incoming data, though why you 12406 * would want to switch to rack after all data is acknowledged 12407 * I have no idea (rrs)! 12408 */ 12409 return (EAGAIN); 12410 } 12411 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 12412 return (0); 12413 } 12414 /* 12415 * If we reach here we don't do SACK on this connection so we can 12416 * never do rack. 12417 */ 12418 return (EINVAL); 12419 } 12420 12421 12422 static void 12423 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 12424 { 12425 int ack_cmp = 0; 12426 12427 if (tp->t_fb_ptr) { 12428 struct tcp_rack *rack; 12429 struct rack_sendmap *rsm, *nrsm, *rm; 12430 12431 rack = (struct tcp_rack *)tp->t_fb_ptr; 12432 if (tp->t_in_pkt) { 12433 /* 12434 * It is unsafe to process the packets since a 12435 * reset may be lurking in them (its rare but it 12436 * can occur). If we were to find a RST, then we 12437 * would end up dropping the connection and the 12438 * INP lock, so when we return the caller (tcp_usrreq) 12439 * will blow up when it trys to unlock the inp. 12440 */ 12441 struct mbuf *save, *m; 12442 12443 m = tp->t_in_pkt; 12444 tp->t_in_pkt = NULL; 12445 tp->t_tail_pkt = NULL; 12446 while (m) { 12447 save = m->m_nextpkt; 12448 m->m_nextpkt = NULL; 12449 m_freem(m); 12450 m = save; 12451 } 12452 if ((tp->t_inpcb) && 12453 (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP)) 12454 ack_cmp = 1; 12455 if (ack_cmp) { 12456 /* Total if we used large or small (if ack-cmp was used). */ 12457 if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS) 12458 counter_u64_add(rack_large_ackcmp, 1); 12459 else 12460 counter_u64_add(rack_small_ackcmp, 1); 12461 } 12462 } 12463 tp->t_flags &= ~TF_FORCEDATA; 12464 #ifdef NETFLIX_SHARED_CWND 12465 if (rack->r_ctl.rc_scw) { 12466 uint32_t limit; 12467 12468 if (rack->r_limit_scw) 12469 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 12470 else 12471 limit = 0; 12472 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 12473 rack->r_ctl.rc_scw_index, 12474 limit); 12475 rack->r_ctl.rc_scw = NULL; 12476 } 12477 #endif 12478 if (rack->r_ctl.fsb.tcp_ip_hdr) { 12479 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 12480 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 12481 rack->r_ctl.fsb.th = NULL; 12482 } 12483 /* Convert back to ticks, with */ 12484 if (tp->t_srtt > 1) { 12485 uint32_t val, frac; 12486 12487 val = USEC_2_TICKS(tp->t_srtt); 12488 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12489 tp->t_srtt = val << TCP_RTT_SHIFT; 12490 /* 12491 * frac is the fractional part here is left 12492 * over from converting to hz and shifting. 12493 * We need to convert this to the 5 bit 12494 * remainder. 12495 */ 12496 if (frac) { 12497 if (hz == 1000) { 12498 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12499 } else { 12500 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12501 } 12502 tp->t_srtt += frac; 12503 } 12504 } 12505 if (tp->t_rttvar) { 12506 uint32_t val, frac; 12507 12508 val = USEC_2_TICKS(tp->t_rttvar); 12509 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12510 tp->t_rttvar = val << TCP_RTTVAR_SHIFT; 12511 /* 12512 * frac is the fractional part here is left 12513 * over from converting to hz and shifting. 12514 * We need to convert this to the 5 bit 12515 * remainder. 12516 */ 12517 if (frac) { 12518 if (hz == 1000) { 12519 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12520 } else { 12521 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12522 } 12523 tp->t_rttvar += frac; 12524 } 12525 } 12526 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); 12527 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); 12528 if (rack->rc_always_pace) { 12529 tcp_decrement_paced_conn(); 12530 rack_undo_cc_pacing(rack); 12531 rack->rc_always_pace = 0; 12532 } 12533 /* Clean up any options if they were not applied */ 12534 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 12535 struct deferred_opt_list *dol; 12536 12537 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 12538 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 12539 free(dol, M_TCPDO); 12540 } 12541 /* rack does not use force data but other stacks may clear it */ 12542 if (rack->r_ctl.crte != NULL) { 12543 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 12544 rack->rack_hdrw_pacing = 0; 12545 rack->r_ctl.crte = NULL; 12546 } 12547 #ifdef TCP_BLACKBOX 12548 tcp_log_flowend(tp); 12549 #endif 12550 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 12551 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12552 #ifdef INVARIANTS 12553 if (rm != rsm) { 12554 panic("At fini, rack:%p rsm:%p rm:%p", 12555 rack, rsm, rm); 12556 } 12557 #endif 12558 uma_zfree(rack_zone, rsm); 12559 } 12560 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12561 while (rsm) { 12562 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 12563 uma_zfree(rack_zone, rsm); 12564 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12565 } 12566 rack->rc_free_cnt = 0; 12567 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12568 tp->t_fb_ptr = NULL; 12569 } 12570 if (tp->t_inpcb) { 12571 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12572 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 12573 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 12574 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP; 12575 /* Cancel the GP measurement in progress */ 12576 tp->t_flags &= ~TF_GPUTINPROG; 12577 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS; 12578 } 12579 /* Make sure snd_nxt is correctly set */ 12580 tp->snd_nxt = tp->snd_max; 12581 } 12582 12583 static void 12584 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 12585 { 12586 if ((rack->r_state == TCPS_CLOSED) && (tp->t_state != TCPS_CLOSED)) { 12587 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 12588 } 12589 switch (tp->t_state) { 12590 case TCPS_SYN_SENT: 12591 rack->r_state = TCPS_SYN_SENT; 12592 rack->r_substate = rack_do_syn_sent; 12593 break; 12594 case TCPS_SYN_RECEIVED: 12595 rack->r_state = TCPS_SYN_RECEIVED; 12596 rack->r_substate = rack_do_syn_recv; 12597 break; 12598 case TCPS_ESTABLISHED: 12599 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12600 rack->r_state = TCPS_ESTABLISHED; 12601 rack->r_substate = rack_do_established; 12602 break; 12603 case TCPS_CLOSE_WAIT: 12604 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12605 rack->r_state = TCPS_CLOSE_WAIT; 12606 rack->r_substate = rack_do_close_wait; 12607 break; 12608 case TCPS_FIN_WAIT_1: 12609 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12610 rack->r_state = TCPS_FIN_WAIT_1; 12611 rack->r_substate = rack_do_fin_wait_1; 12612 break; 12613 case TCPS_CLOSING: 12614 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12615 rack->r_state = TCPS_CLOSING; 12616 rack->r_substate = rack_do_closing; 12617 break; 12618 case TCPS_LAST_ACK: 12619 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12620 rack->r_state = TCPS_LAST_ACK; 12621 rack->r_substate = rack_do_lastack; 12622 break; 12623 case TCPS_FIN_WAIT_2: 12624 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12625 rack->r_state = TCPS_FIN_WAIT_2; 12626 rack->r_substate = rack_do_fin_wait_2; 12627 break; 12628 case TCPS_LISTEN: 12629 case TCPS_CLOSED: 12630 case TCPS_TIME_WAIT: 12631 default: 12632 break; 12633 }; 12634 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 12635 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 12636 12637 } 12638 12639 static void 12640 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 12641 { 12642 /* 12643 * We received an ack, and then did not 12644 * call send or were bounced out due to the 12645 * hpts was running. Now a timer is up as well, is 12646 * it the right timer? 12647 */ 12648 struct rack_sendmap *rsm; 12649 int tmr_up; 12650 12651 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 12652 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 12653 return; 12654 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 12655 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 12656 (tmr_up == PACE_TMR_RXT)) { 12657 /* Should be an RXT */ 12658 return; 12659 } 12660 if (rsm == NULL) { 12661 /* Nothing outstanding? */ 12662 if (tp->t_flags & TF_DELACK) { 12663 if (tmr_up == PACE_TMR_DELACK) 12664 /* We are supposed to have delayed ack up and we do */ 12665 return; 12666 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 12667 /* 12668 * if we hit enobufs then we would expect the possiblity 12669 * of nothing outstanding and the RXT up (and the hptsi timer). 12670 */ 12671 return; 12672 } else if (((V_tcp_always_keepalive || 12673 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 12674 (tp->t_state <= TCPS_CLOSING)) && 12675 (tmr_up == PACE_TMR_KEEP) && 12676 (tp->snd_max == tp->snd_una)) { 12677 /* We should have keep alive up and we do */ 12678 return; 12679 } 12680 } 12681 if (SEQ_GT(tp->snd_max, tp->snd_una) && 12682 ((tmr_up == PACE_TMR_TLP) || 12683 (tmr_up == PACE_TMR_RACK) || 12684 (tmr_up == PACE_TMR_RXT))) { 12685 /* 12686 * Either a Rack, TLP or RXT is fine if we 12687 * have outstanding data. 12688 */ 12689 return; 12690 } else if (tmr_up == PACE_TMR_DELACK) { 12691 /* 12692 * If the delayed ack was going to go off 12693 * before the rtx/tlp/rack timer were going to 12694 * expire, then that would be the timer in control. 12695 * Note we don't check the time here trusting the 12696 * code is correct. 12697 */ 12698 return; 12699 } 12700 /* 12701 * Ok the timer originally started is not what we want now. 12702 * We will force the hpts to be stopped if any, and restart 12703 * with the slot set to what was in the saved slot. 12704 */ 12705 if (rack->rc_inp->inp_in_hpts) { 12706 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 12707 uint32_t us_cts; 12708 12709 us_cts = tcp_get_usecs(NULL); 12710 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12711 rack->r_early = 1; 12712 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 12713 } 12714 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12715 } 12716 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 12717 } 12718 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12719 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12720 } 12721 12722 12723 static void 12724 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) 12725 { 12726 tp->snd_wnd = tiwin; 12727 rack_validate_fo_sendwin_up(tp, rack); 12728 tp->snd_wl1 = seq; 12729 tp->snd_wl2 = ack; 12730 if (tp->snd_wnd > tp->max_sndwnd) 12731 tp->max_sndwnd = tp->snd_wnd; 12732 if (tp->snd_wnd < (tp->snd_max - high_seq)) { 12733 /* The peer collapsed the window */ 12734 rack_collapsed_window(rack); 12735 } else if (rack->rc_has_collapsed) 12736 rack_un_collapse_window(rack); 12737 /* Do we exit persists? */ 12738 if ((rack->rc_in_persist != 0) && 12739 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12740 rack->r_ctl.rc_pace_min_segs))) { 12741 rack_exit_persist(tp, rack, cts); 12742 } 12743 /* Do we enter persists? */ 12744 if ((rack->rc_in_persist == 0) && 12745 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12746 TCPS_HAVEESTABLISHED(tp->t_state) && 12747 (tp->snd_max == tp->snd_una) && 12748 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 12749 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 12750 /* 12751 * Here the rwnd is less than 12752 * the pacing size, we are established, 12753 * nothing is outstanding, and there is 12754 * data to send. Enter persists. 12755 */ 12756 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12757 } 12758 } 12759 12760 static void 12761 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 12762 { 12763 12764 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 12765 union tcp_log_stackspecific log; 12766 struct timeval ltv; 12767 char tcp_hdr_buf[60]; 12768 struct tcphdr *th; 12769 struct timespec ts; 12770 uint32_t orig_snd_una; 12771 uint8_t xx = 0; 12772 12773 #ifdef NETFLIX_HTTP_LOGGING 12774 struct http_sendfile_track *http_req; 12775 12776 if (SEQ_GT(ae->ack, tp->snd_una)) { 12777 http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1)); 12778 } else { 12779 http_req = tcp_http_find_req_for_seq(tp, ae->ack); 12780 } 12781 #endif 12782 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 12783 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 12784 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 12785 if (rack->rack_no_prr == 0) 12786 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 12787 else 12788 log.u_bbr.flex1 = 0; 12789 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 12790 log.u_bbr.use_lt_bw <<= 1; 12791 log.u_bbr.use_lt_bw |= rack->r_might_revert; 12792 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 12793 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 12794 log.u_bbr.pkts_out = tp->t_maxseg; 12795 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 12796 log.u_bbr.flex7 = 1; 12797 log.u_bbr.lost = ae->flags; 12798 log.u_bbr.cwnd_gain = ackval; 12799 log.u_bbr.pacing_gain = 0x2; 12800 if (ae->flags & TSTMP_HDWR) { 12801 /* Record the hardware timestamp if present */ 12802 log.u_bbr.flex3 = M_TSTMP; 12803 ts.tv_sec = ae->timestamp / 1000000000; 12804 ts.tv_nsec = ae->timestamp % 1000000000; 12805 ltv.tv_sec = ts.tv_sec; 12806 ltv.tv_usec = ts.tv_nsec / 1000; 12807 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 12808 } else if (ae->flags & TSTMP_LRO) { 12809 /* Record the LRO the arrival timestamp */ 12810 log.u_bbr.flex3 = M_TSTMP_LRO; 12811 ts.tv_sec = ae->timestamp / 1000000000; 12812 ts.tv_nsec = ae->timestamp % 1000000000; 12813 ltv.tv_sec = ts.tv_sec; 12814 ltv.tv_usec = ts.tv_nsec / 1000; 12815 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 12816 } 12817 log.u_bbr.timeStamp = tcp_get_usecs(<v); 12818 /* Log the rcv time */ 12819 log.u_bbr.delRate = ae->timestamp; 12820 #ifdef NETFLIX_HTTP_LOGGING 12821 log.u_bbr.applimited = tp->t_http_closed; 12822 log.u_bbr.applimited <<= 8; 12823 log.u_bbr.applimited |= tp->t_http_open; 12824 log.u_bbr.applimited <<= 8; 12825 log.u_bbr.applimited |= tp->t_http_req; 12826 if (http_req) { 12827 /* Copy out any client req info */ 12828 /* seconds */ 12829 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 12830 /* useconds */ 12831 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 12832 log.u_bbr.rttProp = http_req->timestamp; 12833 log.u_bbr.cur_del_rate = http_req->start; 12834 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 12835 log.u_bbr.flex8 |= 1; 12836 } else { 12837 log.u_bbr.flex8 |= 2; 12838 log.u_bbr.bw_inuse = http_req->end; 12839 } 12840 log.u_bbr.flex6 = http_req->start_seq; 12841 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 12842 log.u_bbr.flex8 |= 4; 12843 log.u_bbr.epoch = http_req->end_seq; 12844 } 12845 } 12846 #endif 12847 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 12848 th = (struct tcphdr *)tcp_hdr_buf; 12849 th->th_seq = ae->seq; 12850 th->th_ack = ae->ack; 12851 th->th_win = ae->win; 12852 /* Now fill in the ports */ 12853 th->th_sport = tp->t_inpcb->inp_fport; 12854 th->th_dport = tp->t_inpcb->inp_lport; 12855 th->th_flags = ae->flags & 0xff; 12856 /* Now do we have a timestamp option? */ 12857 if (ae->flags & HAS_TSTMP) { 12858 u_char *cp; 12859 uint32_t val; 12860 12861 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 12862 cp = (u_char *)(th + 1); 12863 *cp = TCPOPT_NOP; 12864 cp++; 12865 *cp = TCPOPT_NOP; 12866 cp++; 12867 *cp = TCPOPT_TIMESTAMP; 12868 cp++; 12869 *cp = TCPOLEN_TIMESTAMP; 12870 cp++; 12871 val = htonl(ae->ts_value); 12872 bcopy((char *)&val, 12873 (char *)cp, sizeof(uint32_t)); 12874 val = htonl(ae->ts_echo); 12875 bcopy((char *)&val, 12876 (char *)(cp + 4), sizeof(uint32_t)); 12877 } else 12878 th->th_off = (sizeof(struct tcphdr) >> 2); 12879 12880 /* 12881 * For sane logging we need to play a little trick. 12882 * If the ack were fully processed we would have moved 12883 * snd_una to high_seq, but since compressed acks are 12884 * processed in two phases, at this point (logging) snd_una 12885 * won't be advanced. So we would see multiple acks showing 12886 * the advancement. We can prevent that by "pretending" that 12887 * snd_una was advanced and then un-advancing it so that the 12888 * logging code has the right value for tlb_snd_una. 12889 */ 12890 if (tp->snd_una != high_seq) { 12891 orig_snd_una = tp->snd_una; 12892 tp->snd_una = high_seq; 12893 xx = 1; 12894 } else 12895 xx = 0; 12896 TCP_LOG_EVENTP(tp, th, 12897 &tp->t_inpcb->inp_socket->so_rcv, 12898 &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0, 12899 0, &log, true, <v); 12900 if (xx) { 12901 tp->snd_una = orig_snd_una; 12902 } 12903 } 12904 12905 } 12906 12907 static int 12908 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 12909 { 12910 /* 12911 * Handle a "special" compressed ack mbuf. Each incoming 12912 * ack has only four possible dispositions: 12913 * 12914 * A) It moves the cum-ack forward 12915 * B) It is behind the cum-ack. 12916 * C) It is a window-update ack. 12917 * D) It is a dup-ack. 12918 * 12919 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 12920 * in the incoming mbuf. We also need to still pay attention 12921 * to nxt_pkt since there may be another packet after this 12922 * one. 12923 */ 12924 #ifdef TCP_ACCOUNTING 12925 uint64_t ts_val; 12926 uint64_t rdstc; 12927 #endif 12928 int segsiz; 12929 struct timespec ts; 12930 struct tcp_rack *rack; 12931 struct tcp_ackent *ae; 12932 uint32_t tiwin, us_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 12933 int cnt, i, did_out, ourfinisacked = 0; 12934 int win_up_req = 0; 12935 struct tcpopt to_holder, *to = NULL; 12936 int nsegs = 0; 12937 int under_pacing = 1; 12938 int recovery = 0; 12939 int idx; 12940 #ifdef TCP_ACCOUNTING 12941 sched_pin(); 12942 #endif 12943 rack = (struct tcp_rack *)tp->t_fb_ptr; 12944 if (rack->gp_ready && 12945 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 12946 under_pacing = 0; 12947 else 12948 under_pacing = 1; 12949 12950 if (rack->r_state != tp->t_state) 12951 rack_set_state(tp, rack); 12952 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 12953 (tp->t_flags & TF_GPUTINPROG)) { 12954 /* 12955 * We have a goodput in progress 12956 * and we have entered a late state. 12957 * Do we have enough data in the sb 12958 * to handle the GPUT request? 12959 */ 12960 uint32_t bytes; 12961 12962 bytes = tp->gput_ack - tp->gput_seq; 12963 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 12964 bytes += tp->gput_seq - tp->snd_una; 12965 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 12966 /* 12967 * There are not enough bytes in the socket 12968 * buffer that have been sent to cover this 12969 * measurement. Cancel it. 12970 */ 12971 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 12972 rack->r_ctl.rc_gp_srtt /*flex1*/, 12973 tp->gput_seq, 12974 0, 0, 18, __LINE__, NULL, 0); 12975 tp->t_flags &= ~TF_GPUTINPROG; 12976 } 12977 } 12978 to = &to_holder; 12979 to->to_flags = 0; 12980 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 12981 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 12982 cnt = m->m_len / sizeof(struct tcp_ackent); 12983 idx = cnt / 5; 12984 if (idx >= MAX_NUM_OF_CNTS) 12985 idx = MAX_NUM_OF_CNTS - 1; 12986 counter_u64_add(rack_proc_comp_ack[idx], 1); 12987 counter_u64_add(rack_multi_single_eq, cnt); 12988 high_seq = tp->snd_una; 12989 the_win = tp->snd_wnd; 12990 win_seq = tp->snd_wl1; 12991 win_upd_ack = tp->snd_wl2; 12992 cts = us_cts = tcp_tv_to_usectick(tv); 12993 segsiz = ctf_fixed_maxseg(tp); 12994 if ((rack->rc_gp_dyn_mul) && 12995 (rack->use_fixed_rate == 0) && 12996 (rack->rc_always_pace)) { 12997 /* Check in on probertt */ 12998 rack_check_probe_rtt(rack, us_cts); 12999 } 13000 for (i = 0; i < cnt; i++) { 13001 #ifdef TCP_ACCOUNTING 13002 ts_val = get_cyclecount(); 13003 #endif 13004 rack_clear_rate_sample(rack); 13005 ae = ((mtod(m, struct tcp_ackent *)) + i); 13006 /* Setup the window */ 13007 tiwin = ae->win << tp->snd_scale; 13008 /* figure out the type of ack */ 13009 if (SEQ_LT(ae->ack, high_seq)) { 13010 /* Case B*/ 13011 ae->ack_val_set = ACK_BEHIND; 13012 } else if (SEQ_GT(ae->ack, high_seq)) { 13013 /* Case A */ 13014 ae->ack_val_set = ACK_CUMACK; 13015 } else if (tiwin == the_win) { 13016 /* Case D */ 13017 ae->ack_val_set = ACK_DUPACK; 13018 } else { 13019 /* Case C */ 13020 ae->ack_val_set = ACK_RWND; 13021 } 13022 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 13023 /* Validate timestamp */ 13024 if (ae->flags & HAS_TSTMP) { 13025 /* Setup for a timestamp */ 13026 to->to_flags = TOF_TS; 13027 ae->ts_echo -= tp->ts_offset; 13028 to->to_tsecr = ae->ts_echo; 13029 to->to_tsval = ae->ts_value; 13030 /* 13031 * If echoed timestamp is later than the current time, fall back to 13032 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 13033 * were used when this connection was established. 13034 */ 13035 if (TSTMP_GT(ae->ts_echo, cts)) 13036 ae->ts_echo = 0; 13037 if (tp->ts_recent && 13038 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 13039 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 13040 #ifdef TCP_ACCOUNTING 13041 rdstc = get_cyclecount(); 13042 if (rdstc > ts_val) { 13043 counter_u64_add(tcp_proc_time[ae->ack_val_set] , 13044 (rdstc - ts_val)); 13045 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13046 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13047 } 13048 } 13049 #endif 13050 continue; 13051 } 13052 } 13053 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 13054 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 13055 tp->ts_recent_age = tcp_ts_getticks(); 13056 tp->ts_recent = ae->ts_value; 13057 } 13058 } else { 13059 /* Setup for a no options */ 13060 to->to_flags = 0; 13061 } 13062 /* Update the rcv time and perform idle reduction possibly */ 13063 if (tp->t_idle_reduce && 13064 (tp->snd_max == tp->snd_una) && 13065 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 13066 counter_u64_add(rack_input_idle_reduces, 1); 13067 rack_cc_after_idle(rack, tp); 13068 } 13069 tp->t_rcvtime = ticks; 13070 /* Now what about ECN? */ 13071 if (tp->t_flags2 & TF2_ECN_PERMIT) { 13072 if (ae->flags & TH_CWR) { 13073 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13074 tp->t_flags |= TF_ACKNOW; 13075 } 13076 switch (ae->codepoint & IPTOS_ECN_MASK) { 13077 case IPTOS_ECN_CE: 13078 tp->t_flags2 |= TF2_ECN_SND_ECE; 13079 KMOD_TCPSTAT_INC(tcps_ecn_ce); 13080 break; 13081 case IPTOS_ECN_ECT0: 13082 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13083 break; 13084 case IPTOS_ECN_ECT1: 13085 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 13086 break; 13087 } 13088 13089 /* Process a packet differently from RFC3168. */ 13090 cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint); 13091 /* Congestion experienced. */ 13092 if (ae->flags & TH_ECE) { 13093 rack_cong_signal(tp, CC_ECN, ae->ack); 13094 } 13095 } 13096 #ifdef TCP_ACCOUNTING 13097 /* Count for the specific type of ack in */ 13098 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1); 13099 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13100 tp->tcp_cnt_counters[ae->ack_val_set]++; 13101 } 13102 #endif 13103 /* 13104 * Note how we could move up these in the determination 13105 * above, but we don't so that way the timestamp checks (and ECN) 13106 * is done first before we do any processing on the ACK. 13107 * The non-compressed path through the code has this 13108 * weakness (noted by @jtl) that it actually does some 13109 * processing before verifying the timestamp information. 13110 * We don't take that path here which is why we set 13111 * the ack_val_set first, do the timestamp and ecn 13112 * processing, and then look at what we have setup. 13113 */ 13114 if (ae->ack_val_set == ACK_BEHIND) { 13115 /* 13116 * Case B flag reordering, if window is not closed 13117 * or it could be a keep-alive or persists 13118 */ 13119 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 13120 counter_u64_add(rack_reorder_seen, 1); 13121 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13122 } 13123 } else if (ae->ack_val_set == ACK_DUPACK) { 13124 /* Case D */ 13125 13126 rack_strike_dupack(rack); 13127 } else if (ae->ack_val_set == ACK_RWND) { 13128 /* Case C */ 13129 13130 win_up_req = 1; 13131 win_upd_ack = ae->ack; 13132 win_seq = ae->seq; 13133 the_win = tiwin; 13134 } else { 13135 /* Case A */ 13136 13137 if (SEQ_GT(ae->ack, tp->snd_max)) { 13138 /* 13139 * We just send an ack since the incoming 13140 * ack is beyond the largest seq we sent. 13141 */ 13142 if ((tp->t_flags & TF_ACKNOW) == 0) { 13143 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 13144 if (tp->t_flags && TF_ACKNOW) 13145 rack->r_wanted_output = 1; 13146 } 13147 } else { 13148 nsegs++; 13149 /* If the window changed setup to update */ 13150 if (tiwin != tp->snd_wnd) { 13151 win_up_req = 1; 13152 win_upd_ack = ae->ack; 13153 win_seq = ae->seq; 13154 the_win = tiwin; 13155 } 13156 #ifdef TCP_ACCOUNTING 13157 /* Account for the acks */ 13158 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13159 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 13160 } 13161 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN], 13162 (((ae->ack - high_seq) + segsiz - 1) / segsiz)); 13163 #endif 13164 high_seq = ae->ack; 13165 /* Setup our act_rcv_time */ 13166 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13167 ts.tv_sec = ae->timestamp / 1000000000; 13168 ts.tv_nsec = ae->timestamp % 1000000000; 13169 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13170 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13171 } else { 13172 rack->r_ctl.act_rcv_time = *tv; 13173 } 13174 rack_process_to_cumack(tp, rack, ae->ack, cts, to); 13175 } 13176 } 13177 /* And lets be sure to commit the rtt measurements for this ack */ 13178 tcp_rack_xmit_timer_commit(rack, tp); 13179 #ifdef TCP_ACCOUNTING 13180 rdstc = get_cyclecount(); 13181 if (rdstc > ts_val) { 13182 counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val)); 13183 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13184 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13185 if (ae->ack_val_set == ACK_CUMACK) 13186 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 13187 } 13188 } 13189 #endif 13190 } 13191 #ifdef TCP_ACCOUNTING 13192 ts_val = get_cyclecount(); 13193 #endif 13194 acked_amount = acked = (high_seq - tp->snd_una); 13195 if (win_up_req) { 13196 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13197 } 13198 if (acked) { 13199 if (rack->sack_attack_disable == 0) 13200 rack_do_decay(rack); 13201 if (acked >= segsiz) { 13202 /* 13203 * You only get credit for 13204 * MSS and greater (and you get extra 13205 * credit for larger cum-ack moves). 13206 */ 13207 int ac; 13208 13209 ac = acked / segsiz; 13210 rack->r_ctl.ack_count += ac; 13211 counter_u64_add(rack_ack_total, ac); 13212 } 13213 if (rack->r_ctl.ack_count > 0xfff00000) { 13214 /* 13215 * reduce the number to keep us under 13216 * a uint32_t. 13217 */ 13218 rack->r_ctl.ack_count /= 2; 13219 rack->r_ctl.sack_count /= 2; 13220 } 13221 if (tp->t_flags & TF_NEEDSYN) { 13222 /* 13223 * T/TCP: Connection was half-synchronized, and our SYN has 13224 * been ACK'd (so connection is now fully synchronized). Go 13225 * to non-starred state, increment snd_una for ACK of SYN, 13226 * and check if we can do window scaling. 13227 */ 13228 tp->t_flags &= ~TF_NEEDSYN; 13229 tp->snd_una++; 13230 acked_amount = acked = (high_seq - tp->snd_una); 13231 } 13232 if (acked > sbavail(&so->so_snd)) 13233 acked_amount = sbavail(&so->so_snd); 13234 #ifdef NETFLIX_EXP_DETECTION 13235 /* 13236 * We only care on a cum-ack move if we are in a sack-disabled 13237 * state. We have already added in to the ack_count, and we never 13238 * would disable on a cum-ack move, so we only care to do the 13239 * detection if it may "undo" it, i.e. we were in disabled already. 13240 */ 13241 if (rack->sack_attack_disable) 13242 rack_do_detection(tp, rack, acked_amount, segsiz); 13243 #endif 13244 if (IN_FASTRECOVERY(tp->t_flags) && 13245 (rack->rack_no_prr == 0)) 13246 rack_update_prr(tp, rack, acked_amount, high_seq); 13247 if (IN_RECOVERY(tp->t_flags)) { 13248 if (SEQ_LT(high_seq, tp->snd_recover) && 13249 (SEQ_LT(high_seq, tp->snd_max))) { 13250 tcp_rack_partialack(tp); 13251 } else { 13252 rack_post_recovery(tp, high_seq); 13253 recovery = 1; 13254 } 13255 } 13256 /* Handle the rack-log-ack part (sendmap) */ 13257 if ((sbused(&so->so_snd) == 0) && 13258 (acked > acked_amount) && 13259 (tp->t_state >= TCPS_FIN_WAIT_1) && 13260 (tp->t_flags & TF_SENTFIN)) { 13261 /* 13262 * We must be sure our fin 13263 * was sent and acked (we can be 13264 * in FIN_WAIT_1 without having 13265 * sent the fin). 13266 */ 13267 ourfinisacked = 1; 13268 /* 13269 * Lets make sure snd_una is updated 13270 * since most likely acked_amount = 0 (it 13271 * should be). 13272 */ 13273 tp->snd_una = high_seq; 13274 } 13275 /* Did we make a RTO error? */ 13276 if ((tp->t_flags & TF_PREVVALID) && 13277 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13278 tp->t_flags &= ~TF_PREVVALID; 13279 if (tp->t_rxtshift == 1 && 13280 (int)(ticks - tp->t_badrxtwin) < 0) 13281 rack_cong_signal(tp, CC_RTO_ERR, high_seq); 13282 } 13283 /* Handle the data in the socket buffer */ 13284 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 13285 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13286 if (acked_amount > 0) { 13287 struct mbuf *mfree; 13288 13289 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 13290 SOCKBUF_LOCK(&so->so_snd); 13291 mfree = sbcut_locked(&so->so_snd, acked); 13292 tp->snd_una = high_seq; 13293 /* Note we want to hold the sb lock through the sendmap adjust */ 13294 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 13295 /* Wake up the socket if we have room to write more */ 13296 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13297 sowwakeup_locked(so); 13298 m_freem(mfree); 13299 } 13300 /* update progress */ 13301 tp->t_acktime = ticks; 13302 rack_log_progress_event(rack, tp, tp->t_acktime, 13303 PROGRESS_UPDATE, __LINE__); 13304 /* Clear out shifts and such */ 13305 tp->t_rxtshift = 0; 13306 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13307 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13308 rack->rc_tlp_in_progress = 0; 13309 rack->r_ctl.rc_tlp_cnt_out = 0; 13310 /* Send recover and snd_nxt must be dragged along */ 13311 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 13312 tp->snd_recover = tp->snd_una; 13313 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 13314 tp->snd_nxt = tp->snd_una; 13315 /* 13316 * If the RXT timer is running we want to 13317 * stop it, so we can restart a TLP (or new RXT). 13318 */ 13319 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13320 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13321 #ifdef NETFLIX_HTTP_LOGGING 13322 tcp_http_check_for_comp(rack->rc_tp, high_seq); 13323 #endif 13324 tp->snd_wl2 = high_seq; 13325 tp->t_dupacks = 0; 13326 if (under_pacing && 13327 (rack->use_fixed_rate == 0) && 13328 (rack->in_probe_rtt == 0) && 13329 rack->rc_gp_dyn_mul && 13330 rack->rc_always_pace) { 13331 /* Check if we are dragging bottom */ 13332 rack_check_bottom_drag(tp, rack, so, acked); 13333 } 13334 if (tp->snd_una == tp->snd_max) { 13335 tp->t_flags &= ~TF_PREVVALID; 13336 rack->r_ctl.retran_during_recovery = 0; 13337 rack->r_ctl.dsack_byte_cnt = 0; 13338 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13339 if (rack->r_ctl.rc_went_idle_time == 0) 13340 rack->r_ctl.rc_went_idle_time = 1; 13341 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13342 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 13343 tp->t_acktime = 0; 13344 /* Set so we might enter persists... */ 13345 rack->r_wanted_output = 1; 13346 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13347 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 13348 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13349 (sbavail(&so->so_snd) == 0) && 13350 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 13351 /* 13352 * The socket was gone and the 13353 * peer sent data (not now in the past), time to 13354 * reset him. 13355 */ 13356 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13357 /* tcp_close will kill the inp pre-log the Reset */ 13358 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13359 #ifdef TCP_ACCOUNTING 13360 rdstc = get_cyclecount(); 13361 if (rdstc > ts_val) { 13362 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13363 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13364 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13365 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13366 } 13367 } 13368 #endif 13369 m_freem(m); 13370 tp = tcp_close(tp); 13371 if (tp == NULL) { 13372 #ifdef TCP_ACCOUNTING 13373 sched_unpin(); 13374 #endif 13375 return (1); 13376 } 13377 /* 13378 * We would normally do drop-with-reset which would 13379 * send back a reset. We can't since we don't have 13380 * all the needed bits. Instead lets arrange for 13381 * a call to tcp_output(). That way since we 13382 * are in the closed state we will generate a reset. 13383 * 13384 * Note if tcp_accounting is on we don't unpin since 13385 * we do that after the goto label. 13386 */ 13387 goto send_out_a_rst; 13388 } 13389 if ((sbused(&so->so_snd) == 0) && 13390 (tp->t_state >= TCPS_FIN_WAIT_1) && 13391 (tp->t_flags & TF_SENTFIN)) { 13392 /* 13393 * If we can't receive any more data, then closing user can 13394 * proceed. Starting the timer is contrary to the 13395 * specification, but if we don't get a FIN we'll hang 13396 * forever. 13397 * 13398 */ 13399 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13400 soisdisconnected(so); 13401 tcp_timer_activate(tp, TT_2MSL, 13402 (tcp_fast_finwait2_recycle ? 13403 tcp_finwait2_timeout : 13404 TP_MAXIDLE(tp))); 13405 } 13406 if (ourfinisacked == 0) { 13407 /* 13408 * We don't change to fin-wait-2 if we have our fin acked 13409 * which means we are probably in TCPS_CLOSING. 13410 */ 13411 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13412 } 13413 } 13414 } 13415 /* Wake up the socket if we have room to write more */ 13416 if (sbavail(&so->so_snd)) { 13417 rack->r_wanted_output = 1; 13418 if (ctf_progress_timeout_check(tp, true)) { 13419 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13420 tp, tick, PROGRESS_DROP, __LINE__); 13421 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 13422 /* 13423 * We cheat here and don't send a RST, we should send one 13424 * when the pacer drops the connection. 13425 */ 13426 #ifdef TCP_ACCOUNTING 13427 rdstc = get_cyclecount(); 13428 if (rdstc > ts_val) { 13429 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13430 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13431 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13432 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13433 } 13434 } 13435 sched_unpin(); 13436 #endif 13437 INP_WUNLOCK(rack->rc_inp); 13438 m_freem(m); 13439 return (1); 13440 } 13441 } 13442 if (ourfinisacked) { 13443 switch(tp->t_state) { 13444 case TCPS_CLOSING: 13445 #ifdef TCP_ACCOUNTING 13446 rdstc = get_cyclecount(); 13447 if (rdstc > ts_val) { 13448 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13449 (rdstc - ts_val)); 13450 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13451 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13452 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13453 } 13454 } 13455 sched_unpin(); 13456 #endif 13457 tcp_twstart(tp); 13458 m_freem(m); 13459 return (1); 13460 break; 13461 case TCPS_LAST_ACK: 13462 #ifdef TCP_ACCOUNTING 13463 rdstc = get_cyclecount(); 13464 if (rdstc > ts_val) { 13465 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13466 (rdstc - ts_val)); 13467 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13468 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13469 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13470 } 13471 } 13472 sched_unpin(); 13473 #endif 13474 tp = tcp_close(tp); 13475 ctf_do_drop(m, tp); 13476 return (1); 13477 break; 13478 case TCPS_FIN_WAIT_1: 13479 #ifdef TCP_ACCOUNTING 13480 rdstc = get_cyclecount(); 13481 if (rdstc > ts_val) { 13482 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13483 (rdstc - ts_val)); 13484 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13485 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13486 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13487 } 13488 } 13489 #endif 13490 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13491 soisdisconnected(so); 13492 tcp_timer_activate(tp, TT_2MSL, 13493 (tcp_fast_finwait2_recycle ? 13494 tcp_finwait2_timeout : 13495 TP_MAXIDLE(tp))); 13496 } 13497 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13498 break; 13499 default: 13500 break; 13501 } 13502 } 13503 if (rack->r_fast_output) { 13504 /* 13505 * We re doing fast output.. can we expand that? 13506 */ 13507 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 13508 } 13509 #ifdef TCP_ACCOUNTING 13510 rdstc = get_cyclecount(); 13511 if (rdstc > ts_val) { 13512 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13513 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13514 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13515 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13516 } 13517 } 13518 13519 } else if (win_up_req) { 13520 rdstc = get_cyclecount(); 13521 if (rdstc > ts_val) { 13522 counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val)); 13523 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13524 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 13525 } 13526 } 13527 #endif 13528 } 13529 /* Now is there a next packet, if so we are done */ 13530 m_freem(m); 13531 did_out = 0; 13532 if (nxt_pkt) { 13533 #ifdef TCP_ACCOUNTING 13534 sched_unpin(); 13535 #endif 13536 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 13537 return (0); 13538 } 13539 rack_handle_might_revert(tp, rack); 13540 ctf_calc_rwin(so, tp); 13541 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 13542 send_out_a_rst: 13543 (void)tp->t_fb->tfb_tcp_output(tp); 13544 did_out = 1; 13545 } 13546 rack_free_trim(rack); 13547 #ifdef TCP_ACCOUNTING 13548 sched_unpin(); 13549 #endif 13550 rack_timer_audit(tp, rack, &so->so_snd); 13551 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 13552 return (0); 13553 } 13554 13555 13556 static int 13557 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 13558 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 13559 int32_t nxt_pkt, struct timeval *tv) 13560 { 13561 #ifdef TCP_ACCOUNTING 13562 uint64_t ts_val; 13563 #endif 13564 int32_t thflags, retval, did_out = 0; 13565 int32_t way_out = 0; 13566 uint32_t cts; 13567 uint32_t tiwin; 13568 struct timespec ts; 13569 struct tcpopt to; 13570 struct tcp_rack *rack; 13571 struct rack_sendmap *rsm; 13572 int32_t prev_state = 0; 13573 #ifdef TCP_ACCOUNTING 13574 int ack_val_set = 0xf; 13575 #endif 13576 int nsegs; 13577 uint32_t us_cts; 13578 /* 13579 * tv passed from common code is from either M_TSTMP_LRO or 13580 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 13581 */ 13582 rack = (struct tcp_rack *)tp->t_fb_ptr; 13583 cts = tcp_tv_to_usectick(tv); 13584 if (m->m_flags & M_ACKCMP) { 13585 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 13586 } 13587 if (m->m_flags & M_ACKCMP) { 13588 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 13589 } 13590 nsegs = m->m_pkthdr.lro_nsegs; 13591 counter_u64_add(rack_proc_non_comp_ack, 1); 13592 thflags = th->th_flags; 13593 #ifdef TCP_ACCOUNTING 13594 sched_pin(); 13595 if (thflags & TH_ACK) 13596 ts_val = get_cyclecount(); 13597 #endif 13598 if ((m->m_flags & M_TSTMP) || 13599 (m->m_flags & M_TSTMP_LRO)) { 13600 mbuf_tstmp2timespec(m, &ts); 13601 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13602 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13603 } else 13604 rack->r_ctl.act_rcv_time = *tv; 13605 kern_prefetch(rack, &prev_state); 13606 prev_state = 0; 13607 /* 13608 * Unscale the window into a 32-bit value. For the SYN_SENT state 13609 * the scale is zero. 13610 */ 13611 tiwin = th->th_win << tp->snd_scale; 13612 /* 13613 * Parse options on any incoming segment. 13614 */ 13615 memset(&to, 0, sizeof(to)); 13616 tcp_dooptions(&to, (u_char *)(th + 1), 13617 (th->th_off << 2) - sizeof(struct tcphdr), 13618 (thflags & TH_SYN) ? TO_SYN : 0); 13619 #ifdef TCP_ACCOUNTING 13620 if (thflags & TH_ACK) { 13621 /* 13622 * We have a tradeoff here. We can either do what we are 13623 * doing i.e. pinning to this CPU and then doing the accounting 13624 * <or> we could do a critical enter, setup the rdtsc and cpu 13625 * as in below, and then validate we are on the same CPU on 13626 * exit. I have choosen to not do the critical enter since 13627 * that often will gain you a context switch, and instead lock 13628 * us (line above this if) to the same CPU with sched_pin(). This 13629 * means we may be context switched out for a higher priority 13630 * interupt but we won't be moved to another CPU. 13631 * 13632 * If this occurs (which it won't very often since we most likely 13633 * are running this code in interupt context and only a higher 13634 * priority will bump us ... clock?) we will falsely add in 13635 * to the time the interupt processing time plus the ack processing 13636 * time. This is ok since its a rare event. 13637 */ 13638 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 13639 ctf_fixed_maxseg(tp)); 13640 } 13641 #endif 13642 NET_EPOCH_ASSERT(); 13643 INP_WLOCK_ASSERT(tp->t_inpcb); 13644 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 13645 __func__)); 13646 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 13647 __func__)); 13648 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13649 (tp->t_flags & TF_GPUTINPROG)) { 13650 /* 13651 * We have a goodput in progress 13652 * and we have entered a late state. 13653 * Do we have enough data in the sb 13654 * to handle the GPUT request? 13655 */ 13656 uint32_t bytes; 13657 13658 bytes = tp->gput_ack - tp->gput_seq; 13659 if (SEQ_GT(tp->gput_seq, tp->snd_una)) 13660 bytes += tp->gput_seq - tp->snd_una; 13661 if (bytes > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 13662 /* 13663 * There are not enough bytes in the socket 13664 * buffer that have been sent to cover this 13665 * measurement. Cancel it. 13666 */ 13667 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 13668 rack->r_ctl.rc_gp_srtt /*flex1*/, 13669 tp->gput_seq, 13670 0, 0, 18, __LINE__, NULL, 0); 13671 tp->t_flags &= ~TF_GPUTINPROG; 13672 } 13673 } 13674 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13675 union tcp_log_stackspecific log; 13676 struct timeval ltv; 13677 #ifdef NETFLIX_HTTP_LOGGING 13678 struct http_sendfile_track *http_req; 13679 13680 if (SEQ_GT(th->th_ack, tp->snd_una)) { 13681 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 13682 } else { 13683 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 13684 } 13685 #endif 13686 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13687 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13688 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13689 if (rack->rack_no_prr == 0) 13690 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13691 else 13692 log.u_bbr.flex1 = 0; 13693 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 13694 log.u_bbr.use_lt_bw <<= 1; 13695 log.u_bbr.use_lt_bw |= rack->r_might_revert; 13696 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 13697 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13698 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 13699 log.u_bbr.flex3 = m->m_flags; 13700 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 13701 log.u_bbr.lost = thflags; 13702 log.u_bbr.pacing_gain = 0x1; 13703 #ifdef TCP_ACCOUNTING 13704 log.u_bbr.cwnd_gain = ack_val_set; 13705 #endif 13706 log.u_bbr.flex7 = 2; 13707 if (m->m_flags & M_TSTMP) { 13708 /* Record the hardware timestamp if present */ 13709 mbuf_tstmp2timespec(m, &ts); 13710 ltv.tv_sec = ts.tv_sec; 13711 ltv.tv_usec = ts.tv_nsec / 1000; 13712 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 13713 } else if (m->m_flags & M_TSTMP_LRO) { 13714 /* Record the LRO the arrival timestamp */ 13715 mbuf_tstmp2timespec(m, &ts); 13716 ltv.tv_sec = ts.tv_sec; 13717 ltv.tv_usec = ts.tv_nsec / 1000; 13718 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 13719 } 13720 log.u_bbr.timeStamp = tcp_get_usecs(<v); 13721 /* Log the rcv time */ 13722 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 13723 #ifdef NETFLIX_HTTP_LOGGING 13724 log.u_bbr.applimited = tp->t_http_closed; 13725 log.u_bbr.applimited <<= 8; 13726 log.u_bbr.applimited |= tp->t_http_open; 13727 log.u_bbr.applimited <<= 8; 13728 log.u_bbr.applimited |= tp->t_http_req; 13729 if (http_req) { 13730 /* Copy out any client req info */ 13731 /* seconds */ 13732 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 13733 /* useconds */ 13734 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 13735 log.u_bbr.rttProp = http_req->timestamp; 13736 log.u_bbr.cur_del_rate = http_req->start; 13737 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 13738 log.u_bbr.flex8 |= 1; 13739 } else { 13740 log.u_bbr.flex8 |= 2; 13741 log.u_bbr.bw_inuse = http_req->end; 13742 } 13743 log.u_bbr.flex6 = http_req->start_seq; 13744 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 13745 log.u_bbr.flex8 |= 4; 13746 log.u_bbr.epoch = http_req->end_seq; 13747 } 13748 } 13749 #endif 13750 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 13751 tlen, &log, true, <v); 13752 } 13753 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 13754 way_out = 4; 13755 retval = 0; 13756 m_freem(m); 13757 goto done_with_input; 13758 } 13759 /* 13760 * If a segment with the ACK-bit set arrives in the SYN-SENT state 13761 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 13762 */ 13763 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 13764 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 13765 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13766 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13767 #ifdef TCP_ACCOUNTING 13768 sched_unpin(); 13769 #endif 13770 return (1); 13771 } 13772 13773 /* 13774 * Parse options on any incoming segment. 13775 */ 13776 tcp_dooptions(&to, (u_char *)(th + 1), 13777 (th->th_off << 2) - sizeof(struct tcphdr), 13778 (thflags & TH_SYN) ? TO_SYN : 0); 13779 13780 /* 13781 * If timestamps were negotiated during SYN/ACK and a 13782 * segment without a timestamp is received, silently drop 13783 * the segment, unless it is a RST segment or missing timestamps are 13784 * tolerated. 13785 * See section 3.2 of RFC 7323. 13786 */ 13787 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 13788 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 13789 way_out = 5; 13790 retval = 0; 13791 m_freem(m); 13792 goto done_with_input; 13793 } 13794 13795 /* 13796 * Segment received on connection. Reset idle time and keep-alive 13797 * timer. XXX: This should be done after segment validation to 13798 * ignore broken/spoofed segs. 13799 */ 13800 if (tp->t_idle_reduce && 13801 (tp->snd_max == tp->snd_una) && 13802 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 13803 counter_u64_add(rack_input_idle_reduces, 1); 13804 rack_cc_after_idle(rack, tp); 13805 } 13806 tp->t_rcvtime = ticks; 13807 #ifdef STATS 13808 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 13809 #endif 13810 if (tiwin > rack->r_ctl.rc_high_rwnd) 13811 rack->r_ctl.rc_high_rwnd = tiwin; 13812 /* 13813 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 13814 * this to occur after we've validated the segment. 13815 */ 13816 if (tp->t_flags2 & TF2_ECN_PERMIT) { 13817 if (thflags & TH_CWR) { 13818 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13819 tp->t_flags |= TF_ACKNOW; 13820 } 13821 switch (iptos & IPTOS_ECN_MASK) { 13822 case IPTOS_ECN_CE: 13823 tp->t_flags2 |= TF2_ECN_SND_ECE; 13824 KMOD_TCPSTAT_INC(tcps_ecn_ce); 13825 break; 13826 case IPTOS_ECN_ECT0: 13827 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13828 break; 13829 case IPTOS_ECN_ECT1: 13830 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 13831 break; 13832 } 13833 13834 /* Process a packet differently from RFC3168. */ 13835 cc_ecnpkt_handler(tp, th, iptos); 13836 13837 /* Congestion experienced. */ 13838 if (thflags & TH_ECE) { 13839 rack_cong_signal(tp, CC_ECN, th->th_ack); 13840 } 13841 } 13842 13843 /* 13844 * If echoed timestamp is later than the current time, fall back to 13845 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 13846 * were used when this connection was established. 13847 */ 13848 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 13849 to.to_tsecr -= tp->ts_offset; 13850 if (TSTMP_GT(to.to_tsecr, cts)) 13851 to.to_tsecr = 0; 13852 } 13853 13854 /* 13855 * If its the first time in we need to take care of options and 13856 * verify we can do SACK for rack! 13857 */ 13858 if (rack->r_state == 0) { 13859 /* Should be init'd by rack_init() */ 13860 KASSERT(rack->rc_inp != NULL, 13861 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 13862 if (rack->rc_inp == NULL) { 13863 rack->rc_inp = tp->t_inpcb; 13864 } 13865 13866 /* 13867 * Process options only when we get SYN/ACK back. The SYN 13868 * case for incoming connections is handled in tcp_syncache. 13869 * According to RFC1323 the window field in a SYN (i.e., a 13870 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 13871 * this is traditional behavior, may need to be cleaned up. 13872 */ 13873 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 13874 /* Handle parallel SYN for ECN */ 13875 if (!(thflags & TH_ACK) && 13876 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 13877 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 13878 tp->t_flags2 |= TF2_ECN_PERMIT; 13879 tp->t_flags2 |= TF2_ECN_SND_ECE; 13880 TCPSTAT_INC(tcps_ecn_shs); 13881 } 13882 if ((to.to_flags & TOF_SCALE) && 13883 (tp->t_flags & TF_REQ_SCALE)) { 13884 tp->t_flags |= TF_RCVD_SCALE; 13885 tp->snd_scale = to.to_wscale; 13886 } else 13887 tp->t_flags &= ~TF_REQ_SCALE; 13888 /* 13889 * Initial send window. It will be updated with the 13890 * next incoming segment to the scaled value. 13891 */ 13892 tp->snd_wnd = th->th_win; 13893 rack_validate_fo_sendwin_up(tp, rack); 13894 if ((to.to_flags & TOF_TS) && 13895 (tp->t_flags & TF_REQ_TSTMP)) { 13896 tp->t_flags |= TF_RCVD_TSTMP; 13897 tp->ts_recent = to.to_tsval; 13898 tp->ts_recent_age = cts; 13899 } else 13900 tp->t_flags &= ~TF_REQ_TSTMP; 13901 if (to.to_flags & TOF_MSS) { 13902 tcp_mss(tp, to.to_mss); 13903 } 13904 if ((tp->t_flags & TF_SACK_PERMIT) && 13905 (to.to_flags & TOF_SACKPERM) == 0) 13906 tp->t_flags &= ~TF_SACK_PERMIT; 13907 if (IS_FASTOPEN(tp->t_flags)) { 13908 if (to.to_flags & TOF_FASTOPEN) { 13909 uint16_t mss; 13910 13911 if (to.to_flags & TOF_MSS) 13912 mss = to.to_mss; 13913 else 13914 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 13915 mss = TCP6_MSS; 13916 else 13917 mss = TCP_MSS; 13918 tcp_fastopen_update_cache(tp, mss, 13919 to.to_tfo_len, to.to_tfo_cookie); 13920 } else 13921 tcp_fastopen_disable_path(tp); 13922 } 13923 } 13924 /* 13925 * At this point we are at the initial call. Here we decide 13926 * if we are doing RACK or not. We do this by seeing if 13927 * TF_SACK_PERMIT is set and the sack-not-required is clear. 13928 * The code now does do dup-ack counting so if you don't 13929 * switch back you won't get rack & TLP, but you will still 13930 * get this stack. 13931 */ 13932 13933 if ((rack_sack_not_required == 0) && 13934 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 13935 tcp_switch_back_to_default(tp); 13936 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 13937 tlen, iptos); 13938 #ifdef TCP_ACCOUNTING 13939 sched_unpin(); 13940 #endif 13941 return (1); 13942 } 13943 tcp_set_hpts(tp->t_inpcb); 13944 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 13945 } 13946 if (thflags & TH_FIN) 13947 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 13948 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13949 if ((rack->rc_gp_dyn_mul) && 13950 (rack->use_fixed_rate == 0) && 13951 (rack->rc_always_pace)) { 13952 /* Check in on probertt */ 13953 rack_check_probe_rtt(rack, us_cts); 13954 } 13955 rack_clear_rate_sample(rack); 13956 if (rack->forced_ack) { 13957 uint32_t us_rtt; 13958 13959 /* 13960 * A persist or keep-alive was forced out, update our 13961 * min rtt time. Note we do not worry about lost 13962 * retransmissions since KEEP-ALIVES and persists 13963 * are usually way long on times of sending (though 13964 * if we were really paranoid or worried we could 13965 * at least use timestamps if available to validate). 13966 */ 13967 rack->forced_ack = 0; 13968 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 13969 if (us_rtt == 0) 13970 us_rtt = 1; 13971 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13972 tcp_rack_xmit_timer(rack, us_rtt, 0, us_rtt, 3, NULL, 1); 13973 } 13974 /* 13975 * This is the one exception case where we set the rack state 13976 * always. All other times (timers etc) we must have a rack-state 13977 * set (so we assure we have done the checks above for SACK). 13978 */ 13979 rack->r_ctl.rc_rcvtime = cts; 13980 if (rack->r_state != tp->t_state) 13981 rack_set_state(tp, rack); 13982 if (SEQ_GT(th->th_ack, tp->snd_una) && 13983 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 13984 kern_prefetch(rsm, &prev_state); 13985 prev_state = rack->r_state; 13986 retval = (*rack->r_substate) (m, th, so, 13987 tp, &to, drop_hdrlen, 13988 tlen, tiwin, thflags, nxt_pkt, iptos); 13989 #ifdef INVARIANTS 13990 if ((retval == 0) && 13991 (tp->t_inpcb == NULL)) { 13992 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 13993 retval, tp, prev_state); 13994 } 13995 #endif 13996 if (retval == 0) { 13997 /* 13998 * If retval is 1 the tcb is unlocked and most likely the tp 13999 * is gone. 14000 */ 14001 INP_WLOCK_ASSERT(tp->t_inpcb); 14002 if ((rack->rc_gp_dyn_mul) && 14003 (rack->rc_always_pace) && 14004 (rack->use_fixed_rate == 0) && 14005 rack->in_probe_rtt && 14006 (rack->r_ctl.rc_time_probertt_starts == 0)) { 14007 /* 14008 * If we are going for target, lets recheck before 14009 * we output. 14010 */ 14011 rack_check_probe_rtt(rack, us_cts); 14012 } 14013 if (rack->set_pacing_done_a_iw == 0) { 14014 /* How much has been acked? */ 14015 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 14016 /* We have enough to set in the pacing segment size */ 14017 rack->set_pacing_done_a_iw = 1; 14018 rack_set_pace_segments(tp, rack, __LINE__, NULL); 14019 } 14020 } 14021 tcp_rack_xmit_timer_commit(rack, tp); 14022 #ifdef TCP_ACCOUNTING 14023 /* 14024 * If we set the ack_val_se to what ack processing we are doing 14025 * we also want to track how many cycles we burned. Note 14026 * the bits after tcp_output we let be "free". This is because 14027 * we are also tracking the tcp_output times as well. Note the 14028 * use of 0xf here since we only have 11 counter (0 - 0xa) and 14029 * 0xf cannot be returned and is what we initialize it too to 14030 * indicate we are not doing the tabulations. 14031 */ 14032 if (ack_val_set != 0xf) { 14033 uint64_t crtsc; 14034 14035 crtsc = get_cyclecount(); 14036 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14037 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 14038 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 14039 } 14040 } 14041 #endif 14042 if (nxt_pkt == 0) { 14043 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 14044 do_output_now: 14045 did_out = 1; 14046 (void)tp->t_fb->tfb_tcp_output(tp); 14047 } 14048 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 14049 rack_free_trim(rack); 14050 } 14051 if ((nxt_pkt == 0) && 14052 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 14053 (SEQ_GT(tp->snd_max, tp->snd_una) || 14054 (tp->t_flags & TF_DELACK) || 14055 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 14056 (tp->t_state <= TCPS_CLOSING)))) { 14057 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 14058 if ((tp->snd_max == tp->snd_una) && 14059 ((tp->t_flags & TF_DELACK) == 0) && 14060 (rack->rc_inp->inp_in_hpts) && 14061 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 14062 /* keep alive not needed if we are hptsi output yet */ 14063 ; 14064 } else { 14065 int late = 0; 14066 if (rack->rc_inp->inp_in_hpts) { 14067 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 14068 us_cts = tcp_get_usecs(NULL); 14069 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 14070 rack->r_early = 1; 14071 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 14072 } else 14073 late = 1; 14074 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 14075 } 14076 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 14077 } 14078 if (late && (did_out == 0)) { 14079 /* 14080 * We are late in the sending 14081 * and we did not call the output 14082 * (this probably should not happen). 14083 */ 14084 goto do_output_now; 14085 } 14086 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 14087 } 14088 way_out = 1; 14089 } else if (nxt_pkt == 0) { 14090 /* Do we have the correct timer running? */ 14091 rack_timer_audit(tp, rack, &so->so_snd); 14092 way_out = 2; 14093 } 14094 done_with_input: 14095 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 14096 if (did_out) 14097 rack->r_wanted_output = 0; 14098 #ifdef INVARIANTS 14099 if (tp->t_inpcb == NULL) { 14100 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 14101 did_out, 14102 retval, tp, prev_state); 14103 } 14104 #endif 14105 #ifdef TCP_ACCOUNTING 14106 } else { 14107 /* 14108 * Track the time (see above). 14109 */ 14110 if (ack_val_set != 0xf) { 14111 uint64_t crtsc; 14112 14113 crtsc = get_cyclecount(); 14114 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 14115 /* 14116 * Note we *DO NOT* increment the per-tcb counters since 14117 * in the else the TP may be gone!! 14118 */ 14119 } 14120 #endif 14121 } 14122 #ifdef TCP_ACCOUNTING 14123 sched_unpin(); 14124 #endif 14125 return (retval); 14126 } 14127 14128 void 14129 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 14130 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 14131 { 14132 struct timeval tv; 14133 14134 /* First lets see if we have old packets */ 14135 if (tp->t_in_pkt) { 14136 if (ctf_do_queued_segments(so, tp, 1)) { 14137 m_freem(m); 14138 return; 14139 } 14140 } 14141 if (m->m_flags & M_TSTMP_LRO) { 14142 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 14143 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 14144 } else { 14145 /* Should not be should we kassert instead? */ 14146 tcp_get_usecs(&tv); 14147 } 14148 if (rack_do_segment_nounlock(m, th, so, tp, 14149 drop_hdrlen, tlen, iptos, 0, &tv) == 0) { 14150 INP_WUNLOCK(tp->t_inpcb); 14151 } 14152 } 14153 14154 struct rack_sendmap * 14155 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 14156 { 14157 struct rack_sendmap *rsm = NULL; 14158 int32_t idx; 14159 uint32_t srtt = 0, thresh = 0, ts_low = 0; 14160 14161 /* Return the next guy to be re-transmitted */ 14162 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 14163 return (NULL); 14164 } 14165 if (tp->t_flags & TF_SENTFIN) { 14166 /* retran the end FIN? */ 14167 return (NULL); 14168 } 14169 /* ok lets look at this one */ 14170 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 14171 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 14172 goto check_it; 14173 } 14174 rsm = rack_find_lowest_rsm(rack); 14175 if (rsm == NULL) { 14176 return (NULL); 14177 } 14178 check_it: 14179 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 14180 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 14181 /* 14182 * No sack so we automatically do the 3 strikes and 14183 * retransmit (no rack timer would be started). 14184 */ 14185 14186 return (rsm); 14187 } 14188 if (rsm->r_flags & RACK_ACKED) { 14189 return (NULL); 14190 } 14191 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 14192 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 14193 /* Its not yet ready */ 14194 return (NULL); 14195 } 14196 srtt = rack_grab_rtt(tp, rack); 14197 idx = rsm->r_rtr_cnt - 1; 14198 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 14199 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 14200 if ((tsused == ts_low) || 14201 (TSTMP_LT(tsused, ts_low))) { 14202 /* No time since sending */ 14203 return (NULL); 14204 } 14205 if ((tsused - ts_low) < thresh) { 14206 /* It has not been long enough yet */ 14207 return (NULL); 14208 } 14209 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 14210 ((rsm->r_flags & RACK_SACK_PASSED) && 14211 (rack->sack_attack_disable == 0))) { 14212 /* 14213 * We have passed the dup-ack threshold <or> 14214 * a SACK has indicated this is missing. 14215 * Note that if you are a declared attacker 14216 * it is only the dup-ack threshold that 14217 * will cause retransmits. 14218 */ 14219 /* log retransmit reason */ 14220 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 14221 rack->r_fast_output = 0; 14222 return (rsm); 14223 } 14224 return (NULL); 14225 } 14226 14227 static void 14228 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 14229 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 14230 int line, struct rack_sendmap *rsm, uint8_t quality) 14231 { 14232 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 14233 union tcp_log_stackspecific log; 14234 struct timeval tv; 14235 14236 memset(&log, 0, sizeof(log)); 14237 log.u_bbr.flex1 = slot; 14238 log.u_bbr.flex2 = len; 14239 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 14240 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 14241 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 14242 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 14243 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 14244 log.u_bbr.use_lt_bw <<= 1; 14245 log.u_bbr.use_lt_bw |= rack->r_late; 14246 log.u_bbr.use_lt_bw <<= 1; 14247 log.u_bbr.use_lt_bw |= rack->r_early; 14248 log.u_bbr.use_lt_bw <<= 1; 14249 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 14250 log.u_bbr.use_lt_bw <<= 1; 14251 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 14252 log.u_bbr.use_lt_bw <<= 1; 14253 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 14254 log.u_bbr.use_lt_bw <<= 1; 14255 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 14256 log.u_bbr.use_lt_bw <<= 1; 14257 log.u_bbr.use_lt_bw |= rack->gp_ready; 14258 log.u_bbr.pkt_epoch = line; 14259 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 14260 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 14261 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 14262 log.u_bbr.bw_inuse = bw_est; 14263 log.u_bbr.delRate = bw; 14264 if (rack->r_ctl.gp_bw == 0) 14265 log.u_bbr.cur_del_rate = 0; 14266 else 14267 log.u_bbr.cur_del_rate = rack_get_bw(rack); 14268 log.u_bbr.rttProp = len_time; 14269 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 14270 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 14271 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 14272 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 14273 /* We are in slow start */ 14274 log.u_bbr.flex7 = 1; 14275 } else { 14276 /* we are on congestion avoidance */ 14277 log.u_bbr.flex7 = 0; 14278 } 14279 log.u_bbr.flex8 = method; 14280 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14281 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14282 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 14283 log.u_bbr.cwnd_gain <<= 1; 14284 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 14285 log.u_bbr.cwnd_gain <<= 1; 14286 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 14287 log.u_bbr.bbr_substate = quality; 14288 TCP_LOG_EVENTP(rack->rc_tp, NULL, 14289 &rack->rc_inp->inp_socket->so_rcv, 14290 &rack->rc_inp->inp_socket->so_snd, 14291 BBR_LOG_HPTSI_CALC, 0, 14292 0, &log, false, &tv); 14293 } 14294 } 14295 14296 static uint32_t 14297 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 14298 { 14299 uint32_t new_tso, user_max; 14300 14301 user_max = rack->rc_user_set_max_segs * mss; 14302 if (rack->rc_force_max_seg) { 14303 return (user_max); 14304 } 14305 if (rack->use_fixed_rate && 14306 ((rack->r_ctl.crte == NULL) || 14307 (bw != rack->r_ctl.crte->rate))) { 14308 /* Use the user mss since we are not exactly matched */ 14309 return (user_max); 14310 } 14311 new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 14312 if (new_tso > user_max) 14313 new_tso = user_max; 14314 return (new_tso); 14315 } 14316 14317 static int32_t 14318 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 14319 { 14320 uint64_t lentim, fill_bw; 14321 14322 /* Lets first see if we are full, if so continue with normal rate */ 14323 rack->r_via_fill_cw = 0; 14324 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 14325 return (slot); 14326 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 14327 return (slot); 14328 if (rack->r_ctl.rc_last_us_rtt == 0) 14329 return (slot); 14330 if (rack->rc_pace_fill_if_rttin_range && 14331 (rack->r_ctl.rc_last_us_rtt >= 14332 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 14333 /* The rtt is huge, N * smallest, lets not fill */ 14334 return (slot); 14335 } 14336 /* 14337 * first lets calculate the b/w based on the last us-rtt 14338 * and the sndwnd. 14339 */ 14340 fill_bw = rack->r_ctl.cwnd_to_use; 14341 /* Take the rwnd if its smaller */ 14342 if (fill_bw > rack->rc_tp->snd_wnd) 14343 fill_bw = rack->rc_tp->snd_wnd; 14344 if (rack->r_fill_less_agg) { 14345 /* 14346 * Now take away the inflight (this will reduce our 14347 * aggressiveness and yeah, if we get that much out in 1RTT 14348 * we will have had acks come back and still be behind). 14349 */ 14350 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14351 } 14352 /* Now lets make it into a b/w */ 14353 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 14354 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 14355 /* We are below the min b/w */ 14356 if (non_paced) 14357 *rate_wanted = fill_bw; 14358 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 14359 return (slot); 14360 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) 14361 fill_bw = rack->r_ctl.bw_rate_cap; 14362 rack->r_via_fill_cw = 1; 14363 if (rack->r_rack_hw_rate_caps && 14364 (rack->r_ctl.crte != NULL)) { 14365 uint64_t high_rate; 14366 14367 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 14368 if (fill_bw > high_rate) { 14369 /* We are capping bw at the highest rate table entry */ 14370 if (*rate_wanted > high_rate) { 14371 /* The original rate was also capped */ 14372 rack->r_via_fill_cw = 0; 14373 } 14374 rack_log_hdwr_pacing(rack, 14375 fill_bw, high_rate, __LINE__, 14376 0, 3); 14377 fill_bw = high_rate; 14378 if (capped) 14379 *capped = 1; 14380 } 14381 } else if ((rack->r_ctl.crte == NULL) && 14382 (rack->rack_hdrw_pacing == 0) && 14383 (rack->rack_hdw_pace_ena) && 14384 rack->r_rack_hw_rate_caps && 14385 (rack->rack_attempt_hdwr_pace == 0) && 14386 (rack->rc_inp->inp_route.ro_nh != NULL) && 14387 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14388 /* 14389 * Ok we may have a first attempt that is greater than our top rate 14390 * lets check. 14391 */ 14392 uint64_t high_rate; 14393 14394 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 14395 if (high_rate) { 14396 if (fill_bw > high_rate) { 14397 fill_bw = high_rate; 14398 if (capped) 14399 *capped = 1; 14400 } 14401 } 14402 } 14403 /* 14404 * Ok fill_bw holds our mythical b/w to fill the cwnd 14405 * in a rtt, what does that time wise equate too? 14406 */ 14407 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 14408 lentim /= fill_bw; 14409 *rate_wanted = fill_bw; 14410 if (non_paced || (lentim < slot)) { 14411 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 14412 0, lentim, 12, __LINE__, NULL, 0); 14413 return ((int32_t)lentim); 14414 } else 14415 return (slot); 14416 } 14417 14418 static int32_t 14419 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 14420 { 14421 int32_t slot = 0; 14422 int can_start_hw_pacing = 1; 14423 int err; 14424 14425 if (rack->rc_always_pace == 0) { 14426 /* 14427 * We use the most optimistic possible cwnd/srtt for 14428 * sending calculations. This will make our 14429 * calculation anticipate getting more through 14430 * quicker then possible. But thats ok we don't want 14431 * the peer to have a gap in data sending. 14432 */ 14433 uint32_t srtt, cwnd, tr_perms = 0; 14434 int32_t reduce = 0; 14435 14436 old_method: 14437 /* 14438 * We keep no precise pacing with the old method 14439 * instead we use the pacer to mitigate bursts. 14440 */ 14441 if (rack->r_ctl.rc_rack_min_rtt) 14442 srtt = rack->r_ctl.rc_rack_min_rtt; 14443 else 14444 srtt = max(tp->t_srtt, 1); 14445 if (rack->r_ctl.rc_rack_largest_cwnd) 14446 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 14447 else 14448 cwnd = rack->r_ctl.cwnd_to_use; 14449 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 14450 tr_perms = (cwnd * 1000) / srtt; 14451 if (tr_perms == 0) { 14452 tr_perms = ctf_fixed_maxseg(tp); 14453 } 14454 /* 14455 * Calculate how long this will take to drain, if 14456 * the calculation comes out to zero, thats ok we 14457 * will use send_a_lot to possibly spin around for 14458 * more increasing tot_len_this_send to the point 14459 * that its going to require a pace, or we hit the 14460 * cwnd. Which in that case we are just waiting for 14461 * a ACK. 14462 */ 14463 slot = len / tr_perms; 14464 /* Now do we reduce the time so we don't run dry? */ 14465 if (slot && rack_slot_reduction) { 14466 reduce = (slot / rack_slot_reduction); 14467 if (reduce < slot) { 14468 slot -= reduce; 14469 } else 14470 slot = 0; 14471 } 14472 slot *= HPTS_USEC_IN_MSEC; 14473 if (rack->rc_pace_to_cwnd) { 14474 uint64_t rate_wanted = 0; 14475 14476 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 14477 rack->rc_ack_can_sendout_data = 1; 14478 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL, 0); 14479 } else 14480 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL, 0); 14481 } else { 14482 uint64_t bw_est, res, lentim, rate_wanted; 14483 uint32_t orig_val, srtt, segs, oh; 14484 int capped = 0; 14485 int prev_fill; 14486 14487 if ((rack->r_rr_config == 1) && rsm) { 14488 return (rack->r_ctl.rc_min_to); 14489 } 14490 if (rack->use_fixed_rate) { 14491 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 14492 } else if ((rack->r_ctl.init_rate == 0) && 14493 #ifdef NETFLIX_PEAKRATE 14494 (rack->rc_tp->t_maxpeakrate == 0) && 14495 #endif 14496 (rack->r_ctl.gp_bw == 0)) { 14497 /* no way to yet do an estimate */ 14498 bw_est = rate_wanted = 0; 14499 } else { 14500 bw_est = rack_get_bw(rack); 14501 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 14502 } 14503 if ((bw_est == 0) || (rate_wanted == 0) || 14504 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 14505 /* 14506 * No way yet to make a b/w estimate or 14507 * our raise is set incorrectly. 14508 */ 14509 goto old_method; 14510 } 14511 /* We need to account for all the overheads */ 14512 segs = (len + segsiz - 1) / segsiz; 14513 /* 14514 * We need the diff between 1514 bytes (e-mtu with e-hdr) 14515 * and how much data we put in each packet. Yes this 14516 * means we may be off if we are larger than 1500 bytes 14517 * or smaller. But this just makes us more conservative. 14518 */ 14519 if (rack_hw_rate_min && 14520 (bw_est < rack_hw_rate_min)) 14521 can_start_hw_pacing = 0; 14522 if (ETHERNET_SEGMENT_SIZE > segsiz) 14523 oh = ETHERNET_SEGMENT_SIZE - segsiz; 14524 else 14525 oh = 0; 14526 segs *= oh; 14527 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 14528 res = lentim / rate_wanted; 14529 slot = (uint32_t)res; 14530 orig_val = rack->r_ctl.rc_pace_max_segs; 14531 if (rack->r_ctl.crte == NULL) { 14532 /* 14533 * Only do this if we are not hardware pacing 14534 * since if we are doing hw-pacing below we will 14535 * set make a call after setting up or changing 14536 * the rate. 14537 */ 14538 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 14539 } else if (rack->rc_inp->inp_snd_tag == NULL) { 14540 /* 14541 * We lost our rate somehow, this can happen 14542 * if the interface changed underneath us. 14543 */ 14544 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 14545 rack->r_ctl.crte = NULL; 14546 /* Lets re-allow attempting to setup pacing */ 14547 rack->rack_hdrw_pacing = 0; 14548 rack->rack_attempt_hdwr_pace = 0; 14549 rack_log_hdwr_pacing(rack, 14550 rate_wanted, bw_est, __LINE__, 14551 0, 6); 14552 } 14553 /* Did we change the TSO size, if so log it */ 14554 if (rack->r_ctl.rc_pace_max_segs != orig_val) 14555 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL, 0); 14556 prev_fill = rack->r_via_fill_cw; 14557 if ((rack->rc_pace_to_cwnd) && 14558 (capped == 0) && 14559 (rack->use_fixed_rate == 0) && 14560 (rack->in_probe_rtt == 0) && 14561 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 14562 /* 14563 * We want to pace at our rate *or* faster to 14564 * fill the cwnd to the max if its not full. 14565 */ 14566 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 14567 } 14568 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 14569 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14570 if ((rack->rack_hdw_pace_ena) && 14571 (can_start_hw_pacing > 0) && 14572 (rack->rack_hdrw_pacing == 0) && 14573 (rack->rack_attempt_hdwr_pace == 0)) { 14574 /* 14575 * Lets attempt to turn on hardware pacing 14576 * if we can. 14577 */ 14578 rack->rack_attempt_hdwr_pace = 1; 14579 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 14580 rack->rc_inp->inp_route.ro_nh->nh_ifp, 14581 rate_wanted, 14582 RS_PACING_GEQ, 14583 &err, &rack->r_ctl.crte_prev_rate); 14584 if (rack->r_ctl.crte) { 14585 rack->rack_hdrw_pacing = 1; 14586 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, 14587 0, rack->r_ctl.crte, 14588 NULL); 14589 rack_log_hdwr_pacing(rack, 14590 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14591 err, 0); 14592 rack->r_ctl.last_hw_bw_req = rate_wanted; 14593 } else { 14594 counter_u64_add(rack_hw_pace_init_fail, 1); 14595 } 14596 } else if (rack->rack_hdrw_pacing && 14597 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 14598 /* Do we need to adjust our rate? */ 14599 const struct tcp_hwrate_limit_table *nrte; 14600 14601 if (rack->r_up_only && 14602 (rate_wanted < rack->r_ctl.crte->rate)) { 14603 /** 14604 * We have four possible states here 14605 * having to do with the previous time 14606 * and this time. 14607 * previous | this-time 14608 * A) 0 | 0 -- fill_cw not in the picture 14609 * B) 1 | 0 -- we were doing a fill-cw but now are not 14610 * C) 1 | 1 -- all rates from fill_cw 14611 * D) 0 | 1 -- we were doing non-fill and now we are filling 14612 * 14613 * For case A, C and D we don't allow a drop. But for 14614 * case B where we now our on our steady rate we do 14615 * allow a drop. 14616 * 14617 */ 14618 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 14619 goto done_w_hdwr; 14620 } 14621 if ((rate_wanted > rack->r_ctl.crte->rate) || 14622 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 14623 if (rack_hw_rate_to_low && 14624 (bw_est < rack_hw_rate_to_low)) { 14625 /* 14626 * The pacing rate is too low for hardware, but 14627 * do allow hardware pacing to be restarted. 14628 */ 14629 rack_log_hdwr_pacing(rack, 14630 bw_est, rack->r_ctl.crte->rate, __LINE__, 14631 0, 5); 14632 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 14633 rack->r_ctl.crte = NULL; 14634 rack->rack_attempt_hdwr_pace = 0; 14635 rack->rack_hdrw_pacing = 0; 14636 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14637 goto done_w_hdwr; 14638 } 14639 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 14640 rack->rc_tp, 14641 rack->rc_inp->inp_route.ro_nh->nh_ifp, 14642 rate_wanted, 14643 RS_PACING_GEQ, 14644 &err, &rack->r_ctl.crte_prev_rate); 14645 if (nrte == NULL) { 14646 /* Lost the rate */ 14647 rack->rack_hdrw_pacing = 0; 14648 rack->r_ctl.crte = NULL; 14649 rack_log_hdwr_pacing(rack, 14650 rate_wanted, 0, __LINE__, 14651 err, 1); 14652 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14653 counter_u64_add(rack_hw_pace_lost, 1); 14654 } else if (nrte != rack->r_ctl.crte) { 14655 rack->r_ctl.crte = nrte; 14656 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, 14657 segsiz, 0, 14658 rack->r_ctl.crte, 14659 NULL); 14660 rack_log_hdwr_pacing(rack, 14661 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14662 err, 2); 14663 rack->r_ctl.last_hw_bw_req = rate_wanted; 14664 } 14665 } else { 14666 /* We just need to adjust the segment size */ 14667 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14668 rack_log_hdwr_pacing(rack, 14669 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14670 0, 4); 14671 rack->r_ctl.last_hw_bw_req = rate_wanted; 14672 } 14673 } 14674 } 14675 if ((rack->r_ctl.crte != NULL) && 14676 (rack->r_ctl.crte->rate == rate_wanted)) { 14677 /* 14678 * We need to add a extra if the rates 14679 * are exactly matched. The idea is 14680 * we want the software to make sure the 14681 * queue is empty before adding more, this 14682 * gives us N MSS extra pace times where 14683 * N is our sysctl 14684 */ 14685 slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); 14686 } 14687 done_w_hdwr: 14688 if (rack_limit_time_with_srtt && 14689 (rack->use_fixed_rate == 0) && 14690 #ifdef NETFLIX_PEAKRATE 14691 (rack->rc_tp->t_maxpeakrate == 0) && 14692 #endif 14693 (rack->rack_hdrw_pacing == 0)) { 14694 /* 14695 * Sanity check, we do not allow the pacing delay 14696 * to be longer than the SRTT of the path. If it is 14697 * a slow path, then adding a packet should increase 14698 * the RTT and compensate for this i.e. the srtt will 14699 * be greater so the allowed pacing time will be greater. 14700 * 14701 * Note this restriction is not for where a peak rate 14702 * is set, we are doing fixed pacing or hardware pacing. 14703 */ 14704 if (rack->rc_tp->t_srtt) 14705 srtt = rack->rc_tp->t_srtt; 14706 else 14707 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 14708 if (srtt < slot) { 14709 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL, 0); 14710 slot = srtt; 14711 } 14712 } 14713 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm, 0); 14714 } 14715 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 14716 /* 14717 * If this rate is seeing enobufs when it 14718 * goes to send then either the nic is out 14719 * of gas or we are mis-estimating the time 14720 * somehow and not letting the queue empty 14721 * completely. Lets add to the pacing time. 14722 */ 14723 int hw_boost_delay; 14724 14725 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 14726 if (hw_boost_delay > rack_enobuf_hw_max) 14727 hw_boost_delay = rack_enobuf_hw_max; 14728 else if (hw_boost_delay < rack_enobuf_hw_min) 14729 hw_boost_delay = rack_enobuf_hw_min; 14730 slot += hw_boost_delay; 14731 } 14732 if (slot) 14733 counter_u64_add(rack_calc_nonzero, 1); 14734 else 14735 counter_u64_add(rack_calc_zero, 1); 14736 return (slot); 14737 } 14738 14739 static void 14740 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 14741 tcp_seq startseq, uint32_t sb_offset) 14742 { 14743 struct rack_sendmap *my_rsm = NULL; 14744 struct rack_sendmap fe; 14745 14746 if (tp->t_state < TCPS_ESTABLISHED) { 14747 /* 14748 * We don't start any measurements if we are 14749 * not at least established. 14750 */ 14751 return; 14752 } 14753 if (tp->t_state >= TCPS_FIN_WAIT_1) { 14754 /* 14755 * We will get no more data into the SB 14756 * this means we need to have the data available 14757 * before we start a measurement. 14758 */ 14759 14760 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) < 14761 max(rc_init_window(rack), 14762 (MIN_GP_WIN * ctf_fixed_maxseg(tp)))) { 14763 /* Nope not enough data */ 14764 return; 14765 } 14766 } 14767 tp->t_flags |= TF_GPUTINPROG; 14768 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 14769 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 14770 tp->gput_seq = startseq; 14771 rack->app_limited_needs_set = 0; 14772 if (rack->in_probe_rtt) 14773 rack->measure_saw_probe_rtt = 1; 14774 else if ((rack->measure_saw_probe_rtt) && 14775 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 14776 rack->measure_saw_probe_rtt = 0; 14777 if (rack->rc_gp_filled) 14778 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 14779 else { 14780 /* Special case initial measurement */ 14781 struct timeval tv; 14782 14783 tp->gput_ts = tcp_get_usecs(&tv); 14784 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 14785 } 14786 /* 14787 * We take a guess out into the future, 14788 * if we have no measurement and no 14789 * initial rate, we measure the first 14790 * initial-windows worth of data to 14791 * speed up getting some GP measurement and 14792 * thus start pacing. 14793 */ 14794 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 14795 rack->app_limited_needs_set = 1; 14796 tp->gput_ack = startseq + max(rc_init_window(rack), 14797 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 14798 rack_log_pacing_delay_calc(rack, 14799 tp->gput_seq, 14800 tp->gput_ack, 14801 0, 14802 tp->gput_ts, 14803 rack->r_ctl.rc_app_limited_cnt, 14804 9, 14805 __LINE__, NULL, 0); 14806 return; 14807 } 14808 if (sb_offset) { 14809 /* 14810 * We are out somewhere in the sb 14811 * can we use the already outstanding data? 14812 */ 14813 if (rack->r_ctl.rc_app_limited_cnt == 0) { 14814 /* 14815 * Yes first one is good and in this case 14816 * the tp->gput_ts is correctly set based on 14817 * the last ack that arrived (no need to 14818 * set things up when an ack comes in). 14819 */ 14820 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 14821 if ((my_rsm == NULL) || 14822 (my_rsm->r_rtr_cnt != 1)) { 14823 /* retransmission? */ 14824 goto use_latest; 14825 } 14826 } else { 14827 if (rack->r_ctl.rc_first_appl == NULL) { 14828 /* 14829 * If rc_first_appl is NULL 14830 * then the cnt should be 0. 14831 * This is probably an error, maybe 14832 * a KASSERT would be approprate. 14833 */ 14834 goto use_latest; 14835 } 14836 /* 14837 * If we have a marker pointer to the last one that is 14838 * app limited we can use that, but we need to set 14839 * things up so that when it gets ack'ed we record 14840 * the ack time (if its not already acked). 14841 */ 14842 rack->app_limited_needs_set = 1; 14843 /* 14844 * We want to get to the rsm that is either 14845 * next with space i.e. over 1 MSS or the one 14846 * after that (after the app-limited). 14847 */ 14848 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 14849 rack->r_ctl.rc_first_appl); 14850 if (my_rsm) { 14851 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 14852 /* Have to use the next one */ 14853 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 14854 my_rsm); 14855 else { 14856 /* Use after the first MSS of it is acked */ 14857 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 14858 goto start_set; 14859 } 14860 } 14861 if ((my_rsm == NULL) || 14862 (my_rsm->r_rtr_cnt != 1)) { 14863 /* 14864 * Either its a retransmit or 14865 * the last is the app-limited one. 14866 */ 14867 goto use_latest; 14868 } 14869 } 14870 tp->gput_seq = my_rsm->r_start; 14871 start_set: 14872 if (my_rsm->r_flags & RACK_ACKED) { 14873 /* 14874 * This one has been acked use the arrival ack time 14875 */ 14876 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 14877 rack->app_limited_needs_set = 0; 14878 } 14879 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 14880 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 14881 rack_log_pacing_delay_calc(rack, 14882 tp->gput_seq, 14883 tp->gput_ack, 14884 (uint64_t)my_rsm, 14885 tp->gput_ts, 14886 rack->r_ctl.rc_app_limited_cnt, 14887 9, 14888 __LINE__, NULL, 0); 14889 return; 14890 } 14891 14892 use_latest: 14893 /* 14894 * We don't know how long we may have been 14895 * idle or if this is the first-send. Lets 14896 * setup the flag so we will trim off 14897 * the first ack'd data so we get a true 14898 * measurement. 14899 */ 14900 rack->app_limited_needs_set = 1; 14901 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 14902 /* Find this guy so we can pull the send time */ 14903 fe.r_start = startseq; 14904 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 14905 if (my_rsm) { 14906 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 14907 if (my_rsm->r_flags & RACK_ACKED) { 14908 /* 14909 * Unlikely since its probably what was 14910 * just transmitted (but I am paranoid). 14911 */ 14912 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 14913 rack->app_limited_needs_set = 0; 14914 } 14915 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 14916 /* This also is unlikely */ 14917 tp->gput_seq = my_rsm->r_start; 14918 } 14919 } else { 14920 /* 14921 * TSNH unless we have some send-map limit, 14922 * and even at that it should not be hitting 14923 * that limit (we should have stopped sending). 14924 */ 14925 struct timeval tv; 14926 14927 microuptime(&tv); 14928 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 14929 } 14930 rack_log_pacing_delay_calc(rack, 14931 tp->gput_seq, 14932 tp->gput_ack, 14933 (uint64_t)my_rsm, 14934 tp->gput_ts, 14935 rack->r_ctl.rc_app_limited_cnt, 14936 9, __LINE__, NULL, 0); 14937 } 14938 14939 static inline uint32_t 14940 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 14941 uint32_t avail, int32_t sb_offset) 14942 { 14943 uint32_t len; 14944 uint32_t sendwin; 14945 14946 if (tp->snd_wnd > cwnd_to_use) 14947 sendwin = cwnd_to_use; 14948 else 14949 sendwin = tp->snd_wnd; 14950 if (ctf_outstanding(tp) >= tp->snd_wnd) { 14951 /* We never want to go over our peers rcv-window */ 14952 len = 0; 14953 } else { 14954 uint32_t flight; 14955 14956 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 14957 if (flight >= sendwin) { 14958 /* 14959 * We have in flight what we are allowed by cwnd (if 14960 * it was rwnd blocking it would have hit above out 14961 * >= tp->snd_wnd). 14962 */ 14963 return (0); 14964 } 14965 len = sendwin - flight; 14966 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 14967 /* We would send too much (beyond the rwnd) */ 14968 len = tp->snd_wnd - ctf_outstanding(tp); 14969 } 14970 if ((len + sb_offset) > avail) { 14971 /* 14972 * We don't have that much in the SB, how much is 14973 * there? 14974 */ 14975 len = avail - sb_offset; 14976 } 14977 } 14978 return (len); 14979 } 14980 14981 static void 14982 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 14983 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 14984 int rsm_is_null, int optlen, int line, uint16_t mode) 14985 { 14986 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 14987 union tcp_log_stackspecific log; 14988 struct timeval tv; 14989 14990 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14991 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 14992 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 14993 log.u_bbr.flex1 = error; 14994 log.u_bbr.flex2 = flags; 14995 log.u_bbr.flex3 = rsm_is_null; 14996 log.u_bbr.flex4 = ipoptlen; 14997 log.u_bbr.flex5 = tp->rcv_numsacks; 14998 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 14999 log.u_bbr.flex7 = optlen; 15000 log.u_bbr.flex8 = rack->r_fsb_inited; 15001 log.u_bbr.applimited = rack->r_fast_output; 15002 log.u_bbr.bw_inuse = rack_get_bw(rack); 15003 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15004 log.u_bbr.cwnd_gain = mode; 15005 log.u_bbr.pkts_out = orig_len; 15006 log.u_bbr.lt_epoch = len; 15007 log.u_bbr.delivered = line; 15008 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 15009 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15010 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 15011 len, &log, false, NULL, NULL, 0, &tv); 15012 } 15013 } 15014 15015 15016 static struct mbuf * 15017 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 15018 struct rack_fast_send_blk *fsb, 15019 int32_t seglimit, int32_t segsize, int hw_tls) 15020 { 15021 #ifdef KERN_TLS 15022 struct ktls_session *tls, *ntls; 15023 struct mbuf *start; 15024 #endif 15025 struct mbuf *m, *n, **np, *smb; 15026 struct mbuf *top; 15027 int32_t off, soff; 15028 int32_t len = *plen; 15029 int32_t fragsize; 15030 int32_t len_cp = 0; 15031 uint32_t mlen, frags; 15032 15033 soff = off = the_off; 15034 smb = m = the_m; 15035 np = ⊤ 15036 top = NULL; 15037 #ifdef KERN_TLS 15038 if (hw_tls && (m->m_flags & M_EXTPG)) 15039 tls = m->m_epg_tls; 15040 else 15041 tls = NULL; 15042 start = m; 15043 #endif 15044 while (len > 0) { 15045 if (m == NULL) { 15046 *plen = len_cp; 15047 break; 15048 } 15049 #ifdef KERN_TLS 15050 if (hw_tls) { 15051 if (m->m_flags & M_EXTPG) 15052 ntls = m->m_epg_tls; 15053 else 15054 ntls = NULL; 15055 15056 /* 15057 * Avoid mixing TLS records with handshake 15058 * data or TLS records from different 15059 * sessions. 15060 */ 15061 if (tls != ntls) { 15062 MPASS(m != start); 15063 *plen = len_cp; 15064 break; 15065 } 15066 } 15067 #endif 15068 mlen = min(len, m->m_len - off); 15069 if (seglimit) { 15070 /* 15071 * For M_EXTPG mbufs, add 3 segments 15072 * + 1 in case we are crossing page boundaries 15073 * + 2 in case the TLS hdr/trailer are used 15074 * It is cheaper to just add the segments 15075 * than it is to take the cache miss to look 15076 * at the mbuf ext_pgs state in detail. 15077 */ 15078 if (m->m_flags & M_EXTPG) { 15079 fragsize = min(segsize, PAGE_SIZE); 15080 frags = 3; 15081 } else { 15082 fragsize = segsize; 15083 frags = 0; 15084 } 15085 15086 /* Break if we really can't fit anymore. */ 15087 if ((frags + 1) >= seglimit) { 15088 *plen = len_cp; 15089 break; 15090 } 15091 15092 /* 15093 * Reduce size if you can't copy the whole 15094 * mbuf. If we can't copy the whole mbuf, also 15095 * adjust len so the loop will end after this 15096 * mbuf. 15097 */ 15098 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 15099 mlen = (seglimit - frags - 1) * fragsize; 15100 len = mlen; 15101 *plen = len_cp + len; 15102 } 15103 frags += howmany(mlen, fragsize); 15104 if (frags == 0) 15105 frags++; 15106 seglimit -= frags; 15107 KASSERT(seglimit > 0, 15108 ("%s: seglimit went too low", __func__)); 15109 } 15110 n = m_get(M_NOWAIT, m->m_type); 15111 *np = n; 15112 if (n == NULL) 15113 goto nospace; 15114 n->m_len = mlen; 15115 soff += mlen; 15116 len_cp += n->m_len; 15117 if (m->m_flags & (M_EXT|M_EXTPG)) { 15118 n->m_data = m->m_data + off; 15119 mb_dupcl(n, m); 15120 } else { 15121 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 15122 (u_int)n->m_len); 15123 } 15124 len -= n->m_len; 15125 off = 0; 15126 m = m->m_next; 15127 np = &n->m_next; 15128 if (len || (soff == smb->m_len)) { 15129 /* 15130 * We have more so we move forward or 15131 * we have consumed the entire mbuf and 15132 * len has fell to 0. 15133 */ 15134 soff = 0; 15135 smb = m; 15136 } 15137 15138 } 15139 if (fsb != NULL) { 15140 fsb->m = smb; 15141 fsb->off = soff; 15142 if (smb) { 15143 /* 15144 * Save off the size of the mbuf. We do 15145 * this so that we can recognize when it 15146 * has been trimmed by sbcut() as acks 15147 * come in. 15148 */ 15149 fsb->o_m_len = smb->m_len; 15150 } else { 15151 /* 15152 * This is the case where the next mbuf went to NULL. This 15153 * means with this copy we have sent everything in the sb. 15154 * In theory we could clear the fast_output flag, but lets 15155 * not since its possible that we could get more added 15156 * and acks that call the extend function which would let 15157 * us send more. 15158 */ 15159 fsb->o_m_len = 0; 15160 } 15161 } 15162 return (top); 15163 nospace: 15164 if (top) 15165 m_freem(top); 15166 return (NULL); 15167 15168 } 15169 15170 /* 15171 * This is a copy of m_copym(), taking the TSO segment size/limit 15172 * constraints into account, and advancing the sndptr as it goes. 15173 */ 15174 static struct mbuf * 15175 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 15176 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 15177 { 15178 struct mbuf *m, *n; 15179 int32_t soff; 15180 15181 soff = rack->r_ctl.fsb.off; 15182 m = rack->r_ctl.fsb.m; 15183 if (rack->r_ctl.fsb.o_m_len != m->m_len) { 15184 /* 15185 * The mbuf had the front of it chopped off by an ack 15186 * we need to adjust the soff/off by that difference. 15187 */ 15188 uint32_t delta; 15189 15190 delta = rack->r_ctl.fsb.o_m_len - m->m_len; 15191 soff -= delta; 15192 } 15193 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 15194 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 15195 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 15196 __FUNCTION__, 15197 rack, *plen, m, m->m_len)); 15198 /* Save off the right location before we copy and advance */ 15199 *s_soff = soff; 15200 *s_mb = rack->r_ctl.fsb.m; 15201 n = rack_fo_base_copym(m, soff, plen, 15202 &rack->r_ctl.fsb, 15203 seglimit, segsize, rack->r_ctl.fsb.hw_tls); 15204 return (n); 15205 } 15206 15207 static int 15208 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 15209 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len, uint8_t doing_tlp) 15210 { 15211 /* 15212 * Enter the fast retransmit path. We are given that a sched_pin is 15213 * in place (if accounting is compliled in) and the cycle count taken 15214 * at the entry is in the ts_val. The concept her is that the rsm 15215 * now holds the mbuf offsets and such so we can directly transmit 15216 * without a lot of overhead, the len field is already set for 15217 * us to prohibit us from sending too much (usually its 1MSS). 15218 */ 15219 struct ip *ip = NULL; 15220 struct udphdr *udp = NULL; 15221 struct tcphdr *th = NULL; 15222 struct mbuf *m = NULL; 15223 struct inpcb *inp; 15224 uint8_t *cpto; 15225 struct tcp_log_buffer *lgb; 15226 #ifdef TCP_ACCOUNTING 15227 uint64_t crtsc; 15228 int cnt_thru = 1; 15229 #endif 15230 struct tcpopt to; 15231 u_char opt[TCP_MAXOLEN]; 15232 uint32_t hdrlen, optlen; 15233 int32_t slot, segsiz, max_val, tso = 0, error = 0, flags, ulen = 0; 15234 uint32_t us_cts; 15235 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15236 uint32_t if_hw_tsomaxsegsize; 15237 15238 #ifdef INET6 15239 struct ip6_hdr *ip6 = NULL; 15240 15241 if (rack->r_is_v6) { 15242 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15243 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15244 } else 15245 #endif /* INET6 */ 15246 { 15247 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15248 hdrlen = sizeof(struct tcpiphdr); 15249 } 15250 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15251 goto failed; 15252 } 15253 if (rsm->r_flags & RACK_TLP) 15254 doing_tlp = 1; 15255 else if (doing_tlp) 15256 rsm->r_flags |= RACK_TLP; 15257 startseq = rsm->r_start; 15258 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15259 inp = rack->rc_inp; 15260 to.to_flags = 0; 15261 flags = tcp_outflags[tp->t_state]; 15262 if (flags & (TH_SYN|TH_RST)) { 15263 goto failed; 15264 } 15265 if (rsm->r_flags & RACK_HAS_FIN) { 15266 /* We can't send a FIN here */ 15267 goto failed; 15268 } 15269 if (flags & TH_FIN) { 15270 /* We never send a FIN */ 15271 flags &= ~TH_FIN; 15272 } 15273 if (tp->t_flags & TF_RCVD_TSTMP) { 15274 to.to_tsval = ms_cts + tp->ts_offset; 15275 to.to_tsecr = tp->ts_recent; 15276 to.to_flags = TOF_TS; 15277 } 15278 optlen = tcp_addoptions(&to, opt); 15279 hdrlen += optlen; 15280 udp = rack->r_ctl.fsb.udp; 15281 if (udp) 15282 hdrlen += sizeof(struct udphdr); 15283 if (rack->r_ctl.rc_pace_max_segs) 15284 max_val = rack->r_ctl.rc_pace_max_segs; 15285 else if (rack->rc_user_set_max_segs) 15286 max_val = rack->rc_user_set_max_segs * segsiz; 15287 else 15288 max_val = len; 15289 if ((tp->t_flags & TF_TSO) && 15290 V_tcp_do_tso && 15291 (len > segsiz) && 15292 (tp->t_port == 0)) 15293 tso = 1; 15294 #ifdef INET6 15295 if (MHLEN < hdrlen + max_linkhdr) 15296 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15297 else 15298 #endif 15299 m = m_gethdr(M_NOWAIT, MT_DATA); 15300 if (m == NULL) 15301 goto failed; 15302 m->m_data += max_linkhdr; 15303 m->m_len = hdrlen; 15304 th = rack->r_ctl.fsb.th; 15305 /* Establish the len to send */ 15306 if (len > max_val) 15307 len = max_val; 15308 if ((tso) && (len + optlen > tp->t_maxseg)) { 15309 uint32_t if_hw_tsomax; 15310 int32_t max_len; 15311 15312 /* extract TSO information */ 15313 if_hw_tsomax = tp->t_tsomax; 15314 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15315 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15316 /* 15317 * Check if we should limit by maximum payload 15318 * length: 15319 */ 15320 if (if_hw_tsomax != 0) { 15321 /* compute maximum TSO length */ 15322 max_len = (if_hw_tsomax - hdrlen - 15323 max_linkhdr); 15324 if (max_len <= 0) { 15325 goto failed; 15326 } else if (len > max_len) { 15327 len = max_len; 15328 } 15329 } 15330 if (len <= segsiz) { 15331 /* 15332 * In case there are too many small fragments don't 15333 * use TSO: 15334 */ 15335 tso = 0; 15336 } 15337 } else { 15338 tso = 0; 15339 } 15340 if ((tso == 0) && (len > segsiz)) 15341 len = segsiz; 15342 us_cts = tcp_get_usecs(tv); 15343 if ((len == 0) || 15344 (len <= MHLEN - hdrlen - max_linkhdr)) { 15345 goto failed; 15346 } 15347 th->th_seq = htonl(rsm->r_start); 15348 th->th_ack = htonl(tp->rcv_nxt); 15349 /* 15350 * The PUSH bit should only be applied 15351 * if the full retransmission is made. If 15352 * we are sending less than this is the 15353 * left hand edge and should not have 15354 * the PUSH bit. 15355 */ 15356 if ((rsm->r_flags & RACK_HAD_PUSH) && 15357 (len == (rsm->r_end - rsm->r_start))) 15358 flags |= TH_PUSH; 15359 th->th_flags = flags; 15360 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15361 if (th->th_win == 0) { 15362 tp->t_sndzerowin++; 15363 tp->t_flags |= TF_RXWIN0SENT; 15364 } else 15365 tp->t_flags &= ~TF_RXWIN0SENT; 15366 if (rsm->r_flags & RACK_TLP) { 15367 /* 15368 * TLP should not count in retran count, but 15369 * in its own bin 15370 */ 15371 counter_u64_add(rack_tlp_retran, 1); 15372 counter_u64_add(rack_tlp_retran_bytes, len); 15373 } else { 15374 tp->t_sndrexmitpack++; 15375 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 15376 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 15377 } 15378 #ifdef STATS 15379 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 15380 len); 15381 #endif 15382 if (rsm->m == NULL) 15383 goto failed; 15384 if (rsm->orig_m_len != rsm->m->m_len) { 15385 /* Fix up the orig_m_len and possibly the mbuf offset */ 15386 rack_adjust_orig_mlen(rsm); 15387 } 15388 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, rsm->r_hw_tls); 15389 if (len <= segsiz) { 15390 /* 15391 * Must have ran out of mbufs for the copy 15392 * shorten it to no longer need tso. Lets 15393 * not put on sendalot since we are low on 15394 * mbufs. 15395 */ 15396 tso = 0; 15397 } 15398 if ((m->m_next == NULL) || (len <= 0)){ 15399 goto failed; 15400 } 15401 if (udp) { 15402 if (rack->r_is_v6) 15403 ulen = hdrlen + len - sizeof(struct ip6_hdr); 15404 else 15405 ulen = hdrlen + len - sizeof(struct ip); 15406 udp->uh_ulen = htons(ulen); 15407 } 15408 m->m_pkthdr.rcvif = (struct ifnet *)0; 15409 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 15410 #ifdef INET6 15411 if (rack->r_is_v6) { 15412 if (tp->t_port) { 15413 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 15414 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15415 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 15416 th->th_sum = htons(0); 15417 UDPSTAT_INC(udps_opackets); 15418 } else { 15419 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 15420 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15421 th->th_sum = in6_cksum_pseudo(ip6, 15422 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 15423 0); 15424 } 15425 } 15426 #endif 15427 #if defined(INET6) && defined(INET) 15428 else 15429 #endif 15430 #ifdef INET 15431 { 15432 if (tp->t_port) { 15433 m->m_pkthdr.csum_flags = CSUM_UDP; 15434 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15435 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 15436 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 15437 th->th_sum = htons(0); 15438 UDPSTAT_INC(udps_opackets); 15439 } else { 15440 m->m_pkthdr.csum_flags = CSUM_TCP; 15441 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15442 th->th_sum = in_pseudo(ip->ip_src.s_addr, 15443 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 15444 IPPROTO_TCP + len + optlen)); 15445 } 15446 /* IP version must be set here for ipv4/ipv6 checking later */ 15447 KASSERT(ip->ip_v == IPVERSION, 15448 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 15449 } 15450 #endif 15451 if (tso) { 15452 KASSERT(len > tp->t_maxseg - optlen, 15453 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 15454 m->m_pkthdr.csum_flags |= CSUM_TSO; 15455 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 15456 } 15457 #ifdef INET6 15458 if (rack->r_is_v6) { 15459 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 15460 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 15461 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 15462 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15463 else 15464 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15465 } 15466 #endif 15467 #if defined(INET) && defined(INET6) 15468 else 15469 #endif 15470 #ifdef INET 15471 { 15472 ip->ip_len = htons(m->m_pkthdr.len); 15473 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 15474 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 15475 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15476 if (tp->t_port == 0 || len < V_tcp_minmss) { 15477 ip->ip_off |= htons(IP_DF); 15478 } 15479 } else { 15480 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15481 } 15482 } 15483 #endif 15484 /* Time to copy in our header */ 15485 cpto = mtod(m, uint8_t *); 15486 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 15487 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 15488 if (optlen) { 15489 bcopy(opt, th + 1, optlen); 15490 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 15491 } else { 15492 th->th_off = sizeof(struct tcphdr) >> 2; 15493 } 15494 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15495 union tcp_log_stackspecific log; 15496 15497 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15498 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 15499 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 15500 if (rack->rack_no_prr) 15501 log.u_bbr.flex1 = 0; 15502 else 15503 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15504 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 15505 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 15506 log.u_bbr.flex4 = max_val; 15507 log.u_bbr.flex5 = 0; 15508 /* Save off the early/late values */ 15509 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15510 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 15511 log.u_bbr.bw_inuse = rack_get_bw(rack); 15512 if (doing_tlp == 0) 15513 log.u_bbr.flex8 = 1; 15514 else 15515 log.u_bbr.flex8 = 2; 15516 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15517 log.u_bbr.flex7 = 55; 15518 log.u_bbr.pkts_out = tp->t_maxseg; 15519 log.u_bbr.timeStamp = cts; 15520 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15521 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 15522 log.u_bbr.delivered = 0; 15523 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15524 len, &log, false, NULL, NULL, 0, tv); 15525 } else 15526 lgb = NULL; 15527 #ifdef INET6 15528 if (rack->r_is_v6) { 15529 error = ip6_output(m, NULL, 15530 &inp->inp_route6, 15531 0, NULL, NULL, inp); 15532 } 15533 #endif 15534 #if defined(INET) && defined(INET6) 15535 else 15536 #endif 15537 #ifdef INET 15538 { 15539 error = ip_output(m, NULL, 15540 &inp->inp_route, 15541 0, 0, inp); 15542 } 15543 #endif 15544 m = NULL; 15545 if (lgb) { 15546 lgb->tlb_errno = error; 15547 lgb = NULL; 15548 } 15549 if (error) { 15550 goto failed; 15551 } 15552 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 15553 rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls); 15554 if (doing_tlp && (rack->fast_rsm_hack == 0)) { 15555 rack->rc_tlp_in_progress = 1; 15556 rack->r_ctl.rc_tlp_cnt_out++; 15557 } 15558 if (error == 0) 15559 tcp_account_for_send(tp, len, 1, doing_tlp, rsm->r_hw_tls); 15560 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 15561 rack->forced_ack = 0; /* If we send something zap the FA flag */ 15562 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 15563 rack->r_ctl.retran_during_recovery += len; 15564 { 15565 int idx; 15566 15567 idx = (len / segsiz) + 3; 15568 if (idx >= TCP_MSS_ACCT_ATIMER) 15569 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 15570 else 15571 counter_u64_add(rack_out_size[idx], 1); 15572 } 15573 if (tp->t_rtttime == 0) { 15574 tp->t_rtttime = ticks; 15575 tp->t_rtseq = startseq; 15576 KMOD_TCPSTAT_INC(tcps_segstimed); 15577 } 15578 counter_u64_add(rack_fto_rsm_send, 1); 15579 if (error && (error == ENOBUFS)) { 15580 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 15581 if (rack->rc_enobuf < 0x7f) 15582 rack->rc_enobuf++; 15583 if (slot < (10 * HPTS_USEC_IN_MSEC)) 15584 slot = 10 * HPTS_USEC_IN_MSEC; 15585 } else 15586 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 15587 if ((slot == 0) || 15588 (rack->rc_always_pace == 0) || 15589 (rack->r_rr_config == 1)) { 15590 /* 15591 * We have no pacing set or we 15592 * are using old-style rack or 15593 * we are overriden to use the old 1ms pacing. 15594 */ 15595 slot = rack->r_ctl.rc_min_to; 15596 } 15597 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 15598 if (rack->r_must_retran) { 15599 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 15600 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 15601 /* 15602 * We have retransmitted all we need. 15603 */ 15604 rack->r_must_retran = 0; 15605 rack->r_ctl.rc_out_at_rto = 0; 15606 } 15607 } 15608 #ifdef TCP_ACCOUNTING 15609 crtsc = get_cyclecount(); 15610 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15611 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 15612 } 15613 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 15614 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15615 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 15616 } 15617 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 15618 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15619 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 15620 } 15621 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz)); 15622 sched_unpin(); 15623 #endif 15624 return (0); 15625 failed: 15626 if (m) 15627 m_free(m); 15628 return (-1); 15629 } 15630 15631 static void 15632 rack_sndbuf_autoscale(struct tcp_rack *rack) 15633 { 15634 /* 15635 * Automatic sizing of send socket buffer. Often the send buffer 15636 * size is not optimally adjusted to the actual network conditions 15637 * at hand (delay bandwidth product). Setting the buffer size too 15638 * small limits throughput on links with high bandwidth and high 15639 * delay (eg. trans-continental/oceanic links). Setting the 15640 * buffer size too big consumes too much real kernel memory, 15641 * especially with many connections on busy servers. 15642 * 15643 * The criteria to step up the send buffer one notch are: 15644 * 1. receive window of remote host is larger than send buffer 15645 * (with a fudge factor of 5/4th); 15646 * 2. send buffer is filled to 7/8th with data (so we actually 15647 * have data to make use of it); 15648 * 3. send buffer fill has not hit maximal automatic size; 15649 * 4. our send window (slow start and cogestion controlled) is 15650 * larger than sent but unacknowledged data in send buffer. 15651 * 15652 * Note that the rack version moves things much faster since 15653 * we want to avoid hitting cache lines in the rack_fast_output() 15654 * path so this is called much less often and thus moves 15655 * the SB forward by a percentage. 15656 */ 15657 struct socket *so; 15658 struct tcpcb *tp; 15659 uint32_t sendwin, scaleup; 15660 15661 tp = rack->rc_tp; 15662 so = rack->rc_inp->inp_socket; 15663 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 15664 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 15665 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 15666 sbused(&so->so_snd) >= 15667 (so->so_snd.sb_hiwat / 8 * 7) && 15668 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 15669 sendwin >= (sbused(&so->so_snd) - 15670 (tp->snd_nxt - tp->snd_una))) { 15671 if (rack_autosndbuf_inc) 15672 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 15673 else 15674 scaleup = V_tcp_autosndbuf_inc; 15675 if (scaleup < V_tcp_autosndbuf_inc) 15676 scaleup = V_tcp_autosndbuf_inc; 15677 scaleup += so->so_snd.sb_hiwat; 15678 if (scaleup > V_tcp_autosndbuf_max) 15679 scaleup = V_tcp_autosndbuf_max; 15680 if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread)) 15681 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 15682 } 15683 } 15684 } 15685 15686 static int 15687 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 15688 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 15689 { 15690 /* 15691 * Enter to do fast output. We are given that the sched_pin is 15692 * in place (if accounting is compiled in) and the cycle count taken 15693 * at entry is in place in ts_val. The idea here is that 15694 * we know how many more bytes needs to be sent (presumably either 15695 * during pacing or to fill the cwnd and that was greater than 15696 * the max-burst). We have how much to send and all the info we 15697 * need to just send. 15698 */ 15699 struct ip *ip = NULL; 15700 struct udphdr *udp = NULL; 15701 struct tcphdr *th = NULL; 15702 struct mbuf *m, *s_mb; 15703 struct inpcb *inp; 15704 uint8_t *cpto; 15705 struct tcp_log_buffer *lgb; 15706 #ifdef TCP_ACCOUNTING 15707 uint64_t crtsc; 15708 #endif 15709 struct tcpopt to; 15710 u_char opt[TCP_MAXOLEN]; 15711 uint32_t hdrlen, optlen; 15712 int cnt_thru = 1; 15713 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error = 0, flags, ulen = 0; 15714 uint32_t us_cts, s_soff; 15715 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15716 uint32_t if_hw_tsomaxsegsize; 15717 uint16_t add_flag = RACK_SENT_FP; 15718 #ifdef INET6 15719 struct ip6_hdr *ip6 = NULL; 15720 15721 if (rack->r_is_v6) { 15722 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15723 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15724 } else 15725 #endif /* INET6 */ 15726 { 15727 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15728 hdrlen = sizeof(struct tcpiphdr); 15729 } 15730 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15731 m = NULL; 15732 goto failed; 15733 } 15734 startseq = tp->snd_max; 15735 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15736 inp = rack->rc_inp; 15737 len = rack->r_ctl.fsb.left_to_send; 15738 to.to_flags = 0; 15739 flags = rack->r_ctl.fsb.tcp_flags; 15740 if (tp->t_flags & TF_RCVD_TSTMP) { 15741 to.to_tsval = ms_cts + tp->ts_offset; 15742 to.to_tsecr = tp->ts_recent; 15743 to.to_flags = TOF_TS; 15744 } 15745 optlen = tcp_addoptions(&to, opt); 15746 hdrlen += optlen; 15747 udp = rack->r_ctl.fsb.udp; 15748 if (udp) 15749 hdrlen += sizeof(struct udphdr); 15750 if (rack->r_ctl.rc_pace_max_segs) 15751 max_val = rack->r_ctl.rc_pace_max_segs; 15752 else if (rack->rc_user_set_max_segs) 15753 max_val = rack->rc_user_set_max_segs * segsiz; 15754 else 15755 max_val = len; 15756 if ((tp->t_flags & TF_TSO) && 15757 V_tcp_do_tso && 15758 (len > segsiz) && 15759 (tp->t_port == 0)) 15760 tso = 1; 15761 again: 15762 #ifdef INET6 15763 if (MHLEN < hdrlen + max_linkhdr) 15764 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15765 else 15766 #endif 15767 m = m_gethdr(M_NOWAIT, MT_DATA); 15768 if (m == NULL) 15769 goto failed; 15770 m->m_data += max_linkhdr; 15771 m->m_len = hdrlen; 15772 th = rack->r_ctl.fsb.th; 15773 /* Establish the len to send */ 15774 if (len > max_val) 15775 len = max_val; 15776 if ((tso) && (len + optlen > tp->t_maxseg)) { 15777 uint32_t if_hw_tsomax; 15778 int32_t max_len; 15779 15780 /* extract TSO information */ 15781 if_hw_tsomax = tp->t_tsomax; 15782 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15783 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15784 /* 15785 * Check if we should limit by maximum payload 15786 * length: 15787 */ 15788 if (if_hw_tsomax != 0) { 15789 /* compute maximum TSO length */ 15790 max_len = (if_hw_tsomax - hdrlen - 15791 max_linkhdr); 15792 if (max_len <= 0) { 15793 goto failed; 15794 } else if (len > max_len) { 15795 len = max_len; 15796 } 15797 } 15798 if (len <= segsiz) { 15799 /* 15800 * In case there are too many small fragments don't 15801 * use TSO: 15802 */ 15803 tso = 0; 15804 } 15805 } else { 15806 tso = 0; 15807 } 15808 if ((tso == 0) && (len > segsiz)) 15809 len = segsiz; 15810 us_cts = tcp_get_usecs(tv); 15811 if ((len == 0) || 15812 (len <= MHLEN - hdrlen - max_linkhdr)) { 15813 goto failed; 15814 } 15815 sb_offset = tp->snd_max - tp->snd_una; 15816 th->th_seq = htonl(tp->snd_max); 15817 th->th_ack = htonl(tp->rcv_nxt); 15818 th->th_flags = flags; 15819 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15820 if (th->th_win == 0) { 15821 tp->t_sndzerowin++; 15822 tp->t_flags |= TF_RXWIN0SENT; 15823 } else 15824 tp->t_flags &= ~TF_RXWIN0SENT; 15825 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 15826 KMOD_TCPSTAT_INC(tcps_sndpack); 15827 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 15828 #ifdef STATS 15829 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 15830 len); 15831 #endif 15832 if (rack->r_ctl.fsb.m == NULL) 15833 goto failed; 15834 15835 /* s_mb and s_soff are saved for rack_log_output */ 15836 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, 15837 &s_mb, &s_soff); 15838 if (len <= segsiz) { 15839 /* 15840 * Must have ran out of mbufs for the copy 15841 * shorten it to no longer need tso. Lets 15842 * not put on sendalot since we are low on 15843 * mbufs. 15844 */ 15845 tso = 0; 15846 } 15847 if (rack->r_ctl.fsb.rfo_apply_push && 15848 (len == rack->r_ctl.fsb.left_to_send)) { 15849 th->th_flags |= TH_PUSH; 15850 add_flag |= RACK_HAD_PUSH; 15851 } 15852 if ((m->m_next == NULL) || (len <= 0)){ 15853 goto failed; 15854 } 15855 if (udp) { 15856 if (rack->r_is_v6) 15857 ulen = hdrlen + len - sizeof(struct ip6_hdr); 15858 else 15859 ulen = hdrlen + len - sizeof(struct ip); 15860 udp->uh_ulen = htons(ulen); 15861 } 15862 m->m_pkthdr.rcvif = (struct ifnet *)0; 15863 if (tp->t_state == TCPS_ESTABLISHED && 15864 (tp->t_flags2 & TF2_ECN_PERMIT)) { 15865 /* 15866 * If the peer has ECN, mark data packets with ECN capable 15867 * transmission (ECT). Ignore pure ack packets, 15868 * retransmissions. 15869 */ 15870 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 15871 #ifdef INET6 15872 if (rack->r_is_v6) 15873 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 15874 else 15875 #endif 15876 ip->ip_tos |= IPTOS_ECN_ECT0; 15877 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 15878 /* 15879 * Reply with proper ECN notifications. 15880 * Only set CWR on new data segments. 15881 */ 15882 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 15883 flags |= TH_CWR; 15884 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 15885 } 15886 } 15887 if (tp->t_flags2 & TF2_ECN_SND_ECE) 15888 flags |= TH_ECE; 15889 } 15890 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 15891 #ifdef INET6 15892 if (rack->r_is_v6) { 15893 if (tp->t_port) { 15894 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 15895 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15896 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 15897 th->th_sum = htons(0); 15898 UDPSTAT_INC(udps_opackets); 15899 } else { 15900 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 15901 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15902 th->th_sum = in6_cksum_pseudo(ip6, 15903 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 15904 0); 15905 } 15906 } 15907 #endif 15908 #if defined(INET6) && defined(INET) 15909 else 15910 #endif 15911 #ifdef INET 15912 { 15913 if (tp->t_port) { 15914 m->m_pkthdr.csum_flags = CSUM_UDP; 15915 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15916 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 15917 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 15918 th->th_sum = htons(0); 15919 UDPSTAT_INC(udps_opackets); 15920 } else { 15921 m->m_pkthdr.csum_flags = CSUM_TCP; 15922 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15923 th->th_sum = in_pseudo(ip->ip_src.s_addr, 15924 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 15925 IPPROTO_TCP + len + optlen)); 15926 } 15927 /* IP version must be set here for ipv4/ipv6 checking later */ 15928 KASSERT(ip->ip_v == IPVERSION, 15929 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 15930 } 15931 #endif 15932 if (tso) { 15933 KASSERT(len > tp->t_maxseg - optlen, 15934 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 15935 m->m_pkthdr.csum_flags |= CSUM_TSO; 15936 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 15937 } 15938 #ifdef INET6 15939 if (rack->r_is_v6) { 15940 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 15941 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 15942 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 15943 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15944 else 15945 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15946 } 15947 #endif 15948 #if defined(INET) && defined(INET6) 15949 else 15950 #endif 15951 #ifdef INET 15952 { 15953 ip->ip_len = htons(m->m_pkthdr.len); 15954 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 15955 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 15956 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15957 if (tp->t_port == 0 || len < V_tcp_minmss) { 15958 ip->ip_off |= htons(IP_DF); 15959 } 15960 } else { 15961 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15962 } 15963 } 15964 #endif 15965 /* Time to copy in our header */ 15966 cpto = mtod(m, uint8_t *); 15967 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 15968 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 15969 if (optlen) { 15970 bcopy(opt, th + 1, optlen); 15971 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 15972 } else { 15973 th->th_off = sizeof(struct tcphdr) >> 2; 15974 } 15975 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15976 union tcp_log_stackspecific log; 15977 15978 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15979 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 15980 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 15981 if (rack->rack_no_prr) 15982 log.u_bbr.flex1 = 0; 15983 else 15984 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15985 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 15986 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 15987 log.u_bbr.flex4 = max_val; 15988 log.u_bbr.flex5 = 0; 15989 /* Save off the early/late values */ 15990 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15991 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 15992 log.u_bbr.bw_inuse = rack_get_bw(rack); 15993 log.u_bbr.flex8 = 0; 15994 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15995 log.u_bbr.flex7 = 44; 15996 log.u_bbr.pkts_out = tp->t_maxseg; 15997 log.u_bbr.timeStamp = cts; 15998 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15999 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 16000 log.u_bbr.delivered = 0; 16001 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 16002 len, &log, false, NULL, NULL, 0, tv); 16003 } else 16004 lgb = NULL; 16005 #ifdef INET6 16006 if (rack->r_is_v6) { 16007 error = ip6_output(m, NULL, 16008 &inp->inp_route6, 16009 0, NULL, NULL, inp); 16010 } 16011 #endif 16012 #if defined(INET) && defined(INET6) 16013 else 16014 #endif 16015 #ifdef INET 16016 { 16017 error = ip_output(m, NULL, 16018 &inp->inp_route, 16019 0, 0, inp); 16020 } 16021 #endif 16022 if (lgb) { 16023 lgb->tlb_errno = error; 16024 lgb = NULL; 16025 } 16026 if (error) { 16027 *send_err = error; 16028 m = NULL; 16029 goto failed; 16030 } 16031 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 16032 NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls); 16033 m = NULL; 16034 if (tp->snd_una == tp->snd_max) { 16035 rack->r_ctl.rc_tlp_rxt_last_time = cts; 16036 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 16037 tp->t_acktime = ticks; 16038 } 16039 if (error == 0) 16040 tcp_account_for_send(tp, len, 0, 0, rack->r_ctl.fsb.hw_tls); 16041 16042 rack->forced_ack = 0; /* If we send something zap the FA flag */ 16043 tot_len += len; 16044 if ((tp->t_flags & TF_GPUTINPROG) == 0) 16045 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 16046 tp->snd_max += len; 16047 tp->snd_nxt = tp->snd_max; 16048 { 16049 int idx; 16050 16051 idx = (len / segsiz) + 3; 16052 if (idx >= TCP_MSS_ACCT_ATIMER) 16053 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 16054 else 16055 counter_u64_add(rack_out_size[idx], 1); 16056 } 16057 if (len <= rack->r_ctl.fsb.left_to_send) 16058 rack->r_ctl.fsb.left_to_send -= len; 16059 else 16060 rack->r_ctl.fsb.left_to_send = 0; 16061 if (rack->r_ctl.fsb.left_to_send < segsiz) { 16062 rack->r_fast_output = 0; 16063 rack->r_ctl.fsb.left_to_send = 0; 16064 /* At the end of fast_output scale up the sb */ 16065 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 16066 rack_sndbuf_autoscale(rack); 16067 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 16068 } 16069 if (tp->t_rtttime == 0) { 16070 tp->t_rtttime = ticks; 16071 tp->t_rtseq = startseq; 16072 KMOD_TCPSTAT_INC(tcps_segstimed); 16073 } 16074 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 16075 (max_val > len) && 16076 (tso == 0)) { 16077 max_val -= len; 16078 len = segsiz; 16079 th = rack->r_ctl.fsb.th; 16080 cnt_thru++; 16081 goto again; 16082 } 16083 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 16084 counter_u64_add(rack_fto_send, 1); 16085 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 16086 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 16087 #ifdef TCP_ACCOUNTING 16088 crtsc = get_cyclecount(); 16089 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16090 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 16091 } 16092 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 16093 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16094 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 16095 } 16096 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 16097 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16098 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 16099 } 16100 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz)); 16101 sched_unpin(); 16102 #endif 16103 return (0); 16104 failed: 16105 if (m) 16106 m_free(m); 16107 rack->r_fast_output = 0; 16108 return (-1); 16109 } 16110 16111 static int 16112 rack_output(struct tcpcb *tp) 16113 { 16114 struct socket *so; 16115 uint32_t recwin; 16116 uint32_t sb_offset, s_moff = 0; 16117 int32_t len, flags, error = 0; 16118 struct mbuf *m, *s_mb = NULL; 16119 struct mbuf *mb; 16120 uint32_t if_hw_tsomaxsegcount = 0; 16121 uint32_t if_hw_tsomaxsegsize; 16122 int32_t segsiz, minseg; 16123 long tot_len_this_send = 0; 16124 #ifdef INET 16125 struct ip *ip = NULL; 16126 #ifdef TCPDEBUG 16127 struct ipovly *ipov = NULL; 16128 #endif 16129 #endif 16130 struct udphdr *udp = NULL; 16131 struct tcp_rack *rack; 16132 struct tcphdr *th; 16133 uint8_t pass = 0; 16134 uint8_t mark = 0; 16135 uint8_t wanted_cookie = 0; 16136 u_char opt[TCP_MAXOLEN]; 16137 unsigned ipoptlen, optlen, hdrlen; 16138 #if defined(INET) || defined(INET6) 16139 unsigned ulen=0; 16140 #endif 16141 uint32_t rack_seq; 16142 16143 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16144 unsigned ipsec_optlen = 0; 16145 16146 #endif 16147 int32_t idle, sendalot; 16148 int32_t sub_from_prr = 0; 16149 volatile int32_t sack_rxmit; 16150 struct rack_sendmap *rsm = NULL; 16151 int32_t tso, mtu; 16152 struct tcpopt to; 16153 int32_t slot = 0; 16154 int32_t sup_rack = 0; 16155 uint32_t cts, ms_cts, delayed, early; 16156 uint16_t add_flag = RACK_SENT_SP; 16157 /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ 16158 uint8_t hpts_calling, doing_tlp = 0; 16159 uint32_t cwnd_to_use, pace_max_seg; 16160 int32_t do_a_prefetch = 0; 16161 int32_t prefetch_rsm = 0; 16162 int32_t orig_len = 0; 16163 struct timeval tv; 16164 int32_t prefetch_so_done = 0; 16165 struct tcp_log_buffer *lgb; 16166 struct inpcb *inp; 16167 struct sockbuf *sb; 16168 uint64_t ts_val = 0; 16169 #ifdef TCP_ACCOUNTING 16170 uint64_t crtsc; 16171 #endif 16172 #ifdef INET6 16173 struct ip6_hdr *ip6 = NULL; 16174 int32_t isipv6; 16175 #endif 16176 uint8_t filled_all = 0; 16177 bool hw_tls = false; 16178 16179 /* setup and take the cache hits here */ 16180 rack = (struct tcp_rack *)tp->t_fb_ptr; 16181 #ifdef TCP_ACCOUNTING 16182 sched_pin(); 16183 ts_val = get_cyclecount(); 16184 #endif 16185 hpts_calling = rack->rc_inp->inp_hpts_calls; 16186 NET_EPOCH_ASSERT(); 16187 INP_WLOCK_ASSERT(rack->rc_inp); 16188 #ifdef TCP_OFFLOAD 16189 if (tp->t_flags & TF_TOE) { 16190 #ifdef TCP_ACCOUNTING 16191 sched_unpin(); 16192 #endif 16193 return (tcp_offload_output(tp)); 16194 } 16195 #endif 16196 /* 16197 * For TFO connections in SYN_RECEIVED, only allow the initial 16198 * SYN|ACK and those sent by the retransmit timer. 16199 */ 16200 if (IS_FASTOPEN(tp->t_flags) && 16201 (tp->t_state == TCPS_SYN_RECEIVED) && 16202 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 16203 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 16204 #ifdef TCP_ACCOUNTING 16205 sched_unpin(); 16206 #endif 16207 return (0); 16208 } 16209 #ifdef INET6 16210 if (rack->r_state) { 16211 /* Use the cache line loaded if possible */ 16212 isipv6 = rack->r_is_v6; 16213 } else { 16214 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 16215 } 16216 #endif 16217 early = 0; 16218 cts = tcp_get_usecs(&tv); 16219 ms_cts = tcp_tv_to_mssectick(&tv); 16220 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 16221 rack->rc_inp->inp_in_hpts) { 16222 /* 16223 * We are on the hpts for some timer but not hptsi output. 16224 * Remove from the hpts unconditionally. 16225 */ 16226 rack_timer_cancel(tp, rack, cts, __LINE__); 16227 } 16228 /* Are we pacing and late? */ 16229 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16230 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 16231 /* We are delayed */ 16232 delayed = cts - rack->r_ctl.rc_last_output_to; 16233 } else { 16234 delayed = 0; 16235 } 16236 /* Do the timers, which may override the pacer */ 16237 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 16238 if (rack_process_timers(tp, rack, cts, hpts_calling, &doing_tlp)) { 16239 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 16240 #ifdef TCP_ACCOUNTING 16241 sched_unpin(); 16242 #endif 16243 return (0); 16244 } 16245 } 16246 if (rack->rc_in_persist) { 16247 if (rack->rc_inp->inp_in_hpts == 0) { 16248 /* Timer is not running */ 16249 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16250 } 16251 #ifdef TCP_ACCOUNTING 16252 sched_unpin(); 16253 #endif 16254 return (0); 16255 } 16256 if ((rack->r_timer_override) || 16257 (rack->rc_ack_can_sendout_data) || 16258 (delayed) || 16259 (tp->t_state < TCPS_ESTABLISHED)) { 16260 rack->rc_ack_can_sendout_data = 0; 16261 if (rack->rc_inp->inp_in_hpts) 16262 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 16263 } else if (rack->rc_inp->inp_in_hpts) { 16264 /* 16265 * On the hpts you can't pass even if ACKNOW is on, we will 16266 * when the hpts fires. 16267 */ 16268 #ifdef TCP_ACCOUNTING 16269 crtsc = get_cyclecount(); 16270 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16271 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 16272 } 16273 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val)); 16274 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16275 tp->tcp_cnt_counters[SND_BLOCKED]++; 16276 } 16277 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1); 16278 sched_unpin(); 16279 #endif 16280 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 16281 return (0); 16282 } 16283 rack->rc_inp->inp_hpts_calls = 0; 16284 /* Finish out both pacing early and late accounting */ 16285 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16286 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 16287 early = rack->r_ctl.rc_last_output_to - cts; 16288 } else 16289 early = 0; 16290 if (delayed) { 16291 rack->r_ctl.rc_agg_delayed += delayed; 16292 rack->r_late = 1; 16293 } else if (early) { 16294 rack->r_ctl.rc_agg_early += early; 16295 rack->r_early = 1; 16296 } 16297 /* Now that early/late accounting is done turn off the flag */ 16298 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16299 rack->r_wanted_output = 0; 16300 rack->r_timer_override = 0; 16301 if ((tp->t_state != rack->r_state) && 16302 TCPS_HAVEESTABLISHED(tp->t_state)) { 16303 rack_set_state(tp, rack); 16304 } 16305 if ((rack->r_fast_output) && 16306 (doing_tlp == 0) && 16307 (tp->rcv_numsacks == 0)) { 16308 int ret; 16309 16310 error = 0; 16311 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 16312 if (ret >= 0) 16313 return(ret); 16314 else if (error) { 16315 inp = rack->rc_inp; 16316 so = inp->inp_socket; 16317 sb = &so->so_snd; 16318 goto nomore; 16319 } 16320 } 16321 inp = rack->rc_inp; 16322 /* 16323 * For TFO connections in SYN_SENT or SYN_RECEIVED, 16324 * only allow the initial SYN or SYN|ACK and those sent 16325 * by the retransmit timer. 16326 */ 16327 if (IS_FASTOPEN(tp->t_flags) && 16328 ((tp->t_state == TCPS_SYN_RECEIVED) || 16329 (tp->t_state == TCPS_SYN_SENT)) && 16330 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 16331 (tp->t_rxtshift == 0)) { /* not a retransmit */ 16332 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16333 so = inp->inp_socket; 16334 sb = &so->so_snd; 16335 goto just_return_nolock; 16336 } 16337 /* 16338 * Determine length of data that should be transmitted, and flags 16339 * that will be used. If there is some data or critical controls 16340 * (SYN, RST) to send, then transmit; otherwise, investigate 16341 * further. 16342 */ 16343 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 16344 if (tp->t_idle_reduce) { 16345 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 16346 rack_cc_after_idle(rack, tp); 16347 } 16348 tp->t_flags &= ~TF_LASTIDLE; 16349 if (idle) { 16350 if (tp->t_flags & TF_MORETOCOME) { 16351 tp->t_flags |= TF_LASTIDLE; 16352 idle = 0; 16353 } 16354 } 16355 if ((tp->snd_una == tp->snd_max) && 16356 rack->r_ctl.rc_went_idle_time && 16357 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 16358 idle = cts - rack->r_ctl.rc_went_idle_time; 16359 if (idle > rack_min_probertt_hold) { 16360 /* Count as a probe rtt */ 16361 if (rack->in_probe_rtt == 0) { 16362 rack->r_ctl.rc_lower_rtt_us_cts = cts; 16363 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 16364 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 16365 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 16366 } else { 16367 rack_exit_probertt(rack, cts); 16368 } 16369 } 16370 idle = 0; 16371 } 16372 if (rack_use_fsb && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) 16373 rack_init_fsb_block(tp, rack); 16374 again: 16375 /* 16376 * If we've recently taken a timeout, snd_max will be greater than 16377 * snd_nxt. There may be SACK information that allows us to avoid 16378 * resending already delivered data. Adjust snd_nxt accordingly. 16379 */ 16380 sendalot = 0; 16381 cts = tcp_get_usecs(&tv); 16382 ms_cts = tcp_tv_to_mssectick(&tv); 16383 tso = 0; 16384 mtu = 0; 16385 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16386 minseg = segsiz; 16387 if (rack->r_ctl.rc_pace_max_segs == 0) 16388 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 16389 else 16390 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 16391 sb_offset = tp->snd_max - tp->snd_una; 16392 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16393 flags = tcp_outflags[tp->t_state]; 16394 while (rack->rc_free_cnt < rack_free_cache) { 16395 rsm = rack_alloc(rack); 16396 if (rsm == NULL) { 16397 if (inp->inp_hpts_calls) 16398 /* Retry in a ms */ 16399 slot = (1 * HPTS_USEC_IN_MSEC); 16400 so = inp->inp_socket; 16401 sb = &so->so_snd; 16402 goto just_return_nolock; 16403 } 16404 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 16405 rack->rc_free_cnt++; 16406 rsm = NULL; 16407 } 16408 if (inp->inp_hpts_calls) 16409 inp->inp_hpts_calls = 0; 16410 sack_rxmit = 0; 16411 len = 0; 16412 rsm = NULL; 16413 if (flags & TH_RST) { 16414 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 16415 so = inp->inp_socket; 16416 sb = &so->so_snd; 16417 goto send; 16418 } 16419 if (rack->r_ctl.rc_resend) { 16420 /* Retransmit timer */ 16421 rsm = rack->r_ctl.rc_resend; 16422 rack->r_ctl.rc_resend = NULL; 16423 rsm->r_flags &= ~RACK_TLP; 16424 len = rsm->r_end - rsm->r_start; 16425 sack_rxmit = 1; 16426 sendalot = 0; 16427 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16428 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16429 __func__, __LINE__, 16430 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16431 sb_offset = rsm->r_start - tp->snd_una; 16432 if (len >= segsiz) 16433 len = segsiz; 16434 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 16435 /* We have a retransmit that takes precedence */ 16436 rsm->r_flags &= ~RACK_TLP; 16437 if ((!IN_FASTRECOVERY(tp->t_flags)) && 16438 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 16439 /* Enter recovery if not induced by a time-out */ 16440 rack->r_ctl.rc_rsm_start = rsm->r_start; 16441 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 16442 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 16443 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 16444 } 16445 #ifdef INVARIANTS 16446 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 16447 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 16448 tp, rack, rsm, rsm->r_start, tp->snd_una); 16449 } 16450 #endif 16451 len = rsm->r_end - rsm->r_start; 16452 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16453 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16454 __func__, __LINE__, 16455 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16456 sb_offset = rsm->r_start - tp->snd_una; 16457 sendalot = 0; 16458 if (len >= segsiz) 16459 len = segsiz; 16460 if (len > 0) { 16461 sack_rxmit = 1; 16462 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 16463 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 16464 min(len, segsiz)); 16465 counter_u64_add(rack_rtm_prr_retran, 1); 16466 } 16467 } else if (rack->r_ctl.rc_tlpsend) { 16468 /* Tail loss probe */ 16469 long cwin; 16470 long tlen; 16471 16472 /* 16473 * Check if we can do a TLP with a RACK'd packet 16474 * this can happen if we are not doing the rack 16475 * cheat and we skipped to a TLP and it 16476 * went off. 16477 */ 16478 rsm = rack->r_ctl.rc_tlpsend; 16479 rsm->r_flags |= RACK_TLP; 16480 16481 rack->r_ctl.rc_tlpsend = NULL; 16482 sack_rxmit = 1; 16483 tlen = rsm->r_end - rsm->r_start; 16484 if (tlen > segsiz) 16485 tlen = segsiz; 16486 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16487 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16488 __func__, __LINE__, 16489 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16490 sb_offset = rsm->r_start - tp->snd_una; 16491 cwin = min(tp->snd_wnd, tlen); 16492 len = cwin; 16493 } 16494 if (rack->r_must_retran && 16495 (rsm == NULL)) { 16496 /* 16497 * Non-Sack and we had a RTO or MTU change, we 16498 * need to retransmit until we reach 16499 * the former snd_max (rack->r_ctl.rc_snd_max_at_rto). 16500 */ 16501 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 16502 int sendwin, flight; 16503 16504 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 16505 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 16506 if (flight >= sendwin) { 16507 so = inp->inp_socket; 16508 sb = &so->so_snd; 16509 goto just_return_nolock; 16510 } 16511 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 16512 KASSERT(rsm != NULL, ("rsm is NULL rack:%p r_must_retran set", rack)); 16513 if (rsm == NULL) { 16514 /* TSNH */ 16515 rack->r_must_retran = 0; 16516 rack->r_ctl.rc_out_at_rto = 0; 16517 rack->r_must_retran = 0; 16518 so = inp->inp_socket; 16519 sb = &so->so_snd; 16520 goto just_return_nolock; 16521 } 16522 sack_rxmit = 1; 16523 len = rsm->r_end - rsm->r_start; 16524 sendalot = 0; 16525 sb_offset = rsm->r_start - tp->snd_una; 16526 if (len >= segsiz) 16527 len = segsiz; 16528 } else { 16529 /* We must be done if there is nothing outstanding */ 16530 rack->r_must_retran = 0; 16531 rack->r_ctl.rc_out_at_rto = 0; 16532 } 16533 } 16534 /* 16535 * Enforce a connection sendmap count limit if set 16536 * as long as we are not retransmiting. 16537 */ 16538 if ((rsm == NULL) && 16539 (rack->do_detection == 0) && 16540 (V_tcp_map_entries_limit > 0) && 16541 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 16542 counter_u64_add(rack_to_alloc_limited, 1); 16543 if (!rack->alloc_limit_reported) { 16544 rack->alloc_limit_reported = 1; 16545 counter_u64_add(rack_alloc_limited_conns, 1); 16546 } 16547 so = inp->inp_socket; 16548 sb = &so->so_snd; 16549 goto just_return_nolock; 16550 } 16551 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 16552 /* we are retransmitting the fin */ 16553 len--; 16554 if (len) { 16555 /* 16556 * When retransmitting data do *not* include the 16557 * FIN. This could happen from a TLP probe. 16558 */ 16559 flags &= ~TH_FIN; 16560 } 16561 } 16562 #ifdef INVARIANTS 16563 /* For debugging */ 16564 rack->r_ctl.rc_rsm_at_retran = rsm; 16565 #endif 16566 if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && 16567 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 16568 int ret; 16569 16570 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); 16571 if (ret == 0) 16572 return (0); 16573 } 16574 so = inp->inp_socket; 16575 sb = &so->so_snd; 16576 if (do_a_prefetch == 0) { 16577 kern_prefetch(sb, &do_a_prefetch); 16578 do_a_prefetch = 1; 16579 } 16580 #ifdef NETFLIX_SHARED_CWND 16581 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 16582 rack->rack_enable_scwnd) { 16583 /* We are doing cwnd sharing */ 16584 if (rack->gp_ready && 16585 (rack->rack_attempted_scwnd == 0) && 16586 (rack->r_ctl.rc_scw == NULL) && 16587 tp->t_lib) { 16588 /* The pcbid is in, lets make an attempt */ 16589 counter_u64_add(rack_try_scwnd, 1); 16590 rack->rack_attempted_scwnd = 1; 16591 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 16592 &rack->r_ctl.rc_scw_index, 16593 segsiz); 16594 } 16595 if (rack->r_ctl.rc_scw && 16596 (rack->rack_scwnd_is_idle == 1) && 16597 sbavail(&so->so_snd)) { 16598 /* we are no longer out of data */ 16599 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 16600 rack->rack_scwnd_is_idle = 0; 16601 } 16602 if (rack->r_ctl.rc_scw) { 16603 /* First lets update and get the cwnd */ 16604 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 16605 rack->r_ctl.rc_scw_index, 16606 tp->snd_cwnd, tp->snd_wnd, segsiz); 16607 } 16608 } 16609 #endif 16610 /* 16611 * Get standard flags, and add SYN or FIN if requested by 'hidden' 16612 * state flags. 16613 */ 16614 if (tp->t_flags & TF_NEEDFIN) 16615 flags |= TH_FIN; 16616 if (tp->t_flags & TF_NEEDSYN) 16617 flags |= TH_SYN; 16618 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 16619 void *end_rsm; 16620 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 16621 if (end_rsm) 16622 kern_prefetch(end_rsm, &prefetch_rsm); 16623 prefetch_rsm = 1; 16624 } 16625 SOCKBUF_LOCK(sb); 16626 /* 16627 * If snd_nxt == snd_max and we have transmitted a FIN, the 16628 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 16629 * negative length. This can also occur when TCP opens up its 16630 * congestion window while receiving additional duplicate acks after 16631 * fast-retransmit because TCP will reset snd_nxt to snd_max after 16632 * the fast-retransmit. 16633 * 16634 * In the normal retransmit-FIN-only case, however, snd_nxt will be 16635 * set to snd_una, the sb_offset will be 0, and the length may wind 16636 * up 0. 16637 * 16638 * If sack_rxmit is true we are retransmitting from the scoreboard 16639 * in which case len is already set. 16640 */ 16641 if ((sack_rxmit == 0) && 16642 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 16643 uint32_t avail; 16644 16645 avail = sbavail(sb); 16646 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 16647 sb_offset = tp->snd_nxt - tp->snd_una; 16648 else 16649 sb_offset = 0; 16650 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 16651 if (rack->r_ctl.rc_tlp_new_data) { 16652 /* TLP is forcing out new data */ 16653 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 16654 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 16655 } 16656 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 16657 if (tp->snd_wnd > sb_offset) 16658 len = tp->snd_wnd - sb_offset; 16659 else 16660 len = 0; 16661 } else { 16662 len = rack->r_ctl.rc_tlp_new_data; 16663 } 16664 } else { 16665 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 16666 } 16667 if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { 16668 /* 16669 * For prr=off, we need to send only 1 MSS 16670 * at a time. We do this because another sack could 16671 * be arriving that causes us to send retransmits and 16672 * we don't want to be on a long pace due to a larger send 16673 * that keeps us from sending out the retransmit. 16674 */ 16675 len = segsiz; 16676 } 16677 } else { 16678 uint32_t outstanding; 16679 /* 16680 * We are inside of a Fast recovery episode, this 16681 * is caused by a SACK or 3 dup acks. At this point 16682 * we have sent all the retransmissions and we rely 16683 * on PRR to dictate what we will send in the form of 16684 * new data. 16685 */ 16686 16687 outstanding = tp->snd_max - tp->snd_una; 16688 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 16689 if (tp->snd_wnd > outstanding) { 16690 len = tp->snd_wnd - outstanding; 16691 /* Check to see if we have the data */ 16692 if ((sb_offset + len) > avail) { 16693 /* It does not all fit */ 16694 if (avail > sb_offset) 16695 len = avail - sb_offset; 16696 else 16697 len = 0; 16698 } 16699 } else { 16700 len = 0; 16701 } 16702 } else if (avail > sb_offset) { 16703 len = avail - sb_offset; 16704 } else { 16705 len = 0; 16706 } 16707 if (len > 0) { 16708 if (len > rack->r_ctl.rc_prr_sndcnt) { 16709 len = rack->r_ctl.rc_prr_sndcnt; 16710 } 16711 if (len > 0) { 16712 sub_from_prr = 1; 16713 counter_u64_add(rack_rtm_prr_newdata, 1); 16714 } 16715 } 16716 if (len > segsiz) { 16717 /* 16718 * We should never send more than a MSS when 16719 * retransmitting or sending new data in prr 16720 * mode unless the override flag is on. Most 16721 * likely the PRR algorithm is not going to 16722 * let us send a lot as well :-) 16723 */ 16724 if (rack->r_ctl.rc_prr_sendalot == 0) { 16725 len = segsiz; 16726 } 16727 } else if (len < segsiz) { 16728 /* 16729 * Do we send any? The idea here is if the 16730 * send empty's the socket buffer we want to 16731 * do it. However if not then lets just wait 16732 * for our prr_sndcnt to get bigger. 16733 */ 16734 long leftinsb; 16735 16736 leftinsb = sbavail(sb) - sb_offset; 16737 if (leftinsb > len) { 16738 /* This send does not empty the sb */ 16739 len = 0; 16740 } 16741 } 16742 } 16743 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 16744 /* 16745 * If you have not established 16746 * and are not doing FAST OPEN 16747 * no data please. 16748 */ 16749 if ((sack_rxmit == 0) && 16750 (!IS_FASTOPEN(tp->t_flags))){ 16751 len = 0; 16752 sb_offset = 0; 16753 } 16754 } 16755 if (prefetch_so_done == 0) { 16756 kern_prefetch(so, &prefetch_so_done); 16757 prefetch_so_done = 1; 16758 } 16759 /* 16760 * Lop off SYN bit if it has already been sent. However, if this is 16761 * SYN-SENT state and if segment contains data and if we don't know 16762 * that foreign host supports TAO, suppress sending segment. 16763 */ 16764 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 16765 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 16766 /* 16767 * When sending additional segments following a TFO SYN|ACK, 16768 * do not include the SYN bit. 16769 */ 16770 if (IS_FASTOPEN(tp->t_flags) && 16771 (tp->t_state == TCPS_SYN_RECEIVED)) 16772 flags &= ~TH_SYN; 16773 } 16774 /* 16775 * Be careful not to send data and/or FIN on SYN segments. This 16776 * measure is needed to prevent interoperability problems with not 16777 * fully conformant TCP implementations. 16778 */ 16779 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 16780 len = 0; 16781 flags &= ~TH_FIN; 16782 } 16783 /* 16784 * On TFO sockets, ensure no data is sent in the following cases: 16785 * 16786 * - When retransmitting SYN|ACK on a passively-created socket 16787 * 16788 * - When retransmitting SYN on an actively created socket 16789 * 16790 * - When sending a zero-length cookie (cookie request) on an 16791 * actively created socket 16792 * 16793 * - When the socket is in the CLOSED state (RST is being sent) 16794 */ 16795 if (IS_FASTOPEN(tp->t_flags) && 16796 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 16797 ((tp->t_state == TCPS_SYN_SENT) && 16798 (tp->t_tfo_client_cookie_len == 0)) || 16799 (flags & TH_RST))) { 16800 sack_rxmit = 0; 16801 len = 0; 16802 } 16803 /* Without fast-open there should never be data sent on a SYN */ 16804 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 16805 tp->snd_nxt = tp->iss; 16806 len = 0; 16807 } 16808 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 16809 /* We only send 1 MSS if we have a DSACK block */ 16810 add_flag |= RACK_SENT_W_DSACK; 16811 len = segsiz; 16812 } 16813 orig_len = len; 16814 if (len <= 0) { 16815 /* 16816 * If FIN has been sent but not acked, but we haven't been 16817 * called to retransmit, len will be < 0. Otherwise, window 16818 * shrank after we sent into it. If window shrank to 0, 16819 * cancel pending retransmit, pull snd_nxt back to (closed) 16820 * window, and set the persist timer if it isn't already 16821 * going. If the window didn't close completely, just wait 16822 * for an ACK. 16823 * 16824 * We also do a general check here to ensure that we will 16825 * set the persist timer when we have data to send, but a 16826 * 0-byte window. This makes sure the persist timer is set 16827 * even if the packet hits one of the "goto send" lines 16828 * below. 16829 */ 16830 len = 0; 16831 if ((tp->snd_wnd == 0) && 16832 (TCPS_HAVEESTABLISHED(tp->t_state)) && 16833 (tp->snd_una == tp->snd_max) && 16834 (sb_offset < (int)sbavail(sb))) { 16835 rack_enter_persist(tp, rack, cts); 16836 } 16837 } else if ((rsm == NULL) && 16838 (doing_tlp == 0) && 16839 (len < pace_max_seg)) { 16840 /* 16841 * We are not sending a maximum sized segment for 16842 * some reason. Should we not send anything (think 16843 * sws or persists)? 16844 */ 16845 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 16846 (TCPS_HAVEESTABLISHED(tp->t_state)) && 16847 (len < minseg) && 16848 (len < (int)(sbavail(sb) - sb_offset))) { 16849 /* 16850 * Here the rwnd is less than 16851 * the minimum pacing size, this is not a retransmit, 16852 * we are established and 16853 * the send is not the last in the socket buffer 16854 * we send nothing, and we may enter persists 16855 * if nothing is outstanding. 16856 */ 16857 len = 0; 16858 if (tp->snd_max == tp->snd_una) { 16859 /* 16860 * Nothing out we can 16861 * go into persists. 16862 */ 16863 rack_enter_persist(tp, rack, cts); 16864 } 16865 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 16866 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 16867 (len < (int)(sbavail(sb) - sb_offset)) && 16868 (len < minseg)) { 16869 /* 16870 * Here we are not retransmitting, and 16871 * the cwnd is not so small that we could 16872 * not send at least a min size (rxt timer 16873 * not having gone off), We have 2 segments or 16874 * more already in flight, its not the tail end 16875 * of the socket buffer and the cwnd is blocking 16876 * us from sending out a minimum pacing segment size. 16877 * Lets not send anything. 16878 */ 16879 len = 0; 16880 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 16881 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 16882 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 16883 (len < (int)(sbavail(sb) - sb_offset)) && 16884 (TCPS_HAVEESTABLISHED(tp->t_state))) { 16885 /* 16886 * Here we have a send window but we have 16887 * filled it up and we can't send another pacing segment. 16888 * We also have in flight more than 2 segments 16889 * and we are not completing the sb i.e. we allow 16890 * the last bytes of the sb to go out even if 16891 * its not a full pacing segment. 16892 */ 16893 len = 0; 16894 } else if ((rack->r_ctl.crte != NULL) && 16895 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 16896 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 16897 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 16898 (len < (int)(sbavail(sb) - sb_offset))) { 16899 /* 16900 * Here we are doing hardware pacing, this is not a TLP, 16901 * we are not sending a pace max segment size, there is rwnd 16902 * room to send at least N pace_max_seg, the cwnd is greater 16903 * than or equal to a full pacing segments plus 4 mss and we have 2 or 16904 * more segments in flight and its not the tail of the socket buffer. 16905 * 16906 * We don't want to send instead we need to get more ack's in to 16907 * allow us to send a full pacing segment. Normally, if we are pacing 16908 * about the right speed, we should have finished our pacing 16909 * send as most of the acks have come back if we are at the 16910 * right rate. This is a bit fuzzy since return path delay 16911 * can delay the acks, which is why we want to make sure we 16912 * have cwnd space to have a bit more than a max pace segments in flight. 16913 * 16914 * If we have not gotten our acks back we are pacing at too high a 16915 * rate delaying will not hurt and will bring our GP estimate down by 16916 * injecting the delay. If we don't do this we will send 16917 * 2 MSS out in response to the acks being clocked in which 16918 * defeats the point of hw-pacing (i.e. to help us get 16919 * larger TSO's out). 16920 */ 16921 len = 0; 16922 16923 } 16924 16925 } 16926 /* len will be >= 0 after this point. */ 16927 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 16928 rack_sndbuf_autoscale(rack); 16929 /* 16930 * Decide if we can use TCP Segmentation Offloading (if supported by 16931 * hardware). 16932 * 16933 * TSO may only be used if we are in a pure bulk sending state. The 16934 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 16935 * options prevent using TSO. With TSO the TCP header is the same 16936 * (except for the sequence number) for all generated packets. This 16937 * makes it impossible to transmit any options which vary per 16938 * generated segment or packet. 16939 * 16940 * IPv4 handling has a clear separation of ip options and ip header 16941 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 16942 * the right thing below to provide length of just ip options and thus 16943 * checking for ipoptlen is enough to decide if ip options are present. 16944 */ 16945 ipoptlen = 0; 16946 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16947 /* 16948 * Pre-calculate here as we save another lookup into the darknesses 16949 * of IPsec that way and can actually decide if TSO is ok. 16950 */ 16951 #ifdef INET6 16952 if (isipv6 && IPSEC_ENABLED(ipv6)) 16953 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 16954 #ifdef INET 16955 else 16956 #endif 16957 #endif /* INET6 */ 16958 #ifdef INET 16959 if (IPSEC_ENABLED(ipv4)) 16960 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 16961 #endif /* INET */ 16962 #endif 16963 16964 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16965 ipoptlen += ipsec_optlen; 16966 #endif 16967 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 16968 (tp->t_port == 0) && 16969 ((tp->t_flags & TF_SIGNATURE) == 0) && 16970 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 16971 ipoptlen == 0) 16972 tso = 1; 16973 { 16974 uint32_t outstanding; 16975 16976 outstanding = tp->snd_max - tp->snd_una; 16977 if (tp->t_flags & TF_SENTFIN) { 16978 /* 16979 * If we sent a fin, snd_max is 1 higher than 16980 * snd_una 16981 */ 16982 outstanding--; 16983 } 16984 if (sack_rxmit) { 16985 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 16986 flags &= ~TH_FIN; 16987 } else { 16988 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 16989 sbused(sb))) 16990 flags &= ~TH_FIN; 16991 } 16992 } 16993 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 16994 (long)TCP_MAXWIN << tp->rcv_scale); 16995 16996 /* 16997 * Sender silly window avoidance. We transmit under the following 16998 * conditions when len is non-zero: 16999 * 17000 * - We have a full segment (or more with TSO) - This is the last 17001 * buffer in a write()/send() and we are either idle or running 17002 * NODELAY - we've timed out (e.g. persist timer) - we have more 17003 * then 1/2 the maximum send window's worth of data (receiver may be 17004 * limited the window size) - we need to retransmit 17005 */ 17006 if (len) { 17007 if (len >= segsiz) { 17008 goto send; 17009 } 17010 /* 17011 * NOTE! on localhost connections an 'ack' from the remote 17012 * end may occur synchronously with the output and cause us 17013 * to flush a buffer queued with moretocome. XXX 17014 * 17015 */ 17016 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 17017 (idle || (tp->t_flags & TF_NODELAY)) && 17018 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17019 (tp->t_flags & TF_NOPUSH) == 0) { 17020 pass = 2; 17021 goto send; 17022 } 17023 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 17024 pass = 22; 17025 goto send; 17026 } 17027 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 17028 pass = 4; 17029 goto send; 17030 } 17031 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 17032 pass = 5; 17033 goto send; 17034 } 17035 if (sack_rxmit) { 17036 pass = 6; 17037 goto send; 17038 } 17039 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 17040 (ctf_outstanding(tp) < (segsiz * 2))) { 17041 /* 17042 * We have less than two MSS outstanding (delayed ack) 17043 * and our rwnd will not let us send a full sized 17044 * MSS. Lets go ahead and let this small segment 17045 * out because we want to try to have at least two 17046 * packets inflight to not be caught by delayed ack. 17047 */ 17048 pass = 12; 17049 goto send; 17050 } 17051 } 17052 /* 17053 * Sending of standalone window updates. 17054 * 17055 * Window updates are important when we close our window due to a 17056 * full socket buffer and are opening it again after the application 17057 * reads data from it. Once the window has opened again and the 17058 * remote end starts to send again the ACK clock takes over and 17059 * provides the most current window information. 17060 * 17061 * We must avoid the silly window syndrome whereas every read from 17062 * the receive buffer, no matter how small, causes a window update 17063 * to be sent. We also should avoid sending a flurry of window 17064 * updates when the socket buffer had queued a lot of data and the 17065 * application is doing small reads. 17066 * 17067 * Prevent a flurry of pointless window updates by only sending an 17068 * update when we can increase the advertized window by more than 17069 * 1/4th of the socket buffer capacity. When the buffer is getting 17070 * full or is very small be more aggressive and send an update 17071 * whenever we can increase by two mss sized segments. In all other 17072 * situations the ACK's to new incoming data will carry further 17073 * window increases. 17074 * 17075 * Don't send an independent window update if a delayed ACK is 17076 * pending (it will get piggy-backed on it) or the remote side 17077 * already has done a half-close and won't send more data. Skip 17078 * this if the connection is in T/TCP half-open state. 17079 */ 17080 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 17081 !(tp->t_flags & TF_DELACK) && 17082 !TCPS_HAVERCVDFIN(tp->t_state)) { 17083 /* 17084 * "adv" is the amount we could increase the window, taking 17085 * into account that we are limited by TCP_MAXWIN << 17086 * tp->rcv_scale. 17087 */ 17088 int32_t adv; 17089 int oldwin; 17090 17091 adv = recwin; 17092 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 17093 oldwin = (tp->rcv_adv - tp->rcv_nxt); 17094 if (adv > oldwin) 17095 adv -= oldwin; 17096 else { 17097 /* We can't increase the window */ 17098 adv = 0; 17099 } 17100 } else 17101 oldwin = 0; 17102 17103 /* 17104 * If the new window size ends up being the same as or less 17105 * than the old size when it is scaled, then don't force 17106 * a window update. 17107 */ 17108 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 17109 goto dontupdate; 17110 17111 if (adv >= (int32_t)(2 * segsiz) && 17112 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 17113 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 17114 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 17115 pass = 7; 17116 goto send; 17117 } 17118 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 17119 pass = 23; 17120 goto send; 17121 } 17122 } 17123 dontupdate: 17124 17125 /* 17126 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 17127 * is also a catch-all for the retransmit timer timeout case. 17128 */ 17129 if (tp->t_flags & TF_ACKNOW) { 17130 pass = 8; 17131 goto send; 17132 } 17133 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 17134 pass = 9; 17135 goto send; 17136 } 17137 /* 17138 * If our state indicates that FIN should be sent and we have not 17139 * yet done so, then we need to send. 17140 */ 17141 if ((flags & TH_FIN) && 17142 (tp->snd_nxt == tp->snd_una)) { 17143 pass = 11; 17144 goto send; 17145 } 17146 /* 17147 * No reason to send a segment, just return. 17148 */ 17149 just_return: 17150 SOCKBUF_UNLOCK(sb); 17151 just_return_nolock: 17152 { 17153 int app_limited = CTF_JR_SENT_DATA; 17154 17155 if (tot_len_this_send > 0) { 17156 /* Make sure snd_nxt is up to max */ 17157 rack->r_ctl.fsb.recwin = recwin; 17158 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 17159 if ((error == 0) && 17160 rack_use_rfo && 17161 ((flags & (TH_SYN|TH_FIN)) == 0) && 17162 (ipoptlen == 0) && 17163 (tp->snd_nxt == tp->snd_max) && 17164 (tp->rcv_numsacks == 0) && 17165 rack->r_fsb_inited && 17166 TCPS_HAVEESTABLISHED(tp->t_state) && 17167 (rack->r_must_retran == 0) && 17168 ((tp->t_flags & TF_NEEDFIN) == 0) && 17169 (len > 0) && (orig_len > 0) && 17170 (orig_len > len) && 17171 ((orig_len - len) >= segsiz) && 17172 ((optlen == 0) || 17173 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 17174 /* We can send at least one more MSS using our fsb */ 17175 17176 rack->r_fast_output = 1; 17177 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 17178 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 17179 rack->r_ctl.fsb.tcp_flags = flags; 17180 rack->r_ctl.fsb.left_to_send = orig_len - len; 17181 if (hw_tls) 17182 rack->r_ctl.fsb.hw_tls = 1; 17183 else 17184 rack->r_ctl.fsb.hw_tls = 0; 17185 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 17186 ("rack:%p left_to_send:%u sbavail:%u out:%u", 17187 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 17188 (tp->snd_max - tp->snd_una))); 17189 if (rack->r_ctl.fsb.left_to_send < segsiz) 17190 rack->r_fast_output = 0; 17191 else { 17192 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 17193 rack->r_ctl.fsb.rfo_apply_push = 1; 17194 else 17195 rack->r_ctl.fsb.rfo_apply_push = 0; 17196 } 17197 } else 17198 rack->r_fast_output = 0; 17199 17200 17201 rack_log_fsb(rack, tp, so, flags, 17202 ipoptlen, orig_len, len, 0, 17203 1, optlen, __LINE__, 1); 17204 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 17205 tp->snd_nxt = tp->snd_max; 17206 } else { 17207 int end_window = 0; 17208 uint32_t seq = tp->gput_ack; 17209 17210 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17211 if (rsm) { 17212 /* 17213 * Mark the last sent that we just-returned (hinting 17214 * that delayed ack may play a role in any rtt measurement). 17215 */ 17216 rsm->r_just_ret = 1; 17217 } 17218 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 17219 rack->r_ctl.rc_agg_delayed = 0; 17220 rack->r_early = 0; 17221 rack->r_late = 0; 17222 rack->r_ctl.rc_agg_early = 0; 17223 if ((ctf_outstanding(tp) + 17224 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 17225 minseg)) >= tp->snd_wnd) { 17226 /* We are limited by the rwnd */ 17227 app_limited = CTF_JR_RWND_LIMITED; 17228 if (IN_FASTRECOVERY(tp->t_flags)) 17229 rack->r_ctl.rc_prr_sndcnt = 0; 17230 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 17231 /* We are limited by whats available -- app limited */ 17232 app_limited = CTF_JR_APP_LIMITED; 17233 if (IN_FASTRECOVERY(tp->t_flags)) 17234 rack->r_ctl.rc_prr_sndcnt = 0; 17235 } else if ((idle == 0) && 17236 ((tp->t_flags & TF_NODELAY) == 0) && 17237 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17238 (len < segsiz)) { 17239 /* 17240 * No delay is not on and the 17241 * user is sending less than 1MSS. This 17242 * brings out SWS avoidance so we 17243 * don't send. Another app-limited case. 17244 */ 17245 app_limited = CTF_JR_APP_LIMITED; 17246 } else if (tp->t_flags & TF_NOPUSH) { 17247 /* 17248 * The user has requested no push of 17249 * the last segment and we are 17250 * at the last segment. Another app 17251 * limited case. 17252 */ 17253 app_limited = CTF_JR_APP_LIMITED; 17254 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 17255 /* Its the cwnd */ 17256 app_limited = CTF_JR_CWND_LIMITED; 17257 } else if (IN_FASTRECOVERY(tp->t_flags) && 17258 (rack->rack_no_prr == 0) && 17259 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 17260 app_limited = CTF_JR_PRR; 17261 } else { 17262 /* Now why here are we not sending? */ 17263 #ifdef NOW 17264 #ifdef INVARIANTS 17265 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 17266 #endif 17267 #endif 17268 app_limited = CTF_JR_ASSESSING; 17269 } 17270 /* 17271 * App limited in some fashion, for our pacing GP 17272 * measurements we don't want any gap (even cwnd). 17273 * Close down the measurement window. 17274 */ 17275 if (rack_cwnd_block_ends_measure && 17276 ((app_limited == CTF_JR_CWND_LIMITED) || 17277 (app_limited == CTF_JR_PRR))) { 17278 /* 17279 * The reason we are not sending is 17280 * the cwnd (or prr). We have been configured 17281 * to end the measurement window in 17282 * this case. 17283 */ 17284 end_window = 1; 17285 } else if (rack_rwnd_block_ends_measure && 17286 (app_limited == CTF_JR_RWND_LIMITED)) { 17287 /* 17288 * We are rwnd limited and have been 17289 * configured to end the measurement 17290 * window in this case. 17291 */ 17292 end_window = 1; 17293 } else if (app_limited == CTF_JR_APP_LIMITED) { 17294 /* 17295 * A true application limited period, we have 17296 * ran out of data. 17297 */ 17298 end_window = 1; 17299 } else if (app_limited == CTF_JR_ASSESSING) { 17300 /* 17301 * In the assessing case we hit the end of 17302 * the if/else and had no known reason 17303 * This will panic us under invariants.. 17304 * 17305 * If we get this out in logs we need to 17306 * investagate which reason we missed. 17307 */ 17308 end_window = 1; 17309 } 17310 if (end_window) { 17311 uint8_t log = 0; 17312 17313 /* Adjust the Gput measurement */ 17314 if ((tp->t_flags & TF_GPUTINPROG) && 17315 SEQ_GT(tp->gput_ack, tp->snd_max)) { 17316 tp->gput_ack = tp->snd_max; 17317 if ((tp->gput_ack - tp->gput_seq) < (MIN_GP_WIN * segsiz)) { 17318 /* 17319 * There is not enough to measure. 17320 */ 17321 tp->t_flags &= ~TF_GPUTINPROG; 17322 rack_log_pacing_delay_calc(rack, (tp->gput_ack - tp->gput_seq) /*flex2*/, 17323 rack->r_ctl.rc_gp_srtt /*flex1*/, 17324 tp->gput_seq, 17325 0, 0, 18, __LINE__, NULL, 0); 17326 } else 17327 log = 1; 17328 } 17329 /* Mark the last packet has app limited */ 17330 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17331 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 17332 if (rack->r_ctl.rc_app_limited_cnt == 0) 17333 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 17334 else { 17335 /* 17336 * Go out to the end app limited and mark 17337 * this new one as next and move the end_appl up 17338 * to this guy. 17339 */ 17340 if (rack->r_ctl.rc_end_appl) 17341 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 17342 rack->r_ctl.rc_end_appl = rsm; 17343 } 17344 rsm->r_flags |= RACK_APP_LIMITED; 17345 rack->r_ctl.rc_app_limited_cnt++; 17346 } 17347 if (log) 17348 rack_log_pacing_delay_calc(rack, 17349 rack->r_ctl.rc_app_limited_cnt, seq, 17350 tp->gput_ack, 0, 0, 4, __LINE__, NULL, 0); 17351 } 17352 } 17353 if (slot) { 17354 /* set the rack tcb into the slot N */ 17355 counter_u64_add(rack_paced_segments, 1); 17356 } else if (tot_len_this_send) { 17357 counter_u64_add(rack_unpaced_segments, 1); 17358 } 17359 /* Check if we need to go into persists or not */ 17360 if ((tp->snd_max == tp->snd_una) && 17361 TCPS_HAVEESTABLISHED(tp->t_state) && 17362 sbavail(sb) && 17363 (sbavail(sb) > tp->snd_wnd) && 17364 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 17365 /* Yes lets make sure to move to persist before timer-start */ 17366 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 17367 } 17368 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 17369 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 17370 } 17371 #ifdef NETFLIX_SHARED_CWND 17372 if ((sbavail(sb) == 0) && 17373 rack->r_ctl.rc_scw) { 17374 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 17375 rack->rack_scwnd_is_idle = 1; 17376 } 17377 #endif 17378 #ifdef TCP_ACCOUNTING 17379 if (tot_len_this_send > 0) { 17380 crtsc = get_cyclecount(); 17381 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17382 tp->tcp_cnt_counters[SND_OUT_DATA]++; 17383 } 17384 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 17385 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17386 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 17387 } 17388 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 17389 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17390 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 17391 } 17392 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz)); 17393 } else { 17394 crtsc = get_cyclecount(); 17395 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17396 tp->tcp_cnt_counters[SND_LIMITED]++; 17397 } 17398 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1); 17399 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17400 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 17401 } 17402 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val)); 17403 } 17404 sched_unpin(); 17405 #endif 17406 return (0); 17407 17408 send: 17409 if (rsm || sack_rxmit) 17410 counter_u64_add(rack_nfto_resend, 1); 17411 else 17412 counter_u64_add(rack_non_fto_send, 1); 17413 if ((flags & TH_FIN) && 17414 sbavail(sb)) { 17415 /* 17416 * We do not transmit a FIN 17417 * with data outstanding. We 17418 * need to make it so all data 17419 * is acked first. 17420 */ 17421 flags &= ~TH_FIN; 17422 } 17423 /* Enforce stack imposed max seg size if we have one */ 17424 if (rack->r_ctl.rc_pace_max_segs && 17425 (len > rack->r_ctl.rc_pace_max_segs)) { 17426 mark = 1; 17427 len = rack->r_ctl.rc_pace_max_segs; 17428 } 17429 SOCKBUF_LOCK_ASSERT(sb); 17430 if (len > 0) { 17431 if (len >= segsiz) 17432 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 17433 else 17434 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 17435 } 17436 /* 17437 * Before ESTABLISHED, force sending of initial options unless TCP 17438 * set not to do any options. NOTE: we assume that the IP/TCP header 17439 * plus TCP options always fit in a single mbuf, leaving room for a 17440 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 17441 * + optlen <= MCLBYTES 17442 */ 17443 optlen = 0; 17444 #ifdef INET6 17445 if (isipv6) 17446 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 17447 else 17448 #endif 17449 hdrlen = sizeof(struct tcpiphdr); 17450 17451 /* 17452 * Compute options for segment. We only have to care about SYN and 17453 * established connection segments. Options for SYN-ACK segments 17454 * are handled in TCP syncache. 17455 */ 17456 to.to_flags = 0; 17457 if ((tp->t_flags & TF_NOOPT) == 0) { 17458 /* Maximum segment size. */ 17459 if (flags & TH_SYN) { 17460 tp->snd_nxt = tp->iss; 17461 to.to_mss = tcp_mssopt(&inp->inp_inc); 17462 if (tp->t_port) 17463 to.to_mss -= V_tcp_udp_tunneling_overhead; 17464 to.to_flags |= TOF_MSS; 17465 17466 /* 17467 * On SYN or SYN|ACK transmits on TFO connections, 17468 * only include the TFO option if it is not a 17469 * retransmit, as the presence of the TFO option may 17470 * have caused the original SYN or SYN|ACK to have 17471 * been dropped by a middlebox. 17472 */ 17473 if (IS_FASTOPEN(tp->t_flags) && 17474 (tp->t_rxtshift == 0)) { 17475 if (tp->t_state == TCPS_SYN_RECEIVED) { 17476 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 17477 to.to_tfo_cookie = 17478 (u_int8_t *)&tp->t_tfo_cookie.server; 17479 to.to_flags |= TOF_FASTOPEN; 17480 wanted_cookie = 1; 17481 } else if (tp->t_state == TCPS_SYN_SENT) { 17482 to.to_tfo_len = 17483 tp->t_tfo_client_cookie_len; 17484 to.to_tfo_cookie = 17485 tp->t_tfo_cookie.client; 17486 to.to_flags |= TOF_FASTOPEN; 17487 wanted_cookie = 1; 17488 /* 17489 * If we wind up having more data to 17490 * send with the SYN than can fit in 17491 * one segment, don't send any more 17492 * until the SYN|ACK comes back from 17493 * the other end. 17494 */ 17495 sendalot = 0; 17496 } 17497 } 17498 } 17499 /* Window scaling. */ 17500 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 17501 to.to_wscale = tp->request_r_scale; 17502 to.to_flags |= TOF_SCALE; 17503 } 17504 /* Timestamps. */ 17505 if ((tp->t_flags & TF_RCVD_TSTMP) || 17506 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 17507 to.to_tsval = ms_cts + tp->ts_offset; 17508 to.to_tsecr = tp->ts_recent; 17509 to.to_flags |= TOF_TS; 17510 } 17511 /* Set receive buffer autosizing timestamp. */ 17512 if (tp->rfbuf_ts == 0 && 17513 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 17514 tp->rfbuf_ts = tcp_ts_getticks(); 17515 /* Selective ACK's. */ 17516 if (tp->t_flags & TF_SACK_PERMIT) { 17517 if (flags & TH_SYN) 17518 to.to_flags |= TOF_SACKPERM; 17519 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 17520 tp->rcv_numsacks > 0) { 17521 to.to_flags |= TOF_SACK; 17522 to.to_nsacks = tp->rcv_numsacks; 17523 to.to_sacks = (u_char *)tp->sackblks; 17524 } 17525 } 17526 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 17527 /* TCP-MD5 (RFC2385). */ 17528 if (tp->t_flags & TF_SIGNATURE) 17529 to.to_flags |= TOF_SIGNATURE; 17530 #endif /* TCP_SIGNATURE */ 17531 17532 /* Processing the options. */ 17533 hdrlen += optlen = tcp_addoptions(&to, opt); 17534 /* 17535 * If we wanted a TFO option to be added, but it was unable 17536 * to fit, ensure no data is sent. 17537 */ 17538 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 17539 !(to.to_flags & TOF_FASTOPEN)) 17540 len = 0; 17541 } 17542 if (tp->t_port) { 17543 if (V_tcp_udp_tunneling_port == 0) { 17544 /* The port was removed?? */ 17545 SOCKBUF_UNLOCK(&so->so_snd); 17546 #ifdef TCP_ACCOUNTING 17547 crtsc = get_cyclecount(); 17548 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17549 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 17550 } 17551 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 17552 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17553 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 17554 } 17555 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 17556 sched_unpin(); 17557 #endif 17558 return (EHOSTUNREACH); 17559 } 17560 hdrlen += sizeof(struct udphdr); 17561 } 17562 #ifdef INET6 17563 if (isipv6) 17564 ipoptlen = ip6_optlen(tp->t_inpcb); 17565 else 17566 #endif 17567 if (tp->t_inpcb->inp_options) 17568 ipoptlen = tp->t_inpcb->inp_options->m_len - 17569 offsetof(struct ipoption, ipopt_list); 17570 else 17571 ipoptlen = 0; 17572 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17573 ipoptlen += ipsec_optlen; 17574 #endif 17575 17576 /* 17577 * Adjust data length if insertion of options will bump the packet 17578 * length beyond the t_maxseg length. Clear the FIN bit because we 17579 * cut off the tail of the segment. 17580 */ 17581 if (len + optlen + ipoptlen > tp->t_maxseg) { 17582 if (tso) { 17583 uint32_t if_hw_tsomax; 17584 uint32_t moff; 17585 int32_t max_len; 17586 17587 /* extract TSO information */ 17588 if_hw_tsomax = tp->t_tsomax; 17589 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 17590 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 17591 KASSERT(ipoptlen == 0, 17592 ("%s: TSO can't do IP options", __func__)); 17593 17594 /* 17595 * Check if we should limit by maximum payload 17596 * length: 17597 */ 17598 if (if_hw_tsomax != 0) { 17599 /* compute maximum TSO length */ 17600 max_len = (if_hw_tsomax - hdrlen - 17601 max_linkhdr); 17602 if (max_len <= 0) { 17603 len = 0; 17604 } else if (len > max_len) { 17605 sendalot = 1; 17606 len = max_len; 17607 mark = 2; 17608 } 17609 } 17610 /* 17611 * Prevent the last segment from being fractional 17612 * unless the send sockbuf can be emptied: 17613 */ 17614 max_len = (tp->t_maxseg - optlen); 17615 if ((sb_offset + len) < sbavail(sb)) { 17616 moff = len % (u_int)max_len; 17617 if (moff != 0) { 17618 mark = 3; 17619 len -= moff; 17620 } 17621 } 17622 /* 17623 * In case there are too many small fragments don't 17624 * use TSO: 17625 */ 17626 if (len <= segsiz) { 17627 mark = 4; 17628 tso = 0; 17629 } 17630 /* 17631 * Send the FIN in a separate segment after the bulk 17632 * sending is done. We don't trust the TSO 17633 * implementations to clear the FIN flag on all but 17634 * the last segment. 17635 */ 17636 if (tp->t_flags & TF_NEEDFIN) { 17637 sendalot = 4; 17638 } 17639 } else { 17640 mark = 5; 17641 if (optlen + ipoptlen >= tp->t_maxseg) { 17642 /* 17643 * Since we don't have enough space to put 17644 * the IP header chain and the TCP header in 17645 * one packet as required by RFC 7112, don't 17646 * send it. Also ensure that at least one 17647 * byte of the payload can be put into the 17648 * TCP segment. 17649 */ 17650 SOCKBUF_UNLOCK(&so->so_snd); 17651 error = EMSGSIZE; 17652 sack_rxmit = 0; 17653 goto out; 17654 } 17655 len = tp->t_maxseg - optlen - ipoptlen; 17656 sendalot = 5; 17657 } 17658 } else { 17659 tso = 0; 17660 mark = 6; 17661 } 17662 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 17663 ("%s: len > IP_MAXPACKET", __func__)); 17664 #ifdef DIAGNOSTIC 17665 #ifdef INET6 17666 if (max_linkhdr + hdrlen > MCLBYTES) 17667 #else 17668 if (max_linkhdr + hdrlen > MHLEN) 17669 #endif 17670 panic("tcphdr too big"); 17671 #endif 17672 17673 /* 17674 * This KASSERT is here to catch edge cases at a well defined place. 17675 * Before, those had triggered (random) panic conditions further 17676 * down. 17677 */ 17678 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 17679 if ((len == 0) && 17680 (flags & TH_FIN) && 17681 (sbused(sb))) { 17682 /* 17683 * We have outstanding data, don't send a fin by itself!. 17684 */ 17685 goto just_return; 17686 } 17687 /* 17688 * Grab a header mbuf, attaching a copy of data to be transmitted, 17689 * and initialize the header from the template for sends on this 17690 * connection. 17691 */ 17692 hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; 17693 if (len) { 17694 uint32_t max_val; 17695 uint32_t moff; 17696 17697 if (rack->r_ctl.rc_pace_max_segs) 17698 max_val = rack->r_ctl.rc_pace_max_segs; 17699 else if (rack->rc_user_set_max_segs) 17700 max_val = rack->rc_user_set_max_segs * segsiz; 17701 else 17702 max_val = len; 17703 /* 17704 * We allow a limit on sending with hptsi. 17705 */ 17706 if (len > max_val) { 17707 mark = 7; 17708 len = max_val; 17709 } 17710 #ifdef INET6 17711 if (MHLEN < hdrlen + max_linkhdr) 17712 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 17713 else 17714 #endif 17715 m = m_gethdr(M_NOWAIT, MT_DATA); 17716 17717 if (m == NULL) { 17718 SOCKBUF_UNLOCK(sb); 17719 error = ENOBUFS; 17720 sack_rxmit = 0; 17721 goto out; 17722 } 17723 m->m_data += max_linkhdr; 17724 m->m_len = hdrlen; 17725 17726 /* 17727 * Start the m_copy functions from the closest mbuf to the 17728 * sb_offset in the socket buffer chain. 17729 */ 17730 mb = sbsndptr_noadv(sb, sb_offset, &moff); 17731 s_mb = mb; 17732 s_moff = moff; 17733 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 17734 m_copydata(mb, moff, (int)len, 17735 mtod(m, caddr_t)+hdrlen); 17736 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 17737 sbsndptr_adv(sb, mb, len); 17738 m->m_len += len; 17739 } else { 17740 struct sockbuf *msb; 17741 17742 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 17743 msb = NULL; 17744 else 17745 msb = sb; 17746 m->m_next = tcp_m_copym( 17747 mb, moff, &len, 17748 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 17749 ((rsm == NULL) ? hw_tls : 0) 17750 #ifdef NETFLIX_COPY_ARGS 17751 , &filled_all 17752 #endif 17753 ); 17754 if (len <= (tp->t_maxseg - optlen)) { 17755 /* 17756 * Must have ran out of mbufs for the copy 17757 * shorten it to no longer need tso. Lets 17758 * not put on sendalot since we are low on 17759 * mbufs. 17760 */ 17761 tso = 0; 17762 } 17763 if (m->m_next == NULL) { 17764 SOCKBUF_UNLOCK(sb); 17765 (void)m_free(m); 17766 error = ENOBUFS; 17767 sack_rxmit = 0; 17768 goto out; 17769 } 17770 } 17771 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 17772 if (rsm && (rsm->r_flags & RACK_TLP)) { 17773 /* 17774 * TLP should not count in retran count, but 17775 * in its own bin 17776 */ 17777 counter_u64_add(rack_tlp_retran, 1); 17778 counter_u64_add(rack_tlp_retran_bytes, len); 17779 } else { 17780 tp->t_sndrexmitpack++; 17781 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 17782 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 17783 } 17784 #ifdef STATS 17785 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 17786 len); 17787 #endif 17788 } else { 17789 KMOD_TCPSTAT_INC(tcps_sndpack); 17790 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 17791 #ifdef STATS 17792 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 17793 len); 17794 #endif 17795 } 17796 /* 17797 * If we're sending everything we've got, set PUSH. (This 17798 * will keep happy those implementations which only give 17799 * data to the user when a buffer fills or a PUSH comes in.) 17800 */ 17801 if (sb_offset + len == sbused(sb) && 17802 sbused(sb) && 17803 !(flags & TH_SYN)) { 17804 flags |= TH_PUSH; 17805 add_flag |= RACK_HAD_PUSH; 17806 } 17807 17808 SOCKBUF_UNLOCK(sb); 17809 } else { 17810 SOCKBUF_UNLOCK(sb); 17811 if (tp->t_flags & TF_ACKNOW) 17812 KMOD_TCPSTAT_INC(tcps_sndacks); 17813 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 17814 KMOD_TCPSTAT_INC(tcps_sndctrl); 17815 else 17816 KMOD_TCPSTAT_INC(tcps_sndwinup); 17817 17818 m = m_gethdr(M_NOWAIT, MT_DATA); 17819 if (m == NULL) { 17820 error = ENOBUFS; 17821 sack_rxmit = 0; 17822 goto out; 17823 } 17824 #ifdef INET6 17825 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 17826 MHLEN >= hdrlen) { 17827 M_ALIGN(m, hdrlen); 17828 } else 17829 #endif 17830 m->m_data += max_linkhdr; 17831 m->m_len = hdrlen; 17832 } 17833 SOCKBUF_UNLOCK_ASSERT(sb); 17834 m->m_pkthdr.rcvif = (struct ifnet *)0; 17835 #ifdef MAC 17836 mac_inpcb_create_mbuf(inp, m); 17837 #endif 17838 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 17839 #ifdef INET6 17840 if (isipv6) { 17841 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 17842 } else 17843 #endif /* INET6 */ 17844 { 17845 #ifdef INET 17846 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 17847 #endif 17848 } 17849 th = rack->r_ctl.fsb.th; 17850 udp = rack->r_ctl.fsb.udp; 17851 if (udp) { 17852 #ifdef INET6 17853 if (isipv6) { 17854 ulen = hdrlen + len - sizeof(struct ip6_hdr); 17855 } else 17856 #endif /* INET6 */ 17857 { 17858 #ifdef INET 17859 ulen = hdrlen + len - sizeof(struct ip); 17860 udp->uh_ulen = htons(ulen); 17861 #endif 17862 } 17863 } 17864 } else { 17865 #ifdef INET6 17866 if (isipv6) { 17867 ip6 = mtod(m, struct ip6_hdr *); 17868 if (tp->t_port) { 17869 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 17870 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 17871 udp->uh_dport = tp->t_port; 17872 ulen = hdrlen + len - sizeof(struct ip6_hdr); 17873 udp->uh_ulen = htons(ulen); 17874 th = (struct tcphdr *)(udp + 1); 17875 } else 17876 th = (struct tcphdr *)(ip6 + 1); 17877 tcpip_fillheaders(inp, tp->t_port, ip6, th); 17878 } else 17879 #endif /* INET6 */ 17880 { 17881 #ifdef INET 17882 ip = mtod(m, struct ip *); 17883 #ifdef TCPDEBUG 17884 ipov = (struct ipovly *)ip; 17885 #endif 17886 if (tp->t_port) { 17887 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 17888 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 17889 udp->uh_dport = tp->t_port; 17890 ulen = hdrlen + len - sizeof(struct ip); 17891 udp->uh_ulen = htons(ulen); 17892 th = (struct tcphdr *)(udp + 1); 17893 } else 17894 th = (struct tcphdr *)(ip + 1); 17895 tcpip_fillheaders(inp, tp->t_port, ip, th); 17896 #endif /* INET */ 17897 } 17898 } 17899 /* 17900 * Fill in fields, remembering maximum advertised window for use in 17901 * delaying messages about window sizes. If resending a FIN, be sure 17902 * not to use a new sequence number. 17903 */ 17904 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 17905 tp->snd_nxt == tp->snd_max) 17906 tp->snd_nxt--; 17907 /* 17908 * If we are starting a connection, send ECN setup SYN packet. If we 17909 * are on a retransmit, we may resend those bits a number of times 17910 * as per RFC 3168. 17911 */ 17912 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 17913 if (tp->t_rxtshift >= 1) { 17914 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 17915 flags |= TH_ECE | TH_CWR; 17916 } else 17917 flags |= TH_ECE | TH_CWR; 17918 } 17919 /* Handle parallel SYN for ECN */ 17920 if ((tp->t_state == TCPS_SYN_RECEIVED) && 17921 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 17922 flags |= TH_ECE; 17923 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 17924 } 17925 if (TCPS_HAVEESTABLISHED(tp->t_state) && 17926 (tp->t_flags2 & TF2_ECN_PERMIT)) { 17927 /* 17928 * If the peer has ECN, mark data packets with ECN capable 17929 * transmission (ECT). Ignore pure ack packets, 17930 * retransmissions. 17931 */ 17932 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 17933 (sack_rxmit == 0)) { 17934 #ifdef INET6 17935 if (isipv6) { 17936 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 17937 } else 17938 #endif 17939 { 17940 #ifdef INET 17941 ip->ip_tos |= IPTOS_ECN_ECT0; 17942 #endif 17943 } 17944 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 17945 /* 17946 * Reply with proper ECN notifications. 17947 * Only set CWR on new data segments. 17948 */ 17949 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 17950 flags |= TH_CWR; 17951 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 17952 } 17953 } 17954 if (tp->t_flags2 & TF2_ECN_SND_ECE) 17955 flags |= TH_ECE; 17956 } 17957 /* 17958 * If we are doing retransmissions, then snd_nxt will not reflect 17959 * the first unsent octet. For ACK only packets, we do not want the 17960 * sequence number of the retransmitted packet, we want the sequence 17961 * number of the next unsent octet. So, if there is no data (and no 17962 * SYN or FIN), use snd_max instead of snd_nxt when filling in 17963 * ti_seq. But if we are in persist state, snd_max might reflect 17964 * one byte beyond the right edge of the window, so use snd_nxt in 17965 * that case, since we know we aren't doing a retransmission. 17966 * (retransmit and persist are mutually exclusive...) 17967 */ 17968 if (sack_rxmit == 0) { 17969 if (len || (flags & (TH_SYN | TH_FIN))) { 17970 th->th_seq = htonl(tp->snd_nxt); 17971 rack_seq = tp->snd_nxt; 17972 } else { 17973 th->th_seq = htonl(tp->snd_max); 17974 rack_seq = tp->snd_max; 17975 } 17976 } else { 17977 th->th_seq = htonl(rsm->r_start); 17978 rack_seq = rsm->r_start; 17979 } 17980 th->th_ack = htonl(tp->rcv_nxt); 17981 th->th_flags = flags; 17982 /* 17983 * Calculate receive window. Don't shrink window, but avoid silly 17984 * window syndrome. 17985 * If a RST segment is sent, advertise a window of zero. 17986 */ 17987 if (flags & TH_RST) { 17988 recwin = 0; 17989 } else { 17990 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 17991 recwin < (long)segsiz) { 17992 recwin = 0; 17993 } 17994 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 17995 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 17996 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 17997 } 17998 17999 /* 18000 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 18001 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 18002 * handled in syncache. 18003 */ 18004 if (flags & TH_SYN) 18005 th->th_win = htons((u_short) 18006 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 18007 else { 18008 /* Avoid shrinking window with window scaling. */ 18009 recwin = roundup2(recwin, 1 << tp->rcv_scale); 18010 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 18011 } 18012 /* 18013 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 18014 * window. This may cause the remote transmitter to stall. This 18015 * flag tells soreceive() to disable delayed acknowledgements when 18016 * draining the buffer. This can occur if the receiver is 18017 * attempting to read more data than can be buffered prior to 18018 * transmitting on the connection. 18019 */ 18020 if (th->th_win == 0) { 18021 tp->t_sndzerowin++; 18022 tp->t_flags |= TF_RXWIN0SENT; 18023 } else 18024 tp->t_flags &= ~TF_RXWIN0SENT; 18025 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 18026 /* Now are we using fsb?, if so copy the template data to the mbuf */ 18027 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 18028 uint8_t *cpto; 18029 18030 cpto = mtod(m, uint8_t *); 18031 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 18032 /* 18033 * We have just copied in: 18034 * IP/IP6 18035 * <optional udphdr> 18036 * tcphdr (no options) 18037 * 18038 * We need to grab the correct pointers into the mbuf 18039 * for both the tcp header, and possibly the udp header (if tunneling). 18040 * We do this by using the offset in the copy buffer and adding it 18041 * to the mbuf base pointer (cpto). 18042 */ 18043 #ifdef INET6 18044 if (isipv6) 18045 ip6 = mtod(m, struct ip6_hdr *); 18046 else 18047 #endif /* INET6 */ 18048 #ifdef INET 18049 ip = mtod(m, struct ip *); 18050 #endif /* INET */ 18051 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 18052 /* If we have a udp header lets set it into the mbuf as well */ 18053 if (udp) 18054 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 18055 } 18056 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 18057 if (to.to_flags & TOF_SIGNATURE) { 18058 /* 18059 * Calculate MD5 signature and put it into the place 18060 * determined before. 18061 * NOTE: since TCP options buffer doesn't point into 18062 * mbuf's data, calculate offset and use it. 18063 */ 18064 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 18065 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 18066 /* 18067 * Do not send segment if the calculation of MD5 18068 * digest has failed. 18069 */ 18070 goto out; 18071 } 18072 } 18073 #endif 18074 if (optlen) { 18075 bcopy(opt, th + 1, optlen); 18076 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 18077 } 18078 /* 18079 * Put TCP length in extended header, and then checksum extended 18080 * header and data. 18081 */ 18082 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 18083 #ifdef INET6 18084 if (isipv6) { 18085 /* 18086 * ip6_plen is not need to be filled now, and will be filled 18087 * in ip6_output. 18088 */ 18089 if (tp->t_port) { 18090 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 18091 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18092 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 18093 th->th_sum = htons(0); 18094 UDPSTAT_INC(udps_opackets); 18095 } else { 18096 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 18097 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18098 th->th_sum = in6_cksum_pseudo(ip6, 18099 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 18100 0); 18101 } 18102 } 18103 #endif 18104 #if defined(INET6) && defined(INET) 18105 else 18106 #endif 18107 #ifdef INET 18108 { 18109 if (tp->t_port) { 18110 m->m_pkthdr.csum_flags = CSUM_UDP; 18111 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 18112 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 18113 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 18114 th->th_sum = htons(0); 18115 UDPSTAT_INC(udps_opackets); 18116 } else { 18117 m->m_pkthdr.csum_flags = CSUM_TCP; 18118 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 18119 th->th_sum = in_pseudo(ip->ip_src.s_addr, 18120 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 18121 IPPROTO_TCP + len + optlen)); 18122 } 18123 /* IP version must be set here for ipv4/ipv6 checking later */ 18124 KASSERT(ip->ip_v == IPVERSION, 18125 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 18126 } 18127 #endif 18128 /* 18129 * Enable TSO and specify the size of the segments. The TCP pseudo 18130 * header checksum is always provided. XXX: Fixme: This is currently 18131 * not the case for IPv6. 18132 */ 18133 if (tso) { 18134 KASSERT(len > tp->t_maxseg - optlen, 18135 ("%s: len <= tso_segsz", __func__)); 18136 m->m_pkthdr.csum_flags |= CSUM_TSO; 18137 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 18138 } 18139 KASSERT(len + hdrlen == m_length(m, NULL), 18140 ("%s: mbuf chain different than expected: %d + %u != %u", 18141 __func__, len, hdrlen, m_length(m, NULL))); 18142 18143 #ifdef TCP_HHOOK 18144 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 18145 hhook_run_tcp_est_out(tp, th, &to, len, tso); 18146 #endif 18147 /* We're getting ready to send; log now. */ 18148 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 18149 union tcp_log_stackspecific log; 18150 18151 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 18152 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 18153 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 18154 if (rack->rack_no_prr) 18155 log.u_bbr.flex1 = 0; 18156 else 18157 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 18158 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 18159 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 18160 log.u_bbr.flex4 = orig_len; 18161 if (filled_all) 18162 log.u_bbr.flex5 = 0x80000000; 18163 else 18164 log.u_bbr.flex5 = 0; 18165 /* Save off the early/late values */ 18166 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 18167 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 18168 log.u_bbr.bw_inuse = rack_get_bw(rack); 18169 if (rsm || sack_rxmit) { 18170 if (doing_tlp) 18171 log.u_bbr.flex8 = 2; 18172 else 18173 log.u_bbr.flex8 = 1; 18174 } else { 18175 log.u_bbr.flex8 = 0; 18176 } 18177 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 18178 log.u_bbr.flex7 = mark; 18179 log.u_bbr.flex7 <<= 8; 18180 log.u_bbr.flex7 |= pass; 18181 log.u_bbr.pkts_out = tp->t_maxseg; 18182 log.u_bbr.timeStamp = cts; 18183 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18184 log.u_bbr.lt_epoch = cwnd_to_use; 18185 log.u_bbr.delivered = sendalot; 18186 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 18187 len, &log, false, NULL, NULL, 0, &tv); 18188 } else 18189 lgb = NULL; 18190 18191 /* 18192 * Fill in IP length and desired time to live and send to IP level. 18193 * There should be a better way to handle ttl and tos; we could keep 18194 * them in the template, but need a way to checksum without them. 18195 */ 18196 /* 18197 * m->m_pkthdr.len should have been set before cksum calcuration, 18198 * because in6_cksum() need it. 18199 */ 18200 #ifdef INET6 18201 if (isipv6) { 18202 /* 18203 * we separately set hoplimit for every segment, since the 18204 * user might want to change the value via setsockopt. Also, 18205 * desired default hop limit might be changed via Neighbor 18206 * Discovery. 18207 */ 18208 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 18209 18210 /* 18211 * Set the packet size here for the benefit of DTrace 18212 * probes. ip6_output() will set it properly; it's supposed 18213 * to include the option header lengths as well. 18214 */ 18215 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18216 18217 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18218 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18219 else 18220 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18221 18222 if (tp->t_state == TCPS_SYN_SENT) 18223 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 18224 18225 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 18226 /* TODO: IPv6 IP6TOS_ECT bit on */ 18227 error = ip6_output(m, 18228 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18229 inp->in6p_outputopts, 18230 #else 18231 NULL, 18232 #endif 18233 &inp->inp_route6, 18234 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 18235 NULL, NULL, inp); 18236 18237 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 18238 mtu = inp->inp_route6.ro_nh->nh_mtu; 18239 } 18240 #endif /* INET6 */ 18241 #if defined(INET) && defined(INET6) 18242 else 18243 #endif 18244 #ifdef INET 18245 { 18246 ip->ip_len = htons(m->m_pkthdr.len); 18247 #ifdef INET6 18248 if (inp->inp_vflag & INP_IPV6PROTO) 18249 ip->ip_ttl = in6_selecthlim(inp, NULL); 18250 #endif /* INET6 */ 18251 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 18252 /* 18253 * If we do path MTU discovery, then we set DF on every 18254 * packet. This might not be the best thing to do according 18255 * to RFC3390 Section 2. However the tcp hostcache migitates 18256 * the problem so it affects only the first tcp connection 18257 * with a host. 18258 * 18259 * NB: Don't set DF on small MTU/MSS to have a safe 18260 * fallback. 18261 */ 18262 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18263 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18264 if (tp->t_port == 0 || len < V_tcp_minmss) { 18265 ip->ip_off |= htons(IP_DF); 18266 } 18267 } else { 18268 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18269 } 18270 18271 if (tp->t_state == TCPS_SYN_SENT) 18272 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 18273 18274 TCP_PROBE5(send, NULL, tp, ip, tp, th); 18275 18276 error = ip_output(m, 18277 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18278 inp->inp_options, 18279 #else 18280 NULL, 18281 #endif 18282 &inp->inp_route, 18283 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 18284 inp); 18285 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 18286 mtu = inp->inp_route.ro_nh->nh_mtu; 18287 } 18288 #endif /* INET */ 18289 18290 out: 18291 if (lgb) { 18292 lgb->tlb_errno = error; 18293 lgb = NULL; 18294 } 18295 /* 18296 * In transmit state, time the transmission and arrange for the 18297 * retransmit. In persist state, just set snd_max. 18298 */ 18299 if (error == 0) { 18300 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp, hw_tls); 18301 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18302 if (rsm && (doing_tlp == 0)) { 18303 /* Set we retransmitted */ 18304 rack->rc_gp_saw_rec = 1; 18305 } else { 18306 if (cwnd_to_use > tp->snd_ssthresh) { 18307 /* Set we sent in CA */ 18308 rack->rc_gp_saw_ca = 1; 18309 } else { 18310 /* Set we sent in SS */ 18311 rack->rc_gp_saw_ss = 1; 18312 } 18313 } 18314 if (doing_tlp && (rsm == NULL)) { 18315 /* Make sure new data TLP cnt is clear */ 18316 rack->r_ctl.rc_tlp_new_data = 0; 18317 } 18318 if (TCPS_HAVEESTABLISHED(tp->t_state) && 18319 (tp->t_flags & TF_SACK_PERMIT) && 18320 tp->rcv_numsacks > 0) 18321 tcp_clean_dsack_blocks(tp); 18322 tot_len_this_send += len; 18323 if (len == 0) 18324 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 18325 else if (len == 1) { 18326 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 18327 } else if (len > 1) { 18328 int idx; 18329 18330 idx = (len / segsiz) + 3; 18331 if (idx >= TCP_MSS_ACCT_ATIMER) 18332 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18333 else 18334 counter_u64_add(rack_out_size[idx], 1); 18335 } 18336 } 18337 if ((rack->rack_no_prr == 0) && 18338 sub_from_prr && 18339 (error == 0)) { 18340 if (rack->r_ctl.rc_prr_sndcnt >= len) 18341 rack->r_ctl.rc_prr_sndcnt -= len; 18342 else 18343 rack->r_ctl.rc_prr_sndcnt = 0; 18344 } 18345 sub_from_prr = 0; 18346 if (doing_tlp && (rsm == NULL)) { 18347 /* New send doing a TLP */ 18348 add_flag |= RACK_TLP; 18349 } 18350 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 18351 rack_to_usec_ts(&tv), 18352 rsm, add_flag, s_mb, s_moff, hw_tls); 18353 18354 18355 if ((error == 0) && 18356 (len > 0) && 18357 (tp->snd_una == tp->snd_max)) 18358 rack->r_ctl.rc_tlp_rxt_last_time = cts; 18359 { 18360 tcp_seq startseq = tp->snd_nxt; 18361 18362 /* Track our lost count */ 18363 if (rsm && (doing_tlp == 0)) 18364 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 18365 /* 18366 * Advance snd_nxt over sequence space of this segment. 18367 */ 18368 if (error) 18369 /* We don't log or do anything with errors */ 18370 goto nomore; 18371 if (doing_tlp == 0) { 18372 if (rsm == NULL) { 18373 /* 18374 * Not a retransmission of some 18375 * sort, new data is going out so 18376 * clear our TLP count and flag. 18377 */ 18378 rack->rc_tlp_in_progress = 0; 18379 rack->r_ctl.rc_tlp_cnt_out = 0; 18380 } 18381 } else { 18382 /* 18383 * We have just sent a TLP, mark that it is true 18384 * and make sure our in progress is set so we 18385 * continue to check the count. 18386 */ 18387 rack->rc_tlp_in_progress = 1; 18388 rack->r_ctl.rc_tlp_cnt_out++; 18389 } 18390 if (flags & (TH_SYN | TH_FIN)) { 18391 if (flags & TH_SYN) 18392 tp->snd_nxt++; 18393 if (flags & TH_FIN) { 18394 tp->snd_nxt++; 18395 tp->t_flags |= TF_SENTFIN; 18396 } 18397 } 18398 /* In the ENOBUFS case we do *not* update snd_max */ 18399 if (sack_rxmit) 18400 goto nomore; 18401 18402 tp->snd_nxt += len; 18403 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 18404 if (tp->snd_una == tp->snd_max) { 18405 /* 18406 * Update the time we just added data since 18407 * none was outstanding. 18408 */ 18409 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 18410 tp->t_acktime = ticks; 18411 } 18412 tp->snd_max = tp->snd_nxt; 18413 /* 18414 * Time this transmission if not a retransmission and 18415 * not currently timing anything. 18416 * This is only relevant in case of switching back to 18417 * the base stack. 18418 */ 18419 if (tp->t_rtttime == 0) { 18420 tp->t_rtttime = ticks; 18421 tp->t_rtseq = startseq; 18422 KMOD_TCPSTAT_INC(tcps_segstimed); 18423 } 18424 if (len && 18425 ((tp->t_flags & TF_GPUTINPROG) == 0)) 18426 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 18427 } 18428 /* 18429 * If we are doing FO we need to update the mbuf position and subtract 18430 * this happens when the peer sends us duplicate information and 18431 * we thus want to send a DSACK. 18432 * 18433 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 18434 * turned off? If not then we are going to echo multiple DSACK blocks 18435 * out (with the TSO), which we should not be doing. 18436 */ 18437 if (rack->r_fast_output && len) { 18438 if (rack->r_ctl.fsb.left_to_send > len) 18439 rack->r_ctl.fsb.left_to_send -= len; 18440 else 18441 rack->r_ctl.fsb.left_to_send = 0; 18442 if (rack->r_ctl.fsb.left_to_send < segsiz) 18443 rack->r_fast_output = 0; 18444 if (rack->r_fast_output) { 18445 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18446 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18447 } 18448 } 18449 } 18450 nomore: 18451 if (error) { 18452 rack->r_ctl.rc_agg_delayed = 0; 18453 rack->r_early = 0; 18454 rack->r_late = 0; 18455 rack->r_ctl.rc_agg_early = 0; 18456 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 18457 /* 18458 * Failures do not advance the seq counter above. For the 18459 * case of ENOBUFS we will fall out and retry in 1ms with 18460 * the hpts. Everything else will just have to retransmit 18461 * with the timer. 18462 * 18463 * In any case, we do not want to loop around for another 18464 * send without a good reason. 18465 */ 18466 sendalot = 0; 18467 switch (error) { 18468 case EPERM: 18469 tp->t_softerror = error; 18470 #ifdef TCP_ACCOUNTING 18471 crtsc = get_cyclecount(); 18472 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18473 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18474 } 18475 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18476 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18477 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18478 } 18479 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18480 sched_unpin(); 18481 #endif 18482 return (error); 18483 case ENOBUFS: 18484 /* 18485 * Pace us right away to retry in a some 18486 * time 18487 */ 18488 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 18489 if (rack->rc_enobuf < 0x7f) 18490 rack->rc_enobuf++; 18491 if (slot < (10 * HPTS_USEC_IN_MSEC)) 18492 slot = 10 * HPTS_USEC_IN_MSEC; 18493 if (rack->r_ctl.crte != NULL) { 18494 counter_u64_add(rack_saw_enobuf_hw, 1); 18495 tcp_rl_log_enobuf(rack->r_ctl.crte); 18496 } 18497 counter_u64_add(rack_saw_enobuf, 1); 18498 goto enobufs; 18499 case EMSGSIZE: 18500 /* 18501 * For some reason the interface we used initially 18502 * to send segments changed to another or lowered 18503 * its MTU. If TSO was active we either got an 18504 * interface without TSO capabilits or TSO was 18505 * turned off. If we obtained mtu from ip_output() 18506 * then update it and try again. 18507 */ 18508 if (tso) 18509 tp->t_flags &= ~TF_TSO; 18510 if (mtu != 0) { 18511 tcp_mss_update(tp, -1, mtu, NULL, NULL); 18512 goto again; 18513 } 18514 slot = 10 * HPTS_USEC_IN_MSEC; 18515 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 18516 #ifdef TCP_ACCOUNTING 18517 crtsc = get_cyclecount(); 18518 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18519 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18520 } 18521 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18522 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18523 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18524 } 18525 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18526 sched_unpin(); 18527 #endif 18528 return (error); 18529 case ENETUNREACH: 18530 counter_u64_add(rack_saw_enetunreach, 1); 18531 case EHOSTDOWN: 18532 case EHOSTUNREACH: 18533 case ENETDOWN: 18534 if (TCPS_HAVERCVDSYN(tp->t_state)) { 18535 tp->t_softerror = error; 18536 } 18537 /* FALLTHROUGH */ 18538 default: 18539 slot = 10 * HPTS_USEC_IN_MSEC; 18540 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 18541 #ifdef TCP_ACCOUNTING 18542 crtsc = get_cyclecount(); 18543 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18544 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18545 } 18546 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18547 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18548 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18549 } 18550 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18551 sched_unpin(); 18552 #endif 18553 return (error); 18554 } 18555 } else { 18556 rack->rc_enobuf = 0; 18557 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 18558 rack->r_ctl.retran_during_recovery += len; 18559 } 18560 KMOD_TCPSTAT_INC(tcps_sndtotal); 18561 18562 /* 18563 * Data sent (as far as we can tell). If this advertises a larger 18564 * window than any other segment, then remember the size of the 18565 * advertised window. Any pending ACK has now been sent. 18566 */ 18567 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 18568 tp->rcv_adv = tp->rcv_nxt + recwin; 18569 18570 tp->last_ack_sent = tp->rcv_nxt; 18571 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 18572 enobufs: 18573 if (sendalot) { 18574 /* Do we need to turn off sendalot? */ 18575 if (rack->r_ctl.rc_pace_max_segs && 18576 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 18577 /* We hit our max. */ 18578 sendalot = 0; 18579 } else if ((rack->rc_user_set_max_segs) && 18580 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 18581 /* We hit the user defined max */ 18582 sendalot = 0; 18583 } 18584 } 18585 if ((error == 0) && (flags & TH_FIN)) 18586 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 18587 if (flags & TH_RST) { 18588 /* 18589 * We don't send again after sending a RST. 18590 */ 18591 slot = 0; 18592 sendalot = 0; 18593 if (error == 0) 18594 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 18595 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 18596 /* 18597 * Get our pacing rate, if an error 18598 * occurred in sending (ENOBUF) we would 18599 * hit the else if with slot preset. Other 18600 * errors return. 18601 */ 18602 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 18603 } 18604 if (rsm && 18605 (rsm->r_flags & RACK_HAS_SYN) == 0 && 18606 rack->use_rack_rr) { 18607 /* Its a retransmit and we use the rack cheat? */ 18608 if ((slot == 0) || 18609 (rack->rc_always_pace == 0) || 18610 (rack->r_rr_config == 1)) { 18611 /* 18612 * We have no pacing set or we 18613 * are using old-style rack or 18614 * we are overriden to use the old 1ms pacing. 18615 */ 18616 slot = rack->r_ctl.rc_min_to; 18617 } 18618 } 18619 /* We have sent clear the flag */ 18620 rack->r_ent_rec_ns = 0; 18621 if (rack->r_must_retran) { 18622 if (rsm) { 18623 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 18624 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 18625 /* 18626 * We have retransmitted all. 18627 */ 18628 rack->r_must_retran = 0; 18629 rack->r_ctl.rc_out_at_rto = 0; 18630 } 18631 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 18632 /* 18633 * Sending new data will also kill 18634 * the loop. 18635 */ 18636 rack->r_must_retran = 0; 18637 rack->r_ctl.rc_out_at_rto = 0; 18638 } 18639 } 18640 rack->r_ctl.fsb.recwin = recwin; 18641 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 18642 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 18643 /* 18644 * We hit an RTO and now have past snd_max at the RTO 18645 * clear all the WAS flags. 18646 */ 18647 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 18648 } 18649 if (slot) { 18650 /* set the rack tcb into the slot N */ 18651 counter_u64_add(rack_paced_segments, 1); 18652 if ((error == 0) && 18653 rack_use_rfo && 18654 ((flags & (TH_SYN|TH_FIN)) == 0) && 18655 (rsm == NULL) && 18656 (tp->snd_nxt == tp->snd_max) && 18657 (ipoptlen == 0) && 18658 (tp->rcv_numsacks == 0) && 18659 rack->r_fsb_inited && 18660 TCPS_HAVEESTABLISHED(tp->t_state) && 18661 (rack->r_must_retran == 0) && 18662 ((tp->t_flags & TF_NEEDFIN) == 0) && 18663 (len > 0) && (orig_len > 0) && 18664 (orig_len > len) && 18665 ((orig_len - len) >= segsiz) && 18666 ((optlen == 0) || 18667 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 18668 /* We can send at least one more MSS using our fsb */ 18669 18670 rack->r_fast_output = 1; 18671 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18672 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18673 rack->r_ctl.fsb.tcp_flags = flags; 18674 rack->r_ctl.fsb.left_to_send = orig_len - len; 18675 if (hw_tls) 18676 rack->r_ctl.fsb.hw_tls = 1; 18677 else 18678 rack->r_ctl.fsb.hw_tls = 0; 18679 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 18680 ("rack:%p left_to_send:%u sbavail:%u out:%u", 18681 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 18682 (tp->snd_max - tp->snd_una))); 18683 if (rack->r_ctl.fsb.left_to_send < segsiz) 18684 rack->r_fast_output = 0; 18685 else { 18686 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 18687 rack->r_ctl.fsb.rfo_apply_push = 1; 18688 else 18689 rack->r_ctl.fsb.rfo_apply_push = 0; 18690 } 18691 } else 18692 rack->r_fast_output = 0; 18693 rack_log_fsb(rack, tp, so, flags, 18694 ipoptlen, orig_len, len, error, 18695 (rsm == NULL), optlen, __LINE__, 2); 18696 } else if (sendalot) { 18697 int ret; 18698 18699 if (len) 18700 counter_u64_add(rack_unpaced_segments, 1); 18701 sack_rxmit = 0; 18702 if ((error == 0) && 18703 rack_use_rfo && 18704 ((flags & (TH_SYN|TH_FIN)) == 0) && 18705 (rsm == NULL) && 18706 (ipoptlen == 0) && 18707 (tp->rcv_numsacks == 0) && 18708 (tp->snd_nxt == tp->snd_max) && 18709 (rack->r_must_retran == 0) && 18710 rack->r_fsb_inited && 18711 TCPS_HAVEESTABLISHED(tp->t_state) && 18712 ((tp->t_flags & TF_NEEDFIN) == 0) && 18713 (len > 0) && (orig_len > 0) && 18714 (orig_len > len) && 18715 ((orig_len - len) >= segsiz) && 18716 ((optlen == 0) || 18717 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 18718 /* we can use fast_output for more */ 18719 18720 rack->r_fast_output = 1; 18721 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18722 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18723 rack->r_ctl.fsb.tcp_flags = flags; 18724 rack->r_ctl.fsb.left_to_send = orig_len - len; 18725 if (hw_tls) 18726 rack->r_ctl.fsb.hw_tls = 1; 18727 else 18728 rack->r_ctl.fsb.hw_tls = 0; 18729 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 18730 ("rack:%p left_to_send:%u sbavail:%u out:%u", 18731 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 18732 (tp->snd_max - tp->snd_una))); 18733 if (rack->r_ctl.fsb.left_to_send < segsiz) { 18734 rack->r_fast_output = 0; 18735 } 18736 if (rack->r_fast_output) { 18737 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 18738 rack->r_ctl.fsb.rfo_apply_push = 1; 18739 else 18740 rack->r_ctl.fsb.rfo_apply_push = 0; 18741 rack_log_fsb(rack, tp, so, flags, 18742 ipoptlen, orig_len, len, error, 18743 (rsm == NULL), optlen, __LINE__, 3); 18744 error = 0; 18745 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 18746 if (ret >= 0) 18747 return (ret); 18748 else if (error) 18749 goto nomore; 18750 18751 } 18752 } 18753 goto again; 18754 } else if (len) { 18755 counter_u64_add(rack_unpaced_segments, 1); 18756 } 18757 /* Assure when we leave that snd_nxt will point to top */ 18758 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 18759 tp->snd_nxt = tp->snd_max; 18760 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 18761 #ifdef TCP_ACCOUNTING 18762 crtsc = get_cyclecount() - ts_val; 18763 if (tot_len_this_send) { 18764 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18765 tp->tcp_cnt_counters[SND_OUT_DATA]++; 18766 } 18767 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 18768 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18769 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 18770 } 18771 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc); 18772 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18773 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 18774 } 18775 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz)); 18776 } else { 18777 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18778 tp->tcp_cnt_counters[SND_OUT_ACK]++; 18779 } 18780 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1); 18781 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18782 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 18783 } 18784 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc); 18785 } 18786 sched_unpin(); 18787 #endif 18788 if (error == ENOBUFS) 18789 error = 0; 18790 return (error); 18791 } 18792 18793 static void 18794 rack_update_seg(struct tcp_rack *rack) 18795 { 18796 uint32_t orig_val; 18797 18798 orig_val = rack->r_ctl.rc_pace_max_segs; 18799 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 18800 if (orig_val != rack->r_ctl.rc_pace_max_segs) 18801 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL, 0); 18802 } 18803 18804 static void 18805 rack_mtu_change(struct tcpcb *tp) 18806 { 18807 /* 18808 * The MSS may have changed 18809 */ 18810 struct tcp_rack *rack; 18811 18812 rack = (struct tcp_rack *)tp->t_fb_ptr; 18813 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 18814 /* 18815 * The MTU has changed we need to resend everything 18816 * since all we have sent is lost. We first fix 18817 * up the mtu though. 18818 */ 18819 rack_set_pace_segments(tp, rack, __LINE__, NULL); 18820 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 18821 rack_remxt_tmr(tp); 18822 rack->r_fast_output = 0; 18823 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 18824 rack->r_ctl.rc_sacked); 18825 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 18826 rack->r_must_retran = 1; 18827 18828 } 18829 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 18830 /* We don't use snd_nxt to retransmit */ 18831 tp->snd_nxt = tp->snd_max; 18832 } 18833 18834 static int 18835 rack_set_profile(struct tcp_rack *rack, int prof) 18836 { 18837 int err = EINVAL; 18838 if (prof == 1) { 18839 /* pace_always=1 */ 18840 if (rack->rc_always_pace == 0) { 18841 if (tcp_can_enable_pacing() == 0) 18842 return (EBUSY); 18843 } 18844 rack->rc_always_pace = 1; 18845 if (rack->use_fixed_rate || rack->gp_ready) 18846 rack_set_cc_pacing(rack); 18847 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18848 rack->rack_attempt_hdwr_pace = 0; 18849 /* cmpack=1 */ 18850 if (rack_use_cmp_acks) 18851 rack->r_use_cmp_ack = 1; 18852 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 18853 rack->r_use_cmp_ack) 18854 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18855 /* scwnd=1 */ 18856 rack->rack_enable_scwnd = 1; 18857 /* dynamic=100 */ 18858 rack->rc_gp_dyn_mul = 1; 18859 /* gp_inc_ca */ 18860 rack->r_ctl.rack_per_of_gp_ca = 100; 18861 /* rrr_conf=3 */ 18862 rack->r_rr_config = 3; 18863 /* npush=2 */ 18864 rack->r_ctl.rc_no_push_at_mrtt = 2; 18865 /* fillcw=1 */ 18866 rack->rc_pace_to_cwnd = 1; 18867 rack->rc_pace_fill_if_rttin_range = 0; 18868 rack->rtt_limit_mul = 0; 18869 /* noprr=1 */ 18870 rack->rack_no_prr = 1; 18871 /* lscwnd=1 */ 18872 rack->r_limit_scw = 1; 18873 /* gp_inc_rec */ 18874 rack->r_ctl.rack_per_of_gp_rec = 90; 18875 err = 0; 18876 18877 } else if (prof == 3) { 18878 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ 18879 /* pace_always=1 */ 18880 if (rack->rc_always_pace == 0) { 18881 if (tcp_can_enable_pacing() == 0) 18882 return (EBUSY); 18883 } 18884 rack->rc_always_pace = 1; 18885 if (rack->use_fixed_rate || rack->gp_ready) 18886 rack_set_cc_pacing(rack); 18887 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18888 rack->rack_attempt_hdwr_pace = 0; 18889 /* cmpack=1 */ 18890 if (rack_use_cmp_acks) 18891 rack->r_use_cmp_ack = 1; 18892 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 18893 rack->r_use_cmp_ack) 18894 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18895 /* scwnd=1 */ 18896 rack->rack_enable_scwnd = 1; 18897 /* dynamic=100 */ 18898 rack->rc_gp_dyn_mul = 1; 18899 /* gp_inc_ca */ 18900 rack->r_ctl.rack_per_of_gp_ca = 100; 18901 /* rrr_conf=3 */ 18902 rack->r_rr_config = 3; 18903 /* npush=2 */ 18904 rack->r_ctl.rc_no_push_at_mrtt = 2; 18905 /* fillcw=2 */ 18906 rack->rc_pace_to_cwnd = 1; 18907 rack->r_fill_less_agg = 1; 18908 rack->rc_pace_fill_if_rttin_range = 0; 18909 rack->rtt_limit_mul = 0; 18910 /* noprr=1 */ 18911 rack->rack_no_prr = 1; 18912 /* lscwnd=1 */ 18913 rack->r_limit_scw = 1; 18914 /* gp_inc_rec */ 18915 rack->r_ctl.rack_per_of_gp_rec = 90; 18916 err = 0; 18917 18918 18919 } else if (prof == 2) { 18920 /* cmpack=1 */ 18921 if (rack->rc_always_pace == 0) { 18922 if (tcp_can_enable_pacing() == 0) 18923 return (EBUSY); 18924 } 18925 rack->rc_always_pace = 1; 18926 if (rack->use_fixed_rate || rack->gp_ready) 18927 rack_set_cc_pacing(rack); 18928 rack->r_use_cmp_ack = 1; 18929 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 18930 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18931 /* pace_always=1 */ 18932 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18933 /* scwnd=1 */ 18934 rack->rack_enable_scwnd = 1; 18935 /* dynamic=100 */ 18936 rack->rc_gp_dyn_mul = 1; 18937 rack->r_ctl.rack_per_of_gp_ca = 100; 18938 /* rrr_conf=3 */ 18939 rack->r_rr_config = 3; 18940 /* npush=2 */ 18941 rack->r_ctl.rc_no_push_at_mrtt = 2; 18942 /* fillcw=1 */ 18943 rack->rc_pace_to_cwnd = 1; 18944 rack->rc_pace_fill_if_rttin_range = 0; 18945 rack->rtt_limit_mul = 0; 18946 /* noprr=1 */ 18947 rack->rack_no_prr = 1; 18948 /* lscwnd=0 */ 18949 rack->r_limit_scw = 0; 18950 err = 0; 18951 } else if (prof == 0) { 18952 /* This changes things back to the default settings */ 18953 err = 0; 18954 if (rack->rc_always_pace) { 18955 tcp_decrement_paced_conn(); 18956 rack_undo_cc_pacing(rack); 18957 rack->rc_always_pace = 0; 18958 } 18959 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 18960 rack->rc_always_pace = 1; 18961 if (rack->use_fixed_rate || rack->gp_ready) 18962 rack_set_cc_pacing(rack); 18963 } else 18964 rack->rc_always_pace = 0; 18965 if (rack_use_cmp_acks) 18966 rack->r_use_cmp_ack = 1; 18967 else 18968 rack->r_use_cmp_ack = 0; 18969 if (rack_disable_prr) 18970 rack->rack_no_prr = 1; 18971 else 18972 rack->rack_no_prr = 0; 18973 if (rack_gp_no_rec_chg) 18974 rack->rc_gp_no_rec_chg = 1; 18975 else 18976 rack->rc_gp_no_rec_chg = 0; 18977 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 18978 rack->r_mbuf_queue = 1; 18979 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 18980 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18981 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18982 } else { 18983 rack->r_mbuf_queue = 0; 18984 rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 18985 } 18986 if (rack_enable_shared_cwnd) 18987 rack->rack_enable_scwnd = 1; 18988 else 18989 rack->rack_enable_scwnd = 0; 18990 if (rack_do_dyn_mul) { 18991 /* When dynamic adjustment is on CA needs to start at 100% */ 18992 rack->rc_gp_dyn_mul = 1; 18993 if (rack_do_dyn_mul >= 100) 18994 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 18995 } else { 18996 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 18997 rack->rc_gp_dyn_mul = 0; 18998 } 18999 rack->r_rr_config = 0; 19000 rack->r_ctl.rc_no_push_at_mrtt = 0; 19001 rack->rc_pace_to_cwnd = 0; 19002 rack->rc_pace_fill_if_rttin_range = 0; 19003 rack->rtt_limit_mul = 0; 19004 19005 if (rack_enable_hw_pacing) 19006 rack->rack_hdw_pace_ena = 1; 19007 else 19008 rack->rack_hdw_pace_ena = 0; 19009 if (rack_disable_prr) 19010 rack->rack_no_prr = 1; 19011 else 19012 rack->rack_no_prr = 0; 19013 if (rack_limits_scwnd) 19014 rack->r_limit_scw = 1; 19015 else 19016 rack->r_limit_scw = 0; 19017 err = 0; 19018 } 19019 return (err); 19020 } 19021 19022 static int 19023 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 19024 { 19025 struct deferred_opt_list *dol; 19026 19027 dol = malloc(sizeof(struct deferred_opt_list), 19028 M_TCPFSB, M_NOWAIT|M_ZERO); 19029 if (dol == NULL) { 19030 /* 19031 * No space yikes -- fail out.. 19032 */ 19033 return (0); 19034 } 19035 dol->optname = sopt_name; 19036 dol->optval = loptval; 19037 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 19038 return (1); 19039 } 19040 19041 static int 19042 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 19043 uint32_t optval, uint64_t loptval) 19044 { 19045 struct epoch_tracker et; 19046 struct sockopt sopt; 19047 struct cc_newreno_opts opt; 19048 uint64_t val; 19049 int error = 0; 19050 uint16_t ca, ss; 19051 19052 switch (sopt_name) { 19053 19054 case TCP_RACK_PACING_BETA: 19055 RACK_OPTS_INC(tcp_rack_beta); 19056 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 19057 /* This only works for newreno. */ 19058 error = EINVAL; 19059 break; 19060 } 19061 if (rack->rc_pacing_cc_set) { 19062 /* 19063 * Set them into the real CC module 19064 * whats in the rack pcb is the old values 19065 * to be used on restoral/ 19066 */ 19067 sopt.sopt_dir = SOPT_SET; 19068 opt.name = CC_NEWRENO_BETA; 19069 opt.val = optval; 19070 if (CC_ALGO(tp)->ctl_output != NULL) 19071 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 19072 else { 19073 error = ENOENT; 19074 break; 19075 } 19076 } else { 19077 /* 19078 * Not pacing yet so set it into our local 19079 * rack pcb storage. 19080 */ 19081 rack->r_ctl.rc_saved_beta.beta = optval; 19082 } 19083 break; 19084 case TCP_RACK_TIMER_SLOP: 19085 RACK_OPTS_INC(tcp_rack_timer_slop); 19086 rack->r_ctl.timer_slop = optval; 19087 if (rack->rc_tp->t_srtt) { 19088 /* 19089 * If we have an SRTT lets update t_rxtcur 19090 * to have the new slop. 19091 */ 19092 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 19093 rack_rto_min, rack_rto_max, 19094 rack->r_ctl.timer_slop); 19095 } 19096 break; 19097 case TCP_RACK_PACING_BETA_ECN: 19098 RACK_OPTS_INC(tcp_rack_beta_ecn); 19099 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 19100 /* This only works for newreno. */ 19101 error = EINVAL; 19102 break; 19103 } 19104 if (rack->rc_pacing_cc_set) { 19105 /* 19106 * Set them into the real CC module 19107 * whats in the rack pcb is the old values 19108 * to be used on restoral/ 19109 */ 19110 sopt.sopt_dir = SOPT_SET; 19111 opt.name = CC_NEWRENO_BETA_ECN; 19112 opt.val = optval; 19113 if (CC_ALGO(tp)->ctl_output != NULL) 19114 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 19115 else 19116 error = ENOENT; 19117 } else { 19118 /* 19119 * Not pacing yet so set it into our local 19120 * rack pcb storage. 19121 */ 19122 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 19123 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; 19124 } 19125 break; 19126 case TCP_DEFER_OPTIONS: 19127 RACK_OPTS_INC(tcp_defer_opt); 19128 if (optval) { 19129 if (rack->gp_ready) { 19130 /* Too late */ 19131 error = EINVAL; 19132 break; 19133 } 19134 rack->defer_options = 1; 19135 } else 19136 rack->defer_options = 0; 19137 break; 19138 case TCP_RACK_MEASURE_CNT: 19139 RACK_OPTS_INC(tcp_rack_measure_cnt); 19140 if (optval && (optval <= 0xff)) { 19141 rack->r_ctl.req_measurements = optval; 19142 } else 19143 error = EINVAL; 19144 break; 19145 case TCP_REC_ABC_VAL: 19146 RACK_OPTS_INC(tcp_rec_abc_val); 19147 if (optval > 0) 19148 rack->r_use_labc_for_rec = 1; 19149 else 19150 rack->r_use_labc_for_rec = 0; 19151 break; 19152 case TCP_RACK_ABC_VAL: 19153 RACK_OPTS_INC(tcp_rack_abc_val); 19154 if ((optval > 0) && (optval < 255)) 19155 rack->rc_labc = optval; 19156 else 19157 error = EINVAL; 19158 break; 19159 case TCP_HDWR_UP_ONLY: 19160 RACK_OPTS_INC(tcp_pacing_up_only); 19161 if (optval) 19162 rack->r_up_only = 1; 19163 else 19164 rack->r_up_only = 0; 19165 break; 19166 case TCP_PACING_RATE_CAP: 19167 RACK_OPTS_INC(tcp_pacing_rate_cap); 19168 rack->r_ctl.bw_rate_cap = loptval; 19169 break; 19170 case TCP_RACK_PROFILE: 19171 RACK_OPTS_INC(tcp_profile); 19172 error = rack_set_profile(rack, optval); 19173 break; 19174 case TCP_USE_CMP_ACKS: 19175 RACK_OPTS_INC(tcp_use_cmp_acks); 19176 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) { 19177 /* You can't turn it off once its on! */ 19178 error = EINVAL; 19179 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 19180 rack->r_use_cmp_ack = 1; 19181 rack->r_mbuf_queue = 1; 19182 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19183 } 19184 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 19185 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 19186 break; 19187 case TCP_SHARED_CWND_TIME_LIMIT: 19188 RACK_OPTS_INC(tcp_lscwnd); 19189 if (optval) 19190 rack->r_limit_scw = 1; 19191 else 19192 rack->r_limit_scw = 0; 19193 break; 19194 case TCP_RACK_PACE_TO_FILL: 19195 RACK_OPTS_INC(tcp_fillcw); 19196 if (optval == 0) 19197 rack->rc_pace_to_cwnd = 0; 19198 else { 19199 rack->rc_pace_to_cwnd = 1; 19200 if (optval > 1) 19201 rack->r_fill_less_agg = 1; 19202 } 19203 if ((optval >= rack_gp_rtt_maxmul) && 19204 rack_gp_rtt_maxmul && 19205 (optval < 0xf)) { 19206 rack->rc_pace_fill_if_rttin_range = 1; 19207 rack->rtt_limit_mul = optval; 19208 } else { 19209 rack->rc_pace_fill_if_rttin_range = 0; 19210 rack->rtt_limit_mul = 0; 19211 } 19212 break; 19213 case TCP_RACK_NO_PUSH_AT_MAX: 19214 RACK_OPTS_INC(tcp_npush); 19215 if (optval == 0) 19216 rack->r_ctl.rc_no_push_at_mrtt = 0; 19217 else if (optval < 0xff) 19218 rack->r_ctl.rc_no_push_at_mrtt = optval; 19219 else 19220 error = EINVAL; 19221 break; 19222 case TCP_SHARED_CWND_ENABLE: 19223 RACK_OPTS_INC(tcp_rack_scwnd); 19224 if (optval == 0) 19225 rack->rack_enable_scwnd = 0; 19226 else 19227 rack->rack_enable_scwnd = 1; 19228 break; 19229 case TCP_RACK_MBUF_QUEUE: 19230 /* Now do we use the LRO mbuf-queue feature */ 19231 RACK_OPTS_INC(tcp_rack_mbufq); 19232 if (optval || rack->r_use_cmp_ack) 19233 rack->r_mbuf_queue = 1; 19234 else 19235 rack->r_mbuf_queue = 0; 19236 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19237 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19238 else 19239 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19240 break; 19241 case TCP_RACK_NONRXT_CFG_RATE: 19242 RACK_OPTS_INC(tcp_rack_cfg_rate); 19243 if (optval == 0) 19244 rack->rack_rec_nonrxt_use_cr = 0; 19245 else 19246 rack->rack_rec_nonrxt_use_cr = 1; 19247 break; 19248 case TCP_NO_PRR: 19249 RACK_OPTS_INC(tcp_rack_noprr); 19250 if (optval == 0) 19251 rack->rack_no_prr = 0; 19252 else if (optval == 1) 19253 rack->rack_no_prr = 1; 19254 else if (optval == 2) 19255 rack->no_prr_addback = 1; 19256 else 19257 error = EINVAL; 19258 break; 19259 case TCP_TIMELY_DYN_ADJ: 19260 RACK_OPTS_INC(tcp_timely_dyn); 19261 if (optval == 0) 19262 rack->rc_gp_dyn_mul = 0; 19263 else { 19264 rack->rc_gp_dyn_mul = 1; 19265 if (optval >= 100) { 19266 /* 19267 * If the user sets something 100 or more 19268 * its the gp_ca value. 19269 */ 19270 rack->r_ctl.rack_per_of_gp_ca = optval; 19271 } 19272 } 19273 break; 19274 case TCP_RACK_DO_DETECTION: 19275 RACK_OPTS_INC(tcp_rack_do_detection); 19276 if (optval == 0) 19277 rack->do_detection = 0; 19278 else 19279 rack->do_detection = 1; 19280 break; 19281 case TCP_RACK_TLP_USE: 19282 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 19283 error = EINVAL; 19284 break; 19285 } 19286 RACK_OPTS_INC(tcp_tlp_use); 19287 rack->rack_tlp_threshold_use = optval; 19288 break; 19289 case TCP_RACK_TLP_REDUCE: 19290 /* RACK TLP cwnd reduction (bool) */ 19291 RACK_OPTS_INC(tcp_rack_tlp_reduce); 19292 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 19293 break; 19294 /* Pacing related ones */ 19295 case TCP_RACK_PACE_ALWAYS: 19296 /* 19297 * zero is old rack method, 1 is new 19298 * method using a pacing rate. 19299 */ 19300 RACK_OPTS_INC(tcp_rack_pace_always); 19301 if (optval > 0) { 19302 if (rack->rc_always_pace) { 19303 error = EALREADY; 19304 break; 19305 } else if (tcp_can_enable_pacing()) { 19306 rack->rc_always_pace = 1; 19307 if (rack->use_fixed_rate || rack->gp_ready) 19308 rack_set_cc_pacing(rack); 19309 } 19310 else { 19311 error = ENOSPC; 19312 break; 19313 } 19314 } else { 19315 if (rack->rc_always_pace) { 19316 tcp_decrement_paced_conn(); 19317 rack->rc_always_pace = 0; 19318 rack_undo_cc_pacing(rack); 19319 } 19320 } 19321 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19322 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19323 else 19324 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19325 /* A rate may be set irate or other, if so set seg size */ 19326 rack_update_seg(rack); 19327 break; 19328 case TCP_BBR_RACK_INIT_RATE: 19329 RACK_OPTS_INC(tcp_initial_rate); 19330 val = optval; 19331 /* Change from kbits per second to bytes per second */ 19332 val *= 1000; 19333 val /= 8; 19334 rack->r_ctl.init_rate = val; 19335 if (rack->rc_init_win != rack_default_init_window) { 19336 uint32_t win, snt; 19337 19338 /* 19339 * Options don't always get applied 19340 * in the order you think. So in order 19341 * to assure we update a cwnd we need 19342 * to check and see if we are still 19343 * where we should raise the cwnd. 19344 */ 19345 win = rc_init_window(rack); 19346 if (SEQ_GT(tp->snd_max, tp->iss)) 19347 snt = tp->snd_max - tp->iss; 19348 else 19349 snt = 0; 19350 if ((snt < win) && 19351 (tp->snd_cwnd < win)) 19352 tp->snd_cwnd = win; 19353 } 19354 if (rack->rc_always_pace) 19355 rack_update_seg(rack); 19356 break; 19357 case TCP_BBR_IWINTSO: 19358 RACK_OPTS_INC(tcp_initial_win); 19359 if (optval && (optval <= 0xff)) { 19360 uint32_t win, snt; 19361 19362 rack->rc_init_win = optval; 19363 win = rc_init_window(rack); 19364 if (SEQ_GT(tp->snd_max, tp->iss)) 19365 snt = tp->snd_max - tp->iss; 19366 else 19367 snt = 0; 19368 if ((snt < win) && 19369 (tp->t_srtt | 19370 #ifdef NETFLIX_PEAKRATE 19371 tp->t_maxpeakrate | 19372 #endif 19373 rack->r_ctl.init_rate)) { 19374 /* 19375 * We are not past the initial window 19376 * and we have some bases for pacing, 19377 * so we need to possibly adjust up 19378 * the cwnd. Note even if we don't set 19379 * the cwnd, its still ok to raise the rc_init_win 19380 * which can be used coming out of idle when we 19381 * would have a rate. 19382 */ 19383 if (tp->snd_cwnd < win) 19384 tp->snd_cwnd = win; 19385 } 19386 if (rack->rc_always_pace) 19387 rack_update_seg(rack); 19388 } else 19389 error = EINVAL; 19390 break; 19391 case TCP_RACK_FORCE_MSEG: 19392 RACK_OPTS_INC(tcp_rack_force_max_seg); 19393 if (optval) 19394 rack->rc_force_max_seg = 1; 19395 else 19396 rack->rc_force_max_seg = 0; 19397 break; 19398 case TCP_RACK_PACE_MAX_SEG: 19399 /* Max segments size in a pace in bytes */ 19400 RACK_OPTS_INC(tcp_rack_max_seg); 19401 rack->rc_user_set_max_segs = optval; 19402 rack_set_pace_segments(tp, rack, __LINE__, NULL); 19403 break; 19404 case TCP_RACK_PACE_RATE_REC: 19405 /* Set the fixed pacing rate in Bytes per second ca */ 19406 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 19407 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19408 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19409 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19410 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 19411 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19412 rack->use_fixed_rate = 1; 19413 if (rack->rc_always_pace) 19414 rack_set_cc_pacing(rack); 19415 rack_log_pacing_delay_calc(rack, 19416 rack->r_ctl.rc_fixed_pacing_rate_ss, 19417 rack->r_ctl.rc_fixed_pacing_rate_ca, 19418 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19419 __LINE__, NULL,0); 19420 break; 19421 19422 case TCP_RACK_PACE_RATE_SS: 19423 /* Set the fixed pacing rate in Bytes per second ca */ 19424 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 19425 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19426 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19427 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19428 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 19429 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19430 rack->use_fixed_rate = 1; 19431 if (rack->rc_always_pace) 19432 rack_set_cc_pacing(rack); 19433 rack_log_pacing_delay_calc(rack, 19434 rack->r_ctl.rc_fixed_pacing_rate_ss, 19435 rack->r_ctl.rc_fixed_pacing_rate_ca, 19436 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19437 __LINE__, NULL, 0); 19438 break; 19439 19440 case TCP_RACK_PACE_RATE_CA: 19441 /* Set the fixed pacing rate in Bytes per second ca */ 19442 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 19443 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19444 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 19445 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19446 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 19447 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19448 rack->use_fixed_rate = 1; 19449 if (rack->rc_always_pace) 19450 rack_set_cc_pacing(rack); 19451 rack_log_pacing_delay_calc(rack, 19452 rack->r_ctl.rc_fixed_pacing_rate_ss, 19453 rack->r_ctl.rc_fixed_pacing_rate_ca, 19454 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19455 __LINE__, NULL, 0); 19456 break; 19457 case TCP_RACK_GP_INCREASE_REC: 19458 RACK_OPTS_INC(tcp_gp_inc_rec); 19459 rack->r_ctl.rack_per_of_gp_rec = optval; 19460 rack_log_pacing_delay_calc(rack, 19461 rack->r_ctl.rack_per_of_gp_ss, 19462 rack->r_ctl.rack_per_of_gp_ca, 19463 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19464 __LINE__, NULL, 0); 19465 break; 19466 case TCP_RACK_GP_INCREASE_CA: 19467 RACK_OPTS_INC(tcp_gp_inc_ca); 19468 ca = optval; 19469 if (ca < 100) { 19470 /* 19471 * We don't allow any reduction 19472 * over the GP b/w. 19473 */ 19474 error = EINVAL; 19475 break; 19476 } 19477 rack->r_ctl.rack_per_of_gp_ca = ca; 19478 rack_log_pacing_delay_calc(rack, 19479 rack->r_ctl.rack_per_of_gp_ss, 19480 rack->r_ctl.rack_per_of_gp_ca, 19481 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19482 __LINE__, NULL, 0); 19483 break; 19484 case TCP_RACK_GP_INCREASE_SS: 19485 RACK_OPTS_INC(tcp_gp_inc_ss); 19486 ss = optval; 19487 if (ss < 100) { 19488 /* 19489 * We don't allow any reduction 19490 * over the GP b/w. 19491 */ 19492 error = EINVAL; 19493 break; 19494 } 19495 rack->r_ctl.rack_per_of_gp_ss = ss; 19496 rack_log_pacing_delay_calc(rack, 19497 rack->r_ctl.rack_per_of_gp_ss, 19498 rack->r_ctl.rack_per_of_gp_ca, 19499 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19500 __LINE__, NULL, 0); 19501 break; 19502 case TCP_RACK_RR_CONF: 19503 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 19504 if (optval && optval <= 3) 19505 rack->r_rr_config = optval; 19506 else 19507 rack->r_rr_config = 0; 19508 break; 19509 case TCP_HDWR_RATE_CAP: 19510 RACK_OPTS_INC(tcp_hdwr_rate_cap); 19511 if (optval) { 19512 if (rack->r_rack_hw_rate_caps == 0) 19513 rack->r_rack_hw_rate_caps = 1; 19514 else 19515 error = EALREADY; 19516 } else { 19517 rack->r_rack_hw_rate_caps = 0; 19518 } 19519 break; 19520 case TCP_BBR_HDWR_PACE: 19521 RACK_OPTS_INC(tcp_hdwr_pacing); 19522 if (optval){ 19523 if (rack->rack_hdrw_pacing == 0) { 19524 rack->rack_hdw_pace_ena = 1; 19525 rack->rack_attempt_hdwr_pace = 0; 19526 } else 19527 error = EALREADY; 19528 } else { 19529 rack->rack_hdw_pace_ena = 0; 19530 #ifdef RATELIMIT 19531 if (rack->r_ctl.crte != NULL) { 19532 rack->rack_hdrw_pacing = 0; 19533 rack->rack_attempt_hdwr_pace = 0; 19534 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 19535 rack->r_ctl.crte = NULL; 19536 } 19537 #endif 19538 } 19539 break; 19540 /* End Pacing related ones */ 19541 case TCP_RACK_PRR_SENDALOT: 19542 /* Allow PRR to send more than one seg */ 19543 RACK_OPTS_INC(tcp_rack_prr_sendalot); 19544 rack->r_ctl.rc_prr_sendalot = optval; 19545 break; 19546 case TCP_RACK_MIN_TO: 19547 /* Minimum time between rack t-o's in ms */ 19548 RACK_OPTS_INC(tcp_rack_min_to); 19549 rack->r_ctl.rc_min_to = optval; 19550 break; 19551 case TCP_RACK_EARLY_SEG: 19552 /* If early recovery max segments */ 19553 RACK_OPTS_INC(tcp_rack_early_seg); 19554 rack->r_ctl.rc_early_recovery_segs = optval; 19555 break; 19556 case TCP_RACK_REORD_THRESH: 19557 /* RACK reorder threshold (shift amount) */ 19558 RACK_OPTS_INC(tcp_rack_reord_thresh); 19559 if ((optval > 0) && (optval < 31)) 19560 rack->r_ctl.rc_reorder_shift = optval; 19561 else 19562 error = EINVAL; 19563 break; 19564 case TCP_RACK_REORD_FADE: 19565 /* Does reordering fade after ms time */ 19566 RACK_OPTS_INC(tcp_rack_reord_fade); 19567 rack->r_ctl.rc_reorder_fade = optval; 19568 break; 19569 case TCP_RACK_TLP_THRESH: 19570 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 19571 RACK_OPTS_INC(tcp_rack_tlp_thresh); 19572 if (optval) 19573 rack->r_ctl.rc_tlp_threshold = optval; 19574 else 19575 error = EINVAL; 19576 break; 19577 case TCP_BBR_USE_RACK_RR: 19578 RACK_OPTS_INC(tcp_rack_rr); 19579 if (optval) 19580 rack->use_rack_rr = 1; 19581 else 19582 rack->use_rack_rr = 0; 19583 break; 19584 case TCP_FAST_RSM_HACK: 19585 RACK_OPTS_INC(tcp_rack_fastrsm_hack); 19586 if (optval) 19587 rack->fast_rsm_hack = 1; 19588 else 19589 rack->fast_rsm_hack = 0; 19590 break; 19591 case TCP_RACK_PKT_DELAY: 19592 /* RACK added ms i.e. rack-rtt + reord + N */ 19593 RACK_OPTS_INC(tcp_rack_pkt_delay); 19594 rack->r_ctl.rc_pkt_delay = optval; 19595 break; 19596 case TCP_DELACK: 19597 RACK_OPTS_INC(tcp_rack_delayed_ack); 19598 if (optval == 0) 19599 tp->t_delayed_ack = 0; 19600 else 19601 tp->t_delayed_ack = 1; 19602 if (tp->t_flags & TF_DELACK) { 19603 tp->t_flags &= ~TF_DELACK; 19604 tp->t_flags |= TF_ACKNOW; 19605 NET_EPOCH_ENTER(et); 19606 rack_output(tp); 19607 NET_EPOCH_EXIT(et); 19608 } 19609 break; 19610 19611 case TCP_BBR_RACK_RTT_USE: 19612 RACK_OPTS_INC(tcp_rack_rtt_use); 19613 if ((optval != USE_RTT_HIGH) && 19614 (optval != USE_RTT_LOW) && 19615 (optval != USE_RTT_AVG)) 19616 error = EINVAL; 19617 else 19618 rack->r_ctl.rc_rate_sample_method = optval; 19619 break; 19620 case TCP_DATA_AFTER_CLOSE: 19621 RACK_OPTS_INC(tcp_data_after_close); 19622 if (optval) 19623 rack->rc_allow_data_af_clo = 1; 19624 else 19625 rack->rc_allow_data_af_clo = 0; 19626 break; 19627 default: 19628 break; 19629 } 19630 #ifdef NETFLIX_STATS 19631 tcp_log_socket_option(tp, sopt_name, optval, error); 19632 #endif 19633 return (error); 19634 } 19635 19636 19637 static void 19638 rack_apply_deferred_options(struct tcp_rack *rack) 19639 { 19640 struct deferred_opt_list *dol, *sdol; 19641 uint32_t s_optval; 19642 19643 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 19644 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 19645 /* Disadvantage of deferal is you loose the error return */ 19646 s_optval = (uint32_t)dol->optval; 19647 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); 19648 free(dol, M_TCPDO); 19649 } 19650 } 19651 19652 static void 19653 rack_hw_tls_change(struct tcpcb *tp, int chg) 19654 { 19655 /* 19656 * HW tls state has changed.. fix all 19657 * rsm's in flight. 19658 */ 19659 struct tcp_rack *rack; 19660 struct rack_sendmap *rsm; 19661 19662 rack = (struct tcp_rack *)tp->t_fb_ptr; 19663 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 19664 if (chg) 19665 rsm->r_hw_tls = 1; 19666 else 19667 rsm->r_hw_tls = 0; 19668 } 19669 if (chg) 19670 rack->r_ctl.fsb.hw_tls = 1; 19671 else 19672 rack->r_ctl.fsb.hw_tls = 0; 19673 } 19674 19675 static int 19676 rack_pru_options(struct tcpcb *tp, int flags) 19677 { 19678 if (flags & PRUS_OOB) 19679 return (EOPNOTSUPP); 19680 return (0); 19681 } 19682 19683 static struct tcp_function_block __tcp_rack = { 19684 .tfb_tcp_block_name = __XSTRING(STACKNAME), 19685 .tfb_tcp_output = rack_output, 19686 .tfb_do_queued_segments = ctf_do_queued_segments, 19687 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 19688 .tfb_tcp_do_segment = rack_do_segment, 19689 .tfb_tcp_ctloutput = rack_ctloutput, 19690 .tfb_tcp_fb_init = rack_init, 19691 .tfb_tcp_fb_fini = rack_fini, 19692 .tfb_tcp_timer_stop_all = rack_stopall, 19693 .tfb_tcp_timer_activate = rack_timer_activate, 19694 .tfb_tcp_timer_active = rack_timer_active, 19695 .tfb_tcp_timer_stop = rack_timer_stop, 19696 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 19697 .tfb_tcp_handoff_ok = rack_handoff_ok, 19698 .tfb_tcp_mtu_chg = rack_mtu_change, 19699 .tfb_pru_options = rack_pru_options, 19700 .tfb_hwtls_change = rack_hw_tls_change, 19701 }; 19702 19703 /* 19704 * rack_ctloutput() must drop the inpcb lock before performing copyin on 19705 * socket option arguments. When it re-acquires the lock after the copy, it 19706 * has to revalidate that the connection is still valid for the socket 19707 * option. 19708 */ 19709 static int 19710 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 19711 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 19712 { 19713 uint64_t loptval; 19714 int32_t error = 0, optval; 19715 19716 switch (sopt->sopt_name) { 19717 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 19718 /* Pacing related ones */ 19719 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 19720 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 19721 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 19722 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 19723 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 19724 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 19725 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 19726 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 19727 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 19728 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 19729 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 19730 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 19731 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 19732 case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ 19733 case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ 19734 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 19735 /* End pacing related */ 19736 case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ 19737 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 19738 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 19739 case TCP_RACK_MIN_TO: /* URL:min_to */ 19740 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 19741 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 19742 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 19743 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 19744 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 19745 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 19746 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 19747 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 19748 case TCP_RACK_DO_DETECTION: /* URL:detect */ 19749 case TCP_NO_PRR: /* URL:noprr */ 19750 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 19751 case TCP_DATA_AFTER_CLOSE: /* no URL */ 19752 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 19753 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 19754 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 19755 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 19756 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 19757 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 19758 case TCP_RACK_PROFILE: /* URL:profile */ 19759 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 19760 case TCP_RACK_ABC_VAL: /* URL:labc */ 19761 case TCP_REC_ABC_VAL: /* URL:reclabc */ 19762 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 19763 case TCP_DEFER_OPTIONS: /* URL:defer */ 19764 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 19765 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 19766 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 19767 break; 19768 default: 19769 /* Filter off all unknown options to the base stack */ 19770 return (tcp_default_ctloutput(so, sopt, inp, tp)); 19771 break; 19772 } 19773 INP_WUNLOCK(inp); 19774 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 19775 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 19776 /* 19777 * We truncate it down to 32 bits for the socket-option trace this 19778 * means rates > 34Gbps won't show right, but thats probably ok. 19779 */ 19780 optval = (uint32_t)loptval; 19781 } else { 19782 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 19783 /* Save it in 64 bit form too */ 19784 loptval = optval; 19785 } 19786 if (error) 19787 return (error); 19788 INP_WLOCK(inp); 19789 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 19790 INP_WUNLOCK(inp); 19791 return (ECONNRESET); 19792 } 19793 if (tp->t_fb != &__tcp_rack) { 19794 INP_WUNLOCK(inp); 19795 return (ENOPROTOOPT); 19796 } 19797 if (rack->defer_options && (rack->gp_ready == 0) && 19798 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 19799 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 19800 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 19801 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 19802 /* Options are beind deferred */ 19803 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 19804 INP_WUNLOCK(inp); 19805 return (0); 19806 } else { 19807 /* No memory to defer, fail */ 19808 INP_WUNLOCK(inp); 19809 return (ENOMEM); 19810 } 19811 } 19812 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); 19813 INP_WUNLOCK(inp); 19814 return (error); 19815 } 19816 19817 static void 19818 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 19819 { 19820 19821 INP_WLOCK_ASSERT(tp->t_inpcb); 19822 bzero(ti, sizeof(*ti)); 19823 19824 ti->tcpi_state = tp->t_state; 19825 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 19826 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 19827 if (tp->t_flags & TF_SACK_PERMIT) 19828 ti->tcpi_options |= TCPI_OPT_SACK; 19829 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 19830 ti->tcpi_options |= TCPI_OPT_WSCALE; 19831 ti->tcpi_snd_wscale = tp->snd_scale; 19832 ti->tcpi_rcv_wscale = tp->rcv_scale; 19833 } 19834 if (tp->t_flags2 & TF2_ECN_PERMIT) 19835 ti->tcpi_options |= TCPI_OPT_ECN; 19836 if (tp->t_flags & TF_FASTOPEN) 19837 ti->tcpi_options |= TCPI_OPT_TFO; 19838 /* still kept in ticks is t_rcvtime */ 19839 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 19840 /* Since we hold everything in precise useconds this is easy */ 19841 ti->tcpi_rtt = tp->t_srtt; 19842 ti->tcpi_rttvar = tp->t_rttvar; 19843 ti->tcpi_rto = tp->t_rxtcur; 19844 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 19845 ti->tcpi_snd_cwnd = tp->snd_cwnd; 19846 /* 19847 * FreeBSD-specific extension fields for tcp_info. 19848 */ 19849 ti->tcpi_rcv_space = tp->rcv_wnd; 19850 ti->tcpi_rcv_nxt = tp->rcv_nxt; 19851 ti->tcpi_snd_wnd = tp->snd_wnd; 19852 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 19853 ti->tcpi_snd_nxt = tp->snd_nxt; 19854 ti->tcpi_snd_mss = tp->t_maxseg; 19855 ti->tcpi_rcv_mss = tp->t_maxseg; 19856 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 19857 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 19858 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 19859 #ifdef NETFLIX_STATS 19860 ti->tcpi_total_tlp = tp->t_sndtlppack; 19861 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 19862 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 19863 #endif 19864 #ifdef TCP_OFFLOAD 19865 if (tp->t_flags & TF_TOE) { 19866 ti->tcpi_options |= TCPI_OPT_TOE; 19867 tcp_offload_tcp_info(tp, ti); 19868 } 19869 #endif 19870 } 19871 19872 static int 19873 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 19874 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 19875 { 19876 int32_t error, optval; 19877 uint64_t val, loptval; 19878 struct tcp_info ti; 19879 /* 19880 * Because all our options are either boolean or an int, we can just 19881 * pull everything into optval and then unlock and copy. If we ever 19882 * add a option that is not a int, then this will have quite an 19883 * impact to this routine. 19884 */ 19885 error = 0; 19886 switch (sopt->sopt_name) { 19887 case TCP_INFO: 19888 /* First get the info filled */ 19889 rack_fill_info(tp, &ti); 19890 /* Fix up the rtt related fields if needed */ 19891 INP_WUNLOCK(inp); 19892 error = sooptcopyout(sopt, &ti, sizeof ti); 19893 return (error); 19894 /* 19895 * Beta is the congestion control value for NewReno that influences how 19896 * much of a backoff happens when loss is detected. It is normally set 19897 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 19898 * when you exit recovery. 19899 */ 19900 case TCP_RACK_PACING_BETA: 19901 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 19902 error = EINVAL; 19903 else if (rack->rc_pacing_cc_set == 0) 19904 optval = rack->r_ctl.rc_saved_beta.beta; 19905 else { 19906 /* 19907 * Reach out into the CC data and report back what 19908 * I have previously set. Yeah it looks hackish but 19909 * we don't want to report the saved values. 19910 */ 19911 if (tp->ccv->cc_data) 19912 optval = ((struct newreno *)tp->ccv->cc_data)->beta; 19913 else 19914 error = EINVAL; 19915 } 19916 break; 19917 /* 19918 * Beta_ecn is the congestion control value for NewReno that influences how 19919 * much of a backoff happens when a ECN mark is detected. It is normally set 19920 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 19921 * you exit recovery. Note that classic ECN has a beta of 50, it is only 19922 * ABE Ecn that uses this "less" value, but we do too with pacing :) 19923 */ 19924 19925 case TCP_RACK_PACING_BETA_ECN: 19926 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 19927 error = EINVAL; 19928 else if (rack->rc_pacing_cc_set == 0) 19929 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 19930 else { 19931 /* 19932 * Reach out into the CC data and report back what 19933 * I have previously set. Yeah it looks hackish but 19934 * we don't want to report the saved values. 19935 */ 19936 if (tp->ccv->cc_data) 19937 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn; 19938 else 19939 error = EINVAL; 19940 } 19941 break; 19942 case TCP_FAST_RSM_HACK: 19943 optval = rack->fast_rsm_hack; 19944 break; 19945 case TCP_DEFER_OPTIONS: 19946 optval = rack->defer_options; 19947 break; 19948 case TCP_RACK_MEASURE_CNT: 19949 optval = rack->r_ctl.req_measurements; 19950 break; 19951 case TCP_REC_ABC_VAL: 19952 optval = rack->r_use_labc_for_rec; 19953 break; 19954 case TCP_RACK_ABC_VAL: 19955 optval = rack->rc_labc; 19956 break; 19957 case TCP_HDWR_UP_ONLY: 19958 optval= rack->r_up_only; 19959 break; 19960 case TCP_PACING_RATE_CAP: 19961 loptval = rack->r_ctl.bw_rate_cap; 19962 break; 19963 case TCP_RACK_PROFILE: 19964 /* You cannot retrieve a profile, its write only */ 19965 error = EINVAL; 19966 break; 19967 case TCP_USE_CMP_ACKS: 19968 optval = rack->r_use_cmp_ack; 19969 break; 19970 case TCP_RACK_PACE_TO_FILL: 19971 optval = rack->rc_pace_to_cwnd; 19972 if (optval && rack->r_fill_less_agg) 19973 optval++; 19974 break; 19975 case TCP_RACK_NO_PUSH_AT_MAX: 19976 optval = rack->r_ctl.rc_no_push_at_mrtt; 19977 break; 19978 case TCP_SHARED_CWND_ENABLE: 19979 optval = rack->rack_enable_scwnd; 19980 break; 19981 case TCP_RACK_NONRXT_CFG_RATE: 19982 optval = rack->rack_rec_nonrxt_use_cr; 19983 break; 19984 case TCP_NO_PRR: 19985 if (rack->rack_no_prr == 1) 19986 optval = 1; 19987 else if (rack->no_prr_addback == 1) 19988 optval = 2; 19989 else 19990 optval = 0; 19991 break; 19992 case TCP_RACK_DO_DETECTION: 19993 optval = rack->do_detection; 19994 break; 19995 case TCP_RACK_MBUF_QUEUE: 19996 /* Now do we use the LRO mbuf-queue feature */ 19997 optval = rack->r_mbuf_queue; 19998 break; 19999 case TCP_TIMELY_DYN_ADJ: 20000 optval = rack->rc_gp_dyn_mul; 20001 break; 20002 case TCP_BBR_IWINTSO: 20003 optval = rack->rc_init_win; 20004 break; 20005 case TCP_RACK_TLP_REDUCE: 20006 /* RACK TLP cwnd reduction (bool) */ 20007 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 20008 break; 20009 case TCP_BBR_RACK_INIT_RATE: 20010 val = rack->r_ctl.init_rate; 20011 /* convert to kbits per sec */ 20012 val *= 8; 20013 val /= 1000; 20014 optval = (uint32_t)val; 20015 break; 20016 case TCP_RACK_FORCE_MSEG: 20017 optval = rack->rc_force_max_seg; 20018 break; 20019 case TCP_RACK_PACE_MAX_SEG: 20020 /* Max segments in a pace */ 20021 optval = rack->rc_user_set_max_segs; 20022 break; 20023 case TCP_RACK_PACE_ALWAYS: 20024 /* Use the always pace method */ 20025 optval = rack->rc_always_pace; 20026 break; 20027 case TCP_RACK_PRR_SENDALOT: 20028 /* Allow PRR to send more than one seg */ 20029 optval = rack->r_ctl.rc_prr_sendalot; 20030 break; 20031 case TCP_RACK_MIN_TO: 20032 /* Minimum time between rack t-o's in ms */ 20033 optval = rack->r_ctl.rc_min_to; 20034 break; 20035 case TCP_RACK_EARLY_SEG: 20036 /* If early recovery max segments */ 20037 optval = rack->r_ctl.rc_early_recovery_segs; 20038 break; 20039 case TCP_RACK_REORD_THRESH: 20040 /* RACK reorder threshold (shift amount) */ 20041 optval = rack->r_ctl.rc_reorder_shift; 20042 break; 20043 case TCP_RACK_REORD_FADE: 20044 /* Does reordering fade after ms time */ 20045 optval = rack->r_ctl.rc_reorder_fade; 20046 break; 20047 case TCP_BBR_USE_RACK_RR: 20048 /* Do we use the rack cheat for rxt */ 20049 optval = rack->use_rack_rr; 20050 break; 20051 case TCP_RACK_RR_CONF: 20052 optval = rack->r_rr_config; 20053 break; 20054 case TCP_HDWR_RATE_CAP: 20055 optval = rack->r_rack_hw_rate_caps; 20056 break; 20057 case TCP_BBR_HDWR_PACE: 20058 optval = rack->rack_hdw_pace_ena; 20059 break; 20060 case TCP_RACK_TLP_THRESH: 20061 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 20062 optval = rack->r_ctl.rc_tlp_threshold; 20063 break; 20064 case TCP_RACK_PKT_DELAY: 20065 /* RACK added ms i.e. rack-rtt + reord + N */ 20066 optval = rack->r_ctl.rc_pkt_delay; 20067 break; 20068 case TCP_RACK_TLP_USE: 20069 optval = rack->rack_tlp_threshold_use; 20070 break; 20071 case TCP_RACK_PACE_RATE_CA: 20072 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 20073 break; 20074 case TCP_RACK_PACE_RATE_SS: 20075 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 20076 break; 20077 case TCP_RACK_PACE_RATE_REC: 20078 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 20079 break; 20080 case TCP_RACK_GP_INCREASE_SS: 20081 optval = rack->r_ctl.rack_per_of_gp_ca; 20082 break; 20083 case TCP_RACK_GP_INCREASE_CA: 20084 optval = rack->r_ctl.rack_per_of_gp_ss; 20085 break; 20086 case TCP_BBR_RACK_RTT_USE: 20087 optval = rack->r_ctl.rc_rate_sample_method; 20088 break; 20089 case TCP_DELACK: 20090 optval = tp->t_delayed_ack; 20091 break; 20092 case TCP_DATA_AFTER_CLOSE: 20093 optval = rack->rc_allow_data_af_clo; 20094 break; 20095 case TCP_SHARED_CWND_TIME_LIMIT: 20096 optval = rack->r_limit_scw; 20097 break; 20098 case TCP_RACK_TIMER_SLOP: 20099 optval = rack->r_ctl.timer_slop; 20100 break; 20101 default: 20102 return (tcp_default_ctloutput(so, sopt, inp, tp)); 20103 break; 20104 } 20105 INP_WUNLOCK(inp); 20106 if (error == 0) { 20107 if (TCP_PACING_RATE_CAP) 20108 error = sooptcopyout(sopt, &loptval, sizeof loptval); 20109 else 20110 error = sooptcopyout(sopt, &optval, sizeof optval); 20111 } 20112 return (error); 20113 } 20114 20115 static int 20116 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 20117 { 20118 int32_t error = EINVAL; 20119 struct tcp_rack *rack; 20120 20121 rack = (struct tcp_rack *)tp->t_fb_ptr; 20122 if (rack == NULL) { 20123 /* Huh? */ 20124 goto out; 20125 } 20126 if (sopt->sopt_dir == SOPT_SET) { 20127 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 20128 } else if (sopt->sopt_dir == SOPT_GET) { 20129 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 20130 } 20131 out: 20132 INP_WUNLOCK(inp); 20133 return (error); 20134 } 20135 20136 static const char *rack_stack_names[] = { 20137 __XSTRING(STACKNAME), 20138 #ifdef STACKALIAS 20139 __XSTRING(STACKALIAS), 20140 #endif 20141 }; 20142 20143 static int 20144 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 20145 { 20146 memset(mem, 0, size); 20147 return (0); 20148 } 20149 20150 static void 20151 rack_dtor(void *mem, int32_t size, void *arg) 20152 { 20153 20154 } 20155 20156 static bool rack_mod_inited = false; 20157 20158 static int 20159 tcp_addrack(module_t mod, int32_t type, void *data) 20160 { 20161 int32_t err = 0; 20162 int num_stacks; 20163 20164 switch (type) { 20165 case MOD_LOAD: 20166 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 20167 sizeof(struct rack_sendmap), 20168 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 20169 20170 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 20171 sizeof(struct tcp_rack), 20172 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 20173 20174 sysctl_ctx_init(&rack_sysctl_ctx); 20175 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 20176 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 20177 OID_AUTO, 20178 #ifdef STACKALIAS 20179 __XSTRING(STACKALIAS), 20180 #else 20181 __XSTRING(STACKNAME), 20182 #endif 20183 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 20184 ""); 20185 if (rack_sysctl_root == NULL) { 20186 printf("Failed to add sysctl node\n"); 20187 err = EFAULT; 20188 goto free_uma; 20189 } 20190 rack_init_sysctls(); 20191 num_stacks = nitems(rack_stack_names); 20192 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 20193 rack_stack_names, &num_stacks); 20194 if (err) { 20195 printf("Failed to register %s stack name for " 20196 "%s module\n", rack_stack_names[num_stacks], 20197 __XSTRING(MODNAME)); 20198 sysctl_ctx_free(&rack_sysctl_ctx); 20199 free_uma: 20200 uma_zdestroy(rack_zone); 20201 uma_zdestroy(rack_pcb_zone); 20202 rack_counter_destroy(); 20203 printf("Failed to register rack module -- err:%d\n", err); 20204 return (err); 20205 } 20206 tcp_lro_reg_mbufq(); 20207 rack_mod_inited = true; 20208 break; 20209 case MOD_QUIESCE: 20210 err = deregister_tcp_functions(&__tcp_rack, true, false); 20211 break; 20212 case MOD_UNLOAD: 20213 err = deregister_tcp_functions(&__tcp_rack, false, true); 20214 if (err == EBUSY) 20215 break; 20216 if (rack_mod_inited) { 20217 uma_zdestroy(rack_zone); 20218 uma_zdestroy(rack_pcb_zone); 20219 sysctl_ctx_free(&rack_sysctl_ctx); 20220 rack_counter_destroy(); 20221 rack_mod_inited = false; 20222 } 20223 tcp_lro_dereg_mbufq(); 20224 err = 0; 20225 break; 20226 default: 20227 return (EOPNOTSUPP); 20228 } 20229 return (err); 20230 } 20231 20232 static moduledata_t tcp_rack = { 20233 .name = __XSTRING(MODNAME), 20234 .evhand = tcp_addrack, 20235 .priv = 0 20236 }; 20237 20238 MODULE_VERSION(MODNAME, 1); 20239 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 20240 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 20241