1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include <sys/param.h> 36 #include <sys/arb.h> 37 #include <sys/module.h> 38 #include <sys/kernel.h> 39 #ifdef TCP_HHOOK 40 #include <sys/hhook.h> 41 #endif 42 #include <sys/lock.h> 43 #include <sys/malloc.h> 44 #include <sys/lock.h> 45 #include <sys/mutex.h> 46 #include <sys/mbuf.h> 47 #include <sys/proc.h> /* for proc0 declaration */ 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/sysctl.h> 51 #include <sys/systm.h> 52 #ifdef STATS 53 #include <sys/qmath.h> 54 #include <sys/tree.h> 55 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 56 #else 57 #include <sys/tree.h> 58 #endif 59 #include <sys/refcount.h> 60 #include <sys/queue.h> 61 #include <sys/tim_filter.h> 62 #include <sys/smp.h> 63 #include <sys/kthread.h> 64 #include <sys/kern_prefetch.h> 65 #include <sys/protosw.h> 66 #ifdef TCP_ACCOUNTING 67 #include <sys/sched.h> 68 #include <machine/cpu.h> 69 #endif 70 #include <vm/uma.h> 71 72 #include <net/route.h> 73 #include <net/route/nhop.h> 74 #include <net/vnet.h> 75 76 #define TCPSTATES /* for logging */ 77 78 #include <netinet/in.h> 79 #include <netinet/in_kdtrace.h> 80 #include <netinet/in_pcb.h> 81 #include <netinet/ip.h> 82 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 83 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 84 #include <netinet/ip_var.h> 85 #include <netinet/ip6.h> 86 #include <netinet6/in6_pcb.h> 87 #include <netinet6/ip6_var.h> 88 #include <netinet/tcp.h> 89 #define TCPOUTFLAGS 90 #include <netinet/tcp_fsm.h> 91 #include <netinet/tcp_log_buf.h> 92 #include <netinet/tcp_seq.h> 93 #include <netinet/tcp_timer.h> 94 #include <netinet/tcp_var.h> 95 #include <netinet/tcp_hpts.h> 96 #include <netinet/tcp_ratelimit.h> 97 #include <netinet/tcp_accounting.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/cc/cc.h> 100 #include <netinet/cc/cc_newreno.h> 101 #include <netinet/tcp_fastopen.h> 102 #include <netinet/tcp_lro.h> 103 #ifdef NETFLIX_SHARED_CWND 104 #include <netinet/tcp_shared_cwnd.h> 105 #endif 106 #ifdef TCPDEBUG 107 #include <netinet/tcp_debug.h> 108 #endif /* TCPDEBUG */ 109 #ifdef TCP_OFFLOAD 110 #include <netinet/tcp_offload.h> 111 #endif 112 #ifdef INET6 113 #include <netinet6/tcp6_var.h> 114 #endif 115 116 #include <netipsec/ipsec_support.h> 117 118 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 119 #include <netipsec/ipsec.h> 120 #include <netipsec/ipsec6.h> 121 #endif /* IPSEC */ 122 123 #include <netinet/udp.h> 124 #include <netinet/udp_var.h> 125 #include <machine/in_cksum.h> 126 127 #ifdef MAC 128 #include <security/mac/mac_framework.h> 129 #endif 130 #include "sack_filter.h" 131 #include "tcp_rack.h" 132 #include "rack_bbr_common.h" 133 134 uma_zone_t rack_zone; 135 uma_zone_t rack_pcb_zone; 136 137 #ifndef TICKS2SBT 138 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 139 #endif 140 141 VNET_DECLARE(uint32_t, newreno_beta); 142 VNET_DECLARE(uint32_t, newreno_beta_ecn); 143 #define V_newreno_beta VNET(newreno_beta) 144 #define V_newreno_beta_ecn VNET(newreno_beta_ecn) 145 146 147 MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); 148 MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); 149 150 struct sysctl_ctx_list rack_sysctl_ctx; 151 struct sysctl_oid *rack_sysctl_root; 152 153 #define CUM_ACKED 1 154 #define SACKED 2 155 156 /* 157 * The RACK module incorporates a number of 158 * TCP ideas that have been put out into the IETF 159 * over the last few years: 160 * - Matt Mathis's Rate Halving which slowly drops 161 * the congestion window so that the ack clock can 162 * be maintained during a recovery. 163 * - Yuchung Cheng's RACK TCP (for which its named) that 164 * will stop us using the number of dup acks and instead 165 * use time as the gage of when we retransmit. 166 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 167 * of Dukkipati et.al. 168 * RACK depends on SACK, so if an endpoint arrives that 169 * cannot do SACK the state machine below will shuttle the 170 * connection back to using the "default" TCP stack that is 171 * in FreeBSD. 172 * 173 * To implement RACK the original TCP stack was first decomposed 174 * into a functional state machine with individual states 175 * for each of the possible TCP connection states. The do_segement 176 * functions role in life is to mandate the connection supports SACK 177 * initially and then assure that the RACK state matches the conenction 178 * state before calling the states do_segment function. Each 179 * state is simplified due to the fact that the original do_segment 180 * has been decomposed and we *know* what state we are in (no 181 * switches on the state) and all tests for SACK are gone. This 182 * greatly simplifies what each state does. 183 * 184 * TCP output is also over-written with a new version since it 185 * must maintain the new rack scoreboard. 186 * 187 */ 188 static int32_t rack_tlp_thresh = 1; 189 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 190 static int32_t rack_tlp_use_greater = 1; 191 static int32_t rack_reorder_thresh = 2; 192 static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 193 * - 60 seconds */ 194 static uint8_t rack_req_measurements = 1; 195 /* Attack threshold detections */ 196 static uint32_t rack_highest_sack_thresh_seen = 0; 197 static uint32_t rack_highest_move_thresh_seen = 0; 198 static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ 199 static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ 200 static int32_t rack_hw_rate_caps = 1; /* 1; */ 201 static int32_t rack_hw_rate_min = 0; /* 1500000;*/ 202 static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ 203 static int32_t rack_hw_up_only = 1; 204 static int32_t rack_stats_gets_ms_rtt = 1; 205 static int32_t rack_prr_addbackmax = 2; 206 207 static int32_t rack_pkt_delay = 1000; 208 static int32_t rack_send_a_lot_in_prr = 1; 209 static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ 210 static int32_t rack_verbose_logging = 0; 211 static int32_t rack_ignore_data_after_close = 1; 212 static int32_t rack_enable_shared_cwnd = 1; 213 static int32_t rack_use_cmp_acks = 1; 214 static int32_t rack_use_fsb = 1; 215 static int32_t rack_use_rfo = 1; 216 static int32_t rack_use_rsm_rfo = 1; 217 static int32_t rack_max_abc_post_recovery = 2; 218 static int32_t rack_client_low_buf = 0; 219 #ifdef TCP_ACCOUNTING 220 static int32_t rack_tcp_accounting = 0; 221 #endif 222 static int32_t rack_limits_scwnd = 1; 223 static int32_t rack_enable_mqueue_for_nonpaced = 0; 224 static int32_t rack_disable_prr = 0; 225 static int32_t use_rack_rr = 1; 226 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 227 static int32_t rack_persist_min = 250000; /* 250usec */ 228 static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ 229 static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ 230 static int32_t rack_default_init_window = 0; /* Use system default */ 231 static int32_t rack_limit_time_with_srtt = 0; 232 static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ 233 static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ 234 static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ 235 static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ 236 static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ 237 /* 238 * Currently regular tcp has a rto_min of 30ms 239 * the backoff goes 12 times so that ends up 240 * being a total of 122.850 seconds before a 241 * connection is killed. 242 */ 243 static uint32_t rack_def_data_window = 20; 244 static uint32_t rack_goal_bdp = 2; 245 static uint32_t rack_min_srtts = 1; 246 static uint32_t rack_min_measure_usec = 0; 247 static int32_t rack_tlp_min = 10000; /* 10ms */ 248 static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ 249 static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ 250 static const int32_t rack_free_cache = 2; 251 static int32_t rack_hptsi_segments = 40; 252 static int32_t rack_rate_sample_method = USE_RTT_LOW; 253 static int32_t rack_pace_every_seg = 0; 254 static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ 255 static int32_t rack_slot_reduction = 4; 256 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 257 static int32_t rack_cwnd_block_ends_measure = 0; 258 static int32_t rack_rwnd_block_ends_measure = 0; 259 static int32_t rack_def_profile = 0; 260 261 static int32_t rack_lower_cwnd_at_tlp = 0; 262 static int32_t rack_limited_retran = 0; 263 static int32_t rack_always_send_oldest = 0; 264 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 265 266 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 267 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 268 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 269 270 /* Probertt */ 271 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 272 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 273 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 274 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 275 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 276 277 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 278 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 279 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 280 static uint32_t rack_probertt_use_min_rtt_exit = 0; 281 static uint32_t rack_probe_rtt_sets_cwnd = 0; 282 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 283 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ 284 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 285 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 286 static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ 287 static uint32_t rack_probertt_filter_life = 10000000; 288 static uint32_t rack_probertt_lower_within = 10; 289 static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ 290 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 291 static int32_t rack_probertt_clear_is = 1; 292 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 293 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 294 295 /* Part of pacing */ 296 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 297 298 /* Timely information */ 299 /* Combine these two gives the range of 'no change' to bw */ 300 /* ie the up/down provide the upper and lower bound */ 301 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 302 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 303 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 304 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 305 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 306 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 307 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 308 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 309 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 310 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 311 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 312 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 313 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 314 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 315 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 316 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 317 static int32_t rack_use_max_for_nobackoff = 0; 318 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 319 static int32_t rack_timely_no_stopping = 0; 320 static int32_t rack_down_raise_thresh = 100; 321 static int32_t rack_req_segs = 1; 322 static uint64_t rack_bw_rate_cap = 0; 323 324 /* Weird delayed ack mode */ 325 static int32_t rack_use_imac_dack = 0; 326 /* Rack specific counters */ 327 counter_u64_t rack_badfr; 328 counter_u64_t rack_badfr_bytes; 329 counter_u64_t rack_rtm_prr_retran; 330 counter_u64_t rack_rtm_prr_newdata; 331 counter_u64_t rack_timestamp_mismatch; 332 counter_u64_t rack_reorder_seen; 333 counter_u64_t rack_paced_segments; 334 counter_u64_t rack_unpaced_segments; 335 counter_u64_t rack_calc_zero; 336 counter_u64_t rack_calc_nonzero; 337 counter_u64_t rack_saw_enobuf; 338 counter_u64_t rack_saw_enobuf_hw; 339 counter_u64_t rack_saw_enetunreach; 340 counter_u64_t rack_per_timer_hole; 341 counter_u64_t rack_large_ackcmp; 342 counter_u64_t rack_small_ackcmp; 343 #ifdef INVARIANTS 344 counter_u64_t rack_adjust_map_bw; 345 #endif 346 /* Tail loss probe counters */ 347 counter_u64_t rack_tlp_tot; 348 counter_u64_t rack_tlp_newdata; 349 counter_u64_t rack_tlp_retran; 350 counter_u64_t rack_tlp_retran_bytes; 351 counter_u64_t rack_tlp_retran_fail; 352 counter_u64_t rack_to_tot; 353 counter_u64_t rack_to_arm_rack; 354 counter_u64_t rack_to_arm_tlp; 355 counter_u64_t rack_hot_alloc; 356 counter_u64_t rack_to_alloc; 357 counter_u64_t rack_to_alloc_hard; 358 counter_u64_t rack_to_alloc_emerg; 359 counter_u64_t rack_to_alloc_limited; 360 counter_u64_t rack_alloc_limited_conns; 361 counter_u64_t rack_split_limited; 362 363 #define MAX_NUM_OF_CNTS 13 364 counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS]; 365 counter_u64_t rack_multi_single_eq; 366 counter_u64_t rack_proc_non_comp_ack; 367 368 counter_u64_t rack_fto_send; 369 counter_u64_t rack_fto_rsm_send; 370 counter_u64_t rack_nfto_resend; 371 counter_u64_t rack_non_fto_send; 372 counter_u64_t rack_extended_rfo; 373 374 counter_u64_t rack_sack_proc_all; 375 counter_u64_t rack_sack_proc_short; 376 counter_u64_t rack_sack_proc_restart; 377 counter_u64_t rack_sack_attacks_detected; 378 counter_u64_t rack_sack_attacks_reversed; 379 counter_u64_t rack_sack_used_next_merge; 380 counter_u64_t rack_sack_splits; 381 counter_u64_t rack_sack_used_prev_merge; 382 counter_u64_t rack_sack_skipped_acked; 383 counter_u64_t rack_ack_total; 384 counter_u64_t rack_express_sack; 385 counter_u64_t rack_sack_total; 386 counter_u64_t rack_move_none; 387 counter_u64_t rack_move_some; 388 389 counter_u64_t rack_used_tlpmethod; 390 counter_u64_t rack_used_tlpmethod2; 391 counter_u64_t rack_enter_tlp_calc; 392 counter_u64_t rack_input_idle_reduces; 393 counter_u64_t rack_collapsed_win; 394 counter_u64_t rack_tlp_does_nada; 395 counter_u64_t rack_try_scwnd; 396 counter_u64_t rack_hw_pace_init_fail; 397 counter_u64_t rack_hw_pace_lost; 398 counter_u64_t rack_sbsndptr_right; 399 counter_u64_t rack_sbsndptr_wrong; 400 401 /* Temp CPU counters */ 402 counter_u64_t rack_find_high; 403 404 counter_u64_t rack_progress_drops; 405 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 406 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 407 408 409 #define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) 410 411 #define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax, slop) do { \ 412 (tv) = (value) + slop; \ 413 if ((u_long)(tv) < (u_long)(tvmin)) \ 414 (tv) = (tvmin); \ 415 if ((u_long)(tv) > (u_long)(tvmax)) \ 416 (tv) = (tvmax); \ 417 } while (0) 418 419 static void 420 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 421 422 static int 423 rack_process_ack(struct mbuf *m, struct tcphdr *th, 424 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 425 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 426 static int 427 rack_process_data(struct mbuf *m, struct tcphdr *th, 428 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 429 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 430 static void 431 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 432 uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); 433 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 434 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 435 uint8_t limit_type); 436 static struct rack_sendmap * 437 rack_check_recovery_mode(struct tcpcb *tp, 438 uint32_t tsused); 439 static void 440 rack_cong_signal(struct tcpcb *tp, 441 uint32_t type, uint32_t ack); 442 static void rack_counter_destroy(void); 443 static int 444 rack_ctloutput(struct socket *so, struct sockopt *sopt, 445 struct inpcb *inp, struct tcpcb *tp); 446 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 447 static void 448 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); 449 static void 450 rack_do_segment(struct mbuf *m, struct tcphdr *th, 451 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 452 uint8_t iptos); 453 static void rack_dtor(void *mem, int32_t size, void *arg); 454 static void 455 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 456 uint32_t flex1, uint32_t flex2, 457 uint32_t flex3, uint32_t flex4, 458 uint32_t flex5, uint32_t flex6, 459 uint16_t flex7, uint8_t mod); 460 static void 461 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 462 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 463 static struct rack_sendmap * 464 rack_find_high_nonack(struct tcp_rack *rack, 465 struct rack_sendmap *rsm); 466 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 467 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 468 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 469 static int 470 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 471 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 472 static void 473 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 474 tcp_seq th_ack, int line); 475 static uint32_t 476 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 477 static int32_t rack_handoff_ok(struct tcpcb *tp); 478 static int32_t rack_init(struct tcpcb *tp); 479 static void rack_init_sysctls(void); 480 static void 481 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 482 struct tcphdr *th, int entered_rec, int dup_ack_struck); 483 static void 484 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 485 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts, 486 struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff); 487 488 static void 489 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 490 struct rack_sendmap *rsm); 491 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 492 static int32_t rack_output(struct tcpcb *tp); 493 494 static uint32_t 495 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 496 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 497 uint32_t cts, int *moved_two); 498 static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); 499 static void rack_remxt_tmr(struct tcpcb *tp); 500 static int 501 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 502 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 503 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 504 static int32_t rack_stopall(struct tcpcb *tp); 505 static void 506 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 507 uint32_t delta); 508 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 509 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 510 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 511 static uint32_t 512 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 513 struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); 514 static void 515 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 516 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); 517 static int 518 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 519 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 520 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 521 static int 522 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 523 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 524 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 525 static int 526 rack_do_closing(struct mbuf *m, struct tcphdr *th, 527 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 528 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 529 static int 530 rack_do_established(struct mbuf *m, struct tcphdr *th, 531 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 532 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 533 static int 534 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 535 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 536 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 537 static int 538 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 539 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 540 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 541 static int 542 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 543 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 544 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 545 static int 546 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 547 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 548 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 549 static int 550 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 551 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 552 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 553 static int 554 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 555 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 556 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 557 struct rack_sendmap * 558 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 559 uint32_t tsused); 560 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 561 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 562 static void 563 tcp_rack_partialack(struct tcpcb *tp); 564 static int 565 rack_set_profile(struct tcp_rack *rack, int prof); 566 static void 567 rack_apply_deferred_options(struct tcp_rack *rack); 568 569 int32_t rack_clear_counter=0; 570 571 static void 572 rack_set_cc_pacing(struct tcp_rack *rack) 573 { 574 struct sockopt sopt; 575 struct cc_newreno_opts opt; 576 struct newreno old, *ptr; 577 struct tcpcb *tp; 578 int error; 579 580 if (rack->rc_pacing_cc_set) 581 return; 582 583 tp = rack->rc_tp; 584 if (tp->cc_algo == NULL) { 585 /* Tcb is leaving */ 586 printf("No cc algorithm?\n"); 587 return; 588 } 589 rack->rc_pacing_cc_set = 1; 590 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 591 /* Not new-reno we can't play games with beta! */ 592 goto out; 593 } 594 ptr = ((struct newreno *)tp->ccv->cc_data); 595 if (CC_ALGO(tp)->ctl_output == NULL) { 596 /* Huh, why does new_reno no longer have a set function? */ 597 printf("no ctl_output for algo:%s\n", tp->cc_algo->name); 598 goto out; 599 } 600 if (ptr == NULL) { 601 /* Just the default values */ 602 old.beta = V_newreno_beta_ecn; 603 old.beta_ecn = V_newreno_beta_ecn; 604 old.newreno_flags = 0; 605 } else { 606 old.beta = ptr->beta; 607 old.beta_ecn = ptr->beta_ecn; 608 old.newreno_flags = ptr->newreno_flags; 609 } 610 sopt.sopt_valsize = sizeof(struct cc_newreno_opts); 611 sopt.sopt_dir = SOPT_SET; 612 opt.name = CC_NEWRENO_BETA; 613 opt.val = rack->r_ctl.rc_saved_beta.beta; 614 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 615 if (error) { 616 printf("Error returned by ctl_output %d\n", error); 617 goto out; 618 } 619 /* 620 * Hack alert we need to set in our newreno_flags 621 * so that Abe behavior is also applied. 622 */ 623 ((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN; 624 opt.name = CC_NEWRENO_BETA_ECN; 625 opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; 626 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 627 if (error) { 628 printf("Error returned by ctl_output %d\n", error); 629 goto out; 630 } 631 /* Save off the original values for restoral */ 632 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 633 out: 634 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 635 union tcp_log_stackspecific log; 636 struct timeval tv; 637 638 ptr = ((struct newreno *)tp->ccv->cc_data); 639 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 640 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 641 if (ptr) { 642 log.u_bbr.flex1 = ptr->beta; 643 log.u_bbr.flex2 = ptr->beta_ecn; 644 log.u_bbr.flex3 = ptr->newreno_flags; 645 } 646 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 647 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 648 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 649 log.u_bbr.flex7 = rack->gp_ready; 650 log.u_bbr.flex7 <<= 1; 651 log.u_bbr.flex7 |= rack->use_fixed_rate; 652 log.u_bbr.flex7 <<= 1; 653 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 654 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 655 log.u_bbr.flex8 = 3; 656 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, 657 0, &log, false, NULL, NULL, 0, &tv); 658 } 659 } 660 661 static void 662 rack_undo_cc_pacing(struct tcp_rack *rack) 663 { 664 struct newreno old, *ptr; 665 struct tcpcb *tp; 666 667 if (rack->rc_pacing_cc_set == 0) 668 return; 669 tp = rack->rc_tp; 670 rack->rc_pacing_cc_set = 0; 671 if (tp->cc_algo == NULL) 672 /* Tcb is leaving */ 673 return; 674 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 675 /* Not new-reno nothing to do! */ 676 return; 677 } 678 ptr = ((struct newreno *)tp->ccv->cc_data); 679 if (ptr == NULL) { 680 /* 681 * This happens at rack_fini() if the 682 * cc module gets freed on us. In that 683 * case we loose our "new" settings but 684 * thats ok, since the tcb is going away anyway. 685 */ 686 return; 687 } 688 /* Grab out our set values */ 689 memcpy(&old, ptr, sizeof(struct newreno)); 690 /* Copy back in the original values */ 691 memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno)); 692 /* Now save back the values we had set in (for when pacing is restored) */ 693 memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); 694 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 695 union tcp_log_stackspecific log; 696 struct timeval tv; 697 698 ptr = ((struct newreno *)tp->ccv->cc_data); 699 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 700 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 701 log.u_bbr.flex1 = ptr->beta; 702 log.u_bbr.flex2 = ptr->beta_ecn; 703 log.u_bbr.flex3 = ptr->newreno_flags; 704 log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; 705 log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; 706 log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; 707 log.u_bbr.flex7 = rack->gp_ready; 708 log.u_bbr.flex7 <<= 1; 709 log.u_bbr.flex7 |= rack->use_fixed_rate; 710 log.u_bbr.flex7 <<= 1; 711 log.u_bbr.flex7 |= rack->rc_pacing_cc_set; 712 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 713 log.u_bbr.flex8 = 4; 714 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 715 0, &log, false, NULL, NULL, 0, &tv); 716 } 717 } 718 719 #ifdef NETFLIX_PEAKRATE 720 static inline void 721 rack_update_peakrate_thr(struct tcpcb *tp) 722 { 723 /* Keep in mind that t_maxpeakrate is in B/s. */ 724 uint64_t peak; 725 peak = uqmax((tp->t_maxseg * 2), 726 (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC)); 727 tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX); 728 } 729 #endif 730 731 static int 732 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 733 { 734 uint32_t stat; 735 int32_t error; 736 int i; 737 738 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 739 if (error || req->newptr == NULL) 740 return error; 741 742 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 743 if (error) 744 return (error); 745 if (stat == 1) { 746 #ifdef INVARIANTS 747 printf("Clearing RACK counters\n"); 748 #endif 749 counter_u64_zero(rack_badfr); 750 counter_u64_zero(rack_badfr_bytes); 751 counter_u64_zero(rack_rtm_prr_retran); 752 counter_u64_zero(rack_rtm_prr_newdata); 753 counter_u64_zero(rack_timestamp_mismatch); 754 counter_u64_zero(rack_reorder_seen); 755 counter_u64_zero(rack_tlp_tot); 756 counter_u64_zero(rack_tlp_newdata); 757 counter_u64_zero(rack_tlp_retran); 758 counter_u64_zero(rack_tlp_retran_bytes); 759 counter_u64_zero(rack_tlp_retran_fail); 760 counter_u64_zero(rack_to_tot); 761 counter_u64_zero(rack_to_arm_rack); 762 counter_u64_zero(rack_to_arm_tlp); 763 counter_u64_zero(rack_paced_segments); 764 counter_u64_zero(rack_calc_zero); 765 counter_u64_zero(rack_calc_nonzero); 766 counter_u64_zero(rack_unpaced_segments); 767 counter_u64_zero(rack_saw_enobuf); 768 counter_u64_zero(rack_saw_enobuf_hw); 769 counter_u64_zero(rack_saw_enetunreach); 770 counter_u64_zero(rack_per_timer_hole); 771 counter_u64_zero(rack_large_ackcmp); 772 counter_u64_zero(rack_small_ackcmp); 773 #ifdef INVARIANTS 774 counter_u64_zero(rack_adjust_map_bw); 775 #endif 776 counter_u64_zero(rack_to_alloc_hard); 777 counter_u64_zero(rack_to_alloc_emerg); 778 counter_u64_zero(rack_sack_proc_all); 779 counter_u64_zero(rack_fto_send); 780 counter_u64_zero(rack_fto_rsm_send); 781 counter_u64_zero(rack_extended_rfo); 782 counter_u64_zero(rack_hw_pace_init_fail); 783 counter_u64_zero(rack_hw_pace_lost); 784 counter_u64_zero(rack_sbsndptr_wrong); 785 counter_u64_zero(rack_sbsndptr_right); 786 counter_u64_zero(rack_non_fto_send); 787 counter_u64_zero(rack_nfto_resend); 788 counter_u64_zero(rack_sack_proc_short); 789 counter_u64_zero(rack_sack_proc_restart); 790 counter_u64_zero(rack_to_alloc); 791 counter_u64_zero(rack_to_alloc_limited); 792 counter_u64_zero(rack_alloc_limited_conns); 793 counter_u64_zero(rack_split_limited); 794 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 795 counter_u64_zero(rack_proc_comp_ack[i]); 796 } 797 counter_u64_zero(rack_multi_single_eq); 798 counter_u64_zero(rack_proc_non_comp_ack); 799 counter_u64_zero(rack_find_high); 800 counter_u64_zero(rack_sack_attacks_detected); 801 counter_u64_zero(rack_sack_attacks_reversed); 802 counter_u64_zero(rack_sack_used_next_merge); 803 counter_u64_zero(rack_sack_used_prev_merge); 804 counter_u64_zero(rack_sack_splits); 805 counter_u64_zero(rack_sack_skipped_acked); 806 counter_u64_zero(rack_ack_total); 807 counter_u64_zero(rack_express_sack); 808 counter_u64_zero(rack_sack_total); 809 counter_u64_zero(rack_move_none); 810 counter_u64_zero(rack_move_some); 811 counter_u64_zero(rack_used_tlpmethod); 812 counter_u64_zero(rack_used_tlpmethod2); 813 counter_u64_zero(rack_enter_tlp_calc); 814 counter_u64_zero(rack_progress_drops); 815 counter_u64_zero(rack_tlp_does_nada); 816 counter_u64_zero(rack_try_scwnd); 817 counter_u64_zero(rack_collapsed_win); 818 } 819 rack_clear_counter = 0; 820 return (0); 821 } 822 823 static void 824 rack_init_sysctls(void) 825 { 826 int i; 827 struct sysctl_oid *rack_counters; 828 struct sysctl_oid *rack_attack; 829 struct sysctl_oid *rack_pacing; 830 struct sysctl_oid *rack_timely; 831 struct sysctl_oid *rack_timers; 832 struct sysctl_oid *rack_tlp; 833 struct sysctl_oid *rack_misc; 834 struct sysctl_oid *rack_measure; 835 struct sysctl_oid *rack_probertt; 836 struct sysctl_oid *rack_hw_pacing; 837 838 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 839 SYSCTL_CHILDREN(rack_sysctl_root), 840 OID_AUTO, 841 "sack_attack", 842 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 843 "Rack Sack Attack Counters and Controls"); 844 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 845 SYSCTL_CHILDREN(rack_sysctl_root), 846 OID_AUTO, 847 "stats", 848 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 849 "Rack Counters"); 850 SYSCTL_ADD_S32(&rack_sysctl_ctx, 851 SYSCTL_CHILDREN(rack_sysctl_root), 852 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 853 &rack_rate_sample_method , USE_RTT_LOW, 854 "What method should we use for rate sampling 0=high, 1=low "); 855 /* Probe rtt related controls */ 856 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 857 SYSCTL_CHILDREN(rack_sysctl_root), 858 OID_AUTO, 859 "probertt", 860 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 861 "ProbeRTT related Controls"); 862 SYSCTL_ADD_U16(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_probertt), 864 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 865 &rack_atexit_prtt_hbp, 130, 866 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 867 SYSCTL_ADD_U16(&rack_sysctl_ctx, 868 SYSCTL_CHILDREN(rack_probertt), 869 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 870 &rack_atexit_prtt, 130, 871 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 872 SYSCTL_ADD_U16(&rack_sysctl_ctx, 873 SYSCTL_CHILDREN(rack_probertt), 874 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 875 &rack_per_of_gp_probertt, 60, 876 "What percentage of goodput do we pace at in probertt"); 877 SYSCTL_ADD_U16(&rack_sysctl_ctx, 878 SYSCTL_CHILDREN(rack_probertt), 879 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 880 &rack_per_of_gp_probertt_reduce, 10, 881 "What percentage of goodput do we reduce every gp_srtt"); 882 SYSCTL_ADD_U16(&rack_sysctl_ctx, 883 SYSCTL_CHILDREN(rack_probertt), 884 OID_AUTO, "gp_per_low", CTLFLAG_RW, 885 &rack_per_of_gp_lowthresh, 40, 886 "What percentage of goodput do we allow the multiplier to fall to"); 887 SYSCTL_ADD_U32(&rack_sysctl_ctx, 888 SYSCTL_CHILDREN(rack_probertt), 889 OID_AUTO, "time_between", CTLFLAG_RW, 890 & rack_time_between_probertt, 96000000, 891 "How many useconds between the lowest rtt falling must past before we enter probertt"); 892 SYSCTL_ADD_U32(&rack_sysctl_ctx, 893 SYSCTL_CHILDREN(rack_probertt), 894 OID_AUTO, "safety", CTLFLAG_RW, 895 &rack_probe_rtt_safety_val, 2000000, 896 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 897 SYSCTL_ADD_U32(&rack_sysctl_ctx, 898 SYSCTL_CHILDREN(rack_probertt), 899 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 900 &rack_probe_rtt_sets_cwnd, 0, 901 "Do we set the cwnd too (if always_lower is on)"); 902 SYSCTL_ADD_U32(&rack_sysctl_ctx, 903 SYSCTL_CHILDREN(rack_probertt), 904 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 905 &rack_max_drain_wait, 2, 906 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 907 SYSCTL_ADD_U32(&rack_sysctl_ctx, 908 SYSCTL_CHILDREN(rack_probertt), 909 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 910 &rack_must_drain, 1, 911 "We must drain this many gp_srtt's waiting for flight to reach goal"); 912 SYSCTL_ADD_U32(&rack_sysctl_ctx, 913 SYSCTL_CHILDREN(rack_probertt), 914 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 915 &rack_probertt_use_min_rtt_entry, 1, 916 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 917 SYSCTL_ADD_U32(&rack_sysctl_ctx, 918 SYSCTL_CHILDREN(rack_probertt), 919 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 920 &rack_probertt_use_min_rtt_exit, 0, 921 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 922 SYSCTL_ADD_U32(&rack_sysctl_ctx, 923 SYSCTL_CHILDREN(rack_probertt), 924 OID_AUTO, "length_div", CTLFLAG_RW, 925 &rack_probertt_gpsrtt_cnt_div, 0, 926 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 927 SYSCTL_ADD_U32(&rack_sysctl_ctx, 928 SYSCTL_CHILDREN(rack_probertt), 929 OID_AUTO, "length_mul", CTLFLAG_RW, 930 &rack_probertt_gpsrtt_cnt_mul, 0, 931 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 932 SYSCTL_ADD_U32(&rack_sysctl_ctx, 933 SYSCTL_CHILDREN(rack_probertt), 934 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 935 &rack_min_probertt_hold, 200000, 936 "What is the minimum time we hold probertt at target"); 937 SYSCTL_ADD_U32(&rack_sysctl_ctx, 938 SYSCTL_CHILDREN(rack_probertt), 939 OID_AUTO, "filter_life", CTLFLAG_RW, 940 &rack_probertt_filter_life, 10000000, 941 "What is the time for the filters life in useconds"); 942 SYSCTL_ADD_U32(&rack_sysctl_ctx, 943 SYSCTL_CHILDREN(rack_probertt), 944 OID_AUTO, "lower_within", CTLFLAG_RW, 945 &rack_probertt_lower_within, 10, 946 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 947 SYSCTL_ADD_U32(&rack_sysctl_ctx, 948 SYSCTL_CHILDREN(rack_probertt), 949 OID_AUTO, "must_move", CTLFLAG_RW, 950 &rack_min_rtt_movement, 250, 951 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 952 SYSCTL_ADD_U32(&rack_sysctl_ctx, 953 SYSCTL_CHILDREN(rack_probertt), 954 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 955 &rack_probertt_clear_is, 1, 956 "Do we clear I/S counts on exiting probe-rtt"); 957 SYSCTL_ADD_S32(&rack_sysctl_ctx, 958 SYSCTL_CHILDREN(rack_probertt), 959 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 960 &rack_max_drain_hbp, 1, 961 "How many extra drain gpsrtt's do we get in highly buffered paths"); 962 SYSCTL_ADD_S32(&rack_sysctl_ctx, 963 SYSCTL_CHILDREN(rack_probertt), 964 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 965 &rack_hbp_thresh, 3, 966 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 967 /* Pacing related sysctls */ 968 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 969 SYSCTL_CHILDREN(rack_sysctl_root), 970 OID_AUTO, 971 "pacing", 972 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 973 "Pacing related Controls"); 974 SYSCTL_ADD_S32(&rack_sysctl_ctx, 975 SYSCTL_CHILDREN(rack_pacing), 976 OID_AUTO, "max_pace_over", CTLFLAG_RW, 977 &rack_max_per_above, 30, 978 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 979 SYSCTL_ADD_S32(&rack_sysctl_ctx, 980 SYSCTL_CHILDREN(rack_pacing), 981 OID_AUTO, "pace_to_one", CTLFLAG_RW, 982 &rack_pace_one_seg, 0, 983 "Do we allow low b/w pacing of 1MSS instead of two"); 984 SYSCTL_ADD_S32(&rack_sysctl_ctx, 985 SYSCTL_CHILDREN(rack_pacing), 986 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 987 &rack_limit_time_with_srtt, 0, 988 "Do we limit pacing time based on srtt"); 989 SYSCTL_ADD_S32(&rack_sysctl_ctx, 990 SYSCTL_CHILDREN(rack_pacing), 991 OID_AUTO, "init_win", CTLFLAG_RW, 992 &rack_default_init_window, 0, 993 "Do we have a rack initial window 0 = system default"); 994 SYSCTL_ADD_U16(&rack_sysctl_ctx, 995 SYSCTL_CHILDREN(rack_pacing), 996 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 997 &rack_per_of_gp_ss, 250, 998 "If non zero, what percentage of goodput to pace at in slow start"); 999 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1000 SYSCTL_CHILDREN(rack_pacing), 1001 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 1002 &rack_per_of_gp_ca, 150, 1003 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 1004 SYSCTL_ADD_U16(&rack_sysctl_ctx, 1005 SYSCTL_CHILDREN(rack_pacing), 1006 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 1007 &rack_per_of_gp_rec, 200, 1008 "If non zero, what percentage of goodput to pace at in recovery"); 1009 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1010 SYSCTL_CHILDREN(rack_pacing), 1011 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 1012 &rack_hptsi_segments, 40, 1013 "What size is the max for TSO segments in pacing and burst mitigation"); 1014 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1015 SYSCTL_CHILDREN(rack_pacing), 1016 OID_AUTO, "burst_reduces", CTLFLAG_RW, 1017 &rack_slot_reduction, 4, 1018 "When doing only burst mitigation what is the reduce divisor"); 1019 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1020 SYSCTL_CHILDREN(rack_sysctl_root), 1021 OID_AUTO, "use_pacing", CTLFLAG_RW, 1022 &rack_pace_every_seg, 0, 1023 "If set we use pacing, if clear we use only the original burst mitigation"); 1024 SYSCTL_ADD_U64(&rack_sysctl_ctx, 1025 SYSCTL_CHILDREN(rack_pacing), 1026 OID_AUTO, "rate_cap", CTLFLAG_RW, 1027 &rack_bw_rate_cap, 0, 1028 "If set we apply this value to the absolute rate cap used by pacing"); 1029 SYSCTL_ADD_U8(&rack_sysctl_ctx, 1030 SYSCTL_CHILDREN(rack_sysctl_root), 1031 OID_AUTO, "req_measure_cnt", CTLFLAG_RW, 1032 &rack_req_measurements, 1, 1033 "If doing dynamic pacing, how many measurements must be in before we start pacing?"); 1034 /* Hardware pacing */ 1035 rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1036 SYSCTL_CHILDREN(rack_sysctl_root), 1037 OID_AUTO, 1038 "hdwr_pacing", 1039 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1040 "Pacing related Controls"); 1041 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1042 SYSCTL_CHILDREN(rack_hw_pacing), 1043 OID_AUTO, "rwnd_factor", CTLFLAG_RW, 1044 &rack_hw_rwnd_factor, 2, 1045 "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); 1046 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1047 SYSCTL_CHILDREN(rack_hw_pacing), 1048 OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, 1049 &rack_enobuf_hw_boost_mult, 2, 1050 "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); 1051 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1052 SYSCTL_CHILDREN(rack_hw_pacing), 1053 OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, 1054 &rack_enobuf_hw_max, 2, 1055 "What is the max boost the pacing time if we see a ENOBUFS?"); 1056 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1057 SYSCTL_CHILDREN(rack_hw_pacing), 1058 OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, 1059 &rack_enobuf_hw_min, 2, 1060 "What is the min boost the pacing time if we see a ENOBUFS?"); 1061 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1062 SYSCTL_CHILDREN(rack_hw_pacing), 1063 OID_AUTO, "enable", CTLFLAG_RW, 1064 &rack_enable_hw_pacing, 0, 1065 "Should RACK attempt to use hw pacing?"); 1066 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1067 SYSCTL_CHILDREN(rack_hw_pacing), 1068 OID_AUTO, "rate_cap", CTLFLAG_RW, 1069 &rack_hw_rate_caps, 1, 1070 "Does the highest hardware pacing rate cap the rate we will send at??"); 1071 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1072 SYSCTL_CHILDREN(rack_hw_pacing), 1073 OID_AUTO, "rate_min", CTLFLAG_RW, 1074 &rack_hw_rate_min, 0, 1075 "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); 1076 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1077 SYSCTL_CHILDREN(rack_hw_pacing), 1078 OID_AUTO, "rate_to_low", CTLFLAG_RW, 1079 &rack_hw_rate_to_low, 0, 1080 "If we fall below this rate, dis-engage hw pacing?"); 1081 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_hw_pacing), 1083 OID_AUTO, "up_only", CTLFLAG_RW, 1084 &rack_hw_up_only, 1, 1085 "Do we allow hw pacing to lower the rate selected?"); 1086 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1087 SYSCTL_CHILDREN(rack_hw_pacing), 1088 OID_AUTO, "extra_mss_precise", CTLFLAG_RW, 1089 &rack_hw_pace_extra_slots, 2, 1090 "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); 1091 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1092 SYSCTL_CHILDREN(rack_sysctl_root), 1093 OID_AUTO, 1094 "timely", 1095 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1096 "Rack Timely RTT Controls"); 1097 /* Timely based GP dynmics */ 1098 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1099 SYSCTL_CHILDREN(rack_timely), 1100 OID_AUTO, "upper", CTLFLAG_RW, 1101 &rack_gp_per_bw_mul_up, 2, 1102 "Rack timely upper range for equal b/w (in percentage)"); 1103 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1104 SYSCTL_CHILDREN(rack_timely), 1105 OID_AUTO, "lower", CTLFLAG_RW, 1106 &rack_gp_per_bw_mul_down, 4, 1107 "Rack timely lower range for equal b/w (in percentage)"); 1108 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1109 SYSCTL_CHILDREN(rack_timely), 1110 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 1111 &rack_gp_rtt_maxmul, 3, 1112 "Rack timely multipler of lowest rtt for rtt_max"); 1113 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1114 SYSCTL_CHILDREN(rack_timely), 1115 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 1116 &rack_gp_rtt_mindiv, 4, 1117 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1118 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1119 SYSCTL_CHILDREN(rack_timely), 1120 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 1121 &rack_gp_rtt_minmul, 1, 1122 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 1123 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1124 SYSCTL_CHILDREN(rack_timely), 1125 OID_AUTO, "decrease", CTLFLAG_RW, 1126 &rack_gp_decrease_per, 20, 1127 "Rack timely decrease percentage of our GP multiplication factor"); 1128 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1129 SYSCTL_CHILDREN(rack_timely), 1130 OID_AUTO, "increase", CTLFLAG_RW, 1131 &rack_gp_increase_per, 2, 1132 "Rack timely increase perentage of our GP multiplication factor"); 1133 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1134 SYSCTL_CHILDREN(rack_timely), 1135 OID_AUTO, "lowerbound", CTLFLAG_RW, 1136 &rack_per_lower_bound, 50, 1137 "Rack timely lowest percentage we allow GP multiplier to fall to"); 1138 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1139 SYSCTL_CHILDREN(rack_timely), 1140 OID_AUTO, "upperboundss", CTLFLAG_RW, 1141 &rack_per_upper_bound_ss, 0, 1142 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 1143 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1144 SYSCTL_CHILDREN(rack_timely), 1145 OID_AUTO, "upperboundca", CTLFLAG_RW, 1146 &rack_per_upper_bound_ca, 0, 1147 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 1148 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1149 SYSCTL_CHILDREN(rack_timely), 1150 OID_AUTO, "dynamicgp", CTLFLAG_RW, 1151 &rack_do_dyn_mul, 0, 1152 "Rack timely do we enable dynmaic timely goodput by default"); 1153 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1154 SYSCTL_CHILDREN(rack_timely), 1155 OID_AUTO, "no_rec_red", CTLFLAG_RW, 1156 &rack_gp_no_rec_chg, 1, 1157 "Rack timely do we prohibit the recovery multiplier from being lowered"); 1158 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1159 SYSCTL_CHILDREN(rack_timely), 1160 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 1161 &rack_timely_dec_clear, 6, 1162 "Rack timely what threshold do we count to before another boost during b/w decent"); 1163 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1164 SYSCTL_CHILDREN(rack_timely), 1165 OID_AUTO, "max_push_rise", CTLFLAG_RW, 1166 &rack_timely_max_push_rise, 3, 1167 "Rack timely how many times do we push up with b/w increase"); 1168 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1169 SYSCTL_CHILDREN(rack_timely), 1170 OID_AUTO, "max_push_drop", CTLFLAG_RW, 1171 &rack_timely_max_push_drop, 3, 1172 "Rack timely how many times do we push back on b/w decent"); 1173 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1174 SYSCTL_CHILDREN(rack_timely), 1175 OID_AUTO, "min_segs", CTLFLAG_RW, 1176 &rack_timely_min_segs, 4, 1177 "Rack timely when setting the cwnd what is the min num segments"); 1178 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1179 SYSCTL_CHILDREN(rack_timely), 1180 OID_AUTO, "noback_max", CTLFLAG_RW, 1181 &rack_use_max_for_nobackoff, 0, 1182 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 1183 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_timely), 1185 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 1186 &rack_timely_int_timely_only, 0, 1187 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 1188 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1189 SYSCTL_CHILDREN(rack_timely), 1190 OID_AUTO, "nonstop", CTLFLAG_RW, 1191 &rack_timely_no_stopping, 0, 1192 "Rack timely don't stop increase"); 1193 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1194 SYSCTL_CHILDREN(rack_timely), 1195 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 1196 &rack_down_raise_thresh, 100, 1197 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 1198 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1199 SYSCTL_CHILDREN(rack_timely), 1200 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 1201 &rack_req_segs, 1, 1202 "Bottom dragging if not these many segments outstanding and room"); 1203 1204 /* TLP and Rack related parameters */ 1205 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1206 SYSCTL_CHILDREN(rack_sysctl_root), 1207 OID_AUTO, 1208 "tlp", 1209 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1210 "TLP and Rack related Controls"); 1211 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1212 SYSCTL_CHILDREN(rack_tlp), 1213 OID_AUTO, "use_rrr", CTLFLAG_RW, 1214 &use_rack_rr, 1, 1215 "Do we use Rack Rapid Recovery"); 1216 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1217 SYSCTL_CHILDREN(rack_tlp), 1218 OID_AUTO, "post_rec_labc", CTLFLAG_RW, 1219 &rack_max_abc_post_recovery, 2, 1220 "Since we do early recovery, do we override the l_abc to a value, if so what?"); 1221 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1222 SYSCTL_CHILDREN(rack_tlp), 1223 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 1224 &rack_non_rxt_use_cr, 0, 1225 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 1226 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1227 SYSCTL_CHILDREN(rack_tlp), 1228 OID_AUTO, "tlpmethod", CTLFLAG_RW, 1229 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 1230 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 1231 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1232 SYSCTL_CHILDREN(rack_tlp), 1233 OID_AUTO, "limit", CTLFLAG_RW, 1234 &rack_tlp_limit, 2, 1235 "How many TLP's can be sent without sending new data"); 1236 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_tlp), 1238 OID_AUTO, "use_greater", CTLFLAG_RW, 1239 &rack_tlp_use_greater, 1, 1240 "Should we use the rack_rtt time if its greater than srtt"); 1241 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1242 SYSCTL_CHILDREN(rack_tlp), 1243 OID_AUTO, "tlpminto", CTLFLAG_RW, 1244 &rack_tlp_min, 10000, 1245 "TLP minimum timeout per the specification (in microseconds)"); 1246 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1247 SYSCTL_CHILDREN(rack_tlp), 1248 OID_AUTO, "send_oldest", CTLFLAG_RW, 1249 &rack_always_send_oldest, 0, 1250 "Should we always send the oldest TLP and RACK-TLP"); 1251 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1252 SYSCTL_CHILDREN(rack_tlp), 1253 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 1254 &rack_limited_retran, 0, 1255 "How many times can a rack timeout drive out sends"); 1256 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1257 SYSCTL_CHILDREN(rack_tlp), 1258 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 1259 &rack_lower_cwnd_at_tlp, 0, 1260 "When a TLP completes a retran should we enter recovery"); 1261 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1262 SYSCTL_CHILDREN(rack_tlp), 1263 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 1264 &rack_reorder_thresh, 2, 1265 "What factor for rack will be added when seeing reordering (shift right)"); 1266 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1267 SYSCTL_CHILDREN(rack_tlp), 1268 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 1269 &rack_tlp_thresh, 1, 1270 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 1271 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1272 SYSCTL_CHILDREN(rack_tlp), 1273 OID_AUTO, "reorder_fade", CTLFLAG_RW, 1274 &rack_reorder_fade, 60000000, 1275 "Does reorder detection fade, if so how many microseconds (0 means never)"); 1276 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1277 SYSCTL_CHILDREN(rack_tlp), 1278 OID_AUTO, "pktdelay", CTLFLAG_RW, 1279 &rack_pkt_delay, 1000, 1280 "Extra RACK time (in microseconds) besides reordering thresh"); 1281 1282 /* Timer related controls */ 1283 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1284 SYSCTL_CHILDREN(rack_sysctl_root), 1285 OID_AUTO, 1286 "timers", 1287 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1288 "Timer related controls"); 1289 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1290 SYSCTL_CHILDREN(rack_timers), 1291 OID_AUTO, "persmin", CTLFLAG_RW, 1292 &rack_persist_min, 250000, 1293 "What is the minimum time in microseconds between persists"); 1294 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1295 SYSCTL_CHILDREN(rack_timers), 1296 OID_AUTO, "persmax", CTLFLAG_RW, 1297 &rack_persist_max, 2000000, 1298 "What is the largest delay in microseconds between persists"); 1299 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1300 SYSCTL_CHILDREN(rack_timers), 1301 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1302 &rack_delayed_ack_time, 40000, 1303 "Delayed ack time (40ms in microseconds)"); 1304 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1305 SYSCTL_CHILDREN(rack_timers), 1306 OID_AUTO, "minrto", CTLFLAG_RW, 1307 &rack_rto_min, 30000, 1308 "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); 1309 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1310 SYSCTL_CHILDREN(rack_timers), 1311 OID_AUTO, "maxrto", CTLFLAG_RW, 1312 &rack_rto_max, 4000000, 1313 "Maxiumum RTO in microseconds -- should be at least as large as min_rto"); 1314 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_timers), 1316 OID_AUTO, "minto", CTLFLAG_RW, 1317 &rack_min_to, 1000, 1318 "Minimum rack timeout in microseconds"); 1319 /* Measure controls */ 1320 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_sysctl_root), 1322 OID_AUTO, 1323 "measure", 1324 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1325 "Measure related controls"); 1326 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1327 SYSCTL_CHILDREN(rack_measure), 1328 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1329 &rack_wma_divisor, 8, 1330 "When doing b/w calculation what is the divisor for the WMA"); 1331 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1332 SYSCTL_CHILDREN(rack_measure), 1333 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1334 &rack_cwnd_block_ends_measure, 0, 1335 "Does a cwnd just-return end the measurement window (app limited)"); 1336 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1337 SYSCTL_CHILDREN(rack_measure), 1338 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1339 &rack_rwnd_block_ends_measure, 0, 1340 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1341 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1342 SYSCTL_CHILDREN(rack_measure), 1343 OID_AUTO, "min_target", CTLFLAG_RW, 1344 &rack_def_data_window, 20, 1345 "What is the minimum target window (in mss) for a GP measurements"); 1346 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1347 SYSCTL_CHILDREN(rack_measure), 1348 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1349 &rack_goal_bdp, 2, 1350 "What is the goal BDP to measure"); 1351 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1352 SYSCTL_CHILDREN(rack_measure), 1353 OID_AUTO, "min_srtts", CTLFLAG_RW, 1354 &rack_min_srtts, 1, 1355 "What is the goal BDP to measure"); 1356 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1357 SYSCTL_CHILDREN(rack_measure), 1358 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1359 &rack_min_measure_usec, 0, 1360 "What is the Minimum time time for a measurement if 0, this is off"); 1361 /* Misc rack controls */ 1362 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_sysctl_root), 1364 OID_AUTO, 1365 "misc", 1366 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1367 "Misc related controls"); 1368 #ifdef TCP_ACCOUNTING 1369 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1370 SYSCTL_CHILDREN(rack_misc), 1371 OID_AUTO, "tcp_acct", CTLFLAG_RW, 1372 &rack_tcp_accounting, 0, 1373 "Should we turn on TCP accounting for all rack sessions?"); 1374 #endif 1375 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1376 SYSCTL_CHILDREN(rack_misc), 1377 OID_AUTO, "prr_addback_max", CTLFLAG_RW, 1378 &rack_prr_addbackmax, 2, 1379 "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); 1380 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1381 SYSCTL_CHILDREN(rack_misc), 1382 OID_AUTO, "stats_gets_ms", CTLFLAG_RW, 1383 &rack_stats_gets_ms_rtt, 1, 1384 "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); 1385 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1386 SYSCTL_CHILDREN(rack_misc), 1387 OID_AUTO, "clientlowbuf", CTLFLAG_RW, 1388 &rack_client_low_buf, 0, 1389 "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); 1390 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1391 SYSCTL_CHILDREN(rack_misc), 1392 OID_AUTO, "defprofile", CTLFLAG_RW, 1393 &rack_def_profile, 0, 1394 "Should RACK use a default profile (0=no, num == profile num)?"); 1395 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1396 SYSCTL_CHILDREN(rack_misc), 1397 OID_AUTO, "cmpack", CTLFLAG_RW, 1398 &rack_use_cmp_acks, 1, 1399 "Should RACK have LRO send compressed acks"); 1400 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1401 SYSCTL_CHILDREN(rack_misc), 1402 OID_AUTO, "fsb", CTLFLAG_RW, 1403 &rack_use_fsb, 1, 1404 "Should RACK use the fast send block?"); 1405 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1406 SYSCTL_CHILDREN(rack_misc), 1407 OID_AUTO, "rfo", CTLFLAG_RW, 1408 &rack_use_rfo, 1, 1409 "Should RACK use rack_fast_output()?"); 1410 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1411 SYSCTL_CHILDREN(rack_misc), 1412 OID_AUTO, "rsmrfo", CTLFLAG_RW, 1413 &rack_use_rsm_rfo, 1, 1414 "Should RACK use rack_fast_rsm_output()?"); 1415 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1416 SYSCTL_CHILDREN(rack_misc), 1417 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1418 &rack_enable_shared_cwnd, 1, 1419 "Should RACK try to use the shared cwnd on connections where allowed"); 1420 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1421 SYSCTL_CHILDREN(rack_misc), 1422 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1423 &rack_limits_scwnd, 1, 1424 "Should RACK place low end time limits on the shared cwnd feature"); 1425 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1426 SYSCTL_CHILDREN(rack_misc), 1427 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1428 &rack_enable_mqueue_for_nonpaced, 0, 1429 "Should RACK use mbuf queuing for non-paced connections"); 1430 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1431 SYSCTL_CHILDREN(rack_misc), 1432 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1433 &rack_use_imac_dack, 0, 1434 "Should RACK try to emulate iMac delayed ack"); 1435 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1436 SYSCTL_CHILDREN(rack_misc), 1437 OID_AUTO, "no_prr", CTLFLAG_RW, 1438 &rack_disable_prr, 0, 1439 "Should RACK not use prr and only pace (must have pacing on)"); 1440 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_misc), 1442 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1443 &rack_verbose_logging, 0, 1444 "Should RACK black box logging be verbose"); 1445 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1446 SYSCTL_CHILDREN(rack_misc), 1447 OID_AUTO, "data_after_close", CTLFLAG_RW, 1448 &rack_ignore_data_after_close, 1, 1449 "Do we hold off sending a RST until all pending data is ack'd"); 1450 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1451 SYSCTL_CHILDREN(rack_misc), 1452 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1453 &rack_sack_not_required, 1, 1454 "Do we allow rack to run on connections not supporting SACK"); 1455 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1456 SYSCTL_CHILDREN(rack_misc), 1457 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1458 &rack_send_a_lot_in_prr, 1, 1459 "Send a lot in prr"); 1460 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1461 SYSCTL_CHILDREN(rack_misc), 1462 OID_AUTO, "autoscale", CTLFLAG_RW, 1463 &rack_autosndbuf_inc, 20, 1464 "What percentage should rack scale up its snd buffer by?"); 1465 /* Sack Attacker detection stuff */ 1466 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1467 SYSCTL_CHILDREN(rack_attack), 1468 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1469 &rack_highest_sack_thresh_seen, 0, 1470 "Highest sack to ack ratio seen"); 1471 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1472 SYSCTL_CHILDREN(rack_attack), 1473 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1474 &rack_highest_move_thresh_seen, 0, 1475 "Highest move to non-move ratio seen"); 1476 rack_ack_total = counter_u64_alloc(M_WAITOK); 1477 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1478 SYSCTL_CHILDREN(rack_attack), 1479 OID_AUTO, "acktotal", CTLFLAG_RD, 1480 &rack_ack_total, 1481 "Total number of Ack's"); 1482 rack_express_sack = counter_u64_alloc(M_WAITOK); 1483 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1484 SYSCTL_CHILDREN(rack_attack), 1485 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1486 &rack_express_sack, 1487 "Total expresss number of Sack's"); 1488 rack_sack_total = counter_u64_alloc(M_WAITOK); 1489 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_attack), 1491 OID_AUTO, "sacktotal", CTLFLAG_RD, 1492 &rack_sack_total, 1493 "Total number of SACKs"); 1494 rack_move_none = counter_u64_alloc(M_WAITOK); 1495 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1496 SYSCTL_CHILDREN(rack_attack), 1497 OID_AUTO, "move_none", CTLFLAG_RD, 1498 &rack_move_none, 1499 "Total number of SACK index reuse of postions under threshold"); 1500 rack_move_some = counter_u64_alloc(M_WAITOK); 1501 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1502 SYSCTL_CHILDREN(rack_attack), 1503 OID_AUTO, "move_some", CTLFLAG_RD, 1504 &rack_move_some, 1505 "Total number of SACK index reuse of postions over threshold"); 1506 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1507 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1508 SYSCTL_CHILDREN(rack_attack), 1509 OID_AUTO, "attacks", CTLFLAG_RD, 1510 &rack_sack_attacks_detected, 1511 "Total number of SACK attackers that had sack disabled"); 1512 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1513 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1514 SYSCTL_CHILDREN(rack_attack), 1515 OID_AUTO, "reversed", CTLFLAG_RD, 1516 &rack_sack_attacks_reversed, 1517 "Total number of SACK attackers that were later determined false positive"); 1518 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1519 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1520 SYSCTL_CHILDREN(rack_attack), 1521 OID_AUTO, "nextmerge", CTLFLAG_RD, 1522 &rack_sack_used_next_merge, 1523 "Total number of times we used the next merge"); 1524 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1525 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1526 SYSCTL_CHILDREN(rack_attack), 1527 OID_AUTO, "prevmerge", CTLFLAG_RD, 1528 &rack_sack_used_prev_merge, 1529 "Total number of times we used the prev merge"); 1530 /* Counters */ 1531 rack_fto_send = counter_u64_alloc(M_WAITOK); 1532 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1533 SYSCTL_CHILDREN(rack_counters), 1534 OID_AUTO, "fto_send", CTLFLAG_RD, 1535 &rack_fto_send, "Total number of rack_fast_output sends"); 1536 rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); 1537 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1538 SYSCTL_CHILDREN(rack_counters), 1539 OID_AUTO, "fto_rsm_send", CTLFLAG_RD, 1540 &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); 1541 rack_nfto_resend = counter_u64_alloc(M_WAITOK); 1542 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1543 SYSCTL_CHILDREN(rack_counters), 1544 OID_AUTO, "nfto_resend", CTLFLAG_RD, 1545 &rack_nfto_resend, "Total number of rack_output retransmissions"); 1546 rack_non_fto_send = counter_u64_alloc(M_WAITOK); 1547 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1548 SYSCTL_CHILDREN(rack_counters), 1549 OID_AUTO, "nfto_send", CTLFLAG_RD, 1550 &rack_non_fto_send, "Total number of rack_output first sends"); 1551 rack_extended_rfo = counter_u64_alloc(M_WAITOK); 1552 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1553 SYSCTL_CHILDREN(rack_counters), 1554 OID_AUTO, "rfo_extended", CTLFLAG_RD, 1555 &rack_extended_rfo, "Total number of times we extended rfo"); 1556 1557 rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); 1558 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1559 SYSCTL_CHILDREN(rack_counters), 1560 OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, 1561 &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); 1562 rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); 1563 1564 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1565 SYSCTL_CHILDREN(rack_counters), 1566 OID_AUTO, "hwpace_lost", CTLFLAG_RD, 1567 &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); 1568 1569 1570 1571 rack_badfr = counter_u64_alloc(M_WAITOK); 1572 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1573 SYSCTL_CHILDREN(rack_counters), 1574 OID_AUTO, "badfr", CTLFLAG_RD, 1575 &rack_badfr, "Total number of bad FRs"); 1576 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1577 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1578 SYSCTL_CHILDREN(rack_counters), 1579 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1580 &rack_badfr_bytes, "Total number of bad FRs"); 1581 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1582 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1583 SYSCTL_CHILDREN(rack_counters), 1584 OID_AUTO, "prrsndret", CTLFLAG_RD, 1585 &rack_rtm_prr_retran, 1586 "Total number of prr based retransmits"); 1587 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1588 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1589 SYSCTL_CHILDREN(rack_counters), 1590 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1591 &rack_rtm_prr_newdata, 1592 "Total number of prr based new transmits"); 1593 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1594 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1595 SYSCTL_CHILDREN(rack_counters), 1596 OID_AUTO, "tsnf", CTLFLAG_RD, 1597 &rack_timestamp_mismatch, 1598 "Total number of timestamps that we could not find the reported ts"); 1599 rack_find_high = counter_u64_alloc(M_WAITOK); 1600 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1601 SYSCTL_CHILDREN(rack_counters), 1602 OID_AUTO, "findhigh", CTLFLAG_RD, 1603 &rack_find_high, 1604 "Total number of FIN causing find-high"); 1605 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1606 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1607 SYSCTL_CHILDREN(rack_counters), 1608 OID_AUTO, "reordering", CTLFLAG_RD, 1609 &rack_reorder_seen, 1610 "Total number of times we added delay due to reordering"); 1611 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1612 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1613 SYSCTL_CHILDREN(rack_counters), 1614 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1615 &rack_tlp_tot, 1616 "Total number of tail loss probe expirations"); 1617 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1618 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1619 SYSCTL_CHILDREN(rack_counters), 1620 OID_AUTO, "tlp_new", CTLFLAG_RD, 1621 &rack_tlp_newdata, 1622 "Total number of tail loss probe sending new data"); 1623 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1624 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1625 SYSCTL_CHILDREN(rack_counters), 1626 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1627 &rack_tlp_retran, 1628 "Total number of tail loss probe sending retransmitted data"); 1629 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1630 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1631 SYSCTL_CHILDREN(rack_counters), 1632 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1633 &rack_tlp_retran_bytes, 1634 "Total bytes of tail loss probe sending retransmitted data"); 1635 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1636 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1637 SYSCTL_CHILDREN(rack_counters), 1638 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1639 &rack_tlp_retran_fail, 1640 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1641 rack_to_tot = counter_u64_alloc(M_WAITOK); 1642 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1643 SYSCTL_CHILDREN(rack_counters), 1644 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1645 &rack_to_tot, 1646 "Total number of times the rack to expired"); 1647 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1648 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1649 SYSCTL_CHILDREN(rack_counters), 1650 OID_AUTO, "arm_rack", CTLFLAG_RD, 1651 &rack_to_arm_rack, 1652 "Total number of times the rack timer armed"); 1653 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1654 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1655 SYSCTL_CHILDREN(rack_counters), 1656 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1657 &rack_to_arm_tlp, 1658 "Total number of times the tlp timer armed"); 1659 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1660 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1661 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1662 SYSCTL_CHILDREN(rack_counters), 1663 OID_AUTO, "calc_zero", CTLFLAG_RD, 1664 &rack_calc_zero, 1665 "Total number of times pacing time worked out to zero"); 1666 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1667 SYSCTL_CHILDREN(rack_counters), 1668 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1669 &rack_calc_nonzero, 1670 "Total number of times pacing time worked out to non-zero"); 1671 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1672 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1673 SYSCTL_CHILDREN(rack_counters), 1674 OID_AUTO, "paced", CTLFLAG_RD, 1675 &rack_paced_segments, 1676 "Total number of times a segment send caused hptsi"); 1677 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1678 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1679 SYSCTL_CHILDREN(rack_counters), 1680 OID_AUTO, "unpaced", CTLFLAG_RD, 1681 &rack_unpaced_segments, 1682 "Total number of times a segment did not cause hptsi"); 1683 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1684 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1685 SYSCTL_CHILDREN(rack_counters), 1686 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1687 &rack_saw_enobuf, 1688 "Total number of times a sends returned enobuf for non-hdwr paced connections"); 1689 rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); 1690 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1691 SYSCTL_CHILDREN(rack_counters), 1692 OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, 1693 &rack_saw_enobuf_hw, 1694 "Total number of times a send returned enobuf for hdwr paced connections"); 1695 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1696 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1697 SYSCTL_CHILDREN(rack_counters), 1698 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1699 &rack_saw_enetunreach, 1700 "Total number of times a send received a enetunreachable"); 1701 rack_hot_alloc = counter_u64_alloc(M_WAITOK); 1702 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1703 SYSCTL_CHILDREN(rack_counters), 1704 OID_AUTO, "alloc_hot", CTLFLAG_RD, 1705 &rack_hot_alloc, 1706 "Total allocations from the top of our list"); 1707 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1708 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1709 SYSCTL_CHILDREN(rack_counters), 1710 OID_AUTO, "allocs", CTLFLAG_RD, 1711 &rack_to_alloc, 1712 "Total allocations of tracking structures"); 1713 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1714 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1715 SYSCTL_CHILDREN(rack_counters), 1716 OID_AUTO, "allochard", CTLFLAG_RD, 1717 &rack_to_alloc_hard, 1718 "Total allocations done with sleeping the hard way"); 1719 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1720 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1721 SYSCTL_CHILDREN(rack_counters), 1722 OID_AUTO, "allocemerg", CTLFLAG_RD, 1723 &rack_to_alloc_emerg, 1724 "Total allocations done from emergency cache"); 1725 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1726 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1727 SYSCTL_CHILDREN(rack_counters), 1728 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1729 &rack_to_alloc_limited, 1730 "Total allocations dropped due to limit"); 1731 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1732 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1733 SYSCTL_CHILDREN(rack_counters), 1734 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1735 &rack_alloc_limited_conns, 1736 "Connections with allocations dropped due to limit"); 1737 rack_split_limited = counter_u64_alloc(M_WAITOK); 1738 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1739 SYSCTL_CHILDREN(rack_counters), 1740 OID_AUTO, "split_limited", CTLFLAG_RD, 1741 &rack_split_limited, 1742 "Split allocations dropped due to limit"); 1743 1744 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 1745 char name[32]; 1746 sprintf(name, "cmp_ack_cnt_%d", i); 1747 rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK); 1748 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1749 SYSCTL_CHILDREN(rack_counters), 1750 OID_AUTO, name, CTLFLAG_RD, 1751 &rack_proc_comp_ack[i], 1752 "Number of compressed acks we processed"); 1753 } 1754 rack_large_ackcmp = counter_u64_alloc(M_WAITOK); 1755 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1756 SYSCTL_CHILDREN(rack_counters), 1757 OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD, 1758 &rack_large_ackcmp, 1759 "Number of TCP connections with large mbuf's for compressed acks"); 1760 rack_small_ackcmp = counter_u64_alloc(M_WAITOK); 1761 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1762 SYSCTL_CHILDREN(rack_counters), 1763 OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD, 1764 &rack_small_ackcmp, 1765 "Number of TCP connections with small mbuf's for compressed acks"); 1766 #ifdef INVARIANTS 1767 rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); 1768 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1769 SYSCTL_CHILDREN(rack_counters), 1770 OID_AUTO, "map_adjust_req", CTLFLAG_RD, 1771 &rack_adjust_map_bw, 1772 "Number of times we hit the case where the sb went up and down on a sendmap entry"); 1773 #endif 1774 rack_multi_single_eq = counter_u64_alloc(M_WAITOK); 1775 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1776 SYSCTL_CHILDREN(rack_counters), 1777 OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, 1778 &rack_multi_single_eq, 1779 "Number of compressed acks total represented"); 1780 rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); 1781 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1782 SYSCTL_CHILDREN(rack_counters), 1783 OID_AUTO, "cmp_ack_not", CTLFLAG_RD, 1784 &rack_proc_non_comp_ack, 1785 "Number of non compresseds acks that we processed"); 1786 1787 1788 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1789 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1790 SYSCTL_CHILDREN(rack_counters), 1791 OID_AUTO, "sack_long", CTLFLAG_RD, 1792 &rack_sack_proc_all, 1793 "Total times we had to walk whole list for sack processing"); 1794 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1795 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1796 SYSCTL_CHILDREN(rack_counters), 1797 OID_AUTO, "sack_restart", CTLFLAG_RD, 1798 &rack_sack_proc_restart, 1799 "Total times we had to walk whole list due to a restart"); 1800 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1801 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1802 SYSCTL_CHILDREN(rack_counters), 1803 OID_AUTO, "sack_short", CTLFLAG_RD, 1804 &rack_sack_proc_short, 1805 "Total times we took shortcut for sack processing"); 1806 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1807 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1808 SYSCTL_CHILDREN(rack_counters), 1809 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1810 &rack_enter_tlp_calc, 1811 "Total times we called calc-tlp"); 1812 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1813 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1814 SYSCTL_CHILDREN(rack_counters), 1815 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1816 &rack_used_tlpmethod, 1817 "Total number of runt sacks"); 1818 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1819 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1820 SYSCTL_CHILDREN(rack_counters), 1821 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1822 &rack_used_tlpmethod2, 1823 "Total number of times we hit TLP method 2"); 1824 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1825 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1826 SYSCTL_CHILDREN(rack_attack), 1827 OID_AUTO, "skipacked", CTLFLAG_RD, 1828 &rack_sack_skipped_acked, 1829 "Total number of times we skipped previously sacked"); 1830 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1831 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1832 SYSCTL_CHILDREN(rack_attack), 1833 OID_AUTO, "ofsplit", CTLFLAG_RD, 1834 &rack_sack_splits, 1835 "Total number of times we did the old fashion tree split"); 1836 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1837 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1838 SYSCTL_CHILDREN(rack_counters), 1839 OID_AUTO, "prog_drops", CTLFLAG_RD, 1840 &rack_progress_drops, 1841 "Total number of progress drops"); 1842 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1843 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1844 SYSCTL_CHILDREN(rack_counters), 1845 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1846 &rack_input_idle_reduces, 1847 "Total number of idle reductions on input"); 1848 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1849 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1850 SYSCTL_CHILDREN(rack_counters), 1851 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1852 &rack_collapsed_win, 1853 "Total number of collapsed windows"); 1854 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1855 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1856 SYSCTL_CHILDREN(rack_counters), 1857 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1858 &rack_tlp_does_nada, 1859 "Total number of nada tlp calls"); 1860 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1861 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1862 SYSCTL_CHILDREN(rack_counters), 1863 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1864 &rack_try_scwnd, 1865 "Total number of scwnd attempts"); 1866 1867 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1868 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1869 SYSCTL_CHILDREN(rack_counters), 1870 OID_AUTO, "timer_hole", CTLFLAG_RD, 1871 &rack_per_timer_hole, 1872 "Total persists start in timer hole"); 1873 1874 rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK); 1875 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1876 SYSCTL_CHILDREN(rack_counters), 1877 OID_AUTO, "sndptr_wrong", CTLFLAG_RD, 1878 &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret"); 1879 rack_sbsndptr_right = counter_u64_alloc(M_WAITOK); 1880 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1881 SYSCTL_CHILDREN(rack_counters), 1882 OID_AUTO, "sndptr_right", CTLFLAG_RD, 1883 &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret"); 1884 1885 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1886 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1887 OID_AUTO, "outsize", CTLFLAG_RD, 1888 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1889 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1890 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1891 OID_AUTO, "opts", CTLFLAG_RD, 1892 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1893 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1894 SYSCTL_CHILDREN(rack_sysctl_root), 1895 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1896 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1897 } 1898 1899 static __inline int 1900 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1901 { 1902 if (SEQ_GEQ(b->r_start, a->r_start) && 1903 SEQ_LT(b->r_start, a->r_end)) { 1904 /* 1905 * The entry b is within the 1906 * block a. i.e.: 1907 * a -- |-------------| 1908 * b -- |----| 1909 * <or> 1910 * b -- |------| 1911 * <or> 1912 * b -- |-----------| 1913 */ 1914 return (0); 1915 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1916 /* 1917 * b falls as either the next 1918 * sequence block after a so a 1919 * is said to be smaller than b. 1920 * i.e: 1921 * a -- |------| 1922 * b -- |--------| 1923 * or 1924 * b -- |-----| 1925 */ 1926 return (1); 1927 } 1928 /* 1929 * Whats left is where a is 1930 * larger than b. i.e: 1931 * a -- |-------| 1932 * b -- |---| 1933 * or even possibly 1934 * b -- |--------------| 1935 */ 1936 return (-1); 1937 } 1938 1939 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1940 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1941 1942 static uint32_t 1943 rc_init_window(struct tcp_rack *rack) 1944 { 1945 uint32_t win; 1946 1947 if (rack->rc_init_win == 0) { 1948 /* 1949 * Nothing set by the user, use the system stack 1950 * default. 1951 */ 1952 return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1953 } 1954 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1955 return (win); 1956 } 1957 1958 static uint64_t 1959 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1960 { 1961 if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) 1962 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1963 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1964 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1965 else 1966 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1967 } 1968 1969 static uint64_t 1970 rack_get_bw(struct tcp_rack *rack) 1971 { 1972 if (rack->use_fixed_rate) { 1973 /* Return the fixed pacing rate */ 1974 return (rack_get_fixed_pacing_bw(rack)); 1975 } 1976 if (rack->r_ctl.gp_bw == 0) { 1977 /* 1978 * We have yet no b/w measurement, 1979 * if we have a user set initial bw 1980 * return it. If we don't have that and 1981 * we have an srtt, use the tcp IW (10) to 1982 * calculate a fictional b/w over the SRTT 1983 * which is more or less a guess. Note 1984 * we don't use our IW from rack on purpose 1985 * so if we have like IW=30, we are not 1986 * calculating a "huge" b/w. 1987 */ 1988 uint64_t bw, srtt; 1989 if (rack->r_ctl.init_rate) 1990 return (rack->r_ctl.init_rate); 1991 1992 /* Has the user set a max peak rate? */ 1993 #ifdef NETFLIX_PEAKRATE 1994 if (rack->rc_tp->t_maxpeakrate) 1995 return (rack->rc_tp->t_maxpeakrate); 1996 #endif 1997 /* Ok lets come up with the IW guess, if we have a srtt */ 1998 if (rack->rc_tp->t_srtt == 0) { 1999 /* 2000 * Go with old pacing method 2001 * i.e. burst mitigation only. 2002 */ 2003 return (0); 2004 } 2005 /* Ok lets get the initial TCP win (not racks) */ 2006 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 2007 srtt = (uint64_t)rack->rc_tp->t_srtt; 2008 bw *= (uint64_t)USECS_IN_SECOND; 2009 bw /= srtt; 2010 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2011 bw = rack->r_ctl.bw_rate_cap; 2012 return (bw); 2013 } else { 2014 uint64_t bw; 2015 2016 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 2017 /* Averaging is done, we can return the value */ 2018 bw = rack->r_ctl.gp_bw; 2019 } else { 2020 /* Still doing initial average must calculate */ 2021 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; 2022 } 2023 #ifdef NETFLIX_PEAKRATE 2024 if ((rack->rc_tp->t_maxpeakrate) && 2025 (bw > rack->rc_tp->t_maxpeakrate)) { 2026 /* The user has set a peak rate to pace at 2027 * don't allow us to pace faster than that. 2028 */ 2029 return (rack->rc_tp->t_maxpeakrate); 2030 } 2031 #endif 2032 if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) 2033 bw = rack->r_ctl.bw_rate_cap; 2034 return (bw); 2035 } 2036 } 2037 2038 static uint16_t 2039 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 2040 { 2041 if (rack->use_fixed_rate) { 2042 return (100); 2043 } else if (rack->in_probe_rtt && (rsm == NULL)) 2044 return (rack->r_ctl.rack_per_of_gp_probertt); 2045 else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && 2046 rack->r_ctl.rack_per_of_gp_rec)) { 2047 if (rsm) { 2048 /* a retransmission always use the recovery rate */ 2049 return (rack->r_ctl.rack_per_of_gp_rec); 2050 } else if (rack->rack_rec_nonrxt_use_cr) { 2051 /* Directed to use the configured rate */ 2052 goto configured_rate; 2053 } else if (rack->rack_no_prr && 2054 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 2055 /* No PRR, lets just use the b/w estimate only */ 2056 return (100); 2057 } else { 2058 /* 2059 * Here we may have a non-retransmit but we 2060 * have no overrides, so just use the recovery 2061 * rate (prr is in effect). 2062 */ 2063 return (rack->r_ctl.rack_per_of_gp_rec); 2064 } 2065 } 2066 configured_rate: 2067 /* For the configured rate we look at our cwnd vs the ssthresh */ 2068 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 2069 return (rack->r_ctl.rack_per_of_gp_ss); 2070 else 2071 return (rack->r_ctl.rack_per_of_gp_ca); 2072 } 2073 2074 static void 2075 rack_log_hdwr_pacing(struct tcp_rack *rack, 2076 uint64_t rate, uint64_t hw_rate, int line, 2077 int error, uint16_t mod) 2078 { 2079 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2080 union tcp_log_stackspecific log; 2081 struct timeval tv; 2082 const struct ifnet *ifp; 2083 2084 memset(&log, 0, sizeof(log)); 2085 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 2086 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 2087 if (rack->r_ctl.crte) { 2088 ifp = rack->r_ctl.crte->ptbl->rs_ifp; 2089 } else if (rack->rc_inp->inp_route.ro_nh && 2090 rack->rc_inp->inp_route.ro_nh->nh_ifp) { 2091 ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; 2092 } else 2093 ifp = NULL; 2094 if (ifp) { 2095 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 2096 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 2097 } 2098 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2099 log.u_bbr.bw_inuse = rate; 2100 log.u_bbr.flex5 = line; 2101 log.u_bbr.flex6 = error; 2102 log.u_bbr.flex7 = mod; 2103 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 2104 log.u_bbr.flex8 = rack->use_fixed_rate; 2105 log.u_bbr.flex8 <<= 1; 2106 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 2107 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 2108 log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; 2109 if (rack->r_ctl.crte) 2110 log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; 2111 else 2112 log.u_bbr.cur_del_rate = 0; 2113 log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; 2114 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2115 &rack->rc_inp->inp_socket->so_rcv, 2116 &rack->rc_inp->inp_socket->so_snd, 2117 BBR_LOG_HDWR_PACE, 0, 2118 0, &log, false, &tv); 2119 } 2120 } 2121 2122 static uint64_t 2123 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) 2124 { 2125 /* 2126 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 2127 */ 2128 uint64_t bw_est, high_rate; 2129 uint64_t gain; 2130 2131 gain = (uint64_t)rack_get_output_gain(rack, rsm); 2132 bw_est = bw * gain; 2133 bw_est /= (uint64_t)100; 2134 /* Never fall below the minimum (def 64kbps) */ 2135 if (bw_est < RACK_MIN_BW) 2136 bw_est = RACK_MIN_BW; 2137 if (rack->r_rack_hw_rate_caps) { 2138 /* Rate caps are in place */ 2139 if (rack->r_ctl.crte != NULL) { 2140 /* We have a hdwr rate already */ 2141 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 2142 if (bw_est >= high_rate) { 2143 /* We are capping bw at the highest rate table entry */ 2144 rack_log_hdwr_pacing(rack, 2145 bw_est, high_rate, __LINE__, 2146 0, 3); 2147 bw_est = high_rate; 2148 if (capped) 2149 *capped = 1; 2150 } 2151 } else if ((rack->rack_hdrw_pacing == 0) && 2152 (rack->rack_hdw_pace_ena) && 2153 (rack->rack_attempt_hdwr_pace == 0) && 2154 (rack->rc_inp->inp_route.ro_nh != NULL) && 2155 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 2156 /* 2157 * Special case, we have not yet attempted hardware 2158 * pacing, and yet we may, when we do, find out if we are 2159 * above the highest rate. We need to know the maxbw for the interface 2160 * in question (if it supports ratelimiting). We get back 2161 * a 0, if the interface is not found in the RL lists. 2162 */ 2163 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 2164 if (high_rate) { 2165 /* Yep, we have a rate is it above this rate? */ 2166 if (bw_est > high_rate) { 2167 bw_est = high_rate; 2168 if (capped) 2169 *capped = 1; 2170 } 2171 } 2172 } 2173 } 2174 return (bw_est); 2175 } 2176 2177 static void 2178 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 2179 { 2180 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2181 union tcp_log_stackspecific log; 2182 struct timeval tv; 2183 2184 if ((mod != 1) && (rack_verbose_logging == 0)) { 2185 /* 2186 * We get 3 values currently for mod 2187 * 1 - We are retransmitting and this tells the reason. 2188 * 2 - We are clearing a dup-ack count. 2189 * 3 - We are incrementing a dup-ack count. 2190 * 2191 * The clear/increment are only logged 2192 * if you have BBverbose on. 2193 */ 2194 return; 2195 } 2196 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2197 log.u_bbr.flex1 = tsused; 2198 log.u_bbr.flex2 = thresh; 2199 log.u_bbr.flex3 = rsm->r_flags; 2200 log.u_bbr.flex4 = rsm->r_dupack; 2201 log.u_bbr.flex5 = rsm->r_start; 2202 log.u_bbr.flex6 = rsm->r_end; 2203 log.u_bbr.flex8 = mod; 2204 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2205 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2206 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2207 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2208 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2209 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2210 log.u_bbr.pacing_gain = rack->r_must_retran; 2211 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2212 &rack->rc_inp->inp_socket->so_rcv, 2213 &rack->rc_inp->inp_socket->so_snd, 2214 BBR_LOG_SETTINGS_CHG, 0, 2215 0, &log, false, &tv); 2216 } 2217 } 2218 2219 static void 2220 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 2221 { 2222 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2223 union tcp_log_stackspecific log; 2224 struct timeval tv; 2225 2226 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2227 log.u_bbr.flex1 = rack->rc_tp->t_srtt; 2228 log.u_bbr.flex2 = to; 2229 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 2230 log.u_bbr.flex4 = slot; 2231 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 2232 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2233 log.u_bbr.flex7 = rack->rc_in_persist; 2234 log.u_bbr.flex8 = which; 2235 if (rack->rack_no_prr) 2236 log.u_bbr.pkts_out = 0; 2237 else 2238 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 2239 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2240 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2241 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2242 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2243 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2244 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2245 log.u_bbr.pacing_gain = rack->r_must_retran; 2246 log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; 2247 log.u_bbr.lost = rack_rto_min; 2248 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2249 &rack->rc_inp->inp_socket->so_rcv, 2250 &rack->rc_inp->inp_socket->so_snd, 2251 BBR_LOG_TIMERSTAR, 0, 2252 0, &log, false, &tv); 2253 } 2254 } 2255 2256 static void 2257 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 2258 { 2259 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2260 union tcp_log_stackspecific log; 2261 struct timeval tv; 2262 2263 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2264 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2265 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2266 log.u_bbr.flex8 = to_num; 2267 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 2268 log.u_bbr.flex2 = rack->rc_rack_rtt; 2269 if (rsm == NULL) 2270 log.u_bbr.flex3 = 0; 2271 else 2272 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 2273 if (rack->rack_no_prr) 2274 log.u_bbr.flex5 = 0; 2275 else 2276 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2277 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2278 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2279 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2280 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2281 log.u_bbr.pacing_gain = rack->r_must_retran; 2282 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2283 &rack->rc_inp->inp_socket->so_rcv, 2284 &rack->rc_inp->inp_socket->so_snd, 2285 BBR_LOG_RTO, 0, 2286 0, &log, false, &tv); 2287 } 2288 } 2289 2290 static void 2291 rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, 2292 struct rack_sendmap *prev, 2293 struct rack_sendmap *rsm, 2294 struct rack_sendmap *next, 2295 int flag, uint32_t th_ack, int line) 2296 { 2297 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2298 union tcp_log_stackspecific log; 2299 struct timeval tv; 2300 2301 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2302 log.u_bbr.flex8 = flag; 2303 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2304 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2305 log.u_bbr.cur_del_rate = (uint64_t)prev; 2306 log.u_bbr.delRate = (uint64_t)rsm; 2307 log.u_bbr.rttProp = (uint64_t)next; 2308 log.u_bbr.flex7 = 0; 2309 if (prev) { 2310 log.u_bbr.flex1 = prev->r_start; 2311 log.u_bbr.flex2 = prev->r_end; 2312 log.u_bbr.flex7 |= 0x4; 2313 } 2314 if (rsm) { 2315 log.u_bbr.flex3 = rsm->r_start; 2316 log.u_bbr.flex4 = rsm->r_end; 2317 log.u_bbr.flex7 |= 0x2; 2318 } 2319 if (next) { 2320 log.u_bbr.flex5 = next->r_start; 2321 log.u_bbr.flex6 = next->r_end; 2322 log.u_bbr.flex7 |= 0x1; 2323 } 2324 log.u_bbr.applimited = line; 2325 log.u_bbr.pkts_out = th_ack; 2326 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2327 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2328 if (rack->rack_no_prr) 2329 log.u_bbr.lost = 0; 2330 else 2331 log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; 2332 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2333 &rack->rc_inp->inp_socket->so_rcv, 2334 &rack->rc_inp->inp_socket->so_snd, 2335 TCP_LOG_MAPCHG, 0, 2336 0, &log, false, &tv); 2337 } 2338 } 2339 2340 static void 2341 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 2342 struct rack_sendmap *rsm, int conf) 2343 { 2344 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2345 union tcp_log_stackspecific log; 2346 struct timeval tv; 2347 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2348 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2349 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2350 log.u_bbr.flex1 = t; 2351 log.u_bbr.flex2 = len; 2352 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; 2353 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; 2354 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; 2355 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2356 log.u_bbr.flex7 = conf; 2357 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; 2358 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 2359 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2360 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; 2361 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 2362 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2363 if (rsm) { 2364 log.u_bbr.pkt_epoch = rsm->r_start; 2365 log.u_bbr.lost = rsm->r_end; 2366 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 2367 log.u_bbr.pacing_gain = rsm->r_flags; 2368 } else { 2369 /* Its a SYN */ 2370 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 2371 log.u_bbr.lost = 0; 2372 log.u_bbr.cwnd_gain = 0; 2373 log.u_bbr.pacing_gain = 0; 2374 } 2375 /* Write out general bits of interest rrs here */ 2376 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 2377 log.u_bbr.use_lt_bw <<= 1; 2378 log.u_bbr.use_lt_bw |= rack->forced_ack; 2379 log.u_bbr.use_lt_bw <<= 1; 2380 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 2381 log.u_bbr.use_lt_bw <<= 1; 2382 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 2383 log.u_bbr.use_lt_bw <<= 1; 2384 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 2385 log.u_bbr.use_lt_bw <<= 1; 2386 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 2387 log.u_bbr.use_lt_bw <<= 1; 2388 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 2389 log.u_bbr.use_lt_bw <<= 1; 2390 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 2391 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 2392 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 2393 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 2394 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 2395 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 2396 log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 2397 log.u_bbr.bw_inuse <<= 32; 2398 if (rsm) 2399 log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); 2400 TCP_LOG_EVENTP(tp, NULL, 2401 &rack->rc_inp->inp_socket->so_rcv, 2402 &rack->rc_inp->inp_socket->so_snd, 2403 BBR_LOG_BBRRTT, 0, 2404 0, &log, false, &tv); 2405 2406 2407 } 2408 } 2409 2410 static void 2411 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 2412 { 2413 /* 2414 * Log the rtt sample we are 2415 * applying to the srtt algorithm in 2416 * useconds. 2417 */ 2418 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2419 union tcp_log_stackspecific log; 2420 struct timeval tv; 2421 2422 /* Convert our ms to a microsecond */ 2423 memset(&log, 0, sizeof(log)); 2424 log.u_bbr.flex1 = rtt; 2425 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2426 log.u_bbr.flex3 = rack->r_ctl.sack_count; 2427 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2428 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 2429 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2430 log.u_bbr.flex7 = 1; 2431 log.u_bbr.flex8 = rack->sack_attack_disable; 2432 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2433 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2434 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2435 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2436 log.u_bbr.pacing_gain = rack->r_must_retran; 2437 /* 2438 * We capture in delRate the upper 32 bits as 2439 * the confidence level we had declared, and the 2440 * lower 32 bits as the actual RTT using the arrival 2441 * timestamp. 2442 */ 2443 log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; 2444 log.u_bbr.delRate <<= 32; 2445 log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; 2446 /* Lets capture all the things that make up t_rtxcur */ 2447 log.u_bbr.applimited = rack_rto_min; 2448 log.u_bbr.epoch = rack_rto_max; 2449 log.u_bbr.lt_epoch = rack->r_ctl.timer_slop; 2450 log.u_bbr.lost = rack_rto_min; 2451 log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); 2452 log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); 2453 log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; 2454 log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; 2455 log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; 2456 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2457 &rack->rc_inp->inp_socket->so_rcv, 2458 &rack->rc_inp->inp_socket->so_snd, 2459 TCP_LOG_RTT, 0, 2460 0, &log, false, &tv); 2461 } 2462 } 2463 2464 static void 2465 rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) 2466 { 2467 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 2468 union tcp_log_stackspecific log; 2469 struct timeval tv; 2470 2471 /* Convert our ms to a microsecond */ 2472 memset(&log, 0, sizeof(log)); 2473 log.u_bbr.flex1 = rtt; 2474 log.u_bbr.flex2 = send_time; 2475 log.u_bbr.flex3 = ack_time; 2476 log.u_bbr.flex4 = where; 2477 log.u_bbr.flex7 = 2; 2478 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2479 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2480 &rack->rc_inp->inp_socket->so_rcv, 2481 &rack->rc_inp->inp_socket->so_snd, 2482 TCP_LOG_RTT, 0, 2483 0, &log, false, &tv); 2484 } 2485 } 2486 2487 2488 2489 static inline void 2490 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 2491 { 2492 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 2493 union tcp_log_stackspecific log; 2494 struct timeval tv; 2495 2496 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2497 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2498 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2499 log.u_bbr.flex1 = line; 2500 log.u_bbr.flex2 = tick; 2501 log.u_bbr.flex3 = tp->t_maxunacktime; 2502 log.u_bbr.flex4 = tp->t_acktime; 2503 log.u_bbr.flex8 = event; 2504 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2505 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2506 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2507 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2508 log.u_bbr.pacing_gain = rack->r_must_retran; 2509 TCP_LOG_EVENTP(tp, NULL, 2510 &rack->rc_inp->inp_socket->so_rcv, 2511 &rack->rc_inp->inp_socket->so_snd, 2512 BBR_LOG_PROGRESS, 0, 2513 0, &log, false, &tv); 2514 } 2515 } 2516 2517 static void 2518 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 2519 { 2520 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2521 union tcp_log_stackspecific log; 2522 2523 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2524 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2525 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2526 log.u_bbr.flex1 = slot; 2527 if (rack->rack_no_prr) 2528 log.u_bbr.flex2 = 0; 2529 else 2530 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 2531 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 2532 log.u_bbr.flex8 = rack->rc_in_persist; 2533 log.u_bbr.timeStamp = cts; 2534 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2535 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2536 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2537 log.u_bbr.pacing_gain = rack->r_must_retran; 2538 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2539 &rack->rc_inp->inp_socket->so_rcv, 2540 &rack->rc_inp->inp_socket->so_snd, 2541 BBR_LOG_BBRSND, 0, 2542 0, &log, false, tv); 2543 } 2544 } 2545 2546 static void 2547 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) 2548 { 2549 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2550 union tcp_log_stackspecific log; 2551 struct timeval tv; 2552 2553 memset(&log, 0, sizeof(log)); 2554 log.u_bbr.flex1 = did_out; 2555 log.u_bbr.flex2 = nxt_pkt; 2556 log.u_bbr.flex3 = way_out; 2557 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2558 if (rack->rack_no_prr) 2559 log.u_bbr.flex5 = 0; 2560 else 2561 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2562 log.u_bbr.flex6 = nsegs; 2563 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 2564 log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ 2565 log.u_bbr.flex7 <<= 1; 2566 log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ 2567 log.u_bbr.flex7 <<= 1; 2568 log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ 2569 log.u_bbr.flex8 = rack->rc_in_persist; 2570 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2571 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2572 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2573 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2574 log.u_bbr.use_lt_bw <<= 1; 2575 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2576 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2577 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2578 log.u_bbr.pacing_gain = rack->r_must_retran; 2579 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2580 &rack->rc_inp->inp_socket->so_rcv, 2581 &rack->rc_inp->inp_socket->so_snd, 2582 BBR_LOG_DOSEG_DONE, 0, 2583 0, &log, false, &tv); 2584 } 2585 } 2586 2587 static void 2588 rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) 2589 { 2590 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 2591 union tcp_log_stackspecific log; 2592 struct timeval tv; 2593 uint32_t cts; 2594 2595 memset(&log, 0, sizeof(log)); 2596 cts = tcp_get_usecs(&tv); 2597 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 2598 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 2599 log.u_bbr.flex4 = arg1; 2600 log.u_bbr.flex5 = arg2; 2601 log.u_bbr.flex6 = arg3; 2602 log.u_bbr.flex8 = frm; 2603 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2604 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2605 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2606 log.u_bbr.applimited = rack->r_ctl.rc_sacked; 2607 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2608 log.u_bbr.pacing_gain = rack->r_must_retran; 2609 TCP_LOG_EVENTP(tp, NULL, 2610 &tp->t_inpcb->inp_socket->so_rcv, 2611 &tp->t_inpcb->inp_socket->so_snd, 2612 TCP_HDWR_PACE_SIZE, 0, 2613 0, &log, false, &tv); 2614 } 2615 } 2616 2617 static void 2618 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2619 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2620 { 2621 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2622 union tcp_log_stackspecific log; 2623 struct timeval tv; 2624 2625 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2626 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2627 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2628 log.u_bbr.flex1 = slot; 2629 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2630 log.u_bbr.flex4 = reason; 2631 if (rack->rack_no_prr) 2632 log.u_bbr.flex5 = 0; 2633 else 2634 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2635 log.u_bbr.flex7 = hpts_calling; 2636 log.u_bbr.flex8 = rack->rc_in_persist; 2637 log.u_bbr.lt_epoch = cwnd_to_use; 2638 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2639 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2640 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2641 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2642 log.u_bbr.pacing_gain = rack->r_must_retran; 2643 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2644 &rack->rc_inp->inp_socket->so_rcv, 2645 &rack->rc_inp->inp_socket->so_snd, 2646 BBR_LOG_JUSTRET, 0, 2647 tlen, &log, false, &tv); 2648 } 2649 } 2650 2651 static void 2652 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2653 struct timeval *tv, uint32_t flags_on_entry) 2654 { 2655 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2656 union tcp_log_stackspecific log; 2657 2658 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2659 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2660 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2661 log.u_bbr.flex1 = line; 2662 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2663 log.u_bbr.flex3 = flags_on_entry; 2664 log.u_bbr.flex4 = us_cts; 2665 if (rack->rack_no_prr) 2666 log.u_bbr.flex5 = 0; 2667 else 2668 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2669 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2670 log.u_bbr.flex7 = hpts_removed; 2671 log.u_bbr.flex8 = 1; 2672 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2673 log.u_bbr.timeStamp = us_cts; 2674 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2675 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2676 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2677 log.u_bbr.pacing_gain = rack->r_must_retran; 2678 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2679 &rack->rc_inp->inp_socket->so_rcv, 2680 &rack->rc_inp->inp_socket->so_snd, 2681 BBR_LOG_TIMERCANC, 0, 2682 0, &log, false, tv); 2683 } 2684 } 2685 2686 static void 2687 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2688 uint32_t flex1, uint32_t flex2, 2689 uint32_t flex3, uint32_t flex4, 2690 uint32_t flex5, uint32_t flex6, 2691 uint16_t flex7, uint8_t mod) 2692 { 2693 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2694 union tcp_log_stackspecific log; 2695 struct timeval tv; 2696 2697 if (mod == 1) { 2698 /* No you can't use 1, its for the real to cancel */ 2699 return; 2700 } 2701 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2702 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2703 log.u_bbr.flex1 = flex1; 2704 log.u_bbr.flex2 = flex2; 2705 log.u_bbr.flex3 = flex3; 2706 log.u_bbr.flex4 = flex4; 2707 log.u_bbr.flex5 = flex5; 2708 log.u_bbr.flex6 = flex6; 2709 log.u_bbr.flex7 = flex7; 2710 log.u_bbr.flex8 = mod; 2711 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2712 &rack->rc_inp->inp_socket->so_rcv, 2713 &rack->rc_inp->inp_socket->so_snd, 2714 BBR_LOG_TIMERCANC, 0, 2715 0, &log, false, &tv); 2716 } 2717 } 2718 2719 static void 2720 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2721 { 2722 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2723 union tcp_log_stackspecific log; 2724 struct timeval tv; 2725 2726 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2727 log.u_bbr.flex1 = timers; 2728 log.u_bbr.flex2 = ret; 2729 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2730 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2731 log.u_bbr.flex5 = cts; 2732 if (rack->rack_no_prr) 2733 log.u_bbr.flex6 = 0; 2734 else 2735 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2736 log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; 2737 log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; 2738 log.u_bbr.pacing_gain = rack->r_must_retran; 2739 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2740 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2741 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2742 &rack->rc_inp->inp_socket->so_rcv, 2743 &rack->rc_inp->inp_socket->so_snd, 2744 BBR_LOG_TO_PROCESS, 0, 2745 0, &log, false, &tv); 2746 } 2747 } 2748 2749 static void 2750 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2751 { 2752 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2753 union tcp_log_stackspecific log; 2754 struct timeval tv; 2755 2756 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2757 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2758 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2759 if (rack->rack_no_prr) 2760 log.u_bbr.flex3 = 0; 2761 else 2762 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2763 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2764 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2765 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2766 log.u_bbr.flex8 = frm; 2767 log.u_bbr.pkts_out = orig_cwnd; 2768 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2769 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2770 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 2771 log.u_bbr.use_lt_bw <<= 1; 2772 log.u_bbr.use_lt_bw |= rack->r_might_revert; 2773 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2774 &rack->rc_inp->inp_socket->so_rcv, 2775 &rack->rc_inp->inp_socket->so_snd, 2776 BBR_LOG_BBRUPD, 0, 2777 0, &log, false, &tv); 2778 } 2779 } 2780 2781 #ifdef NETFLIX_EXP_DETECTION 2782 static void 2783 rack_log_sad(struct tcp_rack *rack, int event) 2784 { 2785 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2786 union tcp_log_stackspecific log; 2787 struct timeval tv; 2788 2789 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2790 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2791 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2792 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2793 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2794 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2795 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2796 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2797 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2798 log.u_bbr.lt_epoch |= rack->do_detection; 2799 log.u_bbr.applimited = tcp_map_minimum; 2800 log.u_bbr.flex7 = rack->sack_attack_disable; 2801 log.u_bbr.flex8 = event; 2802 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2803 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2804 log.u_bbr.delivered = tcp_sad_decay_val; 2805 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2806 &rack->rc_inp->inp_socket->so_rcv, 2807 &rack->rc_inp->inp_socket->so_snd, 2808 TCP_SAD_DETECTION, 0, 2809 0, &log, false, &tv); 2810 } 2811 } 2812 #endif 2813 2814 static void 2815 rack_counter_destroy(void) 2816 { 2817 int i; 2818 2819 counter_u64_free(rack_fto_send); 2820 counter_u64_free(rack_fto_rsm_send); 2821 counter_u64_free(rack_nfto_resend); 2822 counter_u64_free(rack_hw_pace_init_fail); 2823 counter_u64_free(rack_hw_pace_lost); 2824 counter_u64_free(rack_non_fto_send); 2825 counter_u64_free(rack_extended_rfo); 2826 counter_u64_free(rack_ack_total); 2827 counter_u64_free(rack_express_sack); 2828 counter_u64_free(rack_sack_total); 2829 counter_u64_free(rack_move_none); 2830 counter_u64_free(rack_move_some); 2831 counter_u64_free(rack_sack_attacks_detected); 2832 counter_u64_free(rack_sack_attacks_reversed); 2833 counter_u64_free(rack_sack_used_next_merge); 2834 counter_u64_free(rack_sack_used_prev_merge); 2835 counter_u64_free(rack_badfr); 2836 counter_u64_free(rack_badfr_bytes); 2837 counter_u64_free(rack_rtm_prr_retran); 2838 counter_u64_free(rack_rtm_prr_newdata); 2839 counter_u64_free(rack_timestamp_mismatch); 2840 counter_u64_free(rack_find_high); 2841 counter_u64_free(rack_reorder_seen); 2842 counter_u64_free(rack_tlp_tot); 2843 counter_u64_free(rack_tlp_newdata); 2844 counter_u64_free(rack_tlp_retran); 2845 counter_u64_free(rack_tlp_retran_bytes); 2846 counter_u64_free(rack_tlp_retran_fail); 2847 counter_u64_free(rack_to_tot); 2848 counter_u64_free(rack_to_arm_rack); 2849 counter_u64_free(rack_to_arm_tlp); 2850 counter_u64_free(rack_calc_zero); 2851 counter_u64_free(rack_calc_nonzero); 2852 counter_u64_free(rack_paced_segments); 2853 counter_u64_free(rack_unpaced_segments); 2854 counter_u64_free(rack_saw_enobuf); 2855 counter_u64_free(rack_saw_enobuf_hw); 2856 counter_u64_free(rack_saw_enetunreach); 2857 counter_u64_free(rack_hot_alloc); 2858 counter_u64_free(rack_to_alloc); 2859 counter_u64_free(rack_to_alloc_hard); 2860 counter_u64_free(rack_to_alloc_emerg); 2861 counter_u64_free(rack_to_alloc_limited); 2862 counter_u64_free(rack_alloc_limited_conns); 2863 counter_u64_free(rack_split_limited); 2864 for (i = 0; i < MAX_NUM_OF_CNTS; i++) { 2865 counter_u64_free(rack_proc_comp_ack[i]); 2866 } 2867 counter_u64_free(rack_multi_single_eq); 2868 counter_u64_free(rack_proc_non_comp_ack); 2869 counter_u64_free(rack_sack_proc_all); 2870 counter_u64_free(rack_sack_proc_restart); 2871 counter_u64_free(rack_sack_proc_short); 2872 counter_u64_free(rack_enter_tlp_calc); 2873 counter_u64_free(rack_used_tlpmethod); 2874 counter_u64_free(rack_used_tlpmethod2); 2875 counter_u64_free(rack_sack_skipped_acked); 2876 counter_u64_free(rack_sack_splits); 2877 counter_u64_free(rack_progress_drops); 2878 counter_u64_free(rack_input_idle_reduces); 2879 counter_u64_free(rack_collapsed_win); 2880 counter_u64_free(rack_tlp_does_nada); 2881 counter_u64_free(rack_try_scwnd); 2882 counter_u64_free(rack_per_timer_hole); 2883 counter_u64_free(rack_large_ackcmp); 2884 counter_u64_free(rack_small_ackcmp); 2885 #ifdef INVARIANTS 2886 counter_u64_free(rack_adjust_map_bw); 2887 #endif 2888 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2889 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2890 } 2891 2892 static struct rack_sendmap * 2893 rack_alloc(struct tcp_rack *rack) 2894 { 2895 struct rack_sendmap *rsm; 2896 2897 /* 2898 * First get the top of the list it in 2899 * theory is the "hottest" rsm we have, 2900 * possibly just freed by ack processing. 2901 */ 2902 if (rack->rc_free_cnt > rack_free_cache) { 2903 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2904 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2905 counter_u64_add(rack_hot_alloc, 1); 2906 rack->rc_free_cnt--; 2907 return (rsm); 2908 } 2909 /* 2910 * Once we get under our free cache we probably 2911 * no longer have a "hot" one available. Lets 2912 * get one from UMA. 2913 */ 2914 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2915 if (rsm) { 2916 rack->r_ctl.rc_num_maps_alloced++; 2917 counter_u64_add(rack_to_alloc, 1); 2918 return (rsm); 2919 } 2920 /* 2921 * Dig in to our aux rsm's (the last two) since 2922 * UMA failed to get us one. 2923 */ 2924 if (rack->rc_free_cnt) { 2925 counter_u64_add(rack_to_alloc_emerg, 1); 2926 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2927 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2928 rack->rc_free_cnt--; 2929 return (rsm); 2930 } 2931 return (NULL); 2932 } 2933 2934 static struct rack_sendmap * 2935 rack_alloc_full_limit(struct tcp_rack *rack) 2936 { 2937 if ((V_tcp_map_entries_limit > 0) && 2938 (rack->do_detection == 0) && 2939 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2940 counter_u64_add(rack_to_alloc_limited, 1); 2941 if (!rack->alloc_limit_reported) { 2942 rack->alloc_limit_reported = 1; 2943 counter_u64_add(rack_alloc_limited_conns, 1); 2944 } 2945 return (NULL); 2946 } 2947 return (rack_alloc(rack)); 2948 } 2949 2950 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2951 static struct rack_sendmap * 2952 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2953 { 2954 struct rack_sendmap *rsm; 2955 2956 if (limit_type) { 2957 /* currently there is only one limit type */ 2958 if (V_tcp_map_split_limit > 0 && 2959 (rack->do_detection == 0) && 2960 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2961 counter_u64_add(rack_split_limited, 1); 2962 if (!rack->alloc_limit_reported) { 2963 rack->alloc_limit_reported = 1; 2964 counter_u64_add(rack_alloc_limited_conns, 1); 2965 } 2966 return (NULL); 2967 } 2968 } 2969 2970 /* allocate and mark in the limit type, if set */ 2971 rsm = rack_alloc(rack); 2972 if (rsm != NULL && limit_type) { 2973 rsm->r_limit_type = limit_type; 2974 rack->r_ctl.rc_num_split_allocs++; 2975 } 2976 return (rsm); 2977 } 2978 2979 static void 2980 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2981 { 2982 if (rsm->r_flags & RACK_APP_LIMITED) { 2983 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2984 rack->r_ctl.rc_app_limited_cnt--; 2985 } 2986 } 2987 if (rsm->r_limit_type) { 2988 /* currently there is only one limit type */ 2989 rack->r_ctl.rc_num_split_allocs--; 2990 } 2991 if (rsm == rack->r_ctl.rc_first_appl) { 2992 if (rack->r_ctl.rc_app_limited_cnt == 0) 2993 rack->r_ctl.rc_first_appl = NULL; 2994 else { 2995 /* Follow the next one out */ 2996 struct rack_sendmap fe; 2997 2998 fe.r_start = rsm->r_nseq_appl; 2999 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3000 } 3001 } 3002 if (rsm == rack->r_ctl.rc_resend) 3003 rack->r_ctl.rc_resend = NULL; 3004 if (rsm == rack->r_ctl.rc_rsm_at_retran) 3005 rack->r_ctl.rc_rsm_at_retran = NULL; 3006 if (rsm == rack->r_ctl.rc_end_appl) 3007 rack->r_ctl.rc_end_appl = NULL; 3008 if (rack->r_ctl.rc_tlpsend == rsm) 3009 rack->r_ctl.rc_tlpsend = NULL; 3010 if (rack->r_ctl.rc_sacklast == rsm) 3011 rack->r_ctl.rc_sacklast = NULL; 3012 memset(rsm, 0, sizeof(struct rack_sendmap)); 3013 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); 3014 rack->rc_free_cnt++; 3015 } 3016 3017 static void 3018 rack_free_trim(struct tcp_rack *rack) 3019 { 3020 struct rack_sendmap *rsm; 3021 3022 /* 3023 * Free up all the tail entries until 3024 * we get our list down to the limit. 3025 */ 3026 while (rack->rc_free_cnt > rack_free_cache) { 3027 rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); 3028 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 3029 rack->rc_free_cnt--; 3030 uma_zfree(rack_zone, rsm); 3031 } 3032 } 3033 3034 3035 static uint32_t 3036 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 3037 { 3038 uint64_t srtt, bw, len, tim; 3039 uint32_t segsiz, def_len, minl; 3040 3041 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3042 def_len = rack_def_data_window * segsiz; 3043 if (rack->rc_gp_filled == 0) { 3044 /* 3045 * We have no measurement (IW is in flight?) so 3046 * we can only guess using our data_window sysctl 3047 * value (usually 100MSS). 3048 */ 3049 return (def_len); 3050 } 3051 /* 3052 * Now we have a number of factors to consider. 3053 * 3054 * 1) We have a desired BDP which is usually 3055 * at least 2. 3056 * 2) We have a minimum number of rtt's usually 1 SRTT 3057 * but we allow it too to be more. 3058 * 3) We want to make sure a measurement last N useconds (if 3059 * we have set rack_min_measure_usec. 3060 * 3061 * We handle the first concern here by trying to create a data 3062 * window of max(rack_def_data_window, DesiredBDP). The 3063 * second concern we handle in not letting the measurement 3064 * window end normally until at least the required SRTT's 3065 * have gone by which is done further below in 3066 * rack_enough_for_measurement(). Finally the third concern 3067 * we also handle here by calculating how long that time 3068 * would take at the current BW and then return the 3069 * max of our first calculation and that length. Note 3070 * that if rack_min_measure_usec is 0, we don't deal 3071 * with concern 3. Also for both Concern 1 and 3 an 3072 * application limited period could end the measurement 3073 * earlier. 3074 * 3075 * So lets calculate the BDP with the "known" b/w using 3076 * the SRTT has our rtt and then multiply it by the 3077 * goal. 3078 */ 3079 bw = rack_get_bw(rack); 3080 srtt = (uint64_t)tp->t_srtt; 3081 len = bw * srtt; 3082 len /= (uint64_t)HPTS_USEC_IN_SEC; 3083 len *= max(1, rack_goal_bdp); 3084 /* Now we need to round up to the nearest MSS */ 3085 len = roundup(len, segsiz); 3086 if (rack_min_measure_usec) { 3087 /* Now calculate our min length for this b/w */ 3088 tim = rack_min_measure_usec; 3089 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 3090 if (minl == 0) 3091 minl = 1; 3092 minl = roundup(minl, segsiz); 3093 if (len < minl) 3094 len = minl; 3095 } 3096 /* 3097 * Now if we have a very small window we want 3098 * to attempt to get the window that is 3099 * as small as possible. This happens on 3100 * low b/w connections and we don't want to 3101 * span huge numbers of rtt's between measurements. 3102 * 3103 * We basically include 2 over our "MIN window" so 3104 * that the measurement can be shortened (possibly) by 3105 * an ack'ed packet. 3106 */ 3107 if (len < def_len) 3108 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 3109 else 3110 return (max((uint32_t)len, def_len)); 3111 3112 } 3113 3114 static int 3115 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 3116 { 3117 uint32_t tim, srtts, segsiz; 3118 3119 /* 3120 * Has enough time passed for the GP measurement to be valid? 3121 */ 3122 if ((tp->snd_max == tp->snd_una) || 3123 (th_ack == tp->snd_max)){ 3124 /* All is acked */ 3125 return (1); 3126 } 3127 if (SEQ_LT(th_ack, tp->gput_seq)) { 3128 /* Not enough bytes yet */ 3129 return (0); 3130 } 3131 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3132 if (SEQ_LT(th_ack, tp->gput_ack) && 3133 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3134 /* Not enough bytes yet */ 3135 return (0); 3136 } 3137 if (rack->r_ctl.rc_first_appl && 3138 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 3139 /* 3140 * We are up to the app limited point 3141 * we have to measure irrespective of the time.. 3142 */ 3143 return (1); 3144 } 3145 /* Now what about time? */ 3146 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 3147 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 3148 if (tim >= srtts) { 3149 return (1); 3150 } 3151 /* Nope not even a full SRTT has passed */ 3152 return (0); 3153 } 3154 3155 static void 3156 rack_log_timely(struct tcp_rack *rack, 3157 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 3158 uint64_t up_bnd, int line, uint8_t method) 3159 { 3160 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3161 union tcp_log_stackspecific log; 3162 struct timeval tv; 3163 3164 memset(&log, 0, sizeof(log)); 3165 log.u_bbr.flex1 = logged; 3166 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 3167 log.u_bbr.flex2 <<= 4; 3168 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 3169 log.u_bbr.flex2 <<= 4; 3170 log.u_bbr.flex2 |= rack->rc_gp_incr; 3171 log.u_bbr.flex2 <<= 4; 3172 log.u_bbr.flex2 |= rack->rc_gp_bwred; 3173 log.u_bbr.flex3 = rack->rc_gp_incr; 3174 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3175 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 3176 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 3177 log.u_bbr.flex7 = rack->rc_gp_bwred; 3178 log.u_bbr.flex8 = method; 3179 log.u_bbr.cur_del_rate = cur_bw; 3180 log.u_bbr.delRate = low_bnd; 3181 log.u_bbr.bw_inuse = up_bnd; 3182 log.u_bbr.rttProp = rack_get_bw(rack); 3183 log.u_bbr.pkt_epoch = line; 3184 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3185 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3186 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3187 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3188 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3189 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 3190 log.u_bbr.cwnd_gain <<= 1; 3191 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 3192 log.u_bbr.cwnd_gain <<= 1; 3193 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 3194 log.u_bbr.cwnd_gain <<= 1; 3195 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 3196 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 3197 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3198 &rack->rc_inp->inp_socket->so_rcv, 3199 &rack->rc_inp->inp_socket->so_snd, 3200 TCP_TIMELY_WORK, 0, 3201 0, &log, false, &tv); 3202 } 3203 } 3204 3205 static int 3206 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 3207 { 3208 /* 3209 * Before we increase we need to know if 3210 * the estimate just made was less than 3211 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 3212 * 3213 * If we already are pacing at a fast enough 3214 * rate to push us faster there is no sense of 3215 * increasing. 3216 * 3217 * We first caculate our actual pacing rate (ss or ca multipler 3218 * times our cur_bw). 3219 * 3220 * Then we take the last measured rate and multipy by our 3221 * maximum pacing overage to give us a max allowable rate. 3222 * 3223 * If our act_rate is smaller than our max_allowable rate 3224 * then we should increase. Else we should hold steady. 3225 * 3226 */ 3227 uint64_t act_rate, max_allow_rate; 3228 3229 if (rack_timely_no_stopping) 3230 return (1); 3231 3232 if ((cur_bw == 0) || (last_bw_est == 0)) { 3233 /* 3234 * Initial startup case or 3235 * everything is acked case. 3236 */ 3237 rack_log_timely(rack, mult, cur_bw, 0, 0, 3238 __LINE__, 9); 3239 return (1); 3240 } 3241 if (mult <= 100) { 3242 /* 3243 * We can always pace at or slightly above our rate. 3244 */ 3245 rack_log_timely(rack, mult, cur_bw, 0, 0, 3246 __LINE__, 9); 3247 return (1); 3248 } 3249 act_rate = cur_bw * (uint64_t)mult; 3250 act_rate /= 100; 3251 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 3252 max_allow_rate /= 100; 3253 if (act_rate < max_allow_rate) { 3254 /* 3255 * Here the rate we are actually pacing at 3256 * is smaller than 10% above our last measurement. 3257 * This means we are pacing below what we would 3258 * like to try to achieve (plus some wiggle room). 3259 */ 3260 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3261 __LINE__, 9); 3262 return (1); 3263 } else { 3264 /* 3265 * Here we are already pacing at least rack_max_per_above(10%) 3266 * what we are getting back. This indicates most likely 3267 * that we are being limited (cwnd/rwnd/app) and can't 3268 * get any more b/w. There is no sense of trying to 3269 * raise up the pacing rate its not speeding us up 3270 * and we already are pacing faster than we are getting. 3271 */ 3272 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 3273 __LINE__, 8); 3274 return (0); 3275 } 3276 } 3277 3278 static void 3279 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 3280 { 3281 /* 3282 * When we drag bottom, we want to assure 3283 * that no multiplier is below 1.0, if so 3284 * we want to restore it to at least that. 3285 */ 3286 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 3287 /* This is unlikely we usually do not touch recovery */ 3288 rack->r_ctl.rack_per_of_gp_rec = 100; 3289 } 3290 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 3291 rack->r_ctl.rack_per_of_gp_ca = 100; 3292 } 3293 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 3294 rack->r_ctl.rack_per_of_gp_ss = 100; 3295 } 3296 } 3297 3298 static void 3299 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 3300 { 3301 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 3302 rack->r_ctl.rack_per_of_gp_ca = 100; 3303 } 3304 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 3305 rack->r_ctl.rack_per_of_gp_ss = 100; 3306 } 3307 } 3308 3309 static void 3310 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 3311 { 3312 int32_t calc, logged, plus; 3313 3314 logged = 0; 3315 3316 if (override) { 3317 /* 3318 * override is passed when we are 3319 * loosing b/w and making one last 3320 * gasp at trying to not loose out 3321 * to a new-reno flow. 3322 */ 3323 goto extra_boost; 3324 } 3325 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 3326 if (rack->rc_gp_incr && 3327 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 3328 /* 3329 * Reset and get 5 strokes more before the boost. Note 3330 * that the count is 0 based so we have to add one. 3331 */ 3332 extra_boost: 3333 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 3334 rack->rc_gp_timely_inc_cnt = 0; 3335 } else 3336 plus = (uint32_t)rack_gp_increase_per; 3337 /* Must be at least 1% increase for true timely increases */ 3338 if ((plus < 1) && 3339 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 3340 plus = 1; 3341 if (rack->rc_gp_saw_rec && 3342 (rack->rc_gp_no_rec_chg == 0) && 3343 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3344 rack->r_ctl.rack_per_of_gp_rec)) { 3345 /* We have been in recovery ding it too */ 3346 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 3347 if (calc > 0xffff) 3348 calc = 0xffff; 3349 logged |= 1; 3350 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 3351 if (rack_per_upper_bound_ss && 3352 (rack->rc_dragged_bottom == 0) && 3353 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 3354 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 3355 } 3356 if (rack->rc_gp_saw_ca && 3357 (rack->rc_gp_saw_ss == 0) && 3358 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3359 rack->r_ctl.rack_per_of_gp_ca)) { 3360 /* In CA */ 3361 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 3362 if (calc > 0xffff) 3363 calc = 0xffff; 3364 logged |= 2; 3365 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 3366 if (rack_per_upper_bound_ca && 3367 (rack->rc_dragged_bottom == 0) && 3368 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 3369 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 3370 } 3371 if (rack->rc_gp_saw_ss && 3372 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 3373 rack->r_ctl.rack_per_of_gp_ss)) { 3374 /* In SS */ 3375 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 3376 if (calc > 0xffff) 3377 calc = 0xffff; 3378 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 3379 if (rack_per_upper_bound_ss && 3380 (rack->rc_dragged_bottom == 0) && 3381 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 3382 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 3383 logged |= 4; 3384 } 3385 if (logged && 3386 (rack->rc_gp_incr == 0)){ 3387 /* Go into increment mode */ 3388 rack->rc_gp_incr = 1; 3389 rack->rc_gp_timely_inc_cnt = 0; 3390 } 3391 if (rack->rc_gp_incr && 3392 logged && 3393 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 3394 rack->rc_gp_timely_inc_cnt++; 3395 } 3396 rack_log_timely(rack, logged, plus, 0, 0, 3397 __LINE__, 1); 3398 } 3399 3400 static uint32_t 3401 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 3402 { 3403 /* 3404 * norm_grad = rtt_diff / minrtt; 3405 * new_per = curper * (1 - B * norm_grad) 3406 * 3407 * B = rack_gp_decrease_per (default 10%) 3408 * rtt_dif = input var current rtt-diff 3409 * curper = input var current percentage 3410 * minrtt = from rack filter 3411 * 3412 */ 3413 uint64_t perf; 3414 3415 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3416 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 3417 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 3418 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 3419 (uint64_t)1000000)) / 3420 (uint64_t)1000000); 3421 if (perf > curper) { 3422 /* TSNH */ 3423 perf = curper - 1; 3424 } 3425 return ((uint32_t)perf); 3426 } 3427 3428 static uint32_t 3429 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 3430 { 3431 /* 3432 * highrttthresh 3433 * result = curper * (1 - (B * ( 1 - ------ )) 3434 * gp_srtt 3435 * 3436 * B = rack_gp_decrease_per (default 10%) 3437 * highrttthresh = filter_min * rack_gp_rtt_maxmul 3438 */ 3439 uint64_t perf; 3440 uint32_t highrttthresh; 3441 3442 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3443 3444 perf = (((uint64_t)curper * ((uint64_t)1000000 - 3445 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 3446 ((uint64_t)highrttthresh * (uint64_t)1000000) / 3447 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 3448 return (perf); 3449 } 3450 3451 static void 3452 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 3453 { 3454 uint64_t logvar, logvar2, logvar3; 3455 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 3456 3457 if (rack->rc_gp_incr) { 3458 /* Turn off increment counting */ 3459 rack->rc_gp_incr = 0; 3460 rack->rc_gp_timely_inc_cnt = 0; 3461 } 3462 ss_red = ca_red = rec_red = 0; 3463 logged = 0; 3464 /* Calculate the reduction value */ 3465 if (rtt_diff < 0) { 3466 rtt_diff *= -1; 3467 } 3468 /* Must be at least 1% reduction */ 3469 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 3470 /* We have been in recovery ding it too */ 3471 if (timely_says == 2) { 3472 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 3473 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3474 if (alt < new_per) 3475 val = alt; 3476 else 3477 val = new_per; 3478 } else 3479 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3480 if (rack->r_ctl.rack_per_of_gp_rec > val) { 3481 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 3482 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 3483 } else { 3484 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3485 rec_red = 0; 3486 } 3487 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 3488 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 3489 logged |= 1; 3490 } 3491 if (rack->rc_gp_saw_ss) { 3492 /* Sent in SS */ 3493 if (timely_says == 2) { 3494 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 3495 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3496 if (alt < new_per) 3497 val = alt; 3498 else 3499 val = new_per; 3500 } else 3501 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 3502 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 3503 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 3504 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 3505 } else { 3506 ss_red = new_per; 3507 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3508 logvar = new_per; 3509 logvar <<= 32; 3510 logvar |= alt; 3511 logvar2 = (uint32_t)rtt; 3512 logvar2 <<= 32; 3513 logvar2 |= (uint32_t)rtt_diff; 3514 logvar3 = rack_gp_rtt_maxmul; 3515 logvar3 <<= 32; 3516 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3517 rack_log_timely(rack, timely_says, 3518 logvar2, logvar3, 3519 logvar, __LINE__, 10); 3520 } 3521 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 3522 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 3523 logged |= 4; 3524 } else if (rack->rc_gp_saw_ca) { 3525 /* Sent in CA */ 3526 if (timely_says == 2) { 3527 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 3528 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 3529 if (alt < new_per) 3530 val = alt; 3531 else 3532 val = new_per; 3533 } else 3534 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 3535 if (rack->r_ctl.rack_per_of_gp_ca > val) { 3536 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 3537 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 3538 } else { 3539 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3540 ca_red = 0; 3541 logvar = new_per; 3542 logvar <<= 32; 3543 logvar |= alt; 3544 logvar2 = (uint32_t)rtt; 3545 logvar2 <<= 32; 3546 logvar2 |= (uint32_t)rtt_diff; 3547 logvar3 = rack_gp_rtt_maxmul; 3548 logvar3 <<= 32; 3549 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3550 rack_log_timely(rack, timely_says, 3551 logvar2, logvar3, 3552 logvar, __LINE__, 10); 3553 } 3554 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 3555 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 3556 logged |= 2; 3557 } 3558 if (rack->rc_gp_timely_dec_cnt < 0x7) { 3559 rack->rc_gp_timely_dec_cnt++; 3560 if (rack_timely_dec_clear && 3561 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 3562 rack->rc_gp_timely_dec_cnt = 0; 3563 } 3564 logvar = ss_red; 3565 logvar <<= 32; 3566 logvar |= ca_red; 3567 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 3568 __LINE__, 2); 3569 } 3570 3571 static void 3572 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 3573 uint32_t rtt, uint32_t line, uint8_t reas) 3574 { 3575 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 3576 union tcp_log_stackspecific log; 3577 struct timeval tv; 3578 3579 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 3580 log.u_bbr.flex1 = line; 3581 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 3582 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 3583 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 3584 log.u_bbr.flex5 = rtt; 3585 log.u_bbr.flex6 = rack->rc_highly_buffered; 3586 log.u_bbr.flex6 <<= 1; 3587 log.u_bbr.flex6 |= rack->forced_ack; 3588 log.u_bbr.flex6 <<= 1; 3589 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 3590 log.u_bbr.flex6 <<= 1; 3591 log.u_bbr.flex6 |= rack->in_probe_rtt; 3592 log.u_bbr.flex6 <<= 1; 3593 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 3594 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 3595 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 3596 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 3597 log.u_bbr.flex8 = reas; 3598 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 3599 log.u_bbr.delRate = rack_get_bw(rack); 3600 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 3601 log.u_bbr.cur_del_rate <<= 32; 3602 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 3603 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 3604 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 3605 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 3606 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 3607 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 3608 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 3609 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 3610 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3611 log.u_bbr.rttProp = us_cts; 3612 log.u_bbr.rttProp <<= 32; 3613 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 3614 TCP_LOG_EVENTP(rack->rc_tp, NULL, 3615 &rack->rc_inp->inp_socket->so_rcv, 3616 &rack->rc_inp->inp_socket->so_snd, 3617 BBR_LOG_RTT_SHRINKS, 0, 3618 0, &log, false, &rack->r_ctl.act_rcv_time); 3619 } 3620 } 3621 3622 static void 3623 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 3624 { 3625 uint64_t bwdp; 3626 3627 bwdp = rack_get_bw(rack); 3628 bwdp *= (uint64_t)rtt; 3629 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 3630 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 3631 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 3632 /* 3633 * A window protocol must be able to have 4 packets 3634 * outstanding as the floor in order to function 3635 * (especially considering delayed ack :D). 3636 */ 3637 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 3638 } 3639 } 3640 3641 static void 3642 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 3643 { 3644 /** 3645 * ProbeRTT is a bit different in rack_pacing than in 3646 * BBR. It is like BBR in that it uses the lowering of 3647 * the RTT as a signal that we saw something new and 3648 * counts from there for how long between. But it is 3649 * different in that its quite simple. It does not 3650 * play with the cwnd and wait until we get down 3651 * to N segments outstanding and hold that for 3652 * 200ms. Instead it just sets the pacing reduction 3653 * rate to a set percentage (70 by default) and hold 3654 * that for a number of recent GP Srtt's. 3655 */ 3656 uint32_t segsiz; 3657 3658 if (rack->rc_gp_dyn_mul == 0) 3659 return; 3660 3661 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3662 /* We are idle */ 3663 return; 3664 } 3665 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3666 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3667 /* 3668 * Stop the goodput now, the idea here is 3669 * that future measurements with in_probe_rtt 3670 * won't register if they are not greater so 3671 * we want to get what info (if any) is available 3672 * now. 3673 */ 3674 rack_do_goodput_measurement(rack->rc_tp, rack, 3675 rack->rc_tp->snd_una, __LINE__); 3676 } 3677 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3678 rack->r_ctl.rc_time_probertt_entered = us_cts; 3679 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3680 rack->r_ctl.rc_pace_min_segs); 3681 rack->in_probe_rtt = 1; 3682 rack->measure_saw_probe_rtt = 1; 3683 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3684 rack->r_ctl.rc_time_probertt_starts = 0; 3685 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3686 if (rack_probertt_use_min_rtt_entry) 3687 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3688 else 3689 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3690 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3691 __LINE__, RACK_RTTS_ENTERPROBE); 3692 } 3693 3694 static void 3695 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3696 { 3697 struct rack_sendmap *rsm; 3698 uint32_t segsiz; 3699 3700 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3701 rack->r_ctl.rc_pace_min_segs); 3702 rack->in_probe_rtt = 0; 3703 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3704 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3705 /* 3706 * Stop the goodput now, the idea here is 3707 * that future measurements with in_probe_rtt 3708 * won't register if they are not greater so 3709 * we want to get what info (if any) is available 3710 * now. 3711 */ 3712 rack_do_goodput_measurement(rack->rc_tp, rack, 3713 rack->rc_tp->snd_una, __LINE__); 3714 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3715 /* 3716 * We don't have enough data to make a measurement. 3717 * So lets just stop and start here after exiting 3718 * probe-rtt. We probably are not interested in 3719 * the results anyway. 3720 */ 3721 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3722 } 3723 /* 3724 * Measurements through the current snd_max are going 3725 * to be limited by the slower pacing rate. 3726 * 3727 * We need to mark these as app-limited so we 3728 * don't collapse the b/w. 3729 */ 3730 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3731 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3732 if (rack->r_ctl.rc_app_limited_cnt == 0) 3733 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3734 else { 3735 /* 3736 * Go out to the end app limited and mark 3737 * this new one as next and move the end_appl up 3738 * to this guy. 3739 */ 3740 if (rack->r_ctl.rc_end_appl) 3741 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3742 rack->r_ctl.rc_end_appl = rsm; 3743 } 3744 rsm->r_flags |= RACK_APP_LIMITED; 3745 rack->r_ctl.rc_app_limited_cnt++; 3746 } 3747 /* 3748 * Now, we need to examine our pacing rate multipliers. 3749 * If its under 100%, we need to kick it back up to 3750 * 100%. We also don't let it be over our "max" above 3751 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3752 * Note setting clamp_atexit_prtt to 0 has the effect 3753 * of setting CA/SS to 100% always at exit (which is 3754 * the default behavior). 3755 */ 3756 if (rack_probertt_clear_is) { 3757 rack->rc_gp_incr = 0; 3758 rack->rc_gp_bwred = 0; 3759 rack->rc_gp_timely_inc_cnt = 0; 3760 rack->rc_gp_timely_dec_cnt = 0; 3761 } 3762 /* Do we do any clamping at exit? */ 3763 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3764 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3765 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3766 } 3767 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3768 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3769 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3770 } 3771 /* 3772 * Lets set rtt_diff to 0, so that we will get a "boost" 3773 * after exiting. 3774 */ 3775 rack->r_ctl.rc_rtt_diff = 0; 3776 3777 /* Clear all flags so we start fresh */ 3778 rack->rc_tp->t_bytes_acked = 0; 3779 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3780 /* 3781 * If configured to, set the cwnd and ssthresh to 3782 * our targets. 3783 */ 3784 if (rack_probe_rtt_sets_cwnd) { 3785 uint64_t ebdp; 3786 uint32_t setto; 3787 3788 /* Set ssthresh so we get into CA once we hit our target */ 3789 if (rack_probertt_use_min_rtt_exit == 1) { 3790 /* Set to min rtt */ 3791 rack_set_prtt_target(rack, segsiz, 3792 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3793 } else if (rack_probertt_use_min_rtt_exit == 2) { 3794 /* Set to current gp rtt */ 3795 rack_set_prtt_target(rack, segsiz, 3796 rack->r_ctl.rc_gp_srtt); 3797 } else if (rack_probertt_use_min_rtt_exit == 3) { 3798 /* Set to entry gp rtt */ 3799 rack_set_prtt_target(rack, segsiz, 3800 rack->r_ctl.rc_entry_gp_rtt); 3801 } else { 3802 uint64_t sum; 3803 uint32_t setval; 3804 3805 sum = rack->r_ctl.rc_entry_gp_rtt; 3806 sum *= 10; 3807 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3808 if (sum >= 20) { 3809 /* 3810 * A highly buffered path needs 3811 * cwnd space for timely to work. 3812 * Lets set things up as if 3813 * we are heading back here again. 3814 */ 3815 setval = rack->r_ctl.rc_entry_gp_rtt; 3816 } else if (sum >= 15) { 3817 /* 3818 * Lets take the smaller of the 3819 * two since we are just somewhat 3820 * buffered. 3821 */ 3822 setval = rack->r_ctl.rc_gp_srtt; 3823 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3824 setval = rack->r_ctl.rc_entry_gp_rtt; 3825 } else { 3826 /* 3827 * Here we are not highly buffered 3828 * and should pick the min we can to 3829 * keep from causing loss. 3830 */ 3831 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3832 } 3833 rack_set_prtt_target(rack, segsiz, 3834 setval); 3835 } 3836 if (rack_probe_rtt_sets_cwnd > 1) { 3837 /* There is a percentage here to boost */ 3838 ebdp = rack->r_ctl.rc_target_probertt_flight; 3839 ebdp *= rack_probe_rtt_sets_cwnd; 3840 ebdp /= 100; 3841 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3842 } else 3843 setto = rack->r_ctl.rc_target_probertt_flight; 3844 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3845 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3846 /* Enforce a min */ 3847 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3848 } 3849 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3850 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3851 } 3852 rack_log_rtt_shrinks(rack, us_cts, 3853 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3854 __LINE__, RACK_RTTS_EXITPROBE); 3855 /* Clear times last so log has all the info */ 3856 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3857 rack->r_ctl.rc_time_probertt_entered = us_cts; 3858 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3859 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3860 } 3861 3862 static void 3863 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3864 { 3865 /* Check in on probe-rtt */ 3866 if (rack->rc_gp_filled == 0) { 3867 /* We do not do p-rtt unless we have gp measurements */ 3868 return; 3869 } 3870 if (rack->in_probe_rtt) { 3871 uint64_t no_overflow; 3872 uint32_t endtime, must_stay; 3873 3874 if (rack->r_ctl.rc_went_idle_time && 3875 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3876 /* 3877 * We went idle during prtt, just exit now. 3878 */ 3879 rack_exit_probertt(rack, us_cts); 3880 } else if (rack_probe_rtt_safety_val && 3881 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3882 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3883 /* 3884 * Probe RTT safety value triggered! 3885 */ 3886 rack_log_rtt_shrinks(rack, us_cts, 3887 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3888 __LINE__, RACK_RTTS_SAFETY); 3889 rack_exit_probertt(rack, us_cts); 3890 } 3891 /* Calculate the max we will wait */ 3892 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3893 if (rack->rc_highly_buffered) 3894 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3895 /* Calculate the min we must wait */ 3896 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3897 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3898 TSTMP_LT(us_cts, endtime)) { 3899 uint32_t calc; 3900 /* Do we lower more? */ 3901 no_exit: 3902 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3903 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3904 else 3905 calc = 0; 3906 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3907 if (calc) { 3908 /* Maybe */ 3909 calc *= rack_per_of_gp_probertt_reduce; 3910 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3911 /* Limit it too */ 3912 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3913 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3914 } 3915 /* We must reach target or the time set */ 3916 return; 3917 } 3918 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3919 if ((TSTMP_LT(us_cts, must_stay) && 3920 rack->rc_highly_buffered) || 3921 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3922 rack->r_ctl.rc_target_probertt_flight)) { 3923 /* We are not past the must_stay time */ 3924 goto no_exit; 3925 } 3926 rack_log_rtt_shrinks(rack, us_cts, 3927 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3928 __LINE__, RACK_RTTS_REACHTARGET); 3929 rack->r_ctl.rc_time_probertt_starts = us_cts; 3930 if (rack->r_ctl.rc_time_probertt_starts == 0) 3931 rack->r_ctl.rc_time_probertt_starts = 1; 3932 /* Restore back to our rate we want to pace at in prtt */ 3933 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3934 } 3935 /* 3936 * Setup our end time, some number of gp_srtts plus 200ms. 3937 */ 3938 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3939 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3940 if (rack_probertt_gpsrtt_cnt_div) 3941 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3942 else 3943 endtime = 0; 3944 endtime += rack_min_probertt_hold; 3945 endtime += rack->r_ctl.rc_time_probertt_starts; 3946 if (TSTMP_GEQ(us_cts, endtime)) { 3947 /* yes, exit probertt */ 3948 rack_exit_probertt(rack, us_cts); 3949 } 3950 3951 } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3952 /* Go into probertt, its been too long since we went lower */ 3953 rack_enter_probertt(rack, us_cts); 3954 } 3955 } 3956 3957 static void 3958 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3959 uint32_t rtt, int32_t rtt_diff) 3960 { 3961 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3962 uint32_t losses; 3963 3964 if ((rack->rc_gp_dyn_mul == 0) || 3965 (rack->use_fixed_rate) || 3966 (rack->in_probe_rtt) || 3967 (rack->rc_always_pace == 0)) { 3968 /* No dynamic GP multipler in play */ 3969 return; 3970 } 3971 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3972 cur_bw = rack_get_bw(rack); 3973 /* Calculate our up and down range */ 3974 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3975 up_bnd /= 100; 3976 up_bnd += rack->r_ctl.last_gp_comp_bw; 3977 3978 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3979 subfr /= 100; 3980 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3981 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3982 /* 3983 * This is the case where our RTT is above 3984 * the max target and we have been configured 3985 * to just do timely no bonus up stuff in that case. 3986 * 3987 * There are two configurations, set to 1, and we 3988 * just do timely if we are over our max. If its 3989 * set above 1 then we slam the multipliers down 3990 * to 100 and then decrement per timely. 3991 */ 3992 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3993 __LINE__, 3); 3994 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3995 rack_validate_multipliers_at_or_below_100(rack); 3996 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3997 } else if ((last_bw_est < low_bnd) && !losses) { 3998 /* 3999 * We are decreasing this is a bit complicated this 4000 * means we are loosing ground. This could be 4001 * because another flow entered and we are competing 4002 * for b/w with it. This will push the RTT up which 4003 * makes timely unusable unless we want to get shoved 4004 * into a corner and just be backed off (the age 4005 * old problem with delay based CC). 4006 * 4007 * On the other hand if it was a route change we 4008 * would like to stay somewhat contained and not 4009 * blow out the buffers. 4010 */ 4011 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4012 __LINE__, 3); 4013 rack->r_ctl.last_gp_comp_bw = cur_bw; 4014 if (rack->rc_gp_bwred == 0) { 4015 /* Go into reduction counting */ 4016 rack->rc_gp_bwred = 1; 4017 rack->rc_gp_timely_dec_cnt = 0; 4018 } 4019 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 4020 (timely_says == 0)) { 4021 /* 4022 * Push another time with a faster pacing 4023 * to try to gain back (we include override to 4024 * get a full raise factor). 4025 */ 4026 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 4027 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 4028 (timely_says == 0) || 4029 (rack_down_raise_thresh == 0)) { 4030 /* 4031 * Do an override up in b/w if we were 4032 * below the threshold or if the threshold 4033 * is zero we always do the raise. 4034 */ 4035 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 4036 } else { 4037 /* Log it stays the same */ 4038 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 4039 __LINE__, 11); 4040 } 4041 rack->rc_gp_timely_dec_cnt++; 4042 /* We are not incrementing really no-count */ 4043 rack->rc_gp_incr = 0; 4044 rack->rc_gp_timely_inc_cnt = 0; 4045 } else { 4046 /* 4047 * Lets just use the RTT 4048 * information and give up 4049 * pushing. 4050 */ 4051 goto use_timely; 4052 } 4053 } else if ((timely_says != 2) && 4054 !losses && 4055 (last_bw_est > up_bnd)) { 4056 /* 4057 * We are increasing b/w lets keep going, updating 4058 * our b/w and ignoring any timely input, unless 4059 * of course we are at our max raise (if there is one). 4060 */ 4061 4062 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4063 __LINE__, 3); 4064 rack->r_ctl.last_gp_comp_bw = cur_bw; 4065 if (rack->rc_gp_saw_ss && 4066 rack_per_upper_bound_ss && 4067 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 4068 /* 4069 * In cases where we can't go higher 4070 * we should just use timely. 4071 */ 4072 goto use_timely; 4073 } 4074 if (rack->rc_gp_saw_ca && 4075 rack_per_upper_bound_ca && 4076 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 4077 /* 4078 * In cases where we can't go higher 4079 * we should just use timely. 4080 */ 4081 goto use_timely; 4082 } 4083 rack->rc_gp_bwred = 0; 4084 rack->rc_gp_timely_dec_cnt = 0; 4085 /* You get a set number of pushes if timely is trying to reduce */ 4086 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 4087 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4088 } else { 4089 /* Log it stays the same */ 4090 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 4091 __LINE__, 12); 4092 } 4093 return; 4094 } else { 4095 /* 4096 * We are staying between the lower and upper range bounds 4097 * so use timely to decide. 4098 */ 4099 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 4100 __LINE__, 3); 4101 use_timely: 4102 if (timely_says) { 4103 rack->rc_gp_incr = 0; 4104 rack->rc_gp_timely_inc_cnt = 0; 4105 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 4106 !losses && 4107 (last_bw_est < low_bnd)) { 4108 /* We are loosing ground */ 4109 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4110 rack->rc_gp_timely_dec_cnt++; 4111 /* We are not incrementing really no-count */ 4112 rack->rc_gp_incr = 0; 4113 rack->rc_gp_timely_inc_cnt = 0; 4114 } else 4115 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 4116 } else { 4117 rack->rc_gp_bwred = 0; 4118 rack->rc_gp_timely_dec_cnt = 0; 4119 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 4120 } 4121 } 4122 } 4123 4124 static int32_t 4125 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 4126 { 4127 int32_t timely_says; 4128 uint64_t log_mult, log_rtt_a_diff; 4129 4130 log_rtt_a_diff = rtt; 4131 log_rtt_a_diff <<= 32; 4132 log_rtt_a_diff |= (uint32_t)rtt_diff; 4133 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 4134 rack_gp_rtt_maxmul)) { 4135 /* Reduce the b/w multipler */ 4136 timely_says = 2; 4137 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 4138 log_mult <<= 32; 4139 log_mult |= prev_rtt; 4140 rack_log_timely(rack, timely_says, log_mult, 4141 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4142 log_rtt_a_diff, __LINE__, 4); 4143 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4144 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4145 max(rack_gp_rtt_mindiv , 1)))) { 4146 /* Increase the b/w multipler */ 4147 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 4148 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 4149 max(rack_gp_rtt_mindiv , 1)); 4150 log_mult <<= 32; 4151 log_mult |= prev_rtt; 4152 timely_says = 0; 4153 rack_log_timely(rack, timely_says, log_mult , 4154 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 4155 log_rtt_a_diff, __LINE__, 5); 4156 } else { 4157 /* 4158 * Use a gradient to find it the timely gradient 4159 * is: 4160 * grad = rc_rtt_diff / min_rtt; 4161 * 4162 * anything below or equal to 0 will be 4163 * a increase indication. Anything above 4164 * zero is a decrease. Note we take care 4165 * of the actual gradient calculation 4166 * in the reduction (its not needed for 4167 * increase). 4168 */ 4169 log_mult = prev_rtt; 4170 if (rtt_diff <= 0) { 4171 /* 4172 * Rttdiff is less than zero, increase the 4173 * b/w multipler (its 0 or negative) 4174 */ 4175 timely_says = 0; 4176 rack_log_timely(rack, timely_says, log_mult, 4177 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 4178 } else { 4179 /* Reduce the b/w multipler */ 4180 timely_says = 1; 4181 rack_log_timely(rack, timely_says, log_mult, 4182 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 4183 } 4184 } 4185 return (timely_says); 4186 } 4187 4188 static void 4189 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 4190 tcp_seq th_ack, int line) 4191 { 4192 uint64_t tim, bytes_ps, ltim, stim, utim; 4193 uint32_t segsiz, bytes, reqbytes, us_cts; 4194 int32_t gput, new_rtt_diff, timely_says; 4195 uint64_t resid_bw, subpart = 0, addpart = 0, srtt; 4196 int did_add = 0; 4197 4198 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4199 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4200 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 4201 tim = us_cts - tp->gput_ts; 4202 else 4203 tim = 0; 4204 4205 if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) 4206 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 4207 else 4208 stim = 0; 4209 /* 4210 * Use the larger of the send time or ack time. This prevents us 4211 * from being influenced by ack artifacts to come up with too 4212 * high of measurement. Note that since we are spanning over many more 4213 * bytes in most of our measurements hopefully that is less likely to 4214 * occur. 4215 */ 4216 if (tim > stim) 4217 utim = max(tim, 1); 4218 else 4219 utim = max(stim, 1); 4220 /* Lets get a msec time ltim too for the old stuff */ 4221 ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); 4222 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 4223 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 4224 if ((tim == 0) && (stim == 0)) { 4225 /* 4226 * Invalid measurement time, maybe 4227 * all on one ack/one send? 4228 */ 4229 bytes = 0; 4230 bytes_ps = 0; 4231 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4232 0, 0, 0, 10, __LINE__, NULL); 4233 goto skip_measurement; 4234 } 4235 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 4236 /* We never made a us_rtt measurement? */ 4237 bytes = 0; 4238 bytes_ps = 0; 4239 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4240 0, 0, 0, 10, __LINE__, NULL); 4241 goto skip_measurement; 4242 } 4243 /* 4244 * Calculate the maximum possible b/w this connection 4245 * could have. We base our calculation on the lowest 4246 * rtt we have seen during the measurement and the 4247 * largest rwnd the client has given us in that time. This 4248 * forms a BDP that is the maximum that we could ever 4249 * get to the client. Anything larger is not valid. 4250 * 4251 * I originally had code here that rejected measurements 4252 * where the time was less than 1/2 the latest us_rtt. 4253 * But after thinking on that I realized its wrong since 4254 * say you had a 150Mbps or even 1Gbps link, and you 4255 * were a long way away.. example I am in Europe (100ms rtt) 4256 * talking to my 1Gbps link in S.C. Now measuring say 150,000 4257 * bytes my time would be 1.2ms, and yet my rtt would say 4258 * the measurement was invalid the time was < 50ms. The 4259 * same thing is true for 150Mb (8ms of time). 4260 * 4261 * A better way I realized is to look at what the maximum 4262 * the connection could possibly do. This is gated on 4263 * the lowest RTT we have seen and the highest rwnd. 4264 * We should in theory never exceed that, if we are 4265 * then something on the path is storing up packets 4266 * and then feeding them all at once to our endpoint 4267 * messing up our measurement. 4268 */ 4269 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 4270 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 4271 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 4272 if (SEQ_LT(th_ack, tp->gput_seq)) { 4273 /* No measurement can be made */ 4274 bytes = 0; 4275 bytes_ps = 0; 4276 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4277 0, 0, 0, 10, __LINE__, NULL); 4278 goto skip_measurement; 4279 } else 4280 bytes = (th_ack - tp->gput_seq); 4281 bytes_ps = (uint64_t)bytes; 4282 /* 4283 * Don't measure a b/w for pacing unless we have gotten at least 4284 * an initial windows worth of data in this measurement interval. 4285 * 4286 * Small numbers of bytes get badly influenced by delayed ack and 4287 * other artifacts. Note we take the initial window or our 4288 * defined minimum GP (defaulting to 10 which hopefully is the 4289 * IW). 4290 */ 4291 if (rack->rc_gp_filled == 0) { 4292 /* 4293 * The initial estimate is special. We 4294 * have blasted out an IW worth of packets 4295 * without a real valid ack ts results. We 4296 * then setup the app_limited_needs_set flag, 4297 * this should get the first ack in (probably 2 4298 * MSS worth) to be recorded as the timestamp. 4299 * We thus allow a smaller number of bytes i.e. 4300 * IW - 2MSS. 4301 */ 4302 reqbytes -= (2 * segsiz); 4303 /* Also lets fill previous for our first measurement to be neutral */ 4304 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4305 } 4306 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 4307 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4308 rack->r_ctl.rc_app_limited_cnt, 4309 0, 0, 10, __LINE__, NULL); 4310 goto skip_measurement; 4311 } 4312 /* 4313 * We now need to calculate the Timely like status so 4314 * we can update (possibly) the b/w multipliers. 4315 */ 4316 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 4317 if (rack->rc_gp_filled == 0) { 4318 /* No previous reading */ 4319 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 4320 } else { 4321 if (rack->measure_saw_probe_rtt == 0) { 4322 /* 4323 * We don't want a probertt to be counted 4324 * since it will be negative incorrectly. We 4325 * expect to be reducing the RTT when we 4326 * pace at a slower rate. 4327 */ 4328 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 4329 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 4330 } 4331 } 4332 timely_says = rack_make_timely_judgement(rack, 4333 rack->r_ctl.rc_gp_srtt, 4334 rack->r_ctl.rc_rtt_diff, 4335 rack->r_ctl.rc_prev_gp_srtt 4336 ); 4337 bytes_ps *= HPTS_USEC_IN_SEC; 4338 bytes_ps /= utim; 4339 if (bytes_ps > rack->r_ctl.last_max_bw) { 4340 /* 4341 * Something is on path playing 4342 * since this b/w is not possible based 4343 * on our BDP (highest rwnd and lowest rtt 4344 * we saw in the measurement window). 4345 * 4346 * Another option here would be to 4347 * instead skip the measurement. 4348 */ 4349 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 4350 bytes_ps, rack->r_ctl.last_max_bw, 0, 4351 11, __LINE__, NULL); 4352 bytes_ps = rack->r_ctl.last_max_bw; 4353 } 4354 /* We store gp for b/w in bytes per second */ 4355 if (rack->rc_gp_filled == 0) { 4356 /* Initial measurment */ 4357 if (bytes_ps) { 4358 rack->r_ctl.gp_bw = bytes_ps; 4359 rack->rc_gp_filled = 1; 4360 rack->r_ctl.num_measurements = 1; 4361 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 4362 } else { 4363 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 4364 rack->r_ctl.rc_app_limited_cnt, 4365 0, 0, 10, __LINE__, NULL); 4366 } 4367 if (rack->rc_inp->inp_in_hpts && 4368 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 4369 /* 4370 * Ok we can't trust the pacer in this case 4371 * where we transition from un-paced to paced. 4372 * Or for that matter when the burst mitigation 4373 * was making a wild guess and got it wrong. 4374 * Stop the pacer and clear up all the aggregate 4375 * delays etc. 4376 */ 4377 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4378 rack->r_ctl.rc_hpts_flags = 0; 4379 rack->r_ctl.rc_last_output_to = 0; 4380 } 4381 did_add = 2; 4382 } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { 4383 /* Still a small number run an average */ 4384 rack->r_ctl.gp_bw += bytes_ps; 4385 addpart = rack->r_ctl.num_measurements; 4386 rack->r_ctl.num_measurements++; 4387 if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { 4388 /* We have collected enought to move forward */ 4389 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; 4390 } 4391 did_add = 3; 4392 } else { 4393 /* 4394 * We want to take 1/wma of the goodput and add in to 7/8th 4395 * of the old value weighted by the srtt. So if your measurement 4396 * period is say 2 SRTT's long you would get 1/4 as the 4397 * value, if it was like 1/2 SRTT then you would get 1/16th. 4398 * 4399 * But we must be careful not to take too much i.e. if the 4400 * srtt is say 20ms and the measurement is taken over 4401 * 400ms our weight would be 400/20 i.e. 20. On the 4402 * other hand if we get a measurement over 1ms with a 4403 * 10ms rtt we only want to take a much smaller portion. 4404 */ 4405 if (rack->r_ctl.num_measurements < 0xff) { 4406 rack->r_ctl.num_measurements++; 4407 } 4408 srtt = (uint64_t)tp->t_srtt; 4409 if (srtt == 0) { 4410 /* 4411 * Strange why did t_srtt go back to zero? 4412 */ 4413 if (rack->r_ctl.rc_rack_min_rtt) 4414 srtt = rack->r_ctl.rc_rack_min_rtt; 4415 else 4416 srtt = HPTS_USEC_IN_MSEC; 4417 } 4418 /* 4419 * XXXrrs: Note for reviewers, in playing with 4420 * dynamic pacing I discovered this GP calculation 4421 * as done originally leads to some undesired results. 4422 * Basically you can get longer measurements contributing 4423 * too much to the WMA. Thus I changed it if you are doing 4424 * dynamic adjustments to only do the aportioned adjustment 4425 * if we have a very small (time wise) measurement. Longer 4426 * measurements just get there weight (defaulting to 1/8) 4427 * add to the WMA. We may want to think about changing 4428 * this to always do that for both sides i.e. dynamic 4429 * and non-dynamic... but considering lots of folks 4430 * were playing with this I did not want to change the 4431 * calculation per.se. without your thoughts.. Lawerence? 4432 * Peter?? 4433 */ 4434 if (rack->rc_gp_dyn_mul == 0) { 4435 subpart = rack->r_ctl.gp_bw * utim; 4436 subpart /= (srtt * 8); 4437 if (subpart < (rack->r_ctl.gp_bw / 2)) { 4438 /* 4439 * The b/w update takes no more 4440 * away then 1/2 our running total 4441 * so factor it in. 4442 */ 4443 addpart = bytes_ps * utim; 4444 addpart /= (srtt * 8); 4445 } else { 4446 /* 4447 * Don't allow a single measurement 4448 * to account for more than 1/2 of the 4449 * WMA. This could happen on a retransmission 4450 * where utim becomes huge compared to 4451 * srtt (multiple retransmissions when using 4452 * the sending rate which factors in all the 4453 * transmissions from the first one). 4454 */ 4455 subpart = rack->r_ctl.gp_bw / 2; 4456 addpart = bytes_ps / 2; 4457 } 4458 resid_bw = rack->r_ctl.gp_bw - subpart; 4459 rack->r_ctl.gp_bw = resid_bw + addpart; 4460 did_add = 1; 4461 } else { 4462 if ((utim / srtt) <= 1) { 4463 /* 4464 * The b/w update was over a small period 4465 * of time. The idea here is to prevent a small 4466 * measurement time period from counting 4467 * too much. So we scale it based on the 4468 * time so it attributes less than 1/rack_wma_divisor 4469 * of its measurement. 4470 */ 4471 subpart = rack->r_ctl.gp_bw * utim; 4472 subpart /= (srtt * rack_wma_divisor); 4473 addpart = bytes_ps * utim; 4474 addpart /= (srtt * rack_wma_divisor); 4475 } else { 4476 /* 4477 * The scaled measurement was long 4478 * enough so lets just add in the 4479 * portion of the measurment i.e. 1/rack_wma_divisor 4480 */ 4481 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 4482 addpart = bytes_ps / rack_wma_divisor; 4483 } 4484 if ((rack->measure_saw_probe_rtt == 0) || 4485 (bytes_ps > rack->r_ctl.gp_bw)) { 4486 /* 4487 * For probe-rtt we only add it in 4488 * if its larger, all others we just 4489 * add in. 4490 */ 4491 did_add = 1; 4492 resid_bw = rack->r_ctl.gp_bw - subpart; 4493 rack->r_ctl.gp_bw = resid_bw + addpart; 4494 } 4495 } 4496 } 4497 if ((rack->gp_ready == 0) && 4498 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 4499 /* We have enough measurements now */ 4500 rack->gp_ready = 1; 4501 rack_set_cc_pacing(rack); 4502 if (rack->defer_options) 4503 rack_apply_deferred_options(rack); 4504 } 4505 rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, 4506 rack_get_bw(rack), 22, did_add, NULL); 4507 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 4508 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 4509 rack_update_multiplier(rack, timely_says, bytes_ps, 4510 rack->r_ctl.rc_gp_srtt, 4511 rack->r_ctl.rc_rtt_diff); 4512 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 4513 rack_get_bw(rack), 3, line, NULL); 4514 /* reset the gp srtt and setup the new prev */ 4515 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 4516 /* Record the lost count for the next measurement */ 4517 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 4518 /* 4519 * We restart our diffs based on the gpsrtt in the 4520 * measurement window. 4521 */ 4522 rack->rc_gp_rtt_set = 0; 4523 rack->rc_gp_saw_rec = 0; 4524 rack->rc_gp_saw_ca = 0; 4525 rack->rc_gp_saw_ss = 0; 4526 rack->rc_dragged_bottom = 0; 4527 skip_measurement: 4528 4529 #ifdef STATS 4530 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 4531 gput); 4532 /* 4533 * XXXLAS: This is a temporary hack, and should be 4534 * chained off VOI_TCP_GPUT when stats(9) grows an 4535 * API to deal with chained VOIs. 4536 */ 4537 if (tp->t_stats_gput_prev > 0) 4538 stats_voi_update_abs_s32(tp->t_stats, 4539 VOI_TCP_GPUT_ND, 4540 ((gput - tp->t_stats_gput_prev) * 100) / 4541 tp->t_stats_gput_prev); 4542 #endif 4543 tp->t_flags &= ~TF_GPUTINPROG; 4544 tp->t_stats_gput_prev = gput; 4545 /* 4546 * Now are we app limited now and there is space from where we 4547 * were to where we want to go? 4548 * 4549 * We don't do the other case i.e. non-applimited here since 4550 * the next send will trigger us picking up the missing data. 4551 */ 4552 if (rack->r_ctl.rc_first_appl && 4553 TCPS_HAVEESTABLISHED(tp->t_state) && 4554 rack->r_ctl.rc_app_limited_cnt && 4555 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 4556 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 4557 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 4558 /* 4559 * Yep there is enough outstanding to make a measurement here. 4560 */ 4561 struct rack_sendmap *rsm, fe; 4562 4563 tp->t_flags |= TF_GPUTINPROG; 4564 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 4565 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 4566 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 4567 rack->app_limited_needs_set = 0; 4568 tp->gput_seq = th_ack; 4569 if (rack->in_probe_rtt) 4570 rack->measure_saw_probe_rtt = 1; 4571 else if ((rack->measure_saw_probe_rtt) && 4572 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 4573 rack->measure_saw_probe_rtt = 0; 4574 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 4575 /* There is a full window to gain info from */ 4576 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 4577 } else { 4578 /* We can only measure up to the applimited point */ 4579 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 4580 } 4581 /* 4582 * Now we need to find the timestamp of the send at tp->gput_seq 4583 * for the send based measurement. 4584 */ 4585 fe.r_start = tp->gput_seq; 4586 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 4587 if (rsm) { 4588 /* Ok send-based limit is set */ 4589 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 4590 /* 4591 * Move back to include the earlier part 4592 * so our ack time lines up right (this may 4593 * make an overlapping measurement but thats 4594 * ok). 4595 */ 4596 tp->gput_seq = rsm->r_start; 4597 } 4598 if (rsm->r_flags & RACK_ACKED) 4599 tp->gput_ts = (uint32_t)rsm->r_ack_arrival; 4600 else 4601 rack->app_limited_needs_set = 1; 4602 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 4603 } else { 4604 /* 4605 * If we don't find the rsm due to some 4606 * send-limit set the current time, which 4607 * basically disables the send-limit. 4608 */ 4609 struct timeval tv; 4610 4611 microuptime(&tv); 4612 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 4613 } 4614 rack_log_pacing_delay_calc(rack, 4615 tp->gput_seq, 4616 tp->gput_ack, 4617 (uint64_t)rsm, 4618 tp->gput_ts, 4619 rack->r_ctl.rc_app_limited_cnt, 4620 9, 4621 __LINE__, NULL); 4622 } 4623 } 4624 4625 /* 4626 * CC wrapper hook functions 4627 */ 4628 static void 4629 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, 4630 uint16_t type, int32_t recovery) 4631 { 4632 uint32_t prior_cwnd, acked; 4633 struct tcp_log_buffer *lgb = NULL; 4634 uint8_t labc_to_use; 4635 4636 INP_WLOCK_ASSERT(tp->t_inpcb); 4637 tp->ccv->nsegs = nsegs; 4638 acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una); 4639 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 4640 uint32_t max; 4641 4642 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 4643 if (tp->ccv->bytes_this_ack > max) { 4644 tp->ccv->bytes_this_ack = max; 4645 } 4646 } 4647 #ifdef STATS 4648 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 4649 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 4650 #endif 4651 if ((tp->t_flags & TF_GPUTINPROG) && 4652 rack_enough_for_measurement(tp, rack, th_ack)) { 4653 /* Measure the Goodput */ 4654 rack_do_goodput_measurement(tp, rack, th_ack, __LINE__); 4655 #ifdef NETFLIX_PEAKRATE 4656 if ((type == CC_ACK) && 4657 (tp->t_maxpeakrate)) { 4658 /* 4659 * We update t_peakrate_thr. This gives us roughly 4660 * one update per round trip time. Note 4661 * it will only be used if pace_always is off i.e 4662 * we don't do this for paced flows. 4663 */ 4664 rack_update_peakrate_thr(tp); 4665 } 4666 #endif 4667 } 4668 /* Which way our we limited, if not cwnd limited no advance in CA */ 4669 if (tp->snd_cwnd <= tp->snd_wnd) 4670 tp->ccv->flags |= CCF_CWND_LIMITED; 4671 else 4672 tp->ccv->flags &= ~CCF_CWND_LIMITED; 4673 if (tp->snd_cwnd > tp->snd_ssthresh) { 4674 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 4675 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 4676 /* For the setting of a window past use the actual scwnd we are using */ 4677 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 4678 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 4679 tp->ccv->flags |= CCF_ABC_SENTAWND; 4680 } 4681 } else { 4682 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 4683 tp->t_bytes_acked = 0; 4684 } 4685 prior_cwnd = tp->snd_cwnd; 4686 if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || 4687 (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) 4688 labc_to_use = rack->rc_labc; 4689 else 4690 labc_to_use = rack_max_abc_post_recovery; 4691 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4692 union tcp_log_stackspecific log; 4693 struct timeval tv; 4694 4695 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4696 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4697 log.u_bbr.flex1 = th_ack; 4698 log.u_bbr.flex2 = tp->ccv->flags; 4699 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4700 log.u_bbr.flex4 = tp->ccv->nsegs; 4701 log.u_bbr.flex5 = labc_to_use; 4702 log.u_bbr.flex6 = prior_cwnd; 4703 log.u_bbr.flex7 = V_tcp_do_newsack; 4704 log.u_bbr.flex8 = 1; 4705 lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4706 0, &log, false, NULL, NULL, 0, &tv); 4707 } 4708 if (CC_ALGO(tp)->ack_received != NULL) { 4709 /* XXXLAS: Find a way to live without this */ 4710 tp->ccv->curack = th_ack; 4711 tp->ccv->labc = labc_to_use; 4712 tp->ccv->flags |= CCF_USE_LOCAL_ABC; 4713 CC_ALGO(tp)->ack_received(tp->ccv, type); 4714 } 4715 if (lgb) { 4716 lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; 4717 } 4718 if (rack->r_must_retran) { 4719 if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { 4720 /* 4721 * We now are beyond the rxt point so lets disable 4722 * the flag. 4723 */ 4724 rack->r_ctl.rc_out_at_rto = 0; 4725 rack->r_must_retran = 0; 4726 } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { 4727 /* 4728 * Only decrement the rc_out_at_rto if the cwnd advances 4729 * at least a whole segment. Otherwise next time the peer 4730 * acks, we won't be able to send this generaly happens 4731 * when we are in Congestion Avoidance. 4732 */ 4733 if (acked <= rack->r_ctl.rc_out_at_rto){ 4734 rack->r_ctl.rc_out_at_rto -= acked; 4735 } else { 4736 rack->r_ctl.rc_out_at_rto = 0; 4737 } 4738 } 4739 } 4740 #ifdef STATS 4741 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4742 #endif 4743 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4744 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4745 } 4746 #ifdef NETFLIX_PEAKRATE 4747 /* we enforce max peak rate if it is set and we are not pacing */ 4748 if ((rack->rc_always_pace == 0) && 4749 tp->t_peakrate_thr && 4750 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4751 tp->snd_cwnd = tp->t_peakrate_thr; 4752 } 4753 #endif 4754 } 4755 4756 static void 4757 tcp_rack_partialack(struct tcpcb *tp) 4758 { 4759 struct tcp_rack *rack; 4760 4761 rack = (struct tcp_rack *)tp->t_fb_ptr; 4762 INP_WLOCK_ASSERT(tp->t_inpcb); 4763 /* 4764 * If we are doing PRR and have enough 4765 * room to send <or> we are pacing and prr 4766 * is disabled we will want to see if we 4767 * can send data (by setting r_wanted_output to 4768 * true). 4769 */ 4770 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4771 rack->rack_no_prr) 4772 rack->r_wanted_output = 1; 4773 } 4774 4775 static void 4776 rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) 4777 { 4778 struct tcp_rack *rack; 4779 uint32_t orig_cwnd; 4780 4781 orig_cwnd = tp->snd_cwnd; 4782 INP_WLOCK_ASSERT(tp->t_inpcb); 4783 rack = (struct tcp_rack *)tp->t_fb_ptr; 4784 /* only alert CC if we alerted when we entered */ 4785 if (CC_ALGO(tp)->post_recovery != NULL) { 4786 tp->ccv->curack = th_ack; 4787 CC_ALGO(tp)->post_recovery(tp->ccv); 4788 if (tp->snd_cwnd < tp->snd_ssthresh) { 4789 /* 4790 * Rack has burst control and pacing 4791 * so lets not set this any lower than 4792 * snd_ssthresh per RFC-6582 (option 2). 4793 */ 4794 tp->snd_cwnd = tp->snd_ssthresh; 4795 } 4796 } 4797 if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { 4798 union tcp_log_stackspecific log; 4799 struct timeval tv; 4800 4801 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4802 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 4803 log.u_bbr.flex1 = th_ack; 4804 log.u_bbr.flex2 = tp->ccv->flags; 4805 log.u_bbr.flex3 = tp->ccv->bytes_this_ack; 4806 log.u_bbr.flex4 = tp->ccv->nsegs; 4807 log.u_bbr.flex5 = V_tcp_abc_l_var; 4808 log.u_bbr.flex6 = orig_cwnd; 4809 log.u_bbr.flex7 = V_tcp_do_newsack; 4810 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 4811 log.u_bbr.flex8 = 2; 4812 tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, 4813 0, &log, false, NULL, NULL, 0, &tv); 4814 } 4815 if ((rack->rack_no_prr == 0) && 4816 (rack->no_prr_addback == 0) && 4817 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4818 /* 4819 * Suck the next prr cnt back into cwnd, but 4820 * only do that if we are not application limited. 4821 */ 4822 if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4823 /* 4824 * We are allowed to add back to the cwnd the amount we did 4825 * not get out if: 4826 * a) no_prr_addback is off. 4827 * b) we are not app limited 4828 * c) we are doing prr 4829 * <and> 4830 * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). 4831 */ 4832 tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), 4833 rack->r_ctl.rc_prr_sndcnt); 4834 } 4835 rack->r_ctl.rc_prr_sndcnt = 0; 4836 rack_log_to_prr(rack, 1, 0); 4837 } 4838 rack_log_to_prr(rack, 14, orig_cwnd); 4839 tp->snd_recover = tp->snd_una; 4840 EXIT_RECOVERY(tp->t_flags); 4841 } 4842 4843 static void 4844 rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack) 4845 { 4846 struct tcp_rack *rack; 4847 uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; 4848 4849 INP_WLOCK_ASSERT(tp->t_inpcb); 4850 #ifdef STATS 4851 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); 4852 #endif 4853 if (IN_RECOVERY(tp->t_flags) == 0) { 4854 in_rec_at_entry = 0; 4855 ssthresh_enter = tp->snd_ssthresh; 4856 cwnd_enter = tp->snd_cwnd; 4857 } else 4858 in_rec_at_entry = 1; 4859 rack = (struct tcp_rack *)tp->t_fb_ptr; 4860 switch (type) { 4861 case CC_NDUPACK: 4862 tp->t_flags &= ~TF_WASFRECOVERY; 4863 tp->t_flags &= ~TF_WASCRECOVERY; 4864 if (!IN_FASTRECOVERY(tp->t_flags)) { 4865 rack->r_ctl.rc_prr_delivered = 0; 4866 rack->r_ctl.rc_prr_out = 0; 4867 if (rack->rack_no_prr == 0) { 4868 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4869 rack_log_to_prr(rack, 2, in_rec_at_entry); 4870 } 4871 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4872 tp->snd_recover = tp->snd_max; 4873 if (tp->t_flags2 & TF2_ECN_PERMIT) 4874 tp->t_flags2 |= TF2_ECN_SND_CWR; 4875 } 4876 break; 4877 case CC_ECN: 4878 if (!IN_CONGRECOVERY(tp->t_flags) || 4879 /* 4880 * Allow ECN reaction on ACK to CWR, if 4881 * that data segment was also CE marked. 4882 */ 4883 SEQ_GEQ(ack, tp->snd_recover)) { 4884 EXIT_CONGRECOVERY(tp->t_flags); 4885 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4886 tp->snd_recover = tp->snd_max + 1; 4887 if (tp->t_flags2 & TF2_ECN_PERMIT) 4888 tp->t_flags2 |= TF2_ECN_SND_CWR; 4889 } 4890 break; 4891 case CC_RTO: 4892 tp->t_dupacks = 0; 4893 tp->t_bytes_acked = 0; 4894 EXIT_RECOVERY(tp->t_flags); 4895 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4896 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4897 orig_cwnd = tp->snd_cwnd; 4898 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4899 rack_log_to_prr(rack, 16, orig_cwnd); 4900 if (tp->t_flags2 & TF2_ECN_PERMIT) 4901 tp->t_flags2 |= TF2_ECN_SND_CWR; 4902 break; 4903 case CC_RTO_ERR: 4904 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4905 /* RTO was unnecessary, so reset everything. */ 4906 tp->snd_cwnd = tp->snd_cwnd_prev; 4907 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4908 tp->snd_recover = tp->snd_recover_prev; 4909 if (tp->t_flags & TF_WASFRECOVERY) { 4910 ENTER_FASTRECOVERY(tp->t_flags); 4911 tp->t_flags &= ~TF_WASFRECOVERY; 4912 } 4913 if (tp->t_flags & TF_WASCRECOVERY) { 4914 ENTER_CONGRECOVERY(tp->t_flags); 4915 tp->t_flags &= ~TF_WASCRECOVERY; 4916 } 4917 tp->snd_nxt = tp->snd_max; 4918 tp->t_badrxtwin = 0; 4919 break; 4920 } 4921 if ((CC_ALGO(tp)->cong_signal != NULL) && 4922 (type != CC_RTO)){ 4923 tp->ccv->curack = ack; 4924 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4925 } 4926 if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { 4927 rack_log_to_prr(rack, 15, cwnd_enter); 4928 rack->r_ctl.dsack_byte_cnt = 0; 4929 rack->r_ctl.retran_during_recovery = 0; 4930 rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; 4931 rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; 4932 rack->r_ent_rec_ns = 1; 4933 } 4934 } 4935 4936 static inline void 4937 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4938 { 4939 uint32_t i_cwnd; 4940 4941 INP_WLOCK_ASSERT(tp->t_inpcb); 4942 4943 #ifdef NETFLIX_STATS 4944 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4945 if (tp->t_state == TCPS_ESTABLISHED) 4946 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4947 #endif 4948 if (CC_ALGO(tp)->after_idle != NULL) 4949 CC_ALGO(tp)->after_idle(tp->ccv); 4950 4951 if (tp->snd_cwnd == 1) 4952 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4953 else 4954 i_cwnd = rc_init_window(rack); 4955 4956 /* 4957 * Being idle is no differnt than the initial window. If the cc 4958 * clamps it down below the initial window raise it to the initial 4959 * window. 4960 */ 4961 if (tp->snd_cwnd < i_cwnd) { 4962 tp->snd_cwnd = i_cwnd; 4963 } 4964 } 4965 4966 /* 4967 * Indicate whether this ack should be delayed. We can delay the ack if 4968 * following conditions are met: 4969 * - There is no delayed ack timer in progress. 4970 * - Our last ack wasn't a 0-sized window. We never want to delay 4971 * the ack that opens up a 0-sized window. 4972 * - LRO wasn't used for this segment. We make sure by checking that the 4973 * segment size is not larger than the MSS. 4974 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4975 * connection. 4976 */ 4977 #define DELAY_ACK(tp, tlen) \ 4978 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4979 ((tp->t_flags & TF_DELACK) == 0) && \ 4980 (tlen <= tp->t_maxseg) && \ 4981 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4982 4983 static struct rack_sendmap * 4984 rack_find_lowest_rsm(struct tcp_rack *rack) 4985 { 4986 struct rack_sendmap *rsm; 4987 4988 /* 4989 * Walk the time-order transmitted list looking for an rsm that is 4990 * not acked. This will be the one that was sent the longest time 4991 * ago that is still outstanding. 4992 */ 4993 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4994 if (rsm->r_flags & RACK_ACKED) { 4995 continue; 4996 } 4997 goto finish; 4998 } 4999 finish: 5000 return (rsm); 5001 } 5002 5003 static struct rack_sendmap * 5004 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 5005 { 5006 struct rack_sendmap *prsm; 5007 5008 /* 5009 * Walk the sequence order list backward until we hit and arrive at 5010 * the highest seq not acked. In theory when this is called it 5011 * should be the last segment (which it was not). 5012 */ 5013 counter_u64_add(rack_find_high, 1); 5014 prsm = rsm; 5015 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 5016 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 5017 continue; 5018 } 5019 return (prsm); 5020 } 5021 return (NULL); 5022 } 5023 5024 static uint32_t 5025 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 5026 { 5027 int32_t lro; 5028 uint32_t thresh; 5029 5030 /* 5031 * lro is the flag we use to determine if we have seen reordering. 5032 * If it gets set we have seen reordering. The reorder logic either 5033 * works in one of two ways: 5034 * 5035 * If reorder-fade is configured, then we track the last time we saw 5036 * re-ordering occur. If we reach the point where enough time as 5037 * passed we no longer consider reordering has occuring. 5038 * 5039 * Or if reorder-face is 0, then once we see reordering we consider 5040 * the connection to alway be subject to reordering and just set lro 5041 * to 1. 5042 * 5043 * In the end if lro is non-zero we add the extra time for 5044 * reordering in. 5045 */ 5046 if (srtt == 0) 5047 srtt = 1; 5048 if (rack->r_ctl.rc_reorder_ts) { 5049 if (rack->r_ctl.rc_reorder_fade) { 5050 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 5051 lro = cts - rack->r_ctl.rc_reorder_ts; 5052 if (lro == 0) { 5053 /* 5054 * No time as passed since the last 5055 * reorder, mark it as reordering. 5056 */ 5057 lro = 1; 5058 } 5059 } else { 5060 /* Negative time? */ 5061 lro = 0; 5062 } 5063 if (lro > rack->r_ctl.rc_reorder_fade) { 5064 /* Turn off reordering seen too */ 5065 rack->r_ctl.rc_reorder_ts = 0; 5066 lro = 0; 5067 } 5068 } else { 5069 /* Reodering does not fade */ 5070 lro = 1; 5071 } 5072 } else { 5073 lro = 0; 5074 } 5075 thresh = srtt + rack->r_ctl.rc_pkt_delay; 5076 if (lro) { 5077 /* It must be set, if not you get 1/4 rtt */ 5078 if (rack->r_ctl.rc_reorder_shift) 5079 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 5080 else 5081 thresh += (srtt >> 2); 5082 } else { 5083 thresh += 1; 5084 } 5085 /* We don't let the rack timeout be above a RTO */ 5086 if (thresh > rack->rc_tp->t_rxtcur) { 5087 thresh = rack->rc_tp->t_rxtcur; 5088 } 5089 /* And we don't want it above the RTO max either */ 5090 if (thresh > rack_rto_max) { 5091 thresh = rack_rto_max; 5092 } 5093 return (thresh); 5094 } 5095 5096 static uint32_t 5097 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 5098 struct rack_sendmap *rsm, uint32_t srtt) 5099 { 5100 struct rack_sendmap *prsm; 5101 uint32_t thresh, len; 5102 int segsiz; 5103 5104 if (srtt == 0) 5105 srtt = 1; 5106 if (rack->r_ctl.rc_tlp_threshold) 5107 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 5108 else 5109 thresh = (srtt * 2); 5110 5111 /* Get the previous sent packet, if any */ 5112 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 5113 counter_u64_add(rack_enter_tlp_calc, 1); 5114 len = rsm->r_end - rsm->r_start; 5115 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 5116 /* Exactly like the ID */ 5117 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 5118 uint32_t alt_thresh; 5119 /* 5120 * Compensate for delayed-ack with the d-ack time. 5121 */ 5122 counter_u64_add(rack_used_tlpmethod, 1); 5123 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5124 if (alt_thresh > thresh) 5125 thresh = alt_thresh; 5126 } 5127 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 5128 /* 2.1 behavior */ 5129 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 5130 if (prsm && (len <= segsiz)) { 5131 /* 5132 * Two packets outstanding, thresh should be (2*srtt) + 5133 * possible inter-packet delay (if any). 5134 */ 5135 uint32_t inter_gap = 0; 5136 int idx, nidx; 5137 5138 counter_u64_add(rack_used_tlpmethod, 1); 5139 idx = rsm->r_rtr_cnt - 1; 5140 nidx = prsm->r_rtr_cnt - 1; 5141 if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { 5142 /* Yes it was sent later (or at the same time) */ 5143 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 5144 } 5145 thresh += inter_gap; 5146 } else if (len <= segsiz) { 5147 /* 5148 * Possibly compensate for delayed-ack. 5149 */ 5150 uint32_t alt_thresh; 5151 5152 counter_u64_add(rack_used_tlpmethod2, 1); 5153 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5154 if (alt_thresh > thresh) 5155 thresh = alt_thresh; 5156 } 5157 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 5158 /* 2.2 behavior */ 5159 if (len <= segsiz) { 5160 uint32_t alt_thresh; 5161 /* 5162 * Compensate for delayed-ack with the d-ack time. 5163 */ 5164 counter_u64_add(rack_used_tlpmethod, 1); 5165 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 5166 if (alt_thresh > thresh) 5167 thresh = alt_thresh; 5168 } 5169 } 5170 /* Not above an RTO */ 5171 if (thresh > tp->t_rxtcur) { 5172 thresh = tp->t_rxtcur; 5173 } 5174 /* Not above a RTO max */ 5175 if (thresh > rack_rto_max) { 5176 thresh = rack_rto_max; 5177 } 5178 /* Apply user supplied min TLP */ 5179 if (thresh < rack_tlp_min) { 5180 thresh = rack_tlp_min; 5181 } 5182 return (thresh); 5183 } 5184 5185 static uint32_t 5186 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 5187 { 5188 /* 5189 * We want the rack_rtt which is the 5190 * last rtt we measured. However if that 5191 * does not exist we fallback to the srtt (which 5192 * we probably will never do) and then as a last 5193 * resort we use RACK_INITIAL_RTO if no srtt is 5194 * yet set. 5195 */ 5196 if (rack->rc_rack_rtt) 5197 return (rack->rc_rack_rtt); 5198 else if (tp->t_srtt == 0) 5199 return (RACK_INITIAL_RTO); 5200 return (tp->t_srtt); 5201 } 5202 5203 static struct rack_sendmap * 5204 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 5205 { 5206 /* 5207 * Check to see that we don't need to fall into recovery. We will 5208 * need to do so if our oldest transmit is past the time we should 5209 * have had an ack. 5210 */ 5211 struct tcp_rack *rack; 5212 struct rack_sendmap *rsm; 5213 int32_t idx; 5214 uint32_t srtt, thresh; 5215 5216 rack = (struct tcp_rack *)tp->t_fb_ptr; 5217 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 5218 return (NULL); 5219 } 5220 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5221 if (rsm == NULL) 5222 return (NULL); 5223 5224 if (rsm->r_flags & RACK_ACKED) { 5225 rsm = rack_find_lowest_rsm(rack); 5226 if (rsm == NULL) 5227 return (NULL); 5228 } 5229 idx = rsm->r_rtr_cnt - 1; 5230 srtt = rack_grab_rtt(tp, rack); 5231 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 5232 if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { 5233 return (NULL); 5234 } 5235 if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { 5236 return (NULL); 5237 } 5238 /* Ok if we reach here we are over-due and this guy can be sent */ 5239 if (IN_RECOVERY(tp->t_flags) == 0) { 5240 /* 5241 * For the one that enters us into recovery record undo 5242 * info. 5243 */ 5244 rack->r_ctl.rc_rsm_start = rsm->r_start; 5245 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 5246 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 5247 } 5248 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 5249 return (rsm); 5250 } 5251 5252 static uint32_t 5253 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 5254 { 5255 int32_t t; 5256 int32_t tt; 5257 uint32_t ret_val; 5258 5259 t = (tp->t_srtt + (tp->t_rttvar << 2)); 5260 RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 5261 rack_persist_min, rack_persist_max, rack->r_ctl.timer_slop); 5262 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5263 tp->t_rxtshift++; 5264 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 5265 ret_val = (uint32_t)tt; 5266 return (ret_val); 5267 } 5268 5269 static uint32_t 5270 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 5271 { 5272 /* 5273 * Start the FR timer, we do this based on getting the first one in 5274 * the rc_tmap. Note that if its NULL we must stop the timer. in all 5275 * events we need to stop the running timer (if its running) before 5276 * starting the new one. 5277 */ 5278 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 5279 uint32_t srtt_cur; 5280 int32_t idx; 5281 int32_t is_tlp_timer = 0; 5282 struct rack_sendmap *rsm; 5283 5284 if (rack->t_timers_stopped) { 5285 /* All timers have been stopped none are to run */ 5286 return (0); 5287 } 5288 if (rack->rc_in_persist) { 5289 /* We can't start any timer in persists */ 5290 return (rack_get_persists_timer_val(tp, rack)); 5291 } 5292 rack->rc_on_min_to = 0; 5293 if ((tp->t_state < TCPS_ESTABLISHED) || 5294 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 5295 goto activate_rxt; 5296 } 5297 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5298 if ((rsm == NULL) || sup_rack) { 5299 /* Nothing on the send map or no rack */ 5300 activate_rxt: 5301 time_since_sent = 0; 5302 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5303 if (rsm) { 5304 /* 5305 * Should we discount the RTX timer any? 5306 * 5307 * We want to discount it the smallest amount. 5308 * If a timer (Rack/TLP or RXT) has gone off more 5309 * recently thats the discount we want to use (now - timer time). 5310 * If the retransmit of the oldest packet was more recent then 5311 * we want to use that (now - oldest-packet-last_transmit_time). 5312 * 5313 */ 5314 idx = rsm->r_rtr_cnt - 1; 5315 if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) 5316 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5317 else 5318 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5319 if (TSTMP_GT(cts, tstmp_touse)) 5320 time_since_sent = cts - tstmp_touse; 5321 } 5322 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 5323 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 5324 to = tp->t_rxtcur; 5325 if (to > time_since_sent) 5326 to -= time_since_sent; 5327 else 5328 to = rack->r_ctl.rc_min_to; 5329 if (to == 0) 5330 to = 1; 5331 /* Special case for KEEPINIT */ 5332 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 5333 (TP_KEEPINIT(tp) != 0) && 5334 rsm) { 5335 /* 5336 * We have to put a ceiling on the rxt timer 5337 * of the keep-init timeout. 5338 */ 5339 uint32_t max_time, red; 5340 5341 max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); 5342 if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { 5343 red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); 5344 if (red < max_time) 5345 max_time -= red; 5346 else 5347 max_time = 1; 5348 } 5349 /* Reduce timeout to the keep value if needed */ 5350 if (max_time < to) 5351 to = max_time; 5352 } 5353 return (to); 5354 } 5355 return (0); 5356 } 5357 if (rsm->r_flags & RACK_ACKED) { 5358 rsm = rack_find_lowest_rsm(rack); 5359 if (rsm == NULL) { 5360 /* No lowest? */ 5361 goto activate_rxt; 5362 } 5363 } 5364 if (rack->sack_attack_disable) { 5365 /* 5366 * We don't want to do 5367 * any TLP's if you are an attacker. 5368 * Though if you are doing what 5369 * is expected you may still have 5370 * SACK-PASSED marks. 5371 */ 5372 goto activate_rxt; 5373 } 5374 /* Convert from ms to usecs */ 5375 if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 5376 if ((tp->t_flags & TF_SENTFIN) && 5377 ((tp->snd_max - tp->snd_una) == 1) && 5378 (rsm->r_flags & RACK_HAS_FIN)) { 5379 /* 5380 * We don't start a rack timer if all we have is a 5381 * FIN outstanding. 5382 */ 5383 goto activate_rxt; 5384 } 5385 if ((rack->use_rack_rr == 0) && 5386 (IN_FASTRECOVERY(tp->t_flags)) && 5387 (rack->rack_no_prr == 0) && 5388 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5389 /* 5390 * We are not cheating, in recovery and 5391 * not enough ack's to yet get our next 5392 * retransmission out. 5393 * 5394 * Note that classified attackers do not 5395 * get to use the rack-cheat. 5396 */ 5397 goto activate_tlp; 5398 } 5399 srtt = rack_grab_rtt(tp, rack); 5400 thresh = rack_calc_thresh_rack(rack, srtt, cts); 5401 idx = rsm->r_rtr_cnt - 1; 5402 exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; 5403 if (SEQ_GEQ(exp, cts)) { 5404 to = exp - cts; 5405 if (to < rack->r_ctl.rc_min_to) { 5406 to = rack->r_ctl.rc_min_to; 5407 if (rack->r_rr_config == 3) 5408 rack->rc_on_min_to = 1; 5409 } 5410 } else { 5411 to = rack->r_ctl.rc_min_to; 5412 if (rack->r_rr_config == 3) 5413 rack->rc_on_min_to = 1; 5414 } 5415 } else { 5416 /* Ok we need to do a TLP not RACK */ 5417 activate_tlp: 5418 if ((rack->rc_tlp_in_progress != 0) && 5419 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 5420 /* 5421 * The previous send was a TLP and we have sent 5422 * N TLP's without sending new data. 5423 */ 5424 goto activate_rxt; 5425 } 5426 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 5427 if (rsm == NULL) { 5428 /* We found no rsm to TLP with. */ 5429 goto activate_rxt; 5430 } 5431 if (rsm->r_flags & RACK_HAS_FIN) { 5432 /* If its a FIN we dont do TLP */ 5433 rsm = NULL; 5434 goto activate_rxt; 5435 } 5436 idx = rsm->r_rtr_cnt - 1; 5437 time_since_sent = 0; 5438 if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) 5439 tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; 5440 else 5441 tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; 5442 if (TSTMP_GT(cts, tstmp_touse)) 5443 time_since_sent = cts - tstmp_touse; 5444 is_tlp_timer = 1; 5445 if (tp->t_srtt) { 5446 if ((rack->rc_srtt_measure_made == 0) && 5447 (tp->t_srtt == 1)) { 5448 /* 5449 * If another stack as run and set srtt to 1, 5450 * then the srtt was 0, so lets use the initial. 5451 */ 5452 srtt = RACK_INITIAL_RTO; 5453 } else { 5454 srtt_cur = tp->t_srtt; 5455 srtt = srtt_cur; 5456 } 5457 } else 5458 srtt = RACK_INITIAL_RTO; 5459 /* 5460 * If the SRTT is not keeping up and the 5461 * rack RTT has spiked we want to use 5462 * the last RTT not the smoothed one. 5463 */ 5464 if (rack_tlp_use_greater && 5465 tp->t_srtt && 5466 (srtt < rack_grab_rtt(tp, rack))) { 5467 srtt = rack_grab_rtt(tp, rack); 5468 } 5469 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 5470 if (thresh > time_since_sent) { 5471 to = thresh - time_since_sent; 5472 } else { 5473 to = rack->r_ctl.rc_min_to; 5474 rack_log_alt_to_to_cancel(rack, 5475 thresh, /* flex1 */ 5476 time_since_sent, /* flex2 */ 5477 tstmp_touse, /* flex3 */ 5478 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 5479 (uint32_t)rsm->r_tim_lastsent[idx], 5480 srtt, 5481 idx, 99); 5482 } 5483 if (to < rack_tlp_min) { 5484 to = rack_tlp_min; 5485 } 5486 if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { 5487 /* 5488 * If the TLP time works out to larger than the max 5489 * RTO lets not do TLP.. just RTO. 5490 */ 5491 goto activate_rxt; 5492 } 5493 } 5494 if (is_tlp_timer == 0) { 5495 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 5496 } else { 5497 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 5498 } 5499 if (to == 0) 5500 to = 1; 5501 return (to); 5502 } 5503 5504 static void 5505 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5506 { 5507 if (rack->rc_in_persist == 0) { 5508 if (tp->t_flags & TF_GPUTINPROG) { 5509 /* 5510 * Stop the goodput now, the calling of the 5511 * measurement function clears the flag. 5512 */ 5513 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 5514 } 5515 #ifdef NETFLIX_SHARED_CWND 5516 if (rack->r_ctl.rc_scw) { 5517 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5518 rack->rack_scwnd_is_idle = 1; 5519 } 5520 #endif 5521 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 5522 if (rack->r_ctl.rc_went_idle_time == 0) 5523 rack->r_ctl.rc_went_idle_time = 1; 5524 rack_timer_cancel(tp, rack, cts, __LINE__); 5525 tp->t_rxtshift = 0; 5526 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5527 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5528 rack->rc_in_persist = 1; 5529 } 5530 } 5531 5532 static void 5533 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5534 { 5535 if (rack->rc_inp->inp_in_hpts) { 5536 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5537 rack->r_ctl.rc_hpts_flags = 0; 5538 } 5539 #ifdef NETFLIX_SHARED_CWND 5540 if (rack->r_ctl.rc_scw) { 5541 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 5542 rack->rack_scwnd_is_idle = 0; 5543 } 5544 #endif 5545 if (rack->rc_gp_dyn_mul && 5546 (rack->use_fixed_rate == 0) && 5547 (rack->rc_always_pace)) { 5548 /* 5549 * Do we count this as if a probe-rtt just 5550 * finished? 5551 */ 5552 uint32_t time_idle, idle_min; 5553 5554 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 5555 idle_min = rack_min_probertt_hold; 5556 if (rack_probertt_gpsrtt_cnt_div) { 5557 uint64_t extra; 5558 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 5559 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 5560 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 5561 idle_min += (uint32_t)extra; 5562 } 5563 if (time_idle >= idle_min) { 5564 /* Yes, we count it as a probe-rtt. */ 5565 uint32_t us_cts; 5566 5567 us_cts = tcp_get_usecs(NULL); 5568 if (rack->in_probe_rtt == 0) { 5569 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 5570 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 5571 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 5572 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 5573 } else { 5574 rack_exit_probertt(rack, us_cts); 5575 } 5576 } 5577 } 5578 rack->rc_in_persist = 0; 5579 rack->r_ctl.rc_went_idle_time = 0; 5580 tp->t_rxtshift = 0; 5581 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 5582 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 5583 rack->r_ctl.rc_agg_delayed = 0; 5584 rack->r_early = 0; 5585 rack->r_late = 0; 5586 rack->r_ctl.rc_agg_early = 0; 5587 } 5588 5589 static void 5590 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 5591 struct hpts_diag *diag, struct timeval *tv) 5592 { 5593 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5594 union tcp_log_stackspecific log; 5595 5596 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5597 log.u_bbr.flex1 = diag->p_nxt_slot; 5598 log.u_bbr.flex2 = diag->p_cur_slot; 5599 log.u_bbr.flex3 = diag->slot_req; 5600 log.u_bbr.flex4 = diag->inp_hptsslot; 5601 log.u_bbr.flex5 = diag->slot_remaining; 5602 log.u_bbr.flex6 = diag->need_new_to; 5603 log.u_bbr.flex7 = diag->p_hpts_active; 5604 log.u_bbr.flex8 = diag->p_on_min_sleep; 5605 /* Hijack other fields as needed */ 5606 log.u_bbr.epoch = diag->have_slept; 5607 log.u_bbr.lt_epoch = diag->yet_to_sleep; 5608 log.u_bbr.pkts_out = diag->co_ret; 5609 log.u_bbr.applimited = diag->hpts_sleep_time; 5610 log.u_bbr.delivered = diag->p_prev_slot; 5611 log.u_bbr.inflight = diag->p_runningtick; 5612 log.u_bbr.bw_inuse = diag->wheel_tick; 5613 log.u_bbr.rttProp = diag->wheel_cts; 5614 log.u_bbr.timeStamp = cts; 5615 log.u_bbr.delRate = diag->maxticks; 5616 log.u_bbr.cur_del_rate = diag->p_curtick; 5617 log.u_bbr.cur_del_rate <<= 32; 5618 log.u_bbr.cur_del_rate |= diag->p_lasttick; 5619 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5620 &rack->rc_inp->inp_socket->so_rcv, 5621 &rack->rc_inp->inp_socket->so_snd, 5622 BBR_LOG_HPTSDIAG, 0, 5623 0, &log, false, tv); 5624 } 5625 5626 } 5627 5628 static void 5629 rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) 5630 { 5631 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 5632 union tcp_log_stackspecific log; 5633 struct timeval tv; 5634 5635 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 5636 log.u_bbr.flex1 = sb->sb_flags; 5637 log.u_bbr.flex2 = len; 5638 log.u_bbr.flex3 = sb->sb_state; 5639 log.u_bbr.flex8 = type; 5640 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 5641 TCP_LOG_EVENTP(rack->rc_tp, NULL, 5642 &rack->rc_inp->inp_socket->so_rcv, 5643 &rack->rc_inp->inp_socket->so_snd, 5644 TCP_LOG_SB_WAKE, 0, 5645 len, &log, false, &tv); 5646 } 5647 } 5648 5649 static void 5650 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 5651 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 5652 { 5653 struct hpts_diag diag; 5654 struct inpcb *inp; 5655 struct timeval tv; 5656 uint32_t delayed_ack = 0; 5657 uint32_t hpts_timeout; 5658 uint32_t entry_slot = slot; 5659 uint8_t stopped; 5660 uint32_t left = 0; 5661 uint32_t us_cts; 5662 5663 inp = tp->t_inpcb; 5664 if ((tp->t_state == TCPS_CLOSED) || 5665 (tp->t_state == TCPS_LISTEN)) { 5666 return; 5667 } 5668 if (inp->inp_in_hpts) { 5669 /* Already on the pacer */ 5670 return; 5671 } 5672 stopped = rack->rc_tmr_stopped; 5673 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 5674 left = rack->r_ctl.rc_timer_exp - cts; 5675 } 5676 rack->r_ctl.rc_timer_exp = 0; 5677 rack->r_ctl.rc_hpts_flags = 0; 5678 us_cts = tcp_get_usecs(&tv); 5679 /* Now early/late accounting */ 5680 rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL); 5681 if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { 5682 /* 5683 * We have a early carry over set, 5684 * we can always add more time so we 5685 * can always make this compensation. 5686 * 5687 * Note if ack's are allowed to wake us do not 5688 * penalize the next timer for being awoke 5689 * by an ack aka the rc_agg_early (non-paced mode). 5690 */ 5691 slot += rack->r_ctl.rc_agg_early; 5692 rack->r_early = 0; 5693 rack->r_ctl.rc_agg_early = 0; 5694 } 5695 if (rack->r_late) { 5696 /* 5697 * This is harder, we can 5698 * compensate some but it 5699 * really depends on what 5700 * the current pacing time is. 5701 */ 5702 if (rack->r_ctl.rc_agg_delayed >= slot) { 5703 /* 5704 * We can't compensate for it all. 5705 * And we have to have some time 5706 * on the clock. We always have a min 5707 * 10 slots (10 x 10 i.e. 100 usecs). 5708 */ 5709 if (slot <= HPTS_TICKS_PER_USEC) { 5710 /* We gain delay */ 5711 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 5712 slot = HPTS_TICKS_PER_USEC; 5713 } else { 5714 /* We take off some */ 5715 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 5716 slot = HPTS_TICKS_PER_USEC; 5717 } 5718 } else { 5719 slot -= rack->r_ctl.rc_agg_delayed; 5720 rack->r_ctl.rc_agg_delayed = 0; 5721 /* Make sure we have 100 useconds at minimum */ 5722 if (slot < HPTS_TICKS_PER_USEC) { 5723 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 5724 slot = HPTS_TICKS_PER_USEC; 5725 } 5726 if (rack->r_ctl.rc_agg_delayed == 0) 5727 rack->r_late = 0; 5728 } 5729 } 5730 if (slot) { 5731 /* We are pacing too */ 5732 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 5733 } 5734 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 5735 #ifdef NETFLIX_EXP_DETECTION 5736 if (rack->sack_attack_disable && 5737 (slot < tcp_sad_pacing_interval)) { 5738 /* 5739 * We have a potential attacker on 5740 * the line. We have possibly some 5741 * (or now) pacing time set. We want to 5742 * slow down the processing of sacks by some 5743 * amount (if it is an attacker). Set the default 5744 * slot for attackers in place (unless the orginal 5745 * interval is longer). Its stored in 5746 * micro-seconds, so lets convert to msecs. 5747 */ 5748 slot = tcp_sad_pacing_interval; 5749 } 5750 #endif 5751 if (tp->t_flags & TF_DELACK) { 5752 delayed_ack = TICKS_2_USEC(tcp_delacktime); 5753 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 5754 } 5755 if (delayed_ack && ((hpts_timeout == 0) || 5756 (delayed_ack < hpts_timeout))) 5757 hpts_timeout = delayed_ack; 5758 else 5759 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5760 /* 5761 * If no timers are going to run and we will fall off the hptsi 5762 * wheel, we resort to a keep-alive timer if its configured. 5763 */ 5764 if ((hpts_timeout == 0) && 5765 (slot == 0)) { 5766 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5767 (tp->t_state <= TCPS_CLOSING)) { 5768 /* 5769 * Ok we have no timer (persists, rack, tlp, rxt or 5770 * del-ack), we don't have segments being paced. So 5771 * all that is left is the keepalive timer. 5772 */ 5773 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 5774 /* Get the established keep-alive time */ 5775 hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); 5776 } else { 5777 /* 5778 * Get the initial setup keep-alive time, 5779 * note that this is probably not going to 5780 * happen, since rack will be running a rxt timer 5781 * if a SYN of some sort is outstanding. It is 5782 * actually handled in rack_timeout_rxt(). 5783 */ 5784 hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); 5785 } 5786 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 5787 if (rack->in_probe_rtt) { 5788 /* 5789 * We want to instead not wake up a long time from 5790 * now but to wake up about the time we would 5791 * exit probe-rtt and initiate a keep-alive ack. 5792 * This will get us out of probe-rtt and update 5793 * our min-rtt. 5794 */ 5795 hpts_timeout = rack_min_probertt_hold; 5796 } 5797 } 5798 } 5799 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 5800 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 5801 /* 5802 * RACK, TLP, persists and RXT timers all are restartable 5803 * based on actions input .. i.e we received a packet (ack 5804 * or sack) and that changes things (rw, or snd_una etc). 5805 * Thus we can restart them with a new value. For 5806 * keep-alive, delayed_ack we keep track of what was left 5807 * and restart the timer with a smaller value. 5808 */ 5809 if (left < hpts_timeout) 5810 hpts_timeout = left; 5811 } 5812 if (hpts_timeout) { 5813 /* 5814 * Hack alert for now we can't time-out over 2,147,483 5815 * seconds (a bit more than 596 hours), which is probably ok 5816 * :). 5817 */ 5818 if (hpts_timeout > 0x7ffffffe) 5819 hpts_timeout = 0x7ffffffe; 5820 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 5821 } 5822 rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL); 5823 if ((rack->gp_ready == 0) && 5824 (rack->use_fixed_rate == 0) && 5825 (hpts_timeout < slot) && 5826 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 5827 /* 5828 * We have no good estimate yet for the 5829 * old clunky burst mitigation or the 5830 * real pacing. And the tlp or rxt is smaller 5831 * than the pacing calculation. Lets not 5832 * pace that long since we know the calculation 5833 * so far is not accurate. 5834 */ 5835 slot = hpts_timeout; 5836 } 5837 rack->r_ctl.last_pacing_time = slot; 5838 /** 5839 * Turn off all the flags for queuing by default. The 5840 * flags have important meanings to what happens when 5841 * LRO interacts with the transport. Most likely (by default now) 5842 * mbuf_queueing and ack compression are on. So the transport 5843 * has a couple of flags that control what happens (if those 5844 * are not on then these flags won't have any effect since it 5845 * won't go through the queuing LRO path). 5846 * 5847 * INP_MBUF_QUEUE_READY - This flags says that I am busy 5848 * pacing output, so don't disturb. But 5849 * it also means LRO can wake me if there 5850 * is a SACK arrival. 5851 * 5852 * INP_DONT_SACK_QUEUE - This flag is used in conjunction 5853 * with the above flag (QUEUE_READY) and 5854 * when present it says don't even wake me 5855 * if a SACK arrives. 5856 * 5857 * The idea behind these flags is that if we are pacing we 5858 * set the MBUF_QUEUE_READY and only get woken up if 5859 * a SACK arrives (which could change things) or if 5860 * our pacing timer expires. If, however, we have a rack 5861 * timer running, then we don't even want a sack to wake 5862 * us since the rack timer has to expire before we can send. 5863 * 5864 * Other cases should usually have none of the flags set 5865 * so LRO can call into us. 5866 */ 5867 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5868 if (slot) { 5869 rack->r_ctl.rc_last_output_to = us_cts + slot; 5870 /* 5871 * A pacing timer (slot) is being set, in 5872 * such a case we cannot send (we are blocked by 5873 * the timer). So lets tell LRO that it should not 5874 * wake us unless there is a SACK. Note this only 5875 * will be effective if mbuf queueing is on or 5876 * compressed acks are being processed. 5877 */ 5878 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5879 /* 5880 * But wait if we have a Rack timer running 5881 * even a SACK should not disturb us (with 5882 * the exception of r_rr_config 3). 5883 */ 5884 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 5885 (rack->r_rr_config != 3)) 5886 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5887 if (rack->rc_ack_can_sendout_data) { 5888 /* 5889 * Ahh but wait, this is that special case 5890 * where the pacing timer can be disturbed 5891 * backout the changes (used for non-paced 5892 * burst limiting). 5893 */ 5894 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5895 } 5896 if ((rack->use_rack_rr) && 5897 (rack->r_rr_config < 2) && 5898 ((hpts_timeout) && (hpts_timeout < slot))) { 5899 /* 5900 * Arrange for the hpts to kick back in after the 5901 * t-o if the t-o does not cause a send. 5902 */ 5903 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 5904 __LINE__, &diag); 5905 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5906 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5907 } else { 5908 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 5909 __LINE__, &diag); 5910 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5911 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 5912 } 5913 } else if (hpts_timeout) { 5914 /* 5915 * With respect to inp_flags2 here, lets let any new acks wake 5916 * us up here. Since we are not pacing (no pacing timer), output 5917 * can happen so we should let it. If its a Rack timer, then any inbound 5918 * packet probably won't change the sending (we will be blocked) 5919 * but it may change the prr stats so letting it in (the set defaults 5920 * at the start of this block) are good enough. 5921 */ 5922 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), 5923 __LINE__, &diag); 5924 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5925 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5926 } else { 5927 /* No timer starting */ 5928 #ifdef INVARIANTS 5929 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 5930 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 5931 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 5932 } 5933 #endif 5934 } 5935 rack->rc_tmr_stopped = 0; 5936 if (slot) 5937 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 5938 } 5939 5940 /* 5941 * RACK Timer, here we simply do logging and house keeping. 5942 * the normal rack_output() function will call the 5943 * appropriate thing to check if we need to do a RACK retransmit. 5944 * We return 1, saying don't proceed with rack_output only 5945 * when all timers have been stopped (destroyed PCB?). 5946 */ 5947 static int 5948 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5949 { 5950 /* 5951 * This timer simply provides an internal trigger to send out data. 5952 * The check_recovery_mode call will see if there are needed 5953 * retransmissions, if so we will enter fast-recovery. The output 5954 * call may or may not do the same thing depending on sysctl 5955 * settings. 5956 */ 5957 struct rack_sendmap *rsm; 5958 5959 if (tp->t_timers->tt_flags & TT_STOPPED) { 5960 return (1); 5961 } 5962 counter_u64_add(rack_to_tot, 1); 5963 if (rack->r_state && (rack->r_state != tp->t_state)) 5964 rack_set_state(tp, rack); 5965 rack->rc_on_min_to = 0; 5966 rsm = rack_check_recovery_mode(tp, cts); 5967 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5968 if (rsm) { 5969 rack->r_ctl.rc_resend = rsm; 5970 rack->r_timer_override = 1; 5971 if (rack->use_rack_rr) { 5972 /* 5973 * Don't accumulate extra pacing delay 5974 * we are allowing the rack timer to 5975 * over-ride pacing i.e. rrr takes precedence 5976 * if the pacing interval is longer than the rrr 5977 * time (in other words we get the min pacing 5978 * time versus rrr pacing time). 5979 */ 5980 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5981 } 5982 } 5983 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5984 if (rsm == NULL) { 5985 /* restart a timer and return 1 */ 5986 rack_start_hpts_timer(rack, tp, cts, 5987 0, 0, 0); 5988 return (1); 5989 } 5990 return (0); 5991 } 5992 5993 static void 5994 rack_adjust_orig_mlen(struct rack_sendmap *rsm) 5995 { 5996 if (rsm->m->m_len > rsm->orig_m_len) { 5997 /* 5998 * Mbuf grew, caused by sbcompress, our offset does 5999 * not change. 6000 */ 6001 rsm->orig_m_len = rsm->m->m_len; 6002 } else if (rsm->m->m_len < rsm->orig_m_len) { 6003 /* 6004 * Mbuf shrank, trimmed off the top by an ack, our 6005 * offset changes. 6006 */ 6007 rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); 6008 rsm->orig_m_len = rsm->m->m_len; 6009 } 6010 } 6011 6012 static void 6013 rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) 6014 { 6015 struct mbuf *m; 6016 uint32_t soff; 6017 6018 if (src_rsm->m && (src_rsm->orig_m_len != src_rsm->m->m_len)) { 6019 /* Fix up the orig_m_len and possibly the mbuf offset */ 6020 rack_adjust_orig_mlen(src_rsm); 6021 } 6022 m = src_rsm->m; 6023 soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); 6024 while (soff >= m->m_len) { 6025 /* Move out past this mbuf */ 6026 soff -= m->m_len; 6027 m = m->m_next; 6028 KASSERT((m != NULL), 6029 ("rsm:%p nrsm:%p hit at soff:%u null m", 6030 src_rsm, rsm, soff)); 6031 } 6032 rsm->m = m; 6033 rsm->soff = soff; 6034 rsm->orig_m_len = m->m_len; 6035 } 6036 6037 static __inline void 6038 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 6039 struct rack_sendmap *rsm, uint32_t start) 6040 { 6041 int idx; 6042 6043 nrsm->r_start = start; 6044 nrsm->r_end = rsm->r_end; 6045 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 6046 nrsm->r_flags = rsm->r_flags; 6047 nrsm->r_dupack = rsm->r_dupack; 6048 nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; 6049 nrsm->r_rtr_bytes = 0; 6050 rsm->r_end = nrsm->r_start; 6051 nrsm->r_just_ret = rsm->r_just_ret; 6052 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 6053 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 6054 } 6055 /* Now if we have SYN flag we keep it on the left edge */ 6056 if (nrsm->r_flags & RACK_HAS_SYN) 6057 nrsm->r_flags &= ~RACK_HAS_SYN; 6058 /* Now if we have a FIN flag we keep it on the right edge */ 6059 if (rsm->r_flags & RACK_HAS_FIN) 6060 rsm->r_flags &= ~RACK_HAS_FIN; 6061 /* Push bit must go to the right edge as well */ 6062 if (rsm->r_flags & RACK_HAD_PUSH) 6063 rsm->r_flags &= ~RACK_HAD_PUSH; 6064 6065 /* 6066 * Now we need to find nrsm's new location in the mbuf chain 6067 * we basically calculate a new offset, which is soff + 6068 * how much is left in original rsm. Then we walk out the mbuf 6069 * chain to find the righ postion, it may be the same mbuf 6070 * or maybe not. 6071 */ 6072 KASSERT(((rsm->m != NULL) || 6073 (rsm->r_flags & (RACK_HAS_SYN|RACK_HAS_FIN))), 6074 ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); 6075 if (rsm->m) 6076 rack_setup_offset_for_rsm(rsm, nrsm); 6077 } 6078 6079 static struct rack_sendmap * 6080 rack_merge_rsm(struct tcp_rack *rack, 6081 struct rack_sendmap *l_rsm, 6082 struct rack_sendmap *r_rsm) 6083 { 6084 /* 6085 * We are merging two ack'd RSM's, 6086 * the l_rsm is on the left (lower seq 6087 * values) and the r_rsm is on the right 6088 * (higher seq value). The simplest way 6089 * to merge these is to move the right 6090 * one into the left. I don't think there 6091 * is any reason we need to try to find 6092 * the oldest (or last oldest retransmitted). 6093 */ 6094 struct rack_sendmap *rm; 6095 6096 rack_log_map_chg(rack->rc_tp, rack, NULL, 6097 l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); 6098 l_rsm->r_end = r_rsm->r_end; 6099 if (l_rsm->r_dupack < r_rsm->r_dupack) 6100 l_rsm->r_dupack = r_rsm->r_dupack; 6101 if (r_rsm->r_rtr_bytes) 6102 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 6103 if (r_rsm->r_in_tmap) { 6104 /* This really should not happen */ 6105 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 6106 r_rsm->r_in_tmap = 0; 6107 } 6108 6109 /* Now the flags */ 6110 if (r_rsm->r_flags & RACK_HAS_FIN) 6111 l_rsm->r_flags |= RACK_HAS_FIN; 6112 if (r_rsm->r_flags & RACK_TLP) 6113 l_rsm->r_flags |= RACK_TLP; 6114 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 6115 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 6116 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 6117 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 6118 /* 6119 * If both are app-limited then let the 6120 * free lower the count. If right is app 6121 * limited and left is not, transfer. 6122 */ 6123 l_rsm->r_flags |= RACK_APP_LIMITED; 6124 r_rsm->r_flags &= ~RACK_APP_LIMITED; 6125 if (r_rsm == rack->r_ctl.rc_first_appl) 6126 rack->r_ctl.rc_first_appl = l_rsm; 6127 } 6128 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 6129 #ifdef INVARIANTS 6130 if (rm != r_rsm) { 6131 panic("removing head in rack:%p rsm:%p rm:%p", 6132 rack, r_rsm, rm); 6133 } 6134 #endif 6135 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 6136 /* Transfer the split limit to the map we free */ 6137 r_rsm->r_limit_type = l_rsm->r_limit_type; 6138 l_rsm->r_limit_type = 0; 6139 } 6140 rack_free(rack, r_rsm); 6141 return (l_rsm); 6142 } 6143 6144 /* 6145 * TLP Timer, here we simply setup what segment we want to 6146 * have the TLP expire on, the normal rack_output() will then 6147 * send it out. 6148 * 6149 * We return 1, saying don't proceed with rack_output only 6150 * when all timers have been stopped (destroyed PCB?). 6151 */ 6152 static int 6153 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6154 { 6155 /* 6156 * Tail Loss Probe. 6157 */ 6158 struct rack_sendmap *rsm = NULL; 6159 struct rack_sendmap *insret; 6160 struct socket *so; 6161 uint32_t amm; 6162 uint32_t out, avail; 6163 int collapsed_win = 0; 6164 6165 if (tp->t_timers->tt_flags & TT_STOPPED) { 6166 return (1); 6167 } 6168 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6169 /* Its not time yet */ 6170 return (0); 6171 } 6172 if (ctf_progress_timeout_check(tp, true)) { 6173 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6174 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 6175 return (1); 6176 } 6177 /* 6178 * A TLP timer has expired. We have been idle for 2 rtts. So we now 6179 * need to figure out how to force a full MSS segment out. 6180 */ 6181 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 6182 rack->r_ctl.retran_during_recovery = 0; 6183 rack->r_ctl.dsack_byte_cnt = 0; 6184 counter_u64_add(rack_tlp_tot, 1); 6185 if (rack->r_state && (rack->r_state != tp->t_state)) 6186 rack_set_state(tp, rack); 6187 so = tp->t_inpcb->inp_socket; 6188 avail = sbavail(&so->so_snd); 6189 out = tp->snd_max - tp->snd_una; 6190 if (out > tp->snd_wnd) { 6191 /* special case, we need a retransmission */ 6192 collapsed_win = 1; 6193 goto need_retran; 6194 } 6195 /* 6196 * Check our send oldest always settings, and if 6197 * there is an oldest to send jump to the need_retran. 6198 */ 6199 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 6200 goto need_retran; 6201 6202 if (avail > out) { 6203 /* New data is available */ 6204 amm = avail - out; 6205 if (amm > ctf_fixed_maxseg(tp)) { 6206 amm = ctf_fixed_maxseg(tp); 6207 if ((amm + out) > tp->snd_wnd) { 6208 /* We are rwnd limited */ 6209 goto need_retran; 6210 } 6211 } else if (amm < ctf_fixed_maxseg(tp)) { 6212 /* not enough to fill a MTU */ 6213 goto need_retran; 6214 } 6215 if (IN_FASTRECOVERY(tp->t_flags)) { 6216 /* Unlikely */ 6217 if (rack->rack_no_prr == 0) { 6218 if (out + amm <= tp->snd_wnd) { 6219 rack->r_ctl.rc_prr_sndcnt = amm; 6220 rack_log_to_prr(rack, 4, 0); 6221 } 6222 } else 6223 goto need_retran; 6224 } else { 6225 /* Set the send-new override */ 6226 if (out + amm <= tp->snd_wnd) 6227 rack->r_ctl.rc_tlp_new_data = amm; 6228 else 6229 goto need_retran; 6230 } 6231 rack->r_ctl.rc_tlpsend = NULL; 6232 counter_u64_add(rack_tlp_newdata, 1); 6233 goto send; 6234 } 6235 need_retran: 6236 /* 6237 * Ok we need to arrange the last un-acked segment to be re-sent, or 6238 * optionally the first un-acked segment. 6239 */ 6240 if (collapsed_win == 0) { 6241 if (rack_always_send_oldest) 6242 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 6243 else { 6244 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6245 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 6246 rsm = rack_find_high_nonack(rack, rsm); 6247 } 6248 } 6249 if (rsm == NULL) { 6250 counter_u64_add(rack_tlp_does_nada, 1); 6251 #ifdef TCP_BLACKBOX 6252 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6253 #endif 6254 goto out; 6255 } 6256 } else { 6257 /* 6258 * We must find the last segment 6259 * that was acceptable by the client. 6260 */ 6261 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6262 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 6263 /* Found one */ 6264 break; 6265 } 6266 } 6267 if (rsm == NULL) { 6268 /* None? if so send the first */ 6269 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6270 if (rsm == NULL) { 6271 counter_u64_add(rack_tlp_does_nada, 1); 6272 #ifdef TCP_BLACKBOX 6273 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 6274 #endif 6275 goto out; 6276 } 6277 } 6278 } 6279 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 6280 /* 6281 * We need to split this the last segment in two. 6282 */ 6283 struct rack_sendmap *nrsm; 6284 6285 nrsm = rack_alloc_full_limit(rack); 6286 if (nrsm == NULL) { 6287 /* 6288 * No memory to split, we will just exit and punt 6289 * off to the RXT timer. 6290 */ 6291 counter_u64_add(rack_tlp_does_nada, 1); 6292 goto out; 6293 } 6294 rack_clone_rsm(rack, nrsm, rsm, 6295 (rsm->r_end - ctf_fixed_maxseg(tp))); 6296 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 6297 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6298 #ifdef INVARIANTS 6299 if (insret != NULL) { 6300 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6301 nrsm, insret, rack, rsm); 6302 } 6303 #endif 6304 if (rsm->r_in_tmap) { 6305 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6306 nrsm->r_in_tmap = 1; 6307 } 6308 rsm->r_flags &= (~RACK_HAS_FIN); 6309 rsm = nrsm; 6310 } 6311 rack->r_ctl.rc_tlpsend = rsm; 6312 send: 6313 rack->r_timer_override = 1; 6314 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6315 return (0); 6316 out: 6317 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 6318 return (0); 6319 } 6320 6321 /* 6322 * Delayed ack Timer, here we simply need to setup the 6323 * ACK_NOW flag and remove the DELACK flag. From there 6324 * the output routine will send the ack out. 6325 * 6326 * We only return 1, saying don't proceed, if all timers 6327 * are stopped (destroyed PCB?). 6328 */ 6329 static int 6330 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6331 { 6332 if (tp->t_timers->tt_flags & TT_STOPPED) { 6333 return (1); 6334 } 6335 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 6336 tp->t_flags &= ~TF_DELACK; 6337 tp->t_flags |= TF_ACKNOW; 6338 KMOD_TCPSTAT_INC(tcps_delack); 6339 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 6340 return (0); 6341 } 6342 6343 /* 6344 * Persists timer, here we simply send the 6345 * same thing as a keepalive will. 6346 * the one byte send. 6347 * 6348 * We only return 1, saying don't proceed, if all timers 6349 * are stopped (destroyed PCB?). 6350 */ 6351 static int 6352 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6353 { 6354 struct tcptemp *t_template; 6355 struct inpcb *inp; 6356 int32_t retval = 1; 6357 6358 inp = tp->t_inpcb; 6359 6360 if (tp->t_timers->tt_flags & TT_STOPPED) { 6361 return (1); 6362 } 6363 if (rack->rc_in_persist == 0) 6364 return (0); 6365 if (ctf_progress_timeout_check(tp, false)) { 6366 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6367 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6368 tcp_set_inp_to_drop(inp, ETIMEDOUT); 6369 return (1); 6370 } 6371 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 6372 /* 6373 * Persistence timer into zero window. Force a byte to be output, if 6374 * possible. 6375 */ 6376 KMOD_TCPSTAT_INC(tcps_persisttimeo); 6377 /* 6378 * Hack: if the peer is dead/unreachable, we do not time out if the 6379 * window is closed. After a full backoff, drop the connection if 6380 * the idle time (no responses to probes) reaches the maximum 6381 * backoff that we would use if retransmitting. 6382 */ 6383 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 6384 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 6385 TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { 6386 KMOD_TCPSTAT_INC(tcps_persistdrop); 6387 retval = 1; 6388 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6389 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6390 goto out; 6391 } 6392 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 6393 tp->snd_una == tp->snd_max) 6394 rack_exit_persist(tp, rack, cts); 6395 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 6396 /* 6397 * If the user has closed the socket then drop a persisting 6398 * connection after a much reduced timeout. 6399 */ 6400 if (tp->t_state > TCPS_CLOSE_WAIT && 6401 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 6402 retval = 1; 6403 KMOD_TCPSTAT_INC(tcps_persistdrop); 6404 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 6405 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6406 goto out; 6407 } 6408 t_template = tcpip_maketemplate(rack->rc_inp); 6409 if (t_template) { 6410 /* only set it if we were answered */ 6411 if (rack->forced_ack == 0) { 6412 rack->forced_ack = 1; 6413 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6414 } 6415 tcp_respond(tp, t_template->tt_ipgen, 6416 &t_template->tt_t, (struct mbuf *)NULL, 6417 tp->rcv_nxt, tp->snd_una - 1, 0); 6418 /* This sends an ack */ 6419 if (tp->t_flags & TF_DELACK) 6420 tp->t_flags &= ~TF_DELACK; 6421 free(t_template, M_TEMP); 6422 } 6423 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 6424 tp->t_rxtshift++; 6425 out: 6426 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 6427 rack_start_hpts_timer(rack, tp, cts, 6428 0, 0, 0); 6429 return (retval); 6430 } 6431 6432 /* 6433 * If a keepalive goes off, we had no other timers 6434 * happening. We always return 1 here since this 6435 * routine either drops the connection or sends 6436 * out a segment with respond. 6437 */ 6438 static int 6439 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6440 { 6441 struct tcptemp *t_template; 6442 struct inpcb *inp; 6443 6444 if (tp->t_timers->tt_flags & TT_STOPPED) { 6445 return (1); 6446 } 6447 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 6448 inp = tp->t_inpcb; 6449 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 6450 /* 6451 * Keep-alive timer went off; send something or drop connection if 6452 * idle for too long. 6453 */ 6454 KMOD_TCPSTAT_INC(tcps_keeptimeo); 6455 if (tp->t_state < TCPS_ESTABLISHED) 6456 goto dropit; 6457 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 6458 tp->t_state <= TCPS_CLOSING) { 6459 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 6460 goto dropit; 6461 /* 6462 * Send a packet designed to force a response if the peer is 6463 * up and reachable: either an ACK if the connection is 6464 * still alive, or an RST if the peer has closed the 6465 * connection due to timeout or reboot. Using sequence 6466 * number tp->snd_una-1 causes the transmitted zero-length 6467 * segment to lie outside the receive window; by the 6468 * protocol spec, this requires the correspondent TCP to 6469 * respond. 6470 */ 6471 KMOD_TCPSTAT_INC(tcps_keepprobe); 6472 t_template = tcpip_maketemplate(inp); 6473 if (t_template) { 6474 if (rack->forced_ack == 0) { 6475 rack->forced_ack = 1; 6476 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 6477 } 6478 tcp_respond(tp, t_template->tt_ipgen, 6479 &t_template->tt_t, (struct mbuf *)NULL, 6480 tp->rcv_nxt, tp->snd_una - 1, 0); 6481 free(t_template, M_TEMP); 6482 } 6483 } 6484 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 6485 return (1); 6486 dropit: 6487 KMOD_TCPSTAT_INC(tcps_keepdrops); 6488 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6489 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 6490 return (1); 6491 } 6492 6493 /* 6494 * Retransmit helper function, clear up all the ack 6495 * flags and take care of important book keeping. 6496 */ 6497 static void 6498 rack_remxt_tmr(struct tcpcb *tp) 6499 { 6500 /* 6501 * The retransmit timer went off, all sack'd blocks must be 6502 * un-acked. 6503 */ 6504 struct rack_sendmap *rsm, *trsm = NULL; 6505 struct tcp_rack *rack; 6506 6507 rack = (struct tcp_rack *)tp->t_fb_ptr; 6508 rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); 6509 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 6510 if (rack->r_state && (rack->r_state != tp->t_state)) 6511 rack_set_state(tp, rack); 6512 /* 6513 * Ideally we would like to be able to 6514 * mark SACK-PASS on anything not acked here. 6515 * 6516 * However, if we do that we would burst out 6517 * all that data 1ms apart. This would be unwise, 6518 * so for now we will just let the normal rxt timer 6519 * and tlp timer take care of it. 6520 * 6521 * Also we really need to stick them back in sequence 6522 * order. This way we send in the proper order and any 6523 * sacks that come floating in will "re-ack" the data. 6524 * To do this we zap the tmap with an INIT and then 6525 * walk through and place every rsm in the RB tree 6526 * back in its seq ordered place. 6527 */ 6528 TAILQ_INIT(&rack->r_ctl.rc_tmap); 6529 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6530 rsm->r_dupack = 0; 6531 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6532 /* We must re-add it back to the tlist */ 6533 if (trsm == NULL) { 6534 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6535 } else { 6536 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 6537 } 6538 rsm->r_in_tmap = 1; 6539 trsm = rsm; 6540 if (rsm->r_flags & RACK_ACKED) 6541 rsm->r_flags |= RACK_WAS_ACKED; 6542 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 6543 } 6544 /* Clear the count (we just un-acked them) */ 6545 rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; 6546 rack->r_ctl.rc_sacked = 0; 6547 rack->r_ctl.rc_sacklast = NULL; 6548 rack->r_ctl.rc_agg_delayed = 0; 6549 rack->r_early = 0; 6550 rack->r_ctl.rc_agg_early = 0; 6551 rack->r_late = 0; 6552 /* Clear the tlp rtx mark */ 6553 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6554 if (rack->r_ctl.rc_resend != NULL) 6555 rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; 6556 rack->r_ctl.rc_prr_sndcnt = 0; 6557 rack_log_to_prr(rack, 6, 0); 6558 rack->r_timer_override = 1; 6559 if ((((tp->t_flags & TF_SACK_PERMIT) == 0) 6560 #ifdef NETFLIX_EXP_DETECTION 6561 || (rack->sack_attack_disable != 0) 6562 #endif 6563 ) && ((tp->t_flags & TF_SENTFIN) == 0)) { 6564 /* 6565 * For non-sack customers new data 6566 * needs to go out as retransmits until 6567 * we retransmit up to snd_max. 6568 */ 6569 rack->r_must_retran = 1; 6570 rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, 6571 rack->r_ctl.rc_sacked); 6572 } 6573 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 6574 } 6575 6576 static void 6577 rack_convert_rtts(struct tcpcb *tp) 6578 { 6579 if (tp->t_srtt > 1) { 6580 uint32_t val, frac; 6581 6582 val = tp->t_srtt >> TCP_RTT_SHIFT; 6583 frac = tp->t_srtt & 0x1f; 6584 tp->t_srtt = TICKS_2_USEC(val); 6585 /* 6586 * frac is the fractional part of the srtt (if any) 6587 * but its in ticks and every bit represents 6588 * 1/32nd of a hz. 6589 */ 6590 if (frac) { 6591 if (hz == 1000) { 6592 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6593 } else { 6594 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6595 } 6596 tp->t_srtt += frac; 6597 } 6598 } 6599 if (tp->t_rttvar) { 6600 uint32_t val, frac; 6601 6602 val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; 6603 frac = tp->t_rttvar & 0x1f; 6604 tp->t_rttvar = TICKS_2_USEC(val); 6605 /* 6606 * frac is the fractional part of the srtt (if any) 6607 * but its in ticks and every bit represents 6608 * 1/32nd of a hz. 6609 */ 6610 if (frac) { 6611 if (hz == 1000) { 6612 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); 6613 } else { 6614 frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); 6615 } 6616 tp->t_rttvar += frac; 6617 } 6618 } 6619 tp->t_rxtcur = RACK_REXMTVAL(tp); 6620 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 6621 tp->t_rxtcur += TICKS_2_USEC(tcp_rexmit_slop); 6622 } 6623 if (tp->t_rxtcur > rack_rto_max) { 6624 tp->t_rxtcur = rack_rto_max; 6625 } 6626 } 6627 6628 static void 6629 rack_cc_conn_init(struct tcpcb *tp) 6630 { 6631 struct tcp_rack *rack; 6632 uint32_t srtt; 6633 6634 rack = (struct tcp_rack *)tp->t_fb_ptr; 6635 srtt = tp->t_srtt; 6636 cc_conn_init(tp); 6637 /* 6638 * Now convert to rack's internal format, 6639 * if required. 6640 */ 6641 if ((srtt == 0) && (tp->t_srtt != 0)) 6642 rack_convert_rtts(tp); 6643 /* 6644 * We want a chance to stay in slowstart as 6645 * we create a connection. TCP spec says that 6646 * initially ssthresh is infinite. For our 6647 * purposes that is the snd_wnd. 6648 */ 6649 if (tp->snd_ssthresh < tp->snd_wnd) { 6650 tp->snd_ssthresh = tp->snd_wnd; 6651 } 6652 /* 6653 * We also want to assure a IW worth of 6654 * data can get inflight. 6655 */ 6656 if (rc_init_window(rack) < tp->snd_cwnd) 6657 tp->snd_cwnd = rc_init_window(rack); 6658 } 6659 6660 /* 6661 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 6662 * we will setup to retransmit the lowest seq number outstanding. 6663 */ 6664 static int 6665 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 6666 { 6667 int32_t rexmt; 6668 struct inpcb *inp; 6669 int32_t retval = 0; 6670 bool isipv6; 6671 6672 inp = tp->t_inpcb; 6673 if (tp->t_timers->tt_flags & TT_STOPPED) { 6674 return (1); 6675 } 6676 if (ctf_progress_timeout_check(tp, false)) { 6677 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6678 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 6679 tcp_set_inp_to_drop(inp, ETIMEDOUT); 6680 return (1); 6681 } 6682 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 6683 rack->r_ctl.retran_during_recovery = 0; 6684 rack->r_ctl.dsack_byte_cnt = 0; 6685 if (IN_FASTRECOVERY(tp->t_flags)) 6686 tp->t_flags |= TF_WASFRECOVERY; 6687 else 6688 tp->t_flags &= ~TF_WASFRECOVERY; 6689 if (IN_CONGRECOVERY(tp->t_flags)) 6690 tp->t_flags |= TF_WASCRECOVERY; 6691 else 6692 tp->t_flags &= ~TF_WASCRECOVERY; 6693 if (TCPS_HAVEESTABLISHED(tp->t_state) && 6694 (tp->snd_una == tp->snd_max)) { 6695 /* Nothing outstanding .. nothing to do */ 6696 return (0); 6697 } 6698 /* 6699 * Rack can only run one timer at a time, so we cannot 6700 * run a KEEPINIT (gating SYN sending) and a retransmit 6701 * timer for the SYN. So if we are in a front state and 6702 * have a KEEPINIT timer we need to check the first transmit 6703 * against now to see if we have exceeded the KEEPINIT time 6704 * (if one is set). 6705 */ 6706 if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && 6707 (TP_KEEPINIT(tp) != 0)) { 6708 struct rack_sendmap *rsm; 6709 6710 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 6711 if (rsm) { 6712 /* Ok we have something outstanding to test keepinit with */ 6713 if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && 6714 ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { 6715 /* We have exceeded the KEEPINIT time */ 6716 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 6717 goto drop_it; 6718 } 6719 } 6720 } 6721 /* 6722 * Retransmission timer went off. Message has not been acked within 6723 * retransmit interval. Back off to a longer retransmit interval 6724 * and retransmit one segment. 6725 */ 6726 rack_remxt_tmr(tp); 6727 if ((rack->r_ctl.rc_resend == NULL) || 6728 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 6729 /* 6730 * If the rwnd collapsed on 6731 * the one we are retransmitting 6732 * it does not count against the 6733 * rxt count. 6734 */ 6735 tp->t_rxtshift++; 6736 } 6737 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 6738 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 6739 drop_it: 6740 tp->t_rxtshift = TCP_MAXRXTSHIFT; 6741 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 6742 retval = 1; 6743 tcp_set_inp_to_drop(rack->rc_inp, 6744 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 6745 goto out; 6746 } 6747 if (tp->t_state == TCPS_SYN_SENT) { 6748 /* 6749 * If the SYN was retransmitted, indicate CWND to be limited 6750 * to 1 segment in cc_conn_init(). 6751 */ 6752 tp->snd_cwnd = 1; 6753 } else if (tp->t_rxtshift == 1) { 6754 /* 6755 * first retransmit; record ssthresh and cwnd so they can be 6756 * recovered if this turns out to be a "bad" retransmit. A 6757 * retransmit is considered "bad" if an ACK for this segment 6758 * is received within RTT/2 interval; the assumption here is 6759 * that the ACK was already in flight. See "On Estimating 6760 * End-to-End Network Path Properties" by Allman and Paxson 6761 * for more details. 6762 */ 6763 tp->snd_cwnd_prev = tp->snd_cwnd; 6764 tp->snd_ssthresh_prev = tp->snd_ssthresh; 6765 tp->snd_recover_prev = tp->snd_recover; 6766 tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); 6767 tp->t_flags |= TF_PREVVALID; 6768 } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) 6769 tp->t_flags &= ~TF_PREVVALID; 6770 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 6771 if ((tp->t_state == TCPS_SYN_SENT) || 6772 (tp->t_state == TCPS_SYN_RECEIVED)) 6773 rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; 6774 else 6775 rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; 6776 6777 RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, 6778 max(rack_rto_min, rexmt), rack_rto_max, rack->r_ctl.timer_slop); 6779 /* 6780 * We enter the path for PLMTUD if connection is established or, if 6781 * connection is FIN_WAIT_1 status, reason for the last is that if 6782 * amount of data we send is very small, we could send it in couple 6783 * of packets and process straight to FIN. In that case we won't 6784 * catch ESTABLISHED state. 6785 */ 6786 #ifdef INET6 6787 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 6788 #else 6789 isipv6 = false; 6790 #endif 6791 if (((V_tcp_pmtud_blackhole_detect == 1) || 6792 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 6793 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 6794 ((tp->t_state == TCPS_ESTABLISHED) || 6795 (tp->t_state == TCPS_FIN_WAIT_1))) { 6796 /* 6797 * Idea here is that at each stage of mtu probe (usually, 6798 * 1448 -> 1188 -> 524) should be given 2 chances to recover 6799 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 6800 * should take care of that. 6801 */ 6802 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 6803 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 6804 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 6805 tp->t_rxtshift % 2 == 0)) { 6806 /* 6807 * Enter Path MTU Black-hole Detection mechanism: - 6808 * Disable Path MTU Discovery (IP "DF" bit). - 6809 * Reduce MTU to lower value than what we negotiated 6810 * with peer. 6811 */ 6812 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 6813 /* Record that we may have found a black hole. */ 6814 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 6815 /* Keep track of previous MSS. */ 6816 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 6817 } 6818 6819 /* 6820 * Reduce the MSS to blackhole value or to the 6821 * default in an attempt to retransmit. 6822 */ 6823 #ifdef INET6 6824 if (isipv6 && 6825 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 6826 /* Use the sysctl tuneable blackhole MSS. */ 6827 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 6828 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6829 } else if (isipv6) { 6830 /* Use the default MSS. */ 6831 tp->t_maxseg = V_tcp_v6mssdflt; 6832 /* 6833 * Disable Path MTU Discovery when we switch 6834 * to minmss. 6835 */ 6836 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6837 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6838 } 6839 #endif 6840 #if defined(INET6) && defined(INET) 6841 else 6842 #endif 6843 #ifdef INET 6844 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 6845 /* Use the sysctl tuneable blackhole MSS. */ 6846 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 6847 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 6848 } else { 6849 /* Use the default MSS. */ 6850 tp->t_maxseg = V_tcp_mssdflt; 6851 /* 6852 * Disable Path MTU Discovery when we switch 6853 * to minmss. 6854 */ 6855 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 6856 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 6857 } 6858 #endif 6859 } else { 6860 /* 6861 * If further retransmissions are still unsuccessful 6862 * with a lowered MTU, maybe this isn't a blackhole 6863 * and we restore the previous MSS and blackhole 6864 * detection flags. The limit '6' is determined by 6865 * giving each probe stage (1448, 1188, 524) 2 6866 * chances to recover. 6867 */ 6868 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 6869 (tp->t_rxtshift >= 6)) { 6870 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 6871 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 6872 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 6873 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 6874 } 6875 } 6876 } 6877 /* 6878 * Disable RFC1323 and SACK if we haven't got any response to 6879 * our third SYN to work-around some broken terminal servers 6880 * (most of which have hopefully been retired) that have bad VJ 6881 * header compression code which trashes TCP segments containing 6882 * unknown-to-them TCP options. 6883 */ 6884 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && 6885 (tp->t_rxtshift == 3)) 6886 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); 6887 /* 6888 * If we backed off this far, our srtt estimate is probably bogus. 6889 * Clobber it so we'll take the next rtt measurement as our srtt; 6890 * move the current srtt into rttvar to keep the current retransmit 6891 * times until then. 6892 */ 6893 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 6894 #ifdef INET6 6895 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 6896 in6_losing(tp->t_inpcb); 6897 else 6898 #endif 6899 in_losing(tp->t_inpcb); 6900 tp->t_rttvar += tp->t_srtt; 6901 tp->t_srtt = 0; 6902 } 6903 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 6904 tp->snd_recover = tp->snd_max; 6905 tp->t_flags |= TF_ACKNOW; 6906 tp->t_rtttime = 0; 6907 rack_cong_signal(tp, CC_RTO, tp->snd_una); 6908 out: 6909 return (retval); 6910 } 6911 6912 static int 6913 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 6914 { 6915 int32_t ret = 0; 6916 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 6917 6918 if (timers == 0) { 6919 return (0); 6920 } 6921 if (tp->t_state == TCPS_LISTEN) { 6922 /* no timers on listen sockets */ 6923 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 6924 return (0); 6925 return (1); 6926 } 6927 if ((timers & PACE_TMR_RACK) && 6928 rack->rc_on_min_to) { 6929 /* 6930 * For the rack timer when we 6931 * are on a min-timeout (which means rrr_conf = 3) 6932 * we don't want to check the timer. It may 6933 * be going off for a pace and thats ok we 6934 * want to send the retransmit (if its ready). 6935 * 6936 * If its on a normal rack timer (non-min) then 6937 * we will check if its expired. 6938 */ 6939 goto skip_time_check; 6940 } 6941 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 6942 uint32_t left; 6943 6944 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 6945 ret = -1; 6946 rack_log_to_processing(rack, cts, ret, 0); 6947 return (0); 6948 } 6949 if (hpts_calling == 0) { 6950 /* 6951 * A user send or queued mbuf (sack) has called us? We 6952 * return 0 and let the pacing guards 6953 * deal with it if they should or 6954 * should not cause a send. 6955 */ 6956 ret = -2; 6957 rack_log_to_processing(rack, cts, ret, 0); 6958 return (0); 6959 } 6960 /* 6961 * Ok our timer went off early and we are not paced false 6962 * alarm, go back to sleep. 6963 */ 6964 ret = -3; 6965 left = rack->r_ctl.rc_timer_exp - cts; 6966 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 6967 rack_log_to_processing(rack, cts, ret, left); 6968 return (1); 6969 } 6970 skip_time_check: 6971 rack->rc_tmr_stopped = 0; 6972 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 6973 if (timers & PACE_TMR_DELACK) { 6974 ret = rack_timeout_delack(tp, rack, cts); 6975 } else if (timers & PACE_TMR_RACK) { 6976 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6977 rack->r_fast_output = 0; 6978 ret = rack_timeout_rack(tp, rack, cts); 6979 } else if (timers & PACE_TMR_TLP) { 6980 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6981 ret = rack_timeout_tlp(tp, rack, cts); 6982 } else if (timers & PACE_TMR_RXT) { 6983 rack->r_ctl.rc_tlp_rxt_last_time = cts; 6984 rack->r_fast_output = 0; 6985 ret = rack_timeout_rxt(tp, rack, cts); 6986 } else if (timers & PACE_TMR_PERSIT) { 6987 ret = rack_timeout_persist(tp, rack, cts); 6988 } else if (timers & PACE_TMR_KEEP) { 6989 ret = rack_timeout_keepalive(tp, rack, cts); 6990 } 6991 rack_log_to_processing(rack, cts, ret, timers); 6992 return (ret); 6993 } 6994 6995 static void 6996 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 6997 { 6998 struct timeval tv; 6999 uint32_t us_cts, flags_on_entry; 7000 uint8_t hpts_removed = 0; 7001 7002 flags_on_entry = rack->r_ctl.rc_hpts_flags; 7003 us_cts = tcp_get_usecs(&tv); 7004 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 7005 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 7006 ((tp->snd_max - tp->snd_una) == 0))) { 7007 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 7008 hpts_removed = 1; 7009 /* If we were not delayed cancel out the flag. */ 7010 if ((tp->snd_max - tp->snd_una) == 0) 7011 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 7012 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7013 } 7014 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 7015 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 7016 if (rack->rc_inp->inp_in_hpts && 7017 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 7018 /* 7019 * Canceling timer's when we have no output being 7020 * paced. We also must remove ourselves from the 7021 * hpts. 7022 */ 7023 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 7024 hpts_removed = 1; 7025 } 7026 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 7027 } 7028 if (hpts_removed == 0) 7029 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 7030 } 7031 7032 static void 7033 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 7034 { 7035 return; 7036 } 7037 7038 static int 7039 rack_stopall(struct tcpcb *tp) 7040 { 7041 struct tcp_rack *rack; 7042 rack = (struct tcp_rack *)tp->t_fb_ptr; 7043 rack->t_timers_stopped = 1; 7044 return (0); 7045 } 7046 7047 static void 7048 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 7049 { 7050 return; 7051 } 7052 7053 static int 7054 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 7055 { 7056 return (0); 7057 } 7058 7059 static void 7060 rack_stop_all_timers(struct tcpcb *tp) 7061 { 7062 struct tcp_rack *rack; 7063 7064 /* 7065 * Assure no timers are running. 7066 */ 7067 if (tcp_timer_active(tp, TT_PERSIST)) { 7068 /* We enter in persists, set the flag appropriately */ 7069 rack = (struct tcp_rack *)tp->t_fb_ptr; 7070 rack->rc_in_persist = 1; 7071 } 7072 tcp_timer_suspend(tp, TT_PERSIST); 7073 tcp_timer_suspend(tp, TT_REXMT); 7074 tcp_timer_suspend(tp, TT_KEEP); 7075 tcp_timer_suspend(tp, TT_DELACK); 7076 } 7077 7078 static void 7079 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 7080 struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) 7081 { 7082 int32_t idx; 7083 uint16_t stripped_flags; 7084 7085 rsm->r_rtr_cnt++; 7086 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7087 rsm->r_dupack = 0; 7088 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 7089 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 7090 rsm->r_flags |= RACK_OVERMAX; 7091 } 7092 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 7093 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 7094 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 7095 } 7096 idx = rsm->r_rtr_cnt - 1; 7097 rsm->r_tim_lastsent[idx] = ts; 7098 stripped_flags = rsm->r_flags & ~(RACK_SENT_SP|RACK_SENT_FP); 7099 if (rsm->r_flags & RACK_ACKED) { 7100 /* Problably MTU discovery messing with us */ 7101 rsm->r_flags &= ~RACK_ACKED; 7102 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7103 } 7104 if (rsm->r_in_tmap) { 7105 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7106 rsm->r_in_tmap = 0; 7107 } 7108 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7109 rsm->r_in_tmap = 1; 7110 if (rsm->r_flags & RACK_SACK_PASSED) { 7111 /* We have retransmitted due to the SACK pass */ 7112 rsm->r_flags &= ~RACK_SACK_PASSED; 7113 rsm->r_flags |= RACK_WAS_SACKPASS; 7114 } 7115 } 7116 7117 static uint32_t 7118 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 7119 struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) 7120 { 7121 /* 7122 * We (re-)transmitted starting at rsm->r_start for some length 7123 * (possibly less than r_end. 7124 */ 7125 struct rack_sendmap *nrsm, *insret; 7126 uint32_t c_end; 7127 int32_t len; 7128 7129 len = *lenp; 7130 c_end = rsm->r_start + len; 7131 if (SEQ_GEQ(c_end, rsm->r_end)) { 7132 /* 7133 * We retransmitted the whole piece or more than the whole 7134 * slopping into the next rsm. 7135 */ 7136 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7137 if (c_end == rsm->r_end) { 7138 *lenp = 0; 7139 return (0); 7140 } else { 7141 int32_t act_len; 7142 7143 /* Hangs over the end return whats left */ 7144 act_len = rsm->r_end - rsm->r_start; 7145 *lenp = (len - act_len); 7146 return (rsm->r_end); 7147 } 7148 /* We don't get out of this block. */ 7149 } 7150 /* 7151 * Here we retransmitted less than the whole thing which means we 7152 * have to split this into what was transmitted and what was not. 7153 */ 7154 nrsm = rack_alloc_full_limit(rack); 7155 if (nrsm == NULL) { 7156 /* 7157 * We can't get memory, so lets not proceed. 7158 */ 7159 *lenp = 0; 7160 return (0); 7161 } 7162 /* 7163 * So here we are going to take the original rsm and make it what we 7164 * retransmitted. nrsm will be the tail portion we did not 7165 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 7166 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 7167 * 1, 6 and the new piece will be 6, 11. 7168 */ 7169 rack_clone_rsm(rack, nrsm, rsm, c_end); 7170 nrsm->r_dupack = 0; 7171 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7172 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7173 #ifdef INVARIANTS 7174 if (insret != NULL) { 7175 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7176 nrsm, insret, rack, rsm); 7177 } 7178 #endif 7179 if (rsm->r_in_tmap) { 7180 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7181 nrsm->r_in_tmap = 1; 7182 } 7183 rsm->r_flags &= (~RACK_HAS_FIN); 7184 rack_update_rsm(tp, rack, rsm, ts, add_flag); 7185 /* Log a split of rsm into rsm and nrsm */ 7186 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7187 *lenp = 0; 7188 return (0); 7189 } 7190 7191 static void 7192 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 7193 uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts, 7194 struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff) 7195 { 7196 struct tcp_rack *rack; 7197 struct rack_sendmap *rsm, *nrsm, *insret, fe; 7198 register uint32_t snd_max, snd_una; 7199 7200 /* 7201 * Add to the RACK log of packets in flight or retransmitted. If 7202 * there is a TS option we will use the TS echoed, if not we will 7203 * grab a TS. 7204 * 7205 * Retransmissions will increment the count and move the ts to its 7206 * proper place. Note that if options do not include TS's then we 7207 * won't be able to effectively use the ACK for an RTT on a retran. 7208 * 7209 * Notes about r_start and r_end. Lets consider a send starting at 7210 * sequence 1 for 10 bytes. In such an example the r_start would be 7211 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 7212 * This means that r_end is actually the first sequence for the next 7213 * slot (11). 7214 * 7215 */ 7216 /* 7217 * If err is set what do we do XXXrrs? should we not add the thing? 7218 * -- i.e. return if err != 0 or should we pretend we sent it? -- 7219 * i.e. proceed with add ** do this for now. 7220 */ 7221 INP_WLOCK_ASSERT(tp->t_inpcb); 7222 if (err) 7223 /* 7224 * We don't log errors -- we could but snd_max does not 7225 * advance in this case either. 7226 */ 7227 return; 7228 7229 if (th_flags & TH_RST) { 7230 /* 7231 * We don't log resets and we return immediately from 7232 * sending 7233 */ 7234 return; 7235 } 7236 rack = (struct tcp_rack *)tp->t_fb_ptr; 7237 snd_una = tp->snd_una; 7238 snd_max = tp->snd_max; 7239 if (th_flags & (TH_SYN | TH_FIN)) { 7240 /* 7241 * The call to rack_log_output is made before bumping 7242 * snd_max. This means we can record one extra byte on a SYN 7243 * or FIN if seq_out is adding more on and a FIN is present 7244 * (and we are not resending). 7245 */ 7246 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 7247 len++; 7248 if (th_flags & TH_FIN) 7249 len++; 7250 if (SEQ_LT(snd_max, tp->snd_nxt)) { 7251 /* 7252 * The add/update as not been done for the FIN/SYN 7253 * yet. 7254 */ 7255 snd_max = tp->snd_nxt; 7256 } 7257 } 7258 if (SEQ_LEQ((seq_out + len), snd_una)) { 7259 /* Are sending an old segment to induce an ack (keep-alive)? */ 7260 return; 7261 } 7262 if (SEQ_LT(seq_out, snd_una)) { 7263 /* huh? should we panic? */ 7264 uint32_t end; 7265 7266 end = seq_out + len; 7267 seq_out = snd_una; 7268 if (SEQ_GEQ(end, seq_out)) 7269 len = end - seq_out; 7270 else 7271 len = 0; 7272 } 7273 if (len == 0) { 7274 /* We don't log zero window probes */ 7275 return; 7276 } 7277 rack->r_ctl.rc_time_last_sent = cts; 7278 if (IN_FASTRECOVERY(tp->t_flags)) { 7279 rack->r_ctl.rc_prr_out += len; 7280 } 7281 /* First question is it a retransmission or new? */ 7282 if (seq_out == snd_max) { 7283 /* Its new */ 7284 again: 7285 rsm = rack_alloc(rack); 7286 if (rsm == NULL) { 7287 /* 7288 * Hmm out of memory and the tcb got destroyed while 7289 * we tried to wait. 7290 */ 7291 return; 7292 } 7293 if (th_flags & TH_FIN) { 7294 rsm->r_flags = RACK_HAS_FIN|add_flag; 7295 } else { 7296 rsm->r_flags = add_flag; 7297 } 7298 rsm->r_tim_lastsent[0] = cts; 7299 rsm->r_rtr_cnt = 1; 7300 rsm->r_rtr_bytes = 0; 7301 if (th_flags & TH_SYN) { 7302 /* The data space is one beyond snd_una */ 7303 rsm->r_flags |= RACK_HAS_SYN; 7304 } 7305 rsm->r_start = seq_out; 7306 rsm->r_end = rsm->r_start + len; 7307 rsm->r_dupack = 0; 7308 /* 7309 * save off the mbuf location that 7310 * sndmbuf_noadv returned (which is 7311 * where we started copying from).. 7312 */ 7313 rsm->m = s_mb; 7314 rsm->soff = s_moff; 7315 /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ 7316 if (rsm->m) { 7317 if (rsm->m->m_len <= rsm->soff) { 7318 /* 7319 * XXXrrs Question, will this happen? 7320 * 7321 * If sbsndptr is set at the correct place 7322 * then s_moff should always be somewhere 7323 * within rsm->m. But if the sbsndptr was 7324 * off then that won't be true. If it occurs 7325 * we need to walkout to the correct location. 7326 */ 7327 struct mbuf *lm; 7328 7329 lm = rsm->m; 7330 while (lm->m_len <= rsm->soff) { 7331 rsm->soff -= lm->m_len; 7332 lm = lm->m_next; 7333 KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", 7334 __func__, rack, s_moff, s_mb, rsm->soff)); 7335 } 7336 rsm->m = lm; 7337 counter_u64_add(rack_sbsndptr_wrong, 1); 7338 } else 7339 counter_u64_add(rack_sbsndptr_right, 1); 7340 rsm->orig_m_len = rsm->m->m_len; 7341 } else 7342 rsm->orig_m_len = 0; 7343 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7344 /* Log a new rsm */ 7345 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); 7346 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7347 #ifdef INVARIANTS 7348 if (insret != NULL) { 7349 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7350 nrsm, insret, rack, rsm); 7351 } 7352 #endif 7353 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7354 rsm->r_in_tmap = 1; 7355 /* 7356 * Special case detection, is there just a single 7357 * packet outstanding when we are not in recovery? 7358 * 7359 * If this is true mark it so. 7360 */ 7361 if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 7362 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 7363 struct rack_sendmap *prsm; 7364 7365 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7366 if (prsm) 7367 prsm->r_one_out_nr = 1; 7368 } 7369 return; 7370 } 7371 /* 7372 * If we reach here its a retransmission and we need to find it. 7373 */ 7374 memset(&fe, 0, sizeof(fe)); 7375 more: 7376 if (hintrsm && (hintrsm->r_start == seq_out)) { 7377 rsm = hintrsm; 7378 hintrsm = NULL; 7379 } else { 7380 /* No hints sorry */ 7381 rsm = NULL; 7382 } 7383 if ((rsm) && (rsm->r_start == seq_out)) { 7384 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7385 if (len == 0) { 7386 return; 7387 } else { 7388 goto more; 7389 } 7390 } 7391 /* Ok it was not the last pointer go through it the hard way. */ 7392 refind: 7393 fe.r_start = seq_out; 7394 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7395 if (rsm) { 7396 if (rsm->r_start == seq_out) { 7397 seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); 7398 if (len == 0) { 7399 return; 7400 } else { 7401 goto refind; 7402 } 7403 } 7404 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 7405 /* Transmitted within this piece */ 7406 /* 7407 * Ok we must split off the front and then let the 7408 * update do the rest 7409 */ 7410 nrsm = rack_alloc_full_limit(rack); 7411 if (nrsm == NULL) { 7412 rack_update_rsm(tp, rack, rsm, cts, add_flag); 7413 return; 7414 } 7415 /* 7416 * copy rsm to nrsm and then trim the front of rsm 7417 * to not include this part. 7418 */ 7419 rack_clone_rsm(rack, nrsm, rsm, seq_out); 7420 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7421 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); 7422 #ifdef INVARIANTS 7423 if (insret != NULL) { 7424 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7425 nrsm, insret, rack, rsm); 7426 } 7427 #endif 7428 if (rsm->r_in_tmap) { 7429 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7430 nrsm->r_in_tmap = 1; 7431 } 7432 rsm->r_flags &= (~RACK_HAS_FIN); 7433 seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); 7434 if (len == 0) { 7435 return; 7436 } else if (len > 0) 7437 goto refind; 7438 } 7439 } 7440 /* 7441 * Hmm not found in map did they retransmit both old and on into the 7442 * new? 7443 */ 7444 if (seq_out == tp->snd_max) { 7445 goto again; 7446 } else if (SEQ_LT(seq_out, tp->snd_max)) { 7447 #ifdef INVARIANTS 7448 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 7449 seq_out, len, tp->snd_una, tp->snd_max); 7450 printf("Starting Dump of all rack entries\n"); 7451 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 7452 printf("rsm:%p start:%u end:%u\n", 7453 rsm, rsm->r_start, rsm->r_end); 7454 } 7455 printf("Dump complete\n"); 7456 panic("seq_out not found rack:%p tp:%p", 7457 rack, tp); 7458 #endif 7459 } else { 7460 #ifdef INVARIANTS 7461 /* 7462 * Hmm beyond sndmax? (only if we are using the new rtt-pack 7463 * flag) 7464 */ 7465 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 7466 seq_out, len, tp->snd_max, tp); 7467 #endif 7468 } 7469 } 7470 7471 /* 7472 * Record one of the RTT updates from an ack into 7473 * our sample structure. 7474 */ 7475 7476 static void 7477 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 7478 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 7479 { 7480 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7481 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 7482 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 7483 } 7484 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7485 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 7486 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 7487 } 7488 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 7489 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 7490 rack->r_ctl.rc_gp_lowrtt = us_rtt; 7491 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 7492 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 7493 } 7494 if ((confidence == 1) && 7495 ((rsm == NULL) || 7496 (rsm->r_just_ret) || 7497 (rsm->r_one_out_nr && 7498 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 7499 /* 7500 * If the rsm had a just return 7501 * hit it then we can't trust the 7502 * rtt measurement for buffer deterimination 7503 * Note that a confidence of 2, indicates 7504 * SACK'd which overrides the r_just_ret or 7505 * the r_one_out_nr. If it was a CUM-ACK and 7506 * we had only two outstanding, but get an 7507 * ack for only 1. Then that also lowers our 7508 * confidence. 7509 */ 7510 confidence = 0; 7511 } 7512 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 7513 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 7514 if (rack->r_ctl.rack_rs.confidence == 0) { 7515 /* 7516 * We take anything with no current confidence 7517 * saved. 7518 */ 7519 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7520 rack->r_ctl.rack_rs.confidence = confidence; 7521 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7522 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 7523 /* 7524 * Once we have a confident number, 7525 * we can update it with a smaller 7526 * value since this confident number 7527 * may include the DSACK time until 7528 * the next segment (the second one) arrived. 7529 */ 7530 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 7531 rack->r_ctl.rack_rs.confidence = confidence; 7532 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 7533 } 7534 } 7535 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 7536 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 7537 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 7538 rack->r_ctl.rack_rs.rs_rtt_cnt++; 7539 } 7540 7541 /* 7542 * Collect new round-trip time estimate 7543 * and update averages and current timeout. 7544 */ 7545 static void 7546 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 7547 { 7548 int32_t delta; 7549 uint32_t o_srtt, o_var; 7550 int32_t hrtt_up = 0; 7551 int32_t rtt; 7552 7553 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 7554 /* No valid sample */ 7555 return; 7556 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 7557 /* We are to use the lowest RTT seen in a single ack */ 7558 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 7559 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 7560 /* We are to use the highest RTT seen in a single ack */ 7561 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 7562 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 7563 /* We are to use the average RTT seen in a single ack */ 7564 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 7565 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 7566 } else { 7567 #ifdef INVARIANTS 7568 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 7569 #endif 7570 return; 7571 } 7572 if (rtt == 0) 7573 rtt = 1; 7574 if (rack->rc_gp_rtt_set == 0) { 7575 /* 7576 * With no RTT we have to accept 7577 * even one we are not confident of. 7578 */ 7579 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 7580 rack->rc_gp_rtt_set = 1; 7581 } else if (rack->r_ctl.rack_rs.confidence) { 7582 /* update the running gp srtt */ 7583 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 7584 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 7585 } 7586 if (rack->r_ctl.rack_rs.confidence) { 7587 /* 7588 * record the low and high for highly buffered path computation, 7589 * we only do this if we are confident (not a retransmission). 7590 */ 7591 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 7592 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7593 hrtt_up = 1; 7594 } 7595 if (rack->rc_highly_buffered == 0) { 7596 /* 7597 * Currently once we declare a path has 7598 * highly buffered there is no going 7599 * back, which may be a problem... 7600 */ 7601 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 7602 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 7603 rack->r_ctl.rc_highest_us_rtt, 7604 rack->r_ctl.rc_lowest_us_rtt, 7605 RACK_RTTS_SEEHBP); 7606 rack->rc_highly_buffered = 1; 7607 } 7608 } 7609 } 7610 if ((rack->r_ctl.rack_rs.confidence) || 7611 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 7612 /* 7613 * If we are highly confident of it <or> it was 7614 * never retransmitted we accept it as the last us_rtt. 7615 */ 7616 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7617 /* The lowest rtt can be set if its was not retransmited */ 7618 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 7619 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 7620 if (rack->r_ctl.rc_lowest_us_rtt == 0) 7621 rack->r_ctl.rc_lowest_us_rtt = 1; 7622 } 7623 } 7624 o_srtt = tp->t_srtt; 7625 o_var = tp->t_rttvar; 7626 rack = (struct tcp_rack *)tp->t_fb_ptr; 7627 if (tp->t_srtt != 0) { 7628 /* 7629 * We keep a simple srtt in microseconds, like our rtt 7630 * measurement. We don't need to do any tricks with shifting 7631 * etc. Instead we just add in 1/8th of the new measurement 7632 * and subtract out 1/8 of the old srtt. We do the same with 7633 * the variance after finding the absolute value of the 7634 * difference between this sample and the current srtt. 7635 */ 7636 delta = tp->t_srtt - rtt; 7637 /* Take off 1/8th of the current sRTT */ 7638 tp->t_srtt -= (tp->t_srtt >> 3); 7639 /* Add in 1/8th of the new RTT just measured */ 7640 tp->t_srtt += (rtt >> 3); 7641 if (tp->t_srtt <= 0) 7642 tp->t_srtt = 1; 7643 /* Now lets make the absolute value of the variance */ 7644 if (delta < 0) 7645 delta = -delta; 7646 /* Subtract out 1/8th */ 7647 tp->t_rttvar -= (tp->t_rttvar >> 3); 7648 /* Add in 1/8th of the new variance we just saw */ 7649 tp->t_rttvar += (delta >> 3); 7650 if (tp->t_rttvar <= 0) 7651 tp->t_rttvar = 1; 7652 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 7653 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7654 } else { 7655 /* 7656 * No rtt measurement yet - use the unsmoothed rtt. Set the 7657 * variance to half the rtt (so our first retransmit happens 7658 * at 3*rtt). 7659 */ 7660 tp->t_srtt = rtt; 7661 tp->t_rttvar = rtt >> 1; 7662 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 7663 } 7664 rack->rc_srtt_measure_made = 1; 7665 KMOD_TCPSTAT_INC(tcps_rttupdated); 7666 tp->t_rttupdated++; 7667 #ifdef STATS 7668 if (rack_stats_gets_ms_rtt == 0) { 7669 /* Send in the microsecond rtt used for rxt timeout purposes */ 7670 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 7671 } else if (rack_stats_gets_ms_rtt == 1) { 7672 /* Send in the millisecond rtt used for rxt timeout purposes */ 7673 int32_t ms_rtt; 7674 7675 /* Round up */ 7676 ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7677 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7678 } else if (rack_stats_gets_ms_rtt == 2) { 7679 /* Send in the millisecond rtt has close to the path RTT as we can get */ 7680 int32_t ms_rtt; 7681 7682 /* Round up */ 7683 ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; 7684 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); 7685 } else { 7686 /* Send in the microsecond rtt has close to the path RTT as we can get */ 7687 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); 7688 } 7689 7690 #endif 7691 /* 7692 * the retransmit should happen at rtt + 4 * rttvar. Because of the 7693 * way we do the smoothing, srtt and rttvar will each average +1/2 7694 * tick of bias. When we compute the retransmit timer, we want 1/2 7695 * tick of rounding and 1 extra tick because of +-1/2 tick 7696 * uncertainty in the firing of the timer. The bias will give us 7697 * exactly the 1.5 tick we need. But, because the bias is 7698 * statistical, we have to test that we don't drop below the minimum 7699 * feasible timer (which is 2 ticks). 7700 */ 7701 tp->t_rxtshift = 0; 7702 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7703 max(rack_rto_min, rtt + 2), rack_rto_max, rack->r_ctl.timer_slop); 7704 rack_log_rtt_sample(rack, rtt); 7705 tp->t_softerror = 0; 7706 } 7707 7708 7709 static void 7710 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 7711 { 7712 /* 7713 * Apply to filter the inbound us-rtt at us_cts. 7714 */ 7715 uint32_t old_rtt; 7716 7717 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 7718 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 7719 us_rtt, us_cts); 7720 if (rack->r_ctl.last_pacing_time && 7721 rack->rc_gp_dyn_mul && 7722 (rack->r_ctl.last_pacing_time > us_rtt)) 7723 rack->pacing_longer_than_rtt = 1; 7724 else 7725 rack->pacing_longer_than_rtt = 0; 7726 if (old_rtt > us_rtt) { 7727 /* We just hit a new lower rtt time */ 7728 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 7729 __LINE__, RACK_RTTS_NEWRTT); 7730 /* 7731 * Only count it if its lower than what we saw within our 7732 * calculated range. 7733 */ 7734 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 7735 if (rack_probertt_lower_within && 7736 rack->rc_gp_dyn_mul && 7737 (rack->use_fixed_rate == 0) && 7738 (rack->rc_always_pace)) { 7739 /* 7740 * We are seeing a new lower rtt very close 7741 * to the time that we would have entered probe-rtt. 7742 * This is probably due to the fact that a peer flow 7743 * has entered probe-rtt. Lets go in now too. 7744 */ 7745 uint32_t val; 7746 7747 val = rack_probertt_lower_within * rack_time_between_probertt; 7748 val /= 100; 7749 if ((rack->in_probe_rtt == 0) && 7750 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 7751 rack_enter_probertt(rack, us_cts); 7752 } 7753 } 7754 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 7755 } 7756 } 7757 } 7758 7759 static int 7760 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 7761 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 7762 { 7763 int32_t i, all; 7764 uint32_t t, len_acked; 7765 7766 if ((rsm->r_flags & RACK_ACKED) || 7767 (rsm->r_flags & RACK_WAS_ACKED)) 7768 /* Already done */ 7769 return (0); 7770 if (rsm->r_no_rtt_allowed) { 7771 /* Not allowed */ 7772 return (0); 7773 } 7774 if (ack_type == CUM_ACKED) { 7775 if (SEQ_GT(th_ack, rsm->r_end)) { 7776 len_acked = rsm->r_end - rsm->r_start; 7777 all = 1; 7778 } else { 7779 len_acked = th_ack - rsm->r_start; 7780 all = 0; 7781 } 7782 } else { 7783 len_acked = rsm->r_end - rsm->r_start; 7784 all = 0; 7785 } 7786 if (rsm->r_rtr_cnt == 1) { 7787 uint32_t us_rtt; 7788 7789 t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7790 if ((int)t <= 0) 7791 t = 1; 7792 if (!tp->t_rttlow || tp->t_rttlow > t) 7793 tp->t_rttlow = t; 7794 if (!rack->r_ctl.rc_rack_min_rtt || 7795 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7796 rack->r_ctl.rc_rack_min_rtt = t; 7797 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7798 rack->r_ctl.rc_rack_min_rtt = 1; 7799 } 7800 } 7801 if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) 7802 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7803 else 7804 us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 7805 if (us_rtt == 0) 7806 us_rtt = 1; 7807 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 7808 if (ack_type == SACKED) { 7809 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); 7810 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 7811 } else { 7812 /* 7813 * We need to setup what our confidence 7814 * is in this ack. 7815 * 7816 * If the rsm was app limited and it is 7817 * less than a mss in length (the end 7818 * of the send) then we have a gap. If we 7819 * were app limited but say we were sending 7820 * multiple MSS's then we are more confident 7821 * int it. 7822 * 7823 * When we are not app-limited then we see if 7824 * the rsm is being included in the current 7825 * measurement, we tell this by the app_limited_needs_set 7826 * flag. 7827 * 7828 * Note that being cwnd blocked is not applimited 7829 * as well as the pacing delay between packets which 7830 * are sending only 1 or 2 MSS's also will show up 7831 * in the RTT. We probably need to examine this algorithm 7832 * a bit more and enhance it to account for the delay 7833 * between rsm's. We could do that by saving off the 7834 * pacing delay of each rsm (in an rsm) and then 7835 * factoring that in somehow though for now I am 7836 * not sure how :) 7837 */ 7838 int calc_conf = 0; 7839 7840 if (rsm->r_flags & RACK_APP_LIMITED) { 7841 if (all && (len_acked <= ctf_fixed_maxseg(tp))) 7842 calc_conf = 0; 7843 else 7844 calc_conf = 1; 7845 } else if (rack->app_limited_needs_set == 0) { 7846 calc_conf = 1; 7847 } else { 7848 calc_conf = 0; 7849 } 7850 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); 7851 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 7852 calc_conf, rsm, rsm->r_rtr_cnt); 7853 } 7854 if ((rsm->r_flags & RACK_TLP) && 7855 (!IN_FASTRECOVERY(tp->t_flags))) { 7856 /* Segment was a TLP and our retrans matched */ 7857 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 7858 rack->r_ctl.rc_rsm_start = tp->snd_max; 7859 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 7860 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 7861 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 7862 } 7863 } 7864 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 7865 /* New more recent rack_tmit_time */ 7866 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7867 rack->rc_rack_rtt = t; 7868 } 7869 return (1); 7870 } 7871 /* 7872 * We clear the soft/rxtshift since we got an ack. 7873 * There is no assurance we will call the commit() function 7874 * so we need to clear these to avoid incorrect handling. 7875 */ 7876 tp->t_rxtshift = 0; 7877 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 7878 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 7879 tp->t_softerror = 0; 7880 if (to && (to->to_flags & TOF_TS) && 7881 (ack_type == CUM_ACKED) && 7882 (to->to_tsecr) && 7883 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 7884 /* 7885 * Now which timestamp does it match? In this block the ACK 7886 * must be coming from a previous transmission. 7887 */ 7888 for (i = 0; i < rsm->r_rtr_cnt; i++) { 7889 if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { 7890 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 7891 if ((int)t <= 0) 7892 t = 1; 7893 if ((i + 1) < rsm->r_rtr_cnt) { 7894 /* 7895 * The peer ack'd from our previous 7896 * transmission. We have a spurious 7897 * retransmission and thus we dont 7898 * want to update our rack_rtt. 7899 */ 7900 return (0); 7901 } 7902 if (!tp->t_rttlow || tp->t_rttlow > t) 7903 tp->t_rttlow = t; 7904 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7905 rack->r_ctl.rc_rack_min_rtt = t; 7906 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7907 rack->r_ctl.rc_rack_min_rtt = 1; 7908 } 7909 } 7910 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 7911 (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 7912 /* New more recent rack_tmit_time */ 7913 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 7914 rack->rc_rack_rtt = t; 7915 } 7916 rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); 7917 tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, 7918 rsm->r_rtr_cnt); 7919 return (1); 7920 } 7921 } 7922 goto ts_not_found; 7923 } else { 7924 /* 7925 * Ok its a SACK block that we retransmitted. or a windows 7926 * machine without timestamps. We can tell nothing from the 7927 * time-stamp since its not there or the time the peer last 7928 * recieved a segment that moved forward its cum-ack point. 7929 */ 7930 ts_not_found: 7931 i = rsm->r_rtr_cnt - 1; 7932 t = cts - (uint32_t)rsm->r_tim_lastsent[i]; 7933 if ((int)t <= 0) 7934 t = 1; 7935 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7936 /* 7937 * We retransmitted and the ack came back in less 7938 * than the smallest rtt we have observed. We most 7939 * likely did an improper retransmit as outlined in 7940 * 6.2 Step 2 point 2 in the rack-draft so we 7941 * don't want to update our rack_rtt. We in 7942 * theory (in future) might want to think about reverting our 7943 * cwnd state but we won't for now. 7944 */ 7945 return (0); 7946 } else if (rack->r_ctl.rc_rack_min_rtt) { 7947 /* 7948 * We retransmitted it and the retransmit did the 7949 * job. 7950 */ 7951 if (!rack->r_ctl.rc_rack_min_rtt || 7952 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 7953 rack->r_ctl.rc_rack_min_rtt = t; 7954 if (rack->r_ctl.rc_rack_min_rtt == 0) { 7955 rack->r_ctl.rc_rack_min_rtt = 1; 7956 } 7957 } 7958 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { 7959 /* New more recent rack_tmit_time */ 7960 rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; 7961 rack->rc_rack_rtt = t; 7962 } 7963 return (1); 7964 } 7965 } 7966 return (0); 7967 } 7968 7969 /* 7970 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 7971 */ 7972 static void 7973 rack_log_sack_passed(struct tcpcb *tp, 7974 struct tcp_rack *rack, struct rack_sendmap *rsm) 7975 { 7976 struct rack_sendmap *nrsm; 7977 7978 nrsm = rsm; 7979 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 7980 rack_head, r_tnext) { 7981 if (nrsm == rsm) { 7982 /* Skip orginal segment he is acked */ 7983 continue; 7984 } 7985 if (nrsm->r_flags & RACK_ACKED) { 7986 /* 7987 * Skip ack'd segments, though we 7988 * should not see these, since tmap 7989 * should not have ack'd segments. 7990 */ 7991 continue; 7992 } 7993 if (nrsm->r_flags & RACK_SACK_PASSED) { 7994 /* 7995 * We found one that is already marked 7996 * passed, we have been here before and 7997 * so all others below this are marked. 7998 */ 7999 break; 8000 } 8001 nrsm->r_flags |= RACK_SACK_PASSED; 8002 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 8003 } 8004 } 8005 8006 static void 8007 rack_need_set_test(struct tcpcb *tp, 8008 struct tcp_rack *rack, 8009 struct rack_sendmap *rsm, 8010 tcp_seq th_ack, 8011 int line, 8012 int use_which) 8013 { 8014 8015 if ((tp->t_flags & TF_GPUTINPROG) && 8016 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8017 /* 8018 * We were app limited, and this ack 8019 * butts up or goes beyond the point where we want 8020 * to start our next measurement. We need 8021 * to record the new gput_ts as here and 8022 * possibly update the start sequence. 8023 */ 8024 uint32_t seq, ts; 8025 8026 if (rsm->r_rtr_cnt > 1) { 8027 /* 8028 * This is a retransmit, can we 8029 * really make any assessment at this 8030 * point? We are not really sure of 8031 * the timestamp, is it this or the 8032 * previous transmission? 8033 * 8034 * Lets wait for something better that 8035 * is not retransmitted. 8036 */ 8037 return; 8038 } 8039 seq = tp->gput_seq; 8040 ts = tp->gput_ts; 8041 rack->app_limited_needs_set = 0; 8042 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 8043 /* Do we start at a new end? */ 8044 if ((use_which == RACK_USE_BEG) && 8045 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 8046 /* 8047 * When we get an ACK that just eats 8048 * up some of the rsm, we set RACK_USE_BEG 8049 * since whats at r_start (i.e. th_ack) 8050 * is left unacked and thats where the 8051 * measurement not starts. 8052 */ 8053 tp->gput_seq = rsm->r_start; 8054 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8055 } 8056 if ((use_which == RACK_USE_END) && 8057 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 8058 /* 8059 * We use the end when the cumack 8060 * is moving forward and completely 8061 * deleting the rsm passed so basically 8062 * r_end holds th_ack. 8063 * 8064 * For SACK's we also want to use the end 8065 * since this piece just got sacked and 8066 * we want to target anything after that 8067 * in our measurement. 8068 */ 8069 tp->gput_seq = rsm->r_end; 8070 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8071 } 8072 if (use_which == RACK_USE_END_OR_THACK) { 8073 /* 8074 * special case for ack moving forward, 8075 * not a sack, we need to move all the 8076 * way up to where this ack cum-ack moves 8077 * to. 8078 */ 8079 if (SEQ_GT(th_ack, rsm->r_end)) 8080 tp->gput_seq = th_ack; 8081 else 8082 tp->gput_seq = rsm->r_end; 8083 rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8084 } 8085 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 8086 /* 8087 * We moved beyond this guy's range, re-calculate 8088 * the new end point. 8089 */ 8090 if (rack->rc_gp_filled == 0) { 8091 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 8092 } else { 8093 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 8094 } 8095 } 8096 /* 8097 * We are moving the goal post, we may be able to clear the 8098 * measure_saw_probe_rtt flag. 8099 */ 8100 if ((rack->in_probe_rtt == 0) && 8101 (rack->measure_saw_probe_rtt) && 8102 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 8103 rack->measure_saw_probe_rtt = 0; 8104 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 8105 seq, tp->gput_seq, 0, 5, line, NULL); 8106 if (rack->rc_gp_filled && 8107 ((tp->gput_ack - tp->gput_seq) < 8108 max(rc_init_window(rack), (MIN_GP_WIN * 8109 ctf_fixed_maxseg(tp))))) { 8110 uint32_t ideal_amount; 8111 8112 ideal_amount = rack_get_measure_window(tp, rack); 8113 if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { 8114 /* 8115 * There is no sense of continuing this measurement 8116 * because its too small to gain us anything we 8117 * trust. Skip it and that way we can start a new 8118 * measurement quicker. 8119 */ 8120 tp->t_flags &= ~TF_GPUTINPROG; 8121 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 8122 0, 0, 0, 6, __LINE__, NULL); 8123 } else { 8124 /* 8125 * Reset the window further out. 8126 */ 8127 tp->gput_ack = tp->gput_seq + ideal_amount; 8128 } 8129 } 8130 } 8131 } 8132 8133 static uint32_t 8134 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 8135 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 8136 { 8137 uint32_t start, end, changed = 0; 8138 struct rack_sendmap stack_map; 8139 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 8140 int32_t used_ref = 1; 8141 int moved = 0; 8142 8143 start = sack->start; 8144 end = sack->end; 8145 rsm = *prsm; 8146 memset(&fe, 0, sizeof(fe)); 8147 do_rest_ofb: 8148 if ((rsm == NULL) || 8149 (SEQ_LT(end, rsm->r_start)) || 8150 (SEQ_GEQ(start, rsm->r_end)) || 8151 (SEQ_LT(start, rsm->r_start))) { 8152 /* 8153 * We are not in the right spot, 8154 * find the correct spot in the tree. 8155 */ 8156 used_ref = 0; 8157 fe.r_start = start; 8158 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8159 moved++; 8160 } 8161 if (rsm == NULL) { 8162 /* TSNH */ 8163 goto out; 8164 } 8165 /* Ok we have an ACK for some piece of this rsm */ 8166 if (rsm->r_start != start) { 8167 if ((rsm->r_flags & RACK_ACKED) == 0) { 8168 /** 8169 * Need to split this in two pieces the before and after, 8170 * the before remains in the map, the after must be 8171 * added. In other words we have: 8172 * rsm |--------------| 8173 * sackblk |-------> 8174 * rsm will become 8175 * rsm |---| 8176 * and nrsm will be the sacked piece 8177 * nrsm |----------| 8178 * 8179 * But before we start down that path lets 8180 * see if the sack spans over on top of 8181 * the next guy and it is already sacked. 8182 */ 8183 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8184 if (next && (next->r_flags & RACK_ACKED) && 8185 SEQ_GEQ(end, next->r_start)) { 8186 /** 8187 * So the next one is already acked, and 8188 * we can thus by hookery use our stack_map 8189 * to reflect the piece being sacked and 8190 * then adjust the two tree entries moving 8191 * the start and ends around. So we start like: 8192 * rsm |------------| (not-acked) 8193 * next |-----------| (acked) 8194 * sackblk |--------> 8195 * We want to end like so: 8196 * rsm |------| (not-acked) 8197 * next |-----------------| (acked) 8198 * nrsm |-----| 8199 * Where nrsm is a temporary stack piece we 8200 * use to update all the gizmos. 8201 */ 8202 /* Copy up our fudge block */ 8203 nrsm = &stack_map; 8204 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8205 /* Now adjust our tree blocks */ 8206 rsm->r_end = start; 8207 next->r_start = start; 8208 /* Now we must adjust back where next->m is */ 8209 rack_setup_offset_for_rsm(rsm, next); 8210 8211 /* We don't need to adjust rsm, it did not change */ 8212 /* Clear out the dup ack count of the remainder */ 8213 rsm->r_dupack = 0; 8214 rsm->r_just_ret = 0; 8215 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8216 /* Now lets make sure our fudge block is right */ 8217 nrsm->r_start = start; 8218 /* Now lets update all the stats and such */ 8219 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8220 if (rack->app_limited_needs_set) 8221 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8222 changed += (nrsm->r_end - nrsm->r_start); 8223 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8224 if (nrsm->r_flags & RACK_SACK_PASSED) { 8225 counter_u64_add(rack_reorder_seen, 1); 8226 rack->r_ctl.rc_reorder_ts = cts; 8227 } 8228 /* 8229 * Now we want to go up from rsm (the 8230 * one left un-acked) to the next one 8231 * in the tmap. We do this so when 8232 * we walk backwards we include marking 8233 * sack-passed on rsm (The one passed in 8234 * is skipped since it is generally called 8235 * on something sacked before removing it 8236 * from the tmap). 8237 */ 8238 if (rsm->r_in_tmap) { 8239 nrsm = TAILQ_NEXT(rsm, r_tnext); 8240 /* 8241 * Now that we have the next 8242 * one walk backwards from there. 8243 */ 8244 if (nrsm && nrsm->r_in_tmap) 8245 rack_log_sack_passed(tp, rack, nrsm); 8246 } 8247 /* Now are we done? */ 8248 if (SEQ_LT(end, next->r_end) || 8249 (end == next->r_end)) { 8250 /* Done with block */ 8251 goto out; 8252 } 8253 rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); 8254 counter_u64_add(rack_sack_used_next_merge, 1); 8255 /* Postion for the next block */ 8256 start = next->r_end; 8257 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 8258 if (rsm == NULL) 8259 goto out; 8260 } else { 8261 /** 8262 * We can't use any hookery here, so we 8263 * need to split the map. We enter like 8264 * so: 8265 * rsm |--------| 8266 * sackblk |-----> 8267 * We will add the new block nrsm and 8268 * that will be the new portion, and then 8269 * fall through after reseting rsm. So we 8270 * split and look like this: 8271 * rsm |----| 8272 * sackblk |-----> 8273 * nrsm |---| 8274 * We then fall through reseting 8275 * rsm to nrsm, so the next block 8276 * picks it up. 8277 */ 8278 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8279 if (nrsm == NULL) { 8280 /* 8281 * failed XXXrrs what can we do but loose the sack 8282 * info? 8283 */ 8284 goto out; 8285 } 8286 counter_u64_add(rack_sack_splits, 1); 8287 rack_clone_rsm(rack, nrsm, rsm, start); 8288 rsm->r_just_ret = 0; 8289 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8290 #ifdef INVARIANTS 8291 if (insret != NULL) { 8292 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8293 nrsm, insret, rack, rsm); 8294 } 8295 #endif 8296 if (rsm->r_in_tmap) { 8297 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8298 nrsm->r_in_tmap = 1; 8299 } 8300 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); 8301 rsm->r_flags &= (~RACK_HAS_FIN); 8302 /* Position us to point to the new nrsm that starts the sack blk */ 8303 rsm = nrsm; 8304 } 8305 } else { 8306 /* Already sacked this piece */ 8307 counter_u64_add(rack_sack_skipped_acked, 1); 8308 moved++; 8309 if (end == rsm->r_end) { 8310 /* Done with block */ 8311 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8312 goto out; 8313 } else if (SEQ_LT(end, rsm->r_end)) { 8314 /* A partial sack to a already sacked block */ 8315 moved++; 8316 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8317 goto out; 8318 } else { 8319 /* 8320 * The end goes beyond this guy 8321 * repostion the start to the 8322 * next block. 8323 */ 8324 start = rsm->r_end; 8325 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8326 if (rsm == NULL) 8327 goto out; 8328 } 8329 } 8330 } 8331 if (SEQ_GEQ(end, rsm->r_end)) { 8332 /** 8333 * The end of this block is either beyond this guy or right 8334 * at this guy. I.e.: 8335 * rsm --- |-----| 8336 * end |-----| 8337 * <or> 8338 * end |---------| 8339 */ 8340 if ((rsm->r_flags & RACK_ACKED) == 0) { 8341 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8342 changed += (rsm->r_end - rsm->r_start); 8343 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8344 if (rsm->r_in_tmap) /* should be true */ 8345 rack_log_sack_passed(tp, rack, rsm); 8346 /* Is Reordering occuring? */ 8347 if (rsm->r_flags & RACK_SACK_PASSED) { 8348 rsm->r_flags &= ~RACK_SACK_PASSED; 8349 counter_u64_add(rack_reorder_seen, 1); 8350 rack->r_ctl.rc_reorder_ts = cts; 8351 } 8352 if (rack->app_limited_needs_set) 8353 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8354 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8355 rsm->r_flags |= RACK_ACKED; 8356 rsm->r_flags &= ~RACK_TLP; 8357 if (rsm->r_in_tmap) { 8358 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8359 rsm->r_in_tmap = 0; 8360 } 8361 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); 8362 } else { 8363 counter_u64_add(rack_sack_skipped_acked, 1); 8364 moved++; 8365 } 8366 if (end == rsm->r_end) { 8367 /* This block only - done, setup for next */ 8368 goto out; 8369 } 8370 /* 8371 * There is more not coverend by this rsm move on 8372 * to the next block in the RB tree. 8373 */ 8374 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8375 start = rsm->r_end; 8376 rsm = nrsm; 8377 if (rsm == NULL) 8378 goto out; 8379 goto do_rest_ofb; 8380 } 8381 /** 8382 * The end of this sack block is smaller than 8383 * our rsm i.e.: 8384 * rsm --- |-----| 8385 * end |--| 8386 */ 8387 if ((rsm->r_flags & RACK_ACKED) == 0) { 8388 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8389 if (prev && (prev->r_flags & RACK_ACKED)) { 8390 /** 8391 * Goal, we want the right remainder of rsm to shrink 8392 * in place and span from (rsm->r_start = end) to rsm->r_end. 8393 * We want to expand prev to go all the way 8394 * to prev->r_end <- end. 8395 * so in the tree we have before: 8396 * prev |--------| (acked) 8397 * rsm |-------| (non-acked) 8398 * sackblk |-| 8399 * We churn it so we end up with 8400 * prev |----------| (acked) 8401 * rsm |-----| (non-acked) 8402 * nrsm |-| (temporary) 8403 */ 8404 nrsm = &stack_map; 8405 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 8406 prev->r_end = end; 8407 rsm->r_start = end; 8408 /* Now adjust nrsm (stack copy) to be 8409 * the one that is the small 8410 * piece that was "sacked". 8411 */ 8412 nrsm->r_end = end; 8413 rsm->r_dupack = 0; 8414 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8415 /* 8416 * Now that the rsm has had its start moved forward 8417 * lets go ahead and get its new place in the world. 8418 */ 8419 rack_setup_offset_for_rsm(prev, rsm); 8420 /* 8421 * Now nrsm is our new little piece 8422 * that is acked (which was merged 8423 * to prev). Update the rtt and changed 8424 * based on that. Also check for reordering. 8425 */ 8426 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 8427 if (rack->app_limited_needs_set) 8428 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 8429 changed += (nrsm->r_end - nrsm->r_start); 8430 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 8431 if (nrsm->r_flags & RACK_SACK_PASSED) { 8432 counter_u64_add(rack_reorder_seen, 1); 8433 rack->r_ctl.rc_reorder_ts = cts; 8434 } 8435 rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); 8436 rsm = prev; 8437 counter_u64_add(rack_sack_used_prev_merge, 1); 8438 } else { 8439 /** 8440 * This is the case where our previous 8441 * block is not acked either, so we must 8442 * split the block in two. 8443 */ 8444 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8445 if (nrsm == NULL) { 8446 /* failed rrs what can we do but loose the sack info? */ 8447 goto out; 8448 } 8449 /** 8450 * In this case nrsm becomes 8451 * nrsm->r_start = end; 8452 * nrsm->r_end = rsm->r_end; 8453 * which is un-acked. 8454 * <and> 8455 * rsm->r_end = nrsm->r_start; 8456 * i.e. the remaining un-acked 8457 * piece is left on the left 8458 * hand side. 8459 * 8460 * So we start like this 8461 * rsm |----------| (not acked) 8462 * sackblk |---| 8463 * build it so we have 8464 * rsm |---| (acked) 8465 * nrsm |------| (not acked) 8466 */ 8467 counter_u64_add(rack_sack_splits, 1); 8468 rack_clone_rsm(rack, nrsm, rsm, end); 8469 rsm->r_flags &= (~RACK_HAS_FIN); 8470 rsm->r_just_ret = 0; 8471 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8472 #ifdef INVARIANTS 8473 if (insret != NULL) { 8474 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8475 nrsm, insret, rack, rsm); 8476 } 8477 #endif 8478 if (rsm->r_in_tmap) { 8479 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8480 nrsm->r_in_tmap = 1; 8481 } 8482 nrsm->r_dupack = 0; 8483 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 8484 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 8485 changed += (rsm->r_end - rsm->r_start); 8486 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 8487 if (rsm->r_in_tmap) /* should be true */ 8488 rack_log_sack_passed(tp, rack, rsm); 8489 /* Is Reordering occuring? */ 8490 if (rsm->r_flags & RACK_SACK_PASSED) { 8491 rsm->r_flags &= ~RACK_SACK_PASSED; 8492 counter_u64_add(rack_reorder_seen, 1); 8493 rack->r_ctl.rc_reorder_ts = cts; 8494 } 8495 if (rack->app_limited_needs_set) 8496 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 8497 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8498 rsm->r_flags |= RACK_ACKED; 8499 rsm->r_flags &= ~RACK_TLP; 8500 rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); 8501 if (rsm->r_in_tmap) { 8502 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8503 rsm->r_in_tmap = 0; 8504 } 8505 } 8506 } else if (start != end){ 8507 /* 8508 * The block was already acked. 8509 */ 8510 counter_u64_add(rack_sack_skipped_acked, 1); 8511 moved++; 8512 } 8513 out: 8514 if (rsm && (rsm->r_flags & RACK_ACKED)) { 8515 /* 8516 * Now can we merge where we worked 8517 * with either the previous or 8518 * next block? 8519 */ 8520 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8521 while (next) { 8522 if (next->r_flags & RACK_ACKED) { 8523 /* yep this and next can be merged */ 8524 rsm = rack_merge_rsm(rack, rsm, next); 8525 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8526 } else 8527 break; 8528 } 8529 /* Now what about the previous? */ 8530 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8531 while (prev) { 8532 if (prev->r_flags & RACK_ACKED) { 8533 /* yep the previous and this can be merged */ 8534 rsm = rack_merge_rsm(rack, prev, rsm); 8535 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8536 } else 8537 break; 8538 } 8539 } 8540 if (used_ref == 0) { 8541 counter_u64_add(rack_sack_proc_all, 1); 8542 } else { 8543 counter_u64_add(rack_sack_proc_short, 1); 8544 } 8545 /* Save off the next one for quick reference. */ 8546 if (rsm) 8547 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8548 else 8549 nrsm = NULL; 8550 *prsm = rack->r_ctl.rc_sacklast = nrsm; 8551 /* Pass back the moved. */ 8552 *moved_two = moved; 8553 return (changed); 8554 } 8555 8556 static void inline 8557 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 8558 { 8559 struct rack_sendmap *tmap; 8560 8561 tmap = NULL; 8562 while (rsm && (rsm->r_flags & RACK_ACKED)) { 8563 /* Its no longer sacked, mark it so */ 8564 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8565 #ifdef INVARIANTS 8566 if (rsm->r_in_tmap) { 8567 panic("rack:%p rsm:%p flags:0x%x in tmap?", 8568 rack, rsm, rsm->r_flags); 8569 } 8570 #endif 8571 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 8572 /* Rebuild it into our tmap */ 8573 if (tmap == NULL) { 8574 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8575 tmap = rsm; 8576 } else { 8577 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 8578 tmap = rsm; 8579 } 8580 tmap->r_in_tmap = 1; 8581 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8582 } 8583 /* 8584 * Now lets possibly clear the sack filter so we start 8585 * recognizing sacks that cover this area. 8586 */ 8587 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 8588 8589 } 8590 8591 static void 8592 rack_do_decay(struct tcp_rack *rack) 8593 { 8594 struct timeval res; 8595 8596 #define timersub(tvp, uvp, vvp) \ 8597 do { \ 8598 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 8599 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 8600 if ((vvp)->tv_usec < 0) { \ 8601 (vvp)->tv_sec--; \ 8602 (vvp)->tv_usec += 1000000; \ 8603 } \ 8604 } while (0) 8605 8606 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 8607 #undef timersub 8608 8609 rack->r_ctl.input_pkt++; 8610 if ((rack->rc_in_persist) || 8611 (res.tv_sec >= 1) || 8612 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 8613 /* 8614 * Check for decay of non-SAD, 8615 * we want all SAD detection metrics to 8616 * decay 1/4 per second (or more) passed. 8617 */ 8618 uint32_t pkt_delta; 8619 8620 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 8621 /* Update our saved tracking values */ 8622 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 8623 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 8624 /* Now do we escape without decay? */ 8625 #ifdef NETFLIX_EXP_DETECTION 8626 if (rack->rc_in_persist || 8627 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 8628 (pkt_delta < tcp_sad_low_pps)){ 8629 /* 8630 * We don't decay idle connections 8631 * or ones that have a low input pps. 8632 */ 8633 return; 8634 } 8635 /* Decay the counters */ 8636 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 8637 tcp_sad_decay_val); 8638 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 8639 tcp_sad_decay_val); 8640 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 8641 tcp_sad_decay_val); 8642 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 8643 tcp_sad_decay_val); 8644 #endif 8645 } 8646 } 8647 8648 static void 8649 rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) 8650 { 8651 struct rack_sendmap *rsm, *rm; 8652 8653 /* 8654 * The ACK point is advancing to th_ack, we must drop off 8655 * the packets in the rack log and calculate any eligble 8656 * RTT's. 8657 */ 8658 rack->r_wanted_output = 1; 8659 more: 8660 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 8661 if (rsm == NULL) { 8662 if ((th_ack - 1) == tp->iss) { 8663 /* 8664 * For the SYN incoming case we will not 8665 * have called tcp_output for the sending of 8666 * the SYN, so there will be no map. All 8667 * other cases should probably be a panic. 8668 */ 8669 return; 8670 } 8671 if (tp->t_flags & TF_SENTFIN) { 8672 /* if we sent a FIN we often will not have map */ 8673 return; 8674 } 8675 #ifdef INVARIANTS 8676 panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", 8677 tp, 8678 tp->t_state, th_ack, rack, 8679 tp->snd_una, tp->snd_max, tp->snd_nxt); 8680 #endif 8681 return; 8682 } 8683 if (SEQ_LT(th_ack, rsm->r_start)) { 8684 /* Huh map is missing this */ 8685 #ifdef INVARIANTS 8686 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 8687 rsm->r_start, 8688 th_ack, tp->t_state, rack->r_state); 8689 #endif 8690 return; 8691 } 8692 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 8693 /* Now do we consume the whole thing? */ 8694 if (SEQ_GEQ(th_ack, rsm->r_end)) { 8695 /* Its all consumed. */ 8696 uint32_t left; 8697 uint8_t newly_acked; 8698 8699 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); 8700 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 8701 rsm->r_rtr_bytes = 0; 8702 /* Record the time of highest cumack sent */ 8703 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8704 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 8705 #ifdef INVARIANTS 8706 if (rm != rsm) { 8707 panic("removing head in rack:%p rsm:%p rm:%p", 8708 rack, rsm, rm); 8709 } 8710 #endif 8711 if (rsm->r_in_tmap) { 8712 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 8713 rsm->r_in_tmap = 0; 8714 } 8715 newly_acked = 1; 8716 if (rsm->r_flags & RACK_ACKED) { 8717 /* 8718 * It was acked on the scoreboard -- remove 8719 * it from total 8720 */ 8721 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 8722 newly_acked = 0; 8723 } else if (rsm->r_flags & RACK_SACK_PASSED) { 8724 /* 8725 * There are segments ACKED on the 8726 * scoreboard further up. We are seeing 8727 * reordering. 8728 */ 8729 rsm->r_flags &= ~RACK_SACK_PASSED; 8730 counter_u64_add(rack_reorder_seen, 1); 8731 rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 8732 rsm->r_flags |= RACK_ACKED; 8733 rack->r_ctl.rc_reorder_ts = cts; 8734 if (rack->r_ent_rec_ns) { 8735 /* 8736 * We have sent no more, and we saw an sack 8737 * then ack arrive. 8738 */ 8739 rack->r_might_revert = 1; 8740 } 8741 } 8742 if ((rsm->r_flags & RACK_TO_REXT) && 8743 (tp->t_flags & TF_RCVD_TSTMP) && 8744 (to->to_flags & TOF_TS) && 8745 (tp->t_flags & TF_PREVVALID)) { 8746 /* 8747 * We can use the timestamp to see 8748 * if this retransmission was from the 8749 * first transmit. If so we made a mistake. 8750 */ 8751 tp->t_flags &= ~TF_PREVVALID; 8752 if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { 8753 /* The first transmit is what this ack is for */ 8754 rack_cong_signal(tp, CC_RTO_ERR, th_ack); 8755 } 8756 } 8757 left = th_ack - rsm->r_end; 8758 if (rack->app_limited_needs_set && newly_acked) 8759 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 8760 /* Free back to zone */ 8761 rack_free(rack, rsm); 8762 if (left) { 8763 goto more; 8764 } 8765 /* Check for reneging */ 8766 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 8767 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 8768 /* 8769 * The peer has moved snd_una up to 8770 * the edge of this send, i.e. one 8771 * that it had previously acked. The only 8772 * way that can be true if the peer threw 8773 * away data (space issues) that it had 8774 * previously sacked (else it would have 8775 * given us snd_una up to (rsm->r_end). 8776 * We need to undo the acked markings here. 8777 * 8778 * Note we have to look to make sure th_ack is 8779 * our rsm->r_start in case we get an old ack 8780 * where th_ack is behind snd_una. 8781 */ 8782 rack_peer_reneges(rack, rsm, th_ack); 8783 } 8784 return; 8785 } 8786 if (rsm->r_flags & RACK_ACKED) { 8787 /* 8788 * It was acked on the scoreboard -- remove it from 8789 * total for the part being cum-acked. 8790 */ 8791 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 8792 } 8793 /* 8794 * Clear the dup ack count for 8795 * the piece that remains. 8796 */ 8797 rsm->r_dupack = 0; 8798 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 8799 if (rsm->r_rtr_bytes) { 8800 /* 8801 * It was retransmitted adjust the 8802 * sack holes for what was acked. 8803 */ 8804 int ack_am; 8805 8806 ack_am = (th_ack - rsm->r_start); 8807 if (ack_am >= rsm->r_rtr_bytes) { 8808 rack->r_ctl.rc_holes_rxt -= ack_am; 8809 rsm->r_rtr_bytes -= ack_am; 8810 } 8811 } 8812 /* 8813 * Update where the piece starts and record 8814 * the time of send of highest cumack sent. 8815 */ 8816 rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; 8817 rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); 8818 /* Now we need to move our offset forward too */ 8819 if (rsm->m && (rsm->orig_m_len != rsm->m->m_len)) { 8820 /* Fix up the orig_m_len and possibly the mbuf offset */ 8821 rack_adjust_orig_mlen(rsm); 8822 } 8823 rsm->soff += (th_ack - rsm->r_start); 8824 rsm->r_start = th_ack; 8825 /* Now do we need to move the mbuf fwd too? */ 8826 if (rsm->m) { 8827 while (rsm->soff >= rsm->m->m_len) { 8828 rsm->soff -= rsm->m->m_len; 8829 rsm->m = rsm->m->m_next; 8830 KASSERT((rsm->m != NULL), 8831 (" nrsm:%p hit at soff:%u null m", 8832 rsm, rsm->soff)); 8833 } 8834 rsm->orig_m_len = rsm->m->m_len; 8835 } 8836 if (rack->app_limited_needs_set) 8837 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 8838 } 8839 8840 static void 8841 rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) 8842 { 8843 struct rack_sendmap *rsm; 8844 int sack_pass_fnd = 0; 8845 8846 if (rack->r_might_revert) { 8847 /* 8848 * Ok we have reordering, have not sent anything, we 8849 * might want to revert the congestion state if nothing 8850 * further has SACK_PASSED on it. Lets check. 8851 * 8852 * We also get here when we have DSACKs come in for 8853 * all the data that we FR'd. Note that a rxt or tlp 8854 * timer clears this from happening. 8855 */ 8856 8857 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 8858 if (rsm->r_flags & RACK_SACK_PASSED) { 8859 sack_pass_fnd = 1; 8860 break; 8861 } 8862 } 8863 if (sack_pass_fnd == 0) { 8864 /* 8865 * We went into recovery 8866 * incorrectly due to reordering! 8867 */ 8868 int orig_cwnd; 8869 8870 rack->r_ent_rec_ns = 0; 8871 orig_cwnd = tp->snd_cwnd; 8872 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec; 8873 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; 8874 tp->snd_recover = tp->snd_una; 8875 rack_log_to_prr(rack, 14, orig_cwnd); 8876 EXIT_RECOVERY(tp->t_flags); 8877 } 8878 rack->r_might_revert = 0; 8879 } 8880 } 8881 8882 #ifdef NETFLIX_EXP_DETECTION 8883 static void 8884 rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) 8885 { 8886 if ((rack->do_detection || tcp_force_detection) && 8887 tcp_sack_to_ack_thresh && 8888 tcp_sack_to_move_thresh && 8889 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 8890 /* 8891 * We have thresholds set to find 8892 * possible attackers and disable sack. 8893 * Check them. 8894 */ 8895 uint64_t ackratio, moveratio, movetotal; 8896 8897 /* Log detecting */ 8898 rack_log_sad(rack, 1); 8899 ackratio = (uint64_t)(rack->r_ctl.sack_count); 8900 ackratio *= (uint64_t)(1000); 8901 if (rack->r_ctl.ack_count) 8902 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 8903 else { 8904 /* We really should not hit here */ 8905 ackratio = 1000; 8906 } 8907 if ((rack->sack_attack_disable == 0) && 8908 (ackratio > rack_highest_sack_thresh_seen)) 8909 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 8910 movetotal = rack->r_ctl.sack_moved_extra; 8911 movetotal += rack->r_ctl.sack_noextra_move; 8912 moveratio = rack->r_ctl.sack_moved_extra; 8913 moveratio *= (uint64_t)1000; 8914 if (movetotal) 8915 moveratio /= movetotal; 8916 else { 8917 /* No moves, thats pretty good */ 8918 moveratio = 0; 8919 } 8920 if ((rack->sack_attack_disable == 0) && 8921 (moveratio > rack_highest_move_thresh_seen)) 8922 rack_highest_move_thresh_seen = (uint32_t)moveratio; 8923 if (rack->sack_attack_disable == 0) { 8924 if ((ackratio > tcp_sack_to_ack_thresh) && 8925 (moveratio > tcp_sack_to_move_thresh)) { 8926 /* Disable sack processing */ 8927 rack->sack_attack_disable = 1; 8928 if (rack->r_rep_attack == 0) { 8929 rack->r_rep_attack = 1; 8930 counter_u64_add(rack_sack_attacks_detected, 1); 8931 } 8932 if (tcp_attack_on_turns_on_logging) { 8933 /* 8934 * Turn on logging, used for debugging 8935 * false positives. 8936 */ 8937 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 8938 } 8939 /* Clamp the cwnd at flight size */ 8940 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 8941 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 8942 rack_log_sad(rack, 2); 8943 } 8944 } else { 8945 /* We are sack-disabled check for false positives */ 8946 if ((ackratio <= tcp_restoral_thresh) || 8947 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 8948 rack->sack_attack_disable = 0; 8949 rack_log_sad(rack, 3); 8950 /* Restart counting */ 8951 rack->r_ctl.sack_count = 0; 8952 rack->r_ctl.sack_moved_extra = 0; 8953 rack->r_ctl.sack_noextra_move = 1; 8954 rack->r_ctl.ack_count = max(1, 8955 (bytes_this_ack / segsiz)); 8956 8957 if (rack->r_rep_reverse == 0) { 8958 rack->r_rep_reverse = 1; 8959 counter_u64_add(rack_sack_attacks_reversed, 1); 8960 } 8961 /* Restore the cwnd */ 8962 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 8963 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 8964 } 8965 } 8966 } 8967 } 8968 #endif 8969 8970 static void 8971 rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) 8972 { 8973 8974 uint32_t am; 8975 8976 if (SEQ_GT(end, start)) 8977 am = end - start; 8978 else 8979 am = 0; 8980 /* 8981 * We keep track of how many DSACK blocks we get 8982 * after a recovery incident. 8983 */ 8984 rack->r_ctl.dsack_byte_cnt += am; 8985 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && 8986 rack->r_ctl.retran_during_recovery && 8987 (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { 8988 /* 8989 * False recovery most likely culprit is reordering. If 8990 * nothing else is missing we need to revert. 8991 */ 8992 rack->r_might_revert = 1; 8993 rack_handle_might_revert(rack->rc_tp, rack); 8994 rack->r_might_revert = 0; 8995 rack->r_ctl.retran_during_recovery = 0; 8996 rack->r_ctl.dsack_byte_cnt = 0; 8997 } 8998 } 8999 9000 static void 9001 rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) 9002 { 9003 /* Deal with changed and PRR here (in recovery only) */ 9004 uint32_t pipe, snd_una; 9005 9006 rack->r_ctl.rc_prr_delivered += changed; 9007 9008 if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { 9009 /* 9010 * It is all outstanding, we are application limited 9011 * and thus we don't need more room to send anything. 9012 * Note we use tp->snd_una here and not th_ack because 9013 * the data as yet not been cut from the sb. 9014 */ 9015 rack->r_ctl.rc_prr_sndcnt = 0; 9016 return; 9017 } 9018 /* Compute prr_sndcnt */ 9019 if (SEQ_GT(tp->snd_una, th_ack)) { 9020 snd_una = tp->snd_una; 9021 } else { 9022 snd_una = th_ack; 9023 } 9024 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 9025 if (pipe > tp->snd_ssthresh) { 9026 long sndcnt; 9027 9028 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 9029 if (rack->r_ctl.rc_prr_recovery_fs > 0) 9030 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 9031 else { 9032 rack->r_ctl.rc_prr_sndcnt = 0; 9033 rack_log_to_prr(rack, 9, 0); 9034 sndcnt = 0; 9035 } 9036 sndcnt++; 9037 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 9038 sndcnt -= rack->r_ctl.rc_prr_out; 9039 else 9040 sndcnt = 0; 9041 rack->r_ctl.rc_prr_sndcnt = sndcnt; 9042 rack_log_to_prr(rack, 10, 0); 9043 } else { 9044 uint32_t limit; 9045 9046 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 9047 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 9048 else 9049 limit = 0; 9050 if (changed > limit) 9051 limit = changed; 9052 limit += ctf_fixed_maxseg(tp); 9053 if (tp->snd_ssthresh > pipe) { 9054 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 9055 rack_log_to_prr(rack, 11, 0); 9056 } else { 9057 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 9058 rack_log_to_prr(rack, 12, 0); 9059 } 9060 } 9061 } 9062 9063 static void 9064 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) 9065 { 9066 uint32_t changed; 9067 struct tcp_rack *rack; 9068 struct rack_sendmap *rsm; 9069 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 9070 register uint32_t th_ack; 9071 int32_t i, j, k, num_sack_blks = 0; 9072 uint32_t cts, acked, ack_point, sack_changed = 0; 9073 int loop_start = 0, moved_two = 0; 9074 uint32_t tsused; 9075 9076 9077 INP_WLOCK_ASSERT(tp->t_inpcb); 9078 if (th->th_flags & TH_RST) { 9079 /* We don't log resets */ 9080 return; 9081 } 9082 rack = (struct tcp_rack *)tp->t_fb_ptr; 9083 cts = tcp_get_usecs(NULL); 9084 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9085 changed = 0; 9086 th_ack = th->th_ack; 9087 if (rack->sack_attack_disable == 0) 9088 rack_do_decay(rack); 9089 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 9090 /* 9091 * You only get credit for 9092 * MSS and greater (and you get extra 9093 * credit for larger cum-ack moves). 9094 */ 9095 int ac; 9096 9097 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 9098 rack->r_ctl.ack_count += ac; 9099 counter_u64_add(rack_ack_total, ac); 9100 } 9101 if (rack->r_ctl.ack_count > 0xfff00000) { 9102 /* 9103 * reduce the number to keep us under 9104 * a uint32_t. 9105 */ 9106 rack->r_ctl.ack_count /= 2; 9107 rack->r_ctl.sack_count /= 2; 9108 } 9109 if (SEQ_GT(th_ack, tp->snd_una)) { 9110 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 9111 tp->t_acktime = ticks; 9112 } 9113 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 9114 changed = th_ack - rsm->r_start; 9115 if (changed) { 9116 rack_process_to_cumack(tp, rack, th_ack, cts, to); 9117 } 9118 if ((to->to_flags & TOF_SACK) == 0) { 9119 /* We are done nothing left and no sack. */ 9120 rack_handle_might_revert(tp, rack); 9121 /* 9122 * For cases where we struck a dup-ack 9123 * with no SACK, add to the changes so 9124 * PRR will work right. 9125 */ 9126 if (dup_ack_struck && (changed == 0)) { 9127 changed += ctf_fixed_maxseg(rack->rc_tp); 9128 } 9129 goto out; 9130 } 9131 /* Sack block processing */ 9132 if (SEQ_GT(th_ack, tp->snd_una)) 9133 ack_point = th_ack; 9134 else 9135 ack_point = tp->snd_una; 9136 for (i = 0; i < to->to_nsacks; i++) { 9137 bcopy((to->to_sacks + i * TCPOLEN_SACK), 9138 &sack, sizeof(sack)); 9139 sack.start = ntohl(sack.start); 9140 sack.end = ntohl(sack.end); 9141 if (SEQ_GT(sack.end, sack.start) && 9142 SEQ_GT(sack.start, ack_point) && 9143 SEQ_LT(sack.start, tp->snd_max) && 9144 SEQ_GT(sack.end, ack_point) && 9145 SEQ_LEQ(sack.end, tp->snd_max)) { 9146 sack_blocks[num_sack_blks] = sack; 9147 num_sack_blks++; 9148 #ifdef NETFLIX_STATS 9149 } else if (SEQ_LEQ(sack.start, th_ack) && 9150 SEQ_LEQ(sack.end, th_ack)) { 9151 /* 9152 * Its a D-SACK block. 9153 */ 9154 tcp_record_dsack(sack.start, sack.end); 9155 #endif 9156 rack_note_dsack(rack, sack.start, sack.end); 9157 } 9158 } 9159 /* 9160 * Sort the SACK blocks so we can update the rack scoreboard with 9161 * just one pass. 9162 */ 9163 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 9164 num_sack_blks, th->th_ack); 9165 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 9166 if (num_sack_blks == 0) { 9167 /* Nothing to sack (DSACKs?) */ 9168 goto out_with_totals; 9169 } 9170 if (num_sack_blks < 2) { 9171 /* Only one, we don't need to sort */ 9172 goto do_sack_work; 9173 } 9174 /* Sort the sacks */ 9175 for (i = 0; i < num_sack_blks; i++) { 9176 for (j = i + 1; j < num_sack_blks; j++) { 9177 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 9178 sack = sack_blocks[i]; 9179 sack_blocks[i] = sack_blocks[j]; 9180 sack_blocks[j] = sack; 9181 } 9182 } 9183 } 9184 /* 9185 * Now are any of the sack block ends the same (yes some 9186 * implementations send these)? 9187 */ 9188 again: 9189 if (num_sack_blks == 0) 9190 goto out_with_totals; 9191 if (num_sack_blks > 1) { 9192 for (i = 0; i < num_sack_blks; i++) { 9193 for (j = i + 1; j < num_sack_blks; j++) { 9194 if (sack_blocks[i].end == sack_blocks[j].end) { 9195 /* 9196 * Ok these two have the same end we 9197 * want the smallest end and then 9198 * throw away the larger and start 9199 * again. 9200 */ 9201 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 9202 /* 9203 * The second block covers 9204 * more area use that 9205 */ 9206 sack_blocks[i].start = sack_blocks[j].start; 9207 } 9208 /* 9209 * Now collapse out the dup-sack and 9210 * lower the count 9211 */ 9212 for (k = (j + 1); k < num_sack_blks; k++) { 9213 sack_blocks[j].start = sack_blocks[k].start; 9214 sack_blocks[j].end = sack_blocks[k].end; 9215 j++; 9216 } 9217 num_sack_blks--; 9218 goto again; 9219 } 9220 } 9221 } 9222 } 9223 do_sack_work: 9224 /* 9225 * First lets look to see if 9226 * we have retransmitted and 9227 * can use the transmit next? 9228 */ 9229 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9230 if (rsm && 9231 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 9232 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 9233 /* 9234 * We probably did the FR and the next 9235 * SACK in continues as we would expect. 9236 */ 9237 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 9238 if (acked) { 9239 rack->r_wanted_output = 1; 9240 changed += acked; 9241 sack_changed += acked; 9242 } 9243 if (num_sack_blks == 1) { 9244 /* 9245 * This is what we would expect from 9246 * a normal implementation to happen 9247 * after we have retransmitted the FR, 9248 * i.e the sack-filter pushes down 9249 * to 1 block and the next to be retransmitted 9250 * is the sequence in the sack block (has more 9251 * are acked). Count this as ACK'd data to boost 9252 * up the chances of recovering any false positives. 9253 */ 9254 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 9255 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 9256 counter_u64_add(rack_express_sack, 1); 9257 if (rack->r_ctl.ack_count > 0xfff00000) { 9258 /* 9259 * reduce the number to keep us under 9260 * a uint32_t. 9261 */ 9262 rack->r_ctl.ack_count /= 2; 9263 rack->r_ctl.sack_count /= 2; 9264 } 9265 goto out_with_totals; 9266 } else { 9267 /* 9268 * Start the loop through the 9269 * rest of blocks, past the first block. 9270 */ 9271 moved_two = 0; 9272 loop_start = 1; 9273 } 9274 } 9275 /* Its a sack of some sort */ 9276 rack->r_ctl.sack_count++; 9277 if (rack->r_ctl.sack_count > 0xfff00000) { 9278 /* 9279 * reduce the number to keep us under 9280 * a uint32_t. 9281 */ 9282 rack->r_ctl.ack_count /= 2; 9283 rack->r_ctl.sack_count /= 2; 9284 } 9285 counter_u64_add(rack_sack_total, 1); 9286 if (rack->sack_attack_disable) { 9287 /* An attacker disablement is in place */ 9288 if (num_sack_blks > 1) { 9289 rack->r_ctl.sack_count += (num_sack_blks - 1); 9290 rack->r_ctl.sack_moved_extra++; 9291 counter_u64_add(rack_move_some, 1); 9292 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 9293 rack->r_ctl.sack_moved_extra /= 2; 9294 rack->r_ctl.sack_noextra_move /= 2; 9295 } 9296 } 9297 goto out; 9298 } 9299 rsm = rack->r_ctl.rc_sacklast; 9300 for (i = loop_start; i < num_sack_blks; i++) { 9301 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 9302 if (acked) { 9303 rack->r_wanted_output = 1; 9304 changed += acked; 9305 sack_changed += acked; 9306 } 9307 if (moved_two) { 9308 /* 9309 * If we did not get a SACK for at least a MSS and 9310 * had to move at all, or if we moved more than our 9311 * threshold, it counts against the "extra" move. 9312 */ 9313 rack->r_ctl.sack_moved_extra += moved_two; 9314 counter_u64_add(rack_move_some, 1); 9315 } else { 9316 /* 9317 * else we did not have to move 9318 * any more than we would expect. 9319 */ 9320 rack->r_ctl.sack_noextra_move++; 9321 counter_u64_add(rack_move_none, 1); 9322 } 9323 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 9324 /* 9325 * If the SACK was not a full MSS then 9326 * we add to sack_count the number of 9327 * MSS's (or possibly more than 9328 * a MSS if its a TSO send) we had to skip by. 9329 */ 9330 rack->r_ctl.sack_count += moved_two; 9331 counter_u64_add(rack_sack_total, moved_two); 9332 } 9333 /* 9334 * Now we need to setup for the next 9335 * round. First we make sure we won't 9336 * exceed the size of our uint32_t on 9337 * the various counts, and then clear out 9338 * moved_two. 9339 */ 9340 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 9341 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 9342 rack->r_ctl.sack_moved_extra /= 2; 9343 rack->r_ctl.sack_noextra_move /= 2; 9344 } 9345 if (rack->r_ctl.sack_count > 0xfff00000) { 9346 rack->r_ctl.ack_count /= 2; 9347 rack->r_ctl.sack_count /= 2; 9348 } 9349 moved_two = 0; 9350 } 9351 out_with_totals: 9352 if (num_sack_blks > 1) { 9353 /* 9354 * You get an extra stroke if 9355 * you have more than one sack-blk, this 9356 * could be where we are skipping forward 9357 * and the sack-filter is still working, or 9358 * it could be an attacker constantly 9359 * moving us. 9360 */ 9361 rack->r_ctl.sack_moved_extra++; 9362 counter_u64_add(rack_move_some, 1); 9363 } 9364 out: 9365 #ifdef NETFLIX_EXP_DETECTION 9366 rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); 9367 #endif 9368 if (changed) { 9369 /* Something changed cancel the rack timer */ 9370 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9371 } 9372 tsused = tcp_get_usecs(NULL); 9373 rsm = tcp_rack_output(tp, rack, tsused); 9374 if ((!IN_FASTRECOVERY(tp->t_flags)) && 9375 rsm) { 9376 /* Enter recovery */ 9377 rack->r_ctl.rc_rsm_start = rsm->r_start; 9378 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 9379 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 9380 entered_recovery = 1; 9381 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 9382 /* 9383 * When we enter recovery we need to assure we send 9384 * one packet. 9385 */ 9386 if (rack->rack_no_prr == 0) { 9387 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 9388 rack_log_to_prr(rack, 8, 0); 9389 } 9390 rack->r_timer_override = 1; 9391 rack->r_early = 0; 9392 rack->r_ctl.rc_agg_early = 0; 9393 } else if (IN_FASTRECOVERY(tp->t_flags) && 9394 rsm && 9395 (rack->r_rr_config == 3)) { 9396 /* 9397 * Assure we can output and we get no 9398 * remembered pace time except the retransmit. 9399 */ 9400 rack->r_timer_override = 1; 9401 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 9402 rack->r_ctl.rc_resend = rsm; 9403 } 9404 if (IN_FASTRECOVERY(tp->t_flags) && 9405 (rack->rack_no_prr == 0) && 9406 (entered_recovery == 0)) { 9407 rack_update_prr(tp, rack, changed, th_ack); 9408 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 9409 ((rack->rc_inp->inp_in_hpts == 0) && 9410 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 9411 /* 9412 * If you are pacing output you don't want 9413 * to override. 9414 */ 9415 rack->r_early = 0; 9416 rack->r_ctl.rc_agg_early = 0; 9417 rack->r_timer_override = 1; 9418 } 9419 } 9420 } 9421 9422 static void 9423 rack_strike_dupack(struct tcp_rack *rack) 9424 { 9425 struct rack_sendmap *rsm; 9426 9427 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 9428 while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 9429 rsm = TAILQ_NEXT(rsm, r_tnext); 9430 } 9431 if (rsm && (rsm->r_dupack < 0xff)) { 9432 rsm->r_dupack++; 9433 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 9434 struct timeval tv; 9435 uint32_t cts; 9436 /* 9437 * Here we see if we need to retransmit. For 9438 * a SACK type connection if enough time has passed 9439 * we will get a return of the rsm. For a non-sack 9440 * connection we will get the rsm returned if the 9441 * dupack value is 3 or more. 9442 */ 9443 cts = tcp_get_usecs(&tv); 9444 rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); 9445 if (rack->r_ctl.rc_resend != NULL) { 9446 if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { 9447 rack_cong_signal(rack->rc_tp, CC_NDUPACK, 9448 rack->rc_tp->snd_una); 9449 } 9450 rack->r_wanted_output = 1; 9451 rack->r_timer_override = 1; 9452 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 9453 } 9454 } else { 9455 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 9456 } 9457 } 9458 } 9459 9460 static void 9461 rack_check_bottom_drag(struct tcpcb *tp, 9462 struct tcp_rack *rack, 9463 struct socket *so, int32_t acked) 9464 { 9465 uint32_t segsiz, minseg; 9466 9467 segsiz = ctf_fixed_maxseg(tp); 9468 minseg = segsiz; 9469 9470 if (tp->snd_max == tp->snd_una) { 9471 /* 9472 * We are doing dynamic pacing and we are way 9473 * under. Basically everything got acked while 9474 * we were still waiting on the pacer to expire. 9475 * 9476 * This means we need to boost the b/w in 9477 * addition to any earlier boosting of 9478 * the multipler. 9479 */ 9480 rack->rc_dragged_bottom = 1; 9481 rack_validate_multipliers_at_or_above100(rack); 9482 /* 9483 * Lets use the segment bytes acked plus 9484 * the lowest RTT seen as the basis to 9485 * form a b/w estimate. This will be off 9486 * due to the fact that the true estimate 9487 * should be around 1/2 the time of the RTT 9488 * but we can settle for that. 9489 */ 9490 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 9491 acked) { 9492 uint64_t bw, calc_bw, rtt; 9493 9494 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 9495 if (rtt == 0) { 9496 /* no us sample is there a ms one? */ 9497 if (rack->r_ctl.rack_rs.rs_rtt_lowest) { 9498 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 9499 } else { 9500 goto no_measurement; 9501 } 9502 } 9503 bw = acked; 9504 calc_bw = bw * 1000000; 9505 calc_bw /= rtt; 9506 if (rack->r_ctl.last_max_bw && 9507 (rack->r_ctl.last_max_bw < calc_bw)) { 9508 /* 9509 * If we have a last calculated max bw 9510 * enforce it. 9511 */ 9512 calc_bw = rack->r_ctl.last_max_bw; 9513 } 9514 /* now plop it in */ 9515 if (rack->rc_gp_filled == 0) { 9516 if (calc_bw > ONE_POINT_TWO_MEG) { 9517 /* 9518 * If we have no measurement 9519 * don't let us set in more than 9520 * 1.2Mbps. If we are still too 9521 * low after pacing with this we 9522 * will hopefully have a max b/w 9523 * available to sanity check things. 9524 */ 9525 calc_bw = ONE_POINT_TWO_MEG; 9526 } 9527 rack->r_ctl.rc_rtt_diff = 0; 9528 rack->r_ctl.gp_bw = calc_bw; 9529 rack->rc_gp_filled = 1; 9530 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9531 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9532 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9533 } else if (calc_bw > rack->r_ctl.gp_bw) { 9534 rack->r_ctl.rc_rtt_diff = 0; 9535 if (rack->r_ctl.num_measurements < RACK_REQ_AVG) 9536 rack->r_ctl.num_measurements = RACK_REQ_AVG; 9537 rack->r_ctl.gp_bw = calc_bw; 9538 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 9539 } else 9540 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9541 if ((rack->gp_ready == 0) && 9542 (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { 9543 /* We have enough measurements now */ 9544 rack->gp_ready = 1; 9545 rack_set_cc_pacing(rack); 9546 if (rack->defer_options) 9547 rack_apply_deferred_options(rack); 9548 } 9549 /* 9550 * For acks over 1mss we do a extra boost to simulate 9551 * where we would get 2 acks (we want 110 for the mul). 9552 */ 9553 if (acked > segsiz) 9554 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9555 } else { 9556 /* 9557 * zero rtt possibly?, settle for just an old increase. 9558 */ 9559 no_measurement: 9560 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9561 } 9562 } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && 9563 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 9564 minseg)) && 9565 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 9566 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 9567 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 9568 (segsiz * rack_req_segs))) { 9569 /* 9570 * We are doing dynamic GP pacing and 9571 * we have everything except 1MSS or less 9572 * bytes left out. We are still pacing away. 9573 * And there is data that could be sent, This 9574 * means we are inserting delayed ack time in 9575 * our measurements because we are pacing too slow. 9576 */ 9577 rack_validate_multipliers_at_or_above100(rack); 9578 rack->rc_dragged_bottom = 1; 9579 rack_increase_bw_mul(rack, -1, 0, 0, 1); 9580 } 9581 } 9582 9583 9584 9585 static void 9586 rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) 9587 { 9588 /* 9589 * The fast output path is enabled and we 9590 * have moved the cumack forward. Lets see if 9591 * we can expand forward the fast path length by 9592 * that amount. What we would ideally like to 9593 * do is increase the number of bytes in the 9594 * fast path block (left_to_send) by the 9595 * acked amount. However we have to gate that 9596 * by two factors: 9597 * 1) The amount outstanding and the rwnd of the peer 9598 * (i.e. we don't want to exceed the rwnd of the peer). 9599 * <and> 9600 * 2) The amount of data left in the socket buffer (i.e. 9601 * we can't send beyond what is in the buffer). 9602 * 9603 * Note that this does not take into account any increase 9604 * in the cwnd. We will only extend the fast path by 9605 * what was acked. 9606 */ 9607 uint32_t new_total, gating_val; 9608 9609 new_total = acked_amount + rack->r_ctl.fsb.left_to_send; 9610 gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), 9611 (tp->snd_wnd - (tp->snd_max - tp->snd_una))); 9612 if (new_total <= gating_val) { 9613 /* We can increase left_to_send by the acked amount */ 9614 counter_u64_add(rack_extended_rfo, 1); 9615 rack->r_ctl.fsb.left_to_send = new_total; 9616 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), 9617 ("rack:%p left_to_send:%u sbavail:%u out:%u", 9618 rack, rack->r_ctl.fsb.left_to_send, 9619 sbavail(&rack->rc_inp->inp_socket->so_snd), 9620 (tp->snd_max - tp->snd_una))); 9621 9622 } 9623 } 9624 9625 static void 9626 rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) 9627 { 9628 /* 9629 * Here any sendmap entry that points to the 9630 * beginning mbuf must be adjusted to the correct 9631 * offset. This must be called with: 9632 * 1) The socket buffer locked 9633 * 2) snd_una adjusted to its new postion. 9634 * 9635 * Note that (2) implies rack_ack_received has also 9636 * been called. 9637 * 9638 * We grab the first mbuf in the socket buffer and 9639 * then go through the front of the sendmap, recalculating 9640 * the stored offset for any sendmap entry that has 9641 * that mbuf. We must use the sb functions to do this 9642 * since its possible an add was done has well as 9643 * the subtraction we may have just completed. This should 9644 * not be a penalty though, since we just referenced the sb 9645 * to go in and trim off the mbufs that we freed (of course 9646 * there will be a penalty for the sendmap references though). 9647 */ 9648 struct mbuf *m; 9649 struct rack_sendmap *rsm; 9650 9651 SOCKBUF_LOCK_ASSERT(sb); 9652 m = sb->sb_mb; 9653 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 9654 if ((rsm == NULL) || (m == NULL)) { 9655 /* Nothing outstanding */ 9656 return; 9657 } 9658 while (rsm->m && (rsm->m == m)) { 9659 /* one to adjust */ 9660 #ifdef INVARIANTS 9661 struct mbuf *tm; 9662 uint32_t soff; 9663 9664 tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); 9665 if (rsm->orig_m_len != m->m_len) { 9666 rack_adjust_orig_mlen(rsm); 9667 } 9668 if (rsm->soff != soff) { 9669 /* 9670 * This is not a fatal error, we anticipate it 9671 * might happen (the else code), so we count it here 9672 * so that under invariant we can see that it really 9673 * does happen. 9674 */ 9675 counter_u64_add(rack_adjust_map_bw, 1); 9676 } 9677 rsm->m = tm; 9678 rsm->soff = soff; 9679 if (tm) 9680 rsm->orig_m_len = rsm->m->m_len; 9681 else 9682 rsm->orig_m_len = 0; 9683 #else 9684 rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); 9685 if (rsm->m) 9686 rsm->orig_m_len = rsm->m->m_len; 9687 else 9688 rsm->orig_m_len = 0; 9689 #endif 9690 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 9691 rsm); 9692 if (rsm == NULL) 9693 break; 9694 } 9695 } 9696 9697 /* 9698 * Return value of 1, we do not need to call rack_process_data(). 9699 * return value of 0, rack_process_data can be called. 9700 * For ret_val if its 0 the TCP is locked, if its non-zero 9701 * its unlocked and probably unsafe to touch the TCB. 9702 */ 9703 static int 9704 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 9705 struct tcpcb *tp, struct tcpopt *to, 9706 uint32_t tiwin, int32_t tlen, 9707 int32_t * ofia, int32_t thflags, int32_t *ret_val) 9708 { 9709 int32_t ourfinisacked = 0; 9710 int32_t nsegs, acked_amount; 9711 int32_t acked; 9712 struct mbuf *mfree; 9713 struct tcp_rack *rack; 9714 int32_t under_pacing = 0; 9715 int32_t recovery = 0; 9716 9717 rack = (struct tcp_rack *)tp->t_fb_ptr; 9718 if (SEQ_GT(th->th_ack, tp->snd_max)) { 9719 __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, 9720 &rack->r_ctl.challenge_ack_ts, 9721 &rack->r_ctl.challenge_ack_cnt); 9722 rack->r_wanted_output = 1; 9723 return (1); 9724 } 9725 if (rack->gp_ready && 9726 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9727 under_pacing = 1; 9728 } 9729 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 9730 int in_rec, dup_ack_struck = 0; 9731 9732 in_rec = IN_FASTRECOVERY(tp->t_flags); 9733 if (rack->rc_in_persist) { 9734 tp->t_rxtshift = 0; 9735 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9736 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9737 } 9738 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) { 9739 rack_strike_dupack(rack); 9740 dup_ack_struck = 1; 9741 } 9742 rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); 9743 } 9744 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 9745 /* 9746 * Old ack, behind (or duplicate to) the last one rcv'd 9747 * Note: We mark reordering is occuring if its 9748 * less than and we have not closed our window. 9749 */ 9750 if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { 9751 counter_u64_add(rack_reorder_seen, 1); 9752 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 9753 } 9754 return (0); 9755 } 9756 /* 9757 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 9758 * something we sent. 9759 */ 9760 if (tp->t_flags & TF_NEEDSYN) { 9761 /* 9762 * T/TCP: Connection was half-synchronized, and our SYN has 9763 * been ACK'd (so connection is now fully synchronized). Go 9764 * to non-starred state, increment snd_una for ACK of SYN, 9765 * and check if we can do window scaling. 9766 */ 9767 tp->t_flags &= ~TF_NEEDSYN; 9768 tp->snd_una++; 9769 /* Do window scaling? */ 9770 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9771 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9772 tp->rcv_scale = tp->request_r_scale; 9773 /* Send window already scaled. */ 9774 } 9775 } 9776 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9777 INP_WLOCK_ASSERT(tp->t_inpcb); 9778 9779 acked = BYTES_THIS_ACK(tp, th); 9780 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9781 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9782 /* 9783 * If we just performed our first retransmit, and the ACK arrives 9784 * within our recovery window, then it was a mistake to do the 9785 * retransmit in the first place. Recover our original cwnd and 9786 * ssthresh, and proceed to transmit where we left off. 9787 */ 9788 if ((tp->t_flags & TF_PREVVALID) && 9789 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 9790 tp->t_flags &= ~TF_PREVVALID; 9791 if (tp->t_rxtshift == 1 && 9792 (int)(ticks - tp->t_badrxtwin) < 0) 9793 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 9794 } 9795 if (acked) { 9796 /* assure we are not backed off */ 9797 tp->t_rxtshift = 0; 9798 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 9799 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 9800 rack->rc_tlp_in_progress = 0; 9801 rack->r_ctl.rc_tlp_cnt_out = 0; 9802 /* 9803 * If it is the RXT timer we want to 9804 * stop it, so we can restart a TLP. 9805 */ 9806 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9807 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9808 #ifdef NETFLIX_HTTP_LOGGING 9809 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9810 #endif 9811 } 9812 /* 9813 * If we have a timestamp reply, update smoothed round trip time. If 9814 * no timestamp is present but transmit timer is running and timed 9815 * sequence number was acked, update smoothed round trip time. Since 9816 * we now have an rtt measurement, cancel the timer backoff (cf., 9817 * Phil Karn's retransmit alg.). Recompute the initial retransmit 9818 * timer. 9819 * 9820 * Some boxes send broken timestamp replies during the SYN+ACK 9821 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9822 * and blow up the retransmit timer. 9823 */ 9824 /* 9825 * If all outstanding data is acked, stop retransmit timer and 9826 * remember to restart (more output or persist). If there is more 9827 * data to be acked, restart retransmit timer, using current 9828 * (possibly backed-off) value. 9829 */ 9830 if (acked == 0) { 9831 if (ofia) 9832 *ofia = ourfinisacked; 9833 return (0); 9834 } 9835 if (IN_RECOVERY(tp->t_flags)) { 9836 if (SEQ_LT(th->th_ack, tp->snd_recover) && 9837 (SEQ_LT(th->th_ack, tp->snd_max))) { 9838 tcp_rack_partialack(tp); 9839 } else { 9840 rack_post_recovery(tp, th->th_ack); 9841 recovery = 1; 9842 } 9843 } 9844 /* 9845 * Let the congestion control algorithm update congestion control 9846 * related information. This typically means increasing the 9847 * congestion window. 9848 */ 9849 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); 9850 SOCKBUF_LOCK(&so->so_snd); 9851 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 9852 tp->snd_wnd -= acked_amount; 9853 mfree = sbcut_locked(&so->so_snd, acked_amount); 9854 if ((sbused(&so->so_snd) == 0) && 9855 (acked > acked_amount) && 9856 (tp->t_state >= TCPS_FIN_WAIT_1) && 9857 (tp->t_flags & TF_SENTFIN)) { 9858 /* 9859 * We must be sure our fin 9860 * was sent and acked (we can be 9861 * in FIN_WAIT_1 without having 9862 * sent the fin). 9863 */ 9864 ourfinisacked = 1; 9865 } 9866 tp->snd_una = th->th_ack; 9867 if (acked_amount && sbavail(&so->so_snd)) 9868 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 9869 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 9870 /* NB: sowwakeup_locked() does an implicit unlock. */ 9871 sowwakeup_locked(so); 9872 m_freem(mfree); 9873 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 9874 tp->snd_recover = tp->snd_una; 9875 9876 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 9877 tp->snd_nxt = tp->snd_una; 9878 } 9879 if (under_pacing && 9880 (rack->use_fixed_rate == 0) && 9881 (rack->in_probe_rtt == 0) && 9882 rack->rc_gp_dyn_mul && 9883 rack->rc_always_pace) { 9884 /* Check if we are dragging bottom */ 9885 rack_check_bottom_drag(tp, rack, so, acked); 9886 } 9887 if (tp->snd_una == tp->snd_max) { 9888 /* Nothing left outstanding */ 9889 tp->t_flags &= ~TF_PREVVALID; 9890 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9891 rack->r_ctl.retran_during_recovery = 0; 9892 rack->r_ctl.dsack_byte_cnt = 0; 9893 if (rack->r_ctl.rc_went_idle_time == 0) 9894 rack->r_ctl.rc_went_idle_time = 1; 9895 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9896 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9897 tp->t_acktime = 0; 9898 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9899 /* Set need output so persist might get set */ 9900 rack->r_wanted_output = 1; 9901 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 9902 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 9903 (sbavail(&so->so_snd) == 0) && 9904 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 9905 /* 9906 * The socket was gone and the 9907 * peer sent data (now or in the past), time to 9908 * reset him. 9909 */ 9910 *ret_val = 1; 9911 /* tcp_close will kill the inp pre-log the Reset */ 9912 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9913 tp = tcp_close(tp); 9914 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 9915 return (1); 9916 } 9917 } 9918 if (ofia) 9919 *ofia = ourfinisacked; 9920 return (0); 9921 } 9922 9923 static void 9924 rack_collapsed_window(struct tcp_rack *rack) 9925 { 9926 /* 9927 * Now we must walk the 9928 * send map and divide the 9929 * ones left stranded. These 9930 * guys can't cause us to abort 9931 * the connection and are really 9932 * "unsent". However if a buggy 9933 * client actually did keep some 9934 * of the data i.e. collapsed the win 9935 * and refused to ack and then opened 9936 * the win and acked that data. We would 9937 * get into an ack war, the simplier 9938 * method then of just pretending we 9939 * did not send those segments something 9940 * won't work. 9941 */ 9942 struct rack_sendmap *rsm, *nrsm, fe, *insret; 9943 tcp_seq max_seq; 9944 9945 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 9946 memset(&fe, 0, sizeof(fe)); 9947 fe.r_start = max_seq; 9948 /* Find the first seq past or at maxseq */ 9949 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 9950 if (rsm == NULL) { 9951 /* Nothing to do strange */ 9952 rack->rc_has_collapsed = 0; 9953 return; 9954 } 9955 /* 9956 * Now do we need to split at 9957 * the collapse point? 9958 */ 9959 if (SEQ_GT(max_seq, rsm->r_start)) { 9960 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 9961 if (nrsm == NULL) { 9962 /* We can't get a rsm, mark all? */ 9963 nrsm = rsm; 9964 goto no_split; 9965 } 9966 /* Clone it */ 9967 rack_clone_rsm(rack, nrsm, rsm, max_seq); 9968 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 9969 #ifdef INVARIANTS 9970 if (insret != NULL) { 9971 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 9972 nrsm, insret, rack, rsm); 9973 } 9974 #endif 9975 rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__); 9976 if (rsm->r_in_tmap) { 9977 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 9978 nrsm->r_in_tmap = 1; 9979 } 9980 /* 9981 * Set in the new RSM as the 9982 * collapsed starting point 9983 */ 9984 rsm = nrsm; 9985 } 9986 no_split: 9987 counter_u64_add(rack_collapsed_win, 1); 9988 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 9989 nrsm->r_flags |= RACK_RWND_COLLAPSED; 9990 } 9991 rack->rc_has_collapsed = 1; 9992 } 9993 9994 static void 9995 rack_un_collapse_window(struct tcp_rack *rack) 9996 { 9997 struct rack_sendmap *rsm; 9998 9999 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 10000 if (rsm->r_flags & RACK_RWND_COLLAPSED) 10001 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 10002 else 10003 break; 10004 } 10005 rack->rc_has_collapsed = 0; 10006 } 10007 10008 static void 10009 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 10010 int32_t tlen, int32_t tfo_syn) 10011 { 10012 if (DELAY_ACK(tp, tlen) || tfo_syn) { 10013 if (rack->rc_dack_mode && 10014 (tlen > 500) && 10015 (rack->rc_dack_toggle == 1)) { 10016 goto no_delayed_ack; 10017 } 10018 rack_timer_cancel(tp, rack, 10019 rack->r_ctl.rc_rcvtime, __LINE__); 10020 tp->t_flags |= TF_DELACK; 10021 } else { 10022 no_delayed_ack: 10023 rack->r_wanted_output = 1; 10024 tp->t_flags |= TF_ACKNOW; 10025 if (rack->rc_dack_mode) { 10026 if (tp->t_flags & TF_DELACK) 10027 rack->rc_dack_toggle = 1; 10028 else 10029 rack->rc_dack_toggle = 0; 10030 } 10031 } 10032 } 10033 10034 static void 10035 rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) 10036 { 10037 /* 10038 * If fast output is in progress, lets validate that 10039 * the new window did not shrink on us and make it 10040 * so fast output should end. 10041 */ 10042 if (rack->r_fast_output) { 10043 uint32_t out; 10044 10045 /* 10046 * Calculate what we will send if left as is 10047 * and compare that to our send window. 10048 */ 10049 out = ctf_outstanding(tp); 10050 if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { 10051 /* ok we have an issue */ 10052 if (out >= tp->snd_wnd) { 10053 /* Turn off fast output the window is met or collapsed */ 10054 rack->r_fast_output = 0; 10055 } else { 10056 /* we have some room left */ 10057 rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; 10058 if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { 10059 /* If not at least 1 full segment never mind */ 10060 rack->r_fast_output = 0; 10061 } 10062 } 10063 } 10064 } 10065 } 10066 10067 10068 /* 10069 * Return value of 1, the TCB is unlocked and most 10070 * likely gone, return value of 0, the TCP is still 10071 * locked. 10072 */ 10073 static int 10074 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 10075 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 10076 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 10077 { 10078 /* 10079 * Update window information. Don't look at window if no ACK: TAC's 10080 * send garbage on first SYN. 10081 */ 10082 int32_t nsegs; 10083 int32_t tfo_syn; 10084 struct tcp_rack *rack; 10085 10086 rack = (struct tcp_rack *)tp->t_fb_ptr; 10087 INP_WLOCK_ASSERT(tp->t_inpcb); 10088 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10089 if ((thflags & TH_ACK) && 10090 (SEQ_LT(tp->snd_wl1, th->th_seq) || 10091 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 10092 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 10093 /* keep track of pure window updates */ 10094 if (tlen == 0 && 10095 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 10096 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 10097 tp->snd_wnd = tiwin; 10098 rack_validate_fo_sendwin_up(tp, rack); 10099 tp->snd_wl1 = th->th_seq; 10100 tp->snd_wl2 = th->th_ack; 10101 if (tp->snd_wnd > tp->max_sndwnd) 10102 tp->max_sndwnd = tp->snd_wnd; 10103 rack->r_wanted_output = 1; 10104 } else if (thflags & TH_ACK) { 10105 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 10106 tp->snd_wnd = tiwin; 10107 rack_validate_fo_sendwin_up(tp, rack); 10108 tp->snd_wl1 = th->th_seq; 10109 tp->snd_wl2 = th->th_ack; 10110 } 10111 } 10112 if (tp->snd_wnd < ctf_outstanding(tp)) 10113 /* The peer collapsed the window */ 10114 rack_collapsed_window(rack); 10115 else if (rack->rc_has_collapsed) 10116 rack_un_collapse_window(rack); 10117 /* Was persist timer active and now we have window space? */ 10118 if ((rack->rc_in_persist != 0) && 10119 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10120 rack->r_ctl.rc_pace_min_segs))) { 10121 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10122 tp->snd_nxt = tp->snd_max; 10123 /* Make sure we output to start the timer */ 10124 rack->r_wanted_output = 1; 10125 } 10126 /* Do we enter persists? */ 10127 if ((rack->rc_in_persist == 0) && 10128 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10129 TCPS_HAVEESTABLISHED(tp->t_state) && 10130 (tp->snd_max == tp->snd_una) && 10131 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 10132 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 10133 /* 10134 * Here the rwnd is less than 10135 * the pacing size, we are established, 10136 * nothing is outstanding, and there is 10137 * data to send. Enter persists. 10138 */ 10139 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10140 } 10141 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 10142 m_freem(m); 10143 return (0); 10144 } 10145 /* 10146 * don't process the URG bit, ignore them drag 10147 * along the up. 10148 */ 10149 tp->rcv_up = tp->rcv_nxt; 10150 INP_WLOCK_ASSERT(tp->t_inpcb); 10151 10152 /* 10153 * Process the segment text, merging it into the TCP sequencing 10154 * queue, and arranging for acknowledgment of receipt if necessary. 10155 * This process logically involves adjusting tp->rcv_wnd as data is 10156 * presented to the user (this happens in tcp_usrreq.c, case 10157 * PRU_RCVD). If a FIN has already been received on this connection 10158 * then we just ignore the text. 10159 */ 10160 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 10161 IS_FASTOPEN(tp->t_flags)); 10162 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 10163 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10164 tcp_seq save_start = th->th_seq; 10165 tcp_seq save_rnxt = tp->rcv_nxt; 10166 int save_tlen = tlen; 10167 10168 m_adj(m, drop_hdrlen); /* delayed header drop */ 10169 /* 10170 * Insert segment which includes th into TCP reassembly 10171 * queue with control block tp. Set thflags to whether 10172 * reassembly now includes a segment with FIN. This handles 10173 * the common case inline (segment is the next to be 10174 * received on an established connection, and the queue is 10175 * empty), avoiding linkage into and removal from the queue 10176 * and repetition of various conversions. Set DELACK for 10177 * segments received in order, but ack immediately when 10178 * segments are out of order (so fast retransmit can work). 10179 */ 10180 if (th->th_seq == tp->rcv_nxt && 10181 SEGQ_EMPTY(tp) && 10182 (TCPS_HAVEESTABLISHED(tp->t_state) || 10183 tfo_syn)) { 10184 #ifdef NETFLIX_SB_LIMITS 10185 u_int mcnt, appended; 10186 10187 if (so->so_rcv.sb_shlim) { 10188 mcnt = m_memcnt(m); 10189 appended = 0; 10190 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10191 CFO_NOSLEEP, NULL) == false) { 10192 counter_u64_add(tcp_sb_shlim_fails, 1); 10193 m_freem(m); 10194 return (0); 10195 } 10196 } 10197 #endif 10198 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 10199 tp->rcv_nxt += tlen; 10200 if (tlen && 10201 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10202 (tp->t_fbyte_in == 0)) { 10203 tp->t_fbyte_in = ticks; 10204 if (tp->t_fbyte_in == 0) 10205 tp->t_fbyte_in = 1; 10206 if (tp->t_fbyte_out && tp->t_fbyte_in) 10207 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10208 } 10209 thflags = th->th_flags & TH_FIN; 10210 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10211 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10212 SOCKBUF_LOCK(&so->so_rcv); 10213 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10214 m_freem(m); 10215 } else 10216 #ifdef NETFLIX_SB_LIMITS 10217 appended = 10218 #endif 10219 sbappendstream_locked(&so->so_rcv, m, 0); 10220 10221 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10222 /* NB: sorwakeup_locked() does an implicit unlock. */ 10223 sorwakeup_locked(so); 10224 #ifdef NETFLIX_SB_LIMITS 10225 if (so->so_rcv.sb_shlim && appended != mcnt) 10226 counter_fo_release(so->so_rcv.sb_shlim, 10227 mcnt - appended); 10228 #endif 10229 } else { 10230 /* 10231 * XXX: Due to the header drop above "th" is 10232 * theoretically invalid by now. Fortunately 10233 * m_adj() doesn't actually frees any mbufs when 10234 * trimming from the head. 10235 */ 10236 tcp_seq temp = save_start; 10237 10238 thflags = tcp_reass(tp, th, &temp, &tlen, m); 10239 tp->t_flags |= TF_ACKNOW; 10240 if (tp->t_flags & TF_WAKESOR) { 10241 tp->t_flags &= ~TF_WAKESOR; 10242 /* NB: sorwakeup_locked() does an implicit unlock. */ 10243 sorwakeup_locked(so); 10244 } 10245 } 10246 if ((tp->t_flags & TF_SACK_PERMIT) && 10247 (save_tlen > 0) && 10248 TCPS_HAVEESTABLISHED(tp->t_state)) { 10249 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 10250 /* 10251 * DSACK actually handled in the fastpath 10252 * above. 10253 */ 10254 RACK_OPTS_INC(tcp_sack_path_1); 10255 tcp_update_sack_list(tp, save_start, 10256 save_start + save_tlen); 10257 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 10258 if ((tp->rcv_numsacks >= 1) && 10259 (tp->sackblks[0].end == save_start)) { 10260 /* 10261 * Partial overlap, recorded at todrop 10262 * above. 10263 */ 10264 RACK_OPTS_INC(tcp_sack_path_2a); 10265 tcp_update_sack_list(tp, 10266 tp->sackblks[0].start, 10267 tp->sackblks[0].end); 10268 } else { 10269 RACK_OPTS_INC(tcp_sack_path_2b); 10270 tcp_update_dsack_list(tp, save_start, 10271 save_start + save_tlen); 10272 } 10273 } else if (tlen >= save_tlen) { 10274 /* Update of sackblks. */ 10275 RACK_OPTS_INC(tcp_sack_path_3); 10276 tcp_update_dsack_list(tp, save_start, 10277 save_start + save_tlen); 10278 } else if (tlen > 0) { 10279 RACK_OPTS_INC(tcp_sack_path_4); 10280 tcp_update_dsack_list(tp, save_start, 10281 save_start + tlen); 10282 } 10283 } 10284 } else { 10285 m_freem(m); 10286 thflags &= ~TH_FIN; 10287 } 10288 10289 /* 10290 * If FIN is received ACK the FIN and let the user know that the 10291 * connection is closing. 10292 */ 10293 if (thflags & TH_FIN) { 10294 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 10295 /* The socket upcall is handled by socantrcvmore. */ 10296 socantrcvmore(so); 10297 /* 10298 * If connection is half-synchronized (ie NEEDSYN 10299 * flag on) then delay ACK, so it may be piggybacked 10300 * when SYN is sent. Otherwise, since we received a 10301 * FIN then no more input can be expected, send ACK 10302 * now. 10303 */ 10304 if (tp->t_flags & TF_NEEDSYN) { 10305 rack_timer_cancel(tp, rack, 10306 rack->r_ctl.rc_rcvtime, __LINE__); 10307 tp->t_flags |= TF_DELACK; 10308 } else { 10309 tp->t_flags |= TF_ACKNOW; 10310 } 10311 tp->rcv_nxt++; 10312 } 10313 switch (tp->t_state) { 10314 /* 10315 * In SYN_RECEIVED and ESTABLISHED STATES enter the 10316 * CLOSE_WAIT state. 10317 */ 10318 case TCPS_SYN_RECEIVED: 10319 tp->t_starttime = ticks; 10320 /* FALLTHROUGH */ 10321 case TCPS_ESTABLISHED: 10322 rack_timer_cancel(tp, rack, 10323 rack->r_ctl.rc_rcvtime, __LINE__); 10324 tcp_state_change(tp, TCPS_CLOSE_WAIT); 10325 break; 10326 10327 /* 10328 * If still in FIN_WAIT_1 STATE FIN has not been 10329 * acked so enter the CLOSING state. 10330 */ 10331 case TCPS_FIN_WAIT_1: 10332 rack_timer_cancel(tp, rack, 10333 rack->r_ctl.rc_rcvtime, __LINE__); 10334 tcp_state_change(tp, TCPS_CLOSING); 10335 break; 10336 10337 /* 10338 * In FIN_WAIT_2 state enter the TIME_WAIT state, 10339 * starting the time-wait timer, turning off the 10340 * other standard timers. 10341 */ 10342 case TCPS_FIN_WAIT_2: 10343 rack_timer_cancel(tp, rack, 10344 rack->r_ctl.rc_rcvtime, __LINE__); 10345 tcp_twstart(tp); 10346 return (1); 10347 } 10348 } 10349 /* 10350 * Return any desired output. 10351 */ 10352 if ((tp->t_flags & TF_ACKNOW) || 10353 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 10354 rack->r_wanted_output = 1; 10355 } 10356 INP_WLOCK_ASSERT(tp->t_inpcb); 10357 return (0); 10358 } 10359 10360 /* 10361 * Here nothing is really faster, its just that we 10362 * have broken out the fast-data path also just like 10363 * the fast-ack. 10364 */ 10365 static int 10366 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 10367 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10368 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 10369 { 10370 int32_t nsegs; 10371 int32_t newsize = 0; /* automatic sockbuf scaling */ 10372 struct tcp_rack *rack; 10373 #ifdef NETFLIX_SB_LIMITS 10374 u_int mcnt, appended; 10375 #endif 10376 #ifdef TCPDEBUG 10377 /* 10378 * The size of tcp_saveipgen must be the size of the max ip header, 10379 * now IPv6. 10380 */ 10381 u_char tcp_saveipgen[IP6_HDR_LEN]; 10382 struct tcphdr tcp_savetcp; 10383 short ostate = 0; 10384 10385 #endif 10386 /* 10387 * If last ACK falls within this segment's sequence numbers, record 10388 * the timestamp. NOTE that the test is modified according to the 10389 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10390 */ 10391 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 10392 return (0); 10393 } 10394 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10395 return (0); 10396 } 10397 if (tiwin && tiwin != tp->snd_wnd) { 10398 return (0); 10399 } 10400 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 10401 return (0); 10402 } 10403 if (__predict_false((to->to_flags & TOF_TS) && 10404 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 10405 return (0); 10406 } 10407 if (__predict_false((th->th_ack != tp->snd_una))) { 10408 return (0); 10409 } 10410 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 10411 return (0); 10412 } 10413 if ((to->to_flags & TOF_TS) != 0 && 10414 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10415 tp->ts_recent_age = tcp_ts_getticks(); 10416 tp->ts_recent = to->to_tsval; 10417 } 10418 rack = (struct tcp_rack *)tp->t_fb_ptr; 10419 /* 10420 * This is a pure, in-sequence data packet with nothing on the 10421 * reassembly queue and we have enough buffer space to take it. 10422 */ 10423 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10424 10425 #ifdef NETFLIX_SB_LIMITS 10426 if (so->so_rcv.sb_shlim) { 10427 mcnt = m_memcnt(m); 10428 appended = 0; 10429 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 10430 CFO_NOSLEEP, NULL) == false) { 10431 counter_u64_add(tcp_sb_shlim_fails, 1); 10432 m_freem(m); 10433 return (1); 10434 } 10435 } 10436 #endif 10437 /* Clean receiver SACK report if present */ 10438 if (tp->rcv_numsacks) 10439 tcp_clean_sackreport(tp); 10440 KMOD_TCPSTAT_INC(tcps_preddat); 10441 tp->rcv_nxt += tlen; 10442 if (tlen && 10443 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 10444 (tp->t_fbyte_in == 0)) { 10445 tp->t_fbyte_in = ticks; 10446 if (tp->t_fbyte_in == 0) 10447 tp->t_fbyte_in = 1; 10448 if (tp->t_fbyte_out && tp->t_fbyte_in) 10449 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 10450 } 10451 /* 10452 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 10453 */ 10454 tp->snd_wl1 = th->th_seq; 10455 /* 10456 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 10457 */ 10458 tp->rcv_up = tp->rcv_nxt; 10459 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 10460 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 10461 #ifdef TCPDEBUG 10462 if (so->so_options & SO_DEBUG) 10463 tcp_trace(TA_INPUT, ostate, tp, 10464 (void *)tcp_saveipgen, &tcp_savetcp, 0); 10465 #endif 10466 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 10467 10468 /* Add data to socket buffer. */ 10469 SOCKBUF_LOCK(&so->so_rcv); 10470 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10471 m_freem(m); 10472 } else { 10473 /* 10474 * Set new socket buffer size. Give up when limit is 10475 * reached. 10476 */ 10477 if (newsize) 10478 if (!sbreserve_locked(&so->so_rcv, 10479 newsize, so, NULL)) 10480 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 10481 m_adj(m, drop_hdrlen); /* delayed header drop */ 10482 #ifdef NETFLIX_SB_LIMITS 10483 appended = 10484 #endif 10485 sbappendstream_locked(&so->so_rcv, m, 0); 10486 ctf_calc_rwin(so, tp); 10487 } 10488 rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); 10489 /* NB: sorwakeup_locked() does an implicit unlock. */ 10490 sorwakeup_locked(so); 10491 #ifdef NETFLIX_SB_LIMITS 10492 if (so->so_rcv.sb_shlim && mcnt != appended) 10493 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 10494 #endif 10495 rack_handle_delayed_ack(tp, rack, tlen, 0); 10496 if (tp->snd_una == tp->snd_max) 10497 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 10498 return (1); 10499 } 10500 10501 /* 10502 * This subfunction is used to try to highly optimize the 10503 * fast path. We again allow window updates that are 10504 * in sequence to remain in the fast-path. We also add 10505 * in the __predict's to attempt to help the compiler. 10506 * Note that if we return a 0, then we can *not* process 10507 * it and the caller should push the packet into the 10508 * slow-path. 10509 */ 10510 static int 10511 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10512 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10513 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 10514 { 10515 int32_t acked; 10516 int32_t nsegs; 10517 #ifdef TCPDEBUG 10518 /* 10519 * The size of tcp_saveipgen must be the size of the max ip header, 10520 * now IPv6. 10521 */ 10522 u_char tcp_saveipgen[IP6_HDR_LEN]; 10523 struct tcphdr tcp_savetcp; 10524 short ostate = 0; 10525 #endif 10526 int32_t under_pacing = 0; 10527 struct tcp_rack *rack; 10528 10529 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 10530 /* Old ack, behind (or duplicate to) the last one rcv'd */ 10531 return (0); 10532 } 10533 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 10534 /* Above what we have sent? */ 10535 return (0); 10536 } 10537 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 10538 /* We are retransmitting */ 10539 return (0); 10540 } 10541 if (__predict_false(tiwin == 0)) { 10542 /* zero window */ 10543 return (0); 10544 } 10545 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 10546 /* We need a SYN or a FIN, unlikely.. */ 10547 return (0); 10548 } 10549 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 10550 /* Timestamp is behind .. old ack with seq wrap? */ 10551 return (0); 10552 } 10553 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 10554 /* Still recovering */ 10555 return (0); 10556 } 10557 rack = (struct tcp_rack *)tp->t_fb_ptr; 10558 if (rack->r_ctl.rc_sacked) { 10559 /* We have sack holes on our scoreboard */ 10560 return (0); 10561 } 10562 /* Ok if we reach here, we can process a fast-ack */ 10563 if (rack->gp_ready && 10564 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 10565 under_pacing = 1; 10566 } 10567 nsegs = max(1, m->m_pkthdr.lro_nsegs); 10568 rack_log_ack(tp, to, th, 0, 0); 10569 /* Did the window get updated? */ 10570 if (tiwin != tp->snd_wnd) { 10571 tp->snd_wnd = tiwin; 10572 rack_validate_fo_sendwin_up(tp, rack); 10573 tp->snd_wl1 = th->th_seq; 10574 if (tp->snd_wnd > tp->max_sndwnd) 10575 tp->max_sndwnd = tp->snd_wnd; 10576 } 10577 /* Do we exit persists? */ 10578 if ((rack->rc_in_persist != 0) && 10579 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 10580 rack->r_ctl.rc_pace_min_segs))) { 10581 rack_exit_persist(tp, rack, cts); 10582 } 10583 /* Do we enter persists? */ 10584 if ((rack->rc_in_persist == 0) && 10585 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 10586 TCPS_HAVEESTABLISHED(tp->t_state) && 10587 (tp->snd_max == tp->snd_una) && 10588 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 10589 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 10590 /* 10591 * Here the rwnd is less than 10592 * the pacing size, we are established, 10593 * nothing is outstanding, and there is 10594 * data to send. Enter persists. 10595 */ 10596 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 10597 } 10598 /* 10599 * If last ACK falls within this segment's sequence numbers, record 10600 * the timestamp. NOTE that the test is modified according to the 10601 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 10602 */ 10603 if ((to->to_flags & TOF_TS) != 0 && 10604 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 10605 tp->ts_recent_age = tcp_ts_getticks(); 10606 tp->ts_recent = to->to_tsval; 10607 } 10608 /* 10609 * This is a pure ack for outstanding data. 10610 */ 10611 KMOD_TCPSTAT_INC(tcps_predack); 10612 10613 /* 10614 * "bad retransmit" recovery. 10615 */ 10616 if ((tp->t_flags & TF_PREVVALID) && 10617 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 10618 tp->t_flags &= ~TF_PREVVALID; 10619 if (tp->t_rxtshift == 1 && 10620 (int)(ticks - tp->t_badrxtwin) < 0) 10621 rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); 10622 } 10623 /* 10624 * Recalculate the transmit timer / rtt. 10625 * 10626 * Some boxes send broken timestamp replies during the SYN+ACK 10627 * phase, ignore timestamps of 0 or we could calculate a huge RTT 10628 * and blow up the retransmit timer. 10629 */ 10630 acked = BYTES_THIS_ACK(tp, th); 10631 10632 #ifdef TCP_HHOOK 10633 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 10634 hhook_run_tcp_est_in(tp, th, to); 10635 #endif 10636 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 10637 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 10638 if (acked) { 10639 struct mbuf *mfree; 10640 10641 rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); 10642 SOCKBUF_LOCK(&so->so_snd); 10643 mfree = sbcut_locked(&so->so_snd, acked); 10644 tp->snd_una = th->th_ack; 10645 /* Note we want to hold the sb lock through the sendmap adjust */ 10646 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 10647 /* Wake up the socket if we have room to write more */ 10648 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 10649 sowwakeup_locked(so); 10650 m_freem(mfree); 10651 tp->t_rxtshift = 0; 10652 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 10653 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 10654 rack->rc_tlp_in_progress = 0; 10655 rack->r_ctl.rc_tlp_cnt_out = 0; 10656 /* 10657 * If it is the RXT timer we want to 10658 * stop it, so we can restart a TLP. 10659 */ 10660 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 10661 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10662 #ifdef NETFLIX_HTTP_LOGGING 10663 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 10664 #endif 10665 } 10666 /* 10667 * Let the congestion control algorithm update congestion control 10668 * related information. This typically means increasing the 10669 * congestion window. 10670 */ 10671 if (tp->snd_wnd < ctf_outstanding(tp)) { 10672 /* The peer collapsed the window */ 10673 rack_collapsed_window(rack); 10674 } else if (rack->rc_has_collapsed) 10675 rack_un_collapse_window(rack); 10676 10677 /* 10678 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 10679 */ 10680 tp->snd_wl2 = th->th_ack; 10681 tp->t_dupacks = 0; 10682 m_freem(m); 10683 /* ND6_HINT(tp); *//* Some progress has been made. */ 10684 10685 /* 10686 * If all outstanding data are acked, stop retransmit timer, 10687 * otherwise restart timer using current (possibly backed-off) 10688 * value. If process is waiting for space, wakeup/selwakeup/signal. 10689 * If data are ready to send, let tcp_output decide between more 10690 * output or persist. 10691 */ 10692 #ifdef TCPDEBUG 10693 if (so->so_options & SO_DEBUG) 10694 tcp_trace(TA_INPUT, ostate, tp, 10695 (void *)tcp_saveipgen, 10696 &tcp_savetcp, 0); 10697 #endif 10698 if (under_pacing && 10699 (rack->use_fixed_rate == 0) && 10700 (rack->in_probe_rtt == 0) && 10701 rack->rc_gp_dyn_mul && 10702 rack->rc_always_pace) { 10703 /* Check if we are dragging bottom */ 10704 rack_check_bottom_drag(tp, rack, so, acked); 10705 } 10706 if (tp->snd_una == tp->snd_max) { 10707 tp->t_flags &= ~TF_PREVVALID; 10708 rack->r_ctl.retran_during_recovery = 0; 10709 rack->r_ctl.dsack_byte_cnt = 0; 10710 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 10711 if (rack->r_ctl.rc_went_idle_time == 0) 10712 rack->r_ctl.rc_went_idle_time = 1; 10713 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 10714 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 10715 tp->t_acktime = 0; 10716 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10717 } 10718 if (acked && rack->r_fast_output) 10719 rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); 10720 if (sbavail(&so->so_snd)) { 10721 rack->r_wanted_output = 1; 10722 } 10723 return (1); 10724 } 10725 10726 /* 10727 * Return value of 1, the TCB is unlocked and most 10728 * likely gone, return value of 0, the TCP is still 10729 * locked. 10730 */ 10731 static int 10732 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 10733 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10734 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10735 { 10736 int32_t ret_val = 0; 10737 int32_t todrop; 10738 int32_t ourfinisacked = 0; 10739 struct tcp_rack *rack; 10740 10741 ctf_calc_rwin(so, tp); 10742 /* 10743 * If the state is SYN_SENT: if seg contains an ACK, but not for our 10744 * SYN, drop the input. if seg contains a RST, then drop the 10745 * connection. if seg does not contain SYN, then drop it. Otherwise 10746 * this is an acceptable SYN segment initialize tp->rcv_nxt and 10747 * tp->irs if seg contains ack then advance tp->snd_una if seg 10748 * contains an ECE and ECN support is enabled, the stream is ECN 10749 * capable. if SYN has been acked change to ESTABLISHED else 10750 * SYN_RCVD state arrange for segment to be acked (eventually) 10751 * continue processing rest of data/controls. 10752 */ 10753 if ((thflags & TH_ACK) && 10754 (SEQ_LEQ(th->th_ack, tp->iss) || 10755 SEQ_GT(th->th_ack, tp->snd_max))) { 10756 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10757 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10758 return (1); 10759 } 10760 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 10761 TCP_PROBE5(connect__refused, NULL, tp, 10762 mtod(m, const char *), tp, th); 10763 tp = tcp_drop(tp, ECONNREFUSED); 10764 ctf_do_drop(m, tp); 10765 return (1); 10766 } 10767 if (thflags & TH_RST) { 10768 ctf_do_drop(m, tp); 10769 return (1); 10770 } 10771 if (!(thflags & TH_SYN)) { 10772 ctf_do_drop(m, tp); 10773 return (1); 10774 } 10775 tp->irs = th->th_seq; 10776 tcp_rcvseqinit(tp); 10777 rack = (struct tcp_rack *)tp->t_fb_ptr; 10778 if (thflags & TH_ACK) { 10779 int tfo_partial = 0; 10780 10781 KMOD_TCPSTAT_INC(tcps_connects); 10782 soisconnected(so); 10783 #ifdef MAC 10784 mac_socketpeer_set_from_mbuf(m, so); 10785 #endif 10786 /* Do window scaling on this connection? */ 10787 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 10788 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 10789 tp->rcv_scale = tp->request_r_scale; 10790 } 10791 tp->rcv_adv += min(tp->rcv_wnd, 10792 TCP_MAXWIN << tp->rcv_scale); 10793 /* 10794 * If not all the data that was sent in the TFO SYN 10795 * has been acked, resend the remainder right away. 10796 */ 10797 if (IS_FASTOPEN(tp->t_flags) && 10798 (tp->snd_una != tp->snd_max)) { 10799 tp->snd_nxt = th->th_ack; 10800 tfo_partial = 1; 10801 } 10802 /* 10803 * If there's data, delay ACK; if there's also a FIN ACKNOW 10804 * will be turned on later. 10805 */ 10806 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 10807 rack_timer_cancel(tp, rack, 10808 rack->r_ctl.rc_rcvtime, __LINE__); 10809 tp->t_flags |= TF_DELACK; 10810 } else { 10811 rack->r_wanted_output = 1; 10812 tp->t_flags |= TF_ACKNOW; 10813 rack->rc_dack_toggle = 0; 10814 } 10815 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 10816 (V_tcp_do_ecn == 1)) { 10817 tp->t_flags2 |= TF2_ECN_PERMIT; 10818 KMOD_TCPSTAT_INC(tcps_ecn_shs); 10819 } 10820 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10821 /* 10822 * We advance snd_una for the 10823 * fast open case. If th_ack is 10824 * acknowledging data beyond 10825 * snd_una we can't just call 10826 * ack-processing since the 10827 * data stream in our send-map 10828 * will start at snd_una + 1 (one 10829 * beyond the SYN). If its just 10830 * equal we don't need to do that 10831 * and there is no send_map. 10832 */ 10833 tp->snd_una++; 10834 } 10835 /* 10836 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 10837 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 10838 */ 10839 tp->t_starttime = ticks; 10840 if (tp->t_flags & TF_NEEDFIN) { 10841 tcp_state_change(tp, TCPS_FIN_WAIT_1); 10842 tp->t_flags &= ~TF_NEEDFIN; 10843 thflags &= ~TH_SYN; 10844 } else { 10845 tcp_state_change(tp, TCPS_ESTABLISHED); 10846 TCP_PROBE5(connect__established, NULL, tp, 10847 mtod(m, const char *), tp, th); 10848 rack_cc_conn_init(tp); 10849 } 10850 } else { 10851 /* 10852 * Received initial SYN in SYN-SENT[*] state => simultaneous 10853 * open. If segment contains CC option and there is a 10854 * cached CC, apply TAO test. If it succeeds, connection is * 10855 * half-synchronized. Otherwise, do 3-way handshake: 10856 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 10857 * there was no CC option, clear cached CC value. 10858 */ 10859 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 10860 tcp_state_change(tp, TCPS_SYN_RECEIVED); 10861 } 10862 INP_WLOCK_ASSERT(tp->t_inpcb); 10863 /* 10864 * Advance th->th_seq to correspond to first data byte. If data, 10865 * trim to stay within window, dropping FIN if necessary. 10866 */ 10867 th->th_seq++; 10868 if (tlen > tp->rcv_wnd) { 10869 todrop = tlen - tp->rcv_wnd; 10870 m_adj(m, -todrop); 10871 tlen = tp->rcv_wnd; 10872 thflags &= ~TH_FIN; 10873 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 10874 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 10875 } 10876 tp->snd_wl1 = th->th_seq - 1; 10877 tp->rcv_up = th->th_seq; 10878 /* 10879 * Client side of transaction: already sent SYN and data. If the 10880 * remote host used T/TCP to validate the SYN, our data will be 10881 * ACK'd; if so, enter normal data segment processing in the middle 10882 * of step 5, ack processing. Otherwise, goto step 6. 10883 */ 10884 if (thflags & TH_ACK) { 10885 /* For syn-sent we need to possibly update the rtt */ 10886 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 10887 uint32_t t, mcts; 10888 10889 mcts = tcp_ts_getticks(); 10890 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 10891 if (!tp->t_rttlow || tp->t_rttlow > t) 10892 tp->t_rttlow = t; 10893 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); 10894 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 10895 tcp_rack_xmit_timer_commit(rack, tp); 10896 } 10897 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 10898 return (ret_val); 10899 /* We may have changed to FIN_WAIT_1 above */ 10900 if (tp->t_state == TCPS_FIN_WAIT_1) { 10901 /* 10902 * In FIN_WAIT_1 STATE in addition to the processing 10903 * for the ESTABLISHED state if our FIN is now 10904 * acknowledged then enter FIN_WAIT_2. 10905 */ 10906 if (ourfinisacked) { 10907 /* 10908 * If we can't receive any more data, then 10909 * closing user can proceed. Starting the 10910 * timer is contrary to the specification, 10911 * but if we don't get a FIN we'll hang 10912 * forever. 10913 * 10914 * XXXjl: we should release the tp also, and 10915 * use a compressed state. 10916 */ 10917 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10918 soisdisconnected(so); 10919 tcp_timer_activate(tp, TT_2MSL, 10920 (tcp_fast_finwait2_recycle ? 10921 tcp_finwait2_timeout : 10922 TP_MAXIDLE(tp))); 10923 } 10924 tcp_state_change(tp, TCPS_FIN_WAIT_2); 10925 } 10926 } 10927 } 10928 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10929 tiwin, thflags, nxt_pkt)); 10930 } 10931 10932 /* 10933 * Return value of 1, the TCB is unlocked and most 10934 * likely gone, return value of 0, the TCP is still 10935 * locked. 10936 */ 10937 static int 10938 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 10939 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10940 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10941 { 10942 struct tcp_rack *rack; 10943 int32_t ret_val = 0; 10944 int32_t ourfinisacked = 0; 10945 10946 ctf_calc_rwin(so, tp); 10947 if ((thflags & TH_ACK) && 10948 (SEQ_LEQ(th->th_ack, tp->snd_una) || 10949 SEQ_GT(th->th_ack, tp->snd_max))) { 10950 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10951 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10952 return (1); 10953 } 10954 rack = (struct tcp_rack *)tp->t_fb_ptr; 10955 if (IS_FASTOPEN(tp->t_flags)) { 10956 /* 10957 * When a TFO connection is in SYN_RECEIVED, the 10958 * only valid packets are the initial SYN, a 10959 * retransmit/copy of the initial SYN (possibly with 10960 * a subset of the original data), a valid ACK, a 10961 * FIN, or a RST. 10962 */ 10963 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 10964 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 10965 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10966 return (1); 10967 } else if (thflags & TH_SYN) { 10968 /* non-initial SYN is ignored */ 10969 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 10970 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 10971 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 10972 ctf_do_drop(m, NULL); 10973 return (0); 10974 } 10975 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 10976 ctf_do_drop(m, NULL); 10977 return (0); 10978 } 10979 } 10980 if ((thflags & TH_RST) || 10981 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10982 return (ctf_process_rst(m, th, so, tp)); 10983 /* 10984 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10985 * it's less than ts_recent, drop it. 10986 */ 10987 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10988 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10989 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10990 return (ret_val); 10991 } 10992 /* 10993 * In the SYN-RECEIVED state, validate that the packet belongs to 10994 * this connection before trimming the data to fit the receive 10995 * window. Check the sequence number versus IRS since we know the 10996 * sequence numbers haven't wrapped. This is a partial fix for the 10997 * "LAND" DoS attack. 10998 */ 10999 if (SEQ_LT(th->th_seq, tp->irs)) { 11000 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11001 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11002 return (1); 11003 } 11004 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11005 &rack->r_ctl.challenge_ack_ts, 11006 &rack->r_ctl.challenge_ack_cnt)) { 11007 return (ret_val); 11008 } 11009 /* 11010 * If last ACK falls within this segment's sequence numbers, record 11011 * its timestamp. NOTE: 1) That the test incorporates suggestions 11012 * from the latest proposal of the tcplw@cray.com list (Braden 11013 * 1993/04/26). 2) That updating only on newer timestamps interferes 11014 * with our earlier PAWS tests, so this check should be solely 11015 * predicated on the sequence space of this segment. 3) That we 11016 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11017 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11018 * SEG.Len, This modified check allows us to overcome RFC1323's 11019 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11020 * p.869. In such cases, we can still calculate the RTT correctly 11021 * when RCV.NXT == Last.ACK.Sent. 11022 */ 11023 if ((to->to_flags & TOF_TS) != 0 && 11024 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11025 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11026 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11027 tp->ts_recent_age = tcp_ts_getticks(); 11028 tp->ts_recent = to->to_tsval; 11029 } 11030 tp->snd_wnd = tiwin; 11031 rack_validate_fo_sendwin_up(tp, rack); 11032 /* 11033 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11034 * is on (half-synchronized state), then queue data for later 11035 * processing; else drop segment and return. 11036 */ 11037 if ((thflags & TH_ACK) == 0) { 11038 if (IS_FASTOPEN(tp->t_flags)) { 11039 rack_cc_conn_init(tp); 11040 } 11041 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11042 tiwin, thflags, nxt_pkt)); 11043 } 11044 KMOD_TCPSTAT_INC(tcps_connects); 11045 soisconnected(so); 11046 /* Do window scaling? */ 11047 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 11048 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 11049 tp->rcv_scale = tp->request_r_scale; 11050 } 11051 /* 11052 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 11053 * FIN-WAIT-1 11054 */ 11055 tp->t_starttime = ticks; 11056 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 11057 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 11058 tp->t_tfo_pending = NULL; 11059 } 11060 if (tp->t_flags & TF_NEEDFIN) { 11061 tcp_state_change(tp, TCPS_FIN_WAIT_1); 11062 tp->t_flags &= ~TF_NEEDFIN; 11063 } else { 11064 tcp_state_change(tp, TCPS_ESTABLISHED); 11065 TCP_PROBE5(accept__established, NULL, tp, 11066 mtod(m, const char *), tp, th); 11067 /* 11068 * TFO connections call cc_conn_init() during SYN 11069 * processing. Calling it again here for such connections 11070 * is not harmless as it would undo the snd_cwnd reduction 11071 * that occurs when a TFO SYN|ACK is retransmitted. 11072 */ 11073 if (!IS_FASTOPEN(tp->t_flags)) 11074 rack_cc_conn_init(tp); 11075 } 11076 /* 11077 * Account for the ACK of our SYN prior to 11078 * regular ACK processing below, except for 11079 * simultaneous SYN, which is handled later. 11080 */ 11081 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 11082 tp->snd_una++; 11083 /* 11084 * If segment contains data or ACK, will call tcp_reass() later; if 11085 * not, do so now to pass queued data to user. 11086 */ 11087 if (tlen == 0 && (thflags & TH_FIN) == 0) { 11088 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 11089 (struct mbuf *)0); 11090 if (tp->t_flags & TF_WAKESOR) { 11091 tp->t_flags &= ~TF_WAKESOR; 11092 /* NB: sorwakeup_locked() does an implicit unlock. */ 11093 sorwakeup_locked(so); 11094 } 11095 } 11096 tp->snd_wl1 = th->th_seq - 1; 11097 /* For syn-recv we need to possibly update the rtt */ 11098 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 11099 uint32_t t, mcts; 11100 11101 mcts = tcp_ts_getticks(); 11102 t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; 11103 if (!tp->t_rttlow || tp->t_rttlow > t) 11104 tp->t_rttlow = t; 11105 rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); 11106 tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); 11107 tcp_rack_xmit_timer_commit(rack, tp); 11108 } 11109 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11110 return (ret_val); 11111 } 11112 if (tp->t_state == TCPS_FIN_WAIT_1) { 11113 /* We could have went to FIN_WAIT_1 (or EST) above */ 11114 /* 11115 * In FIN_WAIT_1 STATE in addition to the processing for the 11116 * ESTABLISHED state if our FIN is now acknowledged then 11117 * enter FIN_WAIT_2. 11118 */ 11119 if (ourfinisacked) { 11120 /* 11121 * If we can't receive any more data, then closing 11122 * user can proceed. Starting the timer is contrary 11123 * to the specification, but if we don't get a FIN 11124 * we'll hang forever. 11125 * 11126 * XXXjl: we should release the tp also, and use a 11127 * compressed state. 11128 */ 11129 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11130 soisdisconnected(so); 11131 tcp_timer_activate(tp, TT_2MSL, 11132 (tcp_fast_finwait2_recycle ? 11133 tcp_finwait2_timeout : 11134 TP_MAXIDLE(tp))); 11135 } 11136 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11137 } 11138 } 11139 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11140 tiwin, thflags, nxt_pkt)); 11141 } 11142 11143 /* 11144 * Return value of 1, the TCB is unlocked and most 11145 * likely gone, return value of 0, the TCP is still 11146 * locked. 11147 */ 11148 static int 11149 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 11150 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11151 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11152 { 11153 int32_t ret_val = 0; 11154 struct tcp_rack *rack; 11155 11156 /* 11157 * Header prediction: check for the two common cases of a 11158 * uni-directional data xfer. If the packet has no control flags, 11159 * is in-sequence, the window didn't change and we're not 11160 * retransmitting, it's a candidate. If the length is zero and the 11161 * ack moved forward, we're the sender side of the xfer. Just free 11162 * the data acked & wake any higher level process that was blocked 11163 * waiting for space. If the length is non-zero and the ack didn't 11164 * move, we're the receiver side. If we're getting packets in-order 11165 * (the reassembly queue is empty), add the data toc The socket 11166 * buffer and note that we need a delayed ack. Make sure that the 11167 * hidden state-flags are also off. Since we check for 11168 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 11169 */ 11170 rack = (struct tcp_rack *)tp->t_fb_ptr; 11171 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 11172 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 11173 __predict_true(SEGQ_EMPTY(tp)) && 11174 __predict_true(th->th_seq == tp->rcv_nxt)) { 11175 if (tlen == 0) { 11176 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 11177 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 11178 return (0); 11179 } 11180 } else { 11181 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 11182 tiwin, nxt_pkt, iptos)) { 11183 return (0); 11184 } 11185 } 11186 } 11187 ctf_calc_rwin(so, tp); 11188 11189 if ((thflags & TH_RST) || 11190 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11191 return (ctf_process_rst(m, th, so, tp)); 11192 11193 /* 11194 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11195 * synchronized state. 11196 */ 11197 if (thflags & TH_SYN) { 11198 ctf_challenge_ack(m, th, tp, &ret_val); 11199 return (ret_val); 11200 } 11201 /* 11202 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11203 * it's less than ts_recent, drop it. 11204 */ 11205 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11206 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11207 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11208 return (ret_val); 11209 } 11210 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11211 &rack->r_ctl.challenge_ack_ts, 11212 &rack->r_ctl.challenge_ack_cnt)) { 11213 return (ret_val); 11214 } 11215 /* 11216 * If last ACK falls within this segment's sequence numbers, record 11217 * its timestamp. NOTE: 1) That the test incorporates suggestions 11218 * from the latest proposal of the tcplw@cray.com list (Braden 11219 * 1993/04/26). 2) That updating only on newer timestamps interferes 11220 * with our earlier PAWS tests, so this check should be solely 11221 * predicated on the sequence space of this segment. 3) That we 11222 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11223 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11224 * SEG.Len, This modified check allows us to overcome RFC1323's 11225 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11226 * p.869. In such cases, we can still calculate the RTT correctly 11227 * when RCV.NXT == Last.ACK.Sent. 11228 */ 11229 if ((to->to_flags & TOF_TS) != 0 && 11230 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11231 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11232 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11233 tp->ts_recent_age = tcp_ts_getticks(); 11234 tp->ts_recent = to->to_tsval; 11235 } 11236 /* 11237 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11238 * is on (half-synchronized state), then queue data for later 11239 * processing; else drop segment and return. 11240 */ 11241 if ((thflags & TH_ACK) == 0) { 11242 if (tp->t_flags & TF_NEEDSYN) { 11243 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11244 tiwin, thflags, nxt_pkt)); 11245 11246 } else if (tp->t_flags & TF_ACKNOW) { 11247 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11248 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11249 return (ret_val); 11250 } else { 11251 ctf_do_drop(m, NULL); 11252 return (0); 11253 } 11254 } 11255 /* 11256 * Ack processing. 11257 */ 11258 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11259 return (ret_val); 11260 } 11261 if (sbavail(&so->so_snd)) { 11262 if (ctf_progress_timeout_check(tp, true)) { 11263 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 11264 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11265 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11266 return (1); 11267 } 11268 } 11269 /* State changes only happen in rack_process_data() */ 11270 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11271 tiwin, thflags, nxt_pkt)); 11272 } 11273 11274 /* 11275 * Return value of 1, the TCB is unlocked and most 11276 * likely gone, return value of 0, the TCP is still 11277 * locked. 11278 */ 11279 static int 11280 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 11281 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11282 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11283 { 11284 int32_t ret_val = 0; 11285 struct tcp_rack *rack; 11286 11287 rack = (struct tcp_rack *)tp->t_fb_ptr; 11288 ctf_calc_rwin(so, tp); 11289 if ((thflags & TH_RST) || 11290 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11291 return (ctf_process_rst(m, th, so, tp)); 11292 /* 11293 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11294 * synchronized state. 11295 */ 11296 if (thflags & TH_SYN) { 11297 ctf_challenge_ack(m, th, tp, &ret_val); 11298 return (ret_val); 11299 } 11300 /* 11301 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11302 * it's less than ts_recent, drop it. 11303 */ 11304 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11305 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11306 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11307 return (ret_val); 11308 } 11309 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11310 &rack->r_ctl.challenge_ack_ts, 11311 &rack->r_ctl.challenge_ack_cnt)) { 11312 return (ret_val); 11313 } 11314 /* 11315 * If last ACK falls within this segment's sequence numbers, record 11316 * its timestamp. NOTE: 1) That the test incorporates suggestions 11317 * from the latest proposal of the tcplw@cray.com list (Braden 11318 * 1993/04/26). 2) That updating only on newer timestamps interferes 11319 * with our earlier PAWS tests, so this check should be solely 11320 * predicated on the sequence space of this segment. 3) That we 11321 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11322 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11323 * SEG.Len, This modified check allows us to overcome RFC1323's 11324 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11325 * p.869. In such cases, we can still calculate the RTT correctly 11326 * when RCV.NXT == Last.ACK.Sent. 11327 */ 11328 if ((to->to_flags & TOF_TS) != 0 && 11329 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11330 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11331 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11332 tp->ts_recent_age = tcp_ts_getticks(); 11333 tp->ts_recent = to->to_tsval; 11334 } 11335 /* 11336 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11337 * is on (half-synchronized state), then queue data for later 11338 * processing; else drop segment and return. 11339 */ 11340 if ((thflags & TH_ACK) == 0) { 11341 if (tp->t_flags & TF_NEEDSYN) { 11342 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11343 tiwin, thflags, nxt_pkt)); 11344 11345 } else if (tp->t_flags & TF_ACKNOW) { 11346 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11347 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11348 return (ret_val); 11349 } else { 11350 ctf_do_drop(m, NULL); 11351 return (0); 11352 } 11353 } 11354 /* 11355 * Ack processing. 11356 */ 11357 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 11358 return (ret_val); 11359 } 11360 if (sbavail(&so->so_snd)) { 11361 if (ctf_progress_timeout_check(tp, true)) { 11362 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11363 tp, tick, PROGRESS_DROP, __LINE__); 11364 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11365 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11366 return (1); 11367 } 11368 } 11369 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11370 tiwin, thflags, nxt_pkt)); 11371 } 11372 11373 static int 11374 rack_check_data_after_close(struct mbuf *m, 11375 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 11376 { 11377 struct tcp_rack *rack; 11378 11379 rack = (struct tcp_rack *)tp->t_fb_ptr; 11380 if (rack->rc_allow_data_af_clo == 0) { 11381 close_now: 11382 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11383 /* tcp_close will kill the inp pre-log the Reset */ 11384 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 11385 tp = tcp_close(tp); 11386 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 11387 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 11388 return (1); 11389 } 11390 if (sbavail(&so->so_snd) == 0) 11391 goto close_now; 11392 /* Ok we allow data that is ignored and a followup reset */ 11393 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 11394 tp->rcv_nxt = th->th_seq + *tlen; 11395 tp->t_flags2 |= TF2_DROP_AF_DATA; 11396 rack->r_wanted_output = 1; 11397 *tlen = 0; 11398 return (0); 11399 } 11400 11401 /* 11402 * Return value of 1, the TCB is unlocked and most 11403 * likely gone, return value of 0, the TCP is still 11404 * locked. 11405 */ 11406 static int 11407 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 11408 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11409 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11410 { 11411 int32_t ret_val = 0; 11412 int32_t ourfinisacked = 0; 11413 struct tcp_rack *rack; 11414 11415 rack = (struct tcp_rack *)tp->t_fb_ptr; 11416 ctf_calc_rwin(so, tp); 11417 11418 if ((thflags & TH_RST) || 11419 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11420 return (ctf_process_rst(m, th, so, tp)); 11421 /* 11422 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11423 * synchronized state. 11424 */ 11425 if (thflags & TH_SYN) { 11426 ctf_challenge_ack(m, th, tp, &ret_val); 11427 return (ret_val); 11428 } 11429 /* 11430 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11431 * it's less than ts_recent, drop it. 11432 */ 11433 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11434 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11435 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11436 return (ret_val); 11437 } 11438 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11439 &rack->r_ctl.challenge_ack_ts, 11440 &rack->r_ctl.challenge_ack_cnt)) { 11441 return (ret_val); 11442 } 11443 /* 11444 * If new data are received on a connection after the user processes 11445 * are gone, then RST the other end. 11446 */ 11447 if ((so->so_state & SS_NOFDREF) && tlen) { 11448 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11449 return (1); 11450 } 11451 /* 11452 * If last ACK falls within this segment's sequence numbers, record 11453 * its timestamp. NOTE: 1) That the test incorporates suggestions 11454 * from the latest proposal of the tcplw@cray.com list (Braden 11455 * 1993/04/26). 2) That updating only on newer timestamps interferes 11456 * with our earlier PAWS tests, so this check should be solely 11457 * predicated on the sequence space of this segment. 3) That we 11458 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11459 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11460 * SEG.Len, This modified check allows us to overcome RFC1323's 11461 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11462 * p.869. In such cases, we can still calculate the RTT correctly 11463 * when RCV.NXT == Last.ACK.Sent. 11464 */ 11465 if ((to->to_flags & TOF_TS) != 0 && 11466 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11467 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11468 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11469 tp->ts_recent_age = tcp_ts_getticks(); 11470 tp->ts_recent = to->to_tsval; 11471 } 11472 /* 11473 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11474 * is on (half-synchronized state), then queue data for later 11475 * processing; else drop segment and return. 11476 */ 11477 if ((thflags & TH_ACK) == 0) { 11478 if (tp->t_flags & TF_NEEDSYN) { 11479 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11480 tiwin, thflags, nxt_pkt)); 11481 } else if (tp->t_flags & TF_ACKNOW) { 11482 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11483 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11484 return (ret_val); 11485 } else { 11486 ctf_do_drop(m, NULL); 11487 return (0); 11488 } 11489 } 11490 /* 11491 * Ack processing. 11492 */ 11493 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11494 return (ret_val); 11495 } 11496 if (ourfinisacked) { 11497 /* 11498 * If we can't receive any more data, then closing user can 11499 * proceed. Starting the timer is contrary to the 11500 * specification, but if we don't get a FIN we'll hang 11501 * forever. 11502 * 11503 * XXXjl: we should release the tp also, and use a 11504 * compressed state. 11505 */ 11506 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 11507 soisdisconnected(so); 11508 tcp_timer_activate(tp, TT_2MSL, 11509 (tcp_fast_finwait2_recycle ? 11510 tcp_finwait2_timeout : 11511 TP_MAXIDLE(tp))); 11512 } 11513 tcp_state_change(tp, TCPS_FIN_WAIT_2); 11514 } 11515 if (sbavail(&so->so_snd)) { 11516 if (ctf_progress_timeout_check(tp, true)) { 11517 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11518 tp, tick, PROGRESS_DROP, __LINE__); 11519 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11520 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11521 return (1); 11522 } 11523 } 11524 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11525 tiwin, thflags, nxt_pkt)); 11526 } 11527 11528 /* 11529 * Return value of 1, the TCB is unlocked and most 11530 * likely gone, return value of 0, the TCP is still 11531 * locked. 11532 */ 11533 static int 11534 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 11535 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11536 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11537 { 11538 int32_t ret_val = 0; 11539 int32_t ourfinisacked = 0; 11540 struct tcp_rack *rack; 11541 11542 rack = (struct tcp_rack *)tp->t_fb_ptr; 11543 ctf_calc_rwin(so, tp); 11544 11545 if ((thflags & TH_RST) || 11546 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11547 return (ctf_process_rst(m, th, so, tp)); 11548 /* 11549 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11550 * synchronized state. 11551 */ 11552 if (thflags & TH_SYN) { 11553 ctf_challenge_ack(m, th, tp, &ret_val); 11554 return (ret_val); 11555 } 11556 /* 11557 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11558 * it's less than ts_recent, drop it. 11559 */ 11560 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11561 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11562 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11563 return (ret_val); 11564 } 11565 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11566 &rack->r_ctl.challenge_ack_ts, 11567 &rack->r_ctl.challenge_ack_cnt)) { 11568 return (ret_val); 11569 } 11570 /* 11571 * If new data are received on a connection after the user processes 11572 * are gone, then RST the other end. 11573 */ 11574 if ((so->so_state & SS_NOFDREF) && tlen) { 11575 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11576 return (1); 11577 } 11578 /* 11579 * If last ACK falls within this segment's sequence numbers, record 11580 * its timestamp. NOTE: 1) That the test incorporates suggestions 11581 * from the latest proposal of the tcplw@cray.com list (Braden 11582 * 1993/04/26). 2) That updating only on newer timestamps interferes 11583 * with our earlier PAWS tests, so this check should be solely 11584 * predicated on the sequence space of this segment. 3) That we 11585 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11586 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11587 * SEG.Len, This modified check allows us to overcome RFC1323's 11588 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11589 * p.869. In such cases, we can still calculate the RTT correctly 11590 * when RCV.NXT == Last.ACK.Sent. 11591 */ 11592 if ((to->to_flags & TOF_TS) != 0 && 11593 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11594 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11595 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11596 tp->ts_recent_age = tcp_ts_getticks(); 11597 tp->ts_recent = to->to_tsval; 11598 } 11599 /* 11600 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11601 * is on (half-synchronized state), then queue data for later 11602 * processing; else drop segment and return. 11603 */ 11604 if ((thflags & TH_ACK) == 0) { 11605 if (tp->t_flags & TF_NEEDSYN) { 11606 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11607 tiwin, thflags, nxt_pkt)); 11608 } else if (tp->t_flags & TF_ACKNOW) { 11609 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11610 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11611 return (ret_val); 11612 } else { 11613 ctf_do_drop(m, NULL); 11614 return (0); 11615 } 11616 } 11617 /* 11618 * Ack processing. 11619 */ 11620 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11621 return (ret_val); 11622 } 11623 if (ourfinisacked) { 11624 tcp_twstart(tp); 11625 m_freem(m); 11626 return (1); 11627 } 11628 if (sbavail(&so->so_snd)) { 11629 if (ctf_progress_timeout_check(tp, true)) { 11630 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11631 tp, tick, PROGRESS_DROP, __LINE__); 11632 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11633 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11634 return (1); 11635 } 11636 } 11637 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11638 tiwin, thflags, nxt_pkt)); 11639 } 11640 11641 /* 11642 * Return value of 1, the TCB is unlocked and most 11643 * likely gone, return value of 0, the TCP is still 11644 * locked. 11645 */ 11646 static int 11647 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 11648 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11649 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11650 { 11651 int32_t ret_val = 0; 11652 int32_t ourfinisacked = 0; 11653 struct tcp_rack *rack; 11654 11655 rack = (struct tcp_rack *)tp->t_fb_ptr; 11656 ctf_calc_rwin(so, tp); 11657 11658 if ((thflags & TH_RST) || 11659 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11660 return (ctf_process_rst(m, th, so, tp)); 11661 /* 11662 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11663 * synchronized state. 11664 */ 11665 if (thflags & TH_SYN) { 11666 ctf_challenge_ack(m, th, tp, &ret_val); 11667 return (ret_val); 11668 } 11669 /* 11670 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11671 * it's less than ts_recent, drop it. 11672 */ 11673 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11674 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11675 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11676 return (ret_val); 11677 } 11678 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11679 &rack->r_ctl.challenge_ack_ts, 11680 &rack->r_ctl.challenge_ack_cnt)) { 11681 return (ret_val); 11682 } 11683 /* 11684 * If new data are received on a connection after the user processes 11685 * are gone, then RST the other end. 11686 */ 11687 if ((so->so_state & SS_NOFDREF) && tlen) { 11688 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11689 return (1); 11690 } 11691 /* 11692 * If last ACK falls within this segment's sequence numbers, record 11693 * its timestamp. NOTE: 1) That the test incorporates suggestions 11694 * from the latest proposal of the tcplw@cray.com list (Braden 11695 * 1993/04/26). 2) That updating only on newer timestamps interferes 11696 * with our earlier PAWS tests, so this check should be solely 11697 * predicated on the sequence space of this segment. 3) That we 11698 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11699 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11700 * SEG.Len, This modified check allows us to overcome RFC1323's 11701 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11702 * p.869. In such cases, we can still calculate the RTT correctly 11703 * when RCV.NXT == Last.ACK.Sent. 11704 */ 11705 if ((to->to_flags & TOF_TS) != 0 && 11706 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11707 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11708 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11709 tp->ts_recent_age = tcp_ts_getticks(); 11710 tp->ts_recent = to->to_tsval; 11711 } 11712 /* 11713 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11714 * is on (half-synchronized state), then queue data for later 11715 * processing; else drop segment and return. 11716 */ 11717 if ((thflags & TH_ACK) == 0) { 11718 if (tp->t_flags & TF_NEEDSYN) { 11719 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11720 tiwin, thflags, nxt_pkt)); 11721 } else if (tp->t_flags & TF_ACKNOW) { 11722 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11723 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11724 return (ret_val); 11725 } else { 11726 ctf_do_drop(m, NULL); 11727 return (0); 11728 } 11729 } 11730 /* 11731 * case TCPS_LAST_ACK: Ack processing. 11732 */ 11733 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11734 return (ret_val); 11735 } 11736 if (ourfinisacked) { 11737 tp = tcp_close(tp); 11738 ctf_do_drop(m, tp); 11739 return (1); 11740 } 11741 if (sbavail(&so->so_snd)) { 11742 if (ctf_progress_timeout_check(tp, true)) { 11743 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11744 tp, tick, PROGRESS_DROP, __LINE__); 11745 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11746 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11747 return (1); 11748 } 11749 } 11750 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11751 tiwin, thflags, nxt_pkt)); 11752 } 11753 11754 /* 11755 * Return value of 1, the TCB is unlocked and most 11756 * likely gone, return value of 0, the TCP is still 11757 * locked. 11758 */ 11759 static int 11760 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 11761 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 11762 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 11763 { 11764 int32_t ret_val = 0; 11765 int32_t ourfinisacked = 0; 11766 struct tcp_rack *rack; 11767 11768 rack = (struct tcp_rack *)tp->t_fb_ptr; 11769 ctf_calc_rwin(so, tp); 11770 11771 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 11772 if ((thflags & TH_RST) || 11773 (tp->t_fin_is_rst && (thflags & TH_FIN))) 11774 return (ctf_process_rst(m, th, so, tp)); 11775 /* 11776 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 11777 * synchronized state. 11778 */ 11779 if (thflags & TH_SYN) { 11780 ctf_challenge_ack(m, th, tp, &ret_val); 11781 return (ret_val); 11782 } 11783 /* 11784 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 11785 * it's less than ts_recent, drop it. 11786 */ 11787 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 11788 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 11789 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 11790 return (ret_val); 11791 } 11792 if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, 11793 &rack->r_ctl.challenge_ack_ts, 11794 &rack->r_ctl.challenge_ack_cnt)) { 11795 return (ret_val); 11796 } 11797 /* 11798 * If new data are received on a connection after the user processes 11799 * are gone, then RST the other end. 11800 */ 11801 if ((so->so_state & SS_NOFDREF) && 11802 tlen) { 11803 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 11804 return (1); 11805 } 11806 /* 11807 * If last ACK falls within this segment's sequence numbers, record 11808 * its timestamp. NOTE: 1) That the test incorporates suggestions 11809 * from the latest proposal of the tcplw@cray.com list (Braden 11810 * 1993/04/26). 2) That updating only on newer timestamps interferes 11811 * with our earlier PAWS tests, so this check should be solely 11812 * predicated on the sequence space of this segment. 3) That we 11813 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 11814 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 11815 * SEG.Len, This modified check allows us to overcome RFC1323's 11816 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 11817 * p.869. In such cases, we can still calculate the RTT correctly 11818 * when RCV.NXT == Last.ACK.Sent. 11819 */ 11820 if ((to->to_flags & TOF_TS) != 0 && 11821 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 11822 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 11823 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 11824 tp->ts_recent_age = tcp_ts_getticks(); 11825 tp->ts_recent = to->to_tsval; 11826 } 11827 /* 11828 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 11829 * is on (half-synchronized state), then queue data for later 11830 * processing; else drop segment and return. 11831 */ 11832 if ((thflags & TH_ACK) == 0) { 11833 if (tp->t_flags & TF_NEEDSYN) { 11834 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11835 tiwin, thflags, nxt_pkt)); 11836 } else if (tp->t_flags & TF_ACKNOW) { 11837 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 11838 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 11839 return (ret_val); 11840 } else { 11841 ctf_do_drop(m, NULL); 11842 return (0); 11843 } 11844 } 11845 /* 11846 * Ack processing. 11847 */ 11848 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 11849 return (ret_val); 11850 } 11851 if (sbavail(&so->so_snd)) { 11852 if (ctf_progress_timeout_check(tp, true)) { 11853 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 11854 tp, tick, PROGRESS_DROP, __LINE__); 11855 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 11856 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11857 return (1); 11858 } 11859 } 11860 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 11861 tiwin, thflags, nxt_pkt)); 11862 } 11863 11864 static void inline 11865 rack_clear_rate_sample(struct tcp_rack *rack) 11866 { 11867 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 11868 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 11869 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 11870 } 11871 11872 static void 11873 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) 11874 { 11875 uint64_t bw_est, rate_wanted; 11876 int chged = 0; 11877 uint32_t user_max, orig_min, orig_max; 11878 11879 orig_min = rack->r_ctl.rc_pace_min_segs; 11880 orig_max = rack->r_ctl.rc_pace_max_segs; 11881 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 11882 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 11883 chged = 1; 11884 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 11885 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 11886 if (user_max != rack->r_ctl.rc_pace_max_segs) 11887 chged = 1; 11888 } 11889 if (rack->rc_force_max_seg) { 11890 rack->r_ctl.rc_pace_max_segs = user_max; 11891 } else if (rack->use_fixed_rate) { 11892 bw_est = rack_get_bw(rack); 11893 if ((rack->r_ctl.crte == NULL) || 11894 (bw_est != rack->r_ctl.crte->rate)) { 11895 rack->r_ctl.rc_pace_max_segs = user_max; 11896 } else { 11897 /* We are pacing right at the hardware rate */ 11898 uint32_t segsiz; 11899 11900 segsiz = min(ctf_fixed_maxseg(tp), 11901 rack->r_ctl.rc_pace_min_segs); 11902 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 11903 tp, bw_est, segsiz, 0, 11904 rack->r_ctl.crte, NULL); 11905 } 11906 } else if (rack->rc_always_pace) { 11907 if (rack->r_ctl.gp_bw || 11908 #ifdef NETFLIX_PEAKRATE 11909 rack->rc_tp->t_maxpeakrate || 11910 #endif 11911 rack->r_ctl.init_rate) { 11912 /* We have a rate of some sort set */ 11913 uint32_t orig; 11914 11915 bw_est = rack_get_bw(rack); 11916 orig = rack->r_ctl.rc_pace_max_segs; 11917 if (fill_override) 11918 rate_wanted = *fill_override; 11919 else 11920 rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); 11921 if (rate_wanted) { 11922 /* We have something */ 11923 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 11924 rate_wanted, 11925 ctf_fixed_maxseg(rack->rc_tp)); 11926 } else 11927 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 11928 if (orig != rack->r_ctl.rc_pace_max_segs) 11929 chged = 1; 11930 } else if ((rack->r_ctl.gp_bw == 0) && 11931 (rack->r_ctl.rc_pace_max_segs == 0)) { 11932 /* 11933 * If we have nothing limit us to bursting 11934 * out IW sized pieces. 11935 */ 11936 chged = 1; 11937 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 11938 } 11939 } 11940 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 11941 chged = 1; 11942 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 11943 } 11944 if (chged) 11945 rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); 11946 } 11947 11948 11949 static void 11950 rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) 11951 { 11952 #ifdef INET6 11953 struct ip6_hdr *ip6 = NULL; 11954 #endif 11955 #ifdef INET 11956 struct ip *ip = NULL; 11957 #endif 11958 struct udphdr *udp = NULL; 11959 11960 /* Ok lets fill in the fast block, it can only be used with no IP options! */ 11961 #ifdef INET6 11962 if (rack->r_is_v6) { 11963 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 11964 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 11965 if (tp->t_port) { 11966 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 11967 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 11968 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 11969 udp->uh_dport = tp->t_port; 11970 rack->r_ctl.fsb.udp = udp; 11971 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 11972 } else 11973 { 11974 rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); 11975 rack->r_ctl.fsb.udp = NULL; 11976 } 11977 tcpip_fillheaders(rack->rc_inp, 11978 tp->t_port, 11979 ip6, rack->r_ctl.fsb.th); 11980 } else 11981 #endif /* INET6 */ 11982 { 11983 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); 11984 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 11985 if (tp->t_port) { 11986 rack->r_ctl.fsb.tcp_ip_hdr_len += sizeof(struct udphdr); 11987 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 11988 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 11989 udp->uh_dport = tp->t_port; 11990 rack->r_ctl.fsb.udp = udp; 11991 rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); 11992 } else 11993 { 11994 rack->r_ctl.fsb.udp = NULL; 11995 rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); 11996 } 11997 tcpip_fillheaders(rack->rc_inp, 11998 tp->t_port, 11999 ip, rack->r_ctl.fsb.th); 12000 } 12001 rack->r_fsb_inited = 1; 12002 } 12003 12004 static int 12005 rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) 12006 { 12007 /* 12008 * Allocate the larger of spaces V6 if available else just 12009 * V4 and include udphdr (overbook) 12010 */ 12011 #ifdef INET6 12012 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr) + sizeof(struct udphdr); 12013 #else 12014 rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr) + sizeof(struct udphdr); 12015 #endif 12016 rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, 12017 M_TCPFSB, M_NOWAIT|M_ZERO); 12018 if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { 12019 return (ENOMEM); 12020 } 12021 rack->r_fsb_inited = 0; 12022 return (0); 12023 } 12024 12025 static int 12026 rack_init(struct tcpcb *tp) 12027 { 12028 struct tcp_rack *rack = NULL; 12029 struct rack_sendmap *insret; 12030 uint32_t iwin, snt, us_cts; 12031 int err; 12032 12033 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 12034 if (tp->t_fb_ptr == NULL) { 12035 /* 12036 * We need to allocate memory but cant. The INP and INP_INFO 12037 * locks and they are recusive (happens during setup. So a 12038 * scheme to drop the locks fails :( 12039 * 12040 */ 12041 return (ENOMEM); 12042 } 12043 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 12044 12045 rack = (struct tcp_rack *)tp->t_fb_ptr; 12046 RB_INIT(&rack->r_ctl.rc_mtree); 12047 TAILQ_INIT(&rack->r_ctl.rc_free); 12048 TAILQ_INIT(&rack->r_ctl.rc_tmap); 12049 rack->rc_tp = tp; 12050 rack->rc_inp = tp->t_inpcb; 12051 /* Set the flag */ 12052 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 12053 /* Probably not needed but lets be sure */ 12054 rack_clear_rate_sample(rack); 12055 /* 12056 * Save off the default values, socket options will poke 12057 * at these if pacing is not on or we have not yet 12058 * reached where pacing is on (gp_ready/fixed enabled). 12059 * When they get set into the CC module (when gp_ready 12060 * is enabled or we enable fixed) then we will set these 12061 * values into the CC and place in here the old values 12062 * so we have a restoral. Then we will set the flag 12063 * rc_pacing_cc_set. That way whenever we turn off pacing 12064 * or switch off this stack, we will know to go restore 12065 * the saved values. 12066 */ 12067 rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; 12068 rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; 12069 /* We want abe like behavior as well */ 12070 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; 12071 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 12072 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 12073 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 12074 if (use_rack_rr) 12075 rack->use_rack_rr = 1; 12076 if (V_tcp_delack_enabled) 12077 tp->t_delayed_ack = 1; 12078 else 12079 tp->t_delayed_ack = 0; 12080 #ifdef TCP_ACCOUNTING 12081 if (rack_tcp_accounting) { 12082 tp->t_flags2 |= TF2_TCP_ACCOUNTING; 12083 } 12084 #endif 12085 if (rack_enable_shared_cwnd) 12086 rack->rack_enable_scwnd = 1; 12087 rack->rc_user_set_max_segs = rack_hptsi_segments; 12088 rack->rc_force_max_seg = 0; 12089 if (rack_use_imac_dack) 12090 rack->rc_dack_mode = 1; 12091 TAILQ_INIT(&rack->r_ctl.opt_list); 12092 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 12093 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 12094 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 12095 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 12096 rack->r_ctl.rc_highest_us_rtt = 0; 12097 rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; 12098 rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); 12099 if (rack_use_cmp_acks) 12100 rack->r_use_cmp_ack = 1; 12101 if (rack_disable_prr) 12102 rack->rack_no_prr = 1; 12103 if (rack_gp_no_rec_chg) 12104 rack->rc_gp_no_rec_chg = 1; 12105 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 12106 rack->rc_always_pace = 1; 12107 if (rack->use_fixed_rate || rack->gp_ready) 12108 rack_set_cc_pacing(rack); 12109 } else 12110 rack->rc_always_pace = 0; 12111 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) 12112 rack->r_mbuf_queue = 1; 12113 else 12114 rack->r_mbuf_queue = 0; 12115 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 12116 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 12117 else 12118 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12119 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12120 if (rack_limits_scwnd) 12121 rack->r_limit_scw = 1; 12122 else 12123 rack->r_limit_scw = 0; 12124 rack->rc_labc = V_tcp_abc_l_var; 12125 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 12126 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12127 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 12128 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 12129 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 12130 rack->r_ctl.rc_min_to = rack_min_to; 12131 microuptime(&rack->r_ctl.act_rcv_time); 12132 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 12133 rack->r_running_late = 0; 12134 rack->r_running_early = 0; 12135 rack->rc_init_win = rack_default_init_window; 12136 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 12137 if (rack_hw_up_only) 12138 rack->r_up_only = 1; 12139 if (rack_do_dyn_mul) { 12140 /* When dynamic adjustment is on CA needs to start at 100% */ 12141 rack->rc_gp_dyn_mul = 1; 12142 if (rack_do_dyn_mul >= 100) 12143 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 12144 } else 12145 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 12146 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 12147 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 12148 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 12149 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 12150 rack_probertt_filter_life); 12151 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 12152 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12153 rack->r_ctl.rc_time_of_last_probertt = us_cts; 12154 rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); 12155 rack->r_ctl.rc_time_probertt_starts = 0; 12156 /* We require at least one measurement, even if the sysctl is 0 */ 12157 if (rack_req_measurements) 12158 rack->r_ctl.req_measurements = rack_req_measurements; 12159 else 12160 rack->r_ctl.req_measurements = 1; 12161 if (rack_enable_hw_pacing) 12162 rack->rack_hdw_pace_ena = 1; 12163 if (rack_hw_rate_caps) 12164 rack->r_rack_hw_rate_caps = 1; 12165 /* Do we force on detection? */ 12166 #ifdef NETFLIX_EXP_DETECTION 12167 if (tcp_force_detection) 12168 rack->do_detection = 1; 12169 else 12170 #endif 12171 rack->do_detection = 0; 12172 if (rack_non_rxt_use_cr) 12173 rack->rack_rec_nonrxt_use_cr = 1; 12174 err = rack_init_fsb(tp, rack); 12175 if (err) { 12176 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12177 tp->t_fb_ptr = NULL; 12178 return (err); 12179 } 12180 if (tp->snd_una != tp->snd_max) { 12181 /* Create a send map for the current outstanding data */ 12182 struct rack_sendmap *rsm; 12183 12184 rsm = rack_alloc(rack); 12185 if (rsm == NULL) { 12186 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12187 tp->t_fb_ptr = NULL; 12188 return (ENOMEM); 12189 } 12190 rsm->r_no_rtt_allowed = 1; 12191 rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); 12192 rsm->r_rtr_cnt = 1; 12193 rsm->r_rtr_bytes = 0; 12194 if (tp->t_flags & TF_SENTFIN) { 12195 rsm->r_end = tp->snd_max - 1; 12196 rsm->r_flags |= RACK_HAS_FIN; 12197 } else { 12198 rsm->r_end = tp->snd_max; 12199 } 12200 if (tp->snd_una == tp->iss) { 12201 /* The data space is one beyond snd_una */ 12202 rsm->r_flags |= RACK_HAS_SYN; 12203 rsm->r_start = tp->iss; 12204 rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); 12205 } else 12206 rsm->r_start = tp->snd_una; 12207 rsm->r_dupack = 0; 12208 if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { 12209 rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); 12210 if (rsm->m) 12211 rsm->orig_m_len = rsm->m->m_len; 12212 else 12213 rsm->orig_m_len = 0; 12214 } else { 12215 /* 12216 * This can happen if we have a stand-alone FIN or 12217 * SYN. 12218 */ 12219 rsm->m = NULL; 12220 rsm->orig_m_len = 0; 12221 rsm->soff = 0; 12222 } 12223 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12224 #ifdef INVARIANTS 12225 if (insret != NULL) { 12226 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 12227 insret, rack, rsm); 12228 } 12229 #endif 12230 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 12231 rsm->r_in_tmap = 1; 12232 } 12233 /* 12234 * Timers in Rack are kept in microseconds so lets 12235 * convert any initial incoming variables 12236 * from ticks into usecs. Note that we 12237 * also change the values of t_srtt and t_rttvar, if 12238 * they are non-zero. They are kept with a 5 12239 * bit decimal so we have to carefully convert 12240 * these to get the full precision. 12241 */ 12242 rack_convert_rtts(tp); 12243 tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); 12244 if (rack_def_profile) 12245 rack_set_profile(rack, rack_def_profile); 12246 /* Cancel the GP measurement in progress */ 12247 tp->t_flags &= ~TF_GPUTINPROG; 12248 if (SEQ_GT(tp->snd_max, tp->iss)) 12249 snt = tp->snd_max - tp->iss; 12250 else 12251 snt = 0; 12252 iwin = rc_init_window(rack); 12253 if (snt < iwin) { 12254 /* We are not past the initial window 12255 * so we need to make sure cwnd is 12256 * correct. 12257 */ 12258 if (tp->snd_cwnd < iwin) 12259 tp->snd_cwnd = iwin; 12260 /* 12261 * If we are within the initial window 12262 * we want ssthresh to be unlimited. Setting 12263 * it to the rwnd (which the default stack does 12264 * and older racks) is not really a good idea 12265 * since we want to be in SS and grow both the 12266 * cwnd and the rwnd (via dynamic rwnd growth). If 12267 * we set it to the rwnd then as the peer grows its 12268 * rwnd we will be stuck in CA and never hit SS. 12269 * 12270 * Its far better to raise it up high (this takes the 12271 * risk that there as been a loss already, probably 12272 * we should have an indicator in all stacks of loss 12273 * but we don't), but considering the normal use this 12274 * is a risk worth taking. The consequences of not 12275 * hitting SS are far worse than going one more time 12276 * into it early on (before we have sent even a IW). 12277 * It is highly unlikely that we will have had a loss 12278 * before getting the IW out. 12279 */ 12280 tp->snd_ssthresh = 0xffffffff; 12281 } 12282 rack_stop_all_timers(tp); 12283 /* Lets setup the fsb block */ 12284 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12285 rack_log_rtt_shrinks(rack, us_cts, tp->t_rxtcur, 12286 __LINE__, RACK_RTTS_INIT); 12287 return (0); 12288 } 12289 12290 static int 12291 rack_handoff_ok(struct tcpcb *tp) 12292 { 12293 if ((tp->t_state == TCPS_CLOSED) || 12294 (tp->t_state == TCPS_LISTEN)) { 12295 /* Sure no problem though it may not stick */ 12296 return (0); 12297 } 12298 if ((tp->t_state == TCPS_SYN_SENT) || 12299 (tp->t_state == TCPS_SYN_RECEIVED)) { 12300 /* 12301 * We really don't know if you support sack, 12302 * you have to get to ESTAB or beyond to tell. 12303 */ 12304 return (EAGAIN); 12305 } 12306 if ((tp->t_flags & TF_SENTFIN) && ((tp->snd_max - tp->snd_una) > 1)) { 12307 /* 12308 * Rack will only send a FIN after all data is acknowledged. 12309 * So in this case we have more data outstanding. We can't 12310 * switch stacks until either all data and only the FIN 12311 * is left (in which case rack_init() now knows how 12312 * to deal with that) <or> all is acknowledged and we 12313 * are only left with incoming data, though why you 12314 * would want to switch to rack after all data is acknowledged 12315 * I have no idea (rrs)! 12316 */ 12317 return (EAGAIN); 12318 } 12319 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 12320 return (0); 12321 } 12322 /* 12323 * If we reach here we don't do SACK on this connection so we can 12324 * never do rack. 12325 */ 12326 return (EINVAL); 12327 } 12328 12329 12330 static void 12331 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 12332 { 12333 int ack_cmp = 0; 12334 12335 if (tp->t_fb_ptr) { 12336 struct tcp_rack *rack; 12337 struct rack_sendmap *rsm, *nrsm, *rm; 12338 12339 rack = (struct tcp_rack *)tp->t_fb_ptr; 12340 if (tp->t_in_pkt) { 12341 /* 12342 * It is unsafe to process the packets since a 12343 * reset may be lurking in them (its rare but it 12344 * can occur). If we were to find a RST, then we 12345 * would end up dropping the connection and the 12346 * INP lock, so when we return the caller (tcp_usrreq) 12347 * will blow up when it trys to unlock the inp. 12348 */ 12349 struct mbuf *save, *m; 12350 12351 m = tp->t_in_pkt; 12352 tp->t_in_pkt = NULL; 12353 tp->t_tail_pkt = NULL; 12354 while (m) { 12355 save = m->m_nextpkt; 12356 m->m_nextpkt = NULL; 12357 m_freem(m); 12358 m = save; 12359 } 12360 if ((tp->t_inpcb) && 12361 (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP)) 12362 ack_cmp = 1; 12363 if (ack_cmp) { 12364 /* Total if we used large or small (if ack-cmp was used). */ 12365 if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS) 12366 counter_u64_add(rack_large_ackcmp, 1); 12367 else 12368 counter_u64_add(rack_small_ackcmp, 1); 12369 } 12370 } 12371 tp->t_flags &= ~TF_FORCEDATA; 12372 #ifdef NETFLIX_SHARED_CWND 12373 if (rack->r_ctl.rc_scw) { 12374 uint32_t limit; 12375 12376 if (rack->r_limit_scw) 12377 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 12378 else 12379 limit = 0; 12380 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 12381 rack->r_ctl.rc_scw_index, 12382 limit); 12383 rack->r_ctl.rc_scw = NULL; 12384 } 12385 #endif 12386 if (rack->r_ctl.fsb.tcp_ip_hdr) { 12387 free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); 12388 rack->r_ctl.fsb.tcp_ip_hdr = NULL; 12389 rack->r_ctl.fsb.th = NULL; 12390 } 12391 /* Convert back to ticks, with */ 12392 if (tp->t_srtt > 1) { 12393 uint32_t val, frac; 12394 12395 val = USEC_2_TICKS(tp->t_srtt); 12396 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12397 tp->t_srtt = val << TCP_RTT_SHIFT; 12398 /* 12399 * frac is the fractional part here is left 12400 * over from converting to hz and shifting. 12401 * We need to convert this to the 5 bit 12402 * remainder. 12403 */ 12404 if (frac) { 12405 if (hz == 1000) { 12406 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12407 } else { 12408 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12409 } 12410 tp->t_srtt += frac; 12411 } 12412 } 12413 if (tp->t_rttvar) { 12414 uint32_t val, frac; 12415 12416 val = USEC_2_TICKS(tp->t_rttvar); 12417 frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); 12418 tp->t_rttvar = val << TCP_RTTVAR_SHIFT; 12419 /* 12420 * frac is the fractional part here is left 12421 * over from converting to hz and shifting. 12422 * We need to convert this to the 5 bit 12423 * remainder. 12424 */ 12425 if (frac) { 12426 if (hz == 1000) { 12427 frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); 12428 } else { 12429 frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); 12430 } 12431 tp->t_rttvar += frac; 12432 } 12433 } 12434 tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); 12435 tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); 12436 if (rack->rc_always_pace) { 12437 tcp_decrement_paced_conn(); 12438 rack_undo_cc_pacing(rack); 12439 rack->rc_always_pace = 0; 12440 } 12441 /* Clean up any options if they were not applied */ 12442 while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { 12443 struct deferred_opt_list *dol; 12444 12445 dol = TAILQ_FIRST(&rack->r_ctl.opt_list); 12446 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 12447 free(dol, M_TCPDO); 12448 } 12449 /* rack does not use force data but other stacks may clear it */ 12450 if (rack->r_ctl.crte != NULL) { 12451 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 12452 rack->rack_hdrw_pacing = 0; 12453 rack->r_ctl.crte = NULL; 12454 } 12455 #ifdef TCP_BLACKBOX 12456 tcp_log_flowend(tp); 12457 #endif 12458 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 12459 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 12460 #ifdef INVARIANTS 12461 if (rm != rsm) { 12462 panic("At fini, rack:%p rsm:%p rm:%p", 12463 rack, rsm, rm); 12464 } 12465 #endif 12466 uma_zfree(rack_zone, rsm); 12467 } 12468 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12469 while (rsm) { 12470 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 12471 uma_zfree(rack_zone, rsm); 12472 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 12473 } 12474 rack->rc_free_cnt = 0; 12475 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 12476 tp->t_fb_ptr = NULL; 12477 } 12478 if (tp->t_inpcb) { 12479 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 12480 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 12481 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 12482 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP; 12483 /* Cancel the GP measurement in progress */ 12484 tp->t_flags &= ~TF_GPUTINPROG; 12485 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS; 12486 } 12487 /* Make sure snd_nxt is correctly set */ 12488 tp->snd_nxt = tp->snd_max; 12489 } 12490 12491 static void 12492 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 12493 { 12494 switch (tp->t_state) { 12495 case TCPS_SYN_SENT: 12496 rack->r_state = TCPS_SYN_SENT; 12497 rack->r_substate = rack_do_syn_sent; 12498 break; 12499 case TCPS_SYN_RECEIVED: 12500 rack->r_state = TCPS_SYN_RECEIVED; 12501 rack->r_substate = rack_do_syn_recv; 12502 break; 12503 case TCPS_ESTABLISHED: 12504 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12505 rack->r_state = TCPS_ESTABLISHED; 12506 rack->r_substate = rack_do_established; 12507 break; 12508 case TCPS_CLOSE_WAIT: 12509 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12510 rack->r_state = TCPS_CLOSE_WAIT; 12511 rack->r_substate = rack_do_close_wait; 12512 break; 12513 case TCPS_FIN_WAIT_1: 12514 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12515 rack->r_state = TCPS_FIN_WAIT_1; 12516 rack->r_substate = rack_do_fin_wait_1; 12517 break; 12518 case TCPS_CLOSING: 12519 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12520 rack->r_state = TCPS_CLOSING; 12521 rack->r_substate = rack_do_closing; 12522 break; 12523 case TCPS_LAST_ACK: 12524 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12525 rack->r_state = TCPS_LAST_ACK; 12526 rack->r_substate = rack_do_lastack; 12527 break; 12528 case TCPS_FIN_WAIT_2: 12529 rack_set_pace_segments(tp, rack, __LINE__, NULL); 12530 rack->r_state = TCPS_FIN_WAIT_2; 12531 rack->r_substate = rack_do_fin_wait_2; 12532 break; 12533 case TCPS_LISTEN: 12534 case TCPS_CLOSED: 12535 case TCPS_TIME_WAIT: 12536 default: 12537 break; 12538 }; 12539 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 12540 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 12541 12542 } 12543 12544 static void 12545 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 12546 { 12547 /* 12548 * We received an ack, and then did not 12549 * call send or were bounced out due to the 12550 * hpts was running. Now a timer is up as well, is 12551 * it the right timer? 12552 */ 12553 struct rack_sendmap *rsm; 12554 int tmr_up; 12555 12556 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 12557 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 12558 return; 12559 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 12560 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 12561 (tmr_up == PACE_TMR_RXT)) { 12562 /* Should be an RXT */ 12563 return; 12564 } 12565 if (rsm == NULL) { 12566 /* Nothing outstanding? */ 12567 if (tp->t_flags & TF_DELACK) { 12568 if (tmr_up == PACE_TMR_DELACK) 12569 /* We are supposed to have delayed ack up and we do */ 12570 return; 12571 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 12572 /* 12573 * if we hit enobufs then we would expect the possiblity 12574 * of nothing outstanding and the RXT up (and the hptsi timer). 12575 */ 12576 return; 12577 } else if (((V_tcp_always_keepalive || 12578 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 12579 (tp->t_state <= TCPS_CLOSING)) && 12580 (tmr_up == PACE_TMR_KEEP) && 12581 (tp->snd_max == tp->snd_una)) { 12582 /* We should have keep alive up and we do */ 12583 return; 12584 } 12585 } 12586 if (SEQ_GT(tp->snd_max, tp->snd_una) && 12587 ((tmr_up == PACE_TMR_TLP) || 12588 (tmr_up == PACE_TMR_RACK) || 12589 (tmr_up == PACE_TMR_RXT))) { 12590 /* 12591 * Either a Rack, TLP or RXT is fine if we 12592 * have outstanding data. 12593 */ 12594 return; 12595 } else if (tmr_up == PACE_TMR_DELACK) { 12596 /* 12597 * If the delayed ack was going to go off 12598 * before the rtx/tlp/rack timer were going to 12599 * expire, then that would be the timer in control. 12600 * Note we don't check the time here trusting the 12601 * code is correct. 12602 */ 12603 return; 12604 } 12605 /* 12606 * Ok the timer originally started is not what we want now. 12607 * We will force the hpts to be stopped if any, and restart 12608 * with the slot set to what was in the saved slot. 12609 */ 12610 if (rack->rc_inp->inp_in_hpts) { 12611 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 12612 uint32_t us_cts; 12613 12614 us_cts = tcp_get_usecs(NULL); 12615 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12616 rack->r_early = 1; 12617 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 12618 } 12619 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12620 } 12621 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 12622 } 12623 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 12624 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 12625 } 12626 12627 12628 static void 12629 rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) 12630 { 12631 tp->snd_wnd = tiwin; 12632 rack_validate_fo_sendwin_up(tp, rack); 12633 tp->snd_wl1 = seq; 12634 tp->snd_wl2 = ack; 12635 if (tp->snd_wnd > tp->max_sndwnd) 12636 tp->max_sndwnd = tp->snd_wnd; 12637 if (tp->snd_wnd < (tp->snd_max - high_seq)) { 12638 /* The peer collapsed the window */ 12639 rack_collapsed_window(rack); 12640 } else if (rack->rc_has_collapsed) 12641 rack_un_collapse_window(rack); 12642 /* Do we exit persists? */ 12643 if ((rack->rc_in_persist != 0) && 12644 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 12645 rack->r_ctl.rc_pace_min_segs))) { 12646 rack_exit_persist(tp, rack, cts); 12647 } 12648 /* Do we enter persists? */ 12649 if ((rack->rc_in_persist == 0) && 12650 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 12651 TCPS_HAVEESTABLISHED(tp->t_state) && 12652 (tp->snd_max == tp->snd_una) && 12653 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 12654 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 12655 /* 12656 * Here the rwnd is less than 12657 * the pacing size, we are established, 12658 * nothing is outstanding, and there is 12659 * data to send. Enter persists. 12660 */ 12661 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 12662 } 12663 } 12664 12665 static void 12666 rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) 12667 { 12668 12669 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 12670 union tcp_log_stackspecific log; 12671 struct timeval ltv; 12672 char tcp_hdr_buf[60]; 12673 struct tcphdr *th; 12674 struct timespec ts; 12675 uint32_t orig_snd_una; 12676 uint8_t xx = 0; 12677 12678 #ifdef NETFLIX_HTTP_LOGGING 12679 struct http_sendfile_track *http_req; 12680 12681 if (SEQ_GT(ae->ack, tp->snd_una)) { 12682 http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1)); 12683 } else { 12684 http_req = tcp_http_find_req_for_seq(tp, ae->ack); 12685 } 12686 #endif 12687 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 12688 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 12689 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 12690 if (rack->rack_no_prr == 0) 12691 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 12692 else 12693 log.u_bbr.flex1 = 0; 12694 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 12695 log.u_bbr.use_lt_bw <<= 1; 12696 log.u_bbr.use_lt_bw |= rack->r_might_revert; 12697 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 12698 log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 12699 log.u_bbr.pkts_out = tp->t_maxseg; 12700 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 12701 log.u_bbr.flex7 = 1; 12702 log.u_bbr.lost = ae->flags; 12703 log.u_bbr.cwnd_gain = ackval; 12704 log.u_bbr.pacing_gain = 0x2; 12705 if (ae->flags & TSTMP_HDWR) { 12706 /* Record the hardware timestamp if present */ 12707 log.u_bbr.flex3 = M_TSTMP; 12708 ts.tv_sec = ae->timestamp / 1000000000; 12709 ts.tv_nsec = ae->timestamp % 1000000000; 12710 ltv.tv_sec = ts.tv_sec; 12711 ltv.tv_usec = ts.tv_nsec / 1000; 12712 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 12713 } else if (ae->flags & TSTMP_LRO) { 12714 /* Record the LRO the arrival timestamp */ 12715 log.u_bbr.flex3 = M_TSTMP_LRO; 12716 ts.tv_sec = ae->timestamp / 1000000000; 12717 ts.tv_nsec = ae->timestamp % 1000000000; 12718 ltv.tv_sec = ts.tv_sec; 12719 ltv.tv_usec = ts.tv_nsec / 1000; 12720 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 12721 } 12722 log.u_bbr.timeStamp = tcp_get_usecs(<v); 12723 /* Log the rcv time */ 12724 log.u_bbr.delRate = ae->timestamp; 12725 #ifdef NETFLIX_HTTP_LOGGING 12726 log.u_bbr.applimited = tp->t_http_closed; 12727 log.u_bbr.applimited <<= 8; 12728 log.u_bbr.applimited |= tp->t_http_open; 12729 log.u_bbr.applimited <<= 8; 12730 log.u_bbr.applimited |= tp->t_http_req; 12731 if (http_req) { 12732 /* Copy out any client req info */ 12733 /* seconds */ 12734 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 12735 /* useconds */ 12736 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 12737 log.u_bbr.rttProp = http_req->timestamp; 12738 log.u_bbr.cur_del_rate = http_req->start; 12739 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 12740 log.u_bbr.flex8 |= 1; 12741 } else { 12742 log.u_bbr.flex8 |= 2; 12743 log.u_bbr.bw_inuse = http_req->end; 12744 } 12745 log.u_bbr.flex6 = http_req->start_seq; 12746 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 12747 log.u_bbr.flex8 |= 4; 12748 log.u_bbr.epoch = http_req->end_seq; 12749 } 12750 } 12751 #endif 12752 memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); 12753 th = (struct tcphdr *)tcp_hdr_buf; 12754 th->th_seq = ae->seq; 12755 th->th_ack = ae->ack; 12756 th->th_win = ae->win; 12757 /* Now fill in the ports */ 12758 th->th_sport = tp->t_inpcb->inp_fport; 12759 th->th_dport = tp->t_inpcb->inp_lport; 12760 th->th_flags = ae->flags & 0xff; 12761 /* Now do we have a timestamp option? */ 12762 if (ae->flags & HAS_TSTMP) { 12763 u_char *cp; 12764 uint32_t val; 12765 12766 th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); 12767 cp = (u_char *)(th + 1); 12768 *cp = TCPOPT_NOP; 12769 cp++; 12770 *cp = TCPOPT_NOP; 12771 cp++; 12772 *cp = TCPOPT_TIMESTAMP; 12773 cp++; 12774 *cp = TCPOLEN_TIMESTAMP; 12775 cp++; 12776 val = htonl(ae->ts_value); 12777 bcopy((char *)&val, 12778 (char *)cp, sizeof(uint32_t)); 12779 val = htonl(ae->ts_echo); 12780 bcopy((char *)&val, 12781 (char *)(cp + 4), sizeof(uint32_t)); 12782 } else 12783 th->th_off = (sizeof(struct tcphdr) >> 2); 12784 12785 /* 12786 * For sane logging we need to play a little trick. 12787 * If the ack were fully processed we would have moved 12788 * snd_una to high_seq, but since compressed acks are 12789 * processed in two phases, at this point (logging) snd_una 12790 * won't be advanced. So we would see multiple acks showing 12791 * the advancement. We can prevent that by "pretending" that 12792 * snd_una was advanced and then un-advancing it so that the 12793 * logging code has the right value for tlb_snd_una. 12794 */ 12795 if (tp->snd_una != high_seq) { 12796 orig_snd_una = tp->snd_una; 12797 tp->snd_una = high_seq; 12798 xx = 1; 12799 } else 12800 xx = 0; 12801 TCP_LOG_EVENTP(tp, th, 12802 &tp->t_inpcb->inp_socket->so_rcv, 12803 &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0, 12804 0, &log, true, <v); 12805 if (xx) { 12806 tp->snd_una = orig_snd_una; 12807 } 12808 } 12809 12810 } 12811 12812 static int 12813 rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) 12814 { 12815 /* 12816 * Handle a "special" compressed ack mbuf. Each incoming 12817 * ack has only four possible dispositions: 12818 * 12819 * A) It moves the cum-ack forward 12820 * B) It is behind the cum-ack. 12821 * C) It is a window-update ack. 12822 * D) It is a dup-ack. 12823 * 12824 * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES 12825 * in the incoming mbuf. We also need to still pay attention 12826 * to nxt_pkt since there may be another packet after this 12827 * one. 12828 */ 12829 #ifdef TCP_ACCOUNTING 12830 uint64_t ts_val; 12831 uint64_t rdstc; 12832 #endif 12833 int segsiz; 12834 struct timespec ts; 12835 struct tcp_rack *rack; 12836 struct tcp_ackent *ae; 12837 uint32_t tiwin, us_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; 12838 int cnt, i, did_out, ourfinisacked = 0; 12839 int win_up_req = 0; 12840 struct tcpopt to_holder, *to = NULL; 12841 int nsegs = 0; 12842 int under_pacing = 1; 12843 int recovery = 0; 12844 int idx; 12845 #ifdef TCP_ACCOUNTING 12846 sched_pin(); 12847 #endif 12848 rack = (struct tcp_rack *)tp->t_fb_ptr; 12849 if (rack->gp_ready && 12850 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) 12851 under_pacing = 0; 12852 else 12853 under_pacing = 1; 12854 12855 if (rack->r_state != tp->t_state) 12856 rack_set_state(tp, rack); 12857 to = &to_holder; 12858 to->to_flags = 0; 12859 KASSERT((m->m_len >= sizeof(struct tcp_ackent)), 12860 ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); 12861 cnt = m->m_len / sizeof(struct tcp_ackent); 12862 idx = cnt / 5; 12863 if (idx >= MAX_NUM_OF_CNTS) 12864 idx = MAX_NUM_OF_CNTS - 1; 12865 counter_u64_add(rack_proc_comp_ack[idx], 1); 12866 counter_u64_add(rack_multi_single_eq, cnt); 12867 high_seq = tp->snd_una; 12868 the_win = tp->snd_wnd; 12869 win_seq = tp->snd_wl1; 12870 win_upd_ack = tp->snd_wl2; 12871 cts = us_cts = tcp_tv_to_usectick(tv); 12872 segsiz = ctf_fixed_maxseg(tp); 12873 if ((rack->rc_gp_dyn_mul) && 12874 (rack->use_fixed_rate == 0) && 12875 (rack->rc_always_pace)) { 12876 /* Check in on probertt */ 12877 rack_check_probe_rtt(rack, us_cts); 12878 } 12879 for (i = 0; i < cnt; i++) { 12880 #ifdef TCP_ACCOUNTING 12881 ts_val = get_cyclecount(); 12882 #endif 12883 rack_clear_rate_sample(rack); 12884 ae = ((mtod(m, struct tcp_ackent *)) + i); 12885 /* Setup the window */ 12886 tiwin = ae->win << tp->snd_scale; 12887 /* figure out the type of ack */ 12888 if (SEQ_LT(ae->ack, high_seq)) { 12889 /* Case B*/ 12890 ae->ack_val_set = ACK_BEHIND; 12891 } else if (SEQ_GT(ae->ack, high_seq)) { 12892 /* Case A */ 12893 ae->ack_val_set = ACK_CUMACK; 12894 } else if (tiwin == the_win) { 12895 /* Case D */ 12896 ae->ack_val_set = ACK_DUPACK; 12897 } else { 12898 /* Case C */ 12899 ae->ack_val_set = ACK_RWND; 12900 } 12901 rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); 12902 /* Validate timestamp */ 12903 if (ae->flags & HAS_TSTMP) { 12904 /* Setup for a timestamp */ 12905 to->to_flags = TOF_TS; 12906 ae->ts_echo -= tp->ts_offset; 12907 to->to_tsecr = ae->ts_echo; 12908 to->to_tsval = ae->ts_value; 12909 /* 12910 * If echoed timestamp is later than the current time, fall back to 12911 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 12912 * were used when this connection was established. 12913 */ 12914 if (TSTMP_GT(ae->ts_echo, cts)) 12915 ae->ts_echo = 0; 12916 if (tp->ts_recent && 12917 TSTMP_LT(ae->ts_value, tp->ts_recent)) { 12918 if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { 12919 #ifdef TCP_ACCOUNTING 12920 rdstc = get_cyclecount(); 12921 if (rdstc > ts_val) { 12922 counter_u64_add(tcp_proc_time[ae->ack_val_set] , 12923 (rdstc - ts_val)); 12924 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 12925 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 12926 } 12927 } 12928 #endif 12929 continue; 12930 } 12931 } 12932 if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && 12933 SEQ_LEQ(tp->last_ack_sent, ae->seq)) { 12934 tp->ts_recent_age = tcp_ts_getticks(); 12935 tp->ts_recent = ae->ts_value; 12936 } 12937 } else { 12938 /* Setup for a no options */ 12939 to->to_flags = 0; 12940 } 12941 /* Update the rcv time and perform idle reduction possibly */ 12942 if (tp->t_idle_reduce && 12943 (tp->snd_max == tp->snd_una) && 12944 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 12945 counter_u64_add(rack_input_idle_reduces, 1); 12946 rack_cc_after_idle(rack, tp); 12947 } 12948 tp->t_rcvtime = ticks; 12949 /* Now what about ECN? */ 12950 if (tp->t_flags2 & TF2_ECN_PERMIT) { 12951 if (ae->flags & TH_CWR) { 12952 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 12953 tp->t_flags |= TF_ACKNOW; 12954 } 12955 switch (ae->codepoint & IPTOS_ECN_MASK) { 12956 case IPTOS_ECN_CE: 12957 tp->t_flags2 |= TF2_ECN_SND_ECE; 12958 KMOD_TCPSTAT_INC(tcps_ecn_ce); 12959 break; 12960 case IPTOS_ECN_ECT0: 12961 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 12962 break; 12963 case IPTOS_ECN_ECT1: 12964 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 12965 break; 12966 } 12967 12968 /* Process a packet differently from RFC3168. */ 12969 cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint); 12970 /* Congestion experienced. */ 12971 if (ae->flags & TH_ECE) { 12972 rack_cong_signal(tp, CC_ECN, ae->ack); 12973 } 12974 } 12975 #ifdef TCP_ACCOUNTING 12976 /* Count for the specific type of ack in */ 12977 counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1); 12978 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 12979 tp->tcp_cnt_counters[ae->ack_val_set]++; 12980 } 12981 #endif 12982 /* 12983 * Note how we could move up these in the determination 12984 * above, but we don't so that way the timestamp checks (and ECN) 12985 * is done first before we do any processing on the ACK. 12986 * The non-compressed path through the code has this 12987 * weakness (noted by @jtl) that it actually does some 12988 * processing before verifying the timestamp information. 12989 * We don't take that path here which is why we set 12990 * the ack_val_set first, do the timestamp and ecn 12991 * processing, and then look at what we have setup. 12992 */ 12993 if (ae->ack_val_set == ACK_BEHIND) { 12994 /* 12995 * Case B flag reordering, if window is not closed 12996 * or it could be a keep-alive or persists 12997 */ 12998 if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { 12999 counter_u64_add(rack_reorder_seen, 1); 13000 rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13001 } 13002 } else if (ae->ack_val_set == ACK_DUPACK) { 13003 /* Case D */ 13004 13005 rack_strike_dupack(rack); 13006 } else if (ae->ack_val_set == ACK_RWND) { 13007 /* Case C */ 13008 13009 win_up_req = 1; 13010 win_upd_ack = ae->ack; 13011 win_seq = ae->seq; 13012 the_win = tiwin; 13013 } else { 13014 /* Case A */ 13015 13016 if (SEQ_GT(ae->ack, tp->snd_max)) { 13017 /* 13018 * We just send an ack since the incoming 13019 * ack is beyond the largest seq we sent. 13020 */ 13021 if ((tp->t_flags & TF_ACKNOW) == 0) { 13022 ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); 13023 if (tp->t_flags && TF_ACKNOW) 13024 rack->r_wanted_output = 1; 13025 } 13026 } else { 13027 nsegs++; 13028 /* If the window changed setup to update */ 13029 if (tiwin != tp->snd_wnd) { 13030 win_up_req = 1; 13031 win_upd_ack = ae->ack; 13032 win_seq = ae->seq; 13033 the_win = tiwin; 13034 } 13035 #ifdef TCP_ACCOUNTING 13036 /* Account for the acks */ 13037 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13038 tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); 13039 } 13040 counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN], 13041 (((ae->ack - high_seq) + segsiz - 1) / segsiz)); 13042 #endif 13043 high_seq = ae->ack; 13044 /* Setup our act_rcv_time */ 13045 if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { 13046 ts.tv_sec = ae->timestamp / 1000000000; 13047 ts.tv_nsec = ae->timestamp % 1000000000; 13048 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13049 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13050 } else { 13051 rack->r_ctl.act_rcv_time = *tv; 13052 } 13053 rack_process_to_cumack(tp, rack, ae->ack, cts, to); 13054 } 13055 } 13056 /* And lets be sure to commit the rtt measurements for this ack */ 13057 tcp_rack_xmit_timer_commit(rack, tp); 13058 #ifdef TCP_ACCOUNTING 13059 rdstc = get_cyclecount(); 13060 if (rdstc > ts_val) { 13061 counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val)); 13062 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13063 tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); 13064 if (ae->ack_val_set == ACK_CUMACK) 13065 tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); 13066 } 13067 } 13068 #endif 13069 } 13070 #ifdef TCP_ACCOUNTING 13071 ts_val = get_cyclecount(); 13072 #endif 13073 acked_amount = acked = (high_seq - tp->snd_una); 13074 if (win_up_req) { 13075 rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); 13076 } 13077 if (acked) { 13078 if (rack->sack_attack_disable == 0) 13079 rack_do_decay(rack); 13080 if (acked >= segsiz) { 13081 /* 13082 * You only get credit for 13083 * MSS and greater (and you get extra 13084 * credit for larger cum-ack moves). 13085 */ 13086 int ac; 13087 13088 ac = acked / segsiz; 13089 rack->r_ctl.ack_count += ac; 13090 counter_u64_add(rack_ack_total, ac); 13091 } 13092 if (rack->r_ctl.ack_count > 0xfff00000) { 13093 /* 13094 * reduce the number to keep us under 13095 * a uint32_t. 13096 */ 13097 rack->r_ctl.ack_count /= 2; 13098 rack->r_ctl.sack_count /= 2; 13099 } 13100 if (tp->t_flags & TF_NEEDSYN) { 13101 /* 13102 * T/TCP: Connection was half-synchronized, and our SYN has 13103 * been ACK'd (so connection is now fully synchronized). Go 13104 * to non-starred state, increment snd_una for ACK of SYN, 13105 * and check if we can do window scaling. 13106 */ 13107 tp->t_flags &= ~TF_NEEDSYN; 13108 tp->snd_una++; 13109 acked_amount = acked = (high_seq - tp->snd_una); 13110 } 13111 if (acked > sbavail(&so->so_snd)) 13112 acked_amount = sbavail(&so->so_snd); 13113 #ifdef NETFLIX_EXP_DETECTION 13114 /* 13115 * We only care on a cum-ack move if we are in a sack-disabled 13116 * state. We have already added in to the ack_count, and we never 13117 * would disable on a cum-ack move, so we only care to do the 13118 * detection if it may "undo" it, i.e. we were in disabled already. 13119 */ 13120 if (rack->sack_attack_disable) 13121 rack_do_detection(tp, rack, acked_amount, segsiz); 13122 #endif 13123 if (IN_FASTRECOVERY(tp->t_flags) && 13124 (rack->rack_no_prr == 0)) 13125 rack_update_prr(tp, rack, acked_amount, high_seq); 13126 if (IN_RECOVERY(tp->t_flags)) { 13127 if (SEQ_LT(high_seq, tp->snd_recover) && 13128 (SEQ_LT(high_seq, tp->snd_max))) { 13129 tcp_rack_partialack(tp); 13130 } else { 13131 rack_post_recovery(tp, high_seq); 13132 recovery = 1; 13133 } 13134 } 13135 /* Handle the rack-log-ack part (sendmap) */ 13136 if ((sbused(&so->so_snd) == 0) && 13137 (acked > acked_amount) && 13138 (tp->t_state >= TCPS_FIN_WAIT_1) && 13139 (tp->t_flags & TF_SENTFIN)) { 13140 /* 13141 * We must be sure our fin 13142 * was sent and acked (we can be 13143 * in FIN_WAIT_1 without having 13144 * sent the fin). 13145 */ 13146 ourfinisacked = 1; 13147 /* 13148 * Lets make sure snd_una is updated 13149 * since most likely acked_amount = 0 (it 13150 * should be). 13151 */ 13152 tp->snd_una = high_seq; 13153 } 13154 /* Did we make a RTO error? */ 13155 if ((tp->t_flags & TF_PREVVALID) && 13156 ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { 13157 tp->t_flags &= ~TF_PREVVALID; 13158 if (tp->t_rxtshift == 1 && 13159 (int)(ticks - tp->t_badrxtwin) < 0) 13160 rack_cong_signal(tp, CC_RTO_ERR, high_seq); 13161 } 13162 /* Handle the data in the socket buffer */ 13163 KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); 13164 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 13165 if (acked_amount > 0) { 13166 struct mbuf *mfree; 13167 13168 rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); 13169 SOCKBUF_LOCK(&so->so_snd); 13170 mfree = sbcut_locked(&so->so_snd, acked); 13171 tp->snd_una = high_seq; 13172 /* Note we want to hold the sb lock through the sendmap adjust */ 13173 rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); 13174 /* Wake up the socket if we have room to write more */ 13175 rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); 13176 sowwakeup_locked(so); 13177 m_freem(mfree); 13178 } 13179 /* update progress */ 13180 tp->t_acktime = ticks; 13181 rack_log_progress_event(rack, tp, tp->t_acktime, 13182 PROGRESS_UPDATE, __LINE__); 13183 /* Clear out shifts and such */ 13184 tp->t_rxtshift = 0; 13185 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 13186 rack_rto_min, rack_rto_max, rack->r_ctl.timer_slop); 13187 rack->rc_tlp_in_progress = 0; 13188 rack->r_ctl.rc_tlp_cnt_out = 0; 13189 /* Send recover and snd_nxt must be dragged along */ 13190 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 13191 tp->snd_recover = tp->snd_una; 13192 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) 13193 tp->snd_nxt = tp->snd_una; 13194 /* 13195 * If the RXT timer is running we want to 13196 * stop it, so we can restart a TLP (or new RXT). 13197 */ 13198 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 13199 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13200 #ifdef NETFLIX_HTTP_LOGGING 13201 tcp_http_check_for_comp(rack->rc_tp, high_seq); 13202 #endif 13203 tp->snd_wl2 = high_seq; 13204 tp->t_dupacks = 0; 13205 if (under_pacing && 13206 (rack->use_fixed_rate == 0) && 13207 (rack->in_probe_rtt == 0) && 13208 rack->rc_gp_dyn_mul && 13209 rack->rc_always_pace) { 13210 /* Check if we are dragging bottom */ 13211 rack_check_bottom_drag(tp, rack, so, acked); 13212 } 13213 if (tp->snd_una == tp->snd_max) { 13214 tp->t_flags &= ~TF_PREVVALID; 13215 rack->r_ctl.retran_during_recovery = 0; 13216 rack->r_ctl.dsack_byte_cnt = 0; 13217 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 13218 if (rack->r_ctl.rc_went_idle_time == 0) 13219 rack->r_ctl.rc_went_idle_time = 1; 13220 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 13221 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 13222 tp->t_acktime = 0; 13223 /* Set so we might enter persists... */ 13224 rack->r_wanted_output = 1; 13225 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13226 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 13227 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 13228 (sbavail(&so->so_snd) == 0) && 13229 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 13230 /* 13231 * The socket was gone and the 13232 * peer sent data (not now in the past), time to 13233 * reset him. 13234 */ 13235 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 13236 /* tcp_close will kill the inp pre-log the Reset */ 13237 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 13238 #ifdef TCP_ACCOUNTING 13239 rdstc = get_cyclecount(); 13240 if (rdstc > ts_val) { 13241 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13242 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13243 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13244 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13245 } 13246 } 13247 #endif 13248 m_freem(m); 13249 tp = tcp_close(tp); 13250 if (tp == NULL) { 13251 #ifdef TCP_ACCOUNTING 13252 sched_unpin(); 13253 #endif 13254 return (1); 13255 } 13256 /* 13257 * We would normally do drop-with-reset which would 13258 * send back a reset. We can't since we don't have 13259 * all the needed bits. Instead lets arrange for 13260 * a call to tcp_output(). That way since we 13261 * are in the closed state we will generate a reset. 13262 * 13263 * Note if tcp_accounting is on we don't unpin since 13264 * we do that after the goto label. 13265 */ 13266 goto send_out_a_rst; 13267 } 13268 if ((sbused(&so->so_snd) == 0) && 13269 (tp->t_state >= TCPS_FIN_WAIT_1) && 13270 (tp->t_flags & TF_SENTFIN)) { 13271 /* 13272 * If we can't receive any more data, then closing user can 13273 * proceed. Starting the timer is contrary to the 13274 * specification, but if we don't get a FIN we'll hang 13275 * forever. 13276 * 13277 */ 13278 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13279 soisdisconnected(so); 13280 tcp_timer_activate(tp, TT_2MSL, 13281 (tcp_fast_finwait2_recycle ? 13282 tcp_finwait2_timeout : 13283 TP_MAXIDLE(tp))); 13284 } 13285 if (ourfinisacked == 0) { 13286 /* 13287 * We don't change to fin-wait-2 if we have our fin acked 13288 * which means we are probably in TCPS_CLOSING. 13289 */ 13290 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13291 } 13292 } 13293 } 13294 /* Wake up the socket if we have room to write more */ 13295 if (sbavail(&so->so_snd)) { 13296 rack->r_wanted_output = 1; 13297 if (ctf_progress_timeout_check(tp, true)) { 13298 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 13299 tp, tick, PROGRESS_DROP, __LINE__); 13300 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 13301 /* 13302 * We cheat here and don't send a RST, we should send one 13303 * when the pacer drops the connection. 13304 */ 13305 #ifdef TCP_ACCOUNTING 13306 rdstc = get_cyclecount(); 13307 if (rdstc > ts_val) { 13308 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13309 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13310 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13311 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13312 } 13313 } 13314 sched_unpin(); 13315 #endif 13316 INP_WUNLOCK(rack->rc_inp); 13317 m_freem(m); 13318 return (1); 13319 } 13320 } 13321 if (ourfinisacked) { 13322 switch(tp->t_state) { 13323 case TCPS_CLOSING: 13324 #ifdef TCP_ACCOUNTING 13325 rdstc = get_cyclecount(); 13326 if (rdstc > ts_val) { 13327 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13328 (rdstc - ts_val)); 13329 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13330 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13331 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13332 } 13333 } 13334 sched_unpin(); 13335 #endif 13336 tcp_twstart(tp); 13337 m_freem(m); 13338 return (1); 13339 break; 13340 case TCPS_LAST_ACK: 13341 #ifdef TCP_ACCOUNTING 13342 rdstc = get_cyclecount(); 13343 if (rdstc > ts_val) { 13344 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13345 (rdstc - ts_val)); 13346 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13347 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13348 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13349 } 13350 } 13351 sched_unpin(); 13352 #endif 13353 tp = tcp_close(tp); 13354 ctf_do_drop(m, tp); 13355 return (1); 13356 break; 13357 case TCPS_FIN_WAIT_1: 13358 #ifdef TCP_ACCOUNTING 13359 rdstc = get_cyclecount(); 13360 if (rdstc > ts_val) { 13361 counter_u64_add(tcp_proc_time[ACK_CUMACK] , 13362 (rdstc - ts_val)); 13363 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13364 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13365 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13366 } 13367 } 13368 #endif 13369 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 13370 soisdisconnected(so); 13371 tcp_timer_activate(tp, TT_2MSL, 13372 (tcp_fast_finwait2_recycle ? 13373 tcp_finwait2_timeout : 13374 TP_MAXIDLE(tp))); 13375 } 13376 tcp_state_change(tp, TCPS_FIN_WAIT_2); 13377 break; 13378 default: 13379 break; 13380 } 13381 } 13382 if (rack->r_fast_output) { 13383 /* 13384 * We re doing fast output.. can we expand that? 13385 */ 13386 rack_gain_for_fastoutput(rack, tp, so, acked_amount); 13387 } 13388 #ifdef TCP_ACCOUNTING 13389 rdstc = get_cyclecount(); 13390 if (rdstc > ts_val) { 13391 counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); 13392 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13393 tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); 13394 tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); 13395 } 13396 } 13397 13398 } else if (win_up_req) { 13399 rdstc = get_cyclecount(); 13400 if (rdstc > ts_val) { 13401 counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val)); 13402 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13403 tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); 13404 } 13405 } 13406 #endif 13407 } 13408 /* Now is there a next packet, if so we are done */ 13409 m_freem(m); 13410 did_out = 0; 13411 if (nxt_pkt) { 13412 #ifdef TCP_ACCOUNTING 13413 sched_unpin(); 13414 #endif 13415 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); 13416 return (0); 13417 } 13418 rack_handle_might_revert(tp, rack); 13419 ctf_calc_rwin(so, tp); 13420 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 13421 send_out_a_rst: 13422 (void)tp->t_fb->tfb_tcp_output(tp); 13423 did_out = 1; 13424 } 13425 rack_free_trim(rack); 13426 #ifdef TCP_ACCOUNTING 13427 sched_unpin(); 13428 #endif 13429 rack_timer_audit(tp, rack, &so->so_snd); 13430 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); 13431 return (0); 13432 } 13433 13434 13435 static int 13436 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 13437 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 13438 int32_t nxt_pkt, struct timeval *tv) 13439 { 13440 #ifdef TCP_ACCOUNTING 13441 uint64_t ts_val; 13442 #endif 13443 int32_t thflags, retval, did_out = 0; 13444 int32_t way_out = 0; 13445 uint32_t cts; 13446 uint32_t tiwin; 13447 struct timespec ts; 13448 struct tcpopt to; 13449 struct tcp_rack *rack; 13450 struct rack_sendmap *rsm; 13451 int32_t prev_state = 0; 13452 #ifdef TCP_ACCOUNTING 13453 int ack_val_set = 0xf; 13454 #endif 13455 int nsegs; 13456 uint32_t us_cts; 13457 /* 13458 * tv passed from common code is from either M_TSTMP_LRO or 13459 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. 13460 */ 13461 if (m->m_flags & M_ACKCMP) { 13462 return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); 13463 } 13464 if (m->m_flags & M_ACKCMP) { 13465 panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); 13466 } 13467 nsegs = m->m_pkthdr.lro_nsegs; 13468 counter_u64_add(rack_proc_non_comp_ack, 1); 13469 thflags = th->th_flags; 13470 #ifdef TCP_ACCOUNTING 13471 sched_pin(); 13472 if (thflags & TH_ACK) 13473 ts_val = get_cyclecount(); 13474 #endif 13475 cts = tcp_tv_to_usectick(tv); 13476 rack = (struct tcp_rack *)tp->t_fb_ptr; 13477 13478 if ((m->m_flags & M_TSTMP) || 13479 (m->m_flags & M_TSTMP_LRO)) { 13480 mbuf_tstmp2timespec(m, &ts); 13481 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 13482 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 13483 } else 13484 rack->r_ctl.act_rcv_time = *tv; 13485 kern_prefetch(rack, &prev_state); 13486 prev_state = 0; 13487 /* 13488 * Unscale the window into a 32-bit value. For the SYN_SENT state 13489 * the scale is zero. 13490 */ 13491 tiwin = th->th_win << tp->snd_scale; 13492 /* 13493 * Parse options on any incoming segment. 13494 */ 13495 memset(&to, 0, sizeof(to)); 13496 tcp_dooptions(&to, (u_char *)(th + 1), 13497 (th->th_off << 2) - sizeof(struct tcphdr), 13498 (thflags & TH_SYN) ? TO_SYN : 0); 13499 #ifdef TCP_ACCOUNTING 13500 if (thflags & TH_ACK) { 13501 /* 13502 * We have a tradeoff here. We can either do what we are 13503 * doing i.e. pinning to this CPU and then doing the accounting 13504 * <or> we could do a critical enter, setup the rdtsc and cpu 13505 * as in below, and then validate we are on the same CPU on 13506 * exit. I have choosen to not do the critical enter since 13507 * that often will gain you a context switch, and instead lock 13508 * us (line above this if) to the same CPU with sched_pin(). This 13509 * means we may be context switched out for a higher priority 13510 * interupt but we won't be moved to another CPU. 13511 * 13512 * If this occurs (which it won't very often since we most likely 13513 * are running this code in interupt context and only a higher 13514 * priority will bump us ... clock?) we will falsely add in 13515 * to the time the interupt processing time plus the ack processing 13516 * time. This is ok since its a rare event. 13517 */ 13518 ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, 13519 ctf_fixed_maxseg(tp)); 13520 } 13521 #endif 13522 NET_EPOCH_ASSERT(); 13523 INP_WLOCK_ASSERT(tp->t_inpcb); 13524 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 13525 __func__)); 13526 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 13527 __func__)); 13528 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13529 union tcp_log_stackspecific log; 13530 struct timeval ltv; 13531 #ifdef NETFLIX_HTTP_LOGGING 13532 struct http_sendfile_track *http_req; 13533 13534 if (SEQ_GT(th->th_ack, tp->snd_una)) { 13535 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 13536 } else { 13537 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 13538 } 13539 #endif 13540 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13541 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13542 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13543 if (rack->rack_no_prr == 0) 13544 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13545 else 13546 log.u_bbr.flex1 = 0; 13547 log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; 13548 log.u_bbr.use_lt_bw <<= 1; 13549 log.u_bbr.use_lt_bw |= rack->r_might_revert; 13550 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 13551 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13552 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 13553 log.u_bbr.flex3 = m->m_flags; 13554 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 13555 log.u_bbr.lost = thflags; 13556 log.u_bbr.pacing_gain = 0x1; 13557 #ifdef TCP_ACCOUNTING 13558 log.u_bbr.cwnd_gain = ack_val_set; 13559 #endif 13560 log.u_bbr.flex7 = 2; 13561 if (m->m_flags & M_TSTMP) { 13562 /* Record the hardware timestamp if present */ 13563 mbuf_tstmp2timespec(m, &ts); 13564 ltv.tv_sec = ts.tv_sec; 13565 ltv.tv_usec = ts.tv_nsec / 1000; 13566 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 13567 } else if (m->m_flags & M_TSTMP_LRO) { 13568 /* Record the LRO the arrival timestamp */ 13569 mbuf_tstmp2timespec(m, &ts); 13570 ltv.tv_sec = ts.tv_sec; 13571 ltv.tv_usec = ts.tv_nsec / 1000; 13572 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 13573 } 13574 log.u_bbr.timeStamp = tcp_get_usecs(<v); 13575 /* Log the rcv time */ 13576 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 13577 #ifdef NETFLIX_HTTP_LOGGING 13578 log.u_bbr.applimited = tp->t_http_closed; 13579 log.u_bbr.applimited <<= 8; 13580 log.u_bbr.applimited |= tp->t_http_open; 13581 log.u_bbr.applimited <<= 8; 13582 log.u_bbr.applimited |= tp->t_http_req; 13583 if (http_req) { 13584 /* Copy out any client req info */ 13585 /* seconds */ 13586 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 13587 /* useconds */ 13588 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 13589 log.u_bbr.rttProp = http_req->timestamp; 13590 log.u_bbr.cur_del_rate = http_req->start; 13591 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 13592 log.u_bbr.flex8 |= 1; 13593 } else { 13594 log.u_bbr.flex8 |= 2; 13595 log.u_bbr.bw_inuse = http_req->end; 13596 } 13597 log.u_bbr.flex6 = http_req->start_seq; 13598 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 13599 log.u_bbr.flex8 |= 4; 13600 log.u_bbr.epoch = http_req->end_seq; 13601 } 13602 } 13603 #endif 13604 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 13605 tlen, &log, true, <v); 13606 } 13607 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 13608 way_out = 4; 13609 retval = 0; 13610 m_freem(m); 13611 goto done_with_input; 13612 } 13613 /* 13614 * If a segment with the ACK-bit set arrives in the SYN-SENT state 13615 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 13616 */ 13617 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 13618 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 13619 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 13620 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 13621 #ifdef TCP_ACCOUNTING 13622 sched_unpin(); 13623 #endif 13624 return (1); 13625 } 13626 13627 /* 13628 * Parse options on any incoming segment. 13629 */ 13630 tcp_dooptions(&to, (u_char *)(th + 1), 13631 (th->th_off << 2) - sizeof(struct tcphdr), 13632 (thflags & TH_SYN) ? TO_SYN : 0); 13633 13634 /* 13635 * If timestamps were negotiated during SYN/ACK and a 13636 * segment without a timestamp is received, silently drop 13637 * the segment, unless it is a RST segment or missing timestamps are 13638 * tolerated. 13639 * See section 3.2 of RFC 7323. 13640 */ 13641 if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && 13642 ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { 13643 way_out = 5; 13644 retval = 0; 13645 m_freem(m); 13646 goto done_with_input; 13647 } 13648 13649 /* 13650 * Segment received on connection. Reset idle time and keep-alive 13651 * timer. XXX: This should be done after segment validation to 13652 * ignore broken/spoofed segs. 13653 */ 13654 if (tp->t_idle_reduce && 13655 (tp->snd_max == tp->snd_una) && 13656 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 13657 counter_u64_add(rack_input_idle_reduces, 1); 13658 rack_cc_after_idle(rack, tp); 13659 } 13660 tp->t_rcvtime = ticks; 13661 #ifdef STATS 13662 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 13663 #endif 13664 if (tiwin > rack->r_ctl.rc_high_rwnd) 13665 rack->r_ctl.rc_high_rwnd = tiwin; 13666 /* 13667 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 13668 * this to occur after we've validated the segment. 13669 */ 13670 if (tp->t_flags2 & TF2_ECN_PERMIT) { 13671 if (thflags & TH_CWR) { 13672 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13673 tp->t_flags |= TF_ACKNOW; 13674 } 13675 switch (iptos & IPTOS_ECN_MASK) { 13676 case IPTOS_ECN_CE: 13677 tp->t_flags2 |= TF2_ECN_SND_ECE; 13678 KMOD_TCPSTAT_INC(tcps_ecn_ce); 13679 break; 13680 case IPTOS_ECN_ECT0: 13681 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13682 break; 13683 case IPTOS_ECN_ECT1: 13684 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 13685 break; 13686 } 13687 13688 /* Process a packet differently from RFC3168. */ 13689 cc_ecnpkt_handler(tp, th, iptos); 13690 13691 /* Congestion experienced. */ 13692 if (thflags & TH_ECE) { 13693 rack_cong_signal(tp, CC_ECN, th->th_ack); 13694 } 13695 } 13696 13697 /* 13698 * If echoed timestamp is later than the current time, fall back to 13699 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 13700 * were used when this connection was established. 13701 */ 13702 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 13703 to.to_tsecr -= tp->ts_offset; 13704 if (TSTMP_GT(to.to_tsecr, cts)) 13705 to.to_tsecr = 0; 13706 } 13707 13708 /* 13709 * If its the first time in we need to take care of options and 13710 * verify we can do SACK for rack! 13711 */ 13712 if (rack->r_state == 0) { 13713 /* Should be init'd by rack_init() */ 13714 KASSERT(rack->rc_inp != NULL, 13715 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 13716 if (rack->rc_inp == NULL) { 13717 rack->rc_inp = tp->t_inpcb; 13718 } 13719 13720 /* 13721 * Process options only when we get SYN/ACK back. The SYN 13722 * case for incoming connections is handled in tcp_syncache. 13723 * According to RFC1323 the window field in a SYN (i.e., a 13724 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 13725 * this is traditional behavior, may need to be cleaned up. 13726 */ 13727 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 13728 /* Handle parallel SYN for ECN */ 13729 if (!(thflags & TH_ACK) && 13730 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 13731 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 13732 tp->t_flags2 |= TF2_ECN_PERMIT; 13733 tp->t_flags2 |= TF2_ECN_SND_ECE; 13734 TCPSTAT_INC(tcps_ecn_shs); 13735 } 13736 if ((to.to_flags & TOF_SCALE) && 13737 (tp->t_flags & TF_REQ_SCALE)) { 13738 tp->t_flags |= TF_RCVD_SCALE; 13739 tp->snd_scale = to.to_wscale; 13740 } else 13741 tp->t_flags &= ~TF_REQ_SCALE; 13742 /* 13743 * Initial send window. It will be updated with the 13744 * next incoming segment to the scaled value. 13745 */ 13746 tp->snd_wnd = th->th_win; 13747 rack_validate_fo_sendwin_up(tp, rack); 13748 if ((to.to_flags & TOF_TS) && 13749 (tp->t_flags & TF_REQ_TSTMP)) { 13750 tp->t_flags |= TF_RCVD_TSTMP; 13751 tp->ts_recent = to.to_tsval; 13752 tp->ts_recent_age = cts; 13753 } else 13754 tp->t_flags &= ~TF_REQ_TSTMP; 13755 if (to.to_flags & TOF_MSS) { 13756 tcp_mss(tp, to.to_mss); 13757 } 13758 if ((tp->t_flags & TF_SACK_PERMIT) && 13759 (to.to_flags & TOF_SACKPERM) == 0) 13760 tp->t_flags &= ~TF_SACK_PERMIT; 13761 if (IS_FASTOPEN(tp->t_flags)) { 13762 if (to.to_flags & TOF_FASTOPEN) { 13763 uint16_t mss; 13764 13765 if (to.to_flags & TOF_MSS) 13766 mss = to.to_mss; 13767 else 13768 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 13769 mss = TCP6_MSS; 13770 else 13771 mss = TCP_MSS; 13772 tcp_fastopen_update_cache(tp, mss, 13773 to.to_tfo_len, to.to_tfo_cookie); 13774 } else 13775 tcp_fastopen_disable_path(tp); 13776 } 13777 } 13778 /* 13779 * At this point we are at the initial call. Here we decide 13780 * if we are doing RACK or not. We do this by seeing if 13781 * TF_SACK_PERMIT is set and the sack-not-required is clear. 13782 * The code now does do dup-ack counting so if you don't 13783 * switch back you won't get rack & TLP, but you will still 13784 * get this stack. 13785 */ 13786 13787 if ((rack_sack_not_required == 0) && 13788 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 13789 tcp_switch_back_to_default(tp); 13790 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 13791 tlen, iptos); 13792 #ifdef TCP_ACCOUNTING 13793 sched_unpin(); 13794 #endif 13795 return (1); 13796 } 13797 tcp_set_hpts(tp->t_inpcb); 13798 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 13799 } 13800 if (thflags & TH_FIN) 13801 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 13802 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 13803 if ((rack->rc_gp_dyn_mul) && 13804 (rack->use_fixed_rate == 0) && 13805 (rack->rc_always_pace)) { 13806 /* Check in on probertt */ 13807 rack_check_probe_rtt(rack, us_cts); 13808 } 13809 if (rack->forced_ack) { 13810 uint32_t us_rtt; 13811 13812 /* 13813 * A persist or keep-alive was forced out, update our 13814 * min rtt time. Note we do not worry about lost 13815 * retransmissions since KEEP-ALIVES and persists 13816 * are usually way long on times of sending (though 13817 * if we were really paranoid or worried we could 13818 * at least use timestamps if available to validate). 13819 */ 13820 rack->forced_ack = 0; 13821 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 13822 if (us_rtt == 0) 13823 us_rtt = 1; 13824 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 13825 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 13826 } 13827 /* 13828 * This is the one exception case where we set the rack state 13829 * always. All other times (timers etc) we must have a rack-state 13830 * set (so we assure we have done the checks above for SACK). 13831 */ 13832 rack->r_ctl.rc_rcvtime = cts; 13833 if (rack->r_state != tp->t_state) 13834 rack_set_state(tp, rack); 13835 if (SEQ_GT(th->th_ack, tp->snd_una) && 13836 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 13837 kern_prefetch(rsm, &prev_state); 13838 prev_state = rack->r_state; 13839 rack_clear_rate_sample(rack); 13840 retval = (*rack->r_substate) (m, th, so, 13841 tp, &to, drop_hdrlen, 13842 tlen, tiwin, thflags, nxt_pkt, iptos); 13843 #ifdef INVARIANTS 13844 if ((retval == 0) && 13845 (tp->t_inpcb == NULL)) { 13846 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 13847 retval, tp, prev_state); 13848 } 13849 #endif 13850 if (retval == 0) { 13851 /* 13852 * If retval is 1 the tcb is unlocked and most likely the tp 13853 * is gone. 13854 */ 13855 INP_WLOCK_ASSERT(tp->t_inpcb); 13856 if ((rack->rc_gp_dyn_mul) && 13857 (rack->rc_always_pace) && 13858 (rack->use_fixed_rate == 0) && 13859 rack->in_probe_rtt && 13860 (rack->r_ctl.rc_time_probertt_starts == 0)) { 13861 /* 13862 * If we are going for target, lets recheck before 13863 * we output. 13864 */ 13865 rack_check_probe_rtt(rack, us_cts); 13866 } 13867 if (rack->set_pacing_done_a_iw == 0) { 13868 /* How much has been acked? */ 13869 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 13870 /* We have enough to set in the pacing segment size */ 13871 rack->set_pacing_done_a_iw = 1; 13872 rack_set_pace_segments(tp, rack, __LINE__, NULL); 13873 } 13874 } 13875 tcp_rack_xmit_timer_commit(rack, tp); 13876 #ifdef TCP_ACCOUNTING 13877 /* 13878 * If we set the ack_val_se to what ack processing we are doing 13879 * we also want to track how many cycles we burned. Note 13880 * the bits after tcp_output we let be "free". This is because 13881 * we are also tracking the tcp_output times as well. Note the 13882 * use of 0xf here since we only have 11 counter (0 - 0xa) and 13883 * 0xf cannot be returned and is what we initialize it too to 13884 * indicate we are not doing the tabulations. 13885 */ 13886 if (ack_val_set != 0xf) { 13887 uint64_t crtsc; 13888 13889 crtsc = get_cyclecount(); 13890 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 13891 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 13892 tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); 13893 } 13894 } 13895 #endif 13896 if (nxt_pkt == 0) { 13897 if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { 13898 do_output_now: 13899 did_out = 1; 13900 (void)tp->t_fb->tfb_tcp_output(tp); 13901 } 13902 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 13903 rack_free_trim(rack); 13904 } 13905 if ((nxt_pkt == 0) && 13906 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 13907 (SEQ_GT(tp->snd_max, tp->snd_una) || 13908 (tp->t_flags & TF_DELACK) || 13909 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 13910 (tp->t_state <= TCPS_CLOSING)))) { 13911 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 13912 if ((tp->snd_max == tp->snd_una) && 13913 ((tp->t_flags & TF_DELACK) == 0) && 13914 (rack->rc_inp->inp_in_hpts) && 13915 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 13916 /* keep alive not needed if we are hptsi output yet */ 13917 ; 13918 } else { 13919 int late = 0; 13920 if (rack->rc_inp->inp_in_hpts) { 13921 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 13922 us_cts = tcp_get_usecs(NULL); 13923 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 13924 rack->r_early = 1; 13925 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 13926 } else 13927 late = 1; 13928 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 13929 } 13930 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 13931 } 13932 if (late && (did_out == 0)) { 13933 /* 13934 * We are late in the sending 13935 * and we did not call the output 13936 * (this probably should not happen). 13937 */ 13938 goto do_output_now; 13939 } 13940 rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); 13941 } 13942 way_out = 1; 13943 } else if (nxt_pkt == 0) { 13944 /* Do we have the correct timer running? */ 13945 rack_timer_audit(tp, rack, &so->so_snd); 13946 way_out = 2; 13947 } 13948 done_with_input: 13949 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, nsegs)); 13950 if (did_out) 13951 rack->r_wanted_output = 0; 13952 #ifdef INVARIANTS 13953 if (tp->t_inpcb == NULL) { 13954 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 13955 did_out, 13956 retval, tp, prev_state); 13957 } 13958 #endif 13959 #ifdef TCP_ACCOUNTING 13960 } else { 13961 /* 13962 * Track the time (see above). 13963 */ 13964 if (ack_val_set != 0xf) { 13965 uint64_t crtsc; 13966 13967 crtsc = get_cyclecount(); 13968 counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); 13969 /* 13970 * Note we *DO NOT* increment the per-tcb counters since 13971 * in the else the TP may be gone!! 13972 */ 13973 } 13974 #endif 13975 } 13976 #ifdef TCP_ACCOUNTING 13977 sched_unpin(); 13978 #endif 13979 return (retval); 13980 } 13981 13982 void 13983 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 13984 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 13985 { 13986 struct timeval tv; 13987 13988 /* First lets see if we have old packets */ 13989 if (tp->t_in_pkt) { 13990 if (ctf_do_queued_segments(so, tp, 1)) { 13991 m_freem(m); 13992 return; 13993 } 13994 } 13995 if (m->m_flags & M_TSTMP_LRO) { 13996 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 13997 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 13998 } else { 13999 /* Should not be should we kassert instead? */ 14000 tcp_get_usecs(&tv); 14001 } 14002 if (rack_do_segment_nounlock(m, th, so, tp, 14003 drop_hdrlen, tlen, iptos, 0, &tv) == 0) { 14004 INP_WUNLOCK(tp->t_inpcb); 14005 } 14006 } 14007 14008 struct rack_sendmap * 14009 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 14010 { 14011 struct rack_sendmap *rsm = NULL; 14012 int32_t idx; 14013 uint32_t srtt = 0, thresh = 0, ts_low = 0; 14014 14015 /* Return the next guy to be re-transmitted */ 14016 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 14017 return (NULL); 14018 } 14019 if (tp->t_flags & TF_SENTFIN) { 14020 /* retran the end FIN? */ 14021 return (NULL); 14022 } 14023 /* ok lets look at this one */ 14024 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 14025 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 14026 goto check_it; 14027 } 14028 rsm = rack_find_lowest_rsm(rack); 14029 if (rsm == NULL) { 14030 return (NULL); 14031 } 14032 check_it: 14033 if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && 14034 (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { 14035 /* 14036 * No sack so we automatically do the 3 strikes and 14037 * retransmit (no rack timer would be started). 14038 */ 14039 14040 return (rsm); 14041 } 14042 if (rsm->r_flags & RACK_ACKED) { 14043 return (NULL); 14044 } 14045 if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && 14046 (rsm->r_dupack < DUP_ACK_THRESHOLD)) { 14047 /* Its not yet ready */ 14048 return (NULL); 14049 } 14050 srtt = rack_grab_rtt(tp, rack); 14051 idx = rsm->r_rtr_cnt - 1; 14052 ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; 14053 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 14054 if ((tsused == ts_low) || 14055 (TSTMP_LT(tsused, ts_low))) { 14056 /* No time since sending */ 14057 return (NULL); 14058 } 14059 if ((tsused - ts_low) < thresh) { 14060 /* It has not been long enough yet */ 14061 return (NULL); 14062 } 14063 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 14064 ((rsm->r_flags & RACK_SACK_PASSED) && 14065 (rack->sack_attack_disable == 0))) { 14066 /* 14067 * We have passed the dup-ack threshold <or> 14068 * a SACK has indicated this is missing. 14069 * Note that if you are a declared attacker 14070 * it is only the dup-ack threshold that 14071 * will cause retransmits. 14072 */ 14073 /* log retransmit reason */ 14074 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 14075 rack->r_fast_output = 0; 14076 return (rsm); 14077 } 14078 return (NULL); 14079 } 14080 14081 static void 14082 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 14083 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 14084 int line, struct rack_sendmap *rsm) 14085 { 14086 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 14087 union tcp_log_stackspecific log; 14088 struct timeval tv; 14089 14090 memset(&log, 0, sizeof(log)); 14091 log.u_bbr.flex1 = slot; 14092 log.u_bbr.flex2 = len; 14093 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 14094 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 14095 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 14096 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 14097 log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; 14098 log.u_bbr.use_lt_bw <<= 1; 14099 log.u_bbr.use_lt_bw |= rack->r_late; 14100 log.u_bbr.use_lt_bw <<= 1; 14101 log.u_bbr.use_lt_bw |= rack->r_early; 14102 log.u_bbr.use_lt_bw <<= 1; 14103 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 14104 log.u_bbr.use_lt_bw <<= 1; 14105 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 14106 log.u_bbr.use_lt_bw <<= 1; 14107 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 14108 log.u_bbr.use_lt_bw <<= 1; 14109 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 14110 log.u_bbr.use_lt_bw <<= 1; 14111 log.u_bbr.use_lt_bw |= rack->gp_ready; 14112 log.u_bbr.pkt_epoch = line; 14113 log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; 14114 log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; 14115 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 14116 log.u_bbr.bw_inuse = bw_est; 14117 log.u_bbr.delRate = bw; 14118 if (rack->r_ctl.gp_bw == 0) 14119 log.u_bbr.cur_del_rate = 0; 14120 else 14121 log.u_bbr.cur_del_rate = rack_get_bw(rack); 14122 log.u_bbr.rttProp = len_time; 14123 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 14124 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 14125 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 14126 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 14127 /* We are in slow start */ 14128 log.u_bbr.flex7 = 1; 14129 } else { 14130 /* we are on congestion avoidance */ 14131 log.u_bbr.flex7 = 0; 14132 } 14133 log.u_bbr.flex8 = method; 14134 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14135 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14136 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 14137 log.u_bbr.cwnd_gain <<= 1; 14138 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 14139 log.u_bbr.cwnd_gain <<= 1; 14140 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 14141 TCP_LOG_EVENTP(rack->rc_tp, NULL, 14142 &rack->rc_inp->inp_socket->so_rcv, 14143 &rack->rc_inp->inp_socket->so_snd, 14144 BBR_LOG_HPTSI_CALC, 0, 14145 0, &log, false, &tv); 14146 } 14147 } 14148 14149 static uint32_t 14150 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 14151 { 14152 uint32_t new_tso, user_max; 14153 14154 user_max = rack->rc_user_set_max_segs * mss; 14155 if (rack->rc_force_max_seg) { 14156 return (user_max); 14157 } 14158 if (rack->use_fixed_rate && 14159 ((rack->r_ctl.crte == NULL) || 14160 (bw != rack->r_ctl.crte->rate))) { 14161 /* Use the user mss since we are not exactly matched */ 14162 return (user_max); 14163 } 14164 new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 14165 if (new_tso > user_max) 14166 new_tso = user_max; 14167 return (new_tso); 14168 } 14169 14170 static int32_t 14171 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) 14172 { 14173 uint64_t lentim, fill_bw; 14174 14175 /* Lets first see if we are full, if so continue with normal rate */ 14176 rack->r_via_fill_cw = 0; 14177 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 14178 return (slot); 14179 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 14180 return (slot); 14181 if (rack->r_ctl.rc_last_us_rtt == 0) 14182 return (slot); 14183 if (rack->rc_pace_fill_if_rttin_range && 14184 (rack->r_ctl.rc_last_us_rtt >= 14185 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 14186 /* The rtt is huge, N * smallest, lets not fill */ 14187 return (slot); 14188 } 14189 /* 14190 * first lets calculate the b/w based on the last us-rtt 14191 * and the sndwnd. 14192 */ 14193 fill_bw = rack->r_ctl.cwnd_to_use; 14194 /* Take the rwnd if its smaller */ 14195 if (fill_bw > rack->rc_tp->snd_wnd) 14196 fill_bw = rack->rc_tp->snd_wnd; 14197 if (rack->r_fill_less_agg) { 14198 /* 14199 * Now take away the inflight (this will reduce our 14200 * aggressiveness and yeah, if we get that much out in 1RTT 14201 * we will have had acks come back and still be behind). 14202 */ 14203 fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14204 } 14205 /* Now lets make it into a b/w */ 14206 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 14207 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 14208 /* We are below the min b/w */ 14209 if (non_paced) 14210 *rate_wanted = fill_bw; 14211 if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) 14212 return (slot); 14213 if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) 14214 fill_bw = rack->r_ctl.bw_rate_cap; 14215 rack->r_via_fill_cw = 1; 14216 if (rack->r_rack_hw_rate_caps && 14217 (rack->r_ctl.crte != NULL)) { 14218 uint64_t high_rate; 14219 14220 high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); 14221 if (fill_bw > high_rate) { 14222 /* We are capping bw at the highest rate table entry */ 14223 if (*rate_wanted > high_rate) { 14224 /* The original rate was also capped */ 14225 rack->r_via_fill_cw = 0; 14226 } 14227 rack_log_hdwr_pacing(rack, 14228 fill_bw, high_rate, __LINE__, 14229 0, 3); 14230 fill_bw = high_rate; 14231 if (capped) 14232 *capped = 1; 14233 } 14234 } else if ((rack->r_ctl.crte == NULL) && 14235 (rack->rack_hdrw_pacing == 0) && 14236 (rack->rack_hdw_pace_ena) && 14237 rack->r_rack_hw_rate_caps && 14238 (rack->rack_attempt_hdwr_pace == 0) && 14239 (rack->rc_inp->inp_route.ro_nh != NULL) && 14240 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14241 /* 14242 * Ok we may have a first attempt that is greater than our top rate 14243 * lets check. 14244 */ 14245 uint64_t high_rate; 14246 14247 high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); 14248 if (high_rate) { 14249 if (fill_bw > high_rate) { 14250 fill_bw = high_rate; 14251 if (capped) 14252 *capped = 1; 14253 } 14254 } 14255 } 14256 /* 14257 * Ok fill_bw holds our mythical b/w to fill the cwnd 14258 * in a rtt, what does that time wise equate too? 14259 */ 14260 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 14261 lentim /= fill_bw; 14262 *rate_wanted = fill_bw; 14263 if (non_paced || (lentim < slot)) { 14264 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 14265 0, lentim, 12, __LINE__, NULL); 14266 return ((int32_t)lentim); 14267 } else 14268 return (slot); 14269 } 14270 14271 static int32_t 14272 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 14273 { 14274 struct rack_sendmap *lrsm; 14275 int32_t slot = 0; 14276 int can_start_hw_pacing = 1; 14277 int err; 14278 14279 if (rack->rc_always_pace == 0) { 14280 /* 14281 * We use the most optimistic possible cwnd/srtt for 14282 * sending calculations. This will make our 14283 * calculation anticipate getting more through 14284 * quicker then possible. But thats ok we don't want 14285 * the peer to have a gap in data sending. 14286 */ 14287 uint32_t srtt, cwnd, tr_perms = 0; 14288 int32_t reduce = 0; 14289 14290 old_method: 14291 /* 14292 * We keep no precise pacing with the old method 14293 * instead we use the pacer to mitigate bursts. 14294 */ 14295 if (rack->r_ctl.rc_rack_min_rtt) 14296 srtt = rack->r_ctl.rc_rack_min_rtt; 14297 else 14298 srtt = max(tp->t_srtt, 1); 14299 if (rack->r_ctl.rc_rack_largest_cwnd) 14300 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 14301 else 14302 cwnd = rack->r_ctl.cwnd_to_use; 14303 /* Inflate cwnd by 1000 so srtt of usecs is in ms */ 14304 tr_perms = (cwnd * 1000) / srtt; 14305 if (tr_perms == 0) { 14306 tr_perms = ctf_fixed_maxseg(tp); 14307 } 14308 /* 14309 * Calculate how long this will take to drain, if 14310 * the calculation comes out to zero, thats ok we 14311 * will use send_a_lot to possibly spin around for 14312 * more increasing tot_len_this_send to the point 14313 * that its going to require a pace, or we hit the 14314 * cwnd. Which in that case we are just waiting for 14315 * a ACK. 14316 */ 14317 slot = len / tr_perms; 14318 /* Now do we reduce the time so we don't run dry? */ 14319 if (slot && rack_slot_reduction) { 14320 reduce = (slot / rack_slot_reduction); 14321 if (reduce < slot) { 14322 slot -= reduce; 14323 } else 14324 slot = 0; 14325 } 14326 slot *= HPTS_USEC_IN_MSEC; 14327 if (rsm == NULL) { 14328 /* 14329 * We always consider ourselves app limited with old style 14330 * that are not retransmits. This could be the initial 14331 * measurement, but thats ok its all setup and specially 14332 * handled. If another send leaks out, then that too will 14333 * be mark app-limited. 14334 */ 14335 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 14336 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 14337 rack->r_ctl.rc_first_appl = lrsm; 14338 lrsm->r_flags |= RACK_APP_LIMITED; 14339 rack->r_ctl.rc_app_limited_cnt++; 14340 } 14341 } 14342 if (rack->rc_pace_to_cwnd) { 14343 uint64_t rate_wanted = 0; 14344 14345 slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); 14346 rack->rc_ack_can_sendout_data = 1; 14347 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL); 14348 } else 14349 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 14350 } else { 14351 uint64_t bw_est, res, lentim, rate_wanted; 14352 uint32_t orig_val, srtt, segs, oh; 14353 int capped = 0; 14354 int prev_fill; 14355 14356 if ((rack->r_rr_config == 1) && rsm) { 14357 return (rack->r_ctl.rc_min_to); 14358 } 14359 if (rack->use_fixed_rate) { 14360 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 14361 } else if ((rack->r_ctl.init_rate == 0) && 14362 #ifdef NETFLIX_PEAKRATE 14363 (rack->rc_tp->t_maxpeakrate == 0) && 14364 #endif 14365 (rack->r_ctl.gp_bw == 0)) { 14366 /* no way to yet do an estimate */ 14367 bw_est = rate_wanted = 0; 14368 } else { 14369 bw_est = rack_get_bw(rack); 14370 rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); 14371 } 14372 if ((bw_est == 0) || (rate_wanted == 0) || 14373 ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { 14374 /* 14375 * No way yet to make a b/w estimate or 14376 * our raise is set incorrectly. 14377 */ 14378 goto old_method; 14379 } 14380 /* We need to account for all the overheads */ 14381 segs = (len + segsiz - 1) / segsiz; 14382 /* 14383 * We need the diff between 1514 bytes (e-mtu with e-hdr) 14384 * and how much data we put in each packet. Yes this 14385 * means we may be off if we are larger than 1500 bytes 14386 * or smaller. But this just makes us more conservative. 14387 */ 14388 if (rack_hw_rate_min && 14389 (bw_est < rack_hw_rate_min)) 14390 can_start_hw_pacing = 0; 14391 if (ETHERNET_SEGMENT_SIZE > segsiz) 14392 oh = ETHERNET_SEGMENT_SIZE - segsiz; 14393 else 14394 oh = 0; 14395 segs *= oh; 14396 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 14397 res = lentim / rate_wanted; 14398 slot = (uint32_t)res; 14399 orig_val = rack->r_ctl.rc_pace_max_segs; 14400 if (rack->r_ctl.crte == NULL) { 14401 /* 14402 * Only do this if we are not hardware pacing 14403 * since if we are doing hw-pacing below we will 14404 * set make a call after setting up or changing 14405 * the rate. 14406 */ 14407 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 14408 } else if (rack->rc_inp->inp_snd_tag == NULL) { 14409 /* 14410 * We lost our rate somehow, this can happen 14411 * if the interface changed underneath us. 14412 */ 14413 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 14414 rack->r_ctl.crte = NULL; 14415 /* Lets re-allow attempting to setup pacing */ 14416 rack->rack_hdrw_pacing = 0; 14417 rack->rack_attempt_hdwr_pace = 0; 14418 rack_log_hdwr_pacing(rack, 14419 rate_wanted, bw_est, __LINE__, 14420 0, 6); 14421 } 14422 /* Did we change the TSO size, if so log it */ 14423 if (rack->r_ctl.rc_pace_max_segs != orig_val) 14424 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 14425 prev_fill = rack->r_via_fill_cw; 14426 if ((rack->rc_pace_to_cwnd) && 14427 (capped == 0) && 14428 (rack->use_fixed_rate == 0) && 14429 (rack->in_probe_rtt == 0) && 14430 (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { 14431 /* 14432 * We want to pace at our rate *or* faster to 14433 * fill the cwnd to the max if its not full. 14434 */ 14435 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); 14436 } 14437 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 14438 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 14439 if ((rack->rack_hdw_pace_ena) && 14440 (can_start_hw_pacing > 0) && 14441 (rack->rack_hdrw_pacing == 0) && 14442 (rack->rack_attempt_hdwr_pace == 0)) { 14443 /* 14444 * Lets attempt to turn on hardware pacing 14445 * if we can. 14446 */ 14447 rack->rack_attempt_hdwr_pace = 1; 14448 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 14449 rack->rc_inp->inp_route.ro_nh->nh_ifp, 14450 rate_wanted, 14451 RS_PACING_GEQ, 14452 &err, &rack->r_ctl.crte_prev_rate); 14453 if (rack->r_ctl.crte) { 14454 rack->rack_hdrw_pacing = 1; 14455 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, 14456 0, rack->r_ctl.crte, 14457 NULL); 14458 rack_log_hdwr_pacing(rack, 14459 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14460 err, 0); 14461 rack->r_ctl.last_hw_bw_req = rate_wanted; 14462 } else { 14463 counter_u64_add(rack_hw_pace_init_fail, 1); 14464 } 14465 } else if (rack->rack_hdrw_pacing && 14466 (rack->r_ctl.last_hw_bw_req != rate_wanted)) { 14467 /* Do we need to adjust our rate? */ 14468 const struct tcp_hwrate_limit_table *nrte; 14469 14470 if (rack->r_up_only && 14471 (rate_wanted < rack->r_ctl.crte->rate)) { 14472 /** 14473 * We have four possible states here 14474 * having to do with the previous time 14475 * and this time. 14476 * previous | this-time 14477 * A) 0 | 0 -- fill_cw not in the picture 14478 * B) 1 | 0 -- we were doing a fill-cw but now are not 14479 * C) 1 | 1 -- all rates from fill_cw 14480 * D) 0 | 1 -- we were doing non-fill and now we are filling 14481 * 14482 * For case A, C and D we don't allow a drop. But for 14483 * case B where we now our on our steady rate we do 14484 * allow a drop. 14485 * 14486 */ 14487 if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) 14488 goto done_w_hdwr; 14489 } 14490 if ((rate_wanted > rack->r_ctl.crte->rate) || 14491 (rate_wanted <= rack->r_ctl.crte_prev_rate)) { 14492 if (rack_hw_rate_to_low && 14493 (bw_est < rack_hw_rate_to_low)) { 14494 /* 14495 * The pacing rate is too low for hardware, but 14496 * do allow hardware pacing to be restarted. 14497 */ 14498 rack_log_hdwr_pacing(rack, 14499 bw_est, rack->r_ctl.crte->rate, __LINE__, 14500 0, 5); 14501 tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); 14502 rack->r_ctl.crte = NULL; 14503 rack->rack_attempt_hdwr_pace = 0; 14504 rack->rack_hdrw_pacing = 0; 14505 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14506 goto done_w_hdwr; 14507 } 14508 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 14509 rack->rc_tp, 14510 rack->rc_inp->inp_route.ro_nh->nh_ifp, 14511 rate_wanted, 14512 RS_PACING_GEQ, 14513 &err, &rack->r_ctl.crte_prev_rate); 14514 if (nrte == NULL) { 14515 /* Lost the rate */ 14516 rack->rack_hdrw_pacing = 0; 14517 rack->r_ctl.crte = NULL; 14518 rack_log_hdwr_pacing(rack, 14519 rate_wanted, 0, __LINE__, 14520 err, 1); 14521 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14522 counter_u64_add(rack_hw_pace_lost, 1); 14523 } else if (nrte != rack->r_ctl.crte) { 14524 rack->r_ctl.crte = nrte; 14525 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, 14526 segsiz, 0, 14527 rack->r_ctl.crte, 14528 NULL); 14529 rack_log_hdwr_pacing(rack, 14530 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14531 err, 2); 14532 rack->r_ctl.last_hw_bw_req = rate_wanted; 14533 } 14534 } else { 14535 /* We just need to adjust the segment size */ 14536 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); 14537 rack_log_hdwr_pacing(rack, 14538 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 14539 0, 4); 14540 rack->r_ctl.last_hw_bw_req = rate_wanted; 14541 } 14542 } 14543 } 14544 if ((rack->r_ctl.crte != NULL) && 14545 (rack->r_ctl.crte->rate == rate_wanted)) { 14546 /* 14547 * We need to add a extra if the rates 14548 * are exactly matched. The idea is 14549 * we want the software to make sure the 14550 * queue is empty before adding more, this 14551 * gives us N MSS extra pace times where 14552 * N is our sysctl 14553 */ 14554 slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); 14555 } 14556 done_w_hdwr: 14557 if (rack_limit_time_with_srtt && 14558 (rack->use_fixed_rate == 0) && 14559 #ifdef NETFLIX_PEAKRATE 14560 (rack->rc_tp->t_maxpeakrate == 0) && 14561 #endif 14562 (rack->rack_hdrw_pacing == 0)) { 14563 /* 14564 * Sanity check, we do not allow the pacing delay 14565 * to be longer than the SRTT of the path. If it is 14566 * a slow path, then adding a packet should increase 14567 * the RTT and compensate for this i.e. the srtt will 14568 * be greater so the allowed pacing time will be greater. 14569 * 14570 * Note this restriction is not for where a peak rate 14571 * is set, we are doing fixed pacing or hardware pacing. 14572 */ 14573 if (rack->rc_tp->t_srtt) 14574 srtt = rack->rc_tp->t_srtt; 14575 else 14576 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 14577 if (srtt < slot) { 14578 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 14579 slot = srtt; 14580 } 14581 } 14582 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 14583 } 14584 if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { 14585 /* 14586 * If this rate is seeing enobufs when it 14587 * goes to send then either the nic is out 14588 * of gas or we are mis-estimating the time 14589 * somehow and not letting the queue empty 14590 * completely. Lets add to the pacing time. 14591 */ 14592 int hw_boost_delay; 14593 14594 hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; 14595 if (hw_boost_delay > rack_enobuf_hw_max) 14596 hw_boost_delay = rack_enobuf_hw_max; 14597 else if (hw_boost_delay < rack_enobuf_hw_min) 14598 hw_boost_delay = rack_enobuf_hw_min; 14599 slot += hw_boost_delay; 14600 } 14601 if (slot) 14602 counter_u64_add(rack_calc_nonzero, 1); 14603 else 14604 counter_u64_add(rack_calc_zero, 1); 14605 return (slot); 14606 } 14607 14608 static void 14609 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 14610 tcp_seq startseq, uint32_t sb_offset) 14611 { 14612 struct rack_sendmap *my_rsm = NULL; 14613 struct rack_sendmap fe; 14614 14615 if (tp->t_state < TCPS_ESTABLISHED) { 14616 /* 14617 * We don't start any measurements if we are 14618 * not at least established. 14619 */ 14620 return; 14621 } 14622 tp->t_flags |= TF_GPUTINPROG; 14623 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 14624 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 14625 tp->gput_seq = startseq; 14626 rack->app_limited_needs_set = 0; 14627 if (rack->in_probe_rtt) 14628 rack->measure_saw_probe_rtt = 1; 14629 else if ((rack->measure_saw_probe_rtt) && 14630 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 14631 rack->measure_saw_probe_rtt = 0; 14632 if (rack->rc_gp_filled) 14633 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 14634 else { 14635 /* Special case initial measurement */ 14636 struct timeval tv; 14637 14638 tp->gput_ts = tcp_get_usecs(&tv); 14639 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 14640 } 14641 /* 14642 * We take a guess out into the future, 14643 * if we have no measurement and no 14644 * initial rate, we measure the first 14645 * initial-windows worth of data to 14646 * speed up getting some GP measurement and 14647 * thus start pacing. 14648 */ 14649 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 14650 rack->app_limited_needs_set = 1; 14651 tp->gput_ack = startseq + max(rc_init_window(rack), 14652 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 14653 rack_log_pacing_delay_calc(rack, 14654 tp->gput_seq, 14655 tp->gput_ack, 14656 0, 14657 tp->gput_ts, 14658 rack->r_ctl.rc_app_limited_cnt, 14659 9, 14660 __LINE__, NULL); 14661 return; 14662 } 14663 if (sb_offset) { 14664 /* 14665 * We are out somewhere in the sb 14666 * can we use the already outstanding data? 14667 */ 14668 14669 if (rack->r_ctl.rc_app_limited_cnt == 0) { 14670 /* 14671 * Yes first one is good and in this case 14672 * the tp->gput_ts is correctly set based on 14673 * the last ack that arrived (no need to 14674 * set things up when an ack comes in). 14675 */ 14676 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 14677 if ((my_rsm == NULL) || 14678 (my_rsm->r_rtr_cnt != 1)) { 14679 /* retransmission? */ 14680 goto use_latest; 14681 } 14682 } else { 14683 if (rack->r_ctl.rc_first_appl == NULL) { 14684 /* 14685 * If rc_first_appl is NULL 14686 * then the cnt should be 0. 14687 * This is probably an error, maybe 14688 * a KASSERT would be approprate. 14689 */ 14690 goto use_latest; 14691 } 14692 /* 14693 * If we have a marker pointer to the last one that is 14694 * app limited we can use that, but we need to set 14695 * things up so that when it gets ack'ed we record 14696 * the ack time (if its not already acked). 14697 */ 14698 rack->app_limited_needs_set = 1; 14699 /* 14700 * We want to get to the rsm that is either 14701 * next with space i.e. over 1 MSS or the one 14702 * after that (after the app-limited). 14703 */ 14704 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 14705 rack->r_ctl.rc_first_appl); 14706 if (my_rsm) { 14707 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 14708 /* Have to use the next one */ 14709 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 14710 my_rsm); 14711 else { 14712 /* Use after the first MSS of it is acked */ 14713 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 14714 goto start_set; 14715 } 14716 } 14717 if ((my_rsm == NULL) || 14718 (my_rsm->r_rtr_cnt != 1)) { 14719 /* 14720 * Either its a retransmit or 14721 * the last is the app-limited one. 14722 */ 14723 goto use_latest; 14724 } 14725 } 14726 tp->gput_seq = my_rsm->r_start; 14727 start_set: 14728 if (my_rsm->r_flags & RACK_ACKED) { 14729 /* 14730 * This one has been acked use the arrival ack time 14731 */ 14732 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 14733 rack->app_limited_needs_set = 0; 14734 } 14735 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 14736 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 14737 rack_log_pacing_delay_calc(rack, 14738 tp->gput_seq, 14739 tp->gput_ack, 14740 (uint64_t)my_rsm, 14741 tp->gput_ts, 14742 rack->r_ctl.rc_app_limited_cnt, 14743 9, 14744 __LINE__, NULL); 14745 return; 14746 } 14747 14748 use_latest: 14749 /* 14750 * We don't know how long we may have been 14751 * idle or if this is the first-send. Lets 14752 * setup the flag so we will trim off 14753 * the first ack'd data so we get a true 14754 * measurement. 14755 */ 14756 rack->app_limited_needs_set = 1; 14757 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 14758 /* Find this guy so we can pull the send time */ 14759 fe.r_start = startseq; 14760 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 14761 if (my_rsm) { 14762 rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; 14763 if (my_rsm->r_flags & RACK_ACKED) { 14764 /* 14765 * Unlikely since its probably what was 14766 * just transmitted (but I am paranoid). 14767 */ 14768 tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; 14769 rack->app_limited_needs_set = 0; 14770 } 14771 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 14772 /* This also is unlikely */ 14773 tp->gput_seq = my_rsm->r_start; 14774 } 14775 } else { 14776 /* 14777 * TSNH unless we have some send-map limit, 14778 * and even at that it should not be hitting 14779 * that limit (we should have stopped sending). 14780 */ 14781 struct timeval tv; 14782 14783 microuptime(&tv); 14784 rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); 14785 } 14786 rack_log_pacing_delay_calc(rack, 14787 tp->gput_seq, 14788 tp->gput_ack, 14789 (uint64_t)my_rsm, 14790 tp->gput_ts, 14791 rack->r_ctl.rc_app_limited_cnt, 14792 9, __LINE__, NULL); 14793 } 14794 14795 static inline uint32_t 14796 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 14797 uint32_t avail, int32_t sb_offset) 14798 { 14799 uint32_t len; 14800 uint32_t sendwin; 14801 14802 if (tp->snd_wnd > cwnd_to_use) 14803 sendwin = cwnd_to_use; 14804 else 14805 sendwin = tp->snd_wnd; 14806 if (ctf_outstanding(tp) >= tp->snd_wnd) { 14807 /* We never want to go over our peers rcv-window */ 14808 len = 0; 14809 } else { 14810 uint32_t flight; 14811 14812 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 14813 if (flight >= sendwin) { 14814 /* 14815 * We have in flight what we are allowed by cwnd (if 14816 * it was rwnd blocking it would have hit above out 14817 * >= tp->snd_wnd). 14818 */ 14819 return (0); 14820 } 14821 len = sendwin - flight; 14822 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 14823 /* We would send too much (beyond the rwnd) */ 14824 len = tp->snd_wnd - ctf_outstanding(tp); 14825 } 14826 if ((len + sb_offset) > avail) { 14827 /* 14828 * We don't have that much in the SB, how much is 14829 * there? 14830 */ 14831 len = avail - sb_offset; 14832 } 14833 } 14834 return (len); 14835 } 14836 14837 static void 14838 rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, 14839 unsigned ipoptlen, int32_t orig_len, int32_t len, int error, 14840 int rsm_is_null, int optlen, int line, uint16_t mode) 14841 { 14842 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 14843 union tcp_log_stackspecific log; 14844 struct timeval tv; 14845 14846 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 14847 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 14848 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 14849 log.u_bbr.flex1 = error; 14850 log.u_bbr.flex2 = flags; 14851 log.u_bbr.flex3 = rsm_is_null; 14852 log.u_bbr.flex4 = ipoptlen; 14853 log.u_bbr.flex5 = tp->rcv_numsacks; 14854 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 14855 log.u_bbr.flex7 = optlen; 14856 log.u_bbr.flex8 = rack->r_fsb_inited; 14857 log.u_bbr.applimited = rack->r_fast_output; 14858 log.u_bbr.bw_inuse = rack_get_bw(rack); 14859 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 14860 log.u_bbr.cwnd_gain = mode; 14861 log.u_bbr.pkts_out = orig_len; 14862 log.u_bbr.lt_epoch = len; 14863 log.u_bbr.delivered = line; 14864 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 14865 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 14866 tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, 14867 len, &log, false, NULL, NULL, 0, &tv); 14868 } 14869 } 14870 14871 14872 static struct mbuf * 14873 rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, 14874 struct rack_fast_send_blk *fsb, 14875 int32_t seglimit, int32_t segsize) 14876 { 14877 #ifdef KERN_TLS 14878 struct ktls_session *tls, *ntls; 14879 struct mbuf *start; 14880 #endif 14881 struct mbuf *m, *n, **np, *smb; 14882 struct mbuf *top; 14883 int32_t off, soff; 14884 int32_t len = *plen; 14885 int32_t fragsize; 14886 int32_t len_cp = 0; 14887 uint32_t mlen, frags; 14888 14889 soff = off = the_off; 14890 smb = m = the_m; 14891 np = ⊤ 14892 top = NULL; 14893 #ifdef KERN_TLS 14894 if (hw_tls && (m->m_flags & M_EXTPG)) 14895 tls = m->m_epg_tls; 14896 else 14897 tls = NULL; 14898 start = m; 14899 #endif 14900 while (len > 0) { 14901 if (m == NULL) { 14902 *plen = len_cp; 14903 break; 14904 } 14905 #ifdef KERN_TLS 14906 if (hw_tls) { 14907 if (m->m_flags & M_EXTPG) 14908 ntls = m->m_epg_tls; 14909 else 14910 ntls = NULL; 14911 14912 /* 14913 * Avoid mixing TLS records with handshake 14914 * data or TLS records from different 14915 * sessions. 14916 */ 14917 if (tls != ntls) { 14918 MPASS(m != start); 14919 *plen = len_cp; 14920 break; 14921 } 14922 } 14923 #endif 14924 mlen = min(len, m->m_len - off); 14925 if (seglimit) { 14926 /* 14927 * For M_EXTPG mbufs, add 3 segments 14928 * + 1 in case we are crossing page boundaries 14929 * + 2 in case the TLS hdr/trailer are used 14930 * It is cheaper to just add the segments 14931 * than it is to take the cache miss to look 14932 * at the mbuf ext_pgs state in detail. 14933 */ 14934 if (m->m_flags & M_EXTPG) { 14935 fragsize = min(segsize, PAGE_SIZE); 14936 frags = 3; 14937 } else { 14938 fragsize = segsize; 14939 frags = 0; 14940 } 14941 14942 /* Break if we really can't fit anymore. */ 14943 if ((frags + 1) >= seglimit) { 14944 *plen = len_cp; 14945 break; 14946 } 14947 14948 /* 14949 * Reduce size if you can't copy the whole 14950 * mbuf. If we can't copy the whole mbuf, also 14951 * adjust len so the loop will end after this 14952 * mbuf. 14953 */ 14954 if ((frags + howmany(mlen, fragsize)) >= seglimit) { 14955 mlen = (seglimit - frags - 1) * fragsize; 14956 len = mlen; 14957 *plen = len_cp + len; 14958 } 14959 frags += howmany(mlen, fragsize); 14960 if (frags == 0) 14961 frags++; 14962 seglimit -= frags; 14963 KASSERT(seglimit > 0, 14964 ("%s: seglimit went too low", __func__)); 14965 } 14966 n = m_get(M_NOWAIT, m->m_type); 14967 *np = n; 14968 if (n == NULL) 14969 goto nospace; 14970 n->m_len = mlen; 14971 soff += mlen; 14972 len_cp += n->m_len; 14973 if (m->m_flags & (M_EXT|M_EXTPG)) { 14974 n->m_data = m->m_data + off; 14975 mb_dupcl(n, m); 14976 } else { 14977 bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), 14978 (u_int)n->m_len); 14979 } 14980 len -= n->m_len; 14981 off = 0; 14982 m = m->m_next; 14983 np = &n->m_next; 14984 if (len || (soff == smb->m_len)) { 14985 /* 14986 * We have more so we move forward or 14987 * we have consumed the entire mbuf and 14988 * len has fell to 0. 14989 */ 14990 soff = 0; 14991 smb = m; 14992 } 14993 14994 } 14995 if (fsb != NULL) { 14996 fsb->m = smb; 14997 fsb->off = soff; 14998 if (smb) { 14999 /* 15000 * Save off the size of the mbuf. We do 15001 * this so that we can recognize when it 15002 * has been trimmed by sbcut() as acks 15003 * come in. 15004 */ 15005 fsb->o_m_len = smb->m_len; 15006 } else { 15007 /* 15008 * This is the case where the next mbuf went to NULL. This 15009 * means with this copy we have sent everything in the sb. 15010 * In theory we could clear the fast_output flag, but lets 15011 * not since its possible that we could get more added 15012 * and acks that call the extend function which would let 15013 * us send more. 15014 */ 15015 fsb->o_m_len = 0; 15016 } 15017 } 15018 return (top); 15019 nospace: 15020 if (top) 15021 m_freem(top); 15022 return (NULL); 15023 15024 } 15025 15026 /* 15027 * This is a copy of m_copym(), taking the TSO segment size/limit 15028 * constraints into account, and advancing the sndptr as it goes. 15029 */ 15030 static struct mbuf * 15031 rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, 15032 int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) 15033 { 15034 struct mbuf *m, *n; 15035 int32_t soff; 15036 15037 soff = rack->r_ctl.fsb.off; 15038 m = rack->r_ctl.fsb.m; 15039 if (rack->r_ctl.fsb.o_m_len != m->m_len) { 15040 /* 15041 * The mbuf had the front of it chopped off by an ack 15042 * we need to adjust the soff/off by that difference. 15043 */ 15044 uint32_t delta; 15045 15046 delta = rack->r_ctl.fsb.o_m_len - m->m_len; 15047 soff -= delta; 15048 } 15049 KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); 15050 KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); 15051 KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", 15052 __FUNCTION__, 15053 rack, *plen, m, m->m_len)); 15054 /* Save off the right location before we copy and advance */ 15055 *s_soff = soff; 15056 *s_mb = rack->r_ctl.fsb.m; 15057 n = rack_fo_base_copym(m, soff, plen, 15058 &rack->r_ctl.fsb, 15059 seglimit, segsize); 15060 return (n); 15061 } 15062 15063 static int 15064 rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, 15065 uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len) 15066 { 15067 /* 15068 * Enter the fast retransmit path. We are given that a sched_pin is 15069 * in place (if accounting is compliled in) and the cycle count taken 15070 * at the entry is in the ts_val. The concept her is that the rsm 15071 * now holds the mbuf offsets and such so we can directly transmit 15072 * without a lot of overhead, the len field is already set for 15073 * us to prohibit us from sending too much (usually its 1MSS). 15074 */ 15075 struct ip *ip = NULL; 15076 struct udphdr *udp = NULL; 15077 struct tcphdr *th = NULL; 15078 struct mbuf *m = NULL; 15079 struct inpcb *inp; 15080 uint8_t *cpto; 15081 struct tcp_log_buffer *lgb; 15082 #ifdef TCP_ACCOUNTING 15083 uint64_t crtsc; 15084 int cnt_thru = 1; 15085 #endif 15086 int doing_tlp = 0; 15087 struct tcpopt to; 15088 u_char opt[TCP_MAXOLEN]; 15089 uint32_t hdrlen, optlen; 15090 int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0; 15091 uint32_t us_cts; 15092 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15093 uint32_t if_hw_tsomaxsegsize; 15094 15095 #ifdef INET6 15096 struct ip6_hdr *ip6 = NULL; 15097 15098 if (rack->r_is_v6) { 15099 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15100 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15101 } else 15102 #endif /* INET6 */ 15103 { 15104 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15105 hdrlen = sizeof(struct tcpiphdr); 15106 } 15107 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15108 goto failed; 15109 } 15110 if (rsm->r_flags & RACK_TLP) 15111 doing_tlp = 1; 15112 startseq = rsm->r_start; 15113 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15114 inp = rack->rc_inp; 15115 to.to_flags = 0; 15116 flags = tcp_outflags[tp->t_state]; 15117 if (flags & (TH_SYN|TH_RST)) { 15118 goto failed; 15119 } 15120 if (rsm->r_flags & RACK_HAS_FIN) { 15121 /* We can't send a FIN here */ 15122 goto failed; 15123 } 15124 if (flags & TH_FIN) { 15125 /* We never send a FIN */ 15126 flags &= ~TH_FIN; 15127 } 15128 if (tp->t_flags & TF_RCVD_TSTMP) { 15129 to.to_tsval = ms_cts + tp->ts_offset; 15130 to.to_tsecr = tp->ts_recent; 15131 to.to_flags = TOF_TS; 15132 } 15133 optlen = tcp_addoptions(&to, opt); 15134 hdrlen += optlen; 15135 udp = rack->r_ctl.fsb.udp; 15136 if (udp) 15137 hdrlen += sizeof(struct udphdr); 15138 if (rack->r_ctl.rc_pace_max_segs) 15139 max_val = rack->r_ctl.rc_pace_max_segs; 15140 else if (rack->rc_user_set_max_segs) 15141 max_val = rack->rc_user_set_max_segs * segsiz; 15142 else 15143 max_val = len; 15144 if ((tp->t_flags & TF_TSO) && 15145 V_tcp_do_tso && 15146 (len > segsiz) && 15147 (tp->t_port == 0)) 15148 tso = 1; 15149 #ifdef INET6 15150 if (MHLEN < hdrlen + max_linkhdr) 15151 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15152 else 15153 #endif 15154 m = m_gethdr(M_NOWAIT, MT_DATA); 15155 if (m == NULL) 15156 goto failed; 15157 m->m_data += max_linkhdr; 15158 m->m_len = hdrlen; 15159 th = rack->r_ctl.fsb.th; 15160 /* Establish the len to send */ 15161 if (len > max_val) 15162 len = max_val; 15163 if ((tso) && (len + optlen > tp->t_maxseg)) { 15164 uint32_t if_hw_tsomax; 15165 int32_t max_len; 15166 15167 /* extract TSO information */ 15168 if_hw_tsomax = tp->t_tsomax; 15169 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15170 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15171 /* 15172 * Check if we should limit by maximum payload 15173 * length: 15174 */ 15175 if (if_hw_tsomax != 0) { 15176 /* compute maximum TSO length */ 15177 max_len = (if_hw_tsomax - hdrlen - 15178 max_linkhdr); 15179 if (max_len <= 0) { 15180 goto failed; 15181 } else if (len > max_len) { 15182 len = max_len; 15183 } 15184 } 15185 if (len <= segsiz) { 15186 /* 15187 * In case there are too many small fragments don't 15188 * use TSO: 15189 */ 15190 tso = 0; 15191 } 15192 } else { 15193 tso = 0; 15194 } 15195 if ((tso == 0) && (len > segsiz)) 15196 len = segsiz; 15197 us_cts = tcp_get_usecs(tv); 15198 if ((len == 0) || 15199 (len <= MHLEN - hdrlen - max_linkhdr)) { 15200 goto failed; 15201 } 15202 th->th_seq = htonl(rsm->r_start); 15203 th->th_ack = htonl(tp->rcv_nxt); 15204 /* 15205 * The PUSH bit should only be applied 15206 * if the full retransmission is made. If 15207 * we are sending less than this is the 15208 * left hand edge and should not have 15209 * the PUSH bit. 15210 */ 15211 if ((rsm->r_flags & RACK_HAD_PUSH) && 15212 (len == (rsm->r_end - rsm->r_start))) 15213 flags |= TH_PUSH; 15214 th->th_flags = flags; 15215 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15216 if (th->th_win == 0) { 15217 tp->t_sndzerowin++; 15218 tp->t_flags |= TF_RXWIN0SENT; 15219 } else 15220 tp->t_flags &= ~TF_RXWIN0SENT; 15221 if (rsm->r_flags & RACK_TLP) { 15222 /* 15223 * TLP should not count in retran count, but 15224 * in its own bin 15225 */ 15226 counter_u64_add(rack_tlp_retran, 1); 15227 counter_u64_add(rack_tlp_retran_bytes, len); 15228 } else { 15229 tp->t_sndrexmitpack++; 15230 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 15231 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 15232 } 15233 #ifdef STATS 15234 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 15235 len); 15236 #endif 15237 if (rsm->m == NULL) 15238 goto failed; 15239 if (rsm->orig_m_len != rsm->m->m_len) { 15240 /* Fix up the orig_m_len and possibly the mbuf offset */ 15241 rack_adjust_orig_mlen(rsm); 15242 } 15243 m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize); 15244 if (len <= segsiz) { 15245 /* 15246 * Must have ran out of mbufs for the copy 15247 * shorten it to no longer need tso. Lets 15248 * not put on sendalot since we are low on 15249 * mbufs. 15250 */ 15251 tso = 0; 15252 } 15253 if ((m->m_next == NULL) || (len <= 0)){ 15254 goto failed; 15255 } 15256 if (udp) { 15257 if (rack->r_is_v6) 15258 ulen = hdrlen + len - sizeof(struct ip6_hdr); 15259 else 15260 ulen = hdrlen + len - sizeof(struct ip); 15261 udp->uh_ulen = htons(ulen); 15262 } 15263 m->m_pkthdr.rcvif = (struct ifnet *)0; 15264 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 15265 #ifdef INET6 15266 if (rack->r_is_v6) { 15267 if (tp->t_port) { 15268 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 15269 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15270 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 15271 th->th_sum = htons(0); 15272 UDPSTAT_INC(udps_opackets); 15273 } else { 15274 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 15275 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15276 th->th_sum = in6_cksum_pseudo(ip6, 15277 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 15278 0); 15279 } 15280 } 15281 #endif 15282 #if defined(INET6) && defined(INET) 15283 else 15284 #endif 15285 #ifdef INET 15286 { 15287 if (tp->t_port) { 15288 m->m_pkthdr.csum_flags = CSUM_UDP; 15289 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15290 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 15291 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 15292 th->th_sum = htons(0); 15293 UDPSTAT_INC(udps_opackets); 15294 } else { 15295 m->m_pkthdr.csum_flags = CSUM_TCP; 15296 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15297 th->th_sum = in_pseudo(ip->ip_src.s_addr, 15298 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 15299 IPPROTO_TCP + len + optlen)); 15300 } 15301 /* IP version must be set here for ipv4/ipv6 checking later */ 15302 KASSERT(ip->ip_v == IPVERSION, 15303 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 15304 } 15305 #endif 15306 if (tso) { 15307 KASSERT(len > tp->t_maxseg - optlen, 15308 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 15309 m->m_pkthdr.csum_flags |= CSUM_TSO; 15310 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 15311 } 15312 #ifdef INET6 15313 if (rack->r_is_v6) { 15314 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 15315 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 15316 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 15317 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15318 else 15319 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15320 } 15321 #endif 15322 #if defined(INET) && defined(INET6) 15323 else 15324 #endif 15325 #ifdef INET 15326 { 15327 ip->ip_len = htons(m->m_pkthdr.len); 15328 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 15329 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 15330 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15331 if (tp->t_port == 0 || len < V_tcp_minmss) { 15332 ip->ip_off |= htons(IP_DF); 15333 } 15334 } else { 15335 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15336 } 15337 } 15338 #endif 15339 /* Time to copy in our header */ 15340 cpto = mtod(m, uint8_t *); 15341 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 15342 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 15343 if (optlen) { 15344 bcopy(opt, th + 1, optlen); 15345 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 15346 } else { 15347 th->th_off = sizeof(struct tcphdr) >> 2; 15348 } 15349 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15350 union tcp_log_stackspecific log; 15351 15352 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15353 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 15354 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 15355 if (rack->rack_no_prr) 15356 log.u_bbr.flex1 = 0; 15357 else 15358 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15359 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 15360 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 15361 log.u_bbr.flex4 = max_val; 15362 log.u_bbr.flex5 = 0; 15363 /* Save off the early/late values */ 15364 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15365 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 15366 log.u_bbr.bw_inuse = rack_get_bw(rack); 15367 log.u_bbr.flex8 = 1; 15368 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15369 log.u_bbr.flex7 = 55; 15370 log.u_bbr.pkts_out = tp->t_maxseg; 15371 log.u_bbr.timeStamp = cts; 15372 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15373 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 15374 log.u_bbr.delivered = 0; 15375 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15376 len, &log, false, NULL, NULL, 0, tv); 15377 } else 15378 lgb = NULL; 15379 #ifdef INET6 15380 if (rack->r_is_v6) { 15381 error = ip6_output(m, NULL, 15382 &inp->inp_route6, 15383 0, NULL, NULL, inp); 15384 } 15385 #endif 15386 #if defined(INET) && defined(INET6) 15387 else 15388 #endif 15389 #ifdef INET 15390 { 15391 error = ip_output(m, NULL, 15392 &inp->inp_route, 15393 0, 0, inp); 15394 } 15395 #endif 15396 m = NULL; 15397 if (lgb) { 15398 lgb->tlb_errno = error; 15399 lgb = NULL; 15400 } 15401 if (error) { 15402 goto failed; 15403 } 15404 rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), 15405 rsm, RACK_SENT_FP, rsm->m, rsm->soff); 15406 if (doing_tlp && (rack->fast_rsm_hack == 0)) { 15407 rack->rc_tlp_in_progress = 1; 15408 rack->r_ctl.rc_tlp_cnt_out++; 15409 } 15410 if (error == 0) 15411 tcp_account_for_send(tp, len, 1, doing_tlp); 15412 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 15413 rack->forced_ack = 0; /* If we send something zap the FA flag */ 15414 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 15415 rack->r_ctl.retran_during_recovery += len; 15416 { 15417 int idx; 15418 15419 idx = (len / segsiz) + 3; 15420 if (idx >= TCP_MSS_ACCT_ATIMER) 15421 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 15422 else 15423 counter_u64_add(rack_out_size[idx], 1); 15424 } 15425 if (tp->t_rtttime == 0) { 15426 tp->t_rtttime = ticks; 15427 tp->t_rtseq = startseq; 15428 KMOD_TCPSTAT_INC(tcps_segstimed); 15429 } 15430 counter_u64_add(rack_fto_rsm_send, 1); 15431 if (error && (error == ENOBUFS)) { 15432 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 15433 if (rack->rc_enobuf < 0x7f) 15434 rack->rc_enobuf++; 15435 if (slot < (10 * HPTS_USEC_IN_MSEC)) 15436 slot = 10 * HPTS_USEC_IN_MSEC; 15437 } else 15438 slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); 15439 if ((slot == 0) || 15440 (rack->rc_always_pace == 0) || 15441 (rack->r_rr_config == 1)) { 15442 /* 15443 * We have no pacing set or we 15444 * are using old-style rack or 15445 * we are overriden to use the old 1ms pacing. 15446 */ 15447 slot = rack->r_ctl.rc_min_to; 15448 } 15449 rack_start_hpts_timer(rack, tp, cts, slot, len, 0); 15450 if (rack->r_must_retran) { 15451 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 15452 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 15453 /* 15454 * We have retransmitted all we need. 15455 */ 15456 rack->r_must_retran = 0; 15457 rack->r_ctl.rc_out_at_rto = 0; 15458 } 15459 } 15460 #ifdef TCP_ACCOUNTING 15461 crtsc = get_cyclecount(); 15462 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15463 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 15464 } 15465 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 15466 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15467 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 15468 } 15469 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 15470 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15471 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); 15472 } 15473 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz)); 15474 sched_unpin(); 15475 #endif 15476 return (0); 15477 failed: 15478 if (m) 15479 m_free(m); 15480 return (-1); 15481 } 15482 15483 static void 15484 rack_sndbuf_autoscale(struct tcp_rack *rack) 15485 { 15486 /* 15487 * Automatic sizing of send socket buffer. Often the send buffer 15488 * size is not optimally adjusted to the actual network conditions 15489 * at hand (delay bandwidth product). Setting the buffer size too 15490 * small limits throughput on links with high bandwidth and high 15491 * delay (eg. trans-continental/oceanic links). Setting the 15492 * buffer size too big consumes too much real kernel memory, 15493 * especially with many connections on busy servers. 15494 * 15495 * The criteria to step up the send buffer one notch are: 15496 * 1. receive window of remote host is larger than send buffer 15497 * (with a fudge factor of 5/4th); 15498 * 2. send buffer is filled to 7/8th with data (so we actually 15499 * have data to make use of it); 15500 * 3. send buffer fill has not hit maximal automatic size; 15501 * 4. our send window (slow start and cogestion controlled) is 15502 * larger than sent but unacknowledged data in send buffer. 15503 * 15504 * Note that the rack version moves things much faster since 15505 * we want to avoid hitting cache lines in the rack_fast_output() 15506 * path so this is called much less often and thus moves 15507 * the SB forward by a percentage. 15508 */ 15509 struct socket *so; 15510 struct tcpcb *tp; 15511 uint32_t sendwin, scaleup; 15512 15513 tp = rack->rc_tp; 15514 so = rack->rc_inp->inp_socket; 15515 sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); 15516 if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { 15517 if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && 15518 sbused(&so->so_snd) >= 15519 (so->so_snd.sb_hiwat / 8 * 7) && 15520 sbused(&so->so_snd) < V_tcp_autosndbuf_max && 15521 sendwin >= (sbused(&so->so_snd) - 15522 (tp->snd_nxt - tp->snd_una))) { 15523 if (rack_autosndbuf_inc) 15524 scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; 15525 else 15526 scaleup = V_tcp_autosndbuf_inc; 15527 if (scaleup < V_tcp_autosndbuf_inc) 15528 scaleup = V_tcp_autosndbuf_inc; 15529 scaleup += so->so_snd.sb_hiwat; 15530 if (scaleup > V_tcp_autosndbuf_max) 15531 scaleup = V_tcp_autosndbuf_max; 15532 if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread)) 15533 so->so_snd.sb_flags &= ~SB_AUTOSIZE; 15534 } 15535 } 15536 } 15537 15538 static int 15539 rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, 15540 uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) 15541 { 15542 /* 15543 * Enter to do fast output. We are given that the sched_pin is 15544 * in place (if accounting is compiled in) and the cycle count taken 15545 * at entry is in place in ts_val. The idea here is that 15546 * we know how many more bytes needs to be sent (presumably either 15547 * during pacing or to fill the cwnd and that was greater than 15548 * the max-burst). We have how much to send and all the info we 15549 * need to just send. 15550 */ 15551 struct ip *ip = NULL; 15552 struct udphdr *udp = NULL; 15553 struct tcphdr *th = NULL; 15554 struct mbuf *m, *s_mb; 15555 struct inpcb *inp; 15556 uint8_t *cpto; 15557 struct tcp_log_buffer *lgb; 15558 #ifdef TCP_ACCOUNTING 15559 uint64_t crtsc; 15560 #endif 15561 struct tcpopt to; 15562 u_char opt[TCP_MAXOLEN]; 15563 uint32_t hdrlen, optlen; 15564 int cnt_thru = 1; 15565 int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0; 15566 uint32_t us_cts, s_soff; 15567 uint32_t if_hw_tsomaxsegcount = 0, startseq; 15568 uint32_t if_hw_tsomaxsegsize; 15569 uint16_t add_flag = RACK_SENT_FP; 15570 #ifdef INET6 15571 struct ip6_hdr *ip6 = NULL; 15572 15573 if (rack->r_is_v6) { 15574 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 15575 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 15576 } else 15577 #endif /* INET6 */ 15578 { 15579 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 15580 hdrlen = sizeof(struct tcpiphdr); 15581 } 15582 if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { 15583 m = NULL; 15584 goto failed; 15585 } 15586 startseq = tp->snd_max; 15587 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 15588 inp = rack->rc_inp; 15589 len = rack->r_ctl.fsb.left_to_send; 15590 to.to_flags = 0; 15591 flags = rack->r_ctl.fsb.tcp_flags; 15592 if (tp->t_flags & TF_RCVD_TSTMP) { 15593 to.to_tsval = ms_cts + tp->ts_offset; 15594 to.to_tsecr = tp->ts_recent; 15595 to.to_flags = TOF_TS; 15596 } 15597 optlen = tcp_addoptions(&to, opt); 15598 hdrlen += optlen; 15599 udp = rack->r_ctl.fsb.udp; 15600 if (udp) 15601 hdrlen += sizeof(struct udphdr); 15602 if (rack->r_ctl.rc_pace_max_segs) 15603 max_val = rack->r_ctl.rc_pace_max_segs; 15604 else if (rack->rc_user_set_max_segs) 15605 max_val = rack->rc_user_set_max_segs * segsiz; 15606 else 15607 max_val = len; 15608 if ((tp->t_flags & TF_TSO) && 15609 V_tcp_do_tso && 15610 (len > segsiz) && 15611 (tp->t_port == 0)) 15612 tso = 1; 15613 again: 15614 #ifdef INET6 15615 if (MHLEN < hdrlen + max_linkhdr) 15616 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 15617 else 15618 #endif 15619 m = m_gethdr(M_NOWAIT, MT_DATA); 15620 if (m == NULL) 15621 goto failed; 15622 m->m_data += max_linkhdr; 15623 m->m_len = hdrlen; 15624 th = rack->r_ctl.fsb.th; 15625 /* Establish the len to send */ 15626 if (len > max_val) 15627 len = max_val; 15628 if ((tso) && (len + optlen > tp->t_maxseg)) { 15629 uint32_t if_hw_tsomax; 15630 int32_t max_len; 15631 15632 /* extract TSO information */ 15633 if_hw_tsomax = tp->t_tsomax; 15634 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 15635 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 15636 /* 15637 * Check if we should limit by maximum payload 15638 * length: 15639 */ 15640 if (if_hw_tsomax != 0) { 15641 /* compute maximum TSO length */ 15642 max_len = (if_hw_tsomax - hdrlen - 15643 max_linkhdr); 15644 if (max_len <= 0) { 15645 goto failed; 15646 } else if (len > max_len) { 15647 len = max_len; 15648 } 15649 } 15650 if (len <= segsiz) { 15651 /* 15652 * In case there are too many small fragments don't 15653 * use TSO: 15654 */ 15655 tso = 0; 15656 } 15657 } else { 15658 tso = 0; 15659 } 15660 if ((tso == 0) && (len > segsiz)) 15661 len = segsiz; 15662 us_cts = tcp_get_usecs(tv); 15663 if ((len == 0) || 15664 (len <= MHLEN - hdrlen - max_linkhdr)) { 15665 goto failed; 15666 } 15667 sb_offset = tp->snd_max - tp->snd_una; 15668 th->th_seq = htonl(tp->snd_max); 15669 th->th_ack = htonl(tp->rcv_nxt); 15670 th->th_flags = flags; 15671 th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); 15672 if (th->th_win == 0) { 15673 tp->t_sndzerowin++; 15674 tp->t_flags |= TF_RXWIN0SENT; 15675 } else 15676 tp->t_flags &= ~TF_RXWIN0SENT; 15677 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 15678 KMOD_TCPSTAT_INC(tcps_sndpack); 15679 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 15680 #ifdef STATS 15681 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 15682 len); 15683 #endif 15684 if (rack->r_ctl.fsb.m == NULL) 15685 goto failed; 15686 15687 /* s_mb and s_soff are saved for rack_log_output */ 15688 m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, &s_mb, &s_soff); 15689 if (len <= segsiz) { 15690 /* 15691 * Must have ran out of mbufs for the copy 15692 * shorten it to no longer need tso. Lets 15693 * not put on sendalot since we are low on 15694 * mbufs. 15695 */ 15696 tso = 0; 15697 } 15698 if (rack->r_ctl.fsb.rfo_apply_push && 15699 (len == rack->r_ctl.fsb.left_to_send)) { 15700 th->th_flags |= TH_PUSH; 15701 add_flag |= RACK_HAD_PUSH; 15702 } 15703 if ((m->m_next == NULL) || (len <= 0)){ 15704 goto failed; 15705 } 15706 if (udp) { 15707 if (rack->r_is_v6) 15708 ulen = hdrlen + len - sizeof(struct ip6_hdr); 15709 else 15710 ulen = hdrlen + len - sizeof(struct ip); 15711 udp->uh_ulen = htons(ulen); 15712 } 15713 m->m_pkthdr.rcvif = (struct ifnet *)0; 15714 if (tp->t_state == TCPS_ESTABLISHED && 15715 (tp->t_flags2 & TF2_ECN_PERMIT)) { 15716 /* 15717 * If the peer has ECN, mark data packets with ECN capable 15718 * transmission (ECT). Ignore pure ack packets, 15719 * retransmissions. 15720 */ 15721 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { 15722 #ifdef INET6 15723 if (rack->r_is_v6) 15724 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 15725 else 15726 #endif 15727 ip->ip_tos |= IPTOS_ECN_ECT0; 15728 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 15729 /* 15730 * Reply with proper ECN notifications. 15731 * Only set CWR on new data segments. 15732 */ 15733 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 15734 flags |= TH_CWR; 15735 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 15736 } 15737 } 15738 if (tp->t_flags2 & TF2_ECN_SND_ECE) 15739 flags |= TH_ECE; 15740 } 15741 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 15742 #ifdef INET6 15743 if (rack->r_is_v6) { 15744 if (tp->t_port) { 15745 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 15746 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15747 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 15748 th->th_sum = htons(0); 15749 UDPSTAT_INC(udps_opackets); 15750 } else { 15751 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 15752 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15753 th->th_sum = in6_cksum_pseudo(ip6, 15754 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 15755 0); 15756 } 15757 } 15758 #endif 15759 #if defined(INET6) && defined(INET) 15760 else 15761 #endif 15762 #ifdef INET 15763 { 15764 if (tp->t_port) { 15765 m->m_pkthdr.csum_flags = CSUM_UDP; 15766 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 15767 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 15768 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 15769 th->th_sum = htons(0); 15770 UDPSTAT_INC(udps_opackets); 15771 } else { 15772 m->m_pkthdr.csum_flags = CSUM_TCP; 15773 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 15774 th->th_sum = in_pseudo(ip->ip_src.s_addr, 15775 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 15776 IPPROTO_TCP + len + optlen)); 15777 } 15778 /* IP version must be set here for ipv4/ipv6 checking later */ 15779 KASSERT(ip->ip_v == IPVERSION, 15780 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 15781 } 15782 #endif 15783 if (tso) { 15784 KASSERT(len > tp->t_maxseg - optlen, 15785 ("%s: len <= tso_segsz tp:%p", __func__, tp)); 15786 m->m_pkthdr.csum_flags |= CSUM_TSO; 15787 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 15788 } 15789 #ifdef INET6 15790 if (rack->r_is_v6) { 15791 ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; 15792 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 15793 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 15794 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15795 else 15796 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15797 } 15798 #endif 15799 #if defined(INET) && defined(INET6) 15800 else 15801 #endif 15802 #ifdef INET 15803 { 15804 ip->ip_len = htons(m->m_pkthdr.len); 15805 ip->ip_ttl = rack->r_ctl.fsb.hoplimit; 15806 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 15807 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 15808 if (tp->t_port == 0 || len < V_tcp_minmss) { 15809 ip->ip_off |= htons(IP_DF); 15810 } 15811 } else { 15812 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 15813 } 15814 } 15815 #endif 15816 /* Time to copy in our header */ 15817 cpto = mtod(m, uint8_t *); 15818 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 15819 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 15820 if (optlen) { 15821 bcopy(opt, th + 1, optlen); 15822 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 15823 } else { 15824 th->th_off = sizeof(struct tcphdr) >> 2; 15825 } 15826 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 15827 union tcp_log_stackspecific log; 15828 15829 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 15830 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 15831 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 15832 if (rack->rack_no_prr) 15833 log.u_bbr.flex1 = 0; 15834 else 15835 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 15836 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 15837 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 15838 log.u_bbr.flex4 = max_val; 15839 log.u_bbr.flex5 = 0; 15840 /* Save off the early/late values */ 15841 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 15842 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 15843 log.u_bbr.bw_inuse = rack_get_bw(rack); 15844 log.u_bbr.flex8 = 0; 15845 log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); 15846 log.u_bbr.flex7 = 44; 15847 log.u_bbr.pkts_out = tp->t_maxseg; 15848 log.u_bbr.timeStamp = cts; 15849 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 15850 log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; 15851 log.u_bbr.delivered = 0; 15852 lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 15853 len, &log, false, NULL, NULL, 0, tv); 15854 } else 15855 lgb = NULL; 15856 #ifdef INET6 15857 if (rack->r_is_v6) { 15858 error = ip6_output(m, NULL, 15859 &inp->inp_route6, 15860 0, NULL, NULL, inp); 15861 } 15862 #endif 15863 #if defined(INET) && defined(INET6) 15864 else 15865 #endif 15866 #ifdef INET 15867 { 15868 error = ip_output(m, NULL, 15869 &inp->inp_route, 15870 0, 0, inp); 15871 } 15872 #endif 15873 if (lgb) { 15874 lgb->tlb_errno = error; 15875 lgb = NULL; 15876 } 15877 if (error) { 15878 *send_err = error; 15879 m = NULL; 15880 goto failed; 15881 } 15882 rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), 15883 NULL, add_flag, s_mb, s_soff); 15884 m = NULL; 15885 if (tp->snd_una == tp->snd_max) { 15886 rack->r_ctl.rc_tlp_rxt_last_time = cts; 15887 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 15888 tp->t_acktime = ticks; 15889 } 15890 if (error == 0) 15891 tcp_account_for_send(tp, len, 0, 0); 15892 15893 rack->forced_ack = 0; /* If we send something zap the FA flag */ 15894 tot_len += len; 15895 if ((tp->t_flags & TF_GPUTINPROG) == 0) 15896 rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); 15897 tp->snd_max += len; 15898 tp->snd_nxt = tp->snd_max; 15899 { 15900 int idx; 15901 15902 idx = (len / segsiz) + 3; 15903 if (idx >= TCP_MSS_ACCT_ATIMER) 15904 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 15905 else 15906 counter_u64_add(rack_out_size[idx], 1); 15907 } 15908 if (len <= rack->r_ctl.fsb.left_to_send) 15909 rack->r_ctl.fsb.left_to_send -= len; 15910 else 15911 rack->r_ctl.fsb.left_to_send = 0; 15912 if (rack->r_ctl.fsb.left_to_send < segsiz) { 15913 rack->r_fast_output = 0; 15914 rack->r_ctl.fsb.left_to_send = 0; 15915 /* At the end of fast_output scale up the sb */ 15916 SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); 15917 rack_sndbuf_autoscale(rack); 15918 SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); 15919 } 15920 if (tp->t_rtttime == 0) { 15921 tp->t_rtttime = ticks; 15922 tp->t_rtseq = startseq; 15923 KMOD_TCPSTAT_INC(tcps_segstimed); 15924 } 15925 if ((rack->r_ctl.fsb.left_to_send >= segsiz) && 15926 (max_val > len) && 15927 (tso == 0)) { 15928 max_val -= len; 15929 len = segsiz; 15930 th = rack->r_ctl.fsb.th; 15931 cnt_thru++; 15932 goto again; 15933 } 15934 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 15935 counter_u64_add(rack_fto_send, 1); 15936 slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); 15937 rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); 15938 #ifdef TCP_ACCOUNTING 15939 crtsc = get_cyclecount(); 15940 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15941 tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; 15942 } 15943 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); 15944 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15945 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 15946 } 15947 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 15948 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 15949 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); 15950 } 15951 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz)); 15952 sched_unpin(); 15953 #endif 15954 return (0); 15955 failed: 15956 if (m) 15957 m_free(m); 15958 rack->r_fast_output = 0; 15959 return (-1); 15960 } 15961 15962 static int 15963 rack_output(struct tcpcb *tp) 15964 { 15965 struct socket *so; 15966 uint32_t recwin; 15967 uint32_t sb_offset, s_moff = 0; 15968 int32_t len, flags, error = 0; 15969 struct mbuf *m, *s_mb = NULL; 15970 struct mbuf *mb; 15971 uint32_t if_hw_tsomaxsegcount = 0; 15972 uint32_t if_hw_tsomaxsegsize; 15973 int32_t segsiz, minseg; 15974 long tot_len_this_send = 0; 15975 #ifdef INET 15976 struct ip *ip = NULL; 15977 #endif 15978 #ifdef TCPDEBUG 15979 struct ipovly *ipov = NULL; 15980 #endif 15981 struct udphdr *udp = NULL; 15982 struct tcp_rack *rack; 15983 struct tcphdr *th; 15984 uint8_t pass = 0; 15985 uint8_t mark = 0; 15986 uint8_t wanted_cookie = 0; 15987 u_char opt[TCP_MAXOLEN]; 15988 unsigned ipoptlen, optlen, hdrlen, ulen=0; 15989 uint32_t rack_seq; 15990 15991 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 15992 unsigned ipsec_optlen = 0; 15993 15994 #endif 15995 int32_t idle, sendalot; 15996 int32_t sub_from_prr = 0; 15997 volatile int32_t sack_rxmit; 15998 struct rack_sendmap *rsm = NULL; 15999 int32_t tso, mtu; 16000 struct tcpopt to; 16001 int32_t slot = 0; 16002 int32_t sup_rack = 0; 16003 uint32_t cts, ms_cts, delayed, early; 16004 uint16_t add_flag = RACK_SENT_SP; 16005 uint8_t hpts_calling, doing_tlp = 0; 16006 uint32_t cwnd_to_use, pace_max_seg; 16007 int32_t do_a_prefetch = 0; 16008 int32_t prefetch_rsm = 0; 16009 int32_t orig_len = 0; 16010 struct timeval tv; 16011 int32_t prefetch_so_done = 0; 16012 struct tcp_log_buffer *lgb; 16013 struct inpcb *inp; 16014 struct sockbuf *sb; 16015 uint64_t ts_val = 0; 16016 #ifdef TCP_ACCOUNTING 16017 uint64_t crtsc; 16018 #endif 16019 #ifdef INET6 16020 struct ip6_hdr *ip6 = NULL; 16021 int32_t isipv6; 16022 #endif 16023 uint8_t filled_all = 0; 16024 bool hw_tls = false; 16025 16026 /* setup and take the cache hits here */ 16027 rack = (struct tcp_rack *)tp->t_fb_ptr; 16028 #ifdef TCP_ACCOUNTING 16029 sched_pin(); 16030 ts_val = get_cyclecount(); 16031 #endif 16032 hpts_calling = rack->rc_inp->inp_hpts_calls; 16033 NET_EPOCH_ASSERT(); 16034 INP_WLOCK_ASSERT(rack->rc_inp); 16035 #ifdef TCP_OFFLOAD 16036 if (tp->t_flags & TF_TOE) { 16037 #ifdef TCP_ACCOUNTING 16038 sched_unpin(); 16039 #endif 16040 return (tcp_offload_output(tp)); 16041 } 16042 #endif 16043 /* 16044 * For TFO connections in SYN_RECEIVED, only allow the initial 16045 * SYN|ACK and those sent by the retransmit timer. 16046 */ 16047 if (IS_FASTOPEN(tp->t_flags) && 16048 (tp->t_state == TCPS_SYN_RECEIVED) && 16049 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 16050 (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ 16051 #ifdef TCP_ACCOUNTING 16052 sched_unpin(); 16053 #endif 16054 return (0); 16055 } 16056 #ifdef INET6 16057 if (rack->r_state) { 16058 /* Use the cache line loaded if possible */ 16059 isipv6 = rack->r_is_v6; 16060 } else { 16061 isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; 16062 } 16063 #endif 16064 early = 0; 16065 cts = tcp_get_usecs(&tv); 16066 ms_cts = tcp_tv_to_mssectick(&tv); 16067 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 16068 rack->rc_inp->inp_in_hpts) { 16069 /* 16070 * We are on the hpts for some timer but not hptsi output. 16071 * Remove from the hpts unconditionally. 16072 */ 16073 rack_timer_cancel(tp, rack, cts, __LINE__); 16074 } 16075 /* Are we pacing and late? */ 16076 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16077 TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { 16078 /* We are delayed */ 16079 delayed = cts - rack->r_ctl.rc_last_output_to; 16080 } else { 16081 delayed = 0; 16082 } 16083 /* Do the timers, which may override the pacer */ 16084 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 16085 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 16086 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 16087 #ifdef TCP_ACCOUNTING 16088 sched_unpin(); 16089 #endif 16090 return (0); 16091 } 16092 } 16093 if (rack->rc_in_persist) { 16094 if (rack->rc_inp->inp_in_hpts == 0) { 16095 /* Timer is not running */ 16096 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 16097 } 16098 #ifdef TCP_ACCOUNTING 16099 sched_unpin(); 16100 #endif 16101 return (0); 16102 } 16103 if ((rack->r_timer_override) || 16104 (rack->rc_ack_can_sendout_data) || 16105 (delayed) || 16106 (tp->t_state < TCPS_ESTABLISHED)) { 16107 rack->rc_ack_can_sendout_data = 0; 16108 if (rack->rc_inp->inp_in_hpts) 16109 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 16110 } else if (rack->rc_inp->inp_in_hpts) { 16111 /* 16112 * On the hpts you can't pass even if ACKNOW is on, we will 16113 * when the hpts fires. 16114 */ 16115 #ifdef TCP_ACCOUNTING 16116 crtsc = get_cyclecount(); 16117 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16118 tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); 16119 } 16120 counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val)); 16121 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 16122 tp->tcp_cnt_counters[SND_BLOCKED]++; 16123 } 16124 counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1); 16125 sched_unpin(); 16126 #endif 16127 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 16128 return (0); 16129 } 16130 rack->rc_inp->inp_hpts_calls = 0; 16131 /* Finish out both pacing early and late accounting */ 16132 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 16133 TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { 16134 early = rack->r_ctl.rc_last_output_to - cts; 16135 } else 16136 early = 0; 16137 if (delayed) { 16138 rack->r_ctl.rc_agg_delayed += delayed; 16139 rack->r_late = 1; 16140 } else if (early) { 16141 rack->r_ctl.rc_agg_early += early; 16142 rack->r_early = 1; 16143 } 16144 /* Now that early/late accounting is done turn off the flag */ 16145 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 16146 rack->r_wanted_output = 0; 16147 rack->r_timer_override = 0; 16148 if ((tp->t_state != rack->r_state) && 16149 TCPS_HAVEESTABLISHED(tp->t_state)) { 16150 rack_set_state(tp, rack); 16151 } 16152 if ((rack->r_fast_output) && 16153 (tp->rcv_numsacks == 0)) { 16154 int ret; 16155 16156 error = 0; 16157 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 16158 if (ret >= 0) 16159 return(ret); 16160 else if (error) { 16161 inp = rack->rc_inp; 16162 so = inp->inp_socket; 16163 sb = &so->so_snd; 16164 goto nomore; 16165 } 16166 } 16167 inp = rack->rc_inp; 16168 /* 16169 * For TFO connections in SYN_SENT or SYN_RECEIVED, 16170 * only allow the initial SYN or SYN|ACK and those sent 16171 * by the retransmit timer. 16172 */ 16173 if (IS_FASTOPEN(tp->t_flags) && 16174 ((tp->t_state == TCPS_SYN_RECEIVED) || 16175 (tp->t_state == TCPS_SYN_SENT)) && 16176 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 16177 (tp->t_rxtshift == 0)) { /* not a retransmit */ 16178 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16179 so = inp->inp_socket; 16180 sb = &so->so_snd; 16181 goto just_return_nolock; 16182 } 16183 /* 16184 * Determine length of data that should be transmitted, and flags 16185 * that will be used. If there is some data or critical controls 16186 * (SYN, RST) to send, then transmit; otherwise, investigate 16187 * further. 16188 */ 16189 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 16190 if (tp->t_idle_reduce) { 16191 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 16192 rack_cc_after_idle(rack, tp); 16193 } 16194 tp->t_flags &= ~TF_LASTIDLE; 16195 if (idle) { 16196 if (tp->t_flags & TF_MORETOCOME) { 16197 tp->t_flags |= TF_LASTIDLE; 16198 idle = 0; 16199 } 16200 } 16201 if ((tp->snd_una == tp->snd_max) && 16202 rack->r_ctl.rc_went_idle_time && 16203 TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { 16204 idle = cts - rack->r_ctl.rc_went_idle_time; 16205 if (idle > rack_min_probertt_hold) { 16206 /* Count as a probe rtt */ 16207 if (rack->in_probe_rtt == 0) { 16208 rack->r_ctl.rc_lower_rtt_us_cts = cts; 16209 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 16210 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 16211 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 16212 } else { 16213 rack_exit_probertt(rack, cts); 16214 } 16215 } 16216 idle = 0; 16217 } 16218 if (rack_use_fsb && (rack->r_fsb_inited == 0)) 16219 rack_init_fsb_block(tp, rack); 16220 again: 16221 /* 16222 * If we've recently taken a timeout, snd_max will be greater than 16223 * snd_nxt. There may be SACK information that allows us to avoid 16224 * resending already delivered data. Adjust snd_nxt accordingly. 16225 */ 16226 sendalot = 0; 16227 cts = tcp_get_usecs(&tv); 16228 ms_cts = tcp_tv_to_mssectick(&tv); 16229 tso = 0; 16230 mtu = 0; 16231 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 16232 minseg = segsiz; 16233 if (rack->r_ctl.rc_pace_max_segs == 0) 16234 pace_max_seg = rack->rc_user_set_max_segs * segsiz; 16235 else 16236 pace_max_seg = rack->r_ctl.rc_pace_max_segs; 16237 sb_offset = tp->snd_max - tp->snd_una; 16238 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 16239 flags = tcp_outflags[tp->t_state]; 16240 while (rack->rc_free_cnt < rack_free_cache) { 16241 rsm = rack_alloc(rack); 16242 if (rsm == NULL) { 16243 if (inp->inp_hpts_calls) 16244 /* Retry in a ms */ 16245 slot = (1 * HPTS_USEC_IN_MSEC); 16246 so = inp->inp_socket; 16247 sb = &so->so_snd; 16248 goto just_return_nolock; 16249 } 16250 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 16251 rack->rc_free_cnt++; 16252 rsm = NULL; 16253 } 16254 if (inp->inp_hpts_calls) 16255 inp->inp_hpts_calls = 0; 16256 sack_rxmit = 0; 16257 len = 0; 16258 rsm = NULL; 16259 if (flags & TH_RST) { 16260 SOCKBUF_LOCK(&inp->inp_socket->so_snd); 16261 so = inp->inp_socket; 16262 sb = &so->so_snd; 16263 goto send; 16264 } 16265 if (rack->r_ctl.rc_resend) { 16266 /* Retransmit timer */ 16267 rsm = rack->r_ctl.rc_resend; 16268 rack->r_ctl.rc_resend = NULL; 16269 rsm->r_flags &= ~RACK_TLP; 16270 len = rsm->r_end - rsm->r_start; 16271 sack_rxmit = 1; 16272 sendalot = 0; 16273 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16274 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16275 __func__, __LINE__, 16276 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16277 sb_offset = rsm->r_start - tp->snd_una; 16278 if (len >= segsiz) 16279 len = segsiz; 16280 } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { 16281 /* We have a retransmit that takes precedence */ 16282 rsm->r_flags &= ~RACK_TLP; 16283 if ((!IN_FASTRECOVERY(tp->t_flags)) && 16284 ((tp->t_flags & TF_WASFRECOVERY) == 0)) { 16285 /* Enter recovery if not induced by a time-out */ 16286 rack->r_ctl.rc_rsm_start = rsm->r_start; 16287 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 16288 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 16289 rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); 16290 } 16291 #ifdef INVARIANTS 16292 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 16293 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 16294 tp, rack, rsm, rsm->r_start, tp->snd_una); 16295 } 16296 #endif 16297 len = rsm->r_end - rsm->r_start; 16298 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16299 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16300 __func__, __LINE__, 16301 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16302 sb_offset = rsm->r_start - tp->snd_una; 16303 sendalot = 0; 16304 if (len >= segsiz) 16305 len = segsiz; 16306 if (len > 0) { 16307 sack_rxmit = 1; 16308 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 16309 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 16310 min(len, segsiz)); 16311 counter_u64_add(rack_rtm_prr_retran, 1); 16312 } 16313 } else if (rack->r_ctl.rc_tlpsend) { 16314 /* Tail loss probe */ 16315 long cwin; 16316 long tlen; 16317 16318 doing_tlp = 1; 16319 /* 16320 * Check if we can do a TLP with a RACK'd packet 16321 * this can happen if we are not doing the rack 16322 * cheat and we skipped to a TLP and it 16323 * went off. 16324 */ 16325 rsm = rack->r_ctl.rc_tlpsend; 16326 rsm->r_flags |= RACK_TLP; 16327 16328 rack->r_ctl.rc_tlpsend = NULL; 16329 sack_rxmit = 1; 16330 tlen = rsm->r_end - rsm->r_start; 16331 if (tlen > segsiz) 16332 tlen = segsiz; 16333 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 16334 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 16335 __func__, __LINE__, 16336 rsm->r_start, tp->snd_una, tp, rack, rsm)); 16337 sb_offset = rsm->r_start - tp->snd_una; 16338 cwin = min(tp->snd_wnd, tlen); 16339 len = cwin; 16340 } 16341 if (rack->r_must_retran && 16342 (rsm == NULL)) { 16343 /* 16344 * Non-Sack and we had a RTO or MTU change, we 16345 * need to retransmit until we reach 16346 * the former snd_max (rack->r_ctl.rc_snd_max_at_rto). 16347 */ 16348 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 16349 int sendwin, flight; 16350 16351 sendwin = min(tp->snd_wnd, tp->snd_cwnd); 16352 flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); 16353 if (flight >= sendwin) { 16354 so = inp->inp_socket; 16355 sb = &so->so_snd; 16356 goto just_return_nolock; 16357 } 16358 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 16359 KASSERT(rsm != NULL, ("rsm is NULL rack:%p r_must_retran set", rack)); 16360 if (rsm == NULL) { 16361 /* TSNH */ 16362 rack->r_must_retran = 0; 16363 rack->r_ctl.rc_out_at_rto = 0; 16364 rack->r_must_retran = 0; 16365 so = inp->inp_socket; 16366 sb = &so->so_snd; 16367 goto just_return_nolock; 16368 } 16369 sack_rxmit = 1; 16370 len = rsm->r_end - rsm->r_start; 16371 sendalot = 0; 16372 sb_offset = rsm->r_start - tp->snd_una; 16373 if (len >= segsiz) 16374 len = segsiz; 16375 } else { 16376 /* We must be done if there is nothing outstanding */ 16377 rack->r_must_retran = 0; 16378 rack->r_ctl.rc_out_at_rto = 0; 16379 } 16380 } 16381 /* 16382 * Enforce a connection sendmap count limit if set 16383 * as long as we are not retransmiting. 16384 */ 16385 if ((rsm == NULL) && 16386 (rack->do_detection == 0) && 16387 (V_tcp_map_entries_limit > 0) && 16388 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 16389 counter_u64_add(rack_to_alloc_limited, 1); 16390 if (!rack->alloc_limit_reported) { 16391 rack->alloc_limit_reported = 1; 16392 counter_u64_add(rack_alloc_limited_conns, 1); 16393 } 16394 so = inp->inp_socket; 16395 sb = &so->so_snd; 16396 goto just_return_nolock; 16397 } 16398 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 16399 /* we are retransmitting the fin */ 16400 len--; 16401 if (len) { 16402 /* 16403 * When retransmitting data do *not* include the 16404 * FIN. This could happen from a TLP probe. 16405 */ 16406 flags &= ~TH_FIN; 16407 } 16408 } 16409 #ifdef INVARIANTS 16410 /* For debugging */ 16411 rack->r_ctl.rc_rsm_at_retran = rsm; 16412 #endif 16413 if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && 16414 ((rsm->r_flags & RACK_HAS_FIN) == 0)) { 16415 int ret; 16416 16417 ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len); 16418 if (ret == 0) 16419 return (0); 16420 } 16421 so = inp->inp_socket; 16422 sb = &so->so_snd; 16423 if (do_a_prefetch == 0) { 16424 kern_prefetch(sb, &do_a_prefetch); 16425 do_a_prefetch = 1; 16426 } 16427 #ifdef NETFLIX_SHARED_CWND 16428 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 16429 rack->rack_enable_scwnd) { 16430 /* We are doing cwnd sharing */ 16431 if (rack->gp_ready && 16432 (rack->rack_attempted_scwnd == 0) && 16433 (rack->r_ctl.rc_scw == NULL) && 16434 tp->t_lib) { 16435 /* The pcbid is in, lets make an attempt */ 16436 counter_u64_add(rack_try_scwnd, 1); 16437 rack->rack_attempted_scwnd = 1; 16438 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 16439 &rack->r_ctl.rc_scw_index, 16440 segsiz); 16441 } 16442 if (rack->r_ctl.rc_scw && 16443 (rack->rack_scwnd_is_idle == 1) && 16444 sbavail(&so->so_snd)) { 16445 /* we are no longer out of data */ 16446 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 16447 rack->rack_scwnd_is_idle = 0; 16448 } 16449 if (rack->r_ctl.rc_scw) { 16450 /* First lets update and get the cwnd */ 16451 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 16452 rack->r_ctl.rc_scw_index, 16453 tp->snd_cwnd, tp->snd_wnd, segsiz); 16454 } 16455 } 16456 #endif 16457 /* 16458 * Get standard flags, and add SYN or FIN if requested by 'hidden' 16459 * state flags. 16460 */ 16461 if (tp->t_flags & TF_NEEDFIN) 16462 flags |= TH_FIN; 16463 if (tp->t_flags & TF_NEEDSYN) 16464 flags |= TH_SYN; 16465 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 16466 void *end_rsm; 16467 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 16468 if (end_rsm) 16469 kern_prefetch(end_rsm, &prefetch_rsm); 16470 prefetch_rsm = 1; 16471 } 16472 SOCKBUF_LOCK(sb); 16473 /* 16474 * If snd_nxt == snd_max and we have transmitted a FIN, the 16475 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 16476 * negative length. This can also occur when TCP opens up its 16477 * congestion window while receiving additional duplicate acks after 16478 * fast-retransmit because TCP will reset snd_nxt to snd_max after 16479 * the fast-retransmit. 16480 * 16481 * In the normal retransmit-FIN-only case, however, snd_nxt will be 16482 * set to snd_una, the sb_offset will be 0, and the length may wind 16483 * up 0. 16484 * 16485 * If sack_rxmit is true we are retransmitting from the scoreboard 16486 * in which case len is already set. 16487 */ 16488 if ((sack_rxmit == 0) && 16489 (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { 16490 uint32_t avail; 16491 16492 avail = sbavail(sb); 16493 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 16494 sb_offset = tp->snd_nxt - tp->snd_una; 16495 else 16496 sb_offset = 0; 16497 if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 16498 if (rack->r_ctl.rc_tlp_new_data) { 16499 /* TLP is forcing out new data */ 16500 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 16501 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 16502 } 16503 if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { 16504 if (tp->snd_wnd > sb_offset) 16505 len = tp->snd_wnd - sb_offset; 16506 else 16507 len = 0; 16508 } else { 16509 len = rack->r_ctl.rc_tlp_new_data; 16510 } 16511 rack->r_ctl.rc_tlp_new_data = 0; 16512 doing_tlp = 1; 16513 } else { 16514 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 16515 } 16516 if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { 16517 /* 16518 * For prr=off, we need to send only 1 MSS 16519 * at a time. We do this because another sack could 16520 * be arriving that causes us to send retransmits and 16521 * we don't want to be on a long pace due to a larger send 16522 * that keeps us from sending out the retransmit. 16523 */ 16524 len = segsiz; 16525 } 16526 } else { 16527 uint32_t outstanding; 16528 /* 16529 * We are inside of a Fast recovery episode, this 16530 * is caused by a SACK or 3 dup acks. At this point 16531 * we have sent all the retransmissions and we rely 16532 * on PRR to dictate what we will send in the form of 16533 * new data. 16534 */ 16535 16536 outstanding = tp->snd_max - tp->snd_una; 16537 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 16538 if (tp->snd_wnd > outstanding) { 16539 len = tp->snd_wnd - outstanding; 16540 /* Check to see if we have the data */ 16541 if ((sb_offset + len) > avail) { 16542 /* It does not all fit */ 16543 if (avail > sb_offset) 16544 len = avail - sb_offset; 16545 else 16546 len = 0; 16547 } 16548 } else { 16549 len = 0; 16550 } 16551 } else if (avail > sb_offset) { 16552 len = avail - sb_offset; 16553 } else { 16554 len = 0; 16555 } 16556 if (len > 0) { 16557 if (len > rack->r_ctl.rc_prr_sndcnt) { 16558 len = rack->r_ctl.rc_prr_sndcnt; 16559 } 16560 if (len > 0) { 16561 sub_from_prr = 1; 16562 counter_u64_add(rack_rtm_prr_newdata, 1); 16563 } 16564 } 16565 if (len > segsiz) { 16566 /* 16567 * We should never send more than a MSS when 16568 * retransmitting or sending new data in prr 16569 * mode unless the override flag is on. Most 16570 * likely the PRR algorithm is not going to 16571 * let us send a lot as well :-) 16572 */ 16573 if (rack->r_ctl.rc_prr_sendalot == 0) { 16574 len = segsiz; 16575 } 16576 } else if (len < segsiz) { 16577 /* 16578 * Do we send any? The idea here is if the 16579 * send empty's the socket buffer we want to 16580 * do it. However if not then lets just wait 16581 * for our prr_sndcnt to get bigger. 16582 */ 16583 long leftinsb; 16584 16585 leftinsb = sbavail(sb) - sb_offset; 16586 if (leftinsb > len) { 16587 /* This send does not empty the sb */ 16588 len = 0; 16589 } 16590 } 16591 } 16592 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 16593 /* 16594 * If you have not established 16595 * and are not doing FAST OPEN 16596 * no data please. 16597 */ 16598 if ((sack_rxmit == 0) && 16599 (!IS_FASTOPEN(tp->t_flags))){ 16600 len = 0; 16601 sb_offset = 0; 16602 } 16603 } 16604 if (prefetch_so_done == 0) { 16605 kern_prefetch(so, &prefetch_so_done); 16606 prefetch_so_done = 1; 16607 } 16608 /* 16609 * Lop off SYN bit if it has already been sent. However, if this is 16610 * SYN-SENT state and if segment contains data and if we don't know 16611 * that foreign host supports TAO, suppress sending segment. 16612 */ 16613 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 16614 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 16615 /* 16616 * When sending additional segments following a TFO SYN|ACK, 16617 * do not include the SYN bit. 16618 */ 16619 if (IS_FASTOPEN(tp->t_flags) && 16620 (tp->t_state == TCPS_SYN_RECEIVED)) 16621 flags &= ~TH_SYN; 16622 } 16623 /* 16624 * Be careful not to send data and/or FIN on SYN segments. This 16625 * measure is needed to prevent interoperability problems with not 16626 * fully conformant TCP implementations. 16627 */ 16628 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 16629 len = 0; 16630 flags &= ~TH_FIN; 16631 } 16632 /* 16633 * On TFO sockets, ensure no data is sent in the following cases: 16634 * 16635 * - When retransmitting SYN|ACK on a passively-created socket 16636 * 16637 * - When retransmitting SYN on an actively created socket 16638 * 16639 * - When sending a zero-length cookie (cookie request) on an 16640 * actively created socket 16641 * 16642 * - When the socket is in the CLOSED state (RST is being sent) 16643 */ 16644 if (IS_FASTOPEN(tp->t_flags) && 16645 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 16646 ((tp->t_state == TCPS_SYN_SENT) && 16647 (tp->t_tfo_client_cookie_len == 0)) || 16648 (flags & TH_RST))) { 16649 sack_rxmit = 0; 16650 len = 0; 16651 } 16652 /* Without fast-open there should never be data sent on a SYN */ 16653 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 16654 tp->snd_nxt = tp->iss; 16655 len = 0; 16656 } 16657 if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { 16658 /* We only send 1 MSS if we have a DSACK block */ 16659 add_flag |= RACK_SENT_W_DSACK; 16660 len = segsiz; 16661 } 16662 orig_len = len; 16663 if (len <= 0) { 16664 /* 16665 * If FIN has been sent but not acked, but we haven't been 16666 * called to retransmit, len will be < 0. Otherwise, window 16667 * shrank after we sent into it. If window shrank to 0, 16668 * cancel pending retransmit, pull snd_nxt back to (closed) 16669 * window, and set the persist timer if it isn't already 16670 * going. If the window didn't close completely, just wait 16671 * for an ACK. 16672 * 16673 * We also do a general check here to ensure that we will 16674 * set the persist timer when we have data to send, but a 16675 * 0-byte window. This makes sure the persist timer is set 16676 * even if the packet hits one of the "goto send" lines 16677 * below. 16678 */ 16679 len = 0; 16680 if ((tp->snd_wnd == 0) && 16681 (TCPS_HAVEESTABLISHED(tp->t_state)) && 16682 (tp->snd_una == tp->snd_max) && 16683 (sb_offset < (int)sbavail(sb))) { 16684 rack_enter_persist(tp, rack, cts); 16685 } 16686 } else if ((rsm == NULL) && 16687 (doing_tlp == 0) && 16688 (len < pace_max_seg)) { 16689 /* 16690 * We are not sending a maximum sized segment for 16691 * some reason. Should we not send anything (think 16692 * sws or persists)? 16693 */ 16694 if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 16695 (TCPS_HAVEESTABLISHED(tp->t_state)) && 16696 (len < minseg) && 16697 (len < (int)(sbavail(sb) - sb_offset))) { 16698 /* 16699 * Here the rwnd is less than 16700 * the minimum pacing size, this is not a retransmit, 16701 * we are established and 16702 * the send is not the last in the socket buffer 16703 * we send nothing, and we may enter persists 16704 * if nothing is outstanding. 16705 */ 16706 len = 0; 16707 if (tp->snd_max == tp->snd_una) { 16708 /* 16709 * Nothing out we can 16710 * go into persists. 16711 */ 16712 rack_enter_persist(tp, rack, cts); 16713 } 16714 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 16715 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 16716 (len < (int)(sbavail(sb) - sb_offset)) && 16717 (len < minseg)) { 16718 /* 16719 * Here we are not retransmitting, and 16720 * the cwnd is not so small that we could 16721 * not send at least a min size (rxt timer 16722 * not having gone off), We have 2 segments or 16723 * more already in flight, its not the tail end 16724 * of the socket buffer and the cwnd is blocking 16725 * us from sending out a minimum pacing segment size. 16726 * Lets not send anything. 16727 */ 16728 len = 0; 16729 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 16730 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 16731 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 16732 (len < (int)(sbavail(sb) - sb_offset)) && 16733 (TCPS_HAVEESTABLISHED(tp->t_state))) { 16734 /* 16735 * Here we have a send window but we have 16736 * filled it up and we can't send another pacing segment. 16737 * We also have in flight more than 2 segments 16738 * and we are not completing the sb i.e. we allow 16739 * the last bytes of the sb to go out even if 16740 * its not a full pacing segment. 16741 */ 16742 len = 0; 16743 } else if ((rack->r_ctl.crte != NULL) && 16744 (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && 16745 (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && 16746 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && 16747 (len < (int)(sbavail(sb) - sb_offset))) { 16748 /* 16749 * Here we are doing hardware pacing, this is not a TLP, 16750 * we are not sending a pace max segment size, there is rwnd 16751 * room to send at least N pace_max_seg, the cwnd is greater 16752 * than or equal to a full pacing segments plus 4 mss and we have 2 or 16753 * more segments in flight and its not the tail of the socket buffer. 16754 * 16755 * We don't want to send instead we need to get more ack's in to 16756 * allow us to send a full pacing segment. Normally, if we are pacing 16757 * about the right speed, we should have finished our pacing 16758 * send as most of the acks have come back if we are at the 16759 * right rate. This is a bit fuzzy since return path delay 16760 * can delay the acks, which is why we want to make sure we 16761 * have cwnd space to have a bit more than a max pace segments in flight. 16762 * 16763 * If we have not gotten our acks back we are pacing at too high a 16764 * rate delaying will not hurt and will bring our GP estimate down by 16765 * injecting the delay. If we don't do this we will send 16766 * 2 MSS out in response to the acks being clocked in which 16767 * defeats the point of hw-pacing (i.e. to help us get 16768 * larger TSO's out). 16769 */ 16770 len = 0; 16771 16772 } 16773 16774 } 16775 /* len will be >= 0 after this point. */ 16776 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 16777 rack_sndbuf_autoscale(rack); 16778 /* 16779 * Decide if we can use TCP Segmentation Offloading (if supported by 16780 * hardware). 16781 * 16782 * TSO may only be used if we are in a pure bulk sending state. The 16783 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 16784 * options prevent using TSO. With TSO the TCP header is the same 16785 * (except for the sequence number) for all generated packets. This 16786 * makes it impossible to transmit any options which vary per 16787 * generated segment or packet. 16788 * 16789 * IPv4 handling has a clear separation of ip options and ip header 16790 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 16791 * the right thing below to provide length of just ip options and thus 16792 * checking for ipoptlen is enough to decide if ip options are present. 16793 */ 16794 ipoptlen = 0; 16795 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16796 /* 16797 * Pre-calculate here as we save another lookup into the darknesses 16798 * of IPsec that way and can actually decide if TSO is ok. 16799 */ 16800 #ifdef INET6 16801 if (isipv6 && IPSEC_ENABLED(ipv6)) 16802 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 16803 #ifdef INET 16804 else 16805 #endif 16806 #endif /* INET6 */ 16807 #ifdef INET 16808 if (IPSEC_ENABLED(ipv4)) 16809 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 16810 #endif /* INET */ 16811 #endif 16812 16813 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 16814 ipoptlen += ipsec_optlen; 16815 #endif 16816 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 16817 (tp->t_port == 0) && 16818 ((tp->t_flags & TF_SIGNATURE) == 0) && 16819 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 16820 ipoptlen == 0) 16821 tso = 1; 16822 { 16823 uint32_t outstanding; 16824 16825 outstanding = tp->snd_max - tp->snd_una; 16826 if (tp->t_flags & TF_SENTFIN) { 16827 /* 16828 * If we sent a fin, snd_max is 1 higher than 16829 * snd_una 16830 */ 16831 outstanding--; 16832 } 16833 if (sack_rxmit) { 16834 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 16835 flags &= ~TH_FIN; 16836 } else { 16837 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 16838 sbused(sb))) 16839 flags &= ~TH_FIN; 16840 } 16841 } 16842 recwin = lmin(lmax(sbspace(&so->so_rcv), 0), 16843 (long)TCP_MAXWIN << tp->rcv_scale); 16844 16845 /* 16846 * Sender silly window avoidance. We transmit under the following 16847 * conditions when len is non-zero: 16848 * 16849 * - We have a full segment (or more with TSO) - This is the last 16850 * buffer in a write()/send() and we are either idle or running 16851 * NODELAY - we've timed out (e.g. persist timer) - we have more 16852 * then 1/2 the maximum send window's worth of data (receiver may be 16853 * limited the window size) - we need to retransmit 16854 */ 16855 if (len) { 16856 if (len >= segsiz) { 16857 goto send; 16858 } 16859 /* 16860 * NOTE! on localhost connections an 'ack' from the remote 16861 * end may occur synchronously with the output and cause us 16862 * to flush a buffer queued with moretocome. XXX 16863 * 16864 */ 16865 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 16866 (idle || (tp->t_flags & TF_NODELAY)) && 16867 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 16868 (tp->t_flags & TF_NOPUSH) == 0) { 16869 pass = 2; 16870 goto send; 16871 } 16872 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 16873 pass = 22; 16874 goto send; 16875 } 16876 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 16877 pass = 4; 16878 goto send; 16879 } 16880 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 16881 pass = 5; 16882 goto send; 16883 } 16884 if (sack_rxmit) { 16885 pass = 6; 16886 goto send; 16887 } 16888 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 16889 (ctf_outstanding(tp) < (segsiz * 2))) { 16890 /* 16891 * We have less than two MSS outstanding (delayed ack) 16892 * and our rwnd will not let us send a full sized 16893 * MSS. Lets go ahead and let this small segment 16894 * out because we want to try to have at least two 16895 * packets inflight to not be caught by delayed ack. 16896 */ 16897 pass = 12; 16898 goto send; 16899 } 16900 } 16901 /* 16902 * Sending of standalone window updates. 16903 * 16904 * Window updates are important when we close our window due to a 16905 * full socket buffer and are opening it again after the application 16906 * reads data from it. Once the window has opened again and the 16907 * remote end starts to send again the ACK clock takes over and 16908 * provides the most current window information. 16909 * 16910 * We must avoid the silly window syndrome whereas every read from 16911 * the receive buffer, no matter how small, causes a window update 16912 * to be sent. We also should avoid sending a flurry of window 16913 * updates when the socket buffer had queued a lot of data and the 16914 * application is doing small reads. 16915 * 16916 * Prevent a flurry of pointless window updates by only sending an 16917 * update when we can increase the advertized window by more than 16918 * 1/4th of the socket buffer capacity. When the buffer is getting 16919 * full or is very small be more aggressive and send an update 16920 * whenever we can increase by two mss sized segments. In all other 16921 * situations the ACK's to new incoming data will carry further 16922 * window increases. 16923 * 16924 * Don't send an independent window update if a delayed ACK is 16925 * pending (it will get piggy-backed on it) or the remote side 16926 * already has done a half-close and won't send more data. Skip 16927 * this if the connection is in T/TCP half-open state. 16928 */ 16929 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 16930 !(tp->t_flags & TF_DELACK) && 16931 !TCPS_HAVERCVDFIN(tp->t_state)) { 16932 /* 16933 * "adv" is the amount we could increase the window, taking 16934 * into account that we are limited by TCP_MAXWIN << 16935 * tp->rcv_scale. 16936 */ 16937 int32_t adv; 16938 int oldwin; 16939 16940 adv = recwin; 16941 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 16942 oldwin = (tp->rcv_adv - tp->rcv_nxt); 16943 if (adv > oldwin) 16944 adv -= oldwin; 16945 else { 16946 /* We can't increase the window */ 16947 adv = 0; 16948 } 16949 } else 16950 oldwin = 0; 16951 16952 /* 16953 * If the new window size ends up being the same as or less 16954 * than the old size when it is scaled, then don't force 16955 * a window update. 16956 */ 16957 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 16958 goto dontupdate; 16959 16960 if (adv >= (int32_t)(2 * segsiz) && 16961 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 16962 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 16963 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 16964 pass = 7; 16965 goto send; 16966 } 16967 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 16968 pass = 23; 16969 goto send; 16970 } 16971 } 16972 dontupdate: 16973 16974 /* 16975 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 16976 * is also a catch-all for the retransmit timer timeout case. 16977 */ 16978 if (tp->t_flags & TF_ACKNOW) { 16979 pass = 8; 16980 goto send; 16981 } 16982 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 16983 pass = 9; 16984 goto send; 16985 } 16986 /* 16987 * If our state indicates that FIN should be sent and we have not 16988 * yet done so, then we need to send. 16989 */ 16990 if ((flags & TH_FIN) && 16991 (tp->snd_nxt == tp->snd_una)) { 16992 pass = 11; 16993 goto send; 16994 } 16995 /* 16996 * No reason to send a segment, just return. 16997 */ 16998 just_return: 16999 SOCKBUF_UNLOCK(sb); 17000 just_return_nolock: 17001 { 17002 int app_limited = CTF_JR_SENT_DATA; 17003 17004 if (tot_len_this_send > 0) { 17005 /* Make sure snd_nxt is up to max */ 17006 rack->r_ctl.fsb.recwin = recwin; 17007 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 17008 if ((error == 0) && 17009 rack_use_rfo && 17010 ((flags & (TH_SYN|TH_FIN)) == 0) && 17011 (ipoptlen == 0) && 17012 (tp->snd_nxt == tp->snd_max) && 17013 (tp->rcv_numsacks == 0) && 17014 rack->r_fsb_inited && 17015 TCPS_HAVEESTABLISHED(tp->t_state) && 17016 (rack->r_must_retran == 0) && 17017 ((tp->t_flags & TF_NEEDFIN) == 0) && 17018 (len > 0) && (orig_len > 0) && 17019 (orig_len > len) && 17020 ((orig_len - len) >= segsiz) && 17021 ((optlen == 0) || 17022 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 17023 /* We can send at least one more MSS using our fsb */ 17024 17025 rack->r_fast_output = 1; 17026 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 17027 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 17028 rack->r_ctl.fsb.tcp_flags = flags; 17029 rack->r_ctl.fsb.left_to_send = orig_len - len; 17030 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 17031 ("rack:%p left_to_send:%u sbavail:%u out:%u", 17032 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 17033 (tp->snd_max - tp->snd_una))); 17034 if (rack->r_ctl.fsb.left_to_send < segsiz) 17035 rack->r_fast_output = 0; 17036 else { 17037 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 17038 rack->r_ctl.fsb.rfo_apply_push = 1; 17039 else 17040 rack->r_ctl.fsb.rfo_apply_push = 0; 17041 } 17042 } else 17043 rack->r_fast_output = 0; 17044 17045 17046 rack_log_fsb(rack, tp, so, flags, 17047 ipoptlen, orig_len, len, 0, 17048 1, optlen, __LINE__, 1); 17049 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 17050 tp->snd_nxt = tp->snd_max; 17051 } else { 17052 int end_window = 0; 17053 uint32_t seq = tp->gput_ack; 17054 17055 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17056 if (rsm) { 17057 /* 17058 * Mark the last sent that we just-returned (hinting 17059 * that delayed ack may play a role in any rtt measurement). 17060 */ 17061 rsm->r_just_ret = 1; 17062 } 17063 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 17064 rack->r_ctl.rc_agg_delayed = 0; 17065 rack->r_early = 0; 17066 rack->r_late = 0; 17067 rack->r_ctl.rc_agg_early = 0; 17068 if ((ctf_outstanding(tp) + 17069 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 17070 minseg)) >= tp->snd_wnd) { 17071 /* We are limited by the rwnd */ 17072 app_limited = CTF_JR_RWND_LIMITED; 17073 if (IN_FASTRECOVERY(tp->t_flags)) 17074 rack->r_ctl.rc_prr_sndcnt = 0; 17075 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 17076 /* We are limited by whats available -- app limited */ 17077 app_limited = CTF_JR_APP_LIMITED; 17078 if (IN_FASTRECOVERY(tp->t_flags)) 17079 rack->r_ctl.rc_prr_sndcnt = 0; 17080 } else if ((idle == 0) && 17081 ((tp->t_flags & TF_NODELAY) == 0) && 17082 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 17083 (len < segsiz)) { 17084 /* 17085 * No delay is not on and the 17086 * user is sending less than 1MSS. This 17087 * brings out SWS avoidance so we 17088 * don't send. Another app-limited case. 17089 */ 17090 app_limited = CTF_JR_APP_LIMITED; 17091 } else if (tp->t_flags & TF_NOPUSH) { 17092 /* 17093 * The user has requested no push of 17094 * the last segment and we are 17095 * at the last segment. Another app 17096 * limited case. 17097 */ 17098 app_limited = CTF_JR_APP_LIMITED; 17099 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 17100 /* Its the cwnd */ 17101 app_limited = CTF_JR_CWND_LIMITED; 17102 } else if (IN_FASTRECOVERY(tp->t_flags) && 17103 (rack->rack_no_prr == 0) && 17104 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 17105 app_limited = CTF_JR_PRR; 17106 } else { 17107 /* Now why here are we not sending? */ 17108 #ifdef NOW 17109 #ifdef INVARIANTS 17110 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 17111 #endif 17112 #endif 17113 app_limited = CTF_JR_ASSESSING; 17114 } 17115 /* 17116 * App limited in some fashion, for our pacing GP 17117 * measurements we don't want any gap (even cwnd). 17118 * Close down the measurement window. 17119 */ 17120 if (rack_cwnd_block_ends_measure && 17121 ((app_limited == CTF_JR_CWND_LIMITED) || 17122 (app_limited == CTF_JR_PRR))) { 17123 /* 17124 * The reason we are not sending is 17125 * the cwnd (or prr). We have been configured 17126 * to end the measurement window in 17127 * this case. 17128 */ 17129 end_window = 1; 17130 } else if (rack_rwnd_block_ends_measure && 17131 (app_limited == CTF_JR_RWND_LIMITED)) { 17132 /* 17133 * We are rwnd limited and have been 17134 * configured to end the measurement 17135 * window in this case. 17136 */ 17137 end_window = 1; 17138 } else if (app_limited == CTF_JR_APP_LIMITED) { 17139 /* 17140 * A true application limited period, we have 17141 * ran out of data. 17142 */ 17143 end_window = 1; 17144 } else if (app_limited == CTF_JR_ASSESSING) { 17145 /* 17146 * In the assessing case we hit the end of 17147 * the if/else and had no known reason 17148 * This will panic us under invariants.. 17149 * 17150 * If we get this out in logs we need to 17151 * investagate which reason we missed. 17152 */ 17153 end_window = 1; 17154 } 17155 if (end_window) { 17156 uint8_t log = 0; 17157 17158 if ((tp->t_flags & TF_GPUTINPROG) && 17159 SEQ_GT(tp->gput_ack, tp->snd_max)) { 17160 /* Mark the last packet has app limited */ 17161 tp->gput_ack = tp->snd_max; 17162 log = 1; 17163 } 17164 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 17165 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 17166 if (rack->r_ctl.rc_app_limited_cnt == 0) 17167 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 17168 else { 17169 /* 17170 * Go out to the end app limited and mark 17171 * this new one as next and move the end_appl up 17172 * to this guy. 17173 */ 17174 if (rack->r_ctl.rc_end_appl) 17175 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 17176 rack->r_ctl.rc_end_appl = rsm; 17177 } 17178 rsm->r_flags |= RACK_APP_LIMITED; 17179 rack->r_ctl.rc_app_limited_cnt++; 17180 } 17181 if (log) 17182 rack_log_pacing_delay_calc(rack, 17183 rack->r_ctl.rc_app_limited_cnt, seq, 17184 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 17185 } 17186 } 17187 if (slot) { 17188 /* set the rack tcb into the slot N */ 17189 counter_u64_add(rack_paced_segments, 1); 17190 } else if (tot_len_this_send) { 17191 counter_u64_add(rack_unpaced_segments, 1); 17192 } 17193 /* Check if we need to go into persists or not */ 17194 if ((tp->snd_max == tp->snd_una) && 17195 TCPS_HAVEESTABLISHED(tp->t_state) && 17196 sbavail(sb) && 17197 (sbavail(sb) > tp->snd_wnd) && 17198 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 17199 /* Yes lets make sure to move to persist before timer-start */ 17200 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 17201 } 17202 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 17203 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 17204 } 17205 #ifdef NETFLIX_SHARED_CWND 17206 if ((sbavail(sb) == 0) && 17207 rack->r_ctl.rc_scw) { 17208 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 17209 rack->rack_scwnd_is_idle = 1; 17210 } 17211 #endif 17212 #ifdef TCP_ACCOUNTING 17213 if (tot_len_this_send > 0) { 17214 crtsc = get_cyclecount(); 17215 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17216 tp->tcp_cnt_counters[SND_OUT_DATA]++; 17217 } 17218 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 17219 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17220 tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); 17221 } 17222 counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); 17223 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17224 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); 17225 } 17226 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz)); 17227 } else { 17228 crtsc = get_cyclecount(); 17229 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17230 tp->tcp_cnt_counters[SND_LIMITED]++; 17231 } 17232 counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1); 17233 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17234 tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); 17235 } 17236 counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val)); 17237 } 17238 sched_unpin(); 17239 #endif 17240 return (0); 17241 17242 send: 17243 if (rsm || sack_rxmit) 17244 counter_u64_add(rack_nfto_resend, 1); 17245 else 17246 counter_u64_add(rack_non_fto_send, 1); 17247 if ((flags & TH_FIN) && 17248 sbavail(sb)) { 17249 /* 17250 * We do not transmit a FIN 17251 * with data outstanding. We 17252 * need to make it so all data 17253 * is acked first. 17254 */ 17255 flags &= ~TH_FIN; 17256 } 17257 /* Enforce stack imposed max seg size if we have one */ 17258 if (rack->r_ctl.rc_pace_max_segs && 17259 (len > rack->r_ctl.rc_pace_max_segs)) { 17260 mark = 1; 17261 len = rack->r_ctl.rc_pace_max_segs; 17262 } 17263 SOCKBUF_LOCK_ASSERT(sb); 17264 if (len > 0) { 17265 if (len >= segsiz) 17266 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 17267 else 17268 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 17269 } 17270 /* 17271 * Before ESTABLISHED, force sending of initial options unless TCP 17272 * set not to do any options. NOTE: we assume that the IP/TCP header 17273 * plus TCP options always fit in a single mbuf, leaving room for a 17274 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 17275 * + optlen <= MCLBYTES 17276 */ 17277 optlen = 0; 17278 #ifdef INET6 17279 if (isipv6) 17280 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 17281 else 17282 #endif 17283 hdrlen = sizeof(struct tcpiphdr); 17284 17285 /* 17286 * Compute options for segment. We only have to care about SYN and 17287 * established connection segments. Options for SYN-ACK segments 17288 * are handled in TCP syncache. 17289 */ 17290 to.to_flags = 0; 17291 if ((tp->t_flags & TF_NOOPT) == 0) { 17292 /* Maximum segment size. */ 17293 if (flags & TH_SYN) { 17294 tp->snd_nxt = tp->iss; 17295 to.to_mss = tcp_mssopt(&inp->inp_inc); 17296 if (tp->t_port) 17297 to.to_mss -= V_tcp_udp_tunneling_overhead; 17298 to.to_flags |= TOF_MSS; 17299 17300 /* 17301 * On SYN or SYN|ACK transmits on TFO connections, 17302 * only include the TFO option if it is not a 17303 * retransmit, as the presence of the TFO option may 17304 * have caused the original SYN or SYN|ACK to have 17305 * been dropped by a middlebox. 17306 */ 17307 if (IS_FASTOPEN(tp->t_flags) && 17308 (tp->t_rxtshift == 0)) { 17309 if (tp->t_state == TCPS_SYN_RECEIVED) { 17310 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 17311 to.to_tfo_cookie = 17312 (u_int8_t *)&tp->t_tfo_cookie.server; 17313 to.to_flags |= TOF_FASTOPEN; 17314 wanted_cookie = 1; 17315 } else if (tp->t_state == TCPS_SYN_SENT) { 17316 to.to_tfo_len = 17317 tp->t_tfo_client_cookie_len; 17318 to.to_tfo_cookie = 17319 tp->t_tfo_cookie.client; 17320 to.to_flags |= TOF_FASTOPEN; 17321 wanted_cookie = 1; 17322 /* 17323 * If we wind up having more data to 17324 * send with the SYN than can fit in 17325 * one segment, don't send any more 17326 * until the SYN|ACK comes back from 17327 * the other end. 17328 */ 17329 sendalot = 0; 17330 } 17331 } 17332 } 17333 /* Window scaling. */ 17334 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 17335 to.to_wscale = tp->request_r_scale; 17336 to.to_flags |= TOF_SCALE; 17337 } 17338 /* Timestamps. */ 17339 if ((tp->t_flags & TF_RCVD_TSTMP) || 17340 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 17341 to.to_tsval = ms_cts + tp->ts_offset; 17342 to.to_tsecr = tp->ts_recent; 17343 to.to_flags |= TOF_TS; 17344 } 17345 /* Set receive buffer autosizing timestamp. */ 17346 if (tp->rfbuf_ts == 0 && 17347 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 17348 tp->rfbuf_ts = tcp_ts_getticks(); 17349 /* Selective ACK's. */ 17350 if (tp->t_flags & TF_SACK_PERMIT) { 17351 if (flags & TH_SYN) 17352 to.to_flags |= TOF_SACKPERM; 17353 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 17354 tp->rcv_numsacks > 0) { 17355 to.to_flags |= TOF_SACK; 17356 to.to_nsacks = tp->rcv_numsacks; 17357 to.to_sacks = (u_char *)tp->sackblks; 17358 } 17359 } 17360 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 17361 /* TCP-MD5 (RFC2385). */ 17362 if (tp->t_flags & TF_SIGNATURE) 17363 to.to_flags |= TOF_SIGNATURE; 17364 #endif /* TCP_SIGNATURE */ 17365 17366 /* Processing the options. */ 17367 hdrlen += optlen = tcp_addoptions(&to, opt); 17368 /* 17369 * If we wanted a TFO option to be added, but it was unable 17370 * to fit, ensure no data is sent. 17371 */ 17372 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 17373 !(to.to_flags & TOF_FASTOPEN)) 17374 len = 0; 17375 } 17376 if (tp->t_port) { 17377 if (V_tcp_udp_tunneling_port == 0) { 17378 /* The port was removed?? */ 17379 SOCKBUF_UNLOCK(&so->so_snd); 17380 #ifdef TCP_ACCOUNTING 17381 crtsc = get_cyclecount(); 17382 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17383 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 17384 } 17385 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 17386 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 17387 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 17388 } 17389 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 17390 sched_unpin(); 17391 #endif 17392 return (EHOSTUNREACH); 17393 } 17394 hdrlen += sizeof(struct udphdr); 17395 } 17396 #ifdef INET6 17397 if (isipv6) 17398 ipoptlen = ip6_optlen(tp->t_inpcb); 17399 else 17400 #endif 17401 if (tp->t_inpcb->inp_options) 17402 ipoptlen = tp->t_inpcb->inp_options->m_len - 17403 offsetof(struct ipoption, ipopt_list); 17404 else 17405 ipoptlen = 0; 17406 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 17407 ipoptlen += ipsec_optlen; 17408 #endif 17409 17410 /* 17411 * Adjust data length if insertion of options will bump the packet 17412 * length beyond the t_maxseg length. Clear the FIN bit because we 17413 * cut off the tail of the segment. 17414 */ 17415 if (len + optlen + ipoptlen > tp->t_maxseg) { 17416 if (tso) { 17417 uint32_t if_hw_tsomax; 17418 uint32_t moff; 17419 int32_t max_len; 17420 17421 /* extract TSO information */ 17422 if_hw_tsomax = tp->t_tsomax; 17423 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 17424 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 17425 KASSERT(ipoptlen == 0, 17426 ("%s: TSO can't do IP options", __func__)); 17427 17428 /* 17429 * Check if we should limit by maximum payload 17430 * length: 17431 */ 17432 if (if_hw_tsomax != 0) { 17433 /* compute maximum TSO length */ 17434 max_len = (if_hw_tsomax - hdrlen - 17435 max_linkhdr); 17436 if (max_len <= 0) { 17437 len = 0; 17438 } else if (len > max_len) { 17439 sendalot = 1; 17440 len = max_len; 17441 mark = 2; 17442 } 17443 } 17444 /* 17445 * Prevent the last segment from being fractional 17446 * unless the send sockbuf can be emptied: 17447 */ 17448 max_len = (tp->t_maxseg - optlen); 17449 if ((sb_offset + len) < sbavail(sb)) { 17450 moff = len % (u_int)max_len; 17451 if (moff != 0) { 17452 mark = 3; 17453 len -= moff; 17454 } 17455 } 17456 /* 17457 * In case there are too many small fragments don't 17458 * use TSO: 17459 */ 17460 if (len <= segsiz) { 17461 mark = 4; 17462 tso = 0; 17463 } 17464 /* 17465 * Send the FIN in a separate segment after the bulk 17466 * sending is done. We don't trust the TSO 17467 * implementations to clear the FIN flag on all but 17468 * the last segment. 17469 */ 17470 if (tp->t_flags & TF_NEEDFIN) { 17471 sendalot = 4; 17472 } 17473 } else { 17474 mark = 5; 17475 if (optlen + ipoptlen >= tp->t_maxseg) { 17476 /* 17477 * Since we don't have enough space to put 17478 * the IP header chain and the TCP header in 17479 * one packet as required by RFC 7112, don't 17480 * send it. Also ensure that at least one 17481 * byte of the payload can be put into the 17482 * TCP segment. 17483 */ 17484 SOCKBUF_UNLOCK(&so->so_snd); 17485 error = EMSGSIZE; 17486 sack_rxmit = 0; 17487 goto out; 17488 } 17489 len = tp->t_maxseg - optlen - ipoptlen; 17490 sendalot = 5; 17491 } 17492 } else { 17493 tso = 0; 17494 mark = 6; 17495 } 17496 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 17497 ("%s: len > IP_MAXPACKET", __func__)); 17498 #ifdef DIAGNOSTIC 17499 #ifdef INET6 17500 if (max_linkhdr + hdrlen > MCLBYTES) 17501 #else 17502 if (max_linkhdr + hdrlen > MHLEN) 17503 #endif 17504 panic("tcphdr too big"); 17505 #endif 17506 17507 /* 17508 * This KASSERT is here to catch edge cases at a well defined place. 17509 * Before, those had triggered (random) panic conditions further 17510 * down. 17511 */ 17512 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 17513 if ((len == 0) && 17514 (flags & TH_FIN) && 17515 (sbused(sb))) { 17516 /* 17517 * We have outstanding data, don't send a fin by itself!. 17518 */ 17519 goto just_return; 17520 } 17521 /* 17522 * Grab a header mbuf, attaching a copy of data to be transmitted, 17523 * and initialize the header from the template for sends on this 17524 * connection. 17525 */ 17526 hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; 17527 if (len) { 17528 uint32_t max_val; 17529 uint32_t moff; 17530 17531 if (rack->r_ctl.rc_pace_max_segs) 17532 max_val = rack->r_ctl.rc_pace_max_segs; 17533 else if (rack->rc_user_set_max_segs) 17534 max_val = rack->rc_user_set_max_segs * segsiz; 17535 else 17536 max_val = len; 17537 /* 17538 * We allow a limit on sending with hptsi. 17539 */ 17540 if (len > max_val) { 17541 mark = 7; 17542 len = max_val; 17543 } 17544 #ifdef INET6 17545 if (MHLEN < hdrlen + max_linkhdr) 17546 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 17547 else 17548 #endif 17549 m = m_gethdr(M_NOWAIT, MT_DATA); 17550 17551 if (m == NULL) { 17552 SOCKBUF_UNLOCK(sb); 17553 error = ENOBUFS; 17554 sack_rxmit = 0; 17555 goto out; 17556 } 17557 m->m_data += max_linkhdr; 17558 m->m_len = hdrlen; 17559 17560 /* 17561 * Start the m_copy functions from the closest mbuf to the 17562 * sb_offset in the socket buffer chain. 17563 */ 17564 mb = sbsndptr_noadv(sb, sb_offset, &moff); 17565 s_mb = mb; 17566 s_moff = moff; 17567 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 17568 m_copydata(mb, moff, (int)len, 17569 mtod(m, caddr_t)+hdrlen); 17570 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 17571 sbsndptr_adv(sb, mb, len); 17572 m->m_len += len; 17573 } else { 17574 struct sockbuf *msb; 17575 17576 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 17577 msb = NULL; 17578 else 17579 msb = sb; 17580 m->m_next = tcp_m_copym( 17581 mb, moff, &len, 17582 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 17583 ((rsm == NULL) ? hw_tls : 0) 17584 #ifdef NETFLIX_COPY_ARGS 17585 , &filled_all 17586 #endif 17587 ); 17588 if (len <= (tp->t_maxseg - optlen)) { 17589 /* 17590 * Must have ran out of mbufs for the copy 17591 * shorten it to no longer need tso. Lets 17592 * not put on sendalot since we are low on 17593 * mbufs. 17594 */ 17595 tso = 0; 17596 } 17597 if (m->m_next == NULL) { 17598 SOCKBUF_UNLOCK(sb); 17599 (void)m_free(m); 17600 error = ENOBUFS; 17601 sack_rxmit = 0; 17602 goto out; 17603 } 17604 } 17605 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 17606 if (rsm && (rsm->r_flags & RACK_TLP)) { 17607 /* 17608 * TLP should not count in retran count, but 17609 * in its own bin 17610 */ 17611 counter_u64_add(rack_tlp_retran, 1); 17612 counter_u64_add(rack_tlp_retran_bytes, len); 17613 } else { 17614 tp->t_sndrexmitpack++; 17615 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 17616 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 17617 } 17618 #ifdef STATS 17619 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 17620 len); 17621 #endif 17622 } else { 17623 KMOD_TCPSTAT_INC(tcps_sndpack); 17624 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 17625 #ifdef STATS 17626 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 17627 len); 17628 #endif 17629 } 17630 /* 17631 * If we're sending everything we've got, set PUSH. (This 17632 * will keep happy those implementations which only give 17633 * data to the user when a buffer fills or a PUSH comes in.) 17634 */ 17635 if (sb_offset + len == sbused(sb) && 17636 sbused(sb) && 17637 !(flags & TH_SYN)) { 17638 flags |= TH_PUSH; 17639 add_flag |= RACK_HAD_PUSH; 17640 } 17641 17642 SOCKBUF_UNLOCK(sb); 17643 } else { 17644 SOCKBUF_UNLOCK(sb); 17645 if (tp->t_flags & TF_ACKNOW) 17646 KMOD_TCPSTAT_INC(tcps_sndacks); 17647 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 17648 KMOD_TCPSTAT_INC(tcps_sndctrl); 17649 else 17650 KMOD_TCPSTAT_INC(tcps_sndwinup); 17651 17652 m = m_gethdr(M_NOWAIT, MT_DATA); 17653 if (m == NULL) { 17654 error = ENOBUFS; 17655 sack_rxmit = 0; 17656 goto out; 17657 } 17658 #ifdef INET6 17659 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 17660 MHLEN >= hdrlen) { 17661 M_ALIGN(m, hdrlen); 17662 } else 17663 #endif 17664 m->m_data += max_linkhdr; 17665 m->m_len = hdrlen; 17666 } 17667 SOCKBUF_UNLOCK_ASSERT(sb); 17668 m->m_pkthdr.rcvif = (struct ifnet *)0; 17669 #ifdef MAC 17670 mac_inpcb_create_mbuf(inp, m); 17671 #endif 17672 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 17673 #ifdef INET6 17674 if (isipv6) 17675 ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; 17676 else 17677 #endif /* INET6 */ 17678 ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; 17679 th = rack->r_ctl.fsb.th; 17680 udp = rack->r_ctl.fsb.udp; 17681 if (udp) { 17682 #ifdef INET6 17683 if (isipv6) 17684 ulen = hdrlen + len - sizeof(struct ip6_hdr); 17685 else 17686 #endif /* INET6 */ 17687 ulen = hdrlen + len - sizeof(struct ip); 17688 udp->uh_ulen = htons(ulen); 17689 } 17690 } else { 17691 #ifdef INET6 17692 if (isipv6) { 17693 ip6 = mtod(m, struct ip6_hdr *); 17694 if (tp->t_port) { 17695 udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); 17696 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 17697 udp->uh_dport = tp->t_port; 17698 ulen = hdrlen + len - sizeof(struct ip6_hdr); 17699 udp->uh_ulen = htons(ulen); 17700 th = (struct tcphdr *)(udp + 1); 17701 } else 17702 th = (struct tcphdr *)(ip6 + 1); 17703 tcpip_fillheaders(inp, tp->t_port, ip6, th); 17704 } else 17705 #endif /* INET6 */ 17706 { 17707 ip = mtod(m, struct ip *); 17708 #ifdef TCPDEBUG 17709 ipov = (struct ipovly *)ip; 17710 #endif 17711 if (tp->t_port) { 17712 udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); 17713 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 17714 udp->uh_dport = tp->t_port; 17715 ulen = hdrlen + len - sizeof(struct ip); 17716 udp->uh_ulen = htons(ulen); 17717 th = (struct tcphdr *)(udp + 1); 17718 } else 17719 th = (struct tcphdr *)(ip + 1); 17720 tcpip_fillheaders(inp, tp->t_port, ip, th); 17721 } 17722 } 17723 /* 17724 * Fill in fields, remembering maximum advertised window for use in 17725 * delaying messages about window sizes. If resending a FIN, be sure 17726 * not to use a new sequence number. 17727 */ 17728 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 17729 tp->snd_nxt == tp->snd_max) 17730 tp->snd_nxt--; 17731 /* 17732 * If we are starting a connection, send ECN setup SYN packet. If we 17733 * are on a retransmit, we may resend those bits a number of times 17734 * as per RFC 3168. 17735 */ 17736 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 17737 if (tp->t_rxtshift >= 1) { 17738 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 17739 flags |= TH_ECE | TH_CWR; 17740 } else 17741 flags |= TH_ECE | TH_CWR; 17742 } 17743 /* Handle parallel SYN for ECN */ 17744 if ((tp->t_state == TCPS_SYN_RECEIVED) && 17745 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 17746 flags |= TH_ECE; 17747 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 17748 } 17749 if (TCPS_HAVEESTABLISHED(tp->t_state) && 17750 (tp->t_flags2 & TF2_ECN_PERMIT)) { 17751 /* 17752 * If the peer has ECN, mark data packets with ECN capable 17753 * transmission (ECT). Ignore pure ack packets, 17754 * retransmissions. 17755 */ 17756 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 17757 (sack_rxmit == 0)) { 17758 #ifdef INET6 17759 if (isipv6) 17760 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 17761 else 17762 #endif 17763 ip->ip_tos |= IPTOS_ECN_ECT0; 17764 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 17765 /* 17766 * Reply with proper ECN notifications. 17767 * Only set CWR on new data segments. 17768 */ 17769 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 17770 flags |= TH_CWR; 17771 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 17772 } 17773 } 17774 if (tp->t_flags2 & TF2_ECN_SND_ECE) 17775 flags |= TH_ECE; 17776 } 17777 /* 17778 * If we are doing retransmissions, then snd_nxt will not reflect 17779 * the first unsent octet. For ACK only packets, we do not want the 17780 * sequence number of the retransmitted packet, we want the sequence 17781 * number of the next unsent octet. So, if there is no data (and no 17782 * SYN or FIN), use snd_max instead of snd_nxt when filling in 17783 * ti_seq. But if we are in persist state, snd_max might reflect 17784 * one byte beyond the right edge of the window, so use snd_nxt in 17785 * that case, since we know we aren't doing a retransmission. 17786 * (retransmit and persist are mutually exclusive...) 17787 */ 17788 if (sack_rxmit == 0) { 17789 if (len || (flags & (TH_SYN | TH_FIN))) { 17790 th->th_seq = htonl(tp->snd_nxt); 17791 rack_seq = tp->snd_nxt; 17792 } else { 17793 th->th_seq = htonl(tp->snd_max); 17794 rack_seq = tp->snd_max; 17795 } 17796 } else { 17797 th->th_seq = htonl(rsm->r_start); 17798 rack_seq = rsm->r_start; 17799 } 17800 th->th_ack = htonl(tp->rcv_nxt); 17801 th->th_flags = flags; 17802 /* 17803 * Calculate receive window. Don't shrink window, but avoid silly 17804 * window syndrome. 17805 * If a RST segment is sent, advertise a window of zero. 17806 */ 17807 if (flags & TH_RST) { 17808 recwin = 0; 17809 } else { 17810 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 17811 recwin < (long)segsiz) { 17812 recwin = 0; 17813 } 17814 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 17815 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 17816 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 17817 } 17818 17819 /* 17820 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 17821 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 17822 * handled in syncache. 17823 */ 17824 if (flags & TH_SYN) 17825 th->th_win = htons((u_short) 17826 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 17827 else { 17828 /* Avoid shrinking window with window scaling. */ 17829 recwin = roundup2(recwin, 1 << tp->rcv_scale); 17830 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 17831 } 17832 /* 17833 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 17834 * window. This may cause the remote transmitter to stall. This 17835 * flag tells soreceive() to disable delayed acknowledgements when 17836 * draining the buffer. This can occur if the receiver is 17837 * attempting to read more data than can be buffered prior to 17838 * transmitting on the connection. 17839 */ 17840 if (th->th_win == 0) { 17841 tp->t_sndzerowin++; 17842 tp->t_flags |= TF_RXWIN0SENT; 17843 } else 17844 tp->t_flags &= ~TF_RXWIN0SENT; 17845 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 17846 /* Now are we using fsb?, if so copy the template data to the mbuf */ 17847 if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { 17848 uint8_t *cpto; 17849 17850 cpto = mtod(m, uint8_t *); 17851 memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); 17852 /* 17853 * We have just copied in: 17854 * IP/IP6 17855 * <optional udphdr> 17856 * tcphdr (no options) 17857 * 17858 * We need to grab the correct pointers into the mbuf 17859 * for both the tcp header, and possibly the udp header (if tunneling). 17860 * We do this by using the offset in the copy buffer and adding it 17861 * to the mbuf base pointer (cpto). 17862 */ 17863 #ifdef INET6 17864 if (isipv6) 17865 ip6 = mtod(m, struct ip6_hdr *); 17866 else 17867 #endif /* INET6 */ 17868 ip = mtod(m, struct ip *); 17869 th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); 17870 /* If we have a udp header lets set it into the mbuf as well */ 17871 if (udp) 17872 udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); 17873 } 17874 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 17875 if (to.to_flags & TOF_SIGNATURE) { 17876 /* 17877 * Calculate MD5 signature and put it into the place 17878 * determined before. 17879 * NOTE: since TCP options buffer doesn't point into 17880 * mbuf's data, calculate offset and use it. 17881 */ 17882 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 17883 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 17884 /* 17885 * Do not send segment if the calculation of MD5 17886 * digest has failed. 17887 */ 17888 goto out; 17889 } 17890 } 17891 #endif 17892 if (optlen) { 17893 bcopy(opt, th + 1, optlen); 17894 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 17895 } 17896 /* 17897 * Put TCP length in extended header, and then checksum extended 17898 * header and data. 17899 */ 17900 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 17901 #ifdef INET6 17902 if (isipv6) { 17903 /* 17904 * ip6_plen is not need to be filled now, and will be filled 17905 * in ip6_output. 17906 */ 17907 if (tp->t_port) { 17908 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 17909 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 17910 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 17911 th->th_sum = htons(0); 17912 UDPSTAT_INC(udps_opackets); 17913 } else { 17914 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 17915 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 17916 th->th_sum = in6_cksum_pseudo(ip6, 17917 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 17918 0); 17919 } 17920 } 17921 #endif 17922 #if defined(INET6) && defined(INET) 17923 else 17924 #endif 17925 #ifdef INET 17926 { 17927 if (tp->t_port) { 17928 m->m_pkthdr.csum_flags = CSUM_UDP; 17929 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 17930 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 17931 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 17932 th->th_sum = htons(0); 17933 UDPSTAT_INC(udps_opackets); 17934 } else { 17935 m->m_pkthdr.csum_flags = CSUM_TCP; 17936 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 17937 th->th_sum = in_pseudo(ip->ip_src.s_addr, 17938 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 17939 IPPROTO_TCP + len + optlen)); 17940 } 17941 /* IP version must be set here for ipv4/ipv6 checking later */ 17942 KASSERT(ip->ip_v == IPVERSION, 17943 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 17944 } 17945 #endif 17946 /* 17947 * Enable TSO and specify the size of the segments. The TCP pseudo 17948 * header checksum is always provided. XXX: Fixme: This is currently 17949 * not the case for IPv6. 17950 */ 17951 if (tso) { 17952 KASSERT(len > tp->t_maxseg - optlen, 17953 ("%s: len <= tso_segsz", __func__)); 17954 m->m_pkthdr.csum_flags |= CSUM_TSO; 17955 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 17956 } 17957 KASSERT(len + hdrlen == m_length(m, NULL), 17958 ("%s: mbuf chain different than expected: %d + %u != %u", 17959 __func__, len, hdrlen, m_length(m, NULL))); 17960 17961 #ifdef TCP_HHOOK 17962 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 17963 hhook_run_tcp_est_out(tp, th, &to, len, tso); 17964 #endif 17965 /* We're getting ready to send; log now. */ 17966 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 17967 union tcp_log_stackspecific log; 17968 17969 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 17970 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 17971 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 17972 if (rack->rack_no_prr) 17973 log.u_bbr.flex1 = 0; 17974 else 17975 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 17976 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 17977 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 17978 log.u_bbr.flex4 = orig_len; 17979 if (filled_all) 17980 log.u_bbr.flex5 = 0x80000000; 17981 else 17982 log.u_bbr.flex5 = 0; 17983 /* Save off the early/late values */ 17984 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 17985 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 17986 log.u_bbr.bw_inuse = rack_get_bw(rack); 17987 if (rsm || sack_rxmit) { 17988 if (doing_tlp) 17989 log.u_bbr.flex8 = 2; 17990 else 17991 log.u_bbr.flex8 = 1; 17992 } else { 17993 log.u_bbr.flex8 = 0; 17994 } 17995 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 17996 log.u_bbr.flex7 = mark; 17997 log.u_bbr.flex7 <<= 8; 17998 log.u_bbr.flex7 |= pass; 17999 log.u_bbr.pkts_out = tp->t_maxseg; 18000 log.u_bbr.timeStamp = cts; 18001 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 18002 log.u_bbr.lt_epoch = cwnd_to_use; 18003 log.u_bbr.delivered = sendalot; 18004 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 18005 len, &log, false, NULL, NULL, 0, &tv); 18006 } else 18007 lgb = NULL; 18008 18009 /* 18010 * Fill in IP length and desired time to live and send to IP level. 18011 * There should be a better way to handle ttl and tos; we could keep 18012 * them in the template, but need a way to checksum without them. 18013 */ 18014 /* 18015 * m->m_pkthdr.len should have been set before cksum calcuration, 18016 * because in6_cksum() need it. 18017 */ 18018 #ifdef INET6 18019 if (isipv6) { 18020 /* 18021 * we separately set hoplimit for every segment, since the 18022 * user might want to change the value via setsockopt. Also, 18023 * desired default hop limit might be changed via Neighbor 18024 * Discovery. 18025 */ 18026 rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); 18027 18028 /* 18029 * Set the packet size here for the benefit of DTrace 18030 * probes. ip6_output() will set it properly; it's supposed 18031 * to include the option header lengths as well. 18032 */ 18033 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 18034 18035 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 18036 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18037 else 18038 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18039 18040 if (tp->t_state == TCPS_SYN_SENT) 18041 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 18042 18043 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 18044 /* TODO: IPv6 IP6TOS_ECT bit on */ 18045 error = ip6_output(m, 18046 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18047 inp->in6p_outputopts, 18048 #else 18049 NULL, 18050 #endif 18051 &inp->inp_route6, 18052 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 18053 NULL, NULL, inp); 18054 18055 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 18056 mtu = inp->inp_route6.ro_nh->nh_mtu; 18057 } 18058 #endif /* INET6 */ 18059 #if defined(INET) && defined(INET6) 18060 else 18061 #endif 18062 #ifdef INET 18063 { 18064 ip->ip_len = htons(m->m_pkthdr.len); 18065 #ifdef INET6 18066 if (inp->inp_vflag & INP_IPV6PROTO) 18067 ip->ip_ttl = in6_selecthlim(inp, NULL); 18068 #endif /* INET6 */ 18069 rack->r_ctl.fsb.hoplimit = ip->ip_ttl; 18070 /* 18071 * If we do path MTU discovery, then we set DF on every 18072 * packet. This might not be the best thing to do according 18073 * to RFC3390 Section 2. However the tcp hostcache migitates 18074 * the problem so it affects only the first tcp connection 18075 * with a host. 18076 * 18077 * NB: Don't set DF on small MTU/MSS to have a safe 18078 * fallback. 18079 */ 18080 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 18081 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 18082 if (tp->t_port == 0 || len < V_tcp_minmss) { 18083 ip->ip_off |= htons(IP_DF); 18084 } 18085 } else { 18086 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 18087 } 18088 18089 if (tp->t_state == TCPS_SYN_SENT) 18090 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 18091 18092 TCP_PROBE5(send, NULL, tp, ip, tp, th); 18093 18094 error = ip_output(m, 18095 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 18096 inp->inp_options, 18097 #else 18098 NULL, 18099 #endif 18100 &inp->inp_route, 18101 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 18102 inp); 18103 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 18104 mtu = inp->inp_route.ro_nh->nh_mtu; 18105 } 18106 #endif /* INET */ 18107 18108 out: 18109 if (lgb) { 18110 lgb->tlb_errno = error; 18111 lgb = NULL; 18112 } 18113 /* 18114 * In transmit state, time the transmission and arrange for the 18115 * retransmit. In persist state, just set snd_max. 18116 */ 18117 if (error == 0) { 18118 tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp); 18119 rack->forced_ack = 0; /* If we send something zap the FA flag */ 18120 if (rsm && (doing_tlp == 0)) { 18121 /* Set we retransmitted */ 18122 rack->rc_gp_saw_rec = 1; 18123 } else { 18124 if (cwnd_to_use > tp->snd_ssthresh) { 18125 /* Set we sent in CA */ 18126 rack->rc_gp_saw_ca = 1; 18127 } else { 18128 /* Set we sent in SS */ 18129 rack->rc_gp_saw_ss = 1; 18130 } 18131 } 18132 if (TCPS_HAVEESTABLISHED(tp->t_state) && 18133 (tp->t_flags & TF_SACK_PERMIT) && 18134 tp->rcv_numsacks > 0) 18135 tcp_clean_dsack_blocks(tp); 18136 tot_len_this_send += len; 18137 if (len == 0) 18138 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 18139 else if (len == 1) { 18140 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 18141 } else if (len > 1) { 18142 int idx; 18143 18144 idx = (len / segsiz) + 3; 18145 if (idx >= TCP_MSS_ACCT_ATIMER) 18146 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 18147 else 18148 counter_u64_add(rack_out_size[idx], 1); 18149 } 18150 } 18151 if ((rack->rack_no_prr == 0) && 18152 sub_from_prr && 18153 (error == 0)) { 18154 if (rack->r_ctl.rc_prr_sndcnt >= len) 18155 rack->r_ctl.rc_prr_sndcnt -= len; 18156 else 18157 rack->r_ctl.rc_prr_sndcnt = 0; 18158 } 18159 sub_from_prr = 0; 18160 if (doing_tlp && (rsm == NULL)) { 18161 /* New send doing a TLP */ 18162 add_flag |= RACK_TLP; 18163 } 18164 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, 18165 rack_to_usec_ts(&tv), 18166 rsm, add_flag, s_mb, s_moff); 18167 18168 18169 if ((error == 0) && 18170 (len > 0) && 18171 (tp->snd_una == tp->snd_max)) 18172 rack->r_ctl.rc_tlp_rxt_last_time = cts; 18173 { 18174 tcp_seq startseq = tp->snd_nxt; 18175 18176 /* Track our lost count */ 18177 if (rsm && (doing_tlp == 0)) 18178 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 18179 /* 18180 * Advance snd_nxt over sequence space of this segment. 18181 */ 18182 if (error) 18183 /* We don't log or do anything with errors */ 18184 goto nomore; 18185 if (doing_tlp == 0) { 18186 if (rsm == NULL) { 18187 /* 18188 * Not a retransmission of some 18189 * sort, new data is going out so 18190 * clear our TLP count and flag. 18191 */ 18192 rack->rc_tlp_in_progress = 0; 18193 rack->r_ctl.rc_tlp_cnt_out = 0; 18194 } 18195 } else { 18196 /* 18197 * We have just sent a TLP, mark that it is true 18198 * and make sure our in progress is set so we 18199 * continue to check the count. 18200 */ 18201 rack->rc_tlp_in_progress = 1; 18202 rack->r_ctl.rc_tlp_cnt_out++; 18203 } 18204 if (flags & (TH_SYN | TH_FIN)) { 18205 if (flags & TH_SYN) 18206 tp->snd_nxt++; 18207 if (flags & TH_FIN) { 18208 tp->snd_nxt++; 18209 tp->t_flags |= TF_SENTFIN; 18210 } 18211 } 18212 /* In the ENOBUFS case we do *not* update snd_max */ 18213 if (sack_rxmit) 18214 goto nomore; 18215 18216 tp->snd_nxt += len; 18217 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 18218 if (tp->snd_una == tp->snd_max) { 18219 /* 18220 * Update the time we just added data since 18221 * none was outstanding. 18222 */ 18223 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 18224 tp->t_acktime = ticks; 18225 } 18226 tp->snd_max = tp->snd_nxt; 18227 /* 18228 * Time this transmission if not a retransmission and 18229 * not currently timing anything. 18230 * This is only relevant in case of switching back to 18231 * the base stack. 18232 */ 18233 if (tp->t_rtttime == 0) { 18234 tp->t_rtttime = ticks; 18235 tp->t_rtseq = startseq; 18236 KMOD_TCPSTAT_INC(tcps_segstimed); 18237 } 18238 if (len && 18239 ((tp->t_flags & TF_GPUTINPROG) == 0)) 18240 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 18241 } 18242 /* 18243 * If we are doing FO we need to update the mbuf position and subtract 18244 * this happens when the peer sends us duplicate information and 18245 * we thus want to send a DSACK. 18246 * 18247 * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO 18248 * turned off? If not then we are going to echo multiple DSACK blocks 18249 * out (with the TSO), which we should not be doing. 18250 */ 18251 if (rack->r_fast_output && len) { 18252 if (rack->r_ctl.fsb.left_to_send > len) 18253 rack->r_ctl.fsb.left_to_send -= len; 18254 else 18255 rack->r_ctl.fsb.left_to_send = 0; 18256 if (rack->r_ctl.fsb.left_to_send < segsiz) 18257 rack->r_fast_output = 0; 18258 if (rack->r_fast_output) { 18259 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18260 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18261 } 18262 } 18263 } 18264 nomore: 18265 if (error) { 18266 rack->r_ctl.rc_agg_delayed = 0; 18267 rack->r_early = 0; 18268 rack->r_late = 0; 18269 rack->r_ctl.rc_agg_early = 0; 18270 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 18271 /* 18272 * Failures do not advance the seq counter above. For the 18273 * case of ENOBUFS we will fall out and retry in 1ms with 18274 * the hpts. Everything else will just have to retransmit 18275 * with the timer. 18276 * 18277 * In any case, we do not want to loop around for another 18278 * send without a good reason. 18279 */ 18280 sendalot = 0; 18281 switch (error) { 18282 case EPERM: 18283 tp->t_softerror = error; 18284 #ifdef TCP_ACCOUNTING 18285 crtsc = get_cyclecount(); 18286 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18287 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18288 } 18289 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18290 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18291 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18292 } 18293 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18294 sched_unpin(); 18295 #endif 18296 return (error); 18297 case ENOBUFS: 18298 /* 18299 * Pace us right away to retry in a some 18300 * time 18301 */ 18302 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 18303 if (rack->rc_enobuf < 0x7f) 18304 rack->rc_enobuf++; 18305 if (slot < (10 * HPTS_USEC_IN_MSEC)) 18306 slot = 10 * HPTS_USEC_IN_MSEC; 18307 if (rack->r_ctl.crte != NULL) { 18308 counter_u64_add(rack_saw_enobuf_hw, 1); 18309 tcp_rl_log_enobuf(rack->r_ctl.crte); 18310 } 18311 counter_u64_add(rack_saw_enobuf, 1); 18312 goto enobufs; 18313 case EMSGSIZE: 18314 /* 18315 * For some reason the interface we used initially 18316 * to send segments changed to another or lowered 18317 * its MTU. If TSO was active we either got an 18318 * interface without TSO capabilits or TSO was 18319 * turned off. If we obtained mtu from ip_output() 18320 * then update it and try again. 18321 */ 18322 if (tso) 18323 tp->t_flags &= ~TF_TSO; 18324 if (mtu != 0) { 18325 tcp_mss_update(tp, -1, mtu, NULL, NULL); 18326 goto again; 18327 } 18328 slot = 10 * HPTS_USEC_IN_MSEC; 18329 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 18330 #ifdef TCP_ACCOUNTING 18331 crtsc = get_cyclecount(); 18332 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18333 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18334 } 18335 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18336 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18337 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18338 } 18339 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18340 sched_unpin(); 18341 #endif 18342 return (error); 18343 case ENETUNREACH: 18344 counter_u64_add(rack_saw_enetunreach, 1); 18345 case EHOSTDOWN: 18346 case EHOSTUNREACH: 18347 case ENETDOWN: 18348 if (TCPS_HAVERCVDSYN(tp->t_state)) { 18349 tp->t_softerror = error; 18350 } 18351 /* FALLTHROUGH */ 18352 default: 18353 slot = 10 * HPTS_USEC_IN_MSEC; 18354 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 18355 #ifdef TCP_ACCOUNTING 18356 crtsc = get_cyclecount(); 18357 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18358 tp->tcp_cnt_counters[SND_OUT_FAIL]++; 18359 } 18360 counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); 18361 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18362 tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); 18363 } 18364 counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); 18365 sched_unpin(); 18366 #endif 18367 return (error); 18368 } 18369 } else { 18370 rack->rc_enobuf = 0; 18371 if (IN_FASTRECOVERY(tp->t_flags) && rsm) 18372 rack->r_ctl.retran_during_recovery += len; 18373 } 18374 KMOD_TCPSTAT_INC(tcps_sndtotal); 18375 18376 /* 18377 * Data sent (as far as we can tell). If this advertises a larger 18378 * window than any other segment, then remember the size of the 18379 * advertised window. Any pending ACK has now been sent. 18380 */ 18381 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 18382 tp->rcv_adv = tp->rcv_nxt + recwin; 18383 18384 tp->last_ack_sent = tp->rcv_nxt; 18385 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 18386 enobufs: 18387 if (sendalot) { 18388 /* Do we need to turn off sendalot? */ 18389 if (rack->r_ctl.rc_pace_max_segs && 18390 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 18391 /* We hit our max. */ 18392 sendalot = 0; 18393 } else if ((rack->rc_user_set_max_segs) && 18394 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 18395 /* We hit the user defined max */ 18396 sendalot = 0; 18397 } 18398 } 18399 if ((error == 0) && (flags & TH_FIN)) 18400 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 18401 if (flags & TH_RST) { 18402 /* 18403 * We don't send again after sending a RST. 18404 */ 18405 slot = 0; 18406 sendalot = 0; 18407 if (error == 0) 18408 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 18409 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 18410 /* 18411 * Get our pacing rate, if an error 18412 * occurred in sending (ENOBUF) we would 18413 * hit the else if with slot preset. Other 18414 * errors return. 18415 */ 18416 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 18417 } 18418 if (rsm && 18419 (rsm->r_flags & RACK_HAS_SYN) == 0 && 18420 rack->use_rack_rr) { 18421 /* Its a retransmit and we use the rack cheat? */ 18422 if ((slot == 0) || 18423 (rack->rc_always_pace == 0) || 18424 (rack->r_rr_config == 1)) { 18425 /* 18426 * We have no pacing set or we 18427 * are using old-style rack or 18428 * we are overriden to use the old 1ms pacing. 18429 */ 18430 slot = rack->r_ctl.rc_min_to; 18431 } 18432 } 18433 /* We have sent clear the flag */ 18434 rack->r_ent_rec_ns = 0; 18435 if (rack->r_must_retran) { 18436 if (rsm) { 18437 rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); 18438 if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { 18439 /* 18440 * We have retransmitted all. 18441 */ 18442 rack->r_must_retran = 0; 18443 rack->r_ctl.rc_out_at_rto = 0; 18444 } 18445 } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 18446 /* 18447 * Sending new data will also kill 18448 * the loop. 18449 */ 18450 rack->r_must_retran = 0; 18451 rack->r_ctl.rc_out_at_rto = 0; 18452 } 18453 } 18454 rack->r_ctl.fsb.recwin = recwin; 18455 if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && 18456 SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { 18457 /* 18458 * We hit an RTO and now have past snd_max at the RTO 18459 * clear all the WAS flags. 18460 */ 18461 tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); 18462 } 18463 if (slot) { 18464 /* set the rack tcb into the slot N */ 18465 counter_u64_add(rack_paced_segments, 1); 18466 if ((error == 0) && 18467 rack_use_rfo && 18468 ((flags & (TH_SYN|TH_FIN)) == 0) && 18469 (rsm == NULL) && 18470 (tp->snd_nxt == tp->snd_max) && 18471 (ipoptlen == 0) && 18472 (tp->rcv_numsacks == 0) && 18473 rack->r_fsb_inited && 18474 TCPS_HAVEESTABLISHED(tp->t_state) && 18475 (rack->r_must_retran == 0) && 18476 ((tp->t_flags & TF_NEEDFIN) == 0) && 18477 (len > 0) && (orig_len > 0) && 18478 (orig_len > len) && 18479 ((orig_len - len) >= segsiz) && 18480 ((optlen == 0) || 18481 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 18482 /* We can send at least one more MSS using our fsb */ 18483 18484 rack->r_fast_output = 1; 18485 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18486 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18487 rack->r_ctl.fsb.tcp_flags = flags; 18488 rack->r_ctl.fsb.left_to_send = orig_len - len; 18489 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 18490 ("rack:%p left_to_send:%u sbavail:%u out:%u", 18491 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 18492 (tp->snd_max - tp->snd_una))); 18493 if (rack->r_ctl.fsb.left_to_send < segsiz) 18494 rack->r_fast_output = 0; 18495 else { 18496 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 18497 rack->r_ctl.fsb.rfo_apply_push = 1; 18498 else 18499 rack->r_ctl.fsb.rfo_apply_push = 0; 18500 } 18501 } else 18502 rack->r_fast_output = 0; 18503 rack_log_fsb(rack, tp, so, flags, 18504 ipoptlen, orig_len, len, error, 18505 (rsm == NULL), optlen, __LINE__, 2); 18506 } else if (sendalot) { 18507 int ret; 18508 18509 if (len) 18510 counter_u64_add(rack_unpaced_segments, 1); 18511 sack_rxmit = 0; 18512 if ((error == 0) && 18513 rack_use_rfo && 18514 ((flags & (TH_SYN|TH_FIN)) == 0) && 18515 (rsm == NULL) && 18516 (ipoptlen == 0) && 18517 (tp->rcv_numsacks == 0) && 18518 (tp->snd_nxt == tp->snd_max) && 18519 (rack->r_must_retran == 0) && 18520 rack->r_fsb_inited && 18521 TCPS_HAVEESTABLISHED(tp->t_state) && 18522 ((tp->t_flags & TF_NEEDFIN) == 0) && 18523 (len > 0) && (orig_len > 0) && 18524 (orig_len > len) && 18525 ((orig_len - len) >= segsiz) && 18526 ((optlen == 0) || 18527 ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { 18528 /* we can use fast_output for more */ 18529 18530 rack->r_fast_output = 1; 18531 rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); 18532 rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; 18533 rack->r_ctl.fsb.tcp_flags = flags; 18534 rack->r_ctl.fsb.left_to_send = orig_len - len; 18535 KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), 18536 ("rack:%p left_to_send:%u sbavail:%u out:%u", 18537 rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), 18538 (tp->snd_max - tp->snd_una))); 18539 if (rack->r_ctl.fsb.left_to_send < segsiz) { 18540 rack->r_fast_output = 0; 18541 } 18542 if (rack->r_fast_output) { 18543 if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) 18544 rack->r_ctl.fsb.rfo_apply_push = 1; 18545 else 18546 rack->r_ctl.fsb.rfo_apply_push = 0; 18547 rack_log_fsb(rack, tp, so, flags, 18548 ipoptlen, orig_len, len, error, 18549 (rsm == NULL), optlen, __LINE__, 3); 18550 error = 0; 18551 ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); 18552 if (ret >= 0) 18553 return (ret); 18554 else if (error) 18555 goto nomore; 18556 18557 } 18558 } 18559 goto again; 18560 } else if (len) { 18561 counter_u64_add(rack_unpaced_segments, 1); 18562 } 18563 /* Assure when we leave that snd_nxt will point to top */ 18564 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 18565 tp->snd_nxt = tp->snd_max; 18566 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 18567 #ifdef TCP_ACCOUNTING 18568 crtsc = get_cyclecount() - ts_val; 18569 if (tot_len_this_send) { 18570 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18571 tp->tcp_cnt_counters[SND_OUT_DATA]++; 18572 } 18573 counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); 18574 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18575 tp->tcp_proc_time[SND_OUT_DATA] += crtsc; 18576 } 18577 counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc); 18578 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18579 tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); 18580 } 18581 counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz)); 18582 } else { 18583 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18584 tp->tcp_cnt_counters[SND_OUT_ACK]++; 18585 } 18586 counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1); 18587 if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { 18588 tp->tcp_proc_time[SND_OUT_ACK] += crtsc; 18589 } 18590 counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc); 18591 } 18592 sched_unpin(); 18593 #endif 18594 if (error == ENOBUFS) 18595 error = 0; 18596 return (error); 18597 } 18598 18599 static void 18600 rack_update_seg(struct tcp_rack *rack) 18601 { 18602 uint32_t orig_val; 18603 18604 orig_val = rack->r_ctl.rc_pace_max_segs; 18605 rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); 18606 if (orig_val != rack->r_ctl.rc_pace_max_segs) 18607 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 18608 } 18609 18610 static void 18611 rack_mtu_change(struct tcpcb *tp) 18612 { 18613 /* 18614 * The MSS may have changed 18615 */ 18616 struct tcp_rack *rack; 18617 18618 rack = (struct tcp_rack *)tp->t_fb_ptr; 18619 if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { 18620 /* 18621 * The MTU has changed we need to resend everything 18622 * since all we have sent is lost. We first fix 18623 * up the mtu though. 18624 */ 18625 rack_set_pace_segments(tp, rack, __LINE__, NULL); 18626 /* We treat this like a full retransmit timeout without the cwnd adjustment */ 18627 rack_remxt_tmr(tp); 18628 rack->r_fast_output = 0; 18629 rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, 18630 rack->r_ctl.rc_sacked); 18631 rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; 18632 rack->r_must_retran = 1; 18633 18634 } 18635 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 18636 /* We don't use snd_nxt to retransmit */ 18637 tp->snd_nxt = tp->snd_max; 18638 } 18639 18640 static int 18641 rack_set_profile(struct tcp_rack *rack, int prof) 18642 { 18643 int err = EINVAL; 18644 if (prof == 1) { 18645 /* pace_always=1 */ 18646 if (rack->rc_always_pace == 0) { 18647 if (tcp_can_enable_pacing() == 0) 18648 return (EBUSY); 18649 } 18650 rack->rc_always_pace = 1; 18651 if (rack->use_fixed_rate || rack->gp_ready) 18652 rack_set_cc_pacing(rack); 18653 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18654 rack->rack_attempt_hdwr_pace = 0; 18655 /* cmpack=1 */ 18656 if (rack_use_cmp_acks) 18657 rack->r_use_cmp_ack = 1; 18658 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 18659 rack->r_use_cmp_ack) 18660 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18661 /* scwnd=1 */ 18662 rack->rack_enable_scwnd = 1; 18663 /* dynamic=100 */ 18664 rack->rc_gp_dyn_mul = 1; 18665 /* gp_inc_ca */ 18666 rack->r_ctl.rack_per_of_gp_ca = 100; 18667 /* rrr_conf=3 */ 18668 rack->r_rr_config = 3; 18669 /* npush=2 */ 18670 rack->r_ctl.rc_no_push_at_mrtt = 2; 18671 /* fillcw=1 */ 18672 rack->rc_pace_to_cwnd = 1; 18673 rack->rc_pace_fill_if_rttin_range = 0; 18674 rack->rtt_limit_mul = 0; 18675 /* noprr=1 */ 18676 rack->rack_no_prr = 1; 18677 /* lscwnd=1 */ 18678 rack->r_limit_scw = 1; 18679 /* gp_inc_rec */ 18680 rack->r_ctl.rack_per_of_gp_rec = 90; 18681 err = 0; 18682 18683 } else if (prof == 3) { 18684 /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ 18685 /* pace_always=1 */ 18686 if (rack->rc_always_pace == 0) { 18687 if (tcp_can_enable_pacing() == 0) 18688 return (EBUSY); 18689 } 18690 rack->rc_always_pace = 1; 18691 if (rack->use_fixed_rate || rack->gp_ready) 18692 rack_set_cc_pacing(rack); 18693 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18694 rack->rack_attempt_hdwr_pace = 0; 18695 /* cmpack=1 */ 18696 if (rack_use_cmp_acks) 18697 rack->r_use_cmp_ack = 1; 18698 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && 18699 rack->r_use_cmp_ack) 18700 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18701 /* scwnd=1 */ 18702 rack->rack_enable_scwnd = 1; 18703 /* dynamic=100 */ 18704 rack->rc_gp_dyn_mul = 1; 18705 /* gp_inc_ca */ 18706 rack->r_ctl.rack_per_of_gp_ca = 100; 18707 /* rrr_conf=3 */ 18708 rack->r_rr_config = 3; 18709 /* npush=2 */ 18710 rack->r_ctl.rc_no_push_at_mrtt = 2; 18711 /* fillcw=2 */ 18712 rack->rc_pace_to_cwnd = 1; 18713 rack->r_fill_less_agg = 1; 18714 rack->rc_pace_fill_if_rttin_range = 0; 18715 rack->rtt_limit_mul = 0; 18716 /* noprr=1 */ 18717 rack->rack_no_prr = 1; 18718 /* lscwnd=1 */ 18719 rack->r_limit_scw = 1; 18720 /* gp_inc_rec */ 18721 rack->r_ctl.rack_per_of_gp_rec = 90; 18722 err = 0; 18723 18724 18725 } else if (prof == 2) { 18726 /* cmpack=1 */ 18727 if (rack->rc_always_pace == 0) { 18728 if (tcp_can_enable_pacing() == 0) 18729 return (EBUSY); 18730 } 18731 rack->rc_always_pace = 1; 18732 if (rack->use_fixed_rate || rack->gp_ready) 18733 rack_set_cc_pacing(rack); 18734 rack->r_use_cmp_ack = 1; 18735 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 18736 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18737 /* pace_always=1 */ 18738 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18739 /* scwnd=1 */ 18740 rack->rack_enable_scwnd = 1; 18741 /* dynamic=100 */ 18742 rack->rc_gp_dyn_mul = 1; 18743 rack->r_ctl.rack_per_of_gp_ca = 100; 18744 /* rrr_conf=3 */ 18745 rack->r_rr_config = 3; 18746 /* npush=2 */ 18747 rack->r_ctl.rc_no_push_at_mrtt = 2; 18748 /* fillcw=1 */ 18749 rack->rc_pace_to_cwnd = 1; 18750 rack->rc_pace_fill_if_rttin_range = 0; 18751 rack->rtt_limit_mul = 0; 18752 /* noprr=1 */ 18753 rack->rack_no_prr = 1; 18754 /* lscwnd=0 */ 18755 rack->r_limit_scw = 0; 18756 err = 0; 18757 } else if (prof == 0) { 18758 /* This changes things back to the default settings */ 18759 err = 0; 18760 if (rack->rc_always_pace) { 18761 tcp_decrement_paced_conn(); 18762 rack_undo_cc_pacing(rack); 18763 rack->rc_always_pace = 0; 18764 } 18765 if (rack_pace_every_seg && tcp_can_enable_pacing()) { 18766 rack->rc_always_pace = 1; 18767 if (rack->use_fixed_rate || rack->gp_ready) 18768 rack_set_cc_pacing(rack); 18769 } else 18770 rack->rc_always_pace = 0; 18771 if (rack_use_cmp_acks) 18772 rack->r_use_cmp_ack = 1; 18773 else 18774 rack->r_use_cmp_ack = 0; 18775 if (rack_disable_prr) 18776 rack->rack_no_prr = 1; 18777 else 18778 rack->rack_no_prr = 0; 18779 if (rack_gp_no_rec_chg) 18780 rack->rc_gp_no_rec_chg = 1; 18781 else 18782 rack->rc_gp_no_rec_chg = 0; 18783 if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { 18784 rack->r_mbuf_queue = 1; 18785 if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) 18786 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18787 rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18788 } else { 18789 rack->r_mbuf_queue = 0; 18790 rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 18791 } 18792 if (rack_enable_shared_cwnd) 18793 rack->rack_enable_scwnd = 1; 18794 else 18795 rack->rack_enable_scwnd = 0; 18796 if (rack_do_dyn_mul) { 18797 /* When dynamic adjustment is on CA needs to start at 100% */ 18798 rack->rc_gp_dyn_mul = 1; 18799 if (rack_do_dyn_mul >= 100) 18800 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 18801 } else { 18802 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 18803 rack->rc_gp_dyn_mul = 0; 18804 } 18805 rack->r_rr_config = 0; 18806 rack->r_ctl.rc_no_push_at_mrtt = 0; 18807 rack->rc_pace_to_cwnd = 0; 18808 rack->rc_pace_fill_if_rttin_range = 0; 18809 rack->rtt_limit_mul = 0; 18810 18811 if (rack_enable_hw_pacing) 18812 rack->rack_hdw_pace_ena = 1; 18813 else 18814 rack->rack_hdw_pace_ena = 0; 18815 if (rack_disable_prr) 18816 rack->rack_no_prr = 1; 18817 else 18818 rack->rack_no_prr = 0; 18819 if (rack_limits_scwnd) 18820 rack->r_limit_scw = 1; 18821 else 18822 rack->r_limit_scw = 0; 18823 err = 0; 18824 } 18825 return (err); 18826 } 18827 18828 static int 18829 rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) 18830 { 18831 struct deferred_opt_list *dol; 18832 18833 dol = malloc(sizeof(struct deferred_opt_list), 18834 M_TCPFSB, M_NOWAIT|M_ZERO); 18835 if (dol == NULL) { 18836 /* 18837 * No space yikes -- fail out.. 18838 */ 18839 return (0); 18840 } 18841 dol->optname = sopt_name; 18842 dol->optval = loptval; 18843 TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); 18844 return (1); 18845 } 18846 18847 static int 18848 rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, 18849 uint32_t optval, uint64_t loptval) 18850 { 18851 struct epoch_tracker et; 18852 struct sockopt sopt; 18853 struct cc_newreno_opts opt; 18854 uint64_t val; 18855 int error = 0; 18856 uint16_t ca, ss; 18857 18858 switch (sopt_name) { 18859 18860 case TCP_RACK_PACING_BETA: 18861 RACK_OPTS_INC(tcp_rack_beta); 18862 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 18863 /* This only works for newreno. */ 18864 error = EINVAL; 18865 break; 18866 } 18867 if (rack->rc_pacing_cc_set) { 18868 /* 18869 * Set them into the real CC module 18870 * whats in the rack pcb is the old values 18871 * to be used on restoral/ 18872 */ 18873 sopt.sopt_dir = SOPT_SET; 18874 opt.name = CC_NEWRENO_BETA; 18875 opt.val = optval; 18876 if (CC_ALGO(tp)->ctl_output != NULL) 18877 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 18878 else { 18879 error = ENOENT; 18880 break; 18881 } 18882 } else { 18883 /* 18884 * Not pacing yet so set it into our local 18885 * rack pcb storage. 18886 */ 18887 rack->r_ctl.rc_saved_beta.beta = optval; 18888 } 18889 break; 18890 case TCP_RACK_TIMER_SLOP: 18891 RACK_OPTS_INC(tcp_rack_timer_slop); 18892 rack->r_ctl.timer_slop = optval; 18893 if (rack->rc_tp->t_srtt) { 18894 /* 18895 * If we have an SRTT lets update t_rxtcur 18896 * to have the new slop. 18897 */ 18898 RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), 18899 rack_rto_min, rack_rto_max, 18900 rack->r_ctl.timer_slop); 18901 } 18902 break; 18903 case TCP_RACK_PACING_BETA_ECN: 18904 RACK_OPTS_INC(tcp_rack_beta_ecn); 18905 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { 18906 /* This only works for newreno. */ 18907 error = EINVAL; 18908 break; 18909 } 18910 if (rack->rc_pacing_cc_set) { 18911 /* 18912 * Set them into the real CC module 18913 * whats in the rack pcb is the old values 18914 * to be used on restoral/ 18915 */ 18916 sopt.sopt_dir = SOPT_SET; 18917 opt.name = CC_NEWRENO_BETA_ECN; 18918 opt.val = optval; 18919 if (CC_ALGO(tp)->ctl_output != NULL) 18920 error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); 18921 else 18922 error = ENOENT; 18923 } else { 18924 /* 18925 * Not pacing yet so set it into our local 18926 * rack pcb storage. 18927 */ 18928 rack->r_ctl.rc_saved_beta.beta_ecn = optval; 18929 rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; 18930 } 18931 break; 18932 case TCP_DEFER_OPTIONS: 18933 RACK_OPTS_INC(tcp_defer_opt); 18934 if (optval) { 18935 if (rack->gp_ready) { 18936 /* Too late */ 18937 error = EINVAL; 18938 break; 18939 } 18940 rack->defer_options = 1; 18941 } else 18942 rack->defer_options = 0; 18943 break; 18944 case TCP_RACK_MEASURE_CNT: 18945 RACK_OPTS_INC(tcp_rack_measure_cnt); 18946 if (optval && (optval <= 0xff)) { 18947 rack->r_ctl.req_measurements = optval; 18948 } else 18949 error = EINVAL; 18950 break; 18951 case TCP_REC_ABC_VAL: 18952 RACK_OPTS_INC(tcp_rec_abc_val); 18953 if (optval > 0) 18954 rack->r_use_labc_for_rec = 1; 18955 else 18956 rack->r_use_labc_for_rec = 0; 18957 break; 18958 case TCP_RACK_ABC_VAL: 18959 RACK_OPTS_INC(tcp_rack_abc_val); 18960 if ((optval > 0) && (optval < 255)) 18961 rack->rc_labc = optval; 18962 else 18963 error = EINVAL; 18964 break; 18965 case TCP_HDWR_UP_ONLY: 18966 RACK_OPTS_INC(tcp_pacing_up_only); 18967 if (optval) 18968 rack->r_up_only = 1; 18969 else 18970 rack->r_up_only = 0; 18971 break; 18972 case TCP_PACING_RATE_CAP: 18973 RACK_OPTS_INC(tcp_pacing_rate_cap); 18974 rack->r_ctl.bw_rate_cap = loptval; 18975 break; 18976 case TCP_RACK_PROFILE: 18977 RACK_OPTS_INC(tcp_profile); 18978 error = rack_set_profile(rack, optval); 18979 break; 18980 case TCP_USE_CMP_ACKS: 18981 RACK_OPTS_INC(tcp_use_cmp_acks); 18982 if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) { 18983 /* You can't turn it off once its on! */ 18984 error = EINVAL; 18985 } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { 18986 rack->r_use_cmp_ack = 1; 18987 rack->r_mbuf_queue = 1; 18988 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 18989 } 18990 if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) 18991 rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; 18992 break; 18993 case TCP_SHARED_CWND_TIME_LIMIT: 18994 RACK_OPTS_INC(tcp_lscwnd); 18995 if (optval) 18996 rack->r_limit_scw = 1; 18997 else 18998 rack->r_limit_scw = 0; 18999 break; 19000 case TCP_RACK_PACE_TO_FILL: 19001 RACK_OPTS_INC(tcp_fillcw); 19002 if (optval == 0) 19003 rack->rc_pace_to_cwnd = 0; 19004 else { 19005 rack->rc_pace_to_cwnd = 1; 19006 if (optval > 1) 19007 rack->r_fill_less_agg = 1; 19008 } 19009 if ((optval >= rack_gp_rtt_maxmul) && 19010 rack_gp_rtt_maxmul && 19011 (optval < 0xf)) { 19012 rack->rc_pace_fill_if_rttin_range = 1; 19013 rack->rtt_limit_mul = optval; 19014 } else { 19015 rack->rc_pace_fill_if_rttin_range = 0; 19016 rack->rtt_limit_mul = 0; 19017 } 19018 break; 19019 case TCP_RACK_NO_PUSH_AT_MAX: 19020 RACK_OPTS_INC(tcp_npush); 19021 if (optval == 0) 19022 rack->r_ctl.rc_no_push_at_mrtt = 0; 19023 else if (optval < 0xff) 19024 rack->r_ctl.rc_no_push_at_mrtt = optval; 19025 else 19026 error = EINVAL; 19027 break; 19028 case TCP_SHARED_CWND_ENABLE: 19029 RACK_OPTS_INC(tcp_rack_scwnd); 19030 if (optval == 0) 19031 rack->rack_enable_scwnd = 0; 19032 else 19033 rack->rack_enable_scwnd = 1; 19034 break; 19035 case TCP_RACK_MBUF_QUEUE: 19036 /* Now do we use the LRO mbuf-queue feature */ 19037 RACK_OPTS_INC(tcp_rack_mbufq); 19038 if (optval || rack->r_use_cmp_ack) 19039 rack->r_mbuf_queue = 1; 19040 else 19041 rack->r_mbuf_queue = 0; 19042 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19043 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19044 else 19045 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19046 break; 19047 case TCP_RACK_NONRXT_CFG_RATE: 19048 RACK_OPTS_INC(tcp_rack_cfg_rate); 19049 if (optval == 0) 19050 rack->rack_rec_nonrxt_use_cr = 0; 19051 else 19052 rack->rack_rec_nonrxt_use_cr = 1; 19053 break; 19054 case TCP_NO_PRR: 19055 RACK_OPTS_INC(tcp_rack_noprr); 19056 if (optval == 0) 19057 rack->rack_no_prr = 0; 19058 else if (optval == 1) 19059 rack->rack_no_prr = 1; 19060 else if (optval == 2) 19061 rack->no_prr_addback = 1; 19062 else 19063 error = EINVAL; 19064 break; 19065 case TCP_TIMELY_DYN_ADJ: 19066 RACK_OPTS_INC(tcp_timely_dyn); 19067 if (optval == 0) 19068 rack->rc_gp_dyn_mul = 0; 19069 else { 19070 rack->rc_gp_dyn_mul = 1; 19071 if (optval >= 100) { 19072 /* 19073 * If the user sets something 100 or more 19074 * its the gp_ca value. 19075 */ 19076 rack->r_ctl.rack_per_of_gp_ca = optval; 19077 } 19078 } 19079 break; 19080 case TCP_RACK_DO_DETECTION: 19081 RACK_OPTS_INC(tcp_rack_do_detection); 19082 if (optval == 0) 19083 rack->do_detection = 0; 19084 else 19085 rack->do_detection = 1; 19086 break; 19087 case TCP_RACK_TLP_USE: 19088 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 19089 error = EINVAL; 19090 break; 19091 } 19092 RACK_OPTS_INC(tcp_tlp_use); 19093 rack->rack_tlp_threshold_use = optval; 19094 break; 19095 case TCP_RACK_TLP_REDUCE: 19096 /* RACK TLP cwnd reduction (bool) */ 19097 RACK_OPTS_INC(tcp_rack_tlp_reduce); 19098 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 19099 break; 19100 /* Pacing related ones */ 19101 case TCP_RACK_PACE_ALWAYS: 19102 /* 19103 * zero is old rack method, 1 is new 19104 * method using a pacing rate. 19105 */ 19106 RACK_OPTS_INC(tcp_rack_pace_always); 19107 if (optval > 0) { 19108 if (rack->rc_always_pace) { 19109 error = EALREADY; 19110 break; 19111 } else if (tcp_can_enable_pacing()) { 19112 rack->rc_always_pace = 1; 19113 if (rack->use_fixed_rate || rack->gp_ready) 19114 rack_set_cc_pacing(rack); 19115 } 19116 else { 19117 error = ENOSPC; 19118 break; 19119 } 19120 } else { 19121 if (rack->rc_always_pace) { 19122 tcp_decrement_paced_conn(); 19123 rack->rc_always_pace = 0; 19124 rack_undo_cc_pacing(rack); 19125 } 19126 } 19127 if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) 19128 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 19129 else 19130 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 19131 /* A rate may be set irate or other, if so set seg size */ 19132 rack_update_seg(rack); 19133 break; 19134 case TCP_BBR_RACK_INIT_RATE: 19135 RACK_OPTS_INC(tcp_initial_rate); 19136 val = optval; 19137 /* Change from kbits per second to bytes per second */ 19138 val *= 1000; 19139 val /= 8; 19140 rack->r_ctl.init_rate = val; 19141 if (rack->rc_init_win != rack_default_init_window) { 19142 uint32_t win, snt; 19143 19144 /* 19145 * Options don't always get applied 19146 * in the order you think. So in order 19147 * to assure we update a cwnd we need 19148 * to check and see if we are still 19149 * where we should raise the cwnd. 19150 */ 19151 win = rc_init_window(rack); 19152 if (SEQ_GT(tp->snd_max, tp->iss)) 19153 snt = tp->snd_max - tp->iss; 19154 else 19155 snt = 0; 19156 if ((snt < win) && 19157 (tp->snd_cwnd < win)) 19158 tp->snd_cwnd = win; 19159 } 19160 if (rack->rc_always_pace) 19161 rack_update_seg(rack); 19162 break; 19163 case TCP_BBR_IWINTSO: 19164 RACK_OPTS_INC(tcp_initial_win); 19165 if (optval && (optval <= 0xff)) { 19166 uint32_t win, snt; 19167 19168 rack->rc_init_win = optval; 19169 win = rc_init_window(rack); 19170 if (SEQ_GT(tp->snd_max, tp->iss)) 19171 snt = tp->snd_max - tp->iss; 19172 else 19173 snt = 0; 19174 if ((snt < win) && 19175 (tp->t_srtt | 19176 #ifdef NETFLIX_PEAKRATE 19177 tp->t_maxpeakrate | 19178 #endif 19179 rack->r_ctl.init_rate)) { 19180 /* 19181 * We are not past the initial window 19182 * and we have some bases for pacing, 19183 * so we need to possibly adjust up 19184 * the cwnd. Note even if we don't set 19185 * the cwnd, its still ok to raise the rc_init_win 19186 * which can be used coming out of idle when we 19187 * would have a rate. 19188 */ 19189 if (tp->snd_cwnd < win) 19190 tp->snd_cwnd = win; 19191 } 19192 if (rack->rc_always_pace) 19193 rack_update_seg(rack); 19194 } else 19195 error = EINVAL; 19196 break; 19197 case TCP_RACK_FORCE_MSEG: 19198 RACK_OPTS_INC(tcp_rack_force_max_seg); 19199 if (optval) 19200 rack->rc_force_max_seg = 1; 19201 else 19202 rack->rc_force_max_seg = 0; 19203 break; 19204 case TCP_RACK_PACE_MAX_SEG: 19205 /* Max segments size in a pace in bytes */ 19206 RACK_OPTS_INC(tcp_rack_max_seg); 19207 rack->rc_user_set_max_segs = optval; 19208 rack_set_pace_segments(tp, rack, __LINE__, NULL); 19209 break; 19210 case TCP_RACK_PACE_RATE_REC: 19211 /* Set the fixed pacing rate in Bytes per second ca */ 19212 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 19213 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19214 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19215 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19216 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 19217 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19218 rack->use_fixed_rate = 1; 19219 if (rack->rc_always_pace) 19220 rack_set_cc_pacing(rack); 19221 rack_log_pacing_delay_calc(rack, 19222 rack->r_ctl.rc_fixed_pacing_rate_ss, 19223 rack->r_ctl.rc_fixed_pacing_rate_ca, 19224 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19225 __LINE__, NULL); 19226 break; 19227 19228 case TCP_RACK_PACE_RATE_SS: 19229 /* Set the fixed pacing rate in Bytes per second ca */ 19230 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 19231 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19232 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 19233 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19234 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 19235 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19236 rack->use_fixed_rate = 1; 19237 if (rack->rc_always_pace) 19238 rack_set_cc_pacing(rack); 19239 rack_log_pacing_delay_calc(rack, 19240 rack->r_ctl.rc_fixed_pacing_rate_ss, 19241 rack->r_ctl.rc_fixed_pacing_rate_ca, 19242 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19243 __LINE__, NULL); 19244 break; 19245 19246 case TCP_RACK_PACE_RATE_CA: 19247 /* Set the fixed pacing rate in Bytes per second ca */ 19248 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 19249 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 19250 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 19251 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 19252 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 19253 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 19254 rack->use_fixed_rate = 1; 19255 if (rack->rc_always_pace) 19256 rack_set_cc_pacing(rack); 19257 rack_log_pacing_delay_calc(rack, 19258 rack->r_ctl.rc_fixed_pacing_rate_ss, 19259 rack->r_ctl.rc_fixed_pacing_rate_ca, 19260 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 19261 __LINE__, NULL); 19262 break; 19263 case TCP_RACK_GP_INCREASE_REC: 19264 RACK_OPTS_INC(tcp_gp_inc_rec); 19265 rack->r_ctl.rack_per_of_gp_rec = optval; 19266 rack_log_pacing_delay_calc(rack, 19267 rack->r_ctl.rack_per_of_gp_ss, 19268 rack->r_ctl.rack_per_of_gp_ca, 19269 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19270 __LINE__, NULL); 19271 break; 19272 case TCP_RACK_GP_INCREASE_CA: 19273 RACK_OPTS_INC(tcp_gp_inc_ca); 19274 ca = optval; 19275 if (ca < 100) { 19276 /* 19277 * We don't allow any reduction 19278 * over the GP b/w. 19279 */ 19280 error = EINVAL; 19281 break; 19282 } 19283 rack->r_ctl.rack_per_of_gp_ca = ca; 19284 rack_log_pacing_delay_calc(rack, 19285 rack->r_ctl.rack_per_of_gp_ss, 19286 rack->r_ctl.rack_per_of_gp_ca, 19287 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19288 __LINE__, NULL); 19289 break; 19290 case TCP_RACK_GP_INCREASE_SS: 19291 RACK_OPTS_INC(tcp_gp_inc_ss); 19292 ss = optval; 19293 if (ss < 100) { 19294 /* 19295 * We don't allow any reduction 19296 * over the GP b/w. 19297 */ 19298 error = EINVAL; 19299 break; 19300 } 19301 rack->r_ctl.rack_per_of_gp_ss = ss; 19302 rack_log_pacing_delay_calc(rack, 19303 rack->r_ctl.rack_per_of_gp_ss, 19304 rack->r_ctl.rack_per_of_gp_ca, 19305 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 19306 __LINE__, NULL); 19307 break; 19308 case TCP_RACK_RR_CONF: 19309 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 19310 if (optval && optval <= 3) 19311 rack->r_rr_config = optval; 19312 else 19313 rack->r_rr_config = 0; 19314 break; 19315 case TCP_HDWR_RATE_CAP: 19316 RACK_OPTS_INC(tcp_hdwr_rate_cap); 19317 if (optval) { 19318 if (rack->r_rack_hw_rate_caps == 0) 19319 rack->r_rack_hw_rate_caps = 1; 19320 else 19321 error = EALREADY; 19322 } else { 19323 rack->r_rack_hw_rate_caps = 0; 19324 } 19325 break; 19326 case TCP_BBR_HDWR_PACE: 19327 RACK_OPTS_INC(tcp_hdwr_pacing); 19328 if (optval){ 19329 if (rack->rack_hdrw_pacing == 0) { 19330 rack->rack_hdw_pace_ena = 1; 19331 rack->rack_attempt_hdwr_pace = 0; 19332 } else 19333 error = EALREADY; 19334 } else { 19335 rack->rack_hdw_pace_ena = 0; 19336 #ifdef RATELIMIT 19337 if (rack->r_ctl.crte != NULL) { 19338 rack->rack_hdrw_pacing = 0; 19339 rack->rack_attempt_hdwr_pace = 0; 19340 tcp_rel_pacing_rate(rack->r_ctl.crte, tp); 19341 rack->r_ctl.crte = NULL; 19342 } 19343 #endif 19344 } 19345 break; 19346 /* End Pacing related ones */ 19347 case TCP_RACK_PRR_SENDALOT: 19348 /* Allow PRR to send more than one seg */ 19349 RACK_OPTS_INC(tcp_rack_prr_sendalot); 19350 rack->r_ctl.rc_prr_sendalot = optval; 19351 break; 19352 case TCP_RACK_MIN_TO: 19353 /* Minimum time between rack t-o's in ms */ 19354 RACK_OPTS_INC(tcp_rack_min_to); 19355 rack->r_ctl.rc_min_to = optval; 19356 break; 19357 case TCP_RACK_EARLY_SEG: 19358 /* If early recovery max segments */ 19359 RACK_OPTS_INC(tcp_rack_early_seg); 19360 rack->r_ctl.rc_early_recovery_segs = optval; 19361 break; 19362 case TCP_RACK_REORD_THRESH: 19363 /* RACK reorder threshold (shift amount) */ 19364 RACK_OPTS_INC(tcp_rack_reord_thresh); 19365 if ((optval > 0) && (optval < 31)) 19366 rack->r_ctl.rc_reorder_shift = optval; 19367 else 19368 error = EINVAL; 19369 break; 19370 case TCP_RACK_REORD_FADE: 19371 /* Does reordering fade after ms time */ 19372 RACK_OPTS_INC(tcp_rack_reord_fade); 19373 rack->r_ctl.rc_reorder_fade = optval; 19374 break; 19375 case TCP_RACK_TLP_THRESH: 19376 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 19377 RACK_OPTS_INC(tcp_rack_tlp_thresh); 19378 if (optval) 19379 rack->r_ctl.rc_tlp_threshold = optval; 19380 else 19381 error = EINVAL; 19382 break; 19383 case TCP_BBR_USE_RACK_RR: 19384 RACK_OPTS_INC(tcp_rack_rr); 19385 if (optval) 19386 rack->use_rack_rr = 1; 19387 else 19388 rack->use_rack_rr = 0; 19389 break; 19390 case TCP_FAST_RSM_HACK: 19391 RACK_OPTS_INC(tcp_rack_fastrsm_hack); 19392 if (optval) 19393 rack->fast_rsm_hack = 1; 19394 else 19395 rack->fast_rsm_hack = 0; 19396 break; 19397 case TCP_RACK_PKT_DELAY: 19398 /* RACK added ms i.e. rack-rtt + reord + N */ 19399 RACK_OPTS_INC(tcp_rack_pkt_delay); 19400 rack->r_ctl.rc_pkt_delay = optval; 19401 break; 19402 case TCP_DELACK: 19403 RACK_OPTS_INC(tcp_rack_delayed_ack); 19404 if (optval == 0) 19405 tp->t_delayed_ack = 0; 19406 else 19407 tp->t_delayed_ack = 1; 19408 if (tp->t_flags & TF_DELACK) { 19409 tp->t_flags &= ~TF_DELACK; 19410 tp->t_flags |= TF_ACKNOW; 19411 NET_EPOCH_ENTER(et); 19412 rack_output(tp); 19413 NET_EPOCH_EXIT(et); 19414 } 19415 break; 19416 19417 case TCP_BBR_RACK_RTT_USE: 19418 RACK_OPTS_INC(tcp_rack_rtt_use); 19419 if ((optval != USE_RTT_HIGH) && 19420 (optval != USE_RTT_LOW) && 19421 (optval != USE_RTT_AVG)) 19422 error = EINVAL; 19423 else 19424 rack->r_ctl.rc_rate_sample_method = optval; 19425 break; 19426 case TCP_DATA_AFTER_CLOSE: 19427 RACK_OPTS_INC(tcp_data_after_close); 19428 if (optval) 19429 rack->rc_allow_data_af_clo = 1; 19430 else 19431 rack->rc_allow_data_af_clo = 0; 19432 break; 19433 default: 19434 break; 19435 } 19436 #ifdef NETFLIX_STATS 19437 tcp_log_socket_option(tp, sopt_name, optval, error); 19438 #endif 19439 return (error); 19440 } 19441 19442 19443 static void 19444 rack_apply_deferred_options(struct tcp_rack *rack) 19445 { 19446 struct deferred_opt_list *dol, *sdol; 19447 uint32_t s_optval; 19448 19449 TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { 19450 TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); 19451 /* Disadvantage of deferal is you loose the error return */ 19452 s_optval = (uint32_t)dol->optval; 19453 (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); 19454 free(dol, M_TCPDO); 19455 } 19456 } 19457 19458 static int 19459 rack_pru_options(struct tcpcb *tp, int flags) 19460 { 19461 if (flags & PRUS_OOB) 19462 return (EOPNOTSUPP); 19463 return (0); 19464 } 19465 19466 static struct tcp_function_block __tcp_rack = { 19467 .tfb_tcp_block_name = __XSTRING(STACKNAME), 19468 .tfb_tcp_output = rack_output, 19469 .tfb_do_queued_segments = ctf_do_queued_segments, 19470 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 19471 .tfb_tcp_do_segment = rack_do_segment, 19472 .tfb_tcp_ctloutput = rack_ctloutput, 19473 .tfb_tcp_fb_init = rack_init, 19474 .tfb_tcp_fb_fini = rack_fini, 19475 .tfb_tcp_timer_stop_all = rack_stopall, 19476 .tfb_tcp_timer_activate = rack_timer_activate, 19477 .tfb_tcp_timer_active = rack_timer_active, 19478 .tfb_tcp_timer_stop = rack_timer_stop, 19479 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 19480 .tfb_tcp_handoff_ok = rack_handoff_ok, 19481 .tfb_tcp_mtu_chg = rack_mtu_change, 19482 .tfb_pru_options = rack_pru_options, 19483 19484 }; 19485 19486 /* 19487 * rack_ctloutput() must drop the inpcb lock before performing copyin on 19488 * socket option arguments. When it re-acquires the lock after the copy, it 19489 * has to revalidate that the connection is still valid for the socket 19490 * option. 19491 */ 19492 static int 19493 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 19494 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 19495 { 19496 uint64_t loptval; 19497 int32_t error = 0, optval; 19498 19499 switch (sopt->sopt_name) { 19500 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 19501 /* Pacing related ones */ 19502 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 19503 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 19504 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 19505 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 19506 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 19507 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 19508 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 19509 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 19510 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 19511 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 19512 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 19513 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 19514 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 19515 case TCP_HDWR_RATE_CAP: /* URL: hdwrcap boolean */ 19516 case TCP_PACING_RATE_CAP: /* URL:cap-- used by side-channel */ 19517 case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ 19518 /* End pacing related */ 19519 case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ 19520 case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ 19521 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 19522 case TCP_RACK_MIN_TO: /* URL:min_to */ 19523 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 19524 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 19525 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 19526 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 19527 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 19528 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 19529 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 19530 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 19531 case TCP_RACK_DO_DETECTION: /* URL:detect */ 19532 case TCP_NO_PRR: /* URL:noprr */ 19533 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 19534 case TCP_DATA_AFTER_CLOSE: /* no URL */ 19535 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 19536 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 19537 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 19538 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 19539 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 19540 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 19541 case TCP_RACK_PROFILE: /* URL:profile */ 19542 case TCP_USE_CMP_ACKS: /* URL:cmpack */ 19543 case TCP_RACK_ABC_VAL: /* URL:labc */ 19544 case TCP_REC_ABC_VAL: /* URL:reclabc */ 19545 case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ 19546 case TCP_DEFER_OPTIONS: /* URL:defer */ 19547 case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ 19548 case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ 19549 case TCP_RACK_TIMER_SLOP: /* URL:timer_slop */ 19550 break; 19551 default: 19552 /* Filter off all unknown options to the base stack */ 19553 return (tcp_default_ctloutput(so, sopt, inp, tp)); 19554 break; 19555 } 19556 INP_WUNLOCK(inp); 19557 if (sopt->sopt_name == TCP_PACING_RATE_CAP) { 19558 error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); 19559 /* 19560 * We truncate it down to 32 bits for the socket-option trace this 19561 * means rates > 34Gbps won't show right, but thats probably ok. 19562 */ 19563 optval = (uint32_t)loptval; 19564 } else { 19565 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 19566 /* Save it in 64 bit form too */ 19567 loptval = optval; 19568 } 19569 if (error) 19570 return (error); 19571 INP_WLOCK(inp); 19572 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 19573 INP_WUNLOCK(inp); 19574 return (ECONNRESET); 19575 } 19576 if (tp->t_fb != &__tcp_rack) { 19577 INP_WUNLOCK(inp); 19578 return (ENOPROTOOPT); 19579 } 19580 if (rack->defer_options && (rack->gp_ready == 0) && 19581 (sopt->sopt_name != TCP_DEFER_OPTIONS) && 19582 (sopt->sopt_name != TCP_RACK_PACING_BETA) && 19583 (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && 19584 (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { 19585 /* Options are beind deferred */ 19586 if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { 19587 INP_WUNLOCK(inp); 19588 return (0); 19589 } else { 19590 /* No memory to defer, fail */ 19591 INP_WUNLOCK(inp); 19592 return (ENOMEM); 19593 } 19594 } 19595 error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); 19596 INP_WUNLOCK(inp); 19597 return (error); 19598 } 19599 19600 static void 19601 rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) 19602 { 19603 19604 INP_WLOCK_ASSERT(tp->t_inpcb); 19605 bzero(ti, sizeof(*ti)); 19606 19607 ti->tcpi_state = tp->t_state; 19608 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 19609 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 19610 if (tp->t_flags & TF_SACK_PERMIT) 19611 ti->tcpi_options |= TCPI_OPT_SACK; 19612 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 19613 ti->tcpi_options |= TCPI_OPT_WSCALE; 19614 ti->tcpi_snd_wscale = tp->snd_scale; 19615 ti->tcpi_rcv_wscale = tp->rcv_scale; 19616 } 19617 if (tp->t_flags2 & TF2_ECN_PERMIT) 19618 ti->tcpi_options |= TCPI_OPT_ECN; 19619 if (tp->t_flags & TF_FASTOPEN) 19620 ti->tcpi_options |= TCPI_OPT_TFO; 19621 /* still kept in ticks is t_rcvtime */ 19622 ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; 19623 /* Since we hold everything in precise useconds this is easy */ 19624 ti->tcpi_rtt = tp->t_srtt; 19625 ti->tcpi_rttvar = tp->t_rttvar; 19626 ti->tcpi_rto = tp->t_rxtcur; 19627 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 19628 ti->tcpi_snd_cwnd = tp->snd_cwnd; 19629 /* 19630 * FreeBSD-specific extension fields for tcp_info. 19631 */ 19632 ti->tcpi_rcv_space = tp->rcv_wnd; 19633 ti->tcpi_rcv_nxt = tp->rcv_nxt; 19634 ti->tcpi_snd_wnd = tp->snd_wnd; 19635 ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ 19636 ti->tcpi_snd_nxt = tp->snd_nxt; 19637 ti->tcpi_snd_mss = tp->t_maxseg; 19638 ti->tcpi_rcv_mss = tp->t_maxseg; 19639 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 19640 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 19641 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 19642 #ifdef NETFLIX_STATS 19643 ti->tcpi_total_tlp = tp->t_sndtlppack; 19644 ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; 19645 memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); 19646 #endif 19647 #ifdef TCP_OFFLOAD 19648 if (tp->t_flags & TF_TOE) { 19649 ti->tcpi_options |= TCPI_OPT_TOE; 19650 tcp_offload_tcp_info(tp, ti); 19651 } 19652 #endif 19653 } 19654 19655 static int 19656 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 19657 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 19658 { 19659 int32_t error, optval; 19660 uint64_t val, loptval; 19661 struct tcp_info ti; 19662 /* 19663 * Because all our options are either boolean or an int, we can just 19664 * pull everything into optval and then unlock and copy. If we ever 19665 * add a option that is not a int, then this will have quite an 19666 * impact to this routine. 19667 */ 19668 error = 0; 19669 switch (sopt->sopt_name) { 19670 case TCP_INFO: 19671 /* First get the info filled */ 19672 rack_fill_info(tp, &ti); 19673 /* Fix up the rtt related fields if needed */ 19674 INP_WUNLOCK(inp); 19675 error = sooptcopyout(sopt, &ti, sizeof ti); 19676 return (error); 19677 /* 19678 * Beta is the congestion control value for NewReno that influences how 19679 * much of a backoff happens when loss is detected. It is normally set 19680 * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value 19681 * when you exit recovery. 19682 */ 19683 case TCP_RACK_PACING_BETA: 19684 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 19685 error = EINVAL; 19686 else if (rack->rc_pacing_cc_set == 0) 19687 optval = rack->r_ctl.rc_saved_beta.beta; 19688 else { 19689 /* 19690 * Reach out into the CC data and report back what 19691 * I have previously set. Yeah it looks hackish but 19692 * we don't want to report the saved values. 19693 */ 19694 if (tp->ccv->cc_data) 19695 optval = ((struct newreno *)tp->ccv->cc_data)->beta; 19696 else 19697 error = EINVAL; 19698 } 19699 break; 19700 /* 19701 * Beta_ecn is the congestion control value for NewReno that influences how 19702 * much of a backoff happens when a ECN mark is detected. It is normally set 19703 * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when 19704 * you exit recovery. Note that classic ECN has a beta of 50, it is only 19705 * ABE Ecn that uses this "less" value, but we do too with pacing :) 19706 */ 19707 19708 case TCP_RACK_PACING_BETA_ECN: 19709 if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) 19710 error = EINVAL; 19711 else if (rack->rc_pacing_cc_set == 0) 19712 optval = rack->r_ctl.rc_saved_beta.beta_ecn; 19713 else { 19714 /* 19715 * Reach out into the CC data and report back what 19716 * I have previously set. Yeah it looks hackish but 19717 * we don't want to report the saved values. 19718 */ 19719 if (tp->ccv->cc_data) 19720 optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn; 19721 else 19722 error = EINVAL; 19723 } 19724 break; 19725 case TCP_FAST_RSM_HACK: 19726 optval = rack->fast_rsm_hack; 19727 break; 19728 case TCP_DEFER_OPTIONS: 19729 optval = rack->defer_options; 19730 break; 19731 case TCP_RACK_MEASURE_CNT: 19732 optval = rack->r_ctl.req_measurements; 19733 break; 19734 case TCP_REC_ABC_VAL: 19735 optval = rack->r_use_labc_for_rec; 19736 break; 19737 case TCP_RACK_ABC_VAL: 19738 optval = rack->rc_labc; 19739 break; 19740 case TCP_HDWR_UP_ONLY: 19741 optval= rack->r_up_only; 19742 break; 19743 case TCP_PACING_RATE_CAP: 19744 loptval = rack->r_ctl.bw_rate_cap; 19745 break; 19746 case TCP_RACK_PROFILE: 19747 /* You cannot retrieve a profile, its write only */ 19748 error = EINVAL; 19749 break; 19750 case TCP_USE_CMP_ACKS: 19751 optval = rack->r_use_cmp_ack; 19752 break; 19753 case TCP_RACK_PACE_TO_FILL: 19754 optval = rack->rc_pace_to_cwnd; 19755 if (optval && rack->r_fill_less_agg) 19756 optval++; 19757 break; 19758 case TCP_RACK_NO_PUSH_AT_MAX: 19759 optval = rack->r_ctl.rc_no_push_at_mrtt; 19760 break; 19761 case TCP_SHARED_CWND_ENABLE: 19762 optval = rack->rack_enable_scwnd; 19763 break; 19764 case TCP_RACK_NONRXT_CFG_RATE: 19765 optval = rack->rack_rec_nonrxt_use_cr; 19766 break; 19767 case TCP_NO_PRR: 19768 if (rack->rack_no_prr == 1) 19769 optval = 1; 19770 else if (rack->no_prr_addback == 1) 19771 optval = 2; 19772 else 19773 optval = 0; 19774 break; 19775 case TCP_RACK_DO_DETECTION: 19776 optval = rack->do_detection; 19777 break; 19778 case TCP_RACK_MBUF_QUEUE: 19779 /* Now do we use the LRO mbuf-queue feature */ 19780 optval = rack->r_mbuf_queue; 19781 break; 19782 case TCP_TIMELY_DYN_ADJ: 19783 optval = rack->rc_gp_dyn_mul; 19784 break; 19785 case TCP_BBR_IWINTSO: 19786 optval = rack->rc_init_win; 19787 break; 19788 case TCP_RACK_TLP_REDUCE: 19789 /* RACK TLP cwnd reduction (bool) */ 19790 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 19791 break; 19792 case TCP_BBR_RACK_INIT_RATE: 19793 val = rack->r_ctl.init_rate; 19794 /* convert to kbits per sec */ 19795 val *= 8; 19796 val /= 1000; 19797 optval = (uint32_t)val; 19798 break; 19799 case TCP_RACK_FORCE_MSEG: 19800 optval = rack->rc_force_max_seg; 19801 break; 19802 case TCP_RACK_PACE_MAX_SEG: 19803 /* Max segments in a pace */ 19804 optval = rack->rc_user_set_max_segs; 19805 break; 19806 case TCP_RACK_PACE_ALWAYS: 19807 /* Use the always pace method */ 19808 optval = rack->rc_always_pace; 19809 break; 19810 case TCP_RACK_PRR_SENDALOT: 19811 /* Allow PRR to send more than one seg */ 19812 optval = rack->r_ctl.rc_prr_sendalot; 19813 break; 19814 case TCP_RACK_MIN_TO: 19815 /* Minimum time between rack t-o's in ms */ 19816 optval = rack->r_ctl.rc_min_to; 19817 break; 19818 case TCP_RACK_EARLY_SEG: 19819 /* If early recovery max segments */ 19820 optval = rack->r_ctl.rc_early_recovery_segs; 19821 break; 19822 case TCP_RACK_REORD_THRESH: 19823 /* RACK reorder threshold (shift amount) */ 19824 optval = rack->r_ctl.rc_reorder_shift; 19825 break; 19826 case TCP_RACK_REORD_FADE: 19827 /* Does reordering fade after ms time */ 19828 optval = rack->r_ctl.rc_reorder_fade; 19829 break; 19830 case TCP_BBR_USE_RACK_RR: 19831 /* Do we use the rack cheat for rxt */ 19832 optval = rack->use_rack_rr; 19833 break; 19834 case TCP_RACK_RR_CONF: 19835 optval = rack->r_rr_config; 19836 break; 19837 case TCP_HDWR_RATE_CAP: 19838 optval = rack->r_rack_hw_rate_caps; 19839 break; 19840 case TCP_BBR_HDWR_PACE: 19841 optval = rack->rack_hdw_pace_ena; 19842 break; 19843 case TCP_RACK_TLP_THRESH: 19844 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 19845 optval = rack->r_ctl.rc_tlp_threshold; 19846 break; 19847 case TCP_RACK_PKT_DELAY: 19848 /* RACK added ms i.e. rack-rtt + reord + N */ 19849 optval = rack->r_ctl.rc_pkt_delay; 19850 break; 19851 case TCP_RACK_TLP_USE: 19852 optval = rack->rack_tlp_threshold_use; 19853 break; 19854 case TCP_RACK_PACE_RATE_CA: 19855 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 19856 break; 19857 case TCP_RACK_PACE_RATE_SS: 19858 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 19859 break; 19860 case TCP_RACK_PACE_RATE_REC: 19861 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 19862 break; 19863 case TCP_RACK_GP_INCREASE_SS: 19864 optval = rack->r_ctl.rack_per_of_gp_ca; 19865 break; 19866 case TCP_RACK_GP_INCREASE_CA: 19867 optval = rack->r_ctl.rack_per_of_gp_ss; 19868 break; 19869 case TCP_BBR_RACK_RTT_USE: 19870 optval = rack->r_ctl.rc_rate_sample_method; 19871 break; 19872 case TCP_DELACK: 19873 optval = tp->t_delayed_ack; 19874 break; 19875 case TCP_DATA_AFTER_CLOSE: 19876 optval = rack->rc_allow_data_af_clo; 19877 break; 19878 case TCP_SHARED_CWND_TIME_LIMIT: 19879 optval = rack->r_limit_scw; 19880 break; 19881 case TCP_RACK_TIMER_SLOP: 19882 optval = rack->r_ctl.timer_slop; 19883 break; 19884 default: 19885 return (tcp_default_ctloutput(so, sopt, inp, tp)); 19886 break; 19887 } 19888 INP_WUNLOCK(inp); 19889 if (error == 0) { 19890 if (TCP_PACING_RATE_CAP) 19891 error = sooptcopyout(sopt, &loptval, sizeof loptval); 19892 else 19893 error = sooptcopyout(sopt, &optval, sizeof optval); 19894 } 19895 return (error); 19896 } 19897 19898 static int 19899 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 19900 { 19901 int32_t error = EINVAL; 19902 struct tcp_rack *rack; 19903 19904 rack = (struct tcp_rack *)tp->t_fb_ptr; 19905 if (rack == NULL) { 19906 /* Huh? */ 19907 goto out; 19908 } 19909 if (sopt->sopt_dir == SOPT_SET) { 19910 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 19911 } else if (sopt->sopt_dir == SOPT_GET) { 19912 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 19913 } 19914 out: 19915 INP_WUNLOCK(inp); 19916 return (error); 19917 } 19918 19919 static const char *rack_stack_names[] = { 19920 __XSTRING(STACKNAME), 19921 #ifdef STACKALIAS 19922 __XSTRING(STACKALIAS), 19923 #endif 19924 }; 19925 19926 static int 19927 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 19928 { 19929 memset(mem, 0, size); 19930 return (0); 19931 } 19932 19933 static void 19934 rack_dtor(void *mem, int32_t size, void *arg) 19935 { 19936 19937 } 19938 19939 static bool rack_mod_inited = false; 19940 19941 static int 19942 tcp_addrack(module_t mod, int32_t type, void *data) 19943 { 19944 int32_t err = 0; 19945 int num_stacks; 19946 19947 switch (type) { 19948 case MOD_LOAD: 19949 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 19950 sizeof(struct rack_sendmap), 19951 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 19952 19953 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 19954 sizeof(struct tcp_rack), 19955 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 19956 19957 sysctl_ctx_init(&rack_sysctl_ctx); 19958 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 19959 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 19960 OID_AUTO, 19961 #ifdef STACKALIAS 19962 __XSTRING(STACKALIAS), 19963 #else 19964 __XSTRING(STACKNAME), 19965 #endif 19966 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 19967 ""); 19968 if (rack_sysctl_root == NULL) { 19969 printf("Failed to add sysctl node\n"); 19970 err = EFAULT; 19971 goto free_uma; 19972 } 19973 rack_init_sysctls(); 19974 num_stacks = nitems(rack_stack_names); 19975 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 19976 rack_stack_names, &num_stacks); 19977 if (err) { 19978 printf("Failed to register %s stack name for " 19979 "%s module\n", rack_stack_names[num_stacks], 19980 __XSTRING(MODNAME)); 19981 sysctl_ctx_free(&rack_sysctl_ctx); 19982 free_uma: 19983 uma_zdestroy(rack_zone); 19984 uma_zdestroy(rack_pcb_zone); 19985 rack_counter_destroy(); 19986 printf("Failed to register rack module -- err:%d\n", err); 19987 return (err); 19988 } 19989 tcp_lro_reg_mbufq(); 19990 rack_mod_inited = true; 19991 break; 19992 case MOD_QUIESCE: 19993 err = deregister_tcp_functions(&__tcp_rack, true, false); 19994 break; 19995 case MOD_UNLOAD: 19996 err = deregister_tcp_functions(&__tcp_rack, false, true); 19997 if (err == EBUSY) 19998 break; 19999 if (rack_mod_inited) { 20000 uma_zdestroy(rack_zone); 20001 uma_zdestroy(rack_pcb_zone); 20002 sysctl_ctx_free(&rack_sysctl_ctx); 20003 rack_counter_destroy(); 20004 rack_mod_inited = false; 20005 } 20006 tcp_lro_dereg_mbufq(); 20007 err = 0; 20008 break; 20009 default: 20010 return (EOPNOTSUPP); 20011 } 20012 return (err); 20013 } 20014 20015 static moduledata_t tcp_rack = { 20016 .name = __XSTRING(MODNAME), 20017 .evhand = tcp_addrack, 20018 .priv = 0 20019 }; 20020 20021 MODULE_VERSION(MODNAME, 1); 20022 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 20023 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 20024