1 /*- 2 * Copyright (c) 2016-2020 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_inet.h" 31 #include "opt_inet6.h" 32 #include "opt_ipsec.h" 33 #include "opt_tcpdebug.h" 34 #include "opt_ratelimit.h" 35 #include "opt_kern_tls.h" 36 #include <sys/param.h> 37 #include <sys/arb.h> 38 #include <sys/module.h> 39 #include <sys/kernel.h> 40 #ifdef TCP_HHOOK 41 #include <sys/hhook.h> 42 #endif 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/lock.h> 46 #include <sys/mutex.h> 47 #include <sys/mbuf.h> 48 #include <sys/proc.h> /* for proc0 declaration */ 49 #include <sys/socket.h> 50 #include <sys/socketvar.h> 51 #ifdef KERN_TLS 52 #include <sys/ktls.h> 53 #endif 54 #include <sys/sysctl.h> 55 #include <sys/systm.h> 56 #ifdef STATS 57 #include <sys/qmath.h> 58 #include <sys/tree.h> 59 #include <sys/stats.h> /* Must come after qmath.h and tree.h */ 60 #else 61 #include <sys/tree.h> 62 #endif 63 #include <sys/refcount.h> 64 #include <sys/queue.h> 65 #include <sys/tim_filter.h> 66 #include <sys/smp.h> 67 #include <sys/kthread.h> 68 #include <sys/kern_prefetch.h> 69 #include <sys/protosw.h> 70 71 #include <vm/uma.h> 72 73 #include <net/route.h> 74 #include <net/route/nhop.h> 75 #include <net/vnet.h> 76 77 #define TCPSTATES /* for logging */ 78 79 #include <netinet/in.h> 80 #include <netinet/in_kdtrace.h> 81 #include <netinet/in_pcb.h> 82 #include <netinet/ip.h> 83 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 84 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 85 #include <netinet/ip_var.h> 86 #include <netinet/ip6.h> 87 #include <netinet6/in6_pcb.h> 88 #include <netinet6/ip6_var.h> 89 #include <netinet/tcp.h> 90 #define TCPOUTFLAGS 91 #include <netinet/tcp_fsm.h> 92 #include <netinet/tcp_log_buf.h> 93 #include <netinet/tcp_seq.h> 94 #include <netinet/tcp_timer.h> 95 #include <netinet/tcp_var.h> 96 #include <netinet/tcp_hpts.h> 97 #include <netinet/tcp_ratelimit.h> 98 #include <netinet/tcpip.h> 99 #include <netinet/cc/cc.h> 100 #include <netinet/tcp_fastopen.h> 101 #include <netinet/tcp_lro.h> 102 #ifdef NETFLIX_SHARED_CWND 103 #include <netinet/tcp_shared_cwnd.h> 104 #endif 105 #ifdef TCPDEBUG 106 #include <netinet/tcp_debug.h> 107 #endif /* TCPDEBUG */ 108 #ifdef TCP_OFFLOAD 109 #include <netinet/tcp_offload.h> 110 #endif 111 #ifdef INET6 112 #include <netinet6/tcp6_var.h> 113 #endif 114 115 #include <netipsec/ipsec_support.h> 116 117 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 118 #include <netipsec/ipsec.h> 119 #include <netipsec/ipsec6.h> 120 #endif /* IPSEC */ 121 122 #include <netinet/udp.h> 123 #include <netinet/udp_var.h> 124 #include <machine/in_cksum.h> 125 126 #ifdef MAC 127 #include <security/mac/mac_framework.h> 128 #endif 129 #include "sack_filter.h" 130 #include "tcp_rack.h" 131 #include "rack_bbr_common.h" 132 133 uma_zone_t rack_zone; 134 uma_zone_t rack_pcb_zone; 135 136 #ifndef TICKS2SBT 137 #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) 138 #endif 139 140 struct sysctl_ctx_list rack_sysctl_ctx; 141 struct sysctl_oid *rack_sysctl_root; 142 143 #define CUM_ACKED 1 144 #define SACKED 2 145 146 /* 147 * The RACK module incorporates a number of 148 * TCP ideas that have been put out into the IETF 149 * over the last few years: 150 * - Matt Mathis's Rate Halving which slowly drops 151 * the congestion window so that the ack clock can 152 * be maintained during a recovery. 153 * - Yuchung Cheng's RACK TCP (for which its named) that 154 * will stop us using the number of dup acks and instead 155 * use time as the gage of when we retransmit. 156 * - Reorder Detection of RFC4737 and the Tail-Loss probe draft 157 * of Dukkipati et.al. 158 * RACK depends on SACK, so if an endpoint arrives that 159 * cannot do SACK the state machine below will shuttle the 160 * connection back to using the "default" TCP stack that is 161 * in FreeBSD. 162 * 163 * To implement RACK the original TCP stack was first decomposed 164 * into a functional state machine with individual states 165 * for each of the possible TCP connection states. The do_segement 166 * functions role in life is to mandate the connection supports SACK 167 * initially and then assure that the RACK state matches the conenction 168 * state before calling the states do_segment function. Each 169 * state is simplified due to the fact that the original do_segment 170 * has been decomposed and we *know* what state we are in (no 171 * switches on the state) and all tests for SACK are gone. This 172 * greatly simplifies what each state does. 173 * 174 * TCP output is also over-written with a new version since it 175 * must maintain the new rack scoreboard. 176 * 177 */ 178 static int32_t rack_tlp_thresh = 1; 179 static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ 180 static int32_t rack_tlp_use_greater = 1; 181 static int32_t rack_reorder_thresh = 2; 182 static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 183 * - 60 seconds */ 184 /* Attack threshold detections */ 185 static uint32_t rack_highest_sack_thresh_seen = 0; 186 static uint32_t rack_highest_move_thresh_seen = 0; 187 188 static int32_t rack_pkt_delay = 1; 189 static int32_t rack_early_recovery = 1; 190 static int32_t rack_send_a_lot_in_prr = 1; 191 static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ 192 static int32_t rack_verbose_logging = 0; 193 static int32_t rack_ignore_data_after_close = 1; 194 static int32_t rack_enable_shared_cwnd = 0; 195 static int32_t rack_limits_scwnd = 1; 196 static int32_t rack_enable_mqueue_for_nonpaced = 0; 197 static int32_t rack_disable_prr = 0; 198 static int32_t use_rack_rr = 1; 199 static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ 200 static int32_t rack_persist_min = 250; /* 250ms */ 201 static int32_t rack_persist_max = 2000; /* 2 Second */ 202 static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ 203 static int32_t rack_hw_tls_max_seg = 3; /* 3 means use hw-tls single segment */ 204 static int32_t rack_default_init_window = 0; /* Use system default */ 205 static int32_t rack_limit_time_with_srtt = 0; 206 static int32_t rack_hw_pace_adjust = 0; 207 /* 208 * Currently regular tcp has a rto_min of 30ms 209 * the backoff goes 12 times so that ends up 210 * being a total of 122.850 seconds before a 211 * connection is killed. 212 */ 213 static uint32_t rack_def_data_window = 20; 214 static uint32_t rack_goal_bdp = 2; 215 static uint32_t rack_min_srtts = 1; 216 static uint32_t rack_min_measure_usec = 0; 217 static int32_t rack_tlp_min = 10; 218 static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ 219 static int32_t rack_rto_max = 4000; /* 4 seconds */ 220 static const int32_t rack_free_cache = 2; 221 static int32_t rack_hptsi_segments = 40; 222 static int32_t rack_rate_sample_method = USE_RTT_LOW; 223 static int32_t rack_pace_every_seg = 0; 224 static int32_t rack_delayed_ack_time = 200; /* 200ms */ 225 static int32_t rack_slot_reduction = 4; 226 static int32_t rack_wma_divisor = 8; /* For WMA calculation */ 227 static int32_t rack_cwnd_block_ends_measure = 0; 228 static int32_t rack_rwnd_block_ends_measure = 0; 229 230 static int32_t rack_lower_cwnd_at_tlp = 0; 231 static int32_t rack_use_proportional_reduce = 0; 232 static int32_t rack_proportional_rate = 10; 233 static int32_t rack_tlp_max_resend = 2; 234 static int32_t rack_limited_retran = 0; 235 static int32_t rack_always_send_oldest = 0; 236 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; 237 238 static uint16_t rack_per_of_gp_ss = 250; /* 250 % slow-start */ 239 static uint16_t rack_per_of_gp_ca = 200; /* 200 % congestion-avoidance */ 240 static uint16_t rack_per_of_gp_rec = 200; /* 200 % of bw */ 241 242 /* Probertt */ 243 static uint16_t rack_per_of_gp_probertt = 60; /* 60% of bw */ 244 static uint16_t rack_per_of_gp_lowthresh = 40; /* 40% is bottom */ 245 static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */ 246 static uint16_t rack_atexit_prtt_hbp = 130; /* Clamp to 130% on exit prtt if highly buffered path */ 247 static uint16_t rack_atexit_prtt = 130; /* Clamp to 100% on exit prtt if non highly buffered path */ 248 249 static uint32_t rack_max_drain_wait = 2; /* How man gp srtt's before we give up draining */ 250 static uint32_t rack_must_drain = 1; /* How many GP srtt's we *must* wait */ 251 static uint32_t rack_probertt_use_min_rtt_entry = 1; /* Use the min to calculate the goal else gp_srtt */ 252 static uint32_t rack_probertt_use_min_rtt_exit = 0; 253 static uint32_t rack_probe_rtt_sets_cwnd = 0; 254 static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ 255 static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ 256 static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ 257 static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ 258 static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ 259 static uint32_t rack_probertt_filter_life = 10000000; 260 static uint32_t rack_probertt_lower_within = 10; 261 static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ 262 static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ 263 static int32_t rack_probertt_clear_is = 1; 264 static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ 265 static int32_t rack_hbp_thresh = 3; /* what is the divisor max_rtt/min_rtt to decided a hbp */ 266 267 268 /* Part of pacing */ 269 static int32_t rack_max_per_above = 30; /* When we go to increment stop if above 100+this% */ 270 271 /* Timely information */ 272 /* Combine these two gives the range of 'no change' to bw */ 273 /* ie the up/down provide the upper and lower bound */ 274 static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ 275 static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ 276 static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ 277 static int32_t rack_gp_rtt_minmul = 1; /* minrtt + (minrtt/mindiv) is lower rtt */ 278 static int32_t rack_gp_rtt_mindiv = 4; /* minrtt + (minrtt * minmul/mindiv) is lower rtt */ 279 static int32_t rack_gp_decrease_per = 20; /* 20% decrease in multipler */ 280 static int32_t rack_gp_increase_per = 2; /* 2% increase in multipler */ 281 static int32_t rack_per_lower_bound = 50; /* Don't allow to drop below this multiplier */ 282 static int32_t rack_per_upper_bound_ss = 0; /* Don't allow SS to grow above this */ 283 static int32_t rack_per_upper_bound_ca = 0; /* Don't allow CA to grow above this */ 284 static int32_t rack_do_dyn_mul = 0; /* Are the rack gp multipliers dynamic */ 285 static int32_t rack_gp_no_rec_chg = 1; /* Prohibit recovery from reducing it's multiplier */ 286 static int32_t rack_timely_dec_clear = 6; /* Do we clear decrement count at a value (6)? */ 287 static int32_t rack_timely_max_push_rise = 3; /* One round of pushing */ 288 static int32_t rack_timely_max_push_drop = 3; /* Three round of pushing */ 289 static int32_t rack_timely_min_segs = 4; /* 4 segment minimum */ 290 static int32_t rack_use_max_for_nobackoff = 0; 291 static int32_t rack_timely_int_timely_only = 0; /* do interim timely's only use the timely algo (no b/w changes)? */ 292 static int32_t rack_timely_no_stopping = 0; 293 static int32_t rack_down_raise_thresh = 100; 294 static int32_t rack_req_segs = 1; 295 296 /* Weird delayed ack mode */ 297 static int32_t rack_use_imac_dack = 0; 298 /* Rack specific counters */ 299 counter_u64_t rack_badfr; 300 counter_u64_t rack_badfr_bytes; 301 counter_u64_t rack_rtm_prr_retran; 302 counter_u64_t rack_rtm_prr_newdata; 303 counter_u64_t rack_timestamp_mismatch; 304 counter_u64_t rack_reorder_seen; 305 counter_u64_t rack_paced_segments; 306 counter_u64_t rack_unpaced_segments; 307 counter_u64_t rack_calc_zero; 308 counter_u64_t rack_calc_nonzero; 309 counter_u64_t rack_saw_enobuf; 310 counter_u64_t rack_saw_enetunreach; 311 counter_u64_t rack_per_timer_hole; 312 313 /* Tail loss probe counters */ 314 counter_u64_t rack_tlp_tot; 315 counter_u64_t rack_tlp_newdata; 316 counter_u64_t rack_tlp_retran; 317 counter_u64_t rack_tlp_retran_bytes; 318 counter_u64_t rack_tlp_retran_fail; 319 counter_u64_t rack_to_tot; 320 counter_u64_t rack_to_arm_rack; 321 counter_u64_t rack_to_arm_tlp; 322 counter_u64_t rack_to_alloc; 323 counter_u64_t rack_to_alloc_hard; 324 counter_u64_t rack_to_alloc_emerg; 325 counter_u64_t rack_to_alloc_limited; 326 counter_u64_t rack_alloc_limited_conns; 327 counter_u64_t rack_split_limited; 328 329 counter_u64_t rack_sack_proc_all; 330 counter_u64_t rack_sack_proc_short; 331 counter_u64_t rack_sack_proc_restart; 332 counter_u64_t rack_sack_attacks_detected; 333 counter_u64_t rack_sack_attacks_reversed; 334 counter_u64_t rack_sack_used_next_merge; 335 counter_u64_t rack_sack_splits; 336 counter_u64_t rack_sack_used_prev_merge; 337 counter_u64_t rack_sack_skipped_acked; 338 counter_u64_t rack_ack_total; 339 counter_u64_t rack_express_sack; 340 counter_u64_t rack_sack_total; 341 counter_u64_t rack_move_none; 342 counter_u64_t rack_move_some; 343 344 counter_u64_t rack_used_tlpmethod; 345 counter_u64_t rack_used_tlpmethod2; 346 counter_u64_t rack_enter_tlp_calc; 347 counter_u64_t rack_input_idle_reduces; 348 counter_u64_t rack_collapsed_win; 349 counter_u64_t rack_tlp_does_nada; 350 counter_u64_t rack_try_scwnd; 351 352 /* Counters for HW TLS */ 353 counter_u64_t rack_tls_rwnd; 354 counter_u64_t rack_tls_cwnd; 355 counter_u64_t rack_tls_app; 356 counter_u64_t rack_tls_other; 357 counter_u64_t rack_tls_filled; 358 counter_u64_t rack_tls_rxt; 359 counter_u64_t rack_tls_tlp; 360 361 /* Temp CPU counters */ 362 counter_u64_t rack_find_high; 363 364 counter_u64_t rack_progress_drops; 365 counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; 366 counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; 367 368 static void 369 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); 370 371 static int 372 rack_process_ack(struct mbuf *m, struct tcphdr *th, 373 struct socket *so, struct tcpcb *tp, struct tcpopt *to, 374 uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); 375 static int 376 rack_process_data(struct mbuf *m, struct tcphdr *th, 377 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 378 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); 379 static void 380 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, 381 struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); 382 static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); 383 static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, 384 uint8_t limit_type); 385 static struct rack_sendmap * 386 rack_check_recovery_mode(struct tcpcb *tp, 387 uint32_t tsused); 388 static void 389 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, 390 uint32_t type); 391 static void rack_counter_destroy(void); 392 static int 393 rack_ctloutput(struct socket *so, struct sockopt *sopt, 394 struct inpcb *inp, struct tcpcb *tp); 395 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); 396 static void 397 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); 398 static void 399 rack_do_segment(struct mbuf *m, struct tcphdr *th, 400 struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 401 uint8_t iptos); 402 static void rack_dtor(void *mem, int32_t size, void *arg); 403 static void 404 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 405 uint32_t t, uint32_t cts); 406 static void 407 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 408 uint32_t flex1, uint32_t flex2, 409 uint32_t flex3, uint32_t flex4, 410 uint32_t flex5, uint32_t flex6, 411 uint16_t flex7, uint8_t mod); 412 static void 413 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 414 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm); 415 static struct rack_sendmap * 416 rack_find_high_nonack(struct tcp_rack *rack, 417 struct rack_sendmap *rsm); 418 static struct rack_sendmap *rack_find_lowest_rsm(struct tcp_rack *rack); 419 static void rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm); 420 static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged); 421 static int 422 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 423 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 424 static void 425 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 426 tcp_seq th_ack, int line); 427 static uint32_t 428 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); 429 static int32_t rack_handoff_ok(struct tcpcb *tp); 430 static int32_t rack_init(struct tcpcb *tp); 431 static void rack_init_sysctls(void); 432 static void 433 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, 434 struct tcphdr *th); 435 static void 436 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 437 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 438 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); 439 static void 440 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, 441 struct rack_sendmap *rsm); 442 static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); 443 static int32_t rack_output(struct tcpcb *tp); 444 445 static uint32_t 446 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, 447 struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, 448 uint32_t cts, int *moved_two); 449 static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); 450 static void rack_remxt_tmr(struct tcpcb *tp); 451 static int 452 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 453 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack); 454 static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack); 455 static int32_t rack_stopall(struct tcpcb *tp); 456 static void 457 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, 458 uint32_t delta); 459 static int32_t rack_timer_active(struct tcpcb *tp, uint32_t timer_type); 460 static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); 461 static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); 462 static uint32_t 463 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 464 struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); 465 static void 466 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 467 struct rack_sendmap *rsm, uint32_t ts); 468 static int 469 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 470 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); 471 static int32_t tcp_addrack(module_t mod, int32_t type, void *data); 472 static int 473 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, 474 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 475 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 476 static int 477 rack_do_closing(struct mbuf *m, struct tcphdr *th, 478 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 479 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 480 static int 481 rack_do_established(struct mbuf *m, struct tcphdr *th, 482 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 483 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 484 static int 485 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 486 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 487 int32_t tlen, uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos); 488 static int 489 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, 490 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 491 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 492 static int 493 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, 494 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 495 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 496 static int 497 rack_do_lastack(struct mbuf *m, struct tcphdr *th, 498 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 499 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 500 static int 501 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, 502 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 503 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 504 static int 505 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, 506 struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, 507 int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); 508 struct rack_sendmap * 509 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, 510 uint32_t tsused); 511 static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, 512 uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); 513 static void 514 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); 515 516 int32_t rack_clear_counter=0; 517 518 519 static int 520 sysctl_rack_clear(SYSCTL_HANDLER_ARGS) 521 { 522 uint32_t stat; 523 int32_t error; 524 525 error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); 526 if (error || req->newptr == NULL) 527 return error; 528 529 error = SYSCTL_IN(req, &stat, sizeof(uint32_t)); 530 if (error) 531 return (error); 532 if (stat == 1) { 533 #ifdef INVARIANTS 534 printf("Clearing RACK counters\n"); 535 #endif 536 counter_u64_zero(rack_badfr); 537 counter_u64_zero(rack_badfr_bytes); 538 counter_u64_zero(rack_rtm_prr_retran); 539 counter_u64_zero(rack_rtm_prr_newdata); 540 counter_u64_zero(rack_timestamp_mismatch); 541 counter_u64_zero(rack_reorder_seen); 542 counter_u64_zero(rack_tlp_tot); 543 counter_u64_zero(rack_tlp_newdata); 544 counter_u64_zero(rack_tlp_retran); 545 counter_u64_zero(rack_tlp_retran_bytes); 546 counter_u64_zero(rack_tlp_retran_fail); 547 counter_u64_zero(rack_to_tot); 548 counter_u64_zero(rack_to_arm_rack); 549 counter_u64_zero(rack_to_arm_tlp); 550 counter_u64_zero(rack_paced_segments); 551 counter_u64_zero(rack_calc_zero); 552 counter_u64_zero(rack_calc_nonzero); 553 counter_u64_zero(rack_unpaced_segments); 554 counter_u64_zero(rack_saw_enobuf); 555 counter_u64_zero(rack_saw_enetunreach); 556 counter_u64_zero(rack_per_timer_hole); 557 counter_u64_zero(rack_to_alloc_hard); 558 counter_u64_zero(rack_to_alloc_emerg); 559 counter_u64_zero(rack_sack_proc_all); 560 counter_u64_zero(rack_sack_proc_short); 561 counter_u64_zero(rack_sack_proc_restart); 562 counter_u64_zero(rack_to_alloc); 563 counter_u64_zero(rack_to_alloc_limited); 564 counter_u64_zero(rack_alloc_limited_conns); 565 counter_u64_zero(rack_split_limited); 566 counter_u64_zero(rack_find_high); 567 counter_u64_zero(rack_tls_rwnd); 568 counter_u64_zero(rack_tls_cwnd); 569 counter_u64_zero(rack_tls_app); 570 counter_u64_zero(rack_tls_other); 571 counter_u64_zero(rack_tls_filled); 572 counter_u64_zero(rack_tls_rxt); 573 counter_u64_zero(rack_tls_tlp); 574 counter_u64_zero(rack_sack_attacks_detected); 575 counter_u64_zero(rack_sack_attacks_reversed); 576 counter_u64_zero(rack_sack_used_next_merge); 577 counter_u64_zero(rack_sack_used_prev_merge); 578 counter_u64_zero(rack_sack_splits); 579 counter_u64_zero(rack_sack_skipped_acked); 580 counter_u64_zero(rack_ack_total); 581 counter_u64_zero(rack_express_sack); 582 counter_u64_zero(rack_sack_total); 583 counter_u64_zero(rack_move_none); 584 counter_u64_zero(rack_move_some); 585 counter_u64_zero(rack_used_tlpmethod); 586 counter_u64_zero(rack_used_tlpmethod2); 587 counter_u64_zero(rack_enter_tlp_calc); 588 counter_u64_zero(rack_progress_drops); 589 counter_u64_zero(rack_tlp_does_nada); 590 counter_u64_zero(rack_try_scwnd); 591 counter_u64_zero(rack_collapsed_win); 592 593 } 594 rack_clear_counter = 0; 595 return (0); 596 } 597 598 599 600 static void 601 rack_init_sysctls(void) 602 { 603 struct sysctl_oid *rack_counters; 604 struct sysctl_oid *rack_attack; 605 struct sysctl_oid *rack_pacing; 606 struct sysctl_oid *rack_timely; 607 struct sysctl_oid *rack_timers; 608 struct sysctl_oid *rack_tlp; 609 struct sysctl_oid *rack_misc; 610 struct sysctl_oid *rack_measure; 611 struct sysctl_oid *rack_probertt; 612 613 rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 614 SYSCTL_CHILDREN(rack_sysctl_root), 615 OID_AUTO, 616 "sack_attack", 617 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 618 "Rack Sack Attack Counters and Controls"); 619 rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 620 SYSCTL_CHILDREN(rack_sysctl_root), 621 OID_AUTO, 622 "stats", 623 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 624 "Rack Counters"); 625 SYSCTL_ADD_S32(&rack_sysctl_ctx, 626 SYSCTL_CHILDREN(rack_sysctl_root), 627 OID_AUTO, "rate_sample_method", CTLFLAG_RW, 628 &rack_rate_sample_method , USE_RTT_LOW, 629 "What method should we use for rate sampling 0=high, 1=low "); 630 SYSCTL_ADD_S32(&rack_sysctl_ctx, 631 SYSCTL_CHILDREN(rack_sysctl_root), 632 OID_AUTO, "hw_tlsmax", CTLFLAG_RW, 633 &rack_hw_tls_max_seg , 3, 634 "What is the maximum number of full TLS records that will be sent at once"); 635 /* Probe rtt related controls */ 636 rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 637 SYSCTL_CHILDREN(rack_sysctl_root), 638 OID_AUTO, 639 "probertt", 640 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 641 "ProbeRTT related Controls"); 642 SYSCTL_ADD_U16(&rack_sysctl_ctx, 643 SYSCTL_CHILDREN(rack_probertt), 644 OID_AUTO, "exit_per_hpb", CTLFLAG_RW, 645 &rack_atexit_prtt_hbp, 130, 646 "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%"); 647 SYSCTL_ADD_U16(&rack_sysctl_ctx, 648 SYSCTL_CHILDREN(rack_probertt), 649 OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW, 650 &rack_atexit_prtt, 130, 651 "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%"); 652 SYSCTL_ADD_U16(&rack_sysctl_ctx, 653 SYSCTL_CHILDREN(rack_probertt), 654 OID_AUTO, "gp_per_mul", CTLFLAG_RW, 655 &rack_per_of_gp_probertt, 60, 656 "What percentage of goodput do we pace at in probertt"); 657 SYSCTL_ADD_U16(&rack_sysctl_ctx, 658 SYSCTL_CHILDREN(rack_probertt), 659 OID_AUTO, "gp_per_reduce", CTLFLAG_RW, 660 &rack_per_of_gp_probertt_reduce, 10, 661 "What percentage of goodput do we reduce every gp_srtt"); 662 SYSCTL_ADD_U16(&rack_sysctl_ctx, 663 SYSCTL_CHILDREN(rack_probertt), 664 OID_AUTO, "gp_per_low", CTLFLAG_RW, 665 &rack_per_of_gp_lowthresh, 40, 666 "What percentage of goodput do we allow the multiplier to fall to"); 667 SYSCTL_ADD_U32(&rack_sysctl_ctx, 668 SYSCTL_CHILDREN(rack_probertt), 669 OID_AUTO, "time_between", CTLFLAG_RW, 670 & rack_time_between_probertt, 96000000, 671 "How many useconds between the lowest rtt falling must past before we enter probertt"); 672 SYSCTL_ADD_U32(&rack_sysctl_ctx, 673 SYSCTL_CHILDREN(rack_probertt), 674 OID_AUTO, "safety", CTLFLAG_RW, 675 &rack_probe_rtt_safety_val, 2000000, 676 "If not zero, provides a maximum usecond that you can stay in probertt (2sec = 2000000)"); 677 SYSCTL_ADD_U32(&rack_sysctl_ctx, 678 SYSCTL_CHILDREN(rack_probertt), 679 OID_AUTO, "sets_cwnd", CTLFLAG_RW, 680 &rack_probe_rtt_sets_cwnd, 0, 681 "Do we set the cwnd too (if always_lower is on)"); 682 SYSCTL_ADD_U32(&rack_sysctl_ctx, 683 SYSCTL_CHILDREN(rack_probertt), 684 OID_AUTO, "maxdrainsrtts", CTLFLAG_RW, 685 &rack_max_drain_wait, 2, 686 "Maximum number of gp_srtt's to hold in drain waiting for flight to reach goal"); 687 SYSCTL_ADD_U32(&rack_sysctl_ctx, 688 SYSCTL_CHILDREN(rack_probertt), 689 OID_AUTO, "mustdrainsrtts", CTLFLAG_RW, 690 &rack_must_drain, 1, 691 "We must drain this many gp_srtt's waiting for flight to reach goal"); 692 SYSCTL_ADD_U32(&rack_sysctl_ctx, 693 SYSCTL_CHILDREN(rack_probertt), 694 OID_AUTO, "goal_use_min_entry", CTLFLAG_RW, 695 &rack_probertt_use_min_rtt_entry, 1, 696 "Should we use the min-rtt to calculate the goal rtt (else gp_srtt) at entry"); 697 SYSCTL_ADD_U32(&rack_sysctl_ctx, 698 SYSCTL_CHILDREN(rack_probertt), 699 OID_AUTO, "goal_use_min_exit", CTLFLAG_RW, 700 &rack_probertt_use_min_rtt_exit, 0, 701 "How to set cwnd at exit, 0 - dynamic, 1 - use min-rtt, 2 - use curgprtt, 3 - entry gp-rtt"); 702 SYSCTL_ADD_U32(&rack_sysctl_ctx, 703 SYSCTL_CHILDREN(rack_probertt), 704 OID_AUTO, "length_div", CTLFLAG_RW, 705 &rack_probertt_gpsrtt_cnt_div, 0, 706 "How many recent goodput srtt periods plus hold tim does probertt last (bottom of fraction)"); 707 SYSCTL_ADD_U32(&rack_sysctl_ctx, 708 SYSCTL_CHILDREN(rack_probertt), 709 OID_AUTO, "length_mul", CTLFLAG_RW, 710 &rack_probertt_gpsrtt_cnt_mul, 0, 711 "How many recent goodput srtt periods plus hold tim does probertt last (top of fraction)"); 712 SYSCTL_ADD_U32(&rack_sysctl_ctx, 713 SYSCTL_CHILDREN(rack_probertt), 714 OID_AUTO, "holdtim_at_target", CTLFLAG_RW, 715 &rack_min_probertt_hold, 200000, 716 "What is the minimum time we hold probertt at target"); 717 SYSCTL_ADD_U32(&rack_sysctl_ctx, 718 SYSCTL_CHILDREN(rack_probertt), 719 OID_AUTO, "filter_life", CTLFLAG_RW, 720 &rack_probertt_filter_life, 10000000, 721 "What is the time for the filters life in useconds"); 722 SYSCTL_ADD_U32(&rack_sysctl_ctx, 723 SYSCTL_CHILDREN(rack_probertt), 724 OID_AUTO, "lower_within", CTLFLAG_RW, 725 &rack_probertt_lower_within, 10, 726 "If the rtt goes lower within this percentage of the time, go into probe-rtt"); 727 SYSCTL_ADD_U32(&rack_sysctl_ctx, 728 SYSCTL_CHILDREN(rack_probertt), 729 OID_AUTO, "must_move", CTLFLAG_RW, 730 &rack_min_rtt_movement, 250, 731 "How much is the minimum movement in rtt to count as a drop for probertt purposes"); 732 SYSCTL_ADD_U32(&rack_sysctl_ctx, 733 SYSCTL_CHILDREN(rack_probertt), 734 OID_AUTO, "clear_is_cnts", CTLFLAG_RW, 735 &rack_probertt_clear_is, 1, 736 "Do we clear I/S counts on exiting probe-rtt"); 737 SYSCTL_ADD_S32(&rack_sysctl_ctx, 738 SYSCTL_CHILDREN(rack_probertt), 739 OID_AUTO, "hbp_extra_drain", CTLFLAG_RW, 740 &rack_max_drain_hbp, 1, 741 "How many extra drain gpsrtt's do we get in highly buffered paths"); 742 SYSCTL_ADD_S32(&rack_sysctl_ctx, 743 SYSCTL_CHILDREN(rack_probertt), 744 OID_AUTO, "hbp_threshold", CTLFLAG_RW, 745 &rack_hbp_thresh, 3, 746 "We are highly buffered if min_rtt_seen / max_rtt_seen > this-threshold"); 747 /* Pacing related sysctls */ 748 rack_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 749 SYSCTL_CHILDREN(rack_sysctl_root), 750 OID_AUTO, 751 "pacing", 752 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 753 "Pacing related Controls"); 754 SYSCTL_ADD_S32(&rack_sysctl_ctx, 755 SYSCTL_CHILDREN(rack_pacing), 756 OID_AUTO, "max_pace_over", CTLFLAG_RW, 757 &rack_max_per_above, 30, 758 "What is the maximum allowable percentage that we can pace above (so 30 = 130% of our goal)"); 759 SYSCTL_ADD_S32(&rack_sysctl_ctx, 760 SYSCTL_CHILDREN(rack_pacing), 761 OID_AUTO, "pace_to_one", CTLFLAG_RW, 762 &rack_pace_one_seg, 0, 763 "Do we allow low b/w pacing of 1MSS instead of two"); 764 SYSCTL_ADD_S32(&rack_sysctl_ctx, 765 SYSCTL_CHILDREN(rack_pacing), 766 OID_AUTO, "limit_wsrtt", CTLFLAG_RW, 767 &rack_limit_time_with_srtt, 0, 768 "Do we limit pacing time based on srtt"); 769 SYSCTL_ADD_S32(&rack_sysctl_ctx, 770 SYSCTL_CHILDREN(rack_pacing), 771 OID_AUTO, "init_win", CTLFLAG_RW, 772 &rack_default_init_window, 0, 773 "Do we have a rack initial window 0 = system default"); 774 SYSCTL_ADD_U32(&rack_sysctl_ctx, 775 SYSCTL_CHILDREN(rack_pacing), 776 OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, 777 &rack_hw_pace_adjust, 0, 778 "What percentage do we raise the MSS by (11 = 1.1%)"); 779 SYSCTL_ADD_U16(&rack_sysctl_ctx, 780 SYSCTL_CHILDREN(rack_pacing), 781 OID_AUTO, "gp_per_ss", CTLFLAG_RW, 782 &rack_per_of_gp_ss, 250, 783 "If non zero, what percentage of goodput to pace at in slow start"); 784 SYSCTL_ADD_U16(&rack_sysctl_ctx, 785 SYSCTL_CHILDREN(rack_pacing), 786 OID_AUTO, "gp_per_ca", CTLFLAG_RW, 787 &rack_per_of_gp_ca, 150, 788 "If non zero, what percentage of goodput to pace at in congestion avoidance"); 789 SYSCTL_ADD_U16(&rack_sysctl_ctx, 790 SYSCTL_CHILDREN(rack_pacing), 791 OID_AUTO, "gp_per_rec", CTLFLAG_RW, 792 &rack_per_of_gp_rec, 200, 793 "If non zero, what percentage of goodput to pace at in recovery"); 794 SYSCTL_ADD_S32(&rack_sysctl_ctx, 795 SYSCTL_CHILDREN(rack_pacing), 796 OID_AUTO, "pace_max_seg", CTLFLAG_RW, 797 &rack_hptsi_segments, 40, 798 "What size is the max for TSO segments in pacing and burst mitigation"); 799 SYSCTL_ADD_S32(&rack_sysctl_ctx, 800 SYSCTL_CHILDREN(rack_pacing), 801 OID_AUTO, "burst_reduces", CTLFLAG_RW, 802 &rack_slot_reduction, 4, 803 "When doing only burst mitigation what is the reduce divisor"); 804 SYSCTL_ADD_S32(&rack_sysctl_ctx, 805 SYSCTL_CHILDREN(rack_sysctl_root), 806 OID_AUTO, "use_pacing", CTLFLAG_RW, 807 &rack_pace_every_seg, 0, 808 "If set we use pacing, if clear we use only the original burst mitigation"); 809 810 rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 811 SYSCTL_CHILDREN(rack_sysctl_root), 812 OID_AUTO, 813 "timely", 814 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 815 "Rack Timely RTT Controls"); 816 /* Timely based GP dynmics */ 817 SYSCTL_ADD_S32(&rack_sysctl_ctx, 818 SYSCTL_CHILDREN(rack_timely), 819 OID_AUTO, "upper", CTLFLAG_RW, 820 &rack_gp_per_bw_mul_up, 2, 821 "Rack timely upper range for equal b/w (in percentage)"); 822 SYSCTL_ADD_S32(&rack_sysctl_ctx, 823 SYSCTL_CHILDREN(rack_timely), 824 OID_AUTO, "lower", CTLFLAG_RW, 825 &rack_gp_per_bw_mul_down, 4, 826 "Rack timely lower range for equal b/w (in percentage)"); 827 SYSCTL_ADD_S32(&rack_sysctl_ctx, 828 SYSCTL_CHILDREN(rack_timely), 829 OID_AUTO, "rtt_max_mul", CTLFLAG_RW, 830 &rack_gp_rtt_maxmul, 3, 831 "Rack timely multipler of lowest rtt for rtt_max"); 832 SYSCTL_ADD_S32(&rack_sysctl_ctx, 833 SYSCTL_CHILDREN(rack_timely), 834 OID_AUTO, "rtt_min_div", CTLFLAG_RW, 835 &rack_gp_rtt_mindiv, 4, 836 "Rack timely divisor used for rtt + (rtt * mul/divisor) for check for lower rtt"); 837 SYSCTL_ADD_S32(&rack_sysctl_ctx, 838 SYSCTL_CHILDREN(rack_timely), 839 OID_AUTO, "rtt_min_mul", CTLFLAG_RW, 840 &rack_gp_rtt_minmul, 1, 841 "Rack timely multiplier used for rtt + (rtt * mul/divisor) for check for lower rtt"); 842 SYSCTL_ADD_S32(&rack_sysctl_ctx, 843 SYSCTL_CHILDREN(rack_timely), 844 OID_AUTO, "decrease", CTLFLAG_RW, 845 &rack_gp_decrease_per, 20, 846 "Rack timely decrease percentage of our GP multiplication factor"); 847 SYSCTL_ADD_S32(&rack_sysctl_ctx, 848 SYSCTL_CHILDREN(rack_timely), 849 OID_AUTO, "increase", CTLFLAG_RW, 850 &rack_gp_increase_per, 2, 851 "Rack timely increase perentage of our GP multiplication factor"); 852 SYSCTL_ADD_S32(&rack_sysctl_ctx, 853 SYSCTL_CHILDREN(rack_timely), 854 OID_AUTO, "lowerbound", CTLFLAG_RW, 855 &rack_per_lower_bound, 50, 856 "Rack timely lowest percentage we allow GP multiplier to fall to"); 857 SYSCTL_ADD_S32(&rack_sysctl_ctx, 858 SYSCTL_CHILDREN(rack_timely), 859 OID_AUTO, "upperboundss", CTLFLAG_RW, 860 &rack_per_upper_bound_ss, 0, 861 "Rack timely higest percentage we allow GP multiplier in SS to raise to (0 is no upperbound)"); 862 SYSCTL_ADD_S32(&rack_sysctl_ctx, 863 SYSCTL_CHILDREN(rack_timely), 864 OID_AUTO, "upperboundca", CTLFLAG_RW, 865 &rack_per_upper_bound_ca, 0, 866 "Rack timely higest percentage we allow GP multiplier to CA raise to (0 is no upperbound)"); 867 SYSCTL_ADD_S32(&rack_sysctl_ctx, 868 SYSCTL_CHILDREN(rack_timely), 869 OID_AUTO, "dynamicgp", CTLFLAG_RW, 870 &rack_do_dyn_mul, 0, 871 "Rack timely do we enable dynmaic timely goodput by default"); 872 SYSCTL_ADD_S32(&rack_sysctl_ctx, 873 SYSCTL_CHILDREN(rack_timely), 874 OID_AUTO, "no_rec_red", CTLFLAG_RW, 875 &rack_gp_no_rec_chg, 1, 876 "Rack timely do we prohibit the recovery multiplier from being lowered"); 877 SYSCTL_ADD_S32(&rack_sysctl_ctx, 878 SYSCTL_CHILDREN(rack_timely), 879 OID_AUTO, "red_clear_cnt", CTLFLAG_RW, 880 &rack_timely_dec_clear, 6, 881 "Rack timely what threshold do we count to before another boost during b/w decent"); 882 SYSCTL_ADD_S32(&rack_sysctl_ctx, 883 SYSCTL_CHILDREN(rack_timely), 884 OID_AUTO, "max_push_rise", CTLFLAG_RW, 885 &rack_timely_max_push_rise, 3, 886 "Rack timely how many times do we push up with b/w increase"); 887 SYSCTL_ADD_S32(&rack_sysctl_ctx, 888 SYSCTL_CHILDREN(rack_timely), 889 OID_AUTO, "max_push_drop", CTLFLAG_RW, 890 &rack_timely_max_push_drop, 3, 891 "Rack timely how many times do we push back on b/w decent"); 892 SYSCTL_ADD_S32(&rack_sysctl_ctx, 893 SYSCTL_CHILDREN(rack_timely), 894 OID_AUTO, "min_segs", CTLFLAG_RW, 895 &rack_timely_min_segs, 4, 896 "Rack timely when setting the cwnd what is the min num segments"); 897 SYSCTL_ADD_S32(&rack_sysctl_ctx, 898 SYSCTL_CHILDREN(rack_timely), 899 OID_AUTO, "noback_max", CTLFLAG_RW, 900 &rack_use_max_for_nobackoff, 0, 901 "Rack timely when deciding if to backoff on a loss, do we use under max rtt else min"); 902 SYSCTL_ADD_S32(&rack_sysctl_ctx, 903 SYSCTL_CHILDREN(rack_timely), 904 OID_AUTO, "interim_timely_only", CTLFLAG_RW, 905 &rack_timely_int_timely_only, 0, 906 "Rack timely when doing interim timely's do we only do timely (no b/w consideration)"); 907 SYSCTL_ADD_S32(&rack_sysctl_ctx, 908 SYSCTL_CHILDREN(rack_timely), 909 OID_AUTO, "nonstop", CTLFLAG_RW, 910 &rack_timely_no_stopping, 0, 911 "Rack timely don't stop increase"); 912 SYSCTL_ADD_S32(&rack_sysctl_ctx, 913 SYSCTL_CHILDREN(rack_timely), 914 OID_AUTO, "dec_raise_thresh", CTLFLAG_RW, 915 &rack_down_raise_thresh, 100, 916 "If the CA or SS is below this threshold raise on the first 3 b/w lowers (0=always)"); 917 SYSCTL_ADD_S32(&rack_sysctl_ctx, 918 SYSCTL_CHILDREN(rack_timely), 919 OID_AUTO, "bottom_drag_segs", CTLFLAG_RW, 920 &rack_req_segs, 1, 921 "Bottom dragging if not these many segments outstanding and room"); 922 923 /* TLP and Rack related parameters */ 924 rack_tlp = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 925 SYSCTL_CHILDREN(rack_sysctl_root), 926 OID_AUTO, 927 "tlp", 928 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 929 "TLP and Rack related Controls"); 930 SYSCTL_ADD_S32(&rack_sysctl_ctx, 931 SYSCTL_CHILDREN(rack_tlp), 932 OID_AUTO, "use_rrr", CTLFLAG_RW, 933 &use_rack_rr, 1, 934 "Do we use Rack Rapid Recovery"); 935 SYSCTL_ADD_S32(&rack_sysctl_ctx, 936 SYSCTL_CHILDREN(rack_tlp), 937 OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, 938 &rack_non_rxt_use_cr, 0, 939 "Do we use ss/ca rate if in recovery we are transmitting a new data chunk"); 940 SYSCTL_ADD_S32(&rack_sysctl_ctx, 941 SYSCTL_CHILDREN(rack_tlp), 942 OID_AUTO, "tlpmethod", CTLFLAG_RW, 943 &rack_tlp_threshold_use, TLP_USE_TWO_ONE, 944 "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); 945 SYSCTL_ADD_S32(&rack_sysctl_ctx, 946 SYSCTL_CHILDREN(rack_tlp), 947 OID_AUTO, "limit", CTLFLAG_RW, 948 &rack_tlp_limit, 2, 949 "How many TLP's can be sent without sending new data"); 950 SYSCTL_ADD_S32(&rack_sysctl_ctx, 951 SYSCTL_CHILDREN(rack_tlp), 952 OID_AUTO, "use_greater", CTLFLAG_RW, 953 &rack_tlp_use_greater, 1, 954 "Should we use the rack_rtt time if its greater than srtt"); 955 SYSCTL_ADD_S32(&rack_sysctl_ctx, 956 SYSCTL_CHILDREN(rack_tlp), 957 OID_AUTO, "tlpminto", CTLFLAG_RW, 958 &rack_tlp_min, 10, 959 "TLP minimum timeout per the specification (10ms)"); 960 SYSCTL_ADD_S32(&rack_sysctl_ctx, 961 SYSCTL_CHILDREN(rack_tlp), 962 OID_AUTO, "send_oldest", CTLFLAG_RW, 963 &rack_always_send_oldest, 0, 964 "Should we always send the oldest TLP and RACK-TLP"); 965 SYSCTL_ADD_S32(&rack_sysctl_ctx, 966 SYSCTL_CHILDREN(rack_tlp), 967 OID_AUTO, "rack_tlimit", CTLFLAG_RW, 968 &rack_limited_retran, 0, 969 "How many times can a rack timeout drive out sends"); 970 SYSCTL_ADD_S32(&rack_sysctl_ctx, 971 SYSCTL_CHILDREN(rack_tlp), 972 OID_AUTO, "tlp_retry", CTLFLAG_RW, 973 &rack_tlp_max_resend, 2, 974 "How many times does TLP retry a single segment or multiple with no ACK"); 975 SYSCTL_ADD_S32(&rack_sysctl_ctx, 976 SYSCTL_CHILDREN(rack_tlp), 977 OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, 978 &rack_lower_cwnd_at_tlp, 0, 979 "When a TLP completes a retran should we enter recovery"); 980 SYSCTL_ADD_S32(&rack_sysctl_ctx, 981 SYSCTL_CHILDREN(rack_tlp), 982 OID_AUTO, "reorder_thresh", CTLFLAG_RW, 983 &rack_reorder_thresh, 2, 984 "What factor for rack will be added when seeing reordering (shift right)"); 985 SYSCTL_ADD_S32(&rack_sysctl_ctx, 986 SYSCTL_CHILDREN(rack_tlp), 987 OID_AUTO, "rtt_tlp_thresh", CTLFLAG_RW, 988 &rack_tlp_thresh, 1, 989 "What divisor for TLP rtt/retran will be added (1=rtt, 2=1/2 rtt etc)"); 990 SYSCTL_ADD_S32(&rack_sysctl_ctx, 991 SYSCTL_CHILDREN(rack_tlp), 992 OID_AUTO, "reorder_fade", CTLFLAG_RW, 993 &rack_reorder_fade, 0, 994 "Does reorder detection fade, if so how many ms (0 means never)"); 995 SYSCTL_ADD_S32(&rack_sysctl_ctx, 996 SYSCTL_CHILDREN(rack_tlp), 997 OID_AUTO, "pktdelay", CTLFLAG_RW, 998 &rack_pkt_delay, 1, 999 "Extra RACK time (in ms) besides reordering thresh"); 1000 1001 /* Timer related controls */ 1002 rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1003 SYSCTL_CHILDREN(rack_sysctl_root), 1004 OID_AUTO, 1005 "timers", 1006 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1007 "Timer related controls"); 1008 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1009 SYSCTL_CHILDREN(rack_timers), 1010 OID_AUTO, "persmin", CTLFLAG_RW, 1011 &rack_persist_min, 250, 1012 "What is the minimum time in milliseconds between persists"); 1013 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1014 SYSCTL_CHILDREN(rack_timers), 1015 OID_AUTO, "persmax", CTLFLAG_RW, 1016 &rack_persist_max, 2000, 1017 "What is the largest delay in milliseconds between persists"); 1018 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1019 SYSCTL_CHILDREN(rack_timers), 1020 OID_AUTO, "delayed_ack", CTLFLAG_RW, 1021 &rack_delayed_ack_time, 200, 1022 "Delayed ack time (200ms)"); 1023 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1024 SYSCTL_CHILDREN(rack_timers), 1025 OID_AUTO, "minrto", CTLFLAG_RW, 1026 &rack_rto_min, 0, 1027 "Minimum RTO in ms -- set with caution below 1000 due to TLP"); 1028 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1029 SYSCTL_CHILDREN(rack_timers), 1030 OID_AUTO, "maxrto", CTLFLAG_RW, 1031 &rack_rto_max, 0, 1032 "Maxiumum RTO in ms -- should be at least as large as min_rto"); 1033 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1034 SYSCTL_CHILDREN(rack_timers), 1035 OID_AUTO, "minto", CTLFLAG_RW, 1036 &rack_min_to, 1, 1037 "Minimum rack timeout in milliseconds"); 1038 /* Measure controls */ 1039 rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1040 SYSCTL_CHILDREN(rack_sysctl_root), 1041 OID_AUTO, 1042 "measure", 1043 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1044 "Measure related controls"); 1045 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1046 SYSCTL_CHILDREN(rack_measure), 1047 OID_AUTO, "wma_divisor", CTLFLAG_RW, 1048 &rack_wma_divisor, 8, 1049 "When doing b/w calculation what is the divisor for the WMA"); 1050 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1051 SYSCTL_CHILDREN(rack_measure), 1052 OID_AUTO, "end_cwnd", CTLFLAG_RW, 1053 &rack_cwnd_block_ends_measure, 0, 1054 "Does a cwnd just-return end the measurement window (app limited)"); 1055 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1056 SYSCTL_CHILDREN(rack_measure), 1057 OID_AUTO, "end_rwnd", CTLFLAG_RW, 1058 &rack_rwnd_block_ends_measure, 0, 1059 "Does an rwnd just-return end the measurement window (app limited -- not persists)"); 1060 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1061 SYSCTL_CHILDREN(rack_measure), 1062 OID_AUTO, "min_target", CTLFLAG_RW, 1063 &rack_def_data_window, 20, 1064 "What is the minimum target window (in mss) for a GP measurements"); 1065 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1066 SYSCTL_CHILDREN(rack_measure), 1067 OID_AUTO, "goal_bdp", CTLFLAG_RW, 1068 &rack_goal_bdp, 2, 1069 "What is the goal BDP to measure"); 1070 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1071 SYSCTL_CHILDREN(rack_measure), 1072 OID_AUTO, "min_srtts", CTLFLAG_RW, 1073 &rack_min_srtts, 1, 1074 "What is the goal BDP to measure"); 1075 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1076 SYSCTL_CHILDREN(rack_measure), 1077 OID_AUTO, "min_measure_tim", CTLFLAG_RW, 1078 &rack_min_measure_usec, 0, 1079 "What is the Minimum time time for a measurement if 0, this is off"); 1080 /* Misc rack controls */ 1081 rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 1082 SYSCTL_CHILDREN(rack_sysctl_root), 1083 OID_AUTO, 1084 "misc", 1085 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1086 "Misc related controls"); 1087 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1088 SYSCTL_CHILDREN(rack_misc), 1089 OID_AUTO, "shared_cwnd", CTLFLAG_RW, 1090 &rack_enable_shared_cwnd, 0, 1091 "Should RACK try to use the shared cwnd on connections where allowed"); 1092 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1093 SYSCTL_CHILDREN(rack_misc), 1094 OID_AUTO, "limits_on_scwnd", CTLFLAG_RW, 1095 &rack_limits_scwnd, 1, 1096 "Should RACK place low end time limits on the shared cwnd feature"); 1097 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1098 SYSCTL_CHILDREN(rack_misc), 1099 OID_AUTO, "non_paced_lro_queue", CTLFLAG_RW, 1100 &rack_enable_mqueue_for_nonpaced, 0, 1101 "Should RACK use mbuf queuing for non-paced connections"); 1102 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1103 SYSCTL_CHILDREN(rack_misc), 1104 OID_AUTO, "iMac_dack", CTLFLAG_RW, 1105 &rack_use_imac_dack, 0, 1106 "Should RACK try to emulate iMac delayed ack"); 1107 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1108 SYSCTL_CHILDREN(rack_misc), 1109 OID_AUTO, "no_prr", CTLFLAG_RW, 1110 &rack_disable_prr, 0, 1111 "Should RACK not use prr and only pace (must have pacing on)"); 1112 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1113 SYSCTL_CHILDREN(rack_misc), 1114 OID_AUTO, "bb_verbose", CTLFLAG_RW, 1115 &rack_verbose_logging, 0, 1116 "Should RACK black box logging be verbose"); 1117 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1118 SYSCTL_CHILDREN(rack_misc), 1119 OID_AUTO, "data_after_close", CTLFLAG_RW, 1120 &rack_ignore_data_after_close, 1, 1121 "Do we hold off sending a RST until all pending data is ack'd"); 1122 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1123 SYSCTL_CHILDREN(rack_misc), 1124 OID_AUTO, "no_sack_needed", CTLFLAG_RW, 1125 &rack_sack_not_required, 0, 1126 "Do we allow rack to run on connections not supporting SACK"); 1127 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1128 SYSCTL_CHILDREN(rack_misc), 1129 OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, 1130 &rack_use_proportional_reduce, 0, 1131 "Should we proportionaly reduce cwnd based on the number of losses "); 1132 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1133 SYSCTL_CHILDREN(rack_misc), 1134 OID_AUTO, "recovery_prop", CTLFLAG_RW, 1135 &rack_proportional_rate, 10, 1136 "What percent reduction per loss"); 1137 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1138 SYSCTL_CHILDREN(rack_misc), 1139 OID_AUTO, "prr_sendalot", CTLFLAG_RW, 1140 &rack_send_a_lot_in_prr, 1, 1141 "Send a lot in prr"); 1142 SYSCTL_ADD_S32(&rack_sysctl_ctx, 1143 SYSCTL_CHILDREN(rack_misc), 1144 OID_AUTO, "earlyrecovery", CTLFLAG_RW, 1145 &rack_early_recovery, 1, 1146 "Do we do early recovery with rack"); 1147 /* Sack Attacker detection stuff */ 1148 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1149 SYSCTL_CHILDREN(rack_attack), 1150 OID_AUTO, "detect_highsackratio", CTLFLAG_RW, 1151 &rack_highest_sack_thresh_seen, 0, 1152 "Highest sack to ack ratio seen"); 1153 SYSCTL_ADD_U32(&rack_sysctl_ctx, 1154 SYSCTL_CHILDREN(rack_attack), 1155 OID_AUTO, "detect_highmoveratio", CTLFLAG_RW, 1156 &rack_highest_move_thresh_seen, 0, 1157 "Highest move to non-move ratio seen"); 1158 rack_ack_total = counter_u64_alloc(M_WAITOK); 1159 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1160 SYSCTL_CHILDREN(rack_attack), 1161 OID_AUTO, "acktotal", CTLFLAG_RD, 1162 &rack_ack_total, 1163 "Total number of Ack's"); 1164 rack_express_sack = counter_u64_alloc(M_WAITOK); 1165 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1166 SYSCTL_CHILDREN(rack_attack), 1167 OID_AUTO, "exp_sacktotal", CTLFLAG_RD, 1168 &rack_express_sack, 1169 "Total expresss number of Sack's"); 1170 rack_sack_total = counter_u64_alloc(M_WAITOK); 1171 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1172 SYSCTL_CHILDREN(rack_attack), 1173 OID_AUTO, "sacktotal", CTLFLAG_RD, 1174 &rack_sack_total, 1175 "Total number of SACKs"); 1176 rack_move_none = counter_u64_alloc(M_WAITOK); 1177 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1178 SYSCTL_CHILDREN(rack_attack), 1179 OID_AUTO, "move_none", CTLFLAG_RD, 1180 &rack_move_none, 1181 "Total number of SACK index reuse of postions under threshold"); 1182 rack_move_some = counter_u64_alloc(M_WAITOK); 1183 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1184 SYSCTL_CHILDREN(rack_attack), 1185 OID_AUTO, "move_some", CTLFLAG_RD, 1186 &rack_move_some, 1187 "Total number of SACK index reuse of postions over threshold"); 1188 rack_sack_attacks_detected = counter_u64_alloc(M_WAITOK); 1189 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1190 SYSCTL_CHILDREN(rack_attack), 1191 OID_AUTO, "attacks", CTLFLAG_RD, 1192 &rack_sack_attacks_detected, 1193 "Total number of SACK attackers that had sack disabled"); 1194 rack_sack_attacks_reversed = counter_u64_alloc(M_WAITOK); 1195 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1196 SYSCTL_CHILDREN(rack_attack), 1197 OID_AUTO, "reversed", CTLFLAG_RD, 1198 &rack_sack_attacks_reversed, 1199 "Total number of SACK attackers that were later determined false positive"); 1200 rack_sack_used_next_merge = counter_u64_alloc(M_WAITOK); 1201 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1202 SYSCTL_CHILDREN(rack_attack), 1203 OID_AUTO, "nextmerge", CTLFLAG_RD, 1204 &rack_sack_used_next_merge, 1205 "Total number of times we used the next merge"); 1206 rack_sack_used_prev_merge = counter_u64_alloc(M_WAITOK); 1207 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1208 SYSCTL_CHILDREN(rack_attack), 1209 OID_AUTO, "prevmerge", CTLFLAG_RD, 1210 &rack_sack_used_prev_merge, 1211 "Total number of times we used the prev merge"); 1212 /* Counters */ 1213 rack_badfr = counter_u64_alloc(M_WAITOK); 1214 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1215 SYSCTL_CHILDREN(rack_counters), 1216 OID_AUTO, "badfr", CTLFLAG_RD, 1217 &rack_badfr, "Total number of bad FRs"); 1218 rack_badfr_bytes = counter_u64_alloc(M_WAITOK); 1219 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1220 SYSCTL_CHILDREN(rack_counters), 1221 OID_AUTO, "badfr_bytes", CTLFLAG_RD, 1222 &rack_badfr_bytes, "Total number of bad FRs"); 1223 rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); 1224 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1225 SYSCTL_CHILDREN(rack_counters), 1226 OID_AUTO, "prrsndret", CTLFLAG_RD, 1227 &rack_rtm_prr_retran, 1228 "Total number of prr based retransmits"); 1229 rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); 1230 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1231 SYSCTL_CHILDREN(rack_counters), 1232 OID_AUTO, "prrsndnew", CTLFLAG_RD, 1233 &rack_rtm_prr_newdata, 1234 "Total number of prr based new transmits"); 1235 rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); 1236 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1237 SYSCTL_CHILDREN(rack_counters), 1238 OID_AUTO, "tsnf", CTLFLAG_RD, 1239 &rack_timestamp_mismatch, 1240 "Total number of timestamps that we could not find the reported ts"); 1241 rack_find_high = counter_u64_alloc(M_WAITOK); 1242 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1243 SYSCTL_CHILDREN(rack_counters), 1244 OID_AUTO, "findhigh", CTLFLAG_RD, 1245 &rack_find_high, 1246 "Total number of FIN causing find-high"); 1247 rack_reorder_seen = counter_u64_alloc(M_WAITOK); 1248 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1249 SYSCTL_CHILDREN(rack_counters), 1250 OID_AUTO, "reordering", CTLFLAG_RD, 1251 &rack_reorder_seen, 1252 "Total number of times we added delay due to reordering"); 1253 rack_tlp_tot = counter_u64_alloc(M_WAITOK); 1254 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1255 SYSCTL_CHILDREN(rack_counters), 1256 OID_AUTO, "tlp_to_total", CTLFLAG_RD, 1257 &rack_tlp_tot, 1258 "Total number of tail loss probe expirations"); 1259 rack_tlp_newdata = counter_u64_alloc(M_WAITOK); 1260 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1261 SYSCTL_CHILDREN(rack_counters), 1262 OID_AUTO, "tlp_new", CTLFLAG_RD, 1263 &rack_tlp_newdata, 1264 "Total number of tail loss probe sending new data"); 1265 rack_tlp_retran = counter_u64_alloc(M_WAITOK); 1266 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1267 SYSCTL_CHILDREN(rack_counters), 1268 OID_AUTO, "tlp_retran", CTLFLAG_RD, 1269 &rack_tlp_retran, 1270 "Total number of tail loss probe sending retransmitted data"); 1271 rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); 1272 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1273 SYSCTL_CHILDREN(rack_counters), 1274 OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, 1275 &rack_tlp_retran_bytes, 1276 "Total bytes of tail loss probe sending retransmitted data"); 1277 rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); 1278 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1279 SYSCTL_CHILDREN(rack_counters), 1280 OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, 1281 &rack_tlp_retran_fail, 1282 "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); 1283 rack_to_tot = counter_u64_alloc(M_WAITOK); 1284 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1285 SYSCTL_CHILDREN(rack_counters), 1286 OID_AUTO, "rack_to_tot", CTLFLAG_RD, 1287 &rack_to_tot, 1288 "Total number of times the rack to expired"); 1289 rack_to_arm_rack = counter_u64_alloc(M_WAITOK); 1290 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1291 SYSCTL_CHILDREN(rack_counters), 1292 OID_AUTO, "arm_rack", CTLFLAG_RD, 1293 &rack_to_arm_rack, 1294 "Total number of times the rack timer armed"); 1295 rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); 1296 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1297 SYSCTL_CHILDREN(rack_counters), 1298 OID_AUTO, "arm_tlp", CTLFLAG_RD, 1299 &rack_to_arm_tlp, 1300 "Total number of times the tlp timer armed"); 1301 rack_calc_zero = counter_u64_alloc(M_WAITOK); 1302 rack_calc_nonzero = counter_u64_alloc(M_WAITOK); 1303 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1304 SYSCTL_CHILDREN(rack_counters), 1305 OID_AUTO, "calc_zero", CTLFLAG_RD, 1306 &rack_calc_zero, 1307 "Total number of times pacing time worked out to zero"); 1308 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1309 SYSCTL_CHILDREN(rack_counters), 1310 OID_AUTO, "calc_nonzero", CTLFLAG_RD, 1311 &rack_calc_nonzero, 1312 "Total number of times pacing time worked out to non-zero"); 1313 rack_paced_segments = counter_u64_alloc(M_WAITOK); 1314 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1315 SYSCTL_CHILDREN(rack_counters), 1316 OID_AUTO, "paced", CTLFLAG_RD, 1317 &rack_paced_segments, 1318 "Total number of times a segment send caused hptsi"); 1319 rack_unpaced_segments = counter_u64_alloc(M_WAITOK); 1320 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1321 SYSCTL_CHILDREN(rack_counters), 1322 OID_AUTO, "unpaced", CTLFLAG_RD, 1323 &rack_unpaced_segments, 1324 "Total number of times a segment did not cause hptsi"); 1325 rack_saw_enobuf = counter_u64_alloc(M_WAITOK); 1326 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1327 SYSCTL_CHILDREN(rack_counters), 1328 OID_AUTO, "saw_enobufs", CTLFLAG_RD, 1329 &rack_saw_enobuf, 1330 "Total number of times a segment did not cause hptsi"); 1331 rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); 1332 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1333 SYSCTL_CHILDREN(rack_counters), 1334 OID_AUTO, "saw_enetunreach", CTLFLAG_RD, 1335 &rack_saw_enetunreach, 1336 "Total number of times a segment did not cause hptsi"); 1337 rack_to_alloc = counter_u64_alloc(M_WAITOK); 1338 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1339 SYSCTL_CHILDREN(rack_counters), 1340 OID_AUTO, "allocs", CTLFLAG_RD, 1341 &rack_to_alloc, 1342 "Total allocations of tracking structures"); 1343 rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); 1344 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1345 SYSCTL_CHILDREN(rack_counters), 1346 OID_AUTO, "allochard", CTLFLAG_RD, 1347 &rack_to_alloc_hard, 1348 "Total allocations done with sleeping the hard way"); 1349 rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); 1350 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1351 SYSCTL_CHILDREN(rack_counters), 1352 OID_AUTO, "allocemerg", CTLFLAG_RD, 1353 &rack_to_alloc_emerg, 1354 "Total allocations done from emergency cache"); 1355 rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); 1356 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1357 SYSCTL_CHILDREN(rack_counters), 1358 OID_AUTO, "alloc_limited", CTLFLAG_RD, 1359 &rack_to_alloc_limited, 1360 "Total allocations dropped due to limit"); 1361 rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); 1362 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1363 SYSCTL_CHILDREN(rack_counters), 1364 OID_AUTO, "alloc_limited_conns", CTLFLAG_RD, 1365 &rack_alloc_limited_conns, 1366 "Connections with allocations dropped due to limit"); 1367 rack_split_limited = counter_u64_alloc(M_WAITOK); 1368 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1369 SYSCTL_CHILDREN(rack_counters), 1370 OID_AUTO, "split_limited", CTLFLAG_RD, 1371 &rack_split_limited, 1372 "Split allocations dropped due to limit"); 1373 rack_sack_proc_all = counter_u64_alloc(M_WAITOK); 1374 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1375 SYSCTL_CHILDREN(rack_counters), 1376 OID_AUTO, "sack_long", CTLFLAG_RD, 1377 &rack_sack_proc_all, 1378 "Total times we had to walk whole list for sack processing"); 1379 rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); 1380 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1381 SYSCTL_CHILDREN(rack_counters), 1382 OID_AUTO, "sack_restart", CTLFLAG_RD, 1383 &rack_sack_proc_restart, 1384 "Total times we had to walk whole list due to a restart"); 1385 rack_sack_proc_short = counter_u64_alloc(M_WAITOK); 1386 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1387 SYSCTL_CHILDREN(rack_counters), 1388 OID_AUTO, "sack_short", CTLFLAG_RD, 1389 &rack_sack_proc_short, 1390 "Total times we took shortcut for sack processing"); 1391 rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); 1392 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1393 SYSCTL_CHILDREN(rack_counters), 1394 OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, 1395 &rack_enter_tlp_calc, 1396 "Total times we called calc-tlp"); 1397 rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); 1398 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1399 SYSCTL_CHILDREN(rack_counters), 1400 OID_AUTO, "hit_tlp_method", CTLFLAG_RD, 1401 &rack_used_tlpmethod, 1402 "Total number of runt sacks"); 1403 rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); 1404 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1405 SYSCTL_CHILDREN(rack_counters), 1406 OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, 1407 &rack_used_tlpmethod2, 1408 "Total number of times we hit TLP method 2"); 1409 rack_sack_skipped_acked = counter_u64_alloc(M_WAITOK); 1410 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1411 SYSCTL_CHILDREN(rack_attack), 1412 OID_AUTO, "skipacked", CTLFLAG_RD, 1413 &rack_sack_skipped_acked, 1414 "Total number of times we skipped previously sacked"); 1415 rack_sack_splits = counter_u64_alloc(M_WAITOK); 1416 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1417 SYSCTL_CHILDREN(rack_attack), 1418 OID_AUTO, "ofsplit", CTLFLAG_RD, 1419 &rack_sack_splits, 1420 "Total number of times we did the old fashion tree split"); 1421 rack_progress_drops = counter_u64_alloc(M_WAITOK); 1422 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1423 SYSCTL_CHILDREN(rack_counters), 1424 OID_AUTO, "prog_drops", CTLFLAG_RD, 1425 &rack_progress_drops, 1426 "Total number of progress drops"); 1427 rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); 1428 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1429 SYSCTL_CHILDREN(rack_counters), 1430 OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, 1431 &rack_input_idle_reduces, 1432 "Total number of idle reductions on input"); 1433 rack_collapsed_win = counter_u64_alloc(M_WAITOK); 1434 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1435 SYSCTL_CHILDREN(rack_counters), 1436 OID_AUTO, "collapsed_win", CTLFLAG_RD, 1437 &rack_collapsed_win, 1438 "Total number of collapsed windows"); 1439 rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); 1440 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1441 SYSCTL_CHILDREN(rack_counters), 1442 OID_AUTO, "tlp_nada", CTLFLAG_RD, 1443 &rack_tlp_does_nada, 1444 "Total number of nada tlp calls"); 1445 rack_try_scwnd = counter_u64_alloc(M_WAITOK); 1446 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1447 SYSCTL_CHILDREN(rack_counters), 1448 OID_AUTO, "tried_scwnd", CTLFLAG_RD, 1449 &rack_try_scwnd, 1450 "Total number of scwnd attempts"); 1451 1452 rack_tls_rwnd = counter_u64_alloc(M_WAITOK); 1453 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1454 SYSCTL_CHILDREN(rack_counters), 1455 OID_AUTO, "tls_rwnd", CTLFLAG_RD, 1456 &rack_tls_rwnd, 1457 "Total hdwr tls rwnd limited"); 1458 rack_tls_cwnd = counter_u64_alloc(M_WAITOK); 1459 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1460 SYSCTL_CHILDREN(rack_counters), 1461 OID_AUTO, "tls_cwnd", CTLFLAG_RD, 1462 &rack_tls_cwnd, 1463 "Total hdwr tls cwnd limited"); 1464 rack_tls_app = counter_u64_alloc(M_WAITOK); 1465 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1466 SYSCTL_CHILDREN(rack_counters), 1467 OID_AUTO, "tls_app", CTLFLAG_RD, 1468 &rack_tls_app, 1469 "Total hdwr tls app limited"); 1470 rack_tls_other = counter_u64_alloc(M_WAITOK); 1471 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1472 SYSCTL_CHILDREN(rack_counters), 1473 OID_AUTO, "tls_other", CTLFLAG_RD, 1474 &rack_tls_other, 1475 "Total hdwr tls other limited"); 1476 rack_tls_filled = counter_u64_alloc(M_WAITOK); 1477 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1478 SYSCTL_CHILDREN(rack_counters), 1479 OID_AUTO, "tls_filled", CTLFLAG_RD, 1480 &rack_tls_filled, 1481 "Total hdwr tls filled"); 1482 rack_tls_rxt = counter_u64_alloc(M_WAITOK); 1483 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1484 SYSCTL_CHILDREN(rack_counters), 1485 OID_AUTO, "tls_rxt", CTLFLAG_RD, 1486 &rack_tls_rxt, 1487 "Total hdwr rxt"); 1488 rack_tls_tlp = counter_u64_alloc(M_WAITOK); 1489 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1490 SYSCTL_CHILDREN(rack_counters), 1491 OID_AUTO, "tls_tlp", CTLFLAG_RD, 1492 &rack_tls_tlp, 1493 "Total hdwr tls tlp"); 1494 rack_per_timer_hole = counter_u64_alloc(M_WAITOK); 1495 SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, 1496 SYSCTL_CHILDREN(rack_counters), 1497 OID_AUTO, "timer_hole", CTLFLAG_RD, 1498 &rack_per_timer_hole, 1499 "Total persists start in timer hole"); 1500 COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); 1501 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1502 OID_AUTO, "outsize", CTLFLAG_RD, 1503 rack_out_size, TCP_MSS_ACCT_SIZE, "MSS send sizes"); 1504 COUNTER_ARRAY_ALLOC(rack_opts_arry, RACK_OPTS_SIZE, M_WAITOK); 1505 SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), 1506 OID_AUTO, "opts", CTLFLAG_RD, 1507 rack_opts_arry, RACK_OPTS_SIZE, "RACK Option Stats"); 1508 SYSCTL_ADD_PROC(&rack_sysctl_ctx, 1509 SYSCTL_CHILDREN(rack_sysctl_root), 1510 OID_AUTO, "clear", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 1511 &rack_clear_counter, 0, sysctl_rack_clear, "IU", "Clear counters"); 1512 } 1513 1514 static __inline int 1515 rb_map_cmp(struct rack_sendmap *b, struct rack_sendmap *a) 1516 { 1517 if (SEQ_GEQ(b->r_start, a->r_start) && 1518 SEQ_LT(b->r_start, a->r_end)) { 1519 /* 1520 * The entry b is within the 1521 * block a. i.e.: 1522 * a -- |-------------| 1523 * b -- |----| 1524 * <or> 1525 * b -- |------| 1526 * <or> 1527 * b -- |-----------| 1528 */ 1529 return (0); 1530 } else if (SEQ_GEQ(b->r_start, a->r_end)) { 1531 /* 1532 * b falls as either the next 1533 * sequence block after a so a 1534 * is said to be smaller than b. 1535 * i.e: 1536 * a -- |------| 1537 * b -- |--------| 1538 * or 1539 * b -- |-----| 1540 */ 1541 return (1); 1542 } 1543 /* 1544 * Whats left is where a is 1545 * larger than b. i.e: 1546 * a -- |-------| 1547 * b -- |---| 1548 * or even possibly 1549 * b -- |--------------| 1550 */ 1551 return (-1); 1552 } 1553 1554 RB_PROTOTYPE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1555 RB_GENERATE(rack_rb_tree_head, rack_sendmap, r_next, rb_map_cmp); 1556 1557 static uint32_t 1558 rc_init_window(struct tcp_rack *rack) 1559 { 1560 uint32_t win; 1561 1562 if (rack->rc_init_win == 0) { 1563 /* 1564 * Nothing set by the user, use the system stack 1565 * default. 1566 */ 1567 return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); 1568 } 1569 win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; 1570 return(win); 1571 } 1572 1573 static uint64_t 1574 rack_get_fixed_pacing_bw(struct tcp_rack *rack) 1575 { 1576 if (IN_RECOVERY(rack->rc_tp->t_flags)) 1577 return (rack->r_ctl.rc_fixed_pacing_rate_rec); 1578 else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1579 return (rack->r_ctl.rc_fixed_pacing_rate_ss); 1580 else 1581 return (rack->r_ctl.rc_fixed_pacing_rate_ca); 1582 } 1583 1584 static uint64_t 1585 rack_get_bw(struct tcp_rack *rack) 1586 { 1587 if (rack->use_fixed_rate) { 1588 /* Return the fixed pacing rate */ 1589 return (rack_get_fixed_pacing_bw(rack)); 1590 } 1591 if (rack->r_ctl.gp_bw == 0) { 1592 /* 1593 * We have yet no b/w measurement, 1594 * if we have a user set initial bw 1595 * return it. If we don't have that and 1596 * we have an srtt, use the tcp IW (10) to 1597 * calculate a fictional b/w over the SRTT 1598 * which is more or less a guess. Note 1599 * we don't use our IW from rack on purpose 1600 * so if we have like IW=30, we are not 1601 * calculating a "huge" b/w. 1602 */ 1603 uint64_t bw, srtt; 1604 if (rack->r_ctl.init_rate) 1605 return (rack->r_ctl.init_rate); 1606 1607 /* Has the user set a max peak rate? */ 1608 #ifdef NETFLIX_PEAKRATE 1609 if (rack->rc_tp->t_maxpeakrate) 1610 return (rack->rc_tp->t_maxpeakrate); 1611 #endif 1612 /* Ok lets come up with the IW guess, if we have a srtt */ 1613 if (rack->rc_tp->t_srtt == 0) { 1614 /* 1615 * Go with old pacing method 1616 * i.e. burst mitigation only. 1617 */ 1618 return (0); 1619 } 1620 /* Ok lets get the initial TCP win (not racks) */ 1621 bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); 1622 srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 1623 bw *= (uint64_t)USECS_IN_SECOND; 1624 bw /= srtt; 1625 return (bw); 1626 } else { 1627 uint64_t bw; 1628 1629 if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { 1630 /* Averaging is done, we can return the value */ 1631 bw = rack->r_ctl.gp_bw; 1632 } else { 1633 /* Still doing initial average must calculate */ 1634 bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; 1635 } 1636 #ifdef NETFLIX_PEAKRATE 1637 if ((rack->rc_tp->t_maxpeakrate) && 1638 (bw > rack->rc_tp->t_maxpeakrate)) { 1639 /* The user has set a peak rate to pace at 1640 * don't allow us to pace faster than that. 1641 */ 1642 return (rack->rc_tp->t_maxpeakrate); 1643 } 1644 #endif 1645 return (bw); 1646 } 1647 } 1648 1649 static uint16_t 1650 rack_get_output_gain(struct tcp_rack *rack, struct rack_sendmap *rsm) 1651 { 1652 if (rack->use_fixed_rate) { 1653 return (100); 1654 } else if (rack->in_probe_rtt && (rsm == NULL)) 1655 return(rack->r_ctl.rack_per_of_gp_probertt); 1656 else if ((IN_RECOVERY(rack->rc_tp->t_flags) && 1657 rack->r_ctl.rack_per_of_gp_rec)) { 1658 if (rsm) { 1659 /* a retransmission always use the recovery rate */ 1660 return(rack->r_ctl.rack_per_of_gp_rec); 1661 } else if (rack->rack_rec_nonrxt_use_cr) { 1662 /* Directed to use the configured rate */ 1663 goto configured_rate; 1664 } else if (rack->rack_no_prr && 1665 (rack->r_ctl.rack_per_of_gp_rec > 100)) { 1666 /* No PRR, lets just use the b/w estimate only */ 1667 return(100); 1668 } else { 1669 /* 1670 * Here we may have a non-retransmit but we 1671 * have no overrides, so just use the recovery 1672 * rate (prr is in effect). 1673 */ 1674 return(rack->r_ctl.rack_per_of_gp_rec); 1675 } 1676 } 1677 configured_rate: 1678 /* For the configured rate we look at our cwnd vs the ssthresh */ 1679 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) 1680 return (rack->r_ctl.rack_per_of_gp_ss); 1681 else 1682 return(rack->r_ctl.rack_per_of_gp_ca); 1683 } 1684 1685 static uint64_t 1686 rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) 1687 { 1688 /* 1689 * We allow rack_per_of_gp_xx to dictate our bw rate we want. 1690 */ 1691 uint64_t bw_est; 1692 uint64_t gain; 1693 1694 gain = (uint64_t)rack_get_output_gain(rack, rsm); 1695 bw_est = bw * gain; 1696 bw_est /= (uint64_t)100; 1697 /* Never fall below the minimum (def 64kbps) */ 1698 if (bw_est < RACK_MIN_BW) 1699 bw_est = RACK_MIN_BW; 1700 return (bw_est); 1701 } 1702 1703 static void 1704 rack_log_retran_reason(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t tsused, uint32_t thresh, int mod) 1705 { 1706 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1707 union tcp_log_stackspecific log; 1708 struct timeval tv; 1709 1710 if ((mod != 1) && (rack_verbose_logging == 0)) { 1711 /* 1712 * We get 3 values currently for mod 1713 * 1 - We are retransmitting and this tells the reason. 1714 * 2 - We are clearing a dup-ack count. 1715 * 3 - We are incrementing a dup-ack count. 1716 * 1717 * The clear/increment are only logged 1718 * if you have BBverbose on. 1719 */ 1720 return; 1721 } 1722 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1723 log.u_bbr.flex1 = tsused; 1724 log.u_bbr.flex2 = thresh; 1725 log.u_bbr.flex3 = rsm->r_flags; 1726 log.u_bbr.flex4 = rsm->r_dupack; 1727 log.u_bbr.flex5 = rsm->r_start; 1728 log.u_bbr.flex6 = rsm->r_end; 1729 log.u_bbr.flex8 = mod; 1730 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1731 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1732 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1733 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1734 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1735 &rack->rc_inp->inp_socket->so_rcv, 1736 &rack->rc_inp->inp_socket->so_snd, 1737 BBR_LOG_SETTINGS_CHG, 0, 1738 0, &log, false, &tv); 1739 } 1740 } 1741 1742 1743 1744 static void 1745 rack_log_to_start(struct tcp_rack *rack, uint32_t cts, uint32_t to, int32_t slot, uint8_t which) 1746 { 1747 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1748 union tcp_log_stackspecific log; 1749 struct timeval tv; 1750 1751 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1752 log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); 1753 log.u_bbr.flex2 = to * 1000; 1754 log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; 1755 log.u_bbr.flex4 = slot; 1756 log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; 1757 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 1758 log.u_bbr.flex7 = rack->rc_in_persist; 1759 log.u_bbr.flex8 = which; 1760 if (rack->rack_no_prr) 1761 log.u_bbr.pkts_out = 0; 1762 else 1763 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1764 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1765 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1766 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1767 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1768 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1769 &rack->rc_inp->inp_socket->so_rcv, 1770 &rack->rc_inp->inp_socket->so_snd, 1771 BBR_LOG_TIMERSTAR, 0, 1772 0, &log, false, &tv); 1773 } 1774 } 1775 1776 static void 1777 rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm) 1778 { 1779 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1780 union tcp_log_stackspecific log; 1781 struct timeval tv; 1782 1783 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1784 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1785 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1786 log.u_bbr.flex8 = to_num; 1787 log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; 1788 log.u_bbr.flex2 = rack->rc_rack_rtt; 1789 if (rsm == NULL) 1790 log.u_bbr.flex3 = 0; 1791 else 1792 log.u_bbr.flex3 = rsm->r_end - rsm->r_start; 1793 if (rack->rack_no_prr) 1794 log.u_bbr.flex5 = 0; 1795 else 1796 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1797 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1798 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1799 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1800 &rack->rc_inp->inp_socket->so_rcv, 1801 &rack->rc_inp->inp_socket->so_snd, 1802 BBR_LOG_RTO, 0, 1803 0, &log, false, &tv); 1804 } 1805 } 1806 1807 static void 1808 rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, 1809 struct rack_sendmap *rsm, int conf) 1810 { 1811 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1812 union tcp_log_stackspecific log; 1813 struct timeval tv; 1814 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1815 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1816 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1817 log.u_bbr.flex1 = t; 1818 log.u_bbr.flex2 = len; 1819 log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; 1820 log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; 1821 log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; 1822 log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; 1823 log.u_bbr.flex7 = conf; 1824 log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; 1825 log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; 1826 if (rack->rack_no_prr) 1827 log.u_bbr.pkts_out = 0; 1828 else 1829 log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; 1830 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1831 log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; 1832 log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; 1833 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1834 if (rsm) { 1835 log.u_bbr.pkt_epoch = rsm->r_start; 1836 log.u_bbr.lost = rsm->r_end; 1837 log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; 1838 } else { 1839 1840 /* Its a SYN */ 1841 log.u_bbr.pkt_epoch = rack->rc_tp->iss; 1842 log.u_bbr.lost = 0; 1843 log.u_bbr.cwnd_gain = 0; 1844 } 1845 /* Write out general bits of interest rrs here */ 1846 log.u_bbr.use_lt_bw = rack->rc_highly_buffered; 1847 log.u_bbr.use_lt_bw <<= 1; 1848 log.u_bbr.use_lt_bw |= rack->forced_ack; 1849 log.u_bbr.use_lt_bw <<= 1; 1850 log.u_bbr.use_lt_bw |= rack->rc_gp_dyn_mul; 1851 log.u_bbr.use_lt_bw <<= 1; 1852 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 1853 log.u_bbr.use_lt_bw <<= 1; 1854 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 1855 log.u_bbr.use_lt_bw <<= 1; 1856 log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; 1857 log.u_bbr.use_lt_bw <<= 1; 1858 log.u_bbr.use_lt_bw |= rack->rc_gp_filled; 1859 log.u_bbr.use_lt_bw <<= 1; 1860 log.u_bbr.use_lt_bw |= rack->rc_dragged_bottom; 1861 log.u_bbr.applimited = rack->r_ctl.rc_target_probertt_flight; 1862 log.u_bbr.epoch = rack->r_ctl.rc_time_probertt_starts; 1863 log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; 1864 log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; 1865 log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; 1866 TCP_LOG_EVENTP(tp, NULL, 1867 &rack->rc_inp->inp_socket->so_rcv, 1868 &rack->rc_inp->inp_socket->so_snd, 1869 BBR_LOG_BBRRTT, 0, 1870 0, &log, false, &tv); 1871 } 1872 } 1873 1874 static void 1875 rack_log_rtt_sample(struct tcp_rack *rack, uint32_t rtt) 1876 { 1877 /* 1878 * Log the rtt sample we are 1879 * applying to the srtt algorithm in 1880 * useconds. 1881 */ 1882 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1883 union tcp_log_stackspecific log; 1884 struct timeval tv; 1885 1886 /* Convert our ms to a microsecond */ 1887 memset(&log, 0, sizeof(log)); 1888 log.u_bbr.flex1 = rtt * 1000; 1889 log.u_bbr.flex2 = rack->r_ctl.ack_count; 1890 log.u_bbr.flex3 = rack->r_ctl.sack_count; 1891 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 1892 log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; 1893 log.u_bbr.flex8 = rack->sack_attack_disable; 1894 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1895 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1896 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1897 &rack->rc_inp->inp_socket->so_rcv, 1898 &rack->rc_inp->inp_socket->so_snd, 1899 TCP_LOG_RTT, 0, 1900 0, &log, false, &tv); 1901 } 1902 } 1903 1904 1905 static inline void 1906 rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) 1907 { 1908 if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1909 union tcp_log_stackspecific log; 1910 struct timeval tv; 1911 1912 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1913 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1914 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1915 log.u_bbr.flex1 = line; 1916 log.u_bbr.flex2 = tick; 1917 log.u_bbr.flex3 = tp->t_maxunacktime; 1918 log.u_bbr.flex4 = tp->t_acktime; 1919 log.u_bbr.flex8 = event; 1920 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1921 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1922 TCP_LOG_EVENTP(tp, NULL, 1923 &rack->rc_inp->inp_socket->so_rcv, 1924 &rack->rc_inp->inp_socket->so_snd, 1925 BBR_LOG_PROGRESS, 0, 1926 0, &log, false, &tv); 1927 } 1928 } 1929 1930 static void 1931 rack_log_type_bbrsnd(struct tcp_rack *rack, uint32_t len, uint32_t slot, uint32_t cts, struct timeval *tv) 1932 { 1933 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1934 union tcp_log_stackspecific log; 1935 1936 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 1937 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1938 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 1939 log.u_bbr.flex1 = slot; 1940 if (rack->rack_no_prr) 1941 log.u_bbr.flex2 = 0; 1942 else 1943 log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; 1944 log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); 1945 log.u_bbr.flex8 = rack->rc_in_persist; 1946 log.u_bbr.timeStamp = cts; 1947 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1948 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1949 &rack->rc_inp->inp_socket->so_rcv, 1950 &rack->rc_inp->inp_socket->so_snd, 1951 BBR_LOG_BBRSND, 0, 1952 0, &log, false, tv); 1953 } 1954 } 1955 1956 static void 1957 rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) 1958 { 1959 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 1960 union tcp_log_stackspecific log; 1961 struct timeval tv; 1962 1963 memset(&log, 0, sizeof(log)); 1964 log.u_bbr.flex1 = did_out; 1965 log.u_bbr.flex2 = nxt_pkt; 1966 log.u_bbr.flex3 = way_out; 1967 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 1968 if (rack->rack_no_prr) 1969 log.u_bbr.flex5 = 0; 1970 else 1971 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 1972 log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; 1973 log.u_bbr.flex7 = rack->r_wanted_output; 1974 log.u_bbr.flex8 = rack->rc_in_persist; 1975 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 1976 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 1977 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 1978 TCP_LOG_EVENTP(rack->rc_tp, NULL, 1979 &rack->rc_inp->inp_socket->so_rcv, 1980 &rack->rc_inp->inp_socket->so_snd, 1981 BBR_LOG_DOSEG_DONE, 0, 1982 0, &log, false, &tv); 1983 } 1984 } 1985 1986 static void 1987 rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) 1988 { 1989 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 1990 union tcp_log_stackspecific log; 1991 struct timeval tv; 1992 uint32_t cts; 1993 1994 memset(&log, 0, sizeof(log)); 1995 cts = tcp_get_usecs(&tv); 1996 log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; 1997 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 1998 log.u_bbr.flex4 = len; 1999 log.u_bbr.flex5 = orig_len; 2000 log.u_bbr.flex6 = rack->r_ctl.rc_sacked; 2001 log.u_bbr.flex7 = mod; 2002 log.u_bbr.flex8 = frm; 2003 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2004 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2005 TCP_LOG_EVENTP(tp, NULL, 2006 &tp->t_inpcb->inp_socket->so_rcv, 2007 &tp->t_inpcb->inp_socket->so_snd, 2008 TCP_HDWR_TLS, 0, 2009 0, &log, false, &tv); 2010 } 2011 } 2012 2013 static void 2014 rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, 2015 uint8_t hpts_calling, int reason, uint32_t cwnd_to_use) 2016 { 2017 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2018 union tcp_log_stackspecific log; 2019 struct timeval tv; 2020 2021 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2022 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2023 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2024 log.u_bbr.flex1 = slot; 2025 log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; 2026 log.u_bbr.flex4 = reason; 2027 if (rack->rack_no_prr) 2028 log.u_bbr.flex5 = 0; 2029 else 2030 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2031 log.u_bbr.flex7 = hpts_calling; 2032 log.u_bbr.flex8 = rack->rc_in_persist; 2033 log.u_bbr.lt_epoch = cwnd_to_use; 2034 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2035 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2036 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2037 &rack->rc_inp->inp_socket->so_rcv, 2038 &rack->rc_inp->inp_socket->so_snd, 2039 BBR_LOG_JUSTRET, 0, 2040 tlen, &log, false, &tv); 2041 } 2042 } 2043 2044 static void 2045 rack_log_to_cancel(struct tcp_rack *rack, int32_t hpts_removed, int line, uint32_t us_cts, 2046 struct timeval *tv, uint32_t flags_on_entry) 2047 { 2048 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2049 union tcp_log_stackspecific log; 2050 2051 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2052 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 2053 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 2054 log.u_bbr.flex1 = line; 2055 log.u_bbr.flex2 = rack->r_ctl.rc_last_output_to; 2056 log.u_bbr.flex3 = flags_on_entry; 2057 log.u_bbr.flex4 = us_cts; 2058 if (rack->rack_no_prr) 2059 log.u_bbr.flex5 = 0; 2060 else 2061 log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; 2062 log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; 2063 log.u_bbr.flex7 = hpts_removed; 2064 log.u_bbr.flex8 = 1; 2065 log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; 2066 log.u_bbr.timeStamp = us_cts; 2067 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2068 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2069 &rack->rc_inp->inp_socket->so_rcv, 2070 &rack->rc_inp->inp_socket->so_snd, 2071 BBR_LOG_TIMERCANC, 0, 2072 0, &log, false, tv); 2073 } 2074 } 2075 2076 static void 2077 rack_log_alt_to_to_cancel(struct tcp_rack *rack, 2078 uint32_t flex1, uint32_t flex2, 2079 uint32_t flex3, uint32_t flex4, 2080 uint32_t flex5, uint32_t flex6, 2081 uint16_t flex7, uint8_t mod) 2082 { 2083 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2084 union tcp_log_stackspecific log; 2085 struct timeval tv; 2086 2087 if (mod == 1) { 2088 /* No you can't use 1, its for the real to cancel */ 2089 return; 2090 } 2091 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2092 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2093 log.u_bbr.flex1 = flex1; 2094 log.u_bbr.flex2 = flex2; 2095 log.u_bbr.flex3 = flex3; 2096 log.u_bbr.flex4 = flex4; 2097 log.u_bbr.flex5 = flex5; 2098 log.u_bbr.flex6 = flex6; 2099 log.u_bbr.flex7 = flex7; 2100 log.u_bbr.flex8 = mod; 2101 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2102 &rack->rc_inp->inp_socket->so_rcv, 2103 &rack->rc_inp->inp_socket->so_snd, 2104 BBR_LOG_TIMERCANC, 0, 2105 0, &log, false, &tv); 2106 } 2107 } 2108 2109 static void 2110 rack_log_to_processing(struct tcp_rack *rack, uint32_t cts, int32_t ret, int32_t timers) 2111 { 2112 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2113 union tcp_log_stackspecific log; 2114 struct timeval tv; 2115 2116 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2117 log.u_bbr.flex1 = timers; 2118 log.u_bbr.flex2 = ret; 2119 log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; 2120 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 2121 log.u_bbr.flex5 = cts; 2122 if (rack->rack_no_prr) 2123 log.u_bbr.flex6 = 0; 2124 else 2125 log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; 2126 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2127 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2128 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2129 &rack->rc_inp->inp_socket->so_rcv, 2130 &rack->rc_inp->inp_socket->so_snd, 2131 BBR_LOG_TO_PROCESS, 0, 2132 0, &log, false, &tv); 2133 } 2134 } 2135 2136 static void 2137 rack_log_to_prr(struct tcp_rack *rack, int frm, int orig_cwnd) 2138 { 2139 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2140 union tcp_log_stackspecific log; 2141 struct timeval tv; 2142 2143 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2144 log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; 2145 log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; 2146 if (rack->rack_no_prr) 2147 log.u_bbr.flex3 = 0; 2148 else 2149 log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; 2150 log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; 2151 log.u_bbr.flex5 = rack->r_ctl.rc_sacked; 2152 log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; 2153 log.u_bbr.flex8 = frm; 2154 log.u_bbr.pkts_out = orig_cwnd; 2155 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2156 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2157 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2158 &rack->rc_inp->inp_socket->so_rcv, 2159 &rack->rc_inp->inp_socket->so_snd, 2160 BBR_LOG_BBRUPD, 0, 2161 0, &log, false, &tv); 2162 } 2163 } 2164 2165 #ifdef NETFLIX_EXP_DETECTION 2166 static void 2167 rack_log_sad(struct tcp_rack *rack, int event) 2168 { 2169 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2170 union tcp_log_stackspecific log; 2171 struct timeval tv; 2172 2173 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2174 log.u_bbr.flex1 = rack->r_ctl.sack_count; 2175 log.u_bbr.flex2 = rack->r_ctl.ack_count; 2176 log.u_bbr.flex3 = rack->r_ctl.sack_moved_extra; 2177 log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; 2178 log.u_bbr.flex5 = rack->r_ctl.rc_num_maps_alloced; 2179 log.u_bbr.flex6 = tcp_sack_to_ack_thresh; 2180 log.u_bbr.pkts_out = tcp_sack_to_move_thresh; 2181 log.u_bbr.lt_epoch = (tcp_force_detection << 8); 2182 log.u_bbr.lt_epoch |= rack->do_detection; 2183 log.u_bbr.applimited = tcp_map_minimum; 2184 log.u_bbr.flex7 = rack->sack_attack_disable; 2185 log.u_bbr.flex8 = event; 2186 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2187 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2188 log.u_bbr.delivered = tcp_sad_decay_val; 2189 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2190 &rack->rc_inp->inp_socket->so_rcv, 2191 &rack->rc_inp->inp_socket->so_snd, 2192 TCP_SAD_DETECTION, 0, 2193 0, &log, false, &tv); 2194 } 2195 } 2196 #endif 2197 2198 static void 2199 rack_counter_destroy(void) 2200 { 2201 counter_u64_free(rack_ack_total); 2202 counter_u64_free(rack_express_sack); 2203 counter_u64_free(rack_sack_total); 2204 counter_u64_free(rack_move_none); 2205 counter_u64_free(rack_move_some); 2206 counter_u64_free(rack_sack_attacks_detected); 2207 counter_u64_free(rack_sack_attacks_reversed); 2208 counter_u64_free(rack_sack_used_next_merge); 2209 counter_u64_free(rack_sack_used_prev_merge); 2210 counter_u64_free(rack_badfr); 2211 counter_u64_free(rack_badfr_bytes); 2212 counter_u64_free(rack_rtm_prr_retran); 2213 counter_u64_free(rack_rtm_prr_newdata); 2214 counter_u64_free(rack_timestamp_mismatch); 2215 counter_u64_free(rack_find_high); 2216 counter_u64_free(rack_reorder_seen); 2217 counter_u64_free(rack_tlp_tot); 2218 counter_u64_free(rack_tlp_newdata); 2219 counter_u64_free(rack_tlp_retran); 2220 counter_u64_free(rack_tlp_retran_bytes); 2221 counter_u64_free(rack_tlp_retran_fail); 2222 counter_u64_free(rack_to_tot); 2223 counter_u64_free(rack_to_arm_rack); 2224 counter_u64_free(rack_to_arm_tlp); 2225 counter_u64_free(rack_calc_zero); 2226 counter_u64_free(rack_calc_nonzero); 2227 counter_u64_free(rack_paced_segments); 2228 counter_u64_free(rack_unpaced_segments); 2229 counter_u64_free(rack_saw_enobuf); 2230 counter_u64_free(rack_saw_enetunreach); 2231 counter_u64_free(rack_to_alloc); 2232 counter_u64_free(rack_to_alloc_hard); 2233 counter_u64_free(rack_to_alloc_emerg); 2234 counter_u64_free(rack_to_alloc_limited); 2235 counter_u64_free(rack_alloc_limited_conns); 2236 counter_u64_free(rack_split_limited); 2237 counter_u64_free(rack_sack_proc_all); 2238 counter_u64_free(rack_sack_proc_restart); 2239 counter_u64_free(rack_sack_proc_short); 2240 counter_u64_free(rack_enter_tlp_calc); 2241 counter_u64_free(rack_used_tlpmethod); 2242 counter_u64_free(rack_used_tlpmethod2); 2243 counter_u64_free(rack_sack_skipped_acked); 2244 counter_u64_free(rack_sack_splits); 2245 counter_u64_free(rack_progress_drops); 2246 counter_u64_free(rack_input_idle_reduces); 2247 counter_u64_free(rack_collapsed_win); 2248 counter_u64_free(rack_tlp_does_nada); 2249 counter_u64_free(rack_try_scwnd); 2250 counter_u64_free(rack_tls_rwnd); 2251 counter_u64_free(rack_tls_cwnd); 2252 counter_u64_free(rack_tls_app); 2253 counter_u64_free(rack_tls_other); 2254 counter_u64_free(rack_tls_filled); 2255 counter_u64_free(rack_tls_rxt); 2256 counter_u64_free(rack_tls_tlp); 2257 counter_u64_free(rack_per_timer_hole); 2258 COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); 2259 COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); 2260 } 2261 2262 static struct rack_sendmap * 2263 rack_alloc(struct tcp_rack *rack) 2264 { 2265 struct rack_sendmap *rsm; 2266 2267 rsm = uma_zalloc(rack_zone, M_NOWAIT); 2268 if (rsm) { 2269 rack->r_ctl.rc_num_maps_alloced++; 2270 counter_u64_add(rack_to_alloc, 1); 2271 return (rsm); 2272 } 2273 if (rack->rc_free_cnt) { 2274 counter_u64_add(rack_to_alloc_emerg, 1); 2275 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 2276 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 2277 rack->rc_free_cnt--; 2278 return (rsm); 2279 } 2280 return (NULL); 2281 } 2282 2283 static struct rack_sendmap * 2284 rack_alloc_full_limit(struct tcp_rack *rack) 2285 { 2286 if ((V_tcp_map_entries_limit > 0) && 2287 (rack->do_detection == 0) && 2288 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 2289 counter_u64_add(rack_to_alloc_limited, 1); 2290 if (!rack->alloc_limit_reported) { 2291 rack->alloc_limit_reported = 1; 2292 counter_u64_add(rack_alloc_limited_conns, 1); 2293 } 2294 return (NULL); 2295 } 2296 return (rack_alloc(rack)); 2297 } 2298 2299 /* wrapper to allocate a sendmap entry, subject to a specific limit */ 2300 static struct rack_sendmap * 2301 rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) 2302 { 2303 struct rack_sendmap *rsm; 2304 2305 if (limit_type) { 2306 /* currently there is only one limit type */ 2307 if (V_tcp_map_split_limit > 0 && 2308 (rack->do_detection == 0) && 2309 rack->r_ctl.rc_num_split_allocs >= V_tcp_map_split_limit) { 2310 counter_u64_add(rack_split_limited, 1); 2311 if (!rack->alloc_limit_reported) { 2312 rack->alloc_limit_reported = 1; 2313 counter_u64_add(rack_alloc_limited_conns, 1); 2314 } 2315 return (NULL); 2316 } 2317 } 2318 2319 /* allocate and mark in the limit type, if set */ 2320 rsm = rack_alloc(rack); 2321 if (rsm != NULL && limit_type) { 2322 rsm->r_limit_type = limit_type; 2323 rack->r_ctl.rc_num_split_allocs++; 2324 } 2325 return (rsm); 2326 } 2327 2328 static void 2329 rack_free(struct tcp_rack *rack, struct rack_sendmap *rsm) 2330 { 2331 if (rsm->r_flags & RACK_APP_LIMITED) { 2332 if (rack->r_ctl.rc_app_limited_cnt > 0) { 2333 rack->r_ctl.rc_app_limited_cnt--; 2334 } 2335 } 2336 if (rsm->r_limit_type) { 2337 /* currently there is only one limit type */ 2338 rack->r_ctl.rc_num_split_allocs--; 2339 } 2340 if (rsm == rack->r_ctl.rc_first_appl) { 2341 if (rack->r_ctl.rc_app_limited_cnt == 0) 2342 rack->r_ctl.rc_first_appl = NULL; 2343 else { 2344 /* Follow the next one out */ 2345 struct rack_sendmap fe; 2346 2347 fe.r_start = rsm->r_nseq_appl; 2348 rack->r_ctl.rc_first_appl = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 2349 } 2350 } 2351 if (rsm == rack->r_ctl.rc_resend) 2352 rack->r_ctl.rc_resend = NULL; 2353 if (rsm == rack->r_ctl.rc_rsm_at_retran) 2354 rack->r_ctl.rc_rsm_at_retran = NULL; 2355 if (rsm == rack->r_ctl.rc_end_appl) 2356 rack->r_ctl.rc_end_appl = NULL; 2357 if (rack->r_ctl.rc_tlpsend == rsm) 2358 rack->r_ctl.rc_tlpsend = NULL; 2359 if (rack->r_ctl.rc_sacklast == rsm) 2360 rack->r_ctl.rc_sacklast = NULL; 2361 if (rack->rc_free_cnt < rack_free_cache) { 2362 memset(rsm, 0, sizeof(struct rack_sendmap)); 2363 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 2364 rsm->r_limit_type = 0; 2365 rack->rc_free_cnt++; 2366 return; 2367 } 2368 rack->r_ctl.rc_num_maps_alloced--; 2369 uma_zfree(rack_zone, rsm); 2370 } 2371 2372 static uint32_t 2373 rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) 2374 { 2375 uint64_t srtt, bw, len, tim; 2376 uint32_t segsiz, def_len, minl; 2377 2378 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2379 def_len = rack_def_data_window * segsiz; 2380 if (rack->rc_gp_filled == 0) { 2381 /* 2382 * We have no measurement (IW is in flight?) so 2383 * we can only guess using our data_window sysctl 2384 * value (usually 100MSS). 2385 */ 2386 return (def_len); 2387 } 2388 /* 2389 * Now we have a number of factors to consider. 2390 * 2391 * 1) We have a desired BDP which is usually 2392 * at least 2. 2393 * 2) We have a minimum number of rtt's usually 1 SRTT 2394 * but we allow it too to be more. 2395 * 3) We want to make sure a measurement last N useconds (if 2396 * we have set rack_min_measure_usec. 2397 * 2398 * We handle the first concern here by trying to create a data 2399 * window of max(rack_def_data_window, DesiredBDP). The 2400 * second concern we handle in not letting the measurement 2401 * window end normally until at least the required SRTT's 2402 * have gone by which is done further below in 2403 * rack_enough_for_measurement(). Finally the third concern 2404 * we also handle here by calculating how long that time 2405 * would take at the current BW and then return the 2406 * max of our first calculation and that length. Note 2407 * that if rack_min_measure_usec is 0, we don't deal 2408 * with concern 3. Also for both Concern 1 and 3 an 2409 * application limited period could end the measurement 2410 * earlier. 2411 * 2412 * So lets calculate the BDP with the "known" b/w using 2413 * the SRTT has our rtt and then multiply it by the 2414 * goal. 2415 */ 2416 bw = rack_get_bw(rack); 2417 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 2418 len = bw * srtt; 2419 len /= (uint64_t)HPTS_USEC_IN_SEC; 2420 len *= max(1, rack_goal_bdp); 2421 /* Now we need to round up to the nearest MSS */ 2422 len = roundup(len, segsiz); 2423 if (rack_min_measure_usec) { 2424 /* Now calculate our min length for this b/w */ 2425 tim = rack_min_measure_usec; 2426 minl = (tim * bw) / (uint64_t)HPTS_USEC_IN_SEC; 2427 if (minl == 0) 2428 minl = 1; 2429 minl = roundup(minl, segsiz); 2430 if (len < minl) 2431 len = minl; 2432 } 2433 /* 2434 * Now if we have a very small window we want 2435 * to attempt to get the window that is 2436 * as small as possible. This happens on 2437 * low b/w connections and we don't want to 2438 * span huge numbers of rtt's between measurements. 2439 * 2440 * We basically include 2 over our "MIN window" so 2441 * that the measurement can be shortened (possibly) by 2442 * an ack'ed packet. 2443 */ 2444 if (len < def_len) 2445 return (max((uint32_t)len, ((MIN_GP_WIN+2) * segsiz))); 2446 else 2447 return (max((uint32_t)len, def_len)); 2448 2449 } 2450 2451 static int 2452 rack_enough_for_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack) 2453 { 2454 uint32_t tim, srtts, segsiz; 2455 2456 /* 2457 * Has enough time passed for the GP measurement to be valid? 2458 */ 2459 if ((tp->snd_max == tp->snd_una) || 2460 (th_ack == tp->snd_max)){ 2461 /* All is acked */ 2462 return (1); 2463 } 2464 if (SEQ_LT(th_ack, tp->gput_seq)) { 2465 /* Not enough bytes yet */ 2466 return (0); 2467 } 2468 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 2469 if (SEQ_LT(th_ack, tp->gput_ack) && 2470 ((th_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 2471 /* Not enough bytes yet */ 2472 return (0); 2473 } 2474 if (rack->r_ctl.rc_first_appl && 2475 (rack->r_ctl.rc_first_appl->r_start == th_ack)) { 2476 /* 2477 * We are up to the app limited point 2478 * we have to measure irrespective of the time.. 2479 */ 2480 return (1); 2481 } 2482 /* Now what about time? */ 2483 srtts = (rack->r_ctl.rc_gp_srtt * rack_min_srtts); 2484 tim = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - tp->gput_ts; 2485 if (tim >= srtts) { 2486 return (1); 2487 } 2488 /* Nope not even a full SRTT has passed */ 2489 return (0); 2490 } 2491 2492 2493 static void 2494 rack_log_timely(struct tcp_rack *rack, 2495 uint32_t logged, uint64_t cur_bw, uint64_t low_bnd, 2496 uint64_t up_bnd, int line, uint8_t method) 2497 { 2498 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2499 union tcp_log_stackspecific log; 2500 struct timeval tv; 2501 2502 memset(&log, 0, sizeof(log)); 2503 log.u_bbr.flex1 = logged; 2504 log.u_bbr.flex2 = rack->rc_gp_timely_inc_cnt; 2505 log.u_bbr.flex2 <<= 4; 2506 log.u_bbr.flex2 |= rack->rc_gp_timely_dec_cnt; 2507 log.u_bbr.flex2 <<= 4; 2508 log.u_bbr.flex2 |= rack->rc_gp_incr; 2509 log.u_bbr.flex2 <<= 4; 2510 log.u_bbr.flex2 |= rack->rc_gp_bwred; 2511 log.u_bbr.flex3 = rack->rc_gp_incr; 2512 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2513 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ca; 2514 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_rec; 2515 log.u_bbr.flex7 = rack->rc_gp_bwred; 2516 log.u_bbr.flex8 = method; 2517 log.u_bbr.cur_del_rate = cur_bw; 2518 log.u_bbr.delRate = low_bnd; 2519 log.u_bbr.bw_inuse = up_bnd; 2520 log.u_bbr.rttProp = rack_get_bw(rack); 2521 log.u_bbr.pkt_epoch = line; 2522 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2523 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2524 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2525 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2526 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2527 log.u_bbr.cwnd_gain = rack->rc_dragged_bottom; 2528 log.u_bbr.cwnd_gain <<= 1; 2529 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_rec; 2530 log.u_bbr.cwnd_gain <<= 1; 2531 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 2532 log.u_bbr.cwnd_gain <<= 1; 2533 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 2534 log.u_bbr.lost = rack->r_ctl.rc_loss_count; 2535 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2536 &rack->rc_inp->inp_socket->so_rcv, 2537 &rack->rc_inp->inp_socket->so_snd, 2538 TCP_TIMELY_WORK, 0, 2539 0, &log, false, &tv); 2540 } 2541 } 2542 2543 static int 2544 rack_bw_can_be_raised(struct tcp_rack *rack, uint64_t cur_bw, uint64_t last_bw_est, uint16_t mult) 2545 { 2546 /* 2547 * Before we increase we need to know if 2548 * the estimate just made was less than 2549 * our pacing goal (i.e. (cur_bw * mult) > last_bw_est) 2550 * 2551 * If we already are pacing at a fast enough 2552 * rate to push us faster there is no sense of 2553 * increasing. 2554 * 2555 * We first caculate our actual pacing rate (ss or ca multipler 2556 * times our cur_bw). 2557 * 2558 * Then we take the last measured rate and multipy by our 2559 * maximum pacing overage to give us a max allowable rate. 2560 * 2561 * If our act_rate is smaller than our max_allowable rate 2562 * then we should increase. Else we should hold steady. 2563 * 2564 */ 2565 uint64_t act_rate, max_allow_rate; 2566 2567 if (rack_timely_no_stopping) 2568 return (1); 2569 2570 if ((cur_bw == 0) || (last_bw_est == 0)) { 2571 /* 2572 * Initial startup case or 2573 * everything is acked case. 2574 */ 2575 rack_log_timely(rack, mult, cur_bw, 0, 0, 2576 __LINE__, 9); 2577 return (1); 2578 } 2579 if (mult <= 100) { 2580 /* 2581 * We can always pace at or slightly above our rate. 2582 */ 2583 rack_log_timely(rack, mult, cur_bw, 0, 0, 2584 __LINE__, 9); 2585 return (1); 2586 } 2587 act_rate = cur_bw * (uint64_t)mult; 2588 act_rate /= 100; 2589 max_allow_rate = last_bw_est * ((uint64_t)rack_max_per_above + (uint64_t)100); 2590 max_allow_rate /= 100; 2591 if (act_rate < max_allow_rate) { 2592 /* 2593 * Here the rate we are actually pacing at 2594 * is smaller than 10% above our last measurement. 2595 * This means we are pacing below what we would 2596 * like to try to achieve (plus some wiggle room). 2597 */ 2598 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2599 __LINE__, 9); 2600 return (1); 2601 } else { 2602 /* 2603 * Here we are already pacing at least rack_max_per_above(10%) 2604 * what we are getting back. This indicates most likely 2605 * that we are being limited (cwnd/rwnd/app) and can't 2606 * get any more b/w. There is no sense of trying to 2607 * raise up the pacing rate its not speeding us up 2608 * and we already are pacing faster than we are getting. 2609 */ 2610 rack_log_timely(rack, mult, cur_bw, act_rate, max_allow_rate, 2611 __LINE__, 8); 2612 return (0); 2613 } 2614 } 2615 2616 static void 2617 rack_validate_multipliers_at_or_above100(struct tcp_rack *rack) 2618 { 2619 /* 2620 * When we drag bottom, we want to assure 2621 * that no multiplier is below 1.0, if so 2622 * we want to restore it to at least that. 2623 */ 2624 if (rack->r_ctl.rack_per_of_gp_rec < 100) { 2625 /* This is unlikely we usually do not touch recovery */ 2626 rack->r_ctl.rack_per_of_gp_rec = 100; 2627 } 2628 if (rack->r_ctl.rack_per_of_gp_ca < 100) { 2629 rack->r_ctl.rack_per_of_gp_ca = 100; 2630 } 2631 if (rack->r_ctl.rack_per_of_gp_ss < 100) { 2632 rack->r_ctl.rack_per_of_gp_ss = 100; 2633 } 2634 } 2635 2636 static void 2637 rack_validate_multipliers_at_or_below_100(struct tcp_rack *rack) 2638 { 2639 if (rack->r_ctl.rack_per_of_gp_ca > 100) { 2640 rack->r_ctl.rack_per_of_gp_ca = 100; 2641 } 2642 if (rack->r_ctl.rack_per_of_gp_ss > 100) { 2643 rack->r_ctl.rack_per_of_gp_ss = 100; 2644 } 2645 } 2646 2647 static void 2648 rack_increase_bw_mul(struct tcp_rack *rack, int timely_says, uint64_t cur_bw, uint64_t last_bw_est, int override) 2649 { 2650 int32_t calc, logged, plus; 2651 2652 logged = 0; 2653 2654 if (override) { 2655 /* 2656 * override is passed when we are 2657 * loosing b/w and making one last 2658 * gasp at trying to not loose out 2659 * to a new-reno flow. 2660 */ 2661 goto extra_boost; 2662 } 2663 /* In classic timely we boost by 5x if we have 5 increases in a row, lets not */ 2664 if (rack->rc_gp_incr && 2665 ((rack->rc_gp_timely_inc_cnt + 1) >= RACK_TIMELY_CNT_BOOST)) { 2666 /* 2667 * Reset and get 5 strokes more before the boost. Note 2668 * that the count is 0 based so we have to add one. 2669 */ 2670 extra_boost: 2671 plus = (uint32_t)rack_gp_increase_per * RACK_TIMELY_CNT_BOOST; 2672 rack->rc_gp_timely_inc_cnt = 0; 2673 } else 2674 plus = (uint32_t)rack_gp_increase_per; 2675 /* Must be at least 1% increase for true timely increases */ 2676 if ((plus < 1) && 2677 ((rack->r_ctl.rc_rtt_diff <= 0) || (timely_says <= 0))) 2678 plus = 1; 2679 if (rack->rc_gp_saw_rec && 2680 (rack->rc_gp_no_rec_chg == 0) && 2681 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2682 rack->r_ctl.rack_per_of_gp_rec)) { 2683 /* We have been in recovery ding it too */ 2684 calc = rack->r_ctl.rack_per_of_gp_rec + plus; 2685 if (calc > 0xffff) 2686 calc = 0xffff; 2687 logged |= 1; 2688 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)calc; 2689 if (rack_per_upper_bound_ss && 2690 (rack->rc_dragged_bottom == 0) && 2691 (rack->r_ctl.rack_per_of_gp_rec > rack_per_upper_bound_ss)) 2692 rack->r_ctl.rack_per_of_gp_rec = rack_per_upper_bound_ss; 2693 } 2694 if (rack->rc_gp_saw_ca && 2695 (rack->rc_gp_saw_ss == 0) && 2696 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2697 rack->r_ctl.rack_per_of_gp_ca)) { 2698 /* In CA */ 2699 calc = rack->r_ctl.rack_per_of_gp_ca + plus; 2700 if (calc > 0xffff) 2701 calc = 0xffff; 2702 logged |= 2; 2703 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)calc; 2704 if (rack_per_upper_bound_ca && 2705 (rack->rc_dragged_bottom == 0) && 2706 (rack->r_ctl.rack_per_of_gp_ca > rack_per_upper_bound_ca)) 2707 rack->r_ctl.rack_per_of_gp_ca = rack_per_upper_bound_ca; 2708 } 2709 if (rack->rc_gp_saw_ss && 2710 rack_bw_can_be_raised(rack, cur_bw, last_bw_est, 2711 rack->r_ctl.rack_per_of_gp_ss)) { 2712 /* In SS */ 2713 calc = rack->r_ctl.rack_per_of_gp_ss + plus; 2714 if (calc > 0xffff) 2715 calc = 0xffff; 2716 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)calc; 2717 if (rack_per_upper_bound_ss && 2718 (rack->rc_dragged_bottom == 0) && 2719 (rack->r_ctl.rack_per_of_gp_ss > rack_per_upper_bound_ss)) 2720 rack->r_ctl.rack_per_of_gp_ss = rack_per_upper_bound_ss; 2721 logged |= 4; 2722 } 2723 if (logged && 2724 (rack->rc_gp_incr == 0)){ 2725 /* Go into increment mode */ 2726 rack->rc_gp_incr = 1; 2727 rack->rc_gp_timely_inc_cnt = 0; 2728 } 2729 if (rack->rc_gp_incr && 2730 logged && 2731 (rack->rc_gp_timely_inc_cnt < RACK_TIMELY_CNT_BOOST)) { 2732 rack->rc_gp_timely_inc_cnt++; 2733 } 2734 rack_log_timely(rack, logged, plus, 0, 0, 2735 __LINE__, 1); 2736 } 2737 2738 static uint32_t 2739 rack_get_decrease(struct tcp_rack *rack, uint32_t curper, int32_t rtt_diff) 2740 { 2741 /* 2742 * norm_grad = rtt_diff / minrtt; 2743 * new_per = curper * (1 - B * norm_grad) 2744 * 2745 * B = rack_gp_decrease_per (default 10%) 2746 * rtt_dif = input var current rtt-diff 2747 * curper = input var current percentage 2748 * minrtt = from rack filter 2749 * 2750 */ 2751 uint64_t perf; 2752 2753 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2754 ((uint64_t)rack_gp_decrease_per * (uint64_t)10000 * 2755 (((uint64_t)rtt_diff * (uint64_t)1000000)/ 2756 (uint64_t)get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)))/ 2757 (uint64_t)1000000)) / 2758 (uint64_t)1000000); 2759 if (perf > curper) { 2760 /* TSNH */ 2761 perf = curper - 1; 2762 } 2763 return ((uint32_t)perf); 2764 } 2765 2766 static uint32_t 2767 rack_decrease_highrtt(struct tcp_rack *rack, uint32_t curper, uint32_t rtt) 2768 { 2769 /* 2770 * highrttthresh 2771 * result = curper * (1 - (B * ( 1 - ------ )) 2772 * gp_srtt 2773 * 2774 * B = rack_gp_decrease_per (default 10%) 2775 * highrttthresh = filter_min * rack_gp_rtt_maxmul 2776 */ 2777 uint64_t perf; 2778 uint32_t highrttthresh; 2779 2780 highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 2781 2782 perf = (((uint64_t)curper * ((uint64_t)1000000 - 2783 ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - 2784 ((uint64_t)highrttthresh * (uint64_t)1000000) / 2785 (uint64_t)rtt)) / 100)) /(uint64_t)1000000); 2786 return (perf); 2787 } 2788 2789 2790 static void 2791 rack_decrease_bw_mul(struct tcp_rack *rack, int timely_says, uint32_t rtt, int32_t rtt_diff) 2792 { 2793 uint64_t logvar, logvar2, logvar3; 2794 uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; 2795 2796 if (rack->rc_gp_incr) { 2797 /* Turn off increment counting */ 2798 rack->rc_gp_incr = 0; 2799 rack->rc_gp_timely_inc_cnt = 0; 2800 } 2801 ss_red = ca_red = rec_red = 0; 2802 logged = 0; 2803 /* Calculate the reduction value */ 2804 if (rtt_diff < 0) { 2805 rtt_diff *= -1; 2806 } 2807 /* Must be at least 1% reduction */ 2808 if (rack->rc_gp_saw_rec && (rack->rc_gp_no_rec_chg == 0)) { 2809 /* We have been in recovery ding it too */ 2810 if (timely_says == 2) { 2811 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_rec, rtt); 2812 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2813 if (alt < new_per) 2814 val = alt; 2815 else 2816 val = new_per; 2817 } else 2818 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2819 if (rack->r_ctl.rack_per_of_gp_rec > val) { 2820 rec_red = (rack->r_ctl.rack_per_of_gp_rec - val); 2821 rack->r_ctl.rack_per_of_gp_rec = (uint16_t)val; 2822 } else { 2823 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2824 rec_red = 0; 2825 } 2826 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_rec) 2827 rack->r_ctl.rack_per_of_gp_rec = rack_per_lower_bound; 2828 logged |= 1; 2829 } 2830 if (rack->rc_gp_saw_ss) { 2831 /* Sent in SS */ 2832 if (timely_says == 2) { 2833 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ss, rtt); 2834 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2835 if (alt < new_per) 2836 val = alt; 2837 else 2838 val = new_per; 2839 } else 2840 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ss, rtt_diff); 2841 if (rack->r_ctl.rack_per_of_gp_ss > new_per) { 2842 ss_red = rack->r_ctl.rack_per_of_gp_ss - val; 2843 rack->r_ctl.rack_per_of_gp_ss = (uint16_t)val; 2844 } else { 2845 ss_red = new_per; 2846 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2847 logvar = new_per; 2848 logvar <<= 32; 2849 logvar |= alt; 2850 logvar2 = (uint32_t)rtt; 2851 logvar2 <<= 32; 2852 logvar2 |= (uint32_t)rtt_diff; 2853 logvar3 = rack_gp_rtt_maxmul; 2854 logvar3 <<= 32; 2855 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2856 rack_log_timely(rack, timely_says, 2857 logvar2, logvar3, 2858 logvar, __LINE__, 10); 2859 } 2860 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) 2861 rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; 2862 logged |= 4; 2863 } else if (rack->rc_gp_saw_ca) { 2864 /* Sent in CA */ 2865 if (timely_says == 2) { 2866 new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); 2867 alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_rec, rtt_diff); 2868 if (alt < new_per) 2869 val = alt; 2870 else 2871 val = new_per; 2872 } else 2873 val = new_per = alt = rack_get_decrease(rack, rack->r_ctl.rack_per_of_gp_ca, rtt_diff); 2874 if (rack->r_ctl.rack_per_of_gp_ca > val) { 2875 ca_red = rack->r_ctl.rack_per_of_gp_ca - val; 2876 rack->r_ctl.rack_per_of_gp_ca = (uint16_t)val; 2877 } else { 2878 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2879 ca_red = 0; 2880 logvar = new_per; 2881 logvar <<= 32; 2882 logvar |= alt; 2883 logvar2 = (uint32_t)rtt; 2884 logvar2 <<= 32; 2885 logvar2 |= (uint32_t)rtt_diff; 2886 logvar3 = rack_gp_rtt_maxmul; 2887 logvar3 <<= 32; 2888 logvar3 |= get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2889 rack_log_timely(rack, timely_says, 2890 logvar2, logvar3, 2891 logvar, __LINE__, 10); 2892 } 2893 if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ca) 2894 rack->r_ctl.rack_per_of_gp_ca = rack_per_lower_bound; 2895 logged |= 2; 2896 } 2897 if (rack->rc_gp_timely_dec_cnt < 0x7) { 2898 rack->rc_gp_timely_dec_cnt++; 2899 if (rack_timely_dec_clear && 2900 (rack->rc_gp_timely_dec_cnt == rack_timely_dec_clear)) 2901 rack->rc_gp_timely_dec_cnt = 0; 2902 } 2903 logvar = ss_red; 2904 logvar <<= 32; 2905 logvar |= ca_red; 2906 rack_log_timely(rack, logged, rec_red, rack_per_lower_bound, logvar, 2907 __LINE__, 2); 2908 } 2909 2910 static void 2911 rack_log_rtt_shrinks(struct tcp_rack *rack, uint32_t us_cts, 2912 uint32_t rtt, uint32_t line, uint8_t reas) 2913 { 2914 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 2915 union tcp_log_stackspecific log; 2916 struct timeval tv; 2917 2918 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 2919 log.u_bbr.flex1 = line; 2920 log.u_bbr.flex2 = rack->r_ctl.rc_time_probertt_starts; 2921 log.u_bbr.flex3 = rack->r_ctl.rc_lower_rtt_us_cts; 2922 log.u_bbr.flex4 = rack->r_ctl.rack_per_of_gp_ss; 2923 log.u_bbr.flex5 = rtt; 2924 log.u_bbr.flex6 = rack->rc_highly_buffered; 2925 log.u_bbr.flex6 <<= 1; 2926 log.u_bbr.flex6 |= rack->forced_ack; 2927 log.u_bbr.flex6 <<= 1; 2928 log.u_bbr.flex6 |= rack->rc_gp_dyn_mul; 2929 log.u_bbr.flex6 <<= 1; 2930 log.u_bbr.flex6 |= rack->in_probe_rtt; 2931 log.u_bbr.flex6 <<= 1; 2932 log.u_bbr.flex6 |= rack->measure_saw_probe_rtt; 2933 log.u_bbr.flex7 = rack->r_ctl.rack_per_of_gp_probertt; 2934 log.u_bbr.pacing_gain = rack->r_ctl.rack_per_of_gp_ca; 2935 log.u_bbr.cwnd_gain = rack->r_ctl.rack_per_of_gp_rec; 2936 log.u_bbr.flex8 = reas; 2937 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 2938 log.u_bbr.delRate = rack_get_bw(rack); 2939 log.u_bbr.cur_del_rate = rack->r_ctl.rc_highest_us_rtt; 2940 log.u_bbr.cur_del_rate <<= 32; 2941 log.u_bbr.cur_del_rate |= rack->r_ctl.rc_lowest_us_rtt; 2942 log.u_bbr.applimited = rack->r_ctl.rc_time_probertt_entered; 2943 log.u_bbr.pkts_out = rack->r_ctl.rc_rtt_diff; 2944 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 2945 log.u_bbr.epoch = rack->r_ctl.rc_gp_srtt; 2946 log.u_bbr.lt_epoch = rack->r_ctl.rc_prev_gp_srtt; 2947 log.u_bbr.pkt_epoch = rack->r_ctl.rc_lower_rtt_us_cts; 2948 log.u_bbr.delivered = rack->r_ctl.rc_target_probertt_flight; 2949 log.u_bbr.lost = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 2950 log.u_bbr.rttProp = us_cts; 2951 log.u_bbr.rttProp <<= 32; 2952 log.u_bbr.rttProp |= rack->r_ctl.rc_entry_gp_rtt; 2953 TCP_LOG_EVENTP(rack->rc_tp, NULL, 2954 &rack->rc_inp->inp_socket->so_rcv, 2955 &rack->rc_inp->inp_socket->so_snd, 2956 BBR_LOG_RTT_SHRINKS, 0, 2957 0, &log, false, &rack->r_ctl.act_rcv_time); 2958 } 2959 } 2960 2961 static void 2962 rack_set_prtt_target(struct tcp_rack *rack, uint32_t segsiz, uint32_t rtt) 2963 { 2964 uint64_t bwdp; 2965 2966 bwdp = rack_get_bw(rack); 2967 bwdp *= (uint64_t)rtt; 2968 bwdp /= (uint64_t)HPTS_USEC_IN_SEC; 2969 rack->r_ctl.rc_target_probertt_flight = roundup((uint32_t)bwdp, segsiz); 2970 if (rack->r_ctl.rc_target_probertt_flight < (segsiz * rack_timely_min_segs)) { 2971 /* 2972 * A window protocol must be able to have 4 packets 2973 * outstanding as the floor in order to function 2974 * (especially considering delayed ack :D). 2975 */ 2976 rack->r_ctl.rc_target_probertt_flight = (segsiz * rack_timely_min_segs); 2977 } 2978 } 2979 2980 static void 2981 rack_enter_probertt(struct tcp_rack *rack, uint32_t us_cts) 2982 { 2983 /** 2984 * ProbeRTT is a bit different in rack_pacing than in 2985 * BBR. It is like BBR in that it uses the lowering of 2986 * the RTT as a signal that we saw something new and 2987 * counts from there for how long between. But it is 2988 * different in that its quite simple. It does not 2989 * play with the cwnd and wait until we get down 2990 * to N segments outstanding and hold that for 2991 * 200ms. Instead it just sets the pacing reduction 2992 * rate to a set percentage (70 by default) and hold 2993 * that for a number of recent GP Srtt's. 2994 */ 2995 uint32_t segsiz; 2996 2997 if (rack->rc_gp_dyn_mul == 0) 2998 return; 2999 3000 if (rack->rc_tp->snd_max == rack->rc_tp->snd_una) { 3001 /* We are idle */ 3002 return; 3003 } 3004 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3005 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3006 /* 3007 * Stop the goodput now, the idea here is 3008 * that future measurements with in_probe_rtt 3009 * won't register if they are not greater so 3010 * we want to get what info (if any) is available 3011 * now. 3012 */ 3013 rack_do_goodput_measurement(rack->rc_tp, rack, 3014 rack->rc_tp->snd_una, __LINE__); 3015 } 3016 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3017 rack->r_ctl.rc_time_probertt_entered = us_cts; 3018 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3019 rack->r_ctl.rc_pace_min_segs); 3020 rack->in_probe_rtt = 1; 3021 rack->measure_saw_probe_rtt = 1; 3022 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3023 rack->r_ctl.rc_time_probertt_starts = 0; 3024 rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; 3025 if (rack_probertt_use_min_rtt_entry) 3026 rack_set_prtt_target(rack, segsiz, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3027 else 3028 rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_gp_srtt); 3029 rack_log_rtt_shrinks(rack, us_cts, get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3030 __LINE__, RACK_RTTS_ENTERPROBE); 3031 } 3032 3033 static void 3034 rack_exit_probertt(struct tcp_rack *rack, uint32_t us_cts) 3035 { 3036 struct rack_sendmap *rsm; 3037 uint32_t segsiz; 3038 3039 segsiz = min(ctf_fixed_maxseg(rack->rc_tp), 3040 rack->r_ctl.rc_pace_min_segs); 3041 rack->in_probe_rtt = 0; 3042 if ((rack->rc_tp->t_flags & TF_GPUTINPROG) && 3043 SEQ_GT(rack->rc_tp->snd_una, rack->rc_tp->gput_seq)) { 3044 /* 3045 * Stop the goodput now, the idea here is 3046 * that future measurements with in_probe_rtt 3047 * won't register if they are not greater so 3048 * we want to get what info (if any) is available 3049 * now. 3050 */ 3051 rack_do_goodput_measurement(rack->rc_tp, rack, 3052 rack->rc_tp->snd_una, __LINE__); 3053 } else if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 3054 /* 3055 * We don't have enough data to make a measurement. 3056 * So lets just stop and start here after exiting 3057 * probe-rtt. We probably are not interested in 3058 * the results anyway. 3059 */ 3060 rack->rc_tp->t_flags &= ~TF_GPUTINPROG; 3061 } 3062 /* 3063 * Measurements through the current snd_max are going 3064 * to be limited by the slower pacing rate. 3065 * 3066 * We need to mark these as app-limited so we 3067 * don't collapse the b/w. 3068 */ 3069 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 3070 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 3071 if (rack->r_ctl.rc_app_limited_cnt == 0) 3072 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 3073 else { 3074 /* 3075 * Go out to the end app limited and mark 3076 * this new one as next and move the end_appl up 3077 * to this guy. 3078 */ 3079 if (rack->r_ctl.rc_end_appl) 3080 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 3081 rack->r_ctl.rc_end_appl = rsm; 3082 } 3083 rsm->r_flags |= RACK_APP_LIMITED; 3084 rack->r_ctl.rc_app_limited_cnt++; 3085 } 3086 /* 3087 * Now, we need to examine our pacing rate multipliers. 3088 * If its under 100%, we need to kick it back up to 3089 * 100%. We also don't let it be over our "max" above 3090 * the actual rate i.e. 100% + rack_clamp_atexit_prtt. 3091 * Note setting clamp_atexit_prtt to 0 has the effect 3092 * of setting CA/SS to 100% always at exit (which is 3093 * the default behavior). 3094 */ 3095 if (rack_probertt_clear_is) { 3096 rack->rc_gp_incr = 0; 3097 rack->rc_gp_bwred = 0; 3098 rack->rc_gp_timely_inc_cnt = 0; 3099 rack->rc_gp_timely_dec_cnt = 0; 3100 } 3101 /* Do we do any clamping at exit? */ 3102 if (rack->rc_highly_buffered && rack_atexit_prtt_hbp) { 3103 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt_hbp; 3104 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt_hbp; 3105 } 3106 if ((rack->rc_highly_buffered == 0) && rack_atexit_prtt) { 3107 rack->r_ctl.rack_per_of_gp_ca = rack_atexit_prtt; 3108 rack->r_ctl.rack_per_of_gp_ss = rack_atexit_prtt; 3109 } 3110 /* 3111 * Lets set rtt_diff to 0, so that we will get a "boost" 3112 * after exiting. 3113 */ 3114 rack->r_ctl.rc_rtt_diff = 0; 3115 3116 /* Clear all flags so we start fresh */ 3117 rack->rc_tp->t_bytes_acked = 0; 3118 rack->rc_tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3119 /* 3120 * If configured to, set the cwnd and ssthresh to 3121 * our targets. 3122 */ 3123 if (rack_probe_rtt_sets_cwnd) { 3124 uint64_t ebdp; 3125 uint32_t setto; 3126 3127 /* Set ssthresh so we get into CA once we hit our target */ 3128 if (rack_probertt_use_min_rtt_exit == 1) { 3129 /* Set to min rtt */ 3130 rack_set_prtt_target(rack, segsiz, 3131 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt)); 3132 } else if (rack_probertt_use_min_rtt_exit == 2) { 3133 /* Set to current gp rtt */ 3134 rack_set_prtt_target(rack, segsiz, 3135 rack->r_ctl.rc_gp_srtt); 3136 } else if (rack_probertt_use_min_rtt_exit == 3) { 3137 /* Set to entry gp rtt */ 3138 rack_set_prtt_target(rack, segsiz, 3139 rack->r_ctl.rc_entry_gp_rtt); 3140 } else { 3141 uint64_t sum; 3142 uint32_t setval; 3143 3144 sum = rack->r_ctl.rc_entry_gp_rtt; 3145 sum *= 10; 3146 sum /= (uint64_t)(max(1, rack->r_ctl.rc_gp_srtt)); 3147 if (sum >= 20) { 3148 /* 3149 * A highly buffered path needs 3150 * cwnd space for timely to work. 3151 * Lets set things up as if 3152 * we are heading back here again. 3153 */ 3154 setval = rack->r_ctl.rc_entry_gp_rtt; 3155 } else if (sum >= 15) { 3156 /* 3157 * Lets take the smaller of the 3158 * two since we are just somewhat 3159 * buffered. 3160 */ 3161 setval = rack->r_ctl.rc_gp_srtt; 3162 if (setval > rack->r_ctl.rc_entry_gp_rtt) 3163 setval = rack->r_ctl.rc_entry_gp_rtt; 3164 } else { 3165 /* 3166 * Here we are not highly buffered 3167 * and should pick the min we can to 3168 * keep from causing loss. 3169 */ 3170 setval = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 3171 } 3172 rack_set_prtt_target(rack, segsiz, 3173 setval); 3174 } 3175 if (rack_probe_rtt_sets_cwnd > 1) { 3176 /* There is a percentage here to boost */ 3177 ebdp = rack->r_ctl.rc_target_probertt_flight; 3178 ebdp *= rack_probe_rtt_sets_cwnd; 3179 ebdp /= 100; 3180 setto = rack->r_ctl.rc_target_probertt_flight + ebdp; 3181 } else 3182 setto = rack->r_ctl.rc_target_probertt_flight; 3183 rack->rc_tp->snd_cwnd = roundup(setto, segsiz); 3184 if (rack->rc_tp->snd_cwnd < (segsiz * rack_timely_min_segs)) { 3185 /* Enforce a min */ 3186 rack->rc_tp->snd_cwnd = segsiz * rack_timely_min_segs; 3187 } 3188 /* If we set in the cwnd also set the ssthresh point so we are in CA */ 3189 rack->rc_tp->snd_ssthresh = (rack->rc_tp->snd_cwnd - 1); 3190 } 3191 rack_log_rtt_shrinks(rack, us_cts, 3192 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3193 __LINE__, RACK_RTTS_EXITPROBE); 3194 /* Clear times last so log has all the info */ 3195 rack->r_ctl.rc_probertt_sndmax_atexit = rack->rc_tp->snd_max; 3196 rack->r_ctl.rc_time_probertt_entered = us_cts; 3197 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 3198 rack->r_ctl.rc_time_of_last_probertt = us_cts; 3199 } 3200 3201 static void 3202 rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) 3203 { 3204 /* Check in on probe-rtt */ 3205 if (rack->rc_gp_filled == 0) { 3206 /* We do not do p-rtt unless we have gp measurements */ 3207 return; 3208 } 3209 if (rack->in_probe_rtt) { 3210 uint64_t no_overflow; 3211 uint32_t endtime, must_stay; 3212 3213 if (rack->r_ctl.rc_went_idle_time && 3214 ((us_cts - rack->r_ctl.rc_went_idle_time) > rack_min_probertt_hold)) { 3215 /* 3216 * We went idle during prtt, just exit now. 3217 */ 3218 rack_exit_probertt(rack, us_cts); 3219 } else if (rack_probe_rtt_safety_val && 3220 TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered) && 3221 ((us_cts - rack->r_ctl.rc_time_probertt_entered) > rack_probe_rtt_safety_val)) { 3222 /* 3223 * Probe RTT safety value triggered! 3224 */ 3225 rack_log_rtt_shrinks(rack, us_cts, 3226 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3227 __LINE__, RACK_RTTS_SAFETY); 3228 rack_exit_probertt(rack, us_cts); 3229 } 3230 /* Calculate the max we will wait */ 3231 endtime = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_max_drain_wait); 3232 if (rack->rc_highly_buffered) 3233 endtime += (rack->r_ctl.rc_gp_srtt * rack_max_drain_hbp); 3234 /* Calculate the min we must wait */ 3235 must_stay = rack->r_ctl.rc_time_probertt_entered + (rack->r_ctl.rc_gp_srtt * rack_must_drain); 3236 if ((ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.rc_target_probertt_flight) && 3237 TSTMP_LT(us_cts, endtime)) { 3238 uint32_t calc; 3239 /* Do we lower more? */ 3240 no_exit: 3241 if (TSTMP_GT(us_cts, rack->r_ctl.rc_time_probertt_entered)) 3242 calc = us_cts - rack->r_ctl.rc_time_probertt_entered; 3243 else 3244 calc = 0; 3245 calc /= max(rack->r_ctl.rc_gp_srtt, 1); 3246 if (calc) { 3247 /* Maybe */ 3248 calc *= rack_per_of_gp_probertt_reduce; 3249 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; 3250 /* Limit it too */ 3251 if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) 3252 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; 3253 } 3254 /* We must reach target or the time set */ 3255 return; 3256 } 3257 if (rack->r_ctl.rc_time_probertt_starts == 0) { 3258 if ((TSTMP_LT(us_cts, must_stay) && 3259 rack->rc_highly_buffered) || 3260 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > 3261 rack->r_ctl.rc_target_probertt_flight)) { 3262 /* We are not past the must_stay time */ 3263 goto no_exit; 3264 } 3265 rack_log_rtt_shrinks(rack, us_cts, 3266 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3267 __LINE__, RACK_RTTS_REACHTARGET); 3268 rack->r_ctl.rc_time_probertt_starts = us_cts; 3269 if (rack->r_ctl.rc_time_probertt_starts == 0) 3270 rack->r_ctl.rc_time_probertt_starts = 1; 3271 /* Restore back to our rate we want to pace at in prtt */ 3272 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 3273 } 3274 /* 3275 * Setup our end time, some number of gp_srtts plus 200ms. 3276 */ 3277 no_overflow = ((uint64_t)rack->r_ctl.rc_gp_srtt * 3278 (uint64_t)rack_probertt_gpsrtt_cnt_mul); 3279 if (rack_probertt_gpsrtt_cnt_div) 3280 endtime = (uint32_t)(no_overflow / (uint64_t)rack_probertt_gpsrtt_cnt_div); 3281 else 3282 endtime = 0; 3283 endtime += rack_min_probertt_hold; 3284 endtime += rack->r_ctl.rc_time_probertt_starts; 3285 if (TSTMP_GEQ(us_cts, endtime)) { 3286 /* yes, exit probertt */ 3287 rack_exit_probertt(rack, us_cts); 3288 } 3289 3290 } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { 3291 /* Go into probertt, its been too long since we went lower */ 3292 rack_enter_probertt(rack, us_cts); 3293 } 3294 } 3295 3296 static void 3297 rack_update_multiplier(struct tcp_rack *rack, int32_t timely_says, uint64_t last_bw_est, 3298 uint32_t rtt, int32_t rtt_diff) 3299 { 3300 uint64_t cur_bw, up_bnd, low_bnd, subfr; 3301 uint32_t losses; 3302 3303 if ((rack->rc_gp_dyn_mul == 0) || 3304 (rack->use_fixed_rate) || 3305 (rack->in_probe_rtt) || 3306 (rack->rc_always_pace == 0)) { 3307 /* No dynamic GP multipler in play */ 3308 return; 3309 } 3310 losses = rack->r_ctl.rc_loss_count - rack->r_ctl.rc_loss_at_start; 3311 cur_bw = rack_get_bw(rack); 3312 /* Calculate our up and down range */ 3313 up_bnd = rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_up; 3314 up_bnd /= 100; 3315 up_bnd += rack->r_ctl.last_gp_comp_bw; 3316 3317 subfr = (uint64_t)rack->r_ctl.last_gp_comp_bw * (uint64_t)rack_gp_per_bw_mul_down; 3318 subfr /= 100; 3319 low_bnd = rack->r_ctl.last_gp_comp_bw - subfr; 3320 if ((timely_says == 2) && (rack->r_ctl.rc_no_push_at_mrtt)) { 3321 /* 3322 * This is the case where our RTT is above 3323 * the max target and we have been configured 3324 * to just do timely no bonus up stuff in that case. 3325 * 3326 * There are two configurations, set to 1, and we 3327 * just do timely if we are over our max. If its 3328 * set above 1 then we slam the multipliers down 3329 * to 100 and then decrement per timely. 3330 */ 3331 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3332 __LINE__, 3); 3333 if (rack->r_ctl.rc_no_push_at_mrtt > 1) 3334 rack_validate_multipliers_at_or_below_100(rack); 3335 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3336 } else if ((last_bw_est < low_bnd) && !losses) { 3337 /* 3338 * We are decreasing this is a bit complicated this 3339 * means we are loosing ground. This could be 3340 * because another flow entered and we are competing 3341 * for b/w with it. This will push the RTT up which 3342 * makes timely unusable unless we want to get shoved 3343 * into a corner and just be backed off (the age 3344 * old problem with delay based CC). 3345 * 3346 * On the other hand if it was a route change we 3347 * would like to stay somewhat contained and not 3348 * blow out the buffers. 3349 */ 3350 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3351 __LINE__, 3); 3352 rack->r_ctl.last_gp_comp_bw = cur_bw; 3353 if (rack->rc_gp_bwred == 0) { 3354 /* Go into reduction counting */ 3355 rack->rc_gp_bwred = 1; 3356 rack->rc_gp_timely_dec_cnt = 0; 3357 } 3358 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) || 3359 (timely_says == 0)) { 3360 /* 3361 * Push another time with a faster pacing 3362 * to try to gain back (we include override to 3363 * get a full raise factor). 3364 */ 3365 if ((rack->rc_gp_saw_ca && rack->r_ctl.rack_per_of_gp_ca <= rack_down_raise_thresh) || 3366 (rack->rc_gp_saw_ss && rack->r_ctl.rack_per_of_gp_ss <= rack_down_raise_thresh) || 3367 (timely_says == 0) || 3368 (rack_down_raise_thresh == 0)) { 3369 /* 3370 * Do an override up in b/w if we were 3371 * below the threshold or if the threshold 3372 * is zero we always do the raise. 3373 */ 3374 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 1); 3375 } else { 3376 /* Log it stays the same */ 3377 rack_log_timely(rack, 0, last_bw_est, low_bnd, 0, 3378 __LINE__, 11); 3379 3380 } 3381 rack->rc_gp_timely_dec_cnt++; 3382 /* We are not incrementing really no-count */ 3383 rack->rc_gp_incr = 0; 3384 rack->rc_gp_timely_inc_cnt = 0; 3385 } else { 3386 /* 3387 * Lets just use the RTT 3388 * information and give up 3389 * pushing. 3390 */ 3391 goto use_timely; 3392 } 3393 } else if ((timely_says != 2) && 3394 !losses && 3395 (last_bw_est > up_bnd)) { 3396 /* 3397 * We are increasing b/w lets keep going, updating 3398 * our b/w and ignoring any timely input, unless 3399 * of course we are at our max raise (if there is one). 3400 */ 3401 3402 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3403 __LINE__, 3); 3404 rack->r_ctl.last_gp_comp_bw = cur_bw; 3405 if (rack->rc_gp_saw_ss && 3406 rack_per_upper_bound_ss && 3407 (rack->r_ctl.rack_per_of_gp_ss == rack_per_upper_bound_ss)) { 3408 /* 3409 * In cases where we can't go higher 3410 * we should just use timely. 3411 */ 3412 goto use_timely; 3413 } 3414 if (rack->rc_gp_saw_ca && 3415 rack_per_upper_bound_ca && 3416 (rack->r_ctl.rack_per_of_gp_ca == rack_per_upper_bound_ca)) { 3417 /* 3418 * In cases where we can't go higher 3419 * we should just use timely. 3420 */ 3421 goto use_timely; 3422 } 3423 rack->rc_gp_bwred = 0; 3424 rack->rc_gp_timely_dec_cnt = 0; 3425 /* You get a set number of pushes if timely is trying to reduce */ 3426 if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { 3427 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3428 } else { 3429 /* Log it stays the same */ 3430 rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, 3431 __LINE__, 12); 3432 3433 } 3434 return; 3435 } else { 3436 /* 3437 * We are staying between the lower and upper range bounds 3438 * so use timely to decide. 3439 */ 3440 rack_log_timely(rack, timely_says, cur_bw, low_bnd, up_bnd, 3441 __LINE__, 3); 3442 use_timely: 3443 if (timely_says) { 3444 rack->rc_gp_incr = 0; 3445 rack->rc_gp_timely_inc_cnt = 0; 3446 if ((rack->rc_gp_timely_dec_cnt < rack_timely_max_push_drop) && 3447 !losses && 3448 (last_bw_est < low_bnd)) { 3449 /* We are loosing ground */ 3450 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3451 rack->rc_gp_timely_dec_cnt++; 3452 /* We are not incrementing really no-count */ 3453 rack->rc_gp_incr = 0; 3454 rack->rc_gp_timely_inc_cnt = 0; 3455 } else 3456 rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); 3457 } else { 3458 rack->rc_gp_bwred = 0; 3459 rack->rc_gp_timely_dec_cnt = 0; 3460 rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); 3461 } 3462 } 3463 } 3464 3465 static int32_t 3466 rack_make_timely_judgement(struct tcp_rack *rack, uint32_t rtt, int32_t rtt_diff, uint32_t prev_rtt) 3467 { 3468 int32_t timely_says; 3469 uint64_t log_mult, log_rtt_a_diff; 3470 3471 log_rtt_a_diff = rtt; 3472 log_rtt_a_diff <<= 32; 3473 log_rtt_a_diff |= (uint32_t)rtt_diff; 3474 if (rtt >= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * 3475 rack_gp_rtt_maxmul)) { 3476 /* Reduce the b/w multipler */ 3477 timely_says = 2; 3478 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; 3479 log_mult <<= 32; 3480 log_mult |= prev_rtt; 3481 rack_log_timely(rack, timely_says, log_mult, 3482 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3483 log_rtt_a_diff, __LINE__, 4); 3484 } else if (rtt <= (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3485 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3486 max(rack_gp_rtt_mindiv , 1)))) { 3487 /* Increase the b/w multipler */ 3488 log_mult = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) + 3489 ((get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_minmul) / 3490 max(rack_gp_rtt_mindiv , 1)); 3491 log_mult <<= 32; 3492 log_mult |= prev_rtt; 3493 timely_says = 0; 3494 rack_log_timely(rack, timely_says, log_mult , 3495 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), 3496 log_rtt_a_diff, __LINE__, 5); 3497 } else { 3498 /* 3499 * Use a gradient to find it the timely gradient 3500 * is: 3501 * grad = rc_rtt_diff / min_rtt; 3502 * 3503 * anything below or equal to 0 will be 3504 * a increase indication. Anything above 3505 * zero is a decrease. Note we take care 3506 * of the actual gradient calculation 3507 * in the reduction (its not needed for 3508 * increase). 3509 */ 3510 log_mult = prev_rtt; 3511 if (rtt_diff <= 0) { 3512 /* 3513 * Rttdiff is less than zero, increase the 3514 * b/w multipler (its 0 or negative) 3515 */ 3516 timely_says = 0; 3517 rack_log_timely(rack, timely_says, log_mult, 3518 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 6); 3519 } else { 3520 /* Reduce the b/w multipler */ 3521 timely_says = 1; 3522 rack_log_timely(rack, timely_says, log_mult, 3523 get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt), log_rtt_a_diff, __LINE__, 7); 3524 } 3525 } 3526 return (timely_says); 3527 } 3528 3529 static void 3530 rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, 3531 tcp_seq th_ack, int line) 3532 { 3533 uint64_t tim, bytes_ps, ltim, stim, utim; 3534 uint32_t segsiz, bytes, reqbytes, us_cts; 3535 int32_t gput, new_rtt_diff, timely_says; 3536 3537 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3538 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 3539 if (TSTMP_GEQ(us_cts, tp->gput_ts)) 3540 tim = us_cts - tp->gput_ts; 3541 else 3542 tim = 0; 3543 3544 if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) 3545 stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; 3546 else 3547 stim = 0; 3548 /* 3549 * Use the larger of the send time or ack time. This prevents us 3550 * from being influenced by ack artifacts to come up with too 3551 * high of measurement. Note that since we are spanning over many more 3552 * bytes in most of our measurements hopefully that is less likely to 3553 * occur. 3554 */ 3555 if (tim > stim) 3556 utim = max(tim, 1); 3557 else 3558 utim = max(stim, 1); 3559 /* Lets validate utim */ 3560 ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); 3561 gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; 3562 reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); 3563 if ((tim == 0) && (stim == 0)) { 3564 /* 3565 * Invalid measurement time, maybe 3566 * all on one ack/one send? 3567 */ 3568 bytes = 0; 3569 bytes_ps = 0; 3570 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3571 0, 0, 0, 10, __LINE__, NULL); 3572 goto skip_measurement; 3573 } 3574 if (rack->r_ctl.rc_gp_lowrtt == 0xffffffff) { 3575 /* We never made a us_rtt measurement? */ 3576 bytes = 0; 3577 bytes_ps = 0; 3578 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3579 0, 0, 0, 10, __LINE__, NULL); 3580 goto skip_measurement; 3581 } 3582 /* 3583 * Calculate the maximum possible b/w this connection 3584 * could have. We base our calculation on the lowest 3585 * rtt we have seen during the measurement and the 3586 * largest rwnd the client has given us in that time. This 3587 * forms a BDP that is the maximum that we could ever 3588 * get to the client. Anything larger is not valid. 3589 * 3590 * I originally had code here that rejected measurements 3591 * where the time was less than 1/2 the latest us_rtt. 3592 * But after thinking on that I realized its wrong since 3593 * say you had a 150Mbps or even 1Gbps link, and you 3594 * were a long way away.. example I am in Europe (100ms rtt) 3595 * talking to my 1Gbps link in S.C. Now measuring say 150,000 3596 * bytes my time would be 1.2ms, and yet my rtt would say 3597 * the measurement was invalid the time was < 50ms. The 3598 * same thing is true for 150Mb (8ms of time). 3599 * 3600 * A better way I realized is to look at what the maximum 3601 * the connection could possibly do. This is gated on 3602 * the lowest RTT we have seen and the highest rwnd. 3603 * We should in theory never exceed that, if we are 3604 * then something on the path is storing up packets 3605 * and then feeding them all at once to our endpoint 3606 * messing up our measurement. 3607 */ 3608 rack->r_ctl.last_max_bw = rack->r_ctl.rc_gp_high_rwnd; 3609 rack->r_ctl.last_max_bw *= HPTS_USEC_IN_SEC; 3610 rack->r_ctl.last_max_bw /= rack->r_ctl.rc_gp_lowrtt; 3611 if (SEQ_LT(th_ack, tp->gput_seq)) { 3612 /* No measurement can be made */ 3613 bytes = 0; 3614 bytes_ps = 0; 3615 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3616 0, 0, 0, 10, __LINE__, NULL); 3617 goto skip_measurement; 3618 } else 3619 bytes = (th_ack - tp->gput_seq); 3620 bytes_ps = (uint64_t)bytes; 3621 /* 3622 * Don't measure a b/w for pacing unless we have gotten at least 3623 * an initial windows worth of data in this measurement interval. 3624 * 3625 * Small numbers of bytes get badly influenced by delayed ack and 3626 * other artifacts. Note we take the initial window or our 3627 * defined minimum GP (defaulting to 10 which hopefully is the 3628 * IW). 3629 */ 3630 if (rack->rc_gp_filled == 0) { 3631 /* 3632 * The initial estimate is special. We 3633 * have blasted out an IW worth of packets 3634 * without a real valid ack ts results. We 3635 * then setup the app_limited_needs_set flag, 3636 * this should get the first ack in (probably 2 3637 * MSS worth) to be recorded as the timestamp. 3638 * We thus allow a smaller number of bytes i.e. 3639 * IW - 2MSS. 3640 */ 3641 reqbytes -= (2 * segsiz); 3642 /* Also lets fill previous for our first measurement to be neutral */ 3643 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3644 } 3645 if ((bytes_ps < reqbytes) || rack->app_limited_needs_set) { 3646 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3647 rack->r_ctl.rc_app_limited_cnt, 3648 0, 0, 10, __LINE__, NULL); 3649 goto skip_measurement; 3650 } 3651 /* 3652 * We now need to calculate the Timely like status so 3653 * we can update (possibly) the b/w multipliers. 3654 */ 3655 new_rtt_diff = (int32_t)rack->r_ctl.rc_gp_srtt - (int32_t)rack->r_ctl.rc_prev_gp_srtt; 3656 if (rack->rc_gp_filled == 0) { 3657 /* No previous reading */ 3658 rack->r_ctl.rc_rtt_diff = new_rtt_diff; 3659 } else { 3660 if (rack->measure_saw_probe_rtt == 0) { 3661 /* 3662 * We don't want a probertt to be counted 3663 * since it will be negative incorrectly. We 3664 * expect to be reducing the RTT when we 3665 * pace at a slower rate. 3666 */ 3667 rack->r_ctl.rc_rtt_diff -= (rack->r_ctl.rc_rtt_diff / 8); 3668 rack->r_ctl.rc_rtt_diff += (new_rtt_diff / 8); 3669 } 3670 } 3671 timely_says = rack_make_timely_judgement(rack, 3672 rack->r_ctl.rc_gp_srtt, 3673 rack->r_ctl.rc_rtt_diff, 3674 rack->r_ctl.rc_prev_gp_srtt 3675 ); 3676 bytes_ps *= HPTS_USEC_IN_SEC; 3677 bytes_ps /= utim; 3678 if (bytes_ps > rack->r_ctl.last_max_bw) { 3679 /* 3680 * Something is on path playing 3681 * since this b/w is not possible based 3682 * on our BDP (highest rwnd and lowest rtt 3683 * we saw in the measurement window). 3684 * 3685 * Another option here would be to 3686 * instead skip the measurement. 3687 */ 3688 rack_log_pacing_delay_calc(rack, bytes, reqbytes, 3689 bytes_ps, rack->r_ctl.last_max_bw, 0, 3690 11, __LINE__, NULL); 3691 bytes_ps = rack->r_ctl.last_max_bw; 3692 } 3693 /* We store gp for b/w in bytes per second */ 3694 if (rack->rc_gp_filled == 0) { 3695 /* Initial measurment */ 3696 if (bytes_ps) { 3697 rack->r_ctl.gp_bw = bytes_ps; 3698 rack->rc_gp_filled = 1; 3699 rack->r_ctl.num_avg = 1; 3700 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 3701 } else { 3702 rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, 3703 rack->r_ctl.rc_app_limited_cnt, 3704 0, 0, 10, __LINE__, NULL); 3705 } 3706 if (rack->rc_inp->inp_in_hpts && 3707 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 3708 /* 3709 * Ok we can't trust the pacer in this case 3710 * where we transition from un-paced to paced. 3711 * Or for that matter when the burst mitigation 3712 * was making a wild guess and got it wrong. 3713 * Stop the pacer and clear up all the aggregate 3714 * delays etc. 3715 */ 3716 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 3717 rack->r_ctl.rc_hpts_flags = 0; 3718 rack->r_ctl.rc_last_output_to = 0; 3719 } 3720 } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { 3721 /* Still a small number run an average */ 3722 rack->r_ctl.gp_bw += bytes_ps; 3723 rack->r_ctl.num_avg++; 3724 if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { 3725 /* We have collected enought to move forward */ 3726 rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; 3727 } 3728 } else { 3729 /* 3730 * We want to take 1/wma of the goodput and add in to 7/8th 3731 * of the old value weighted by the srtt. So if your measurement 3732 * period is say 2 SRTT's long you would get 1/4 as the 3733 * value, if it was like 1/2 SRTT then you would get 1/16th. 3734 * 3735 * But we must be careful not to take too much i.e. if the 3736 * srtt is say 20ms and the measurement is taken over 3737 * 400ms our weight would be 400/20 i.e. 20. On the 3738 * other hand if we get a measurement over 1ms with a 3739 * 10ms rtt we only want to take a much smaller portion. 3740 */ 3741 uint64_t resid_bw, subpart, addpart, srtt; 3742 3743 srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); 3744 if (srtt == 0) { 3745 /* 3746 * Strange why did t_srtt go back to zero? 3747 */ 3748 if (rack->r_ctl.rc_rack_min_rtt) 3749 srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); 3750 else 3751 srtt = HPTS_USEC_IN_MSEC; 3752 } 3753 /* 3754 * XXXrrs: Note for reviewers, in playing with 3755 * dynamic pacing I discovered this GP calculation 3756 * as done originally leads to some undesired results. 3757 * Basically you can get longer measurements contributing 3758 * too much to the WMA. Thus I changed it if you are doing 3759 * dynamic adjustments to only do the aportioned adjustment 3760 * if we have a very small (time wise) measurement. Longer 3761 * measurements just get there weight (defaulting to 1/8) 3762 * add to the WMA. We may want to think about changing 3763 * this to always do that for both sides i.e. dynamic 3764 * and non-dynamic... but considering lots of folks 3765 * were playing with this I did not want to change the 3766 * calculation per.se. without your thoughts.. Lawerence? 3767 * Peter?? 3768 */ 3769 if (rack->rc_gp_dyn_mul == 0) { 3770 subpart = rack->r_ctl.gp_bw * utim; 3771 subpart /= (srtt * 8); 3772 if (subpart < (rack->r_ctl.gp_bw / 2)) { 3773 /* 3774 * The b/w update takes no more 3775 * away then 1/2 our running total 3776 * so factor it in. 3777 */ 3778 addpart = bytes_ps * utim; 3779 addpart /= (srtt * 8); 3780 } else { 3781 /* 3782 * Don't allow a single measurement 3783 * to account for more than 1/2 of the 3784 * WMA. This could happen on a retransmission 3785 * where utim becomes huge compared to 3786 * srtt (multiple retransmissions when using 3787 * the sending rate which factors in all the 3788 * transmissions from the first one). 3789 */ 3790 subpart = rack->r_ctl.gp_bw / 2; 3791 addpart = bytes_ps / 2; 3792 } 3793 resid_bw = rack->r_ctl.gp_bw - subpart; 3794 rack->r_ctl.gp_bw = resid_bw + addpart; 3795 } else { 3796 if ((utim / srtt) <= 1) { 3797 /* 3798 * The b/w update was over a small period 3799 * of time. The idea here is to prevent a small 3800 * measurement time period from counting 3801 * too much. So we scale it based on the 3802 * time so it attributes less than 1/rack_wma_divisor 3803 * of its measurement. 3804 */ 3805 subpart = rack->r_ctl.gp_bw * utim; 3806 subpart /= (srtt * rack_wma_divisor); 3807 addpart = bytes_ps * utim; 3808 addpart /= (srtt * rack_wma_divisor); 3809 } else { 3810 /* 3811 * The scaled measurement was long 3812 * enough so lets just add in the 3813 * portion of the measurment i.e. 1/rack_wma_divisor 3814 */ 3815 subpart = rack->r_ctl.gp_bw / rack_wma_divisor; 3816 addpart = bytes_ps / rack_wma_divisor; 3817 } 3818 if ((rack->measure_saw_probe_rtt == 0) || 3819 (bytes_ps > rack->r_ctl.gp_bw)) { 3820 /* 3821 * For probe-rtt we only add it in 3822 * if its larger, all others we just 3823 * add in. 3824 */ 3825 resid_bw = rack->r_ctl.gp_bw - subpart; 3826 rack->r_ctl.gp_bw = resid_bw + addpart; 3827 } 3828 } 3829 } 3830 /* We do not update any multipliers if we are in or have seen a probe-rtt */ 3831 if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) 3832 rack_update_multiplier(rack, timely_says, bytes_ps, 3833 rack->r_ctl.rc_gp_srtt, 3834 rack->r_ctl.rc_rtt_diff); 3835 rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, 3836 rack_get_bw(rack), 3, line, NULL); 3837 /* reset the gp srtt and setup the new prev */ 3838 rack->r_ctl.rc_prev_gp_srtt = rack->r_ctl.rc_gp_srtt; 3839 /* Record the lost count for the next measurement */ 3840 rack->r_ctl.rc_loss_at_start = rack->r_ctl.rc_loss_count; 3841 /* 3842 * We restart our diffs based on the gpsrtt in the 3843 * measurement window. 3844 */ 3845 rack->rc_gp_rtt_set = 0; 3846 rack->rc_gp_saw_rec = 0; 3847 rack->rc_gp_saw_ca = 0; 3848 rack->rc_gp_saw_ss = 0; 3849 rack->rc_dragged_bottom = 0; 3850 skip_measurement: 3851 3852 #ifdef STATS 3853 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, 3854 gput); 3855 /* 3856 * XXXLAS: This is a temporary hack, and should be 3857 * chained off VOI_TCP_GPUT when stats(9) grows an 3858 * API to deal with chained VOIs. 3859 */ 3860 if (tp->t_stats_gput_prev > 0) 3861 stats_voi_update_abs_s32(tp->t_stats, 3862 VOI_TCP_GPUT_ND, 3863 ((gput - tp->t_stats_gput_prev) * 100) / 3864 tp->t_stats_gput_prev); 3865 #endif 3866 tp->t_flags &= ~TF_GPUTINPROG; 3867 tp->t_stats_gput_prev = gput; 3868 /* 3869 * Now are we app limited now and there is space from where we 3870 * were to where we want to go? 3871 * 3872 * We don't do the other case i.e. non-applimited here since 3873 * the next send will trigger us picking up the missing data. 3874 */ 3875 if (rack->r_ctl.rc_first_appl && 3876 TCPS_HAVEESTABLISHED(tp->t_state) && 3877 rack->r_ctl.rc_app_limited_cnt && 3878 (SEQ_GT(rack->r_ctl.rc_first_appl->r_start, th_ack)) && 3879 ((rack->r_ctl.rc_first_appl->r_start - th_ack) > 3880 max(rc_init_window(rack), (MIN_GP_WIN * segsiz)))) { 3881 /* 3882 * Yep there is enough outstanding to make a measurement here. 3883 */ 3884 struct rack_sendmap *rsm, fe; 3885 3886 tp->t_flags |= TF_GPUTINPROG; 3887 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 3888 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 3889 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 3890 rack->app_limited_needs_set = 0; 3891 tp->gput_seq = th_ack; 3892 if (rack->in_probe_rtt) 3893 rack->measure_saw_probe_rtt = 1; 3894 else if ((rack->measure_saw_probe_rtt) && 3895 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 3896 rack->measure_saw_probe_rtt = 0; 3897 if ((rack->r_ctl.rc_first_appl->r_start - th_ack) >= rack_get_measure_window(tp, rack)) { 3898 /* There is a full window to gain info from */ 3899 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 3900 } else { 3901 /* We can only measure up to the applimited point */ 3902 tp->gput_ack = tp->gput_seq + (rack->r_ctl.rc_first_appl->r_start - th_ack); 3903 } 3904 /* 3905 * Now we need to find the timestamp of the send at tp->gput_seq 3906 * for the send based measurement. 3907 */ 3908 fe.r_start = tp->gput_seq; 3909 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 3910 if (rsm) { 3911 /* Ok send-based limit is set */ 3912 if (SEQ_LT(rsm->r_start, tp->gput_seq)) { 3913 /* 3914 * Move back to include the earlier part 3915 * so our ack time lines up right (this may 3916 * make an overlapping measurement but thats 3917 * ok). 3918 */ 3919 tp->gput_seq = rsm->r_start; 3920 } 3921 if (rsm->r_flags & RACK_ACKED) 3922 tp->gput_ts = rsm->r_ack_arrival; 3923 else 3924 rack->app_limited_needs_set = 1; 3925 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 3926 } else { 3927 /* 3928 * If we don't find the rsm due to some 3929 * send-limit set the current time, which 3930 * basically disables the send-limit. 3931 */ 3932 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 3933 } 3934 rack_log_pacing_delay_calc(rack, 3935 tp->gput_seq, 3936 tp->gput_ack, 3937 (uint64_t)rsm, 3938 tp->gput_ts, 3939 rack->r_ctl.rc_app_limited_cnt, 3940 9, 3941 __LINE__, NULL); 3942 } 3943 } 3944 3945 /* 3946 * CC wrapper hook functions 3947 */ 3948 static void 3949 rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, 3950 uint16_t type, int32_t recovery) 3951 { 3952 INP_WLOCK_ASSERT(tp->t_inpcb); 3953 tp->ccv->nsegs = nsegs; 3954 tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); 3955 if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { 3956 uint32_t max; 3957 3958 max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); 3959 if (tp->ccv->bytes_this_ack > max) { 3960 tp->ccv->bytes_this_ack = max; 3961 } 3962 } 3963 if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) 3964 tp->ccv->flags |= CCF_CWND_LIMITED; 3965 else 3966 tp->ccv->flags &= ~CCF_CWND_LIMITED; 3967 #ifdef STATS 3968 stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, 3969 ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); 3970 #endif 3971 if ((tp->t_flags & TF_GPUTINPROG) && 3972 rack_enough_for_measurement(tp, rack, th->th_ack)) { 3973 /* Measure the Goodput */ 3974 rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); 3975 #ifdef NETFLIX_PEAKRATE 3976 if ((type == CC_ACK) && 3977 (tp->t_maxpeakrate)) { 3978 /* 3979 * We update t_peakrate_thr. This gives us roughly 3980 * one update per round trip time. Note 3981 * it will only be used if pace_always is off i.e 3982 * we don't do this for paced flows. 3983 */ 3984 tcp_update_peakrate_thr(tp); 3985 } 3986 #endif 3987 } 3988 if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { 3989 tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, 3990 nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); 3991 if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { 3992 tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; 3993 tp->ccv->flags |= CCF_ABC_SENTAWND; 3994 } 3995 } else { 3996 tp->ccv->flags &= ~CCF_ABC_SENTAWND; 3997 tp->t_bytes_acked = 0; 3998 } 3999 if (CC_ALGO(tp)->ack_received != NULL) { 4000 /* XXXLAS: Find a way to live without this */ 4001 tp->ccv->curack = th->th_ack; 4002 CC_ALGO(tp)->ack_received(tp->ccv, type); 4003 } 4004 #ifdef STATS 4005 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); 4006 #endif 4007 if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { 4008 rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; 4009 } 4010 #ifdef NETFLIX_PEAKRATE 4011 /* we enforce max peak rate if it is set and we are not pacing */ 4012 if ((rack->rc_always_pace == 0) && 4013 tp->t_peakrate_thr && 4014 (tp->snd_cwnd > tp->t_peakrate_thr)) { 4015 tp->snd_cwnd = tp->t_peakrate_thr; 4016 } 4017 #endif 4018 } 4019 4020 static void 4021 tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) 4022 { 4023 struct tcp_rack *rack; 4024 4025 rack = (struct tcp_rack *)tp->t_fb_ptr; 4026 INP_WLOCK_ASSERT(tp->t_inpcb); 4027 /* 4028 * If we are doing PRR and have enough 4029 * room to send <or> we are pacing and prr 4030 * is disabled we will want to see if we 4031 * can send data (by setting r_wanted_output to 4032 * true). 4033 */ 4034 if ((rack->r_ctl.rc_prr_sndcnt > 0) || 4035 rack->rack_no_prr) 4036 rack->r_wanted_output = 1; 4037 } 4038 4039 static void 4040 rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) 4041 { 4042 struct tcp_rack *rack; 4043 uint32_t orig_cwnd; 4044 4045 4046 orig_cwnd = tp->snd_cwnd; 4047 INP_WLOCK_ASSERT(tp->t_inpcb); 4048 rack = (struct tcp_rack *)tp->t_fb_ptr; 4049 if (rack->rc_not_backing_off == 0) { 4050 /* only alert CC if we alerted when we entered */ 4051 if (CC_ALGO(tp)->post_recovery != NULL) { 4052 tp->ccv->curack = th->th_ack; 4053 CC_ALGO(tp)->post_recovery(tp->ccv); 4054 } 4055 if (tp->snd_cwnd > tp->snd_ssthresh) { 4056 /* Drop us down to the ssthresh (1/2 cwnd at loss) */ 4057 tp->snd_cwnd = tp->snd_ssthresh; 4058 } 4059 } 4060 if ((rack->rack_no_prr == 0) && 4061 (rack->r_ctl.rc_prr_sndcnt > 0)) { 4062 /* Suck the next prr cnt back into cwnd */ 4063 tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; 4064 rack->r_ctl.rc_prr_sndcnt = 0; 4065 rack_log_to_prr(rack, 1, 0); 4066 } 4067 rack_log_to_prr(rack, 14, orig_cwnd); 4068 tp->snd_recover = tp->snd_una; 4069 EXIT_RECOVERY(tp->t_flags); 4070 } 4071 4072 static void 4073 rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) 4074 { 4075 struct tcp_rack *rack; 4076 4077 INP_WLOCK_ASSERT(tp->t_inpcb); 4078 4079 rack = (struct tcp_rack *)tp->t_fb_ptr; 4080 switch (type) { 4081 case CC_NDUPACK: 4082 tp->t_flags &= ~TF_WASFRECOVERY; 4083 tp->t_flags &= ~TF_WASCRECOVERY; 4084 if (!IN_FASTRECOVERY(tp->t_flags)) { 4085 rack->r_ctl.rc_prr_delivered = 0; 4086 rack->r_ctl.rc_prr_out = 0; 4087 if (rack->rack_no_prr == 0) { 4088 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 4089 rack_log_to_prr(rack, 2, 0); 4090 } 4091 rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; 4092 tp->snd_recover = tp->snd_max; 4093 if (tp->t_flags2 & TF2_ECN_PERMIT) 4094 tp->t_flags2 |= TF2_ECN_SND_CWR; 4095 } 4096 break; 4097 case CC_ECN: 4098 if (!IN_CONGRECOVERY(tp->t_flags) || 4099 /* 4100 * Allow ECN reaction on ACK to CWR, if 4101 * that data segment was also CE marked. 4102 */ 4103 SEQ_GEQ(th->th_ack, tp->snd_recover)) { 4104 EXIT_CONGRECOVERY(tp->t_flags); 4105 KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); 4106 tp->snd_recover = tp->snd_max + 1; 4107 if (tp->t_flags2 & TF2_ECN_PERMIT) 4108 tp->t_flags2 |= TF2_ECN_SND_CWR; 4109 } 4110 break; 4111 case CC_RTO: 4112 tp->t_dupacks = 0; 4113 tp->t_bytes_acked = 0; 4114 EXIT_RECOVERY(tp->t_flags); 4115 tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / 4116 ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); 4117 tp->snd_cwnd = ctf_fixed_maxseg(tp); 4118 if (tp->t_flags2 & TF2_ECN_PERMIT) 4119 tp->t_flags2 |= TF2_ECN_SND_CWR; 4120 break; 4121 case CC_RTO_ERR: 4122 KMOD_TCPSTAT_INC(tcps_sndrexmitbad); 4123 /* RTO was unnecessary, so reset everything. */ 4124 tp->snd_cwnd = tp->snd_cwnd_prev; 4125 tp->snd_ssthresh = tp->snd_ssthresh_prev; 4126 tp->snd_recover = tp->snd_recover_prev; 4127 if (tp->t_flags & TF_WASFRECOVERY) { 4128 ENTER_FASTRECOVERY(tp->t_flags); 4129 tp->t_flags &= ~TF_WASFRECOVERY; 4130 } 4131 if (tp->t_flags & TF_WASCRECOVERY) { 4132 ENTER_CONGRECOVERY(tp->t_flags); 4133 tp->t_flags &= ~TF_WASCRECOVERY; 4134 } 4135 tp->snd_nxt = tp->snd_max; 4136 tp->t_badrxtwin = 0; 4137 break; 4138 } 4139 /* 4140 * If we are below our max rtt, don't 4141 * signal the CC control to change things. 4142 * instead set it up so that we are in 4143 * recovery but not going to back off. 4144 */ 4145 4146 if (rack->rc_highly_buffered) { 4147 /* 4148 * Do we use the higher rtt for 4149 * our threshold to not backoff (like CDG)? 4150 */ 4151 uint32_t rtt_mul, rtt_div; 4152 4153 if (rack_use_max_for_nobackoff) { 4154 rtt_mul = (rack_gp_rtt_maxmul - 1); 4155 rtt_div = 1; 4156 } else { 4157 rtt_mul = rack_gp_rtt_minmul; 4158 rtt_div = max(rack_gp_rtt_mindiv , 1); 4159 } 4160 if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + 4161 ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / 4162 rtt_div))) { 4163 /* below our min threshold */ 4164 rack->rc_not_backing_off = 1; 4165 ENTER_RECOVERY(rack->rc_tp->t_flags); 4166 rack_log_rtt_shrinks(rack, 0, 4167 rtt_mul, 4168 rtt_div, 4169 RACK_RTTS_NOBACKOFF); 4170 return; 4171 } 4172 } 4173 rack->rc_not_backing_off = 0; 4174 if (CC_ALGO(tp)->cong_signal != NULL) { 4175 if (th != NULL) 4176 tp->ccv->curack = th->th_ack; 4177 CC_ALGO(tp)->cong_signal(tp->ccv, type); 4178 } 4179 } 4180 4181 4182 4183 static inline void 4184 rack_cc_after_idle(struct tcp_rack *rack, struct tcpcb *tp) 4185 { 4186 uint32_t i_cwnd; 4187 4188 INP_WLOCK_ASSERT(tp->t_inpcb); 4189 4190 #ifdef NETFLIX_STATS 4191 KMOD_TCPSTAT_INC(tcps_idle_restarts); 4192 if (tp->t_state == TCPS_ESTABLISHED) 4193 KMOD_TCPSTAT_INC(tcps_idle_estrestarts); 4194 #endif 4195 if (CC_ALGO(tp)->after_idle != NULL) 4196 CC_ALGO(tp)->after_idle(tp->ccv); 4197 4198 if (tp->snd_cwnd == 1) 4199 i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ 4200 else 4201 i_cwnd = rc_init_window(rack); 4202 4203 /* 4204 * Being idle is no differnt than the initial window. If the cc 4205 * clamps it down below the initial window raise it to the initial 4206 * window. 4207 */ 4208 if (tp->snd_cwnd < i_cwnd) { 4209 tp->snd_cwnd = i_cwnd; 4210 } 4211 } 4212 4213 4214 /* 4215 * Indicate whether this ack should be delayed. We can delay the ack if 4216 * following conditions are met: 4217 * - There is no delayed ack timer in progress. 4218 * - Our last ack wasn't a 0-sized window. We never want to delay 4219 * the ack that opens up a 0-sized window. 4220 * - LRO wasn't used for this segment. We make sure by checking that the 4221 * segment size is not larger than the MSS. 4222 * - Delayed acks are enabled or this is a half-synchronized T/TCP 4223 * connection. 4224 */ 4225 #define DELAY_ACK(tp, tlen) \ 4226 (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ 4227 ((tp->t_flags & TF_DELACK) == 0) && \ 4228 (tlen <= tp->t_maxseg) && \ 4229 (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) 4230 4231 static struct rack_sendmap * 4232 rack_find_lowest_rsm(struct tcp_rack *rack) 4233 { 4234 struct rack_sendmap *rsm; 4235 4236 /* 4237 * Walk the time-order transmitted list looking for an rsm that is 4238 * not acked. This will be the one that was sent the longest time 4239 * ago that is still outstanding. 4240 */ 4241 TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { 4242 if (rsm->r_flags & RACK_ACKED) { 4243 continue; 4244 } 4245 goto finish; 4246 } 4247 finish: 4248 return (rsm); 4249 } 4250 4251 static struct rack_sendmap * 4252 rack_find_high_nonack(struct tcp_rack *rack, struct rack_sendmap *rsm) 4253 { 4254 struct rack_sendmap *prsm; 4255 4256 /* 4257 * Walk the sequence order list backward until we hit and arrive at 4258 * the highest seq not acked. In theory when this is called it 4259 * should be the last segment (which it was not). 4260 */ 4261 counter_u64_add(rack_find_high, 1); 4262 prsm = rsm; 4263 RB_FOREACH_REVERSE_FROM(prsm, rack_rb_tree_head, rsm) { 4264 if (prsm->r_flags & (RACK_ACKED | RACK_HAS_FIN)) { 4265 continue; 4266 } 4267 return (prsm); 4268 } 4269 return (NULL); 4270 } 4271 4272 4273 static uint32_t 4274 rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) 4275 { 4276 int32_t lro; 4277 uint32_t thresh; 4278 4279 /* 4280 * lro is the flag we use to determine if we have seen reordering. 4281 * If it gets set we have seen reordering. The reorder logic either 4282 * works in one of two ways: 4283 * 4284 * If reorder-fade is configured, then we track the last time we saw 4285 * re-ordering occur. If we reach the point where enough time as 4286 * passed we no longer consider reordering has occuring. 4287 * 4288 * Or if reorder-face is 0, then once we see reordering we consider 4289 * the connection to alway be subject to reordering and just set lro 4290 * to 1. 4291 * 4292 * In the end if lro is non-zero we add the extra time for 4293 * reordering in. 4294 */ 4295 if (srtt == 0) 4296 srtt = 1; 4297 if (rack->r_ctl.rc_reorder_ts) { 4298 if (rack->r_ctl.rc_reorder_fade) { 4299 if (SEQ_GEQ(cts, rack->r_ctl.rc_reorder_ts)) { 4300 lro = cts - rack->r_ctl.rc_reorder_ts; 4301 if (lro == 0) { 4302 /* 4303 * No time as passed since the last 4304 * reorder, mark it as reordering. 4305 */ 4306 lro = 1; 4307 } 4308 } else { 4309 /* Negative time? */ 4310 lro = 0; 4311 } 4312 if (lro > rack->r_ctl.rc_reorder_fade) { 4313 /* Turn off reordering seen too */ 4314 rack->r_ctl.rc_reorder_ts = 0; 4315 lro = 0; 4316 } 4317 } else { 4318 /* Reodering does not fade */ 4319 lro = 1; 4320 } 4321 } else { 4322 lro = 0; 4323 } 4324 thresh = srtt + rack->r_ctl.rc_pkt_delay; 4325 if (lro) { 4326 /* It must be set, if not you get 1/4 rtt */ 4327 if (rack->r_ctl.rc_reorder_shift) 4328 thresh += (srtt >> rack->r_ctl.rc_reorder_shift); 4329 else 4330 thresh += (srtt >> 2); 4331 } else { 4332 thresh += 1; 4333 } 4334 /* We don't let the rack timeout be above a RTO */ 4335 if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { 4336 thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); 4337 } 4338 /* And we don't want it above the RTO max either */ 4339 if (thresh > rack_rto_max) { 4340 thresh = rack_rto_max; 4341 } 4342 return (thresh); 4343 } 4344 4345 static uint32_t 4346 rack_calc_thresh_tlp(struct tcpcb *tp, struct tcp_rack *rack, 4347 struct rack_sendmap *rsm, uint32_t srtt) 4348 { 4349 struct rack_sendmap *prsm; 4350 uint32_t thresh, len; 4351 int segsiz; 4352 4353 if (srtt == 0) 4354 srtt = 1; 4355 if (rack->r_ctl.rc_tlp_threshold) 4356 thresh = srtt + (srtt / rack->r_ctl.rc_tlp_threshold); 4357 else 4358 thresh = (srtt * 2); 4359 4360 /* Get the previous sent packet, if any */ 4361 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 4362 counter_u64_add(rack_enter_tlp_calc, 1); 4363 len = rsm->r_end - rsm->r_start; 4364 if (rack->rack_tlp_threshold_use == TLP_USE_ID) { 4365 /* Exactly like the ID */ 4366 if (((tp->snd_max - tp->snd_una) - rack->r_ctl.rc_sacked + rack->r_ctl.rc_holes_rxt) <= segsiz) { 4367 uint32_t alt_thresh; 4368 /* 4369 * Compensate for delayed-ack with the d-ack time. 4370 */ 4371 counter_u64_add(rack_used_tlpmethod, 1); 4372 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4373 if (alt_thresh > thresh) 4374 thresh = alt_thresh; 4375 } 4376 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_ONE) { 4377 /* 2.1 behavior */ 4378 prsm = TAILQ_PREV(rsm, rack_head, r_tnext); 4379 if (prsm && (len <= segsiz)) { 4380 /* 4381 * Two packets outstanding, thresh should be (2*srtt) + 4382 * possible inter-packet delay (if any). 4383 */ 4384 uint32_t inter_gap = 0; 4385 int idx, nidx; 4386 4387 counter_u64_add(rack_used_tlpmethod, 1); 4388 idx = rsm->r_rtr_cnt - 1; 4389 nidx = prsm->r_rtr_cnt - 1; 4390 if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { 4391 /* Yes it was sent later (or at the same time) */ 4392 inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; 4393 } 4394 thresh += inter_gap; 4395 } else if (len <= segsiz) { 4396 /* 4397 * Possibly compensate for delayed-ack. 4398 */ 4399 uint32_t alt_thresh; 4400 4401 counter_u64_add(rack_used_tlpmethod2, 1); 4402 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4403 if (alt_thresh > thresh) 4404 thresh = alt_thresh; 4405 } 4406 } else if (rack->rack_tlp_threshold_use == TLP_USE_TWO_TWO) { 4407 /* 2.2 behavior */ 4408 if (len <= segsiz) { 4409 uint32_t alt_thresh; 4410 /* 4411 * Compensate for delayed-ack with the d-ack time. 4412 */ 4413 counter_u64_add(rack_used_tlpmethod, 1); 4414 alt_thresh = srtt + (srtt / 2) + rack_delayed_ack_time; 4415 if (alt_thresh > thresh) 4416 thresh = alt_thresh; 4417 } 4418 } 4419 /* Not above an RTO */ 4420 if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { 4421 thresh = TICKS_2_MSEC(tp->t_rxtcur); 4422 } 4423 /* Not above a RTO max */ 4424 if (thresh > rack_rto_max) { 4425 thresh = rack_rto_max; 4426 } 4427 /* Apply user supplied min TLP */ 4428 if (thresh < rack_tlp_min) { 4429 thresh = rack_tlp_min; 4430 } 4431 return (thresh); 4432 } 4433 4434 static uint32_t 4435 rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack) 4436 { 4437 /* 4438 * We want the rack_rtt which is the 4439 * last rtt we measured. However if that 4440 * does not exist we fallback to the srtt (which 4441 * we probably will never do) and then as a last 4442 * resort we use RACK_INITIAL_RTO if no srtt is 4443 * yet set. 4444 */ 4445 if (rack->rc_rack_rtt) 4446 return(rack->rc_rack_rtt); 4447 else if (tp->t_srtt == 0) 4448 return(RACK_INITIAL_RTO); 4449 return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); 4450 } 4451 4452 static struct rack_sendmap * 4453 rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused) 4454 { 4455 /* 4456 * Check to see that we don't need to fall into recovery. We will 4457 * need to do so if our oldest transmit is past the time we should 4458 * have had an ack. 4459 */ 4460 struct tcp_rack *rack; 4461 struct rack_sendmap *rsm; 4462 int32_t idx; 4463 uint32_t srtt, thresh; 4464 4465 rack = (struct tcp_rack *)tp->t_fb_ptr; 4466 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 4467 return (NULL); 4468 } 4469 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4470 if (rsm == NULL) 4471 return (NULL); 4472 4473 if (rsm->r_flags & RACK_ACKED) { 4474 rsm = rack_find_lowest_rsm(rack); 4475 if (rsm == NULL) 4476 return (NULL); 4477 } 4478 idx = rsm->r_rtr_cnt - 1; 4479 srtt = rack_grab_rtt(tp, rack); 4480 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 4481 if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { 4482 return (NULL); 4483 } 4484 if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { 4485 return (NULL); 4486 } 4487 /* Ok if we reach here we are over-due and this guy can be sent */ 4488 if (IN_RECOVERY(tp->t_flags) == 0) { 4489 /* 4490 * For the one that enters us into recovery record undo 4491 * info. 4492 */ 4493 rack->r_ctl.rc_rsm_start = rsm->r_start; 4494 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 4495 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 4496 } 4497 rack_cong_signal(tp, NULL, CC_NDUPACK); 4498 return (rsm); 4499 } 4500 4501 static uint32_t 4502 rack_get_persists_timer_val(struct tcpcb *tp, struct tcp_rack *rack) 4503 { 4504 int32_t t; 4505 int32_t tt; 4506 uint32_t ret_val; 4507 4508 t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); 4509 TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], 4510 rack_persist_min, rack_persist_max); 4511 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 4512 tp->t_rxtshift++; 4513 rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; 4514 ret_val = (uint32_t)tt; 4515 return (ret_val); 4516 } 4517 4518 static uint32_t 4519 rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) 4520 { 4521 /* 4522 * Start the FR timer, we do this based on getting the first one in 4523 * the rc_tmap. Note that if its NULL we must stop the timer. in all 4524 * events we need to stop the running timer (if its running) before 4525 * starting the new one. 4526 */ 4527 uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; 4528 uint32_t srtt_cur; 4529 int32_t idx; 4530 int32_t is_tlp_timer = 0; 4531 struct rack_sendmap *rsm; 4532 4533 if (rack->t_timers_stopped) { 4534 /* All timers have been stopped none are to run */ 4535 return (0); 4536 } 4537 if (rack->rc_in_persist) { 4538 /* We can't start any timer in persists */ 4539 return (rack_get_persists_timer_val(tp, rack)); 4540 } 4541 rack->rc_on_min_to = 0; 4542 if ((tp->t_state < TCPS_ESTABLISHED) || 4543 ((tp->t_flags & TF_SACK_PERMIT) == 0)) 4544 goto activate_rxt; 4545 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4546 if ((rsm == NULL) || sup_rack) { 4547 /* Nothing on the send map */ 4548 activate_rxt: 4549 time_since_sent = 0; 4550 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 4551 if (rsm) { 4552 idx = rsm->r_rtr_cnt - 1; 4553 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4554 tstmp_touse = rsm->r_tim_lastsent[idx]; 4555 else 4556 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4557 if (TSTMP_GT(cts, tstmp_touse)) 4558 time_since_sent = cts - tstmp_touse; 4559 } 4560 if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { 4561 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; 4562 to = TICKS_2_MSEC(tp->t_rxtcur); 4563 if (to > time_since_sent) 4564 to -= time_since_sent; 4565 else 4566 to = rack->r_ctl.rc_min_to; 4567 if (to == 0) 4568 to = 1; 4569 return (to); 4570 } 4571 return (0); 4572 } 4573 if (rsm->r_flags & RACK_ACKED) { 4574 rsm = rack_find_lowest_rsm(rack); 4575 if (rsm == NULL) { 4576 /* No lowest? */ 4577 goto activate_rxt; 4578 } 4579 } 4580 if (rack->sack_attack_disable) { 4581 /* 4582 * We don't want to do 4583 * any TLP's if you are an attacker. 4584 * Though if you are doing what 4585 * is expected you may still have 4586 * SACK-PASSED marks. 4587 */ 4588 goto activate_rxt; 4589 } 4590 /* Convert from ms to usecs */ 4591 if (rsm->r_flags & RACK_SACK_PASSED) { 4592 if ((tp->t_flags & TF_SENTFIN) && 4593 ((tp->snd_max - tp->snd_una) == 1) && 4594 (rsm->r_flags & RACK_HAS_FIN)) { 4595 /* 4596 * We don't start a rack timer if all we have is a 4597 * FIN outstanding. 4598 */ 4599 goto activate_rxt; 4600 } 4601 if ((rack->use_rack_rr == 0) && 4602 (IN_RECOVERY(tp->t_flags)) && 4603 (rack->rack_no_prr == 0) && 4604 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 4605 /* 4606 * We are not cheating, in recovery and 4607 * not enough ack's to yet get our next 4608 * retransmission out. 4609 * 4610 * Note that classified attackers do not 4611 * get to use the rack-cheat. 4612 */ 4613 goto activate_tlp; 4614 } 4615 srtt = rack_grab_rtt(tp, rack); 4616 thresh = rack_calc_thresh_rack(rack, srtt, cts); 4617 idx = rsm->r_rtr_cnt - 1; 4618 exp = rsm->r_tim_lastsent[idx] + thresh; 4619 if (SEQ_GEQ(exp, cts)) { 4620 to = exp - cts; 4621 if (to < rack->r_ctl.rc_min_to) { 4622 to = rack->r_ctl.rc_min_to; 4623 if (rack->r_rr_config == 3) 4624 rack->rc_on_min_to = 1; 4625 } 4626 } else { 4627 to = rack->r_ctl.rc_min_to; 4628 if (rack->r_rr_config == 3) 4629 rack->rc_on_min_to = 1; 4630 } 4631 } else { 4632 /* Ok we need to do a TLP not RACK */ 4633 activate_tlp: 4634 if ((rack->rc_tlp_in_progress != 0) && 4635 (rack->r_ctl.rc_tlp_cnt_out >= rack_tlp_limit)) { 4636 /* 4637 * The previous send was a TLP and we have sent 4638 * N TLP's without sending new data. 4639 */ 4640 goto activate_rxt; 4641 } 4642 rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 4643 if (rsm == NULL) { 4644 /* We found no rsm to TLP with. */ 4645 goto activate_rxt; 4646 } 4647 if (rsm->r_flags & RACK_HAS_FIN) { 4648 /* If its a FIN we dont do TLP */ 4649 rsm = NULL; 4650 goto activate_rxt; 4651 } 4652 idx = rsm->r_rtr_cnt - 1; 4653 time_since_sent = 0; 4654 if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) 4655 tstmp_touse = rsm->r_tim_lastsent[idx]; 4656 else 4657 tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; 4658 if (TSTMP_GT(cts, tstmp_touse)) 4659 time_since_sent = cts - tstmp_touse; 4660 is_tlp_timer = 1; 4661 if (tp->t_srtt) { 4662 srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); 4663 srtt = TICKS_2_MSEC(srtt_cur); 4664 } else 4665 srtt = RACK_INITIAL_RTO; 4666 /* 4667 * If the SRTT is not keeping up and the 4668 * rack RTT has spiked we want to use 4669 * the last RTT not the smoothed one. 4670 */ 4671 if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) 4672 srtt = rack_grab_rtt(tp, rack); 4673 thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); 4674 if (thresh > time_since_sent) 4675 to = thresh - time_since_sent; 4676 else { 4677 to = rack->r_ctl.rc_min_to; 4678 rack_log_alt_to_to_cancel(rack, 4679 thresh, /* flex1 */ 4680 time_since_sent, /* flex2 */ 4681 tstmp_touse, /* flex3 */ 4682 rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ 4683 rsm->r_tim_lastsent[idx], 4684 srtt, 4685 idx, 99); 4686 } 4687 if (to > TCPTV_REXMTMAX) { 4688 /* 4689 * If the TLP time works out to larger than the max 4690 * RTO lets not do TLP.. just RTO. 4691 */ 4692 goto activate_rxt; 4693 } 4694 } 4695 if (is_tlp_timer == 0) { 4696 rack->r_ctl.rc_hpts_flags |= PACE_TMR_RACK; 4697 } else { 4698 rack->r_ctl.rc_hpts_flags |= PACE_TMR_TLP; 4699 } 4700 if (to == 0) 4701 to = 1; 4702 return (to); 4703 } 4704 4705 static void 4706 rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4707 { 4708 if (rack->rc_in_persist == 0) { 4709 if (tp->t_flags & TF_GPUTINPROG) { 4710 /* 4711 * Stop the goodput now, the calling of the 4712 * measurement function clears the flag. 4713 */ 4714 rack_do_goodput_measurement(tp, rack, tp->snd_una, __LINE__); 4715 } 4716 #ifdef NETFLIX_SHARED_CWND 4717 if (rack->r_ctl.rc_scw) { 4718 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4719 rack->rack_scwnd_is_idle = 1; 4720 } 4721 #endif 4722 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 4723 if (rack->r_ctl.rc_went_idle_time == 0) 4724 rack->r_ctl.rc_went_idle_time = 1; 4725 rack_timer_cancel(tp, rack, cts, __LINE__); 4726 tp->t_rxtshift = 0; 4727 rack->rc_in_persist = 1; 4728 } 4729 } 4730 4731 static void 4732 rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 4733 { 4734 if (rack->rc_inp->inp_in_hpts) { 4735 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 4736 rack->r_ctl.rc_hpts_flags = 0; 4737 } 4738 #ifdef NETFLIX_SHARED_CWND 4739 if (rack->r_ctl.rc_scw) { 4740 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 4741 rack->rack_scwnd_is_idle = 0; 4742 } 4743 #endif 4744 if (rack->rc_gp_dyn_mul && 4745 (rack->use_fixed_rate == 0) && 4746 (rack->rc_always_pace)) { 4747 /* 4748 * Do we count this as if a probe-rtt just 4749 * finished? 4750 */ 4751 uint32_t time_idle, idle_min; 4752 4753 time_idle = tcp_get_usecs(NULL) - rack->r_ctl.rc_went_idle_time; 4754 idle_min = rack_min_probertt_hold; 4755 if (rack_probertt_gpsrtt_cnt_div) { 4756 uint64_t extra; 4757 extra = (uint64_t)rack->r_ctl.rc_gp_srtt * 4758 (uint64_t)rack_probertt_gpsrtt_cnt_mul; 4759 extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; 4760 idle_min += (uint32_t)extra; 4761 } 4762 if (time_idle >= idle_min) { 4763 /* Yes, we count it as a probe-rtt. */ 4764 uint32_t us_cts; 4765 4766 us_cts = tcp_get_usecs(NULL); 4767 if (rack->in_probe_rtt == 0) { 4768 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 4769 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 4770 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 4771 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 4772 } else { 4773 rack_exit_probertt(rack, us_cts); 4774 } 4775 } 4776 4777 } 4778 rack->rc_in_persist = 0; 4779 rack->r_ctl.rc_went_idle_time = 0; 4780 tp->t_rxtshift = 0; 4781 rack->r_ctl.rc_agg_delayed = 0; 4782 rack->r_early = 0; 4783 rack->r_late = 0; 4784 rack->r_ctl.rc_agg_early = 0; 4785 } 4786 4787 static void 4788 rack_log_hpts_diag(struct tcp_rack *rack, uint32_t cts, 4789 struct hpts_diag *diag, struct timeval *tv) 4790 { 4791 if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 4792 union tcp_log_stackspecific log; 4793 4794 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 4795 log.u_bbr.flex1 = diag->p_nxt_slot; 4796 log.u_bbr.flex2 = diag->p_cur_slot; 4797 log.u_bbr.flex3 = diag->slot_req; 4798 log.u_bbr.flex4 = diag->inp_hptsslot; 4799 log.u_bbr.flex5 = diag->slot_remaining; 4800 log.u_bbr.flex6 = diag->need_new_to; 4801 log.u_bbr.flex7 = diag->p_hpts_active; 4802 log.u_bbr.flex8 = diag->p_on_min_sleep; 4803 /* Hijack other fields as needed */ 4804 log.u_bbr.epoch = diag->have_slept; 4805 log.u_bbr.lt_epoch = diag->yet_to_sleep; 4806 log.u_bbr.pkts_out = diag->co_ret; 4807 log.u_bbr.applimited = diag->hpts_sleep_time; 4808 log.u_bbr.delivered = diag->p_prev_slot; 4809 log.u_bbr.inflight = diag->p_runningtick; 4810 log.u_bbr.bw_inuse = diag->wheel_tick; 4811 log.u_bbr.rttProp = diag->wheel_cts; 4812 log.u_bbr.timeStamp = cts; 4813 log.u_bbr.delRate = diag->maxticks; 4814 log.u_bbr.cur_del_rate = diag->p_curtick; 4815 log.u_bbr.cur_del_rate <<= 32; 4816 log.u_bbr.cur_del_rate |= diag->p_lasttick; 4817 TCP_LOG_EVENTP(rack->rc_tp, NULL, 4818 &rack->rc_inp->inp_socket->so_rcv, 4819 &rack->rc_inp->inp_socket->so_snd, 4820 BBR_LOG_HPTSDIAG, 0, 4821 0, &log, false, tv); 4822 } 4823 4824 } 4825 4826 static void 4827 rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, 4828 int32_t slot, uint32_t tot_len_this_send, int sup_rack) 4829 { 4830 struct hpts_diag diag; 4831 struct inpcb *inp; 4832 struct timeval tv; 4833 uint32_t delayed_ack = 0; 4834 uint32_t hpts_timeout; 4835 uint8_t stopped; 4836 uint32_t left = 0; 4837 uint32_t us_cts; 4838 4839 inp = tp->t_inpcb; 4840 if ((tp->t_state == TCPS_CLOSED) || 4841 (tp->t_state == TCPS_LISTEN)) { 4842 return; 4843 } 4844 if (inp->inp_in_hpts) { 4845 /* Already on the pacer */ 4846 return; 4847 } 4848 stopped = rack->rc_tmr_stopped; 4849 if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { 4850 left = rack->r_ctl.rc_timer_exp - cts; 4851 } 4852 rack->r_ctl.rc_timer_exp = 0; 4853 rack->r_ctl.rc_hpts_flags = 0; 4854 us_cts = tcp_get_usecs(&tv); 4855 /* Now early/late accounting */ 4856 if (rack->r_early) { 4857 /* 4858 * We have a early carry over set, 4859 * we can always add more time so we 4860 * can always make this compensation. 4861 */ 4862 slot += rack->r_ctl.rc_agg_early; 4863 rack->r_early = 0; 4864 rack->r_ctl.rc_agg_early = 0; 4865 } 4866 if (rack->r_late) { 4867 /* 4868 * This is harder, we can 4869 * compensate some but it 4870 * really depends on what 4871 * the current pacing time is. 4872 */ 4873 if (rack->r_ctl.rc_agg_delayed >= slot) { 4874 /* 4875 * We can't compensate for it all. 4876 * And we have to have some time 4877 * on the clock. We always have a min 4878 * 10 slots (10 x 10 i.e. 100 usecs). 4879 */ 4880 if (slot <= HPTS_TICKS_PER_USEC) { 4881 /* We gain delay */ 4882 rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot); 4883 slot = HPTS_TICKS_PER_USEC; 4884 } else { 4885 /* We take off some */ 4886 rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC); 4887 slot = HPTS_TICKS_PER_USEC; 4888 } 4889 } else { 4890 4891 slot -= rack->r_ctl.rc_agg_delayed; 4892 rack->r_ctl.rc_agg_delayed = 0; 4893 /* Make sure we have 100 useconds at minimum */ 4894 if (slot < HPTS_TICKS_PER_USEC) { 4895 rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot; 4896 slot = HPTS_TICKS_PER_USEC; 4897 } 4898 if (rack->r_ctl.rc_agg_delayed == 0) 4899 rack->r_late = 0; 4900 } 4901 } 4902 if (slot) { 4903 /* We are pacing too */ 4904 rack->r_ctl.rc_hpts_flags |= PACE_PKT_OUTPUT; 4905 } 4906 hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); 4907 #ifdef NETFLIX_EXP_DETECTION 4908 if (rack->sack_attack_disable && 4909 (slot < tcp_sad_pacing_interval)) { 4910 /* 4911 * We have a potential attacker on 4912 * the line. We have possibly some 4913 * (or now) pacing time set. We want to 4914 * slow down the processing of sacks by some 4915 * amount (if it is an attacker). Set the default 4916 * slot for attackers in place (unless the orginal 4917 * interval is longer). Its stored in 4918 * micro-seconds, so lets convert to msecs. 4919 */ 4920 slot = tcp_sad_pacing_interval; 4921 } 4922 #endif 4923 if (tp->t_flags & TF_DELACK) { 4924 delayed_ack = TICKS_2_MSEC(tcp_delacktime); 4925 rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; 4926 } 4927 if (delayed_ack && ((hpts_timeout == 0) || 4928 (delayed_ack < hpts_timeout))) 4929 hpts_timeout = delayed_ack; 4930 else 4931 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 4932 /* 4933 * If no timers are going to run and we will fall off the hptsi 4934 * wheel, we resort to a keep-alive timer if its configured. 4935 */ 4936 if ((hpts_timeout == 0) && 4937 (slot == 0)) { 4938 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 4939 (tp->t_state <= TCPS_CLOSING)) { 4940 /* 4941 * Ok we have no timer (persists, rack, tlp, rxt or 4942 * del-ack), we don't have segments being paced. So 4943 * all that is left is the keepalive timer. 4944 */ 4945 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 4946 /* Get the established keep-alive time */ 4947 hpts_timeout = TP_KEEPIDLE(tp); 4948 } else { 4949 /* Get the initial setup keep-alive time */ 4950 hpts_timeout = TP_KEEPINIT(tp); 4951 } 4952 rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; 4953 if (rack->in_probe_rtt) { 4954 /* 4955 * We want to instead not wake up a long time from 4956 * now but to wake up about the time we would 4957 * exit probe-rtt and initiate a keep-alive ack. 4958 * This will get us out of probe-rtt and update 4959 * our min-rtt. 4960 */ 4961 hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); 4962 } 4963 } 4964 } 4965 if (left && (stopped & (PACE_TMR_KEEP | PACE_TMR_DELACK)) == 4966 (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK)) { 4967 /* 4968 * RACK, TLP, persists and RXT timers all are restartable 4969 * based on actions input .. i.e we received a packet (ack 4970 * or sack) and that changes things (rw, or snd_una etc). 4971 * Thus we can restart them with a new value. For 4972 * keep-alive, delayed_ack we keep track of what was left 4973 * and restart the timer with a smaller value. 4974 */ 4975 if (left < hpts_timeout) 4976 hpts_timeout = left; 4977 } 4978 if (hpts_timeout) { 4979 /* 4980 * Hack alert for now we can't time-out over 2,147,483 4981 * seconds (a bit more than 596 hours), which is probably ok 4982 * :). 4983 */ 4984 if (hpts_timeout > 0x7ffffffe) 4985 hpts_timeout = 0x7ffffffe; 4986 rack->r_ctl.rc_timer_exp = cts + hpts_timeout; 4987 } 4988 if ((rack->rc_gp_filled == 0) && 4989 (hpts_timeout < slot) && 4990 (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { 4991 /* 4992 * We have no good estimate yet for the 4993 * old clunky burst mitigation or the 4994 * real pacing. And the tlp or rxt is smaller 4995 * than the pacing calculation. Lets not 4996 * pace that long since we know the calculation 4997 * so far is not accurate. 4998 */ 4999 slot = hpts_timeout; 5000 } 5001 rack->r_ctl.last_pacing_time = slot; 5002 if (slot) { 5003 rack->r_ctl.rc_last_output_to = us_cts + slot; 5004 if (rack->rc_always_pace || rack->r_mbuf_queue) { 5005 if ((rack->rc_gp_filled == 0) || 5006 rack->pacing_longer_than_rtt) { 5007 inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); 5008 } else { 5009 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5010 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && 5011 (rack->r_rr_config != 3)) 5012 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5013 else 5014 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5015 } 5016 } 5017 if ((rack->use_rack_rr) && 5018 (rack->r_rr_config < 2) && 5019 ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { 5020 /* 5021 * Arrange for the hpts to kick back in after the 5022 * t-o if the t-o does not cause a send. 5023 */ 5024 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 5025 __LINE__, &diag); 5026 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5027 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5028 } else { 5029 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(slot), 5030 __LINE__, &diag); 5031 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5032 rack_log_to_start(rack, cts, hpts_timeout, slot, 1); 5033 } 5034 } else if (hpts_timeout) { 5035 if (rack->rc_always_pace || rack->r_mbuf_queue) { 5036 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { 5037 /* For a rack timer, don't wake us */ 5038 inp->inp_flags2 |= INP_MBUF_QUEUE_READY; 5039 if (rack->r_rr_config != 3) 5040 inp->inp_flags2 |= INP_DONT_SACK_QUEUE; 5041 else 5042 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5043 } else { 5044 /* All other timers wake us up */ 5045 inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 5046 inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 5047 } 5048 } 5049 (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), 5050 __LINE__, &diag); 5051 rack_log_hpts_diag(rack, us_cts, &diag, &tv); 5052 rack_log_to_start(rack, cts, hpts_timeout, slot, 0); 5053 } else { 5054 /* No timer starting */ 5055 #ifdef INVARIANTS 5056 if (SEQ_GT(tp->snd_max, tp->snd_una)) { 5057 panic("tp:%p rack:%p tlts:%d cts:%u slot:%u pto:%u -- no timer started?", 5058 tp, rack, tot_len_this_send, cts, slot, hpts_timeout); 5059 } 5060 #endif 5061 } 5062 rack->rc_tmr_stopped = 0; 5063 if (slot) 5064 rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv); 5065 } 5066 5067 /* 5068 * RACK Timer, here we simply do logging and house keeping. 5069 * the normal rack_output() function will call the 5070 * appropriate thing to check if we need to do a RACK retransmit. 5071 * We return 1, saying don't proceed with rack_output only 5072 * when all timers have been stopped (destroyed PCB?). 5073 */ 5074 static int 5075 rack_timeout_rack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5076 { 5077 /* 5078 * This timer simply provides an internal trigger to send out data. 5079 * The check_recovery_mode call will see if there are needed 5080 * retransmissions, if so we will enter fast-recovery. The output 5081 * call may or may not do the same thing depending on sysctl 5082 * settings. 5083 */ 5084 struct rack_sendmap *rsm; 5085 int32_t recovery; 5086 5087 if (tp->t_timers->tt_flags & TT_STOPPED) { 5088 return (1); 5089 } 5090 recovery = IN_RECOVERY(tp->t_flags); 5091 counter_u64_add(rack_to_tot, 1); 5092 if (rack->r_state && (rack->r_state != tp->t_state)) 5093 rack_set_state(tp, rack); 5094 rack->rc_on_min_to = 0; 5095 rsm = rack_check_recovery_mode(tp, cts); 5096 rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); 5097 if (rsm) { 5098 uint32_t rtt; 5099 5100 rack->r_ctl.rc_resend = rsm; 5101 if (rack->use_rack_rr) { 5102 /* 5103 * Don't accumulate extra pacing delay 5104 * we are allowing the rack timer to 5105 * over-ride pacing i.e. rrr takes precedence 5106 * if the pacing interval is longer than the rrr 5107 * time (in other words we get the min pacing 5108 * time versus rrr pacing time). 5109 */ 5110 rack->r_timer_override = 1; 5111 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5112 } 5113 rtt = rack->rc_rack_rtt; 5114 if (rtt == 0) 5115 rtt = 1; 5116 if (rack->rack_no_prr == 0) { 5117 if ((recovery == 0) && 5118 (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { 5119 /* 5120 * The rack-timeout that enter's us into recovery 5121 * will force out one MSS and set us up so that we 5122 * can do one more send in 2*rtt (transitioning the 5123 * rack timeout into a rack-tlp). 5124 */ 5125 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5126 rack->r_timer_override = 1; 5127 rack_log_to_prr(rack, 3, 0); 5128 } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && 5129 rack->use_rack_rr) { 5130 /* 5131 * When a rack timer goes, if the rack rr is 5132 * on, arrange it so we can send a full segment 5133 * overriding prr (though we pay a price for this 5134 * for future new sends). 5135 */ 5136 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 5137 rack_log_to_prr(rack, 4, 0); 5138 } 5139 } 5140 } 5141 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; 5142 if (rsm == NULL) { 5143 /* restart a timer and return 1 */ 5144 rack_start_hpts_timer(rack, tp, cts, 5145 0, 0, 0); 5146 return (1); 5147 } 5148 return (0); 5149 } 5150 5151 static __inline void 5152 rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, 5153 struct rack_sendmap *rsm, uint32_t start) 5154 { 5155 int idx; 5156 5157 nrsm->r_start = start; 5158 nrsm->r_end = rsm->r_end; 5159 nrsm->r_rtr_cnt = rsm->r_rtr_cnt; 5160 nrsm->r_flags = rsm->r_flags; 5161 nrsm->r_dupack = rsm->r_dupack; 5162 nrsm->usec_orig_send = rsm->usec_orig_send; 5163 nrsm->r_rtr_bytes = 0; 5164 rsm->r_end = nrsm->r_start; 5165 nrsm->r_just_ret = rsm->r_just_ret; 5166 for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { 5167 nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; 5168 } 5169 } 5170 5171 static struct rack_sendmap * 5172 rack_merge_rsm(struct tcp_rack *rack, 5173 struct rack_sendmap *l_rsm, 5174 struct rack_sendmap *r_rsm) 5175 { 5176 /* 5177 * We are merging two ack'd RSM's, 5178 * the l_rsm is on the left (lower seq 5179 * values) and the r_rsm is on the right 5180 * (higher seq value). The simplest way 5181 * to merge these is to move the right 5182 * one into the left. I don't think there 5183 * is any reason we need to try to find 5184 * the oldest (or last oldest retransmitted). 5185 */ 5186 struct rack_sendmap *rm; 5187 5188 l_rsm->r_end = r_rsm->r_end; 5189 if (l_rsm->r_dupack < r_rsm->r_dupack) 5190 l_rsm->r_dupack = r_rsm->r_dupack; 5191 if (r_rsm->r_rtr_bytes) 5192 l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; 5193 if (r_rsm->r_in_tmap) { 5194 /* This really should not happen */ 5195 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); 5196 r_rsm->r_in_tmap = 0; 5197 } 5198 5199 /* Now the flags */ 5200 if (r_rsm->r_flags & RACK_HAS_FIN) 5201 l_rsm->r_flags |= RACK_HAS_FIN; 5202 if (r_rsm->r_flags & RACK_TLP) 5203 l_rsm->r_flags |= RACK_TLP; 5204 if (r_rsm->r_flags & RACK_RWND_COLLAPSED) 5205 l_rsm->r_flags |= RACK_RWND_COLLAPSED; 5206 if ((r_rsm->r_flags & RACK_APP_LIMITED) && 5207 ((l_rsm->r_flags & RACK_APP_LIMITED) == 0)) { 5208 /* 5209 * If both are app-limited then let the 5210 * free lower the count. If right is app 5211 * limited and left is not, transfer. 5212 */ 5213 l_rsm->r_flags |= RACK_APP_LIMITED; 5214 r_rsm->r_flags &= ~RACK_APP_LIMITED; 5215 if (r_rsm == rack->r_ctl.rc_first_appl) 5216 rack->r_ctl.rc_first_appl = l_rsm; 5217 } 5218 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, r_rsm); 5219 #ifdef INVARIANTS 5220 if (rm != r_rsm) { 5221 panic("removing head in rack:%p rsm:%p rm:%p", 5222 rack, r_rsm, rm); 5223 } 5224 #endif 5225 if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { 5226 /* Transfer the split limit to the map we free */ 5227 r_rsm->r_limit_type = l_rsm->r_limit_type; 5228 l_rsm->r_limit_type = 0; 5229 } 5230 rack_free(rack, r_rsm); 5231 return(l_rsm); 5232 } 5233 5234 /* 5235 * TLP Timer, here we simply setup what segment we want to 5236 * have the TLP expire on, the normal rack_output() will then 5237 * send it out. 5238 * 5239 * We return 1, saying don't proceed with rack_output only 5240 * when all timers have been stopped (destroyed PCB?). 5241 */ 5242 static int 5243 rack_timeout_tlp(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5244 { 5245 /* 5246 * Tail Loss Probe. 5247 */ 5248 struct rack_sendmap *rsm = NULL; 5249 struct rack_sendmap *insret; 5250 struct socket *so; 5251 uint32_t amm, old_prr_snd = 0; 5252 uint32_t out, avail; 5253 int collapsed_win = 0; 5254 5255 if (tp->t_timers->tt_flags & TT_STOPPED) { 5256 return (1); 5257 } 5258 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5259 /* Its not time yet */ 5260 return (0); 5261 } 5262 if (ctf_progress_timeout_check(tp, true)) { 5263 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5264 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 5265 return (1); 5266 } 5267 /* 5268 * A TLP timer has expired. We have been idle for 2 rtts. So we now 5269 * need to figure out how to force a full MSS segment out. 5270 */ 5271 rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); 5272 counter_u64_add(rack_tlp_tot, 1); 5273 if (rack->r_state && (rack->r_state != tp->t_state)) 5274 rack_set_state(tp, rack); 5275 so = tp->t_inpcb->inp_socket; 5276 #ifdef KERN_TLS 5277 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 5278 /* 5279 * For hardware TLS we do *not* want to send 5280 * new data, lets instead just do a retransmission. 5281 */ 5282 goto need_retran; 5283 } 5284 #endif 5285 avail = sbavail(&so->so_snd); 5286 out = tp->snd_max - tp->snd_una; 5287 if (out > tp->snd_wnd) { 5288 /* special case, we need a retransmission */ 5289 collapsed_win = 1; 5290 goto need_retran; 5291 } 5292 /* 5293 * Check our send oldest always settings, and if 5294 * there is an oldest to send jump to the need_retran. 5295 */ 5296 if (rack_always_send_oldest && (TAILQ_EMPTY(&rack->r_ctl.rc_tmap) == 0)) 5297 goto need_retran; 5298 5299 if (avail > out) { 5300 /* New data is available */ 5301 amm = avail - out; 5302 if (amm > ctf_fixed_maxseg(tp)) { 5303 amm = ctf_fixed_maxseg(tp); 5304 if ((amm + out) > tp->snd_wnd) { 5305 /* We are rwnd limited */ 5306 goto need_retran; 5307 } 5308 } else if (amm < ctf_fixed_maxseg(tp)) { 5309 /* not enough to fill a MTU */ 5310 goto need_retran; 5311 } 5312 if (IN_RECOVERY(tp->t_flags)) { 5313 /* Unlikely */ 5314 if (rack->rack_no_prr == 0) { 5315 old_prr_snd = rack->r_ctl.rc_prr_sndcnt; 5316 if (out + amm <= tp->snd_wnd) { 5317 rack->r_ctl.rc_prr_sndcnt = amm; 5318 rack_log_to_prr(rack, 4, 0); 5319 } 5320 } else 5321 goto need_retran; 5322 } else { 5323 /* Set the send-new override */ 5324 if (out + amm <= tp->snd_wnd) 5325 rack->r_ctl.rc_tlp_new_data = amm; 5326 else 5327 goto need_retran; 5328 } 5329 rack->r_ctl.rc_tlpsend = NULL; 5330 counter_u64_add(rack_tlp_newdata, 1); 5331 goto send; 5332 } 5333 need_retran: 5334 /* 5335 * Ok we need to arrange the last un-acked segment to be re-sent, or 5336 * optionally the first un-acked segment. 5337 */ 5338 if (collapsed_win == 0) { 5339 if (rack_always_send_oldest) 5340 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 5341 else { 5342 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5343 if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { 5344 rsm = rack_find_high_nonack(rack, rsm); 5345 } 5346 } 5347 if (rsm == NULL) { 5348 counter_u64_add(rack_tlp_does_nada, 1); 5349 #ifdef TCP_BLACKBOX 5350 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5351 #endif 5352 goto out; 5353 } 5354 } else { 5355 /* 5356 * We must find the last segment 5357 * that was acceptable by the client. 5358 */ 5359 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5360 if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { 5361 /* Found one */ 5362 break; 5363 } 5364 } 5365 if (rsm == NULL) { 5366 /* None? if so send the first */ 5367 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5368 if (rsm == NULL) { 5369 counter_u64_add(rack_tlp_does_nada, 1); 5370 #ifdef TCP_BLACKBOX 5371 tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); 5372 #endif 5373 goto out; 5374 } 5375 } 5376 } 5377 if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { 5378 /* 5379 * We need to split this the last segment in two. 5380 */ 5381 struct rack_sendmap *nrsm; 5382 5383 5384 nrsm = rack_alloc_full_limit(rack); 5385 if (nrsm == NULL) { 5386 /* 5387 * No memory to split, we will just exit and punt 5388 * off to the RXT timer. 5389 */ 5390 counter_u64_add(rack_tlp_does_nada, 1); 5391 goto out; 5392 } 5393 rack_clone_rsm(rack, nrsm, rsm, 5394 (rsm->r_end - ctf_fixed_maxseg(tp))); 5395 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 5396 #ifdef INVARIANTS 5397 if (insret != NULL) { 5398 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 5399 nrsm, insret, rack, rsm); 5400 } 5401 #endif 5402 if (rsm->r_in_tmap) { 5403 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 5404 nrsm->r_in_tmap = 1; 5405 } 5406 rsm->r_flags &= (~RACK_HAS_FIN); 5407 rsm = nrsm; 5408 } 5409 rack->r_ctl.rc_tlpsend = rsm; 5410 send: 5411 rack->r_timer_override = 1; 5412 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5413 return (0); 5414 out: 5415 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; 5416 return (0); 5417 } 5418 5419 /* 5420 * Delayed ack Timer, here we simply need to setup the 5421 * ACK_NOW flag and remove the DELACK flag. From there 5422 * the output routine will send the ack out. 5423 * 5424 * We only return 1, saying don't proceed, if all timers 5425 * are stopped (destroyed PCB?). 5426 */ 5427 static int 5428 rack_timeout_delack(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5429 { 5430 if (tp->t_timers->tt_flags & TT_STOPPED) { 5431 return (1); 5432 } 5433 rack_log_to_event(rack, RACK_TO_FRM_DELACK, NULL); 5434 tp->t_flags &= ~TF_DELACK; 5435 tp->t_flags |= TF_ACKNOW; 5436 KMOD_TCPSTAT_INC(tcps_delack); 5437 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_DELACK; 5438 return (0); 5439 } 5440 5441 /* 5442 * Persists timer, here we simply send the 5443 * same thing as a keepalive will. 5444 * the one byte send. 5445 * 5446 * We only return 1, saying don't proceed, if all timers 5447 * are stopped (destroyed PCB?). 5448 */ 5449 static int 5450 rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5451 { 5452 struct tcptemp *t_template; 5453 struct inpcb *inp; 5454 int32_t retval = 1; 5455 5456 inp = tp->t_inpcb; 5457 5458 if (tp->t_timers->tt_flags & TT_STOPPED) { 5459 return (1); 5460 } 5461 if (rack->rc_in_persist == 0) 5462 return (0); 5463 if (ctf_progress_timeout_check(tp, false)) { 5464 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5465 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5466 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5467 return (1); 5468 } 5469 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); 5470 /* 5471 * Persistence timer into zero window. Force a byte to be output, if 5472 * possible. 5473 */ 5474 KMOD_TCPSTAT_INC(tcps_persisttimeo); 5475 /* 5476 * Hack: if the peer is dead/unreachable, we do not time out if the 5477 * window is closed. After a full backoff, drop the connection if 5478 * the idle time (no responses to probes) reaches the maximum 5479 * backoff that we would use if retransmitting. 5480 */ 5481 if (tp->t_rxtshift == TCP_MAXRXTSHIFT && 5482 (ticks - tp->t_rcvtime >= tcp_maxpersistidle || 5483 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { 5484 KMOD_TCPSTAT_INC(tcps_persistdrop); 5485 retval = 1; 5486 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5487 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5488 goto out; 5489 } 5490 if ((sbavail(&rack->rc_inp->inp_socket->so_snd) == 0) && 5491 tp->snd_una == tp->snd_max) 5492 rack_exit_persist(tp, rack, cts); 5493 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_PERSIT; 5494 /* 5495 * If the user has closed the socket then drop a persisting 5496 * connection after a much reduced timeout. 5497 */ 5498 if (tp->t_state > TCPS_CLOSE_WAIT && 5499 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) { 5500 retval = 1; 5501 KMOD_TCPSTAT_INC(tcps_persistdrop); 5502 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); 5503 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5504 goto out; 5505 } 5506 t_template = tcpip_maketemplate(rack->rc_inp); 5507 if (t_template) { 5508 /* only set it if we were answered */ 5509 if (rack->forced_ack == 0) { 5510 rack->forced_ack = 1; 5511 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5512 } 5513 tcp_respond(tp, t_template->tt_ipgen, 5514 &t_template->tt_t, (struct mbuf *)NULL, 5515 tp->rcv_nxt, tp->snd_una - 1, 0); 5516 /* This sends an ack */ 5517 if (tp->t_flags & TF_DELACK) 5518 tp->t_flags &= ~TF_DELACK; 5519 free(t_template, M_TEMP); 5520 } 5521 if (tp->t_rxtshift < TCP_MAXRXTSHIFT) 5522 tp->t_rxtshift++; 5523 out: 5524 rack_log_to_event(rack, RACK_TO_FRM_PERSIST, NULL); 5525 rack_start_hpts_timer(rack, tp, cts, 5526 0, 0, 0); 5527 return (retval); 5528 } 5529 5530 /* 5531 * If a keepalive goes off, we had no other timers 5532 * happening. We always return 1 here since this 5533 * routine either drops the connection or sends 5534 * out a segment with respond. 5535 */ 5536 static int 5537 rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5538 { 5539 struct tcptemp *t_template; 5540 struct inpcb *inp; 5541 5542 if (tp->t_timers->tt_flags & TT_STOPPED) { 5543 return (1); 5544 } 5545 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; 5546 inp = tp->t_inpcb; 5547 rack_log_to_event(rack, RACK_TO_FRM_KEEP, NULL); 5548 /* 5549 * Keep-alive timer went off; send something or drop connection if 5550 * idle for too long. 5551 */ 5552 KMOD_TCPSTAT_INC(tcps_keeptimeo); 5553 if (tp->t_state < TCPS_ESTABLISHED) 5554 goto dropit; 5555 if ((V_tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && 5556 tp->t_state <= TCPS_CLOSING) { 5557 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) 5558 goto dropit; 5559 /* 5560 * Send a packet designed to force a response if the peer is 5561 * up and reachable: either an ACK if the connection is 5562 * still alive, or an RST if the peer has closed the 5563 * connection due to timeout or reboot. Using sequence 5564 * number tp->snd_una-1 causes the transmitted zero-length 5565 * segment to lie outside the receive window; by the 5566 * protocol spec, this requires the correspondent TCP to 5567 * respond. 5568 */ 5569 KMOD_TCPSTAT_INC(tcps_keepprobe); 5570 t_template = tcpip_maketemplate(inp); 5571 if (t_template) { 5572 if (rack->forced_ack == 0) { 5573 rack->forced_ack = 1; 5574 rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); 5575 } 5576 tcp_respond(tp, t_template->tt_ipgen, 5577 &t_template->tt_t, (struct mbuf *)NULL, 5578 tp->rcv_nxt, tp->snd_una - 1, 0); 5579 free(t_template, M_TEMP); 5580 } 5581 } 5582 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 5583 return (1); 5584 dropit: 5585 KMOD_TCPSTAT_INC(tcps_keepdrops); 5586 tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); 5587 tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); 5588 return (1); 5589 } 5590 5591 /* 5592 * Retransmit helper function, clear up all the ack 5593 * flags and take care of important book keeping. 5594 */ 5595 static void 5596 rack_remxt_tmr(struct tcpcb *tp) 5597 { 5598 /* 5599 * The retransmit timer went off, all sack'd blocks must be 5600 * un-acked. 5601 */ 5602 struct rack_sendmap *rsm, *trsm = NULL; 5603 struct tcp_rack *rack; 5604 int32_t cnt = 0; 5605 5606 rack = (struct tcp_rack *)tp->t_fb_ptr; 5607 rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); 5608 rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); 5609 if (rack->r_state && (rack->r_state != tp->t_state)) 5610 rack_set_state(tp, rack); 5611 /* 5612 * Ideally we would like to be able to 5613 * mark SACK-PASS on anything not acked here. 5614 * However, if we do that we would burst out 5615 * all that data 1ms apart. This would be unwise, 5616 * so for now we will just let the normal rxt timer 5617 * and tlp timer take care of it. 5618 */ 5619 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 5620 if (rsm->r_flags & RACK_ACKED) { 5621 cnt++; 5622 rsm->r_dupack = 0; 5623 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 5624 if (rsm->r_in_tmap == 0) { 5625 /* We must re-add it back to the tlist */ 5626 if (trsm == NULL) { 5627 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 5628 } else { 5629 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); 5630 } 5631 rsm->r_in_tmap = 1; 5632 } 5633 } 5634 trsm = rsm; 5635 if (rsm->r_flags & RACK_ACKED) 5636 rsm->r_flags |= RACK_WAS_ACKED; 5637 rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); 5638 } 5639 /* Clear the count (we just un-acked them) */ 5640 rack->r_ctl.rc_sacked = 0; 5641 rack->r_ctl.rc_agg_delayed = 0; 5642 rack->r_early = 0; 5643 rack->r_ctl.rc_agg_early = 0; 5644 rack->r_late = 0; 5645 /* Clear the tlp rtx mark */ 5646 rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 5647 rack->r_ctl.rc_prr_sndcnt = 0; 5648 rack_log_to_prr(rack, 6, 0); 5649 rack->r_timer_override = 1; 5650 } 5651 5652 static void 5653 rack_cc_conn_init(struct tcpcb *tp) 5654 { 5655 struct tcp_rack *rack; 5656 5657 5658 rack = (struct tcp_rack *)tp->t_fb_ptr; 5659 cc_conn_init(tp); 5660 /* 5661 * We want a chance to stay in slowstart as 5662 * we create a connection. TCP spec says that 5663 * initially ssthresh is infinite. For our 5664 * purposes that is the snd_wnd. 5665 */ 5666 if (tp->snd_ssthresh < tp->snd_wnd) { 5667 tp->snd_ssthresh = tp->snd_wnd; 5668 } 5669 /* 5670 * We also want to assure a IW worth of 5671 * data can get inflight. 5672 */ 5673 if (rc_init_window(rack) < tp->snd_cwnd) 5674 tp->snd_cwnd = rc_init_window(rack); 5675 } 5676 5677 /* 5678 * Re-transmit timeout! If we drop the PCB we will return 1, otherwise 5679 * we will setup to retransmit the lowest seq number outstanding. 5680 */ 5681 static int 5682 rack_timeout_rxt(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) 5683 { 5684 int32_t rexmt; 5685 struct inpcb *inp; 5686 int32_t retval = 0; 5687 bool isipv6; 5688 5689 inp = tp->t_inpcb; 5690 if (tp->t_timers->tt_flags & TT_STOPPED) { 5691 return (1); 5692 } 5693 if (ctf_progress_timeout_check(tp, false)) { 5694 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5695 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 5696 tcp_set_inp_to_drop(inp, ETIMEDOUT); 5697 return (1); 5698 } 5699 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; 5700 if (TCPS_HAVEESTABLISHED(tp->t_state) && 5701 (tp->snd_una == tp->snd_max)) { 5702 /* Nothing outstanding .. nothing to do */ 5703 return (0); 5704 } 5705 /* 5706 * Retransmission timer went off. Message has not been acked within 5707 * retransmit interval. Back off to a longer retransmit interval 5708 * and retransmit one segment. 5709 */ 5710 rack_remxt_tmr(tp); 5711 if ((rack->r_ctl.rc_resend == NULL) || 5712 ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { 5713 /* 5714 * If the rwnd collapsed on 5715 * the one we are retransmitting 5716 * it does not count against the 5717 * rxt count. 5718 */ 5719 tp->t_rxtshift++; 5720 } 5721 if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { 5722 tp->t_rxtshift = TCP_MAXRXTSHIFT; 5723 KMOD_TCPSTAT_INC(tcps_timeoutdrop); 5724 retval = 1; 5725 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); 5726 tcp_set_inp_to_drop(rack->rc_inp, 5727 (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); 5728 goto out; 5729 } 5730 if (tp->t_state == TCPS_SYN_SENT) { 5731 /* 5732 * If the SYN was retransmitted, indicate CWND to be limited 5733 * to 1 segment in cc_conn_init(). 5734 */ 5735 tp->snd_cwnd = 1; 5736 } else if (tp->t_rxtshift == 1) { 5737 /* 5738 * first retransmit; record ssthresh and cwnd so they can be 5739 * recovered if this turns out to be a "bad" retransmit. A 5740 * retransmit is considered "bad" if an ACK for this segment 5741 * is received within RTT/2 interval; the assumption here is 5742 * that the ACK was already in flight. See "On Estimating 5743 * End-to-End Network Path Properties" by Allman and Paxson 5744 * for more details. 5745 */ 5746 tp->snd_cwnd_prev = tp->snd_cwnd; 5747 tp->snd_ssthresh_prev = tp->snd_ssthresh; 5748 tp->snd_recover_prev = tp->snd_recover; 5749 if (IN_FASTRECOVERY(tp->t_flags)) 5750 tp->t_flags |= TF_WASFRECOVERY; 5751 else 5752 tp->t_flags &= ~TF_WASFRECOVERY; 5753 if (IN_CONGRECOVERY(tp->t_flags)) 5754 tp->t_flags |= TF_WASCRECOVERY; 5755 else 5756 tp->t_flags &= ~TF_WASCRECOVERY; 5757 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); 5758 tp->t_flags |= TF_PREVVALID; 5759 } else 5760 tp->t_flags &= ~TF_PREVVALID; 5761 KMOD_TCPSTAT_INC(tcps_rexmttimeo); 5762 if ((tp->t_state == TCPS_SYN_SENT) || 5763 (tp->t_state == TCPS_SYN_RECEIVED)) 5764 rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); 5765 else 5766 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; 5767 TCPT_RANGESET(tp->t_rxtcur, rexmt, 5768 max(MSEC_2_TICKS(rack_rto_min), rexmt), 5769 MSEC_2_TICKS(rack_rto_max)); 5770 /* 5771 * We enter the path for PLMTUD if connection is established or, if 5772 * connection is FIN_WAIT_1 status, reason for the last is that if 5773 * amount of data we send is very small, we could send it in couple 5774 * of packets and process straight to FIN. In that case we won't 5775 * catch ESTABLISHED state. 5776 */ 5777 #ifdef INET6 5778 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? true : false; 5779 #else 5780 isipv6 = false; 5781 #endif 5782 if (((V_tcp_pmtud_blackhole_detect == 1) || 5783 (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) || 5784 (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) && 5785 ((tp->t_state == TCPS_ESTABLISHED) || 5786 (tp->t_state == TCPS_FIN_WAIT_1))) { 5787 5788 /* 5789 * Idea here is that at each stage of mtu probe (usually, 5790 * 1448 -> 1188 -> 524) should be given 2 chances to recover 5791 * before further clamping down. 'tp->t_rxtshift % 2 == 0' 5792 * should take care of that. 5793 */ 5794 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) == 5795 (TF2_PLPMTU_PMTUD | TF2_PLPMTU_MAXSEGSNT)) && 5796 (tp->t_rxtshift >= 2 && tp->t_rxtshift < 6 && 5797 tp->t_rxtshift % 2 == 0)) { 5798 /* 5799 * Enter Path MTU Black-hole Detection mechanism: - 5800 * Disable Path MTU Discovery (IP "DF" bit). - 5801 * Reduce MTU to lower value than what we negotiated 5802 * with peer. 5803 */ 5804 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) { 5805 /* Record that we may have found a black hole. */ 5806 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE; 5807 /* Keep track of previous MSS. */ 5808 tp->t_pmtud_saved_maxseg = tp->t_maxseg; 5809 } 5810 5811 /* 5812 * Reduce the MSS to blackhole value or to the 5813 * default in an attempt to retransmit. 5814 */ 5815 #ifdef INET6 5816 if (isipv6 && 5817 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) { 5818 /* Use the sysctl tuneable blackhole MSS. */ 5819 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss; 5820 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5821 } else if (isipv6) { 5822 /* Use the default MSS. */ 5823 tp->t_maxseg = V_tcp_v6mssdflt; 5824 /* 5825 * Disable Path MTU Discovery when we switch 5826 * to minmss. 5827 */ 5828 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5829 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5830 } 5831 #endif 5832 #if defined(INET6) && defined(INET) 5833 else 5834 #endif 5835 #ifdef INET 5836 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) { 5837 /* Use the sysctl tuneable blackhole MSS. */ 5838 tp->t_maxseg = V_tcp_pmtud_blackhole_mss; 5839 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated); 5840 } else { 5841 /* Use the default MSS. */ 5842 tp->t_maxseg = V_tcp_mssdflt; 5843 /* 5844 * Disable Path MTU Discovery when we switch 5845 * to minmss. 5846 */ 5847 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 5848 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss); 5849 } 5850 #endif 5851 } else { 5852 /* 5853 * If further retransmissions are still unsuccessful 5854 * with a lowered MTU, maybe this isn't a blackhole 5855 * and we restore the previous MSS and blackhole 5856 * detection flags. The limit '6' is determined by 5857 * giving each probe stage (1448, 1188, 524) 2 5858 * chances to recover. 5859 */ 5860 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) && 5861 (tp->t_rxtshift >= 6)) { 5862 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 5863 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE; 5864 tp->t_maxseg = tp->t_pmtud_saved_maxseg; 5865 KMOD_TCPSTAT_INC(tcps_pmtud_blackhole_failed); 5866 } 5867 } 5868 } 5869 /* 5870 * If we backed off this far, our srtt estimate is probably bogus. 5871 * Clobber it so we'll take the next rtt measurement as our srtt; 5872 * move the current srtt into rttvar to keep the current retransmit 5873 * times until then. 5874 */ 5875 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { 5876 #ifdef INET6 5877 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 5878 in6_losing(tp->t_inpcb); 5879 else 5880 #endif 5881 in_losing(tp->t_inpcb); 5882 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); 5883 tp->t_srtt = 0; 5884 } 5885 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 5886 tp->snd_recover = tp->snd_max; 5887 tp->t_flags |= TF_ACKNOW; 5888 tp->t_rtttime = 0; 5889 rack_cong_signal(tp, NULL, CC_RTO); 5890 out: 5891 return (retval); 5892 } 5893 5894 static int 5895 rack_process_timers(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, uint8_t hpts_calling) 5896 { 5897 int32_t ret = 0; 5898 int32_t timers = (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK); 5899 5900 if (timers == 0) { 5901 return (0); 5902 } 5903 if (tp->t_state == TCPS_LISTEN) { 5904 /* no timers on listen sockets */ 5905 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) 5906 return (0); 5907 return (1); 5908 } 5909 if ((timers & PACE_TMR_RACK) && 5910 rack->rc_on_min_to) { 5911 /* 5912 * For the rack timer when we 5913 * are on a min-timeout (which means rrr_conf = 3) 5914 * we don't want to check the timer. It may 5915 * be going off for a pace and thats ok we 5916 * want to send the retransmit (if its ready). 5917 * 5918 * If its on a normal rack timer (non-min) then 5919 * we will check if its expired. 5920 */ 5921 goto skip_time_check; 5922 } 5923 if (TSTMP_LT(cts, rack->r_ctl.rc_timer_exp)) { 5924 uint32_t left; 5925 5926 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 5927 ret = -1; 5928 rack_log_to_processing(rack, cts, ret, 0); 5929 return (0); 5930 } 5931 if (hpts_calling == 0) { 5932 /* 5933 * A user send or queued mbuf (sack) has called us? We 5934 * return 0 and let the pacing guards 5935 * deal with it if they should or 5936 * should not cause a send. 5937 */ 5938 ret = -2; 5939 rack_log_to_processing(rack, cts, ret, 0); 5940 return (0); 5941 } 5942 /* 5943 * Ok our timer went off early and we are not paced false 5944 * alarm, go back to sleep. 5945 */ 5946 ret = -3; 5947 left = rack->r_ctl.rc_timer_exp - cts; 5948 tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(left)); 5949 rack_log_to_processing(rack, cts, ret, left); 5950 return (1); 5951 } 5952 skip_time_check: 5953 rack->rc_tmr_stopped = 0; 5954 rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_MASK; 5955 if (timers & PACE_TMR_DELACK) { 5956 ret = rack_timeout_delack(tp, rack, cts); 5957 } else if (timers & PACE_TMR_RACK) { 5958 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5959 ret = rack_timeout_rack(tp, rack, cts); 5960 } else if (timers & PACE_TMR_TLP) { 5961 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5962 ret = rack_timeout_tlp(tp, rack, cts); 5963 } else if (timers & PACE_TMR_RXT) { 5964 rack->r_ctl.rc_tlp_rxt_last_time = cts; 5965 ret = rack_timeout_rxt(tp, rack, cts); 5966 } else if (timers & PACE_TMR_PERSIT) { 5967 ret = rack_timeout_persist(tp, rack, cts); 5968 } else if (timers & PACE_TMR_KEEP) { 5969 ret = rack_timeout_keepalive(tp, rack, cts); 5970 } 5971 rack_log_to_processing(rack, cts, ret, timers); 5972 return (ret); 5973 } 5974 5975 static void 5976 rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line) 5977 { 5978 struct timeval tv; 5979 uint32_t us_cts, flags_on_entry; 5980 uint8_t hpts_removed = 0; 5981 5982 5983 flags_on_entry = rack->r_ctl.rc_hpts_flags; 5984 us_cts = tcp_get_usecs(&tv); 5985 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 5986 ((TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) || 5987 ((tp->snd_max - tp->snd_una) == 0))) { 5988 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 5989 hpts_removed = 1; 5990 /* If we were not delayed cancel out the flag. */ 5991 if ((tp->snd_max - tp->snd_una) == 0) 5992 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 5993 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 5994 } 5995 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 5996 rack->rc_tmr_stopped = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 5997 if (rack->rc_inp->inp_in_hpts && 5998 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)) { 5999 /* 6000 * Canceling timer's when we have no output being 6001 * paced. We also must remove ourselves from the 6002 * hpts. 6003 */ 6004 tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); 6005 hpts_removed = 1; 6006 } 6007 rack->r_ctl.rc_hpts_flags &= ~(PACE_TMR_MASK); 6008 } 6009 if (hpts_removed == 0) 6010 rack_log_to_cancel(rack, hpts_removed, line, us_cts, &tv, flags_on_entry); 6011 } 6012 6013 static void 6014 rack_timer_stop(struct tcpcb *tp, uint32_t timer_type) 6015 { 6016 return; 6017 } 6018 6019 static int 6020 rack_stopall(struct tcpcb *tp) 6021 { 6022 struct tcp_rack *rack; 6023 rack = (struct tcp_rack *)tp->t_fb_ptr; 6024 rack->t_timers_stopped = 1; 6025 return (0); 6026 } 6027 6028 static void 6029 rack_timer_activate(struct tcpcb *tp, uint32_t timer_type, uint32_t delta) 6030 { 6031 return; 6032 } 6033 6034 static int 6035 rack_timer_active(struct tcpcb *tp, uint32_t timer_type) 6036 { 6037 return (0); 6038 } 6039 6040 static void 6041 rack_stop_all_timers(struct tcpcb *tp) 6042 { 6043 struct tcp_rack *rack; 6044 6045 /* 6046 * Assure no timers are running. 6047 */ 6048 if (tcp_timer_active(tp, TT_PERSIST)) { 6049 /* We enter in persists, set the flag appropriately */ 6050 rack = (struct tcp_rack *)tp->t_fb_ptr; 6051 rack->rc_in_persist = 1; 6052 } 6053 tcp_timer_suspend(tp, TT_PERSIST); 6054 tcp_timer_suspend(tp, TT_REXMT); 6055 tcp_timer_suspend(tp, TT_KEEP); 6056 tcp_timer_suspend(tp, TT_DELACK); 6057 } 6058 6059 static void 6060 rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, 6061 struct rack_sendmap *rsm, uint32_t ts) 6062 { 6063 int32_t idx; 6064 6065 rsm->r_rtr_cnt++; 6066 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6067 rsm->r_dupack = 0; 6068 if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { 6069 rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; 6070 rsm->r_flags |= RACK_OVERMAX; 6071 } 6072 if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { 6073 rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); 6074 rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); 6075 } 6076 idx = rsm->r_rtr_cnt - 1; 6077 rsm->r_tim_lastsent[idx] = ts; 6078 if (rsm->r_flags & RACK_ACKED) { 6079 /* Problably MTU discovery messing with us */ 6080 rsm->r_flags &= ~RACK_ACKED; 6081 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 6082 } 6083 if (rsm->r_in_tmap) { 6084 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6085 rsm->r_in_tmap = 0; 6086 } 6087 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6088 rsm->r_in_tmap = 1; 6089 if (rsm->r_flags & RACK_SACK_PASSED) { 6090 /* We have retransmitted due to the SACK pass */ 6091 rsm->r_flags &= ~RACK_SACK_PASSED; 6092 rsm->r_flags |= RACK_WAS_SACKPASS; 6093 } 6094 } 6095 6096 6097 static uint32_t 6098 rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, 6099 struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) 6100 { 6101 /* 6102 * We (re-)transmitted starting at rsm->r_start for some length 6103 * (possibly less than r_end. 6104 */ 6105 struct rack_sendmap *nrsm, *insret; 6106 uint32_t c_end; 6107 int32_t len; 6108 6109 len = *lenp; 6110 c_end = rsm->r_start + len; 6111 if (SEQ_GEQ(c_end, rsm->r_end)) { 6112 /* 6113 * We retransmitted the whole piece or more than the whole 6114 * slopping into the next rsm. 6115 */ 6116 rack_update_rsm(tp, rack, rsm, ts); 6117 if (c_end == rsm->r_end) { 6118 *lenp = 0; 6119 return (0); 6120 } else { 6121 int32_t act_len; 6122 6123 /* Hangs over the end return whats left */ 6124 act_len = rsm->r_end - rsm->r_start; 6125 *lenp = (len - act_len); 6126 return (rsm->r_end); 6127 } 6128 /* We don't get out of this block. */ 6129 } 6130 /* 6131 * Here we retransmitted less than the whole thing which means we 6132 * have to split this into what was transmitted and what was not. 6133 */ 6134 nrsm = rack_alloc_full_limit(rack); 6135 if (nrsm == NULL) { 6136 /* 6137 * We can't get memory, so lets not proceed. 6138 */ 6139 *lenp = 0; 6140 return (0); 6141 } 6142 /* 6143 * So here we are going to take the original rsm and make it what we 6144 * retransmitted. nrsm will be the tail portion we did not 6145 * retransmit. For example say the chunk was 1, 11 (10 bytes). And 6146 * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to 6147 * 1, 6 and the new piece will be 6, 11. 6148 */ 6149 rack_clone_rsm(rack, nrsm, rsm, c_end); 6150 nrsm->r_dupack = 0; 6151 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 6152 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6153 #ifdef INVARIANTS 6154 if (insret != NULL) { 6155 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6156 nrsm, insret, rack, rsm); 6157 } 6158 #endif 6159 if (rsm->r_in_tmap) { 6160 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6161 nrsm->r_in_tmap = 1; 6162 } 6163 rsm->r_flags &= (~RACK_HAS_FIN); 6164 rack_update_rsm(tp, rack, rsm, ts); 6165 *lenp = 0; 6166 return (0); 6167 } 6168 6169 6170 static void 6171 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, 6172 uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, 6173 uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) 6174 { 6175 struct tcp_rack *rack; 6176 struct rack_sendmap *rsm, *nrsm, *insret, fe; 6177 register uint32_t snd_max, snd_una; 6178 6179 /* 6180 * Add to the RACK log of packets in flight or retransmitted. If 6181 * there is a TS option we will use the TS echoed, if not we will 6182 * grab a TS. 6183 * 6184 * Retransmissions will increment the count and move the ts to its 6185 * proper place. Note that if options do not include TS's then we 6186 * won't be able to effectively use the ACK for an RTT on a retran. 6187 * 6188 * Notes about r_start and r_end. Lets consider a send starting at 6189 * sequence 1 for 10 bytes. In such an example the r_start would be 6190 * 1 (starting sequence) but the r_end would be r_start+len i.e. 11. 6191 * This means that r_end is actually the first sequence for the next 6192 * slot (11). 6193 * 6194 */ 6195 /* 6196 * If err is set what do we do XXXrrs? should we not add the thing? 6197 * -- i.e. return if err != 0 or should we pretend we sent it? -- 6198 * i.e. proceed with add ** do this for now. 6199 */ 6200 INP_WLOCK_ASSERT(tp->t_inpcb); 6201 if (err) 6202 /* 6203 * We don't log errors -- we could but snd_max does not 6204 * advance in this case either. 6205 */ 6206 return; 6207 6208 if (th_flags & TH_RST) { 6209 /* 6210 * We don't log resets and we return immediately from 6211 * sending 6212 */ 6213 return; 6214 } 6215 rack = (struct tcp_rack *)tp->t_fb_ptr; 6216 snd_una = tp->snd_una; 6217 if (SEQ_LEQ((seq_out + len), snd_una)) { 6218 /* Are sending an old segment to induce an ack (keep-alive)? */ 6219 return; 6220 } 6221 if (SEQ_LT(seq_out, snd_una)) { 6222 /* huh? should we panic? */ 6223 uint32_t end; 6224 6225 end = seq_out + len; 6226 seq_out = snd_una; 6227 if (SEQ_GEQ(end, seq_out)) 6228 len = end - seq_out; 6229 else 6230 len = 0; 6231 } 6232 snd_max = tp->snd_max; 6233 if (th_flags & (TH_SYN | TH_FIN)) { 6234 /* 6235 * The call to rack_log_output is made before bumping 6236 * snd_max. This means we can record one extra byte on a SYN 6237 * or FIN if seq_out is adding more on and a FIN is present 6238 * (and we are not resending). 6239 */ 6240 if ((th_flags & TH_SYN) && (seq_out == tp->iss)) 6241 len++; 6242 if (th_flags & TH_FIN) 6243 len++; 6244 if (SEQ_LT(snd_max, tp->snd_nxt)) { 6245 /* 6246 * The add/update as not been done for the FIN/SYN 6247 * yet. 6248 */ 6249 snd_max = tp->snd_nxt; 6250 } 6251 } 6252 if (len == 0) { 6253 /* We don't log zero window probes */ 6254 return; 6255 } 6256 rack->r_ctl.rc_time_last_sent = ts; 6257 if (IN_RECOVERY(tp->t_flags)) { 6258 rack->r_ctl.rc_prr_out += len; 6259 } 6260 /* First question is it a retransmission or new? */ 6261 if (seq_out == snd_max) { 6262 /* Its new */ 6263 again: 6264 rsm = rack_alloc(rack); 6265 if (rsm == NULL) { 6266 /* 6267 * Hmm out of memory and the tcb got destroyed while 6268 * we tried to wait. 6269 */ 6270 return; 6271 } 6272 if (th_flags & TH_FIN) { 6273 rsm->r_flags = RACK_HAS_FIN; 6274 } else { 6275 rsm->r_flags = 0; 6276 } 6277 rsm->r_tim_lastsent[0] = ts; 6278 rsm->r_rtr_cnt = 1; 6279 rsm->r_rtr_bytes = 0; 6280 rsm->usec_orig_send = us_cts; 6281 if (th_flags & TH_SYN) { 6282 /* The data space is one beyond snd_una */ 6283 rsm->r_flags |= RACK_HAS_SIN; 6284 rsm->r_start = seq_out + 1; 6285 rsm->r_end = rsm->r_start + (len - 1); 6286 } else { 6287 /* Normal case */ 6288 rsm->r_start = seq_out; 6289 rsm->r_end = rsm->r_start + len; 6290 } 6291 rsm->r_dupack = 0; 6292 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 6293 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6294 #ifdef INVARIANTS 6295 if (insret != NULL) { 6296 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6297 nrsm, insret, rack, rsm); 6298 } 6299 #endif 6300 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 6301 rsm->r_in_tmap = 1; 6302 /* 6303 * Special case detection, is there just a single 6304 * packet outstanding when we are not in recovery? 6305 * 6306 * If this is true mark it so. 6307 */ 6308 if ((IN_RECOVERY(tp->t_flags) == 0) && 6309 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { 6310 struct rack_sendmap *prsm; 6311 6312 prsm = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 6313 if (prsm) 6314 prsm->r_one_out_nr = 1; 6315 } 6316 return; 6317 } 6318 /* 6319 * If we reach here its a retransmission and we need to find it. 6320 */ 6321 memset(&fe, 0, sizeof(fe)); 6322 more: 6323 if (hintrsm && (hintrsm->r_start == seq_out)) { 6324 rsm = hintrsm; 6325 hintrsm = NULL; 6326 } else { 6327 /* No hints sorry */ 6328 rsm = NULL; 6329 } 6330 if ((rsm) && (rsm->r_start == seq_out)) { 6331 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6332 if (len == 0) { 6333 return; 6334 } else { 6335 goto more; 6336 } 6337 } 6338 /* Ok it was not the last pointer go through it the hard way. */ 6339 refind: 6340 fe.r_start = seq_out; 6341 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 6342 if (rsm) { 6343 if (rsm->r_start == seq_out) { 6344 seq_out = rack_update_entry(tp, rack, rsm, ts, &len); 6345 if (len == 0) { 6346 return; 6347 } else { 6348 goto refind; 6349 } 6350 } 6351 if (SEQ_GEQ(seq_out, rsm->r_start) && SEQ_LT(seq_out, rsm->r_end)) { 6352 /* Transmitted within this piece */ 6353 /* 6354 * Ok we must split off the front and then let the 6355 * update do the rest 6356 */ 6357 nrsm = rack_alloc_full_limit(rack); 6358 if (nrsm == NULL) { 6359 rack_update_rsm(tp, rack, rsm, ts); 6360 return; 6361 } 6362 /* 6363 * copy rsm to nrsm and then trim the front of rsm 6364 * to not include this part. 6365 */ 6366 rack_clone_rsm(rack, nrsm, rsm, seq_out); 6367 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 6368 #ifdef INVARIANTS 6369 if (insret != NULL) { 6370 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 6371 nrsm, insret, rack, rsm); 6372 } 6373 #endif 6374 if (rsm->r_in_tmap) { 6375 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 6376 nrsm->r_in_tmap = 1; 6377 } 6378 rsm->r_flags &= (~RACK_HAS_FIN); 6379 seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); 6380 if (len == 0) { 6381 return; 6382 } else if (len > 0) 6383 goto refind; 6384 } 6385 } 6386 /* 6387 * Hmm not found in map did they retransmit both old and on into the 6388 * new? 6389 */ 6390 if (seq_out == tp->snd_max) { 6391 goto again; 6392 } else if (SEQ_LT(seq_out, tp->snd_max)) { 6393 #ifdef INVARIANTS 6394 printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", 6395 seq_out, len, tp->snd_una, tp->snd_max); 6396 printf("Starting Dump of all rack entries\n"); 6397 RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 6398 printf("rsm:%p start:%u end:%u\n", 6399 rsm, rsm->r_start, rsm->r_end); 6400 } 6401 printf("Dump complete\n"); 6402 panic("seq_out not found rack:%p tp:%p", 6403 rack, tp); 6404 #endif 6405 } else { 6406 #ifdef INVARIANTS 6407 /* 6408 * Hmm beyond sndmax? (only if we are using the new rtt-pack 6409 * flag) 6410 */ 6411 panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", 6412 seq_out, len, tp->snd_max, tp); 6413 #endif 6414 } 6415 } 6416 6417 /* 6418 * Record one of the RTT updates from an ack into 6419 * our sample structure. 6420 */ 6421 6422 static void 6423 tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_rtt, 6424 int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt) 6425 { 6426 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6427 (rack->r_ctl.rack_rs.rs_rtt_lowest > rtt)) { 6428 rack->r_ctl.rack_rs.rs_rtt_lowest = rtt; 6429 } 6430 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6431 (rack->r_ctl.rack_rs.rs_rtt_highest < rtt)) { 6432 rack->r_ctl.rack_rs.rs_rtt_highest = rtt; 6433 } 6434 if (rack->rc_tp->t_flags & TF_GPUTINPROG) { 6435 if (us_rtt < rack->r_ctl.rc_gp_lowrtt) 6436 rack->r_ctl.rc_gp_lowrtt = us_rtt; 6437 if (rack->rc_tp->snd_wnd > rack->r_ctl.rc_gp_high_rwnd) 6438 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 6439 } 6440 if ((confidence == 1) && 6441 ((rsm == NULL) || 6442 (rsm->r_just_ret) || 6443 (rsm->r_one_out_nr && 6444 len < (ctf_fixed_maxseg(rack->rc_tp) * 2)))) { 6445 /* 6446 * If the rsm had a just return 6447 * hit it then we can't trust the 6448 * rtt measurement for buffer deterimination 6449 * Note that a confidence of 2, indicates 6450 * SACK'd which overrides the r_just_ret or 6451 * the r_one_out_nr. If it was a CUM-ACK and 6452 * we had only two outstanding, but get an 6453 * ack for only 1. Then that also lowers our 6454 * confidence. 6455 */ 6456 confidence = 0; 6457 } 6458 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) || 6459 (rack->r_ctl.rack_rs.rs_us_rtt > us_rtt)) { 6460 if (rack->r_ctl.rack_rs.confidence == 0) { 6461 /* 6462 * We take anything with no current confidence 6463 * saved. 6464 */ 6465 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6466 rack->r_ctl.rack_rs.confidence = confidence; 6467 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6468 } else if (confidence || rack->r_ctl.rack_rs.confidence) { 6469 /* 6470 * Once we have a confident number, 6471 * we can update it with a smaller 6472 * value since this confident number 6473 * may include the DSACK time until 6474 * the next segment (the second one) arrived. 6475 */ 6476 rack->r_ctl.rack_rs.rs_us_rtt = us_rtt; 6477 rack->r_ctl.rack_rs.confidence = confidence; 6478 rack->r_ctl.rack_rs.rs_us_rtrcnt = rtrcnt; 6479 } 6480 6481 } 6482 rack_log_rtt_upd(rack->rc_tp, rack, us_rtt, len, rsm, confidence); 6483 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_VALID; 6484 rack->r_ctl.rack_rs.rs_rtt_tot += rtt; 6485 rack->r_ctl.rack_rs.rs_rtt_cnt++; 6486 } 6487 6488 /* 6489 * Collect new round-trip time estimate 6490 * and update averages and current timeout. 6491 */ 6492 static void 6493 tcp_rack_xmit_timer_commit(struct tcp_rack *rack, struct tcpcb *tp) 6494 { 6495 int32_t delta; 6496 uint32_t o_srtt, o_var; 6497 int32_t hrtt_up = 0; 6498 int32_t rtt; 6499 6500 if (rack->r_ctl.rack_rs.rs_flags & RACK_RTT_EMPTY) 6501 /* No valid sample */ 6502 return; 6503 if (rack->r_ctl.rc_rate_sample_method == USE_RTT_LOW) { 6504 /* We are to use the lowest RTT seen in a single ack */ 6505 rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; 6506 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_HIGH) { 6507 /* We are to use the highest RTT seen in a single ack */ 6508 rtt = rack->r_ctl.rack_rs.rs_rtt_highest; 6509 } else if (rack->r_ctl.rc_rate_sample_method == USE_RTT_AVG) { 6510 /* We are to use the average RTT seen in a single ack */ 6511 rtt = (int32_t)(rack->r_ctl.rack_rs.rs_rtt_tot / 6512 (uint64_t)rack->r_ctl.rack_rs.rs_rtt_cnt); 6513 } else { 6514 #ifdef INVARIANTS 6515 panic("Unknown rtt variant %d", rack->r_ctl.rc_rate_sample_method); 6516 #endif 6517 return; 6518 } 6519 if (rtt == 0) 6520 rtt = 1; 6521 if (rack->rc_gp_rtt_set == 0) { 6522 /* 6523 * With no RTT we have to accept 6524 * even one we are not confident of. 6525 */ 6526 rack->r_ctl.rc_gp_srtt = rack->r_ctl.rack_rs.rs_us_rtt; 6527 rack->rc_gp_rtt_set = 1; 6528 } else if (rack->r_ctl.rack_rs.confidence) { 6529 /* update the running gp srtt */ 6530 rack->r_ctl.rc_gp_srtt -= (rack->r_ctl.rc_gp_srtt/8); 6531 rack->r_ctl.rc_gp_srtt += rack->r_ctl.rack_rs.rs_us_rtt / 8; 6532 } 6533 if (rack->r_ctl.rack_rs.confidence) { 6534 /* 6535 * record the low and high for highly buffered path computation, 6536 * we only do this if we are confident (not a retransmission). 6537 */ 6538 if (rack->r_ctl.rc_highest_us_rtt < rack->r_ctl.rack_rs.rs_us_rtt) { 6539 rack->r_ctl.rc_highest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6540 hrtt_up = 1; 6541 } 6542 if (rack->rc_highly_buffered == 0) { 6543 /* 6544 * Currently once we declare a path has 6545 * highly buffered there is no going 6546 * back, which may be a problem... 6547 */ 6548 if ((rack->r_ctl.rc_highest_us_rtt / rack->r_ctl.rc_lowest_us_rtt) > rack_hbp_thresh) { 6549 rack_log_rtt_shrinks(rack, rack->r_ctl.rack_rs.rs_us_rtt, 6550 rack->r_ctl.rc_highest_us_rtt, 6551 rack->r_ctl.rc_lowest_us_rtt, 6552 RACK_RTTS_SEEHBP); 6553 rack->rc_highly_buffered = 1; 6554 } 6555 } 6556 } 6557 if ((rack->r_ctl.rack_rs.confidence) || 6558 (rack->r_ctl.rack_rs.rs_us_rtrcnt == 1)) { 6559 /* 6560 * If we are highly confident of it <or> it was 6561 * never retransmitted we accept it as the last us_rtt. 6562 */ 6563 rack->r_ctl.rc_last_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6564 /* The lowest rtt can be set if its was not retransmited */ 6565 if (rack->r_ctl.rc_lowest_us_rtt > rack->r_ctl.rack_rs.rs_us_rtt) { 6566 rack->r_ctl.rc_lowest_us_rtt = rack->r_ctl.rack_rs.rs_us_rtt; 6567 if (rack->r_ctl.rc_lowest_us_rtt == 0) 6568 rack->r_ctl.rc_lowest_us_rtt = 1; 6569 } 6570 } 6571 rack_log_rtt_sample(rack, rtt); 6572 o_srtt = tp->t_srtt; 6573 o_var = tp->t_rttvar; 6574 rack = (struct tcp_rack *)tp->t_fb_ptr; 6575 if (tp->t_srtt != 0) { 6576 /* 6577 * srtt is stored as fixed point with 5 bits after the 6578 * binary point (i.e., scaled by 8). The following magic is 6579 * equivalent to the smoothing algorithm in rfc793 with an 6580 * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). 6581 * Adjust rtt to origin 0. 6582 */ 6583 delta = ((rtt - 1) << TCP_DELTA_SHIFT) 6584 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); 6585 6586 tp->t_srtt += delta; 6587 if (tp->t_srtt <= 0) 6588 tp->t_srtt = 1; 6589 6590 /* 6591 * We accumulate a smoothed rtt variance (actually, a 6592 * smoothed mean difference), then set the retransmit timer 6593 * to smoothed rtt + 4 times the smoothed variance. rttvar 6594 * is stored as fixed point with 4 bits after the binary 6595 * point (scaled by 16). The following is equivalent to 6596 * rfc793 smoothing with an alpha of .75 (rttvar = 6597 * rttvar*3/4 + |delta| / 4). This replaces rfc793's 6598 * wired-in beta. 6599 */ 6600 if (delta < 0) 6601 delta = -delta; 6602 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); 6603 tp->t_rttvar += delta; 6604 if (tp->t_rttvar <= 0) 6605 tp->t_rttvar = 1; 6606 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) 6607 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6608 } else { 6609 /* 6610 * No rtt measurement yet - use the unsmoothed rtt. Set the 6611 * variance to half the rtt (so our first retransmit happens 6612 * at 3*rtt). 6613 */ 6614 tp->t_srtt = rtt << TCP_RTT_SHIFT; 6615 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); 6616 tp->t_rttbest = tp->t_srtt + tp->t_rttvar; 6617 } 6618 KMOD_TCPSTAT_INC(tcps_rttupdated); 6619 tp->t_rttupdated++; 6620 #ifdef STATS 6621 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); 6622 #endif 6623 tp->t_rxtshift = 0; 6624 6625 /* 6626 * the retransmit should happen at rtt + 4 * rttvar. Because of the 6627 * way we do the smoothing, srtt and rttvar will each average +1/2 6628 * tick of bias. When we compute the retransmit timer, we want 1/2 6629 * tick of rounding and 1 extra tick because of +-1/2 tick 6630 * uncertainty in the firing of the timer. The bias will give us 6631 * exactly the 1.5 tick we need. But, because the bias is 6632 * statistical, we have to test that we don't drop below the minimum 6633 * feasible timer (which is 2 ticks). 6634 */ 6635 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), 6636 max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); 6637 tp->t_softerror = 0; 6638 } 6639 6640 static void 6641 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, 6642 uint32_t t, uint32_t cts) 6643 { 6644 /* 6645 * For this RSM, we acknowledged the data from a previous 6646 * transmission, not the last one we made. This means we did a false 6647 * retransmit. 6648 */ 6649 struct tcp_rack *rack; 6650 6651 if (rsm->r_flags & RACK_HAS_FIN) { 6652 /* 6653 * The sending of the FIN often is multiple sent when we 6654 * have everything outstanding ack'd. We ignore this case 6655 * since its over now. 6656 */ 6657 return; 6658 } 6659 if (rsm->r_flags & RACK_TLP) { 6660 /* 6661 * We expect TLP's to have this occur. 6662 */ 6663 return; 6664 } 6665 rack = (struct tcp_rack *)tp->t_fb_ptr; 6666 /* should we undo cc changes and exit recovery? */ 6667 if (IN_RECOVERY(tp->t_flags)) { 6668 if (rack->r_ctl.rc_rsm_start == rsm->r_start) { 6669 /* 6670 * Undo what we ratched down and exit recovery if 6671 * possible 6672 */ 6673 EXIT_RECOVERY(tp->t_flags); 6674 tp->snd_recover = tp->snd_una; 6675 if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) 6676 tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; 6677 if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) 6678 tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; 6679 } 6680 } 6681 if (rsm->r_flags & RACK_WAS_SACKPASS) { 6682 /* 6683 * We retransmitted based on a sack and the earlier 6684 * retransmission ack'd it - re-ordering is occuring. 6685 */ 6686 counter_u64_add(rack_reorder_seen, 1); 6687 rack->r_ctl.rc_reorder_ts = cts; 6688 } 6689 counter_u64_add(rack_badfr, 1); 6690 counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); 6691 } 6692 6693 static void 6694 rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) 6695 { 6696 /* 6697 * Apply to filter the inbound us-rtt at us_cts. 6698 */ 6699 uint32_t old_rtt; 6700 6701 old_rtt = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt); 6702 apply_filter_min_small(&rack->r_ctl.rc_gp_min_rtt, 6703 us_rtt, us_cts); 6704 if (rack->r_ctl.last_pacing_time && 6705 rack->rc_gp_dyn_mul && 6706 (rack->r_ctl.last_pacing_time > us_rtt)) 6707 rack->pacing_longer_than_rtt = 1; 6708 else 6709 rack->pacing_longer_than_rtt = 0; 6710 if (old_rtt > us_rtt) { 6711 /* We just hit a new lower rtt time */ 6712 rack_log_rtt_shrinks(rack, us_cts, old_rtt, 6713 __LINE__, RACK_RTTS_NEWRTT); 6714 /* 6715 * Only count it if its lower than what we saw within our 6716 * calculated range. 6717 */ 6718 if ((old_rtt - us_rtt) > rack_min_rtt_movement) { 6719 if (rack_probertt_lower_within && 6720 rack->rc_gp_dyn_mul && 6721 (rack->use_fixed_rate == 0) && 6722 (rack->rc_always_pace)) { 6723 /* 6724 * We are seeing a new lower rtt very close 6725 * to the time that we would have entered probe-rtt. 6726 * This is probably due to the fact that a peer flow 6727 * has entered probe-rtt. Lets go in now too. 6728 */ 6729 uint32_t val; 6730 6731 val = rack_probertt_lower_within * rack_time_between_probertt; 6732 val /= 100; 6733 if ((rack->in_probe_rtt == 0) && 6734 ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { 6735 rack_enter_probertt(rack, us_cts); 6736 } 6737 } 6738 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 6739 } 6740 } 6741 } 6742 6743 static int 6744 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, 6745 struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) 6746 { 6747 int32_t i; 6748 uint32_t t, len_acked; 6749 6750 if ((rsm->r_flags & RACK_ACKED) || 6751 (rsm->r_flags & RACK_WAS_ACKED)) 6752 /* Already done */ 6753 return (0); 6754 6755 if (ack_type == CUM_ACKED) { 6756 if (SEQ_GT(th_ack, rsm->r_end)) 6757 len_acked = rsm->r_end - rsm->r_start; 6758 else 6759 len_acked = th_ack - rsm->r_start; 6760 } else 6761 len_acked = rsm->r_end - rsm->r_start; 6762 if (rsm->r_rtr_cnt == 1) { 6763 uint32_t us_rtt; 6764 6765 t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6766 if ((int)t <= 0) 6767 t = 1; 6768 if (!tp->t_rttlow || tp->t_rttlow > t) 6769 tp->t_rttlow = t; 6770 if (!rack->r_ctl.rc_rack_min_rtt || 6771 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6772 rack->r_ctl.rc_rack_min_rtt = t; 6773 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6774 rack->r_ctl.rc_rack_min_rtt = 1; 6775 } 6776 } 6777 us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; 6778 if (us_rtt == 0) 6779 us_rtt = 1; 6780 rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); 6781 if (ack_type == SACKED) 6782 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); 6783 else { 6784 /* 6785 * For cum-ack we are only confident if what 6786 * is being acked is included in a measurement. 6787 * Otherwise it could be an idle period that 6788 * includes Delayed-ack time. 6789 */ 6790 tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 6791 (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); 6792 } 6793 if ((rsm->r_flags & RACK_TLP) && 6794 (!IN_RECOVERY(tp->t_flags))) { 6795 /* Segment was a TLP and our retrans matched */ 6796 if (rack->r_ctl.rc_tlp_cwnd_reduce) { 6797 rack->r_ctl.rc_rsm_start = tp->snd_max; 6798 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 6799 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 6800 rack_cong_signal(tp, NULL, CC_NDUPACK); 6801 /* 6802 * When we enter recovery we need to assure 6803 * we send one packet. 6804 */ 6805 if (rack->rack_no_prr == 0) { 6806 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 6807 rack_log_to_prr(rack, 7, 0); 6808 } 6809 } 6810 } 6811 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6812 /* New more recent rack_tmit_time */ 6813 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6814 rack->rc_rack_rtt = t; 6815 } 6816 return (1); 6817 } 6818 /* 6819 * We clear the soft/rxtshift since we got an ack. 6820 * There is no assurance we will call the commit() function 6821 * so we need to clear these to avoid incorrect handling. 6822 */ 6823 tp->t_rxtshift = 0; 6824 tp->t_softerror = 0; 6825 if ((to->to_flags & TOF_TS) && 6826 (ack_type == CUM_ACKED) && 6827 (to->to_tsecr) && 6828 ((rsm->r_flags & RACK_OVERMAX) == 0)) { 6829 /* 6830 * Now which timestamp does it match? In this block the ACK 6831 * must be coming from a previous transmission. 6832 */ 6833 for (i = 0; i < rsm->r_rtr_cnt; i++) { 6834 if (rsm->r_tim_lastsent[i] == to->to_tsecr) { 6835 t = cts - rsm->r_tim_lastsent[i]; 6836 if ((int)t <= 0) 6837 t = 1; 6838 if ((i + 1) < rsm->r_rtr_cnt) { 6839 /* Likely */ 6840 rack_earlier_retran(tp, rsm, t, cts); 6841 } 6842 if (!tp->t_rttlow || tp->t_rttlow > t) 6843 tp->t_rttlow = t; 6844 if (!rack->r_ctl.rc_rack_min_rtt || SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6845 rack->r_ctl.rc_rack_min_rtt = t; 6846 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6847 rack->r_ctl.rc_rack_min_rtt = 1; 6848 } 6849 } 6850 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, 6851 rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { 6852 /* New more recent rack_tmit_time */ 6853 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; 6854 rack->rc_rack_rtt = t; 6855 } 6856 tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, 6857 rsm->r_rtr_cnt); 6858 return (1); 6859 } 6860 } 6861 goto ts_not_found; 6862 } else { 6863 /* 6864 * Ok its a SACK block that we retransmitted. or a windows 6865 * machine without timestamps. We can tell nothing from the 6866 * time-stamp since its not there or the time the peer last 6867 * recieved a segment that moved forward its cum-ack point. 6868 */ 6869 ts_not_found: 6870 i = rsm->r_rtr_cnt - 1; 6871 t = cts - rsm->r_tim_lastsent[i]; 6872 if ((int)t <= 0) 6873 t = 1; 6874 if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6875 /* 6876 * We retransmitted and the ack came back in less 6877 * than the smallest rtt we have observed. We most 6878 * likey did an improper retransmit as outlined in 6879 * 4.2 Step 3 point 2 in the rack-draft. 6880 */ 6881 i = rsm->r_rtr_cnt - 2; 6882 t = cts - rsm->r_tim_lastsent[i]; 6883 rack_earlier_retran(tp, rsm, t, cts); 6884 } else if (rack->r_ctl.rc_rack_min_rtt) { 6885 /* 6886 * We retransmitted it and the retransmit did the 6887 * job. 6888 */ 6889 if (!rack->r_ctl.rc_rack_min_rtt || 6890 SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { 6891 rack->r_ctl.rc_rack_min_rtt = t; 6892 if (rack->r_ctl.rc_rack_min_rtt == 0) { 6893 rack->r_ctl.rc_rack_min_rtt = 1; 6894 } 6895 } 6896 if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { 6897 /* New more recent rack_tmit_time */ 6898 rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; 6899 rack->rc_rack_rtt = t; 6900 } 6901 return (1); 6902 } 6903 } 6904 return (0); 6905 } 6906 6907 /* 6908 * Mark the SACK_PASSED flag on all entries prior to rsm send wise. 6909 */ 6910 static void 6911 rack_log_sack_passed(struct tcpcb *tp, 6912 struct tcp_rack *rack, struct rack_sendmap *rsm) 6913 { 6914 struct rack_sendmap *nrsm; 6915 6916 nrsm = rsm; 6917 TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, 6918 rack_head, r_tnext) { 6919 if (nrsm == rsm) { 6920 /* Skip orginal segment he is acked */ 6921 continue; 6922 } 6923 if (nrsm->r_flags & RACK_ACKED) { 6924 /* 6925 * Skip ack'd segments, though we 6926 * should not see these, since tmap 6927 * should not have ack'd segments. 6928 */ 6929 continue; 6930 } 6931 if (nrsm->r_flags & RACK_SACK_PASSED) { 6932 /* 6933 * We found one that is already marked 6934 * passed, we have been here before and 6935 * so all others below this are marked. 6936 */ 6937 break; 6938 } 6939 nrsm->r_flags |= RACK_SACK_PASSED; 6940 nrsm->r_flags &= ~RACK_WAS_SACKPASS; 6941 } 6942 } 6943 6944 static void 6945 rack_need_set_test(struct tcpcb *tp, 6946 struct tcp_rack *rack, 6947 struct rack_sendmap *rsm, 6948 tcp_seq th_ack, 6949 int line, 6950 int use_which) 6951 { 6952 6953 if ((tp->t_flags & TF_GPUTINPROG) && 6954 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6955 /* 6956 * We were app limited, and this ack 6957 * butts up or goes beyond the point where we want 6958 * to start our next measurement. We need 6959 * to record the new gput_ts as here and 6960 * possibly update the start sequence. 6961 */ 6962 uint32_t seq, ts; 6963 6964 if (rsm->r_rtr_cnt > 1) { 6965 /* 6966 * This is a retransmit, can we 6967 * really make any assessment at this 6968 * point? We are not really sure of 6969 * the timestamp, is it this or the 6970 * previous transmission? 6971 * 6972 * Lets wait for something better that 6973 * is not retransmitted. 6974 */ 6975 return; 6976 } 6977 seq = tp->gput_seq; 6978 ts = tp->gput_ts; 6979 rack->app_limited_needs_set = 0; 6980 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 6981 /* Do we start at a new end? */ 6982 if ((use_which == RACK_USE_BEG) && 6983 SEQ_GEQ(rsm->r_start, tp->gput_seq)) { 6984 /* 6985 * When we get an ACK that just eats 6986 * up some of the rsm, we set RACK_USE_BEG 6987 * since whats at r_start (i.e. th_ack) 6988 * is left unacked and thats where the 6989 * measurement not starts. 6990 */ 6991 tp->gput_seq = rsm->r_start; 6992 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 6993 } 6994 if ((use_which == RACK_USE_END) && 6995 SEQ_GEQ(rsm->r_end, tp->gput_seq)) { 6996 /* 6997 * We use the end when the cumack 6998 * is moving forward and completely 6999 * deleting the rsm passed so basically 7000 * r_end holds th_ack. 7001 * 7002 * For SACK's we also want to use the end 7003 * since this piece just got sacked and 7004 * we want to target anything after that 7005 * in our measurement. 7006 */ 7007 tp->gput_seq = rsm->r_end; 7008 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 7009 } 7010 if (use_which == RACK_USE_END_OR_THACK) { 7011 /* 7012 * special case for ack moving forward, 7013 * not a sack, we need to move all the 7014 * way up to where this ack cum-ack moves 7015 * to. 7016 */ 7017 if (SEQ_GT(th_ack, rsm->r_end)) 7018 tp->gput_seq = th_ack; 7019 else 7020 tp->gput_seq = rsm->r_end; 7021 rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; 7022 } 7023 if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { 7024 /* 7025 * We moved beyond this guy's range, re-calculate 7026 * the new end point. 7027 */ 7028 if (rack->rc_gp_filled == 0) { 7029 tp->gput_ack = tp->gput_seq + max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 7030 } else { 7031 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 7032 } 7033 } 7034 /* 7035 * We are moving the goal post, we may be able to clear the 7036 * measure_saw_probe_rtt flag. 7037 */ 7038 if ((rack->in_probe_rtt == 0) && 7039 (rack->measure_saw_probe_rtt) && 7040 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 7041 rack->measure_saw_probe_rtt = 0; 7042 rack_log_pacing_delay_calc(rack, ts, tp->gput_ts, 7043 seq, tp->gput_seq, 0, 5, line, NULL); 7044 if (rack->rc_gp_filled && 7045 ((tp->gput_ack - tp->gput_seq) < 7046 max(rc_init_window(rack), (MIN_GP_WIN * 7047 ctf_fixed_maxseg(tp))))) { 7048 /* 7049 * There is no sense of continuing this measurement 7050 * because its too small to gain us anything we 7051 * trust. Skip it and that way we can start a new 7052 * measurement quicker. 7053 */ 7054 rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, 7055 0, 0, 0, 6, __LINE__, NULL); 7056 tp->t_flags &= ~TF_GPUTINPROG; 7057 } 7058 } 7059 } 7060 7061 static uint32_t 7062 rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, 7063 struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two) 7064 { 7065 uint32_t start, end, changed = 0; 7066 struct rack_sendmap stack_map; 7067 struct rack_sendmap *rsm, *nrsm, fe, *insret, *prev, *next; 7068 int32_t used_ref = 1; 7069 int moved = 0; 7070 7071 start = sack->start; 7072 end = sack->end; 7073 rsm = *prsm; 7074 memset(&fe, 0, sizeof(fe)); 7075 do_rest_ofb: 7076 if ((rsm == NULL) || 7077 (SEQ_LT(end, rsm->r_start)) || 7078 (SEQ_GEQ(start, rsm->r_end)) || 7079 (SEQ_LT(start, rsm->r_start))) { 7080 /* 7081 * We are not in the right spot, 7082 * find the correct spot in the tree. 7083 */ 7084 used_ref = 0; 7085 fe.r_start = start; 7086 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 7087 moved++; 7088 } 7089 if (rsm == NULL) { 7090 /* TSNH */ 7091 goto out; 7092 } 7093 /* Ok we have an ACK for some piece of this rsm */ 7094 if (rsm->r_start != start) { 7095 if ((rsm->r_flags & RACK_ACKED) == 0) { 7096 /** 7097 * Need to split this in two pieces the before and after, 7098 * the before remains in the map, the after must be 7099 * added. In other words we have: 7100 * rsm |--------------| 7101 * sackblk |-------> 7102 * rsm will become 7103 * rsm |---| 7104 * and nrsm will be the sacked piece 7105 * nrsm |----------| 7106 * 7107 * But before we start down that path lets 7108 * see if the sack spans over on top of 7109 * the next guy and it is already sacked. 7110 */ 7111 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7112 if (next && (next->r_flags & RACK_ACKED) && 7113 SEQ_GEQ(end, next->r_start)) { 7114 /** 7115 * So the next one is already acked, and 7116 * we can thus by hookery use our stack_map 7117 * to reflect the piece being sacked and 7118 * then adjust the two tree entries moving 7119 * the start and ends around. So we start like: 7120 * rsm |------------| (not-acked) 7121 * next |-----------| (acked) 7122 * sackblk |--------> 7123 * We want to end like so: 7124 * rsm |------| (not-acked) 7125 * next |-----------------| (acked) 7126 * nrsm |-----| 7127 * Where nrsm is a temporary stack piece we 7128 * use to update all the gizmos. 7129 */ 7130 /* Copy up our fudge block */ 7131 nrsm = &stack_map; 7132 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7133 /* Now adjust our tree blocks */ 7134 rsm->r_end = start; 7135 next->r_start = start; 7136 /* Clear out the dup ack count of the remainder */ 7137 rsm->r_dupack = 0; 7138 rsm->r_just_ret = 0; 7139 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7140 /* Now lets make sure our fudge block is right */ 7141 nrsm->r_start = start; 7142 /* Now lets update all the stats and such */ 7143 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7144 if (rack->app_limited_needs_set) 7145 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7146 changed += (nrsm->r_end - nrsm->r_start); 7147 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7148 if (nrsm->r_flags & RACK_SACK_PASSED) { 7149 counter_u64_add(rack_reorder_seen, 1); 7150 rack->r_ctl.rc_reorder_ts = cts; 7151 } 7152 /* 7153 * Now we want to go up from rsm (the 7154 * one left un-acked) to the next one 7155 * in the tmap. We do this so when 7156 * we walk backwards we include marking 7157 * sack-passed on rsm (The one passed in 7158 * is skipped since it is generally called 7159 * on something sacked before removing it 7160 * from the tmap). 7161 */ 7162 if (rsm->r_in_tmap) { 7163 nrsm = TAILQ_NEXT(rsm, r_tnext); 7164 /* 7165 * Now that we have the next 7166 * one walk backwards from there. 7167 */ 7168 if (nrsm && nrsm->r_in_tmap) 7169 rack_log_sack_passed(tp, rack, nrsm); 7170 } 7171 /* Now are we done? */ 7172 if (SEQ_LT(end, next->r_end) || 7173 (end == next->r_end)) { 7174 /* Done with block */ 7175 goto out; 7176 } 7177 counter_u64_add(rack_sack_used_next_merge, 1); 7178 /* Postion for the next block */ 7179 start = next->r_end; 7180 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, next); 7181 if (rsm == NULL) 7182 goto out; 7183 } else { 7184 /** 7185 * We can't use any hookery here, so we 7186 * need to split the map. We enter like 7187 * so: 7188 * rsm |--------| 7189 * sackblk |-----> 7190 * We will add the new block nrsm and 7191 * that will be the new portion, and then 7192 * fall through after reseting rsm. So we 7193 * split and look like this: 7194 * rsm |----| 7195 * sackblk |-----> 7196 * nrsm |---| 7197 * We then fall through reseting 7198 * rsm to nrsm, so the next block 7199 * picks it up. 7200 */ 7201 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7202 if (nrsm == NULL) { 7203 /* 7204 * failed XXXrrs what can we do but loose the sack 7205 * info? 7206 */ 7207 goto out; 7208 } 7209 counter_u64_add(rack_sack_splits, 1); 7210 rack_clone_rsm(rack, nrsm, rsm, start); 7211 rsm->r_just_ret = 0; 7212 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7213 #ifdef INVARIANTS 7214 if (insret != NULL) { 7215 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7216 nrsm, insret, rack, rsm); 7217 } 7218 #endif 7219 if (rsm->r_in_tmap) { 7220 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7221 nrsm->r_in_tmap = 1; 7222 } 7223 rsm->r_flags &= (~RACK_HAS_FIN); 7224 /* Position us to point to the new nrsm that starts the sack blk */ 7225 rsm = nrsm; 7226 } 7227 } else { 7228 /* Already sacked this piece */ 7229 counter_u64_add(rack_sack_skipped_acked, 1); 7230 moved++; 7231 if (end == rsm->r_end) { 7232 /* Done with block */ 7233 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7234 goto out; 7235 } else if (SEQ_LT(end, rsm->r_end)) { 7236 /* A partial sack to a already sacked block */ 7237 moved++; 7238 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7239 goto out; 7240 } else { 7241 /* 7242 * The end goes beyond this guy 7243 * repostion the start to the 7244 * next block. 7245 */ 7246 start = rsm->r_end; 7247 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7248 if (rsm == NULL) 7249 goto out; 7250 } 7251 } 7252 } 7253 if (SEQ_GEQ(end, rsm->r_end)) { 7254 /** 7255 * The end of this block is either beyond this guy or right 7256 * at this guy. I.e.: 7257 * rsm --- |-----| 7258 * end |-----| 7259 * <or> 7260 * end |---------| 7261 */ 7262 if ((rsm->r_flags & RACK_ACKED) == 0) { 7263 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7264 changed += (rsm->r_end - rsm->r_start); 7265 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7266 if (rsm->r_in_tmap) /* should be true */ 7267 rack_log_sack_passed(tp, rack, rsm); 7268 /* Is Reordering occuring? */ 7269 if (rsm->r_flags & RACK_SACK_PASSED) { 7270 rsm->r_flags &= ~RACK_SACK_PASSED; 7271 counter_u64_add(rack_reorder_seen, 1); 7272 rack->r_ctl.rc_reorder_ts = cts; 7273 } 7274 if (rack->app_limited_needs_set) 7275 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7276 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7277 rsm->r_flags |= RACK_ACKED; 7278 rsm->r_flags &= ~RACK_TLP; 7279 if (rsm->r_in_tmap) { 7280 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7281 rsm->r_in_tmap = 0; 7282 } 7283 } else { 7284 counter_u64_add(rack_sack_skipped_acked, 1); 7285 moved++; 7286 } 7287 if (end == rsm->r_end) { 7288 /* This block only - done, setup for next */ 7289 goto out; 7290 } 7291 /* 7292 * There is more not coverend by this rsm move on 7293 * to the next block in the RB tree. 7294 */ 7295 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7296 start = rsm->r_end; 7297 rsm = nrsm; 7298 if (rsm == NULL) 7299 goto out; 7300 goto do_rest_ofb; 7301 } 7302 /** 7303 * The end of this sack block is smaller than 7304 * our rsm i.e.: 7305 * rsm --- |-----| 7306 * end |--| 7307 */ 7308 if ((rsm->r_flags & RACK_ACKED) == 0) { 7309 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7310 if (prev && (prev->r_flags & RACK_ACKED)) { 7311 /** 7312 * Goal, we want the right remainder of rsm to shrink 7313 * in place and span from (rsm->r_start = end) to rsm->r_end. 7314 * We want to expand prev to go all the way 7315 * to prev->r_end <- end. 7316 * so in the tree we have before: 7317 * prev |--------| (acked) 7318 * rsm |-------| (non-acked) 7319 * sackblk |-| 7320 * We churn it so we end up with 7321 * prev |----------| (acked) 7322 * rsm |-----| (non-acked) 7323 * nrsm |-| (temporary) 7324 */ 7325 nrsm = &stack_map; 7326 memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); 7327 prev->r_end = end; 7328 rsm->r_start = end; 7329 /* Now adjust nrsm (stack copy) to be 7330 * the one that is the small 7331 * piece that was "sacked". 7332 */ 7333 nrsm->r_end = end; 7334 rsm->r_dupack = 0; 7335 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7336 /* 7337 * Now nrsm is our new little piece 7338 * that is acked (which was merged 7339 * to prev). Update the rtt and changed 7340 * based on that. Also check for reordering. 7341 */ 7342 rack_update_rtt(tp, rack, nrsm, to, cts, SACKED, 0); 7343 if (rack->app_limited_needs_set) 7344 rack_need_set_test(tp, rack, nrsm, tp->snd_una, __LINE__, RACK_USE_END); 7345 changed += (nrsm->r_end - nrsm->r_start); 7346 rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); 7347 if (nrsm->r_flags & RACK_SACK_PASSED) { 7348 counter_u64_add(rack_reorder_seen, 1); 7349 rack->r_ctl.rc_reorder_ts = cts; 7350 } 7351 rsm = prev; 7352 counter_u64_add(rack_sack_used_prev_merge, 1); 7353 } else { 7354 /** 7355 * This is the case where our previous 7356 * block is not acked either, so we must 7357 * split the block in two. 7358 */ 7359 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 7360 if (nrsm == NULL) { 7361 /* failed rrs what can we do but loose the sack info? */ 7362 goto out; 7363 } 7364 /** 7365 * In this case nrsm becomes 7366 * nrsm->r_start = end; 7367 * nrsm->r_end = rsm->r_end; 7368 * which is un-acked. 7369 * <and> 7370 * rsm->r_end = nrsm->r_start; 7371 * i.e. the remaining un-acked 7372 * piece is left on the left 7373 * hand side. 7374 * 7375 * So we start like this 7376 * rsm |----------| (not acked) 7377 * sackblk |---| 7378 * build it so we have 7379 * rsm |---| (acked) 7380 * nrsm |------| (not acked) 7381 */ 7382 counter_u64_add(rack_sack_splits, 1); 7383 rack_clone_rsm(rack, nrsm, rsm, end); 7384 rsm->r_flags &= (~RACK_HAS_FIN); 7385 rsm->r_just_ret = 0; 7386 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 7387 #ifdef INVARIANTS 7388 if (insret != NULL) { 7389 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 7390 nrsm, insret, rack, rsm); 7391 } 7392 #endif 7393 if (rsm->r_in_tmap) { 7394 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 7395 nrsm->r_in_tmap = 1; 7396 } 7397 nrsm->r_dupack = 0; 7398 rack_log_retran_reason(rack, nrsm, __LINE__, 0, 2); 7399 rack_update_rtt(tp, rack, rsm, to, cts, SACKED, 0); 7400 changed += (rsm->r_end - rsm->r_start); 7401 rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); 7402 if (rsm->r_in_tmap) /* should be true */ 7403 rack_log_sack_passed(tp, rack, rsm); 7404 /* Is Reordering occuring? */ 7405 if (rsm->r_flags & RACK_SACK_PASSED) { 7406 rsm->r_flags &= ~RACK_SACK_PASSED; 7407 counter_u64_add(rack_reorder_seen, 1); 7408 rack->r_ctl.rc_reorder_ts = cts; 7409 } 7410 if (rack->app_limited_needs_set) 7411 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); 7412 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7413 rsm->r_flags |= RACK_ACKED; 7414 rsm->r_flags &= ~RACK_TLP; 7415 if (rsm->r_in_tmap) { 7416 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7417 rsm->r_in_tmap = 0; 7418 } 7419 } 7420 } else if (start != end){ 7421 /* 7422 * The block was already acked. 7423 */ 7424 counter_u64_add(rack_sack_skipped_acked, 1); 7425 moved++; 7426 } 7427 out: 7428 if (rsm && (rsm->r_flags & RACK_ACKED)) { 7429 /* 7430 * Now can we merge where we worked 7431 * with either the previous or 7432 * next block? 7433 */ 7434 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7435 while (next) { 7436 if (next->r_flags & RACK_ACKED) { 7437 /* yep this and next can be merged */ 7438 rsm = rack_merge_rsm(rack, rsm, next); 7439 next = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7440 } else 7441 break; 7442 } 7443 /* Now what about the previous? */ 7444 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7445 while (prev) { 7446 if (prev->r_flags & RACK_ACKED) { 7447 /* yep the previous and this can be merged */ 7448 rsm = rack_merge_rsm(rack, prev, rsm); 7449 prev = RB_PREV(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7450 } else 7451 break; 7452 } 7453 } 7454 if (used_ref == 0) { 7455 counter_u64_add(rack_sack_proc_all, 1); 7456 } else { 7457 counter_u64_add(rack_sack_proc_short, 1); 7458 } 7459 /* Save off the next one for quick reference. */ 7460 if (rsm) 7461 nrsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7462 else 7463 nrsm = NULL; 7464 *prsm = rack->r_ctl.rc_sacklast = nrsm; 7465 /* Pass back the moved. */ 7466 *moved_two = moved; 7467 return (changed); 7468 } 7469 7470 static void inline 7471 rack_peer_reneges(struct tcp_rack *rack, struct rack_sendmap *rsm, tcp_seq th_ack) 7472 { 7473 struct rack_sendmap *tmap; 7474 7475 tmap = NULL; 7476 while (rsm && (rsm->r_flags & RACK_ACKED)) { 7477 /* Its no longer sacked, mark it so */ 7478 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7479 #ifdef INVARIANTS 7480 if (rsm->r_in_tmap) { 7481 panic("rack:%p rsm:%p flags:0x%x in tmap?", 7482 rack, rsm, rsm->r_flags); 7483 } 7484 #endif 7485 rsm->r_flags &= ~(RACK_ACKED|RACK_SACK_PASSED|RACK_WAS_SACKPASS); 7486 /* Rebuild it into our tmap */ 7487 if (tmap == NULL) { 7488 TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7489 tmap = rsm; 7490 } else { 7491 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, tmap, rsm, r_tnext); 7492 tmap = rsm; 7493 } 7494 tmap->r_in_tmap = 1; 7495 rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7496 } 7497 /* 7498 * Now lets possibly clear the sack filter so we start 7499 * recognizing sacks that cover this area. 7500 */ 7501 sack_filter_clear(&rack->r_ctl.rack_sf, th_ack); 7502 7503 } 7504 7505 static void 7506 rack_do_decay(struct tcp_rack *rack) 7507 { 7508 struct timeval res; 7509 7510 #define timersub(tvp, uvp, vvp) \ 7511 do { \ 7512 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 7513 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 7514 if ((vvp)->tv_usec < 0) { \ 7515 (vvp)->tv_sec--; \ 7516 (vvp)->tv_usec += 1000000; \ 7517 } \ 7518 } while (0) 7519 7520 timersub(&rack->r_ctl.act_rcv_time, &rack->r_ctl.rc_last_time_decay, &res); 7521 #undef timersub 7522 7523 rack->r_ctl.input_pkt++; 7524 if ((rack->rc_in_persist) || 7525 (res.tv_sec >= 1) || 7526 (rack->rc_tp->snd_max == rack->rc_tp->snd_una)) { 7527 /* 7528 * Check for decay of non-SAD, 7529 * we want all SAD detection metrics to 7530 * decay 1/4 per second (or more) passed. 7531 */ 7532 uint32_t pkt_delta; 7533 7534 pkt_delta = rack->r_ctl.input_pkt - rack->r_ctl.saved_input_pkt; 7535 /* Update our saved tracking values */ 7536 rack->r_ctl.saved_input_pkt = rack->r_ctl.input_pkt; 7537 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 7538 /* Now do we escape without decay? */ 7539 #ifdef NETFLIX_EXP_DETECTION 7540 if (rack->rc_in_persist || 7541 (rack->rc_tp->snd_max == rack->rc_tp->snd_una) || 7542 (pkt_delta < tcp_sad_low_pps)){ 7543 /* 7544 * We don't decay idle connections 7545 * or ones that have a low input pps. 7546 */ 7547 return; 7548 } 7549 /* Decay the counters */ 7550 rack->r_ctl.ack_count = ctf_decay_count(rack->r_ctl.ack_count, 7551 tcp_sad_decay_val); 7552 rack->r_ctl.sack_count = ctf_decay_count(rack->r_ctl.sack_count, 7553 tcp_sad_decay_val); 7554 rack->r_ctl.sack_moved_extra = ctf_decay_count(rack->r_ctl.sack_moved_extra, 7555 tcp_sad_decay_val); 7556 rack->r_ctl.sack_noextra_move = ctf_decay_count(rack->r_ctl.sack_noextra_move, 7557 tcp_sad_decay_val); 7558 #endif 7559 } 7560 } 7561 7562 static void 7563 rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) 7564 { 7565 uint32_t changed, entered_recovery = 0; 7566 struct tcp_rack *rack; 7567 struct rack_sendmap *rsm, *rm; 7568 struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; 7569 register uint32_t th_ack; 7570 int32_t i, j, k, num_sack_blks = 0; 7571 uint32_t cts, acked, ack_point, sack_changed = 0; 7572 int loop_start = 0, moved_two = 0; 7573 uint32_t tsused; 7574 7575 7576 INP_WLOCK_ASSERT(tp->t_inpcb); 7577 if (th->th_flags & TH_RST) { 7578 /* We don't log resets */ 7579 return; 7580 } 7581 rack = (struct tcp_rack *)tp->t_fb_ptr; 7582 cts = tcp_ts_getticks(); 7583 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7584 changed = 0; 7585 th_ack = th->th_ack; 7586 if (rack->sack_attack_disable == 0) 7587 rack_do_decay(rack); 7588 if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { 7589 /* 7590 * You only get credit for 7591 * MSS and greater (and you get extra 7592 * credit for larger cum-ack moves). 7593 */ 7594 int ac; 7595 7596 ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); 7597 rack->r_ctl.ack_count += ac; 7598 counter_u64_add(rack_ack_total, ac); 7599 } 7600 if (rack->r_ctl.ack_count > 0xfff00000) { 7601 /* 7602 * reduce the number to keep us under 7603 * a uint32_t. 7604 */ 7605 rack->r_ctl.ack_count /= 2; 7606 rack->r_ctl.sack_count /= 2; 7607 } 7608 if (SEQ_GT(th_ack, tp->snd_una)) { 7609 rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); 7610 tp->t_acktime = ticks; 7611 } 7612 if (rsm && SEQ_GT(th_ack, rsm->r_start)) 7613 changed = th_ack - rsm->r_start; 7614 if (changed) { 7615 /* 7616 * The ACK point is advancing to th_ack, we must drop off 7617 * the packets in the rack log and calculate any eligble 7618 * RTT's. 7619 */ 7620 rack->r_wanted_output = 1; 7621 more: 7622 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7623 if (rsm == NULL) { 7624 if ((th_ack - 1) == tp->iss) { 7625 /* 7626 * For the SYN incoming case we will not 7627 * have called tcp_output for the sending of 7628 * the SYN, so there will be no map. All 7629 * other cases should probably be a panic. 7630 */ 7631 goto proc_sack; 7632 } 7633 if (tp->t_flags & TF_SENTFIN) { 7634 /* if we send a FIN we will not hav a map */ 7635 goto proc_sack; 7636 } 7637 #ifdef INVARIANTS 7638 panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", 7639 tp, 7640 th, tp->t_state, rack, 7641 tp->snd_una, tp->snd_max, tp->snd_nxt, changed); 7642 #endif 7643 goto proc_sack; 7644 } 7645 if (SEQ_LT(th_ack, rsm->r_start)) { 7646 /* Huh map is missing this */ 7647 #ifdef INVARIANTS 7648 printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", 7649 rsm->r_start, 7650 th_ack, tp->t_state, rack->r_state); 7651 #endif 7652 goto proc_sack; 7653 } 7654 rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); 7655 /* Now do we consume the whole thing? */ 7656 if (SEQ_GEQ(th_ack, rsm->r_end)) { 7657 /* Its all consumed. */ 7658 uint32_t left; 7659 uint8_t newly_acked; 7660 7661 rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; 7662 rsm->r_rtr_bytes = 0; 7663 /* Record the time of highest cumack sent */ 7664 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7665 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 7666 #ifdef INVARIANTS 7667 if (rm != rsm) { 7668 panic("removing head in rack:%p rsm:%p rm:%p", 7669 rack, rsm, rm); 7670 } 7671 #endif 7672 if (rsm->r_in_tmap) { 7673 TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); 7674 rsm->r_in_tmap = 0; 7675 } 7676 newly_acked = 1; 7677 if (rsm->r_flags & RACK_ACKED) { 7678 /* 7679 * It was acked on the scoreboard -- remove 7680 * it from total 7681 */ 7682 rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); 7683 newly_acked = 0; 7684 } else if (rsm->r_flags & RACK_SACK_PASSED) { 7685 /* 7686 * There are segments ACKED on the 7687 * scoreboard further up. We are seeing 7688 * reordering. 7689 */ 7690 rsm->r_flags &= ~RACK_SACK_PASSED; 7691 counter_u64_add(rack_reorder_seen, 1); 7692 rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 7693 rsm->r_flags |= RACK_ACKED; 7694 rack->r_ctl.rc_reorder_ts = cts; 7695 } 7696 left = th_ack - rsm->r_end; 7697 if (rack->app_limited_needs_set && newly_acked) 7698 rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); 7699 /* Free back to zone */ 7700 rack_free(rack, rsm); 7701 if (left) { 7702 goto more; 7703 } 7704 goto proc_sack; 7705 } 7706 if (rsm->r_flags & RACK_ACKED) { 7707 /* 7708 * It was acked on the scoreboard -- remove it from 7709 * total for the part being cum-acked. 7710 */ 7711 rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); 7712 } 7713 /* 7714 * Clear the dup ack count for 7715 * the piece that remains. 7716 */ 7717 rsm->r_dupack = 0; 7718 rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); 7719 if (rsm->r_rtr_bytes) { 7720 /* 7721 * It was retransmitted adjust the 7722 * sack holes for what was acked. 7723 */ 7724 int ack_am; 7725 7726 ack_am = (th_ack - rsm->r_start); 7727 if (ack_am >= rsm->r_rtr_bytes) { 7728 rack->r_ctl.rc_holes_rxt -= ack_am; 7729 rsm->r_rtr_bytes -= ack_am; 7730 } 7731 } 7732 /* 7733 * Update where the piece starts and record 7734 * the time of send of highest cumack sent. 7735 */ 7736 rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; 7737 rsm->r_start = th_ack; 7738 if (rack->app_limited_needs_set) 7739 rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); 7740 7741 } 7742 proc_sack: 7743 /* Check for reneging */ 7744 rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 7745 if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { 7746 /* 7747 * The peer has moved snd_una up to 7748 * the edge of this send, i.e. one 7749 * that it had previously acked. The only 7750 * way that can be true if the peer threw 7751 * away data (space issues) that it had 7752 * previously sacked (else it would have 7753 * given us snd_una up to (rsm->r_end). 7754 * We need to undo the acked markings here. 7755 * 7756 * Note we have to look to make sure th_ack is 7757 * our rsm->r_start in case we get an old ack 7758 * where th_ack is behind snd_una. 7759 */ 7760 rack_peer_reneges(rack, rsm, th->th_ack); 7761 } 7762 if ((to->to_flags & TOF_SACK) == 0) { 7763 /* We are done nothing left */ 7764 goto out; 7765 } 7766 /* Sack block processing */ 7767 if (SEQ_GT(th_ack, tp->snd_una)) 7768 ack_point = th_ack; 7769 else 7770 ack_point = tp->snd_una; 7771 for (i = 0; i < to->to_nsacks; i++) { 7772 bcopy((to->to_sacks + i * TCPOLEN_SACK), 7773 &sack, sizeof(sack)); 7774 sack.start = ntohl(sack.start); 7775 sack.end = ntohl(sack.end); 7776 if (SEQ_GT(sack.end, sack.start) && 7777 SEQ_GT(sack.start, ack_point) && 7778 SEQ_LT(sack.start, tp->snd_max) && 7779 SEQ_GT(sack.end, ack_point) && 7780 SEQ_LEQ(sack.end, tp->snd_max)) { 7781 sack_blocks[num_sack_blks] = sack; 7782 num_sack_blks++; 7783 #ifdef NETFLIX_STATS 7784 } else if (SEQ_LEQ(sack.start, th_ack) && 7785 SEQ_LEQ(sack.end, th_ack)) { 7786 /* 7787 * Its a D-SACK block. 7788 */ 7789 tcp_record_dsack(sack.start, sack.end); 7790 #endif 7791 } 7792 7793 } 7794 /* 7795 * Sort the SACK blocks so we can update the rack scoreboard with 7796 * just one pass. 7797 */ 7798 num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, 7799 num_sack_blks, th->th_ack); 7800 ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); 7801 if (num_sack_blks == 0) { 7802 /* Nothing to sack (DSACKs?) */ 7803 goto out_with_totals; 7804 } 7805 if (num_sack_blks < 2) { 7806 /* Only one, we don't need to sort */ 7807 goto do_sack_work; 7808 } 7809 /* Sort the sacks */ 7810 for (i = 0; i < num_sack_blks; i++) { 7811 for (j = i + 1; j < num_sack_blks; j++) { 7812 if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) { 7813 sack = sack_blocks[i]; 7814 sack_blocks[i] = sack_blocks[j]; 7815 sack_blocks[j] = sack; 7816 } 7817 } 7818 } 7819 /* 7820 * Now are any of the sack block ends the same (yes some 7821 * implementations send these)? 7822 */ 7823 again: 7824 if (num_sack_blks == 0) 7825 goto out_with_totals; 7826 if (num_sack_blks > 1) { 7827 for (i = 0; i < num_sack_blks; i++) { 7828 for (j = i + 1; j < num_sack_blks; j++) { 7829 if (sack_blocks[i].end == sack_blocks[j].end) { 7830 /* 7831 * Ok these two have the same end we 7832 * want the smallest end and then 7833 * throw away the larger and start 7834 * again. 7835 */ 7836 if (SEQ_LT(sack_blocks[j].start, sack_blocks[i].start)) { 7837 /* 7838 * The second block covers 7839 * more area use that 7840 */ 7841 sack_blocks[i].start = sack_blocks[j].start; 7842 } 7843 /* 7844 * Now collapse out the dup-sack and 7845 * lower the count 7846 */ 7847 for (k = (j + 1); k < num_sack_blks; k++) { 7848 sack_blocks[j].start = sack_blocks[k].start; 7849 sack_blocks[j].end = sack_blocks[k].end; 7850 j++; 7851 } 7852 num_sack_blks--; 7853 goto again; 7854 } 7855 } 7856 } 7857 } 7858 do_sack_work: 7859 /* 7860 * First lets look to see if 7861 * we have retransmitted and 7862 * can use the transmit next? 7863 */ 7864 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 7865 if (rsm && 7866 SEQ_GT(sack_blocks[0].end, rsm->r_start) && 7867 SEQ_LT(sack_blocks[0].start, rsm->r_end)) { 7868 /* 7869 * We probably did the FR and the next 7870 * SACK in continues as we would expect. 7871 */ 7872 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[0], to, &rsm, cts, &moved_two); 7873 if (acked) { 7874 rack->r_wanted_output = 1; 7875 changed += acked; 7876 sack_changed += acked; 7877 } 7878 if (num_sack_blks == 1) { 7879 /* 7880 * This is what we would expect from 7881 * a normal implementation to happen 7882 * after we have retransmitted the FR, 7883 * i.e the sack-filter pushes down 7884 * to 1 block and the next to be retransmitted 7885 * is the sequence in the sack block (has more 7886 * are acked). Count this as ACK'd data to boost 7887 * up the chances of recovering any false positives. 7888 */ 7889 rack->r_ctl.ack_count += (acked / ctf_fixed_maxseg(rack->rc_tp)); 7890 counter_u64_add(rack_ack_total, (acked / ctf_fixed_maxseg(rack->rc_tp))); 7891 counter_u64_add(rack_express_sack, 1); 7892 if (rack->r_ctl.ack_count > 0xfff00000) { 7893 /* 7894 * reduce the number to keep us under 7895 * a uint32_t. 7896 */ 7897 rack->r_ctl.ack_count /= 2; 7898 rack->r_ctl.sack_count /= 2; 7899 } 7900 goto out_with_totals; 7901 } else { 7902 /* 7903 * Start the loop through the 7904 * rest of blocks, past the first block. 7905 */ 7906 moved_two = 0; 7907 loop_start = 1; 7908 } 7909 } 7910 /* Its a sack of some sort */ 7911 rack->r_ctl.sack_count++; 7912 if (rack->r_ctl.sack_count > 0xfff00000) { 7913 /* 7914 * reduce the number to keep us under 7915 * a uint32_t. 7916 */ 7917 rack->r_ctl.ack_count /= 2; 7918 rack->r_ctl.sack_count /= 2; 7919 } 7920 counter_u64_add(rack_sack_total, 1); 7921 if (rack->sack_attack_disable) { 7922 /* An attacker disablement is in place */ 7923 if (num_sack_blks > 1) { 7924 rack->r_ctl.sack_count += (num_sack_blks - 1); 7925 rack->r_ctl.sack_moved_extra++; 7926 counter_u64_add(rack_move_some, 1); 7927 if (rack->r_ctl.sack_moved_extra > 0xfff00000) { 7928 rack->r_ctl.sack_moved_extra /= 2; 7929 rack->r_ctl.sack_noextra_move /= 2; 7930 } 7931 } 7932 goto out; 7933 } 7934 rsm = rack->r_ctl.rc_sacklast; 7935 for (i = loop_start; i < num_sack_blks; i++) { 7936 acked = rack_proc_sack_blk(tp, rack, &sack_blocks[i], to, &rsm, cts, &moved_two); 7937 if (acked) { 7938 rack->r_wanted_output = 1; 7939 changed += acked; 7940 sack_changed += acked; 7941 } 7942 if (moved_two) { 7943 /* 7944 * If we did not get a SACK for at least a MSS and 7945 * had to move at all, or if we moved more than our 7946 * threshold, it counts against the "extra" move. 7947 */ 7948 rack->r_ctl.sack_moved_extra += moved_two; 7949 counter_u64_add(rack_move_some, 1); 7950 } else { 7951 /* 7952 * else we did not have to move 7953 * any more than we would expect. 7954 */ 7955 rack->r_ctl.sack_noextra_move++; 7956 counter_u64_add(rack_move_none, 1); 7957 } 7958 if (moved_two && (acked < ctf_fixed_maxseg(rack->rc_tp))) { 7959 /* 7960 * If the SACK was not a full MSS then 7961 * we add to sack_count the number of 7962 * MSS's (or possibly more than 7963 * a MSS if its a TSO send) we had to skip by. 7964 */ 7965 rack->r_ctl.sack_count += moved_two; 7966 counter_u64_add(rack_sack_total, moved_two); 7967 } 7968 /* 7969 * Now we need to setup for the next 7970 * round. First we make sure we won't 7971 * exceed the size of our uint32_t on 7972 * the various counts, and then clear out 7973 * moved_two. 7974 */ 7975 if ((rack->r_ctl.sack_moved_extra > 0xfff00000) || 7976 (rack->r_ctl.sack_noextra_move > 0xfff00000)) { 7977 rack->r_ctl.sack_moved_extra /= 2; 7978 rack->r_ctl.sack_noextra_move /= 2; 7979 } 7980 if (rack->r_ctl.sack_count > 0xfff00000) { 7981 rack->r_ctl.ack_count /= 2; 7982 rack->r_ctl.sack_count /= 2; 7983 } 7984 moved_two = 0; 7985 } 7986 out_with_totals: 7987 if (num_sack_blks > 1) { 7988 /* 7989 * You get an extra stroke if 7990 * you have more than one sack-blk, this 7991 * could be where we are skipping forward 7992 * and the sack-filter is still working, or 7993 * it could be an attacker constantly 7994 * moving us. 7995 */ 7996 rack->r_ctl.sack_moved_extra++; 7997 counter_u64_add(rack_move_some, 1); 7998 } 7999 out: 8000 #ifdef NETFLIX_EXP_DETECTION 8001 if ((rack->do_detection || tcp_force_detection) && 8002 tcp_sack_to_ack_thresh && 8003 tcp_sack_to_move_thresh && 8004 ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { 8005 /* 8006 * We have thresholds set to find 8007 * possible attackers and disable sack. 8008 * Check them. 8009 */ 8010 uint64_t ackratio, moveratio, movetotal; 8011 8012 /* Log detecting */ 8013 rack_log_sad(rack, 1); 8014 ackratio = (uint64_t)(rack->r_ctl.sack_count); 8015 ackratio *= (uint64_t)(1000); 8016 if (rack->r_ctl.ack_count) 8017 ackratio /= (uint64_t)(rack->r_ctl.ack_count); 8018 else { 8019 /* We really should not hit here */ 8020 ackratio = 1000; 8021 } 8022 if ((rack->sack_attack_disable == 0) && 8023 (ackratio > rack_highest_sack_thresh_seen)) 8024 rack_highest_sack_thresh_seen = (uint32_t)ackratio; 8025 movetotal = rack->r_ctl.sack_moved_extra; 8026 movetotal += rack->r_ctl.sack_noextra_move; 8027 moveratio = rack->r_ctl.sack_moved_extra; 8028 moveratio *= (uint64_t)1000; 8029 if (movetotal) 8030 moveratio /= movetotal; 8031 else { 8032 /* No moves, thats pretty good */ 8033 moveratio = 0; 8034 } 8035 if ((rack->sack_attack_disable == 0) && 8036 (moveratio > rack_highest_move_thresh_seen)) 8037 rack_highest_move_thresh_seen = (uint32_t)moveratio; 8038 if (rack->sack_attack_disable == 0) { 8039 if ((ackratio > tcp_sack_to_ack_thresh) && 8040 (moveratio > tcp_sack_to_move_thresh)) { 8041 /* Disable sack processing */ 8042 rack->sack_attack_disable = 1; 8043 if (rack->r_rep_attack == 0) { 8044 rack->r_rep_attack = 1; 8045 counter_u64_add(rack_sack_attacks_detected, 1); 8046 } 8047 if (tcp_attack_on_turns_on_logging) { 8048 /* 8049 * Turn on logging, used for debugging 8050 * false positives. 8051 */ 8052 rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; 8053 } 8054 /* Clamp the cwnd at flight size */ 8055 rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; 8056 rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 8057 rack_log_sad(rack, 2); 8058 } 8059 } else { 8060 /* We are sack-disabled check for false positives */ 8061 if ((ackratio <= tcp_restoral_thresh) || 8062 (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { 8063 rack->sack_attack_disable = 0; 8064 rack_log_sad(rack, 3); 8065 /* Restart counting */ 8066 rack->r_ctl.sack_count = 0; 8067 rack->r_ctl.sack_moved_extra = 0; 8068 rack->r_ctl.sack_noextra_move = 1; 8069 rack->r_ctl.ack_count = max(1, 8070 (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); 8071 8072 if (rack->r_rep_reverse == 0) { 8073 rack->r_rep_reverse = 1; 8074 counter_u64_add(rack_sack_attacks_reversed, 1); 8075 } 8076 /* Restore the cwnd */ 8077 if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) 8078 rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; 8079 } 8080 } 8081 } 8082 #endif 8083 if (changed) { 8084 /* Something changed cancel the rack timer */ 8085 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8086 } 8087 tsused = tcp_ts_getticks(); 8088 rsm = tcp_rack_output(tp, rack, tsused); 8089 if ((!IN_RECOVERY(tp->t_flags)) && 8090 rsm) { 8091 /* Enter recovery */ 8092 rack->r_ctl.rc_rsm_start = rsm->r_start; 8093 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 8094 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 8095 entered_recovery = 1; 8096 rack_cong_signal(tp, NULL, CC_NDUPACK); 8097 /* 8098 * When we enter recovery we need to assure we send 8099 * one packet. 8100 */ 8101 if (rack->rack_no_prr == 0) { 8102 rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); 8103 rack_log_to_prr(rack, 8, 0); 8104 } 8105 rack->r_timer_override = 1; 8106 rack->r_early = 0; 8107 rack->r_ctl.rc_agg_early = 0; 8108 } else if (IN_RECOVERY(tp->t_flags) && 8109 rsm && 8110 (rack->r_rr_config == 3)) { 8111 /* 8112 * Assure we can output and we get no 8113 * remembered pace time except the retransmit. 8114 */ 8115 rack->r_timer_override = 1; 8116 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 8117 rack->r_ctl.rc_resend = rsm; 8118 } 8119 if (IN_RECOVERY(tp->t_flags) && 8120 (rack->rack_no_prr == 0) && 8121 (entered_recovery == 0)) { 8122 /* Deal with PRR here (in recovery only) */ 8123 uint32_t pipe, snd_una; 8124 8125 rack->r_ctl.rc_prr_delivered += changed; 8126 /* Compute prr_sndcnt */ 8127 if (SEQ_GT(tp->snd_una, th_ack)) { 8128 snd_una = tp->snd_una; 8129 } else { 8130 snd_una = th_ack; 8131 } 8132 pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; 8133 if (pipe > tp->snd_ssthresh) { 8134 long sndcnt; 8135 8136 sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; 8137 if (rack->r_ctl.rc_prr_recovery_fs > 0) 8138 sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; 8139 else { 8140 rack->r_ctl.rc_prr_sndcnt = 0; 8141 rack_log_to_prr(rack, 9, 0); 8142 sndcnt = 0; 8143 } 8144 sndcnt++; 8145 if (sndcnt > (long)rack->r_ctl.rc_prr_out) 8146 sndcnt -= rack->r_ctl.rc_prr_out; 8147 else 8148 sndcnt = 0; 8149 rack->r_ctl.rc_prr_sndcnt = sndcnt; 8150 rack_log_to_prr(rack, 10, 0); 8151 } else { 8152 uint32_t limit; 8153 8154 if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) 8155 limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); 8156 else 8157 limit = 0; 8158 if (changed > limit) 8159 limit = changed; 8160 limit += ctf_fixed_maxseg(tp); 8161 if (tp->snd_ssthresh > pipe) { 8162 rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); 8163 rack_log_to_prr(rack, 11, 0); 8164 } else { 8165 rack->r_ctl.rc_prr_sndcnt = min(0, limit); 8166 rack_log_to_prr(rack, 12, 0); 8167 } 8168 } 8169 if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && 8170 ((rack->rc_inp->inp_in_hpts == 0) && 8171 ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { 8172 /* 8173 * If you are pacing output you don't want 8174 * to override. 8175 */ 8176 rack->r_early = 0; 8177 rack->r_ctl.rc_agg_early = 0; 8178 rack->r_timer_override = 1; 8179 } 8180 } 8181 } 8182 8183 static void 8184 rack_strike_dupack(struct tcp_rack *rack) 8185 { 8186 struct rack_sendmap *rsm; 8187 8188 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 8189 if (rsm && (rsm->r_dupack < 0xff)) { 8190 rsm->r_dupack++; 8191 if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { 8192 rack->r_wanted_output = 1; 8193 rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); 8194 } else { 8195 rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); 8196 } 8197 } 8198 } 8199 8200 static void 8201 rack_check_bottom_drag(struct tcpcb *tp, 8202 struct tcp_rack *rack, 8203 struct socket *so, int32_t acked) 8204 { 8205 uint32_t segsiz, minseg; 8206 8207 segsiz = ctf_fixed_maxseg(tp); 8208 if (so->so_snd.sb_flags & SB_TLS_IFNET) { 8209 minseg = rack->r_ctl.rc_pace_min_segs; 8210 } else { 8211 minseg = segsiz; 8212 } 8213 if (tp->snd_max == tp->snd_una) { 8214 /* 8215 * We are doing dynamic pacing and we are way 8216 * under. Basically everything got acked while 8217 * we were still waiting on the pacer to expire. 8218 * 8219 * This means we need to boost the b/w in 8220 * addition to any earlier boosting of 8221 * the multipler. 8222 */ 8223 rack->rc_dragged_bottom = 1; 8224 rack_validate_multipliers_at_or_above100(rack); 8225 /* 8226 * Lets use the segment bytes acked plus 8227 * the lowest RTT seen as the basis to 8228 * form a b/w estimate. This will be off 8229 * due to the fact that the true estimate 8230 * should be around 1/2 the time of the RTT 8231 * but we can settle for that. 8232 */ 8233 if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && 8234 acked) { 8235 uint64_t bw, calc_bw, rtt; 8236 8237 rtt = rack->r_ctl.rack_rs.rs_us_rtt; 8238 bw = acked; 8239 calc_bw = bw * 1000000; 8240 calc_bw /= rtt; 8241 if (rack->r_ctl.last_max_bw && 8242 (rack->r_ctl.last_max_bw < calc_bw)) { 8243 /* 8244 * If we have a last calculated max bw 8245 * enforce it. 8246 */ 8247 calc_bw = rack->r_ctl.last_max_bw; 8248 } 8249 /* now plop it in */ 8250 if (rack->rc_gp_filled == 0) { 8251 if (calc_bw > ONE_POINT_TWO_MEG) { 8252 /* 8253 * If we have no measurement 8254 * don't let us set in more than 8255 * 1.2Mbps. If we are still too 8256 * low after pacing with this we 8257 * will hopefully have a max b/w 8258 * available to sanity check things. 8259 */ 8260 calc_bw = ONE_POINT_TWO_MEG; 8261 } 8262 rack->r_ctl.rc_rtt_diff = 0; 8263 rack->r_ctl.gp_bw = calc_bw; 8264 rack->rc_gp_filled = 1; 8265 rack->r_ctl.num_avg = RACK_REQ_AVG; 8266 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8267 } else if (calc_bw > rack->r_ctl.gp_bw) { 8268 rack->r_ctl.rc_rtt_diff = 0; 8269 rack->r_ctl.num_avg = RACK_REQ_AVG; 8270 rack->r_ctl.gp_bw = calc_bw; 8271 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 8272 } else 8273 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8274 /* 8275 * For acks over 1mss we do a extra boost to simulate 8276 * where we would get 2 acks (we want 110 for the mul). 8277 */ 8278 if (acked > segsiz) 8279 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8280 } else { 8281 /* 8282 * Huh, this should not be, settle 8283 * for just an old increase. 8284 */ 8285 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8286 } 8287 } else if ((IN_RECOVERY(tp->t_flags) == 0) && 8288 (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), 8289 minseg)) && 8290 (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && 8291 (tp->snd_wnd > max((segsiz * (rack_req_segs + 2)), minseg)) && 8292 (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) <= 8293 (segsiz * rack_req_segs))) { 8294 /* 8295 * We are doing dynamic GP pacing and 8296 * we have everything except 1MSS or less 8297 * bytes left out. We are still pacing away. 8298 * And there is data that could be sent, This 8299 * means we are inserting delayed ack time in 8300 * our measurements because we are pacing too slow. 8301 */ 8302 rack_validate_multipliers_at_or_above100(rack); 8303 rack->rc_dragged_bottom = 1; 8304 rack_increase_bw_mul(rack, -1, 0, 0, 1); 8305 } 8306 } 8307 8308 /* 8309 * Return value of 1, we do not need to call rack_process_data(). 8310 * return value of 0, rack_process_data can be called. 8311 * For ret_val if its 0 the TCP is locked, if its non-zero 8312 * its unlocked and probably unsafe to touch the TCB. 8313 */ 8314 static int 8315 rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, 8316 struct tcpcb *tp, struct tcpopt *to, 8317 uint32_t tiwin, int32_t tlen, 8318 int32_t * ofia, int32_t thflags, int32_t * ret_val) 8319 { 8320 int32_t ourfinisacked = 0; 8321 int32_t nsegs, acked_amount; 8322 int32_t acked; 8323 struct mbuf *mfree; 8324 struct tcp_rack *rack; 8325 int32_t under_pacing = 0; 8326 int32_t recovery = 0; 8327 8328 rack = (struct tcp_rack *)tp->t_fb_ptr; 8329 if (SEQ_GT(th->th_ack, tp->snd_max)) { 8330 ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); 8331 rack->r_wanted_output = 1; 8332 return (1); 8333 } 8334 if (rack->rc_gp_filled && 8335 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 8336 under_pacing = 1; 8337 } 8338 if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { 8339 if (rack->rc_in_persist) 8340 tp->t_rxtshift = 0; 8341 if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) 8342 rack_strike_dupack(rack); 8343 rack_log_ack(tp, to, th); 8344 } 8345 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 8346 /* 8347 * Old ack, behind (or duplicate to) the last one rcv'd 8348 * Note: Should mark reordering is occuring! We should also 8349 * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, 8350 * 3-3, 4-4 would be reording. As well as ack 1, 3-3 <no 8351 * retran and> ack 3 8352 */ 8353 return (0); 8354 } 8355 /* 8356 * If we reach this point, ACK is not a duplicate, i.e., it ACKs 8357 * something we sent. 8358 */ 8359 if (tp->t_flags & TF_NEEDSYN) { 8360 /* 8361 * T/TCP: Connection was half-synchronized, and our SYN has 8362 * been ACK'd (so connection is now fully synchronized). Go 8363 * to non-starred state, increment snd_una for ACK of SYN, 8364 * and check if we can do window scaling. 8365 */ 8366 tp->t_flags &= ~TF_NEEDSYN; 8367 tp->snd_una++; 8368 /* Do window scaling? */ 8369 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 8370 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 8371 tp->rcv_scale = tp->request_r_scale; 8372 /* Send window already scaled. */ 8373 } 8374 } 8375 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8376 INP_WLOCK_ASSERT(tp->t_inpcb); 8377 8378 acked = BYTES_THIS_ACK(tp, th); 8379 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 8380 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 8381 /* 8382 * If we just performed our first retransmit, and the ACK arrives 8383 * within our recovery window, then it was a mistake to do the 8384 * retransmit in the first place. Recover our original cwnd and 8385 * ssthresh, and proceed to transmit where we left off. 8386 */ 8387 if (tp->t_flags & TF_PREVVALID) { 8388 tp->t_flags &= ~TF_PREVVALID; 8389 if (tp->t_rxtshift == 1 && 8390 (int)(ticks - tp->t_badrxtwin) < 0) 8391 rack_cong_signal(tp, th, CC_RTO_ERR); 8392 } 8393 if (acked) { 8394 /* assure we are not backed off */ 8395 tp->t_rxtshift = 0; 8396 rack->rc_tlp_in_progress = 0; 8397 rack->r_ctl.rc_tlp_cnt_out = 0; 8398 /* 8399 * If it is the RXT timer we want to 8400 * stop it, so we can restart a TLP. 8401 */ 8402 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 8403 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8404 #ifdef NETFLIX_HTTP_LOGGING 8405 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 8406 #endif 8407 } 8408 /* 8409 * If we have a timestamp reply, update smoothed round trip time. If 8410 * no timestamp is present but transmit timer is running and timed 8411 * sequence number was acked, update smoothed round trip time. Since 8412 * we now have an rtt measurement, cancel the timer backoff (cf., 8413 * Phil Karn's retransmit alg.). Recompute the initial retransmit 8414 * timer. 8415 * 8416 * Some boxes send broken timestamp replies during the SYN+ACK 8417 * phase, ignore timestamps of 0 or we could calculate a huge RTT 8418 * and blow up the retransmit timer. 8419 */ 8420 /* 8421 * If all outstanding data is acked, stop retransmit timer and 8422 * remember to restart (more output or persist). If there is more 8423 * data to be acked, restart retransmit timer, using current 8424 * (possibly backed-off) value. 8425 */ 8426 if (acked == 0) { 8427 if (ofia) 8428 *ofia = ourfinisacked; 8429 return (0); 8430 } 8431 if (rack->r_ctl.rc_early_recovery) { 8432 if (IN_RECOVERY(tp->t_flags)) { 8433 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8434 (SEQ_LT(th->th_ack, tp->snd_max))) { 8435 tcp_rack_partialack(tp, th); 8436 } else { 8437 rack_post_recovery(tp, th); 8438 recovery = 1; 8439 } 8440 } 8441 } 8442 /* 8443 * Let the congestion control algorithm update congestion control 8444 * related information. This typically means increasing the 8445 * congestion window. 8446 */ 8447 rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); 8448 SOCKBUF_LOCK(&so->so_snd); 8449 acked_amount = min(acked, (int)sbavail(&so->so_snd)); 8450 tp->snd_wnd -= acked_amount; 8451 mfree = sbcut_locked(&so->so_snd, acked_amount); 8452 if ((sbused(&so->so_snd) == 0) && 8453 (acked > acked_amount) && 8454 (tp->t_state >= TCPS_FIN_WAIT_1) && 8455 (tp->t_flags & TF_SENTFIN)) { 8456 /* 8457 * We must be sure our fin 8458 * was sent and acked (we can be 8459 * in FIN_WAIT_1 without having 8460 * sent the fin). 8461 */ 8462 ourfinisacked = 1; 8463 } 8464 /* NB: sowwakeup_locked() does an implicit unlock. */ 8465 sowwakeup_locked(so); 8466 m_freem(mfree); 8467 if (rack->r_ctl.rc_early_recovery == 0) { 8468 if (IN_RECOVERY(tp->t_flags)) { 8469 if (SEQ_LT(th->th_ack, tp->snd_recover) && 8470 (SEQ_LT(th->th_ack, tp->snd_max))) { 8471 tcp_rack_partialack(tp, th); 8472 } else { 8473 rack_post_recovery(tp, th); 8474 } 8475 } 8476 } 8477 tp->snd_una = th->th_ack; 8478 if (SEQ_GT(tp->snd_una, tp->snd_recover)) 8479 tp->snd_recover = tp->snd_una; 8480 8481 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) { 8482 tp->snd_nxt = tp->snd_una; 8483 } 8484 if (under_pacing && 8485 (rack->use_fixed_rate == 0) && 8486 (rack->in_probe_rtt == 0) && 8487 rack->rc_gp_dyn_mul && 8488 rack->rc_always_pace) { 8489 /* Check if we are dragging bottom */ 8490 rack_check_bottom_drag(tp, rack, so, acked); 8491 } 8492 if (tp->snd_una == tp->snd_max) { 8493 /* Nothing left outstanding */ 8494 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 8495 if (rack->r_ctl.rc_went_idle_time == 0) 8496 rack->r_ctl.rc_went_idle_time = 1; 8497 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 8498 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 8499 tp->t_acktime = 0; 8500 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 8501 /* Set need output so persist might get set */ 8502 rack->r_wanted_output = 1; 8503 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 8504 if ((tp->t_state >= TCPS_FIN_WAIT_1) && 8505 (sbavail(&so->so_snd) == 0) && 8506 (tp->t_flags2 & TF2_DROP_AF_DATA)) { 8507 /* 8508 * The socket was gone and the 8509 * peer sent data, time to 8510 * reset him. 8511 */ 8512 *ret_val = 1; 8513 /* tcp_close will kill the inp pre-log the Reset */ 8514 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 8515 tp = tcp_close(tp); 8516 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); 8517 return (1); 8518 8519 } 8520 } 8521 if (ofia) 8522 *ofia = ourfinisacked; 8523 return (0); 8524 } 8525 8526 static void 8527 rack_collapsed_window(struct tcp_rack *rack) 8528 { 8529 /* 8530 * Now we must walk the 8531 * send map and divide the 8532 * ones left stranded. These 8533 * guys can't cause us to abort 8534 * the connection and are really 8535 * "unsent". However if a buggy 8536 * client actually did keep some 8537 * of the data i.e. collapsed the win 8538 * and refused to ack and then opened 8539 * the win and acked that data. We would 8540 * get into an ack war, the simplier 8541 * method then of just pretending we 8542 * did not send those segments something 8543 * won't work. 8544 */ 8545 struct rack_sendmap *rsm, *nrsm, fe, *insret; 8546 tcp_seq max_seq; 8547 8548 max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; 8549 memset(&fe, 0, sizeof(fe)); 8550 fe.r_start = max_seq; 8551 /* Find the first seq past or at maxseq */ 8552 rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 8553 if (rsm == NULL) { 8554 /* Nothing to do strange */ 8555 rack->rc_has_collapsed = 0; 8556 return; 8557 } 8558 /* 8559 * Now do we need to split at 8560 * the collapse point? 8561 */ 8562 if (SEQ_GT(max_seq, rsm->r_start)) { 8563 nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT); 8564 if (nrsm == NULL) { 8565 /* We can't get a rsm, mark all? */ 8566 nrsm = rsm; 8567 goto no_split; 8568 } 8569 /* Clone it */ 8570 rack_clone_rsm(rack, nrsm, rsm, max_seq); 8571 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); 8572 #ifdef INVARIANTS 8573 if (insret != NULL) { 8574 panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", 8575 nrsm, insret, rack, rsm); 8576 } 8577 #endif 8578 if (rsm->r_in_tmap) { 8579 TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); 8580 nrsm->r_in_tmap = 1; 8581 } 8582 /* 8583 * Set in the new RSM as the 8584 * collapsed starting point 8585 */ 8586 rsm = nrsm; 8587 } 8588 no_split: 8589 counter_u64_add(rack_collapsed_win, 1); 8590 RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { 8591 nrsm->r_flags |= RACK_RWND_COLLAPSED; 8592 rack->rc_has_collapsed = 1; 8593 } 8594 } 8595 8596 static void 8597 rack_un_collapse_window(struct tcp_rack *rack) 8598 { 8599 struct rack_sendmap *rsm; 8600 8601 RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { 8602 if (rsm->r_flags & RACK_RWND_COLLAPSED) 8603 rsm->r_flags &= ~RACK_RWND_COLLAPSED; 8604 else 8605 break; 8606 } 8607 rack->rc_has_collapsed = 0; 8608 } 8609 8610 static void 8611 rack_handle_delayed_ack(struct tcpcb *tp, struct tcp_rack *rack, 8612 int32_t tlen, int32_t tfo_syn) 8613 { 8614 if (DELAY_ACK(tp, tlen) || tfo_syn) { 8615 if (rack->rc_dack_mode && 8616 (tlen > 500) && 8617 (rack->rc_dack_toggle == 1)) { 8618 goto no_delayed_ack; 8619 } 8620 rack_timer_cancel(tp, rack, 8621 rack->r_ctl.rc_rcvtime, __LINE__); 8622 tp->t_flags |= TF_DELACK; 8623 } else { 8624 no_delayed_ack: 8625 rack->r_wanted_output = 1; 8626 tp->t_flags |= TF_ACKNOW; 8627 if (rack->rc_dack_mode) { 8628 if (tp->t_flags & TF_DELACK) 8629 rack->rc_dack_toggle = 1; 8630 else 8631 rack->rc_dack_toggle = 0; 8632 } 8633 } 8634 } 8635 /* 8636 * Return value of 1, the TCB is unlocked and most 8637 * likely gone, return value of 0, the TCP is still 8638 * locked. 8639 */ 8640 static int 8641 rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, 8642 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, 8643 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) 8644 { 8645 /* 8646 * Update window information. Don't look at window if no ACK: TAC's 8647 * send garbage on first SYN. 8648 */ 8649 int32_t nsegs; 8650 int32_t tfo_syn; 8651 struct tcp_rack *rack; 8652 8653 rack = (struct tcp_rack *)tp->t_fb_ptr; 8654 INP_WLOCK_ASSERT(tp->t_inpcb); 8655 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8656 if ((thflags & TH_ACK) && 8657 (SEQ_LT(tp->snd_wl1, th->th_seq) || 8658 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || 8659 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { 8660 /* keep track of pure window updates */ 8661 if (tlen == 0 && 8662 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) 8663 KMOD_TCPSTAT_INC(tcps_rcvwinupd); 8664 tp->snd_wnd = tiwin; 8665 tp->snd_wl1 = th->th_seq; 8666 tp->snd_wl2 = th->th_ack; 8667 if (tp->snd_wnd > tp->max_sndwnd) 8668 tp->max_sndwnd = tp->snd_wnd; 8669 rack->r_wanted_output = 1; 8670 } else if (thflags & TH_ACK) { 8671 if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { 8672 tp->snd_wnd = tiwin; 8673 tp->snd_wl1 = th->th_seq; 8674 tp->snd_wl2 = th->th_ack; 8675 } 8676 } 8677 if (tp->snd_wnd < ctf_outstanding(tp)) 8678 /* The peer collapsed the window */ 8679 rack_collapsed_window(rack); 8680 else if (rack->rc_has_collapsed) 8681 rack_un_collapse_window(rack); 8682 /* Was persist timer active and now we have window space? */ 8683 if ((rack->rc_in_persist != 0) && 8684 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 8685 rack->r_ctl.rc_pace_min_segs))) { 8686 rack_exit_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8687 tp->snd_nxt = tp->snd_max; 8688 /* Make sure we output to start the timer */ 8689 rack->r_wanted_output = 1; 8690 } 8691 /* Do we enter persists? */ 8692 if ((rack->rc_in_persist == 0) && 8693 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 8694 TCPS_HAVEESTABLISHED(tp->t_state) && 8695 (tp->snd_max == tp->snd_una) && 8696 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 8697 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 8698 /* 8699 * Here the rwnd is less than 8700 * the pacing size, we are established, 8701 * nothing is outstanding, and there is 8702 * data to send. Enter persists. 8703 */ 8704 tp->snd_nxt = tp->snd_una; 8705 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 8706 } 8707 if (tp->t_flags2 & TF2_DROP_AF_DATA) { 8708 m_freem(m); 8709 return (0); 8710 } 8711 /* 8712 * don't process the URG bit, ignore them drag 8713 * along the up. 8714 */ 8715 tp->rcv_up = tp->rcv_nxt; 8716 INP_WLOCK_ASSERT(tp->t_inpcb); 8717 8718 /* 8719 * Process the segment text, merging it into the TCP sequencing 8720 * queue, and arranging for acknowledgment of receipt if necessary. 8721 * This process logically involves adjusting tp->rcv_wnd as data is 8722 * presented to the user (this happens in tcp_usrreq.c, case 8723 * PRU_RCVD). If a FIN has already been received on this connection 8724 * then we just ignore the text. 8725 */ 8726 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && 8727 IS_FASTOPEN(tp->t_flags)); 8728 if ((tlen || (thflags & TH_FIN) || (tfo_syn && tlen > 0)) && 8729 TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8730 tcp_seq save_start = th->th_seq; 8731 tcp_seq save_rnxt = tp->rcv_nxt; 8732 int save_tlen = tlen; 8733 8734 m_adj(m, drop_hdrlen); /* delayed header drop */ 8735 /* 8736 * Insert segment which includes th into TCP reassembly 8737 * queue with control block tp. Set thflags to whether 8738 * reassembly now includes a segment with FIN. This handles 8739 * the common case inline (segment is the next to be 8740 * received on an established connection, and the queue is 8741 * empty), avoiding linkage into and removal from the queue 8742 * and repetition of various conversions. Set DELACK for 8743 * segments received in order, but ack immediately when 8744 * segments are out of order (so fast retransmit can work). 8745 */ 8746 if (th->th_seq == tp->rcv_nxt && 8747 SEGQ_EMPTY(tp) && 8748 (TCPS_HAVEESTABLISHED(tp->t_state) || 8749 tfo_syn)) { 8750 #ifdef NETFLIX_SB_LIMITS 8751 u_int mcnt, appended; 8752 8753 if (so->so_rcv.sb_shlim) { 8754 mcnt = m_memcnt(m); 8755 appended = 0; 8756 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8757 CFO_NOSLEEP, NULL) == false) { 8758 counter_u64_add(tcp_sb_shlim_fails, 1); 8759 m_freem(m); 8760 return (0); 8761 } 8762 } 8763 #endif 8764 rack_handle_delayed_ack(tp, rack, tlen, tfo_syn); 8765 tp->rcv_nxt += tlen; 8766 if (tlen && 8767 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 8768 (tp->t_fbyte_in == 0)) { 8769 tp->t_fbyte_in = ticks; 8770 if (tp->t_fbyte_in == 0) 8771 tp->t_fbyte_in = 1; 8772 if (tp->t_fbyte_out && tp->t_fbyte_in) 8773 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 8774 } 8775 thflags = th->th_flags & TH_FIN; 8776 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 8777 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 8778 SOCKBUF_LOCK(&so->so_rcv); 8779 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 8780 m_freem(m); 8781 } else 8782 #ifdef NETFLIX_SB_LIMITS 8783 appended = 8784 #endif 8785 sbappendstream_locked(&so->so_rcv, m, 0); 8786 /* NB: sorwakeup_locked() does an implicit unlock. */ 8787 sorwakeup_locked(so); 8788 #ifdef NETFLIX_SB_LIMITS 8789 if (so->so_rcv.sb_shlim && appended != mcnt) 8790 counter_fo_release(so->so_rcv.sb_shlim, 8791 mcnt - appended); 8792 #endif 8793 } else { 8794 /* 8795 * XXX: Due to the header drop above "th" is 8796 * theoretically invalid by now. Fortunately 8797 * m_adj() doesn't actually frees any mbufs when 8798 * trimming from the head. 8799 */ 8800 tcp_seq temp = save_start; 8801 thflags = tcp_reass(tp, th, &temp, &tlen, m); 8802 tp->t_flags |= TF_ACKNOW; 8803 } 8804 if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) { 8805 if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { 8806 /* 8807 * DSACK actually handled in the fastpath 8808 * above. 8809 */ 8810 RACK_OPTS_INC(tcp_sack_path_1); 8811 tcp_update_sack_list(tp, save_start, 8812 save_start + save_tlen); 8813 } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { 8814 if ((tp->rcv_numsacks >= 1) && 8815 (tp->sackblks[0].end == save_start)) { 8816 /* 8817 * Partial overlap, recorded at todrop 8818 * above. 8819 */ 8820 RACK_OPTS_INC(tcp_sack_path_2a); 8821 tcp_update_sack_list(tp, 8822 tp->sackblks[0].start, 8823 tp->sackblks[0].end); 8824 } else { 8825 RACK_OPTS_INC(tcp_sack_path_2b); 8826 tcp_update_dsack_list(tp, save_start, 8827 save_start + save_tlen); 8828 } 8829 } else if (tlen >= save_tlen) { 8830 /* Update of sackblks. */ 8831 RACK_OPTS_INC(tcp_sack_path_3); 8832 tcp_update_dsack_list(tp, save_start, 8833 save_start + save_tlen); 8834 } else if (tlen > 0) { 8835 RACK_OPTS_INC(tcp_sack_path_4); 8836 tcp_update_dsack_list(tp, save_start, 8837 save_start + tlen); 8838 } 8839 } 8840 } else { 8841 m_freem(m); 8842 thflags &= ~TH_FIN; 8843 } 8844 8845 /* 8846 * If FIN is received ACK the FIN and let the user know that the 8847 * connection is closing. 8848 */ 8849 if (thflags & TH_FIN) { 8850 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { 8851 socantrcvmore(so); 8852 /* 8853 * If connection is half-synchronized (ie NEEDSYN 8854 * flag on) then delay ACK, so it may be piggybacked 8855 * when SYN is sent. Otherwise, since we received a 8856 * FIN then no more input can be expected, send ACK 8857 * now. 8858 */ 8859 if (tp->t_flags & TF_NEEDSYN) { 8860 rack_timer_cancel(tp, rack, 8861 rack->r_ctl.rc_rcvtime, __LINE__); 8862 tp->t_flags |= TF_DELACK; 8863 } else { 8864 tp->t_flags |= TF_ACKNOW; 8865 } 8866 tp->rcv_nxt++; 8867 } 8868 switch (tp->t_state) { 8869 8870 /* 8871 * In SYN_RECEIVED and ESTABLISHED STATES enter the 8872 * CLOSE_WAIT state. 8873 */ 8874 case TCPS_SYN_RECEIVED: 8875 tp->t_starttime = ticks; 8876 /* FALLTHROUGH */ 8877 case TCPS_ESTABLISHED: 8878 rack_timer_cancel(tp, rack, 8879 rack->r_ctl.rc_rcvtime, __LINE__); 8880 tcp_state_change(tp, TCPS_CLOSE_WAIT); 8881 break; 8882 8883 /* 8884 * If still in FIN_WAIT_1 STATE FIN has not been 8885 * acked so enter the CLOSING state. 8886 */ 8887 case TCPS_FIN_WAIT_1: 8888 rack_timer_cancel(tp, rack, 8889 rack->r_ctl.rc_rcvtime, __LINE__); 8890 tcp_state_change(tp, TCPS_CLOSING); 8891 break; 8892 8893 /* 8894 * In FIN_WAIT_2 state enter the TIME_WAIT state, 8895 * starting the time-wait timer, turning off the 8896 * other standard timers. 8897 */ 8898 case TCPS_FIN_WAIT_2: 8899 rack_timer_cancel(tp, rack, 8900 rack->r_ctl.rc_rcvtime, __LINE__); 8901 tcp_twstart(tp); 8902 return (1); 8903 } 8904 } 8905 /* 8906 * Return any desired output. 8907 */ 8908 if ((tp->t_flags & TF_ACKNOW) || 8909 (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) { 8910 rack->r_wanted_output = 1; 8911 } 8912 INP_WLOCK_ASSERT(tp->t_inpcb); 8913 return (0); 8914 } 8915 8916 /* 8917 * Here nothing is really faster, its just that we 8918 * have broken out the fast-data path also just like 8919 * the fast-ack. 8920 */ 8921 static int 8922 rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, 8923 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 8924 uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos) 8925 { 8926 int32_t nsegs; 8927 int32_t newsize = 0; /* automatic sockbuf scaling */ 8928 struct tcp_rack *rack; 8929 #ifdef NETFLIX_SB_LIMITS 8930 u_int mcnt, appended; 8931 #endif 8932 #ifdef TCPDEBUG 8933 /* 8934 * The size of tcp_saveipgen must be the size of the max ip header, 8935 * now IPv6. 8936 */ 8937 u_char tcp_saveipgen[IP6_HDR_LEN]; 8938 struct tcphdr tcp_savetcp; 8939 short ostate = 0; 8940 8941 #endif 8942 /* 8943 * If last ACK falls within this segment's sequence numbers, record 8944 * the timestamp. NOTE that the test is modified according to the 8945 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 8946 */ 8947 if (__predict_false(th->th_seq != tp->rcv_nxt)) { 8948 return (0); 8949 } 8950 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 8951 return (0); 8952 } 8953 if (tiwin && tiwin != tp->snd_wnd) { 8954 return (0); 8955 } 8956 if (__predict_false((tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN)))) { 8957 return (0); 8958 } 8959 if (__predict_false((to->to_flags & TOF_TS) && 8960 (TSTMP_LT(to->to_tsval, tp->ts_recent)))) { 8961 return (0); 8962 } 8963 if (__predict_false((th->th_ack != tp->snd_una))) { 8964 return (0); 8965 } 8966 if (__predict_false(tlen > sbspace(&so->so_rcv))) { 8967 return (0); 8968 } 8969 if ((to->to_flags & TOF_TS) != 0 && 8970 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 8971 tp->ts_recent_age = tcp_ts_getticks(); 8972 tp->ts_recent = to->to_tsval; 8973 } 8974 rack = (struct tcp_rack *)tp->t_fb_ptr; 8975 /* 8976 * This is a pure, in-sequence data packet with nothing on the 8977 * reassembly queue and we have enough buffer space to take it. 8978 */ 8979 nsegs = max(1, m->m_pkthdr.lro_nsegs); 8980 8981 #ifdef NETFLIX_SB_LIMITS 8982 if (so->so_rcv.sb_shlim) { 8983 mcnt = m_memcnt(m); 8984 appended = 0; 8985 if (counter_fo_get(so->so_rcv.sb_shlim, mcnt, 8986 CFO_NOSLEEP, NULL) == false) { 8987 counter_u64_add(tcp_sb_shlim_fails, 1); 8988 m_freem(m); 8989 return (1); 8990 } 8991 } 8992 #endif 8993 /* Clean receiver SACK report if present */ 8994 if (tp->rcv_numsacks) 8995 tcp_clean_sackreport(tp); 8996 KMOD_TCPSTAT_INC(tcps_preddat); 8997 tp->rcv_nxt += tlen; 8998 if (tlen && 8999 ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) && 9000 (tp->t_fbyte_in == 0)) { 9001 tp->t_fbyte_in = ticks; 9002 if (tp->t_fbyte_in == 0) 9003 tp->t_fbyte_in = 1; 9004 if (tp->t_fbyte_out && tp->t_fbyte_in) 9005 tp->t_flags2 |= TF2_FBYTES_COMPLETE; 9006 } 9007 /* 9008 * Pull snd_wl1 up to prevent seq wrap relative to th_seq. 9009 */ 9010 tp->snd_wl1 = th->th_seq; 9011 /* 9012 * Pull rcv_up up to prevent seq wrap relative to rcv_nxt. 9013 */ 9014 tp->rcv_up = tp->rcv_nxt; 9015 KMOD_TCPSTAT_ADD(tcps_rcvpack, nsegs); 9016 KMOD_TCPSTAT_ADD(tcps_rcvbyte, tlen); 9017 #ifdef TCPDEBUG 9018 if (so->so_options & SO_DEBUG) 9019 tcp_trace(TA_INPUT, ostate, tp, 9020 (void *)tcp_saveipgen, &tcp_savetcp, 0); 9021 #endif 9022 newsize = tcp_autorcvbuf(m, th, so, tp, tlen); 9023 9024 /* Add data to socket buffer. */ 9025 SOCKBUF_LOCK(&so->so_rcv); 9026 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9027 m_freem(m); 9028 } else { 9029 /* 9030 * Set new socket buffer size. Give up when limit is 9031 * reached. 9032 */ 9033 if (newsize) 9034 if (!sbreserve_locked(&so->so_rcv, 9035 newsize, so, NULL)) 9036 so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 9037 m_adj(m, drop_hdrlen); /* delayed header drop */ 9038 #ifdef NETFLIX_SB_LIMITS 9039 appended = 9040 #endif 9041 sbappendstream_locked(&so->so_rcv, m, 0); 9042 ctf_calc_rwin(so, tp); 9043 } 9044 /* NB: sorwakeup_locked() does an implicit unlock. */ 9045 sorwakeup_locked(so); 9046 #ifdef NETFLIX_SB_LIMITS 9047 if (so->so_rcv.sb_shlim && mcnt != appended) 9048 counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended); 9049 #endif 9050 rack_handle_delayed_ack(tp, rack, tlen, 0); 9051 if (tp->snd_una == tp->snd_max) 9052 sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); 9053 return (1); 9054 } 9055 9056 /* 9057 * This subfunction is used to try to highly optimize the 9058 * fast path. We again allow window updates that are 9059 * in sequence to remain in the fast-path. We also add 9060 * in the __predict's to attempt to help the compiler. 9061 * Note that if we return a 0, then we can *not* process 9062 * it and the caller should push the packet into the 9063 * slow-path. 9064 */ 9065 static int 9066 rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 9067 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9068 uint32_t tiwin, int32_t nxt_pkt, uint32_t cts) 9069 { 9070 int32_t acked; 9071 int32_t nsegs; 9072 #ifdef TCPDEBUG 9073 /* 9074 * The size of tcp_saveipgen must be the size of the max ip header, 9075 * now IPv6. 9076 */ 9077 u_char tcp_saveipgen[IP6_HDR_LEN]; 9078 struct tcphdr tcp_savetcp; 9079 short ostate = 0; 9080 #endif 9081 int32_t under_pacing = 0; 9082 struct tcp_rack *rack; 9083 9084 if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { 9085 /* Old ack, behind (or duplicate to) the last one rcv'd */ 9086 return (0); 9087 } 9088 if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { 9089 /* Above what we have sent? */ 9090 return (0); 9091 } 9092 if (__predict_false(tp->snd_nxt != tp->snd_max)) { 9093 /* We are retransmitting */ 9094 return (0); 9095 } 9096 if (__predict_false(tiwin == 0)) { 9097 /* zero window */ 9098 return (0); 9099 } 9100 if (__predict_false(tp->t_flags & (TF_NEEDSYN | TF_NEEDFIN))) { 9101 /* We need a SYN or a FIN, unlikely.. */ 9102 return (0); 9103 } 9104 if ((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { 9105 /* Timestamp is behind .. old ack with seq wrap? */ 9106 return (0); 9107 } 9108 if (__predict_false(IN_RECOVERY(tp->t_flags))) { 9109 /* Still recovering */ 9110 return (0); 9111 } 9112 rack = (struct tcp_rack *)tp->t_fb_ptr; 9113 if (rack->r_ctl.rc_sacked) { 9114 /* We have sack holes on our scoreboard */ 9115 return (0); 9116 } 9117 /* Ok if we reach here, we can process a fast-ack */ 9118 if (rack->rc_gp_filled && 9119 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 9120 under_pacing = 1; 9121 } 9122 nsegs = max(1, m->m_pkthdr.lro_nsegs); 9123 rack_log_ack(tp, to, th); 9124 /* Did the window get updated? */ 9125 if (tiwin != tp->snd_wnd) { 9126 tp->snd_wnd = tiwin; 9127 tp->snd_wl1 = th->th_seq; 9128 if (tp->snd_wnd > tp->max_sndwnd) 9129 tp->max_sndwnd = tp->snd_wnd; 9130 } 9131 /* Do we exit persists? */ 9132 if ((rack->rc_in_persist != 0) && 9133 (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), 9134 rack->r_ctl.rc_pace_min_segs))) { 9135 rack_exit_persist(tp, rack, cts); 9136 } 9137 /* Do we enter persists? */ 9138 if ((rack->rc_in_persist == 0) && 9139 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && 9140 TCPS_HAVEESTABLISHED(tp->t_state) && 9141 (tp->snd_max == tp->snd_una) && 9142 sbavail(&tp->t_inpcb->inp_socket->so_snd) && 9143 (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { 9144 /* 9145 * Here the rwnd is less than 9146 * the pacing size, we are established, 9147 * nothing is outstanding, and there is 9148 * data to send. Enter persists. 9149 */ 9150 tp->snd_nxt = tp->snd_una; 9151 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 9152 } 9153 /* 9154 * If last ACK falls within this segment's sequence numbers, record 9155 * the timestamp. NOTE that the test is modified according to the 9156 * latest proposal of the tcplw@cray.com list (Braden 1993/04/26). 9157 */ 9158 if ((to->to_flags & TOF_TS) != 0 && 9159 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { 9160 tp->ts_recent_age = tcp_ts_getticks(); 9161 tp->ts_recent = to->to_tsval; 9162 } 9163 /* 9164 * This is a pure ack for outstanding data. 9165 */ 9166 KMOD_TCPSTAT_INC(tcps_predack); 9167 9168 /* 9169 * "bad retransmit" recovery. 9170 */ 9171 if (tp->t_flags & TF_PREVVALID) { 9172 tp->t_flags &= ~TF_PREVVALID; 9173 if (tp->t_rxtshift == 1 && 9174 (int)(ticks - tp->t_badrxtwin) < 0) 9175 rack_cong_signal(tp, th, CC_RTO_ERR); 9176 } 9177 /* 9178 * Recalculate the transmit timer / rtt. 9179 * 9180 * Some boxes send broken timestamp replies during the SYN+ACK 9181 * phase, ignore timestamps of 0 or we could calculate a huge RTT 9182 * and blow up the retransmit timer. 9183 */ 9184 acked = BYTES_THIS_ACK(tp, th); 9185 9186 #ifdef TCP_HHOOK 9187 /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ 9188 hhook_run_tcp_est_in(tp, th, to); 9189 #endif 9190 9191 KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); 9192 KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); 9193 sbdrop(&so->so_snd, acked); 9194 if (acked) { 9195 /* assure we are not backed off */ 9196 tp->t_rxtshift = 0; 9197 rack->rc_tlp_in_progress = 0; 9198 rack->r_ctl.rc_tlp_cnt_out = 0; 9199 /* 9200 * If it is the RXT timer we want to 9201 * stop it, so we can restart a TLP. 9202 */ 9203 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) 9204 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9205 #ifdef NETFLIX_HTTP_LOGGING 9206 tcp_http_check_for_comp(rack->rc_tp, th->th_ack); 9207 #endif 9208 } 9209 /* 9210 * Let the congestion control algorithm update congestion control 9211 * related information. This typically means increasing the 9212 * congestion window. 9213 */ 9214 rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); 9215 9216 tp->snd_una = th->th_ack; 9217 if (tp->snd_wnd < ctf_outstanding(tp)) { 9218 /* The peer collapsed the window */ 9219 rack_collapsed_window(rack); 9220 } else if (rack->rc_has_collapsed) 9221 rack_un_collapse_window(rack); 9222 9223 /* 9224 * Pull snd_wl2 up to prevent seq wrap relative to th_ack. 9225 */ 9226 tp->snd_wl2 = th->th_ack; 9227 tp->t_dupacks = 0; 9228 m_freem(m); 9229 /* ND6_HINT(tp); *//* Some progress has been made. */ 9230 9231 /* 9232 * If all outstanding data are acked, stop retransmit timer, 9233 * otherwise restart timer using current (possibly backed-off) 9234 * value. If process is waiting for space, wakeup/selwakeup/signal. 9235 * If data are ready to send, let tcp_output decide between more 9236 * output or persist. 9237 */ 9238 #ifdef TCPDEBUG 9239 if (so->so_options & SO_DEBUG) 9240 tcp_trace(TA_INPUT, ostate, tp, 9241 (void *)tcp_saveipgen, 9242 &tcp_savetcp, 0); 9243 #endif 9244 if (under_pacing && 9245 (rack->use_fixed_rate == 0) && 9246 (rack->in_probe_rtt == 0) && 9247 rack->rc_gp_dyn_mul && 9248 rack->rc_always_pace) { 9249 /* Check if we are dragging bottom */ 9250 rack_check_bottom_drag(tp, rack, so, acked); 9251 } 9252 if (tp->snd_una == tp->snd_max) { 9253 rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); 9254 if (rack->r_ctl.rc_went_idle_time == 0) 9255 rack->r_ctl.rc_went_idle_time = 1; 9256 rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); 9257 if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) 9258 tp->t_acktime = 0; 9259 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 9260 } 9261 /* Wake up the socket if we have room to write more */ 9262 sowwakeup(so); 9263 if (sbavail(&so->so_snd)) { 9264 rack->r_wanted_output = 1; 9265 } 9266 return (1); 9267 } 9268 9269 /* 9270 * Return value of 1, the TCB is unlocked and most 9271 * likely gone, return value of 0, the TCP is still 9272 * locked. 9273 */ 9274 static int 9275 rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, 9276 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9277 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9278 { 9279 int32_t ret_val = 0; 9280 int32_t todrop; 9281 int32_t ourfinisacked = 0; 9282 struct tcp_rack *rack; 9283 9284 ctf_calc_rwin(so, tp); 9285 /* 9286 * If the state is SYN_SENT: if seg contains an ACK, but not for our 9287 * SYN, drop the input. if seg contains a RST, then drop the 9288 * connection. if seg does not contain SYN, then drop it. Otherwise 9289 * this is an acceptable SYN segment initialize tp->rcv_nxt and 9290 * tp->irs if seg contains ack then advance tp->snd_una if seg 9291 * contains an ECE and ECN support is enabled, the stream is ECN 9292 * capable. if SYN has been acked change to ESTABLISHED else 9293 * SYN_RCVD state arrange for segment to be acked (eventually) 9294 * continue processing rest of data/controls. 9295 */ 9296 if ((thflags & TH_ACK) && 9297 (SEQ_LEQ(th->th_ack, tp->iss) || 9298 SEQ_GT(th->th_ack, tp->snd_max))) { 9299 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9300 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9301 return (1); 9302 } 9303 if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { 9304 TCP_PROBE5(connect__refused, NULL, tp, 9305 mtod(m, const char *), tp, th); 9306 tp = tcp_drop(tp, ECONNREFUSED); 9307 ctf_do_drop(m, tp); 9308 return (1); 9309 } 9310 if (thflags & TH_RST) { 9311 ctf_do_drop(m, tp); 9312 return (1); 9313 } 9314 if (!(thflags & TH_SYN)) { 9315 ctf_do_drop(m, tp); 9316 return (1); 9317 } 9318 tp->irs = th->th_seq; 9319 tcp_rcvseqinit(tp); 9320 rack = (struct tcp_rack *)tp->t_fb_ptr; 9321 if (thflags & TH_ACK) { 9322 int tfo_partial = 0; 9323 9324 KMOD_TCPSTAT_INC(tcps_connects); 9325 soisconnected(so); 9326 #ifdef MAC 9327 mac_socketpeer_set_from_mbuf(m, so); 9328 #endif 9329 /* Do window scaling on this connection? */ 9330 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9331 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9332 tp->rcv_scale = tp->request_r_scale; 9333 } 9334 tp->rcv_adv += min(tp->rcv_wnd, 9335 TCP_MAXWIN << tp->rcv_scale); 9336 /* 9337 * If not all the data that was sent in the TFO SYN 9338 * has been acked, resend the remainder right away. 9339 */ 9340 if (IS_FASTOPEN(tp->t_flags) && 9341 (tp->snd_una != tp->snd_max)) { 9342 tp->snd_nxt = th->th_ack; 9343 tfo_partial = 1; 9344 } 9345 /* 9346 * If there's data, delay ACK; if there's also a FIN ACKNOW 9347 * will be turned on later. 9348 */ 9349 if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial) { 9350 rack_timer_cancel(tp, rack, 9351 rack->r_ctl.rc_rcvtime, __LINE__); 9352 tp->t_flags |= TF_DELACK; 9353 } else { 9354 rack->r_wanted_output = 1; 9355 tp->t_flags |= TF_ACKNOW; 9356 rack->rc_dack_toggle = 0; 9357 } 9358 if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && 9359 (V_tcp_do_ecn == 1)) { 9360 tp->t_flags2 |= TF2_ECN_PERMIT; 9361 KMOD_TCPSTAT_INC(tcps_ecn_shs); 9362 } 9363 if (SEQ_GT(th->th_ack, tp->snd_una)) { 9364 /* 9365 * We advance snd_una for the 9366 * fast open case. If th_ack is 9367 * acknowledging data beyond 9368 * snd_una we can't just call 9369 * ack-processing since the 9370 * data stream in our send-map 9371 * will start at snd_una + 1 (one 9372 * beyond the SYN). If its just 9373 * equal we don't need to do that 9374 * and there is no send_map. 9375 */ 9376 tp->snd_una++; 9377 } 9378 /* 9379 * Received <SYN,ACK> in SYN_SENT[*] state. Transitions: 9380 * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 9381 */ 9382 tp->t_starttime = ticks; 9383 if (tp->t_flags & TF_NEEDFIN) { 9384 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9385 tp->t_flags &= ~TF_NEEDFIN; 9386 thflags &= ~TH_SYN; 9387 } else { 9388 tcp_state_change(tp, TCPS_ESTABLISHED); 9389 TCP_PROBE5(connect__established, NULL, tp, 9390 mtod(m, const char *), tp, th); 9391 rack_cc_conn_init(tp); 9392 } 9393 } else { 9394 /* 9395 * Received initial SYN in SYN-SENT[*] state => simultaneous 9396 * open. If segment contains CC option and there is a 9397 * cached CC, apply TAO test. If it succeeds, connection is * 9398 * half-synchronized. Otherwise, do 3-way handshake: 9399 * SYN-SENT -> SYN-RECEIVED SYN-SENT* -> SYN-RECEIVED* If 9400 * there was no CC option, clear cached CC value. 9401 */ 9402 tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); 9403 tcp_state_change(tp, TCPS_SYN_RECEIVED); 9404 } 9405 INP_WLOCK_ASSERT(tp->t_inpcb); 9406 /* 9407 * Advance th->th_seq to correspond to first data byte. If data, 9408 * trim to stay within window, dropping FIN if necessary. 9409 */ 9410 th->th_seq++; 9411 if (tlen > tp->rcv_wnd) { 9412 todrop = tlen - tp->rcv_wnd; 9413 m_adj(m, -todrop); 9414 tlen = tp->rcv_wnd; 9415 thflags &= ~TH_FIN; 9416 KMOD_TCPSTAT_INC(tcps_rcvpackafterwin); 9417 KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); 9418 } 9419 tp->snd_wl1 = th->th_seq - 1; 9420 tp->rcv_up = th->th_seq; 9421 /* 9422 * Client side of transaction: already sent SYN and data. If the 9423 * remote host used T/TCP to validate the SYN, our data will be 9424 * ACK'd; if so, enter normal data segment processing in the middle 9425 * of step 5, ack processing. Otherwise, goto step 6. 9426 */ 9427 if (thflags & TH_ACK) { 9428 /* For syn-sent we need to possibly update the rtt */ 9429 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9430 uint32_t t; 9431 9432 t = tcp_ts_getticks() - to->to_tsecr; 9433 if (!tp->t_rttlow || tp->t_rttlow > t) 9434 tp->t_rttlow = t; 9435 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9436 tcp_rack_xmit_timer_commit(rack, tp); 9437 } 9438 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) 9439 return (ret_val); 9440 /* We may have changed to FIN_WAIT_1 above */ 9441 if (tp->t_state == TCPS_FIN_WAIT_1) { 9442 /* 9443 * In FIN_WAIT_1 STATE in addition to the processing 9444 * for the ESTABLISHED state if our FIN is now 9445 * acknowledged then enter FIN_WAIT_2. 9446 */ 9447 if (ourfinisacked) { 9448 /* 9449 * If we can't receive any more data, then 9450 * closing user can proceed. Starting the 9451 * timer is contrary to the specification, 9452 * but if we don't get a FIN we'll hang 9453 * forever. 9454 * 9455 * XXXjl: we should release the tp also, and 9456 * use a compressed state. 9457 */ 9458 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9459 soisdisconnected(so); 9460 tcp_timer_activate(tp, TT_2MSL, 9461 (tcp_fast_finwait2_recycle ? 9462 tcp_finwait2_timeout : 9463 TP_MAXIDLE(tp))); 9464 } 9465 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9466 } 9467 } 9468 } 9469 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9470 tiwin, thflags, nxt_pkt)); 9471 } 9472 9473 /* 9474 * Return value of 1, the TCB is unlocked and most 9475 * likely gone, return value of 0, the TCP is still 9476 * locked. 9477 */ 9478 static int 9479 rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so, 9480 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9481 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9482 { 9483 struct tcp_rack *rack; 9484 int32_t ret_val = 0; 9485 int32_t ourfinisacked = 0; 9486 9487 ctf_calc_rwin(so, tp); 9488 if ((thflags & TH_ACK) && 9489 (SEQ_LEQ(th->th_ack, tp->snd_una) || 9490 SEQ_GT(th->th_ack, tp->snd_max))) { 9491 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9492 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9493 return (1); 9494 } 9495 rack = (struct tcp_rack *)tp->t_fb_ptr; 9496 if (IS_FASTOPEN(tp->t_flags)) { 9497 /* 9498 * When a TFO connection is in SYN_RECEIVED, the 9499 * only valid packets are the initial SYN, a 9500 * retransmit/copy of the initial SYN (possibly with 9501 * a subset of the original data), a valid ACK, a 9502 * FIN, or a RST. 9503 */ 9504 if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { 9505 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9506 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9507 return (1); 9508 } else if (thflags & TH_SYN) { 9509 /* non-initial SYN is ignored */ 9510 if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || 9511 (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || 9512 (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { 9513 ctf_do_drop(m, NULL); 9514 return (0); 9515 } 9516 } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { 9517 ctf_do_drop(m, NULL); 9518 return (0); 9519 } 9520 } 9521 if ((thflags & TH_RST) || 9522 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9523 return (ctf_process_rst(m, th, so, tp)); 9524 /* 9525 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9526 * it's less than ts_recent, drop it. 9527 */ 9528 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9529 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9530 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9531 return (ret_val); 9532 } 9533 /* 9534 * In the SYN-RECEIVED state, validate that the packet belongs to 9535 * this connection before trimming the data to fit the receive 9536 * window. Check the sequence number versus IRS since we know the 9537 * sequence numbers haven't wrapped. This is a partial fix for the 9538 * "LAND" DoS attack. 9539 */ 9540 if (SEQ_LT(th->th_seq, tp->irs)) { 9541 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 9542 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9543 return (1); 9544 } 9545 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9546 return (ret_val); 9547 } 9548 /* 9549 * If last ACK falls within this segment's sequence numbers, record 9550 * its timestamp. NOTE: 1) That the test incorporates suggestions 9551 * from the latest proposal of the tcplw@cray.com list (Braden 9552 * 1993/04/26). 2) That updating only on newer timestamps interferes 9553 * with our earlier PAWS tests, so this check should be solely 9554 * predicated on the sequence space of this segment. 3) That we 9555 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9556 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9557 * SEG.Len, This modified check allows us to overcome RFC1323's 9558 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9559 * p.869. In such cases, we can still calculate the RTT correctly 9560 * when RCV.NXT == Last.ACK.Sent. 9561 */ 9562 if ((to->to_flags & TOF_TS) != 0 && 9563 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9564 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9565 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9566 tp->ts_recent_age = tcp_ts_getticks(); 9567 tp->ts_recent = to->to_tsval; 9568 } 9569 tp->snd_wnd = tiwin; 9570 /* 9571 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9572 * is on (half-synchronized state), then queue data for later 9573 * processing; else drop segment and return. 9574 */ 9575 if ((thflags & TH_ACK) == 0) { 9576 if (IS_FASTOPEN(tp->t_flags)) { 9577 rack_cc_conn_init(tp); 9578 } 9579 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9580 tiwin, thflags, nxt_pkt)); 9581 } 9582 KMOD_TCPSTAT_INC(tcps_connects); 9583 soisconnected(so); 9584 /* Do window scaling? */ 9585 if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == 9586 (TF_RCVD_SCALE | TF_REQ_SCALE)) { 9587 tp->rcv_scale = tp->request_r_scale; 9588 } 9589 /* 9590 * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> 9591 * FIN-WAIT-1 9592 */ 9593 tp->t_starttime = ticks; 9594 if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { 9595 tcp_fastopen_decrement_counter(tp->t_tfo_pending); 9596 tp->t_tfo_pending = NULL; 9597 } 9598 if (tp->t_flags & TF_NEEDFIN) { 9599 tcp_state_change(tp, TCPS_FIN_WAIT_1); 9600 tp->t_flags &= ~TF_NEEDFIN; 9601 } else { 9602 tcp_state_change(tp, TCPS_ESTABLISHED); 9603 TCP_PROBE5(accept__established, NULL, tp, 9604 mtod(m, const char *), tp, th); 9605 /* 9606 * TFO connections call cc_conn_init() during SYN 9607 * processing. Calling it again here for such connections 9608 * is not harmless as it would undo the snd_cwnd reduction 9609 * that occurs when a TFO SYN|ACK is retransmitted. 9610 */ 9611 if (!IS_FASTOPEN(tp->t_flags)) 9612 rack_cc_conn_init(tp); 9613 } 9614 /* 9615 * Account for the ACK of our SYN prior to 9616 * regular ACK processing below, except for 9617 * simultaneous SYN, which is handled later. 9618 */ 9619 if (SEQ_GT(th->th_ack, tp->snd_una) && !(tp->t_flags & TF_NEEDSYN)) 9620 tp->snd_una++; 9621 /* 9622 * If segment contains data or ACK, will call tcp_reass() later; if 9623 * not, do so now to pass queued data to user. 9624 */ 9625 if (tlen == 0 && (thflags & TH_FIN) == 0) 9626 (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, 9627 (struct mbuf *)0); 9628 tp->snd_wl1 = th->th_seq - 1; 9629 /* For syn-recv we need to possibly update the rtt */ 9630 if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { 9631 uint32_t t; 9632 9633 t = tcp_ts_getticks() - to->to_tsecr; 9634 if (!tp->t_rttlow || tp->t_rttlow > t) 9635 tp->t_rttlow = t; 9636 tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); 9637 tcp_rack_xmit_timer_commit(rack, tp); 9638 } 9639 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 9640 return (ret_val); 9641 } 9642 if (tp->t_state == TCPS_FIN_WAIT_1) { 9643 /* We could have went to FIN_WAIT_1 (or EST) above */ 9644 /* 9645 * In FIN_WAIT_1 STATE in addition to the processing for the 9646 * ESTABLISHED state if our FIN is now acknowledged then 9647 * enter FIN_WAIT_2. 9648 */ 9649 if (ourfinisacked) { 9650 /* 9651 * If we can't receive any more data, then closing 9652 * user can proceed. Starting the timer is contrary 9653 * to the specification, but if we don't get a FIN 9654 * we'll hang forever. 9655 * 9656 * XXXjl: we should release the tp also, and use a 9657 * compressed state. 9658 */ 9659 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 9660 soisdisconnected(so); 9661 tcp_timer_activate(tp, TT_2MSL, 9662 (tcp_fast_finwait2_recycle ? 9663 tcp_finwait2_timeout : 9664 TP_MAXIDLE(tp))); 9665 } 9666 tcp_state_change(tp, TCPS_FIN_WAIT_2); 9667 } 9668 } 9669 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9670 tiwin, thflags, nxt_pkt)); 9671 } 9672 9673 /* 9674 * Return value of 1, the TCB is unlocked and most 9675 * likely gone, return value of 0, the TCP is still 9676 * locked. 9677 */ 9678 static int 9679 rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, 9680 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9681 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9682 { 9683 int32_t ret_val = 0; 9684 struct tcp_rack *rack; 9685 9686 /* 9687 * Header prediction: check for the two common cases of a 9688 * uni-directional data xfer. If the packet has no control flags, 9689 * is in-sequence, the window didn't change and we're not 9690 * retransmitting, it's a candidate. If the length is zero and the 9691 * ack moved forward, we're the sender side of the xfer. Just free 9692 * the data acked & wake any higher level process that was blocked 9693 * waiting for space. If the length is non-zero and the ack didn't 9694 * move, we're the receiver side. If we're getting packets in-order 9695 * (the reassembly queue is empty), add the data toc The socket 9696 * buffer and note that we need a delayed ack. Make sure that the 9697 * hidden state-flags are also off. Since we check for 9698 * TCPS_ESTABLISHED first, it can only be TH_NEEDSYN. 9699 */ 9700 rack = (struct tcp_rack *)tp->t_fb_ptr; 9701 if (__predict_true(((to->to_flags & TOF_SACK) == 0)) && 9702 __predict_true((thflags & (TH_SYN | TH_FIN | TH_RST | TH_ACK)) == TH_ACK) && 9703 __predict_true(SEGQ_EMPTY(tp)) && 9704 __predict_true(th->th_seq == tp->rcv_nxt)) { 9705 if (tlen == 0) { 9706 if (rack_fastack(m, th, so, tp, to, drop_hdrlen, tlen, 9707 tiwin, nxt_pkt, rack->r_ctl.rc_rcvtime)) { 9708 return (0); 9709 } 9710 } else { 9711 if (rack_do_fastnewdata(m, th, so, tp, to, drop_hdrlen, tlen, 9712 tiwin, nxt_pkt, iptos)) { 9713 return (0); 9714 } 9715 } 9716 } 9717 ctf_calc_rwin(so, tp); 9718 9719 if ((thflags & TH_RST) || 9720 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9721 return (ctf_process_rst(m, th, so, tp)); 9722 9723 /* 9724 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9725 * synchronized state. 9726 */ 9727 if (thflags & TH_SYN) { 9728 ctf_challenge_ack(m, th, tp, &ret_val); 9729 return (ret_val); 9730 } 9731 /* 9732 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9733 * it's less than ts_recent, drop it. 9734 */ 9735 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9736 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9737 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9738 return (ret_val); 9739 } 9740 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9741 return (ret_val); 9742 } 9743 /* 9744 * If last ACK falls within this segment's sequence numbers, record 9745 * its timestamp. NOTE: 1) That the test incorporates suggestions 9746 * from the latest proposal of the tcplw@cray.com list (Braden 9747 * 1993/04/26). 2) That updating only on newer timestamps interferes 9748 * with our earlier PAWS tests, so this check should be solely 9749 * predicated on the sequence space of this segment. 3) That we 9750 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9751 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9752 * SEG.Len, This modified check allows us to overcome RFC1323's 9753 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9754 * p.869. In such cases, we can still calculate the RTT correctly 9755 * when RCV.NXT == Last.ACK.Sent. 9756 */ 9757 if ((to->to_flags & TOF_TS) != 0 && 9758 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9759 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9760 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9761 tp->ts_recent_age = tcp_ts_getticks(); 9762 tp->ts_recent = to->to_tsval; 9763 } 9764 /* 9765 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9766 * is on (half-synchronized state), then queue data for later 9767 * processing; else drop segment and return. 9768 */ 9769 if ((thflags & TH_ACK) == 0) { 9770 if (tp->t_flags & TF_NEEDSYN) { 9771 9772 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9773 tiwin, thflags, nxt_pkt)); 9774 9775 } else if (tp->t_flags & TF_ACKNOW) { 9776 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9777 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 9778 return (ret_val); 9779 } else { 9780 ctf_do_drop(m, NULL); 9781 return (0); 9782 } 9783 } 9784 /* 9785 * Ack processing. 9786 */ 9787 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9788 return (ret_val); 9789 } 9790 if (sbavail(&so->so_snd)) { 9791 if (ctf_progress_timeout_check(tp, true)) { 9792 rack_log_progress_event(rack, tp, tick, PROGRESS_DROP, __LINE__); 9793 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9794 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9795 return (1); 9796 } 9797 } 9798 /* State changes only happen in rack_process_data() */ 9799 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9800 tiwin, thflags, nxt_pkt)); 9801 } 9802 9803 /* 9804 * Return value of 1, the TCB is unlocked and most 9805 * likely gone, return value of 0, the TCP is still 9806 * locked. 9807 */ 9808 static int 9809 rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, 9810 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9811 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9812 { 9813 int32_t ret_val = 0; 9814 9815 ctf_calc_rwin(so, tp); 9816 if ((thflags & TH_RST) || 9817 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9818 return (ctf_process_rst(m, th, so, tp)); 9819 /* 9820 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9821 * synchronized state. 9822 */ 9823 if (thflags & TH_SYN) { 9824 ctf_challenge_ack(m, th, tp, &ret_val); 9825 return (ret_val); 9826 } 9827 /* 9828 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9829 * it's less than ts_recent, drop it. 9830 */ 9831 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9832 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9833 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9834 return (ret_val); 9835 } 9836 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9837 return (ret_val); 9838 } 9839 /* 9840 * If last ACK falls within this segment's sequence numbers, record 9841 * its timestamp. NOTE: 1) That the test incorporates suggestions 9842 * from the latest proposal of the tcplw@cray.com list (Braden 9843 * 1993/04/26). 2) That updating only on newer timestamps interferes 9844 * with our earlier PAWS tests, so this check should be solely 9845 * predicated on the sequence space of this segment. 3) That we 9846 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9847 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9848 * SEG.Len, This modified check allows us to overcome RFC1323's 9849 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9850 * p.869. In such cases, we can still calculate the RTT correctly 9851 * when RCV.NXT == Last.ACK.Sent. 9852 */ 9853 if ((to->to_flags & TOF_TS) != 0 && 9854 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9855 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9856 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9857 tp->ts_recent_age = tcp_ts_getticks(); 9858 tp->ts_recent = to->to_tsval; 9859 } 9860 /* 9861 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9862 * is on (half-synchronized state), then queue data for later 9863 * processing; else drop segment and return. 9864 */ 9865 if ((thflags & TH_ACK) == 0) { 9866 if (tp->t_flags & TF_NEEDSYN) { 9867 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9868 tiwin, thflags, nxt_pkt)); 9869 9870 } else if (tp->t_flags & TF_ACKNOW) { 9871 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 9872 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 9873 return (ret_val); 9874 } else { 9875 ctf_do_drop(m, NULL); 9876 return (0); 9877 } 9878 } 9879 /* 9880 * Ack processing. 9881 */ 9882 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { 9883 return (ret_val); 9884 } 9885 if (sbavail(&so->so_snd)) { 9886 if (ctf_progress_timeout_check(tp, true)) { 9887 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 9888 tp, tick, PROGRESS_DROP, __LINE__); 9889 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 9890 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 9891 return (1); 9892 } 9893 } 9894 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 9895 tiwin, thflags, nxt_pkt)); 9896 } 9897 9898 static int 9899 rack_check_data_after_close(struct mbuf *m, 9900 struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) 9901 { 9902 struct tcp_rack *rack; 9903 9904 rack = (struct tcp_rack *)tp->t_fb_ptr; 9905 if (rack->rc_allow_data_af_clo == 0) { 9906 close_now: 9907 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9908 /* tcp_close will kill the inp pre-log the Reset */ 9909 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 9910 tp = tcp_close(tp); 9911 KMOD_TCPSTAT_INC(tcps_rcvafterclose); 9912 ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); 9913 return (1); 9914 } 9915 if (sbavail(&so->so_snd) == 0) 9916 goto close_now; 9917 /* Ok we allow data that is ignored and a followup reset */ 9918 tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE); 9919 tp->rcv_nxt = th->th_seq + *tlen; 9920 tp->t_flags2 |= TF2_DROP_AF_DATA; 9921 rack->r_wanted_output = 1; 9922 *tlen = 0; 9923 return (0); 9924 } 9925 9926 /* 9927 * Return value of 1, the TCB is unlocked and most 9928 * likely gone, return value of 0, the TCP is still 9929 * locked. 9930 */ 9931 static int 9932 rack_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so, 9933 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 9934 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 9935 { 9936 int32_t ret_val = 0; 9937 int32_t ourfinisacked = 0; 9938 9939 ctf_calc_rwin(so, tp); 9940 9941 if ((thflags & TH_RST) || 9942 (tp->t_fin_is_rst && (thflags & TH_FIN))) 9943 return (ctf_process_rst(m, th, so, tp)); 9944 /* 9945 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 9946 * synchronized state. 9947 */ 9948 if (thflags & TH_SYN) { 9949 ctf_challenge_ack(m, th, tp, &ret_val); 9950 return (ret_val); 9951 } 9952 /* 9953 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 9954 * it's less than ts_recent, drop it. 9955 */ 9956 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 9957 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 9958 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 9959 return (ret_val); 9960 } 9961 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 9962 return (ret_val); 9963 } 9964 /* 9965 * If new data are received on a connection after the user processes 9966 * are gone, then RST the other end. 9967 */ 9968 if ((so->so_state & SS_NOFDREF) && tlen) { 9969 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 9970 return (1); 9971 } 9972 /* 9973 * If last ACK falls within this segment's sequence numbers, record 9974 * its timestamp. NOTE: 1) That the test incorporates suggestions 9975 * from the latest proposal of the tcplw@cray.com list (Braden 9976 * 1993/04/26). 2) That updating only on newer timestamps interferes 9977 * with our earlier PAWS tests, so this check should be solely 9978 * predicated on the sequence space of this segment. 3) That we 9979 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 9980 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 9981 * SEG.Len, This modified check allows us to overcome RFC1323's 9982 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 9983 * p.869. In such cases, we can still calculate the RTT correctly 9984 * when RCV.NXT == Last.ACK.Sent. 9985 */ 9986 if ((to->to_flags & TOF_TS) != 0 && 9987 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 9988 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 9989 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 9990 tp->ts_recent_age = tcp_ts_getticks(); 9991 tp->ts_recent = to->to_tsval; 9992 } 9993 /* 9994 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 9995 * is on (half-synchronized state), then queue data for later 9996 * processing; else drop segment and return. 9997 */ 9998 if ((thflags & TH_ACK) == 0) { 9999 if (tp->t_flags & TF_NEEDSYN) { 10000 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10001 tiwin, thflags, nxt_pkt)); 10002 } else if (tp->t_flags & TF_ACKNOW) { 10003 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10004 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10005 return (ret_val); 10006 } else { 10007 ctf_do_drop(m, NULL); 10008 return (0); 10009 } 10010 } 10011 /* 10012 * Ack processing. 10013 */ 10014 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10015 return (ret_val); 10016 } 10017 if (ourfinisacked) { 10018 /* 10019 * If we can't receive any more data, then closing user can 10020 * proceed. Starting the timer is contrary to the 10021 * specification, but if we don't get a FIN we'll hang 10022 * forever. 10023 * 10024 * XXXjl: we should release the tp also, and use a 10025 * compressed state. 10026 */ 10027 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { 10028 soisdisconnected(so); 10029 tcp_timer_activate(tp, TT_2MSL, 10030 (tcp_fast_finwait2_recycle ? 10031 tcp_finwait2_timeout : 10032 TP_MAXIDLE(tp))); 10033 } 10034 tcp_state_change(tp, TCPS_FIN_WAIT_2); 10035 } 10036 if (sbavail(&so->so_snd)) { 10037 if (ctf_progress_timeout_check(tp, true)) { 10038 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10039 tp, tick, PROGRESS_DROP, __LINE__); 10040 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10041 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10042 return (1); 10043 } 10044 } 10045 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10046 tiwin, thflags, nxt_pkt)); 10047 } 10048 10049 /* 10050 * Return value of 1, the TCB is unlocked and most 10051 * likely gone, return value of 0, the TCP is still 10052 * locked. 10053 */ 10054 static int 10055 rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, 10056 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10057 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10058 { 10059 int32_t ret_val = 0; 10060 int32_t ourfinisacked = 0; 10061 10062 ctf_calc_rwin(so, tp); 10063 10064 if ((thflags & TH_RST) || 10065 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10066 return (ctf_process_rst(m, th, so, tp)); 10067 /* 10068 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10069 * synchronized state. 10070 */ 10071 if (thflags & TH_SYN) { 10072 ctf_challenge_ack(m, th, tp, &ret_val); 10073 return (ret_val); 10074 } 10075 /* 10076 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10077 * it's less than ts_recent, drop it. 10078 */ 10079 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10080 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10081 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10082 return (ret_val); 10083 } 10084 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10085 return (ret_val); 10086 } 10087 /* 10088 * If new data are received on a connection after the user processes 10089 * are gone, then RST the other end. 10090 */ 10091 if ((so->so_state & SS_NOFDREF) && tlen) { 10092 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10093 return (1); 10094 } 10095 /* 10096 * If last ACK falls within this segment's sequence numbers, record 10097 * its timestamp. NOTE: 1) That the test incorporates suggestions 10098 * from the latest proposal of the tcplw@cray.com list (Braden 10099 * 1993/04/26). 2) That updating only on newer timestamps interferes 10100 * with our earlier PAWS tests, so this check should be solely 10101 * predicated on the sequence space of this segment. 3) That we 10102 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10103 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10104 * SEG.Len, This modified check allows us to overcome RFC1323's 10105 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10106 * p.869. In such cases, we can still calculate the RTT correctly 10107 * when RCV.NXT == Last.ACK.Sent. 10108 */ 10109 if ((to->to_flags & TOF_TS) != 0 && 10110 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10111 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10112 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10113 tp->ts_recent_age = tcp_ts_getticks(); 10114 tp->ts_recent = to->to_tsval; 10115 } 10116 /* 10117 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10118 * is on (half-synchronized state), then queue data for later 10119 * processing; else drop segment and return. 10120 */ 10121 if ((thflags & TH_ACK) == 0) { 10122 if (tp->t_flags & TF_NEEDSYN) { 10123 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10124 tiwin, thflags, nxt_pkt)); 10125 } else if (tp->t_flags & TF_ACKNOW) { 10126 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10127 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; 10128 return (ret_val); 10129 } else { 10130 ctf_do_drop(m, NULL); 10131 return (0); 10132 } 10133 } 10134 /* 10135 * Ack processing. 10136 */ 10137 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10138 return (ret_val); 10139 } 10140 if (ourfinisacked) { 10141 tcp_twstart(tp); 10142 m_freem(m); 10143 return (1); 10144 } 10145 if (sbavail(&so->so_snd)) { 10146 if (ctf_progress_timeout_check(tp, true)) { 10147 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10148 tp, tick, PROGRESS_DROP, __LINE__); 10149 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10150 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10151 return (1); 10152 } 10153 } 10154 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10155 tiwin, thflags, nxt_pkt)); 10156 } 10157 10158 /* 10159 * Return value of 1, the TCB is unlocked and most 10160 * likely gone, return value of 0, the TCP is still 10161 * locked. 10162 */ 10163 static int 10164 rack_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so, 10165 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10166 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10167 { 10168 int32_t ret_val = 0; 10169 int32_t ourfinisacked = 0; 10170 10171 ctf_calc_rwin(so, tp); 10172 10173 if ((thflags & TH_RST) || 10174 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10175 return (ctf_process_rst(m, th, so, tp)); 10176 /* 10177 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10178 * synchronized state. 10179 */ 10180 if (thflags & TH_SYN) { 10181 ctf_challenge_ack(m, th, tp, &ret_val); 10182 return (ret_val); 10183 } 10184 /* 10185 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10186 * it's less than ts_recent, drop it. 10187 */ 10188 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10189 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10190 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10191 return (ret_val); 10192 } 10193 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10194 return (ret_val); 10195 } 10196 /* 10197 * If new data are received on a connection after the user processes 10198 * are gone, then RST the other end. 10199 */ 10200 if ((so->so_state & SS_NOFDREF) && tlen) { 10201 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10202 return (1); 10203 } 10204 /* 10205 * If last ACK falls within this segment's sequence numbers, record 10206 * its timestamp. NOTE: 1) That the test incorporates suggestions 10207 * from the latest proposal of the tcplw@cray.com list (Braden 10208 * 1993/04/26). 2) That updating only on newer timestamps interferes 10209 * with our earlier PAWS tests, so this check should be solely 10210 * predicated on the sequence space of this segment. 3) That we 10211 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10212 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10213 * SEG.Len, This modified check allows us to overcome RFC1323's 10214 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10215 * p.869. In such cases, we can still calculate the RTT correctly 10216 * when RCV.NXT == Last.ACK.Sent. 10217 */ 10218 if ((to->to_flags & TOF_TS) != 0 && 10219 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10220 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10221 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10222 tp->ts_recent_age = tcp_ts_getticks(); 10223 tp->ts_recent = to->to_tsval; 10224 } 10225 /* 10226 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10227 * is on (half-synchronized state), then queue data for later 10228 * processing; else drop segment and return. 10229 */ 10230 if ((thflags & TH_ACK) == 0) { 10231 if (tp->t_flags & TF_NEEDSYN) { 10232 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10233 tiwin, thflags, nxt_pkt)); 10234 } else if (tp->t_flags & TF_ACKNOW) { 10235 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10236 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10237 return (ret_val); 10238 } else { 10239 ctf_do_drop(m, NULL); 10240 return (0); 10241 } 10242 } 10243 /* 10244 * case TCPS_LAST_ACK: Ack processing. 10245 */ 10246 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10247 return (ret_val); 10248 } 10249 if (ourfinisacked) { 10250 tp = tcp_close(tp); 10251 ctf_do_drop(m, tp); 10252 return (1); 10253 } 10254 if (sbavail(&so->so_snd)) { 10255 if (ctf_progress_timeout_check(tp, true)) { 10256 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10257 tp, tick, PROGRESS_DROP, __LINE__); 10258 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10259 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10260 return (1); 10261 } 10262 } 10263 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10264 tiwin, thflags, nxt_pkt)); 10265 } 10266 10267 10268 /* 10269 * Return value of 1, the TCB is unlocked and most 10270 * likely gone, return value of 0, the TCP is still 10271 * locked. 10272 */ 10273 static int 10274 rack_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so, 10275 struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, 10276 uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) 10277 { 10278 int32_t ret_val = 0; 10279 int32_t ourfinisacked = 0; 10280 10281 ctf_calc_rwin(so, tp); 10282 10283 /* Reset receive buffer auto scaling when not in bulk receive mode. */ 10284 if ((thflags & TH_RST) || 10285 (tp->t_fin_is_rst && (thflags & TH_FIN))) 10286 return (ctf_process_rst(m, th, so, tp)); 10287 /* 10288 * RFC5961 Section 4.2 Send challenge ACK for any SYN in 10289 * synchronized state. 10290 */ 10291 if (thflags & TH_SYN) { 10292 ctf_challenge_ack(m, th, tp, &ret_val); 10293 return (ret_val); 10294 } 10295 /* 10296 * RFC 1323 PAWS: If we have a timestamp reply on this segment and 10297 * it's less than ts_recent, drop it. 10298 */ 10299 if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && 10300 TSTMP_LT(to->to_tsval, tp->ts_recent)) { 10301 if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) 10302 return (ret_val); 10303 } 10304 if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { 10305 return (ret_val); 10306 } 10307 /* 10308 * If new data are received on a connection after the user processes 10309 * are gone, then RST the other end. 10310 */ 10311 if ((so->so_state & SS_NOFDREF) && 10312 tlen) { 10313 if (rack_check_data_after_close(m, tp, &tlen, th, so)) 10314 return (1); 10315 } 10316 /* 10317 * If last ACK falls within this segment's sequence numbers, record 10318 * its timestamp. NOTE: 1) That the test incorporates suggestions 10319 * from the latest proposal of the tcplw@cray.com list (Braden 10320 * 1993/04/26). 2) That updating only on newer timestamps interferes 10321 * with our earlier PAWS tests, so this check should be solely 10322 * predicated on the sequence space of this segment. 3) That we 10323 * modify the segment boundary check to be Last.ACK.Sent <= SEG.SEQ 10324 * + SEG.Len instead of RFC1323's Last.ACK.Sent < SEG.SEQ + 10325 * SEG.Len, This modified check allows us to overcome RFC1323's 10326 * limitations as described in Stevens TCP/IP Illustrated Vol. 2 10327 * p.869. In such cases, we can still calculate the RTT correctly 10328 * when RCV.NXT == Last.ACK.Sent. 10329 */ 10330 if ((to->to_flags & TOF_TS) != 0 && 10331 SEQ_LEQ(th->th_seq, tp->last_ack_sent) && 10332 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + 10333 ((thflags & (TH_SYN | TH_FIN)) != 0))) { 10334 tp->ts_recent_age = tcp_ts_getticks(); 10335 tp->ts_recent = to->to_tsval; 10336 } 10337 /* 10338 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag 10339 * is on (half-synchronized state), then queue data for later 10340 * processing; else drop segment and return. 10341 */ 10342 if ((thflags & TH_ACK) == 0) { 10343 if (tp->t_flags & TF_NEEDSYN) { 10344 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10345 tiwin, thflags, nxt_pkt)); 10346 } else if (tp->t_flags & TF_ACKNOW) { 10347 ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); 10348 ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; 10349 return (ret_val); 10350 } else { 10351 ctf_do_drop(m, NULL); 10352 return (0); 10353 } 10354 } 10355 /* 10356 * Ack processing. 10357 */ 10358 if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { 10359 return (ret_val); 10360 } 10361 if (sbavail(&so->so_snd)) { 10362 if (ctf_progress_timeout_check(tp, true)) { 10363 rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, 10364 tp, tick, PROGRESS_DROP, __LINE__); 10365 tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); 10366 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 10367 return (1); 10368 } 10369 } 10370 return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, 10371 tiwin, thflags, nxt_pkt)); 10372 } 10373 10374 static void inline 10375 rack_clear_rate_sample(struct tcp_rack *rack) 10376 { 10377 rack->r_ctl.rack_rs.rs_flags = RACK_RTT_EMPTY; 10378 rack->r_ctl.rack_rs.rs_rtt_cnt = 0; 10379 rack->r_ctl.rack_rs.rs_rtt_tot = 0; 10380 } 10381 10382 static void 10383 rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) 10384 { 10385 uint64_t bw_est, rate_wanted; 10386 uint32_t tls_seg = 0; 10387 int chged = 0; 10388 uint32_t user_max; 10389 10390 user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; 10391 #ifdef KERN_TLS 10392 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 10393 tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); 10394 if (tls_seg != rack->r_ctl.rc_pace_min_segs) 10395 chged = 1; 10396 rack->r_ctl.rc_pace_min_segs = tls_seg; 10397 } else 10398 #endif 10399 { 10400 if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) 10401 chged = 1; 10402 rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); 10403 } 10404 if (rack->use_fixed_rate || rack->rc_force_max_seg) { 10405 if (user_max != rack->r_ctl.rc_pace_max_segs) 10406 chged = 1; 10407 } 10408 if (rack->rc_force_max_seg) { 10409 rack->r_ctl.rc_pace_max_segs = user_max; 10410 } else if (rack->use_fixed_rate) { 10411 bw_est = rack_get_bw(rack); 10412 if ((rack->r_ctl.crte == NULL) || 10413 (bw_est != rack->r_ctl.crte->rate)) { 10414 rack->r_ctl.rc_pace_max_segs = user_max; 10415 } else { 10416 /* We are pacing right at the hardware rate */ 10417 uint32_t segsiz; 10418 10419 segsiz = min(ctf_fixed_maxseg(tp), 10420 rack->r_ctl.rc_pace_min_segs); 10421 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size( 10422 bw_est, segsiz, 0, 10423 rack->r_ctl.crte, NULL); 10424 } 10425 } else if (rack->rc_always_pace) { 10426 if (rack->r_ctl.gp_bw || 10427 #ifdef NETFLIX_PEAKRATE 10428 rack->rc_tp->t_maxpeakrate || 10429 #endif 10430 rack->r_ctl.init_rate) { 10431 /* We have a rate of some sort set */ 10432 uint32_t orig; 10433 10434 bw_est = rack_get_bw(rack); 10435 orig = rack->r_ctl.rc_pace_max_segs; 10436 rate_wanted = rack_get_output_bw(rack, bw_est, NULL); 10437 if (rate_wanted) { 10438 /* We have something */ 10439 rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, 10440 rate_wanted, 10441 ctf_fixed_maxseg(rack->rc_tp)); 10442 } else 10443 rack->r_ctl.rc_pace_max_segs = rack->r_ctl.rc_pace_min_segs; 10444 if (orig != rack->r_ctl.rc_pace_max_segs) 10445 chged = 1; 10446 } else if ((rack->r_ctl.gp_bw == 0) && 10447 (rack->r_ctl.rc_pace_max_segs == 0)) { 10448 /* 10449 * If we have nothing limit us to bursting 10450 * out IW sized pieces. 10451 */ 10452 chged = 1; 10453 rack->r_ctl.rc_pace_max_segs = rc_init_window(rack); 10454 } 10455 } 10456 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10457 chged = 1; 10458 rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; 10459 } 10460 #ifdef KERN_TLS 10461 uint32_t orig; 10462 10463 if (tls_seg != 0) { 10464 orig = rack->r_ctl.rc_pace_max_segs; 10465 if (rack_hw_tls_max_seg > 1) { 10466 rack->r_ctl.rc_pace_max_segs /= tls_seg; 10467 if (rack_hw_tls_max_seg > rack->r_ctl.rc_pace_max_segs) 10468 rack->r_ctl.rc_pace_max_segs = rack_hw_tls_max_seg; 10469 } else { 10470 rack->r_ctl.rc_pace_max_segs = 1; 10471 } 10472 if (rack->r_ctl.rc_pace_max_segs == 0) 10473 rack->r_ctl.rc_pace_max_segs = 1; 10474 rack->r_ctl.rc_pace_max_segs *= tls_seg; 10475 if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) { 10476 /* We can't go over the max bytes (usually 64k) */ 10477 rack->r_ctl.rc_pace_max_segs = ((PACE_MAX_IP_BYTES / tls_seg) * tls_seg); 10478 } 10479 if (orig != rack->r_ctl.rc_pace_max_segs) 10480 chged = 1; 10481 } 10482 #endif 10483 if (chged) 10484 rack_log_type_hrdwtso(tp, rack, tls_seg, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); 10485 } 10486 10487 static int 10488 rack_init(struct tcpcb *tp) 10489 { 10490 struct tcp_rack *rack = NULL; 10491 struct rack_sendmap *insret; 10492 uint32_t iwin, snt, us_cts; 10493 10494 tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); 10495 if (tp->t_fb_ptr == NULL) { 10496 /* 10497 * We need to allocate memory but cant. The INP and INP_INFO 10498 * locks and they are recusive (happens during setup. So a 10499 * scheme to drop the locks fails :( 10500 * 10501 */ 10502 return (ENOMEM); 10503 } 10504 memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); 10505 10506 rack = (struct tcp_rack *)tp->t_fb_ptr; 10507 RB_INIT(&rack->r_ctl.rc_mtree); 10508 TAILQ_INIT(&rack->r_ctl.rc_free); 10509 TAILQ_INIT(&rack->r_ctl.rc_tmap); 10510 rack->rc_tp = tp; 10511 if (tp->t_inpcb) { 10512 rack->rc_inp = tp->t_inpcb; 10513 } 10514 /* Probably not needed but lets be sure */ 10515 rack_clear_rate_sample(rack); 10516 rack->r_ctl.rc_reorder_fade = rack_reorder_fade; 10517 rack->rc_allow_data_af_clo = rack_ignore_data_after_close; 10518 rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; 10519 if (use_rack_rr) 10520 rack->use_rack_rr = 1; 10521 if (V_tcp_delack_enabled) 10522 tp->t_delayed_ack = 1; 10523 else 10524 tp->t_delayed_ack = 0; 10525 if (rack_enable_shared_cwnd) 10526 rack->rack_enable_scwnd = 1; 10527 rack->rc_user_set_max_segs = rack_hptsi_segments; 10528 rack->rc_force_max_seg = 0; 10529 if (rack_use_imac_dack) 10530 rack->rc_dack_mode = 1; 10531 rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; 10532 rack->r_ctl.rc_pkt_delay = rack_pkt_delay; 10533 rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; 10534 rack->r_ctl.rc_prop_rate = rack_proportional_rate; 10535 rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; 10536 rack->r_ctl.rc_early_recovery = rack_early_recovery; 10537 rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; 10538 rack->r_ctl.rc_highest_us_rtt = 0; 10539 if (rack_disable_prr) 10540 rack->rack_no_prr = 1; 10541 if (rack_gp_no_rec_chg) 10542 rack->rc_gp_no_rec_chg = 1; 10543 rack->rc_always_pace = rack_pace_every_seg; 10544 if (rack_enable_mqueue_for_nonpaced) 10545 rack->r_mbuf_queue = 1; 10546 else 10547 rack->r_mbuf_queue = 0; 10548 if (rack->r_mbuf_queue || rack->rc_always_pace) 10549 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 10550 else 10551 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10552 rack_set_pace_segments(tp, rack, __LINE__); 10553 if (rack_limits_scwnd) 10554 rack->r_limit_scw = 1; 10555 else 10556 rack->r_limit_scw = 0; 10557 rack->r_ctl.rc_high_rwnd = tp->snd_wnd; 10558 rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 10559 rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; 10560 rack->rack_tlp_threshold_use = rack_tlp_threshold_use; 10561 rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; 10562 rack->r_ctl.rc_min_to = rack_min_to; 10563 microuptime(&rack->r_ctl.act_rcv_time); 10564 rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; 10565 rack->r_running_late = 0; 10566 rack->r_running_early = 0; 10567 rack->rc_init_win = rack_default_init_window; 10568 rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; 10569 if (rack_do_dyn_mul) { 10570 /* When dynamic adjustment is on CA needs to start at 100% */ 10571 rack->rc_gp_dyn_mul = 1; 10572 if (rack_do_dyn_mul >= 100) 10573 rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; 10574 } else 10575 rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; 10576 rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; 10577 rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; 10578 rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); 10579 setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, 10580 rack_probertt_filter_life); 10581 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 10582 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 10583 rack->r_ctl.rc_time_of_last_probertt = us_cts; 10584 rack->r_ctl.rc_time_probertt_starts = 0; 10585 /* Do we force on detection? */ 10586 #ifdef NETFLIX_EXP_DETECTION 10587 if (tcp_force_detection) 10588 rack->do_detection = 1; 10589 else 10590 #endif 10591 rack->do_detection = 0; 10592 if (rack_non_rxt_use_cr) 10593 rack->rack_rec_nonrxt_use_cr = 1; 10594 if (tp->snd_una != tp->snd_max) { 10595 /* Create a send map for the current outstanding data */ 10596 struct rack_sendmap *rsm; 10597 10598 rsm = rack_alloc(rack); 10599 if (rsm == NULL) { 10600 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10601 tp->t_fb_ptr = NULL; 10602 return (ENOMEM); 10603 } 10604 rsm->r_flags = RACK_OVERMAX; 10605 rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; 10606 rsm->r_rtr_cnt = 1; 10607 rsm->r_rtr_bytes = 0; 10608 rsm->r_start = tp->snd_una; 10609 rsm->r_end = tp->snd_max; 10610 rsm->usec_orig_send = us_cts; 10611 rsm->r_dupack = 0; 10612 insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10613 #ifdef INVARIANTS 10614 if (insret != NULL) { 10615 panic("Insert in rb tree fails ret:%p rack:%p rsm:%p", 10616 insret, rack, rsm); 10617 } 10618 #endif 10619 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); 10620 rsm->r_in_tmap = 1; 10621 } 10622 /* Cancel the GP measurement in progress */ 10623 tp->t_flags &= ~TF_GPUTINPROG; 10624 if (SEQ_GT(tp->snd_max, tp->iss)) 10625 snt = tp->snd_max - tp->iss; 10626 else 10627 snt = 0; 10628 iwin = rc_init_window(rack); 10629 if (snt < iwin) { 10630 /* We are not past the initial window 10631 * so we need to make sure cwnd is 10632 * correct. 10633 */ 10634 if (tp->snd_cwnd < iwin) 10635 tp->snd_cwnd = iwin; 10636 /* 10637 * If we are within the initial window 10638 * we want ssthresh to be unlimited. Setting 10639 * it to the rwnd (which the default stack does 10640 * and older racks) is not really a good idea 10641 * since we want to be in SS and grow both the 10642 * cwnd and the rwnd (via dynamic rwnd growth). If 10643 * we set it to the rwnd then as the peer grows its 10644 * rwnd we will be stuck in CA and never hit SS. 10645 * 10646 * Its far better to raise it up high (this takes the 10647 * risk that there as been a loss already, probably 10648 * we should have an indicator in all stacks of loss 10649 * but we don't), but considering the normal use this 10650 * is a risk worth taking. The consequences of not 10651 * hitting SS are far worse than going one more time 10652 * into it early on (before we have sent even a IW). 10653 * It is highly unlikely that we will have had a loss 10654 * before getting the IW out. 10655 */ 10656 tp->snd_ssthresh = 0xffffffff; 10657 } 10658 rack_stop_all_timers(tp); 10659 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10660 rack_log_rtt_shrinks(rack, us_cts, 0, 10661 __LINE__, RACK_RTTS_INIT); 10662 return (0); 10663 } 10664 10665 static int 10666 rack_handoff_ok(struct tcpcb *tp) 10667 { 10668 if ((tp->t_state == TCPS_CLOSED) || 10669 (tp->t_state == TCPS_LISTEN)) { 10670 /* Sure no problem though it may not stick */ 10671 return (0); 10672 } 10673 if ((tp->t_state == TCPS_SYN_SENT) || 10674 (tp->t_state == TCPS_SYN_RECEIVED)) { 10675 /* 10676 * We really don't know you have to get to ESTAB or beyond 10677 * to tell. 10678 */ 10679 return (EAGAIN); 10680 } 10681 if ((tp->t_flags & TF_SACK_PERMIT) || rack_sack_not_required){ 10682 return (0); 10683 } 10684 /* 10685 * If we reach here we don't do SACK on this connection so we can 10686 * never do rack. 10687 */ 10688 return (EINVAL); 10689 } 10690 10691 static void 10692 rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) 10693 { 10694 if (tp->t_fb_ptr) { 10695 struct tcp_rack *rack; 10696 struct rack_sendmap *rsm, *nrsm, *rm; 10697 10698 rack = (struct tcp_rack *)tp->t_fb_ptr; 10699 #ifdef NETFLIX_SHARED_CWND 10700 if (rack->r_ctl.rc_scw) { 10701 uint32_t limit; 10702 10703 if (rack->r_limit_scw) 10704 limit = max(1, rack->r_ctl.rc_lowest_us_rtt); 10705 else 10706 limit = 0; 10707 tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, 10708 rack->r_ctl.rc_scw_index, 10709 limit); 10710 rack->r_ctl.rc_scw = NULL; 10711 } 10712 #endif 10713 /* rack does not use force data but other stacks may clear it */ 10714 tp->t_flags &= ~TF_FORCEDATA; 10715 if (tp->t_inpcb) { 10716 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 10717 tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; 10718 tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; 10719 } 10720 #ifdef TCP_BLACKBOX 10721 tcp_log_flowend(tp); 10722 #endif 10723 RB_FOREACH_SAFE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm) { 10724 rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); 10725 #ifdef INVARIANTS 10726 if (rm != rsm) { 10727 panic("At fini, rack:%p rsm:%p rm:%p", 10728 rack, rsm, rm); 10729 } 10730 #endif 10731 uma_zfree(rack_zone, rsm); 10732 } 10733 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10734 while (rsm) { 10735 TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); 10736 uma_zfree(rack_zone, rsm); 10737 rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); 10738 } 10739 rack->rc_free_cnt = 0; 10740 uma_zfree(rack_pcb_zone, tp->t_fb_ptr); 10741 tp->t_fb_ptr = NULL; 10742 } 10743 /* Cancel the GP measurement in progress */ 10744 tp->t_flags &= ~TF_GPUTINPROG; 10745 /* Make sure snd_nxt is correctly set */ 10746 tp->snd_nxt = tp->snd_max; 10747 } 10748 10749 10750 static void 10751 rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) 10752 { 10753 switch (tp->t_state) { 10754 case TCPS_SYN_SENT: 10755 rack->r_state = TCPS_SYN_SENT; 10756 rack->r_substate = rack_do_syn_sent; 10757 break; 10758 case TCPS_SYN_RECEIVED: 10759 rack->r_state = TCPS_SYN_RECEIVED; 10760 rack->r_substate = rack_do_syn_recv; 10761 break; 10762 case TCPS_ESTABLISHED: 10763 rack_set_pace_segments(tp, rack, __LINE__); 10764 rack->r_state = TCPS_ESTABLISHED; 10765 rack->r_substate = rack_do_established; 10766 break; 10767 case TCPS_CLOSE_WAIT: 10768 rack->r_state = TCPS_CLOSE_WAIT; 10769 rack->r_substate = rack_do_close_wait; 10770 break; 10771 case TCPS_FIN_WAIT_1: 10772 rack->r_state = TCPS_FIN_WAIT_1; 10773 rack->r_substate = rack_do_fin_wait_1; 10774 break; 10775 case TCPS_CLOSING: 10776 rack->r_state = TCPS_CLOSING; 10777 rack->r_substate = rack_do_closing; 10778 break; 10779 case TCPS_LAST_ACK: 10780 rack->r_state = TCPS_LAST_ACK; 10781 rack->r_substate = rack_do_lastack; 10782 break; 10783 case TCPS_FIN_WAIT_2: 10784 rack->r_state = TCPS_FIN_WAIT_2; 10785 rack->r_substate = rack_do_fin_wait_2; 10786 break; 10787 case TCPS_LISTEN: 10788 case TCPS_CLOSED: 10789 case TCPS_TIME_WAIT: 10790 default: 10791 break; 10792 }; 10793 } 10794 10795 10796 static void 10797 rack_timer_audit(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb) 10798 { 10799 /* 10800 * We received an ack, and then did not 10801 * call send or were bounced out due to the 10802 * hpts was running. Now a timer is up as well, is 10803 * it the right timer? 10804 */ 10805 struct rack_sendmap *rsm; 10806 int tmr_up; 10807 10808 tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; 10809 if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) 10810 return; 10811 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 10812 if (((rsm == NULL) || (tp->t_state < TCPS_ESTABLISHED)) && 10813 (tmr_up == PACE_TMR_RXT)) { 10814 /* Should be an RXT */ 10815 return; 10816 } 10817 if (rsm == NULL) { 10818 /* Nothing outstanding? */ 10819 if (tp->t_flags & TF_DELACK) { 10820 if (tmr_up == PACE_TMR_DELACK) 10821 /* We are supposed to have delayed ack up and we do */ 10822 return; 10823 } else if (sbavail(&tp->t_inpcb->inp_socket->so_snd) && (tmr_up == PACE_TMR_RXT)) { 10824 /* 10825 * if we hit enobufs then we would expect the possiblity 10826 * of nothing outstanding and the RXT up (and the hptsi timer). 10827 */ 10828 return; 10829 } else if (((V_tcp_always_keepalive || 10830 rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 10831 (tp->t_state <= TCPS_CLOSING)) && 10832 (tmr_up == PACE_TMR_KEEP) && 10833 (tp->snd_max == tp->snd_una)) { 10834 /* We should have keep alive up and we do */ 10835 return; 10836 } 10837 } 10838 if (SEQ_GT(tp->snd_max, tp->snd_una) && 10839 ((tmr_up == PACE_TMR_TLP) || 10840 (tmr_up == PACE_TMR_RACK) || 10841 (tmr_up == PACE_TMR_RXT))) { 10842 /* 10843 * Either a Rack, TLP or RXT is fine if we 10844 * have outstanding data. 10845 */ 10846 return; 10847 } else if (tmr_up == PACE_TMR_DELACK) { 10848 /* 10849 * If the delayed ack was going to go off 10850 * before the rtx/tlp/rack timer were going to 10851 * expire, then that would be the timer in control. 10852 * Note we don't check the time here trusting the 10853 * code is correct. 10854 */ 10855 return; 10856 } 10857 /* 10858 * Ok the timer originally started is not what we want now. 10859 * We will force the hpts to be stopped if any, and restart 10860 * with the slot set to what was in the saved slot. 10861 */ 10862 if (rack->rc_inp->inp_in_hpts) { 10863 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 10864 uint32_t us_cts; 10865 10866 us_cts = tcp_get_usecs(NULL); 10867 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 10868 rack->r_early = 1; 10869 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 10870 } 10871 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 10872 } 10873 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 10874 } 10875 rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); 10876 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 10877 } 10878 10879 static int 10880 rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, 10881 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, 10882 int32_t nxt_pkt, struct timeval *tv) 10883 { 10884 int32_t thflags, retval, did_out = 0; 10885 int32_t way_out = 0; 10886 uint32_t cts; 10887 uint32_t tiwin; 10888 struct timespec ts; 10889 struct tcpopt to; 10890 struct tcp_rack *rack; 10891 struct rack_sendmap *rsm; 10892 int32_t prev_state = 0; 10893 uint32_t us_cts; 10894 /* 10895 * tv passed from common code is from either M_TSTMP_LRO or 10896 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The 10897 * rack_pacing stack assumes tv always refers to 'now', so we overwrite 10898 * tv here to guarantee that. 10899 */ 10900 if (m->m_flags & M_TSTMP_LRO) 10901 tcp_get_usecs(tv); 10902 10903 cts = tcp_tv_to_mssectick(tv); 10904 rack = (struct tcp_rack *)tp->t_fb_ptr; 10905 10906 if ((m->m_flags & M_TSTMP) || 10907 (m->m_flags & M_TSTMP_LRO)) { 10908 mbuf_tstmp2timespec(m, &ts); 10909 rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; 10910 rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; 10911 } else 10912 rack->r_ctl.act_rcv_time = *tv; 10913 kern_prefetch(rack, &prev_state); 10914 prev_state = 0; 10915 thflags = th->th_flags; 10916 10917 NET_EPOCH_ASSERT(); 10918 INP_WLOCK_ASSERT(tp->t_inpcb); 10919 KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", 10920 __func__)); 10921 KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", 10922 __func__)); 10923 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 10924 union tcp_log_stackspecific log; 10925 struct timeval ltv; 10926 #ifdef NETFLIX_HTTP_LOGGING 10927 struct http_sendfile_track *http_req; 10928 10929 if (SEQ_GT(th->th_ack, tp->snd_una)) { 10930 http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); 10931 } else { 10932 http_req = tcp_http_find_req_for_seq(tp, th->th_ack); 10933 } 10934 #endif 10935 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 10936 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 10937 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 10938 if (rack->rack_no_prr == 0) 10939 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 10940 else 10941 log.u_bbr.flex1 = 0; 10942 log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; 10943 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 10944 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 10945 log.u_bbr.flex3 = m->m_flags; 10946 log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; 10947 if (m->m_flags & M_TSTMP) { 10948 /* Record the hardware timestamp if present */ 10949 mbuf_tstmp2timespec(m, &ts); 10950 ltv.tv_sec = ts.tv_sec; 10951 ltv.tv_usec = ts.tv_nsec / 1000; 10952 log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); 10953 } else if (m->m_flags & M_TSTMP_LRO) { 10954 /* Record the LRO the arrival timestamp */ 10955 mbuf_tstmp2timespec(m, &ts); 10956 ltv.tv_sec = ts.tv_sec; 10957 ltv.tv_usec = ts.tv_nsec / 1000; 10958 log.u_bbr.flex5 = tcp_tv_to_usectick(<v); 10959 } 10960 log.u_bbr.timeStamp = tcp_get_usecs(<v); 10961 /* Log the rcv time */ 10962 log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; 10963 #ifdef NETFLIX_HTTP_LOGGING 10964 log.u_bbr.applimited = tp->t_http_closed; 10965 log.u_bbr.applimited <<= 8; 10966 log.u_bbr.applimited |= tp->t_http_open; 10967 log.u_bbr.applimited <<= 8; 10968 log.u_bbr.applimited |= tp->t_http_req; 10969 if (http_req) { 10970 /* Copy out any client req info */ 10971 /* seconds */ 10972 log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); 10973 /* useconds */ 10974 log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); 10975 log.u_bbr.rttProp = http_req->timestamp; 10976 log.u_bbr.cur_del_rate = http_req->start; 10977 if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { 10978 log.u_bbr.flex8 |= 1; 10979 } else { 10980 log.u_bbr.flex8 |= 2; 10981 log.u_bbr.bw_inuse = http_req->end; 10982 } 10983 log.u_bbr.flex6 = http_req->start_seq; 10984 if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { 10985 log.u_bbr.flex8 |= 4; 10986 log.u_bbr.epoch = http_req->end_seq; 10987 } 10988 } 10989 #endif 10990 TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, 10991 tlen, &log, true, <v); 10992 } 10993 if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { 10994 way_out = 4; 10995 retval = 0; 10996 goto done_with_input; 10997 } 10998 /* 10999 * If a segment with the ACK-bit set arrives in the SYN-SENT state 11000 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. 11001 */ 11002 if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && 11003 (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { 11004 tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); 11005 ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); 11006 return(1); 11007 } 11008 /* 11009 * Segment received on connection. Reset idle time and keep-alive 11010 * timer. XXX: This should be done after segment validation to 11011 * ignore broken/spoofed segs. 11012 */ 11013 if (tp->t_idle_reduce && 11014 (tp->snd_max == tp->snd_una) && 11015 ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { 11016 counter_u64_add(rack_input_idle_reduces, 1); 11017 rack_cc_after_idle(rack, tp); 11018 } 11019 tp->t_rcvtime = ticks; 11020 /* 11021 * Unscale the window into a 32-bit value. For the SYN_SENT state 11022 * the scale is zero. 11023 */ 11024 tiwin = th->th_win << tp->snd_scale; 11025 #ifdef STATS 11026 stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); 11027 #endif 11028 if (tiwin > rack->r_ctl.rc_high_rwnd) 11029 rack->r_ctl.rc_high_rwnd = tiwin; 11030 /* 11031 * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move 11032 * this to occur after we've validated the segment. 11033 */ 11034 if (tp->t_flags2 & TF2_ECN_PERMIT) { 11035 if (thflags & TH_CWR) { 11036 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 11037 tp->t_flags |= TF_ACKNOW; 11038 } 11039 switch (iptos & IPTOS_ECN_MASK) { 11040 case IPTOS_ECN_CE: 11041 tp->t_flags2 |= TF2_ECN_SND_ECE; 11042 KMOD_TCPSTAT_INC(tcps_ecn_ce); 11043 break; 11044 case IPTOS_ECN_ECT0: 11045 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 11046 break; 11047 case IPTOS_ECN_ECT1: 11048 KMOD_TCPSTAT_INC(tcps_ecn_ect1); 11049 break; 11050 } 11051 11052 /* Process a packet differently from RFC3168. */ 11053 cc_ecnpkt_handler(tp, th, iptos); 11054 11055 /* Congestion experienced. */ 11056 if (thflags & TH_ECE) { 11057 rack_cong_signal(tp, th, CC_ECN); 11058 } 11059 } 11060 /* 11061 * Parse options on any incoming segment. 11062 */ 11063 tcp_dooptions(&to, (u_char *)(th + 1), 11064 (th->th_off << 2) - sizeof(struct tcphdr), 11065 (thflags & TH_SYN) ? TO_SYN : 0); 11066 11067 /* 11068 * If echoed timestamp is later than the current time, fall back to 11069 * non RFC1323 RTT calculation. Normalize timestamp if syncookies 11070 * were used when this connection was established. 11071 */ 11072 if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { 11073 to.to_tsecr -= tp->ts_offset; 11074 if (TSTMP_GT(to.to_tsecr, cts)) 11075 to.to_tsecr = 0; 11076 } 11077 11078 /* 11079 * If its the first time in we need to take care of options and 11080 * verify we can do SACK for rack! 11081 */ 11082 if (rack->r_state == 0) { 11083 /* Should be init'd by rack_init() */ 11084 KASSERT(rack->rc_inp != NULL, 11085 ("%s: rack->rc_inp unexpectedly NULL", __func__)); 11086 if (rack->rc_inp == NULL) { 11087 rack->rc_inp = tp->t_inpcb; 11088 } 11089 11090 /* 11091 * Process options only when we get SYN/ACK back. The SYN 11092 * case for incoming connections is handled in tcp_syncache. 11093 * According to RFC1323 the window field in a SYN (i.e., a 11094 * <SYN> or <SYN,ACK>) segment itself is never scaled. XXX 11095 * this is traditional behavior, may need to be cleaned up. 11096 */ 11097 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { 11098 /* Handle parallel SYN for ECN */ 11099 if (!(thflags & TH_ACK) && 11100 ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && 11101 ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { 11102 tp->t_flags2 |= TF2_ECN_PERMIT; 11103 tp->t_flags2 |= TF2_ECN_SND_ECE; 11104 TCPSTAT_INC(tcps_ecn_shs); 11105 } 11106 if ((to.to_flags & TOF_SCALE) && 11107 (tp->t_flags & TF_REQ_SCALE)) { 11108 tp->t_flags |= TF_RCVD_SCALE; 11109 tp->snd_scale = to.to_wscale; 11110 } else 11111 tp->t_flags &= ~TF_REQ_SCALE; 11112 /* 11113 * Initial send window. It will be updated with the 11114 * next incoming segment to the scaled value. 11115 */ 11116 tp->snd_wnd = th->th_win; 11117 if ((to.to_flags & TOF_TS) && 11118 (tp->t_flags & TF_REQ_TSTMP)) { 11119 tp->t_flags |= TF_RCVD_TSTMP; 11120 tp->ts_recent = to.to_tsval; 11121 tp->ts_recent_age = cts; 11122 } else 11123 tp->t_flags &= ~TF_REQ_TSTMP; 11124 if (to.to_flags & TOF_MSS) 11125 tcp_mss(tp, to.to_mss); 11126 if ((tp->t_flags & TF_SACK_PERMIT) && 11127 (to.to_flags & TOF_SACKPERM) == 0) 11128 tp->t_flags &= ~TF_SACK_PERMIT; 11129 if (IS_FASTOPEN(tp->t_flags)) { 11130 if (to.to_flags & TOF_FASTOPEN) { 11131 uint16_t mss; 11132 11133 if (to.to_flags & TOF_MSS) 11134 mss = to.to_mss; 11135 else 11136 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) 11137 mss = TCP6_MSS; 11138 else 11139 mss = TCP_MSS; 11140 tcp_fastopen_update_cache(tp, mss, 11141 to.to_tfo_len, to.to_tfo_cookie); 11142 } else 11143 tcp_fastopen_disable_path(tp); 11144 } 11145 } 11146 /* 11147 * At this point we are at the initial call. Here we decide 11148 * if we are doing RACK or not. We do this by seeing if 11149 * TF_SACK_PERMIT is set and the sack-not-required is clear. 11150 * The code now does do dup-ack counting so if you don't 11151 * switch back you won't get rack & TLP, but you will still 11152 * get this stack. 11153 */ 11154 11155 if ((rack_sack_not_required == 0) && 11156 ((tp->t_flags & TF_SACK_PERMIT) == 0)) { 11157 tcp_switch_back_to_default(tp); 11158 (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, 11159 tlen, iptos); 11160 return (1); 11161 } 11162 /* Set the flag */ 11163 rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; 11164 tcp_set_hpts(tp->t_inpcb); 11165 sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); 11166 } 11167 if (thflags & TH_FIN) 11168 tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); 11169 us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11170 if ((rack->rc_gp_dyn_mul) && 11171 (rack->use_fixed_rate == 0) && 11172 (rack->rc_always_pace)) { 11173 /* Check in on probertt */ 11174 rack_check_probe_rtt(rack, us_cts); 11175 } 11176 if (rack->forced_ack) { 11177 uint32_t us_rtt; 11178 11179 /* 11180 * A persist or keep-alive was forced out, update our 11181 * min rtt time. Note we do not worry about lost 11182 * retransmissions since KEEP-ALIVES and persists 11183 * are usually way long on times of sending (though 11184 * if we were really paranoid or worried we could 11185 * at least use timestamps if available to validate). 11186 */ 11187 rack->forced_ack = 0; 11188 us_rtt = us_cts - rack->r_ctl.forced_ack_ts; 11189 if (us_rtt == 0) 11190 us_rtt = 1; 11191 rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); 11192 rack_apply_updated_usrtt(rack, us_rtt, us_cts); 11193 } 11194 /* 11195 * This is the one exception case where we set the rack state 11196 * always. All other times (timers etc) we must have a rack-state 11197 * set (so we assure we have done the checks above for SACK). 11198 */ 11199 rack->r_ctl.rc_rcvtime = cts; 11200 if (rack->r_state != tp->t_state) 11201 rack_set_state(tp, rack); 11202 if (SEQ_GT(th->th_ack, tp->snd_una) && 11203 (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) 11204 kern_prefetch(rsm, &prev_state); 11205 prev_state = rack->r_state; 11206 rack_clear_rate_sample(rack); 11207 retval = (*rack->r_substate) (m, th, so, 11208 tp, &to, drop_hdrlen, 11209 tlen, tiwin, thflags, nxt_pkt, iptos); 11210 #ifdef INVARIANTS 11211 if ((retval == 0) && 11212 (tp->t_inpcb == NULL)) { 11213 panic("retval:%d tp:%p t_inpcb:NULL state:%d", 11214 retval, tp, prev_state); 11215 } 11216 #endif 11217 if (retval == 0) { 11218 /* 11219 * If retval is 1 the tcb is unlocked and most likely the tp 11220 * is gone. 11221 */ 11222 INP_WLOCK_ASSERT(tp->t_inpcb); 11223 if ((rack->rc_gp_dyn_mul) && 11224 (rack->rc_always_pace) && 11225 (rack->use_fixed_rate == 0) && 11226 rack->in_probe_rtt && 11227 (rack->r_ctl.rc_time_probertt_starts == 0)) { 11228 /* 11229 * If we are going for target, lets recheck before 11230 * we output. 11231 */ 11232 rack_check_probe_rtt(rack, us_cts); 11233 } 11234 if (rack->set_pacing_done_a_iw == 0) { 11235 /* How much has been acked? */ 11236 if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { 11237 /* We have enough to set in the pacing segment size */ 11238 rack->set_pacing_done_a_iw = 1; 11239 rack_set_pace_segments(tp, rack, __LINE__); 11240 } 11241 } 11242 tcp_rack_xmit_timer_commit(rack, tp); 11243 if (nxt_pkt == 0) { 11244 if (rack->r_wanted_output != 0) { 11245 do_output_now: 11246 did_out = 1; 11247 (void)tp->t_fb->tfb_tcp_output(tp); 11248 } 11249 rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); 11250 } 11251 if ((nxt_pkt == 0) && 11252 ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && 11253 (SEQ_GT(tp->snd_max, tp->snd_una) || 11254 (tp->t_flags & TF_DELACK) || 11255 ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && 11256 (tp->t_state <= TCPS_CLOSING)))) { 11257 /* We could not send (probably in the hpts but stopped the timer earlier)? */ 11258 if ((tp->snd_max == tp->snd_una) && 11259 ((tp->t_flags & TF_DELACK) == 0) && 11260 (rack->rc_inp->inp_in_hpts) && 11261 (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { 11262 /* keep alive not needed if we are hptsi output yet */ 11263 ; 11264 } else { 11265 int late = 0; 11266 if (rack->rc_inp->inp_in_hpts) { 11267 if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { 11268 us_cts = tcp_get_usecs(NULL); 11269 if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 11270 rack->r_early = 1; 11271 rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); 11272 } else 11273 late = 1; 11274 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 11275 } 11276 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 11277 } 11278 if (late && (did_out == 0)) { 11279 /* 11280 * We are late in the sending 11281 * and we did not call the output 11282 * (this probably should not happen). 11283 */ 11284 goto do_output_now; 11285 } 11286 rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); 11287 } 11288 way_out = 1; 11289 } else if (nxt_pkt == 0) { 11290 /* Do we have the correct timer running? */ 11291 rack_timer_audit(tp, rack, &so->so_snd); 11292 way_out = 2; 11293 } 11294 done_with_input: 11295 rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); 11296 if (did_out) 11297 rack->r_wanted_output = 0; 11298 #ifdef INVARIANTS 11299 if (tp->t_inpcb == NULL) { 11300 panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", 11301 did_out, 11302 retval, tp, prev_state); 11303 } 11304 #endif 11305 } 11306 return (retval); 11307 } 11308 11309 void 11310 rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, 11311 struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) 11312 { 11313 struct timeval tv; 11314 11315 /* First lets see if we have old packets */ 11316 if (tp->t_in_pkt) { 11317 if (ctf_do_queued_segments(so, tp, 1)) { 11318 m_freem(m); 11319 return; 11320 } 11321 } 11322 if (m->m_flags & M_TSTMP_LRO) { 11323 tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; 11324 tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; 11325 } else { 11326 /* Should not be should we kassert instead? */ 11327 tcp_get_usecs(&tv); 11328 } 11329 if(rack_do_segment_nounlock(m, th, so, tp, 11330 drop_hdrlen, tlen, iptos, 0, &tv) == 0) 11331 INP_WUNLOCK(tp->t_inpcb); 11332 } 11333 11334 struct rack_sendmap * 11335 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) 11336 { 11337 struct rack_sendmap *rsm = NULL; 11338 int32_t idx; 11339 uint32_t srtt = 0, thresh = 0, ts_low = 0; 11340 11341 /* Return the next guy to be re-transmitted */ 11342 if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { 11343 return (NULL); 11344 } 11345 if (tp->t_flags & TF_SENTFIN) { 11346 /* retran the end FIN? */ 11347 return (NULL); 11348 } 11349 /* ok lets look at this one */ 11350 rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); 11351 if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { 11352 goto check_it; 11353 } 11354 rsm = rack_find_lowest_rsm(rack); 11355 if (rsm == NULL) { 11356 return (NULL); 11357 } 11358 check_it: 11359 if (rsm->r_flags & RACK_ACKED) { 11360 return (NULL); 11361 } 11362 if ((rsm->r_flags & RACK_SACK_PASSED) == 0) { 11363 /* Its not yet ready */ 11364 return (NULL); 11365 } 11366 srtt = rack_grab_rtt(tp, rack); 11367 idx = rsm->r_rtr_cnt - 1; 11368 ts_low = rsm->r_tim_lastsent[idx]; 11369 thresh = rack_calc_thresh_rack(rack, srtt, tsused); 11370 if ((tsused == ts_low) || 11371 (TSTMP_LT(tsused, ts_low))) { 11372 /* No time since sending */ 11373 return (NULL); 11374 } 11375 if ((tsused - ts_low) < thresh) { 11376 /* It has not been long enough yet */ 11377 return (NULL); 11378 } 11379 if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || 11380 ((rsm->r_flags & RACK_SACK_PASSED) && 11381 (rack->sack_attack_disable == 0))) { 11382 /* 11383 * We have passed the dup-ack threshold <or> 11384 * a SACK has indicated this is missing. 11385 * Note that if you are a declared attacker 11386 * it is only the dup-ack threshold that 11387 * will cause retransmits. 11388 */ 11389 /* log retransmit reason */ 11390 rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); 11391 return (rsm); 11392 } 11393 return (NULL); 11394 } 11395 11396 static void 11397 rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, 11398 uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, 11399 int line, struct rack_sendmap *rsm) 11400 { 11401 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11402 union tcp_log_stackspecific log; 11403 struct timeval tv; 11404 11405 memset(&log, 0, sizeof(log)); 11406 log.u_bbr.flex1 = slot; 11407 log.u_bbr.flex2 = len; 11408 log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; 11409 log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; 11410 log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; 11411 log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; 11412 log.u_bbr.use_lt_bw = rack->app_limited_needs_set; 11413 log.u_bbr.use_lt_bw <<= 1; 11414 log.u_bbr.use_lt_bw = rack->rc_gp_filled; 11415 log.u_bbr.use_lt_bw <<= 1; 11416 log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; 11417 log.u_bbr.use_lt_bw <<= 1; 11418 log.u_bbr.use_lt_bw |= rack->in_probe_rtt; 11419 log.u_bbr.pkt_epoch = line; 11420 log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; 11421 log.u_bbr.bw_inuse = bw_est; 11422 log.u_bbr.delRate = bw; 11423 if (rack->r_ctl.gp_bw == 0) 11424 log.u_bbr.cur_del_rate = 0; 11425 else 11426 log.u_bbr.cur_del_rate = rack_get_bw(rack); 11427 log.u_bbr.rttProp = len_time; 11428 log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; 11429 log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; 11430 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 11431 if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { 11432 /* We are in slow start */ 11433 log.u_bbr.flex7 = 1; 11434 } else { 11435 /* we are on congestion avoidance */ 11436 log.u_bbr.flex7 = 0; 11437 } 11438 log.u_bbr.flex8 = method; 11439 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11440 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 11441 log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; 11442 log.u_bbr.cwnd_gain <<= 1; 11443 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; 11444 log.u_bbr.cwnd_gain <<= 1; 11445 log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; 11446 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11447 &rack->rc_inp->inp_socket->so_rcv, 11448 &rack->rc_inp->inp_socket->so_snd, 11449 BBR_LOG_HPTSI_CALC, 0, 11450 0, &log, false, &tv); 11451 } 11452 } 11453 11454 static uint32_t 11455 rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) 11456 { 11457 uint32_t new_tso, user_max; 11458 11459 user_max = rack->rc_user_set_max_segs * mss; 11460 if (rack->rc_force_max_seg) { 11461 return (user_max); 11462 } 11463 if (rack->use_fixed_rate && 11464 ((rack->r_ctl.crte == NULL) || 11465 (bw != rack->r_ctl.crte->rate))) { 11466 /* Use the user mss since we are not exactly matched */ 11467 return (user_max); 11468 } 11469 new_tso = tcp_get_pacing_burst_size(bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); 11470 if (new_tso > user_max) 11471 new_tso = user_max; 11472 return(new_tso); 11473 } 11474 11475 static void 11476 rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, 11477 uint64_t rate, uint64_t hw_rate, int line, 11478 int error) 11479 { 11480 if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { 11481 union tcp_log_stackspecific log; 11482 struct timeval tv; 11483 11484 memset(&log, 0, sizeof(log)); 11485 log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); 11486 log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); 11487 log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); 11488 log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); 11489 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 11490 log.u_bbr.bw_inuse = rate; 11491 log.u_bbr.flex5 = line; 11492 log.u_bbr.flex6 = error; 11493 log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; 11494 log.u_bbr.flex8 = rack->use_fixed_rate; 11495 log.u_bbr.flex8 <<= 1; 11496 log.u_bbr.flex8 |= rack->rack_hdrw_pacing; 11497 log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; 11498 TCP_LOG_EVENTP(rack->rc_tp, NULL, 11499 &rack->rc_inp->inp_socket->so_rcv, 11500 &rack->rc_inp->inp_socket->so_snd, 11501 BBR_LOG_HDWR_PACE, 0, 11502 0, &log, false, &tv); 11503 } 11504 } 11505 11506 static int32_t 11507 pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) 11508 { 11509 uint64_t lentim, fill_bw; 11510 11511 /* Lets first see if we are full, if so continue with normal rate */ 11512 if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) 11513 return (slot); 11514 if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) 11515 return (slot); 11516 if (rack->r_ctl.rc_last_us_rtt == 0) 11517 return (slot); 11518 if (rack->rc_pace_fill_if_rttin_range && 11519 (rack->r_ctl.rc_last_us_rtt >= 11520 (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { 11521 /* The rtt is huge, N * smallest, lets not fill */ 11522 return (slot); 11523 } 11524 /* 11525 * first lets calculate the b/w based on the last us-rtt 11526 * and the sndwnd. 11527 */ 11528 fill_bw = rack->r_ctl.cwnd_to_use; 11529 /* Take the rwnd if its smaller */ 11530 if (fill_bw > rack->rc_tp->snd_wnd) 11531 fill_bw = rack->rc_tp->snd_wnd; 11532 fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; 11533 fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; 11534 /* We are below the min b/w */ 11535 if (fill_bw < RACK_MIN_BW) 11536 return (slot); 11537 /* 11538 * Ok fill_bw holds our mythical b/w to fill the cwnd 11539 * in a rtt, what does that time wise equate too? 11540 */ 11541 lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; 11542 lentim /= fill_bw; 11543 if (lentim < slot) { 11544 rack_log_pacing_delay_calc(rack, len, slot, fill_bw, 11545 0, lentim, 12, __LINE__, NULL); 11546 return ((int32_t)lentim); 11547 } else 11548 return (slot); 11549 } 11550 11551 static int32_t 11552 rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) 11553 { 11554 struct rack_sendmap *lrsm; 11555 int32_t slot = 0; 11556 int err; 11557 11558 if (rack->rc_always_pace == 0) { 11559 /* 11560 * We use the most optimistic possible cwnd/srtt for 11561 * sending calculations. This will make our 11562 * calculation anticipate getting more through 11563 * quicker then possible. But thats ok we don't want 11564 * the peer to have a gap in data sending. 11565 */ 11566 uint32_t srtt, cwnd, tr_perms = 0; 11567 int32_t reduce = 0; 11568 11569 old_method: 11570 /* 11571 * We keep no precise pacing with the old method 11572 * instead we use the pacer to mitigate bursts. 11573 */ 11574 rack->r_ctl.rc_agg_delayed = 0; 11575 rack->r_early = 0; 11576 rack->r_late = 0; 11577 rack->r_ctl.rc_agg_early = 0; 11578 if (rack->r_ctl.rc_rack_min_rtt) 11579 srtt = rack->r_ctl.rc_rack_min_rtt; 11580 else 11581 srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); 11582 if (rack->r_ctl.rc_rack_largest_cwnd) 11583 cwnd = rack->r_ctl.rc_rack_largest_cwnd; 11584 else 11585 cwnd = rack->r_ctl.cwnd_to_use; 11586 tr_perms = cwnd / srtt; 11587 if (tr_perms == 0) { 11588 tr_perms = ctf_fixed_maxseg(tp); 11589 } 11590 /* 11591 * Calculate how long this will take to drain, if 11592 * the calculation comes out to zero, thats ok we 11593 * will use send_a_lot to possibly spin around for 11594 * more increasing tot_len_this_send to the point 11595 * that its going to require a pace, or we hit the 11596 * cwnd. Which in that case we are just waiting for 11597 * a ACK. 11598 */ 11599 slot = len / tr_perms; 11600 /* Now do we reduce the time so we don't run dry? */ 11601 if (slot && rack_slot_reduction) { 11602 reduce = (slot / rack_slot_reduction); 11603 if (reduce < slot) { 11604 slot -= reduce; 11605 } else 11606 slot = 0; 11607 } 11608 slot *= HPTS_USEC_IN_MSEC; 11609 if (rsm == NULL) { 11610 /* 11611 * We always consider ourselves app limited with old style 11612 * that are not retransmits. This could be the initial 11613 * measurement, but thats ok its all setup and specially 11614 * handled. If another send leaks out, then that too will 11615 * be mark app-limited. 11616 */ 11617 lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11618 if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { 11619 rack->r_ctl.rc_first_appl = lrsm; 11620 lrsm->r_flags |= RACK_APP_LIMITED; 11621 rack->r_ctl.rc_app_limited_cnt++; 11622 } 11623 } 11624 rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); 11625 } else { 11626 uint64_t bw_est, res, lentim, rate_wanted; 11627 uint32_t orig_val, srtt, segs, oh; 11628 11629 if ((rack->r_rr_config == 1) && rsm) { 11630 return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); 11631 } 11632 if (rack->use_fixed_rate) { 11633 rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); 11634 } else if ((rack->r_ctl.init_rate == 0) && 11635 #ifdef NETFLIX_PEAKRATE 11636 (rack->rc_tp->t_maxpeakrate == 0) && 11637 #endif 11638 (rack->r_ctl.gp_bw == 0)) { 11639 /* no way to yet do an estimate */ 11640 bw_est = rate_wanted = 0; 11641 } else { 11642 bw_est = rack_get_bw(rack); 11643 rate_wanted = rack_get_output_bw(rack, bw_est, rsm); 11644 } 11645 if ((bw_est == 0) || (rate_wanted == 0)) { 11646 /* 11647 * No way yet to make a b/w estimate or 11648 * our raise is set incorrectly. 11649 */ 11650 goto old_method; 11651 } 11652 /* We need to account for all the overheads */ 11653 segs = (len + segsiz - 1) / segsiz; 11654 /* 11655 * We need the diff between 1514 bytes (e-mtu with e-hdr) 11656 * and how much data we put in each packet. Yes this 11657 * means we may be off if we are larger than 1500 bytes 11658 * or smaller. But this just makes us more conservative. 11659 */ 11660 if (ETHERNET_SEGMENT_SIZE > segsiz) 11661 oh = ETHERNET_SEGMENT_SIZE - segsiz; 11662 else 11663 oh = 0; 11664 segs *= oh; 11665 lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; 11666 res = lentim / rate_wanted; 11667 slot = (uint32_t)res; 11668 orig_val = rack->r_ctl.rc_pace_max_segs; 11669 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11670 #ifdef KERN_TLS 11671 /* For TLS we need to override this, possibly */ 11672 if (rack->rc_inp->inp_socket->so_snd.sb_flags & SB_TLS_IFNET) { 11673 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11674 } 11675 #endif 11676 /* Did we change the TSO size, if so log it */ 11677 if (rack->r_ctl.rc_pace_max_segs != orig_val) 11678 rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); 11679 if ((rack->rc_pace_to_cwnd) && 11680 (rack->in_probe_rtt == 0) && 11681 (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { 11682 /* 11683 * We want to pace at our rate *or* faster to 11684 * fill the cwnd to the max if its not full. 11685 */ 11686 slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); 11687 } 11688 if ((rack->rc_inp->inp_route.ro_nh != NULL) && 11689 (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { 11690 if ((rack->rack_hdw_pace_ena) && 11691 (rack->rack_hdrw_pacing == 0) && 11692 (rack->rack_attempt_hdwr_pace == 0)) { 11693 /* 11694 * Lets attempt to turn on hardware pacing 11695 * if we can. 11696 */ 11697 rack->rack_attempt_hdwr_pace = 1; 11698 rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, 11699 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11700 rate_wanted, 11701 RS_PACING_GEQ, 11702 &err); 11703 if (rack->r_ctl.crte) { 11704 rack->rack_hdrw_pacing = 1; 11705 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, segsiz, 11706 0, rack->r_ctl.crte, 11707 NULL); 11708 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11709 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11710 err); 11711 } 11712 } else if (rack->rack_hdrw_pacing && 11713 (rack->r_ctl.crte->rate != rate_wanted)) { 11714 /* Do we need to adjust our rate? */ 11715 const struct tcp_hwrate_limit_table *nrte; 11716 11717 nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, 11718 rack->rc_tp, 11719 rack->rc_inp->inp_route.ro_nh->nh_ifp, 11720 rate_wanted, 11721 RS_PACING_GEQ, 11722 &err); 11723 if (nrte == NULL) { 11724 /* Lost the rate */ 11725 rack->rack_hdrw_pacing = 0; 11726 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 11727 } else if (nrte != rack->r_ctl.crte) { 11728 rack->r_ctl.crte = nrte; 11729 rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rate_wanted, 11730 segsiz, 0, 11731 rack->r_ctl.crte, 11732 NULL); 11733 rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, 11734 rate_wanted, rack->r_ctl.crte->rate, __LINE__, 11735 err); 11736 } 11737 11738 } 11739 } 11740 if (rack_limit_time_with_srtt && 11741 (rack->use_fixed_rate == 0) && 11742 #ifdef NETFLIX_PEAKRATE 11743 (rack->rc_tp->t_maxpeakrate == 0) && 11744 #endif 11745 (rack->rack_hdrw_pacing == 0)) { 11746 /* 11747 * Sanity check, we do not allow the pacing delay 11748 * to be longer than the SRTT of the path. If it is 11749 * a slow path, then adding a packet should increase 11750 * the RTT and compensate for this i.e. the srtt will 11751 * be greater so the allowed pacing time will be greater. 11752 * 11753 * Note this restriction is not for where a peak rate 11754 * is set, we are doing fixed pacing or hardware pacing. 11755 */ 11756 if (rack->rc_tp->t_srtt) 11757 srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); 11758 else 11759 srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ 11760 if (srtt < slot) { 11761 rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); 11762 slot = srtt; 11763 } 11764 } 11765 rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); 11766 } 11767 if (slot) 11768 counter_u64_add(rack_calc_nonzero, 1); 11769 else 11770 counter_u64_add(rack_calc_zero, 1); 11771 return (slot); 11772 } 11773 11774 static void 11775 rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, 11776 tcp_seq startseq, uint32_t sb_offset) 11777 { 11778 struct rack_sendmap *my_rsm = NULL; 11779 struct rack_sendmap fe; 11780 11781 if (tp->t_state < TCPS_ESTABLISHED) { 11782 /* 11783 * We don't start any measurements if we are 11784 * not at least established. 11785 */ 11786 return; 11787 } 11788 tp->t_flags |= TF_GPUTINPROG; 11789 rack->r_ctl.rc_gp_lowrtt = 0xffffffff; 11790 rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; 11791 tp->gput_seq = startseq; 11792 rack->app_limited_needs_set = 0; 11793 if (rack->in_probe_rtt) 11794 rack->measure_saw_probe_rtt = 1; 11795 else if ((rack->measure_saw_probe_rtt) && 11796 (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) 11797 rack->measure_saw_probe_rtt = 0; 11798 if (rack->rc_gp_filled) 11799 tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); 11800 else { 11801 /* Special case initial measurement */ 11802 rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); 11803 } 11804 /* 11805 * We take a guess out into the future, 11806 * if we have no measurement and no 11807 * initial rate, we measure the first 11808 * initial-windows worth of data to 11809 * speed up getting some GP measurement and 11810 * thus start pacing. 11811 */ 11812 if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { 11813 rack->app_limited_needs_set = 1; 11814 tp->gput_ack = startseq + max(rc_init_window(rack), 11815 (MIN_GP_WIN * ctf_fixed_maxseg(tp))); 11816 rack_log_pacing_delay_calc(rack, 11817 tp->gput_seq, 11818 tp->gput_ack, 11819 0, 11820 tp->gput_ts, 11821 rack->r_ctl.rc_app_limited_cnt, 11822 9, 11823 __LINE__, NULL); 11824 return; 11825 } 11826 if (sb_offset) { 11827 /* 11828 * We are out somewhere in the sb 11829 * can we use the already outstanding data? 11830 */ 11831 11832 if (rack->r_ctl.rc_app_limited_cnt == 0) { 11833 /* 11834 * Yes first one is good and in this case 11835 * the tp->gput_ts is correctly set based on 11836 * the last ack that arrived (no need to 11837 * set things up when an ack comes in). 11838 */ 11839 my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 11840 if ((my_rsm == NULL) || 11841 (my_rsm->r_rtr_cnt != 1)) { 11842 /* retransmission? */ 11843 goto use_latest; 11844 } 11845 } else { 11846 if (rack->r_ctl.rc_first_appl == NULL) { 11847 /* 11848 * If rc_first_appl is NULL 11849 * then the cnt should be 0. 11850 * This is probably an error, maybe 11851 * a KASSERT would be approprate. 11852 */ 11853 goto use_latest; 11854 } 11855 /* 11856 * If we have a marker pointer to the last one that is 11857 * app limited we can use that, but we need to set 11858 * things up so that when it gets ack'ed we record 11859 * the ack time (if its not already acked). 11860 */ 11861 rack->app_limited_needs_set = 1; 11862 /* 11863 * We want to get to the rsm that is either 11864 * next with space i.e. over 1 MSS or the one 11865 * after that (after the app-limited). 11866 */ 11867 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11868 rack->r_ctl.rc_first_appl); 11869 if (my_rsm) { 11870 if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) 11871 /* Have to use the next one */ 11872 my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, 11873 my_rsm); 11874 else { 11875 /* Use after the first MSS of it is acked */ 11876 tp->gput_seq = my_rsm->r_start + ctf_fixed_maxseg(tp); 11877 goto start_set; 11878 } 11879 } 11880 if ((my_rsm == NULL) || 11881 (my_rsm->r_rtr_cnt != 1)) { 11882 /* 11883 * Either its a retransmit or 11884 * the last is the app-limited one. 11885 */ 11886 goto use_latest; 11887 } 11888 } 11889 tp->gput_seq = my_rsm->r_start; 11890 start_set: 11891 if (my_rsm->r_flags & RACK_ACKED) { 11892 /* 11893 * This one has been acked use the arrival ack time 11894 */ 11895 tp->gput_ts = my_rsm->r_ack_arrival; 11896 rack->app_limited_needs_set = 0; 11897 } 11898 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11899 tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); 11900 rack_log_pacing_delay_calc(rack, 11901 tp->gput_seq, 11902 tp->gput_ack, 11903 (uint64_t)my_rsm, 11904 tp->gput_ts, 11905 rack->r_ctl.rc_app_limited_cnt, 11906 9, 11907 __LINE__, NULL); 11908 return; 11909 } 11910 11911 use_latest: 11912 /* 11913 * We don't know how long we may have been 11914 * idle or if this is the first-send. Lets 11915 * setup the flag so we will trim off 11916 * the first ack'd data so we get a true 11917 * measurement. 11918 */ 11919 rack->app_limited_needs_set = 1; 11920 tp->gput_ack = startseq + rack_get_measure_window(tp, rack); 11921 /* Find this guy so we can pull the send time */ 11922 fe.r_start = startseq; 11923 my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); 11924 if (my_rsm) { 11925 rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; 11926 if (my_rsm->r_flags & RACK_ACKED) { 11927 /* 11928 * Unlikely since its probably what was 11929 * just transmitted (but I am paranoid). 11930 */ 11931 tp->gput_ts = my_rsm->r_ack_arrival; 11932 rack->app_limited_needs_set = 0; 11933 } 11934 if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { 11935 /* This also is unlikely */ 11936 tp->gput_seq = my_rsm->r_start; 11937 } 11938 } else { 11939 /* 11940 * TSNH unless we have some send-map limit, 11941 * and even at that it should not be hitting 11942 * that limit (we should have stopped sending). 11943 */ 11944 rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); 11945 } 11946 rack_log_pacing_delay_calc(rack, 11947 tp->gput_seq, 11948 tp->gput_ack, 11949 (uint64_t)my_rsm, 11950 tp->gput_ts, 11951 rack->r_ctl.rc_app_limited_cnt, 11952 9, __LINE__, NULL); 11953 } 11954 11955 static inline uint32_t 11956 rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, 11957 uint32_t avail, int32_t sb_offset) 11958 { 11959 uint32_t len; 11960 uint32_t sendwin; 11961 11962 if (tp->snd_wnd > cwnd_to_use) 11963 sendwin = cwnd_to_use; 11964 else 11965 sendwin = tp->snd_wnd; 11966 if (ctf_outstanding(tp) >= tp->snd_wnd) { 11967 /* We never want to go over our peers rcv-window */ 11968 len = 0; 11969 } else { 11970 uint32_t flight; 11971 11972 flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); 11973 if (flight >= sendwin) { 11974 /* 11975 * We have in flight what we are allowed by cwnd (if 11976 * it was rwnd blocking it would have hit above out 11977 * >= tp->snd_wnd). 11978 */ 11979 return (0); 11980 } 11981 len = sendwin - flight; 11982 if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { 11983 /* We would send too much (beyond the rwnd) */ 11984 len = tp->snd_wnd - ctf_outstanding(tp); 11985 } 11986 if ((len + sb_offset) > avail) { 11987 /* 11988 * We don't have that much in the SB, how much is 11989 * there? 11990 */ 11991 len = avail - sb_offset; 11992 } 11993 } 11994 return (len); 11995 } 11996 11997 static int 11998 rack_output(struct tcpcb *tp) 11999 { 12000 struct socket *so; 12001 uint32_t recwin; 12002 uint32_t sb_offset; 12003 int32_t len, flags, error = 0; 12004 struct mbuf *m; 12005 struct mbuf *mb; 12006 uint32_t if_hw_tsomaxsegcount = 0; 12007 uint32_t if_hw_tsomaxsegsize; 12008 int32_t segsiz, minseg; 12009 long tot_len_this_send = 0; 12010 struct ip *ip = NULL; 12011 #ifdef TCPDEBUG 12012 struct ipovly *ipov = NULL; 12013 #endif 12014 struct udphdr *udp = NULL; 12015 struct tcp_rack *rack; 12016 struct tcphdr *th; 12017 uint8_t pass = 0; 12018 uint8_t mark = 0; 12019 uint8_t wanted_cookie = 0; 12020 u_char opt[TCP_MAXOLEN]; 12021 unsigned ipoptlen, optlen, hdrlen, ulen=0; 12022 uint32_t rack_seq; 12023 12024 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12025 unsigned ipsec_optlen = 0; 12026 12027 #endif 12028 int32_t idle, sendalot; 12029 int32_t sub_from_prr = 0; 12030 volatile int32_t sack_rxmit; 12031 struct rack_sendmap *rsm = NULL; 12032 int32_t tso, mtu; 12033 struct tcpopt to; 12034 int32_t slot = 0; 12035 int32_t sup_rack = 0; 12036 uint32_t cts, us_cts, delayed, early; 12037 uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; 12038 uint32_t cwnd_to_use; 12039 int32_t do_a_prefetch; 12040 int32_t prefetch_rsm = 0; 12041 int force_tso = 0; 12042 int32_t orig_len; 12043 struct timeval tv; 12044 int32_t prefetch_so_done = 0; 12045 struct tcp_log_buffer *lgb = NULL; 12046 struct inpcb *inp; 12047 struct sockbuf *sb; 12048 #ifdef INET6 12049 struct ip6_hdr *ip6 = NULL; 12050 int32_t isipv6; 12051 #endif 12052 uint8_t filled_all = 0; 12053 bool hw_tls = false; 12054 12055 /* setup and take the cache hits here */ 12056 rack = (struct tcp_rack *)tp->t_fb_ptr; 12057 inp = rack->rc_inp; 12058 so = inp->inp_socket; 12059 sb = &so->so_snd; 12060 kern_prefetch(sb, &do_a_prefetch); 12061 do_a_prefetch = 1; 12062 hpts_calling = inp->inp_hpts_calls; 12063 #ifdef KERN_TLS 12064 hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; 12065 #endif 12066 12067 NET_EPOCH_ASSERT(); 12068 INP_WLOCK_ASSERT(inp); 12069 #ifdef TCP_OFFLOAD 12070 if (tp->t_flags & TF_TOE) 12071 return (tcp_offload_output(tp)); 12072 #endif 12073 /* 12074 * For TFO connections in SYN_RECEIVED, only allow the initial 12075 * SYN|ACK and those sent by the retransmit timer. 12076 */ 12077 if (IS_FASTOPEN(tp->t_flags) && 12078 (tp->t_state == TCPS_SYN_RECEIVED) && 12079 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ 12080 (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ 12081 return (0); 12082 #ifdef INET6 12083 if (rack->r_state) { 12084 /* Use the cache line loaded if possible */ 12085 isipv6 = rack->r_is_v6; 12086 } else { 12087 isipv6 = (inp->inp_vflag & INP_IPV6) != 0; 12088 } 12089 #endif 12090 early = 0; 12091 us_cts = tcp_get_usecs(&tv); 12092 cts = tcp_tv_to_mssectick(&tv); 12093 if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && 12094 inp->inp_in_hpts) { 12095 /* 12096 * We are on the hpts for some timer but not hptsi output. 12097 * Remove from the hpts unconditionally. 12098 */ 12099 rack_timer_cancel(tp, rack, cts, __LINE__); 12100 } 12101 /* Are we pacing and late? */ 12102 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12103 TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { 12104 /* We are delayed */ 12105 delayed = us_cts - rack->r_ctl.rc_last_output_to; 12106 } else { 12107 delayed = 0; 12108 } 12109 /* Do the timers, which may override the pacer */ 12110 if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { 12111 if (rack_process_timers(tp, rack, cts, hpts_calling)) { 12112 counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); 12113 return (0); 12114 } 12115 } 12116 if ((rack->r_timer_override) || 12117 (delayed) || 12118 (tp->t_state < TCPS_ESTABLISHED)) { 12119 if (tp->t_inpcb->inp_in_hpts) 12120 tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); 12121 } else if (tp->t_inpcb->inp_in_hpts) { 12122 /* 12123 * On the hpts you can't pass even if ACKNOW is on, we will 12124 * when the hpts fires. 12125 */ 12126 counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); 12127 return (0); 12128 } 12129 inp->inp_hpts_calls = 0; 12130 /* Finish out both pacing early and late accounting */ 12131 if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && 12132 TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { 12133 early = rack->r_ctl.rc_last_output_to - us_cts; 12134 } else 12135 early = 0; 12136 if (delayed) { 12137 rack->r_ctl.rc_agg_delayed += delayed; 12138 rack->r_late = 1; 12139 } else if (early) { 12140 rack->r_ctl.rc_agg_early += early; 12141 rack->r_early = 1; 12142 } 12143 /* Now that early/late accounting is done turn off the flag */ 12144 rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; 12145 rack->r_wanted_output = 0; 12146 rack->r_timer_override = 0; 12147 /* 12148 * For TFO connections in SYN_SENT or SYN_RECEIVED, 12149 * only allow the initial SYN or SYN|ACK and those sent 12150 * by the retransmit timer. 12151 */ 12152 if (IS_FASTOPEN(tp->t_flags) && 12153 ((tp->t_state == TCPS_SYN_RECEIVED) || 12154 (tp->t_state == TCPS_SYN_SENT)) && 12155 SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ 12156 (tp->t_rxtshift == 0)) { /* not a retransmit */ 12157 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12158 goto just_return_nolock; 12159 } 12160 /* 12161 * Determine length of data that should be transmitted, and flags 12162 * that will be used. If there is some data or critical controls 12163 * (SYN, RST) to send, then transmit; otherwise, investigate 12164 * further. 12165 */ 12166 idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); 12167 if (tp->t_idle_reduce) { 12168 if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) 12169 rack_cc_after_idle(rack, tp); 12170 } 12171 tp->t_flags &= ~TF_LASTIDLE; 12172 if (idle) { 12173 if (tp->t_flags & TF_MORETOCOME) { 12174 tp->t_flags |= TF_LASTIDLE; 12175 idle = 0; 12176 } 12177 } 12178 if ((tp->snd_una == tp->snd_max) && 12179 rack->r_ctl.rc_went_idle_time && 12180 TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { 12181 idle = us_cts - rack->r_ctl.rc_went_idle_time; 12182 if (idle > rack_min_probertt_hold) { 12183 /* Count as a probe rtt */ 12184 if (rack->in_probe_rtt == 0) { 12185 rack->r_ctl.rc_lower_rtt_us_cts = us_cts; 12186 rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; 12187 rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; 12188 rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; 12189 } else { 12190 rack_exit_probertt(rack, us_cts); 12191 } 12192 } 12193 idle = 0; 12194 } 12195 again: 12196 /* 12197 * If we've recently taken a timeout, snd_max will be greater than 12198 * snd_nxt. There may be SACK information that allows us to avoid 12199 * resending already delivered data. Adjust snd_nxt accordingly. 12200 */ 12201 sendalot = 0; 12202 us_cts = tcp_get_usecs(&tv); 12203 cts = tcp_tv_to_mssectick(&tv); 12204 tso = 0; 12205 mtu = 0; 12206 segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); 12207 if (so->so_snd.sb_flags & SB_TLS_IFNET) { 12208 minseg = rack->r_ctl.rc_pace_min_segs; 12209 } else { 12210 minseg = segsiz; 12211 } 12212 sb_offset = tp->snd_max - tp->snd_una; 12213 cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; 12214 #ifdef NETFLIX_SHARED_CWND 12215 if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && 12216 rack->rack_enable_scwnd) { 12217 /* We are doing cwnd sharing */ 12218 if (rack->rc_gp_filled && 12219 (rack->rack_attempted_scwnd == 0) && 12220 (rack->r_ctl.rc_scw == NULL) && 12221 tp->t_lib) { 12222 /* The pcbid is in, lets make an attempt */ 12223 counter_u64_add(rack_try_scwnd, 1); 12224 rack->rack_attempted_scwnd = 1; 12225 rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, 12226 &rack->r_ctl.rc_scw_index, 12227 segsiz); 12228 } 12229 if (rack->r_ctl.rc_scw && 12230 (rack->rack_scwnd_is_idle == 1) && 12231 (rack->rc_in_persist == 0) && 12232 sbavail(sb)) { 12233 /* we are no longer out of data */ 12234 tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 12235 rack->rack_scwnd_is_idle = 0; 12236 } 12237 if (rack->r_ctl.rc_scw) { 12238 /* First lets update and get the cwnd */ 12239 rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, 12240 rack->r_ctl.rc_scw_index, 12241 tp->snd_cwnd, tp->snd_wnd, segsiz); 12242 } 12243 } 12244 #endif 12245 flags = tcp_outflags[tp->t_state]; 12246 while (rack->rc_free_cnt < rack_free_cache) { 12247 rsm = rack_alloc(rack); 12248 if (rsm == NULL) { 12249 if (inp->inp_hpts_calls) 12250 /* Retry in a ms */ 12251 slot = (1 * HPTS_USEC_IN_MSEC); 12252 goto just_return_nolock; 12253 } 12254 TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); 12255 rack->rc_free_cnt++; 12256 rsm = NULL; 12257 } 12258 if (inp->inp_hpts_calls) 12259 inp->inp_hpts_calls = 0; 12260 sack_rxmit = 0; 12261 len = 0; 12262 rsm = NULL; 12263 if (flags & TH_RST) { 12264 SOCKBUF_LOCK(sb); 12265 goto send; 12266 } 12267 if (rack->r_ctl.rc_resend) { 12268 /* Retransmit timer */ 12269 rsm = rack->r_ctl.rc_resend; 12270 rack->r_ctl.rc_resend = NULL; 12271 rsm->r_flags &= ~RACK_TLP; 12272 len = rsm->r_end - rsm->r_start; 12273 sack_rxmit = 1; 12274 sendalot = 0; 12275 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12276 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12277 __func__, __LINE__, 12278 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12279 sb_offset = rsm->r_start - tp->snd_una; 12280 if (len >= segsiz) 12281 len = segsiz; 12282 } else if ((rack->rc_in_persist == 0) && 12283 ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { 12284 /* We have a retransmit that takes precedence */ 12285 rsm->r_flags &= ~RACK_TLP; 12286 if ((!IN_RECOVERY(tp->t_flags)) && 12287 ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { 12288 /* Enter recovery if not induced by a time-out */ 12289 rack->r_ctl.rc_rsm_start = rsm->r_start; 12290 rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; 12291 rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; 12292 rack_cong_signal(tp, NULL, CC_NDUPACK); 12293 /* 12294 * When we enter recovery we need to assure we send 12295 * one packet. 12296 */ 12297 if (rack->rack_no_prr == 0) { 12298 rack->r_ctl.rc_prr_sndcnt = segsiz; 12299 rack_log_to_prr(rack, 13, 0); 12300 } 12301 } 12302 #ifdef INVARIANTS 12303 if (SEQ_LT(rsm->r_start, tp->snd_una)) { 12304 panic("Huh, tp:%p rack:%p rsm:%p start:%u < snd_una:%u\n", 12305 tp, rack, rsm, rsm->r_start, tp->snd_una); 12306 } 12307 #endif 12308 len = rsm->r_end - rsm->r_start; 12309 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12310 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12311 __func__, __LINE__, 12312 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12313 sb_offset = rsm->r_start - tp->snd_una; 12314 /* Can we send it within the PRR boundary? */ 12315 if (rack->rack_no_prr == 0) { 12316 if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { 12317 /* It does not fit */ 12318 if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && 12319 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12320 /* 12321 * prr is less than a segment, we 12322 * have more acks due in besides 12323 * what we need to resend. Lets not send 12324 * to avoid sending small pieces of 12325 * what we need to retransmit. 12326 */ 12327 len = 0; 12328 goto just_return_nolock; 12329 } 12330 len = rack->r_ctl.rc_prr_sndcnt; 12331 } 12332 } 12333 sendalot = 0; 12334 if (len >= segsiz) 12335 len = segsiz; 12336 if (len > 0) { 12337 sub_from_prr = 1; 12338 sack_rxmit = 1; 12339 KMOD_TCPSTAT_INC(tcps_sack_rexmits); 12340 KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, 12341 min(len, segsiz)); 12342 counter_u64_add(rack_rtm_prr_retran, 1); 12343 } 12344 } else if (rack->r_ctl.rc_tlpsend) { 12345 /* Tail loss probe */ 12346 long cwin; 12347 long tlen; 12348 12349 doing_tlp = 1; 12350 /* 12351 * Check if we can do a TLP with a RACK'd packet 12352 * this can happen if we are not doing the rack 12353 * cheat and we skipped to a TLP and it 12354 * went off. 12355 */ 12356 rsm = rack->r_ctl.rc_tlpsend; 12357 rsm->r_flags |= RACK_TLP; 12358 rack->r_ctl.rc_tlpsend = NULL; 12359 sack_rxmit = 1; 12360 tlen = rsm->r_end - rsm->r_start; 12361 if (tlen > segsiz) 12362 tlen = segsiz; 12363 KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), 12364 ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", 12365 __func__, __LINE__, 12366 rsm->r_start, tp->snd_una, tp, rack, rsm)); 12367 sb_offset = rsm->r_start - tp->snd_una; 12368 cwin = min(tp->snd_wnd, tlen); 12369 len = cwin; 12370 } 12371 /* 12372 * Enforce a connection sendmap count limit if set 12373 * as long as we are not retransmiting. 12374 */ 12375 if ((rsm == NULL) && 12376 (rack->do_detection == 0) && 12377 (V_tcp_map_entries_limit > 0) && 12378 (rack->r_ctl.rc_num_maps_alloced >= V_tcp_map_entries_limit)) { 12379 counter_u64_add(rack_to_alloc_limited, 1); 12380 if (!rack->alloc_limit_reported) { 12381 rack->alloc_limit_reported = 1; 12382 counter_u64_add(rack_alloc_limited_conns, 1); 12383 } 12384 goto just_return_nolock; 12385 } 12386 if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { 12387 /* we are retransmitting the fin */ 12388 len--; 12389 if (len) { 12390 /* 12391 * When retransmitting data do *not* include the 12392 * FIN. This could happen from a TLP probe. 12393 */ 12394 flags &= ~TH_FIN; 12395 } 12396 } 12397 #ifdef INVARIANTS 12398 /* For debugging */ 12399 rack->r_ctl.rc_rsm_at_retran = rsm; 12400 #endif 12401 /* 12402 * Get standard flags, and add SYN or FIN if requested by 'hidden' 12403 * state flags. 12404 */ 12405 if (tp->t_flags & TF_NEEDFIN) 12406 flags |= TH_FIN; 12407 if (tp->t_flags & TF_NEEDSYN) 12408 flags |= TH_SYN; 12409 if ((sack_rxmit == 0) && (prefetch_rsm == 0)) { 12410 void *end_rsm; 12411 end_rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); 12412 if (end_rsm) 12413 kern_prefetch(end_rsm, &prefetch_rsm); 12414 prefetch_rsm = 1; 12415 } 12416 SOCKBUF_LOCK(sb); 12417 /* 12418 * If snd_nxt == snd_max and we have transmitted a FIN, the 12419 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a 12420 * negative length. This can also occur when TCP opens up its 12421 * congestion window while receiving additional duplicate acks after 12422 * fast-retransmit because TCP will reset snd_nxt to snd_max after 12423 * the fast-retransmit. 12424 * 12425 * In the normal retransmit-FIN-only case, however, snd_nxt will be 12426 * set to snd_una, the sb_offset will be 0, and the length may wind 12427 * up 0. 12428 * 12429 * If sack_rxmit is true we are retransmitting from the scoreboard 12430 * in which case len is already set. 12431 */ 12432 if ((sack_rxmit == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { 12433 uint32_t avail; 12434 12435 avail = sbavail(sb); 12436 if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) 12437 sb_offset = tp->snd_nxt - tp->snd_una; 12438 else 12439 sb_offset = 0; 12440 if ((IN_RECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { 12441 if (rack->r_ctl.rc_tlp_new_data) { 12442 /* TLP is forcing out new data */ 12443 if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { 12444 rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); 12445 } 12446 if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) 12447 len = tp->snd_wnd; 12448 else 12449 len = rack->r_ctl.rc_tlp_new_data; 12450 rack->r_ctl.rc_tlp_new_data = 0; 12451 new_data_tlp = doing_tlp = 1; 12452 } else 12453 len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); 12454 if (IN_RECOVERY(tp->t_flags) && (len > segsiz)) { 12455 /* 12456 * For prr=off, we need to send only 1 MSS 12457 * at a time. We do this because another sack could 12458 * be arriving that causes us to send retransmits and 12459 * we don't want to be on a long pace due to a larger send 12460 * that keeps us from sending out the retransmit. 12461 */ 12462 len = segsiz; 12463 } 12464 } else { 12465 uint32_t outstanding; 12466 12467 /* 12468 * We are inside of a SACK recovery episode and are 12469 * sending new data, having retransmitted all the 12470 * data possible so far in the scoreboard. 12471 */ 12472 outstanding = tp->snd_max - tp->snd_una; 12473 if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { 12474 if (tp->snd_wnd > outstanding) { 12475 len = tp->snd_wnd - outstanding; 12476 /* Check to see if we have the data */ 12477 if ((sb_offset + len) > avail) { 12478 /* It does not all fit */ 12479 if (avail > sb_offset) 12480 len = avail - sb_offset; 12481 else 12482 len = 0; 12483 } 12484 } else 12485 len = 0; 12486 } else if (avail > sb_offset) 12487 len = avail - sb_offset; 12488 else 12489 len = 0; 12490 if (len > 0) { 12491 if (len > rack->r_ctl.rc_prr_sndcnt) 12492 len = rack->r_ctl.rc_prr_sndcnt; 12493 if (len > 0) { 12494 sub_from_prr = 1; 12495 counter_u64_add(rack_rtm_prr_newdata, 1); 12496 } 12497 } 12498 if (len > segsiz) { 12499 /* 12500 * We should never send more than a MSS when 12501 * retransmitting or sending new data in prr 12502 * mode unless the override flag is on. Most 12503 * likely the PRR algorithm is not going to 12504 * let us send a lot as well :-) 12505 */ 12506 if (rack->r_ctl.rc_prr_sendalot == 0) 12507 len = segsiz; 12508 } else if (len < segsiz) { 12509 /* 12510 * Do we send any? The idea here is if the 12511 * send empty's the socket buffer we want to 12512 * do it. However if not then lets just wait 12513 * for our prr_sndcnt to get bigger. 12514 */ 12515 long leftinsb; 12516 12517 leftinsb = sbavail(sb) - sb_offset; 12518 if (leftinsb > len) { 12519 /* This send does not empty the sb */ 12520 len = 0; 12521 } 12522 } 12523 } 12524 } else if (!TCPS_HAVEESTABLISHED(tp->t_state)) { 12525 /* 12526 * If you have not established 12527 * and are not doing FAST OPEN 12528 * no data please. 12529 */ 12530 if ((sack_rxmit == 0) && 12531 (!IS_FASTOPEN(tp->t_flags))){ 12532 len = 0; 12533 sb_offset = 0; 12534 } 12535 } 12536 if (prefetch_so_done == 0) { 12537 kern_prefetch(so, &prefetch_so_done); 12538 prefetch_so_done = 1; 12539 } 12540 /* 12541 * Lop off SYN bit if it has already been sent. However, if this is 12542 * SYN-SENT state and if segment contains data and if we don't know 12543 * that foreign host supports TAO, suppress sending segment. 12544 */ 12545 if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && 12546 ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { 12547 /* 12548 * When sending additional segments following a TFO SYN|ACK, 12549 * do not include the SYN bit. 12550 */ 12551 if (IS_FASTOPEN(tp->t_flags) && 12552 (tp->t_state == TCPS_SYN_RECEIVED)) 12553 flags &= ~TH_SYN; 12554 } 12555 /* 12556 * Be careful not to send data and/or FIN on SYN segments. This 12557 * measure is needed to prevent interoperability problems with not 12558 * fully conformant TCP implementations. 12559 */ 12560 if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { 12561 len = 0; 12562 flags &= ~TH_FIN; 12563 } 12564 /* 12565 * On TFO sockets, ensure no data is sent in the following cases: 12566 * 12567 * - When retransmitting SYN|ACK on a passively-created socket 12568 * 12569 * - When retransmitting SYN on an actively created socket 12570 * 12571 * - When sending a zero-length cookie (cookie request) on an 12572 * actively created socket 12573 * 12574 * - When the socket is in the CLOSED state (RST is being sent) 12575 */ 12576 if (IS_FASTOPEN(tp->t_flags) && 12577 (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || 12578 ((tp->t_state == TCPS_SYN_SENT) && 12579 (tp->t_tfo_client_cookie_len == 0)) || 12580 (flags & TH_RST))) { 12581 sack_rxmit = 0; 12582 len = 0; 12583 } 12584 /* Without fast-open there should never be data sent on a SYN */ 12585 if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { 12586 tp->snd_nxt = tp->iss; 12587 len = 0; 12588 } 12589 orig_len = len; 12590 if (len <= 0) { 12591 /* 12592 * If FIN has been sent but not acked, but we haven't been 12593 * called to retransmit, len will be < 0. Otherwise, window 12594 * shrank after we sent into it. If window shrank to 0, 12595 * cancel pending retransmit, pull snd_nxt back to (closed) 12596 * window, and set the persist timer if it isn't already 12597 * going. If the window didn't close completely, just wait 12598 * for an ACK. 12599 * 12600 * We also do a general check here to ensure that we will 12601 * set the persist timer when we have data to send, but a 12602 * 0-byte window. This makes sure the persist timer is set 12603 * even if the packet hits one of the "goto send" lines 12604 * below. 12605 */ 12606 len = 0; 12607 if ((tp->snd_wnd == 0) && 12608 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12609 (tp->snd_una == tp->snd_max) && 12610 (sb_offset < (int)sbavail(sb))) { 12611 tp->snd_nxt = tp->snd_una; 12612 rack_enter_persist(tp, rack, cts); 12613 } 12614 } else if ((rsm == NULL) && 12615 ((doing_tlp == 0) || (new_data_tlp == 1)) && 12616 (len < rack->r_ctl.rc_pace_max_segs)) { 12617 /* 12618 * We are not sending a maximum sized segment for 12619 * some reason. Should we not send anything (think 12620 * sws or persists)? 12621 */ 12622 if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && 12623 (TCPS_HAVEESTABLISHED(tp->t_state)) && 12624 (len < minseg) && 12625 (len < (int)(sbavail(sb) - sb_offset))) { 12626 /* 12627 * Here the rwnd is less than 12628 * the minimum pacing size, this is not a retransmit, 12629 * we are established and 12630 * the send is not the last in the socket buffer 12631 * we send nothing, and we may enter persists 12632 * if nothing is outstanding. 12633 */ 12634 len = 0; 12635 if (tp->snd_max == tp->snd_una) { 12636 /* 12637 * Nothing out we can 12638 * go into persists. 12639 */ 12640 rack_enter_persist(tp, rack, cts); 12641 tp->snd_nxt = tp->snd_una; 12642 } 12643 } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && 12644 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12645 (len < (int)(sbavail(sb) - sb_offset)) && 12646 (len < minseg)) { 12647 /* 12648 * Here we are not retransmitting, and 12649 * the cwnd is not so small that we could 12650 * not send at least a min size (rxt timer 12651 * not having gone off), We have 2 segments or 12652 * more already in flight, its not the tail end 12653 * of the socket buffer and the cwnd is blocking 12654 * us from sending out a minimum pacing segment size. 12655 * Lets not send anything. 12656 */ 12657 len = 0; 12658 } else if (((tp->snd_wnd - ctf_outstanding(tp)) < 12659 min((rack->r_ctl.rc_high_rwnd/2), minseg)) && 12660 (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && 12661 (len < (int)(sbavail(sb) - sb_offset)) && 12662 (TCPS_HAVEESTABLISHED(tp->t_state))) { 12663 /* 12664 * Here we have a send window but we have 12665 * filled it up and we can't send another pacing segment. 12666 * We also have in flight more than 2 segments 12667 * and we are not completing the sb i.e. we allow 12668 * the last bytes of the sb to go out even if 12669 * its not a full pacing segment. 12670 */ 12671 len = 0; 12672 } 12673 } 12674 /* len will be >= 0 after this point. */ 12675 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 12676 tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); 12677 /* 12678 * Decide if we can use TCP Segmentation Offloading (if supported by 12679 * hardware). 12680 * 12681 * TSO may only be used if we are in a pure bulk sending state. The 12682 * presence of TCP-MD5, SACK retransmits, SACK advertizements and IP 12683 * options prevent using TSO. With TSO the TCP header is the same 12684 * (except for the sequence number) for all generated packets. This 12685 * makes it impossible to transmit any options which vary per 12686 * generated segment or packet. 12687 * 12688 * IPv4 handling has a clear separation of ip options and ip header 12689 * flags while IPv6 combines both in in6p_outputopts. ip6_optlen() does 12690 * the right thing below to provide length of just ip options and thus 12691 * checking for ipoptlen is enough to decide if ip options are present. 12692 */ 12693 12694 #ifdef INET6 12695 if (isipv6) 12696 ipoptlen = ip6_optlen(tp->t_inpcb); 12697 else 12698 #endif 12699 if (tp->t_inpcb->inp_options) 12700 ipoptlen = tp->t_inpcb->inp_options->m_len - 12701 offsetof(struct ipoption, ipopt_list); 12702 else 12703 ipoptlen = 0; 12704 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12705 /* 12706 * Pre-calculate here as we save another lookup into the darknesses 12707 * of IPsec that way and can actually decide if TSO is ok. 12708 */ 12709 #ifdef INET6 12710 if (isipv6 && IPSEC_ENABLED(ipv6)) 12711 ipsec_optlen = IPSEC_HDRSIZE(ipv6, tp->t_inpcb); 12712 #ifdef INET 12713 else 12714 #endif 12715 #endif /* INET6 */ 12716 #ifdef INET 12717 if (IPSEC_ENABLED(ipv4)) 12718 ipsec_optlen = IPSEC_HDRSIZE(ipv4, tp->t_inpcb); 12719 #endif /* INET */ 12720 #endif 12721 12722 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 12723 ipoptlen += ipsec_optlen; 12724 #endif 12725 if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > segsiz && 12726 (tp->t_port == 0) && 12727 ((tp->t_flags & TF_SIGNATURE) == 0) && 12728 tp->rcv_numsacks == 0 && sack_rxmit == 0 && 12729 ipoptlen == 0) 12730 tso = 1; 12731 { 12732 uint32_t outstanding; 12733 12734 outstanding = tp->snd_max - tp->snd_una; 12735 if (tp->t_flags & TF_SENTFIN) { 12736 /* 12737 * If we sent a fin, snd_max is 1 higher than 12738 * snd_una 12739 */ 12740 outstanding--; 12741 } 12742 if (sack_rxmit) { 12743 if ((rsm->r_flags & RACK_HAS_FIN) == 0) 12744 flags &= ~TH_FIN; 12745 } else { 12746 if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + 12747 sbused(sb))) 12748 flags &= ~TH_FIN; 12749 } 12750 } 12751 recwin = sbspace(&so->so_rcv); 12752 12753 /* 12754 * Sender silly window avoidance. We transmit under the following 12755 * conditions when len is non-zero: 12756 * 12757 * - We have a full segment (or more with TSO) - This is the last 12758 * buffer in a write()/send() and we are either idle or running 12759 * NODELAY - we've timed out (e.g. persist timer) - we have more 12760 * then 1/2 the maximum send window's worth of data (receiver may be 12761 * limited the window size) - we need to retransmit 12762 */ 12763 if (len) { 12764 if (len >= segsiz) { 12765 goto send; 12766 } 12767 /* 12768 * NOTE! on localhost connections an 'ack' from the remote 12769 * end may occur synchronously with the output and cause us 12770 * to flush a buffer queued with moretocome. XXX 12771 * 12772 */ 12773 if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ 12774 (idle || (tp->t_flags & TF_NODELAY)) && 12775 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12776 (tp->t_flags & TF_NOPUSH) == 0) { 12777 pass = 2; 12778 goto send; 12779 } 12780 if ((tp->snd_una == tp->snd_max) && len) { /* Nothing outstanding */ 12781 pass = 22; 12782 goto send; 12783 } 12784 if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) { 12785 pass = 4; 12786 goto send; 12787 } 12788 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ 12789 pass = 5; 12790 goto send; 12791 } 12792 if (sack_rxmit) { 12793 pass = 6; 12794 goto send; 12795 } 12796 if (((tp->snd_wnd - ctf_outstanding(tp)) < segsiz) && 12797 (ctf_outstanding(tp) < (segsiz * 2))) { 12798 /* 12799 * We have less than two MSS outstanding (delayed ack) 12800 * and our rwnd will not let us send a full sized 12801 * MSS. Lets go ahead and let this small segment 12802 * out because we want to try to have at least two 12803 * packets inflight to not be caught by delayed ack. 12804 */ 12805 pass = 12; 12806 goto send; 12807 } 12808 } 12809 /* 12810 * Sending of standalone window updates. 12811 * 12812 * Window updates are important when we close our window due to a 12813 * full socket buffer and are opening it again after the application 12814 * reads data from it. Once the window has opened again and the 12815 * remote end starts to send again the ACK clock takes over and 12816 * provides the most current window information. 12817 * 12818 * We must avoid the silly window syndrome whereas every read from 12819 * the receive buffer, no matter how small, causes a window update 12820 * to be sent. We also should avoid sending a flurry of window 12821 * updates when the socket buffer had queued a lot of data and the 12822 * application is doing small reads. 12823 * 12824 * Prevent a flurry of pointless window updates by only sending an 12825 * update when we can increase the advertized window by more than 12826 * 1/4th of the socket buffer capacity. When the buffer is getting 12827 * full or is very small be more aggressive and send an update 12828 * whenever we can increase by two mss sized segments. In all other 12829 * situations the ACK's to new incoming data will carry further 12830 * window increases. 12831 * 12832 * Don't send an independent window update if a delayed ACK is 12833 * pending (it will get piggy-backed on it) or the remote side 12834 * already has done a half-close and won't send more data. Skip 12835 * this if the connection is in T/TCP half-open state. 12836 */ 12837 if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && 12838 !(tp->t_flags & TF_DELACK) && 12839 !TCPS_HAVERCVDFIN(tp->t_state)) { 12840 /* 12841 * "adv" is the amount we could increase the window, taking 12842 * into account that we are limited by TCP_MAXWIN << 12843 * tp->rcv_scale. 12844 */ 12845 int32_t adv; 12846 int oldwin; 12847 12848 adv = recwin; 12849 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) { 12850 oldwin = (tp->rcv_adv - tp->rcv_nxt); 12851 if (adv > oldwin) 12852 adv -= oldwin; 12853 else { 12854 /* We can't increase the window */ 12855 adv = 0; 12856 } 12857 } else 12858 oldwin = 0; 12859 12860 /* 12861 * If the new window size ends up being the same as or less 12862 * than the old size when it is scaled, then don't force 12863 * a window update. 12864 */ 12865 if (oldwin >> tp->rcv_scale >= (adv + oldwin) >> tp->rcv_scale) 12866 goto dontupdate; 12867 12868 if (adv >= (int32_t)(2 * segsiz) && 12869 (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || 12870 recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || 12871 so->so_rcv.sb_hiwat <= 8 * segsiz)) { 12872 pass = 7; 12873 goto send; 12874 } 12875 if (2 * adv >= (int32_t) so->so_rcv.sb_hiwat) { 12876 pass = 23; 12877 goto send; 12878 } 12879 } 12880 dontupdate: 12881 12882 /* 12883 * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW 12884 * is also a catch-all for the retransmit timer timeout case. 12885 */ 12886 if (tp->t_flags & TF_ACKNOW) { 12887 pass = 8; 12888 goto send; 12889 } 12890 if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) { 12891 pass = 9; 12892 goto send; 12893 } 12894 /* 12895 * If our state indicates that FIN should be sent and we have not 12896 * yet done so, then we need to send. 12897 */ 12898 if ((flags & TH_FIN) && 12899 (tp->snd_nxt == tp->snd_una)) { 12900 pass = 11; 12901 goto send; 12902 } 12903 /* 12904 * No reason to send a segment, just return. 12905 */ 12906 just_return: 12907 SOCKBUF_UNLOCK(sb); 12908 just_return_nolock: 12909 { 12910 int app_limited = CTF_JR_SENT_DATA; 12911 12912 if (tot_len_this_send > 0) { 12913 /* Make sure snd_nxt is up to max */ 12914 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 12915 tp->snd_nxt = tp->snd_max; 12916 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); 12917 } else { 12918 int end_window = 0; 12919 uint32_t seq = tp->gput_ack; 12920 12921 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 12922 if (rsm) { 12923 /* 12924 * Mark the last sent that we just-returned (hinting 12925 * that delayed ack may play a role in any rtt measurement). 12926 */ 12927 rsm->r_just_ret = 1; 12928 } 12929 counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); 12930 rack->r_ctl.rc_agg_delayed = 0; 12931 rack->r_early = 0; 12932 rack->r_late = 0; 12933 rack->r_ctl.rc_agg_early = 0; 12934 if ((ctf_outstanding(tp) + 12935 min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), 12936 minseg)) >= tp->snd_wnd) { 12937 /* We are limited by the rwnd */ 12938 app_limited = CTF_JR_RWND_LIMITED; 12939 } else if (ctf_outstanding(tp) >= sbavail(sb)) { 12940 /* We are limited by whats available -- app limited */ 12941 app_limited = CTF_JR_APP_LIMITED; 12942 } else if ((idle == 0) && 12943 ((tp->t_flags & TF_NODELAY) == 0) && 12944 ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && 12945 (len < segsiz)) { 12946 /* 12947 * No delay is not on and the 12948 * user is sending less than 1MSS. This 12949 * brings out SWS avoidance so we 12950 * don't send. Another app-limited case. 12951 */ 12952 app_limited = CTF_JR_APP_LIMITED; 12953 } else if (tp->t_flags & TF_NOPUSH) { 12954 /* 12955 * The user has requested no push of 12956 * the last segment and we are 12957 * at the last segment. Another app 12958 * limited case. 12959 */ 12960 app_limited = CTF_JR_APP_LIMITED; 12961 } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { 12962 /* Its the cwnd */ 12963 app_limited = CTF_JR_CWND_LIMITED; 12964 } else if (rack->rc_in_persist == 1) { 12965 /* We are in persists */ 12966 app_limited = CTF_JR_PERSISTS; 12967 } else if (IN_RECOVERY(tp->t_flags) && 12968 (rack->rack_no_prr == 0) && 12969 (rack->r_ctl.rc_prr_sndcnt < segsiz)) { 12970 app_limited = CTF_JR_PRR; 12971 } else { 12972 /* Now why here are we not sending? */ 12973 #ifdef NOW 12974 #ifdef INVARIANTS 12975 panic("rack:%p hit JR_ASSESSING case cwnd_to_use:%u?", rack, cwnd_to_use); 12976 #endif 12977 #endif 12978 app_limited = CTF_JR_ASSESSING; 12979 } 12980 /* 12981 * App limited in some fashion, for our pacing GP 12982 * measurements we don't want any gap (even cwnd). 12983 * Close down the measurement window. 12984 */ 12985 if (rack_cwnd_block_ends_measure && 12986 ((app_limited == CTF_JR_CWND_LIMITED) || 12987 (app_limited == CTF_JR_PRR))) { 12988 /* 12989 * The reason we are not sending is 12990 * the cwnd (or prr). We have been configured 12991 * to end the measurement window in 12992 * this case. 12993 */ 12994 end_window = 1; 12995 } else if (app_limited == CTF_JR_PERSISTS) { 12996 /* 12997 * We never end the measurement window 12998 * in persists, though in theory we 12999 * should be only entering after everything 13000 * is acknowledged (so we will probably 13001 * never come here). 13002 */ 13003 end_window = 0; 13004 } else if (rack_rwnd_block_ends_measure && 13005 (app_limited == CTF_JR_RWND_LIMITED)) { 13006 /* 13007 * We are rwnd limited and have been 13008 * configured to end the measurement 13009 * window in this case. 13010 */ 13011 end_window = 1; 13012 } else if (app_limited == CTF_JR_APP_LIMITED) { 13013 /* 13014 * A true application limited period, we have 13015 * ran out of data. 13016 */ 13017 end_window = 1; 13018 } else if (app_limited == CTF_JR_ASSESSING) { 13019 /* 13020 * In the assessing case we hit the end of 13021 * the if/else and had no known reason 13022 * This will panic us under invariants.. 13023 * 13024 * If we get this out in logs we need to 13025 * investagate which reason we missed. 13026 */ 13027 end_window = 1; 13028 } 13029 if (end_window) { 13030 uint8_t log = 0; 13031 13032 if ((tp->t_flags & TF_GPUTINPROG) && 13033 SEQ_GT(tp->gput_ack, tp->snd_max)) { 13034 /* Mark the last packet has app limited */ 13035 tp->gput_ack = tp->snd_max; 13036 log = 1; 13037 } 13038 rsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); 13039 if (rsm && ((rsm->r_flags & RACK_APP_LIMITED) == 0)) { 13040 if (rack->r_ctl.rc_app_limited_cnt == 0) 13041 rack->r_ctl.rc_end_appl = rack->r_ctl.rc_first_appl = rsm; 13042 else { 13043 /* 13044 * Go out to the end app limited and mark 13045 * this new one as next and move the end_appl up 13046 * to this guy. 13047 */ 13048 if (rack->r_ctl.rc_end_appl) 13049 rack->r_ctl.rc_end_appl->r_nseq_appl = rsm->r_start; 13050 rack->r_ctl.rc_end_appl = rsm; 13051 } 13052 rsm->r_flags |= RACK_APP_LIMITED; 13053 rack->r_ctl.rc_app_limited_cnt++; 13054 } 13055 if (log) 13056 rack_log_pacing_delay_calc(rack, 13057 rack->r_ctl.rc_app_limited_cnt, seq, 13058 tp->gput_ack, 0, 0, 4, __LINE__, NULL); 13059 } 13060 } 13061 if (slot) { 13062 /* set the rack tcb into the slot N */ 13063 counter_u64_add(rack_paced_segments, 1); 13064 } else if (tot_len_this_send) { 13065 counter_u64_add(rack_unpaced_segments, 1); 13066 } 13067 /* Check if we need to go into persists or not */ 13068 if ((rack->rc_in_persist == 0) && 13069 (tp->snd_max == tp->snd_una) && 13070 TCPS_HAVEESTABLISHED(tp->t_state) && 13071 sbavail(sb) && 13072 (sbavail(sb) > tp->snd_wnd) && 13073 (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg))) { 13074 /* Yes lets make sure to move to persist before timer-start */ 13075 rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); 13076 } 13077 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); 13078 rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); 13079 } 13080 #ifdef NETFLIX_SHARED_CWND 13081 if ((sbavail(sb) == 0) && 13082 rack->r_ctl.rc_scw) { 13083 tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); 13084 rack->rack_scwnd_is_idle = 1; 13085 } 13086 #endif 13087 return (0); 13088 13089 send: 13090 if ((flags & TH_FIN) && 13091 sbavail(sb)) { 13092 /* 13093 * We do not transmit a FIN 13094 * with data outstanding. We 13095 * need to make it so all data 13096 * is acked first. 13097 */ 13098 flags &= ~TH_FIN; 13099 } 13100 /* Enforce stack imposed max seg size if we have one */ 13101 if (rack->r_ctl.rc_pace_max_segs && 13102 (len > rack->r_ctl.rc_pace_max_segs)) { 13103 mark = 1; 13104 len = rack->r_ctl.rc_pace_max_segs; 13105 } 13106 SOCKBUF_LOCK_ASSERT(sb); 13107 if (len > 0) { 13108 if (len >= segsiz) 13109 tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; 13110 else 13111 tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; 13112 } 13113 /* 13114 * Before ESTABLISHED, force sending of initial options unless TCP 13115 * set not to do any options. NOTE: we assume that the IP/TCP header 13116 * plus TCP options always fit in a single mbuf, leaving room for a 13117 * maximum link header, i.e. max_linkhdr + sizeof (struct tcpiphdr) 13118 * + optlen <= MCLBYTES 13119 */ 13120 optlen = 0; 13121 #ifdef INET6 13122 if (isipv6) 13123 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); 13124 else 13125 #endif 13126 hdrlen = sizeof(struct tcpiphdr); 13127 13128 /* 13129 * Compute options for segment. We only have to care about SYN and 13130 * established connection segments. Options for SYN-ACK segments 13131 * are handled in TCP syncache. 13132 */ 13133 to.to_flags = 0; 13134 if ((tp->t_flags & TF_NOOPT) == 0) { 13135 /* Maximum segment size. */ 13136 if (flags & TH_SYN) { 13137 tp->snd_nxt = tp->iss; 13138 to.to_mss = tcp_mssopt(&inp->inp_inc); 13139 #ifdef NETFLIX_TCPOUDP 13140 if (tp->t_port) 13141 to.to_mss -= V_tcp_udp_tunneling_overhead; 13142 #endif 13143 to.to_flags |= TOF_MSS; 13144 13145 /* 13146 * On SYN or SYN|ACK transmits on TFO connections, 13147 * only include the TFO option if it is not a 13148 * retransmit, as the presence of the TFO option may 13149 * have caused the original SYN or SYN|ACK to have 13150 * been dropped by a middlebox. 13151 */ 13152 if (IS_FASTOPEN(tp->t_flags) && 13153 (tp->t_rxtshift == 0)) { 13154 if (tp->t_state == TCPS_SYN_RECEIVED) { 13155 to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; 13156 to.to_tfo_cookie = 13157 (u_int8_t *)&tp->t_tfo_cookie.server; 13158 to.to_flags |= TOF_FASTOPEN; 13159 wanted_cookie = 1; 13160 } else if (tp->t_state == TCPS_SYN_SENT) { 13161 to.to_tfo_len = 13162 tp->t_tfo_client_cookie_len; 13163 to.to_tfo_cookie = 13164 tp->t_tfo_cookie.client; 13165 to.to_flags |= TOF_FASTOPEN; 13166 wanted_cookie = 1; 13167 /* 13168 * If we wind up having more data to 13169 * send with the SYN than can fit in 13170 * one segment, don't send any more 13171 * until the SYN|ACK comes back from 13172 * the other end. 13173 */ 13174 sendalot = 0; 13175 } 13176 } 13177 } 13178 /* Window scaling. */ 13179 if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { 13180 to.to_wscale = tp->request_r_scale; 13181 to.to_flags |= TOF_SCALE; 13182 } 13183 /* Timestamps. */ 13184 if ((tp->t_flags & TF_RCVD_TSTMP) || 13185 ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { 13186 to.to_tsval = cts + tp->ts_offset; 13187 to.to_tsecr = tp->ts_recent; 13188 to.to_flags |= TOF_TS; 13189 } 13190 /* Set receive buffer autosizing timestamp. */ 13191 if (tp->rfbuf_ts == 0 && 13192 (so->so_rcv.sb_flags & SB_AUTOSIZE)) 13193 tp->rfbuf_ts = tcp_ts_getticks(); 13194 /* Selective ACK's. */ 13195 if (flags & TH_SYN) 13196 to.to_flags |= TOF_SACKPERM; 13197 else if (TCPS_HAVEESTABLISHED(tp->t_state) && 13198 tp->rcv_numsacks > 0) { 13199 to.to_flags |= TOF_SACK; 13200 to.to_nsacks = tp->rcv_numsacks; 13201 to.to_sacks = (u_char *)tp->sackblks; 13202 } 13203 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13204 /* TCP-MD5 (RFC2385). */ 13205 if (tp->t_flags & TF_SIGNATURE) 13206 to.to_flags |= TOF_SIGNATURE; 13207 #endif /* TCP_SIGNATURE */ 13208 13209 /* Processing the options. */ 13210 hdrlen += optlen = tcp_addoptions(&to, opt); 13211 /* 13212 * If we wanted a TFO option to be added, but it was unable 13213 * to fit, ensure no data is sent. 13214 */ 13215 if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && 13216 !(to.to_flags & TOF_FASTOPEN)) 13217 len = 0; 13218 } 13219 #ifdef NETFLIX_TCPOUDP 13220 if (tp->t_port) { 13221 if (V_tcp_udp_tunneling_port == 0) { 13222 /* The port was removed?? */ 13223 SOCKBUF_UNLOCK(&so->so_snd); 13224 return (EHOSTUNREACH); 13225 } 13226 hdrlen += sizeof(struct udphdr); 13227 } 13228 #endif 13229 #ifdef INET6 13230 if (isipv6) 13231 ipoptlen = ip6_optlen(tp->t_inpcb); 13232 else 13233 #endif 13234 if (tp->t_inpcb->inp_options) 13235 ipoptlen = tp->t_inpcb->inp_options->m_len - 13236 offsetof(struct ipoption, ipopt_list); 13237 else 13238 ipoptlen = 0; 13239 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 13240 ipoptlen += ipsec_optlen; 13241 #endif 13242 13243 #ifdef KERN_TLS 13244 /* force TSO for so TLS offload can get mss */ 13245 if (sb->sb_flags & SB_TLS_IFNET) { 13246 force_tso = 1; 13247 } 13248 #endif 13249 /* 13250 * Adjust data length if insertion of options will bump the packet 13251 * length beyond the t_maxseg length. Clear the FIN bit because we 13252 * cut off the tail of the segment. 13253 */ 13254 if (len + optlen + ipoptlen > tp->t_maxseg) { 13255 if (tso) { 13256 uint32_t if_hw_tsomax; 13257 uint32_t moff; 13258 int32_t max_len; 13259 13260 /* extract TSO information */ 13261 if_hw_tsomax = tp->t_tsomax; 13262 if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; 13263 if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; 13264 KASSERT(ipoptlen == 0, 13265 ("%s: TSO can't do IP options", __func__)); 13266 13267 /* 13268 * Check if we should limit by maximum payload 13269 * length: 13270 */ 13271 if (if_hw_tsomax != 0) { 13272 /* compute maximum TSO length */ 13273 max_len = (if_hw_tsomax - hdrlen - 13274 max_linkhdr); 13275 if (max_len <= 0) { 13276 len = 0; 13277 } else if (len > max_len) { 13278 sendalot = 1; 13279 len = max_len; 13280 mark = 2; 13281 } 13282 } 13283 /* 13284 * Prevent the last segment from being fractional 13285 * unless the send sockbuf can be emptied: 13286 */ 13287 max_len = (tp->t_maxseg - optlen); 13288 if (((sb_offset + len) < sbavail(sb)) && 13289 (hw_tls == 0)) { 13290 moff = len % (u_int)max_len; 13291 if (moff != 0) { 13292 mark = 3; 13293 len -= moff; 13294 } 13295 } 13296 /* 13297 * In case there are too many small fragments don't 13298 * use TSO: 13299 */ 13300 if (len <= segsiz) { 13301 mark = 4; 13302 tso = 0; 13303 } 13304 /* 13305 * Send the FIN in a separate segment after the bulk 13306 * sending is done. We don't trust the TSO 13307 * implementations to clear the FIN flag on all but 13308 * the last segment. 13309 */ 13310 if (tp->t_flags & TF_NEEDFIN) { 13311 sendalot = 4; 13312 } 13313 } else { 13314 mark = 5; 13315 if (optlen + ipoptlen >= tp->t_maxseg) { 13316 /* 13317 * Since we don't have enough space to put 13318 * the IP header chain and the TCP header in 13319 * one packet as required by RFC 7112, don't 13320 * send it. Also ensure that at least one 13321 * byte of the payload can be put into the 13322 * TCP segment. 13323 */ 13324 SOCKBUF_UNLOCK(&so->so_snd); 13325 error = EMSGSIZE; 13326 sack_rxmit = 0; 13327 goto out; 13328 } 13329 len = tp->t_maxseg - optlen - ipoptlen; 13330 sendalot = 5; 13331 } 13332 } else { 13333 tso = 0; 13334 mark = 6; 13335 } 13336 KASSERT(len + hdrlen + ipoptlen <= IP_MAXPACKET, 13337 ("%s: len > IP_MAXPACKET", __func__)); 13338 #ifdef DIAGNOSTIC 13339 #ifdef INET6 13340 if (max_linkhdr + hdrlen > MCLBYTES) 13341 #else 13342 if (max_linkhdr + hdrlen > MHLEN) 13343 #endif 13344 panic("tcphdr too big"); 13345 #endif 13346 13347 /* 13348 * This KASSERT is here to catch edge cases at a well defined place. 13349 * Before, those had triggered (random) panic conditions further 13350 * down. 13351 */ 13352 KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); 13353 if ((len == 0) && 13354 (flags & TH_FIN) && 13355 (sbused(sb))) { 13356 /* 13357 * We have outstanding data, don't send a fin by itself!. 13358 */ 13359 goto just_return; 13360 } 13361 /* 13362 * Grab a header mbuf, attaching a copy of data to be transmitted, 13363 * and initialize the header from the template for sends on this 13364 * connection. 13365 */ 13366 if (len) { 13367 uint32_t max_val; 13368 uint32_t moff; 13369 13370 if (rack->r_ctl.rc_pace_max_segs) 13371 max_val = rack->r_ctl.rc_pace_max_segs; 13372 else if (rack->rc_user_set_max_segs) 13373 max_val = rack->rc_user_set_max_segs * segsiz; 13374 else 13375 max_val = len; 13376 /* 13377 * We allow a limit on sending with hptsi. 13378 */ 13379 if (len > max_val) { 13380 mark = 7; 13381 len = max_val; 13382 } 13383 #ifdef INET6 13384 if (MHLEN < hdrlen + max_linkhdr) 13385 m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); 13386 else 13387 #endif 13388 m = m_gethdr(M_NOWAIT, MT_DATA); 13389 13390 if (m == NULL) { 13391 SOCKBUF_UNLOCK(sb); 13392 error = ENOBUFS; 13393 sack_rxmit = 0; 13394 goto out; 13395 } 13396 m->m_data += max_linkhdr; 13397 m->m_len = hdrlen; 13398 13399 /* 13400 * Start the m_copy functions from the closest mbuf to the 13401 * sb_offset in the socket buffer chain. 13402 */ 13403 mb = sbsndptr_noadv(sb, sb_offset, &moff); 13404 if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { 13405 m_copydata(mb, moff, (int)len, 13406 mtod(m, caddr_t)+hdrlen); 13407 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13408 sbsndptr_adv(sb, mb, len); 13409 m->m_len += len; 13410 } else { 13411 struct sockbuf *msb; 13412 13413 if (SEQ_LT(tp->snd_nxt, tp->snd_max)) 13414 msb = NULL; 13415 else 13416 msb = sb; 13417 m->m_next = tcp_m_copym( 13418 mb, moff, &len, 13419 if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, 13420 ((rsm == NULL) ? hw_tls : 0) 13421 #ifdef NETFLIX_COPY_ARGS 13422 , &filled_all 13423 #endif 13424 ); 13425 if (len <= (tp->t_maxseg - optlen)) { 13426 /* 13427 * Must have ran out of mbufs for the copy 13428 * shorten it to no longer need tso. Lets 13429 * not put on sendalot since we are low on 13430 * mbufs. 13431 */ 13432 tso = 0; 13433 } 13434 if (m->m_next == NULL) { 13435 SOCKBUF_UNLOCK(sb); 13436 (void)m_free(m); 13437 error = ENOBUFS; 13438 sack_rxmit = 0; 13439 goto out; 13440 } 13441 } 13442 if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { 13443 if (rsm && (rsm->r_flags & RACK_TLP)) { 13444 /* 13445 * TLP should not count in retran count, but 13446 * in its own bin 13447 */ 13448 counter_u64_add(rack_tlp_retran, 1); 13449 counter_u64_add(rack_tlp_retran_bytes, len); 13450 } else { 13451 tp->t_sndrexmitpack++; 13452 KMOD_TCPSTAT_INC(tcps_sndrexmitpack); 13453 KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); 13454 } 13455 #ifdef STATS 13456 stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, 13457 len); 13458 #endif 13459 } else { 13460 KMOD_TCPSTAT_INC(tcps_sndpack); 13461 KMOD_TCPSTAT_ADD(tcps_sndbyte, len); 13462 #ifdef STATS 13463 stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, 13464 len); 13465 #endif 13466 } 13467 /* 13468 * If we're sending everything we've got, set PUSH. (This 13469 * will keep happy those implementations which only give 13470 * data to the user when a buffer fills or a PUSH comes in.) 13471 */ 13472 if (sb_offset + len == sbused(sb) && 13473 sbused(sb) && 13474 !(flags & TH_SYN)) 13475 flags |= TH_PUSH; 13476 13477 SOCKBUF_UNLOCK(sb); 13478 } else { 13479 SOCKBUF_UNLOCK(sb); 13480 if (tp->t_flags & TF_ACKNOW) 13481 KMOD_TCPSTAT_INC(tcps_sndacks); 13482 else if (flags & (TH_SYN | TH_FIN | TH_RST)) 13483 KMOD_TCPSTAT_INC(tcps_sndctrl); 13484 else 13485 KMOD_TCPSTAT_INC(tcps_sndwinup); 13486 13487 m = m_gethdr(M_NOWAIT, MT_DATA); 13488 if (m == NULL) { 13489 error = ENOBUFS; 13490 sack_rxmit = 0; 13491 goto out; 13492 } 13493 #ifdef INET6 13494 if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && 13495 MHLEN >= hdrlen) { 13496 M_ALIGN(m, hdrlen); 13497 } else 13498 #endif 13499 m->m_data += max_linkhdr; 13500 m->m_len = hdrlen; 13501 } 13502 SOCKBUF_UNLOCK_ASSERT(sb); 13503 m->m_pkthdr.rcvif = (struct ifnet *)0; 13504 #ifdef MAC 13505 mac_inpcb_create_mbuf(inp, m); 13506 #endif 13507 #ifdef INET6 13508 if (isipv6) { 13509 ip6 = mtod(m, struct ip6_hdr *); 13510 #ifdef NETFLIX_TCPOUDP 13511 if (tp->t_port) { 13512 udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); 13513 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13514 udp->uh_dport = tp->t_port; 13515 ulen = hdrlen + len - sizeof(struct ip6_hdr); 13516 udp->uh_ulen = htons(ulen); 13517 th = (struct tcphdr *)(udp + 1); 13518 } else 13519 #endif 13520 th = (struct tcphdr *)(ip6 + 1); 13521 tcpip_fillheaders(inp, 13522 #ifdef NETFLIX_TCPOUDP 13523 tp->t_port, 13524 #endif 13525 ip6, th); 13526 } else 13527 #endif /* INET6 */ 13528 { 13529 ip = mtod(m, struct ip *); 13530 #ifdef TCPDEBUG 13531 ipov = (struct ipovly *)ip; 13532 #endif 13533 #ifdef NETFLIX_TCPOUDP 13534 if (tp->t_port) { 13535 udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); 13536 udp->uh_sport = htons(V_tcp_udp_tunneling_port); 13537 udp->uh_dport = tp->t_port; 13538 ulen = hdrlen + len - sizeof(struct ip); 13539 udp->uh_ulen = htons(ulen); 13540 th = (struct tcphdr *)(udp + 1); 13541 } else 13542 #endif 13543 th = (struct tcphdr *)(ip + 1); 13544 tcpip_fillheaders(inp, 13545 #ifdef NETFLIX_TCPOUDP 13546 tp->t_port, 13547 #endif 13548 ip, th); 13549 } 13550 /* 13551 * Fill in fields, remembering maximum advertised window for use in 13552 * delaying messages about window sizes. If resending a FIN, be sure 13553 * not to use a new sequence number. 13554 */ 13555 if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 13556 tp->snd_nxt == tp->snd_max) 13557 tp->snd_nxt--; 13558 /* 13559 * If we are starting a connection, send ECN setup SYN packet. If we 13560 * are on a retransmit, we may resend those bits a number of times 13561 * as per RFC 3168. 13562 */ 13563 if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 1) { 13564 if (tp->t_rxtshift >= 1) { 13565 if (tp->t_rxtshift <= V_tcp_ecn_maxretries) 13566 flags |= TH_ECE | TH_CWR; 13567 } else 13568 flags |= TH_ECE | TH_CWR; 13569 } 13570 /* Handle parallel SYN for ECN */ 13571 if ((tp->t_state == TCPS_SYN_RECEIVED) && 13572 (tp->t_flags2 & TF2_ECN_SND_ECE)) { 13573 flags |= TH_ECE; 13574 tp->t_flags2 &= ~TF2_ECN_SND_ECE; 13575 } 13576 if (tp->t_state == TCPS_ESTABLISHED && 13577 (tp->t_flags2 & TF2_ECN_PERMIT)) { 13578 /* 13579 * If the peer has ECN, mark data packets with ECN capable 13580 * transmission (ECT). Ignore pure ack packets, 13581 * retransmissions. 13582 */ 13583 if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && 13584 (sack_rxmit == 0)) { 13585 #ifdef INET6 13586 if (isipv6) 13587 ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); 13588 else 13589 #endif 13590 ip->ip_tos |= IPTOS_ECN_ECT0; 13591 KMOD_TCPSTAT_INC(tcps_ecn_ect0); 13592 /* 13593 * Reply with proper ECN notifications. 13594 * Only set CWR on new data segments. 13595 */ 13596 if (tp->t_flags2 & TF2_ECN_SND_CWR) { 13597 flags |= TH_CWR; 13598 tp->t_flags2 &= ~TF2_ECN_SND_CWR; 13599 } 13600 } 13601 if (tp->t_flags2 & TF2_ECN_SND_ECE) 13602 flags |= TH_ECE; 13603 } 13604 /* 13605 * If we are doing retransmissions, then snd_nxt will not reflect 13606 * the first unsent octet. For ACK only packets, we do not want the 13607 * sequence number of the retransmitted packet, we want the sequence 13608 * number of the next unsent octet. So, if there is no data (and no 13609 * SYN or FIN), use snd_max instead of snd_nxt when filling in 13610 * ti_seq. But if we are in persist state, snd_max might reflect 13611 * one byte beyond the right edge of the window, so use snd_nxt in 13612 * that case, since we know we aren't doing a retransmission. 13613 * (retransmit and persist are mutually exclusive...) 13614 */ 13615 if (sack_rxmit == 0) { 13616 if (len || (flags & (TH_SYN | TH_FIN)) || 13617 rack->rc_in_persist) { 13618 th->th_seq = htonl(tp->snd_nxt); 13619 rack_seq = tp->snd_nxt; 13620 } else if (flags & TH_RST) { 13621 /* 13622 * For a Reset send the last cum ack in sequence 13623 * (this like any other choice may still generate a 13624 * challenge ack, if a ack-update packet is in 13625 * flight). 13626 */ 13627 th->th_seq = htonl(tp->snd_una); 13628 rack_seq = tp->snd_una; 13629 } else { 13630 th->th_seq = htonl(tp->snd_max); 13631 rack_seq = tp->snd_max; 13632 } 13633 } else { 13634 th->th_seq = htonl(rsm->r_start); 13635 rack_seq = rsm->r_start; 13636 } 13637 th->th_ack = htonl(tp->rcv_nxt); 13638 if (optlen) { 13639 bcopy(opt, th + 1, optlen); 13640 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 13641 } 13642 th->th_flags = flags; 13643 /* 13644 * Calculate receive window. Don't shrink window, but avoid silly 13645 * window syndrome. 13646 * If a RST segment is sent, advertise a window of zero. 13647 */ 13648 if (flags & TH_RST) { 13649 recwin = 0; 13650 } else { 13651 if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && 13652 recwin < (long)segsiz) 13653 recwin = 0; 13654 if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && 13655 recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) 13656 recwin = (long)(tp->rcv_adv - tp->rcv_nxt); 13657 if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) 13658 recwin = (long)TCP_MAXWIN << tp->rcv_scale; 13659 } 13660 13661 /* 13662 * According to RFC1323 the window field in a SYN (i.e., a <SYN> or 13663 * <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> case is 13664 * handled in syncache. 13665 */ 13666 if (flags & TH_SYN) 13667 th->th_win = htons((u_short) 13668 (min(sbspace(&so->so_rcv), TCP_MAXWIN))); 13669 else { 13670 /* Avoid shrinking window with window scaling. */ 13671 recwin = roundup2(recwin, 1 << tp->rcv_scale); 13672 th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); 13673 } 13674 /* 13675 * Adjust the RXWIN0SENT flag - indicate that we have advertised a 0 13676 * window. This may cause the remote transmitter to stall. This 13677 * flag tells soreceive() to disable delayed acknowledgements when 13678 * draining the buffer. This can occur if the receiver is 13679 * attempting to read more data than can be buffered prior to 13680 * transmitting on the connection. 13681 */ 13682 if (th->th_win == 0) { 13683 tp->t_sndzerowin++; 13684 tp->t_flags |= TF_RXWIN0SENT; 13685 } else 13686 tp->t_flags &= ~TF_RXWIN0SENT; 13687 tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ 13688 13689 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) 13690 if (to.to_flags & TOF_SIGNATURE) { 13691 /* 13692 * Calculate MD5 signature and put it into the place 13693 * determined before. 13694 * NOTE: since TCP options buffer doesn't point into 13695 * mbuf's data, calculate offset and use it. 13696 */ 13697 if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, th, 13698 (u_char *)(th + 1) + (to.to_signature - opt)) != 0) { 13699 /* 13700 * Do not send segment if the calculation of MD5 13701 * digest has failed. 13702 */ 13703 goto out; 13704 } 13705 } 13706 #endif 13707 13708 /* 13709 * Put TCP length in extended header, and then checksum extended 13710 * header and data. 13711 */ 13712 m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ 13713 #ifdef INET6 13714 if (isipv6) { 13715 /* 13716 * ip6_plen is not need to be filled now, and will be filled 13717 * in ip6_output. 13718 */ 13719 if (tp->t_port) { 13720 m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; 13721 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13722 udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); 13723 th->th_sum = htons(0); 13724 UDPSTAT_INC(udps_opackets); 13725 } else { 13726 m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 13727 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13728 th->th_sum = in6_cksum_pseudo(ip6, 13729 sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 13730 0); 13731 } 13732 } 13733 #endif 13734 #if defined(INET6) && defined(INET) 13735 else 13736 #endif 13737 #ifdef INET 13738 { 13739 if (tp->t_port) { 13740 m->m_pkthdr.csum_flags = CSUM_UDP; 13741 m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); 13742 udp->uh_sum = in_pseudo(ip->ip_src.s_addr, 13743 ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); 13744 th->th_sum = htons(0); 13745 UDPSTAT_INC(udps_opackets); 13746 } else { 13747 m->m_pkthdr.csum_flags = CSUM_TCP; 13748 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 13749 th->th_sum = in_pseudo(ip->ip_src.s_addr, 13750 ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + 13751 IPPROTO_TCP + len + optlen)); 13752 } 13753 /* IP version must be set here for ipv4/ipv6 checking later */ 13754 KASSERT(ip->ip_v == IPVERSION, 13755 ("%s: IP version incorrect: %d", __func__, ip->ip_v)); 13756 } 13757 #endif 13758 /* 13759 * Enable TSO and specify the size of the segments. The TCP pseudo 13760 * header checksum is always provided. XXX: Fixme: This is currently 13761 * not the case for IPv6. 13762 */ 13763 if (tso || force_tso) { 13764 KASSERT(force_tso || len > tp->t_maxseg - optlen, 13765 ("%s: len <= tso_segsz", __func__)); 13766 m->m_pkthdr.csum_flags |= CSUM_TSO; 13767 m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; 13768 } 13769 KASSERT(len + hdrlen == m_length(m, NULL), 13770 ("%s: mbuf chain different than expected: %d + %u != %u", 13771 __func__, len, hdrlen, m_length(m, NULL))); 13772 13773 #ifdef TCP_HHOOK 13774 /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ 13775 hhook_run_tcp_est_out(tp, th, &to, len, tso); 13776 #endif 13777 #ifdef TCPDEBUG 13778 /* 13779 * Trace. 13780 */ 13781 if (so->so_options & SO_DEBUG) { 13782 u_short save = 0; 13783 13784 #ifdef INET6 13785 if (!isipv6) 13786 #endif 13787 { 13788 save = ipov->ih_len; 13789 ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + 13790 * (th->th_off << 2) */ ); 13791 } 13792 tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); 13793 #ifdef INET6 13794 if (!isipv6) 13795 #endif 13796 ipov->ih_len = save; 13797 } 13798 #endif /* TCPDEBUG */ 13799 13800 /* We're getting ready to send; log now. */ 13801 if (tp->t_logstate != TCP_LOG_STATE_OFF) { 13802 union tcp_log_stackspecific log; 13803 struct timeval tv; 13804 13805 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 13806 log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; 13807 log.u_bbr.ininput = rack->rc_inp->inp_in_input; 13808 if (rack->rack_no_prr) 13809 log.u_bbr.flex1 = 0; 13810 else 13811 log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; 13812 log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; 13813 log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; 13814 log.u_bbr.flex4 = orig_len; 13815 if (filled_all) 13816 log.u_bbr.flex5 = 0x80000000; 13817 else 13818 log.u_bbr.flex5 = 0; 13819 /* Save off the early/late values */ 13820 log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; 13821 log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; 13822 log.u_bbr.bw_inuse = rack_get_bw(rack); 13823 if (rsm || sack_rxmit) { 13824 if (doing_tlp) 13825 log.u_bbr.flex8 = 2; 13826 else 13827 log.u_bbr.flex8 = 1; 13828 } else { 13829 log.u_bbr.flex8 = 0; 13830 } 13831 log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); 13832 log.u_bbr.flex7 = mark; 13833 log.u_bbr.pkts_out = tp->t_maxseg; 13834 log.u_bbr.timeStamp = tcp_get_usecs(&tv); 13835 log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); 13836 log.u_bbr.lt_epoch = cwnd_to_use; 13837 log.u_bbr.delivered = sendalot; 13838 lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, 13839 len, &log, false, NULL, NULL, 0, &tv); 13840 } else 13841 lgb = NULL; 13842 13843 /* 13844 * Fill in IP length and desired time to live and send to IP level. 13845 * There should be a better way to handle ttl and tos; we could keep 13846 * them in the template, but need a way to checksum without them. 13847 */ 13848 /* 13849 * m->m_pkthdr.len should have been set before cksum calcuration, 13850 * because in6_cksum() need it. 13851 */ 13852 #ifdef INET6 13853 if (isipv6) { 13854 /* 13855 * we separately set hoplimit for every segment, since the 13856 * user might want to change the value via setsockopt. Also, 13857 * desired default hop limit might be changed via Neighbor 13858 * Discovery. 13859 */ 13860 ip6->ip6_hlim = in6_selecthlim(inp, NULL); 13861 13862 /* 13863 * Set the packet size here for the benefit of DTrace 13864 * probes. ip6_output() will set it properly; it's supposed 13865 * to include the option header lengths as well. 13866 */ 13867 ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); 13868 13869 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) 13870 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13871 else 13872 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13873 13874 if (tp->t_state == TCPS_SYN_SENT) 13875 TCP_PROBE5(connect__request, NULL, tp, ip6, tp, th); 13876 13877 TCP_PROBE5(send, NULL, tp, ip6, tp, th); 13878 /* TODO: IPv6 IP6TOS_ECT bit on */ 13879 error = ip6_output(m, inp->in6p_outputopts, 13880 &inp->inp_route6, 13881 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 13882 NULL, NULL, inp); 13883 13884 if (error == EMSGSIZE && inp->inp_route6.ro_nh != NULL) 13885 mtu = inp->inp_route6.ro_nh->nh_mtu; 13886 } 13887 #endif /* INET6 */ 13888 #if defined(INET) && defined(INET6) 13889 else 13890 #endif 13891 #ifdef INET 13892 { 13893 ip->ip_len = htons(m->m_pkthdr.len); 13894 #ifdef INET6 13895 if (inp->inp_vflag & INP_IPV6PROTO) 13896 ip->ip_ttl = in6_selecthlim(inp, NULL); 13897 #endif /* INET6 */ 13898 /* 13899 * If we do path MTU discovery, then we set DF on every 13900 * packet. This might not be the best thing to do according 13901 * to RFC3390 Section 2. However the tcp hostcache migitates 13902 * the problem so it affects only the first tcp connection 13903 * with a host. 13904 * 13905 * NB: Don't set DF on small MTU/MSS to have a safe 13906 * fallback. 13907 */ 13908 if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { 13909 tp->t_flags2 |= TF2_PLPMTU_PMTUD; 13910 if (tp->t_port == 0 || len < V_tcp_minmss) { 13911 ip->ip_off |= htons(IP_DF); 13912 } 13913 } else { 13914 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; 13915 } 13916 13917 if (tp->t_state == TCPS_SYN_SENT) 13918 TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); 13919 13920 TCP_PROBE5(send, NULL, tp, ip, tp, th); 13921 13922 error = ip_output(m, inp->inp_options, &inp->inp_route, 13923 ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, 13924 inp); 13925 if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) 13926 mtu = inp->inp_route.ro_nh->nh_mtu; 13927 } 13928 #endif /* INET */ 13929 13930 out: 13931 if (lgb) { 13932 lgb->tlb_errno = error; 13933 lgb = NULL; 13934 } 13935 /* 13936 * In transmit state, time the transmission and arrange for the 13937 * retransmit. In persist state, just set snd_max. 13938 */ 13939 if (error == 0) { 13940 rack->forced_ack = 0; /* If we send something zap the FA flag */ 13941 if (rsm && (doing_tlp == 0)) { 13942 /* Set we retransmitted */ 13943 rack->rc_gp_saw_rec = 1; 13944 } else { 13945 if (cwnd_to_use > tp->snd_ssthresh) { 13946 /* Set we sent in CA */ 13947 rack->rc_gp_saw_ca = 1; 13948 } else { 13949 /* Set we sent in SS */ 13950 rack->rc_gp_saw_ss = 1; 13951 } 13952 } 13953 if (TCPS_HAVEESTABLISHED(tp->t_state) && 13954 (tp->t_flags & TF_SACK_PERMIT) && 13955 tp->rcv_numsacks > 0) 13956 tcp_clean_dsack_blocks(tp); 13957 tot_len_this_send += len; 13958 if (len == 0) 13959 counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); 13960 else if (len == 1) { 13961 counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); 13962 } else if (len > 1) { 13963 int idx; 13964 13965 idx = (len / segsiz) + 3; 13966 if (idx >= TCP_MSS_ACCT_ATIMER) 13967 counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); 13968 else 13969 counter_u64_add(rack_out_size[idx], 1); 13970 } 13971 if (hw_tls && len > 0) { 13972 if (filled_all) { 13973 counter_u64_add(rack_tls_filled, 1); 13974 rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1); 13975 } else { 13976 if (rsm) { 13977 counter_u64_add(rack_tls_rxt, 1); 13978 rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1); 13979 } else if (doing_tlp) { 13980 counter_u64_add(rack_tls_tlp, 1); 13981 rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); 13982 } else if ( (ctf_outstanding(tp) + minseg) > sbavail(sb)) { 13983 counter_u64_add(rack_tls_app, 1); 13984 rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); 13985 } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + minseg) > cwnd_to_use) { 13986 counter_u64_add(rack_tls_cwnd, 1); 13987 rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); 13988 } else if ((ctf_outstanding(tp) + minseg) > tp->snd_wnd) { 13989 counter_u64_add(rack_tls_rwnd, 1); 13990 rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); 13991 } else { 13992 rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1); 13993 counter_u64_add(rack_tls_other, 1); 13994 } 13995 } 13996 } 13997 } 13998 if (rack->rack_no_prr == 0) { 13999 if (sub_from_prr && (error == 0)) { 14000 if (rack->r_ctl.rc_prr_sndcnt >= len) 14001 rack->r_ctl.rc_prr_sndcnt -= len; 14002 else 14003 rack->r_ctl.rc_prr_sndcnt = 0; 14004 } 14005 } 14006 sub_from_prr = 0; 14007 rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, 14008 pass, rsm, us_cts); 14009 if ((error == 0) && 14010 (len > 0) && 14011 (tp->snd_una == tp->snd_max)) 14012 rack->r_ctl.rc_tlp_rxt_last_time = cts; 14013 /* Now are we in persists? */ 14014 if (rack->rc_in_persist == 0) { 14015 tcp_seq startseq = tp->snd_nxt; 14016 14017 /* Track our lost count */ 14018 if (rsm && (doing_tlp == 0)) 14019 rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; 14020 /* 14021 * Advance snd_nxt over sequence space of this segment. 14022 */ 14023 if (error) 14024 /* We don't log or do anything with errors */ 14025 goto nomore; 14026 if (doing_tlp == 0) { 14027 if (rsm == NULL) { 14028 /* 14029 * Not a retransmission of some 14030 * sort, new data is going out so 14031 * clear our TLP count and flag. 14032 */ 14033 rack->rc_tlp_in_progress = 0; 14034 rack->r_ctl.rc_tlp_cnt_out = 0; 14035 } 14036 } else { 14037 /* 14038 * We have just sent a TLP, mark that it is true 14039 * and make sure our in progress is set so we 14040 * continue to check the count. 14041 */ 14042 rack->rc_tlp_in_progress = 1; 14043 rack->r_ctl.rc_tlp_cnt_out++; 14044 } 14045 if (flags & (TH_SYN | TH_FIN)) { 14046 if (flags & TH_SYN) 14047 tp->snd_nxt++; 14048 if (flags & TH_FIN) { 14049 tp->snd_nxt++; 14050 tp->t_flags |= TF_SENTFIN; 14051 } 14052 } 14053 /* In the ENOBUFS case we do *not* update snd_max */ 14054 if (sack_rxmit) 14055 goto nomore; 14056 14057 tp->snd_nxt += len; 14058 if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { 14059 if (tp->snd_una == tp->snd_max) { 14060 /* 14061 * Update the time we just added data since 14062 * none was outstanding. 14063 */ 14064 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 14065 tp->t_acktime = ticks; 14066 } 14067 tp->snd_max = tp->snd_nxt; 14068 /* 14069 * Time this transmission if not a retransmission and 14070 * not currently timing anything. 14071 * This is only relevant in case of switching back to 14072 * the base stack. 14073 */ 14074 if (tp->t_rtttime == 0) { 14075 tp->t_rtttime = ticks; 14076 tp->t_rtseq = startseq; 14077 KMOD_TCPSTAT_INC(tcps_segstimed); 14078 } 14079 if (len && 14080 ((tp->t_flags & TF_GPUTINPROG) == 0)) 14081 rack_start_gp_measurement(tp, rack, startseq, sb_offset); 14082 } 14083 } else { 14084 /* 14085 * Persist case, update snd_max but since we are in persist 14086 * mode (no window) we do not update snd_nxt. 14087 */ 14088 int32_t xlen = len; 14089 14090 if (error) 14091 goto nomore; 14092 14093 if (flags & TH_SYN) 14094 ++xlen; 14095 if (flags & TH_FIN) { 14096 ++xlen; 14097 tp->t_flags |= TF_SENTFIN; 14098 } 14099 /* In the ENOBUFS case we do *not* update snd_max */ 14100 if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { 14101 if (tp->snd_una == tp->snd_max) { 14102 /* 14103 * Update the time we just added data since 14104 * none was outstanding. 14105 */ 14106 rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); 14107 tp->t_acktime = ticks; 14108 } 14109 tp->snd_max = tp->snd_nxt + len; 14110 } 14111 } 14112 nomore: 14113 if (error) { 14114 rack->r_ctl.rc_agg_delayed = 0; 14115 rack->r_early = 0; 14116 rack->r_late = 0; 14117 rack->r_ctl.rc_agg_early = 0; 14118 SOCKBUF_UNLOCK_ASSERT(sb); /* Check gotos. */ 14119 /* 14120 * Failures do not advance the seq counter above. For the 14121 * case of ENOBUFS we will fall out and retry in 1ms with 14122 * the hpts. Everything else will just have to retransmit 14123 * with the timer. 14124 * 14125 * In any case, we do not want to loop around for another 14126 * send without a good reason. 14127 */ 14128 sendalot = 0; 14129 switch (error) { 14130 case EPERM: 14131 tp->t_softerror = error; 14132 return (error); 14133 case ENOBUFS: 14134 if (slot == 0) { 14135 /* 14136 * Pace us right away to retry in a some 14137 * time 14138 */ 14139 slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); 14140 if (rack->rc_enobuf < 126) 14141 rack->rc_enobuf++; 14142 if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { 14143 slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; 14144 } 14145 if (slot < (10 * HPTS_USEC_IN_MSEC)) 14146 slot = 10 * HPTS_USEC_IN_MSEC; 14147 } 14148 counter_u64_add(rack_saw_enobuf, 1); 14149 error = 0; 14150 goto enobufs; 14151 case EMSGSIZE: 14152 /* 14153 * For some reason the interface we used initially 14154 * to send segments changed to another or lowered 14155 * its MTU. If TSO was active we either got an 14156 * interface without TSO capabilits or TSO was 14157 * turned off. If we obtained mtu from ip_output() 14158 * then update it and try again. 14159 */ 14160 if (tso) 14161 tp->t_flags &= ~TF_TSO; 14162 if (mtu != 0) { 14163 tcp_mss_update(tp, -1, mtu, NULL, NULL); 14164 goto again; 14165 } 14166 slot = 10 * HPTS_USEC_IN_MSEC; 14167 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14168 return (error); 14169 case ENETUNREACH: 14170 counter_u64_add(rack_saw_enetunreach, 1); 14171 case EHOSTDOWN: 14172 case EHOSTUNREACH: 14173 case ENETDOWN: 14174 if (TCPS_HAVERCVDSYN(tp->t_state)) { 14175 tp->t_softerror = error; 14176 } 14177 /* FALLTHROUGH */ 14178 default: 14179 slot = 10 * HPTS_USEC_IN_MSEC; 14180 rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); 14181 return (error); 14182 } 14183 } else { 14184 rack->rc_enobuf = 0; 14185 } 14186 KMOD_TCPSTAT_INC(tcps_sndtotal); 14187 14188 /* 14189 * Data sent (as far as we can tell). If this advertises a larger 14190 * window than any other segment, then remember the size of the 14191 * advertised window. Any pending ACK has now been sent. 14192 */ 14193 if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) 14194 tp->rcv_adv = tp->rcv_nxt + recwin; 14195 tp->last_ack_sent = tp->rcv_nxt; 14196 tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); 14197 enobufs: 14198 /* Assure when we leave that snd_nxt will point to top */ 14199 if (SEQ_GT(tp->snd_max, tp->snd_nxt)) 14200 tp->snd_nxt = tp->snd_max; 14201 if (sendalot) { 14202 /* Do we need to turn off sendalot? */ 14203 if (rack->r_ctl.rc_pace_max_segs && 14204 (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { 14205 /* We hit our max. */ 14206 sendalot = 0; 14207 } else if ((rack->rc_user_set_max_segs) && 14208 (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { 14209 /* We hit the user defined max */ 14210 sendalot = 0; 14211 } 14212 } 14213 if ((error == 0) && (flags & TH_FIN)) 14214 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN); 14215 if (flags & TH_RST) { 14216 /* 14217 * We don't send again after sending a RST. 14218 */ 14219 slot = 0; 14220 sendalot = 0; 14221 if (error == 0) 14222 tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); 14223 } else if ((slot == 0) && (sendalot == 0) && tot_len_this_send) { 14224 /* 14225 * Get our pacing rate, if an error 14226 * occured in sending (ENOBUF) we would 14227 * hit the else if with slot preset. Other 14228 * errors return. 14229 */ 14230 slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); 14231 } 14232 if (rsm && 14233 rack->use_rack_rr) { 14234 /* Its a retransmit and we use the rack cheat? */ 14235 if ((slot == 0) || 14236 (rack->rc_always_pace == 0) || 14237 (rack->r_rr_config == 1)) { 14238 /* 14239 * We have no pacing set or we 14240 * are using old-style rack or 14241 * we are overriden to use the old 1ms pacing. 14242 */ 14243 slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; 14244 } 14245 } 14246 if (slot) { 14247 /* set the rack tcb into the slot N */ 14248 counter_u64_add(rack_paced_segments, 1); 14249 } else if (sendalot) { 14250 if (len) 14251 counter_u64_add(rack_unpaced_segments, 1); 14252 sack_rxmit = 0; 14253 goto again; 14254 } else if (len) { 14255 counter_u64_add(rack_unpaced_segments, 1); 14256 } 14257 rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); 14258 return (error); 14259 } 14260 14261 static void 14262 rack_update_seg(struct tcp_rack *rack) 14263 { 14264 uint32_t orig_val; 14265 14266 orig_val = rack->r_ctl.rc_pace_max_segs; 14267 rack_set_pace_segments(rack->rc_tp, rack, __LINE__); 14268 if (orig_val != rack->r_ctl.rc_pace_max_segs) 14269 rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); 14270 } 14271 14272 /* 14273 * rack_ctloutput() must drop the inpcb lock before performing copyin on 14274 * socket option arguments. When it re-acquires the lock after the copy, it 14275 * has to revalidate that the connection is still valid for the socket 14276 * option. 14277 */ 14278 static int 14279 rack_set_sockopt(struct socket *so, struct sockopt *sopt, 14280 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14281 { 14282 struct epoch_tracker et; 14283 uint64_t val; 14284 int32_t error = 0, optval; 14285 uint16_t ca, ss; 14286 14287 14288 switch (sopt->sopt_name) { 14289 case TCP_RACK_PROP_RATE: /* URL:prop_rate */ 14290 case TCP_RACK_PROP : /* URL:prop */ 14291 case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ 14292 case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ 14293 case TCP_RACK_PACE_REDUCE: /* Not used */ 14294 /* Pacing related ones */ 14295 case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ 14296 case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ 14297 case TCP_BBR_IWINTSO: /* URL:tso_iwin */ 14298 case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ 14299 case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ 14300 case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ 14301 case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ 14302 case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ 14303 case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ 14304 case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ 14305 case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ 14306 case TCP_RACK_RR_CONF: /* URL:rrr_conf */ 14307 case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ 14308 /* End pacing related */ 14309 case TCP_DELACK: 14310 case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ 14311 case TCP_RACK_MIN_TO: /* URL:min_to */ 14312 case TCP_RACK_EARLY_SEG: /* URL:early_seg */ 14313 case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ 14314 case TCP_RACK_REORD_FADE: /* URL:reord_fade */ 14315 case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ 14316 case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ 14317 case TCP_RACK_TLP_USE: /* URL:tlp_use */ 14318 case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ 14319 case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ 14320 case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ 14321 case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ 14322 case TCP_RACK_DO_DETECTION: /* URL:detect */ 14323 case TCP_NO_PRR: /* URL:noprr */ 14324 case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ 14325 case TCP_DATA_AFTER_CLOSE: 14326 case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ 14327 case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ 14328 case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ 14329 case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ 14330 case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ 14331 case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ 14332 case TCP_RACK_PROFILE: /* URL:profile */ 14333 break; 14334 default: 14335 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14336 break; 14337 } 14338 INP_WUNLOCK(inp); 14339 error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); 14340 if (error) 14341 return (error); 14342 INP_WLOCK(inp); 14343 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 14344 INP_WUNLOCK(inp); 14345 return (ECONNRESET); 14346 } 14347 tp = intotcpcb(inp); 14348 rack = (struct tcp_rack *)tp->t_fb_ptr; 14349 switch (sopt->sopt_name) { 14350 case TCP_RACK_PROFILE: 14351 RACK_OPTS_INC(tcp_profile); 14352 if (optval == 1) { 14353 /* pace_always=1 */ 14354 rack->rc_always_pace = 1; 14355 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14356 /* scwnd=1 */ 14357 rack->rack_enable_scwnd = 1; 14358 /* dynamic=100 */ 14359 rack->rc_gp_dyn_mul = 1; 14360 rack->r_ctl.rack_per_of_gp_ca = 100; 14361 /* rrr_conf=3 */ 14362 rack->r_rr_config = 3; 14363 /* npush=2 */ 14364 rack->r_ctl.rc_no_push_at_mrtt = 2; 14365 /* fillcw=1 */ 14366 rack->rc_pace_to_cwnd = 1; 14367 rack->rc_pace_fill_if_rttin_range = 0; 14368 rack->rtt_limit_mul = 0; 14369 /* noprr=1 */ 14370 rack->rack_no_prr = 1; 14371 /* lscwnd=1 */ 14372 rack->r_limit_scw = 1; 14373 } else if (optval == 2) { 14374 /* pace_always=1 */ 14375 rack->rc_always_pace = 1; 14376 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14377 /* scwnd=1 */ 14378 rack->rack_enable_scwnd = 1; 14379 /* dynamic=100 */ 14380 rack->rc_gp_dyn_mul = 1; 14381 rack->r_ctl.rack_per_of_gp_ca = 100; 14382 /* rrr_conf=3 */ 14383 rack->r_rr_config = 3; 14384 /* npush=2 */ 14385 rack->r_ctl.rc_no_push_at_mrtt = 2; 14386 /* fillcw=1 */ 14387 rack->rc_pace_to_cwnd = 1; 14388 rack->rc_pace_fill_if_rttin_range = 0; 14389 rack->rtt_limit_mul = 0; 14390 /* noprr=1 */ 14391 rack->rack_no_prr = 1; 14392 /* lscwnd=0 */ 14393 rack->r_limit_scw = 0; 14394 } 14395 break; 14396 case TCP_SHARED_CWND_TIME_LIMIT: 14397 RACK_OPTS_INC(tcp_lscwnd); 14398 if (optval) 14399 rack->r_limit_scw = 1; 14400 else 14401 rack->r_limit_scw = 0; 14402 break; 14403 case TCP_RACK_PACE_TO_FILL: 14404 RACK_OPTS_INC(tcp_fillcw); 14405 if (optval == 0) 14406 rack->rc_pace_to_cwnd = 0; 14407 else 14408 rack->rc_pace_to_cwnd = 1; 14409 if ((optval >= rack_gp_rtt_maxmul) && 14410 rack_gp_rtt_maxmul && 14411 (optval < 0xf)) { 14412 rack->rc_pace_fill_if_rttin_range = 1; 14413 rack->rtt_limit_mul = optval; 14414 } else { 14415 rack->rc_pace_fill_if_rttin_range = 0; 14416 rack->rtt_limit_mul = 0; 14417 } 14418 break; 14419 case TCP_RACK_NO_PUSH_AT_MAX: 14420 RACK_OPTS_INC(tcp_npush); 14421 if (optval == 0) 14422 rack->r_ctl.rc_no_push_at_mrtt = 0; 14423 else if (optval < 0xff) 14424 rack->r_ctl.rc_no_push_at_mrtt = optval; 14425 else 14426 error = EINVAL; 14427 break; 14428 case TCP_SHARED_CWND_ENABLE: 14429 RACK_OPTS_INC(tcp_rack_scwnd); 14430 if (optval == 0) 14431 rack->rack_enable_scwnd = 0; 14432 else 14433 rack->rack_enable_scwnd = 1; 14434 break; 14435 case TCP_RACK_MBUF_QUEUE: 14436 /* Now do we use the LRO mbuf-queue feature */ 14437 RACK_OPTS_INC(tcp_rack_mbufq); 14438 if (optval) 14439 rack->r_mbuf_queue = 1; 14440 else 14441 rack->r_mbuf_queue = 0; 14442 if (rack->r_mbuf_queue || rack->rc_always_pace) 14443 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14444 else 14445 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14446 break; 14447 case TCP_RACK_NONRXT_CFG_RATE: 14448 RACK_OPTS_INC(tcp_rack_cfg_rate); 14449 if (optval == 0) 14450 rack->rack_rec_nonrxt_use_cr = 0; 14451 else 14452 rack->rack_rec_nonrxt_use_cr = 1; 14453 break; 14454 case TCP_NO_PRR: 14455 RACK_OPTS_INC(tcp_rack_noprr); 14456 if (optval == 0) 14457 rack->rack_no_prr = 0; 14458 else 14459 rack->rack_no_prr = 1; 14460 break; 14461 case TCP_TIMELY_DYN_ADJ: 14462 RACK_OPTS_INC(tcp_timely_dyn); 14463 if (optval == 0) 14464 rack->rc_gp_dyn_mul = 0; 14465 else { 14466 rack->rc_gp_dyn_mul = 1; 14467 if (optval >= 100) { 14468 /* 14469 * If the user sets something 100 or more 14470 * its the gp_ca value. 14471 */ 14472 rack->r_ctl.rack_per_of_gp_ca = optval; 14473 } 14474 } 14475 break; 14476 case TCP_RACK_DO_DETECTION: 14477 RACK_OPTS_INC(tcp_rack_do_detection); 14478 if (optval == 0) 14479 rack->do_detection = 0; 14480 else 14481 rack->do_detection = 1; 14482 break; 14483 case TCP_RACK_PROP_RATE: 14484 if ((optval <= 0) || (optval >= 100)) { 14485 error = EINVAL; 14486 break; 14487 } 14488 RACK_OPTS_INC(tcp_rack_prop_rate); 14489 rack->r_ctl.rc_prop_rate = optval; 14490 break; 14491 case TCP_RACK_TLP_USE: 14492 if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { 14493 error = EINVAL; 14494 break; 14495 } 14496 RACK_OPTS_INC(tcp_tlp_use); 14497 rack->rack_tlp_threshold_use = optval; 14498 break; 14499 case TCP_RACK_PROP: 14500 /* RACK proportional rate reduction (bool) */ 14501 RACK_OPTS_INC(tcp_rack_prop); 14502 rack->r_ctl.rc_prop_reduce = optval; 14503 break; 14504 case TCP_RACK_TLP_REDUCE: 14505 /* RACK TLP cwnd reduction (bool) */ 14506 RACK_OPTS_INC(tcp_rack_tlp_reduce); 14507 rack->r_ctl.rc_tlp_cwnd_reduce = optval; 14508 break; 14509 case TCP_RACK_EARLY_RECOV: 14510 /* Should recovery happen early (bool) */ 14511 RACK_OPTS_INC(tcp_rack_early_recov); 14512 rack->r_ctl.rc_early_recovery = optval; 14513 break; 14514 14515 /* Pacing related ones */ 14516 case TCP_RACK_PACE_ALWAYS: 14517 /* 14518 * zero is old rack method, 1 is new 14519 * method using a pacing rate. 14520 */ 14521 RACK_OPTS_INC(tcp_rack_pace_always); 14522 if (optval > 0) 14523 rack->rc_always_pace = 1; 14524 else 14525 rack->rc_always_pace = 0; 14526 if (rack->r_mbuf_queue || rack->rc_always_pace) 14527 tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; 14528 else 14529 tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; 14530 /* A rate may be set irate or other, if so set seg size */ 14531 rack_update_seg(rack); 14532 break; 14533 case TCP_BBR_RACK_INIT_RATE: 14534 RACK_OPTS_INC(tcp_initial_rate); 14535 val = optval; 14536 /* Change from kbits per second to bytes per second */ 14537 val *= 1000; 14538 val /= 8; 14539 rack->r_ctl.init_rate = val; 14540 if (rack->rc_init_win != rack_default_init_window) { 14541 uint32_t win, snt; 14542 14543 /* 14544 * Options don't always get applied 14545 * in the order you think. So in order 14546 * to assure we update a cwnd we need 14547 * to check and see if we are still 14548 * where we should raise the cwnd. 14549 */ 14550 win = rc_init_window(rack); 14551 if (SEQ_GT(tp->snd_max, tp->iss)) 14552 snt = tp->snd_max - tp->iss; 14553 else 14554 snt = 0; 14555 if ((snt < win) && 14556 (tp->snd_cwnd < win)) 14557 tp->snd_cwnd = win; 14558 } 14559 if (rack->rc_always_pace) 14560 rack_update_seg(rack); 14561 break; 14562 case TCP_BBR_IWINTSO: 14563 RACK_OPTS_INC(tcp_initial_win); 14564 if (optval && (optval <= 0xff)) { 14565 uint32_t win, snt; 14566 14567 rack->rc_init_win = optval; 14568 win = rc_init_window(rack); 14569 if (SEQ_GT(tp->snd_max, tp->iss)) 14570 snt = tp->snd_max - tp->iss; 14571 else 14572 snt = 0; 14573 if ((snt < win) && 14574 (tp->t_srtt | 14575 #ifdef NETFLIX_PEAKRATE 14576 tp->t_maxpeakrate | 14577 #endif 14578 rack->r_ctl.init_rate)) { 14579 /* 14580 * We are not past the initial window 14581 * and we have some bases for pacing, 14582 * so we need to possibly adjust up 14583 * the cwnd. Note even if we don't set 14584 * the cwnd, its still ok to raise the rc_init_win 14585 * which can be used coming out of idle when we 14586 * would have a rate. 14587 */ 14588 if (tp->snd_cwnd < win) 14589 tp->snd_cwnd = win; 14590 } 14591 if (rack->rc_always_pace) 14592 rack_update_seg(rack); 14593 } else 14594 error = EINVAL; 14595 break; 14596 case TCP_RACK_FORCE_MSEG: 14597 RACK_OPTS_INC(tcp_rack_force_max_seg); 14598 if (optval) 14599 rack->rc_force_max_seg = 1; 14600 else 14601 rack->rc_force_max_seg = 0; 14602 break; 14603 case TCP_RACK_PACE_MAX_SEG: 14604 /* Max segments size in a pace in bytes */ 14605 RACK_OPTS_INC(tcp_rack_max_seg); 14606 rack->rc_user_set_max_segs = optval; 14607 rack_set_pace_segments(tp, rack, __LINE__); 14608 break; 14609 case TCP_RACK_PACE_RATE_REC: 14610 /* Set the fixed pacing rate in Bytes per second ca */ 14611 RACK_OPTS_INC(tcp_rack_pace_rate_rec); 14612 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14613 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14614 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14615 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14616 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14617 rack->use_fixed_rate = 1; 14618 rack_log_pacing_delay_calc(rack, 14619 rack->r_ctl.rc_fixed_pacing_rate_ss, 14620 rack->r_ctl.rc_fixed_pacing_rate_ca, 14621 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14622 __LINE__, NULL); 14623 break; 14624 14625 case TCP_RACK_PACE_RATE_SS: 14626 /* Set the fixed pacing rate in Bytes per second ca */ 14627 RACK_OPTS_INC(tcp_rack_pace_rate_ss); 14628 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14629 if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) 14630 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14631 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14632 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14633 rack->use_fixed_rate = 1; 14634 rack_log_pacing_delay_calc(rack, 14635 rack->r_ctl.rc_fixed_pacing_rate_ss, 14636 rack->r_ctl.rc_fixed_pacing_rate_ca, 14637 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14638 __LINE__, NULL); 14639 break; 14640 14641 case TCP_RACK_PACE_RATE_CA: 14642 /* Set the fixed pacing rate in Bytes per second ca */ 14643 RACK_OPTS_INC(tcp_rack_pace_rate_ca); 14644 rack->r_ctl.rc_fixed_pacing_rate_ca = optval; 14645 if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) 14646 rack->r_ctl.rc_fixed_pacing_rate_ss = optval; 14647 if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) 14648 rack->r_ctl.rc_fixed_pacing_rate_rec = optval; 14649 rack->use_fixed_rate = 1; 14650 rack_log_pacing_delay_calc(rack, 14651 rack->r_ctl.rc_fixed_pacing_rate_ss, 14652 rack->r_ctl.rc_fixed_pacing_rate_ca, 14653 rack->r_ctl.rc_fixed_pacing_rate_rec, 0, 0, 8, 14654 __LINE__, NULL); 14655 break; 14656 case TCP_RACK_GP_INCREASE_REC: 14657 RACK_OPTS_INC(tcp_gp_inc_rec); 14658 rack->r_ctl.rack_per_of_gp_rec = optval; 14659 rack_log_pacing_delay_calc(rack, 14660 rack->r_ctl.rack_per_of_gp_ss, 14661 rack->r_ctl.rack_per_of_gp_ca, 14662 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14663 __LINE__, NULL); 14664 break; 14665 case TCP_RACK_GP_INCREASE_CA: 14666 RACK_OPTS_INC(tcp_gp_inc_ca); 14667 ca = optval; 14668 if (ca < 100) { 14669 /* 14670 * We don't allow any reduction 14671 * over the GP b/w. 14672 */ 14673 error = EINVAL; 14674 break; 14675 } 14676 rack->r_ctl.rack_per_of_gp_ca = ca; 14677 rack_log_pacing_delay_calc(rack, 14678 rack->r_ctl.rack_per_of_gp_ss, 14679 rack->r_ctl.rack_per_of_gp_ca, 14680 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14681 __LINE__, NULL); 14682 break; 14683 case TCP_RACK_GP_INCREASE_SS: 14684 RACK_OPTS_INC(tcp_gp_inc_ss); 14685 ss = optval; 14686 if (ss < 100) { 14687 /* 14688 * We don't allow any reduction 14689 * over the GP b/w. 14690 */ 14691 error = EINVAL; 14692 break; 14693 } 14694 rack->r_ctl.rack_per_of_gp_ss = ss; 14695 rack_log_pacing_delay_calc(rack, 14696 rack->r_ctl.rack_per_of_gp_ss, 14697 rack->r_ctl.rack_per_of_gp_ca, 14698 rack->r_ctl.rack_per_of_gp_rec, 0, 0, 1, 14699 __LINE__, NULL); 14700 break; 14701 case TCP_RACK_RR_CONF: 14702 RACK_OPTS_INC(tcp_rack_rrr_no_conf_rate); 14703 if (optval && optval <= 3) 14704 rack->r_rr_config = optval; 14705 else 14706 rack->r_rr_config = 0; 14707 break; 14708 case TCP_BBR_HDWR_PACE: 14709 RACK_OPTS_INC(tcp_hdwr_pacing); 14710 if (optval){ 14711 if (rack->rack_hdrw_pacing == 0) { 14712 rack->rack_hdw_pace_ena = 1; 14713 rack->rack_attempt_hdwr_pace = 0; 14714 } else 14715 error = EALREADY; 14716 } else { 14717 rack->rack_hdw_pace_ena = 0; 14718 #ifdef RATELIMIT 14719 if (rack->rack_hdrw_pacing) { 14720 rack->rack_hdrw_pacing = 0; 14721 in_pcbdetach_txrtlmt(rack->rc_inp); 14722 } 14723 #endif 14724 } 14725 break; 14726 /* End Pacing related ones */ 14727 case TCP_RACK_PRR_SENDALOT: 14728 /* Allow PRR to send more than one seg */ 14729 RACK_OPTS_INC(tcp_rack_prr_sendalot); 14730 rack->r_ctl.rc_prr_sendalot = optval; 14731 break; 14732 case TCP_RACK_MIN_TO: 14733 /* Minimum time between rack t-o's in ms */ 14734 RACK_OPTS_INC(tcp_rack_min_to); 14735 rack->r_ctl.rc_min_to = optval; 14736 break; 14737 case TCP_RACK_EARLY_SEG: 14738 /* If early recovery max segments */ 14739 RACK_OPTS_INC(tcp_rack_early_seg); 14740 rack->r_ctl.rc_early_recovery_segs = optval; 14741 break; 14742 case TCP_RACK_REORD_THRESH: 14743 /* RACK reorder threshold (shift amount) */ 14744 RACK_OPTS_INC(tcp_rack_reord_thresh); 14745 if ((optval > 0) && (optval < 31)) 14746 rack->r_ctl.rc_reorder_shift = optval; 14747 else 14748 error = EINVAL; 14749 break; 14750 case TCP_RACK_REORD_FADE: 14751 /* Does reordering fade after ms time */ 14752 RACK_OPTS_INC(tcp_rack_reord_fade); 14753 rack->r_ctl.rc_reorder_fade = optval; 14754 break; 14755 case TCP_RACK_TLP_THRESH: 14756 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14757 RACK_OPTS_INC(tcp_rack_tlp_thresh); 14758 if (optval) 14759 rack->r_ctl.rc_tlp_threshold = optval; 14760 else 14761 error = EINVAL; 14762 break; 14763 case TCP_BBR_USE_RACK_RR: 14764 RACK_OPTS_INC(tcp_rack_rr); 14765 if (optval) 14766 rack->use_rack_rr = 1; 14767 else 14768 rack->use_rack_rr = 0; 14769 break; 14770 case TCP_RACK_PKT_DELAY: 14771 /* RACK added ms i.e. rack-rtt + reord + N */ 14772 RACK_OPTS_INC(tcp_rack_pkt_delay); 14773 rack->r_ctl.rc_pkt_delay = optval; 14774 break; 14775 case TCP_RACK_TLP_INC_VAR: 14776 /* Does TLP include rtt variance in t-o */ 14777 error = EINVAL; 14778 break; 14779 case TCP_RACK_IDLE_REDUCE_HIGH: 14780 error = EINVAL; 14781 break; 14782 case TCP_DELACK: 14783 if (optval == 0) 14784 tp->t_delayed_ack = 0; 14785 else 14786 tp->t_delayed_ack = 1; 14787 if (tp->t_flags & TF_DELACK) { 14788 tp->t_flags &= ~TF_DELACK; 14789 tp->t_flags |= TF_ACKNOW; 14790 NET_EPOCH_ENTER(et); 14791 rack_output(tp); 14792 NET_EPOCH_EXIT(et); 14793 } 14794 break; 14795 14796 case TCP_BBR_RACK_RTT_USE: 14797 if ((optval != USE_RTT_HIGH) && 14798 (optval != USE_RTT_LOW) && 14799 (optval != USE_RTT_AVG)) 14800 error = EINVAL; 14801 else 14802 rack->r_ctl.rc_rate_sample_method = optval; 14803 break; 14804 case TCP_DATA_AFTER_CLOSE: 14805 if (optval) 14806 rack->rc_allow_data_af_clo = 1; 14807 else 14808 rack->rc_allow_data_af_clo = 0; 14809 break; 14810 case TCP_RACK_PACE_REDUCE: 14811 /* sysctl only now */ 14812 error = EINVAL; 14813 break; 14814 default: 14815 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14816 break; 14817 } 14818 #ifdef NETFLIX_STATS 14819 tcp_log_socket_option(tp, sopt->sopt_name, optval, error); 14820 #endif 14821 INP_WUNLOCK(inp); 14822 return (error); 14823 } 14824 14825 static int 14826 rack_get_sockopt(struct socket *so, struct sockopt *sopt, 14827 struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) 14828 { 14829 int32_t error, optval; 14830 uint64_t val; 14831 /* 14832 * Because all our options are either boolean or an int, we can just 14833 * pull everything into optval and then unlock and copy. If we ever 14834 * add a option that is not a int, then this will have quite an 14835 * impact to this routine. 14836 */ 14837 error = 0; 14838 switch (sopt->sopt_name) { 14839 case TCP_RACK_PROFILE: 14840 /* You cannot retrieve a profile, its write only */ 14841 error = EINVAL; 14842 break; 14843 case TCP_RACK_PACE_TO_FILL: 14844 optval = rack->rc_pace_to_cwnd; 14845 break; 14846 case TCP_RACK_NO_PUSH_AT_MAX: 14847 optval = rack->r_ctl.rc_no_push_at_mrtt; 14848 break; 14849 case TCP_SHARED_CWND_ENABLE: 14850 optval = rack->rack_enable_scwnd; 14851 break; 14852 case TCP_RACK_NONRXT_CFG_RATE: 14853 optval = rack->rack_rec_nonrxt_use_cr; 14854 break; 14855 case TCP_NO_PRR: 14856 optval = rack->rack_no_prr; 14857 break; 14858 case TCP_RACK_DO_DETECTION: 14859 optval = rack->do_detection; 14860 break; 14861 case TCP_RACK_MBUF_QUEUE: 14862 /* Now do we use the LRO mbuf-queue feature */ 14863 optval = rack->r_mbuf_queue; 14864 break; 14865 case TCP_TIMELY_DYN_ADJ: 14866 optval = rack->rc_gp_dyn_mul; 14867 break; 14868 case TCP_BBR_IWINTSO: 14869 optval = rack->rc_init_win; 14870 break; 14871 case TCP_RACK_PROP_RATE: 14872 optval = rack->r_ctl.rc_prop_rate; 14873 break; 14874 case TCP_RACK_PROP: 14875 /* RACK proportional rate reduction (bool) */ 14876 optval = rack->r_ctl.rc_prop_reduce; 14877 break; 14878 case TCP_RACK_TLP_REDUCE: 14879 /* RACK TLP cwnd reduction (bool) */ 14880 optval = rack->r_ctl.rc_tlp_cwnd_reduce; 14881 break; 14882 case TCP_RACK_EARLY_RECOV: 14883 /* Should recovery happen early (bool) */ 14884 optval = rack->r_ctl.rc_early_recovery; 14885 break; 14886 case TCP_RACK_PACE_REDUCE: 14887 /* RACK Hptsi reduction factor (divisor) */ 14888 error = EINVAL; 14889 break; 14890 case TCP_BBR_RACK_INIT_RATE: 14891 val = rack->r_ctl.init_rate; 14892 /* convert to kbits per sec */ 14893 val *= 8; 14894 val /= 1000; 14895 optval = (uint32_t)val; 14896 break; 14897 case TCP_RACK_FORCE_MSEG: 14898 optval = rack->rc_force_max_seg; 14899 break; 14900 case TCP_RACK_PACE_MAX_SEG: 14901 /* Max segments in a pace */ 14902 optval = rack->rc_user_set_max_segs; 14903 break; 14904 case TCP_RACK_PACE_ALWAYS: 14905 /* Use the always pace method */ 14906 optval = rack->rc_always_pace; 14907 break; 14908 case TCP_RACK_PRR_SENDALOT: 14909 /* Allow PRR to send more than one seg */ 14910 optval = rack->r_ctl.rc_prr_sendalot; 14911 break; 14912 case TCP_RACK_MIN_TO: 14913 /* Minimum time between rack t-o's in ms */ 14914 optval = rack->r_ctl.rc_min_to; 14915 break; 14916 case TCP_RACK_EARLY_SEG: 14917 /* If early recovery max segments */ 14918 optval = rack->r_ctl.rc_early_recovery_segs; 14919 break; 14920 case TCP_RACK_REORD_THRESH: 14921 /* RACK reorder threshold (shift amount) */ 14922 optval = rack->r_ctl.rc_reorder_shift; 14923 break; 14924 case TCP_RACK_REORD_FADE: 14925 /* Does reordering fade after ms time */ 14926 optval = rack->r_ctl.rc_reorder_fade; 14927 break; 14928 case TCP_BBR_USE_RACK_RR: 14929 /* Do we use the rack cheat for rxt */ 14930 optval = rack->use_rack_rr; 14931 break; 14932 case TCP_RACK_RR_CONF: 14933 optval = rack->r_rr_config; 14934 break; 14935 case TCP_BBR_HDWR_PACE: 14936 optval = rack->rack_hdw_pace_ena; 14937 break; 14938 case TCP_RACK_TLP_THRESH: 14939 /* RACK TLP theshold i.e. srtt+(srtt/N) */ 14940 optval = rack->r_ctl.rc_tlp_threshold; 14941 break; 14942 case TCP_RACK_PKT_DELAY: 14943 /* RACK added ms i.e. rack-rtt + reord + N */ 14944 optval = rack->r_ctl.rc_pkt_delay; 14945 break; 14946 case TCP_RACK_TLP_USE: 14947 optval = rack->rack_tlp_threshold_use; 14948 break; 14949 case TCP_RACK_TLP_INC_VAR: 14950 /* Does TLP include rtt variance in t-o */ 14951 error = EINVAL; 14952 break; 14953 case TCP_RACK_IDLE_REDUCE_HIGH: 14954 error = EINVAL; 14955 break; 14956 case TCP_RACK_PACE_RATE_CA: 14957 optval = rack->r_ctl.rc_fixed_pacing_rate_ca; 14958 break; 14959 case TCP_RACK_PACE_RATE_SS: 14960 optval = rack->r_ctl.rc_fixed_pacing_rate_ss; 14961 break; 14962 case TCP_RACK_PACE_RATE_REC: 14963 optval = rack->r_ctl.rc_fixed_pacing_rate_rec; 14964 break; 14965 case TCP_RACK_GP_INCREASE_SS: 14966 optval = rack->r_ctl.rack_per_of_gp_ca; 14967 break; 14968 case TCP_RACK_GP_INCREASE_CA: 14969 optval = rack->r_ctl.rack_per_of_gp_ss; 14970 break; 14971 case TCP_BBR_RACK_RTT_USE: 14972 optval = rack->r_ctl.rc_rate_sample_method; 14973 break; 14974 case TCP_DELACK: 14975 optval = tp->t_delayed_ack; 14976 break; 14977 case TCP_DATA_AFTER_CLOSE: 14978 optval = rack->rc_allow_data_af_clo; 14979 break; 14980 case TCP_SHARED_CWND_TIME_LIMIT: 14981 optval = rack->r_limit_scw; 14982 break; 14983 default: 14984 return (tcp_default_ctloutput(so, sopt, inp, tp)); 14985 break; 14986 } 14987 INP_WUNLOCK(inp); 14988 if (error == 0) { 14989 error = sooptcopyout(sopt, &optval, sizeof optval); 14990 } 14991 return (error); 14992 } 14993 14994 static int 14995 rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) 14996 { 14997 int32_t error = EINVAL; 14998 struct tcp_rack *rack; 14999 15000 rack = (struct tcp_rack *)tp->t_fb_ptr; 15001 if (rack == NULL) { 15002 /* Huh? */ 15003 goto out; 15004 } 15005 if (sopt->sopt_dir == SOPT_SET) { 15006 return (rack_set_sockopt(so, sopt, inp, tp, rack)); 15007 } else if (sopt->sopt_dir == SOPT_GET) { 15008 return (rack_get_sockopt(so, sopt, inp, tp, rack)); 15009 } 15010 out: 15011 INP_WUNLOCK(inp); 15012 return (error); 15013 } 15014 15015 static int 15016 rack_pru_options(struct tcpcb *tp, int flags) 15017 { 15018 if (flags & PRUS_OOB) 15019 return (EOPNOTSUPP); 15020 return (0); 15021 } 15022 15023 static struct tcp_function_block __tcp_rack = { 15024 .tfb_tcp_block_name = __XSTRING(STACKNAME), 15025 .tfb_tcp_output = rack_output, 15026 .tfb_do_queued_segments = ctf_do_queued_segments, 15027 .tfb_do_segment_nounlock = rack_do_segment_nounlock, 15028 .tfb_tcp_do_segment = rack_do_segment, 15029 .tfb_tcp_ctloutput = rack_ctloutput, 15030 .tfb_tcp_fb_init = rack_init, 15031 .tfb_tcp_fb_fini = rack_fini, 15032 .tfb_tcp_timer_stop_all = rack_stopall, 15033 .tfb_tcp_timer_activate = rack_timer_activate, 15034 .tfb_tcp_timer_active = rack_timer_active, 15035 .tfb_tcp_timer_stop = rack_timer_stop, 15036 .tfb_tcp_rexmit_tmr = rack_remxt_tmr, 15037 .tfb_tcp_handoff_ok = rack_handoff_ok, 15038 .tfb_pru_options = rack_pru_options, 15039 }; 15040 15041 static const char *rack_stack_names[] = { 15042 __XSTRING(STACKNAME), 15043 #ifdef STACKALIAS 15044 __XSTRING(STACKALIAS), 15045 #endif 15046 }; 15047 15048 static int 15049 rack_ctor(void *mem, int32_t size, void *arg, int32_t how) 15050 { 15051 memset(mem, 0, size); 15052 return (0); 15053 } 15054 15055 static void 15056 rack_dtor(void *mem, int32_t size, void *arg) 15057 { 15058 15059 } 15060 15061 static bool rack_mod_inited = false; 15062 15063 static int 15064 tcp_addrack(module_t mod, int32_t type, void *data) 15065 { 15066 int32_t err = 0; 15067 int num_stacks; 15068 15069 switch (type) { 15070 case MOD_LOAD: 15071 rack_zone = uma_zcreate(__XSTRING(MODNAME) "_map", 15072 sizeof(struct rack_sendmap), 15073 rack_ctor, rack_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 15074 15075 rack_pcb_zone = uma_zcreate(__XSTRING(MODNAME) "_pcb", 15076 sizeof(struct tcp_rack), 15077 rack_ctor, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); 15078 15079 sysctl_ctx_init(&rack_sysctl_ctx); 15080 rack_sysctl_root = SYSCTL_ADD_NODE(&rack_sysctl_ctx, 15081 SYSCTL_STATIC_CHILDREN(_net_inet_tcp), 15082 OID_AUTO, 15083 #ifdef STACKALIAS 15084 __XSTRING(STACKALIAS), 15085 #else 15086 __XSTRING(STACKNAME), 15087 #endif 15088 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 15089 ""); 15090 if (rack_sysctl_root == NULL) { 15091 printf("Failed to add sysctl node\n"); 15092 err = EFAULT; 15093 goto free_uma; 15094 } 15095 rack_init_sysctls(); 15096 num_stacks = nitems(rack_stack_names); 15097 err = register_tcp_functions_as_names(&__tcp_rack, M_WAITOK, 15098 rack_stack_names, &num_stacks); 15099 if (err) { 15100 printf("Failed to register %s stack name for " 15101 "%s module\n", rack_stack_names[num_stacks], 15102 __XSTRING(MODNAME)); 15103 sysctl_ctx_free(&rack_sysctl_ctx); 15104 free_uma: 15105 uma_zdestroy(rack_zone); 15106 uma_zdestroy(rack_pcb_zone); 15107 rack_counter_destroy(); 15108 printf("Failed to register rack module -- err:%d\n", err); 15109 return (err); 15110 } 15111 tcp_lro_reg_mbufq(); 15112 rack_mod_inited = true; 15113 break; 15114 case MOD_QUIESCE: 15115 err = deregister_tcp_functions(&__tcp_rack, true, false); 15116 break; 15117 case MOD_UNLOAD: 15118 err = deregister_tcp_functions(&__tcp_rack, false, true); 15119 if (err == EBUSY) 15120 break; 15121 if (rack_mod_inited) { 15122 uma_zdestroy(rack_zone); 15123 uma_zdestroy(rack_pcb_zone); 15124 sysctl_ctx_free(&rack_sysctl_ctx); 15125 rack_counter_destroy(); 15126 rack_mod_inited = false; 15127 } 15128 tcp_lro_dereg_mbufq(); 15129 err = 0; 15130 break; 15131 default: 15132 return (EOPNOTSUPP); 15133 } 15134 return (err); 15135 } 15136 15137 static moduledata_t tcp_rack = { 15138 .name = __XSTRING(MODNAME), 15139 .evhand = tcp_addrack, 15140 .priv = 0 15141 }; 15142 15143 MODULE_VERSION(MODNAME, 1); 15144 DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); 15145 MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); 15146